scripts/tinybird/retrieve_legacy_data_from_circleci.py
"""Helper script to retrieve historical data and load into tinybird parity dashboard
The script is intended to be run locally. It was executed once, to retrieve the data from the past successful master builds
in order to get more data into the parity dashboard for a hackathon project.
"""
import datetime
import http.client
import json
import os
import urllib
from scripts.tinybird.upload_raw_test_metrics_and_coverage import (
send_implemented_coverage,
send_metric_report,
)
PROJECT_SLUG = "github/localstack/localstack"
MASTER_BRANCH = "master"
def send_request_to_connection(conn, url):
print(f"sending request to url: {url}")
headers = {"accept": "application/json"} # , "Circle-Token": api_token}
conn.request(
"GET",
url=url,
headers=headers,
)
res = conn.getresponse()
if res.getcode() == 200:
data = res.read()
return data
else:
print(f"connection failed: {res.getcode}")
return None
def extract_artifacts_url_for_path(artifacts, path):
data_url = [item["url"] for item in artifacts["items"] if item["path"].startswith(path)]
if len(data_url) != 1:
print(f"unexpected artifacts count for {path}, unexpected content: {data_url}")
return None
return data_url[0]
def collect_workflows_past_30_days():
"""
Retrieves the workflows run from the past 30 days from circecli on 'master' branch,
and retrieves the artifacts for each successful workflow run, that are collected in the 'report' job.
The artifacts for coverage implementation, and raw-data collection are downloaded, and then processed and sent to
tinybird backend.
"""
try:
conn = http.client.HTTPSConnection("circleci.com")
# api_token = os.getenv("API_TOKEN")
end = datetime.datetime.utcnow()
start = end - datetime.timedelta(days=30)
get_workflows_request = f"/api/v2/insights/{PROJECT_SLUG}/workflows/main?&branch={MASTER_BRANCH}&start-date={start.isoformat()}&end-date={end.isoformat()}"
data = send_request_to_connection(conn, get_workflows_request)
if not data:
print(f"could not resolve {get_workflows_request}")
return
# this is just for tracking the current status - we already uploaded data for all of these workflows-ids:
already_sent = [
"0b4e29e5-b6c2-42b6-8f2d-9bbd3d3bc8aa",
"3780cc96-10a0-4c41-9b5a-98d16b83dd94",
"7ec971e9-4ee2-4269-857e-f3641961ecde",
"3e02b8c5-6c9b-40d0-84df-c4e2d0a7797d",
"015202d7-5071-4773-b223-854ccffe969f",
"c8dd0d5d-b00c-4507-9129-669c3cc9f55a",
"a87bf4f8-3adb-4d0a-b11c-32c0a3318ee9",
"0b1a2ddb-ed17-426c-ba0c-23c4771ecb22",
"97d01dac-15a1-4791-8e90-ce1fed09538d",
"83fb8b2f-dab2-465f-be52-83342820f448",
"2ae81ec5-2d18-48bf-b4ad-6bed8309f281",
"63aa8ee8-4242-43fa-8408-4720c8fdd04b",
"32c09e00-0733-443e-9b3a-9ca7e2ae32eb",
"e244742d-c90b-4301-9d0f-1c6a06e3eec9",
"0821f4ca-640d-4cce-9af8-a593f261aa75",
"b181f475-192c-49c5-9f80-f33201a2d11b",
"90b57b93-4a01-4612-bd92-fe9c4566da64",
"dd8e4e20-2f85-41d3-b664-39304feec01b",
"6122ea91-f0e4-4ea4-aca6-b67feec9d81b",
"c035931f-90b0-4c48-a82c-0b7e343ebf49",
"d8b03fae-b7e2-4871-a480-84edd531bfb9",
"f499c3c1-ac46-403a-8a73-2daaebcf063d",
"a310a406-b37a-4556-89e3-a6475bbb114f",
"bab3f52c-0ed2-4390-b4b4-d34b5cb6e1ad",
"c2245fe6-258f-4248-a296-224fe3f213d1",
"67e8e834-3ab6-497e-b2d3-1e6df4575380",
"3b367c58-f208-4e98-aa92-816cd649094b",
"cc63b1b1-61ff-44f9-b3bf-cc24e23cf54b",
"4eff4f42-770e-414a-ad5d-dde8e49b244f",
"8092d5a8-c9a8-4812-ac22-d620a5e04003",
"d682debe-17d7-4e31-9df1-e2f70758302f",
"b8a3e0ea-25ca-47df-afec-48ac3a0de811",
"450f335f-cd9c-45f3-a69f-1db5f9f16082",
"4467264f-8a57-4a05-ad0d-8d224221ec69",
"9e91a4d6-147b-4a64-bcb6-2d311164c3d8",
"4a0c989a-31e7-4d9d-afdc-dc31c697fd11",
"5b1a604c-12a9-4b9c-ba1e-abd8be05e135",
"a9291b6e-eefe-466f-8802-64083abbfb0f",
"0210fe7b-55a9-4bb0-a496-fbbff2831dd5",
"1d5056aa-4d8c-4435-8a90-b3b48c8849e6",
"1b339b55-fd27-4527-aff3-4a31109297e4",
"f9c79715-ff09-4a1a-acea-ac4acd0eedc4",
"93cddbf6-b48d-4086-b089-869ff2b7af0f",
"f96e2531-cde6-490f-be26-076b3b3deaa4",
"2dec1ba3-c306-4868-95bf-668689c10f4f",
"ce8bedd9-618c-4475-b76e-b429ac49f84b",
"7f2ae078-41cd-4f64-88ec-ef0f45185020",
"271ba76a-3c7d-4b6e-abbd-294050608ebf",
"afa647e9-ad38-467f-9ebc-fa7283586c19",
"2cef06d8-98dc-415e-a8af-758689711c68",
"8c859042-b37a-4447-9d3e-07d1ae160765",
"b5ba1234-1983-4805-a9be-c4ca9c52b799",
"b6614e63-4538-4583-8f9d-0c220db602a8",
"71453fae-a689-4e28-995f-bd6e2c7cadaf",
"53e43bae-3c70-4df5-8490-fe9208fbd952",
"d1776b0e-7ddc-42e0-bd2d-7561ae72ae8b",
"ad88f81e-6526-44f4-9208-ea64efdbde87",
"503226e6-6671-4248-9fba-7b31f4684c0c",
"c8e688aa-b63d-4e11-a14e-4ea1a2ad5257",
"48002330-8ecb-41c5-9acc-95ae260a7a15",
"e5550424-bec4-48a1-9354-0ad1f14510c4",
"304dc6fc-9807-46b6-9665-fe8d6cc2d9b7",
"24fe00ef-6c48-4260-9bca-125e2b16e7b2",
"12e6470d-f923-4358-9fbb-185ff981903c",
"32b53e7f-f0d3-446b-9b56-9cb4cdd5134d",
"fe786b67-dc09-41e0-aba5-33e7aa8dcdf7",
"a7c06a4b-2954-4660-8072-3c10c7d2823b",
"c1dedfce-2619-484b-8a10-bc9b2bda39ff",
"618a7511-e82b-4e7f-9d4a-4b4a4247f6e0",
"00bec0f4-7844-4ad9-8d01-e3833aae9697",
"8cb2fb8f-b840-4f5b-b151-744fb425298c",
"8c2a8d3d-f05a-4c27-9df6-bc7f4f6106b8",
"9dfc79d6-952e-4ae4-9dd8-493ac9a30065",
"edf9a307-0e80-4a80-97f4-f53c78910554",
"3c9c12e5-0fe7-4b1a-b224-7570808f8e19",
]
# TODO check "next_page_token"
# -> wasn't required for the initial run, as on master everything was on one page for the past 30 days
workflows = json.loads(data.decode("utf-8"))
count = 0
for item in workflows.get("items"):
if item["status"] == "success":
workflow_id = item["id"]
if workflow_id in already_sent:
continue
print(f"checking workflow_id {workflow_id}")
date_created_at = item["created_at"]
converted_date = datetime.datetime.strptime(
date_created_at, "%Y-%m-%dT%H:%M:%S.%fZ"
)
# create the same time format we use when uploading data in the cirlce ci
timestamp = converted_date.strftime("%Y-%m-%d %H:%M:%S")
# get the details for the job (we need the job_number of the report step)
job_request = f"/api/v2/workflow/{workflow_id}/job"
job_data = send_request_to_connection(conn, job_request)
if not job_data:
print("could not retrieve job_data")
return
jobs = json.loads(job_data.decode("utf-8"))
report_job = [item for item in jobs["items"] if item["name"] == "report"]
if len(report_job) != 1:
print(f"report job should be exactly 1, unexpected content: {report_job}")
return
job_number = report_job[0]["job_number"]
# request artificats for the report job
artifacts_request = (
f"/api/v2/project/github/localstack/localstack/{job_number}/artifacts"
)
artifacts_data = send_request_to_connection(conn, artifacts_request)
if not artifacts_data:
print("could not retrieve artifacts data")
return
artifacts = json.loads(artifacts_data.decode("utf-8"))
# extract the required urls for metric-data-raw, and coverage data for community/pro
metric_data_url = extract_artifacts_url_for_path(
artifacts=artifacts, path="parity_metrics/metric-report-raw-data-all"
)
community_cov_url = extract_artifacts_url_for_path(
artifacts=artifacts, path="community/implementation_coverage_full.csv"
)
pro_cov_url = extract_artifacts_url_for_path(
artifacts=artifacts, path="pro/implementation_coverage_full.csv"
)
if not metric_data_url or not community_cov_url or not pro_cov_url:
print("At least one artifact url could not be found. existing..")
return
# download files locally
metric_report_file_path = "./metric_report_raw.csv"
print(f"trying to download {metric_data_url}")
urllib.request.urlretrieve(metric_data_url, metric_report_file_path)
community_coverage_file_path = "./community_coverage.csv"
print(f"trying to download {community_cov_url}")
urllib.request.urlretrieve(community_cov_url, community_coverage_file_path)
pro_coverage_file_path = "./pro_coverage.csv"
print(f"trying to download {pro_cov_url}")
urllib.request.urlretrieve(pro_cov_url, pro_coverage_file_path)
# update required ENVs with the data from the current workflow/job
os.environ["CIRCLE_BRANCH"] = MASTER_BRANCH
os.environ["CIRCLE_PULL_REQUESTS"] = ""
os.environ["CIRCLE_BUILD_NUM"] = str(job_number)
os.environ["CIRCLE_BUILD_URL"] = ""
os.environ["CIRCLE_WORKFLOW_ID"] = str(workflow_id)
# trigger the tinybird_upload
send_metric_report(
metric_report_file_path, source_type="community", timestamp=timestamp
)
send_implemented_coverage(
community_coverage_file_path, timestamp=timestamp, type="community"
)
send_implemented_coverage(pro_coverage_file_path, timestamp=timestamp, type="pro")
already_sent.append(workflow_id)
count = count + 1
# print(already_sent)
finally:
print(already_sent)
if timestamp:
print(f"last timestamp: {timestamp}")
if count:
print(f"sent {count} workflow data to tinybird")
if conn:
conn.close()
def main():
collect_workflows_past_30_days()
if __name__ == "__main__":
main()