From aea421811394da000f689b217053f85832140527 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Tue, 10 Dec 2024 22:41:06 +0100 Subject: [PATCH] Fixes to CrUX pipeline (#36) * skip null technologies * ignore null technologies * sql review * updated data fixed --- .../output/core_web_vitals/technologies.js | 39 +++++++++---------- .../output/reports/cwv_tech_categories.js | 2 +- infra/dataform-trigger/index.js | 2 +- infra/tf/function_dataform_trigger.tf | 2 +- 4 files changed, 22 insertions(+), 23 deletions(-) diff --git a/definitions/output/core_web_vitals/technologies.js b/definitions/output/core_web_vitals/technologies.js index 66f99c8..d307929 100644 --- a/definitions/output/core_web_vitals/technologies.js +++ b/definitions/output/core_web_vitals/technologies.js @@ -61,7 +61,7 @@ crux AS ( WHEN 10000 THEN 'Top 10k' WHEN 1000 THEN 'Top 1k' END AS rank, - CONCAT(origin, '/') AS root_page_url, + CONCAT(origin, '/') AS root_page, IF(device = 'desktop', 'desktop', 'mobile') AS client, # CWV @@ -94,9 +94,9 @@ crux AS ( technologies AS ( SELECT - technology.technology AS app, + technology.technology, client, - page AS url + page FROM ${ctx.ref('crawl', 'pages')}, UNNEST(technologies) AS technology WHERE @@ -106,9 +106,9 @@ technologies AS ( technology.technology != '' UNION ALL SELECT - 'ALL' AS app, + 'ALL' AS technology, client, - page AS url + page FROM ${ctx.ref('crawl', 'pages')} WHERE date = '${pastMonth}' @@ -117,7 +117,7 @@ UNION ALL categories AS ( SELECT - technology.technology AS app, + technology.technology, ARRAY_TO_STRING(ARRAY_AGG(DISTINCT category IGNORE NULLS ORDER BY category), ', ') AS category FROM ${ctx.ref('crawl', 'pages')}, UNNEST(technologies) AS technology, @@ -125,10 +125,10 @@ categories AS ( WHERE date = '${pastMonth}' ${constants.devRankFilter} - GROUP BY app + GROUP BY technology UNION ALL SELECT - 'ALL' AS app, + 'ALL' AS technology, ARRAY_TO_STRING(ARRAY_AGG(DISTINCT category IGNORE NULLS ORDER BY category), ', ') AS category FROM ${ctx.ref('crawl', 'pages')}, UNNEST(technologies) AS technology, @@ -142,8 +142,8 @@ UNION ALL summary_stats AS ( SELECT client, - page AS url, - root_page AS root_page_url, + page, + root_page AS root_page, SAFE.INT64(summary.bytesTotal) AS bytesTotal, SAFE.INT64(summary.bytesJS) AS bytesJS, SAFE.INT64(summary.bytesImg) AS bytesImg, @@ -161,8 +161,8 @@ summary_stats AS ( lab_data AS ( SELECT client, - root_page_url, - app, + root_page, + technology, ANY_VALUE(category) AS category, AVG(bytesTotal) AS bytesTotal, AVG(bytesJS) AS bytesJS, @@ -174,13 +174,13 @@ lab_data AS ( AVG(seo) AS seo FROM summary_stats JOIN technologies - USING (client, url) + USING (client, page) JOIN categories - USING (app) + USING (technology) GROUP BY client, - root_page_url, - app + root_page, + technology ) SELECT @@ -188,7 +188,7 @@ SELECT geo, rank, ANY_VALUE(category) AS category, - app, + technology AS app, client, COUNT(0) AS origins, @@ -226,9 +226,8 @@ SELECT SAFE_CAST(APPROX_QUANTILES(bytesImg, 1000)[OFFSET(500)] AS INT64) AS median_bytes_image FROM lab_data -JOIN crux -USING - (client, root_page_url) +INNER JOIN crux +USING (client, root_page) GROUP BY app, geo, diff --git a/definitions/output/reports/cwv_tech_categories.js b/definitions/output/reports/cwv_tech_categories.js index 62664fe..9b637a9 100644 --- a/definitions/output/reports/cwv_tech_categories.js +++ b/definitions/output/reports/cwv_tech_categories.js @@ -40,7 +40,7 @@ technologies AS ( SELECT category, categories.origins, - ARRAY_AGG(technology ORDER BY technologies.origins DESC) AS technologies + ARRAY_AGG(technology IGNORE NULLS ORDER BY technologies.origins DESC) AS technologies FROM categories JOIN technologies USING (category) diff --git a/infra/dataform-trigger/index.js b/infra/dataform-trigger/index.js index 252b268..345c623 100644 --- a/infra/dataform-trigger/index.js +++ b/infra/dataform-trigger/index.js @@ -12,7 +12,7 @@ DECLARE previousMonth_YYYYMM STRING DEFAULT SUBSTR(previousMonth, 1, 6); WITH crux AS ( SELECT LOGICAL_AND(total_rows > 0) AS rows_available, - LOGICAL_AND(TIMESTAMP_DIFF(CURRENT_TIMESTAMP(), last_modified_time, HOUR) < 7) AS recent_last_modified + LOGICAL_OR(TIMESTAMP_DIFF(CURRENT_TIMESTAMP(), last_modified_time, HOUR) < 8) AS recent_last_modified FROM chrome-ux-report.materialized.INFORMATION_SCHEMA.PARTITIONS WHERE table_name IN ('device_summary', 'country_summary') AND partition_id IN (previousMonth, previousMonth_YYYYMM) diff --git a/infra/tf/function_dataform_trigger.tf b/infra/tf/function_dataform_trigger.tf index 3d9fd14..4b4af59 100644 --- a/infra/tf/function_dataform_trigger.tf +++ b/infra/tf/function_dataform_trigger.tf @@ -105,7 +105,7 @@ resource "google_cloud_scheduler_job" "bq-poller-crux-ready" { paused = false project = local.project region = local.region - schedule = "0 */7 8-14 * *" + schedule = "0 */8 8-14 * *" time_zone = "Etc/UTC" http_target { body = base64encode(local.crux_ready_scheduler_body)