diff --git a/definitions/declarations/httparchive.js b/definitions/declarations/httparchive.js index 7cbc384b..aa6d183d 100644 --- a/definitions/declarations/httparchive.js +++ b/definitions/declarations/httparchive.js @@ -1,12 +1,42 @@ -const stagingTables = ['pages', 'requests', 'parsed_css'] -for (const table of stagingTables) { +// Staging tables source: https://github.com/HTTPArchive/crawl/blob/main/crawl.py +['pages', 'requests', 'parsed_css'].forEach(table => declare({ schema: 'crawl_staging', name: table }) -} +) -declare({ - schema: 'wappalyzer', - name: 'apps' -}) +// See https://github.com/HTTPArchive/dataform/issues/43 +assert('corrupted_technology_values') + .tags(['crawl_complete']) + .query(ctx => ` +SELECT + date, + client, + tech, + COUNT(DISTINCT page) AS cnt_pages, + ARRAY_AGG(DISTINCT page LIMIT 3) AS sample_pages +FROM ${ctx.ref('crawl_staging', 'pages')} AS pages +LEFT JOIN pages.technologies AS tech +LEFT JOIN tech.categories AS category +WHERE + date = '${constants.currentMonth}' AND + ( + tech.technology NOT IN (SELECT DISTINCT name FROM wappalyzer.technologies) + OR category NOT IN (SELECT DISTINCT name FROM wappalyzer.categories) + OR ARRAY_LENGTH(tech.categories) = 0 + ) +GROUP BY + date, + client, + tech +ORDER BY cnt_pages DESC +`); + +// Wappalyzer tables source: https://github.com/HTTPArchive/wappalyzer/blob/main/.github/workflows/upload.yml +['technologies', 'categories'].forEach(table => + declare({ + schema: 'wappalyzer', + name: table + }) +) diff --git a/definitions/output/crawl/pages.js b/definitions/output/crawl/pages.js index 4a9c1d5c..b39624f2 100644 --- a/definitions/output/crawl/pages.js +++ b/definitions/output/crawl/pages.js @@ -52,23 +52,97 @@ publish('pages', { DELETE FROM ${ctx.self()} WHERE date = '${constants.currentMonth}' AND client = 'desktop'; -`).query(ctx => ` + +INSERT INTO ${ctx.self()} SELECT * FROM ${ctx.ref('crawl_staging', 'pages')} WHERE date = '${constants.currentMonth}' AND client = 'desktop' - ${constants.devRankFilter} -`).postOps(ctx => ` + ${constants.devRankFilter}; + DELETE FROM ${ctx.self()} WHERE date = '${constants.currentMonth}' AND client = 'mobile'; - -INSERT INTO ${ctx.self()} +`).query(ctx => ` SELECT * FROM ${ctx.ref('crawl_staging', 'pages')} WHERE date = '${constants.currentMonth}' AND client = 'mobile' ${constants.devRankFilter} +`).postOps(ctx => ` +CREATE TEMP TABLE technologies_cleaned AS ( + WITH wappalyzer AS ( + SELECT DISTINCT + name AS technology, + categories + FROM ${ctx.ref('wappalyzer', 'technologies')} + ), + + pages AS ( + SELECT + client, + page, + tech.technology, + tech.categories, + tech.info + FROM ${ctx.self()} AS pages + LEFT JOIN pages.technologies AS tech + WHERE date = '${constants.currentMonth}' ${constants.devRankFilter} + ), + + -- Identify impacted pages + impacted_pages AS ( + SELECT DISTINCT + client, + page + FROM pages + LEFT JOIN pages.categories AS category + WHERE + -- Technology is corrupted + technology NOT IN (SELECT DISTINCT technology FROM wappalyzer) OR + -- Technology's category is corrupted + CONCAT(technology, category) NOT IN ( + SELECT DISTINCT + CONCAT(technology, category) + FROM wappalyzer + LEFT JOIN wappalyzer.categories AS category + ) + ), + + -- Keep valid technologies and use correct categories + reconstructed_technologies AS ( + SELECT + client, + page, + ARRAY_AGG(STRUCT( + pages.technology, + wappalyzer.categories, + pages.info + )) AS technologies + FROM pages + INNER JOIN impacted_pages + USING (client, page) + INNER JOIN wappalyzer + ON pages.technology = wappalyzer.technology + GROUP BY + client, + page + ) + + SELECT + client, + page, + technologies + FROM reconstructed_technologies +); + +-- Update the crawl.pages table with the cleaned and restored technologies +UPDATE ${ctx.self()} AS pages +SET technologies = technologies_cleaned.technologies +FROM technologies_cleaned +WHERE pages.date = '${constants.currentMonth}' AND + pages.client = technologies_cleaned.client AND + pages.page = technologies_cleaned.page; `) diff --git a/definitions/output/reports/cwv_tech_categories.js b/definitions/output/reports/cwv_tech_categories.js index 8db5a557..5232bb6c 100644 --- a/definitions/output/reports/cwv_tech_categories.js +++ b/definitions/output/reports/cwv_tech_categories.js @@ -7,44 +7,69 @@ publish('cwv_tech_categories', { }).query(ctx => ` /* {"dataform_trigger": "report_cwv_tech_complete", "name": "categories", "type": "dict"} */ WITH pages AS ( - SELECT + SELECT DISTINCT + client, root_page, technologies FROM ${ctx.ref('crawl', 'pages')} WHERE - date = '${pastMonth}' AND - client = 'mobile' + date = '${pastMonth}' ${constants.devRankFilter} -), categories AS ( +), + +category_descriptions AS ( + SELECT + name AS category, + description + FROM ${ctx.ref('wappalyzer', 'categories')} +), + +category_stats AS ( SELECT category, - COUNT(DISTINCT root_page) AS origins - FROM pages, - UNNEST(technologies) AS t, - UNNEST(t.categories) AS category + STRUCT( + COALESCE(MAX(IF(client = 'desktop', origins, 0))) AS desktop, + COALESCE(MAX(IF(client = 'mobile', origins, 0))) AS mobile + ) AS origins + FROM ( + SELECT + client, + category, + COUNT(DISTINCT root_page) AS origins + FROM pages + LEFT JOIN pages.technologies AS tech + LEFT JOIN tech.categories AS category + GROUP BY + client, + category + ) GROUP BY category -), technologies AS ( +), + +technology_stats AS ( SELECT - category, technology, - COUNT(DISTINCT root_page) AS origins - FROM pages, - UNNEST(technologies) AS t, - UNNEST(t.categories) AS category + category_obj AS categories, + SUM(origins) AS total_origins + FROM ${ctx.ref('reports', 'cwv_tech_technologies')} GROUP BY - category, - technology + technology, + categories ) SELECT category, - categories.origins, - ARRAY_AGG(technology IGNORE NULLS ORDER BY technologies.origins DESC) AS technologies -FROM categories -JOIN technologies + description, + origins, + ARRAY_AGG(technology IGNORE NULLS ORDER BY technology_stats.total_origins DESC) AS technologies +FROM category_stats +INNER JOIN technology_stats +ON category_stats.category IN UNNEST(technology_stats.categories) +INNER JOIN category_descriptions USING (category) GROUP BY category, - categories.origins -ORDER BY categories.origins DESC + description, + origins +ORDER BY category ASC `) diff --git a/definitions/output/reports/cwv_tech_technologies.js b/definitions/output/reports/cwv_tech_technologies.js index cef2f258..fc586cde 100644 --- a/definitions/output/reports/cwv_tech_technologies.js +++ b/definitions/output/reports/cwv_tech_technologies.js @@ -36,7 +36,7 @@ technologies AS ( STRING_AGG(DISTINCT category, ', ' ORDER BY category ASC) AS category, categories AS category_obj, NULL AS similar_technologies - FROM ${ctx.ref('wappalyzer', 'apps')}, + FROM ${ctx.ref('wappalyzer', 'technologies')}, UNNEST(categories) AS category GROUP BY technology, diff --git a/definitions/output/wappalyzer/tech_detections.js b/definitions/output/wappalyzer/tech_detections.js index 4a2a0d82..35f74dfa 100644 --- a/definitions/output/wappalyzer/tech_detections.js +++ b/definitions/output/wappalyzer/tech_detections.js @@ -112,7 +112,7 @@ tech_deprecated_gone_origins AS ( -- aggregation of technology adoption/deprecation metrics SELECT DATE('${constants.currentMonth}') AS date, - COALESCE(before_summary.technology, tech_adopted_existing_origins.technology, tech_adopted_new_origins.technology, apps.name) AS technology, + COALESCE(before_summary.technology, tech_adopted_existing_origins.technology, tech_adopted_new_origins.technology, technologies.name) AS technology, -- origins summary 0-COALESCE(total_origins_deprecated_existing, 0) AS total_origins_deprecated_existing, @@ -139,6 +139,6 @@ LEFT JOIN tech_deprecated_existing_origins ON before_summary.technology = tech_deprecated_existing_origins.technology LEFT JOIN tech_deprecated_gone_origins ON before_summary.technology = tech_deprecated_gone_origins.technology -FULL OUTER JOIN wappalyzer.apps - ON before_summary.technology = apps.name +FULL OUTER JOIN wappalyzer.technologies + ON before_summary.technology = technologies.name `)