Skip to content

Commit

Permalink
Merge branch 'main' into cooing-swallow
Browse files Browse the repository at this point in the history
  • Loading branch information
max-ostapenko committed Jan 20, 2025
2 parents c68f485 + de144b0 commit 8a44b31
Show file tree
Hide file tree
Showing 5 changed files with 167 additions and 38 deletions.
44 changes: 37 additions & 7 deletions definitions/declarations/httparchive.js
Original file line number Diff line number Diff line change
@@ -1,12 +1,42 @@
const stagingTables = ['pages', 'requests', 'parsed_css']
for (const table of stagingTables) {
// Staging tables source: https://github.com/HTTPArchive/crawl/blob/main/crawl.py
['pages', 'requests', 'parsed_css'].forEach(table =>
declare({
schema: 'crawl_staging',
name: table
})
}
)

declare({
schema: 'wappalyzer',
name: 'apps'
})
// See https://github.com/HTTPArchive/dataform/issues/43
assert('corrupted_technology_values')
.tags(['crawl_complete'])
.query(ctx => `
SELECT
date,
client,
tech,
COUNT(DISTINCT page) AS cnt_pages,
ARRAY_AGG(DISTINCT page LIMIT 3) AS sample_pages
FROM ${ctx.ref('crawl_staging', 'pages')} AS pages
LEFT JOIN pages.technologies AS tech
LEFT JOIN tech.categories AS category
WHERE
date = '${constants.currentMonth}' AND
(
tech.technology NOT IN (SELECT DISTINCT name FROM wappalyzer.technologies)
OR category NOT IN (SELECT DISTINCT name FROM wappalyzer.categories)
OR ARRAY_LENGTH(tech.categories) = 0
)
GROUP BY
date,
client,
tech
ORDER BY cnt_pages DESC
`);

// Wappalyzer tables source: https://github.com/HTTPArchive/wappalyzer/blob/main/.github/workflows/upload.yml
['technologies', 'categories'].forEach(table =>
declare({
schema: 'wappalyzer',
name: table
})
)
84 changes: 79 additions & 5 deletions definitions/output/crawl/pages.js
Original file line number Diff line number Diff line change
Expand Up @@ -52,23 +52,97 @@ publish('pages', {
DELETE FROM ${ctx.self()}
WHERE date = '${constants.currentMonth}' AND
client = 'desktop';
`).query(ctx => `
INSERT INTO ${ctx.self()}
SELECT
*
FROM ${ctx.ref('crawl_staging', 'pages')}
WHERE date = '${constants.currentMonth}' AND
client = 'desktop'
${constants.devRankFilter}
`).postOps(ctx => `
${constants.devRankFilter};
DELETE FROM ${ctx.self()}
WHERE date = '${constants.currentMonth}' AND
client = 'mobile';
INSERT INTO ${ctx.self()}
`).query(ctx => `
SELECT
*
FROM ${ctx.ref('crawl_staging', 'pages')}
WHERE date = '${constants.currentMonth}' AND
client = 'mobile'
${constants.devRankFilter}
`).postOps(ctx => `
CREATE TEMP TABLE technologies_cleaned AS (
WITH wappalyzer AS (
SELECT DISTINCT
name AS technology,
categories
FROM ${ctx.ref('wappalyzer', 'technologies')}
),
pages AS (
SELECT
client,
page,
tech.technology,
tech.categories,
tech.info
FROM ${ctx.self()} AS pages
LEFT JOIN pages.technologies AS tech
WHERE date = '${constants.currentMonth}' ${constants.devRankFilter}
),
-- Identify impacted pages
impacted_pages AS (
SELECT DISTINCT
client,
page
FROM pages
LEFT JOIN pages.categories AS category
WHERE
-- Technology is corrupted
technology NOT IN (SELECT DISTINCT technology FROM wappalyzer) OR
-- Technology's category is corrupted
CONCAT(technology, category) NOT IN (
SELECT DISTINCT
CONCAT(technology, category)
FROM wappalyzer
LEFT JOIN wappalyzer.categories AS category
)
),
-- Keep valid technologies and use correct categories
reconstructed_technologies AS (
SELECT
client,
page,
ARRAY_AGG(STRUCT(
pages.technology,
wappalyzer.categories,
pages.info
)) AS technologies
FROM pages
INNER JOIN impacted_pages
USING (client, page)
INNER JOIN wappalyzer
ON pages.technology = wappalyzer.technology
GROUP BY
client,
page
)
SELECT
client,
page,
technologies
FROM reconstructed_technologies
);
-- Update the crawl.pages table with the cleaned and restored technologies
UPDATE ${ctx.self()} AS pages
SET technologies = technologies_cleaned.technologies
FROM technologies_cleaned
WHERE pages.date = '${constants.currentMonth}' AND
pages.client = technologies_cleaned.client AND
pages.page = technologies_cleaned.page;
`)
69 changes: 47 additions & 22 deletions definitions/output/reports/cwv_tech_categories.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,44 +7,69 @@ publish('cwv_tech_categories', {
}).query(ctx => `
/* {"dataform_trigger": "report_cwv_tech_complete", "name": "categories", "type": "dict"} */
WITH pages AS (
SELECT
SELECT DISTINCT
client,
root_page,
technologies
FROM ${ctx.ref('crawl', 'pages')}
WHERE
date = '${pastMonth}' AND
client = 'mobile'
date = '${pastMonth}'
${constants.devRankFilter}
), categories AS (
),
category_descriptions AS (
SELECT
name AS category,
description
FROM ${ctx.ref('wappalyzer', 'categories')}
),
category_stats AS (
SELECT
category,
COUNT(DISTINCT root_page) AS origins
FROM pages,
UNNEST(technologies) AS t,
UNNEST(t.categories) AS category
STRUCT(
COALESCE(MAX(IF(client = 'desktop', origins, 0))) AS desktop,
COALESCE(MAX(IF(client = 'mobile', origins, 0))) AS mobile
) AS origins
FROM (
SELECT
client,
category,
COUNT(DISTINCT root_page) AS origins
FROM pages
LEFT JOIN pages.technologies AS tech
LEFT JOIN tech.categories AS category
GROUP BY
client,
category
)
GROUP BY category
), technologies AS (
),
technology_stats AS (
SELECT
category,
technology,
COUNT(DISTINCT root_page) AS origins
FROM pages,
UNNEST(technologies) AS t,
UNNEST(t.categories) AS category
category_obj AS categories,
SUM(origins) AS total_origins
FROM ${ctx.ref('reports', 'cwv_tech_technologies')}
GROUP BY
category,
technology
technology,
categories
)
SELECT
category,
categories.origins,
ARRAY_AGG(technology IGNORE NULLS ORDER BY technologies.origins DESC) AS technologies
FROM categories
JOIN technologies
description,
origins,
ARRAY_AGG(technology IGNORE NULLS ORDER BY technology_stats.total_origins DESC) AS technologies
FROM category_stats
INNER JOIN technology_stats
ON category_stats.category IN UNNEST(technology_stats.categories)
INNER JOIN category_descriptions
USING (category)
GROUP BY
category,
categories.origins
ORDER BY categories.origins DESC
description,
origins
ORDER BY category ASC
`)
2 changes: 1 addition & 1 deletion definitions/output/reports/cwv_tech_technologies.js
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ technologies AS (
STRING_AGG(DISTINCT category, ', ' ORDER BY category ASC) AS category,
categories AS category_obj,
NULL AS similar_technologies
FROM ${ctx.ref('wappalyzer', 'apps')},
FROM ${ctx.ref('wappalyzer', 'technologies')},
UNNEST(categories) AS category
GROUP BY
technology,
Expand Down
6 changes: 3 additions & 3 deletions definitions/output/wappalyzer/tech_detections.js
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ tech_deprecated_gone_origins AS (
-- aggregation of technology adoption/deprecation metrics
SELECT
DATE('${constants.currentMonth}') AS date,
COALESCE(before_summary.technology, tech_adopted_existing_origins.technology, tech_adopted_new_origins.technology, apps.name) AS technology,
COALESCE(before_summary.technology, tech_adopted_existing_origins.technology, tech_adopted_new_origins.technology, technologies.name) AS technology,
-- origins summary
0-COALESCE(total_origins_deprecated_existing, 0) AS total_origins_deprecated_existing,
Expand All @@ -139,6 +139,6 @@ LEFT JOIN tech_deprecated_existing_origins
ON before_summary.technology = tech_deprecated_existing_origins.technology
LEFT JOIN tech_deprecated_gone_origins
ON before_summary.technology = tech_deprecated_gone_origins.technology
FULL OUTER JOIN wappalyzer.apps
ON before_summary.technology = apps.name
FULL OUTER JOIN wappalyzer.technologies
ON before_summary.technology = technologies.name
`)

0 comments on commit 8a44b31

Please sign in to comment.