Skip to content

Commit

Permalink
comments
Browse files Browse the repository at this point in the history
  • Loading branch information
max-ostapenko committed Jan 20, 2025
1 parent c1af19a commit 233619e
Showing 1 changed file with 7 additions and 5 deletions.
12 changes: 7 additions & 5 deletions definitions/declarations/httparchive.js
Original file line number Diff line number Diff line change
@@ -1,19 +1,21 @@
// Tables source: https://github.com/HTTPArchive/crawl/blob/main/crawl.py
// Staging tables source: https://github.com/HTTPArchive/crawl/blob/main/crawl.py
['pages', 'requests', 'parsed_css'].forEach(table =>
declare({
schema: 'crawl_staging',
name: table
})
)

assert('corrupted_technology_values').query(ctx => `
assert('corrupted_technology_values')
.tags(['crawl_complete'])
.query(ctx => `
SELECT
date,
client,
tech,
COUNT(DISTINCT page) AS cnt_pages,
ARRAY_AGG(DISTINCT page LIMIT 3) AS sample_pages
FROM ${ctx.ref('crawl_staging', 'pages')}
FROM ${ctx.ref('crawl_staging', 'pages')} AS pages
LEFT JOIN pages.technologies AS tech
LEFT JOIN tech.categories AS category
WHERE
Expand All @@ -27,10 +29,10 @@ GROUP BY
date,
client,
tech
ORDER BY cnt DESC
ORDER BY cnt_pages DESC
`);

// Tables source: https://github.com/HTTPArchive/wappalyzer/blob/main/.github/workflows/upload.yml
// Wappalyzer tables source: https://github.com/HTTPArchive/wappalyzer/blob/main/.github/workflows/upload.yml
['technologies', 'categories'].forEach(table =>
declare({
schema: 'wappalyzer',
Expand Down

0 comments on commit 233619e

Please sign in to comment.