diff --git a/sql/all-observed-domains-query.sql b/sql/all-observed-domains-query.sql index 810a75c..7685b0a 100644 --- a/sql/all-observed-domains-query.sql +++ b/sql/all-observed-domains-query.sql @@ -1,54 +1,61 @@ -CREATE TEMP -TABLE entities (name STRING, domain STRING) AS -SELECT JSON_VALUE(entity, '$.name') AS name, JSON_VALUE(entity, '$.domain') as domain -FROM UNNEST ( - JSON_QUERY_ARRAY (@entities_string, '$') - ) AS entity - -; +CREATE TEMP TABLE + entities (name STRING, domain STRING) AS +SELECT + JSON_VALUE(entity, '$.name') AS name, + JSON_VALUE(entity, '$.domain') as domain +FROM + UNNEST (JSON_QUERY_ARRAY(@entities_string, '$')) AS entity; -- Map each observed domain and count to entity -CREATE TEMP -TABLE entity_domain_count ( - domain STRING, - totalOccurrences INT, - name STRING -) AS +CREATE TEMP TABLE + entity_domain_count (domain STRING, totalOccurrences INT, name STRING) AS SELECT domain_occurrences.domain as domain, domain_occurrences.totalOccurrences as totalOccurrences, entities.name as name, -FROM ( +FROM + ( -- How many times an observed domain is called in archive - SELECT domain, COUNT(*) AS totalOccurrences - FROM ( - SELECT page, NET.HOST (url) AS domain + SELECT + domain, + COUNT(0) AS totalOccurrences + FROM + ( + SELECT + page, + NET.HOST(url) AS domain FROM `httparchive.requests.2022_01_01_mobile` GROUP BY - page, domain + page, + domain ) GROUP BY domain ) as domain_occurrences JOIN -- Mapping between a domain and an entity - entities ON domain_occurrences.domain LIKE -REPLACE (entities.domain, '*', '%'); + entities ON domain_occurrences.domain LIKE REPLACE(entities.domain, '*', '%'); -- Get entities with at least 50 observed domains WITH entity_count AS ( - SELECT name, SUM(totalOccurrences) as totalOccurrences - FROM entity_domain_count + SELECT + name, + SUM(totalOccurrences) as totalOccurrences + FROM + entity_domain_count GROUP BY name HAVING totalOccurrences >= 50 ) -- Get observed domains owned by entities with at least 50 observed domains -SELECT entity_domain_count.domain, entity_domain_count.totalOccurrences +SELECT + entity_domain_count.domain, + entity_domain_count.totalOccurrences FROM entity_domain_count JOIN entity_count ON entity_count.name = entity_domain_count.name -ORDER BY totalOccurrences DESC \ No newline at end of file +ORDER BY + totalOccurrences DESC \ No newline at end of file diff --git a/sql/entity-per-page.sql b/sql/entity-per-page.sql index 35eb627..a2e0997 100644 --- a/sql/entity-per-page.sql +++ b/sql/entity-per-page.sql @@ -5,40 +5,46 @@ SELECT SUM(totalOccurrences) AS totalScripts, AVG(totalExecutionTime) AS averageExecutionTime, AVG(averageExecutionTime) AS averageScriptExecutionTime -FROM ( - SELECT - pageUrl, - canonicalDomain, - SUM(executionTime) AS totalExecutionTime, - COUNT(executionTime) AS totalOccurrences, - AVG(executionTime) AS averageExecutionTime - FROM ( +FROM + ( SELECT - domain, - canonicalDomain, - category, - JSON_VALUE(report, '$.requestedUrl') AS pageUrl, - NET.HOST(JSON_VALUE(bootupTimeItems, '$.url')) AS observedDomain, - JSON_VALUE(bootupTimeItems, '$.url') AS scriptUrl, - SAFE_CAST(JSON_VALUE(bootupTimeItems, - "$.scripting") AS FLOAT64) AS executionTime -FROM ( - SELECT - url AS page, - report - FROM - `httparchive.lighthouse.2022_01_01_mobile` - ), - UNNEST(JSON_QUERY_ARRAY(report, - '$.audits.bootup-time.details.items')) AS bootupTimeItems - INNER JOIN - `lighthouse-infrastructure.third_party_web.2022_01_01` - ON NET.HOST(JSON_VALUE(bootupTimeItems, "$.url")) = domain + pageUrl, + canonicalDomain, + SUM(executionTime) AS totalExecutionTime, + COUNT(executionTime) AS totalOccurrences, + AVG(executionTime) AS averageExecutionTime + FROM + ( + SELECT + domain, + canonicalDomain, + category, + JSON_VALUE(report, '$.requestedUrl') AS pageUrl, + NET.HOST(JSON_VALUE(bootupTimeItems, '$.url')) AS observedDomain, + JSON_VALUE(bootupTimeItems, '$.url') AS scriptUrl, + SAFE_CAST( + JSON_VALUE(bootupTimeItems, "$.scripting") AS FLOAT64 + ) AS executionTime + FROM + ( + SELECT + url AS page, + report + FROM + `httparchive.lighthouse.2022_01_01_mobile` + ), + UNNEST ( + JSON_QUERY_ARRAY(report, '$.audits.bootup-time.details.items') + ) AS bootupTimeItems + INNER JOIN `lighthouse-infrastructure.third_party_web.2022_01_01` ON NET.HOST(JSON_VALUE(bootupTimeItems, "$.url")) = domain + ) + WHERE + canonicalDomain IS NOT NULL + GROUP BY + pageUrl, + canonicalDomain ) - WHERE canonicalDomain IS NOT NULL - GROUP BY pageUrl, canonicalDomain -) GROUP BY canonicalDomain ORDER BY - totalOccurrences DESC + totalOccurrences DESC \ No newline at end of file