Skip to content

Commit

Permalink
chore: format for bigquery
Browse files Browse the repository at this point in the history
  • Loading branch information
Guillaume NICOLAS committed Aug 9, 2024
1 parent e97267d commit 0bda124
Show file tree
Hide file tree
Showing 2 changed files with 70 additions and 57 deletions.
57 changes: 32 additions & 25 deletions sql/all-observed-domains-query.sql
Original file line number Diff line number Diff line change
@@ -1,54 +1,61 @@
CREATE TEMP
TABLE entities (name STRING, domain STRING) AS
SELECT JSON_VALUE(entity, '$.name') AS name, JSON_VALUE(entity, '$.domain') as domain
FROM UNNEST (
JSON_QUERY_ARRAY (@entities_string, '$')
) AS entity

;
CREATE TEMP TABLE
entities (name STRING, domain STRING) AS
SELECT
JSON_VALUE(entity, '$.name') AS name,
JSON_VALUE(entity, '$.domain') as domain
FROM
UNNEST (JSON_QUERY_ARRAY(@entities_string, '$')) AS entity;

-- Map each observed domain and count to entity
CREATE TEMP
TABLE entity_domain_count (
domain STRING,
totalOccurrences INT,
name STRING
) AS
CREATE TEMP TABLE
entity_domain_count (domain STRING, totalOccurrences INT, name STRING) AS
SELECT
domain_occurrences.domain as domain,
domain_occurrences.totalOccurrences as totalOccurrences,
entities.name as name,
FROM (
FROM
(
-- How many times an observed domain is called in archive
SELECT domain, COUNT(*) AS totalOccurrences
FROM (
SELECT page, NET.HOST (url) AS domain
SELECT
domain,
COUNT(0) AS totalOccurrences
FROM
(
SELECT
page,
NET.HOST(url) AS domain
FROM
`httparchive.requests.2022_01_01_mobile`
GROUP BY
page, domain
page,
domain
)
GROUP BY
domain
) as domain_occurrences
JOIN
-- Mapping between a domain and an entity
entities ON domain_occurrences.domain LIKE
REPLACE (entities.domain, '*', '%');
entities ON domain_occurrences.domain LIKE REPLACE(entities.domain, '*', '%');

-- Get entities with at least 50 observed domains
WITH
entity_count AS (
SELECT name, SUM(totalOccurrences) as totalOccurrences
FROM entity_domain_count
SELECT
name,
SUM(totalOccurrences) as totalOccurrences
FROM
entity_domain_count
GROUP BY
name
HAVING
totalOccurrences >= 50
)
-- Get observed domains owned by entities with at least 50 observed domains
SELECT entity_domain_count.domain, entity_domain_count.totalOccurrences
SELECT
entity_domain_count.domain,
entity_domain_count.totalOccurrences
FROM
entity_domain_count
JOIN entity_count ON entity_count.name = entity_domain_count.name
ORDER BY totalOccurrences DESC
ORDER BY
totalOccurrences DESC
70 changes: 38 additions & 32 deletions sql/entity-per-page.sql
Original file line number Diff line number Diff line change
Expand Up @@ -5,40 +5,46 @@ SELECT
SUM(totalOccurrences) AS totalScripts,
AVG(totalExecutionTime) AS averageExecutionTime,
AVG(averageExecutionTime) AS averageScriptExecutionTime
FROM (
SELECT
pageUrl,
canonicalDomain,
SUM(executionTime) AS totalExecutionTime,
COUNT(executionTime) AS totalOccurrences,
AVG(executionTime) AS averageExecutionTime
FROM (
FROM
(
SELECT
domain,
canonicalDomain,
category,
JSON_VALUE(report, '$.requestedUrl') AS pageUrl,
NET.HOST(JSON_VALUE(bootupTimeItems, '$.url')) AS observedDomain,
JSON_VALUE(bootupTimeItems, '$.url') AS scriptUrl,
SAFE_CAST(JSON_VALUE(bootupTimeItems,
"$.scripting") AS FLOAT64) AS executionTime
FROM (
SELECT
url AS page,
report
FROM
`httparchive.lighthouse.2022_01_01_mobile`
),
UNNEST(JSON_QUERY_ARRAY(report,
'$.audits.bootup-time.details.items')) AS bootupTimeItems
INNER JOIN
`lighthouse-infrastructure.third_party_web.2022_01_01`
ON NET.HOST(JSON_VALUE(bootupTimeItems, "$.url")) = domain
pageUrl,
canonicalDomain,
SUM(executionTime) AS totalExecutionTime,
COUNT(executionTime) AS totalOccurrences,
AVG(executionTime) AS averageExecutionTime
FROM
(
SELECT
domain,
canonicalDomain,
category,
JSON_VALUE(report, '$.requestedUrl') AS pageUrl,
NET.HOST(JSON_VALUE(bootupTimeItems, '$.url')) AS observedDomain,
JSON_VALUE(bootupTimeItems, '$.url') AS scriptUrl,
SAFE_CAST(
JSON_VALUE(bootupTimeItems, "$.scripting") AS FLOAT64
) AS executionTime
FROM
(
SELECT
url AS page,
report
FROM
`httparchive.lighthouse.2022_01_01_mobile`
),
UNNEST (
JSON_QUERY_ARRAY(report, '$.audits.bootup-time.details.items')
) AS bootupTimeItems
INNER JOIN `lighthouse-infrastructure.third_party_web.2022_01_01` ON NET.HOST(JSON_VALUE(bootupTimeItems, "$.url")) = domain
)
WHERE
canonicalDomain IS NOT NULL
GROUP BY
pageUrl,
canonicalDomain
)
WHERE canonicalDomain IS NOT NULL
GROUP BY pageUrl, canonicalDomain
)
GROUP BY
canonicalDomain
ORDER BY
totalOccurrences DESC
totalOccurrences DESC

0 comments on commit 0bda124

Please sign in to comment.