diff --git a/definitions/output/reports/reports_dynamic.js b/definitions/output/reports/reports_dynamic.js index a4c8b9d..b5cd016 100644 --- a/definitions/output/reports/reports_dynamic.js +++ b/definitions/output/reports/reports_dynamic.js @@ -24,7 +24,7 @@ const EXPORT_CONFIG = { bucket: constants.bucket, storagePath: constants.storagePath, dataset: 'reports', - testSuffix: '.json' + fileFormat: '.json' } // Date range for report generation @@ -54,7 +54,7 @@ function buildExportPath(reportConfig) { throw new Error(`Unknown SQL type: ${sql.type}`) } - return objectPath + EXPORT_CONFIG.testSuffix + return objectPath + EXPORT_CONFIG.fileFormat } /** @@ -74,17 +74,19 @@ function buildExportQuery(reportConfig) { WHERE date = '${date}' AND metric = '${metric.id}' AND lens = '${lens.name}' - ORDER BY bin ASC + ORDER BY client, bin ASC ` } else if (sql.type === 'timeseries') { query = ` SELECT - FORMAT_DATE('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, * EXCEPT(date, metric, lens) FROM \`${EXPORT_CONFIG.dataset}.${tableName}\` - WHERE metric = '${metric.id}' + WHERE + date = '${date}' + AND metric = '${metric.id}' AND lens = '${lens.name}' - ORDER BY date DESC + ORDER BY date, client DESC ` } else { throw new Error(`Unknown SQL type: ${sql.type}`) @@ -110,7 +112,7 @@ function createReportConfig(date, metric, sql, lensName, lensSQL) { sql, lens: { name: lensName, sql: lensSQL }, devRankFilter: constants.devRankFilter, - tableName: `${metric.id}_${sql.type}` + tableName: sql.type === 'timeseries' ? sql.type : `${metric.id}_${sql.type}` } } @@ -180,6 +182,7 @@ INSERT INTO ${EXPORT_CONFIG.dataset}.${tableName} --*/ SELECT + DATE('${date}') AS date, '${metric.id}' AS metric, '${lens.name}' AS lens, * diff --git a/includes/reports.js b/includes/reports.js index d4190bf..3a73a0e 100644 --- a/includes/reports.js +++ b/includes/reports.js @@ -42,25 +42,212 @@ class DataformTemplateBuilder { const config = { _metrics: { - bytesTotal: { + bytesCss: { SQL: [ { type: 'histogram', query: DataformTemplateBuilder.create((ctx, params) => ` -WITH pages AS ( +SELECT + *, + SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf +FROM ( + SELECT + *, + volume / SUM(volume) OVER (PARTITION BY client) AS pdf + FROM ( + SELECT + client, + COUNT(0) AS volume, + CAST(FLOOR(FLOAT64(summary.bytesCss) / 10240) * 10 AS INT64) AS bin + FROM ${ctx.ref('crawl', 'pages')} + WHERE + date = '${params.date}' AND + ${params.devRankFilter} AND + ${params.lens.sql} AND + is_root_page + GROUP BY + bin, + client + ) +) +ORDER BY + bin, + client +`) + }, + { + type: 'timeseries', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + date, + client, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesCss), 1001)[OFFSET(101)] / 1024, 2) AS p10, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesCss), 1001)[OFFSET(251)] / 1024, 2) AS p25, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesCss), 1001)[OFFSET(501)] / 1024, 2) AS p50, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesCss), 1001)[OFFSET(751)] / 1024, 2) AS p75, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesCss), 1001)[OFFSET(901)] / 1024, 2) AS p90 +FROM ${ctx.ref('crawl', 'pages')} +WHERE + date = '${params.date}' AND + ${params.devRankFilter} AND + ${params.lens.sql} AND + is_root_page AND + FLOAT64(summary.bytesCss) > 0 +GROUP BY + date, + timestamp, + client +ORDER BY + date DESC, + client +`) + } + ] + }, + ttci: { + SQL: [ + { + type: 'histogram', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + *, + SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf +FROM ( + SELECT + *, + volume / SUM(volume) OVER (PARTITION BY client) AS pdf + FROM ( + SELECT + client, + COUNT(0) AS volume, + CAST(FLOOR(CAST(IFNULL( + FLOAT64(lighthouse.audits.interactive.numericValue), + IFNULL( + FLOAT64(lighthouse.audits['consistently-interactive'].rawValue), + FLOAT64(lighthouse.audits.interactive.rawValue) + ) + ) AS FLOAT64) / 1000) AS INT64) AS bin + FROM ${ctx.ref('crawl', 'pages')} + WHERE + date = '${params.date}' AND + ${params.devRankFilter} AND + ${params.lens.sql} AND + is_root_page + GROUP BY + bin, + client + HAVING + bin IS NOT NULL + ) +) +ORDER BY + bin, + client +`) + }, + { + type: 'timeseries', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + date, + client, + ROUND(APPROX_QUANTILES(value, 1000)[OFFSET(100)], 2) AS p10, + ROUND(APPROX_QUANTILES(value, 1000)[OFFSET(250)], 2) AS p25, + ROUND(APPROX_QUANTILES(value, 1000)[OFFSET(500)], 2) AS p50, + ROUND(APPROX_QUANTILES(value, 1000)[OFFSET(750)], 2) AS p75, + ROUND(APPROX_QUANTILES(value, 1000)[OFFSET(900)], 2) AS p90 +FROM ( SELECT - date, client, - CAST(FLOOR(FLOAT64(summary.bytesTotal) / 1024 / 100) * 100 AS INT64) AS bin + date, + IFNULL( + FLOAT64(lighthouse.audits.interactive.numericValue), + IFNULL( + FLOAT64(lighthouse.audits.interactive.rawValue), + FLOAT64(lighthouse.audits['consistently-interactive'].rawValue) + ) + ) / 1000 AS value FROM ${ctx.ref('crawl', 'pages')} WHERE - date = '${params.date}' - ${params.devRankFilter} - ${params.lens.sql} - AND is_root_page - AND FLOAT64(summary.bytesTotal) > 0 + date = '${params.date}' AND + ${params.devRankFilter} AND + ${params.lens.sql} AND + is_root_page ) - +GROUP BY + date, + timestamp, + client +ORDER BY + date DESC, + client +`) + } + ] + }, + pctHttps: { + SQL: [ + { + type: 'timeseries', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + date, + client, + ROUND(SUM(IF(STARTS_WITH(url, 'https'), 1, 0)) * 100 / COUNT(0), 2) AS percent +FROM ${ctx.ref('crawl', 'requests')} +INNER JOIN ${ctx.ref('crawl', 'pages')} +USING (date, client, is_root_page, rank, page) +WHERE + date = '${params.date}' AND + ${params.devRankFilter} AND + ${params.lens.sql} AND + is_root_page +GROUP BY + date, + timestamp, + client +ORDER BY + date DESC, + client +`) + } + ] + }, + storageEstimate: { + SQL: [ + { + type: 'timeseries', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + date, + client, + SUM(IF(feat.id IS NOT NULL, 1, 0)) AS num_urls, + ROUND(SUM(IF(feat.id IS NOT NULL, 1, 0)) / COUNT(0) * 100, 5) AS percent +FROM ${ctx.ref('crawl', 'pages')} +LEFT OUTER JOIN UNNEST(features) AS feat +ON (feat.id = '1371' OR feat.feature = 'DurableStorageEstimate') +WHERE + date = '${params.date}' AND + ${params.devRankFilter} AND + ${params.lens.sql} AND + is_root_page +GROUP BY + date, + timestamp, + client +ORDER BY + date DESC, + client, + num_urls DESC +`) + } + ] + }, + bootupJs: { + SQL: [ + { + type: 'histogram', + query: DataformTemplateBuilder.create((ctx, params) => ` SELECT *, SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf @@ -70,14 +257,20 @@ FROM ( volume / SUM(volume) OVER (PARTITION BY client) AS pdf FROM ( SELECT - *, - COUNT(0) AS volume - FROM pages - WHERE bin IS NOT NULL - GROUP BY - date, client, - bin + COUNT(0) AS volume, + FLOOR(FLOAT64(IFNULL(lighthouse.audits['bootup-time'].numericValue, lighthouse.audits['bootup-time'].rawValue)) / 100) / 10 AS bin + FROM ${ctx.ref('crawl', 'pages')} + WHERE + date = '${params.date}' AND + ${params.devRankFilter} AND + ${params.lens.sql} AND + is_root_page + GROUP BY + bin, + client + HAVING + bin IS NOT NULL ) ) ORDER BY @@ -88,37 +281,2500 @@ ORDER BY { type: 'timeseries', query: DataformTemplateBuilder.create((ctx, params) => ` -WITH pages AS ( +SELECT + date, + client, + ROUND(APPROX_QUANTILES(value, 1000)[OFFSET(100)], 2) AS p10, + ROUND(APPROX_QUANTILES(value, 1000)[OFFSET(250)], 2) AS p25, + ROUND(APPROX_QUANTILES(value, 1000)[OFFSET(500)], 2) AS p50, + ROUND(APPROX_QUANTILES(value, 1000)[OFFSET(750)], 2) AS p75, + ROUND(APPROX_QUANTILES(value, 1000)[OFFSET(900)], 2) AS p90 +FROM ( SELECT date, client, - FLOAT64(summary.bytesTotal) AS bytesTotal + IFNULL( + FLOAT64(lighthouse.audits['bootup-time'].numericValue), + FLOAT64(lighthouse.audits['bootup-time'].rawValue) + ) / 1000 AS value FROM ${ctx.ref('crawl', 'pages')} WHERE - date = '${params.date}' - ${params.devRankFilter} - ${params.lens.sql} - AND is_root_page - AND INT64(summary.bytesTotal) > 0 + date = '${params.date}' AND + ${params.devRankFilter} AND + ${params.lens.sql} AND + lighthouse IS NOT NULL AND + TO_JSON_STRING(lighthouse) != '{}' AND + is_root_page ) - +GROUP BY + date, + timestamp, + client +ORDER BY + date DESC, + client +`) + } + ] + }, + bytesFont: { + SQL: [ + { + type: 'histogram', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + *, + SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf +FROM ( + SELECT + *, + volume / SUM(volume) OVER (PARTITION BY client) AS pdf + FROM ( + SELECT + client, + COUNT(0) AS volume, + CAST(FLOOR(FLOAT64(summary.bytesFont) / 10240) * 10 AS INT64) AS bin + FROM ${ctx.ref('crawl', 'pages')} + WHERE + date = '${params.date}' AND + ${params.devRankFilter} AND + ${params.lens.sql} AND + is_root_page + GROUP BY + bin, + client + ) +) +ORDER BY + bin, + client +`) + }, + { + type: 'timeseries', + query: DataformTemplateBuilder.create((ctx, params) => ` SELECT date, client, - UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, - ROUND(APPROX_QUANTILES(bytesTotal, 1001)[OFFSET(101)] / 1024, 2) AS p10, - ROUND(APPROX_QUANTILES(bytesTotal, 1001)[OFFSET(251)] / 1024, 2) AS p25, - ROUND(APPROX_QUANTILES(bytesTotal, 1001)[OFFSET(501)] / 1024, 2) AS p50, - ROUND(APPROX_QUANTILES(bytesTotal, 1001)[OFFSET(751)] / 1024, 2) AS p75, - ROUND(APPROX_QUANTILES(bytesTotal, 1001)[OFFSET(901)] / 1024, 2) AS p90 -FROM pages + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesFont), 1001)[OFFSET(101)] / 1024, 2) AS p10, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesFont), 1001)[OFFSET(251)] / 1024, 2) AS p25, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesFont), 1001)[OFFSET(501)] / 1024, 2) AS p50, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesFont), 1001)[OFFSET(751)] / 1024, 2) AS p75, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesFont), 1001)[OFFSET(901)] / 1024, 2) AS p90 +FROM ${ctx.ref('crawl', 'pages')} +WHERE + date = '${params.date}' AND + ${params.devRankFilter} AND + ${params.lens.sql} AND + is_root_page AND + FLOAT64(summary.bytesFont) > 0 GROUP BY + date, + timestamp, + client +ORDER BY + date DESC, + client +`) + } + ] + }, + bytesHtml: { + SQL: [ + { + type: 'histogram', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + *, + SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf +FROM ( + SELECT + *, + volume / SUM(volume) OVER (PARTITION BY client) AS pdf + FROM ( + SELECT + client, + COUNT(0) AS volume, + CAST(FLOOR(FLOAT64(summary.bytesHtml) / 10240) * 10 AS INT64) AS bin + FROM ${ctx.ref('crawl', 'pages')} + WHERE + date = '${params.date}' AND + ${params.devRankFilter} AND + ${params.lens.sql} AND + is_root_page + GROUP BY + bin, + client + ) +) +ORDER BY + bin, + client +`) + }, + { + type: 'timeseries', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT date, client, - timestamp + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesHtml), 1001)[OFFSET(101)] / 1024, 2) AS p10, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesHtml), 1001)[OFFSET(251)] / 1024, 2) AS p25, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesHtml), 1001)[OFFSET(501)] / 1024, 2) AS p50, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesHtml), 1001)[OFFSET(751)] / 1024, 2) AS p75, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesHtml), 1001)[OFFSET(901)] / 1024, 2) AS p90 +FROM ${ctx.ref('crawl', 'pages')} +WHERE + date = '${params.date}' AND + ${params.devRankFilter} AND + ${params.lens.sql} AND + is_root_page AND + FLOAT64(summary.bytesHtml) > 0 +GROUP BY + date, + timestamp, + client ORDER BY + date DESC, + client +`) + } + ] + }, + bytesImg: { + SQL: [ + { + type: 'histogram', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + *, + SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf +FROM ( + SELECT + *, + volume / SUM(volume) OVER (PARTITION BY client) AS pdf + FROM ( + SELECT + client, + COUNT(0) AS volume, + CAST(FLOOR(FLOAT64(summary.bytesImg) / 102400) * 100 AS INT64) AS bin + FROM ${ctx.ref('crawl', 'pages')} + WHERE + date = '${params.date}' AND + ${params.devRankFilter} AND + ${params.lens.sql} AND + is_root_page + GROUP BY + bin, + client + ) +) +ORDER BY + bin, + client +`) + }, + { + type: 'timeseries', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + date, + client, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesImg), 1001)[OFFSET(101)] / 1024, 2) AS p10, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesImg), 1001)[OFFSET(251)] / 1024, 2) AS p25, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesImg), 1001)[OFFSET(501)] / 1024, 2) AS p50, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesImg), 1001)[OFFSET(751)] / 1024, 2) AS p75, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesImg), 1001)[OFFSET(901)] / 1024, 2) AS p90 +FROM ${ctx.ref('crawl', 'pages')} +WHERE + date = '${params.date}' AND + ${params.devRankFilter} AND + ${params.lens.sql} AND + is_root_page AND + FLOAT64(summary.bytesImg) > 0 +GROUP BY date, + timestamp, + client +ORDER BY + date DESC, + client +`) + } + ] + }, + bytesJs: { + SQL: [ + { + type: 'histogram', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + *, + SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf +FROM ( + SELECT + *, + volume / SUM(volume) OVER (PARTITION BY client) AS pdf + FROM ( + SELECT + client, + COUNT(0) AS volume, + CAST(FLOOR(FLOAT64(summary.bytesJS) / 10240) * 10 AS INT64) AS bin + FROM ${ctx.ref('crawl', 'pages')} + WHERE + date = '${params.date}' AND + ${params.devRankFilter} AND + ${params.lens.sql} AND + is_root_page + GROUP BY + bin, + client + ) +) +ORDER BY + bin, client +`) + }, + { + type: 'timeseries', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + date, + client, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesJS), 1001)[OFFSET(101)] / 1024, 2) AS p10, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesJS), 1001)[OFFSET(251)] / 1024, 2) AS p25, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesJS), 1001)[OFFSET(501)] / 1024, 2) AS p50, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesJS), 1001)[OFFSET(751)] / 1024, 2) AS p75, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesJS), 1001)[OFFSET(901)] / 1024, 2) AS p90 +FROM ${ctx.ref('crawl', 'pages')} +WHERE + date = '${params.date}' AND + ${params.devRankFilter} AND + ${params.lens.sql} AND + is_root_page AND + FLOAT64(summary.bytesJS) > 0 +GROUP BY + date, + timestamp, + client +ORDER BY + date DESC, + client +`) + } + ] + }, + bytesOther: { + SQL: [ + { + type: 'histogram', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + *, + SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf +FROM ( + SELECT + *, + volume / SUM(volume) OVER (PARTITION BY client) AS pdf + FROM ( + SELECT + client, + COUNT(0) AS volume, + CAST(FLOOR(FLOAT64(summary.bytesOther) / 10240) * 10 AS INT64) AS bin + FROM ${ctx.ref('crawl', 'pages')} + WHERE + date = '${params.date}' AND + ${params.devRankFilter} AND + ${params.lens.sql} AND + is_root_page + GROUP BY + bin, + client + ) +) +ORDER BY + bin, + client +`) + }, + { + type: 'timeseries', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + date, + client, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesOther), 1001)[OFFSET(101)] / 1024, 2) AS p10, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesOther), 1001)[OFFSET(251)] / 1024, 2) AS p25, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesOther), 1001)[OFFSET(501)] / 1024, 2) AS p50, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesOther), 1001)[OFFSET(751)] / 1024, 2) AS p75, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesOther), 1001)[OFFSET(901)] / 1024, 2) AS p90 +FROM ${ctx.ref('crawl', 'pages')} +WHERE + date = '${params.date}' AND + ${params.devRankFilter} AND + ${params.lens.sql} AND + is_root_page AND + FLOAT64(summary.bytesOther) > 0 +GROUP BY + date, + timestamp, + client +ORDER BY + date DESC, + client +`) + } + ] + }, + bytesTotal: { + SQL: [ + { + type: 'histogram', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + *, + SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf +FROM ( + SELECT + *, + volume / SUM(volume) OVER (PARTITION BY client) AS pdf + FROM ( + SELECT + client, + COUNT(0) AS volume, + CAST(FLOOR(FLOAT64(summary.bytesTotal) / 1024 / 100) * 100 AS INT64) AS bin + FROM ${ctx.ref('crawl', 'pages')} + WHERE + date = '${params.date}' AND + ${params.devRankFilter} AND + ${params.lens.sql} AND + is_root_page + GROUP BY + bin, + client + ) +) +ORDER BY + bin, + client +`) + }, + { + type: 'timeseries', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + date, + client, + ROUND(APPROX_QUANTILES(bytesTotal, 1001)[OFFSET(101)] / 1024, 2) AS p10, + ROUND(APPROX_QUANTILES(bytesTotal, 1001)[OFFSET(251)] / 1024, 2) AS p25, + ROUND(APPROX_QUANTILES(bytesTotal, 1001)[OFFSET(501)] / 1024, 2) AS p50, + ROUND(APPROX_QUANTILES(bytesTotal, 1001)[OFFSET(751)] / 1024, 2) AS p75, + ROUND(APPROX_QUANTILES(bytesTotal, 1001)[OFFSET(901)] / 1024, 2) AS p90 +FROM ${ctx.ref('crawl', 'pages')} +WHERE + date = '${params.date}' AND + ${params.devRankFilter} AND + ${params.lens.sql} AND + is_root_page AND + INT64(summary.bytesTotal) > 0 +GROUP BY + date, + timestamp, + client +ORDER BY + date DESC, + client +`) + } + ] + }, + bytesVideo: { + SQL: [ + { + type: 'histogram', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + *, + SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf +FROM ( + SELECT + *, + volume / SUM(volume) OVER (PARTITION BY client) AS pdf + FROM ( + SELECT + client, + COUNT(0) AS volume, + CAST(FLOOR(FLOAT64(summary.bytesVideo) / 10240) * 10 AS INT64) AS bin + FROM ${ctx.ref('crawl', 'pages')} + WHERE + date = '${params.date}' AND + ${params.devRankFilter} AND + ${params.lens.sql} AND + is_root_page + GROUP BY + bin, + client + ) +) +ORDER BY + bin, + client +`) + }, + { + type: 'timeseries', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + date, + client, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesVideo), 1001)[OFFSET(101)] / 1024, 2) AS p10, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesVideo), 1001)[OFFSET(251)] / 1024, 2) AS p25, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesVideo), 1001)[OFFSET(501)] / 1024, 2) AS p50, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesVideo), 1001)[OFFSET(751)] / 1024, 2) AS p75, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesVideo), 1001)[OFFSET(901)] / 1024, 2) AS p90 +FROM ${ctx.ref('crawl', 'pages')} +WHERE + date = '${params.date}' AND + ${params.devRankFilter} AND + ${params.lens.sql} AND + is_root_page AND + FLOAT64(summary.bytesVideo) > 0 +GROUP BY + date, + timestamp, + client +ORDER BY + date DESC, + client +`) + } + ] + }, + compileJs: { + SQL: [ + { + type: 'histogram', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + *, + SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf +FROM ( + SELECT + *, + volume / SUM(volume) OVER (PARTITION BY client) AS pdf + FROM ( + SELECT + client, + COUNT(0) AS volume, + INT64(payload['_cpu.v8.compile']) AS bin + FROM ${ctx.ref('crawl', 'pages')} + WHERE + date = '${params.date}' AND + ${params.devRankFilter} AND + ${params.lens.sql} AND + is_root_page + GROUP BY + bin, + client + HAVING + bin >= 0 + ) +) +ORDER BY + bin, + client +`) + }, + { + type: 'timeseries', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + date, + client, + ROUND(APPROX_QUANTILES(value, 1000)[OFFSET(100)], 2) AS p10, + ROUND(APPROX_QUANTILES(value, 1000)[OFFSET(250)], 2) AS p25, + ROUND(APPROX_QUANTILES(value, 1000)[OFFSET(500)], 2) AS p50, + ROUND(APPROX_QUANTILES(value, 1000)[OFFSET(750)], 2) AS p75, + ROUND(APPROX_QUANTILES(value, 1000)[OFFSET(900)], 2) AS p90 +FROM ( + SELECT + date, + client, + INT64(payload['_cpu.v8.compile']) AS value + FROM ${ctx.ref('crawl', 'pages')} + WHERE + date = '${params.date}' AND + ${params.devRankFilter} AND + ${params.lens.sql} AND + is_root_page AND + INT64(payload['_cpu.v8.compile']) IS NOT NULL AND + INT64(payload['_cpu.v8.compile']) >= 0 +) +GROUP BY + date, + timestamp, + client +ORDER BY + date DESC, + client +`) + } + ] + }, + dcl: { + SQL: [ + { + type: 'histogram', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + *, + SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf +FROM ( + SELECT + *, + volume / SUM(volume) OVER (PARTITION BY client) AS pdf + FROM ( + SELECT + client, + COUNT(0) AS volume, + FLOOR(FLOAT64(summary.onContentLoaded) / 1000) AS bin + FROM ${ctx.ref('crawl', 'pages')} + WHERE + date = '${params.date}' AND + ${params.devRankFilter} AND + ${params.lens.sql} AND + is_root_page AND + FLOAT64(summary.onContentLoaded) > 0 + GROUP BY + bin, + client + ) +) +ORDER BY + bin, + client +`) + }, + { + type: 'timeseries', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + date, + client, + ROUND(APPROX_QUANTILES(FLOAT64(summary.onContentLoaded), 1001)[OFFSET(101)], 2) AS p10, + ROUND(APPROX_QUANTILES(FLOAT64(summary.onContentLoaded), 1001)[OFFSET(251)], 2) AS p25, + ROUND(APPROX_QUANTILES(FLOAT64(summary.onContentLoaded), 1001)[OFFSET(501)], 2) AS p50, + ROUND(APPROX_QUANTILES(FLOAT64(summary.onContentLoaded), 1001)[OFFSET(751)], 2) AS p75, + ROUND(APPROX_QUANTILES(FLOAT64(summary.onContentLoaded), 1001)[OFFSET(901)], 2) AS p90 +FROM ${ctx.ref('crawl', 'pages')} +WHERE + date = '${params.date}' AND + ${params.devRankFilter} AND + ${params.lens.sql} AND + is_root_page AND + FLOAT64(summary.onContentLoaded) > 0 +GROUP BY + date, + timestamp, + client +ORDER BY + date DESC, + client +`) + } + ] + }, + evalJs: { + SQL: [ + { + type: 'histogram', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + *, + SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf +FROM ( + SELECT + *, + volume / SUM(volume) OVER (PARTITION BY client) AS pdf + FROM ( + SELECT + client, + COUNT(0) AS volume, + CAST(FLOAT64(r.payload['_cpu.EvaluateScript']) / 20 AS INT64) * 20 AS bin + FROM ${ctx.ref('crawl', 'requests')} r + INNER JOIN ${ctx.ref('crawl', 'pages')} + USING (date, client, is_root_page, rank, page) + WHERE + date = '${params.date}' AND + ${params.devRankFilter} AND + ${params.lens.sql} AND + is_root_page + GROUP BY + bin, + client + HAVING + bin >= 0 + ) +) +ORDER BY + bin, + client +`) + } + ] + }, + fcp: { + SQL: [ + { + type: 'histogram', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + *, + SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf +FROM ( + SELECT + *, + volume / SUM(volume) OVER (PARTITION BY client) AS pdf + FROM ( + SELECT + client, + COUNT(0) AS volume, + CAST(FLOOR(FLOAT64(payload['_chromeUserTiming.firstContentfulPaint']) / 1000) AS INT64) AS bin + FROM ${ctx.ref('crawl', 'pages')} + WHERE + date = '${params.date}' AND + ${params.devRankFilter} AND + ${params.lens.sql} AND + is_root_page + GROUP BY + bin, + client + HAVING + bin >= 0 + ) +) +ORDER BY + bin, + client +`) + }, + { + type: 'timeseries', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + date, + client, + ROUND(APPROX_QUANTILES(FLOAT64(payload['_chromeUserTiming.firstContentfulPaint']), 1001)[OFFSET(101)] / 1024, 2) AS p10, + ROUND(APPROX_QUANTILES(FLOAT64(payload['_chromeUserTiming.firstContentfulPaint']), 1001)[OFFSET(251)] / 1024, 2) AS p25, + ROUND(APPROX_QUANTILES(FLOAT64(payload['_chromeUserTiming.firstContentfulPaint']), 1001)[OFFSET(501)] / 1024, 2) AS p50, + ROUND(APPROX_QUANTILES(FLOAT64(payload['_chromeUserTiming.firstContentfulPaint']), 1001)[OFFSET(751)] / 1024, 2) AS p75, + ROUND(APPROX_QUANTILES(FLOAT64(payload['_chromeUserTiming.firstContentfulPaint']), 1001)[OFFSET(901)] / 1024, 2) AS p90 +FROM ${ctx.ref('crawl', 'pages')} +WHERE + date = '${params.date}' AND + ${params.devRankFilter} AND + ${params.lens.sql} AND + is_root_page +GROUP BY + date, + timestamp, + client +HAVING + p50 IS NOT NULL +ORDER BY + date DESC, + client +`) + } + ] + }, + gzipSavings: { + SQL: [ + { + type: 'histogram', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + *, + SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf +FROM ( + SELECT + *, + volume / SUM(volume) OVER (PARTITION BY client) AS pdf + FROM ( + SELECT + client, + COUNT(0) AS volume, + CAST(FLOOR(FLOAT64(payload._gzip_savings) / (1024 * 2)) * 2 AS INT64) AS bin + FROM ${ctx.ref('crawl', 'pages')} + WHERE + date = '${params.date}' AND + ${params.devRankFilter} AND + ${params.lens.sql} AND + is_root_page + GROUP BY + bin, + client + HAVING + bin IS NOT NULL + ) +) +ORDER BY + bin, + client +`) + }, + { + type: 'timeseries', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + date, + client, + ROUND(APPROX_QUANTILES(FLOAT64(payload._gzip_savings), 1001)[OFFSET(101)] / 1024, 2) AS p10, + ROUND(APPROX_QUANTILES(FLOAT64(payload._gzip_savings), 1001)[OFFSET(251)] / 1024, 2) AS p25, + ROUND(APPROX_QUANTILES(FLOAT64(payload._gzip_savings), 1001)[OFFSET(501)] / 1024, 2) AS p50, + ROUND(APPROX_QUANTILES(FLOAT64(payload._gzip_savings), 1001)[OFFSET(751)] / 1024, 2) AS p75, + ROUND(APPROX_QUANTILES(FLOAT64(payload._gzip_savings), 1001)[OFFSET(901)] / 1024, 2) AS p90 +FROM ${ctx.ref('crawl', 'pages')} +WHERE + date = '${params.date}' AND + ${params.devRankFilter} AND + ${params.lens.sql} AND + is_root_page +GROUP BY + date, + timestamp, + client +ORDER BY + date DESC, + client +`) + } + ] + }, + ol: { + SQL: [ + { + type: 'histogram', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + *, + SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf +FROM ( + SELECT + *, + volume / SUM(volume) OVER (PARTITION BY client) AS pdf + FROM ( + SELECT + client, + COUNT(0) AS volume, + FLOOR(FLOAT64(summary.onLoad) / 1000) AS bin + FROM ${ctx.ref('crawl', 'pages')} + WHERE + date = '${params.date}' AND + ${params.devRankFilter} AND + ${params.lens.sql} AND + is_root_page AND + FLOAT64(summary.onLoad) > 0 + GROUP BY + bin, + client + ) +) +ORDER BY + bin, + client +`) + }, + { + type: 'timeseries', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + date, + client, + ROUND(APPROX_QUANTILES(FLOAT64(summary.onLoad), 1001)[OFFSET(101)] / 1000, 2) AS p10, + ROUND(APPROX_QUANTILES(FLOAT64(summary.onLoad), 1001)[OFFSET(251)] / 1000, 2) AS p25, + ROUND(APPROX_QUANTILES(FLOAT64(summary.onLoad), 1001)[OFFSET(501)] / 1000, 2) AS p50, + ROUND(APPROX_QUANTILES(FLOAT64(summary.onLoad), 1001)[OFFSET(751)] / 1000, 2) AS p75, + ROUND(APPROX_QUANTILES(FLOAT64(summary.onLoad), 1001)[OFFSET(901)] / 1000, 2) AS p90 +FROM ${ctx.ref('crawl', 'pages')} +WHERE + date = '${params.date}' AND + ${params.devRankFilter} AND + ${params.lens.sql} AND + is_root_page AND + FLOAT64(summary.onLoad) > 0 +GROUP BY + date, + timestamp, + client +ORDER BY + date DESC, + client +`) + } + ] + }, + reqCss: { + SQL: [ + { + type: 'histogram', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + *, + SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf +FROM ( + SELECT + *, + volume / SUM(volume) OVER (PARTITION BY client) AS pdf + FROM ( + SELECT + client, + COUNT(0) AS volume, + FLOAT64(summary.reqCss) AS bin + FROM ${ctx.ref('crawl', 'pages')} + WHERE + date = '${params.date}' AND + ${params.devRankFilter} + ${params.lens.sql} + is_root_page + GROUP BY + bin, + client + ) +) +ORDER BY + bin, + client +`) + }, + { + type: 'timeseries', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + date, + client, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqCss), 1001)[OFFSET(101)], 2) AS p10, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqCss), 1001)[OFFSET(251)], 2) AS p25, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqCss), 1001)[OFFSET(501)], 2) AS p50, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqCss), 1001)[OFFSET(751)], 2) AS p75, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqCss), 1001)[OFFSET(901)], 2) AS p90 +FROM ${ctx.ref('crawl', 'pages')} +WHERE + date = '${params.date}' AND + ${params.devRankFilter} AND + ${params.lens.sql} AND + is_root_page AND + FLOAT64(summary.reqCss) > 0 +GROUP BY + date, + timestamp, + client +ORDER BY + date DESC, + client +`) + } + ] + }, + reqFont: { + SQL: [ + { + type: 'histogram', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + *, + SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf +FROM ( + SELECT + *, + volume / SUM(volume) OVER (PARTITION BY client) AS pdf + FROM ( + SELECT + client, + COUNT(0) AS volume, + FLOAT64(summary.reqFont) AS bin + FROM ${ctx.ref('crawl', 'pages')} + WHERE + date = '${params.date}' AND + ${params.devRankFilter} + ${params.lens.sql} + is_root_page + GROUP BY + bin, + client + ) +) +ORDER BY + bin, + client +`) + }, + { + type: 'timeseries', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + date, + client, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqFont), 1001)[OFFSET(101)], 2) AS p10, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqFont), 1001)[OFFSET(251)], 2) AS p25, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqFont), 1001)[OFFSET(501)], 2) AS p50, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqFont), 1001)[OFFSET(751)], 2) AS p75, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqFont), 1001)[OFFSET(901)], 2) AS p90 +FROM ${ctx.ref('crawl', 'pages')} +WHERE + date = '${params.date}' AND + ${params.devRankFilter} AND + ${params.lens.sql} AND + is_root_page AND + FLOAT64(summary.reqFont) > 0 +GROUP BY + date, + timestamp, + client +ORDER BY + date DESC, + client +`) + } + ] + }, + reqHtml: { + SQL: [ + { + type: 'histogram', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + *, + SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf +FROM ( + SELECT + *, + volume / SUM(volume) OVER (PARTITION BY client) AS pdf + FROM ( + SELECT + client, + COUNT(0) AS volume, + FLOAT64(summary.reqHtml) AS bin + FROM ${ctx.ref('crawl', 'pages')} + WHERE + date = '${params.date}' AND + ${params.devRankFilter} + ${params.lens.sql} + is_root_page + GROUP BY + bin, + client + ) +) +ORDER BY + bin, + client +`) + }, + { + type: 'timeseries', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + date, + client, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqHtml), 1001)[OFFSET(101)], 2) AS p10, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqHtml), 1001)[OFFSET(251)], 2) AS p25, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqHtml), 1001)[OFFSET(501)], 2) AS p50, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqHtml), 1001)[OFFSET(751)], 2) AS p75, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqHtml), 1001)[OFFSET(901)], 2) AS p90 +FROM ${ctx.ref('crawl', 'pages')} +WHERE + date = '${params.date}' AND + ${params.devRankFilter} AND + ${params.lens.sql} AND + is_root_page AND + FLOAT64(summary.reqHtml) > 0 +GROUP BY + date, + timestamp, + client +ORDER BY + date DESC, + client +`) + } + ] + }, + reqImg: { + SQL: [ + { + type: 'histogram', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + *, + SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf +FROM ( + SELECT + *, + volume / SUM(volume) OVER (PARTITION BY client) AS pdf + FROM ( + SELECT + client, + COUNT(0) AS volume, + FLOAT64(summary.reqImg) AS bin + FROM ${ctx.ref('crawl', 'pages')} + WHERE + date = '${params.date}' AND + ${params.devRankFilter} + ${params.lens.sql} + is_root_page + GROUP BY + bin, + client + ) +) +ORDER BY + bin, + client +`) + }, + { + type: 'timeseries', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + date, + client, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqImg), 1001)[OFFSET(101)], 2) AS p10, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqImg), 1001)[OFFSET(251)], 2) AS p25, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqImg), 1001)[OFFSET(501)], 2) AS p50, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqImg), 1001)[OFFSET(751)], 2) AS p75, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqImg), 1001)[OFFSET(901)], 2) AS p90 +FROM ${ctx.ref('crawl', 'pages')} +WHERE + date = '${params.date}' AND + ${params.devRankFilter} AND + ${params.lens.sql} AND + is_root_page AND + FLOAT64(summary.reqImg) > 0 +GROUP BY + date, + timestamp, + client +ORDER BY + date DESC, + client +`) + } + ] + }, + reqJs: { + SQL: [ + { + type: 'histogram', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + *, + SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf +FROM ( + SELECT + *, + volume / SUM(volume) OVER (PARTITION BY client) AS pdf + FROM ( + SELECT + client, + COUNT(0) AS volume, + FLOAT64(summary.reqJS) AS bin + FROM ${ctx.ref('crawl', 'pages')} + WHERE + date = '${params.date}' AND + ${params.devRankFilter} + ${params.lens.sql} + is_root_page + GROUP BY + bin, + client + ) +) +ORDER BY + bin, + client +`) + }, + { + type: 'timeseries', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + date, + client, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqJS), 1001)[OFFSET(101)], 2) AS p10, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqJS), 1001)[OFFSET(251)], 2) AS p25, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqJS), 1001)[OFFSET(501)], 2) AS p50, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqJS), 1001)[OFFSET(751)], 2) AS p75, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqJS), 1001)[OFFSET(901)], 2) AS p90 +FROM ${ctx.ref('crawl', 'pages')} +WHERE + date = '${params.date}' AND + ${params.devRankFilter} AND + ${params.lens.sql} AND + is_root_page AND + FLOAT64(summary.reqJS) > 0 +GROUP BY + date, + timestamp, + client +ORDER BY + date DESC, + client +`) + } + ] + }, + reqOther: { + SQL: [ + { + type: 'histogram', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + *, + SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf +FROM ( + SELECT + *, + volume / SUM(volume) OVER (PARTITION BY client) AS pdf + FROM ( + SELECT + client, + COUNT(0) AS volume, + FLOAT64(summary.reqOther) AS bin + FROM ${ctx.ref('crawl', 'pages')} + WHERE + date = '${params.date}' AND + ${params.devRankFilter} + ${params.lens.sql} + is_root_page + GROUP BY + bin, + client + ) +) +ORDER BY + bin, + client +`) + }, + { + type: 'timeseries', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + date, + client, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqOther), 1001)[OFFSET(101)], 2) AS p10, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqOther), 1001)[OFFSET(251)], 2) AS p25, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqOther), 1001)[OFFSET(501)], 2) AS p50, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqOther), 1001)[OFFSET(751)], 2) AS p75, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqOther), 1001)[OFFSET(901)], 2) AS p90 +FROM ${ctx.ref('crawl', 'pages')} +WHERE + date = '${params.date}' AND + ${params.devRankFilter} AND + ${params.lens.sql} AND + is_root_page AND + FLOAT64(summary.reqOther) > 0 +GROUP BY + date, + timestamp, + client +ORDER BY + date DESC, + client +`) + } + ] + }, + reqTotal: { + SQL: [ + { + type: 'histogram', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + *, + SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf +FROM ( + SELECT + *, + volume / SUM(volume) OVER (PARTITION BY client) AS pdf + FROM ( + SELECT + client, + COUNT(0) AS volume, + FLOOR(FLOAT64(summary.reqTotal) / 10) * 10 AS bin + FROM ${ctx.ref('crawl', 'pages')} + WHERE + date = '${params.date}' AND + ${params.devRankFilter} + ${params.lens.sql} + is_root_page + GROUP BY + bin, + client + ) +) +ORDER BY + bin, + client +`) + }, + { + type: 'timeseries', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + date, + client, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqTotal), 1001)[OFFSET(101)], 2) AS p10, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqTotal), 1001)[OFFSET(251)], 2) AS p25, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqTotal), 1001)[OFFSET(501)], 2) AS p50, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqTotal), 1001)[OFFSET(751)], 2) AS p75, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqTotal), 1001)[OFFSET(901)], 2) AS p90 +FROM ${ctx.ref('crawl', 'pages')} +WHERE + date = '${params.date}' AND + ${params.devRankFilter} AND + ${params.lens.sql} AND + is_root_page AND + FLOAT64(summary.reqTotal) > 0 +GROUP BY + date, + timestamp, + client +ORDER BY + date DESC, + client +`) + } + ] + }, + reqVideo: { + SQL: [ + { + type: 'histogram', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + *, + SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf +FROM ( + SELECT + *, + volume / SUM(volume) OVER (PARTITION BY client) AS pdf + FROM ( + SELECT + client, + COUNT(0) AS volume, + FLOAT64(summary.reqVideo) AS bin + FROM ${ctx.ref('crawl', 'pages')} + WHERE + date = '${params.date}' AND + ${params.devRankFilter} + ${params.lens.sql} + is_root_page + GROUP BY + bin, + client + ) +) +ORDER BY + bin, + client +`) + }, + { + type: 'timeseries', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + date, + client, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqVideo), 1001)[OFFSET(101)], 2) AS p10, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqVideo), 1001)[OFFSET(251)], 2) AS p25, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqVideo), 1001)[OFFSET(501)], 2) AS p50, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqVideo), 1001)[OFFSET(751)], 2) AS p75, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqVideo), 1001)[OFFSET(901)], 2) AS p90 +FROM ${ctx.ref('crawl', 'pages')} +WHERE + date = '${params.date}' AND + ${params.devRankFilter} AND + ${params.lens.sql} AND + is_root_page AND + FLOAT64(summary.reqVideo) > 0 +GROUP BY + date, + timestamp, + client +ORDER BY + date DESC, + client +`) + } + ] + }, + imgSavings: { + SQL: [ + { + type: 'histogram', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + *, + SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf +FROM ( + SELECT + *, + volume / SUM(volume) OVER (PARTITION BY client) AS pdf + FROM ( + SELECT + client, + COUNT(0) AS volume, + CAST(FLOOR(FLOAT64(payload._image_savings) / (1024 * 10)) * 10 AS INT64) AS bin + FROM ${ctx.ref('crawl', 'pages')} + WHERE + date = '${params.date}' AND + ${params.devRankFilter} + ${params.lens.sql} + is_root_page + GROUP BY + bin, + client + HAVING + bin IS NOT NULL + ) +) +ORDER BY + bin, + client +`) + }, + { + type: 'timeseries', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + date, + client, + ROUND(APPROX_QUANTILES(FLOAT64(payload._image_savings), 1001)[OFFSET(101)] / 1024, 2) AS p10, + ROUND(APPROX_QUANTILES(FLOAT64(payload._image_savings), 1001)[OFFSET(251)] / 1024, 2) AS p25, + ROUND(APPROX_QUANTILES(FLOAT64(payload._image_savings), 1001)[OFFSET(501)] / 1024, 2) AS p50, + ROUND(APPROX_QUANTILES(FLOAT64(payload._image_savings), 1001)[OFFSET(751)] / 1024, 2) AS p75, + ROUND(APPROX_QUANTILES(FLOAT64(payload._image_savings), 1001)[OFFSET(901)] / 1024, 2) AS p90 +FROM ${ctx.ref('crawl', 'pages')} +WHERE + is_root_page AND + ${params.devRankFilter} + ${params.lens.sql} + date = '${params.date}' +GROUP BY + date, + timestamp, + client +ORDER BY + date DESC, + client +`) + } + ] + }, + offscreenImages: { + SQL: [ + { + type: 'histogram', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + *, + SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf +FROM ( + SELECT + *, + volume / SUM(volume) OVER (PARTITION BY client) AS pdf + FROM ( + SELECT + client, + COUNT(0) AS volume, + CAST(FLOOR(IFNULL( + INT64(lighthouse.audits['offscreen-images'].details.overallSavingsBytes), + INT64(lighthouse.audits['offscreen-images'].extendedInfo.value.wastedKb) * 1024 + ) / 10240) * 10 AS INT64) AS bin + FROM ${ctx.ref('crawl', 'pages')} + WHERE + date = '${params.date}' + date = '${params.date}' AND + ${params.devRankFilter} + ${params.lens.sql} + is_root_page + GROUP BY + bin, + client + HAVING + bin IS NOT NULL + ) +) +ORDER BY + bin, + client +`) + }, + { + type: 'timeseries', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + date, + client, + ROUND(APPROX_QUANTILES(IFNULL(INT64(lighthouse.audits['offscreen-images'].details.overallSavingsBytes), INT64(lighthouse.audits['offscreen-images'].extendedInfo.value.wastedKb) * 1024), 1001)[OFFSET(101)] / 1024, 2) AS p10, + ROUND(APPROX_QUANTILES(IFNULL(INT64(lighthouse.audits['offscreen-images'].details.overallSavingsBytes), INT64(lighthouse.audits['offscreen-images'].extendedInfo.value.wastedKb) * 1024), 1001)[OFFSET(251)] / 1024, 2) AS p25, + ROUND(APPROX_QUANTILES(IFNULL(INT64(lighthouse.audits['offscreen-images'].details.overallSavingsBytes), INT64(lighthouse.audits['offscreen-images'].extendedInfo.value.wastedKb) * 1024), 1001)[OFFSET(501)] / 1024, 2) AS p50, + ROUND(APPROX_QUANTILES(IFNULL(INT64(lighthouse.audits['offscreen-images'].details.overallSavingsBytes), INT64(lighthouse.audits['offscreen-images'].extendedInfo.value.wastedKb) * 1024), 1001)[OFFSET(751)] / 1024, 2) AS p75, + ROUND(APPROX_QUANTILES(IFNULL(INT64(lighthouse.audits['offscreen-images'].details.overallSavingsBytes), INT64(lighthouse.audits['offscreen-images'].extendedInfo.value.wastedKb) * 1024), 1001)[OFFSET(901)] / 1024, 2) AS p90 +FROM ${ctx.ref('crawl', 'pages')} +WHERE + is_root_page AND + ${params.devRankFilter} + ${params.lens.sql} + date = '${params.date}' +GROUP BY + date, + timestamp, + client +ORDER BY + date DESC, + client +`) + } + ] + }, + optimizedImages: { + SQL: [ + { + type: 'histogram', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + *, + SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf +FROM ( + SELECT + *, + volume / SUM(volume) OVER (PARTITION BY client) AS pdf + FROM ( + SELECT + client, + COUNT(0) AS volume, + CAST(FLOOR(IFNULL( + INT64(lighthouse.audits['uses-optimized-images'].details.overallSavingsBytes), + INT64(lighthouse.audits['uses-optimized-images'].extendedInfo.value.wastedKb) * 1024 + ) / 10240) * 10 AS INT64) AS bin + FROM ${ctx.ref('crawl', 'pages')} + WHERE + date = '${params.date}' AND + ${params.devRankFilter} + ${params.lens.sql} + is_root_page + GROUP BY + bin, + client + HAVING + bin IS NOT NULL + ) +) +ORDER BY + bin, + client +`) + }, + { + type: 'timeseries', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + date, + client, + ROUND(APPROX_QUANTILES(IFNULL(INT64(lighthouse.audits['uses-optimized-images'].details.overallSavingsBytes), INT64(lighthouse.audits['uses-optimized-images'].extendedInfo.value.wastedKb) * 1024), 1001)[OFFSET(101)] / 1024, 2) AS p10, + ROUND(APPROX_QUANTILES(IFNULL(INT64(lighthouse.audits['uses-optimized-images'].details.overallSavingsBytes), INT64(lighthouse.audits['uses-optimized-images'].extendedInfo.value.wastedKb) * 1024), 1001)[OFFSET(251)] / 1024, 2) AS p25, + ROUND(APPROX_QUANTILES(IFNULL(INT64(lighthouse.audits['uses-optimized-images'].details.overallSavingsBytes), INT64(lighthouse.audits['uses-optimized-images'].extendedInfo.value.wastedKb) * 1024), 1001)[OFFSET(501)] / 1024, 2) AS p50, + ROUND(APPROX_QUANTILES(IFNULL(INT64(lighthouse.audits['uses-optimized-images'].details.overallSavingsBytes), INT64(lighthouse.audits['uses-optimized-images'].extendedInfo.value.wastedKb) * 1024), 1001)[OFFSET(751)] / 1024, 2) AS p75, + ROUND(APPROX_QUANTILES(IFNULL(INT64(lighthouse.audits['uses-optimized-images'].details.overallSavingsBytes), INT64(lighthouse.audits['uses-optimized-images'].extendedInfo.value.wastedKb) * 1024), 1001)[OFFSET(901)] / 1024, 2) AS p90 +FROM ${ctx.ref('crawl', 'pages')} +WHERE + date = '${params.date}' AND + ${params.devRankFilter} AND + ${params.lens.sql} AND + is_root_page +GROUP BY + date, + timestamp, + client +ORDER BY + date DESC, + client +`) + } + ] + }, + speedIndex: { + SQL: [ + { + type: 'histogram', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + *, + SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf +FROM ( + SELECT + *, + volume / SUM(volume) OVER (PARTITION BY client) AS pdf + FROM ( + SELECT + client, + COUNT(0) AS volume, + CAST(FLOOR(FLOAT64(payload._SpeedIndex) / (1000)) * 1000 AS INT64) AS bin + FROM ${ctx.ref('crawl', 'pages')} + WHERE + date = '${params.date}' AND + ${params.devRankFilter} + ${params.lens.sql} + is_root_page + GROUP BY + bin, + client + HAVING + bin IS NOT NULL + ) +) +ORDER BY + bin, + client +`) + }, + { + type: 'timeseries', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + date, + client, + ROUND(APPROX_QUANTILES(FLOAT64(payload._SpeedIndex), 1001)[OFFSET(101)] / 1000, 2) AS p10, + ROUND(APPROX_QUANTILES(FLOAT64(payload._SpeedIndex), 1001)[OFFSET(251)] / 1000, 2) AS p25, + ROUND(APPROX_QUANTILES(FLOAT64(payload._SpeedIndex), 1001)[OFFSET(501)] / 1000, 2) AS p50, + ROUND(APPROX_QUANTILES(FLOAT64(payload._SpeedIndex), 1001)[OFFSET(751)] / 1000, 2) AS p75, + ROUND(APPROX_QUANTILES(FLOAT64(payload._SpeedIndex), 1001)[OFFSET(901)] / 1000, 2) AS p90 +FROM ${ctx.ref('crawl', 'pages')} +WHERE + is_root_page AND + ${params.devRankFilter} + ${params.lens.sql} + date = '${params.date}' +GROUP BY + date, + timestamp, + client +ORDER BY + date DESC, + client +`) + } + ] + }, + tcp: { + SQL: [ + { + type: 'histogram', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + *, + SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf +FROM ( + SELECT + *, + volume / SUM(volume) OVER (PARTITION BY client) AS pdf + FROM ( + SELECT + client, + COUNT(0) AS volume, + INT64(summary._connections) AS bin + FROM ${ctx.ref('crawl', 'pages')} + WHERE + date = '${params.date}' AND + ${params.devRankFilter} + ${params.lens.sql} + is_root_page AND + INT64(summary._connections) > 0 + GROUP BY + bin, + client + ) +) +ORDER BY + bin, + client +`) + } + ] + }, + imgLazy: { + SQL: [ + { + type: 'timeseries', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + date, + client, + ROUND(COUNT(DISTINCT IF(LOWER(LAX_STRING(attr)) = 'lazy', page, NULL)) * 100 / COUNT(DISTINCT page), 2) AS percent +FROM ${ctx.ref('crawl', 'pages')} +LEFT JOIN + UNNEST(JSON_EXTRACT_ARRAY(custom_metrics.other['img-loading-attr'])) AS attr +WHERE + is_root_page AND + ${params.devRankFilter} + ${params.lens.sql} + date > '2016-01-01' +GROUP BY + date, + timestamp, + client +ORDER BY + date DESC, + client +`) + } + ] + }, + h2: { + SQL: [ + { + type: 'timeseries', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + date, + client, + ROUND(SUM(IF(LAX_STRING(r.summary.respHttpVersion) = 'HTTP/2', 1, 0)) * 100 / COUNT(0), 2) AS percent +FROM ${ctx.ref('crawl', 'requests')} r +INNER JOIN ${ctx.ref('crawl', 'pages')} +USING (date, client, is_root_page, rank, page) +WHERE + is_root_page AND + ${params.devRankFilter} + ${params.lens.sql} + date = '${params.date}' +GROUP BY + date, + timestamp, + client +ORDER BY + date DESC, + client +`) + } + ] + }, + h3: { + SQL: [ + { + type: 'timeseries', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + date, + client, + ROUND( + SUM( + IF( + LAX_STRING(r.summary.respHttpVersion) IN ('HTTP/3', 'h3', 'h3-29') OR + REGEXP_EXTRACT(REGEXP_EXTRACT(resp.value, r'(.*)'), r'(.*?)(?:, [^ ]* = .*)?$') LIKE '%h3=%' OR + REGEXP_EXTRACT(REGEXP_EXTRACT(resp.value, r'(.*)'), r'(.*?)(?:, [^ ]* = .*)?$') LIKE '%h3-29=%', + 1, 0 + ) + ) * 100 / COUNT(0), 2 + ) AS percent +FROM ${ctx.ref('crawl', 'requests')} r +LEFT OUTER JOIN +UNNEST(response_headers) AS resp +ON (resp.name = 'alt-svc') +INNER JOIN ${ctx.ref('crawl', 'pages')} +USING (date, client, is_root_page, rank, page) +WHERE + date = '${params.date}' AND + ${params.devRankFilter} AND + ${params.lens.sql} AND + is_root_page +GROUP BY + date, + timestamp, + client +ORDER BY + date DESC, + client +`) + } + ] + }, + fontDisplay: { + SQL: [ + { + type: 'timeseries', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + date, + client, + ROUND(SUM(IF(LAX_STRING(lighthouse.audits['font-display'].score) IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent +FROM ${ctx.ref('crawl', 'pages')} +WHERE + date = '${params.date}' + ${params.devRankFilter} + ${params.lens.sql} + is_root_page AND + lighthouse IS NOT NULL AND + TO_JSON_STRING(lighthouse) != '{}' AND + LAX_STRING(lighthouse.audits['font-display'].score) IS NOT NULL +GROUP BY + date, + timestamp, + client +ORDER BY + date DESC, + client +`) + } + ] + }, + canonical: { + SQL: [ + { + type: 'timeseries', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + date, + client, + ROUND(SUM(IF(LAX_STRING(lighthouse.audits.canonical.score) IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent +FROM ${ctx.ref('crawl', 'pages')} +WHERE + lighthouse IS NOT NULL AND + TO_JSON_STRING(lighthouse) != '{}' AND + date = '${params.date}' + ${params.devRankFilter} + ${params.lens.sql} + is_root_page +GROUP BY + date, + timestamp, + client +ORDER BY + date DESC, + client +`) + } + ] + }, + a11yButtonName: { + SQL: [ + { + type: 'timeseries', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + date, + client, + ROUND(SUM(IF(LAX_STRING(lighthouse.audits['button-name'].score) IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent +FROM ${ctx.ref('crawl', 'pages')} +WHERE + lighthouse IS NOT NULL AND + TO_JSON_STRING(lighthouse) != '{}' AND + is_root_page AND + ${params.devRankFilter} + ${params.lens.sql} + date = '${params.date}' +GROUP BY + date, + timestamp, + client +ORDER BY + date DESC, + client +`) + } + ] + }, + hreflang: { + SQL: [ + { + type: 'timeseries', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + date, + client, + ROUND(SUM(IF(LAX_STRING(lighthouse.audits.hreflang.score) IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent +FROM ${ctx.ref('crawl', 'pages')} +WHERE + lighthouse IS NOT NULL AND + TO_JSON_STRING(lighthouse) != '{}' AND + date = '${params.date}' + ${params.devRankFilter} + ${params.lens.sql} + is_root_page AND + LAX_STRING(lighthouse.audits.hreflang.score) IS NOT NULL +GROUP BY + date, + timestamp, + client +ORDER BY + date DESC, + client +`) + } + ] + }, + numUrls: { + SQL: [ + { + type: 'timeseries', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + date, + client, + COUNT(0) AS urls +FROM ${ctx.ref('crawl', 'pages')} +WHERE + date = '${params.date}' + ${params.devRankFilter} + ${params.lens.sql} + is_root_page +GROUP BY + date, + timestamp, + client +ORDER BY + date DESC, + client +`) + } + ] + }, + contentIndex: { + SQL: [ + { + type: 'timeseries', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + date, + client, + SUM(IF(feat.id IS NOT NULL, 1, 0)) AS num_urls, + ROUND(SUM(IF(feat.id IS NOT NULL, 1, 0)) / COUNT(0) * 100, 5) AS percent +FROM ${ctx.ref('crawl', 'pages')} +LEFT OUTER JOIN UNNEST(features) AS feat +ON (feat.id = '2983' OR feat.feature = 'ContentIndexAdd') +WHERE + date = '${params.date}' + ${params.devRankFilter} + ${params.lens.sql} + is_root_page +GROUP BY + date, + timestamp, + client +ORDER BY + date DESC, + client, + num_urls DESC +`) + } + ] + }, + legible: { + SQL: [ + { + type: 'timeseries', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + date, + client, + ROUND(SUM(IF(LAX_STRING(lighthouse.audits['font-size'].score) IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent +FROM ${ctx.ref('crawl', 'pages')} +WHERE + lighthouse IS NOT NULL AND + date = '${params.date}' + ${params.devRankFilter} + ${params.lens.sql} + is_root_page AND + LAX_STRING(lighthouse.audits['font-size'].score) IS NOT NULL +GROUP BY + date, + timestamp, + client +ORDER BY + date DESC, + client +`) + } + ] + }, + a11yColorContrast: { + SQL: [ + { + type: 'timeseries', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + date, + client, + ROUND(SUM(IF(LAX_STRING(lighthouse.audits['color-contrast'].score) IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent +FROM ${ctx.ref('crawl', 'pages')} +WHERE + lighthouse IS NOT NULL AND + TO_JSON_STRING(lighthouse) != '{}' AND + date = '${params.date}' + ${params.devRankFilter} + ${params.lens.sql} + is_root_page +GROUP BY + date, + timestamp, + client +ORDER BY + date DESC, + client +`) + } + ] + }, + a11yImageAlt: { + SQL: [ + { + type: 'timeseries', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + date, + client, + ROUND(SUM(IF(LAX_STRING(lighthouse.audits['image-alt'].score) IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent +FROM ${ctx.ref('crawl', 'pages')} +WHERE + lighthouse IS NOT NULL AND + TO_JSON_STRING(lighthouse) != '{}' AND + date = '${params.date}' + ${params.devRankFilter} + ${params.lens.sql} + is_root_page +GROUP BY + date, + timestamp, + client +ORDER BY + date DESC, + client +`) + } + ] + }, + a11yLabel: { + SQL: [ + { + type: 'timeseries', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + date, + client, + ROUND(SUM(IF(LAX_STRING(lighthouse.audits.label.score) IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent +FROM ${ctx.ref('crawl', 'pages')} +WHERE + lighthouse IS NOT NULL AND + TO_JSON_STRING(lighthouse) != '{}' AND + date = '${params.date}' + ${params.devRankFilter} + ${params.lens.sql} + is_root_page +GROUP BY + date, + timestamp, + client +ORDER BY + date DESC, + client +`) + } + ] + }, + a11yLinkName: { + SQL: [ + { + type: 'timeseries', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + date, + client, + ROUND(SUM(IF(LAX_STRING(lighthouse.audits['link-name'].score) IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent +FROM ${ctx.ref('crawl', 'pages')} +WHERE + lighthouse IS NOT NULL AND + TO_JSON_STRING(lighthouse) != '{}' AND + date = '${params.date}' + ${params.devRankFilter} + ${params.lens.sql} + is_root_page +GROUP BY + date, + timestamp, + client +ORDER BY + date DESC, + client +`) + } + ] + }, + a11yScores: { + SQL: [ + { + type: 'timeseries', + query: DataformTemplateBuilder.create((ctx, params) => ` +CREATE TEMPORARY FUNCTION getA11yScore(reportCategories JSON) +RETURNS FLOAT64 DETERMINISTIC +LANGUAGE js AS """ + if(reportCategories) { + return reportCategories.find(i => i.name === 'Accessibility').score; + } +"""; + +SELECT + date, + client, + ROUND(APPROX_QUANTILES(score, 1000)[OFFSET(100)], 2) AS p10, + ROUND(APPROX_QUANTILES(score, 1000)[OFFSET(250)], 2) AS p25, + ROUND(APPROX_QUANTILES(score, 1000)[OFFSET(500)], 2) AS p50, + ROUND(APPROX_QUANTILES(score, 1000)[OFFSET(750)], 2) AS p75, + ROUND(APPROX_QUANTILES(score, 1000)[OFFSET(900)], 2) AS p90 +FROM ( + SELECT + date, + client, + IFNULL(LAX_FLOAT64(lighthouse.categories.accessibility.score) * 100, getA11yScore(lighthouse.reportCategories)) AS score + FROM ${ctx.ref('crawl', 'pages')} + WHERE + lighthouse IS NOT NULL AND + TO_JSON_STRING(lighthouse) != '{}' AND + date = '${params.date}' + ${params.devRankFilter} + ${params.lens.sql} + is_root_page +) +GROUP BY + date, + timestamp, + client +ORDER BY + date DESC, + client +`) + } + ] + }, + asyncClipboardRead: { + SQL: [ + { + type: 'timeseries', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + date, + client, + SUM(IF(feat.id IS NOT NULL, 1, 0)) AS num_urls, + ROUND(SUM(IF(feat.id IS NOT NULL, 1, 0)) / COUNT(0) * 100, 5) AS percent +FROM ${ctx.ref('crawl', 'pages')} +LEFT OUTER JOIN UNNEST(features) AS feat +ON (feat.id = '2369' OR feat.feature = 'AsyncClipboardAPIRead') +WHERE + date = '${params.date}' + ${params.devRankFilter} + ${params.lens.sql} + is_root_page +GROUP BY + date, + timestamp, + client +ORDER BY + date DESC, + client, + num_urls DESC +`) + } + ] + }, + badgeClear: { + SQL: [ + { + type: 'timeseries', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + date, + client, + SUM(IF(feat.id IS NOT NULL, 1, 0)) AS num_urls, + ROUND(SUM(IF(feat.id IS NOT NULL, 1, 0)) / COUNT(0) * 100, 5) AS percent +FROM ${ctx.ref('crawl', 'pages')} +LEFT OUTER JOIN UNNEST(features) AS feat +ON (feat.id = '2727' OR feat.feature = 'BadgeClear') +WHERE + date = '${params.date}' + ${params.devRankFilter} + ${params.lens.sql} + is_root_page +GROUP BY + date, + timestamp, + client +ORDER BY + date DESC, + client, + num_urls DESC +`) + } + ] + }, + badgeSet: { + SQL: [ + { + type: 'timeseries', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + date, + client, + SUM(IF(feat.id IS NOT NULL, 1, 0)) AS num_urls, + ROUND(SUM(IF(feat.id IS NOT NULL, 1, 0)) / COUNT(0) * 100, 5) AS percent +FROM ${ctx.ref('crawl', 'pages')} +LEFT OUTER JOIN UNNEST(features) AS feat +ON (feat.id = '2726' OR feat.feature = 'BadgeSet') +WHERE + date = '${params.date}' + ${params.devRankFilter} + ${params.lens.sql} + is_root_page +GROUP BY + date, + timestamp, + client +ORDER BY + date DESC, + client, + num_urls DESC +`) + } + ] + }, + getInstalledRelatedApps: { + SQL: [ + { + type: 'timeseries', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + date, + client, + SUM(IF(feat.id IS NOT NULL, 1, 0)) AS num_urls, + ROUND(SUM(IF(feat.id IS NOT NULL, 1, 0)) / COUNT(0) * 100, 5) AS percent +FROM ${ctx.ref('crawl', 'pages')} +LEFT OUTER JOIN UNNEST(features) AS feat +ON (feat.id = '1870' OR feat.feature = 'V8Navigator_GetInstalledRelatedApps_Method') +WHERE + date = '${params.date}' + ${params.devRankFilter} + ${params.lens.sql} + is_root_page +GROUP BY + date, + timestamp, + client +ORDER BY + date DESC, + client, + num_urls DESC +`) + } + ] + }, + idleDetection: { + SQL: [ + { + type: 'timeseries', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + date, + client, + SUM(IF(feat.id IS NOT NULL, 1, 0)) AS num_urls, + ROUND(SUM(IF(feat.id IS NOT NULL, 1, 0)) / COUNT(0) * 100, 5) AS percent +FROM ${ctx.ref('crawl', 'pages')} +LEFT OUTER JOIN UNNEST(features) AS feat +ON (feat.id = '2834' OR feat.feature = 'IdleDetectionStart') +WHERE + date = '${params.date}' + ${params.devRankFilter} + ${params.lens.sql} + is_root_page +GROUP BY + date, + timestamp, + client +ORDER BY + date DESC, + client, + num_urls DESC +`) + } + ] + }, + linkText: { + SQL: [ + { + type: 'timeseries', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + date, + client, + ROUND(SUM(IF(LAX_STRING(lighthouse.audits['link-text'].score) IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent +FROM ${ctx.ref('crawl', 'pages')} +WHERE + lighthouse IS NOT NULL AND + date = '${params.date}' + ${params.devRankFilter} + ${params.lens.sql} + is_root_page AND + LAX_STRING(lighthouse.audits['link-text'].score) IS NOT NULL +GROUP BY + date, + timestamp, + client +ORDER BY + date DESC, + client +`) + } + ] + }, + notificationTriggers: { + SQL: [ + { + type: 'timeseries', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + date, + client, + SUM(IF(feat.id IS NOT NULL, 1, 0)) AS num_urls, + ROUND(SUM(IF(feat.id IS NOT NULL, 1, 0)) / COUNT(0) * 100, 5) AS percent +FROM ${ctx.ref('crawl', 'pages')} +LEFT OUTER JOIN UNNEST(features) AS feat +ON (feat.id = '3017' OR feat.feature = 'NotificationShowTrigger') +WHERE + date = '${params.date}' + ${params.devRankFilter} + ${params.lens.sql} + is_root_page +GROUP BY + date, + timestamp, + client +ORDER BY + date DESC, + client, + num_urls DESC +`) + } + ] + }, + periodicBackgroundSync: { + SQL: [ + { + type: 'timeseries', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + date, + client, + SUM(IF(feat.id IS NOT NULL, 1, 0)) AS num_urls, + ROUND(SUM(IF(feat.id IS NOT NULL, 1, 0)) / COUNT(0) * 100, 5) AS percent +FROM ${ctx.ref('crawl', 'pages')} +LEFT OUTER JOIN UNNEST(features) AS feat +ON (feat.id = '2930' OR feat.feature = 'PeriodicBackgroundSync') +WHERE + date = '${params.date}' + ${params.devRankFilter} + ${params.lens.sql} + is_root_page +GROUP BY + date, + timestamp, + client +ORDER BY + date DESC, + client, + num_urls DESC +`) + } + ] + }, + periodicBackgroundSyncRegister: { + SQL: [ + { + type: 'timeseries', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + date, + client, + SUM(IF(feat.id IS NOT NULL, 1, 0)) AS num_urls, + ROUND(SUM(IF(feat.id IS NOT NULL, 1, 0)) / COUNT(0) * 100, 5) AS percent +FROM ${ctx.ref('crawl', 'pages')} +LEFT OUTER JOIN UNNEST(features) AS feat +ON (feat.id = '2931' OR feat.feature = 'PeriodicBackgroundSyncRegister') +WHERE + date = '${params.date}' + ${params.devRankFilter} + ${params.lens.sql} + is_root_page +GROUP BY + date, + timestamp, + client +ORDER BY + date DESC, + client, + num_urls DESC +`) + } + ] + }, + quicTransport: { + SQL: [ + { + type: 'timeseries', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + date, + client, + SUM(IF(feat.id IS NOT NULL, 1, 0)) AS num_urls, + ROUND(SUM(IF(feat.id IS NOT NULL, 1, 0)) / COUNT(0) * 100, 5) AS percent +FROM ${ctx.ref('crawl', 'pages')} +LEFT OUTER JOIN UNNEST(features) AS feat +ON (feat.id = '3184' OR feat.feature = 'QuicTransport') +WHERE + date = '${params.date}' + ${params.devRankFilter} + ${params.lens.sql} + is_root_page +GROUP BY + date, + timestamp, + client +ORDER BY + date DESC, + client, + num_urls DESC +`) + } + ] + }, + screenWakeLock: { + SQL: [ + { + type: 'timeseries', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + date, + client, + SUM(IF(feat.id IS NOT NULL, 1, 0)) AS num_urls, + ROUND(SUM(IF(feat.id IS NOT NULL, 1, 0)) / COUNT(0) * 100, 5) AS percent +FROM ${ctx.ref('crawl', 'pages')} +LEFT OUTER JOIN UNNEST(features) AS feat +ON (feat.id = '3005' OR feat.feature = 'WakeLockAcquireScreenLock') +WHERE + date = '${params.date}' + ${params.devRankFilter} + ${params.lens.sql} + is_root_page +GROUP BY + date, + timestamp, + client +ORDER BY + date DESC, + client, + num_urls DESC +`) + } + ] + }, + storagePersist: { + SQL: [ + { + type: 'timeseries', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + date, + client, + SUM(IF(feat.id IS NOT NULL, 1, 0)) AS num_urls, + ROUND(SUM(IF(feat.id IS NOT NULL, 1, 0)) / COUNT(0) * 100, 5) AS percent +FROM ${ctx.ref('crawl', 'pages')} +LEFT OUTER JOIN + UNNEST(features) AS feat +ON (feat.id = '3018' OR feat.feature = 'DurableStoragePersist') +WHERE + date = '${params.date}' + ${params.devRankFilter} + ${params.lens.sql} + is_root_page +GROUP BY + date, + timestamp, + client +ORDER BY + date DESC, + client, + num_urls DESC +`) + } + ] + }, + swControlledPages: { + SQL: [ + { + type: 'timeseries', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + date, + client, + SUM(IF(feat.id = '990' OR feat.feature = 'ServiceWorkerControlledPage', 1, 0)) AS num_urls, + ROUND(SUM(IF(feat.id = '990' OR feat.feature = 'ServiceWorkerControlledPage', 1, 0)) / COUNT(0) * 100, 5) AS percent +FROM ${ctx.ref('crawl', 'pages')} +LEFT OUTER JOIN + UNNEST(features) AS feat +ON (feat.id = '990' OR feat.feature = 'ServiceWorkerControlledPage') +WHERE + date = '${params.date}' + ${params.devRankFilter} + ${params.lens.sql} + is_root_page +GROUP BY + date, + timestamp, + client +ORDER BY + date DESC, + client, + num_urls DESC +`) + } + ] + }, + webSocketStream: { + SQL: [ + { + type: 'timeseries', + query: DataformTemplateBuilder.create((ctx, params) => ` +SELECT + date, + client, + SUM(IF(feat.id = '3018' OR feat.feature = 'WebSocketStreamConstructor', 1, 0)) AS num_urls, + ROUND(SUM(IF(feat.id = '3018' OR feat.feature = 'WebSocketStreamConstructor', 1, 0)) / COUNT(0) * 100, 5) AS percent +FROM ${ctx.ref('crawl', 'pages')} +LEFT OUTER JOIN + UNNEST(features) AS feat +ON (feat.id = '3018' OR feat.feature = 'WebSocketStreamConstructor') +WHERE + date = '${params.date}' + ${params.devRankFilter} + ${params.lens.sql} + is_root_page +GROUP BY + date, + timestamp, + client +ORDER BY + date DESC, + client, + num_urls DESC `) } ] diff --git a/script/histogram_storage_sync.js b/script/histogram_storage_sync.js new file mode 100644 index 0000000..36a7dc9 --- /dev/null +++ b/script/histogram_storage_sync.js @@ -0,0 +1,357 @@ +import { Storage } from '@google-cloud/storage' +import { BigQuery } from '@google-cloud/bigquery' +import { Readable } from 'stream' + +// Configuration +const CONFIG = { + storage: { bucket: 'httparchive', prefix: 'reports/' }, + bigquery: { projectId: 'httparchive', datasetId: 'reports', tableId: 'histogram1' }, + skipDates: [] +} + +const BACKLOG = [] +/* + +*/ + +const storage = new Storage() +const bigquery = new BigQuery({ projectId: CONFIG.bigquery.projectId }) + +const lenses = ['', 'drupal/', 'magento/', 'top100k/', 'top10k/', 'top1k/', 'top1m/', 'wordpress/'] + +// Generate dates: HTTPArchive collection schedule +function generateHTTPArchiveDates(startDate, endDate) { + const dates = [] + const start = new Date(startDate) + const end = new Date(endDate) + + // Validate dates + if (isNaN(start.getTime()) || isNaN(end.getTime())) { + throw new Error('Invalid date format. Use YYYY-MM-DD format.') + } + + if (start > end) { + throw new Error('Start date must be before or equal to end date.') + } + + const startYear = start.getFullYear() + const startMonth = start.getMonth() + 1 + const endYear = end.getFullYear() + const endMonth = end.getMonth() + 1 + + for (let year = startYear; year <= endYear; year++) { + const monthStart = (year === startYear) ? startMonth : 1 + const monthEnd = (year === endYear) ? endMonth : 12 + + for (let month = monthStart; month <= monthEnd; month++) { + const monthStr = String(month).padStart(2, '0') + + // Always include 1st of month + const firstDate = `${year}-${monthStr}-01` + if (firstDate >= startDate && firstDate <= endDate) { + dates.push(firstDate) + } + + // Add 15th for years 2010-2018 (HTTPArchive historical pattern) + if (year <= 2018) { + const fifteenthDate = `${year}-${monthStr}-15` + if (fifteenthDate >= startDate && fifteenthDate <= endDate) { + dates.push(fifteenthDate) + } + } + } + } + + return dates.sort() +} + +const dates = generateHTTPArchiveDates('2011-06-01', '2025-07-01') + +const histogramMetrics = [ + 'bytesCss', 'bytesImg', 'bytesJs', 'bytesOther', 'bytesTotal', 'evalJs', 'gzipSavings', 'speedIndex', 'dcl', + 'bootupJs', 'bytesFont', 'bytesHtml', 'bytesVideo', 'compileJs', 'fcp', 'imgSavings', 'offscreenImages', 'ol', + 'optimizedImages', 'reqCss', 'reqFont', 'reqHtml', 'reqImg', 'reqJs', 'reqOther', 'reqTotal', 'reqVideo', + 'tcp', 'ttci', 'cruxTtfb', 'cruxOl', 'cruxLcp', 'cruxInp', 'cruxFp', 'cruxFcp', 'cruxDcl', 'cruxCls' +] + +const SCHEMA = [ + { name: 'date', type: 'DATE' }, + { name: 'lens', type: 'STRING' }, + { name: 'client', type: 'STRING' }, + { name: 'metric', type: 'STRING' }, + { name: 'bin', type: 'FLOAT64' }, + { name: 'volume', type: 'FLOAT64' }, + { name: 'cdf', type: 'FLOAT64' }, + { name: 'pdf', type: 'FLOAT64' } +] + +const downloadObject = async (filename) => + (await storage.bucket(CONFIG.storage.bucket).file(filename).download()).toString() + +async function uploadToBigQuery(rows) { + return new Promise((resolve, reject) => { + const table = bigquery.dataset(CONFIG.bigquery.datasetId).table(CONFIG.bigquery.tableId) + const jsonlData = rows.map(row => JSON.stringify(row)).join('\n') + const dataStream = Readable.from([jsonlData]) + + const writeStream = table.createWriteStream({ + sourceFormat: 'NEWLINE_DELIMITED_JSON', + schema: { fields: SCHEMA }, + writeDisposition: 'WRITE_APPEND', + createDisposition: 'CREATE_IF_NEEDED' + }) + + writeStream.on('complete', () => { + resolve() + }) + + writeStream.on('error', (error) => { + console.error('Upload failed:', error.message) + reject(error) + }) + + dataStream.pipe(writeStream) + }) +} + +async function downloadAndParseFile(filename, date, lens, metric) { + try { + const data = await downloadObject(filename) + const rows = JSON.parse(data).map(item => ({ + date, + lens: lens.replace('/', ''), + client: item.client, + metric, + bin: item.bin, + volume: item.volume, + cdf: item.cdf, + pdf: item.pdf + })) + + return { + filename, + success: true, + rows, + rowCount: rows.length, + error: null, + isNotFound: false + } + } catch (error) { + return { + filename, + success: false, + rows: [], + rowCount: 0, + error: error.message, + isNotFound: error.code === 404 || error.message.includes('No such object') + } + } +} + +async function processBacklogFile(filename) { + // Extract metadata from filename: reports/[lens]/YYYY_MM_DD/metric.json + const match = filename.match(/reports\/(?:([^/]+)\/)?(\d{4}_\d{2}_\d{2})\/(.+?)(?:\.json)?$/) + if (!match) { + console.error(`Invalid backlog filename format: ${filename}`) + return { filename, success: false, error: 'Invalid format' } + } + + const [, lensPath = '', dateStr, metric] = match + const date = dateStr.replace(/_/g, '-') + const lens = lensPath + + // Ensure filename has .json extension + const fullFilename = filename.endsWith('.json') ? filename : `${filename}.json` + + const result = await downloadAndParseFile(fullFilename, date, lens, metric) + + // For backlog processing, upload immediately (single files) + if (result.success && result.rows.length > 0) { + try { + await uploadToBigQuery(result.rows) + return { ...result, uploaded: true } + } catch (error) { + return { ...result, success: false, error: error.message, uploaded: false } + } + } + + return result +} + +async function processImportTask(task) { + const { date, lens, metric, filename } = task + const result = await downloadAndParseFile(filename, date, lens, metric) + + return { + ...task, + ...result + } +} + +async function processBacklog() { + if (!BACKLOG || BACKLOG.length === 0) { + console.log('No backlog files to process') + return + } + + console.log(`\nProcessing ${BACKLOG.length} backlog files...`) + + let successCount = 0 + let failCount = 0 + + for (const filename of BACKLOG) { + const result = await processBacklogFile(filename) + + if (result.success) { + console.log(`✓ ${result.filename} (${result.rowCount} rows)`) + successCount++ + } else { + console.log(`✗ ${result.filename}: ${result.error}`) + failCount++ + } + } + + console.log(`\nBacklog completed: ${successCount} successful, ${failCount} failed\n`) +} + +async function processDateData(date) { + console.log(`\nProcessing date: ${date}`) + + const allRows = [] + let totalSuccess = 0 + let totalNotFound = 0 + let totalErrors = 0 + const failedTasks = [] + + // Process each metric sequentially + for (const metric of histogramMetrics) { + console.log(` Processing metric: ${metric} (${histogramMetrics.indexOf(metric) + 1}/${histogramMetrics.length})`) + + // Download all lenses for this metric in parallel + const lensPromises = lenses.map(async (lens) => { + const filename = `${CONFIG.storage.prefix}${lens}${date.replace(/-/g, '_')}/${metric}.json` + const task = { + date, + lens, + metric, + filename, + id: `${date}-${lens || 'all'}-${metric}` + } + + return await processImportTask(task) + }) + + const results = await Promise.allSettled(lensPromises) + + // Process results for this metric + let metricSuccess = 0 + let metricNotFound = 0 + let metricErrors = 0 + + results.forEach((result, index) => { + if (result.status === 'fulfilled') { + const taskResult = result.value + if (taskResult.success) { + // Use concat to avoid stack overflow with large arrays + for (const row of taskResult.rows) { + allRows.push(row) + } + metricSuccess++ + totalSuccess++ + } else if (taskResult.isNotFound) { + metricNotFound++ + totalNotFound++ + } else { + metricErrors++ + totalErrors++ + failedTasks.push(taskResult.filename) + console.error(` ✗ ${taskResult.id}: ${taskResult.error}`) + } + } else { + metricErrors++ + totalErrors++ + const lens = lenses[index] + const filename = `${CONFIG.storage.prefix}${lens}${date.replace(/-/g, '_')}/${metric}.json` + failedTasks.push(filename) + console.error(` ✗ ${date}-${lens || 'all'}-${metric}: ${result.reason?.message || 'Unknown error'}`) + } + }) + + console.log(` ${metricSuccess} success, ${metricNotFound} not found, ${metricErrors} errors`) + } + + console.log(` Total files: ${totalSuccess} success, ${totalNotFound} not found, ${totalErrors} errors`) + + // Upload all data for this date in a single operation + if (allRows.length > 0) { + console.log(` Uploading ${allRows.length.toLocaleString()} rows to BigQuery...`) + try { + await uploadToBigQuery(allRows) + console.log(` ✓ Successfully uploaded all data for ${date}`) + } catch (error) { + console.error(` ✗ Failed to upload data for ${date}: ${error.message}`) + // Add all successful downloads to failed tasks since upload failed + for (const lens of lenses) { + for (const metric of histogramMetrics) { + const filename = `${CONFIG.storage.prefix}${lens}${date.replace(/-/g, '_')}/${metric}.json` + if (!failedTasks.includes(filename)) { + failedTasks.push(filename) + } + } + } + } + } else { + console.log(` No data to upload for ${date}`) + } + + return { + date, + successCount: totalSuccess, + notFoundCount: totalNotFound, + errorCount: totalErrors, + totalRows: allRows.length, + failedTasks + } +} + +async function importHistogramData() { + // Process backlog first + await processBacklog() + + console.log(`Processing ${dates.length} dates`) + + let totalSuccess = 0 + let totalNotFound = 0 + let totalErrors = 0 + let totalRows = 0 + const allFailedTasks = [] + + for (const date of dates) { + if (CONFIG.skipDates.includes(date)) { + console.log(`Skipping date: ${date}`) + continue + } + + const dateResult = await processDateData(date) + + totalSuccess += dateResult.successCount + totalNotFound += dateResult.notFoundCount + totalErrors += dateResult.errorCount + totalRows += dateResult.totalRows + allFailedTasks.push(...dateResult.failedTasks) + } + + console.log('\n=== FINAL SUMMARY ===') + console.log(`Dates processed: ${dates.filter(d => !CONFIG.skipDates.includes(d)).length}`) + console.log(`Total files successful: ${totalSuccess}`) + console.log(`Total files not found: ${totalNotFound}`) + console.log(`Total files with errors: ${totalErrors}`) + console.log(`Total rows uploaded: ${totalRows.toLocaleString()}`) + + if (allFailedTasks.length > 0) { + console.log('\n=== FAILED TASKS (for BACKLOG) ===') + allFailedTasks.forEach(filename => console.log(` '${filename}',`)) + } +} + +importHistogramData().catch(console.error) diff --git a/script/package.json b/script/package.json new file mode 100644 index 0000000..dc5df04 --- /dev/null +++ b/script/package.json @@ -0,0 +1,7 @@ +{ + "type": "module", + "dependencies": { + "@google-cloud/bigquery": "^7.9.1", + "@google-cloud/storage": "^7.14.0" + } +} diff --git a/script/timeseries_storage_sync.js b/script/timeseries_storage_sync.js new file mode 100644 index 0000000..fc0b38b --- /dev/null +++ b/script/timeseries_storage_sync.js @@ -0,0 +1,235 @@ +import { Storage } from '@google-cloud/storage' +import { BigQuery } from '@google-cloud/bigquery' +import { Readable } from 'stream' + +const storage = new Storage() +const bucketName = 'httparchive' +const storagePathPrefix = 'reports/' + +const bigquery = new BigQuery({ projectId: 'httparchive' }) +const datasetId = 'reports' +const tableId = 'timeseries' + +const lenses = [ + '', + 'drupal/', + 'magento/', + 'top100k/', + 'top10k/', + 'top1k/', + 'top1m/', + 'wordpress/' +] + +const histogramMetrics = new Set([ + 'a11yButtonName', + 'a11yColorContrast', + 'a11yImageAlt', + 'a11yLabel', + 'a11yLinkName', + 'a11yScores', + 'asyncClipboardRead', + 'badgeClear', + 'badgeSet', + 'bootupJs', + 'bytesCss', + 'bytesFont', + 'bytesHtml', + 'bytesImg', + 'bytesJs', + 'bytesOther', + 'bytesTotal', + 'bytesVideo', + 'canonical', + 'contentIndex', + 'cruxFastDcl', + 'cruxFastFcp', + 'cruxFastFp', + 'cruxFastInp', + 'cruxFastLcp', + 'cruxFastOl', + 'cruxFastTtfb', + 'cruxLargeCls', + 'cruxPassesCWV', + 'cruxSlowFcp', + 'cruxSlowInp', + 'cruxSlowLcp', + 'cruxSlowTtfb', + 'cruxSmallCls', + 'dcl', + 'fcp', + 'fontDisplay', + 'getInstalledRelatedApps', + 'gzipSavings', + 'h2', + 'h3', + 'hreflang', + 'idleDetection', + 'imgLazy', + 'imgSavings', + 'legible', + 'linkText', + 'notificationTriggers', + 'numUrls', + 'offscreenImages', + 'ol', + 'optimizedImages', + 'pctHttps', + 'periodicBackgroundSync', + 'periodicBackgroundSyncRegister', + 'quicTransport', + 'reqCss', + 'reqFont', + 'reqHtml', + 'reqImg', + 'reqJs', + 'reqOther', + 'reqTotal', + 'reqVideo', + 'screenWakeLock', + 'speedIndex', + 'storageEstimate', + 'storagePersist', + 'swControlledPages', + 'tcp', + 'ttci', + 'webSocketStream' +]) + +async function downloadObject(bucketName, srcFilename) { + const contents = await storage.bucket(bucketName).file(srcFilename).download() + + return contents.toString() +} + +async function ensureTableExists() { + const schema = [ + { name: 'date', type: 'DATE' }, + { name: 'client', type: 'STRING' }, + { name: 'lens', type: 'STRING' }, + { name: 'metric', type: 'STRING' }, + { name: 'percent', type: 'FLOAT64' } + ] + + const table = bigquery.dataset(datasetId).table(tableId) + + try { + const [exists] = await table.exists() + if (!exists) { + console.log(`Creating table ${datasetId}.${tableId}`) + await table.create({ + schema: schema, + location: 'US', + timePartitioning: { + type: 'DAY', + field: 'date' + }, + clustering: { + fields: ['client', 'lens'] + } + }) + console.log(`Table ${datasetId}.${tableId} created successfully with partitioning and clustering`) + } else { + console.log(`Table ${datasetId}.${tableId} already exists`) + } + } catch (error) { + console.error('Error checking/creating table:', error) + throw error + } +} + +async function uploadToBigQuery(rows) { + const schema = [ + { name: 'date', type: 'DATE' }, + { name: 'client', type: 'STRING' }, + { name: 'lens', type: 'STRING' }, + { name: 'metric', type: 'STRING' }, + { name: 'percent', type: 'FLOAT64' } + ] + + return new Promise((resolve, reject) => { + try { + const table = bigquery.dataset(datasetId).table(tableId) + + // Convert rows to JSONL format + const jsonlData = rows.map(row => JSON.stringify(row)).join('\n') + + // Create a readable stream from the JSONL data + const dataStream = Readable.from([jsonlData]) + + // Create write stream with metadata + const writeStream = table.createWriteStream({ + sourceFormat: 'NEWLINE_DELIMITED_JSON', + schema: { + fields: schema + }, + writeDisposition: 'WRITE_APPEND', + createDisposition: 'CREATE_NEVER' // Table should already exist + }) + + // Handle events + writeStream.on('job', (job) => { + console.log(`Write stream job ${job.id} started`) + }) + + writeStream.on('complete', (job) => { + //console.log(`Write stream job ${job.id} completed successfully`) + console.log(`Successfully uploaded ${rows.length} rows using write stream`) + resolve(job) + }) + + writeStream.on('error', (error) => { + console.error('Error in write stream:', error) + reject(error) + }) + + // Pipe the data stream to the write stream + dataStream.pipe(writeStream) + + } catch (error) { + console.error('Error setting up write stream:', error) + reject(error) + } + }) +} + +async function importHistogramData() { + // Ensure the destination table exists before importing data + await ensureTableExists() + + for (const metric of histogramMetrics) { + for (const lens of lenses) { + const srcFilename = `${storagePathPrefix}${lens}${metric}.json` + console.log(`Downloading ${srcFilename}`) + + try { + const data = await downloadObject(bucketName, srcFilename) + + const rows = JSON.parse(data).map(data => ({ + date: data.date.replace(/_/g, '-'), + client: data.client, + lens: lens.replace('/', ''), + metric, + percent: data.percent + })) + + console.log(`Uploading ${rows.length} rows to BigQuery`) + + await uploadToBigQuery(rows) + } catch (error) { + if (error.code === 404 || error.message.includes('No such object')) { + console.log(`File not found: ${srcFilename} - skipping`) + continue + } else { + console.error(`Error processing ${srcFilename}:`, error.message) + // Continue with next file instead of stopping + continue + } + } + //break // TEMP: only do first metric + } + //break // TEMP: only do first lens + } +} + +importHistogramData().catch(console.error)