Skip to content

Commit

Permalink
Added additional queries, and a way to process twice grouped data
Browse files Browse the repository at this point in the history
  • Loading branch information
rmarx committed Jan 25, 2024
1 parent 12c90a3 commit 2c768bd
Show file tree
Hide file tree
Showing 18 changed files with 226 additions and 9 deletions.
1 change: 1 addition & 0 deletions data-cache/country_devicetype.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions data-cache/devicemodel.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions data-cache/devicemodel_country_devicetype.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions data-cache/protocol.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion data-output/beacontype_devicetype.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions data-output/country_devicetype.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions data-output/devicemodel.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions data-output/devicemodel_country_devicetype.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion data-output/devicetype.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion data-output/os_devicetype.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions data-output/protocol.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
[{"date":"2021_10_01","percent":"33.5","protocol":null,"timestamp":"1633046400000"},{"date":"2021_10_01","percent":"59.0","protocol":"h2","timestamp":"1633046400000"},{"date":"2021_10_01","percent":"7.5","protocol":"http/1.1","timestamp":"1633046400000"},{"date":"2021_11_01","percent":"30.8","protocol":null,"timestamp":"1635724800000"},{"date":"2021_11_01","percent":"63.0","protocol":"h2","timestamp":"1635724800000"},{"date":"2021_11_01","percent":"6.2","protocol":"http/1.1","timestamp":"1635724800000"},{"date":"2021_12_01","percent":"27.8","protocol":null,"timestamp":"1638316800000"},{"date":"2021_12_01","percent":"66.6","protocol":"h2","timestamp":"1638316800000"},{"date":"2021_12_01","percent":"5.6","protocol":"http/1.1","timestamp":"1638316800000"},{"date":"2022_01_01","percent":"32.1","protocol":null,"timestamp":"1640995200000"},{"date":"2022_01_01","percent":"64.4","protocol":"h2","timestamp":"1640995200000"},{"date":"2022_01_01","percent":"3.5","protocol":"http/1.1","timestamp":"1640995200000"},{"date":"2022_02_01","percent":"28.2","protocol":null,"timestamp":"1643673600000"},{"date":"2022_02_01","percent":"67.1","protocol":"h2","timestamp":"1643673600000"},{"date":"2022_02_01","percent":"4.7","protocol":"http/1.1","timestamp":"1643673600000"},{"date":"2022_03_01","percent":"28.1","protocol":null,"timestamp":"1646092800000"},{"date":"2022_03_01","percent":"67.6","protocol":"h2","timestamp":"1646092800000"},{"date":"2022_03_01","percent":"4.3","protocol":"http/1.1","timestamp":"1646092800000"},{"date":"2022_04_01","percent":"27.1","protocol":null,"timestamp":"1648771200000"},{"date":"2022_04_01","percent":"66.9","protocol":"h2","timestamp":"1648771200000"},{"date":"2022_04_01","percent":"6.0","protocol":"http/1.1","timestamp":"1648771200000"},{"date":"2022_05_01","percent":"27.4","protocol":null,"timestamp":"1651363200000"},{"date":"2022_05_01","percent":"67.8","protocol":"h2","timestamp":"1651363200000"},{"date":"2022_05_01","percent":"4.8","protocol":"http/1.1","timestamp":"1651363200000"},{"date":"2022_06_01","percent":"25.1","protocol":null,"timestamp":"1654041600000"},{"date":"2022_06_01","percent":"68.3","protocol":"h2","timestamp":"1654041600000"},{"date":"2022_06_01","percent":"6.6","protocol":"http/1.1","timestamp":"1654041600000"},{"date":"2022_07_01","percent":"21.5","protocol":null,"timestamp":"1656633600000"},{"date":"2022_07_01","percent":"72.2","protocol":"h2","timestamp":"1656633600000"},{"date":"2022_07_01","percent":"6.3","protocol":"http/1.1","timestamp":"1656633600000"},{"date":"2022_08_01","percent":"20.9","protocol":null,"timestamp":"1659312000000"},{"date":"2022_08_01","percent":"73.7","protocol":"h2","timestamp":"1659312000000"},{"date":"2022_08_01","percent":"5.5","protocol":"http/1.1","timestamp":"1659312000000"},{"date":"2022_09_01","percent":"20.1","protocol":null,"timestamp":"1661990400000"},{"date":"2022_09_01","percent":"74.0","protocol":"h2","timestamp":"1661990400000"},{"date":"2022_09_01","percent":"0.3","protocol":"h3","timestamp":"1661990400000"},{"date":"2022_09_01","percent":"5.5","protocol":"http/1.1","timestamp":"1661990400000"},{"date":"2022_10_01","percent":"27.7","protocol":null,"timestamp":"1664582400000"},{"date":"2022_10_01","percent":"68.0","protocol":"h2","timestamp":"1664582400000"},{"date":"2022_10_01","percent":"0.3","protocol":"h3","timestamp":"1664582400000"},{"date":"2022_10_01","percent":"3.9","protocol":"http/1.1","timestamp":"1664582400000"},{"date":"2022_11_01","percent":"24.7","protocol":null,"timestamp":"1667260800000"},{"date":"2022_11_01","percent":"70.6","protocol":"h2","timestamp":"1667260800000"},{"date":"2022_11_01","percent":"0.3","protocol":"h3","timestamp":"1667260800000"},{"date":"2022_11_01","percent":"4.4","protocol":"http/1.1","timestamp":"1667260800000"},{"date":"2022_12_01","percent":"24.4","protocol":null,"timestamp":"1669852800000"},{"date":"2022_12_01","percent":"71.8","protocol":"h2","timestamp":"1669852800000"},{"date":"2022_12_01","percent":"0.3","protocol":"h3","timestamp":"1669852800000"},{"date":"2022_12_01","percent":"3.4","protocol":"http/1.1","timestamp":"1669852800000"},{"date":"2023_01_01","percent":"25.5","protocol":null,"timestamp":"1672531200000"},{"date":"2023_01_01","percent":"70.7","protocol":"h2","timestamp":"1672531200000"},{"date":"2023_01_01","percent":"0.3","protocol":"h3","timestamp":"1672531200000"},{"date":"2023_01_01","percent":"3.4","protocol":"http/1.1","timestamp":"1672531200000"},{"date":"2023_02_01","percent":"26.8","protocol":null,"timestamp":"1675209600000"},{"date":"2023_02_01","percent":"67.5","protocol":"h2","timestamp":"1675209600000"},{"date":"2023_02_01","percent":"0.3","protocol":"h3","timestamp":"1675209600000"},{"date":"2023_02_01","percent":"5.4","protocol":"http/1.1","timestamp":"1675209600000"},{"date":"2023_03_01","percent":"26.3","protocol":null,"timestamp":"1677628800000"},{"date":"2023_03_01","percent":"68.4","protocol":"h2","timestamp":"1677628800000"},{"date":"2023_03_01","percent":"0.3","protocol":"h3","timestamp":"1677628800000"},{"date":"2023_03_01","percent":"5.0","protocol":"http/1.1","timestamp":"1677628800000"},{"date":"2023_04_01","percent":"26.9","protocol":null,"timestamp":"1680307200000"},{"date":"2023_04_01","percent":"69.0","protocol":"h2","timestamp":"1680307200000"},{"date":"2023_04_01","percent":"0.3","protocol":"h3","timestamp":"1680307200000"},{"date":"2023_04_01","percent":"3.8","protocol":"http/1.1","timestamp":"1680307200000"},{"date":"2023_05_01","percent":"25.5","protocol":null,"timestamp":"1682899200000"},{"date":"2023_05_01","percent":"68.8","protocol":"h2","timestamp":"1682899200000"},{"date":"2023_05_01","percent":"0.3","protocol":"h3","timestamp":"1682899200000"},{"date":"2023_05_01","percent":"5.4","protocol":"http/1.1","timestamp":"1682899200000"},{"date":"2023_06_01","percent":"22.8","protocol":null,"timestamp":"1685577600000"},{"date":"2023_06_01","percent":"72.9","protocol":"h2","timestamp":"1685577600000"},{"date":"2023_06_01","percent":"0.4","protocol":"h3","timestamp":"1685577600000"},{"date":"2023_06_01","percent":"3.9","protocol":"http/1.1","timestamp":"1685577600000"},{"date":"2023_07_01","percent":"26.9","protocol":null,"timestamp":"1688169600000"},{"date":"2023_07_01","percent":"68.2","protocol":"h2","timestamp":"1688169600000"},{"date":"2023_07_01","percent":"0.4","protocol":"h3","timestamp":"1688169600000"},{"date":"2023_07_01","percent":"4.6","protocol":"http/1.1","timestamp":"1688169600000"},{"date":"2023_08_01","percent":"23.3","protocol":null,"timestamp":"1690848000000"},{"date":"2023_08_01","percent":"72.3","protocol":"h2","timestamp":"1690848000000"},{"date":"2023_08_01","percent":"0.6","protocol":"h3","timestamp":"1690848000000"},{"date":"2023_08_01","percent":"3.8","protocol":"http/1.1","timestamp":"1690848000000"},{"date":"2023_09_01","percent":"18.9","protocol":null,"timestamp":"1693526400000"},{"date":"2023_09_01","percent":"75.9","protocol":"h2","timestamp":"1693526400000"},{"date":"2023_09_01","percent":"1.3","protocol":"h3","timestamp":"1693526400000"},{"date":"2023_09_01","percent":"3.9","protocol":"http/1.1","timestamp":"1693526400000"},{"date":"2023_10_01","percent":"21.5","protocol":null,"timestamp":"1696118400000"},{"date":"2023_10_01","percent":"72.4","protocol":"h2","timestamp":"1696118400000"},{"date":"2023_10_01","percent":"1.4","protocol":"h3","timestamp":"1696118400000"},{"date":"2023_10_01","percent":"4.7","protocol":"http/1.1","timestamp":"1696118400000"},{"date":"2023_11_01","percent":"21.0","protocol":null,"timestamp":"1698796800000"},{"date":"2023_11_01","percent":"73.1","protocol":"h2","timestamp":"1698796800000"},{"date":"2023_11_01","percent":"1.6","protocol":"h3","timestamp":"1698796800000"},{"date":"2023_11_01","percent":"4.3","protocol":"http/1.1","timestamp":"1698796800000"},{"date":"2023_12_01","percent":"20.5","protocol":null,"timestamp":"1701388800000"},{"date":"2023_12_01","percent":"73.9","protocol":"h2","timestamp":"1701388800000"},{"date":"2023_12_01","percent":"1.6","protocol":"h3","timestamp":"1701388800000"},{"date":"2023_12_01","percent":"4.1","protocol":"http/1.1","timestamp":"1701388800000"}]
2 changes: 1 addition & 1 deletion data-output/useragentfamily_devicetype.json

Large diffs are not rendered by default.

6 changes: 6 additions & 0 deletions pipeline.js
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,12 @@ async function processResults( bigQueryResults ) {
await fs.writeFile( outputPath, JSON.stringify(processedData), "utf8" );

}
else if ( query.processingtype === "groupedMetricPerDevice" ){
let processedData = [];
processedData = processing.processGroupedMetricPerDevicetype( data, query.extractmetric, query.groupby );

await fs.writeFile( outputPath, JSON.stringify(processedData), "utf8" );
}
else if ( query.processingtype === "metricGlobal" ){
let processedData = [];
processedData = processing.processSingleMetricGlobal( data, query.extractmetric );
Expand Down
16 changes: 16 additions & 0 deletions queries/country_devicetype.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
{
"description": "Timeseries of the different countries per device type, ordered by occurrence seen in the RUM Archive dataset",
"datetype": "timeseries",
"processingtype": "metricPerDevice",
"extractmetric": "country",
"sql": "
SELECT DATE as date,
COUNTRY as country,
DEVICETYPE as device,
COUNT(*) as rowcount,
SUM(BEACONS) as beaconcount
FROM `akamai-mpulse-rumarchive.rumarchive.rumarchive_page_loads`
WHERE {{TIMESERIES_DATES}}
GROUP BY DATE, DEVICETYPE, COUNTRY
ORDER BY DATE ASC, DEVICETYPE ASC, beaconcount DESC"
}
15 changes: 15 additions & 0 deletions queries/devicemodel.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
"description": "Timeseries of the different device models seen in the RUM Archive dataset",
"datetype": "timeseries",
"processingtype": "metricGlobal",
"extractmetric": "model",
"sql": "
SELECT DATE as date,
DEVICEMODEL as model,
COUNT(*) as rowcount,
SUM(BEACONS) as beaconcount
FROM `akamai-mpulse-rumarchive.rumarchive.rumarchive_page_loads`
WHERE {{TIMESERIES_DATES}}
GROUP BY DATE, DEVICEMODEL
ORDER BY DATE ASC, BEACONCOUNT ASC"
}
19 changes: 19 additions & 0 deletions queries/devicemodel_country_devicetype.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
{
"description": "Timeseries of the different device models per country per device type, ordered by occurrence seen in the RUM Archive dataset",
"datetype": "single",
"processingtype": "groupedMetricPerDevice",
"extractmetric": "model",
"groupby": "country",
"comments": "We exclude null values here due to the already high cardinality of results. For the same reason, we limit to a single date instead of a timeseries.",
"sql": "
SELECT DATE as date,
DEVICEMODEL as model,
COUNTRY as country,
DEVICETYPE as device,
COUNT(*) as rowcount,
SUM(BEACONS) as beaconcount
FROM `akamai-mpulse-rumarchive.rumarchive.rumarchive_page_loads`
WHERE DEVICETYPE IS NOT NULL and COUNTRY IS NOT NULL AND DEVICEMODEL IS NOT NULL AND {{LAST_DATE}}
GROUP BY DATE, DEVICETYPE, COUNTRY, DEVICEMODEL
ORDER BY DATE ASC, DEVICETYPE ASC, COUNTRY ASC, beaconcount DESC"
}
15 changes: 15 additions & 0 deletions queries/protocol.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
"description": "Timeseries of the different HTTP protocol versions seen in the RUM Archive dataset",
"datetype": "timeseries",
"processingtype": "metricGlobal",
"extractmetric": "protocol",
"sql": "
SELECT DATE as date,
PROTOCOL as protocol,
COUNT(*) as rowcount,
SUM(BEACONS) as beaconcount
FROM `akamai-mpulse-rumarchive.rumarchive.rumarchive_page_loads`
WHERE {{TIMESERIES_DATES}}
GROUP BY DATE, PROTOCOL
ORDER BY DATE ASC, PROTOCOL ASC"
}
148 changes: 143 additions & 5 deletions src/processing.js
Original file line number Diff line number Diff line change
Expand Up @@ -42,14 +42,18 @@ function processSingleMetricGlobal(rows, metricFieldName){
datapoint.date = row.date.value.replaceAll("-", "_"); // "2022-12-01" to "2022_12_01"

const dateTotalBeaconCount = dateCounts.get( row.date.value );
datapoint.percent = ((row.beaconcount / dateTotalBeaconCount) * 100).toFixed(1);
let percent = ((row.beaconcount / dateTotalBeaconCount) * 100);
datapoint.percent = percent.toFixed(1);

datapoint[metricFieldName] = row[metricFieldName]; // e.g., .device

// highcharts uses raw timestamps, so pre-calculate them
datapoint.timestamp = "" + (new Date( row.date.value ).getTime());

output.push( datapoint );

if ( percent > 0.1 ) {
// skip entries that are 0.0 percent (especially in high-cardinality queries, these often account for many megabytes of output data)
output.push( datapoint );
}
}

return output;
Expand Down Expand Up @@ -164,14 +168,147 @@ function processSingleMetricPerDevicetype(rows, metricFieldName) {
datapoint.date = row.date.value.replaceAll("-", "_"); // "2022-12-01" to "2022_12_01"

const deviceCount = dateCounts.get( row.date.value ).get( row.device );
datapoint.percent = ((row.beaconcount / deviceCount.beaconCount) * 100).toFixed(1);
const percent = ((row.beaconcount / deviceCount.beaconCount) * 100);
datapoint.percent = percent.toFixed(1);

datapoint[metricFieldName] = row[metricFieldName]; // e.g., .protocol, .useragent, etc.

// highcharts uses raw timestamps, so pre-calculate them
datapoint.timestamp = "" + (new Date( row.date.value ).getTime());

output.push( datapoint );
if( percent > 0.1 ) {
// skip entries that are 0.0 percent (especially in high-cardinality queries, these often account for many megabytes of output data)
output.push( datapoint );
}
}

return output;
}


function processGroupedMetricPerDevicetype(rows, metricFieldName, groupbyFieldName) {
/*
This is similar to processSingleMetricPerDevicetype, but the metrics are grouped in another dimension
For example, SingleMetric would be
deviceModel per deviceType
while GroupedMetric would be
deviceModel per country per deviceType
Where the results are again grouped by country.
Incoming data are rows of raw bigquery results.
We assume the rows are ordered by ASC date!
For example:
{
date: { value: '2022-12-01' },
model: 'Apple iPhone',
country: 'US',
device: 'Desktop',
rowcount: 1,
beaconcount: 9
},
We need to do multiple things:
1. Group by extra dimension (groupbyFieldName) + devicetype + date to find out the toal row and beacon counts of that dimension+device+date
--> (so we can have a percentage for each individual value for that date, since not all dates have an equal amount of beacons)
2. Transform the output to the expected format (same as the one HTTPArchive uses)
(see processSingleMetricPerDevicetype for more details on that)
*/

// TODO: this could benefit from a more generic implementation allow any sequence of groupings (i.e., treating date and devicetype as a normal grouping as well)
// For now, I decided to keep them split for clarity and to wait until clear patterns emerge before deciding to refactor

// Note: conceptually, I could use subsequent .filter() calls to get only the individual sets I need, but this would mean iterating over the data multiple times and creating new arrays each time
// the approach below is more annoying to program, but should be more efficient (looping over data just once to setup the main data structure, then once to transform to output format)


// 1. Aggregate counts per groupbyFieldName + device + date so we can calculate percentages later
// the output is grouped by date first, so keep that structure here (so dateCount = Map<DATESTRING, Map<DEVICESTRING, MAP<groupbyFieldname, Counts>>>)
const dateCounts = new Map();

for( const row of rows ) {
const date = row.date.value;
const device = row.device;
const groupbyDimension = row[ groupbyFieldName ]; // e.g., "country"

let dateEntry = dateCounts.get( date );
if ( !dateEntry ){
dateEntry = new Map();
dateCounts.set( date, dateEntry );
}

let deviceEntry = dateEntry.get( device );
if ( !deviceEntry ) {
deviceEntry = new Map();
dateEntry.set( device, deviceEntry );
}

let groupbyEntry = deviceEntry.get( groupbyDimension );
if ( !groupbyEntry ) {
groupbyEntry = {
rowCount: row.rowcount,
beaconCount: row.beaconcount
};
deviceEntry.set( groupbyDimension, groupbyEntry );
}
else {
groupbyEntry.rowCount += row.rowcount;
groupbyEntry.beaconCount += row.beaconcount;
}
}

// 2. Transform to output format + calculate percentages
const output = [];
for( const row of rows ) {
/*
From
{
date: { value: '2022-12-01' },
useragent: 'ev-crawler',
device: 'Desktop',
rowcount: 1,
beaconcount: 9
},
To
{
"useragent": "ev-crawler",
"client": "desktop",
"date": "2020_01_01",
"percent": "0.0"
},
*/

const datapoint = {};

if ( row.device )
datapoint.client = row.device.toLowerCase();
else
datapoint.client = "unknown";

const groupbyValue = row[ groupbyFieldName ];

if ( groupbyValue ) {
datapoint[ groupbyFieldName.toLowerCase() ] = groupbyValue;
}
else {
datapoint[ groupbyFieldName.toLowerCase() ] = "unknown";
}

datapoint.date = row.date.value.replaceAll("-", "_"); // "2022-12-01" to "2022_12_01"

datapoint[metricFieldName] = row[metricFieldName]; // e.g., .protocol, .useragent, .model, etc.

const dimensionCount = dateCounts.get( row.date.value ).get( row.device ).get( groupbyValue );

const percent = ((row.beaconcount / dimensionCount.beaconCount) * 100);
datapoint.percent = percent.toFixed(1);

// highcharts uses raw timestamps, so pre-calculate them
datapoint.timestamp = "" + (new Date( row.date.value ).getTime());

if( percent > 0.1 ) {
// skip entries that are 0.0 percent (especially in high-cardinality queries, these often account for many megabytes of output data)
output.push( datapoint );
}
}

return output;
Expand Down Expand Up @@ -429,6 +566,7 @@ function processCWVperUseragent(data) {
module.exports = {
processSingleMetricGlobal,
processSingleMetricPerDevicetype,
processGroupedMetricPerDevicetype,
processHistogramPerDevicetype,
processCWVperUseragent
}

0 comments on commit 2c768bd

Please sign in to comment.