Added additional queries, and a way to process twice grouped data

rum-archive · Jan 25, 2024 · 2c768bd · 2c768bd
1 parent 12c90a3
commit 2c768bd
Show file tree

Hide file tree

Showing 18 changed files with 226 additions and 9 deletions.
diff --git a/data-cache/country_devicetype.json b/data-cache/country_devicetype.json
diff --git a/data-cache/devicemodel.json b/data-cache/devicemodel.json
diff --git a/data-cache/devicemodel_country_devicetype.json b/data-cache/devicemodel_country_devicetype.json
diff --git a/data-cache/protocol.json b/data-cache/protocol.json
diff --git a/data-output/beacontype_devicetype.json b/data-output/beacontype_devicetype.json
diff --git a/data-output/country_devicetype.json b/data-output/country_devicetype.json
diff --git a/data-output/devicemodel.json b/data-output/devicemodel.json
diff --git a/data-output/devicemodel_country_devicetype.json b/data-output/devicemodel_country_devicetype.json
diff --git a/data-output/devicetype.json b/data-output/devicetype.json
diff --git a/data-output/os_devicetype.json b/data-output/os_devicetype.json
diff --git a/data-output/protocol.json b/data-output/protocol.json
@@ -0,0 +1 @@
+[{"date":"2021_10_01","percent":"33.5","protocol":null,"timestamp":"1633046400000"},{"date":"2021_10_01","percent":"59.0","protocol":"h2","timestamp":"1633046400000"},{"date":"2021_10_01","percent":"7.5","protocol":"http/1.1","timestamp":"1633046400000"},{"date":"2021_11_01","percent":"30.8","protocol":null,"timestamp":"1635724800000"},{"date":"2021_11_01","percent":"63.0","protocol":"h2","timestamp":"1635724800000"},{"date":"2021_11_01","percent":"6.2","protocol":"http/1.1","timestamp":"1635724800000"},{"date":"2021_12_01","percent":"27.8","protocol":null,"timestamp":"1638316800000"},{"date":"2021_12_01","percent":"66.6","protocol":"h2","timestamp":"1638316800000"},{"date":"2021_12_01","percent":"5.6","protocol":"http/1.1","timestamp":"1638316800000"},{"date":"2022_01_01","percent":"32.1","protocol":null,"timestamp":"1640995200000"},{"date":"2022_01_01","percent":"64.4","protocol":"h2","timestamp":"1640995200000"},{"date":"2022_01_01","percent":"3.5","protocol":"http/1.1","timestamp":"1640995200000"},{"date":"2022_02_01","percent":"28.2","protocol":null,"timestamp":"1643673600000"},{"date":"2022_02_01","percent":"67.1","protocol":"h2","timestamp":"1643673600000"},{"date":"2022_02_01","percent":"4.7","protocol":"http/1.1","timestamp":"1643673600000"},{"date":"2022_03_01","percent":"28.1","protocol":null,"timestamp":"1646092800000"},{"date":"2022_03_01","percent":"67.6","protocol":"h2","timestamp":"1646092800000"},{"date":"2022_03_01","percent":"4.3","protocol":"http/1.1","timestamp":"1646092800000"},{"date":"2022_04_01","percent":"27.1","protocol":null,"timestamp":"1648771200000"},{"date":"2022_04_01","percent":"66.9","protocol":"h2","timestamp":"1648771200000"},{"date":"2022_04_01","percent":"6.0","protocol":"http/1.1","timestamp":"1648771200000"},{"date":"2022_05_01","percent":"27.4","protocol":null,"timestamp":"1651363200000"},{"date":"2022_05_01","percent":"67.8","protocol":"h2","timestamp":"1651363200000"},{"date":"2022_05_01","percent":"4.8","protocol":"http/1.1","timestamp":"1651363200000"},{"date":"2022_06_01","percent":"25.1","protocol":null,"timestamp":"1654041600000"},{"date":"2022_06_01","percent":"68.3","protocol":"h2","timestamp":"1654041600000"},{"date":"2022_06_01","percent":"6.6","protocol":"http/1.1","timestamp":"1654041600000"},{"date":"2022_07_01","percent":"21.5","protocol":null,"timestamp":"1656633600000"},{"date":"2022_07_01","percent":"72.2","protocol":"h2","timestamp":"1656633600000"},{"date":"2022_07_01","percent":"6.3","protocol":"http/1.1","timestamp":"1656633600000"},{"date":"2022_08_01","percent":"20.9","protocol":null,"timestamp":"1659312000000"},{"date":"2022_08_01","percent":"73.7","protocol":"h2","timestamp":"1659312000000"},{"date":"2022_08_01","percent":"5.5","protocol":"http/1.1","timestamp":"1659312000000"},{"date":"2022_09_01","percent":"20.1","protocol":null,"timestamp":"1661990400000"},{"date":"2022_09_01","percent":"74.0","protocol":"h2","timestamp":"1661990400000"},{"date":"2022_09_01","percent":"0.3","protocol":"h3","timestamp":"1661990400000"},{"date":"2022_09_01","percent":"5.5","protocol":"http/1.1","timestamp":"1661990400000"},{"date":"2022_10_01","percent":"27.7","protocol":null,"timestamp":"1664582400000"},{"date":"2022_10_01","percent":"68.0","protocol":"h2","timestamp":"1664582400000"},{"date":"2022_10_01","percent":"0.3","protocol":"h3","timestamp":"1664582400000"},{"date":"2022_10_01","percent":"3.9","protocol":"http/1.1","timestamp":"1664582400000"},{"date":"2022_11_01","percent":"24.7","protocol":null,"timestamp":"1667260800000"},{"date":"2022_11_01","percent":"70.6","protocol":"h2","timestamp":"1667260800000"},{"date":"2022_11_01","percent":"0.3","protocol":"h3","timestamp":"1667260800000"},{"date":"2022_11_01","percent":"4.4","protocol":"http/1.1","timestamp":"1667260800000"},{"date":"2022_12_01","percent":"24.4","protocol":null,"timestamp":"1669852800000"},{"date":"2022_12_01","percent":"71.8","protocol":"h2","timestamp":"1669852800000"},{"date":"2022_12_01","percent":"0.3","protocol":"h3","timestamp":"1669852800000"},{"date":"2022_12_01","percent":"3.4","protocol":"http/1.1","timestamp":"1669852800000"},{"date":"2023_01_01","percent":"25.5","protocol":null,"timestamp":"1672531200000"},{"date":"2023_01_01","percent":"70.7","protocol":"h2","timestamp":"1672531200000"},{"date":"2023_01_01","percent":"0.3","protocol":"h3","timestamp":"1672531200000"},{"date":"2023_01_01","percent":"3.4","protocol":"http/1.1","timestamp":"1672531200000"},{"date":"2023_02_01","percent":"26.8","protocol":null,"timestamp":"1675209600000"},{"date":"2023_02_01","percent":"67.5","protocol":"h2","timestamp":"1675209600000"},{"date":"2023_02_01","percent":"0.3","protocol":"h3","timestamp":"1675209600000"},{"date":"2023_02_01","percent":"5.4","protocol":"http/1.1","timestamp":"1675209600000"},{"date":"2023_03_01","percent":"26.3","protocol":null,"timestamp":"1677628800000"},{"date":"2023_03_01","percent":"68.4","protocol":"h2","timestamp":"1677628800000"},{"date":"2023_03_01","percent":"0.3","protocol":"h3","timestamp":"1677628800000"},{"date":"2023_03_01","percent":"5.0","protocol":"http/1.1","timestamp":"1677628800000"},{"date":"2023_04_01","percent":"26.9","protocol":null,"timestamp":"1680307200000"},{"date":"2023_04_01","percent":"69.0","protocol":"h2","timestamp":"1680307200000"},{"date":"2023_04_01","percent":"0.3","protocol":"h3","timestamp":"1680307200000"},{"date":"2023_04_01","percent":"3.8","protocol":"http/1.1","timestamp":"1680307200000"},{"date":"2023_05_01","percent":"25.5","protocol":null,"timestamp":"1682899200000"},{"date":"2023_05_01","percent":"68.8","protocol":"h2","timestamp":"1682899200000"},{"date":"2023_05_01","percent":"0.3","protocol":"h3","timestamp":"1682899200000"},{"date":"2023_05_01","percent":"5.4","protocol":"http/1.1","timestamp":"1682899200000"},{"date":"2023_06_01","percent":"22.8","protocol":null,"timestamp":"1685577600000"},{"date":"2023_06_01","percent":"72.9","protocol":"h2","timestamp":"1685577600000"},{"date":"2023_06_01","percent":"0.4","protocol":"h3","timestamp":"1685577600000"},{"date":"2023_06_01","percent":"3.9","protocol":"http/1.1","timestamp":"1685577600000"},{"date":"2023_07_01","percent":"26.9","protocol":null,"timestamp":"1688169600000"},{"date":"2023_07_01","percent":"68.2","protocol":"h2","timestamp":"1688169600000"},{"date":"2023_07_01","percent":"0.4","protocol":"h3","timestamp":"1688169600000"},{"date":"2023_07_01","percent":"4.6","protocol":"http/1.1","timestamp":"1688169600000"},{"date":"2023_08_01","percent":"23.3","protocol":null,"timestamp":"1690848000000"},{"date":"2023_08_01","percent":"72.3","protocol":"h2","timestamp":"1690848000000"},{"date":"2023_08_01","percent":"0.6","protocol":"h3","timestamp":"1690848000000"},{"date":"2023_08_01","percent":"3.8","protocol":"http/1.1","timestamp":"1690848000000"},{"date":"2023_09_01","percent":"18.9","protocol":null,"timestamp":"1693526400000"},{"date":"2023_09_01","percent":"75.9","protocol":"h2","timestamp":"1693526400000"},{"date":"2023_09_01","percent":"1.3","protocol":"h3","timestamp":"1693526400000"},{"date":"2023_09_01","percent":"3.9","protocol":"http/1.1","timestamp":"1693526400000"},{"date":"2023_10_01","percent":"21.5","protocol":null,"timestamp":"1696118400000"},{"date":"2023_10_01","percent":"72.4","protocol":"h2","timestamp":"1696118400000"},{"date":"2023_10_01","percent":"1.4","protocol":"h3","timestamp":"1696118400000"},{"date":"2023_10_01","percent":"4.7","protocol":"http/1.1","timestamp":"1696118400000"},{"date":"2023_11_01","percent":"21.0","protocol":null,"timestamp":"1698796800000"},{"date":"2023_11_01","percent":"73.1","protocol":"h2","timestamp":"1698796800000"},{"date":"2023_11_01","percent":"1.6","protocol":"h3","timestamp":"1698796800000"},{"date":"2023_11_01","percent":"4.3","protocol":"http/1.1","timestamp":"1698796800000"},{"date":"2023_12_01","percent":"20.5","protocol":null,"timestamp":"1701388800000"},{"date":"2023_12_01","percent":"73.9","protocol":"h2","timestamp":"1701388800000"},{"date":"2023_12_01","percent":"1.6","protocol":"h3","timestamp":"1701388800000"},{"date":"2023_12_01","percent":"4.1","protocol":"http/1.1","timestamp":"1701388800000"}]
diff --git a/data-output/useragentfamily_devicetype.json b/data-output/useragentfamily_devicetype.json
diff --git a/pipeline.js b/pipeline.js
@@ -46,6 +46,12 @@ async function processResults( bigQueryResults ) {
                 await fs.writeFile( outputPath, JSON.stringify(processedData), "utf8" );
 
             }
+            else if ( query.processingtype === "groupedMetricPerDevice" ){
+                let processedData = [];
+                processedData = processing.processGroupedMetricPerDevicetype( data, query.extractmetric, query.groupby );
+
+                await fs.writeFile( outputPath, JSON.stringify(processedData), "utf8" );
+            }
             else if ( query.processingtype === "metricGlobal" ){
                 let processedData = [];
                 processedData = processing.processSingleMetricGlobal( data, query.extractmetric );

diff --git a/queries/country_devicetype.jsonl b/queries/country_devicetype.jsonl
@@ -0,0 +1,16 @@
+{
+    "description": "Timeseries of the different countries per device type, ordered by occurrence seen in the RUM Archive dataset",
+    "datetype": "timeseries",
+    "processingtype": "metricPerDevice",
+    "extractmetric": "country",
+    "sql": "
+        SELECT DATE as date, 
+        COUNTRY as country, 
+        DEVICETYPE as device, 
+        COUNT(*) as rowcount, 
+        SUM(BEACONS) as beaconcount
+        FROM `akamai-mpulse-rumarchive.rumarchive.rumarchive_page_loads` 
+        WHERE {{TIMESERIES_DATES}}
+        GROUP BY DATE, DEVICETYPE, COUNTRY 
+        ORDER BY DATE ASC, DEVICETYPE ASC, beaconcount DESC"
+}
diff --git a/queries/devicemodel.jsonl b/queries/devicemodel.jsonl
@@ -0,0 +1,15 @@
+{
+    "description": "Timeseries of the different device models seen in the RUM Archive dataset",
+    "datetype": "timeseries",
+    "processingtype": "metricGlobal",
+    "extractmetric": "model",
+    "sql": "
+        SELECT DATE as date,
+        DEVICEMODEL as model, 
+        COUNT(*) as rowcount, 
+        SUM(BEACONS) as beaconcount
+        FROM `akamai-mpulse-rumarchive.rumarchive.rumarchive_page_loads` 
+        WHERE {{TIMESERIES_DATES}}
+        GROUP BY DATE, DEVICEMODEL
+        ORDER BY DATE ASC, BEACONCOUNT ASC"
+}
diff --git a/queries/devicemodel_country_devicetype.jsonl b/queries/devicemodel_country_devicetype.jsonl
@@ -0,0 +1,19 @@
+{
+    "description": "Timeseries of the different device models per country per device type, ordered by occurrence seen in the RUM Archive dataset",
+    "datetype": "single",
+    "processingtype": "groupedMetricPerDevice",
+    "extractmetric": "model",
+    "groupby": "country",
+    "comments": "We exclude null values here due to the already high cardinality of results. For the same reason, we limit to a single date instead of a timeseries.",
+    "sql": "
+        SELECT DATE as date, 
+        DEVICEMODEL as model,
+        COUNTRY as country, 
+        DEVICETYPE as device, 
+        COUNT(*) as rowcount, 
+        SUM(BEACONS) as beaconcount
+        FROM `akamai-mpulse-rumarchive.rumarchive.rumarchive_page_loads` 
+        WHERE DEVICETYPE IS NOT NULL and COUNTRY IS NOT NULL AND DEVICEMODEL IS NOT NULL AND {{LAST_DATE}}
+        GROUP BY DATE, DEVICETYPE, COUNTRY, DEVICEMODEL 
+        ORDER BY DATE ASC, DEVICETYPE ASC, COUNTRY ASC, beaconcount DESC"
+}
diff --git a/queries/protocol.jsonl b/queries/protocol.jsonl
@@ -0,0 +1,15 @@
+{
+    "description": "Timeseries of the different HTTP protocol versions seen in the RUM Archive dataset",
+    "datetype": "timeseries",
+    "processingtype": "metricGlobal",
+    "extractmetric": "protocol",
+    "sql": "
+        SELECT DATE as date,
+        PROTOCOL as protocol, 
+        COUNT(*) as rowcount, 
+        SUM(BEACONS) as beaconcount
+        FROM `akamai-mpulse-rumarchive.rumarchive.rumarchive_page_loads` 
+        WHERE {{TIMESERIES_DATES}}
+        GROUP BY DATE, PROTOCOL
+        ORDER BY DATE ASC, PROTOCOL ASC"
+}
diff --git a/src/processing.js b/src/processing.js
@@ -42,14 +42,18 @@ function processSingleMetricGlobal(rows, metricFieldName){
         datapoint.date = row.date.value.replaceAll("-", "_"); // "2022-12-01" to "2022_12_01"
 
         const dateTotalBeaconCount = dateCounts.get( row.date.value );
-        datapoint.percent = ((row.beaconcount / dateTotalBeaconCount) * 100).toFixed(1);
+        let percent = ((row.beaconcount / dateTotalBeaconCount) * 100);
+        datapoint.percent = percent.toFixed(1);
 
         datapoint[metricFieldName] = row[metricFieldName]; // e.g., .device
 
         // highcharts uses raw timestamps, so pre-calculate them
         datapoint.timestamp = "" + (new Date( row.date.value ).getTime());
-
-        output.push( datapoint );
+
+        if ( percent > 0.1 ) {
+            // skip entries that are 0.0 percent (especially in high-cardinality queries, these often account for many megabytes of output data)
+            output.push( datapoint );
+        }
     }
 
     return output;
@@ -164,14 +168,147 @@ function processSingleMetricPerDevicetype(rows, metricFieldName) {
         datapoint.date = row.date.value.replaceAll("-", "_"); // "2022-12-01" to "2022_12_01"
 
         const deviceCount = dateCounts.get( row.date.value ).get( row.device );
-        datapoint.percent = ((row.beaconcount / deviceCount.beaconCount) * 100).toFixed(1);
+        const percent = ((row.beaconcount / deviceCount.beaconCount) * 100);
+        datapoint.percent = percent.toFixed(1);
 
         datapoint[metricFieldName] = row[metricFieldName]; // e.g., .protocol, .useragent, etc.
 
         // highcharts uses raw timestamps, so pre-calculate them
         datapoint.timestamp = "" + (new Date( row.date.value ).getTime());
 
-        output.push( datapoint );
+        if( percent > 0.1 ) {
+            // skip entries that are 0.0 percent (especially in high-cardinality queries, these often account for many megabytes of output data)
+            output.push( datapoint );
+        }
+    }
+
+    return output;
+}
+
+
+function processGroupedMetricPerDevicetype(rows, metricFieldName, groupbyFieldName) {
+    /*
+        This is similar to processSingleMetricPerDevicetype, but the metrics are grouped in another dimension
+        For example, SingleMetric would be
+            deviceModel per deviceType
+        while GroupedMetric would be
+            deviceModel per country per deviceType
+        Where the results are again grouped by country.
+
+        Incoming data are rows of raw bigquery results.
+        We assume the rows are ordered by ASC date!
+        For example:
+            {
+                date: { value: '2022-12-01' },
+                model: 'Apple iPhone',
+                country: 'US',
+                device: 'Desktop',
+                rowcount: 1,
+                beaconcount: 9
+            },
+
+        We need to do multiple things:
+        1. Group by extra dimension (groupbyFieldName) + devicetype + date to find out the toal row and beacon counts of that dimension+device+date 
+            --> (so we can have a percentage for each individual value for that date, since not all dates have an equal amount of beacons)
+        2. Transform the output to the expected format (same as the one HTTPArchive uses)
+            (see processSingleMetricPerDevicetype for more details on that)
+    */
+
+    // TODO: this could benefit from a more generic implementation allow any sequence of groupings (i.e., treating date and devicetype as a normal grouping as well)
+    //       For now, I decided to keep them split for clarity and to wait until clear patterns emerge before deciding to refactor
+
+    // Note: conceptually, I could use subsequent .filter() calls to get only the individual sets I need, but this would mean iterating over the data multiple times and creating new arrays each time
+    //  the approach below is more annoying to program, but should be more efficient (looping over data just once to setup the main data structure, then once to transform to output format)
+
+
+    // 1. Aggregate counts per groupbyFieldName + device + date so we can calculate percentages later 
+    //      the output is grouped by date first, so keep that structure here  (so  dateCount = Map<DATESTRING, Map<DEVICESTRING, MAP<groupbyFieldname, Counts>>>)
+    const dateCounts = new Map();
+
+    for( const row of rows ) {
+        const date = row.date.value;
+        const device = row.device;
+        const groupbyDimension = row[ groupbyFieldName ]; // e.g., "country"
+
+        let dateEntry = dateCounts.get( date );
+        if ( !dateEntry ){
+            dateEntry = new Map();
+            dateCounts.set( date, dateEntry );
+        }
+
+        let deviceEntry = dateEntry.get( device );
+        if ( !deviceEntry ) {
+            deviceEntry = new Map();
+            dateEntry.set( device, deviceEntry );
+        }
+
+        let groupbyEntry = deviceEntry.get( groupbyDimension );
+        if ( !groupbyEntry ) {
+            groupbyEntry = {
+                rowCount: row.rowcount,
+                beaconCount: row.beaconcount
+            };
+            deviceEntry.set( groupbyDimension, groupbyEntry );
+        }
+        else {
+            groupbyEntry.rowCount += row.rowcount;
+            groupbyEntry.beaconCount += row.beaconcount;
+        }
+    }
+
+    // 2. Transform to output format + calculate percentages
+    const output = [];
+    for( const row of rows ) {
+        /*
+        From
+            {
+                date: { value: '2022-12-01' },
+                useragent: 'ev-crawler',
+                device: 'Desktop',
+                rowcount: 1,
+                beaconcount: 9
+            },
+        To
+            {
+                "useragent": "ev-crawler",
+                "client": "desktop",
+                "date": "2020_01_01",
+                "percent": "0.0"
+            },
+        */
+
+        const datapoint = {};
+
+        if ( row.device )
+            datapoint.client = row.device.toLowerCase();
+        else
+            datapoint.client = "unknown";
+
+        const groupbyValue = row[ groupbyFieldName ];
+
+        if ( groupbyValue ) {
+            datapoint[ groupbyFieldName.toLowerCase() ] = groupbyValue;
+        }
+        else {
+            datapoint[ groupbyFieldName.toLowerCase() ] = "unknown";
+        }
+
+        datapoint.date = row.date.value.replaceAll("-", "_"); // "2022-12-01" to "2022_12_01"
+
+        datapoint[metricFieldName] = row[metricFieldName]; // e.g., .protocol, .useragent, .model, etc.
+
+        const dimensionCount = dateCounts.get( row.date.value ).get( row.device ).get( groupbyValue );
+
+        const percent = ((row.beaconcount / dimensionCount.beaconCount) * 100);
+        datapoint.percent = percent.toFixed(1);
+
+        // highcharts uses raw timestamps, so pre-calculate them
+        datapoint.timestamp = "" + (new Date( row.date.value ).getTime());
+
+        if( percent > 0.1 ) {
+            // skip entries that are 0.0 percent (especially in high-cardinality queries, these often account for many megabytes of output data)
+            output.push( datapoint );
+        }
     }
 
     return output;
@@ -429,6 +566,7 @@ function processCWVperUseragent(data) {
 module.exports = {
     processSingleMetricGlobal,
     processSingleMetricPerDevicetype,
+    processGroupedMetricPerDevicetype,
     processHistogramPerDevicetype,
     processCWVperUseragent
 }