1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

added CDX Loader simulator

This commit is contained in:
Ivan Velev 2022-02-04 17:59:56 -08:00
parent 2ec6db1f99
commit 9276466736
5 changed files with 309 additions and 1 deletions

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,197 @@
[
{
"urlkey": "com,example)/",
"timestamp": "20130729195151",
"url": "http://test@example.com/",
"mime": "warc/revisit",
"status": "-",
"digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
"redirect": "-",
"robotflags": "-",
"length": "591",
"offset": "355",
"filename": "example-url-agnostic-revisit.warc.gz",
"source": "pywb:url-agnost-example.cdx",
"source-coll": "pywb"
},
{
"urlkey": "com,example)/",
"timestamp": "20140127171200",
"url": "http://example.com",
"mime": "text/html",
"status": "200",
"digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
"redirect": "-",
"robotflags": "-",
"length": "1046",
"offset": "334",
"filename": "dupes.warc.gz",
"source": "pywb:dupes.cdx",
"source-coll": "pywb"
},
{
"urlkey": "com,example)/",
"timestamp": "20140127171251",
"url": "http://example.com",
"mime": "warc/revisit",
"status": "-",
"digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
"redirect": "-",
"robotflags": "-",
"length": "553",
"offset": "11875",
"filename": "dupes.warc.gz",
"source": "pywb:dupes.cdx",
"source-coll": "pywb"
},
{
"urlkey": "com,example)/?example=1",
"timestamp": "20140103030321",
"url": "http://example.com?example=1",
"mime": "text/html",
"status": "200",
"digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
"redirect": "-",
"robotflags": "-",
"length": "1043",
"offset": "333",
"filename": "example.warc.gz",
"source": "pywb:example.cdx",
"source-coll": "pywb"
},
{
"urlkey": "com,example)/?example=1",
"timestamp": "20140103030341",
"url": "http://example.com?example=1",
"mime": "warc/revisit",
"status": "-",
"digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
"redirect": "-",
"robotflags": "-",
"length": "553",
"offset": "1864",
"filename": "example.warc.gz",
"source": "pywb:example.cdx",
"source-coll": "pywb"
},
{
"urlkey": "com,example)/?example=2",
"timestamp": "20140103030321",
"url": "http://example.com?example=2",
"mime": "text/html",
"status": "200",
"digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
"redirect": "-",
"robotflags": "-",
"length": "1987",
"offset": "0",
"filename": "example-extra.warc",
"source": "pywb:example-extra.cdx",
"source-coll": "pywb"
},
{
"urlkey": "com,example)/?example=2",
"timestamp": "20140603030341",
"url": "http://example.com?example=2",
"mime": "warc/revisit",
"status": "-",
"digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
"redirect": "-",
"robotflags": "-",
"length": "504",
"offset": "2701",
"filename": "example-extra.warc",
"source": "pywb:example-extra.cdx",
"source-coll": "pywb"
},
{
"urlkey": "com,example)/?example=2",
"timestamp": "20140603030351",
"url": "http://example.com?example=2",
"mime": "warc/revisit",
"status": "-",
"digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36B",
"redirect": "-",
"robotflags": "-",
"length": "504",
"offset": "2701",
"filename": "example-extra.warc",
"source": "pywb:bad.cdx",
"source-coll": "pywb"
},
{
"urlkey": "com,example)/?example=2",
"timestamp": "20140703030321",
"url": "http://example.com?example=2",
"mime": "text/html",
"status": "200",
"digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
"redirect": "-",
"robotflags": "-",
"length": "1987",
"offset": "0",
"filename": "non-existent.warc",
"source": "pywb:bad.cdx",
"source-coll": "pywb"
},
{
"urlkey": "com,example)/?example=3",
"timestamp": "20140603030351",
"url": "http://example.com?example=3",
"mime": "warc/revisit",
"status": "-",
"digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36B",
"redirect": "-",
"robotflags": "-",
"length": "504",
"offset": "2701",
"filename": "example-extra.warc",
"source": "pywb:bad.cdx",
"source-coll": "pywb"
},
{
"urlkey": "com,example)/?example=3",
"timestamp": "20140703030321",
"url": "http://example.com?example=3",
"mime": "text/html",
"status": "200",
"digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
"redirect": "-",
"robotflags": "-",
"length": "1987",
"offset": "0",
"filename": "non-existent.warc",
"source": "pywb:bad.cdx",
"source-coll": "pywb"
},
{
"urlkey": "com,example,test,arc)/",
"timestamp": "20140216050221",
"url": "http://example.com/",
"mime": "text/html",
"status": "200",
"digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
"redirect": "-",
"robotflags": "-",
"length": "1656",
"offset": "151",
"filename": "example.arc",
"source": "pywb:example-arc-test.cdx",
"source-coll": "pywb"
},
{
"urlkey": "com,example,test,gz,arc)/",
"timestamp": "20140216050221",
"url": "http://example.com/",
"mime": "text/html",
"status": "200",
"digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
"redirect": "-",
"robotflags": "-",
"length": "856",
"offset": "171",
"filename": "example.arc.gz",
"source": "pywb:example-arc-test.cdx",
"source-coll": "pywb"
}
]

View File

@ -0,0 +1,88 @@
const getMonthDays = (y, mZeroIndex) => {
const firstOfNextMonth = new Date(y, mZeroIndex+1, 1);
const lastOfMonth = new Date(firstOfNextMonth - 1000 * 3600 * 24);
return lastOfMonth.getDate();
}
// read dynamically from local storage options for make
let simulateCdxOptions = window.localStorage.getItem('cdx_simulate');
simulateCdxOptions = !!simulateCdxOptions ? JSON.parse(simulateCdxOptions) : {};
class CDXRecordFactory {
constructor() {}
async make(url, opts={}) {
// defaults
opts = {count:1000, yearStart:2015, yearEnd:2022, fetchTime:5*1000, ...opts};
const records = [];
const total = opts.count;
const years = [opts.yearStart, opts.yearEnd];
const avgPerMonth = total / (years[1]-years[0]) / 12;
// exaggerate max count per day, any day can hold up to 10th of the month's captures
const maxPerDay = avgPerMonth/10;
let avgTimePerRecord = opts.fetchTime/total; // e.g. 1000 ms / 10,000
let waitAtEveryNRecords = 1;
if (avgTimePerRecord < 1) { // < 1ms per records
waitAtEveryNRecords = Math.ceil(1/avgTimePerRecord); // invert
avgTimePerRecord = 1;
} else { // >= 1ms per record
avgTimePerRecord = Math.round(avgTimePerRecord);
}
let recordI = 0;
for(let y=years[0]; y<=years[1]; y++) {
for(let m=1; m<=12; m++) {
for(let d=1; d<=getMonthDays(y, m-1); d++) {
const dayTimestampPrefix = y + ('0'+m).substr(-2) + ('0'+d).substr(-2);
// minumum to maximum count (random value)
const timesCount = Math.floor(Math.random() * maxPerDay);
const times = {}; // make sure we save to hash to de-dupe
for(let i=0; i<timesCount; i++) {
if (recordI++ % waitAtEveryNRecords === 0) { // wait
const p = new Promise((resolve) => {
setTimeout(() => {
resolve(true);
}, avgTimePerRecord);
});
await p;
}
const newTime = Math.floor(Math.random()*3600*24);
times[newTime] = 1;
}
Object.keys(times).sort().forEach(time => {
records.push({url, timestamp: dayTimestampPrefix+('000000'+time).substr(-6)});
});
}
}
}
return records;
}
}
export class CDXQueryWorkerSimulator {
constructor(workerPath) {
this.messageCb = [];
this.recordFactory = new CDXRecordFactory();
}
addEventListener(type, cb) {
if (type === 'message') {
this.messageCb = cb;
}
}
async postMessage({type, queryUrl}) {
const records = await this.recordFactory.make(queryUrl, simulateCdxOptions);
records.forEach(record => this.messageCb({data: {type: 'cdxRecord', record}}));
this.messageCb({data: {type: 'finished'}});
}
terminate() {
return true;
}
}

View File

@ -0,0 +1,14 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>CDX Simulator</title>
<script src="cdx-simulator.js"></script>
</head>
<body>
<script>
const factory = new CDXRecordFactory();
console.log(factory.make('test.com'));
</script>
</body>
</html>

View File

@ -1,6 +1,7 @@
import appData from "./App.vue";
import { PywbData } from "./model.js";
import { CDXQueryWorkerSimulator } from "./cdx-simulator/cdx-simulator";
import Vue from "vue/dist/vue.esm.browser";
@ -100,6 +101,8 @@ class CDXLoader {
}
async loadCDX(queryURL) {
// Use this to test CDX Loader
// const queryWorker = new CDXQueryWorkerSimulator(this.staticPrefix + "/queryWorker.js");
const queryWorker = new Worker(this.staticPrefix + "/queryWorker.js");
const p = new Promise((resolve) => {