mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
added CDX Loader simulator
This commit is contained in:
parent
2ec6db1f99
commit
9276466736
File diff suppressed because one or more lines are too long
197
pywb/vueui/src/cdx-simulator/cdx-record-sample.json
Normal file
197
pywb/vueui/src/cdx-simulator/cdx-record-sample.json
Normal file
@ -0,0 +1,197 @@
|
||||
[
|
||||
{
|
||||
"urlkey": "com,example)/",
|
||||
"timestamp": "20130729195151",
|
||||
"url": "http://test@example.com/",
|
||||
"mime": "warc/revisit",
|
||||
"status": "-",
|
||||
"digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
|
||||
"redirect": "-",
|
||||
"robotflags": "-",
|
||||
"length": "591",
|
||||
"offset": "355",
|
||||
"filename": "example-url-agnostic-revisit.warc.gz",
|
||||
"source": "pywb:url-agnost-example.cdx",
|
||||
"source-coll": "pywb"
|
||||
},
|
||||
{
|
||||
"urlkey": "com,example)/",
|
||||
"timestamp": "20140127171200",
|
||||
"url": "http://example.com",
|
||||
"mime": "text/html",
|
||||
"status": "200",
|
||||
"digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
|
||||
"redirect": "-",
|
||||
"robotflags": "-",
|
||||
"length": "1046",
|
||||
"offset": "334",
|
||||
"filename": "dupes.warc.gz",
|
||||
"source": "pywb:dupes.cdx",
|
||||
"source-coll": "pywb"
|
||||
},
|
||||
{
|
||||
"urlkey": "com,example)/",
|
||||
"timestamp": "20140127171251",
|
||||
"url": "http://example.com",
|
||||
"mime": "warc/revisit",
|
||||
"status": "-",
|
||||
"digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
|
||||
"redirect": "-",
|
||||
"robotflags": "-",
|
||||
"length": "553",
|
||||
"offset": "11875",
|
||||
"filename": "dupes.warc.gz",
|
||||
"source": "pywb:dupes.cdx",
|
||||
"source-coll": "pywb"
|
||||
},
|
||||
{
|
||||
"urlkey": "com,example)/?example=1",
|
||||
"timestamp": "20140103030321",
|
||||
"url": "http://example.com?example=1",
|
||||
"mime": "text/html",
|
||||
"status": "200",
|
||||
"digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
|
||||
"redirect": "-",
|
||||
"robotflags": "-",
|
||||
"length": "1043",
|
||||
"offset": "333",
|
||||
"filename": "example.warc.gz",
|
||||
"source": "pywb:example.cdx",
|
||||
"source-coll": "pywb"
|
||||
},
|
||||
{
|
||||
"urlkey": "com,example)/?example=1",
|
||||
"timestamp": "20140103030341",
|
||||
"url": "http://example.com?example=1",
|
||||
"mime": "warc/revisit",
|
||||
"status": "-",
|
||||
"digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
|
||||
"redirect": "-",
|
||||
"robotflags": "-",
|
||||
"length": "553",
|
||||
"offset": "1864",
|
||||
"filename": "example.warc.gz",
|
||||
"source": "pywb:example.cdx",
|
||||
"source-coll": "pywb"
|
||||
},
|
||||
{
|
||||
"urlkey": "com,example)/?example=2",
|
||||
"timestamp": "20140103030321",
|
||||
"url": "http://example.com?example=2",
|
||||
"mime": "text/html",
|
||||
"status": "200",
|
||||
"digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
|
||||
"redirect": "-",
|
||||
"robotflags": "-",
|
||||
"length": "1987",
|
||||
"offset": "0",
|
||||
"filename": "example-extra.warc",
|
||||
"source": "pywb:example-extra.cdx",
|
||||
"source-coll": "pywb"
|
||||
},
|
||||
{
|
||||
"urlkey": "com,example)/?example=2",
|
||||
"timestamp": "20140603030341",
|
||||
"url": "http://example.com?example=2",
|
||||
"mime": "warc/revisit",
|
||||
"status": "-",
|
||||
"digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
|
||||
"redirect": "-",
|
||||
"robotflags": "-",
|
||||
"length": "504",
|
||||
"offset": "2701",
|
||||
"filename": "example-extra.warc",
|
||||
"source": "pywb:example-extra.cdx",
|
||||
"source-coll": "pywb"
|
||||
},
|
||||
{
|
||||
"urlkey": "com,example)/?example=2",
|
||||
"timestamp": "20140603030351",
|
||||
"url": "http://example.com?example=2",
|
||||
"mime": "warc/revisit",
|
||||
"status": "-",
|
||||
"digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36B",
|
||||
"redirect": "-",
|
||||
"robotflags": "-",
|
||||
"length": "504",
|
||||
"offset": "2701",
|
||||
"filename": "example-extra.warc",
|
||||
"source": "pywb:bad.cdx",
|
||||
"source-coll": "pywb"
|
||||
},
|
||||
{
|
||||
"urlkey": "com,example)/?example=2",
|
||||
"timestamp": "20140703030321",
|
||||
"url": "http://example.com?example=2",
|
||||
"mime": "text/html",
|
||||
"status": "200",
|
||||
"digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
|
||||
"redirect": "-",
|
||||
"robotflags": "-",
|
||||
"length": "1987",
|
||||
"offset": "0",
|
||||
"filename": "non-existent.warc",
|
||||
"source": "pywb:bad.cdx",
|
||||
"source-coll": "pywb"
|
||||
},
|
||||
{
|
||||
"urlkey": "com,example)/?example=3",
|
||||
"timestamp": "20140603030351",
|
||||
"url": "http://example.com?example=3",
|
||||
"mime": "warc/revisit",
|
||||
"status": "-",
|
||||
"digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36B",
|
||||
"redirect": "-",
|
||||
"robotflags": "-",
|
||||
"length": "504",
|
||||
"offset": "2701",
|
||||
"filename": "example-extra.warc",
|
||||
"source": "pywb:bad.cdx",
|
||||
"source-coll": "pywb"
|
||||
},
|
||||
{
|
||||
"urlkey": "com,example)/?example=3",
|
||||
"timestamp": "20140703030321",
|
||||
"url": "http://example.com?example=3",
|
||||
"mime": "text/html",
|
||||
"status": "200",
|
||||
"digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
|
||||
"redirect": "-",
|
||||
"robotflags": "-",
|
||||
"length": "1987",
|
||||
"offset": "0",
|
||||
"filename": "non-existent.warc",
|
||||
"source": "pywb:bad.cdx",
|
||||
"source-coll": "pywb"
|
||||
},
|
||||
{
|
||||
"urlkey": "com,example,test,arc)/",
|
||||
"timestamp": "20140216050221",
|
||||
"url": "http://example.com/",
|
||||
"mime": "text/html",
|
||||
"status": "200",
|
||||
"digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
|
||||
"redirect": "-",
|
||||
"robotflags": "-",
|
||||
"length": "1656",
|
||||
"offset": "151",
|
||||
"filename": "example.arc",
|
||||
"source": "pywb:example-arc-test.cdx",
|
||||
"source-coll": "pywb"
|
||||
},
|
||||
{
|
||||
"urlkey": "com,example,test,gz,arc)/",
|
||||
"timestamp": "20140216050221",
|
||||
"url": "http://example.com/",
|
||||
"mime": "text/html",
|
||||
"status": "200",
|
||||
"digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
|
||||
"redirect": "-",
|
||||
"robotflags": "-",
|
||||
"length": "856",
|
||||
"offset": "171",
|
||||
"filename": "example.arc.gz",
|
||||
"source": "pywb:example-arc-test.cdx",
|
||||
"source-coll": "pywb"
|
||||
}
|
||||
]
|
88
pywb/vueui/src/cdx-simulator/cdx-simulator.js
Normal file
88
pywb/vueui/src/cdx-simulator/cdx-simulator.js
Normal file
@ -0,0 +1,88 @@
|
||||
const getMonthDays = (y, mZeroIndex) => {
|
||||
const firstOfNextMonth = new Date(y, mZeroIndex+1, 1);
|
||||
const lastOfMonth = new Date(firstOfNextMonth - 1000 * 3600 * 24);
|
||||
return lastOfMonth.getDate();
|
||||
}
|
||||
|
||||
// read dynamically from local storage options for make
|
||||
let simulateCdxOptions = window.localStorage.getItem('cdx_simulate');
|
||||
simulateCdxOptions = !!simulateCdxOptions ? JSON.parse(simulateCdxOptions) : {};
|
||||
|
||||
class CDXRecordFactory {
|
||||
constructor() {}
|
||||
|
||||
async make(url, opts={}) {
|
||||
// defaults
|
||||
opts = {count:1000, yearStart:2015, yearEnd:2022, fetchTime:5*1000, ...opts};
|
||||
|
||||
const records = [];
|
||||
|
||||
const total = opts.count;
|
||||
const years = [opts.yearStart, opts.yearEnd];
|
||||
const avgPerMonth = total / (years[1]-years[0]) / 12;
|
||||
// exaggerate max count per day, any day can hold up to 10th of the month's captures
|
||||
const maxPerDay = avgPerMonth/10;
|
||||
|
||||
let avgTimePerRecord = opts.fetchTime/total; // e.g. 1000 ms / 10,000
|
||||
let waitAtEveryNRecords = 1;
|
||||
if (avgTimePerRecord < 1) { // < 1ms per records
|
||||
waitAtEveryNRecords = Math.ceil(1/avgTimePerRecord); // invert
|
||||
avgTimePerRecord = 1;
|
||||
} else { // >= 1ms per record
|
||||
avgTimePerRecord = Math.round(avgTimePerRecord);
|
||||
}
|
||||
let recordI = 0;
|
||||
|
||||
for(let y=years[0]; y<=years[1]; y++) {
|
||||
for(let m=1; m<=12; m++) {
|
||||
for(let d=1; d<=getMonthDays(y, m-1); d++) {
|
||||
const dayTimestampPrefix = y + ('0'+m).substr(-2) + ('0'+d).substr(-2);
|
||||
// minumum to maximum count (random value)
|
||||
const timesCount = Math.floor(Math.random() * maxPerDay);
|
||||
|
||||
const times = {}; // make sure we save to hash to de-dupe
|
||||
for(let i=0; i<timesCount; i++) {
|
||||
if (recordI++ % waitAtEveryNRecords === 0) { // wait
|
||||
const p = new Promise((resolve) => {
|
||||
setTimeout(() => {
|
||||
resolve(true);
|
||||
}, avgTimePerRecord);
|
||||
});
|
||||
await p;
|
||||
}
|
||||
const newTime = Math.floor(Math.random()*3600*24);
|
||||
times[newTime] = 1;
|
||||
}
|
||||
Object.keys(times).sort().forEach(time => {
|
||||
records.push({url, timestamp: dayTimestampPrefix+('000000'+time).substr(-6)});
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
return records;
|
||||
}
|
||||
}
|
||||
|
||||
export class CDXQueryWorkerSimulator {
|
||||
constructor(workerPath) {
|
||||
this.messageCb = [];
|
||||
this.recordFactory = new CDXRecordFactory();
|
||||
}
|
||||
|
||||
|
||||
addEventListener(type, cb) {
|
||||
if (type === 'message') {
|
||||
this.messageCb = cb;
|
||||
}
|
||||
}
|
||||
|
||||
async postMessage({type, queryUrl}) {
|
||||
const records = await this.recordFactory.make(queryUrl, simulateCdxOptions);
|
||||
records.forEach(record => this.messageCb({data: {type: 'cdxRecord', record}}));
|
||||
this.messageCb({data: {type: 'finished'}});
|
||||
}
|
||||
|
||||
terminate() {
|
||||
return true;
|
||||
}
|
||||
}
|
14
pywb/vueui/src/cdx-simulator/test.html
Normal file
14
pywb/vueui/src/cdx-simulator/test.html
Normal file
@ -0,0 +1,14 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<title>CDX Simulator</title>
|
||||
<script src="cdx-simulator.js"></script>
|
||||
</head>
|
||||
<body>
|
||||
<script>
|
||||
const factory = new CDXRecordFactory();
|
||||
console.log(factory.make('test.com'));
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
@ -1,6 +1,7 @@
|
||||
import appData from "./App.vue";
|
||||
|
||||
import { PywbData } from "./model.js";
|
||||
import { CDXQueryWorkerSimulator } from "./cdx-simulator/cdx-simulator";
|
||||
|
||||
import Vue from "vue/dist/vue.esm.browser";
|
||||
|
||||
@ -100,6 +101,8 @@ class CDXLoader {
|
||||
}
|
||||
|
||||
async loadCDX(queryURL) {
|
||||
// Use this to test CDX Loader
|
||||
// const queryWorker = new CDXQueryWorkerSimulator(this.staticPrefix + "/queryWorker.js");
|
||||
const queryWorker = new Worker(this.staticPrefix + "/queryWorker.js");
|
||||
|
||||
const p = new Promise((resolve) => {
|
||||
|
Loading…
x
Reference in New Issue
Block a user