mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
test_dedup_buckets_multiple
This commit is contained in:
parent
8c52bd8442
commit
51c4f6d622
@ -916,6 +916,71 @@ def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies,
|
||||
finally:
|
||||
fh.close()
|
||||
|
||||
def test_dedup_buckets_multiple(https_daemon, http_daemon, warcprox_, archiving_proxies, playback_proxies):
|
||||
urls_before = warcprox_.proxy.running_stats.urls
|
||||
|
||||
url1 = 'http://localhost:{}/k/l'.format(http_daemon.server_port)
|
||||
|
||||
# archive url1
|
||||
headers = {"Warcprox-Meta": json.dumps({"warc-prefix":"test_dedup_buckets_multiple",
|
||||
"dedup-buckets":{"bucket_1":"rw", "bucket_2":"ro"}})
|
||||
}
|
||||
response = requests.get(url1, proxies=archiving_proxies, verify=False, headers=headers)
|
||||
assert response.status_code == 200
|
||||
assert response.headers['warcprox-test-header'] == 'k!'
|
||||
assert response.content == b'I am the warcprox test payload! llllllllll!\n'
|
||||
|
||||
# wait for postfetch chain
|
||||
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 1)
|
||||
|
||||
# check url1 in dedup db bucket_1
|
||||
# logging.info('looking up sha1:bc3fac8847c9412f49d955e626fb58a76befbf81 in bucket_1')
|
||||
dedup_lookup = warcprox_.dedup_db.lookup(
|
||||
b'sha1:bc3fac8847c9412f49d955e626fb58a76befbf81', bucket="bucket_1")
|
||||
assert dedup_lookup
|
||||
assert dedup_lookup['url'] == url1.encode('ascii')
|
||||
assert re.match(br'^<urn:uuid:[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}>$', dedup_lookup['id'])
|
||||
assert re.match(br'^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$', dedup_lookup['date'])
|
||||
record_id = dedup_lookup['id']
|
||||
dedup_date = dedup_lookup['date']
|
||||
|
||||
# check url1 not in dedup db bucket_2
|
||||
dedup_lookup = warcprox_.dedup_db.lookup(
|
||||
b'sha1:bc3fac8847c9412f49d955e626fb58a76befbf81', bucket="bucket_2")
|
||||
assert dedup_lookup is None
|
||||
|
||||
# close the warc
|
||||
assert warcprox_.warc_writer_processor.writer_pool.warc_writers["test_dedup_buckets_multiple"]
|
||||
writer = warcprox_.warc_writer_processor.writer_pool.warc_writers["test_dedup_buckets_multiple"]
|
||||
warc_path = os.path.join(writer.directory, writer.finalname)
|
||||
assert not os.path.exists(warc_path)
|
||||
warcprox_.warc_writer_processor.writer_pool.warc_writers["test_dedup_buckets_multiple"].close()
|
||||
assert os.path.exists(warc_path)
|
||||
|
||||
# read the warc
|
||||
fh = warctools.ArchiveRecord.open_archive(warc_path)
|
||||
record_iter = fh.read_records(limit=None, offsets=True)
|
||||
try:
|
||||
(offset, record, errors) = next(record_iter)
|
||||
assert record.type == b'warcinfo'
|
||||
|
||||
# url1 bucket_1
|
||||
(offset, record, errors) = next(record_iter)
|
||||
assert record.type == b'response'
|
||||
assert record.url == url1.encode('ascii')
|
||||
# check for duplicate warc record headers
|
||||
assert Counter(h[0] for h in record.headers).most_common(1)[0][1] == 1
|
||||
assert record.content[1] == b'HTTP/1.1 200 OK\r\nContent-Type: text/plain\r\nwarcprox-test-header: k!\r\nContent-Length: 44\r\n\r\nI am the warcprox test payload! llllllllll!\n'
|
||||
(offset, record, errors) = next(record_iter)
|
||||
assert record.type == b'request'
|
||||
|
||||
# that's all folks
|
||||
assert next(record_iter)[1] == None
|
||||
assert next(record_iter, None) == None
|
||||
|
||||
finally:
|
||||
fh.close()
|
||||
|
||||
def test_dedup_bucket_concurrency(https_daemon, http_daemon, warcprox_, archiving_proxies):
|
||||
urls_before = warcprox_.proxy.running_stats.urls
|
||||
revisits_before = warcprox_.proxy.stats_db.value(
|
||||
|
Loading…
x
Reference in New Issue
Block a user