mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
new flag dedup_ok and warcprox-meta field dedup-ok which can be used to prevent deduplication against particular entries rethinkdb big captures table
This commit is contained in:
parent
5bfdbc3d95
commit
4b505c524b
2
setup.py
2
setup.py
@ -51,7 +51,7 @@ except:
|
||||
|
||||
setuptools.setup(
|
||||
name='warcprox',
|
||||
version='2.0b3.dev40',
|
||||
version='2.0b3.dev41',
|
||||
description='WARC writing MITM HTTP/S proxy',
|
||||
url='https://github.com/internetarchive/warcprox',
|
||||
author='Noah Levitt',
|
||||
|
@ -217,10 +217,6 @@ def https_daemon(request, cert):
|
||||
|
||||
return https_daemon
|
||||
|
||||
# @pytest.fixture(scope="module")
|
||||
# def options(request):
|
||||
# return warcprox.Options(base32=True)
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def captures_db(request, rethinkdb_servers, rethinkdb_big_table):
|
||||
captures_db = None
|
||||
@ -1172,6 +1168,79 @@ def test_method_filter(
|
||||
assert response.status_code == 404
|
||||
assert response.content == b'404 Not in Archive\n'
|
||||
|
||||
def test_dedup_ok_flag(
|
||||
https_daemon, http_daemon, warcprox_, archiving_proxies,
|
||||
rethinkdb_big_table):
|
||||
if not rethinkdb_big_table:
|
||||
# this feature is n/a unless using rethinkdb big table
|
||||
return
|
||||
|
||||
url = 'http://localhost:{}/z/b'.format(http_daemon.server_port)
|
||||
|
||||
# check not in dedup db
|
||||
dedup_lookup = warcprox_.warc_writer_thread.dedup_db.lookup(
|
||||
b'sha1:2d7f13181b90a256ce5e5ebfd6e9c9826ece9079',
|
||||
bucket='test_dedup_ok_flag')
|
||||
assert dedup_lookup is None
|
||||
|
||||
# archive with dedup_ok:False
|
||||
request_meta = {'captures-bucket':'test_dedup_ok_flag','dedup-ok':False}
|
||||
headers = {'Warcprox-Meta': json.dumps(request_meta)}
|
||||
response = requests.get(
|
||||
url, proxies=archiving_proxies, headers=headers, verify=False)
|
||||
assert response.status_code == 200
|
||||
assert response.headers['warcprox-test-header'] == 'z!'
|
||||
assert response.content == b'I am the warcprox test payload! bbbbbbbbbb!\n'
|
||||
|
||||
time.sleep(0.5)
|
||||
while not warcprox_.warc_writer_thread.idle:
|
||||
time.sleep(0.5)
|
||||
time.sleep(0.5)
|
||||
|
||||
# check that dedup db doesn't give us anything for this
|
||||
dedup_lookup = warcprox_.warc_writer_thread.dedup_db.lookup(
|
||||
b'sha1:2d7f13181b90a256ce5e5ebfd6e9c9826ece9079',
|
||||
bucket='test_dedup_ok_flag')
|
||||
assert dedup_lookup is None
|
||||
|
||||
# archive without dedup_ok:False
|
||||
request_meta = {'captures-bucket':'test_dedup_ok_flag'}
|
||||
headers = {'Warcprox-Meta': json.dumps(request_meta)}
|
||||
response = requests.get(
|
||||
url, proxies=archiving_proxies, headers=headers, verify=False)
|
||||
|
||||
assert response.status_code == 200
|
||||
assert response.headers['warcprox-test-header'] == 'z!'
|
||||
assert response.content == b'I am the warcprox test payload! bbbbbbbbbb!\n'
|
||||
|
||||
time.sleep(0.5)
|
||||
while not warcprox_.warc_writer_thread.idle:
|
||||
time.sleep(0.5)
|
||||
time.sleep(0.5)
|
||||
|
||||
# check that dedup db gives us something for this
|
||||
dedup_lookup = warcprox_.warc_writer_thread.dedup_db.lookup(
|
||||
b'sha1:2d7f13181b90a256ce5e5ebfd6e9c9826ece9079',
|
||||
bucket='test_dedup_ok_flag')
|
||||
assert dedup_lookup
|
||||
|
||||
# inspect what's in rethinkdb more closely
|
||||
rethink_captures = warcprox_.warc_writer_thread.dedup_db.captures_db
|
||||
results_iter = rethink_captures.r.table(rethink_captures.table).get_all(
|
||||
['FV7RGGA3SCRFNTS6L275N2OJQJXM5EDZ', 'response',
|
||||
'test_dedup_ok_flag'], index='sha1_warc_type').order_by(
|
||||
'timestamp').run()
|
||||
results = list(results_iter)
|
||||
assert len(results) == 2
|
||||
assert results[0].get('dedup_ok') == False
|
||||
assert not 'dedup_ok' in results[1]
|
||||
assert results[0]['url'] == url
|
||||
assert results[1]['url'] == url
|
||||
assert results[0]['warc_type'] == 'response'
|
||||
assert results[1]['warc_type'] == 'response' # not revisit
|
||||
assert results[0]['filename'] == results[1]['filename']
|
||||
assert results[0]['offset'] < results[1]['offset']
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main()
|
||||
|
||||
|
@ -114,7 +114,10 @@ class RethinkCaptures:
|
||||
"digest type is %s but big captures table is indexed by "
|
||||
"sha1" % algo)
|
||||
sha1base32 = base64.b32encode(raw_digest).decode("utf-8")
|
||||
results_iter = self.r.table(self.table).get_all([sha1base32, "response", bucket], index="sha1_warc_type").run()
|
||||
results_iter = self.r.table(self.table).get_all(
|
||||
[sha1base32, "response", bucket],
|
||||
index="sha1_warc_type").filter(
|
||||
self.r.row["dedup_ok"], default=True).run()
|
||||
results = list(results_iter)
|
||||
if len(results) > 0:
|
||||
if len(results) > 1:
|
||||
@ -176,11 +179,14 @@ class RethinkCaptures:
|
||||
# if any
|
||||
}
|
||||
|
||||
if (recorded_url.warcprox_meta and
|
||||
"captures-table-extra-fields" in recorded_url.warcprox_meta):
|
||||
extras = recorded_url.warcprox_meta["captures-table-extra-fields"]
|
||||
for extra_field in extras:
|
||||
entry[extra_field] = extras[extra_field]
|
||||
if recorded_url.warcprox_meta:
|
||||
if "dedup-ok" in recorded_url.warcprox_meta:
|
||||
entry["dedup_ok"] = recorded_url.warcprox_meta["dedup-ok"]
|
||||
if "captures-table-extra-fields" in recorded_url.warcprox_meta:
|
||||
extras = recorded_url.warcprox_meta[
|
||||
"captures-table-extra-fields"]
|
||||
for extra_field in extras:
|
||||
entry[extra_field] = extras[extra_field]
|
||||
|
||||
return entry
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user