mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
new flag dedup_ok and warcprox-meta field dedup-ok which can be used to prevent deduplication against particular entries rethinkdb big captures table
This commit is contained in:
parent
5bfdbc3d95
commit
4b505c524b
2
setup.py
2
setup.py
@ -51,7 +51,7 @@ except:
|
|||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='warcprox',
|
name='warcprox',
|
||||||
version='2.0b3.dev40',
|
version='2.0b3.dev41',
|
||||||
description='WARC writing MITM HTTP/S proxy',
|
description='WARC writing MITM HTTP/S proxy',
|
||||||
url='https://github.com/internetarchive/warcprox',
|
url='https://github.com/internetarchive/warcprox',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
@ -217,10 +217,6 @@ def https_daemon(request, cert):
|
|||||||
|
|
||||||
return https_daemon
|
return https_daemon
|
||||||
|
|
||||||
# @pytest.fixture(scope="module")
|
|
||||||
# def options(request):
|
|
||||||
# return warcprox.Options(base32=True)
|
|
||||||
|
|
||||||
@pytest.fixture(scope="module")
|
@pytest.fixture(scope="module")
|
||||||
def captures_db(request, rethinkdb_servers, rethinkdb_big_table):
|
def captures_db(request, rethinkdb_servers, rethinkdb_big_table):
|
||||||
captures_db = None
|
captures_db = None
|
||||||
@ -1172,6 +1168,79 @@ def test_method_filter(
|
|||||||
assert response.status_code == 404
|
assert response.status_code == 404
|
||||||
assert response.content == b'404 Not in Archive\n'
|
assert response.content == b'404 Not in Archive\n'
|
||||||
|
|
||||||
|
def test_dedup_ok_flag(
|
||||||
|
https_daemon, http_daemon, warcprox_, archiving_proxies,
|
||||||
|
rethinkdb_big_table):
|
||||||
|
if not rethinkdb_big_table:
|
||||||
|
# this feature is n/a unless using rethinkdb big table
|
||||||
|
return
|
||||||
|
|
||||||
|
url = 'http://localhost:{}/z/b'.format(http_daemon.server_port)
|
||||||
|
|
||||||
|
# check not in dedup db
|
||||||
|
dedup_lookup = warcprox_.warc_writer_thread.dedup_db.lookup(
|
||||||
|
b'sha1:2d7f13181b90a256ce5e5ebfd6e9c9826ece9079',
|
||||||
|
bucket='test_dedup_ok_flag')
|
||||||
|
assert dedup_lookup is None
|
||||||
|
|
||||||
|
# archive with dedup_ok:False
|
||||||
|
request_meta = {'captures-bucket':'test_dedup_ok_flag','dedup-ok':False}
|
||||||
|
headers = {'Warcprox-Meta': json.dumps(request_meta)}
|
||||||
|
response = requests.get(
|
||||||
|
url, proxies=archiving_proxies, headers=headers, verify=False)
|
||||||
|
assert response.status_code == 200
|
||||||
|
assert response.headers['warcprox-test-header'] == 'z!'
|
||||||
|
assert response.content == b'I am the warcprox test payload! bbbbbbbbbb!\n'
|
||||||
|
|
||||||
|
time.sleep(0.5)
|
||||||
|
while not warcprox_.warc_writer_thread.idle:
|
||||||
|
time.sleep(0.5)
|
||||||
|
time.sleep(0.5)
|
||||||
|
|
||||||
|
# check that dedup db doesn't give us anything for this
|
||||||
|
dedup_lookup = warcprox_.warc_writer_thread.dedup_db.lookup(
|
||||||
|
b'sha1:2d7f13181b90a256ce5e5ebfd6e9c9826ece9079',
|
||||||
|
bucket='test_dedup_ok_flag')
|
||||||
|
assert dedup_lookup is None
|
||||||
|
|
||||||
|
# archive without dedup_ok:False
|
||||||
|
request_meta = {'captures-bucket':'test_dedup_ok_flag'}
|
||||||
|
headers = {'Warcprox-Meta': json.dumps(request_meta)}
|
||||||
|
response = requests.get(
|
||||||
|
url, proxies=archiving_proxies, headers=headers, verify=False)
|
||||||
|
|
||||||
|
assert response.status_code == 200
|
||||||
|
assert response.headers['warcprox-test-header'] == 'z!'
|
||||||
|
assert response.content == b'I am the warcprox test payload! bbbbbbbbbb!\n'
|
||||||
|
|
||||||
|
time.sleep(0.5)
|
||||||
|
while not warcprox_.warc_writer_thread.idle:
|
||||||
|
time.sleep(0.5)
|
||||||
|
time.sleep(0.5)
|
||||||
|
|
||||||
|
# check that dedup db gives us something for this
|
||||||
|
dedup_lookup = warcprox_.warc_writer_thread.dedup_db.lookup(
|
||||||
|
b'sha1:2d7f13181b90a256ce5e5ebfd6e9c9826ece9079',
|
||||||
|
bucket='test_dedup_ok_flag')
|
||||||
|
assert dedup_lookup
|
||||||
|
|
||||||
|
# inspect what's in rethinkdb more closely
|
||||||
|
rethink_captures = warcprox_.warc_writer_thread.dedup_db.captures_db
|
||||||
|
results_iter = rethink_captures.r.table(rethink_captures.table).get_all(
|
||||||
|
['FV7RGGA3SCRFNTS6L275N2OJQJXM5EDZ', 'response',
|
||||||
|
'test_dedup_ok_flag'], index='sha1_warc_type').order_by(
|
||||||
|
'timestamp').run()
|
||||||
|
results = list(results_iter)
|
||||||
|
assert len(results) == 2
|
||||||
|
assert results[0].get('dedup_ok') == False
|
||||||
|
assert not 'dedup_ok' in results[1]
|
||||||
|
assert results[0]['url'] == url
|
||||||
|
assert results[1]['url'] == url
|
||||||
|
assert results[0]['warc_type'] == 'response'
|
||||||
|
assert results[1]['warc_type'] == 'response' # not revisit
|
||||||
|
assert results[0]['filename'] == results[1]['filename']
|
||||||
|
assert results[0]['offset'] < results[1]['offset']
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
pytest.main()
|
pytest.main()
|
||||||
|
|
||||||
|
@ -114,7 +114,10 @@ class RethinkCaptures:
|
|||||||
"digest type is %s but big captures table is indexed by "
|
"digest type is %s but big captures table is indexed by "
|
||||||
"sha1" % algo)
|
"sha1" % algo)
|
||||||
sha1base32 = base64.b32encode(raw_digest).decode("utf-8")
|
sha1base32 = base64.b32encode(raw_digest).decode("utf-8")
|
||||||
results_iter = self.r.table(self.table).get_all([sha1base32, "response", bucket], index="sha1_warc_type").run()
|
results_iter = self.r.table(self.table).get_all(
|
||||||
|
[sha1base32, "response", bucket],
|
||||||
|
index="sha1_warc_type").filter(
|
||||||
|
self.r.row["dedup_ok"], default=True).run()
|
||||||
results = list(results_iter)
|
results = list(results_iter)
|
||||||
if len(results) > 0:
|
if len(results) > 0:
|
||||||
if len(results) > 1:
|
if len(results) > 1:
|
||||||
@ -176,11 +179,14 @@ class RethinkCaptures:
|
|||||||
# if any
|
# if any
|
||||||
}
|
}
|
||||||
|
|
||||||
if (recorded_url.warcprox_meta and
|
if recorded_url.warcprox_meta:
|
||||||
"captures-table-extra-fields" in recorded_url.warcprox_meta):
|
if "dedup-ok" in recorded_url.warcprox_meta:
|
||||||
extras = recorded_url.warcprox_meta["captures-table-extra-fields"]
|
entry["dedup_ok"] = recorded_url.warcprox_meta["dedup-ok"]
|
||||||
for extra_field in extras:
|
if "captures-table-extra-fields" in recorded_url.warcprox_meta:
|
||||||
entry[extra_field] = extras[extra_field]
|
extras = recorded_url.warcprox_meta[
|
||||||
|
"captures-table-extra-fields"]
|
||||||
|
for extra_field in extras:
|
||||||
|
entry[extra_field] = extras[extra_field]
|
||||||
|
|
||||||
return entry
|
return entry
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user