From 4b505c524b28bb710e695537b0eed7045027737f Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 13 Jan 2017 17:29:05 -0800 Subject: [PATCH] new flag dedup_ok and warcprox-meta field dedup-ok which can be used to prevent deduplication against particular entries rethinkdb big captures table --- setup.py | 2 +- tests/test_warcprox.py | 77 +++++++++++++++++++++++++++++++++++++++--- warcprox/bigtable.py | 18 ++++++---- 3 files changed, 86 insertions(+), 11 deletions(-) diff --git a/setup.py b/setup.py index 70a62a9..48675c0 100755 --- a/setup.py +++ b/setup.py @@ -51,7 +51,7 @@ except: setuptools.setup( name='warcprox', - version='2.0b3.dev40', + version='2.0b3.dev41', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 8fba644..82f80fe 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -217,10 +217,6 @@ def https_daemon(request, cert): return https_daemon -# @pytest.fixture(scope="module") -# def options(request): -# return warcprox.Options(base32=True) - @pytest.fixture(scope="module") def captures_db(request, rethinkdb_servers, rethinkdb_big_table): captures_db = None @@ -1172,6 +1168,79 @@ def test_method_filter( assert response.status_code == 404 assert response.content == b'404 Not in Archive\n' +def test_dedup_ok_flag( + https_daemon, http_daemon, warcprox_, archiving_proxies, + rethinkdb_big_table): + if not rethinkdb_big_table: + # this feature is n/a unless using rethinkdb big table + return + + url = 'http://localhost:{}/z/b'.format(http_daemon.server_port) + + # check not in dedup db + dedup_lookup = warcprox_.warc_writer_thread.dedup_db.lookup( + b'sha1:2d7f13181b90a256ce5e5ebfd6e9c9826ece9079', + bucket='test_dedup_ok_flag') + assert dedup_lookup is None + + # archive with dedup_ok:False + request_meta = {'captures-bucket':'test_dedup_ok_flag','dedup-ok':False} + headers = {'Warcprox-Meta': json.dumps(request_meta)} + response = requests.get( + url, proxies=archiving_proxies, headers=headers, verify=False) + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'z!' + assert response.content == b'I am the warcprox test payload! bbbbbbbbbb!\n' + + time.sleep(0.5) + while not warcprox_.warc_writer_thread.idle: + time.sleep(0.5) + time.sleep(0.5) + + # check that dedup db doesn't give us anything for this + dedup_lookup = warcprox_.warc_writer_thread.dedup_db.lookup( + b'sha1:2d7f13181b90a256ce5e5ebfd6e9c9826ece9079', + bucket='test_dedup_ok_flag') + assert dedup_lookup is None + + # archive without dedup_ok:False + request_meta = {'captures-bucket':'test_dedup_ok_flag'} + headers = {'Warcprox-Meta': json.dumps(request_meta)} + response = requests.get( + url, proxies=archiving_proxies, headers=headers, verify=False) + + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'z!' + assert response.content == b'I am the warcprox test payload! bbbbbbbbbb!\n' + + time.sleep(0.5) + while not warcprox_.warc_writer_thread.idle: + time.sleep(0.5) + time.sleep(0.5) + + # check that dedup db gives us something for this + dedup_lookup = warcprox_.warc_writer_thread.dedup_db.lookup( + b'sha1:2d7f13181b90a256ce5e5ebfd6e9c9826ece9079', + bucket='test_dedup_ok_flag') + assert dedup_lookup + + # inspect what's in rethinkdb more closely + rethink_captures = warcprox_.warc_writer_thread.dedup_db.captures_db + results_iter = rethink_captures.r.table(rethink_captures.table).get_all( + ['FV7RGGA3SCRFNTS6L275N2OJQJXM5EDZ', 'response', + 'test_dedup_ok_flag'], index='sha1_warc_type').order_by( + 'timestamp').run() + results = list(results_iter) + assert len(results) == 2 + assert results[0].get('dedup_ok') == False + assert not 'dedup_ok' in results[1] + assert results[0]['url'] == url + assert results[1]['url'] == url + assert results[0]['warc_type'] == 'response' + assert results[1]['warc_type'] == 'response' # not revisit + assert results[0]['filename'] == results[1]['filename'] + assert results[0]['offset'] < results[1]['offset'] + if __name__ == '__main__': pytest.main() diff --git a/warcprox/bigtable.py b/warcprox/bigtable.py index bc049f3..aeca018 100644 --- a/warcprox/bigtable.py +++ b/warcprox/bigtable.py @@ -114,7 +114,10 @@ class RethinkCaptures: "digest type is %s but big captures table is indexed by " "sha1" % algo) sha1base32 = base64.b32encode(raw_digest).decode("utf-8") - results_iter = self.r.table(self.table).get_all([sha1base32, "response", bucket], index="sha1_warc_type").run() + results_iter = self.r.table(self.table).get_all( + [sha1base32, "response", bucket], + index="sha1_warc_type").filter( + self.r.row["dedup_ok"], default=True).run() results = list(results_iter) if len(results) > 0: if len(results) > 1: @@ -176,11 +179,14 @@ class RethinkCaptures: # if any } - if (recorded_url.warcprox_meta and - "captures-table-extra-fields" in recorded_url.warcprox_meta): - extras = recorded_url.warcprox_meta["captures-table-extra-fields"] - for extra_field in extras: - entry[extra_field] = extras[extra_field] + if recorded_url.warcprox_meta: + if "dedup-ok" in recorded_url.warcprox_meta: + entry["dedup_ok"] = recorded_url.warcprox_meta["dedup-ok"] + if "captures-table-extra-fields" in recorded_url.warcprox_meta: + extras = recorded_url.warcprox_meta[ + "captures-table-extra-fields"] + for extra_field in extras: + entry[extra_field] = extras[extra_field] return entry