From d702a98bbcc911f61a5c789b42b9ba5b278726c4 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 4 Mar 2014 20:12:09 +0000 Subject: [PATCH] url-agnostic revisit testing! add sample warc and cdx for url-agnostic revisits add unit test and integration test resolvingloader: pass callback instead of full cdx server for use for loading cdx in case of url-agnostic revisit --- pywb/cdx/test/cdxserver_test.py | 6 +- pywb/core/handlers.py | 4 +- pywb/core/indexreader.py | 5 ++ pywb/core/pywb_init.py | 1 - pywb/core/replay_views.py | 5 +- pywb/warc/recordloader.py | 4 +- pywb/warc/resolvingloader.py | 22 +++---- pywb/warc/test/test_loading.py | 55 ++++++++++++++++-- sample_archive/cdx/url-agnost-example.cdx | 3 + .../warcs/example-url-agnostic-orig.warc.gz | Bin 0 -> 1354 bytes .../example-url-agnostic-revisit.warc.gz | Bin 0 -> 946 bytes tests/test_integration.py | 9 +++ 12 files changed, 90 insertions(+), 24 deletions(-) create mode 100644 sample_archive/cdx/url-agnost-example.cdx create mode 100644 sample_archive/warcs/example-url-agnostic-orig.warc.gz create mode 100644 sample_archive/warcs/example-url-agnostic-revisit.warc.gz diff --git a/pywb/cdx/test/cdxserver_test.py b/pywb/cdx/test/cdxserver_test.py index 4009b287..bfb29d1f 100644 --- a/pywb/cdx/test/cdxserver_test.py +++ b/pywb/cdx/test/cdxserver_test.py @@ -51,6 +51,7 @@ com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit # Filter exact invert >>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '!=urlkey:com,example)/?example=1') +com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz @@ -61,6 +62,7 @@ com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit # Filter contains invert >>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '!~urlkey:example=1') +com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz @@ -127,8 +129,8 @@ org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db tex # CDX Server init ->>> x = CDXServer([test_cdx_dir]).load_cdx(url = 'example.com', limit = 1, output = 'raw') ->>> pprint.pprint(x.next().items()) +>>> x = CDXServer([test_cdx_dir]).load_cdx(url = 'example.com', limit = 2, output = 'raw') +>>> y = x.next(); pprint.pprint(x.next().items()) [('urlkey', 'com,example)/'), ('timestamp', '20140127171200'), ('original', 'http://example.com'), diff --git a/pywb/core/handlers.py b/pywb/core/handlers.py index 049888df..473632a1 100644 --- a/pywb/core/handlers.py +++ b/pywb/core/handlers.py @@ -42,7 +42,9 @@ class WBHandler(WbUrlHandler): return self.query_view.render_response(wbrequest, cdx_lines) with PerfTimer(wbrequest.env.get('X_PERF'), 'replay') as t: - return self.replay(wbrequest, cdx_lines) + return self.replay(wbrequest, + cdx_lines, + self.index_reader.cdx_load_callback(wbrequest)) def render_search_page(self, wbrequest): diff --git a/pywb/core/indexreader.py b/pywb/core/indexreader.py index 94fca422..25129bcb 100644 --- a/pywb/core/indexreader.py +++ b/pywb/core/indexreader.py @@ -44,6 +44,11 @@ class IndexReader(object): return self.cdx_server.load_cdx(**params) + def cdx_load_callback(self, wbrequest): + def load_cdx(params): + return self.load_cdx(wbrequest, params) + return load_cdx + def get_query_params(self, wburl, limit = 150000, collapse_time = None, replay_closest = 100): if wburl.type == wburl.URL_QUERY: raise NotImplementedError('Url Query Not Yet Supported') diff --git a/pywb/core/pywb_init.py b/pywb/core/pywb_init.py index fbcfec95..263ff442 100644 --- a/pywb/core/pywb_init.py +++ b/pywb/core/pywb_init.py @@ -71,7 +71,6 @@ def create_wb_handler(cdx_server, config, ds_rules_file=None): paths = config.get('archive_paths') resolving_loader = ResolvingLoader(paths=paths, - cdx_server=cdx_server, record_loader=record_loader) head_insert_view = load_template_file(config.get('head_insert_html'), diff --git a/pywb/core/replay_views.py b/pywb/core/replay_views.py index 07997396..cc3621fd 100644 --- a/pywb/core/replay_views.py +++ b/pywb/core/replay_views.py @@ -26,7 +26,7 @@ class ReplayView: self._reporter = reporter - def __call__(self, wbrequest, cdx_lines): + def __call__(self, wbrequest, cdx_lines, cdx_loader): last_e = None first = True @@ -42,7 +42,8 @@ class ReplayView: self._redirect_if_needed(wbrequest, cdx) first = False - (status_headers, stream) = self.content_loader.resolve_headers_and_payload(cdx, failed_files) + (status_headers, stream) = (self.content_loader. + resolve_headers_and_payload(cdx, failed_files, cdx_loader)) # check and reject self-redirect self._reject_self_redirect(wbrequest, cdx, status_headers) diff --git a/pywb/warc/recordloader.py b/pywb/warc/recordloader.py index fb3af38c..4acb491f 100644 --- a/pywb/warc/recordloader.py +++ b/pywb/warc/recordloader.py @@ -126,7 +126,7 @@ class ArcWarcRecordLoader: rec_headers = self.arc_parser.parse(stream, statusline) return 'arc', rec_headers except StatusAndHeadersParserException as se: - msg = 'Unknown archive format, first line: ' + se.statusline + msg = 'Unknown archive format, first line: ' + str(se.statusline) raise ArchiveLoadFailed(msg) @@ -148,7 +148,7 @@ class ARCHeadersParser: if len(parts) != len(headernames): msg = 'Wrong # of headers, expected arc headers {0}, Found {1}' msg = msg.format(headernames, parts) - raise StatusAndHeadersParserException(msg, headernames) + raise StatusAndHeadersParserException(msg, parts) headers = [] diff --git a/pywb/warc/resolvingloader.py b/pywb/warc/resolvingloader.py index 6a44739d..134e1dc8 100644 --- a/pywb/warc/resolvingloader.py +++ b/pywb/warc/resolvingloader.py @@ -5,14 +5,11 @@ from pathresolvers import make_best_resolvers #================================================================= class ResolvingLoader: - def __init__(self, paths, record_loader=ArcWarcRecordLoader(), - cdx_server=None): - + def __init__(self, paths, record_loader=ArcWarcRecordLoader()): self.path_resolvers = make_best_resolvers(paths) self.record_loader = record_loader - self.cdx_server = cdx_server - def resolve_headers_and_payload(self, cdx, failed_files): + def resolve_headers_and_payload(self, cdx, failed_files, cdx_loader): """ Resolve headers and payload for a given capture In the simple case, headers and payload are in the same record. @@ -37,7 +34,8 @@ class ResolvingLoader: if cdx['mimetype'] == 'warc/revisit' and headers_record: payload_record = self._load_different_url_payload(cdx, headers_record, - failed_files) + failed_files, + cdx_loader) # single lookup cases # case 2: non-revisit @@ -121,7 +119,8 @@ class ResolvingLoader: raise ArchiveLoadFailed(msg, filename), None, last_traceback - def _load_different_url_payload(self, cdx, headers_record, failed_files): + def _load_different_url_payload(self, cdx, headers_record, + failed_files, cdx_loader): """ Handle the case where a duplicate of a capture with same digest exists at a different url. @@ -152,7 +151,8 @@ class ResolvingLoader: orig_cdx_lines = self.load_cdx_for_dupe(ref_target_uri, ref_target_date, - cdx['digest']) + cdx['digest'], + cdx_loader) for cdx in orig_cdx_lines: try: @@ -165,12 +165,12 @@ class ResolvingLoader: raise ArchiveLoadFailed('Original for revisit could not be loaded') - def load_cdx_for_dupe(self, url, timestamp, digest): + def load_cdx_for_dupe(self, url, timestamp, digest, cdx_loader): """ If a cdx_server is available, return response from server, otherwise empty list """ - if not self.cdx_server: + if not cdx_loader: return [] params = {'url': url, @@ -178,4 +178,4 @@ class ResolvingLoader: 'filter': 'digest:' + digest, 'output': 'cdxobject'} - return self.cdx_server.load_cdx(**params) + return cdx_loader(params) diff --git a/pywb/warc/test/test_loading.py b/pywb/warc/test/test_loading.py index 02ab54cb..8393f995 100644 --- a/pywb/warc/test/test_loading.py +++ b/pywb/warc/test/test_loading.py @@ -159,7 +159,36 @@ StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Acc +# Test Url Agnostic Revisit Resolving +# ============================================================================== +>>> load_from_cdx_test(URL_AGNOSTIC_ORIG_CDX) +StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'), + ('Content-Type', 'text/html; charset=UTF-8'), + ('Date', 'Tue, 02 Jul 2013 19:54:02 GMT'), + ('ETag', '"780602-4f6-4db31b2978ec0"'), + ('Last-Modified', 'Thu, 25 Apr 2013 16:13:23 GMT'), + ('Server', 'ECS (sjc/4FCE)'), + ('X-Cache', 'HIT'), + ('Content-Length', '1270'), + ('Connection', 'close')]) + + + +>>> load_from_cdx_test(URL_AGNOSTIC_REVISIT_CDX) +StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'), + ('Content-Type', 'text/html; charset=UTF-8'), + ('Date', 'Mon, 29 Jul 2013 19:51:51 GMT'), + ('ETag', '"780602-4f6-4db31b2978ec0"'), + ('Last-Modified', 'Thu, 25 Apr 2013 16:13:23 GMT'), + ('Server', 'ECS (sjc/4FCE)'), + ('X-Cache', 'HIT'), + ('Content-Length', '1270'), + ('Connection', 'close')]) + + + # Error Handling +# ============================================================================== # Invalid WARC Offset >>> load_from_cdx_test('com,example)/?example=1 20140103030341 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1860 example.warc.gz 1043 333 example.warc.gz') @@ -167,7 +196,7 @@ Exception: ArchiveLoadFailed # Invalid ARC Offset ->>> load_from_cdx_test('com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 332 example.warc.gz') +>>> load_from_cdx_test('com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 856 170 example.arc.gz') Exception: ArchiveLoadFailed @@ -175,6 +204,7 @@ Exception: ArchiveLoadFailed >>> load_from_cdx_test('com,example)/?example=1 20140103030341 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz 1043 330 example.warc.gz') Exception: ArchiveLoadFailed + """ import os @@ -188,29 +218,44 @@ from pywb.cdx.cdxobject import CDXObject from pywb import get_test_dir -#test_warc_dir = os.path.dirname(os.path.realpath(__file__)) + '/../sample_data/' +#============================================================================== test_warc_dir = get_test_dir() + 'warcs/' + +URL_AGNOSTIC_ORIG_CDX = 'org,iana,example)/ 20130702195402 http://example.iana.org/ \ +text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - \ +1001 353 example-url-agnostic-orig.warc.gz' + +URL_AGNOSTIC_REVISIT_CDX = 'com,example)/ 20130729195151 http://test@example.com/ \ +warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - \ +591 355 example-url-agnostic-revisit.warc.gz' + + +#============================================================================== def load_test_archive(test_file, offset, length): path = test_warc_dir + test_file testloader = ArcWarcRecordLoader() - archive = testloader.load(path, offset, length) archive = testloader.load(path, offset, length) pprint.pprint((archive.type, archive.rec_headers, archive.status_headers)) +#============================================================================== +def load_orig_cdx(self): + return [CDXObject(URL_AGNOSTIC_ORIG_CDX)] + +#============================================================================== def load_from_cdx_test(cdx): resolve_loader = ResolvingLoader(test_warc_dir) cdx = CDXObject(cdx) try: - (headers, stream) = resolve_loader.resolve_headers_and_payload(cdx, None) + (headers, stream) = resolve_loader.resolve_headers_and_payload(cdx, None, load_orig_cdx) print headers sys.stdout.write(stream.readline()) sys.stdout.write(stream.readline()) - except Exception as e: + except ArchiveLoadFailed as e: print 'Exception: ' + e.__class__.__name__ if __name__ == "__main__": diff --git a/sample_archive/cdx/url-agnost-example.cdx b/sample_archive/cdx/url-agnost-example.cdx new file mode 100644 index 00000000..4d74aa9b --- /dev/null +++ b/sample_archive/cdx/url-agnost-example.cdx @@ -0,0 +1,3 @@ + CDX N b a m s k r M S V g +com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz +org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1001 353 example-url-agnostic-orig.warc.gz diff --git a/sample_archive/warcs/example-url-agnostic-orig.warc.gz b/sample_archive/warcs/example-url-agnostic-orig.warc.gz new file mode 100644 index 0000000000000000000000000000000000000000..987003732a24124cc5cbbbe0cd15a8028316a63f GIT binary patch literal 1354 zcmV-Q1-1GgiwFP!0000018tGtYJ)Ho#_t8+AukY3{L|Jj2GXvqFxJruh2BVvNeyTs zF}B;Deo zI3baXA#`0#d)abDyCn^^kvE9u6HG#I`(BJE&a|oFVMoBzXcQSoS(D!a;d-5JL9%Lh`mA{%mht-E89+lGyir zZ_vUzpG!&#AH0H0cb!&4;E3S)J^whr9t-GzV;rG1ilY_Iz{{sKlv!Ux)!MaU%W;!8h*(L8NrELn$(m07 zz2HT+L=Tl_NN^W>cK7Tq@G^d!SWe4s7%zH7=DQSq$WW#tk+>JqY=w$EKTdsEE0y|| zg>N)YBW!V+(3Z$nR=xF_DsAuDPMg?$V!Lx^;CDN|?S8E{%;+u>l#y$`LaBU^YwGwP z+`IY9%k25#%lNkcJe_^&e|?^P>U_Tc@br1o?u~2P>5PmS)4@p}2JM#Pby}{|YdOxK z-tdT_$QYSk`w-+=;^#SMerPW|>NzfPsN;}M$Q&|oyaDNYi_pVGuhZ=>M-5{l5`~GX z8K7wz@sKJm601o03SzlZS~Wa!v$wdzWTnz zd2QDxlZ!t1X;ayDMNS$TB$nyPZ$<8bI ze=_-gSe+AjLgqPRXLY}z`+*w!TzlaAK-n>f>KRyNi;jx55m?nc8bOho9#0))-N0BMMoIRF3v literal 0 HcmV?d00001 diff --git a/sample_archive/warcs/example-url-agnostic-revisit.warc.gz b/sample_archive/warcs/example-url-agnostic-revisit.warc.gz new file mode 100644 index 0000000000000000000000000000000000000000..3770ed0a8aceb2567092233be67b6daac53c2db3 GIT binary patch literal 946 zcmV;j15NxNiwFP!0000018tFAYlAQphVKRcL;gTCM*CqHgVL_7FxJruh2BWaM-6Bq zF}B-&{i4WrvjoE7<$XElIk{|3+c=3(7+ib>i?2=)@Sv6{EzKq#(UIXvtLcp9uG!=O0%E!GRZSP7X! zRB;S5nl4~Pvn9;4C1WxdDbM9|7;KGpLc4il+I5X$)G4FmslQaB;r*ZOP3W7eJw=v= zK^XK#I`8vIMd3qKkeRkMx{o{)Jil)~&acM;dgL_0XoXU|Omgt@=`3Z=*f5aBw$u?Y z@_-Q?>?Z|Gt7JxxhQ2Ta)XsGzjw@B^p^YjnBWdF?Ha+Fxd@S*LflzrDw)f*FW#iY~ zJ8r3TQO9K%bVJpsz802%gQu}tIIyv-R(IihS>gTs|NlvFb{%oY`vZXnsdnlC001A0 z2ng^e7E=HNb#iQ9VP|e{b963uVRB;tZIWMan=ll`-;wwZR-ZO$4%iU>xa+huNtbMm znx>@L`etm0FcBLW(>33ICMDh0P9T9U_a2|~<8DT)v4;k}>ul^B%nr36K#Rw+Db1-y zbs-E~ujT~ojH!v|8PS-};)(9-N(`9VdwuJK8as}C6hJ?QIL%N>hREIzeZ!I-Dn&V* zl!cWbpxIGGF7b7?*(}#LKSr~7Jzw6&59{S^c)z&3yB`P9sGprL;fQlBnudTh7@#B^ zU=$4yN>AciZOcl$xRk=wwq58)F;Kejeo@W3tX`kbYjp`}Ays2a21_oaebNfny-9vF zd7_|O$!t7MKqX{hc71&%wn8^BQv`f$wIpp@a>D#Pp$WneQG{T~ID#okQaDU`Q|z!#zbx4ib~6vfc#)r>h*a>JQ;h?zB&hNM|C62#X7r!iR(OvUMTqi z;1qmqt0Mq#+BWcIk-5$^qXhwHapFflhT%4XA