diff --git a/pywb/cdx/test/cdxserver_test.py b/pywb/cdx/test/cdxserver_test.py index 4009b287..bfb29d1f 100644 --- a/pywb/cdx/test/cdxserver_test.py +++ b/pywb/cdx/test/cdxserver_test.py @@ -51,6 +51,7 @@ com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit # Filter exact invert >>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '!=urlkey:com,example)/?example=1') +com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz @@ -61,6 +62,7 @@ com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit # Filter contains invert >>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '!~urlkey:example=1') +com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz @@ -127,8 +129,8 @@ org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db tex # CDX Server init ->>> x = CDXServer([test_cdx_dir]).load_cdx(url = 'example.com', limit = 1, output = 'raw') ->>> pprint.pprint(x.next().items()) +>>> x = CDXServer([test_cdx_dir]).load_cdx(url = 'example.com', limit = 2, output = 'raw') +>>> y = x.next(); pprint.pprint(x.next().items()) [('urlkey', 'com,example)/'), ('timestamp', '20140127171200'), ('original', 'http://example.com'), diff --git a/pywb/core/handlers.py b/pywb/core/handlers.py index 049888df..473632a1 100644 --- a/pywb/core/handlers.py +++ b/pywb/core/handlers.py @@ -42,7 +42,9 @@ class WBHandler(WbUrlHandler): return self.query_view.render_response(wbrequest, cdx_lines) with PerfTimer(wbrequest.env.get('X_PERF'), 'replay') as t: - return self.replay(wbrequest, cdx_lines) + return self.replay(wbrequest, + cdx_lines, + self.index_reader.cdx_load_callback(wbrequest)) def render_search_page(self, wbrequest): diff --git a/pywb/core/indexreader.py b/pywb/core/indexreader.py index 94fca422..25129bcb 100644 --- a/pywb/core/indexreader.py +++ b/pywb/core/indexreader.py @@ -44,6 +44,11 @@ class IndexReader(object): return self.cdx_server.load_cdx(**params) + def cdx_load_callback(self, wbrequest): + def load_cdx(params): + return self.load_cdx(wbrequest, params) + return load_cdx + def get_query_params(self, wburl, limit = 150000, collapse_time = None, replay_closest = 100): if wburl.type == wburl.URL_QUERY: raise NotImplementedError('Url Query Not Yet Supported') diff --git a/pywb/core/pywb_init.py b/pywb/core/pywb_init.py index fbcfec95..263ff442 100644 --- a/pywb/core/pywb_init.py +++ b/pywb/core/pywb_init.py @@ -71,7 +71,6 @@ def create_wb_handler(cdx_server, config, ds_rules_file=None): paths = config.get('archive_paths') resolving_loader = ResolvingLoader(paths=paths, - cdx_server=cdx_server, record_loader=record_loader) head_insert_view = load_template_file(config.get('head_insert_html'), diff --git a/pywb/core/replay_views.py b/pywb/core/replay_views.py index 07997396..cc3621fd 100644 --- a/pywb/core/replay_views.py +++ b/pywb/core/replay_views.py @@ -26,7 +26,7 @@ class ReplayView: self._reporter = reporter - def __call__(self, wbrequest, cdx_lines): + def __call__(self, wbrequest, cdx_lines, cdx_loader): last_e = None first = True @@ -42,7 +42,8 @@ class ReplayView: self._redirect_if_needed(wbrequest, cdx) first = False - (status_headers, stream) = self.content_loader.resolve_headers_and_payload(cdx, failed_files) + (status_headers, stream) = (self.content_loader. + resolve_headers_and_payload(cdx, failed_files, cdx_loader)) # check and reject self-redirect self._reject_self_redirect(wbrequest, cdx, status_headers) diff --git a/pywb/warc/recordloader.py b/pywb/warc/recordloader.py index fb3af38c..4acb491f 100644 --- a/pywb/warc/recordloader.py +++ b/pywb/warc/recordloader.py @@ -126,7 +126,7 @@ class ArcWarcRecordLoader: rec_headers = self.arc_parser.parse(stream, statusline) return 'arc', rec_headers except StatusAndHeadersParserException as se: - msg = 'Unknown archive format, first line: ' + se.statusline + msg = 'Unknown archive format, first line: ' + str(se.statusline) raise ArchiveLoadFailed(msg) @@ -148,7 +148,7 @@ class ARCHeadersParser: if len(parts) != len(headernames): msg = 'Wrong # of headers, expected arc headers {0}, Found {1}' msg = msg.format(headernames, parts) - raise StatusAndHeadersParserException(msg, headernames) + raise StatusAndHeadersParserException(msg, parts) headers = [] diff --git a/pywb/warc/resolvingloader.py b/pywb/warc/resolvingloader.py index 6a44739d..134e1dc8 100644 --- a/pywb/warc/resolvingloader.py +++ b/pywb/warc/resolvingloader.py @@ -5,14 +5,11 @@ from pathresolvers import make_best_resolvers #================================================================= class ResolvingLoader: - def __init__(self, paths, record_loader=ArcWarcRecordLoader(), - cdx_server=None): - + def __init__(self, paths, record_loader=ArcWarcRecordLoader()): self.path_resolvers = make_best_resolvers(paths) self.record_loader = record_loader - self.cdx_server = cdx_server - def resolve_headers_and_payload(self, cdx, failed_files): + def resolve_headers_and_payload(self, cdx, failed_files, cdx_loader): """ Resolve headers and payload for a given capture In the simple case, headers and payload are in the same record. @@ -37,7 +34,8 @@ class ResolvingLoader: if cdx['mimetype'] == 'warc/revisit' and headers_record: payload_record = self._load_different_url_payload(cdx, headers_record, - failed_files) + failed_files, + cdx_loader) # single lookup cases # case 2: non-revisit @@ -121,7 +119,8 @@ class ResolvingLoader: raise ArchiveLoadFailed(msg, filename), None, last_traceback - def _load_different_url_payload(self, cdx, headers_record, failed_files): + def _load_different_url_payload(self, cdx, headers_record, + failed_files, cdx_loader): """ Handle the case where a duplicate of a capture with same digest exists at a different url. @@ -152,7 +151,8 @@ class ResolvingLoader: orig_cdx_lines = self.load_cdx_for_dupe(ref_target_uri, ref_target_date, - cdx['digest']) + cdx['digest'], + cdx_loader) for cdx in orig_cdx_lines: try: @@ -165,12 +165,12 @@ class ResolvingLoader: raise ArchiveLoadFailed('Original for revisit could not be loaded') - def load_cdx_for_dupe(self, url, timestamp, digest): + def load_cdx_for_dupe(self, url, timestamp, digest, cdx_loader): """ If a cdx_server is available, return response from server, otherwise empty list """ - if not self.cdx_server: + if not cdx_loader: return [] params = {'url': url, @@ -178,4 +178,4 @@ class ResolvingLoader: 'filter': 'digest:' + digest, 'output': 'cdxobject'} - return self.cdx_server.load_cdx(**params) + return cdx_loader(params) diff --git a/pywb/warc/test/test_loading.py b/pywb/warc/test/test_loading.py index 02ab54cb..8393f995 100644 --- a/pywb/warc/test/test_loading.py +++ b/pywb/warc/test/test_loading.py @@ -159,7 +159,36 @@ StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Acc +# Test Url Agnostic Revisit Resolving +# ============================================================================== +>>> load_from_cdx_test(URL_AGNOSTIC_ORIG_CDX) +StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'), + ('Content-Type', 'text/html; charset=UTF-8'), + ('Date', 'Tue, 02 Jul 2013 19:54:02 GMT'), + ('ETag', '"780602-4f6-4db31b2978ec0"'), + ('Last-Modified', 'Thu, 25 Apr 2013 16:13:23 GMT'), + ('Server', 'ECS (sjc/4FCE)'), + ('X-Cache', 'HIT'), + ('Content-Length', '1270'), + ('Connection', 'close')]) + + + +>>> load_from_cdx_test(URL_AGNOSTIC_REVISIT_CDX) +StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'), + ('Content-Type', 'text/html; charset=UTF-8'), + ('Date', 'Mon, 29 Jul 2013 19:51:51 GMT'), + ('ETag', '"780602-4f6-4db31b2978ec0"'), + ('Last-Modified', 'Thu, 25 Apr 2013 16:13:23 GMT'), + ('Server', 'ECS (sjc/4FCE)'), + ('X-Cache', 'HIT'), + ('Content-Length', '1270'), + ('Connection', 'close')]) + + + # Error Handling +# ============================================================================== # Invalid WARC Offset >>> load_from_cdx_test('com,example)/?example=1 20140103030341 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1860 example.warc.gz 1043 333 example.warc.gz') @@ -167,7 +196,7 @@ Exception: ArchiveLoadFailed # Invalid ARC Offset ->>> load_from_cdx_test('com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 332 example.warc.gz') +>>> load_from_cdx_test('com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 856 170 example.arc.gz') Exception: ArchiveLoadFailed @@ -175,6 +204,7 @@ Exception: ArchiveLoadFailed >>> load_from_cdx_test('com,example)/?example=1 20140103030341 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz 1043 330 example.warc.gz') Exception: ArchiveLoadFailed + """ import os @@ -188,29 +218,44 @@ from pywb.cdx.cdxobject import CDXObject from pywb import get_test_dir -#test_warc_dir = os.path.dirname(os.path.realpath(__file__)) + '/../sample_data/' +#============================================================================== test_warc_dir = get_test_dir() + 'warcs/' + +URL_AGNOSTIC_ORIG_CDX = 'org,iana,example)/ 20130702195402 http://example.iana.org/ \ +text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - \ +1001 353 example-url-agnostic-orig.warc.gz' + +URL_AGNOSTIC_REVISIT_CDX = 'com,example)/ 20130729195151 http://test@example.com/ \ +warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - \ +591 355 example-url-agnostic-revisit.warc.gz' + + +#============================================================================== def load_test_archive(test_file, offset, length): path = test_warc_dir + test_file testloader = ArcWarcRecordLoader() - archive = testloader.load(path, offset, length) archive = testloader.load(path, offset, length) pprint.pprint((archive.type, archive.rec_headers, archive.status_headers)) +#============================================================================== +def load_orig_cdx(self): + return [CDXObject(URL_AGNOSTIC_ORIG_CDX)] + +#============================================================================== def load_from_cdx_test(cdx): resolve_loader = ResolvingLoader(test_warc_dir) cdx = CDXObject(cdx) try: - (headers, stream) = resolve_loader.resolve_headers_and_payload(cdx, None) + (headers, stream) = resolve_loader.resolve_headers_and_payload(cdx, None, load_orig_cdx) print headers sys.stdout.write(stream.readline()) sys.stdout.write(stream.readline()) - except Exception as e: + except ArchiveLoadFailed as e: print 'Exception: ' + e.__class__.__name__ if __name__ == "__main__": diff --git a/sample_archive/cdx/url-agnost-example.cdx b/sample_archive/cdx/url-agnost-example.cdx new file mode 100644 index 00000000..4d74aa9b --- /dev/null +++ b/sample_archive/cdx/url-agnost-example.cdx @@ -0,0 +1,3 @@ + CDX N b a m s k r M S V g +com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz +org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1001 353 example-url-agnostic-orig.warc.gz diff --git a/sample_archive/warcs/example-url-agnostic-orig.warc.gz b/sample_archive/warcs/example-url-agnostic-orig.warc.gz new file mode 100644 index 00000000..98700373 Binary files /dev/null and b/sample_archive/warcs/example-url-agnostic-orig.warc.gz differ diff --git a/sample_archive/warcs/example-url-agnostic-revisit.warc.gz b/sample_archive/warcs/example-url-agnostic-revisit.warc.gz new file mode 100644 index 00000000..3770ed0a Binary files /dev/null and b/sample_archive/warcs/example-url-agnostic-revisit.warc.gz differ diff --git a/tests/test_integration.py b/tests/test_integration.py index 6e539c31..c9cd5c68 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -84,6 +84,15 @@ class TestWb: assert 'wb.js' in resp.body assert '/pywb/20140127171238/http://www.iana.org/time-zones"' in resp.body + + def test_replay_url_agnostic_revisit(self): + resp = self.testapp.get('/pywb/20130729195151/http://www.example.com/') + self._assert_basic_html(resp) + + assert 'Mon, Jul 29 2013 19:51:51' in resp.body + assert 'wb.js' in resp.body + assert '/pywb/20130729195151/http://www.iana.org/domains/example"' in resp.body + def test_replay_identity_1(self): resp = self.testapp.get('/pywb/20140127171251id_/http://example.com') #resp = self.testapp.get('/pywb/20140126200654id_/http://www.iana.org/_img/2013.1/rir-map.svg')