url-agnostic revisit testing!

add sample warc and cdx for url-agnostic revisits add unit test and integration test resolvingloader: pass callback instead of full cdx server for use for loading cdx in case of url-agnostic revisit
2025-03-15 00:03:28 +01:00 · 2014-03-04 20:12:09 +00:00 · 2014-03-04 20:12:09 +00:00 · d702a98bbc
commit d702a98bbc
parent cf5aaf5de4
12 changed files with 90 additions and 24 deletions
--- a/pywb/cdx/test/cdxserver_test.py
+++ b/pywb/cdx/test/cdxserver_test.py
@ -51,6 +51,7 @@ com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit

 # Filter exact invert
 >>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '!=urlkey:com,example)/?example=1')
+com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
 com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz
 com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz

@ -61,6 +62,7 @@ com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit

 # Filter contains invert
 >>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '!~urlkey:example=1')
+com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
 com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz
 com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz

@ -127,8 +129,8 @@ org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db tex


 # CDX Server init
->>> x = CDXServer([test_cdx_dir]).load_cdx(url = 'example.com', limit = 1, output = 'raw')
->>> pprint.pprint(x.next().items())
+>>> x = CDXServer([test_cdx_dir]).load_cdx(url = 'example.com', limit = 2, output = 'raw')
+>>> y = x.next(); pprint.pprint(x.next().items())
 [('urlkey', 'com,example)/'),
 ('timestamp', '20140127171200'),
 ('original', 'http://example.com'),
--- a/pywb/core/handlers.py
+++ b/pywb/core/handlers.py
@ -42,7 +42,9 @@ class WBHandler(WbUrlHandler):
            return self.query_view.render_response(wbrequest, cdx_lines)

        with PerfTimer(wbrequest.env.get('X_PERF'), 'replay') as t:
-            return self.replay(wbrequest, cdx_lines)
+            return self.replay(wbrequest,
+                               cdx_lines,
+                               self.index_reader.cdx_load_callback(wbrequest))


    def render_search_page(self, wbrequest):
--- a/pywb/core/indexreader.py
+++ b/pywb/core/indexreader.py
@ -44,6 +44,11 @@ class IndexReader(object):

        return self.cdx_server.load_cdx(**params)

+    def cdx_load_callback(self, wbrequest):
+        def load_cdx(params):
+            return self.load_cdx(wbrequest, params)
+        return load_cdx
+
    def get_query_params(self, wburl, limit = 150000, collapse_time = None, replay_closest = 100):
        if wburl.type == wburl.URL_QUERY:
            raise NotImplementedError('Url Query Not Yet Supported')
--- a/pywb/core/pywb_init.py
+++ b/pywb/core/pywb_init.py
@ -71,7 +71,6 @@ def create_wb_handler(cdx_server, config, ds_rules_file=None):
    paths = config.get('archive_paths')

    resolving_loader = ResolvingLoader(paths=paths,
-                                       cdx_server=cdx_server,
                                       record_loader=record_loader)

    head_insert_view = load_template_file(config.get('head_insert_html'),
--- a/pywb/core/replay_views.py
+++ b/pywb/core/replay_views.py
@ -26,7 +26,7 @@ class ReplayView:
        self._reporter = reporter


-    def __call__(self, wbrequest, cdx_lines):
+    def __call__(self, wbrequest, cdx_lines, cdx_loader):
        last_e = None
        first = True

@ -42,7 +42,8 @@ class ReplayView:
                    self._redirect_if_needed(wbrequest, cdx)
                    first = False

-                (status_headers, stream) = self.content_loader.resolve_headers_and_payload(cdx, failed_files)
+                (status_headers, stream) = (self.content_loader.
+                                            resolve_headers_and_payload(cdx, failed_files, cdx_loader))

                # check and reject self-redirect
                self._reject_self_redirect(wbrequest, cdx, status_headers)
--- a/pywb/warc/recordloader.py
+++ b/pywb/warc/recordloader.py
@ -126,7 +126,7 @@ class ArcWarcRecordLoader:
            rec_headers = self.arc_parser.parse(stream, statusline)
            return 'arc', rec_headers
        except StatusAndHeadersParserException as se:
-            msg = 'Unknown archive format, first line: ' + se.statusline
+            msg = 'Unknown archive format, first line: ' + str(se.statusline)
            raise ArchiveLoadFailed(msg)


@ -148,7 +148,7 @@ class ARCHeadersParser:
        if len(parts) != len(headernames):
            msg = 'Wrong # of headers, expected arc headers {0}, Found {1}'
            msg = msg.format(headernames, parts)
-            raise StatusAndHeadersParserException(msg, headernames)
+            raise StatusAndHeadersParserException(msg, parts)

        headers = []

--- a/pywb/warc/resolvingloader.py
+++ b/pywb/warc/resolvingloader.py
@ -5,14 +5,11 @@ from pathresolvers import make_best_resolvers

 #=================================================================
 class ResolvingLoader:
-    def __init__(self, paths, record_loader=ArcWarcRecordLoader(),
-                 cdx_server=None):
-
+    def __init__(self, paths, record_loader=ArcWarcRecordLoader()):
        self.path_resolvers = make_best_resolvers(paths)
        self.record_loader = record_loader
-        self.cdx_server = cdx_server

-    def resolve_headers_and_payload(self, cdx, failed_files):
+    def resolve_headers_and_payload(self, cdx, failed_files, cdx_loader):
        """
        Resolve headers and payload for a given capture
        In the simple case, headers and payload are in the same record.
@ -37,7 +34,8 @@ class ResolvingLoader:
        if cdx['mimetype'] == 'warc/revisit' and headers_record:
            payload_record = self._load_different_url_payload(cdx,
                                                              headers_record,
-                                                              failed_files)
+                                                              failed_files,
+                                                              cdx_loader)

        # single lookup cases
        # case 2: non-revisit
@ -121,7 +119,8 @@ class ResolvingLoader:

        raise ArchiveLoadFailed(msg, filename), None, last_traceback

-    def _load_different_url_payload(self, cdx, headers_record, failed_files):
+    def _load_different_url_payload(self, cdx, headers_record,
+                                    failed_files, cdx_loader):
        """
        Handle the case where a duplicate of a capture with same digest
        exists at a different url.
@ -152,7 +151,8 @@ class ResolvingLoader:

        orig_cdx_lines = self.load_cdx_for_dupe(ref_target_uri,
                                                ref_target_date,
-                                                cdx['digest'])
+                                                cdx['digest'],
+                                                cdx_loader)

        for cdx in orig_cdx_lines:
            try:
@ -165,12 +165,12 @@ class ResolvingLoader:

        raise ArchiveLoadFailed('Original for revisit could not be loaded')

-    def load_cdx_for_dupe(self, url, timestamp, digest):
+    def load_cdx_for_dupe(self, url, timestamp, digest, cdx_loader):
        """
        If a cdx_server is available, return response from server,
        otherwise empty list
        """
-        if not self.cdx_server:
+        if not cdx_loader:
            return []

        params = {'url': url,
@ -178,4 +178,4 @@ class ResolvingLoader:
                  'filter': 'digest:' + digest,
                  'output': 'cdxobject'}

-        return self.cdx_server.load_cdx(**params)
+        return cdx_loader(params)
--- a/pywb/warc/test/test_loading.py
+++ b/pywb/warc/test/test_loading.py
@ -159,7 +159,36 @@ StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Acc
 <!doctype html>
 <html>

+# Test Url Agnostic Revisit Resolving
+# ==============================================================================
+>>> load_from_cdx_test(URL_AGNOSTIC_ORIG_CDX)
+StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
+  ('Content-Type', 'text/html; charset=UTF-8'),
+  ('Date', 'Tue, 02 Jul 2013 19:54:02 GMT'),
+  ('ETag', '"780602-4f6-4db31b2978ec0"'),
+  ('Last-Modified', 'Thu, 25 Apr 2013 16:13:23 GMT'),
+  ('Server', 'ECS (sjc/4FCE)'),
+  ('X-Cache', 'HIT'),
+  ('Content-Length', '1270'),
+  ('Connection', 'close')])
+<!doctype html>
+<html>
+
+>>> load_from_cdx_test(URL_AGNOSTIC_REVISIT_CDX)
+StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
+  ('Content-Type', 'text/html; charset=UTF-8'),
+  ('Date', 'Mon, 29 Jul 2013 19:51:51 GMT'),
+  ('ETag', '"780602-4f6-4db31b2978ec0"'),
+  ('Last-Modified', 'Thu, 25 Apr 2013 16:13:23 GMT'),
+  ('Server', 'ECS (sjc/4FCE)'),
+  ('X-Cache', 'HIT'),
+  ('Content-Length', '1270'),
+  ('Connection', 'close')])
+<!doctype html>
+<html>
+
 # Error Handling
+# ==============================================================================

 # Invalid WARC Offset
 >>> load_from_cdx_test('com,example)/?example=1 20140103030341 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1860 example.warc.gz 1043 333 example.warc.gz')
@ -167,7 +196,7 @@ Exception: ArchiveLoadFailed


 # Invalid ARC Offset
->>> load_from_cdx_test('com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 332 example.warc.gz')
+>>> load_from_cdx_test('com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 856 170 example.arc.gz')
 Exception: ArchiveLoadFailed


@ -175,6 +204,7 @@ Exception: ArchiveLoadFailed
 >>> load_from_cdx_test('com,example)/?example=1 20140103030341 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz 1043 330 example.warc.gz')
 Exception: ArchiveLoadFailed

+
 """

 import os
@ -188,29 +218,44 @@ from pywb.cdx.cdxobject import CDXObject

 from pywb import get_test_dir

-#test_warc_dir = os.path.dirname(os.path.realpath(__file__)) + '/../sample_data/'
+#==============================================================================
 test_warc_dir = get_test_dir() + 'warcs/'

+
+URL_AGNOSTIC_ORIG_CDX = 'org,iana,example)/ 20130702195402 http://example.iana.org/ \
+text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - \
+1001 353 example-url-agnostic-orig.warc.gz'
+
+URL_AGNOSTIC_REVISIT_CDX = 'com,example)/ 20130729195151 http://test@example.com/ \
+warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - \
+591 355 example-url-agnostic-revisit.warc.gz'
+
+
+#==============================================================================
 def load_test_archive(test_file, offset, length):
    path = test_warc_dir + test_file

    testloader = ArcWarcRecordLoader()

-    archive = testloader.load(path, offset, length)
    archive = testloader.load(path, offset, length)

    pprint.pprint((archive.type, archive.rec_headers, archive.status_headers))


+#==============================================================================
+def load_orig_cdx(self):
+    return [CDXObject(URL_AGNOSTIC_ORIG_CDX)]
+
+#==============================================================================
 def load_from_cdx_test(cdx):
    resolve_loader = ResolvingLoader(test_warc_dir)
    cdx = CDXObject(cdx)
    try:
-        (headers, stream) = resolve_loader.resolve_headers_and_payload(cdx, None)
+        (headers, stream) = resolve_loader.resolve_headers_and_payload(cdx, None, load_orig_cdx)
        print headers
        sys.stdout.write(stream.readline())
        sys.stdout.write(stream.readline())
-    except Exception as e:
+    except ArchiveLoadFailed as e:
        print 'Exception: ' + e.__class__.__name__

 if __name__ == "__main__":
--- a/sample_archive/cdx/url-agnost-example.cdx
+++ b/sample_archive/cdx/url-agnost-example.cdx
@ -0,0 +1,3 @@
+ CDX N b a m s k r M S V g
+com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
+org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1001 353 example-url-agnostic-orig.warc.gz
--- a/sample_archive/warcs/example-url-agnostic-orig.warc.gz
+++ b/sample_archive/warcs/example-url-agnostic-orig.warc.gz
--- a/sample_archive/warcs/example-url-agnostic-revisit.warc.gz
+++ b/sample_archive/warcs/example-url-agnostic-revisit.warc.gz
--- a/tests/test_integration.py
+++ b/tests/test_integration.py
@ -84,6 +84,15 @@ class TestWb:
        assert 'wb.js' in resp.body
        assert '/pywb/20140127171238/http://www.iana.org/time-zones"' in resp.body

+
+    def test_replay_url_agnostic_revisit(self):
+        resp = self.testapp.get('/pywb/20130729195151/http://www.example.com/')
+        self._assert_basic_html(resp)
+
+        assert 'Mon, Jul 29 2013 19:51:51' in resp.body
+        assert 'wb.js' in resp.body
+        assert '/pywb/20130729195151/http://www.iana.org/domains/example"' in resp.body
+
    def test_replay_identity_1(self):
        resp = self.testapp.get('/pywb/20140127171251id_/http://example.com')
        #resp = self.testapp.get('/pywb/20140126200654id_/http://www.iana.org/_img/2013.1/rir-map.svg')