cdx: fix creation and add test for non-surt cdx (pywb-nonsurt/ test)

archiveindexer: -u option to generate non-surt cdx tests: full test coverage for cdxdomainspecific (fuzzy and custom canon)
2025-03-15 00:03:28 +01:00 · 2014-05-16 21:16:50 -07:00 · 2014-05-16 21:16:50 -07:00 · 7d236af7d7
commit 7d236af7d7
parent 8758e60590
7 changed files with 82 additions and 17 deletions
--- a/pywb/cdx/cdxdomainspecific.py
+++ b/pywb/cdx/cdxdomainspecific.py
@ -25,7 +25,7 @@ def load_domain_specific_cdx_rules(ds_rules_file, surt_ordered):
                    ds_rules_file=ds_rules_file)

    if not surt_ordered:
-        for rule in rules:
+        for rule in rules.rules:
            rule.unsurt()

    if rules:
@ -36,7 +36,7 @@ def load_domain_specific_cdx_rules(ds_rules_file, surt_ordered):
                    ds_rules_file=ds_rules_file)

    if not surt_ordered:
-        for rule in rules:
+        for rule in rules.rules:
            rule.unsurt()

    if rules:
@ -108,11 +108,12 @@ class FuzzyQuery:
        params.update({'url': url,
                       'matchType': 'prefix',
                       'filter': filter_})
-        try:
+
+        if 'reverse' in params:
            del params['reverse']
+
+        if 'closest' in params:
            del params['closest']
-        except KeyError:
-            pass

        return params

@ -141,7 +142,7 @@ class CDXDomainSpecificRule(BaseRule):
        """
        self.url_prefix = map(unsurt, self.url_prefix)
        if self.regex:
-            self.regex = unsurt(self.regex)
+            self.regex = re.compile(unsurt(self.regex.pattern))

        if self.replace:
            self.replace = unsurt(self.replace)
--- a/pywb/cdx/test/test_cdxserver.py
+++ b/pywb/cdx/test/test_cdxserver.py
@ -128,6 +128,36 @@ def test_fuzzy_match():
    assert_cdx_fuzzy_match(RemoteCDXServer(CDX_SERVER_URL,
                           ds_rules_file=DEFAULT_RULES_FILE))

+def test_fuzzy_no_match_1():
+    # no match, no fuzzy
+    with patch('pywb.cdx.cdxsource.urllib2.urlopen', mock_urlopen):
+        server = CDXServer([TEST_CDX_DIR], ds_rules_file=DEFAULT_RULES_FILE)
+        with raises(NotFoundException):
+            server.load_cdx(url='http://notfound.example.com/',
+                            output='cdxobject',
+                            reverse=True,
+                            allowFuzzy=True)
+
+def test_fuzzy_no_match_2():
+    # fuzzy rule, but no actual match
+    with patch('pywb.cdx.cdxsource.urllib2.urlopen', mock_urlopen):
+        server = CDXServer([TEST_CDX_DIR], ds_rules_file=DEFAULT_RULES_FILE)
+        with raises(NotFoundException):
+            server.load_cdx(url='http://notfound.example.com/?_=1234',
+                            closest='2014',
+                            reverse=True,
+                            output='cdxobject',
+                            allowFuzzy=True)
+
+def test2_fuzzy_no_match_3():
+    # special fuzzy rule, matches prefix test.example.example.,
+    # but doesn't match rule regex
+    with patch('pywb.cdx.cdxsource.urllib2.urlopen', mock_urlopen):
+        server = CDXServer([TEST_CDX_DIR], ds_rules_file=DEFAULT_RULES_FILE)
+        with raises(NotFoundException):
+            server.load_cdx(url='http://test.example.example/',
+                            allowFuzzy=True)
+
 def assert_error(func, exception):
    with raises(exception):
        func(CDXServer(CDX_SERVER_URL))
--- a/pywb/rules.yaml
+++ b/pywb/rules.yaml
@ -60,3 +60,4 @@ rules:
      fuzzy_lookup:
        match: '(.*)[&?](?:_|uncache)=[\d]+[&]?'
        filter: '=urlkey:{0}'
+        replace: '?'
--- a/pywb/warc/archiveindexer.py
+++ b/pywb/warc/archiveindexer.py
@ -1,9 +1,9 @@
 from pywb.utils.timeutils import iso_date_to_timestamp
 from pywb.utils.bufferedreaders import DecompressingBufferedReader
+from pywb.utils.canonicalize import canonicalize

 from recordloader import ArcWarcRecordLoader

-import surt
 import hashlib
 import base64

@ -22,12 +22,13 @@ class ArchiveIndexer(object):
    if necessary
    """
    def __init__(self, fileobj, filename,
-                 out=sys.stdout, sort=False, writer=None):
+                 out=sys.stdout, sort=False, writer=None, surt_ordered=True):
        self.fh = fileobj
        self.filename = filename
        self.loader = ArcWarcRecordLoader()
        self.offset = 0
        self.known_format = None
+        self.surt_ordered = surt_ordered

        if writer:
            self.writer = writer
@ -179,7 +180,9 @@ class ArchiveIndexer(object):
        if not digest:
            digest = '-'

-        return [surt.surt(url),
+        key = canonicalize(url, self.surt_ordered)
+
+        return [key,
                timestamp,
                url,
                mime,
@ -211,7 +214,9 @@ class ArchiveIndexer(object):
        mime = record.rec_headers.get_header('content-type')
        mime = self._extract_mime(mime)

-        return [surt.surt(url),
+        key = canonicalize(url, self.surt_ordered)
+
+        return [key,
                timestamp,
                url,
                mime,
@ -318,7 +323,7 @@ def iter_file_or_dir(inputs):
                yield os.path.join(input_, filename), filename


-def index_to_file(inputs, output, sort):
+def index_to_file(inputs, output, sort, surt_ordered):
    if output == '-':
        outfile = sys.stdout
    else:
@ -337,7 +342,8 @@ def index_to_file(inputs, output, sort):
            with open(fullpath, 'r') as infile:
                ArchiveIndexer(fileobj=infile,
                               filename=filename,
-                               writer=writer).make_index()
+                               writer=writer,
+                               surt_ordered=surt_ordered).make_index()
    finally:
        writer.end_all()
        if infile:
@ -357,7 +363,7 @@ def cdx_filename(filename):
    return remove_ext(filename) + '.cdx'


-def index_to_dir(inputs, output, sort):
+def index_to_dir(inputs, output, sort, surt_ordered):
    for fullpath, filename in iter_file_or_dir(inputs):

        outpath = cdx_filename(filename)
@ -368,7 +374,8 @@ def index_to_dir(inputs, output, sort):
                ArchiveIndexer(fileobj=infile,
                               filename=filename,
                               sort=sort,
-                               out=outfile).make_index()
+                               out=outfile,
+                               surt_ordered=surt_ordered).make_index()


 def main(args=None):
@ -393,6 +400,12 @@ Some examples:

    sort_help = """
 sort the output to each file before writing to create a total ordering
+"""
+
+    unsurt_help = """
+Convert SURT (Sort-friendly URI Reordering Transform) back to regular
+urls for the cdx key. Default is to use SURT keys.
+Not-recommended for new cdx, use only for backwards-compatibility.
 """

    output_help = """output file or directory.
@ -409,15 +422,22 @@ sort the output to each file before writing to create a total ordering
                            epilog=epilog,
                            formatter_class=RawTextHelpFormatter)

-    parser.add_argument('-s', '--sort', action='store_true', help=sort_help)
+    parser.add_argument('-s', '--sort',
+                        action='store_true',
+                        help=sort_help)
+
+    parser.add_argument('-u', '--unsurt',
+                        action='store_true',
+                        help=unsurt_help)
+
    parser.add_argument('output', help=output_help)
    parser.add_argument('inputs', nargs='+', help=input_help)

    cmd = parser.parse_args(args=args)
    if cmd.output != '-' and os.path.isdir(cmd.output):
-        index_to_dir(cmd.inputs, cmd.output, cmd.sort)
+        index_to_dir(cmd.inputs, cmd.output, cmd.sort, not cmd.unsurt)
    else:
-        index_to_file(cmd.inputs, cmd.output, cmd.sort)
+        index_to_file(cmd.inputs, cmd.output, cmd.sort, not cmd.unsurt)


 if __name__ == '__main__':
--- a/sample_archive/cdx/example-non-surt.cdx
+++ b/sample_archive/cdx/example-non-surt.cdx
@ -0,0 +1,4 @@
+ CDX N b a m s k r M S V g
+example.com/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
+example.com/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz
+iana.org/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
--- a/tests/test_config.yaml
+++ b/tests/test_config.yaml
@ -15,6 +15,8 @@ collections:
    # ex with filtering: filter CDX lines by filename starting with 'dupe'
    pywb-filt: {'index_paths': './sample_archive/cdx/', 'filters': ['filename:dupe*']}

+    pywb-nosurt: {'index_paths': './sample_archive/cdx/example-non-surt.cdx', 'surt_ordered': False}
+

 # indicate if cdx files are sorted by SURT keys -- eg: com,example)/
 # SURT keys are recommended for future indices, but non-SURT cdxs
--- a/tests/test_integration.py
+++ b/tests/test_integration.py
@ -94,6 +94,13 @@ class TestWb:
        assert 'wb.js' in resp.body
        assert '/pywb/20140127171238/http://www.iana.org/time-zones"' in resp.body

+    def test_replay_non_surt(self):
+        resp = self.testapp.get('/pywb-nosurt/20140103030321/http://example.com?example=1')
+        self._assert_basic_html(resp)
+
+        #assert 'Mon, Jan 27 2014 17:12:38' in resp.body
+        assert 'wb.js' in resp.body
+        #assert '/pywb/20140127171238/http://www.iana.org/time-zones"' in resp.body

    def test_replay_url_agnostic_revisit(self):
        resp = self.testapp.get('/pywb/20130729195151/http://www.example.com/')