From 7d236af7d7762574a7a3e14ed6c5cb0c1f950a38 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 16 May 2014 21:16:50 -0700 Subject: [PATCH] cdx: fix creation and add test for non-surt cdx (pywb-nonsurt/ test) archiveindexer: -u option to generate non-surt cdx tests: full test coverage for cdxdomainspecific (fuzzy and custom canon) --- pywb/cdx/cdxdomainspecific.py | 13 ++++---- pywb/cdx/test/test_cdxserver.py | 30 ++++++++++++++++++ pywb/rules.yaml | 1 + pywb/warc/archiveindexer.py | 42 ++++++++++++++++++------- sample_archive/cdx/example-non-surt.cdx | 4 +++ tests/test_config.yaml | 2 ++ tests/test_integration.py | 7 +++++ 7 files changed, 82 insertions(+), 17 deletions(-) create mode 100644 sample_archive/cdx/example-non-surt.cdx diff --git a/pywb/cdx/cdxdomainspecific.py b/pywb/cdx/cdxdomainspecific.py index 83b4d1ee..fd830c17 100644 --- a/pywb/cdx/cdxdomainspecific.py +++ b/pywb/cdx/cdxdomainspecific.py @@ -25,7 +25,7 @@ def load_domain_specific_cdx_rules(ds_rules_file, surt_ordered): ds_rules_file=ds_rules_file) if not surt_ordered: - for rule in rules: + for rule in rules.rules: rule.unsurt() if rules: @@ -36,7 +36,7 @@ def load_domain_specific_cdx_rules(ds_rules_file, surt_ordered): ds_rules_file=ds_rules_file) if not surt_ordered: - for rule in rules: + for rule in rules.rules: rule.unsurt() if rules: @@ -108,11 +108,12 @@ class FuzzyQuery: params.update({'url': url, 'matchType': 'prefix', 'filter': filter_}) - try: + + if 'reverse' in params: del params['reverse'] + + if 'closest' in params: del params['closest'] - except KeyError: - pass return params @@ -141,7 +142,7 @@ class CDXDomainSpecificRule(BaseRule): """ self.url_prefix = map(unsurt, self.url_prefix) if self.regex: - self.regex = unsurt(self.regex) + self.regex = re.compile(unsurt(self.regex.pattern)) if self.replace: self.replace = unsurt(self.replace) diff --git a/pywb/cdx/test/test_cdxserver.py b/pywb/cdx/test/test_cdxserver.py index 3e4cdf3e..f90ef8aa 100644 --- a/pywb/cdx/test/test_cdxserver.py +++ b/pywb/cdx/test/test_cdxserver.py @@ -128,6 +128,36 @@ def test_fuzzy_match(): assert_cdx_fuzzy_match(RemoteCDXServer(CDX_SERVER_URL, ds_rules_file=DEFAULT_RULES_FILE)) +def test_fuzzy_no_match_1(): + # no match, no fuzzy + with patch('pywb.cdx.cdxsource.urllib2.urlopen', mock_urlopen): + server = CDXServer([TEST_CDX_DIR], ds_rules_file=DEFAULT_RULES_FILE) + with raises(NotFoundException): + server.load_cdx(url='http://notfound.example.com/', + output='cdxobject', + reverse=True, + allowFuzzy=True) + +def test_fuzzy_no_match_2(): + # fuzzy rule, but no actual match + with patch('pywb.cdx.cdxsource.urllib2.urlopen', mock_urlopen): + server = CDXServer([TEST_CDX_DIR], ds_rules_file=DEFAULT_RULES_FILE) + with raises(NotFoundException): + server.load_cdx(url='http://notfound.example.com/?_=1234', + closest='2014', + reverse=True, + output='cdxobject', + allowFuzzy=True) + +def test2_fuzzy_no_match_3(): + # special fuzzy rule, matches prefix test.example.example., + # but doesn't match rule regex + with patch('pywb.cdx.cdxsource.urllib2.urlopen', mock_urlopen): + server = CDXServer([TEST_CDX_DIR], ds_rules_file=DEFAULT_RULES_FILE) + with raises(NotFoundException): + server.load_cdx(url='http://test.example.example/', + allowFuzzy=True) + def assert_error(func, exception): with raises(exception): func(CDXServer(CDX_SERVER_URL)) diff --git a/pywb/rules.yaml b/pywb/rules.yaml index 563a8a28..04327c92 100644 --- a/pywb/rules.yaml +++ b/pywb/rules.yaml @@ -60,3 +60,4 @@ rules: fuzzy_lookup: match: '(.*)[&?](?:_|uncache)=[\d]+[&]?' filter: '=urlkey:{0}' + replace: '?' diff --git a/pywb/warc/archiveindexer.py b/pywb/warc/archiveindexer.py index 2247ced4..df7eef66 100644 --- a/pywb/warc/archiveindexer.py +++ b/pywb/warc/archiveindexer.py @@ -1,9 +1,9 @@ from pywb.utils.timeutils import iso_date_to_timestamp from pywb.utils.bufferedreaders import DecompressingBufferedReader +from pywb.utils.canonicalize import canonicalize from recordloader import ArcWarcRecordLoader -import surt import hashlib import base64 @@ -22,12 +22,13 @@ class ArchiveIndexer(object): if necessary """ def __init__(self, fileobj, filename, - out=sys.stdout, sort=False, writer=None): + out=sys.stdout, sort=False, writer=None, surt_ordered=True): self.fh = fileobj self.filename = filename self.loader = ArcWarcRecordLoader() self.offset = 0 self.known_format = None + self.surt_ordered = surt_ordered if writer: self.writer = writer @@ -179,7 +180,9 @@ class ArchiveIndexer(object): if not digest: digest = '-' - return [surt.surt(url), + key = canonicalize(url, self.surt_ordered) + + return [key, timestamp, url, mime, @@ -211,7 +214,9 @@ class ArchiveIndexer(object): mime = record.rec_headers.get_header('content-type') mime = self._extract_mime(mime) - return [surt.surt(url), + key = canonicalize(url, self.surt_ordered) + + return [key, timestamp, url, mime, @@ -318,7 +323,7 @@ def iter_file_or_dir(inputs): yield os.path.join(input_, filename), filename -def index_to_file(inputs, output, sort): +def index_to_file(inputs, output, sort, surt_ordered): if output == '-': outfile = sys.stdout else: @@ -337,7 +342,8 @@ def index_to_file(inputs, output, sort): with open(fullpath, 'r') as infile: ArchiveIndexer(fileobj=infile, filename=filename, - writer=writer).make_index() + writer=writer, + surt_ordered=surt_ordered).make_index() finally: writer.end_all() if infile: @@ -357,7 +363,7 @@ def cdx_filename(filename): return remove_ext(filename) + '.cdx' -def index_to_dir(inputs, output, sort): +def index_to_dir(inputs, output, sort, surt_ordered): for fullpath, filename in iter_file_or_dir(inputs): outpath = cdx_filename(filename) @@ -368,7 +374,8 @@ def index_to_dir(inputs, output, sort): ArchiveIndexer(fileobj=infile, filename=filename, sort=sort, - out=outfile).make_index() + out=outfile, + surt_ordered=surt_ordered).make_index() def main(args=None): @@ -393,6 +400,12 @@ Some examples: sort_help = """ sort the output to each file before writing to create a total ordering +""" + + unsurt_help = """ +Convert SURT (Sort-friendly URI Reordering Transform) back to regular +urls for the cdx key. Default is to use SURT keys. +Not-recommended for new cdx, use only for backwards-compatibility. """ output_help = """output file or directory. @@ -409,15 +422,22 @@ sort the output to each file before writing to create a total ordering epilog=epilog, formatter_class=RawTextHelpFormatter) - parser.add_argument('-s', '--sort', action='store_true', help=sort_help) + parser.add_argument('-s', '--sort', + action='store_true', + help=sort_help) + + parser.add_argument('-u', '--unsurt', + action='store_true', + help=unsurt_help) + parser.add_argument('output', help=output_help) parser.add_argument('inputs', nargs='+', help=input_help) cmd = parser.parse_args(args=args) if cmd.output != '-' and os.path.isdir(cmd.output): - index_to_dir(cmd.inputs, cmd.output, cmd.sort) + index_to_dir(cmd.inputs, cmd.output, cmd.sort, not cmd.unsurt) else: - index_to_file(cmd.inputs, cmd.output, cmd.sort) + index_to_file(cmd.inputs, cmd.output, cmd.sort, not cmd.unsurt) if __name__ == '__main__': diff --git a/sample_archive/cdx/example-non-surt.cdx b/sample_archive/cdx/example-non-surt.cdx new file mode 100644 index 00000000..4cded58d --- /dev/null +++ b/sample_archive/cdx/example-non-surt.cdx @@ -0,0 +1,4 @@ + CDX N b a m s k r M S V g +example.com/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz +example.com/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz +iana.org/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz diff --git a/tests/test_config.yaml b/tests/test_config.yaml index 2d748083..653c4506 100644 --- a/tests/test_config.yaml +++ b/tests/test_config.yaml @@ -15,6 +15,8 @@ collections: # ex with filtering: filter CDX lines by filename starting with 'dupe' pywb-filt: {'index_paths': './sample_archive/cdx/', 'filters': ['filename:dupe*']} + pywb-nosurt: {'index_paths': './sample_archive/cdx/example-non-surt.cdx', 'surt_ordered': False} + # indicate if cdx files are sorted by SURT keys -- eg: com,example)/ # SURT keys are recommended for future indices, but non-SURT cdxs diff --git a/tests/test_integration.py b/tests/test_integration.py index 10ed3724..27272674 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -94,6 +94,13 @@ class TestWb: assert 'wb.js' in resp.body assert '/pywb/20140127171238/http://www.iana.org/time-zones"' in resp.body + def test_replay_non_surt(self): + resp = self.testapp.get('/pywb-nosurt/20140103030321/http://example.com?example=1') + self._assert_basic_html(resp) + + #assert 'Mon, Jan 27 2014 17:12:38' in resp.body + assert 'wb.js' in resp.body + #assert '/pywb/20140127171238/http://www.iana.org/time-zones"' in resp.body def test_replay_url_agnostic_revisit(self): resp = self.testapp.get('/pywb/20130729195151/http://www.example.com/')