From fe1c32c8f7d78cb38804abb85aa62f84aab88ccd Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 19 Mar 2015 11:20:40 -0700 Subject: [PATCH] cdxj: support loading cdxj (#76) cdx obj: allow alt field names to be used (eg. mime, mimetype, m) (status/statuscode/s) in querying and reading cdx cdx minimal: (#75) now implies cdxj to avoid more formats minimal includes digest always and mime when warc/revisit tests for cdxj loading indexing optimization: reuse same entry obj for records of same type --- config.yaml | 4 +- pywb/cdx/cdxobject.py | 88 +++++++++++++++++++++++++++------ pywb/cdx/cdxops.py | 11 +++-- pywb/cdx/cdxserver.py | 2 +- pywb/cdx/cdxsource.py | 5 +- pywb/cdx/test/test_cdxops.py | 20 +++++++- pywb/warc/archiveiterator.py | 54 +++++++++++--------- pywb/warc/cdxindexer.py | 56 ++++++++------------- pywb/warc/test/test_indexing.py | 17 +++---- setup.py | 2 + tests/test_config.yaml | 2 + tests/test_integration.py | 8 +++ 12 files changed, 174 insertions(+), 95 deletions(-) diff --git a/config.yaml b/config.yaml index de1610bd..36ce435f 100644 --- a/config.yaml +++ b/config.yaml @@ -107,8 +107,8 @@ enable_http_proxy: true # List of route names: # : # default route static/default for pywb defaults -static_routes: - static/default: pywb/static/ +#static_routes: +# static/default: pywb/static/ # enable cdx server api for querying cdx directly (experimental) enable_cdx_api: true diff --git a/pywb/cdx/cdxobject.py b/pywb/cdx/cdxobject.py index 89a40be0..60a0eb8b 100644 --- a/pywb/cdx/cdxobject.py +++ b/pywb/cdx/cdxobject.py @@ -10,6 +10,26 @@ from urlparse import parse_qs from pywb.utils.wbexception import WbException +from json import loads as json_decode + + +#================================================================= +URLKEY = 'urlkey' +TIMESTAMP = 'timestamp' +ORIGINAL = 'original' +MIMETYPE = 'mimetype' +STATUSCODE = 'statuscode' +DIGEST = 'digest' +REDIRECT = 'redirect' +ROBOTFLAGS = 'robotflags' +LENGTH = 'length' +OFFSET = 'offset' +FILENAME = 'filename' + +ORIG_LENGTH = 'orig.length' +ORIG_OFFSET = 'orig.offset' +ORIG_FILENAME = 'orig.filename' + #================================================================= class CDXException(WbException): @@ -24,28 +44,53 @@ class CDXObject(OrderedDict): """ CDX_FORMATS = [ # Public CDX Format - ["urlkey", "timestamp", "original", "mimetype", "statuscode", - "digest", "length"], + [URLKEY, TIMESTAMP, ORIGINAL, MIMETYPE, STATUSCODE, + DIGEST, LENGTH], # CDX 11 Format - ["urlkey", "timestamp", "original", "mimetype", "statuscode", - "digest", "redirect", "robotflags", "length", "offset", "filename"], + [URLKEY, TIMESTAMP, ORIGINAL, MIMETYPE, STATUSCODE, + DIGEST, REDIRECT, ROBOTFLAGS, LENGTH, OFFSET, FILENAME], # CDX 9 Format - ["urlkey", "timestamp", "original", "mimetype", "statuscode", - "digest", "redirect", "offset", "filename"], + [URLKEY, TIMESTAMP, ORIGINAL, MIMETYPE, STATUSCODE, + DIGEST, REDIRECT, OFFSET, FILENAME], # CDX 11 Format + 3 revisit resolve fields - ["urlkey", "timestamp", "original", "mimetype", "statuscode", - "digest", "redirect", "robotflags", "length", "offset", "filename", - "orig.length", "orig.offset", "orig.filename"], + [URLKEY, TIMESTAMP, ORIGINAL, MIMETYPE, STATUSCODE, + DIGEST, REDIRECT, ROBOTFLAGS, LENGTH, OFFSET, FILENAME, + ORIG_LENGTH, ORIG_OFFSET, ORIG_FILENAME], # CDX 9 Format + 3 revisit resolve fields - ["urlkey", "timestamp", "original", "mimetype", "statuscode", - "digest", "redirect", "offset", "filename", - "orig.length", "orig.offset", "orig.filename"] + [URLKEY, TIMESTAMP, ORIGINAL, MIMETYPE, STATUSCODE, + DIGEST, REDIRECT, OFFSET, FILENAME, + ORIG_LENGTH, ORIG_OFFSET, ORIG_FILENAME], ] + + CDX_ALT_FIELDS = { + 'u': ORIGINAL, + 'url': ORIGINAL, + + 'status': STATUSCODE, + 's': STATUSCODE, + + 'mime': MIMETYPE, + 'm': MIMETYPE, + + 'l': LENGTH, + 's': LENGTH, + + 'o': OFFSET, + + 'd': DIGEST, + + 't': TIMESTAMP, + + 'k': URLKEY, + + 'f': FILENAME + } + def __init__(self, cdxline=''): OrderedDict.__init__(self) @@ -56,7 +101,20 @@ class CDXObject(OrderedDict): self.cdxline = cdxline return - fields = cdxline.split(' ') + fields = cdxline.split(' ' , 2) + # Check for CDX JSON + if fields[-1].startswith('{'): + self[URLKEY] = fields[0] + self[TIMESTAMP] = fields[1] + json_fields = json_decode(fields[-1]) + for n, v in json_fields.iteritems(): + n = self.CDX_ALT_FIELDS.get(n, n) + self[n] = str(v) + self.cdxline = cdxline + return + + more_fields = fields.pop().split(' ') + fields.extend(more_fields) cdxformat = None for i in self.CDX_FORMATS: @@ -80,8 +138,8 @@ class CDXObject(OrderedDict): def is_revisit(self): """return ``True`` if this record is a revisit record.""" - return (self['mimetype'] == 'warc/revisit' or - self['filename'] == '-') + return (self.get(MIMETYPE) == 'warc/revisit' or + self.get(FILENAME) == '-') def to_text(self, fields=None): """ diff --git a/pywb/cdx/cdxops.py b/pywb/cdx/cdxops.py index 4aa4fc17..0bf58c42 100644 --- a/pywb/cdx/cdxops.py +++ b/pywb/cdx/cdxops.py @@ -172,6 +172,8 @@ def cdx_filter(cdx_iter, filter_strings): # apply filter to cdx[field] else: self.field = parts[0] + self.field = CDXObject.CDX_ALT_FIELDS.get(self.field, + self.field) string = parts[1] # make regex if regex mode @@ -181,7 +183,10 @@ def cdx_filter(cdx_iter, filter_strings): self.filter_str = string def __call__(self, cdx): - val = cdx[self.field] if self.field else str(cdx) + if not self.field: + val = str(cdx) + else: + val = cdx.get(self.field, '') matched = self.compare_func(val) @@ -280,8 +285,8 @@ def cdx_resolve_revisits(cdx_iter): if original_cdx and is_revisit: fill_orig = lambda field: original_cdx[field] # Transfer mimetype and statuscode - cdx['mimetype'] = original_cdx['mimetype'] - cdx['statuscode'] = original_cdx['statuscode'] + cdx['mimetype'] = original_cdx.get('mimetype', 'none') + cdx['statuscode'] = original_cdx.get('statuscode', 'none') else: fill_orig = lambda field: '-' diff --git a/pywb/cdx/cdxserver.py b/pywb/cdx/cdxserver.py index 1ecec407..0de3e325 100644 --- a/pywb/cdx/cdxserver.py +++ b/pywb/cdx/cdxserver.py @@ -167,7 +167,7 @@ class CDXServer(BaseCDXServer): if filename.startswith('redis://'): return RedisCDXSource(filename, config) - if filename.endswith('.cdx'): + if filename.endswith(('.cdx', '.cdxj')): return CDXFile(filename) if filename.endswith(('.summary', '.idx')): diff --git a/pywb/cdx/cdxsource.py b/pywb/cdx/cdxsource.py index e3174ab1..7eabdbed 100644 --- a/pywb/cdx/cdxsource.py +++ b/pywb/cdx/cdxsource.py @@ -29,13 +29,10 @@ class CDXFile(CDXSource): def load_cdx(self, query): def do_open(): - try: - source = open(self.filename, 'rb') + with open(self.filename, 'rb') as source: gen = iter_range(source, query.key, query.end_key) for line in gen: yield line - finally: - source.close() return do_open() #return iter_range(do_open(), query.key, query.end_key) diff --git a/pywb/cdx/test/test_cdxops.py b/pywb/cdx/test/test_cdxops.py index 86c2fce8..88ea0b58 100644 --- a/pywb/cdx/test/test_cdxops.py +++ b/pywb/cdx/test/test_cdxops.py @@ -48,6 +48,19 @@ org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/s >>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', filter = 'statuscode:200') org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz +# Filter Alt field name +>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', filter = 'status:200') +org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz + +# Filter -- no field specified, match regex on entire line +>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', filter = '~screen.css 20140126200625') +org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz + +# Filter -- no such field, no matches +>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', filter = 'blah:200') +Traceback (most recent call last): +NotFoundException: No Captures found for: http://iana.org/_css/2013.1/screen.css + # Filter exact >>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '=urlkey:com,example)/?example=1') com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz @@ -82,7 +95,6 @@ org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/ org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz - - - org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz 8754 41238 iana.warc.gz - # Sort by closest timestamp + field select output >>> cdx_ops_test(closest = '20140126200826', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', fields = 'timestamp', limit = 10) 20140126200826 @@ -138,6 +150,12 @@ org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201249 http://www.iana.org/_ >>> cdx_ops_test('http://iana.org/domains/root/db', resolveRevisits = True) org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz - - - org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz - - - + +# Resolve Revisit -- cdxj minimal +#>>> cdx_ops_test(url = 'http://example.com/?example=1', sources=[get_test_dir() + 'cdxj/example.cdxj'], resolveRevisits=True) + + + """ #================================================================= diff --git a/pywb/warc/archiveiterator.py b/pywb/warc/archiveiterator.py index 7a88ec44..c72eae62 100644 --- a/pywb/warc/archiveiterator.py +++ b/pywb/warc/archiveiterator.py @@ -197,6 +197,9 @@ class ArchiveIterator(object): class ArchiveIndexEntryMixin(object): MIME_RE = re.compile('[; ]') + def reset_entry(self): + self['key'] = '' + def extract_mime(self, mime, def_mime='unk'): """ Utility function to extract mimetype only from a full content type, removing charset settings @@ -215,11 +218,12 @@ class ArchiveIndexEntryMixin(object): self['status'] = '-' def set_rec_info(self, offset, length, digest): - self['length'] = str(length) - self['offset'] = str(offset) if digest: self['digest'] = digest + self['length'] = str(length) + self['offset'] = str(offset) + def merge_request_data(self, other, options): surt_ordered = options.get('surt_ordered', True) @@ -248,12 +252,21 @@ class ArchiveIndexEntryMixin(object): class DefaultRecordIter(object): def __init__(self, **options): self.options = options + self.entry_cache = {} - def _create_index_entry(self): - if self.options.get('cdxj'): - return OrderedArchiveIndexEntry() - else: - return ArchiveIndexEntry() + def _create_index_entry(self, rec_type): + try: + entry = self.entry_cache[rec_type] + entry.reset_entry() + except: + if self.options.get('cdxj'): + entry = OrderedArchiveIndexEntry() + else: + entry = ArchiveIndexEntry() + + self.entry_cache[rec_type] = entry + + return entry def create_record_iter(self, arcv_iter): append_post = self.options.get('append_post') @@ -295,8 +308,7 @@ class DefaultRecordIter(object): compute_digest = False - if (not minimal and - entry.get('digest', '-') == '-' and + if (entry.get('digest', '-') == '-' and record.rec_type not in ('revisit', 'request', 'warcinfo')): compute_digest = True @@ -312,7 +324,6 @@ class DefaultRecordIter(object): entry['_post_query'] = post_query - #entry.set_rec_info(*arcv_iter.read_to_end(record, compute_digest)) arcv_iter.read_to_end(record, compute_digest) entry.set_rec_info(*arcv_iter.member_info) entry.record = record @@ -355,7 +366,7 @@ class DefaultRecordIter(object): """ Parse warc record """ - entry = self._create_index_entry() + entry = self._create_index_entry(record.rec_type) if record.rec_type == 'warcinfo': entry['url'] = record.rec_headers.get_header('WARC-Filename') @@ -369,12 +380,11 @@ class DefaultRecordIter(object): entry['timestamp'] = iso_date_to_timestamp(record.rec_headers. get_header('WARC-Date')) - if self.options.get('minimal'): - return entry - # mime if record.rec_type == 'revisit': entry['mime'] = 'warc/revisit' + elif self.options.get('minimal'): + entry['mime'] = '-' else: def_mime = '-' if record.rec_type == 'request' else 'unk' entry.extract_mime(record.status_headers. @@ -382,7 +392,7 @@ class DefaultRecordIter(object): def_mime) # status -- only for response records (by convention): - if record.rec_type == 'response': + if record.rec_type == 'response' and not self.options.get('minimal'): entry.extract_status(record.status_headers) else: entry['status'] = '-' @@ -414,7 +424,7 @@ class DefaultRecordIter(object): # replace nulls url = url.replace('\x00', '%00') - entry = self._create_index_entry() + entry = self._create_index_entry(record.rec_type) entry['url'] = url # timestamp @@ -422,14 +432,12 @@ class DefaultRecordIter(object): if len(entry['timestamp']) > 14: entry['timestamp'] = entry['timestamp'][:14] - if self.options.get('minimal'): - return entry + if not self.options.get('minimal'): + # mime + entry.extract_mime(record.rec_headers.get_header('content-type')) - # mime - entry.extract_mime(record.rec_headers.get_header('content-type')) - - # status - entry.extract_status(record.status_headers) + # status + entry.extract_status(record.status_headers) # digest entry['digest'] = '-' diff --git a/pywb/warc/cdxindexer.py b/pywb/warc/cdxindexer.py index 7bbe4942..4d7c5837 100644 --- a/pywb/warc/cdxindexer.py +++ b/pywb/warc/cdxindexer.py @@ -16,6 +16,7 @@ from io import BytesIO from archiveiterator import DefaultRecordIter + #================================================================= class BaseCDXWriter(object): def __init__(self, out): @@ -68,26 +69,6 @@ class CDXJ(object): out.write('\n') -#================================================================= -class CDX06(object): - def _write_header(self): - self.out.write(' CDX N b a S V g\n') - - def write_cdx_line(self, out, entry, filename): - out.write(entry['key']) - out.write(' ') - out.write(entry['timestamp']) - out.write(' ') - out.write(entry['url']) - out.write(' ') - out.write(entry['length']) - out.write(' ') - out.write(entry['offset']) - out.write(' ') - out.write(filename) - out.write('\n') - - #================================================================= class CDX09(object): def _write_header(self): @@ -201,6 +182,8 @@ def cdx_filename(filename): #================================================================= def get_cdx_writer_cls(options): writer_cls = options.get('writer_cls') + if options.get('minimal'): + options['cdxj'] = True if writer_cls: if not options.get('writer_add_mixin'): @@ -212,8 +195,6 @@ def get_cdx_writer_cls(options): if options.get('cdxj'): format_mixin = CDXJ - elif options.get('cdx06') or options.get('minimal'): - format_mixin = CDX06 elif options.get('cdx09'): format_mixin = CDX09 else: @@ -311,14 +292,20 @@ Not-recommended for new cdx, use only for backwards-compatibility. cdx09_help = """ Use older 9-field cdx format, default is 11-cdx field """ - minimal_help = """ -Use a minimal 6-field cdx format, outputing only the basic fields -needed to identiyfy record: -canonicalized url, timestamp, original url, archive offset, archive length -and archive filename. + minimal_json_help = """ +CDX JSON output, but with minimal fields only, available w/o parsing +http record. The fields are: +canonicalized url, timestamp, original url, digest, archive offset, archive length +and archive filename. mimetype is included to indicate warc/revisit only. This option skips record parsing and will not work with POST append (-p) option +""" + + json_help = """ +Output CDX JSON format per line, with url timestamp first, followed by json dict +for all other fields: +url timestamp { ... } """ output_help = """output file or directory. @@ -370,15 +357,13 @@ if input is a directory""" action='store_true', help=cdx09_help) - group.add_argument('-6', '--cdx06', - action='store_true') - group.add_argument('-j', '--cdxj', - action='store_true') - - parser.add_argument('-m', '--minimal', action='store_true', - help=minimal_help) + help=json_help) + + parser.add_argument('-mj', '--minimal-cdxj', + action='store_true', + help=minimal_json_help) parser.add_argument('output', nargs='?', default='-', help=output_help) parser.add_argument('inputs', nargs='+', help=input_help) @@ -392,9 +377,8 @@ if input is a directory""" append_post=cmd.postappend, recurse=cmd.recurse, cdx09=cmd.cdx09, - cdx06=cmd.cdx06, cdxj=cmd.cdxj, - minimal=cmd.minimal) + minimal=cmd.minimal_cdxj) if __name__ == '__main__': diff --git a/pywb/warc/test/test_indexing.py b/pywb/warc/test/test_indexing.py index 6d1d33f7..25498760 100644 --- a/pywb/warc/test/test_indexing.py +++ b/pywb/warc/test/test_indexing.py @@ -8,18 +8,11 @@ com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 20 com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz -# warc.gz -- minimal cdx ->>> print_cdx_index('example.warc.gz', minimal=True) - CDX N b a S V g -com,example)/?example=1 20140103030321 http://example.com?example=1 1043 333 example.warc.gz -com,example)/?example=1 20140103030341 http://example.com?example=1 553 1864 example.warc.gz -org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example 577 2907 example.warc.gz - # warc.gz -- minimal CDXJ >>> print_cdx_index('example.warc.gz', minimal=True, cdxj=True) -com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "length": "1043", "offset": "333", "filename": "example.warc.gz"} -com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "length": "553", "offset": "1864", "filename": "example.warc.gz"} -org,iana)/domains/example 20140128051539 {"url": "http://www.iana.org/domains/example", "length": "577", "offset": "2907", "filename": "example.warc.gz"} +com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "1043", "offset": "333", "filename": "example.warc.gz"} +com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "mime": "warc/revisit", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "553", "offset": "1864", "filename": "example.warc.gz"} +org,iana)/domains/example 20140128051539 {"url": "http://www.iana.org/domains/example", "digest": "JZ622UA23G5ZU6Y3XAKH4LINONUEICEG", "length": "577", "offset": "2907", "filename": "example.warc.gz"} # warc.gz -- parse all >>> print_cdx_index('example.warc.gz', include_all=True) @@ -63,6 +56,10 @@ com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ >>> print_cdx_index('example.arc.gz', cdxj=True) com,example)/ 20140216050221 {"url": "http://example.com/", "mime": "text/html", "status": "200", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "856", "offset": "171", "filename": "example.arc.gz"} +# arc.gz -- minimal + json +>>> print_cdx_index('example.arc.gz', cdxj=True, minimal=True) +com,example)/ 20140216050221 {"url": "http://example.com/", "digest": "PEWDX5GTH66WU74WBPGFECIYBMPMP3FP", "length": "856", "offset": "171", "filename": "example.arc.gz"} + # arc >>> print_cdx_index('example.arc') CDX N b a m s k r M S V g diff --git a/setup.py b/setup.py index b0e89c60..4eca13cd 100755 --- a/setup.py +++ b/setup.py @@ -60,6 +60,8 @@ setup( }, data_files=[ ('sample_archive/cdx', glob.glob('sample_archive/cdx/*')), + ('sample_archive/cdxj', glob.glob('sample_archive/cdxj/*')), + ('sample_archive/non-surt-cdx', glob.glob('sample_archive/non-surt-cdx/*')), ('sample_archive/zipcdx', glob.glob('sample_archive/zipcdx/*')), ('sample_archive/warcs', glob.glob('sample_archive/warcs/*')), ('sample_archive/text_content', diff --git a/tests/test_config.yaml b/tests/test_config.yaml index fde10382..d7533951 100644 --- a/tests/test_config.yaml +++ b/tests/test_config.yaml @@ -46,6 +46,8 @@ collections: index_paths: ./sample_archive/cdx/ redir_to_exact: false + pywb-cdxj: + index_paths: ./sample_archive/cdxj/ # indicate if cdx files are sorted by SURT keys -- eg: com,example)/ diff --git a/tests/test_integration.py b/tests/test_integration.py index 1b04b0de..0612004b 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -124,6 +124,14 @@ class TestWb: assert 'wb.js' in resp.body assert '/pywb-nosurt/20140103030321/http://www.iana.org/domains/example' in resp.body + def test_replay_cdxj(self): + resp = self.testapp.get('/pywb-cdxj/20140103030321/http://example.com?example=1') + self._assert_basic_html(resp) + + assert '"20140103030321"' in resp.body + assert 'wb.js' in resp.body + assert '/pywb-cdxj/20140103030321/http://www.iana.org/domains/example' in resp.body + def test_zero_len_revisit(self): resp = self.testapp.get('/pywb/20140603030341/http://example.com?example=2') self._assert_basic_html(resp)