1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

cdx: fix creation and add test for non-surt cdx (pywb-nonsurt/ test)

archiveindexer: -u option to generate non-surt cdx
tests: full test coverage for cdxdomainspecific (fuzzy and custom canon)
This commit is contained in:
Ilya Kreymer 2014-05-16 21:16:50 -07:00
parent 8758e60590
commit 7d236af7d7
7 changed files with 82 additions and 17 deletions

View File

@ -25,7 +25,7 @@ def load_domain_specific_cdx_rules(ds_rules_file, surt_ordered):
ds_rules_file=ds_rules_file) ds_rules_file=ds_rules_file)
if not surt_ordered: if not surt_ordered:
for rule in rules: for rule in rules.rules:
rule.unsurt() rule.unsurt()
if rules: if rules:
@ -36,7 +36,7 @@ def load_domain_specific_cdx_rules(ds_rules_file, surt_ordered):
ds_rules_file=ds_rules_file) ds_rules_file=ds_rules_file)
if not surt_ordered: if not surt_ordered:
for rule in rules: for rule in rules.rules:
rule.unsurt() rule.unsurt()
if rules: if rules:
@ -108,11 +108,12 @@ class FuzzyQuery:
params.update({'url': url, params.update({'url': url,
'matchType': 'prefix', 'matchType': 'prefix',
'filter': filter_}) 'filter': filter_})
try:
if 'reverse' in params:
del params['reverse'] del params['reverse']
if 'closest' in params:
del params['closest'] del params['closest']
except KeyError:
pass
return params return params
@ -141,7 +142,7 @@ class CDXDomainSpecificRule(BaseRule):
""" """
self.url_prefix = map(unsurt, self.url_prefix) self.url_prefix = map(unsurt, self.url_prefix)
if self.regex: if self.regex:
self.regex = unsurt(self.regex) self.regex = re.compile(unsurt(self.regex.pattern))
if self.replace: if self.replace:
self.replace = unsurt(self.replace) self.replace = unsurt(self.replace)

View File

@ -128,6 +128,36 @@ def test_fuzzy_match():
assert_cdx_fuzzy_match(RemoteCDXServer(CDX_SERVER_URL, assert_cdx_fuzzy_match(RemoteCDXServer(CDX_SERVER_URL,
ds_rules_file=DEFAULT_RULES_FILE)) ds_rules_file=DEFAULT_RULES_FILE))
def test_fuzzy_no_match_1():
# no match, no fuzzy
with patch('pywb.cdx.cdxsource.urllib2.urlopen', mock_urlopen):
server = CDXServer([TEST_CDX_DIR], ds_rules_file=DEFAULT_RULES_FILE)
with raises(NotFoundException):
server.load_cdx(url='http://notfound.example.com/',
output='cdxobject',
reverse=True,
allowFuzzy=True)
def test_fuzzy_no_match_2():
# fuzzy rule, but no actual match
with patch('pywb.cdx.cdxsource.urllib2.urlopen', mock_urlopen):
server = CDXServer([TEST_CDX_DIR], ds_rules_file=DEFAULT_RULES_FILE)
with raises(NotFoundException):
server.load_cdx(url='http://notfound.example.com/?_=1234',
closest='2014',
reverse=True,
output='cdxobject',
allowFuzzy=True)
def test2_fuzzy_no_match_3():
# special fuzzy rule, matches prefix test.example.example.,
# but doesn't match rule regex
with patch('pywb.cdx.cdxsource.urllib2.urlopen', mock_urlopen):
server = CDXServer([TEST_CDX_DIR], ds_rules_file=DEFAULT_RULES_FILE)
with raises(NotFoundException):
server.load_cdx(url='http://test.example.example/',
allowFuzzy=True)
def assert_error(func, exception): def assert_error(func, exception):
with raises(exception): with raises(exception):
func(CDXServer(CDX_SERVER_URL)) func(CDXServer(CDX_SERVER_URL))

View File

@ -60,3 +60,4 @@ rules:
fuzzy_lookup: fuzzy_lookup:
match: '(.*)[&?](?:_|uncache)=[\d]+[&]?' match: '(.*)[&?](?:_|uncache)=[\d]+[&]?'
filter: '=urlkey:{0}' filter: '=urlkey:{0}'
replace: '?'

View File

@ -1,9 +1,9 @@
from pywb.utils.timeutils import iso_date_to_timestamp from pywb.utils.timeutils import iso_date_to_timestamp
from pywb.utils.bufferedreaders import DecompressingBufferedReader from pywb.utils.bufferedreaders import DecompressingBufferedReader
from pywb.utils.canonicalize import canonicalize
from recordloader import ArcWarcRecordLoader from recordloader import ArcWarcRecordLoader
import surt
import hashlib import hashlib
import base64 import base64
@ -22,12 +22,13 @@ class ArchiveIndexer(object):
if necessary if necessary
""" """
def __init__(self, fileobj, filename, def __init__(self, fileobj, filename,
out=sys.stdout, sort=False, writer=None): out=sys.stdout, sort=False, writer=None, surt_ordered=True):
self.fh = fileobj self.fh = fileobj
self.filename = filename self.filename = filename
self.loader = ArcWarcRecordLoader() self.loader = ArcWarcRecordLoader()
self.offset = 0 self.offset = 0
self.known_format = None self.known_format = None
self.surt_ordered = surt_ordered
if writer: if writer:
self.writer = writer self.writer = writer
@ -179,7 +180,9 @@ class ArchiveIndexer(object):
if not digest: if not digest:
digest = '-' digest = '-'
return [surt.surt(url), key = canonicalize(url, self.surt_ordered)
return [key,
timestamp, timestamp,
url, url,
mime, mime,
@ -211,7 +214,9 @@ class ArchiveIndexer(object):
mime = record.rec_headers.get_header('content-type') mime = record.rec_headers.get_header('content-type')
mime = self._extract_mime(mime) mime = self._extract_mime(mime)
return [surt.surt(url), key = canonicalize(url, self.surt_ordered)
return [key,
timestamp, timestamp,
url, url,
mime, mime,
@ -318,7 +323,7 @@ def iter_file_or_dir(inputs):
yield os.path.join(input_, filename), filename yield os.path.join(input_, filename), filename
def index_to_file(inputs, output, sort): def index_to_file(inputs, output, sort, surt_ordered):
if output == '-': if output == '-':
outfile = sys.stdout outfile = sys.stdout
else: else:
@ -337,7 +342,8 @@ def index_to_file(inputs, output, sort):
with open(fullpath, 'r') as infile: with open(fullpath, 'r') as infile:
ArchiveIndexer(fileobj=infile, ArchiveIndexer(fileobj=infile,
filename=filename, filename=filename,
writer=writer).make_index() writer=writer,
surt_ordered=surt_ordered).make_index()
finally: finally:
writer.end_all() writer.end_all()
if infile: if infile:
@ -357,7 +363,7 @@ def cdx_filename(filename):
return remove_ext(filename) + '.cdx' return remove_ext(filename) + '.cdx'
def index_to_dir(inputs, output, sort): def index_to_dir(inputs, output, sort, surt_ordered):
for fullpath, filename in iter_file_or_dir(inputs): for fullpath, filename in iter_file_or_dir(inputs):
outpath = cdx_filename(filename) outpath = cdx_filename(filename)
@ -368,7 +374,8 @@ def index_to_dir(inputs, output, sort):
ArchiveIndexer(fileobj=infile, ArchiveIndexer(fileobj=infile,
filename=filename, filename=filename,
sort=sort, sort=sort,
out=outfile).make_index() out=outfile,
surt_ordered=surt_ordered).make_index()
def main(args=None): def main(args=None):
@ -393,6 +400,12 @@ Some examples:
sort_help = """ sort_help = """
sort the output to each file before writing to create a total ordering sort the output to each file before writing to create a total ordering
"""
unsurt_help = """
Convert SURT (Sort-friendly URI Reordering Transform) back to regular
urls for the cdx key. Default is to use SURT keys.
Not-recommended for new cdx, use only for backwards-compatibility.
""" """
output_help = """output file or directory. output_help = """output file or directory.
@ -409,15 +422,22 @@ sort the output to each file before writing to create a total ordering
epilog=epilog, epilog=epilog,
formatter_class=RawTextHelpFormatter) formatter_class=RawTextHelpFormatter)
parser.add_argument('-s', '--sort', action='store_true', help=sort_help) parser.add_argument('-s', '--sort',
action='store_true',
help=sort_help)
parser.add_argument('-u', '--unsurt',
action='store_true',
help=unsurt_help)
parser.add_argument('output', help=output_help) parser.add_argument('output', help=output_help)
parser.add_argument('inputs', nargs='+', help=input_help) parser.add_argument('inputs', nargs='+', help=input_help)
cmd = parser.parse_args(args=args) cmd = parser.parse_args(args=args)
if cmd.output != '-' and os.path.isdir(cmd.output): if cmd.output != '-' and os.path.isdir(cmd.output):
index_to_dir(cmd.inputs, cmd.output, cmd.sort) index_to_dir(cmd.inputs, cmd.output, cmd.sort, not cmd.unsurt)
else: else:
index_to_file(cmd.inputs, cmd.output, cmd.sort) index_to_file(cmd.inputs, cmd.output, cmd.sort, not cmd.unsurt)
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -0,0 +1,4 @@
CDX N b a m s k r M S V g
example.com/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
example.com/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz
iana.org/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz

View File

@ -15,6 +15,8 @@ collections:
# ex with filtering: filter CDX lines by filename starting with 'dupe' # ex with filtering: filter CDX lines by filename starting with 'dupe'
pywb-filt: {'index_paths': './sample_archive/cdx/', 'filters': ['filename:dupe*']} pywb-filt: {'index_paths': './sample_archive/cdx/', 'filters': ['filename:dupe*']}
pywb-nosurt: {'index_paths': './sample_archive/cdx/example-non-surt.cdx', 'surt_ordered': False}
# indicate if cdx files are sorted by SURT keys -- eg: com,example)/ # indicate if cdx files are sorted by SURT keys -- eg: com,example)/
# SURT keys are recommended for future indices, but non-SURT cdxs # SURT keys are recommended for future indices, but non-SURT cdxs

View File

@ -94,6 +94,13 @@ class TestWb:
assert 'wb.js' in resp.body assert 'wb.js' in resp.body
assert '/pywb/20140127171238/http://www.iana.org/time-zones"' in resp.body assert '/pywb/20140127171238/http://www.iana.org/time-zones"' in resp.body
def test_replay_non_surt(self):
resp = self.testapp.get('/pywb-nosurt/20140103030321/http://example.com?example=1')
self._assert_basic_html(resp)
#assert 'Mon, Jan 27 2014 17:12:38' in resp.body
assert 'wb.js' in resp.body
#assert '/pywb/20140127171238/http://www.iana.org/time-zones"' in resp.body
def test_replay_url_agnostic_revisit(self): def test_replay_url_agnostic_revisit(self):
resp = self.testapp.get('/pywb/20130729195151/http://www.example.com/') resp = self.testapp.get('/pywb/20130729195151/http://www.example.com/')