1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

cdx: fix creation and add test for non-surt cdx (pywb-nonsurt/ test)

archiveindexer: -u option to generate non-surt cdx
tests: full test coverage for cdxdomainspecific (fuzzy and custom canon)
This commit is contained in:
Ilya Kreymer 2014-05-16 21:16:50 -07:00
parent 8758e60590
commit 7d236af7d7
7 changed files with 82 additions and 17 deletions

View File

@ -25,7 +25,7 @@ def load_domain_specific_cdx_rules(ds_rules_file, surt_ordered):
ds_rules_file=ds_rules_file)
if not surt_ordered:
for rule in rules:
for rule in rules.rules:
rule.unsurt()
if rules:
@ -36,7 +36,7 @@ def load_domain_specific_cdx_rules(ds_rules_file, surt_ordered):
ds_rules_file=ds_rules_file)
if not surt_ordered:
for rule in rules:
for rule in rules.rules:
rule.unsurt()
if rules:
@ -108,11 +108,12 @@ class FuzzyQuery:
params.update({'url': url,
'matchType': 'prefix',
'filter': filter_})
try:
if 'reverse' in params:
del params['reverse']
if 'closest' in params:
del params['closest']
except KeyError:
pass
return params
@ -141,7 +142,7 @@ class CDXDomainSpecificRule(BaseRule):
"""
self.url_prefix = map(unsurt, self.url_prefix)
if self.regex:
self.regex = unsurt(self.regex)
self.regex = re.compile(unsurt(self.regex.pattern))
if self.replace:
self.replace = unsurt(self.replace)

View File

@ -128,6 +128,36 @@ def test_fuzzy_match():
assert_cdx_fuzzy_match(RemoteCDXServer(CDX_SERVER_URL,
ds_rules_file=DEFAULT_RULES_FILE))
def test_fuzzy_no_match_1():
# no match, no fuzzy
with patch('pywb.cdx.cdxsource.urllib2.urlopen', mock_urlopen):
server = CDXServer([TEST_CDX_DIR], ds_rules_file=DEFAULT_RULES_FILE)
with raises(NotFoundException):
server.load_cdx(url='http://notfound.example.com/',
output='cdxobject',
reverse=True,
allowFuzzy=True)
def test_fuzzy_no_match_2():
# fuzzy rule, but no actual match
with patch('pywb.cdx.cdxsource.urllib2.urlopen', mock_urlopen):
server = CDXServer([TEST_CDX_DIR], ds_rules_file=DEFAULT_RULES_FILE)
with raises(NotFoundException):
server.load_cdx(url='http://notfound.example.com/?_=1234',
closest='2014',
reverse=True,
output='cdxobject',
allowFuzzy=True)
def test2_fuzzy_no_match_3():
# special fuzzy rule, matches prefix test.example.example.,
# but doesn't match rule regex
with patch('pywb.cdx.cdxsource.urllib2.urlopen', mock_urlopen):
server = CDXServer([TEST_CDX_DIR], ds_rules_file=DEFAULT_RULES_FILE)
with raises(NotFoundException):
server.load_cdx(url='http://test.example.example/',
allowFuzzy=True)
def assert_error(func, exception):
with raises(exception):
func(CDXServer(CDX_SERVER_URL))

View File

@ -60,3 +60,4 @@ rules:
fuzzy_lookup:
match: '(.*)[&?](?:_|uncache)=[\d]+[&]?'
filter: '=urlkey:{0}'
replace: '?'

View File

@ -1,9 +1,9 @@
from pywb.utils.timeutils import iso_date_to_timestamp
from pywb.utils.bufferedreaders import DecompressingBufferedReader
from pywb.utils.canonicalize import canonicalize
from recordloader import ArcWarcRecordLoader
import surt
import hashlib
import base64
@ -22,12 +22,13 @@ class ArchiveIndexer(object):
if necessary
"""
def __init__(self, fileobj, filename,
out=sys.stdout, sort=False, writer=None):
out=sys.stdout, sort=False, writer=None, surt_ordered=True):
self.fh = fileobj
self.filename = filename
self.loader = ArcWarcRecordLoader()
self.offset = 0
self.known_format = None
self.surt_ordered = surt_ordered
if writer:
self.writer = writer
@ -179,7 +180,9 @@ class ArchiveIndexer(object):
if not digest:
digest = '-'
return [surt.surt(url),
key = canonicalize(url, self.surt_ordered)
return [key,
timestamp,
url,
mime,
@ -211,7 +214,9 @@ class ArchiveIndexer(object):
mime = record.rec_headers.get_header('content-type')
mime = self._extract_mime(mime)
return [surt.surt(url),
key = canonicalize(url, self.surt_ordered)
return [key,
timestamp,
url,
mime,
@ -318,7 +323,7 @@ def iter_file_or_dir(inputs):
yield os.path.join(input_, filename), filename
def index_to_file(inputs, output, sort):
def index_to_file(inputs, output, sort, surt_ordered):
if output == '-':
outfile = sys.stdout
else:
@ -337,7 +342,8 @@ def index_to_file(inputs, output, sort):
with open(fullpath, 'r') as infile:
ArchiveIndexer(fileobj=infile,
filename=filename,
writer=writer).make_index()
writer=writer,
surt_ordered=surt_ordered).make_index()
finally:
writer.end_all()
if infile:
@ -357,7 +363,7 @@ def cdx_filename(filename):
return remove_ext(filename) + '.cdx'
def index_to_dir(inputs, output, sort):
def index_to_dir(inputs, output, sort, surt_ordered):
for fullpath, filename in iter_file_or_dir(inputs):
outpath = cdx_filename(filename)
@ -368,7 +374,8 @@ def index_to_dir(inputs, output, sort):
ArchiveIndexer(fileobj=infile,
filename=filename,
sort=sort,
out=outfile).make_index()
out=outfile,
surt_ordered=surt_ordered).make_index()
def main(args=None):
@ -393,6 +400,12 @@ Some examples:
sort_help = """
sort the output to each file before writing to create a total ordering
"""
unsurt_help = """
Convert SURT (Sort-friendly URI Reordering Transform) back to regular
urls for the cdx key. Default is to use SURT keys.
Not-recommended for new cdx, use only for backwards-compatibility.
"""
output_help = """output file or directory.
@ -409,15 +422,22 @@ sort the output to each file before writing to create a total ordering
epilog=epilog,
formatter_class=RawTextHelpFormatter)
parser.add_argument('-s', '--sort', action='store_true', help=sort_help)
parser.add_argument('-s', '--sort',
action='store_true',
help=sort_help)
parser.add_argument('-u', '--unsurt',
action='store_true',
help=unsurt_help)
parser.add_argument('output', help=output_help)
parser.add_argument('inputs', nargs='+', help=input_help)
cmd = parser.parse_args(args=args)
if cmd.output != '-' and os.path.isdir(cmd.output):
index_to_dir(cmd.inputs, cmd.output, cmd.sort)
index_to_dir(cmd.inputs, cmd.output, cmd.sort, not cmd.unsurt)
else:
index_to_file(cmd.inputs, cmd.output, cmd.sort)
index_to_file(cmd.inputs, cmd.output, cmd.sort, not cmd.unsurt)
if __name__ == '__main__':

View File

@ -0,0 +1,4 @@
CDX N b a m s k r M S V g
example.com/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
example.com/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz
iana.org/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz

View File

@ -15,6 +15,8 @@ collections:
# ex with filtering: filter CDX lines by filename starting with 'dupe'
pywb-filt: {'index_paths': './sample_archive/cdx/', 'filters': ['filename:dupe*']}
pywb-nosurt: {'index_paths': './sample_archive/cdx/example-non-surt.cdx', 'surt_ordered': False}
# indicate if cdx files are sorted by SURT keys -- eg: com,example)/
# SURT keys are recommended for future indices, but non-SURT cdxs

View File

@ -94,6 +94,13 @@ class TestWb:
assert 'wb.js' in resp.body
assert '/pywb/20140127171238/http://www.iana.org/time-zones"' in resp.body
def test_replay_non_surt(self):
resp = self.testapp.get('/pywb-nosurt/20140103030321/http://example.com?example=1')
self._assert_basic_html(resp)
#assert 'Mon, Jan 27 2014 17:12:38' in resp.body
assert 'wb.js' in resp.body
#assert '/pywb/20140127171238/http://www.iana.org/time-zones"' in resp.body
def test_replay_url_agnostic_revisit(self):
resp = self.testapp.get('/pywb/20130729195151/http://www.example.com/')