mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
cdx: fix creation and add test for non-surt cdx (pywb-nonsurt/ test)
archiveindexer: -u option to generate non-surt cdx tests: full test coverage for cdxdomainspecific (fuzzy and custom canon)
This commit is contained in:
parent
8758e60590
commit
7d236af7d7
@ -25,7 +25,7 @@ def load_domain_specific_cdx_rules(ds_rules_file, surt_ordered):
|
||||
ds_rules_file=ds_rules_file)
|
||||
|
||||
if not surt_ordered:
|
||||
for rule in rules:
|
||||
for rule in rules.rules:
|
||||
rule.unsurt()
|
||||
|
||||
if rules:
|
||||
@ -36,7 +36,7 @@ def load_domain_specific_cdx_rules(ds_rules_file, surt_ordered):
|
||||
ds_rules_file=ds_rules_file)
|
||||
|
||||
if not surt_ordered:
|
||||
for rule in rules:
|
||||
for rule in rules.rules:
|
||||
rule.unsurt()
|
||||
|
||||
if rules:
|
||||
@ -108,11 +108,12 @@ class FuzzyQuery:
|
||||
params.update({'url': url,
|
||||
'matchType': 'prefix',
|
||||
'filter': filter_})
|
||||
try:
|
||||
|
||||
if 'reverse' in params:
|
||||
del params['reverse']
|
||||
|
||||
if 'closest' in params:
|
||||
del params['closest']
|
||||
except KeyError:
|
||||
pass
|
||||
|
||||
return params
|
||||
|
||||
@ -141,7 +142,7 @@ class CDXDomainSpecificRule(BaseRule):
|
||||
"""
|
||||
self.url_prefix = map(unsurt, self.url_prefix)
|
||||
if self.regex:
|
||||
self.regex = unsurt(self.regex)
|
||||
self.regex = re.compile(unsurt(self.regex.pattern))
|
||||
|
||||
if self.replace:
|
||||
self.replace = unsurt(self.replace)
|
||||
|
@ -128,6 +128,36 @@ def test_fuzzy_match():
|
||||
assert_cdx_fuzzy_match(RemoteCDXServer(CDX_SERVER_URL,
|
||||
ds_rules_file=DEFAULT_RULES_FILE))
|
||||
|
||||
def test_fuzzy_no_match_1():
|
||||
# no match, no fuzzy
|
||||
with patch('pywb.cdx.cdxsource.urllib2.urlopen', mock_urlopen):
|
||||
server = CDXServer([TEST_CDX_DIR], ds_rules_file=DEFAULT_RULES_FILE)
|
||||
with raises(NotFoundException):
|
||||
server.load_cdx(url='http://notfound.example.com/',
|
||||
output='cdxobject',
|
||||
reverse=True,
|
||||
allowFuzzy=True)
|
||||
|
||||
def test_fuzzy_no_match_2():
|
||||
# fuzzy rule, but no actual match
|
||||
with patch('pywb.cdx.cdxsource.urllib2.urlopen', mock_urlopen):
|
||||
server = CDXServer([TEST_CDX_DIR], ds_rules_file=DEFAULT_RULES_FILE)
|
||||
with raises(NotFoundException):
|
||||
server.load_cdx(url='http://notfound.example.com/?_=1234',
|
||||
closest='2014',
|
||||
reverse=True,
|
||||
output='cdxobject',
|
||||
allowFuzzy=True)
|
||||
|
||||
def test2_fuzzy_no_match_3():
|
||||
# special fuzzy rule, matches prefix test.example.example.,
|
||||
# but doesn't match rule regex
|
||||
with patch('pywb.cdx.cdxsource.urllib2.urlopen', mock_urlopen):
|
||||
server = CDXServer([TEST_CDX_DIR], ds_rules_file=DEFAULT_RULES_FILE)
|
||||
with raises(NotFoundException):
|
||||
server.load_cdx(url='http://test.example.example/',
|
||||
allowFuzzy=True)
|
||||
|
||||
def assert_error(func, exception):
|
||||
with raises(exception):
|
||||
func(CDXServer(CDX_SERVER_URL))
|
||||
|
@ -60,3 +60,4 @@ rules:
|
||||
fuzzy_lookup:
|
||||
match: '(.*)[&?](?:_|uncache)=[\d]+[&]?'
|
||||
filter: '=urlkey:{0}'
|
||||
replace: '?'
|
||||
|
@ -1,9 +1,9 @@
|
||||
from pywb.utils.timeutils import iso_date_to_timestamp
|
||||
from pywb.utils.bufferedreaders import DecompressingBufferedReader
|
||||
from pywb.utils.canonicalize import canonicalize
|
||||
|
||||
from recordloader import ArcWarcRecordLoader
|
||||
|
||||
import surt
|
||||
import hashlib
|
||||
import base64
|
||||
|
||||
@ -22,12 +22,13 @@ class ArchiveIndexer(object):
|
||||
if necessary
|
||||
"""
|
||||
def __init__(self, fileobj, filename,
|
||||
out=sys.stdout, sort=False, writer=None):
|
||||
out=sys.stdout, sort=False, writer=None, surt_ordered=True):
|
||||
self.fh = fileobj
|
||||
self.filename = filename
|
||||
self.loader = ArcWarcRecordLoader()
|
||||
self.offset = 0
|
||||
self.known_format = None
|
||||
self.surt_ordered = surt_ordered
|
||||
|
||||
if writer:
|
||||
self.writer = writer
|
||||
@ -179,7 +180,9 @@ class ArchiveIndexer(object):
|
||||
if not digest:
|
||||
digest = '-'
|
||||
|
||||
return [surt.surt(url),
|
||||
key = canonicalize(url, self.surt_ordered)
|
||||
|
||||
return [key,
|
||||
timestamp,
|
||||
url,
|
||||
mime,
|
||||
@ -211,7 +214,9 @@ class ArchiveIndexer(object):
|
||||
mime = record.rec_headers.get_header('content-type')
|
||||
mime = self._extract_mime(mime)
|
||||
|
||||
return [surt.surt(url),
|
||||
key = canonicalize(url, self.surt_ordered)
|
||||
|
||||
return [key,
|
||||
timestamp,
|
||||
url,
|
||||
mime,
|
||||
@ -318,7 +323,7 @@ def iter_file_or_dir(inputs):
|
||||
yield os.path.join(input_, filename), filename
|
||||
|
||||
|
||||
def index_to_file(inputs, output, sort):
|
||||
def index_to_file(inputs, output, sort, surt_ordered):
|
||||
if output == '-':
|
||||
outfile = sys.stdout
|
||||
else:
|
||||
@ -337,7 +342,8 @@ def index_to_file(inputs, output, sort):
|
||||
with open(fullpath, 'r') as infile:
|
||||
ArchiveIndexer(fileobj=infile,
|
||||
filename=filename,
|
||||
writer=writer).make_index()
|
||||
writer=writer,
|
||||
surt_ordered=surt_ordered).make_index()
|
||||
finally:
|
||||
writer.end_all()
|
||||
if infile:
|
||||
@ -357,7 +363,7 @@ def cdx_filename(filename):
|
||||
return remove_ext(filename) + '.cdx'
|
||||
|
||||
|
||||
def index_to_dir(inputs, output, sort):
|
||||
def index_to_dir(inputs, output, sort, surt_ordered):
|
||||
for fullpath, filename in iter_file_or_dir(inputs):
|
||||
|
||||
outpath = cdx_filename(filename)
|
||||
@ -368,7 +374,8 @@ def index_to_dir(inputs, output, sort):
|
||||
ArchiveIndexer(fileobj=infile,
|
||||
filename=filename,
|
||||
sort=sort,
|
||||
out=outfile).make_index()
|
||||
out=outfile,
|
||||
surt_ordered=surt_ordered).make_index()
|
||||
|
||||
|
||||
def main(args=None):
|
||||
@ -393,6 +400,12 @@ Some examples:
|
||||
|
||||
sort_help = """
|
||||
sort the output to each file before writing to create a total ordering
|
||||
"""
|
||||
|
||||
unsurt_help = """
|
||||
Convert SURT (Sort-friendly URI Reordering Transform) back to regular
|
||||
urls for the cdx key. Default is to use SURT keys.
|
||||
Not-recommended for new cdx, use only for backwards-compatibility.
|
||||
"""
|
||||
|
||||
output_help = """output file or directory.
|
||||
@ -409,15 +422,22 @@ sort the output to each file before writing to create a total ordering
|
||||
epilog=epilog,
|
||||
formatter_class=RawTextHelpFormatter)
|
||||
|
||||
parser.add_argument('-s', '--sort', action='store_true', help=sort_help)
|
||||
parser.add_argument('-s', '--sort',
|
||||
action='store_true',
|
||||
help=sort_help)
|
||||
|
||||
parser.add_argument('-u', '--unsurt',
|
||||
action='store_true',
|
||||
help=unsurt_help)
|
||||
|
||||
parser.add_argument('output', help=output_help)
|
||||
parser.add_argument('inputs', nargs='+', help=input_help)
|
||||
|
||||
cmd = parser.parse_args(args=args)
|
||||
if cmd.output != '-' and os.path.isdir(cmd.output):
|
||||
index_to_dir(cmd.inputs, cmd.output, cmd.sort)
|
||||
index_to_dir(cmd.inputs, cmd.output, cmd.sort, not cmd.unsurt)
|
||||
else:
|
||||
index_to_file(cmd.inputs, cmd.output, cmd.sort)
|
||||
index_to_file(cmd.inputs, cmd.output, cmd.sort, not cmd.unsurt)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
4
sample_archive/cdx/example-non-surt.cdx
Normal file
4
sample_archive/cdx/example-non-surt.cdx
Normal file
@ -0,0 +1,4 @@
|
||||
CDX N b a m s k r M S V g
|
||||
example.com/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
|
||||
example.com/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz
|
||||
iana.org/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
|
@ -15,6 +15,8 @@ collections:
|
||||
# ex with filtering: filter CDX lines by filename starting with 'dupe'
|
||||
pywb-filt: {'index_paths': './sample_archive/cdx/', 'filters': ['filename:dupe*']}
|
||||
|
||||
pywb-nosurt: {'index_paths': './sample_archive/cdx/example-non-surt.cdx', 'surt_ordered': False}
|
||||
|
||||
|
||||
# indicate if cdx files are sorted by SURT keys -- eg: com,example)/
|
||||
# SURT keys are recommended for future indices, but non-SURT cdxs
|
||||
|
@ -94,6 +94,13 @@ class TestWb:
|
||||
assert 'wb.js' in resp.body
|
||||
assert '/pywb/20140127171238/http://www.iana.org/time-zones"' in resp.body
|
||||
|
||||
def test_replay_non_surt(self):
|
||||
resp = self.testapp.get('/pywb-nosurt/20140103030321/http://example.com?example=1')
|
||||
self._assert_basic_html(resp)
|
||||
|
||||
#assert 'Mon, Jan 27 2014 17:12:38' in resp.body
|
||||
assert 'wb.js' in resp.body
|
||||
#assert '/pywb/20140127171238/http://www.iana.org/time-zones"' in resp.body
|
||||
|
||||
def test_replay_url_agnostic_revisit(self):
|
||||
resp = self.testapp.get('/pywb/20130729195151/http://www.example.com/')
|
||||
|
Loading…
x
Reference in New Issue
Block a user