mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
cdx: fix creation and add test for non-surt cdx (pywb-nonsurt/ test)
archiveindexer: -u option to generate non-surt cdx tests: full test coverage for cdxdomainspecific (fuzzy and custom canon)
This commit is contained in:
parent
8758e60590
commit
7d236af7d7
@ -25,7 +25,7 @@ def load_domain_specific_cdx_rules(ds_rules_file, surt_ordered):
|
|||||||
ds_rules_file=ds_rules_file)
|
ds_rules_file=ds_rules_file)
|
||||||
|
|
||||||
if not surt_ordered:
|
if not surt_ordered:
|
||||||
for rule in rules:
|
for rule in rules.rules:
|
||||||
rule.unsurt()
|
rule.unsurt()
|
||||||
|
|
||||||
if rules:
|
if rules:
|
||||||
@ -36,7 +36,7 @@ def load_domain_specific_cdx_rules(ds_rules_file, surt_ordered):
|
|||||||
ds_rules_file=ds_rules_file)
|
ds_rules_file=ds_rules_file)
|
||||||
|
|
||||||
if not surt_ordered:
|
if not surt_ordered:
|
||||||
for rule in rules:
|
for rule in rules.rules:
|
||||||
rule.unsurt()
|
rule.unsurt()
|
||||||
|
|
||||||
if rules:
|
if rules:
|
||||||
@ -108,11 +108,12 @@ class FuzzyQuery:
|
|||||||
params.update({'url': url,
|
params.update({'url': url,
|
||||||
'matchType': 'prefix',
|
'matchType': 'prefix',
|
||||||
'filter': filter_})
|
'filter': filter_})
|
||||||
try:
|
|
||||||
|
if 'reverse' in params:
|
||||||
del params['reverse']
|
del params['reverse']
|
||||||
|
|
||||||
|
if 'closest' in params:
|
||||||
del params['closest']
|
del params['closest']
|
||||||
except KeyError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
return params
|
return params
|
||||||
|
|
||||||
@ -141,7 +142,7 @@ class CDXDomainSpecificRule(BaseRule):
|
|||||||
"""
|
"""
|
||||||
self.url_prefix = map(unsurt, self.url_prefix)
|
self.url_prefix = map(unsurt, self.url_prefix)
|
||||||
if self.regex:
|
if self.regex:
|
||||||
self.regex = unsurt(self.regex)
|
self.regex = re.compile(unsurt(self.regex.pattern))
|
||||||
|
|
||||||
if self.replace:
|
if self.replace:
|
||||||
self.replace = unsurt(self.replace)
|
self.replace = unsurt(self.replace)
|
||||||
|
@ -128,6 +128,36 @@ def test_fuzzy_match():
|
|||||||
assert_cdx_fuzzy_match(RemoteCDXServer(CDX_SERVER_URL,
|
assert_cdx_fuzzy_match(RemoteCDXServer(CDX_SERVER_URL,
|
||||||
ds_rules_file=DEFAULT_RULES_FILE))
|
ds_rules_file=DEFAULT_RULES_FILE))
|
||||||
|
|
||||||
|
def test_fuzzy_no_match_1():
|
||||||
|
# no match, no fuzzy
|
||||||
|
with patch('pywb.cdx.cdxsource.urllib2.urlopen', mock_urlopen):
|
||||||
|
server = CDXServer([TEST_CDX_DIR], ds_rules_file=DEFAULT_RULES_FILE)
|
||||||
|
with raises(NotFoundException):
|
||||||
|
server.load_cdx(url='http://notfound.example.com/',
|
||||||
|
output='cdxobject',
|
||||||
|
reverse=True,
|
||||||
|
allowFuzzy=True)
|
||||||
|
|
||||||
|
def test_fuzzy_no_match_2():
|
||||||
|
# fuzzy rule, but no actual match
|
||||||
|
with patch('pywb.cdx.cdxsource.urllib2.urlopen', mock_urlopen):
|
||||||
|
server = CDXServer([TEST_CDX_DIR], ds_rules_file=DEFAULT_RULES_FILE)
|
||||||
|
with raises(NotFoundException):
|
||||||
|
server.load_cdx(url='http://notfound.example.com/?_=1234',
|
||||||
|
closest='2014',
|
||||||
|
reverse=True,
|
||||||
|
output='cdxobject',
|
||||||
|
allowFuzzy=True)
|
||||||
|
|
||||||
|
def test2_fuzzy_no_match_3():
|
||||||
|
# special fuzzy rule, matches prefix test.example.example.,
|
||||||
|
# but doesn't match rule regex
|
||||||
|
with patch('pywb.cdx.cdxsource.urllib2.urlopen', mock_urlopen):
|
||||||
|
server = CDXServer([TEST_CDX_DIR], ds_rules_file=DEFAULT_RULES_FILE)
|
||||||
|
with raises(NotFoundException):
|
||||||
|
server.load_cdx(url='http://test.example.example/',
|
||||||
|
allowFuzzy=True)
|
||||||
|
|
||||||
def assert_error(func, exception):
|
def assert_error(func, exception):
|
||||||
with raises(exception):
|
with raises(exception):
|
||||||
func(CDXServer(CDX_SERVER_URL))
|
func(CDXServer(CDX_SERVER_URL))
|
||||||
|
@ -60,3 +60,4 @@ rules:
|
|||||||
fuzzy_lookup:
|
fuzzy_lookup:
|
||||||
match: '(.*)[&?](?:_|uncache)=[\d]+[&]?'
|
match: '(.*)[&?](?:_|uncache)=[\d]+[&]?'
|
||||||
filter: '=urlkey:{0}'
|
filter: '=urlkey:{0}'
|
||||||
|
replace: '?'
|
||||||
|
@ -1,9 +1,9 @@
|
|||||||
from pywb.utils.timeutils import iso_date_to_timestamp
|
from pywb.utils.timeutils import iso_date_to_timestamp
|
||||||
from pywb.utils.bufferedreaders import DecompressingBufferedReader
|
from pywb.utils.bufferedreaders import DecompressingBufferedReader
|
||||||
|
from pywb.utils.canonicalize import canonicalize
|
||||||
|
|
||||||
from recordloader import ArcWarcRecordLoader
|
from recordloader import ArcWarcRecordLoader
|
||||||
|
|
||||||
import surt
|
|
||||||
import hashlib
|
import hashlib
|
||||||
import base64
|
import base64
|
||||||
|
|
||||||
@ -22,12 +22,13 @@ class ArchiveIndexer(object):
|
|||||||
if necessary
|
if necessary
|
||||||
"""
|
"""
|
||||||
def __init__(self, fileobj, filename,
|
def __init__(self, fileobj, filename,
|
||||||
out=sys.stdout, sort=False, writer=None):
|
out=sys.stdout, sort=False, writer=None, surt_ordered=True):
|
||||||
self.fh = fileobj
|
self.fh = fileobj
|
||||||
self.filename = filename
|
self.filename = filename
|
||||||
self.loader = ArcWarcRecordLoader()
|
self.loader = ArcWarcRecordLoader()
|
||||||
self.offset = 0
|
self.offset = 0
|
||||||
self.known_format = None
|
self.known_format = None
|
||||||
|
self.surt_ordered = surt_ordered
|
||||||
|
|
||||||
if writer:
|
if writer:
|
||||||
self.writer = writer
|
self.writer = writer
|
||||||
@ -179,7 +180,9 @@ class ArchiveIndexer(object):
|
|||||||
if not digest:
|
if not digest:
|
||||||
digest = '-'
|
digest = '-'
|
||||||
|
|
||||||
return [surt.surt(url),
|
key = canonicalize(url, self.surt_ordered)
|
||||||
|
|
||||||
|
return [key,
|
||||||
timestamp,
|
timestamp,
|
||||||
url,
|
url,
|
||||||
mime,
|
mime,
|
||||||
@ -211,7 +214,9 @@ class ArchiveIndexer(object):
|
|||||||
mime = record.rec_headers.get_header('content-type')
|
mime = record.rec_headers.get_header('content-type')
|
||||||
mime = self._extract_mime(mime)
|
mime = self._extract_mime(mime)
|
||||||
|
|
||||||
return [surt.surt(url),
|
key = canonicalize(url, self.surt_ordered)
|
||||||
|
|
||||||
|
return [key,
|
||||||
timestamp,
|
timestamp,
|
||||||
url,
|
url,
|
||||||
mime,
|
mime,
|
||||||
@ -318,7 +323,7 @@ def iter_file_or_dir(inputs):
|
|||||||
yield os.path.join(input_, filename), filename
|
yield os.path.join(input_, filename), filename
|
||||||
|
|
||||||
|
|
||||||
def index_to_file(inputs, output, sort):
|
def index_to_file(inputs, output, sort, surt_ordered):
|
||||||
if output == '-':
|
if output == '-':
|
||||||
outfile = sys.stdout
|
outfile = sys.stdout
|
||||||
else:
|
else:
|
||||||
@ -337,7 +342,8 @@ def index_to_file(inputs, output, sort):
|
|||||||
with open(fullpath, 'r') as infile:
|
with open(fullpath, 'r') as infile:
|
||||||
ArchiveIndexer(fileobj=infile,
|
ArchiveIndexer(fileobj=infile,
|
||||||
filename=filename,
|
filename=filename,
|
||||||
writer=writer).make_index()
|
writer=writer,
|
||||||
|
surt_ordered=surt_ordered).make_index()
|
||||||
finally:
|
finally:
|
||||||
writer.end_all()
|
writer.end_all()
|
||||||
if infile:
|
if infile:
|
||||||
@ -357,7 +363,7 @@ def cdx_filename(filename):
|
|||||||
return remove_ext(filename) + '.cdx'
|
return remove_ext(filename) + '.cdx'
|
||||||
|
|
||||||
|
|
||||||
def index_to_dir(inputs, output, sort):
|
def index_to_dir(inputs, output, sort, surt_ordered):
|
||||||
for fullpath, filename in iter_file_or_dir(inputs):
|
for fullpath, filename in iter_file_or_dir(inputs):
|
||||||
|
|
||||||
outpath = cdx_filename(filename)
|
outpath = cdx_filename(filename)
|
||||||
@ -368,7 +374,8 @@ def index_to_dir(inputs, output, sort):
|
|||||||
ArchiveIndexer(fileobj=infile,
|
ArchiveIndexer(fileobj=infile,
|
||||||
filename=filename,
|
filename=filename,
|
||||||
sort=sort,
|
sort=sort,
|
||||||
out=outfile).make_index()
|
out=outfile,
|
||||||
|
surt_ordered=surt_ordered).make_index()
|
||||||
|
|
||||||
|
|
||||||
def main(args=None):
|
def main(args=None):
|
||||||
@ -393,6 +400,12 @@ Some examples:
|
|||||||
|
|
||||||
sort_help = """
|
sort_help = """
|
||||||
sort the output to each file before writing to create a total ordering
|
sort the output to each file before writing to create a total ordering
|
||||||
|
"""
|
||||||
|
|
||||||
|
unsurt_help = """
|
||||||
|
Convert SURT (Sort-friendly URI Reordering Transform) back to regular
|
||||||
|
urls for the cdx key. Default is to use SURT keys.
|
||||||
|
Not-recommended for new cdx, use only for backwards-compatibility.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
output_help = """output file or directory.
|
output_help = """output file or directory.
|
||||||
@ -409,15 +422,22 @@ sort the output to each file before writing to create a total ordering
|
|||||||
epilog=epilog,
|
epilog=epilog,
|
||||||
formatter_class=RawTextHelpFormatter)
|
formatter_class=RawTextHelpFormatter)
|
||||||
|
|
||||||
parser.add_argument('-s', '--sort', action='store_true', help=sort_help)
|
parser.add_argument('-s', '--sort',
|
||||||
|
action='store_true',
|
||||||
|
help=sort_help)
|
||||||
|
|
||||||
|
parser.add_argument('-u', '--unsurt',
|
||||||
|
action='store_true',
|
||||||
|
help=unsurt_help)
|
||||||
|
|
||||||
parser.add_argument('output', help=output_help)
|
parser.add_argument('output', help=output_help)
|
||||||
parser.add_argument('inputs', nargs='+', help=input_help)
|
parser.add_argument('inputs', nargs='+', help=input_help)
|
||||||
|
|
||||||
cmd = parser.parse_args(args=args)
|
cmd = parser.parse_args(args=args)
|
||||||
if cmd.output != '-' and os.path.isdir(cmd.output):
|
if cmd.output != '-' and os.path.isdir(cmd.output):
|
||||||
index_to_dir(cmd.inputs, cmd.output, cmd.sort)
|
index_to_dir(cmd.inputs, cmd.output, cmd.sort, not cmd.unsurt)
|
||||||
else:
|
else:
|
||||||
index_to_file(cmd.inputs, cmd.output, cmd.sort)
|
index_to_file(cmd.inputs, cmd.output, cmd.sort, not cmd.unsurt)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
4
sample_archive/cdx/example-non-surt.cdx
Normal file
4
sample_archive/cdx/example-non-surt.cdx
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
CDX N b a m s k r M S V g
|
||||||
|
example.com/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
|
||||||
|
example.com/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz
|
||||||
|
iana.org/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
|
@ -15,6 +15,8 @@ collections:
|
|||||||
# ex with filtering: filter CDX lines by filename starting with 'dupe'
|
# ex with filtering: filter CDX lines by filename starting with 'dupe'
|
||||||
pywb-filt: {'index_paths': './sample_archive/cdx/', 'filters': ['filename:dupe*']}
|
pywb-filt: {'index_paths': './sample_archive/cdx/', 'filters': ['filename:dupe*']}
|
||||||
|
|
||||||
|
pywb-nosurt: {'index_paths': './sample_archive/cdx/example-non-surt.cdx', 'surt_ordered': False}
|
||||||
|
|
||||||
|
|
||||||
# indicate if cdx files are sorted by SURT keys -- eg: com,example)/
|
# indicate if cdx files are sorted by SURT keys -- eg: com,example)/
|
||||||
# SURT keys are recommended for future indices, but non-SURT cdxs
|
# SURT keys are recommended for future indices, but non-SURT cdxs
|
||||||
|
@ -94,6 +94,13 @@ class TestWb:
|
|||||||
assert 'wb.js' in resp.body
|
assert 'wb.js' in resp.body
|
||||||
assert '/pywb/20140127171238/http://www.iana.org/time-zones"' in resp.body
|
assert '/pywb/20140127171238/http://www.iana.org/time-zones"' in resp.body
|
||||||
|
|
||||||
|
def test_replay_non_surt(self):
|
||||||
|
resp = self.testapp.get('/pywb-nosurt/20140103030321/http://example.com?example=1')
|
||||||
|
self._assert_basic_html(resp)
|
||||||
|
|
||||||
|
#assert 'Mon, Jan 27 2014 17:12:38' in resp.body
|
||||||
|
assert 'wb.js' in resp.body
|
||||||
|
#assert '/pywb/20140127171238/http://www.iana.org/time-zones"' in resp.body
|
||||||
|
|
||||||
def test_replay_url_agnostic_revisit(self):
|
def test_replay_url_agnostic_revisit(self):
|
||||||
resp = self.testapp.get('/pywb/20130729195151/http://www.example.com/')
|
resp = self.testapp.get('/pywb/20130729195151/http://www.example.com/')
|
||||||
|
Loading…
x
Reference in New Issue
Block a user