mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
url-agnostic revisit testing!
add sample warc and cdx for url-agnostic revisits add unit test and integration test resolvingloader: pass callback instead of full cdx server for use for loading cdx in case of url-agnostic revisit
This commit is contained in:
parent
cf5aaf5de4
commit
d702a98bbc
@ -51,6 +51,7 @@ com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit
|
||||
|
||||
# Filter exact invert
|
||||
>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '!=urlkey:com,example)/?example=1')
|
||||
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
|
||||
com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz
|
||||
com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz
|
||||
|
||||
@ -61,6 +62,7 @@ com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit
|
||||
|
||||
# Filter contains invert
|
||||
>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '!~urlkey:example=1')
|
||||
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
|
||||
com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz
|
||||
com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz
|
||||
|
||||
@ -127,8 +129,8 @@ org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db tex
|
||||
|
||||
|
||||
# CDX Server init
|
||||
>>> x = CDXServer([test_cdx_dir]).load_cdx(url = 'example.com', limit = 1, output = 'raw')
|
||||
>>> pprint.pprint(x.next().items())
|
||||
>>> x = CDXServer([test_cdx_dir]).load_cdx(url = 'example.com', limit = 2, output = 'raw')
|
||||
>>> y = x.next(); pprint.pprint(x.next().items())
|
||||
[('urlkey', 'com,example)/'),
|
||||
('timestamp', '20140127171200'),
|
||||
('original', 'http://example.com'),
|
||||
|
@ -42,7 +42,9 @@ class WBHandler(WbUrlHandler):
|
||||
return self.query_view.render_response(wbrequest, cdx_lines)
|
||||
|
||||
with PerfTimer(wbrequest.env.get('X_PERF'), 'replay') as t:
|
||||
return self.replay(wbrequest, cdx_lines)
|
||||
return self.replay(wbrequest,
|
||||
cdx_lines,
|
||||
self.index_reader.cdx_load_callback(wbrequest))
|
||||
|
||||
|
||||
def render_search_page(self, wbrequest):
|
||||
|
@ -44,6 +44,11 @@ class IndexReader(object):
|
||||
|
||||
return self.cdx_server.load_cdx(**params)
|
||||
|
||||
def cdx_load_callback(self, wbrequest):
|
||||
def load_cdx(params):
|
||||
return self.load_cdx(wbrequest, params)
|
||||
return load_cdx
|
||||
|
||||
def get_query_params(self, wburl, limit = 150000, collapse_time = None, replay_closest = 100):
|
||||
if wburl.type == wburl.URL_QUERY:
|
||||
raise NotImplementedError('Url Query Not Yet Supported')
|
||||
|
@ -71,7 +71,6 @@ def create_wb_handler(cdx_server, config, ds_rules_file=None):
|
||||
paths = config.get('archive_paths')
|
||||
|
||||
resolving_loader = ResolvingLoader(paths=paths,
|
||||
cdx_server=cdx_server,
|
||||
record_loader=record_loader)
|
||||
|
||||
head_insert_view = load_template_file(config.get('head_insert_html'),
|
||||
|
@ -26,7 +26,7 @@ class ReplayView:
|
||||
self._reporter = reporter
|
||||
|
||||
|
||||
def __call__(self, wbrequest, cdx_lines):
|
||||
def __call__(self, wbrequest, cdx_lines, cdx_loader):
|
||||
last_e = None
|
||||
first = True
|
||||
|
||||
@ -42,7 +42,8 @@ class ReplayView:
|
||||
self._redirect_if_needed(wbrequest, cdx)
|
||||
first = False
|
||||
|
||||
(status_headers, stream) = self.content_loader.resolve_headers_and_payload(cdx, failed_files)
|
||||
(status_headers, stream) = (self.content_loader.
|
||||
resolve_headers_and_payload(cdx, failed_files, cdx_loader))
|
||||
|
||||
# check and reject self-redirect
|
||||
self._reject_self_redirect(wbrequest, cdx, status_headers)
|
||||
|
@ -126,7 +126,7 @@ class ArcWarcRecordLoader:
|
||||
rec_headers = self.arc_parser.parse(stream, statusline)
|
||||
return 'arc', rec_headers
|
||||
except StatusAndHeadersParserException as se:
|
||||
msg = 'Unknown archive format, first line: ' + se.statusline
|
||||
msg = 'Unknown archive format, first line: ' + str(se.statusline)
|
||||
raise ArchiveLoadFailed(msg)
|
||||
|
||||
|
||||
@ -148,7 +148,7 @@ class ARCHeadersParser:
|
||||
if len(parts) != len(headernames):
|
||||
msg = 'Wrong # of headers, expected arc headers {0}, Found {1}'
|
||||
msg = msg.format(headernames, parts)
|
||||
raise StatusAndHeadersParserException(msg, headernames)
|
||||
raise StatusAndHeadersParserException(msg, parts)
|
||||
|
||||
headers = []
|
||||
|
||||
|
@ -5,14 +5,11 @@ from pathresolvers import make_best_resolvers
|
||||
|
||||
#=================================================================
|
||||
class ResolvingLoader:
|
||||
def __init__(self, paths, record_loader=ArcWarcRecordLoader(),
|
||||
cdx_server=None):
|
||||
|
||||
def __init__(self, paths, record_loader=ArcWarcRecordLoader()):
|
||||
self.path_resolvers = make_best_resolvers(paths)
|
||||
self.record_loader = record_loader
|
||||
self.cdx_server = cdx_server
|
||||
|
||||
def resolve_headers_and_payload(self, cdx, failed_files):
|
||||
def resolve_headers_and_payload(self, cdx, failed_files, cdx_loader):
|
||||
"""
|
||||
Resolve headers and payload for a given capture
|
||||
In the simple case, headers and payload are in the same record.
|
||||
@ -37,7 +34,8 @@ class ResolvingLoader:
|
||||
if cdx['mimetype'] == 'warc/revisit' and headers_record:
|
||||
payload_record = self._load_different_url_payload(cdx,
|
||||
headers_record,
|
||||
failed_files)
|
||||
failed_files,
|
||||
cdx_loader)
|
||||
|
||||
# single lookup cases
|
||||
# case 2: non-revisit
|
||||
@ -121,7 +119,8 @@ class ResolvingLoader:
|
||||
|
||||
raise ArchiveLoadFailed(msg, filename), None, last_traceback
|
||||
|
||||
def _load_different_url_payload(self, cdx, headers_record, failed_files):
|
||||
def _load_different_url_payload(self, cdx, headers_record,
|
||||
failed_files, cdx_loader):
|
||||
"""
|
||||
Handle the case where a duplicate of a capture with same digest
|
||||
exists at a different url.
|
||||
@ -152,7 +151,8 @@ class ResolvingLoader:
|
||||
|
||||
orig_cdx_lines = self.load_cdx_for_dupe(ref_target_uri,
|
||||
ref_target_date,
|
||||
cdx['digest'])
|
||||
cdx['digest'],
|
||||
cdx_loader)
|
||||
|
||||
for cdx in orig_cdx_lines:
|
||||
try:
|
||||
@ -165,12 +165,12 @@ class ResolvingLoader:
|
||||
|
||||
raise ArchiveLoadFailed('Original for revisit could not be loaded')
|
||||
|
||||
def load_cdx_for_dupe(self, url, timestamp, digest):
|
||||
def load_cdx_for_dupe(self, url, timestamp, digest, cdx_loader):
|
||||
"""
|
||||
If a cdx_server is available, return response from server,
|
||||
otherwise empty list
|
||||
"""
|
||||
if not self.cdx_server:
|
||||
if not cdx_loader:
|
||||
return []
|
||||
|
||||
params = {'url': url,
|
||||
@ -178,4 +178,4 @@ class ResolvingLoader:
|
||||
'filter': 'digest:' + digest,
|
||||
'output': 'cdxobject'}
|
||||
|
||||
return self.cdx_server.load_cdx(**params)
|
||||
return cdx_loader(params)
|
||||
|
@ -159,7 +159,36 @@ StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Acc
|
||||
<!doctype html>
|
||||
<html>
|
||||
|
||||
# Test Url Agnostic Revisit Resolving
|
||||
# ==============================================================================
|
||||
>>> load_from_cdx_test(URL_AGNOSTIC_ORIG_CDX)
|
||||
StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
|
||||
('Content-Type', 'text/html; charset=UTF-8'),
|
||||
('Date', 'Tue, 02 Jul 2013 19:54:02 GMT'),
|
||||
('ETag', '"780602-4f6-4db31b2978ec0"'),
|
||||
('Last-Modified', 'Thu, 25 Apr 2013 16:13:23 GMT'),
|
||||
('Server', 'ECS (sjc/4FCE)'),
|
||||
('X-Cache', 'HIT'),
|
||||
('Content-Length', '1270'),
|
||||
('Connection', 'close')])
|
||||
<!doctype html>
|
||||
<html>
|
||||
|
||||
>>> load_from_cdx_test(URL_AGNOSTIC_REVISIT_CDX)
|
||||
StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
|
||||
('Content-Type', 'text/html; charset=UTF-8'),
|
||||
('Date', 'Mon, 29 Jul 2013 19:51:51 GMT'),
|
||||
('ETag', '"780602-4f6-4db31b2978ec0"'),
|
||||
('Last-Modified', 'Thu, 25 Apr 2013 16:13:23 GMT'),
|
||||
('Server', 'ECS (sjc/4FCE)'),
|
||||
('X-Cache', 'HIT'),
|
||||
('Content-Length', '1270'),
|
||||
('Connection', 'close')])
|
||||
<!doctype html>
|
||||
<html>
|
||||
|
||||
# Error Handling
|
||||
# ==============================================================================
|
||||
|
||||
# Invalid WARC Offset
|
||||
>>> load_from_cdx_test('com,example)/?example=1 20140103030341 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1860 example.warc.gz 1043 333 example.warc.gz')
|
||||
@ -167,7 +196,7 @@ Exception: ArchiveLoadFailed
|
||||
|
||||
|
||||
# Invalid ARC Offset
|
||||
>>> load_from_cdx_test('com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 332 example.warc.gz')
|
||||
>>> load_from_cdx_test('com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 856 170 example.arc.gz')
|
||||
Exception: ArchiveLoadFailed
|
||||
|
||||
|
||||
@ -175,6 +204,7 @@ Exception: ArchiveLoadFailed
|
||||
>>> load_from_cdx_test('com,example)/?example=1 20140103030341 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz 1043 330 example.warc.gz')
|
||||
Exception: ArchiveLoadFailed
|
||||
|
||||
|
||||
"""
|
||||
|
||||
import os
|
||||
@ -188,29 +218,44 @@ from pywb.cdx.cdxobject import CDXObject
|
||||
|
||||
from pywb import get_test_dir
|
||||
|
||||
#test_warc_dir = os.path.dirname(os.path.realpath(__file__)) + '/../sample_data/'
|
||||
#==============================================================================
|
||||
test_warc_dir = get_test_dir() + 'warcs/'
|
||||
|
||||
|
||||
URL_AGNOSTIC_ORIG_CDX = 'org,iana,example)/ 20130702195402 http://example.iana.org/ \
|
||||
text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - \
|
||||
1001 353 example-url-agnostic-orig.warc.gz'
|
||||
|
||||
URL_AGNOSTIC_REVISIT_CDX = 'com,example)/ 20130729195151 http://test@example.com/ \
|
||||
warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - \
|
||||
591 355 example-url-agnostic-revisit.warc.gz'
|
||||
|
||||
|
||||
#==============================================================================
|
||||
def load_test_archive(test_file, offset, length):
|
||||
path = test_warc_dir + test_file
|
||||
|
||||
testloader = ArcWarcRecordLoader()
|
||||
|
||||
archive = testloader.load(path, offset, length)
|
||||
archive = testloader.load(path, offset, length)
|
||||
|
||||
pprint.pprint((archive.type, archive.rec_headers, archive.status_headers))
|
||||
|
||||
|
||||
#==============================================================================
|
||||
def load_orig_cdx(self):
|
||||
return [CDXObject(URL_AGNOSTIC_ORIG_CDX)]
|
||||
|
||||
#==============================================================================
|
||||
def load_from_cdx_test(cdx):
|
||||
resolve_loader = ResolvingLoader(test_warc_dir)
|
||||
cdx = CDXObject(cdx)
|
||||
try:
|
||||
(headers, stream) = resolve_loader.resolve_headers_and_payload(cdx, None)
|
||||
(headers, stream) = resolve_loader.resolve_headers_and_payload(cdx, None, load_orig_cdx)
|
||||
print headers
|
||||
sys.stdout.write(stream.readline())
|
||||
sys.stdout.write(stream.readline())
|
||||
except Exception as e:
|
||||
except ArchiveLoadFailed as e:
|
||||
print 'Exception: ' + e.__class__.__name__
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
3
sample_archive/cdx/url-agnost-example.cdx
Normal file
3
sample_archive/cdx/url-agnost-example.cdx
Normal file
@ -0,0 +1,3 @@
|
||||
CDX N b a m s k r M S V g
|
||||
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
|
||||
org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1001 353 example-url-agnostic-orig.warc.gz
|
BIN
sample_archive/warcs/example-url-agnostic-orig.warc.gz
Normal file
BIN
sample_archive/warcs/example-url-agnostic-orig.warc.gz
Normal file
Binary file not shown.
BIN
sample_archive/warcs/example-url-agnostic-revisit.warc.gz
Normal file
BIN
sample_archive/warcs/example-url-agnostic-revisit.warc.gz
Normal file
Binary file not shown.
@ -84,6 +84,15 @@ class TestWb:
|
||||
assert 'wb.js' in resp.body
|
||||
assert '/pywb/20140127171238/http://www.iana.org/time-zones"' in resp.body
|
||||
|
||||
|
||||
def test_replay_url_agnostic_revisit(self):
|
||||
resp = self.testapp.get('/pywb/20130729195151/http://www.example.com/')
|
||||
self._assert_basic_html(resp)
|
||||
|
||||
assert 'Mon, Jul 29 2013 19:51:51' in resp.body
|
||||
assert 'wb.js' in resp.body
|
||||
assert '/pywb/20130729195151/http://www.iana.org/domains/example"' in resp.body
|
||||
|
||||
def test_replay_identity_1(self):
|
||||
resp = self.testapp.get('/pywb/20140127171251id_/http://example.com')
|
||||
#resp = self.testapp.get('/pywb/20140126200654id_/http://www.iana.org/_img/2013.1/rir-map.svg')
|
||||
|
Loading…
x
Reference in New Issue
Block a user