1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

url-agnostic revisit testing!

add sample warc and cdx for url-agnostic revisits
add unit test and integration test
resolvingloader: pass callback instead of full cdx server
for use for loading cdx in case of url-agnostic revisit
This commit is contained in:
Ilya Kreymer 2014-03-04 20:12:09 +00:00
parent cf5aaf5de4
commit d702a98bbc
12 changed files with 90 additions and 24 deletions

View File

@ -51,6 +51,7 @@ com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit
# Filter exact invert
>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '!=urlkey:com,example)/?example=1')
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz
com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz
@ -61,6 +62,7 @@ com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit
# Filter contains invert
>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '!~urlkey:example=1')
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz
com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz
@ -127,8 +129,8 @@ org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db tex
# CDX Server init
>>> x = CDXServer([test_cdx_dir]).load_cdx(url = 'example.com', limit = 1, output = 'raw')
>>> pprint.pprint(x.next().items())
>>> x = CDXServer([test_cdx_dir]).load_cdx(url = 'example.com', limit = 2, output = 'raw')
>>> y = x.next(); pprint.pprint(x.next().items())
[('urlkey', 'com,example)/'),
('timestamp', '20140127171200'),
('original', 'http://example.com'),

View File

@ -42,7 +42,9 @@ class WBHandler(WbUrlHandler):
return self.query_view.render_response(wbrequest, cdx_lines)
with PerfTimer(wbrequest.env.get('X_PERF'), 'replay') as t:
return self.replay(wbrequest, cdx_lines)
return self.replay(wbrequest,
cdx_lines,
self.index_reader.cdx_load_callback(wbrequest))
def render_search_page(self, wbrequest):

View File

@ -44,6 +44,11 @@ class IndexReader(object):
return self.cdx_server.load_cdx(**params)
def cdx_load_callback(self, wbrequest):
def load_cdx(params):
return self.load_cdx(wbrequest, params)
return load_cdx
def get_query_params(self, wburl, limit = 150000, collapse_time = None, replay_closest = 100):
if wburl.type == wburl.URL_QUERY:
raise NotImplementedError('Url Query Not Yet Supported')

View File

@ -71,7 +71,6 @@ def create_wb_handler(cdx_server, config, ds_rules_file=None):
paths = config.get('archive_paths')
resolving_loader = ResolvingLoader(paths=paths,
cdx_server=cdx_server,
record_loader=record_loader)
head_insert_view = load_template_file(config.get('head_insert_html'),

View File

@ -26,7 +26,7 @@ class ReplayView:
self._reporter = reporter
def __call__(self, wbrequest, cdx_lines):
def __call__(self, wbrequest, cdx_lines, cdx_loader):
last_e = None
first = True
@ -42,7 +42,8 @@ class ReplayView:
self._redirect_if_needed(wbrequest, cdx)
first = False
(status_headers, stream) = self.content_loader.resolve_headers_and_payload(cdx, failed_files)
(status_headers, stream) = (self.content_loader.
resolve_headers_and_payload(cdx, failed_files, cdx_loader))
# check and reject self-redirect
self._reject_self_redirect(wbrequest, cdx, status_headers)

View File

@ -126,7 +126,7 @@ class ArcWarcRecordLoader:
rec_headers = self.arc_parser.parse(stream, statusline)
return 'arc', rec_headers
except StatusAndHeadersParserException as se:
msg = 'Unknown archive format, first line: ' + se.statusline
msg = 'Unknown archive format, first line: ' + str(se.statusline)
raise ArchiveLoadFailed(msg)
@ -148,7 +148,7 @@ class ARCHeadersParser:
if len(parts) != len(headernames):
msg = 'Wrong # of headers, expected arc headers {0}, Found {1}'
msg = msg.format(headernames, parts)
raise StatusAndHeadersParserException(msg, headernames)
raise StatusAndHeadersParserException(msg, parts)
headers = []

View File

@ -5,14 +5,11 @@ from pathresolvers import make_best_resolvers
#=================================================================
class ResolvingLoader:
def __init__(self, paths, record_loader=ArcWarcRecordLoader(),
cdx_server=None):
def __init__(self, paths, record_loader=ArcWarcRecordLoader()):
self.path_resolvers = make_best_resolvers(paths)
self.record_loader = record_loader
self.cdx_server = cdx_server
def resolve_headers_and_payload(self, cdx, failed_files):
def resolve_headers_and_payload(self, cdx, failed_files, cdx_loader):
"""
Resolve headers and payload for a given capture
In the simple case, headers and payload are in the same record.
@ -37,7 +34,8 @@ class ResolvingLoader:
if cdx['mimetype'] == 'warc/revisit' and headers_record:
payload_record = self._load_different_url_payload(cdx,
headers_record,
failed_files)
failed_files,
cdx_loader)
# single lookup cases
# case 2: non-revisit
@ -121,7 +119,8 @@ class ResolvingLoader:
raise ArchiveLoadFailed(msg, filename), None, last_traceback
def _load_different_url_payload(self, cdx, headers_record, failed_files):
def _load_different_url_payload(self, cdx, headers_record,
failed_files, cdx_loader):
"""
Handle the case where a duplicate of a capture with same digest
exists at a different url.
@ -152,7 +151,8 @@ class ResolvingLoader:
orig_cdx_lines = self.load_cdx_for_dupe(ref_target_uri,
ref_target_date,
cdx['digest'])
cdx['digest'],
cdx_loader)
for cdx in orig_cdx_lines:
try:
@ -165,12 +165,12 @@ class ResolvingLoader:
raise ArchiveLoadFailed('Original for revisit could not be loaded')
def load_cdx_for_dupe(self, url, timestamp, digest):
def load_cdx_for_dupe(self, url, timestamp, digest, cdx_loader):
"""
If a cdx_server is available, return response from server,
otherwise empty list
"""
if not self.cdx_server:
if not cdx_loader:
return []
params = {'url': url,
@ -178,4 +178,4 @@ class ResolvingLoader:
'filter': 'digest:' + digest,
'output': 'cdxobject'}
return self.cdx_server.load_cdx(**params)
return cdx_loader(params)

View File

@ -159,7 +159,36 @@ StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Acc
<!doctype html>
<html>
# Test Url Agnostic Revisit Resolving
# ==============================================================================
>>> load_from_cdx_test(URL_AGNOSTIC_ORIG_CDX)
StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
('Content-Type', 'text/html; charset=UTF-8'),
('Date', 'Tue, 02 Jul 2013 19:54:02 GMT'),
('ETag', '"780602-4f6-4db31b2978ec0"'),
('Last-Modified', 'Thu, 25 Apr 2013 16:13:23 GMT'),
('Server', 'ECS (sjc/4FCE)'),
('X-Cache', 'HIT'),
('Content-Length', '1270'),
('Connection', 'close')])
<!doctype html>
<html>
>>> load_from_cdx_test(URL_AGNOSTIC_REVISIT_CDX)
StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
('Content-Type', 'text/html; charset=UTF-8'),
('Date', 'Mon, 29 Jul 2013 19:51:51 GMT'),
('ETag', '"780602-4f6-4db31b2978ec0"'),
('Last-Modified', 'Thu, 25 Apr 2013 16:13:23 GMT'),
('Server', 'ECS (sjc/4FCE)'),
('X-Cache', 'HIT'),
('Content-Length', '1270'),
('Connection', 'close')])
<!doctype html>
<html>
# Error Handling
# ==============================================================================
# Invalid WARC Offset
>>> load_from_cdx_test('com,example)/?example=1 20140103030341 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1860 example.warc.gz 1043 333 example.warc.gz')
@ -167,7 +196,7 @@ Exception: ArchiveLoadFailed
# Invalid ARC Offset
>>> load_from_cdx_test('com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 332 example.warc.gz')
>>> load_from_cdx_test('com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 856 170 example.arc.gz')
Exception: ArchiveLoadFailed
@ -175,6 +204,7 @@ Exception: ArchiveLoadFailed
>>> load_from_cdx_test('com,example)/?example=1 20140103030341 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz 1043 330 example.warc.gz')
Exception: ArchiveLoadFailed
"""
import os
@ -188,29 +218,44 @@ from pywb.cdx.cdxobject import CDXObject
from pywb import get_test_dir
#test_warc_dir = os.path.dirname(os.path.realpath(__file__)) + '/../sample_data/'
#==============================================================================
test_warc_dir = get_test_dir() + 'warcs/'
URL_AGNOSTIC_ORIG_CDX = 'org,iana,example)/ 20130702195402 http://example.iana.org/ \
text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - \
1001 353 example-url-agnostic-orig.warc.gz'
URL_AGNOSTIC_REVISIT_CDX = 'com,example)/ 20130729195151 http://test@example.com/ \
warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - \
591 355 example-url-agnostic-revisit.warc.gz'
#==============================================================================
def load_test_archive(test_file, offset, length):
path = test_warc_dir + test_file
testloader = ArcWarcRecordLoader()
archive = testloader.load(path, offset, length)
archive = testloader.load(path, offset, length)
pprint.pprint((archive.type, archive.rec_headers, archive.status_headers))
#==============================================================================
def load_orig_cdx(self):
return [CDXObject(URL_AGNOSTIC_ORIG_CDX)]
#==============================================================================
def load_from_cdx_test(cdx):
resolve_loader = ResolvingLoader(test_warc_dir)
cdx = CDXObject(cdx)
try:
(headers, stream) = resolve_loader.resolve_headers_and_payload(cdx, None)
(headers, stream) = resolve_loader.resolve_headers_and_payload(cdx, None, load_orig_cdx)
print headers
sys.stdout.write(stream.readline())
sys.stdout.write(stream.readline())
except Exception as e:
except ArchiveLoadFailed as e:
print 'Exception: ' + e.__class__.__name__
if __name__ == "__main__":

View File

@ -0,0 +1,3 @@
CDX N b a m s k r M S V g
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1001 353 example-url-agnostic-orig.warc.gz

Binary file not shown.

View File

@ -84,6 +84,15 @@ class TestWb:
assert 'wb.js' in resp.body
assert '/pywb/20140127171238/http://www.iana.org/time-zones"' in resp.body
def test_replay_url_agnostic_revisit(self):
resp = self.testapp.get('/pywb/20130729195151/http://www.example.com/')
self._assert_basic_html(resp)
assert 'Mon, Jul 29 2013 19:51:51' in resp.body
assert 'wb.js' in resp.body
assert '/pywb/20130729195151/http://www.iana.org/domains/example"' in resp.body
def test_replay_identity_1(self):
resp = self.testapp.get('/pywb/20140127171251id_/http://example.com')
#resp = self.testapp.get('/pywb/20140126200654id_/http://www.iana.org/_img/2013.1/rir-map.svg')