1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

fix typos in remotecdxserver, url-agnostic dedup

when raising new exception, pass traceback of original also!
This commit is contained in:
Ilya Kreymer 2014-02-17 14:52:13 -08:00
parent 158b490453
commit 28187b34d3
5 changed files with 35 additions and 25 deletions

View File

@ -86,7 +86,7 @@ class RemoteCDXServer(object):
raise Exception('Invalid remote cdx source: ' + str(source)) raise Exception('Invalid remote cdx source: ' + str(source))
def load_cdx(self, **params): def load_cdx(self, **params):
remote_iter = remote.load_cdx(**params) remote_iter = self.source.load_cdx(params)
# if need raw, convert to raw format here # if need raw, convert to raw format here
if params.get('output') == 'raw': if params.get('output') == 'raw':
return (CDXObject(cdx) for cdx in remote_iter) return (CDXObject(cdx) for cdx in remote_iter)

View File

@ -11,6 +11,13 @@ from pywb.cdx.cdxobject import CDXObject
#================================================================= #=================================================================
class IndexReader(object): class IndexReader(object):
"""
Main interface for reading index (currently only CDX) from a
source server (currenlt a cdx server)
Creates an appropriate query based on wbrequest type info
"""
def __init__(self, config): def __init__(self, config):
self.cdx_server = create_cdx_server(config) self.cdx_server = create_cdx_server(config)

View File

@ -1,8 +1,8 @@
import handlers import handlers
import indexreader
import archivalrouter import archivalrouter
import config_utils import config_utils
import proxy import proxy
from indexreader import IndexReader
import os import os
import yaml import yaml
@ -52,10 +52,10 @@ def pywb_config_manual(passed_config = {}):
for name, value in collections.iteritems(): for name, value in collections.iteritems():
if isinstance(value, str): if isinstance(value, str):
route_config = config route_config = config
cdx_server = indexreader.IndexReader(value) cdx_server = IndexReader(value)
else: else:
route_config = DictChain(value, config) route_config = DictChain(value, config)
cdx_server = indexreader.IndexReader(route_config) cdx_server = IndexReader(route_config)
wb_handler = config_utils.create_wb_handler( wb_handler = config_utils.create_wb_handler(

View File

@ -94,6 +94,7 @@ class ResolvingLoader:
any_found = False any_found = False
last_exc = None last_exc = None
last_traceback = None
for resolver in self.path_resolvers: for resolver in self.path_resolvers:
possible_paths = resolver(filename) possible_paths = resolver(filename)
@ -105,17 +106,20 @@ class ResolvingLoader:
except Exception as ue: except Exception as ue:
last_exc = ue last_exc = ue
import sys
last_traceback = sys.exc_info()[2]
# Unsuccessful if reached here # Unsuccessful if reached here
if failed_files: if failed_files:
failed_files.append(filename) failed_files.append(filename)
if last_exc: if last_exc:
msg = str(last_exc.__class__.__name__) #msg = str(last_exc.__class__.__name__)
msg = str(last_exc)
else: else:
msg = 'Archive File Not Found' msg = 'Archive File Not Found'
raise ArchiveLoadFailed(msg, filename) raise ArchiveLoadFailed(msg, filename), None, last_traceback
def _load_different_url_payload(self, cdx, headers_record, failed_files): def _load_different_url_payload(self, cdx, headers_record, failed_files):
""" """
@ -147,12 +151,13 @@ class ResolvingLoader:
ref_target_date = iso_date_to_timestamp(ref_target_date) ref_target_date = iso_date_to_timestamp(ref_target_date)
orig_cdx_lines = self.load_cdx_for_dupe(ref_target_uri, orig_cdx_lines = self.load_cdx_for_dupe(ref_target_uri,
ref_target_date, digest) ref_target_date,
cdx['digest'])
for cdx in orig_cdx_lines: for cdx in orig_cdx_lines:
try: try:
payload_record = self._load_and_resolve(cdx, False, payload_record = self._resolve_path_load(cdx, False,
failed_files) failed_files)
return payload_record return payload_record
except ArchiveLoadFailed as e: except ArchiveLoadFailed as e:
@ -160,7 +165,7 @@ class ResolvingLoader:
raise ArchiveLoadFailed('Original for revisit could not be loaded') raise ArchiveLoadFailed('Original for revisit could not be loaded')
def load_cdx_for_dupe(url, timestamp, digest): def load_cdx_for_dupe(self, url, timestamp, digest):
""" """
If a cdx_server is available, return response from server, If a cdx_server is available, return response from server,
otherwise empty list otherwise empty list
@ -169,8 +174,8 @@ class ResolvingLoader:
return [] return []
params = {'url': url, params = {'url': url,
'closest': closest, 'closest': timestamp,
'filter': 'digest:' + digest, 'filter': 'digest:' + digest,
'output': 'raw'} 'output': 'raw'}
return self.cdx_server.load_cdx(params) return self.cdx_server.load_cdx(**params)

View File

@ -145,19 +145,17 @@ StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Acc
# Invalid WARC Offset # Invalid WARC Offset
>>> load_from_cdx_test('com,example)/?example=1 20140103030341 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1860 example.warc.gz 1043 333 example.warc.gz') >>> load_from_cdx_test('com,example)/?example=1 20140103030341 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1860 example.warc.gz 1043 333 example.warc.gz')
Traceback (most recent call last): Exception: ArchiveLoadFailed
ArchiveLoadFailed: example.warc.gz:StatusAndHeadersParserException
# Invalid ARC Offset # Invalid ARC Offset
>>> load_from_cdx_test('com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 332 example.warc.gz') >>> load_from_cdx_test('com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 332 example.warc.gz')
Traceback (most recent call last): Exception: ArchiveLoadFailed
ArchiveLoadFailed: example.warc.gz:StatusAndHeadersParserException
# Error Expected with revisit -- invalid offset on original # Error Expected with revisit -- invalid offset on original
>>> load_from_cdx_test('com,example)/?example=1 20140103030341 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz 1043 330 example.warc.gz') >>> load_from_cdx_test('com,example)/?example=1 20140103030341 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz 1043 330 example.warc.gz')
Traceback (most recent call last): Exception: ArchiveLoadFailed
ArchiveLoadFailed: example.warc.gz:StatusAndHeadersParserException
""" """
@ -189,11 +187,11 @@ def load_test_archive(test_file, offset, length):
def load_from_cdx_test(cdx): def load_from_cdx_test(cdx):
resolve_loader = ResolvingLoader(test_warc_dir) resolve_loader = ResolvingLoader(test_warc_dir)
cdx = CDXObject(cdx) cdx = CDXObject(cdx)
(headers, stream) = resolve_loader.resolve_headers_and_payload(cdx, None) try:
print headers (headers, stream) = resolve_loader.resolve_headers_and_payload(cdx, None)
sys.stdout.write(stream.readline()) print headers
sys.stdout.write(stream.readline()) sys.stdout.write(stream.readline())
sys.stdout.write(stream.readline())
except Exception as e:
print 'Exception: ' + e.__class__.__name__