1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

fix typos in remotecdxserver, url-agnostic dedup

when raising new exception, pass traceback of original also!
This commit is contained in:
Ilya Kreymer 2014-02-17 14:52:13 -08:00
parent 158b490453
commit 28187b34d3
5 changed files with 35 additions and 25 deletions

View File

@ -86,7 +86,7 @@ class RemoteCDXServer(object):
raise Exception('Invalid remote cdx source: ' + str(source))
def load_cdx(self, **params):
remote_iter = remote.load_cdx(**params)
remote_iter = self.source.load_cdx(params)
# if need raw, convert to raw format here
if params.get('output') == 'raw':
return (CDXObject(cdx) for cdx in remote_iter)

View File

@ -11,6 +11,13 @@ from pywb.cdx.cdxobject import CDXObject
#=================================================================
class IndexReader(object):
"""
Main interface for reading index (currently only CDX) from a
source server (currenlt a cdx server)
Creates an appropriate query based on wbrequest type info
"""
def __init__(self, config):
self.cdx_server = create_cdx_server(config)

View File

@ -1,8 +1,8 @@
import handlers
import indexreader
import archivalrouter
import config_utils
import proxy
from indexreader import IndexReader
import os
import yaml
@ -52,10 +52,10 @@ def pywb_config_manual(passed_config = {}):
for name, value in collections.iteritems():
if isinstance(value, str):
route_config = config
cdx_server = indexreader.IndexReader(value)
cdx_server = IndexReader(value)
else:
route_config = DictChain(value, config)
cdx_server = indexreader.IndexReader(route_config)
cdx_server = IndexReader(route_config)
wb_handler = config_utils.create_wb_handler(

View File

@ -94,6 +94,7 @@ class ResolvingLoader:
any_found = False
last_exc = None
last_traceback = None
for resolver in self.path_resolvers:
possible_paths = resolver(filename)
@ -105,17 +106,20 @@ class ResolvingLoader:
except Exception as ue:
last_exc = ue
import sys
last_traceback = sys.exc_info()[2]
# Unsuccessful if reached here
if failed_files:
failed_files.append(filename)
if last_exc:
msg = str(last_exc.__class__.__name__)
#msg = str(last_exc.__class__.__name__)
msg = str(last_exc)
else:
msg = 'Archive File Not Found'
raise ArchiveLoadFailed(msg, filename)
raise ArchiveLoadFailed(msg, filename), None, last_traceback
def _load_different_url_payload(self, cdx, headers_record, failed_files):
"""
@ -147,12 +151,13 @@ class ResolvingLoader:
ref_target_date = iso_date_to_timestamp(ref_target_date)
orig_cdx_lines = self.load_cdx_for_dupe(ref_target_uri,
ref_target_date, digest)
ref_target_date,
cdx['digest'])
for cdx in orig_cdx_lines:
try:
payload_record = self._load_and_resolve(cdx, False,
failed_files)
payload_record = self._resolve_path_load(cdx, False,
failed_files)
return payload_record
except ArchiveLoadFailed as e:
@ -160,7 +165,7 @@ class ResolvingLoader:
raise ArchiveLoadFailed('Original for revisit could not be loaded')
def load_cdx_for_dupe(url, timestamp, digest):
def load_cdx_for_dupe(self, url, timestamp, digest):
"""
If a cdx_server is available, return response from server,
otherwise empty list
@ -169,8 +174,8 @@ class ResolvingLoader:
return []
params = {'url': url,
'closest': closest,
'closest': timestamp,
'filter': 'digest:' + digest,
'output': 'raw'}
return self.cdx_server.load_cdx(params)
return self.cdx_server.load_cdx(**params)

View File

@ -145,19 +145,17 @@ StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Acc
# Invalid WARC Offset
>>> load_from_cdx_test('com,example)/?example=1 20140103030341 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1860 example.warc.gz 1043 333 example.warc.gz')
Traceback (most recent call last):
ArchiveLoadFailed: example.warc.gz:StatusAndHeadersParserException
Exception: ArchiveLoadFailed
# Invalid ARC Offset
>>> load_from_cdx_test('com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 332 example.warc.gz')
Traceback (most recent call last):
ArchiveLoadFailed: example.warc.gz:StatusAndHeadersParserException
Exception: ArchiveLoadFailed
# Error Expected with revisit -- invalid offset on original
>>> load_from_cdx_test('com,example)/?example=1 20140103030341 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz 1043 330 example.warc.gz')
Traceback (most recent call last):
ArchiveLoadFailed: example.warc.gz:StatusAndHeadersParserException
Exception: ArchiveLoadFailed
"""
@ -189,11 +187,11 @@ def load_test_archive(test_file, offset, length):
def load_from_cdx_test(cdx):
resolve_loader = ResolvingLoader(test_warc_dir)
cdx = CDXObject(cdx)
(headers, stream) = resolve_loader.resolve_headers_and_payload(cdx, None)
print headers
sys.stdout.write(stream.readline())
sys.stdout.write(stream.readline())
try:
(headers, stream) = resolve_loader.resolve_headers_and_payload(cdx, None)
print headers
sys.stdout.write(stream.readline())
sys.stdout.write(stream.readline())
except Exception as e:
print 'Exception: ' + e.__class__.__name__