mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
fix typos in remotecdxserver, url-agnostic dedup
when raising new exception, pass traceback of original also!
This commit is contained in:
parent
158b490453
commit
28187b34d3
@ -86,7 +86,7 @@ class RemoteCDXServer(object):
|
||||
raise Exception('Invalid remote cdx source: ' + str(source))
|
||||
|
||||
def load_cdx(self, **params):
|
||||
remote_iter = remote.load_cdx(**params)
|
||||
remote_iter = self.source.load_cdx(params)
|
||||
# if need raw, convert to raw format here
|
||||
if params.get('output') == 'raw':
|
||||
return (CDXObject(cdx) for cdx in remote_iter)
|
||||
|
@ -11,6 +11,13 @@ from pywb.cdx.cdxobject import CDXObject
|
||||
|
||||
#=================================================================
|
||||
class IndexReader(object):
|
||||
"""
|
||||
Main interface for reading index (currently only CDX) from a
|
||||
source server (currenlt a cdx server)
|
||||
|
||||
Creates an appropriate query based on wbrequest type info
|
||||
"""
|
||||
|
||||
def __init__(self, config):
|
||||
self.cdx_server = create_cdx_server(config)
|
||||
|
||||
|
@ -1,8 +1,8 @@
|
||||
import handlers
|
||||
import indexreader
|
||||
import archivalrouter
|
||||
import config_utils
|
||||
import proxy
|
||||
from indexreader import IndexReader
|
||||
|
||||
import os
|
||||
import yaml
|
||||
@ -52,10 +52,10 @@ def pywb_config_manual(passed_config = {}):
|
||||
for name, value in collections.iteritems():
|
||||
if isinstance(value, str):
|
||||
route_config = config
|
||||
cdx_server = indexreader.IndexReader(value)
|
||||
cdx_server = IndexReader(value)
|
||||
else:
|
||||
route_config = DictChain(value, config)
|
||||
cdx_server = indexreader.IndexReader(route_config)
|
||||
cdx_server = IndexReader(route_config)
|
||||
|
||||
|
||||
wb_handler = config_utils.create_wb_handler(
|
||||
|
@ -94,6 +94,7 @@ class ResolvingLoader:
|
||||
|
||||
any_found = False
|
||||
last_exc = None
|
||||
last_traceback = None
|
||||
for resolver in self.path_resolvers:
|
||||
possible_paths = resolver(filename)
|
||||
|
||||
@ -105,17 +106,20 @@ class ResolvingLoader:
|
||||
|
||||
except Exception as ue:
|
||||
last_exc = ue
|
||||
import sys
|
||||
last_traceback = sys.exc_info()[2]
|
||||
|
||||
# Unsuccessful if reached here
|
||||
if failed_files:
|
||||
failed_files.append(filename)
|
||||
|
||||
if last_exc:
|
||||
msg = str(last_exc.__class__.__name__)
|
||||
#msg = str(last_exc.__class__.__name__)
|
||||
msg = str(last_exc)
|
||||
else:
|
||||
msg = 'Archive File Not Found'
|
||||
|
||||
raise ArchiveLoadFailed(msg, filename)
|
||||
raise ArchiveLoadFailed(msg, filename), None, last_traceback
|
||||
|
||||
def _load_different_url_payload(self, cdx, headers_record, failed_files):
|
||||
"""
|
||||
@ -147,12 +151,13 @@ class ResolvingLoader:
|
||||
ref_target_date = iso_date_to_timestamp(ref_target_date)
|
||||
|
||||
orig_cdx_lines = self.load_cdx_for_dupe(ref_target_uri,
|
||||
ref_target_date, digest)
|
||||
ref_target_date,
|
||||
cdx['digest'])
|
||||
|
||||
for cdx in orig_cdx_lines:
|
||||
try:
|
||||
payload_record = self._load_and_resolve(cdx, False,
|
||||
failed_files)
|
||||
payload_record = self._resolve_path_load(cdx, False,
|
||||
failed_files)
|
||||
return payload_record
|
||||
|
||||
except ArchiveLoadFailed as e:
|
||||
@ -160,7 +165,7 @@ class ResolvingLoader:
|
||||
|
||||
raise ArchiveLoadFailed('Original for revisit could not be loaded')
|
||||
|
||||
def load_cdx_for_dupe(url, timestamp, digest):
|
||||
def load_cdx_for_dupe(self, url, timestamp, digest):
|
||||
"""
|
||||
If a cdx_server is available, return response from server,
|
||||
otherwise empty list
|
||||
@ -169,8 +174,8 @@ class ResolvingLoader:
|
||||
return []
|
||||
|
||||
params = {'url': url,
|
||||
'closest': closest,
|
||||
'closest': timestamp,
|
||||
'filter': 'digest:' + digest,
|
||||
'output': 'raw'}
|
||||
|
||||
return self.cdx_server.load_cdx(params)
|
||||
return self.cdx_server.load_cdx(**params)
|
||||
|
@ -145,19 +145,17 @@ StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Acc
|
||||
|
||||
# Invalid WARC Offset
|
||||
>>> load_from_cdx_test('com,example)/?example=1 20140103030341 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1860 example.warc.gz 1043 333 example.warc.gz')
|
||||
Traceback (most recent call last):
|
||||
ArchiveLoadFailed: example.warc.gz:StatusAndHeadersParserException
|
||||
Exception: ArchiveLoadFailed
|
||||
|
||||
|
||||
# Invalid ARC Offset
|
||||
>>> load_from_cdx_test('com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 332 example.warc.gz')
|
||||
Traceback (most recent call last):
|
||||
ArchiveLoadFailed: example.warc.gz:StatusAndHeadersParserException
|
||||
Exception: ArchiveLoadFailed
|
||||
|
||||
|
||||
# Error Expected with revisit -- invalid offset on original
|
||||
>>> load_from_cdx_test('com,example)/?example=1 20140103030341 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz 1043 330 example.warc.gz')
|
||||
Traceback (most recent call last):
|
||||
ArchiveLoadFailed: example.warc.gz:StatusAndHeadersParserException
|
||||
Exception: ArchiveLoadFailed
|
||||
|
||||
"""
|
||||
|
||||
@ -189,11 +187,11 @@ def load_test_archive(test_file, offset, length):
|
||||
def load_from_cdx_test(cdx):
|
||||
resolve_loader = ResolvingLoader(test_warc_dir)
|
||||
cdx = CDXObject(cdx)
|
||||
(headers, stream) = resolve_loader.resolve_headers_and_payload(cdx, None)
|
||||
print headers
|
||||
sys.stdout.write(stream.readline())
|
||||
sys.stdout.write(stream.readline())
|
||||
|
||||
|
||||
|
||||
try:
|
||||
(headers, stream) = resolve_loader.resolve_headers_and_payload(cdx, None)
|
||||
print headers
|
||||
sys.stdout.write(stream.readline())
|
||||
sys.stdout.write(stream.readline())
|
||||
except Exception as e:
|
||||
print 'Exception: ' + e.__class__.__name__
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user