mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
fix typos in remotecdxserver, url-agnostic dedup
when raising new exception, pass traceback of original also!
This commit is contained in:
parent
158b490453
commit
28187b34d3
@ -86,7 +86,7 @@ class RemoteCDXServer(object):
|
|||||||
raise Exception('Invalid remote cdx source: ' + str(source))
|
raise Exception('Invalid remote cdx source: ' + str(source))
|
||||||
|
|
||||||
def load_cdx(self, **params):
|
def load_cdx(self, **params):
|
||||||
remote_iter = remote.load_cdx(**params)
|
remote_iter = self.source.load_cdx(params)
|
||||||
# if need raw, convert to raw format here
|
# if need raw, convert to raw format here
|
||||||
if params.get('output') == 'raw':
|
if params.get('output') == 'raw':
|
||||||
return (CDXObject(cdx) for cdx in remote_iter)
|
return (CDXObject(cdx) for cdx in remote_iter)
|
||||||
|
@ -11,6 +11,13 @@ from pywb.cdx.cdxobject import CDXObject
|
|||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class IndexReader(object):
|
class IndexReader(object):
|
||||||
|
"""
|
||||||
|
Main interface for reading index (currently only CDX) from a
|
||||||
|
source server (currenlt a cdx server)
|
||||||
|
|
||||||
|
Creates an appropriate query based on wbrequest type info
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
self.cdx_server = create_cdx_server(config)
|
self.cdx_server = create_cdx_server(config)
|
||||||
|
|
||||||
|
@ -1,8 +1,8 @@
|
|||||||
import handlers
|
import handlers
|
||||||
import indexreader
|
|
||||||
import archivalrouter
|
import archivalrouter
|
||||||
import config_utils
|
import config_utils
|
||||||
import proxy
|
import proxy
|
||||||
|
from indexreader import IndexReader
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import yaml
|
import yaml
|
||||||
@ -52,10 +52,10 @@ def pywb_config_manual(passed_config = {}):
|
|||||||
for name, value in collections.iteritems():
|
for name, value in collections.iteritems():
|
||||||
if isinstance(value, str):
|
if isinstance(value, str):
|
||||||
route_config = config
|
route_config = config
|
||||||
cdx_server = indexreader.IndexReader(value)
|
cdx_server = IndexReader(value)
|
||||||
else:
|
else:
|
||||||
route_config = DictChain(value, config)
|
route_config = DictChain(value, config)
|
||||||
cdx_server = indexreader.IndexReader(route_config)
|
cdx_server = IndexReader(route_config)
|
||||||
|
|
||||||
|
|
||||||
wb_handler = config_utils.create_wb_handler(
|
wb_handler = config_utils.create_wb_handler(
|
||||||
|
@ -94,6 +94,7 @@ class ResolvingLoader:
|
|||||||
|
|
||||||
any_found = False
|
any_found = False
|
||||||
last_exc = None
|
last_exc = None
|
||||||
|
last_traceback = None
|
||||||
for resolver in self.path_resolvers:
|
for resolver in self.path_resolvers:
|
||||||
possible_paths = resolver(filename)
|
possible_paths = resolver(filename)
|
||||||
|
|
||||||
@ -105,17 +106,20 @@ class ResolvingLoader:
|
|||||||
|
|
||||||
except Exception as ue:
|
except Exception as ue:
|
||||||
last_exc = ue
|
last_exc = ue
|
||||||
|
import sys
|
||||||
|
last_traceback = sys.exc_info()[2]
|
||||||
|
|
||||||
# Unsuccessful if reached here
|
# Unsuccessful if reached here
|
||||||
if failed_files:
|
if failed_files:
|
||||||
failed_files.append(filename)
|
failed_files.append(filename)
|
||||||
|
|
||||||
if last_exc:
|
if last_exc:
|
||||||
msg = str(last_exc.__class__.__name__)
|
#msg = str(last_exc.__class__.__name__)
|
||||||
|
msg = str(last_exc)
|
||||||
else:
|
else:
|
||||||
msg = 'Archive File Not Found'
|
msg = 'Archive File Not Found'
|
||||||
|
|
||||||
raise ArchiveLoadFailed(msg, filename)
|
raise ArchiveLoadFailed(msg, filename), None, last_traceback
|
||||||
|
|
||||||
def _load_different_url_payload(self, cdx, headers_record, failed_files):
|
def _load_different_url_payload(self, cdx, headers_record, failed_files):
|
||||||
"""
|
"""
|
||||||
@ -147,12 +151,13 @@ class ResolvingLoader:
|
|||||||
ref_target_date = iso_date_to_timestamp(ref_target_date)
|
ref_target_date = iso_date_to_timestamp(ref_target_date)
|
||||||
|
|
||||||
orig_cdx_lines = self.load_cdx_for_dupe(ref_target_uri,
|
orig_cdx_lines = self.load_cdx_for_dupe(ref_target_uri,
|
||||||
ref_target_date, digest)
|
ref_target_date,
|
||||||
|
cdx['digest'])
|
||||||
|
|
||||||
for cdx in orig_cdx_lines:
|
for cdx in orig_cdx_lines:
|
||||||
try:
|
try:
|
||||||
payload_record = self._load_and_resolve(cdx, False,
|
payload_record = self._resolve_path_load(cdx, False,
|
||||||
failed_files)
|
failed_files)
|
||||||
return payload_record
|
return payload_record
|
||||||
|
|
||||||
except ArchiveLoadFailed as e:
|
except ArchiveLoadFailed as e:
|
||||||
@ -160,7 +165,7 @@ class ResolvingLoader:
|
|||||||
|
|
||||||
raise ArchiveLoadFailed('Original for revisit could not be loaded')
|
raise ArchiveLoadFailed('Original for revisit could not be loaded')
|
||||||
|
|
||||||
def load_cdx_for_dupe(url, timestamp, digest):
|
def load_cdx_for_dupe(self, url, timestamp, digest):
|
||||||
"""
|
"""
|
||||||
If a cdx_server is available, return response from server,
|
If a cdx_server is available, return response from server,
|
||||||
otherwise empty list
|
otherwise empty list
|
||||||
@ -169,8 +174,8 @@ class ResolvingLoader:
|
|||||||
return []
|
return []
|
||||||
|
|
||||||
params = {'url': url,
|
params = {'url': url,
|
||||||
'closest': closest,
|
'closest': timestamp,
|
||||||
'filter': 'digest:' + digest,
|
'filter': 'digest:' + digest,
|
||||||
'output': 'raw'}
|
'output': 'raw'}
|
||||||
|
|
||||||
return self.cdx_server.load_cdx(params)
|
return self.cdx_server.load_cdx(**params)
|
||||||
|
@ -145,19 +145,17 @@ StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Acc
|
|||||||
|
|
||||||
# Invalid WARC Offset
|
# Invalid WARC Offset
|
||||||
>>> load_from_cdx_test('com,example)/?example=1 20140103030341 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1860 example.warc.gz 1043 333 example.warc.gz')
|
>>> load_from_cdx_test('com,example)/?example=1 20140103030341 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1860 example.warc.gz 1043 333 example.warc.gz')
|
||||||
Traceback (most recent call last):
|
Exception: ArchiveLoadFailed
|
||||||
ArchiveLoadFailed: example.warc.gz:StatusAndHeadersParserException
|
|
||||||
|
|
||||||
# Invalid ARC Offset
|
# Invalid ARC Offset
|
||||||
>>> load_from_cdx_test('com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 332 example.warc.gz')
|
>>> load_from_cdx_test('com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 332 example.warc.gz')
|
||||||
Traceback (most recent call last):
|
Exception: ArchiveLoadFailed
|
||||||
ArchiveLoadFailed: example.warc.gz:StatusAndHeadersParserException
|
|
||||||
|
|
||||||
|
|
||||||
# Error Expected with revisit -- invalid offset on original
|
# Error Expected with revisit -- invalid offset on original
|
||||||
>>> load_from_cdx_test('com,example)/?example=1 20140103030341 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz 1043 330 example.warc.gz')
|
>>> load_from_cdx_test('com,example)/?example=1 20140103030341 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz 1043 330 example.warc.gz')
|
||||||
Traceback (most recent call last):
|
Exception: ArchiveLoadFailed
|
||||||
ArchiveLoadFailed: example.warc.gz:StatusAndHeadersParserException
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@ -189,11 +187,11 @@ def load_test_archive(test_file, offset, length):
|
|||||||
def load_from_cdx_test(cdx):
|
def load_from_cdx_test(cdx):
|
||||||
resolve_loader = ResolvingLoader(test_warc_dir)
|
resolve_loader = ResolvingLoader(test_warc_dir)
|
||||||
cdx = CDXObject(cdx)
|
cdx = CDXObject(cdx)
|
||||||
(headers, stream) = resolve_loader.resolve_headers_and_payload(cdx, None)
|
try:
|
||||||
print headers
|
(headers, stream) = resolve_loader.resolve_headers_and_payload(cdx, None)
|
||||||
sys.stdout.write(stream.readline())
|
print headers
|
||||||
sys.stdout.write(stream.readline())
|
sys.stdout.write(stream.readline())
|
||||||
|
sys.stdout.write(stream.readline())
|
||||||
|
except Exception as e:
|
||||||
|
print 'Exception: ' + e.__class__.__name__
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user