From 28187b34d3536d2c762ee9d1261d9d7e02179d7e Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 17 Feb 2014 14:52:13 -0800 Subject: [PATCH] fix typos in remotecdxserver, url-agnostic dedup when raising new exception, pass traceback of original also! --- pywb/cdx/cdxserver.py | 2 +- pywb/indexreader.py | 7 +++++++ pywb/pywb_init.py | 6 +++--- pywb/warc/resolvingloader.py | 21 +++++++++++++-------- pywb/warc/test/test_loading.py | 24 +++++++++++------------- 5 files changed, 35 insertions(+), 25 deletions(-) diff --git a/pywb/cdx/cdxserver.py b/pywb/cdx/cdxserver.py index 4fad5ff0..2beef250 100644 --- a/pywb/cdx/cdxserver.py +++ b/pywb/cdx/cdxserver.py @@ -86,7 +86,7 @@ class RemoteCDXServer(object): raise Exception('Invalid remote cdx source: ' + str(source)) def load_cdx(self, **params): - remote_iter = remote.load_cdx(**params) + remote_iter = self.source.load_cdx(params) # if need raw, convert to raw format here if params.get('output') == 'raw': return (CDXObject(cdx) for cdx in remote_iter) diff --git a/pywb/indexreader.py b/pywb/indexreader.py index 7472e762..aaf60705 100644 --- a/pywb/indexreader.py +++ b/pywb/indexreader.py @@ -11,6 +11,13 @@ from pywb.cdx.cdxobject import CDXObject #================================================================= class IndexReader(object): + """ + Main interface for reading index (currently only CDX) from a + source server (currenlt a cdx server) + + Creates an appropriate query based on wbrequest type info + """ + def __init__(self, config): self.cdx_server = create_cdx_server(config) diff --git a/pywb/pywb_init.py b/pywb/pywb_init.py index b88c7d72..a6d0500b 100644 --- a/pywb/pywb_init.py +++ b/pywb/pywb_init.py @@ -1,8 +1,8 @@ import handlers -import indexreader import archivalrouter import config_utils import proxy +from indexreader import IndexReader import os import yaml @@ -52,10 +52,10 @@ def pywb_config_manual(passed_config = {}): for name, value in collections.iteritems(): if isinstance(value, str): route_config = config - cdx_server = indexreader.IndexReader(value) + cdx_server = IndexReader(value) else: route_config = DictChain(value, config) - cdx_server = indexreader.IndexReader(route_config) + cdx_server = IndexReader(route_config) wb_handler = config_utils.create_wb_handler( diff --git a/pywb/warc/resolvingloader.py b/pywb/warc/resolvingloader.py index c4ed557f..041024e7 100644 --- a/pywb/warc/resolvingloader.py +++ b/pywb/warc/resolvingloader.py @@ -94,6 +94,7 @@ class ResolvingLoader: any_found = False last_exc = None + last_traceback = None for resolver in self.path_resolvers: possible_paths = resolver(filename) @@ -105,17 +106,20 @@ class ResolvingLoader: except Exception as ue: last_exc = ue + import sys + last_traceback = sys.exc_info()[2] # Unsuccessful if reached here if failed_files: failed_files.append(filename) if last_exc: - msg = str(last_exc.__class__.__name__) + #msg = str(last_exc.__class__.__name__) + msg = str(last_exc) else: msg = 'Archive File Not Found' - raise ArchiveLoadFailed(msg, filename) + raise ArchiveLoadFailed(msg, filename), None, last_traceback def _load_different_url_payload(self, cdx, headers_record, failed_files): """ @@ -147,12 +151,13 @@ class ResolvingLoader: ref_target_date = iso_date_to_timestamp(ref_target_date) orig_cdx_lines = self.load_cdx_for_dupe(ref_target_uri, - ref_target_date, digest) + ref_target_date, + cdx['digest']) for cdx in orig_cdx_lines: try: - payload_record = self._load_and_resolve(cdx, False, - failed_files) + payload_record = self._resolve_path_load(cdx, False, + failed_files) return payload_record except ArchiveLoadFailed as e: @@ -160,7 +165,7 @@ class ResolvingLoader: raise ArchiveLoadFailed('Original for revisit could not be loaded') - def load_cdx_for_dupe(url, timestamp, digest): + def load_cdx_for_dupe(self, url, timestamp, digest): """ If a cdx_server is available, return response from server, otherwise empty list @@ -169,8 +174,8 @@ class ResolvingLoader: return [] params = {'url': url, - 'closest': closest, + 'closest': timestamp, 'filter': 'digest:' + digest, 'output': 'raw'} - return self.cdx_server.load_cdx(params) + return self.cdx_server.load_cdx(**params) diff --git a/pywb/warc/test/test_loading.py b/pywb/warc/test/test_loading.py index d95aaba5..e1a40950 100644 --- a/pywb/warc/test/test_loading.py +++ b/pywb/warc/test/test_loading.py @@ -145,19 +145,17 @@ StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Acc # Invalid WARC Offset >>> load_from_cdx_test('com,example)/?example=1 20140103030341 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1860 example.warc.gz 1043 333 example.warc.gz') -Traceback (most recent call last): -ArchiveLoadFailed: example.warc.gz:StatusAndHeadersParserException +Exception: ArchiveLoadFailed + # Invalid ARC Offset >>> load_from_cdx_test('com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 332 example.warc.gz') -Traceback (most recent call last): -ArchiveLoadFailed: example.warc.gz:StatusAndHeadersParserException +Exception: ArchiveLoadFailed # Error Expected with revisit -- invalid offset on original >>> load_from_cdx_test('com,example)/?example=1 20140103030341 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz 1043 330 example.warc.gz') -Traceback (most recent call last): -ArchiveLoadFailed: example.warc.gz:StatusAndHeadersParserException +Exception: ArchiveLoadFailed """ @@ -189,11 +187,11 @@ def load_test_archive(test_file, offset, length): def load_from_cdx_test(cdx): resolve_loader = ResolvingLoader(test_warc_dir) cdx = CDXObject(cdx) - (headers, stream) = resolve_loader.resolve_headers_and_payload(cdx, None) - print headers - sys.stdout.write(stream.readline()) - sys.stdout.write(stream.readline()) - - - + try: + (headers, stream) = resolve_loader.resolve_headers_and_payload(cdx, None) + print headers + sys.stdout.write(stream.readline()) + sys.stdout.write(stream.readline()) + except Exception as e: + print 'Exception: ' + e.__class__.__name__