diff --git a/config.yaml b/config.yaml index 319fe1fe..b81e555e 100644 --- a/config.yaml +++ b/config.yaml @@ -83,7 +83,7 @@ archive_paths: ./sample_archive/warcs/ # : # default route static/default for pywb defaults static_routes: - static/default: static/ + static/default: pywb/static/ # ==== New / Experimental Settings ==== # Not yet production ready -- used primarily for testing diff --git a/pytest.ini b/pytest.ini deleted file mode 100644 index 5340e5ba..00000000 --- a/pytest.ini +++ /dev/null @@ -1,2 +0,0 @@ -[pytest] -addopts=--cov-config .coveragerc --cov pywb -v --doctest-module ./pywb/ tests/ diff --git a/pywb/core/handlers.py b/pywb/core/handlers.py index 182de73a..a3e2f077 100644 --- a/pywb/core/handlers.py +++ b/pywb/core/handlers.py @@ -3,8 +3,11 @@ import mimetypes import time from pywb.utils.wbexception import NotFoundException +from pywb.utils.loaders import BlockLoader + from pywb.framework.basehandlers import BaseHandler, WbUrlHandler from pywb.framework.wbrequestresponse import WbResponse + from views import TextCapturesView @@ -38,7 +41,8 @@ class WBHandler(WbUrlHandler): if wbrequest.wb_url.mod == 'cdx_': return self.text_query_view.render_response(wbrequest, cdx_lines) - if (wbrequest.wb_url.type == wbrequest.wb_url.QUERY) or (wbrequest.wb_url.type == wbrequest.wb_url.URL_QUERY): + if ((wbrequest.wb_url.type == wbrequest.wb_url.QUERY) or + (wbrequest.wb_url.type == wbrequest.wb_url.URL_QUERY)): return self.query_view.render_response(wbrequest, cdx_lines) with PerfTimer(wbrequest.env.get('X_PERF'), 'replay') as t: @@ -46,14 +50,12 @@ class WBHandler(WbUrlHandler): cdx_lines, self.index_reader.cdx_load_callback(wbrequest)) - def render_search_page(self, wbrequest): if self.search_view: - return self.search_view.render_response(wbrequest = wbrequest) + return self.search_view.render_response(wbrequest=wbrequest) else: return WbResponse.text_response('No Lookup Url Specified') - def __str__(self): return 'WBHandler: ' + str(self.index_reader) + ', ' + str(self.replay) @@ -62,7 +64,7 @@ class WBHandler(WbUrlHandler): # Static Content Handler #================================================================= class StaticHandler(BaseHandler): - def __init__(self, static_path, pkg = 'pywb'): + def __init__(self, static_path, pkg='pywb'): mimetypes.init() self.static_path = static_path @@ -72,10 +74,11 @@ class StaticHandler(BaseHandler): full_path = self.static_path + wbrequest.wb_url_str try: - if full_path.startswith('.') or full_path.startswith('file://'): - data = open(full_path, 'rb') - else: - data = pkgutil.get_data(self.pkg, full_path) + #if full_path.startswith('.') or full_path.startswith('file://'): + # data = open(full_path, 'rb') + #else: + # data = pkgutil.get_data(self.pkg, full_path) + data = BlockLoader().load(full_path) if 'wsgi.file_wrapper' in wbrequest.env: reader = wbrequest.env['wsgi.file_wrapper'](data) @@ -84,10 +87,11 @@ class StaticHandler(BaseHandler): content_type, _ = mimetypes.guess_type(full_path) - return WbResponse.text_stream(data, content_type = content_type) + return WbResponse.text_stream(data, content_type=content_type) except IOError: - raise NotFoundException('Static File Not Found: ' + wbrequest.wb_url_str) + raise NotFoundException('Static File Not Found: ' + + wbrequest.wb_url_str) def __str__(self): return 'Static files from ' + self.static_path diff --git a/pywb/core/replay_views.py b/pywb/core/replay_views.py index 18feb45c..2718d428 100644 --- a/pywb/core/replay_views.py +++ b/pywb/core/replay_views.py @@ -48,7 +48,6 @@ class ReplayView: else: self.response_class = WbResponse - def __call__(self, wbrequest, cdx_lines, cdx_loader): last_e = None first = True @@ -56,11 +55,15 @@ class ReplayView: # List of already failed w/arcs failed_files = [] + response = None + # Iterate over the cdx until find one that works - # The cdx should already be sorted in closest-to-timestamp order (from the cdx server) + # The cdx should already be sorted in + # closest-to-timestamp order (from the cdx server) for cdx in cdx_lines: try: - # optimize: can detect if redirect is needed just from the cdx, no need to load w/arc data + # optimize: can detect if redirect is needed just from the cdx, + # no need to load w/arc data if requiring exact match if first: redir_response = self._redirect_if_needed(wbrequest, cdx) if redir_response: @@ -68,42 +71,10 @@ class ReplayView: first = False - (status_headers, stream) = (self.content_loader. - resolve_headers_and_payload(cdx, failed_files, cdx_loader)) - - # check and reject self-redirect - self._reject_self_redirect(wbrequest, cdx, status_headers) - - # check if redir is needed - redir_response = self._redirect_if_needed(wbrequest, cdx) - if redir_response: - return redir_response - - # one more check for referrer-based self-redirect - self._reject_referrer_self_redirect(wbrequest) - - response = None - - # if Content-Length for payload is present, ensure we don't read past it - content_length = status_headers.get_header('content-length') - if content_length: - stream = LimitReader.wrap_stream(stream, content_length) - - if self.content_rewriter and wbrequest.wb_url.mod != 'id_': - response = self.rewrite_content(wbrequest, cdx, status_headers, stream) - else: - (status_headers, stream) = self.sanitize_content(status_headers, stream) - #status_headers.remove_header('content-length') - - response_iter = self.stream_to_iter(stream) - response = WbResponse(status_headers, response_iter) - - # notify reporter callback, if any - if self._reporter: - self._reporter(wbrequest, cdx, response) - - return response - + response = self.replay_capture(wbrequest, + cdx, + cdx_loader, + failed_files) except (CaptureException, ArchiveLoadFailed) as ce: import traceback @@ -111,10 +82,57 @@ class ReplayView: last_e = ce pass + if response: + return response + if last_e: raise last_e else: - raise WbException('No Content Loaded for: ' + wbrequest.wb_url) + raise WbException('No Content Loaded for: ' + + str(wbrequest.wb_url)) + + def replay_capture(self, wbrequest, cdx, cdx_loader, failed_files): + (status_headers, stream) = (self.content_loader. + resolve_headers_and_payload(cdx, + failed_files, + cdx_loader)) + + # check and reject self-redirect + self._reject_self_redirect(wbrequest, cdx, status_headers) + + # check if redir is needed + redir_response = self._redirect_if_needed(wbrequest, cdx) + if redir_response: + return redir_response + + # one more check for referrer-based self-redirect + self._reject_referrer_self_redirect(wbrequest) + + response = None + + # if Content-Length for payload is present, + # ensure we don't read past it + content_length = status_headers.get_header('content-length') + if content_length: + stream = LimitReader.wrap_stream(stream, content_length) + + if self.content_rewriter and wbrequest.wb_url.mod != 'id_': + + response = self.rewrite_content(wbrequest, + cdx, + status_headers, + stream) + else: + (status_headers, stream) = self.sanitize_content(status_headers, + stream) + response_iter = self.stream_to_iter(stream) + response = WbResponse(status_headers, response_iter) + + # notify reporter callback, if any + if self._reporter: + self._reporter(wbrequest, cdx, response) + + return response @staticmethod def stream_to_iter(stream): @@ -181,7 +199,6 @@ class ReplayView: wbrequest=wbrequest, cdx=cdx) - # Buffer rewrite iterator and return a response from a string def buffered_response(self, status_headers, iterator): out = BytesIO() @@ -194,7 +211,8 @@ class ReplayView: content = out.getvalue() content_length_str = str(len(content)) - status_headers.headers.append(('Content-Length', content_length_str)) + status_headers.headers.append(('Content-Length', + content_length_str)) out.close() return content @@ -203,7 +221,9 @@ class ReplayView: if wbrequest.is_proxy: return None - redir_needed = hasattr(wbrequest, 'is_timegate') and wbrequest.is_timegate + # todo: generalize this? + redir_needed = (hasattr(wbrequest, 'is_timegate') and + wbrequest.is_timegate) if not redir_needed and self.redir_to_exact: redir_needed = (cdx['timestamp'] != wbrequest.wb_url.timestamp) @@ -211,7 +231,9 @@ class ReplayView: if not redir_needed: return None - new_url = wbrequest.urlrewriter.get_timestamp_url(cdx['timestamp'], cdx['original']) + new_url = wbrequest.urlrewriter.get_timestamp_url(cdx['timestamp'], + cdx['original']) + status_headers = StatusAndHeaders('302 Internal Redirect', [('Location', new_url)]) @@ -219,7 +241,6 @@ class ReplayView: return self.response_class(status_headers, wbrequest=wbrequest) - def _reject_self_redirect(self, wbrequest, cdx, status_headers): """ Check if response is a 3xx redirect to the same url @@ -237,18 +258,22 @@ class ReplayView: request_url = wbrequest.wb_url.url.lower() location_url = status_headers.get_header('Location') if not location_url: - return + return location_url = location_url.lower() - if (ReplayView.strip_scheme(request_url) == ReplayView.strip_scheme(location_url)): + if (ReplayView.strip_scheme(request_url) == + ReplayView.strip_scheme(location_url)): raise CaptureException('Self Redirect: ' + str(cdx)) def _reject_referrer_self_redirect(self, wbrequest): """ Perform final check for referrer based self-redirect. - This method should be called after verifying request timestamp matches capture. - if referrer is same as current url, reject this response and try another capture + This method should be called after verifying that + the request timestamp == capture timestamp + + If referrer is same as current url, + reject this response and try another capture. """ if not wbrequest.referrer: return @@ -258,23 +283,27 @@ class ReplayView: wbrequest.rel_prefix + str(wbrequest.wb_url)) if (ReplayView.strip_scheme(request_url) == - ReplayView.strip_scheme(wbrequest.referrer)): - raise CaptureException('Self Redirect via Referrer: ' + str(wbrequest.wb_url)) - + ReplayView.strip_scheme(wbrequest.referrer)): + raise CaptureException('Self Redirect via Referrer: ' + + str(wbrequest.wb_url)) @staticmethod def strip_scheme(url): """ - >>> ReplayView.strip_scheme('https://example.com') == ReplayView.strip_scheme('http://example.com') + >>> ReplayView.strip_scheme('https://example.com') ==\ + ReplayView.strip_scheme('http://example.com') True - >>> ReplayView.strip_scheme('https://example.com') == ReplayView.strip_scheme('http:/example.com') + >>> ReplayView.strip_scheme('https://example.com') ==\ + ReplayView.strip_scheme('http:/example.com') True - >>> ReplayView.strip_scheme('https://example.com') == ReplayView.strip_scheme('example.com') + >>> ReplayView.strip_scheme('https://example.com') ==\ + ReplayView.strip_scheme('example.com') True - >>> ReplayView.strip_scheme('about://example.com') == ReplayView.strip_scheme('example.com') + >>> ReplayView.strip_scheme('about://example.com') ==\ + ReplayView.strip_scheme('example.com') True """ m = ReplayView.STRIP_SCHEME.match(url) @@ -287,6 +316,7 @@ class ReplayView: else: return url + if __name__ == "__main__": import doctest doctest.testmod() diff --git a/setup.py b/setup.py index 198d50bc..512f6cc0 100755 --- a/setup.py +++ b/setup.py @@ -25,10 +25,9 @@ class PyTest(TestCommand): def run_tests(self): import pytest import sys - # cmdline opts moved to pytest.ini - #cmdline = ' --cov-config .coveragerc --cov pywb' - #cmdline += ' -v --doctest-module ./pywb/ tests/' - errcode = pytest.main('') + cmdline = ' --cov-config .coveragerc --cov pywb' + cmdline += ' -v --doctest-module ./pywb/ tests/' + errcode = pytest.main(cmdline) sys.exit(errcode) setup( diff --git a/tests/test_config.yaml b/tests/test_config.yaml index 9d854a67..ac6307f0 100644 --- a/tests/test_config.yaml +++ b/tests/test_config.yaml @@ -78,7 +78,7 @@ absoulte_paths: true # List of route names: # : static_routes: - static/test/route: static/ + static/test/route: pywb/static/ # Enable simple http proxy mode enable_http_proxy: true diff --git a/tests/test_config_memento.yaml b/tests/test_config_memento.yaml index 3dfd6535..c17dabd9 100644 --- a/tests/test_config_memento.yaml +++ b/tests/test_config_memento.yaml @@ -1,94 +1,20 @@ -# pywb config file +# pywb memento test config file # ======================================== -# -# Settings for each collection + +# minimal settings for memento http api testing collections: - # : - # collection will be accessed via / - # is a string or list of: - # - string or list of one or more local .cdx file - # - string or list of one or more local dirs with .cdx files - # - a string value indicating remote http cdx server pywb: ./sample_archive/cdx/ - # ex with filtering: filter CDX lines by filename starting with 'dupe' - pywb-filt: {'index_paths': './sample_archive/cdx/', 'filters': ['filename:dupe*']} - - -# indicate if cdx files are sorted by SURT keys -- eg: com,example)/ -# SURT keys are recommended for future indices, but non-SURT cdxs -# are also supported -# -# * Set to true if cdxs start with surts: com,example)/ -# * Set to false if cdx start with urls: example.com)/ -surt_ordered: true - -# list of paths prefixes for pywb look to 'resolve' WARC and ARC filenames -# in the cdx to their absolute path -# -# if path is: -# * local dir, use path as prefix -# * local file, lookup prefix in tab-delimited sorted index -# * http:// path, use path as remote prefix -# * redis:// path, use redis to lookup full path for w: as key - -archive_paths: ['./invalid/path/to/ignore/', './sample_archive/warcs/'] - -# ==== Optional UI: HTML/Jinja2 Templates ==== - -# template for insert into replayed html content -head_insert_html: ui/head_insert.html - -# template to for 'calendar' query, -# eg, a listing of captures in response to a ../*/ -# -# may be a simple listing or a more complex 'calendar' UI -# if omitted, will list raw cdx in plain text -query_html: ui/query.html - -# template for search page, which is displayed when no search url is entered -# in a collection -search_html: ui/search.html - -# template for home page. -# if no other route is set, this will be rendered at /, /index.htm and /index.html -home_html: ui/index.html - - -# error page temlpate for may formatting error message and details -# if omitted, a text response is returned -error_html: ui/error.html - -# ==== Other Paths ==== - -# list of host names that pywb will be running from to detect -# 'fallthrough' requests based on referrer -# -# eg: an incorrect request for http://localhost:8080/image.gif with a referrer -# of http://localhost:8080/pywb/index.html, pywb can correctly redirect -# to http://localhost:8080/pywb/image.gif -# - -#hostpaths: ['http://localhost:8080'] - -# Rewrite urls with absolute paths instead of relative -absoulte_paths: true - -# List of route names: -# : -static_routes: - static/test/route: static/ +archive_paths: ['./sample_archive/warcs/'] +# Test memento +enable_memento: true # Enable simple http proxy mode enable_http_proxy: true -# enable cdx server api for querying cdx directly (experimental) +# enable cdx server api for timemap enable_cdx_api: true -# test different port -port: 9000 -#with memento -enable_memento: True