refactoring: clean up handlers and replay_views for pep8

use BlockLoader().load for StaticHandler static file resolving update static paths to point to pywb/static instead of static
2025-03-15 00:03:28 +01:00 · 2014-03-14 18:17:22 -07:00 · 2014-03-14 18:17:22 -07:00 · 6461af030b
commit 6461af030b
parent a69d565af5
7 changed files with 113 additions and 156 deletions
--- a/config.yaml
+++ b/config.yaml
@ -83,7 +83,7 @@ archive_paths: ./sample_archive/warcs/
 # <route>: <package or file path>
 # default route static/default for pywb defaults
 static_routes:
-          static/default: static/
+          static/default: pywb/static/
 # ==== New / Experimental Settings ====
 # Not yet production ready -- used primarily for testing
--- a/pytest.ini
+++ b/pytest.ini
@ -1,2 +0,0 @@
 [pytest]
 addopts=--cov-config .coveragerc --cov pywb -v --doctest-module ./pywb/ tests/
--- a/pywb/core/handlers.py
+++ b/pywb/core/handlers.py
@ -3,8 +3,11 @@ import mimetypes
 import time
 from pywb.utils.wbexception import NotFoundException
 from pywb.utils.loaders import BlockLoader
 from pywb.framework.basehandlers import BaseHandler, WbUrlHandler
 from pywb.framework.wbrequestresponse import WbResponse
 from views import TextCapturesView
@ -38,7 +41,8 @@ class WBHandler(WbUrlHandler):
        if wbrequest.wb_url.mod == 'cdx_':
            return self.text_query_view.render_response(wbrequest, cdx_lines)
-        if (wbrequest.wb_url.type == wbrequest.wb_url.QUERY) or (wbrequest.wb_url.type == wbrequest.wb_url.URL_QUERY):
+        if ((wbrequest.wb_url.type == wbrequest.wb_url.QUERY) or
            (wbrequest.wb_url.type == wbrequest.wb_url.URL_QUERY)):
            return self.query_view.render_response(wbrequest, cdx_lines)
        with PerfTimer(wbrequest.env.get('X_PERF'), 'replay') as t:
@ -46,14 +50,12 @@ class WBHandler(WbUrlHandler):
                               cdx_lines,
                               self.index_reader.cdx_load_callback(wbrequest))
    def render_search_page(self, wbrequest):
        if self.search_view:
-            return self.search_view.render_response(wbrequest = wbrequest)
+            return self.search_view.render_response(wbrequest=wbrequest)
        else:
            return WbResponse.text_response('No Lookup Url Specified')
    def __str__(self):
        return 'WBHandler: ' + str(self.index_reader) + ', ' + str(self.replay)
@ -62,7 +64,7 @@ class WBHandler(WbUrlHandler):
 # Static Content Handler
 #=================================================================
 class StaticHandler(BaseHandler):
-    def __init__(self, static_path, pkg = 'pywb'):
+    def __init__(self, static_path, pkg='pywb'):
        mimetypes.init()
        self.static_path = static_path
@ -72,10 +74,11 @@ class StaticHandler(BaseHandler):
        full_path = self.static_path + wbrequest.wb_url_str
        try:
-            if full_path.startswith('.') or full_path.startswith('file://'):
+            #if full_path.startswith('.') or full_path.startswith('file://'):
-                data = open(full_path, 'rb')
+            #    data = open(full_path, 'rb')
-            else:
+            #else:
-                data = pkgutil.get_data(self.pkg, full_path)
+            #    data = pkgutil.get_data(self.pkg, full_path)
            data = BlockLoader().load(full_path)
            if 'wsgi.file_wrapper' in wbrequest.env:
                reader = wbrequest.env['wsgi.file_wrapper'](data)
@ -84,10 +87,11 @@ class StaticHandler(BaseHandler):
            content_type, _ = mimetypes.guess_type(full_path)
-            return WbResponse.text_stream(data, content_type = content_type)
+            return WbResponse.text_stream(data, content_type=content_type)
        except IOError:
-            raise NotFoundException('Static File Not Found: ' + wbrequest.wb_url_str)
+            raise NotFoundException('Static File Not Found: ' +
                                    wbrequest.wb_url_str)
    def __str__(self):
        return 'Static files from ' + self.static_path
--- a/pywb/core/replay_views.py
+++ b/pywb/core/replay_views.py
@ -48,7 +48,6 @@ class ReplayView:
        else:
            self.response_class = WbResponse
    def __call__(self, wbrequest, cdx_lines, cdx_loader):
        last_e = None
        first = True
@ -56,11 +55,15 @@ class ReplayView:
        # List of already failed w/arcs
        failed_files = []
        response = None
        # Iterate over the cdx until find one that works
-        # The cdx should already be sorted in closest-to-timestamp order (from the cdx server)
+        # The cdx should already be sorted in
        # closest-to-timestamp order (from the cdx server)
        for cdx in cdx_lines:
            try:
-                # optimize: can detect if redirect is needed just from the cdx, no need to load w/arc data
+                # optimize: can detect if redirect is needed just from the cdx,
                # no need to load w/arc data if requiring exact match
                if first:
                    redir_response = self._redirect_if_needed(wbrequest, cdx)
                    if redir_response:
@ -68,42 +71,10 @@ class ReplayView:
                    first = False
-                (status_headers, stream) = (self.content_loader.
+                response = self.replay_capture(wbrequest,
-                                            resolve_headers_and_payload(cdx, failed_files, cdx_loader))
+                                               cdx,
-
+                                               cdx_loader,
-                # check and reject self-redirect
+                                               failed_files)
                self._reject_self_redirect(wbrequest, cdx, status_headers)
                # check if redir is needed
                redir_response = self._redirect_if_needed(wbrequest, cdx)
                if redir_response:
                    return redir_response
                # one more check for referrer-based self-redirect
                self._reject_referrer_self_redirect(wbrequest)
                response = None
                # if Content-Length for payload is present, ensure we don't read past it
                content_length = status_headers.get_header('content-length')
                if content_length:
                    stream = LimitReader.wrap_stream(stream, content_length)
                if self.content_rewriter and wbrequest.wb_url.mod != 'id_':
                    response = self.rewrite_content(wbrequest, cdx, status_headers, stream)
                else:
                    (status_headers, stream) = self.sanitize_content(status_headers, stream)
                    #status_headers.remove_header('content-length')
                    response_iter = self.stream_to_iter(stream)
                    response = WbResponse(status_headers, response_iter)
                # notify reporter callback, if any
                if self._reporter:
                    self._reporter(wbrequest, cdx, response)
                return response
            except (CaptureException, ArchiveLoadFailed) as ce:
                import traceback
@ -111,10 +82,57 @@ class ReplayView:
                last_e = ce
                pass
            if response:
                return response
        if last_e:
            raise last_e
        else:
-            raise WbException('No Content Loaded for: ' + wbrequest.wb_url)
+            raise WbException('No Content Loaded for: ' +
                              str(wbrequest.wb_url))
    def replay_capture(self, wbrequest, cdx, cdx_loader, failed_files):
        (status_headers, stream) = (self.content_loader.
                                    resolve_headers_and_payload(cdx,
                                                                failed_files,
                                                                cdx_loader))
        # check and reject self-redirect
        self._reject_self_redirect(wbrequest, cdx, status_headers)
        # check if redir is needed
        redir_response = self._redirect_if_needed(wbrequest, cdx)
        if redir_response:
            return redir_response
        # one more check for referrer-based self-redirect
        self._reject_referrer_self_redirect(wbrequest)
        response = None
        # if Content-Length for payload is present,
        # ensure we don't read past it
        content_length = status_headers.get_header('content-length')
        if content_length:
            stream = LimitReader.wrap_stream(stream, content_length)
        if self.content_rewriter and wbrequest.wb_url.mod != 'id_':
            response = self.rewrite_content(wbrequest,
                                            cdx,
                                            status_headers,
                                            stream)
        else:
            (status_headers, stream) = self.sanitize_content(status_headers,
                                                             stream)
            response_iter = self.stream_to_iter(stream)
            response = WbResponse(status_headers, response_iter)
        # notify reporter callback, if any
        if self._reporter:
            self._reporter(wbrequest, cdx, response)
        return response
    @staticmethod
    def stream_to_iter(stream):
@ -181,7 +199,6 @@ class ReplayView:
                                   wbrequest=wbrequest,
                                   cdx=cdx)
    # Buffer rewrite iterator and return a response from a string
    def buffered_response(self, status_headers, iterator):
        out = BytesIO()
@ -194,7 +211,8 @@ class ReplayView:
            content = out.getvalue()
            content_length_str = str(len(content))
-            status_headers.headers.append(('Content-Length', content_length_str))
+            status_headers.headers.append(('Content-Length',
                                           content_length_str))
            out.close()
        return content
@ -203,7 +221,9 @@ class ReplayView:
        if wbrequest.is_proxy:
            return None
-        redir_needed = hasattr(wbrequest, 'is_timegate') and wbrequest.is_timegate
+        # todo: generalize this?
        redir_needed = (hasattr(wbrequest, 'is_timegate') and
                        wbrequest.is_timegate)
        if not redir_needed and self.redir_to_exact:
            redir_needed = (cdx['timestamp'] != wbrequest.wb_url.timestamp)
@ -211,7 +231,9 @@ class ReplayView:
        if not redir_needed:
            return None
-        new_url = wbrequest.urlrewriter.get_timestamp_url(cdx['timestamp'], cdx['original'])
+        new_url = wbrequest.urlrewriter.get_timestamp_url(cdx['timestamp'],
                                                          cdx['original'])
        status_headers = StatusAndHeaders('302 Internal Redirect',
                                          [('Location', new_url)])
@ -219,7 +241,6 @@ class ReplayView:
        return self.response_class(status_headers,
                                   wbrequest=wbrequest)
    def _reject_self_redirect(self, wbrequest, cdx, status_headers):
        """
        Check if response is a 3xx redirect to the same url
@ -237,18 +258,22 @@ class ReplayView:
        request_url = wbrequest.wb_url.url.lower()
        location_url = status_headers.get_header('Location')
        if not location_url:
-           return
+            return
        location_url = location_url.lower()
-        if (ReplayView.strip_scheme(request_url) == ReplayView.strip_scheme(location_url)):
+        if (ReplayView.strip_scheme(request_url) ==
             ReplayView.strip_scheme(location_url)):
            raise CaptureException('Self Redirect: ' + str(cdx))
    def _reject_referrer_self_redirect(self, wbrequest):
        """
        Perform final check for referrer based self-redirect.
-        This method should be called after verifying request timestamp matches capture.
+        This method should be called after verifying that
-        if referrer is same as current url, reject this response and try another capture
+        the request timestamp == capture timestamp
        If referrer is same as current url,
        reject this response and try another capture.
        """
        if not wbrequest.referrer:
            return
@ -258,23 +283,27 @@ class ReplayView:
                       wbrequest.rel_prefix + str(wbrequest.wb_url))
        if (ReplayView.strip_scheme(request_url) ==
-            ReplayView.strip_scheme(wbrequest.referrer)):
+             ReplayView.strip_scheme(wbrequest.referrer)):
-            raise CaptureException('Self Redirect via Referrer: ' + str(wbrequest.wb_url))
+            raise CaptureException('Self Redirect via Referrer: ' +
-
+                                   str(wbrequest.wb_url))
    @staticmethod
    def strip_scheme(url):
        """
-        >>> ReplayView.strip_scheme('https://example.com') == ReplayView.strip_scheme('http://example.com')
+        >>> ReplayView.strip_scheme('https://example.com') ==\
            ReplayView.strip_scheme('http://example.com')
        True
-        >>> ReplayView.strip_scheme('https://example.com') == ReplayView.strip_scheme('http:/example.com')
+        >>> ReplayView.strip_scheme('https://example.com') ==\
            ReplayView.strip_scheme('http:/example.com')
        True
-        >>> ReplayView.strip_scheme('https://example.com') == ReplayView.strip_scheme('example.com')
+        >>> ReplayView.strip_scheme('https://example.com') ==\
            ReplayView.strip_scheme('example.com')
        True
-        >>> ReplayView.strip_scheme('about://example.com') == ReplayView.strip_scheme('example.com')
+        >>> ReplayView.strip_scheme('about://example.com') ==\
            ReplayView.strip_scheme('example.com')
        True
        """
        m = ReplayView.STRIP_SCHEME.match(url)
@ -287,6 +316,7 @@ class ReplayView:
        else:
            return url
 if __name__ == "__main__":
    import doctest
    doctest.testmod()
--- a/setup.py
+++ b/setup.py
@ -25,10 +25,9 @@ class PyTest(TestCommand):
    def run_tests(self):
        import pytest
        import sys
-        # cmdline opts moved to pytest.ini
+        cmdline = ' --cov-config .coveragerc --cov pywb'
-        #cmdline = ' --cov-config .coveragerc --cov pywb'
+        cmdline += ' -v --doctest-module ./pywb/ tests/'
-        #cmdline += ' -v --doctest-module ./pywb/ tests/'
+        errcode = pytest.main(cmdline)
        errcode = pytest.main('')
        sys.exit(errcode)
 setup(
--- a/tests/test_config.yaml
+++ b/tests/test_config.yaml
@ -78,7 +78,7 @@ absoulte_paths: true
 # List of route names:
 # <route>: <package or file path>
 static_routes:
-          static/test/route: static/
+          static/test/route: pywb/static/
 # Enable simple http proxy mode
 enable_http_proxy: true
--- a/tests/test_config_memento.yaml
+++ b/tests/test_config_memento.yaml
@ -1,94 +1,20 @@
-# pywb config file
+# pywb memento test config file
 # ========================================
-#
+
-# Settings for each collection
+# minimal settings for memento http api testing
 collections:
    # <name>: <cdx_path>
    # collection will be accessed via /<name>
    # <cdx_path> is a string or list of:
    #  - string or list of one or more local .cdx file
    #  - string or list of one or more local dirs with .cdx files
    #  - a string value indicating remote http cdx server
    pywb: ./sample_archive/cdx/
-    # ex with filtering: filter CDX lines by filename starting with 'dupe'
+archive_paths: ['./sample_archive/warcs/']
    pywb-filt: {'index_paths': './sample_archive/cdx/', 'filters': ['filename:dupe*']}
 # indicate if cdx files are sorted by SURT keys -- eg: com,example)/
 # SURT keys are recommended for future indices, but non-SURT cdxs
 # are also supported
 #
 #   * Set to true if cdxs start with surts: com,example)/
 #   * Set to false if cdx start with urls: example.com)/
 surt_ordered: true
 # list of paths prefixes for pywb look to 'resolve'  WARC and ARC filenames
 # in the cdx to their absolute path
 #
 # if path is:
 #   * local dir, use path as prefix
 #   * local file, lookup prefix in tab-delimited sorted index
 #   * http:// path, use path as remote prefix
 #   * redis:// path, use redis to lookup full path for w:<warc> as key
 archive_paths: ['./invalid/path/to/ignore/', './sample_archive/warcs/']
 # ==== Optional UI: HTML/Jinja2 Templates ====
 # template for <head> insert into replayed html content
 head_insert_html: ui/head_insert.html
 # template to for 'calendar' query,
 # eg, a listing of captures  in response to a ../*/<url>
 #
 # may be a simple listing or a more complex 'calendar' UI
 # if omitted, will list raw cdx in plain text
 query_html: ui/query.html
 # template for search page, which is displayed when no search url is entered
 # in a collection
 search_html: ui/search.html
 # template for home page.
 # if no other route is set, this will be rendered at /, /index.htm and /index.html
 home_html: ui/index.html
 # error page temlpate for may formatting error message and details
 # if omitted, a text response is returned
 error_html: ui/error.html
 # ==== Other Paths ====
 # list of host names that pywb will be running from to detect
 # 'fallthrough' requests based on referrer
 #
 # eg: an incorrect request for http://localhost:8080/image.gif with a referrer
 # of http://localhost:8080/pywb/index.html, pywb can correctly redirect
 # to http://localhost:8080/pywb/image.gif
 #
 #hostpaths: ['http://localhost:8080']
 # Rewrite urls with absolute paths instead of relative
 absoulte_paths: true
 # List of route names:
 # <route>: <package or file path>
 static_routes:
          static/test/route: static/
 # Test memento
 enable_memento: true
 # Enable simple http proxy mode
 enable_http_proxy: true
-# enable cdx server api for querying cdx directly (experimental)
+# enable cdx server api for timemap
 enable_cdx_api: true
 # test different port
 port: 9000
 #with memento
 enable_memento: True
		`@ -1,2 +0,0 @@`
			`[pytest]`
			`addopts=--cov-config .coveragerc --cov pywb -v --doctest-module ./pywb/ tests/`