refactoring: clean up handlers and replay_views for pep8

use BlockLoader().load for StaticHandler static file resolving update static paths to point to pywb/static instead of static
2025-03-15 00:03:28 +01:00 · 2014-03-14 18:17:22 -07:00 · 2014-03-14 18:17:22 -07:00 · 6461af030b
commit 6461af030b
parent a69d565af5
7 changed files with 113 additions and 156 deletions
--- a/config.yaml
+++ b/config.yaml
@ -83,7 +83,7 @@ archive_paths: ./sample_archive/warcs/
 # <route>: <package or file path>
 # default route static/default for pywb defaults
 static_routes:
-          static/default: static/
+          static/default: pywb/static/

 # ==== New / Experimental Settings ====
 # Not yet production ready -- used primarily for testing
--- a/pytest.ini
+++ b/pytest.ini
@ -1,2 +0,0 @@
-[pytest]
-addopts=--cov-config .coveragerc --cov pywb -v --doctest-module ./pywb/ tests/
--- a/pywb/core/handlers.py
+++ b/pywb/core/handlers.py
@ -3,8 +3,11 @@ import mimetypes
 import time

 from pywb.utils.wbexception import NotFoundException
+from pywb.utils.loaders import BlockLoader
+
 from pywb.framework.basehandlers import BaseHandler, WbUrlHandler
 from pywb.framework.wbrequestresponse import WbResponse
+
 from views import TextCapturesView


@ -38,7 +41,8 @@ class WBHandler(WbUrlHandler):
        if wbrequest.wb_url.mod == 'cdx_':
            return self.text_query_view.render_response(wbrequest, cdx_lines)

-        if (wbrequest.wb_url.type == wbrequest.wb_url.QUERY) or (wbrequest.wb_url.type == wbrequest.wb_url.URL_QUERY):
+        if ((wbrequest.wb_url.type == wbrequest.wb_url.QUERY) or
+            (wbrequest.wb_url.type == wbrequest.wb_url.URL_QUERY)):
            return self.query_view.render_response(wbrequest, cdx_lines)

        with PerfTimer(wbrequest.env.get('X_PERF'), 'replay') as t:
@ -46,14 +50,12 @@ class WBHandler(WbUrlHandler):
                               cdx_lines,
                               self.index_reader.cdx_load_callback(wbrequest))

-
    def render_search_page(self, wbrequest):
        if self.search_view:
            return self.search_view.render_response(wbrequest=wbrequest)
        else:
            return WbResponse.text_response('No Lookup Url Specified')

-
    def __str__(self):
        return 'WBHandler: ' + str(self.index_reader) + ', ' + str(self.replay)

@ -72,10 +74,11 @@ class StaticHandler(BaseHandler):
        full_path = self.static_path + wbrequest.wb_url_str

        try:
-            if full_path.startswith('.') or full_path.startswith('file://'):
-                data = open(full_path, 'rb')
-            else:
-                data = pkgutil.get_data(self.pkg, full_path)
+            #if full_path.startswith('.') or full_path.startswith('file://'):
+            #    data = open(full_path, 'rb')
+            #else:
+            #    data = pkgutil.get_data(self.pkg, full_path)
+            data = BlockLoader().load(full_path)

            if 'wsgi.file_wrapper' in wbrequest.env:
                reader = wbrequest.env['wsgi.file_wrapper'](data)
@ -87,7 +90,8 @@ class StaticHandler(BaseHandler):
            return WbResponse.text_stream(data, content_type=content_type)

        except IOError:
-            raise NotFoundException('Static File Not Found: ' + wbrequest.wb_url_str)
+            raise NotFoundException('Static File Not Found: ' +
+                                    wbrequest.wb_url_str)

    def __str__(self):
        return 'Static files from ' + self.static_path
--- a/pywb/core/replay_views.py
+++ b/pywb/core/replay_views.py
@ -48,7 +48,6 @@ class ReplayView:
        else:
            self.response_class = WbResponse

-
    def __call__(self, wbrequest, cdx_lines, cdx_loader):
        last_e = None
        first = True
@ -56,11 +55,15 @@ class ReplayView:
        # List of already failed w/arcs
        failed_files = []

+        response = None
+
        # Iterate over the cdx until find one that works
-        # The cdx should already be sorted in closest-to-timestamp order (from the cdx server)
+        # The cdx should already be sorted in
+        # closest-to-timestamp order (from the cdx server)
        for cdx in cdx_lines:
            try:
-                # optimize: can detect if redirect is needed just from the cdx, no need to load w/arc data
+                # optimize: can detect if redirect is needed just from the cdx,
+                # no need to load w/arc data if requiring exact match
                if first:
                    redir_response = self._redirect_if_needed(wbrequest, cdx)
                    if redir_response:
@ -68,8 +71,31 @@ class ReplayView:

                    first = False

+                response = self.replay_capture(wbrequest,
+                                               cdx,
+                                               cdx_loader,
+                                               failed_files)
+
+            except (CaptureException, ArchiveLoadFailed) as ce:
+                import traceback
+                traceback.print_exc()
+                last_e = ce
+                pass
+
+            if response:
+                return response
+
+        if last_e:
+            raise last_e
+        else:
+            raise WbException('No Content Loaded for: ' +
+                              str(wbrequest.wb_url))
+
+    def replay_capture(self, wbrequest, cdx, cdx_loader, failed_files):
        (status_headers, stream) = (self.content_loader.
-                                            resolve_headers_and_payload(cdx, failed_files, cdx_loader))
+                                    resolve_headers_and_payload(cdx,
+                                                                failed_files,
+                                                                cdx_loader))

        # check and reject self-redirect
        self._reject_self_redirect(wbrequest, cdx, status_headers)
@ -84,17 +110,21 @@ class ReplayView:

        response = None

-                # if Content-Length for payload is present, ensure we don't read past it
+        # if Content-Length for payload is present,
+        # ensure we don't read past it
        content_length = status_headers.get_header('content-length')
        if content_length:
            stream = LimitReader.wrap_stream(stream, content_length)

        if self.content_rewriter and wbrequest.wb_url.mod != 'id_':
-                    response = self.rewrite_content(wbrequest, cdx, status_headers, stream)
-                else:
-                    (status_headers, stream) = self.sanitize_content(status_headers, stream)
-                    #status_headers.remove_header('content-length')

+            response = self.rewrite_content(wbrequest,
+                                            cdx,
+                                            status_headers,
+                                            stream)
+        else:
+            (status_headers, stream) = self.sanitize_content(status_headers,
+                                                             stream)
            response_iter = self.stream_to_iter(stream)
            response = WbResponse(status_headers, response_iter)

@ -104,18 +134,6 @@ class ReplayView:

        return response

-
-            except (CaptureException, ArchiveLoadFailed) as ce:
-                import traceback
-                traceback.print_exc()
-                last_e = ce
-                pass
-
-        if last_e:
-            raise last_e
-        else:
-            raise WbException('No Content Loaded for: ' + wbrequest.wb_url)
-
    @staticmethod
    def stream_to_iter(stream):
        try:
@ -181,7 +199,6 @@ class ReplayView:
                                   wbrequest=wbrequest,
                                   cdx=cdx)

-
    # Buffer rewrite iterator and return a response from a string
    def buffered_response(self, status_headers, iterator):
        out = BytesIO()
@ -194,7 +211,8 @@ class ReplayView:
            content = out.getvalue()

            content_length_str = str(len(content))
-            status_headers.headers.append(('Content-Length', content_length_str))
+            status_headers.headers.append(('Content-Length',
+                                           content_length_str))
            out.close()

        return content
@ -203,7 +221,9 @@ class ReplayView:
        if wbrequest.is_proxy:
            return None

-        redir_needed = hasattr(wbrequest, 'is_timegate') and wbrequest.is_timegate
+        # todo: generalize this?
+        redir_needed = (hasattr(wbrequest, 'is_timegate') and
+                        wbrequest.is_timegate)

        if not redir_needed and self.redir_to_exact:
            redir_needed = (cdx['timestamp'] != wbrequest.wb_url.timestamp)
@ -211,7 +231,9 @@ class ReplayView:
        if not redir_needed:
            return None

-        new_url = wbrequest.urlrewriter.get_timestamp_url(cdx['timestamp'], cdx['original'])
+        new_url = wbrequest.urlrewriter.get_timestamp_url(cdx['timestamp'],
+                                                          cdx['original'])
+
        status_headers = StatusAndHeaders('302 Internal Redirect',
                                          [('Location', new_url)])

@ -219,7 +241,6 @@ class ReplayView:
        return self.response_class(status_headers,
                                   wbrequest=wbrequest)

-
    def _reject_self_redirect(self, wbrequest, cdx, status_headers):
        """
        Check if response is a 3xx redirect to the same url
@ -241,14 +262,18 @@ class ReplayView:

        location_url = location_url.lower()

-        if (ReplayView.strip_scheme(request_url) == ReplayView.strip_scheme(location_url)):
+        if (ReplayView.strip_scheme(request_url) ==
+             ReplayView.strip_scheme(location_url)):
            raise CaptureException('Self Redirect: ' + str(cdx))

    def _reject_referrer_self_redirect(self, wbrequest):
        """
        Perform final check for referrer based self-redirect.
-        This method should be called after verifying request timestamp matches capture.
-        if referrer is same as current url, reject this response and try another capture
+        This method should be called after verifying that
+        the request timestamp == capture timestamp
+
+        If referrer is same as current url,
+        reject this response and try another capture.
        """
        if not wbrequest.referrer:
            return
@ -259,22 +284,26 @@ class ReplayView:

        if (ReplayView.strip_scheme(request_url) ==
             ReplayView.strip_scheme(wbrequest.referrer)):
-            raise CaptureException('Self Redirect via Referrer: ' + str(wbrequest.wb_url))
-
+            raise CaptureException('Self Redirect via Referrer: ' +
+                                   str(wbrequest.wb_url))

    @staticmethod
    def strip_scheme(url):
        """
-        >>> ReplayView.strip_scheme('https://example.com') == ReplayView.strip_scheme('http://example.com')
+        >>> ReplayView.strip_scheme('https://example.com') ==\
+            ReplayView.strip_scheme('http://example.com')
        True

-        >>> ReplayView.strip_scheme('https://example.com') == ReplayView.strip_scheme('http:/example.com')
+        >>> ReplayView.strip_scheme('https://example.com') ==\
+            ReplayView.strip_scheme('http:/example.com')
        True

-        >>> ReplayView.strip_scheme('https://example.com') == ReplayView.strip_scheme('example.com')
+        >>> ReplayView.strip_scheme('https://example.com') ==\
+            ReplayView.strip_scheme('example.com')
        True

-        >>> ReplayView.strip_scheme('about://example.com') == ReplayView.strip_scheme('example.com')
+        >>> ReplayView.strip_scheme('about://example.com') ==\
+            ReplayView.strip_scheme('example.com')
        True
        """
        m = ReplayView.STRIP_SCHEME.match(url)
@ -287,6 +316,7 @@ class ReplayView:
        else:
            return url

+
 if __name__ == "__main__":
    import doctest
    doctest.testmod()
--- a/setup.py
+++ b/setup.py
@ -25,10 +25,9 @@ class PyTest(TestCommand):
    def run_tests(self):
        import pytest
        import sys
-        # cmdline opts moved to pytest.ini
-        #cmdline = ' --cov-config .coveragerc --cov pywb'
-        #cmdline += ' -v --doctest-module ./pywb/ tests/'
-        errcode = pytest.main('')
+        cmdline = ' --cov-config .coveragerc --cov pywb'
+        cmdline += ' -v --doctest-module ./pywb/ tests/'
+        errcode = pytest.main(cmdline)
        sys.exit(errcode)

 setup(
--- a/tests/test_config.yaml
+++ b/tests/test_config.yaml
@ -78,7 +78,7 @@ absoulte_paths: true
 # List of route names:
 # <route>: <package or file path>
 static_routes:
-          static/test/route: static/
+          static/test/route: pywb/static/

 # Enable simple http proxy mode
 enable_http_proxy: true
--- a/tests/test_config_memento.yaml
+++ b/tests/test_config_memento.yaml
@ -1,94 +1,20 @@
-# pywb config file
+# pywb memento test config file
 # ========================================
-#
-# Settings for each collection
+
+# minimal settings for memento http api testing

 collections:
-    # <name>: <cdx_path>
-    # collection will be accessed via /<name>
-    # <cdx_path> is a string or list of:
-    #  - string or list of one or more local .cdx file
-    #  - string or list of one or more local dirs with .cdx files
-    #  - a string value indicating remote http cdx server
    pywb: ./sample_archive/cdx/

-    # ex with filtering: filter CDX lines by filename starting with 'dupe'
-    pywb-filt: {'index_paths': './sample_archive/cdx/', 'filters': ['filename:dupe*']}
-
-
-# indicate if cdx files are sorted by SURT keys -- eg: com,example)/
-# SURT keys are recommended for future indices, but non-SURT cdxs
-# are also supported
-#
-#   * Set to true if cdxs start with surts: com,example)/
-#   * Set to false if cdx start with urls: example.com)/
-surt_ordered: true
-
-# list of paths prefixes for pywb look to 'resolve'  WARC and ARC filenames
-# in the cdx to their absolute path
-#
-# if path is:
-#   * local dir, use path as prefix
-#   * local file, lookup prefix in tab-delimited sorted index
-#   * http:// path, use path as remote prefix
-#   * redis:// path, use redis to lookup full path for w:<warc> as key
-
-archive_paths: ['./invalid/path/to/ignore/', './sample_archive/warcs/']
-
-# ==== Optional UI: HTML/Jinja2 Templates ====
-
-# template for <head> insert into replayed html content
-head_insert_html: ui/head_insert.html
-
-# template to for 'calendar' query,
-# eg, a listing of captures  in response to a ../*/<url>
-#
-# may be a simple listing or a more complex 'calendar' UI
-# if omitted, will list raw cdx in plain text
-query_html: ui/query.html
-
-# template for search page, which is displayed when no search url is entered
-# in a collection
-search_html: ui/search.html
-
-# template for home page.
-# if no other route is set, this will be rendered at /, /index.htm and /index.html
-home_html: ui/index.html
-
-
-# error page temlpate for may formatting error message and details
-# if omitted, a text response is returned
-error_html: ui/error.html
-
-# ==== Other Paths ====
-
-# list of host names that pywb will be running from to detect
-# 'fallthrough' requests based on referrer
-#
-# eg: an incorrect request for http://localhost:8080/image.gif with a referrer
-# of http://localhost:8080/pywb/index.html, pywb can correctly redirect
-# to http://localhost:8080/pywb/image.gif
-#
-
-#hostpaths: ['http://localhost:8080']
-
-# Rewrite urls with absolute paths instead of relative
-absoulte_paths: true
-
-# List of route names:
-# <route>: <package or file path>
-static_routes:
-          static/test/route: static/
+archive_paths: ['./sample_archive/warcs/']

+# Test memento
+enable_memento: true

 # Enable simple http proxy mode
 enable_http_proxy: true

-# enable cdx server api for querying cdx directly (experimental)
+# enable cdx server api for timemap
 enable_cdx_api: true

-# test different port
-port: 9000

-#with memento
-enable_memento: True