mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
refactoring: clean up handlers and replay_views for pep8
use BlockLoader().load for StaticHandler static file resolving update static paths to point to pywb/static instead of static
This commit is contained in:
parent
a69d565af5
commit
6461af030b
@ -83,7 +83,7 @@ archive_paths: ./sample_archive/warcs/
|
||||
# <route>: <package or file path>
|
||||
# default route static/default for pywb defaults
|
||||
static_routes:
|
||||
static/default: static/
|
||||
static/default: pywb/static/
|
||||
|
||||
# ==== New / Experimental Settings ====
|
||||
# Not yet production ready -- used primarily for testing
|
||||
|
@ -1,2 +0,0 @@
|
||||
[pytest]
|
||||
addopts=--cov-config .coveragerc --cov pywb -v --doctest-module ./pywb/ tests/
|
@ -3,8 +3,11 @@ import mimetypes
|
||||
import time
|
||||
|
||||
from pywb.utils.wbexception import NotFoundException
|
||||
from pywb.utils.loaders import BlockLoader
|
||||
|
||||
from pywb.framework.basehandlers import BaseHandler, WbUrlHandler
|
||||
from pywb.framework.wbrequestresponse import WbResponse
|
||||
|
||||
from views import TextCapturesView
|
||||
|
||||
|
||||
@ -38,7 +41,8 @@ class WBHandler(WbUrlHandler):
|
||||
if wbrequest.wb_url.mod == 'cdx_':
|
||||
return self.text_query_view.render_response(wbrequest, cdx_lines)
|
||||
|
||||
if (wbrequest.wb_url.type == wbrequest.wb_url.QUERY) or (wbrequest.wb_url.type == wbrequest.wb_url.URL_QUERY):
|
||||
if ((wbrequest.wb_url.type == wbrequest.wb_url.QUERY) or
|
||||
(wbrequest.wb_url.type == wbrequest.wb_url.URL_QUERY)):
|
||||
return self.query_view.render_response(wbrequest, cdx_lines)
|
||||
|
||||
with PerfTimer(wbrequest.env.get('X_PERF'), 'replay') as t:
|
||||
@ -46,14 +50,12 @@ class WBHandler(WbUrlHandler):
|
||||
cdx_lines,
|
||||
self.index_reader.cdx_load_callback(wbrequest))
|
||||
|
||||
|
||||
def render_search_page(self, wbrequest):
|
||||
if self.search_view:
|
||||
return self.search_view.render_response(wbrequest = wbrequest)
|
||||
return self.search_view.render_response(wbrequest=wbrequest)
|
||||
else:
|
||||
return WbResponse.text_response('No Lookup Url Specified')
|
||||
|
||||
|
||||
def __str__(self):
|
||||
return 'WBHandler: ' + str(self.index_reader) + ', ' + str(self.replay)
|
||||
|
||||
@ -62,7 +64,7 @@ class WBHandler(WbUrlHandler):
|
||||
# Static Content Handler
|
||||
#=================================================================
|
||||
class StaticHandler(BaseHandler):
|
||||
def __init__(self, static_path, pkg = 'pywb'):
|
||||
def __init__(self, static_path, pkg='pywb'):
|
||||
mimetypes.init()
|
||||
|
||||
self.static_path = static_path
|
||||
@ -72,10 +74,11 @@ class StaticHandler(BaseHandler):
|
||||
full_path = self.static_path + wbrequest.wb_url_str
|
||||
|
||||
try:
|
||||
if full_path.startswith('.') or full_path.startswith('file://'):
|
||||
data = open(full_path, 'rb')
|
||||
else:
|
||||
data = pkgutil.get_data(self.pkg, full_path)
|
||||
#if full_path.startswith('.') or full_path.startswith('file://'):
|
||||
# data = open(full_path, 'rb')
|
||||
#else:
|
||||
# data = pkgutil.get_data(self.pkg, full_path)
|
||||
data = BlockLoader().load(full_path)
|
||||
|
||||
if 'wsgi.file_wrapper' in wbrequest.env:
|
||||
reader = wbrequest.env['wsgi.file_wrapper'](data)
|
||||
@ -84,10 +87,11 @@ class StaticHandler(BaseHandler):
|
||||
|
||||
content_type, _ = mimetypes.guess_type(full_path)
|
||||
|
||||
return WbResponse.text_stream(data, content_type = content_type)
|
||||
return WbResponse.text_stream(data, content_type=content_type)
|
||||
|
||||
except IOError:
|
||||
raise NotFoundException('Static File Not Found: ' + wbrequest.wb_url_str)
|
||||
raise NotFoundException('Static File Not Found: ' +
|
||||
wbrequest.wb_url_str)
|
||||
|
||||
def __str__(self):
|
||||
return 'Static files from ' + self.static_path
|
||||
|
@ -48,7 +48,6 @@ class ReplayView:
|
||||
else:
|
||||
self.response_class = WbResponse
|
||||
|
||||
|
||||
def __call__(self, wbrequest, cdx_lines, cdx_loader):
|
||||
last_e = None
|
||||
first = True
|
||||
@ -56,11 +55,15 @@ class ReplayView:
|
||||
# List of already failed w/arcs
|
||||
failed_files = []
|
||||
|
||||
response = None
|
||||
|
||||
# Iterate over the cdx until find one that works
|
||||
# The cdx should already be sorted in closest-to-timestamp order (from the cdx server)
|
||||
# The cdx should already be sorted in
|
||||
# closest-to-timestamp order (from the cdx server)
|
||||
for cdx in cdx_lines:
|
||||
try:
|
||||
# optimize: can detect if redirect is needed just from the cdx, no need to load w/arc data
|
||||
# optimize: can detect if redirect is needed just from the cdx,
|
||||
# no need to load w/arc data if requiring exact match
|
||||
if first:
|
||||
redir_response = self._redirect_if_needed(wbrequest, cdx)
|
||||
if redir_response:
|
||||
@ -68,42 +71,10 @@ class ReplayView:
|
||||
|
||||
first = False
|
||||
|
||||
(status_headers, stream) = (self.content_loader.
|
||||
resolve_headers_and_payload(cdx, failed_files, cdx_loader))
|
||||
|
||||
# check and reject self-redirect
|
||||
self._reject_self_redirect(wbrequest, cdx, status_headers)
|
||||
|
||||
# check if redir is needed
|
||||
redir_response = self._redirect_if_needed(wbrequest, cdx)
|
||||
if redir_response:
|
||||
return redir_response
|
||||
|
||||
# one more check for referrer-based self-redirect
|
||||
self._reject_referrer_self_redirect(wbrequest)
|
||||
|
||||
response = None
|
||||
|
||||
# if Content-Length for payload is present, ensure we don't read past it
|
||||
content_length = status_headers.get_header('content-length')
|
||||
if content_length:
|
||||
stream = LimitReader.wrap_stream(stream, content_length)
|
||||
|
||||
if self.content_rewriter and wbrequest.wb_url.mod != 'id_':
|
||||
response = self.rewrite_content(wbrequest, cdx, status_headers, stream)
|
||||
else:
|
||||
(status_headers, stream) = self.sanitize_content(status_headers, stream)
|
||||
#status_headers.remove_header('content-length')
|
||||
|
||||
response_iter = self.stream_to_iter(stream)
|
||||
response = WbResponse(status_headers, response_iter)
|
||||
|
||||
# notify reporter callback, if any
|
||||
if self._reporter:
|
||||
self._reporter(wbrequest, cdx, response)
|
||||
|
||||
return response
|
||||
|
||||
response = self.replay_capture(wbrequest,
|
||||
cdx,
|
||||
cdx_loader,
|
||||
failed_files)
|
||||
|
||||
except (CaptureException, ArchiveLoadFailed) as ce:
|
||||
import traceback
|
||||
@ -111,10 +82,57 @@ class ReplayView:
|
||||
last_e = ce
|
||||
pass
|
||||
|
||||
if response:
|
||||
return response
|
||||
|
||||
if last_e:
|
||||
raise last_e
|
||||
else:
|
||||
raise WbException('No Content Loaded for: ' + wbrequest.wb_url)
|
||||
raise WbException('No Content Loaded for: ' +
|
||||
str(wbrequest.wb_url))
|
||||
|
||||
def replay_capture(self, wbrequest, cdx, cdx_loader, failed_files):
|
||||
(status_headers, stream) = (self.content_loader.
|
||||
resolve_headers_and_payload(cdx,
|
||||
failed_files,
|
||||
cdx_loader))
|
||||
|
||||
# check and reject self-redirect
|
||||
self._reject_self_redirect(wbrequest, cdx, status_headers)
|
||||
|
||||
# check if redir is needed
|
||||
redir_response = self._redirect_if_needed(wbrequest, cdx)
|
||||
if redir_response:
|
||||
return redir_response
|
||||
|
||||
# one more check for referrer-based self-redirect
|
||||
self._reject_referrer_self_redirect(wbrequest)
|
||||
|
||||
response = None
|
||||
|
||||
# if Content-Length for payload is present,
|
||||
# ensure we don't read past it
|
||||
content_length = status_headers.get_header('content-length')
|
||||
if content_length:
|
||||
stream = LimitReader.wrap_stream(stream, content_length)
|
||||
|
||||
if self.content_rewriter and wbrequest.wb_url.mod != 'id_':
|
||||
|
||||
response = self.rewrite_content(wbrequest,
|
||||
cdx,
|
||||
status_headers,
|
||||
stream)
|
||||
else:
|
||||
(status_headers, stream) = self.sanitize_content(status_headers,
|
||||
stream)
|
||||
response_iter = self.stream_to_iter(stream)
|
||||
response = WbResponse(status_headers, response_iter)
|
||||
|
||||
# notify reporter callback, if any
|
||||
if self._reporter:
|
||||
self._reporter(wbrequest, cdx, response)
|
||||
|
||||
return response
|
||||
|
||||
@staticmethod
|
||||
def stream_to_iter(stream):
|
||||
@ -181,7 +199,6 @@ class ReplayView:
|
||||
wbrequest=wbrequest,
|
||||
cdx=cdx)
|
||||
|
||||
|
||||
# Buffer rewrite iterator and return a response from a string
|
||||
def buffered_response(self, status_headers, iterator):
|
||||
out = BytesIO()
|
||||
@ -194,7 +211,8 @@ class ReplayView:
|
||||
content = out.getvalue()
|
||||
|
||||
content_length_str = str(len(content))
|
||||
status_headers.headers.append(('Content-Length', content_length_str))
|
||||
status_headers.headers.append(('Content-Length',
|
||||
content_length_str))
|
||||
out.close()
|
||||
|
||||
return content
|
||||
@ -203,7 +221,9 @@ class ReplayView:
|
||||
if wbrequest.is_proxy:
|
||||
return None
|
||||
|
||||
redir_needed = hasattr(wbrequest, 'is_timegate') and wbrequest.is_timegate
|
||||
# todo: generalize this?
|
||||
redir_needed = (hasattr(wbrequest, 'is_timegate') and
|
||||
wbrequest.is_timegate)
|
||||
|
||||
if not redir_needed and self.redir_to_exact:
|
||||
redir_needed = (cdx['timestamp'] != wbrequest.wb_url.timestamp)
|
||||
@ -211,7 +231,9 @@ class ReplayView:
|
||||
if not redir_needed:
|
||||
return None
|
||||
|
||||
new_url = wbrequest.urlrewriter.get_timestamp_url(cdx['timestamp'], cdx['original'])
|
||||
new_url = wbrequest.urlrewriter.get_timestamp_url(cdx['timestamp'],
|
||||
cdx['original'])
|
||||
|
||||
status_headers = StatusAndHeaders('302 Internal Redirect',
|
||||
[('Location', new_url)])
|
||||
|
||||
@ -219,7 +241,6 @@ class ReplayView:
|
||||
return self.response_class(status_headers,
|
||||
wbrequest=wbrequest)
|
||||
|
||||
|
||||
def _reject_self_redirect(self, wbrequest, cdx, status_headers):
|
||||
"""
|
||||
Check if response is a 3xx redirect to the same url
|
||||
@ -237,18 +258,22 @@ class ReplayView:
|
||||
request_url = wbrequest.wb_url.url.lower()
|
||||
location_url = status_headers.get_header('Location')
|
||||
if not location_url:
|
||||
return
|
||||
return
|
||||
|
||||
location_url = location_url.lower()
|
||||
|
||||
if (ReplayView.strip_scheme(request_url) == ReplayView.strip_scheme(location_url)):
|
||||
if (ReplayView.strip_scheme(request_url) ==
|
||||
ReplayView.strip_scheme(location_url)):
|
||||
raise CaptureException('Self Redirect: ' + str(cdx))
|
||||
|
||||
def _reject_referrer_self_redirect(self, wbrequest):
|
||||
"""
|
||||
Perform final check for referrer based self-redirect.
|
||||
This method should be called after verifying request timestamp matches capture.
|
||||
if referrer is same as current url, reject this response and try another capture
|
||||
This method should be called after verifying that
|
||||
the request timestamp == capture timestamp
|
||||
|
||||
If referrer is same as current url,
|
||||
reject this response and try another capture.
|
||||
"""
|
||||
if not wbrequest.referrer:
|
||||
return
|
||||
@ -258,23 +283,27 @@ class ReplayView:
|
||||
wbrequest.rel_prefix + str(wbrequest.wb_url))
|
||||
|
||||
if (ReplayView.strip_scheme(request_url) ==
|
||||
ReplayView.strip_scheme(wbrequest.referrer)):
|
||||
raise CaptureException('Self Redirect via Referrer: ' + str(wbrequest.wb_url))
|
||||
|
||||
ReplayView.strip_scheme(wbrequest.referrer)):
|
||||
raise CaptureException('Self Redirect via Referrer: ' +
|
||||
str(wbrequest.wb_url))
|
||||
|
||||
@staticmethod
|
||||
def strip_scheme(url):
|
||||
"""
|
||||
>>> ReplayView.strip_scheme('https://example.com') == ReplayView.strip_scheme('http://example.com')
|
||||
>>> ReplayView.strip_scheme('https://example.com') ==\
|
||||
ReplayView.strip_scheme('http://example.com')
|
||||
True
|
||||
|
||||
>>> ReplayView.strip_scheme('https://example.com') == ReplayView.strip_scheme('http:/example.com')
|
||||
>>> ReplayView.strip_scheme('https://example.com') ==\
|
||||
ReplayView.strip_scheme('http:/example.com')
|
||||
True
|
||||
|
||||
>>> ReplayView.strip_scheme('https://example.com') == ReplayView.strip_scheme('example.com')
|
||||
>>> ReplayView.strip_scheme('https://example.com') ==\
|
||||
ReplayView.strip_scheme('example.com')
|
||||
True
|
||||
|
||||
>>> ReplayView.strip_scheme('about://example.com') == ReplayView.strip_scheme('example.com')
|
||||
>>> ReplayView.strip_scheme('about://example.com') ==\
|
||||
ReplayView.strip_scheme('example.com')
|
||||
True
|
||||
"""
|
||||
m = ReplayView.STRIP_SCHEME.match(url)
|
||||
@ -287,6 +316,7 @@ class ReplayView:
|
||||
else:
|
||||
return url
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
doctest.testmod()
|
||||
|
7
setup.py
7
setup.py
@ -25,10 +25,9 @@ class PyTest(TestCommand):
|
||||
def run_tests(self):
|
||||
import pytest
|
||||
import sys
|
||||
# cmdline opts moved to pytest.ini
|
||||
#cmdline = ' --cov-config .coveragerc --cov pywb'
|
||||
#cmdline += ' -v --doctest-module ./pywb/ tests/'
|
||||
errcode = pytest.main('')
|
||||
cmdline = ' --cov-config .coveragerc --cov pywb'
|
||||
cmdline += ' -v --doctest-module ./pywb/ tests/'
|
||||
errcode = pytest.main(cmdline)
|
||||
sys.exit(errcode)
|
||||
|
||||
setup(
|
||||
|
@ -78,7 +78,7 @@ absoulte_paths: true
|
||||
# List of route names:
|
||||
# <route>: <package or file path>
|
||||
static_routes:
|
||||
static/test/route: static/
|
||||
static/test/route: pywb/static/
|
||||
|
||||
# Enable simple http proxy mode
|
||||
enable_http_proxy: true
|
||||
|
@ -1,94 +1,20 @@
|
||||
# pywb config file
|
||||
# pywb memento test config file
|
||||
# ========================================
|
||||
#
|
||||
# Settings for each collection
|
||||
|
||||
# minimal settings for memento http api testing
|
||||
|
||||
collections:
|
||||
# <name>: <cdx_path>
|
||||
# collection will be accessed via /<name>
|
||||
# <cdx_path> is a string or list of:
|
||||
# - string or list of one or more local .cdx file
|
||||
# - string or list of one or more local dirs with .cdx files
|
||||
# - a string value indicating remote http cdx server
|
||||
pywb: ./sample_archive/cdx/
|
||||
|
||||
# ex with filtering: filter CDX lines by filename starting with 'dupe'
|
||||
pywb-filt: {'index_paths': './sample_archive/cdx/', 'filters': ['filename:dupe*']}
|
||||
|
||||
|
||||
# indicate if cdx files are sorted by SURT keys -- eg: com,example)/
|
||||
# SURT keys are recommended for future indices, but non-SURT cdxs
|
||||
# are also supported
|
||||
#
|
||||
# * Set to true if cdxs start with surts: com,example)/
|
||||
# * Set to false if cdx start with urls: example.com)/
|
||||
surt_ordered: true
|
||||
|
||||
# list of paths prefixes for pywb look to 'resolve' WARC and ARC filenames
|
||||
# in the cdx to their absolute path
|
||||
#
|
||||
# if path is:
|
||||
# * local dir, use path as prefix
|
||||
# * local file, lookup prefix in tab-delimited sorted index
|
||||
# * http:// path, use path as remote prefix
|
||||
# * redis:// path, use redis to lookup full path for w:<warc> as key
|
||||
|
||||
archive_paths: ['./invalid/path/to/ignore/', './sample_archive/warcs/']
|
||||
|
||||
# ==== Optional UI: HTML/Jinja2 Templates ====
|
||||
|
||||
# template for <head> insert into replayed html content
|
||||
head_insert_html: ui/head_insert.html
|
||||
|
||||
# template to for 'calendar' query,
|
||||
# eg, a listing of captures in response to a ../*/<url>
|
||||
#
|
||||
# may be a simple listing or a more complex 'calendar' UI
|
||||
# if omitted, will list raw cdx in plain text
|
||||
query_html: ui/query.html
|
||||
|
||||
# template for search page, which is displayed when no search url is entered
|
||||
# in a collection
|
||||
search_html: ui/search.html
|
||||
|
||||
# template for home page.
|
||||
# if no other route is set, this will be rendered at /, /index.htm and /index.html
|
||||
home_html: ui/index.html
|
||||
|
||||
|
||||
# error page temlpate for may formatting error message and details
|
||||
# if omitted, a text response is returned
|
||||
error_html: ui/error.html
|
||||
|
||||
# ==== Other Paths ====
|
||||
|
||||
# list of host names that pywb will be running from to detect
|
||||
# 'fallthrough' requests based on referrer
|
||||
#
|
||||
# eg: an incorrect request for http://localhost:8080/image.gif with a referrer
|
||||
# of http://localhost:8080/pywb/index.html, pywb can correctly redirect
|
||||
# to http://localhost:8080/pywb/image.gif
|
||||
#
|
||||
|
||||
#hostpaths: ['http://localhost:8080']
|
||||
|
||||
# Rewrite urls with absolute paths instead of relative
|
||||
absoulte_paths: true
|
||||
|
||||
# List of route names:
|
||||
# <route>: <package or file path>
|
||||
static_routes:
|
||||
static/test/route: static/
|
||||
archive_paths: ['./sample_archive/warcs/']
|
||||
|
||||
# Test memento
|
||||
enable_memento: true
|
||||
|
||||
# Enable simple http proxy mode
|
||||
enable_http_proxy: true
|
||||
|
||||
# enable cdx server api for querying cdx directly (experimental)
|
||||
# enable cdx server api for timemap
|
||||
enable_cdx_api: true
|
||||
|
||||
# test different port
|
||||
port: 9000
|
||||
|
||||
#with memento
|
||||
enable_memento: True
|
||||
|
Loading…
x
Reference in New Issue
Block a user