1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

refactoring: clean up handlers and replay_views for pep8

use BlockLoader().load for StaticHandler static file resolving
update static paths to point to pywb/static instead of static
This commit is contained in:
Ilya Kreymer 2014-03-14 18:17:22 -07:00
parent a69d565af5
commit 6461af030b
7 changed files with 113 additions and 156 deletions

View File

@ -83,7 +83,7 @@ archive_paths: ./sample_archive/warcs/
# <route>: <package or file path> # <route>: <package or file path>
# default route static/default for pywb defaults # default route static/default for pywb defaults
static_routes: static_routes:
static/default: static/ static/default: pywb/static/
# ==== New / Experimental Settings ==== # ==== New / Experimental Settings ====
# Not yet production ready -- used primarily for testing # Not yet production ready -- used primarily for testing

View File

@ -1,2 +0,0 @@
[pytest]
addopts=--cov-config .coveragerc --cov pywb -v --doctest-module ./pywb/ tests/

View File

@ -3,8 +3,11 @@ import mimetypes
import time import time
from pywb.utils.wbexception import NotFoundException from pywb.utils.wbexception import NotFoundException
from pywb.utils.loaders import BlockLoader
from pywb.framework.basehandlers import BaseHandler, WbUrlHandler from pywb.framework.basehandlers import BaseHandler, WbUrlHandler
from pywb.framework.wbrequestresponse import WbResponse from pywb.framework.wbrequestresponse import WbResponse
from views import TextCapturesView from views import TextCapturesView
@ -38,7 +41,8 @@ class WBHandler(WbUrlHandler):
if wbrequest.wb_url.mod == 'cdx_': if wbrequest.wb_url.mod == 'cdx_':
return self.text_query_view.render_response(wbrequest, cdx_lines) return self.text_query_view.render_response(wbrequest, cdx_lines)
if (wbrequest.wb_url.type == wbrequest.wb_url.QUERY) or (wbrequest.wb_url.type == wbrequest.wb_url.URL_QUERY): if ((wbrequest.wb_url.type == wbrequest.wb_url.QUERY) or
(wbrequest.wb_url.type == wbrequest.wb_url.URL_QUERY)):
return self.query_view.render_response(wbrequest, cdx_lines) return self.query_view.render_response(wbrequest, cdx_lines)
with PerfTimer(wbrequest.env.get('X_PERF'), 'replay') as t: with PerfTimer(wbrequest.env.get('X_PERF'), 'replay') as t:
@ -46,14 +50,12 @@ class WBHandler(WbUrlHandler):
cdx_lines, cdx_lines,
self.index_reader.cdx_load_callback(wbrequest)) self.index_reader.cdx_load_callback(wbrequest))
def render_search_page(self, wbrequest): def render_search_page(self, wbrequest):
if self.search_view: if self.search_view:
return self.search_view.render_response(wbrequest = wbrequest) return self.search_view.render_response(wbrequest=wbrequest)
else: else:
return WbResponse.text_response('No Lookup Url Specified') return WbResponse.text_response('No Lookup Url Specified')
def __str__(self): def __str__(self):
return 'WBHandler: ' + str(self.index_reader) + ', ' + str(self.replay) return 'WBHandler: ' + str(self.index_reader) + ', ' + str(self.replay)
@ -62,7 +64,7 @@ class WBHandler(WbUrlHandler):
# Static Content Handler # Static Content Handler
#================================================================= #=================================================================
class StaticHandler(BaseHandler): class StaticHandler(BaseHandler):
def __init__(self, static_path, pkg = 'pywb'): def __init__(self, static_path, pkg='pywb'):
mimetypes.init() mimetypes.init()
self.static_path = static_path self.static_path = static_path
@ -72,10 +74,11 @@ class StaticHandler(BaseHandler):
full_path = self.static_path + wbrequest.wb_url_str full_path = self.static_path + wbrequest.wb_url_str
try: try:
if full_path.startswith('.') or full_path.startswith('file://'): #if full_path.startswith('.') or full_path.startswith('file://'):
data = open(full_path, 'rb') # data = open(full_path, 'rb')
else: #else:
data = pkgutil.get_data(self.pkg, full_path) # data = pkgutil.get_data(self.pkg, full_path)
data = BlockLoader().load(full_path)
if 'wsgi.file_wrapper' in wbrequest.env: if 'wsgi.file_wrapper' in wbrequest.env:
reader = wbrequest.env['wsgi.file_wrapper'](data) reader = wbrequest.env['wsgi.file_wrapper'](data)
@ -84,10 +87,11 @@ class StaticHandler(BaseHandler):
content_type, _ = mimetypes.guess_type(full_path) content_type, _ = mimetypes.guess_type(full_path)
return WbResponse.text_stream(data, content_type = content_type) return WbResponse.text_stream(data, content_type=content_type)
except IOError: except IOError:
raise NotFoundException('Static File Not Found: ' + wbrequest.wb_url_str) raise NotFoundException('Static File Not Found: ' +
wbrequest.wb_url_str)
def __str__(self): def __str__(self):
return 'Static files from ' + self.static_path return 'Static files from ' + self.static_path

View File

@ -48,7 +48,6 @@ class ReplayView:
else: else:
self.response_class = WbResponse self.response_class = WbResponse
def __call__(self, wbrequest, cdx_lines, cdx_loader): def __call__(self, wbrequest, cdx_lines, cdx_loader):
last_e = None last_e = None
first = True first = True
@ -56,11 +55,15 @@ class ReplayView:
# List of already failed w/arcs # List of already failed w/arcs
failed_files = [] failed_files = []
response = None
# Iterate over the cdx until find one that works # Iterate over the cdx until find one that works
# The cdx should already be sorted in closest-to-timestamp order (from the cdx server) # The cdx should already be sorted in
# closest-to-timestamp order (from the cdx server)
for cdx in cdx_lines: for cdx in cdx_lines:
try: try:
# optimize: can detect if redirect is needed just from the cdx, no need to load w/arc data # optimize: can detect if redirect is needed just from the cdx,
# no need to load w/arc data if requiring exact match
if first: if first:
redir_response = self._redirect_if_needed(wbrequest, cdx) redir_response = self._redirect_if_needed(wbrequest, cdx)
if redir_response: if redir_response:
@ -68,42 +71,10 @@ class ReplayView:
first = False first = False
(status_headers, stream) = (self.content_loader. response = self.replay_capture(wbrequest,
resolve_headers_and_payload(cdx, failed_files, cdx_loader)) cdx,
cdx_loader,
# check and reject self-redirect failed_files)
self._reject_self_redirect(wbrequest, cdx, status_headers)
# check if redir is needed
redir_response = self._redirect_if_needed(wbrequest, cdx)
if redir_response:
return redir_response
# one more check for referrer-based self-redirect
self._reject_referrer_self_redirect(wbrequest)
response = None
# if Content-Length for payload is present, ensure we don't read past it
content_length = status_headers.get_header('content-length')
if content_length:
stream = LimitReader.wrap_stream(stream, content_length)
if self.content_rewriter and wbrequest.wb_url.mod != 'id_':
response = self.rewrite_content(wbrequest, cdx, status_headers, stream)
else:
(status_headers, stream) = self.sanitize_content(status_headers, stream)
#status_headers.remove_header('content-length')
response_iter = self.stream_to_iter(stream)
response = WbResponse(status_headers, response_iter)
# notify reporter callback, if any
if self._reporter:
self._reporter(wbrequest, cdx, response)
return response
except (CaptureException, ArchiveLoadFailed) as ce: except (CaptureException, ArchiveLoadFailed) as ce:
import traceback import traceback
@ -111,10 +82,57 @@ class ReplayView:
last_e = ce last_e = ce
pass pass
if response:
return response
if last_e: if last_e:
raise last_e raise last_e
else: else:
raise WbException('No Content Loaded for: ' + wbrequest.wb_url) raise WbException('No Content Loaded for: ' +
str(wbrequest.wb_url))
def replay_capture(self, wbrequest, cdx, cdx_loader, failed_files):
(status_headers, stream) = (self.content_loader.
resolve_headers_and_payload(cdx,
failed_files,
cdx_loader))
# check and reject self-redirect
self._reject_self_redirect(wbrequest, cdx, status_headers)
# check if redir is needed
redir_response = self._redirect_if_needed(wbrequest, cdx)
if redir_response:
return redir_response
# one more check for referrer-based self-redirect
self._reject_referrer_self_redirect(wbrequest)
response = None
# if Content-Length for payload is present,
# ensure we don't read past it
content_length = status_headers.get_header('content-length')
if content_length:
stream = LimitReader.wrap_stream(stream, content_length)
if self.content_rewriter and wbrequest.wb_url.mod != 'id_':
response = self.rewrite_content(wbrequest,
cdx,
status_headers,
stream)
else:
(status_headers, stream) = self.sanitize_content(status_headers,
stream)
response_iter = self.stream_to_iter(stream)
response = WbResponse(status_headers, response_iter)
# notify reporter callback, if any
if self._reporter:
self._reporter(wbrequest, cdx, response)
return response
@staticmethod @staticmethod
def stream_to_iter(stream): def stream_to_iter(stream):
@ -181,7 +199,6 @@ class ReplayView:
wbrequest=wbrequest, wbrequest=wbrequest,
cdx=cdx) cdx=cdx)
# Buffer rewrite iterator and return a response from a string # Buffer rewrite iterator and return a response from a string
def buffered_response(self, status_headers, iterator): def buffered_response(self, status_headers, iterator):
out = BytesIO() out = BytesIO()
@ -194,7 +211,8 @@ class ReplayView:
content = out.getvalue() content = out.getvalue()
content_length_str = str(len(content)) content_length_str = str(len(content))
status_headers.headers.append(('Content-Length', content_length_str)) status_headers.headers.append(('Content-Length',
content_length_str))
out.close() out.close()
return content return content
@ -203,7 +221,9 @@ class ReplayView:
if wbrequest.is_proxy: if wbrequest.is_proxy:
return None return None
redir_needed = hasattr(wbrequest, 'is_timegate') and wbrequest.is_timegate # todo: generalize this?
redir_needed = (hasattr(wbrequest, 'is_timegate') and
wbrequest.is_timegate)
if not redir_needed and self.redir_to_exact: if not redir_needed and self.redir_to_exact:
redir_needed = (cdx['timestamp'] != wbrequest.wb_url.timestamp) redir_needed = (cdx['timestamp'] != wbrequest.wb_url.timestamp)
@ -211,7 +231,9 @@ class ReplayView:
if not redir_needed: if not redir_needed:
return None return None
new_url = wbrequest.urlrewriter.get_timestamp_url(cdx['timestamp'], cdx['original']) new_url = wbrequest.urlrewriter.get_timestamp_url(cdx['timestamp'],
cdx['original'])
status_headers = StatusAndHeaders('302 Internal Redirect', status_headers = StatusAndHeaders('302 Internal Redirect',
[('Location', new_url)]) [('Location', new_url)])
@ -219,7 +241,6 @@ class ReplayView:
return self.response_class(status_headers, return self.response_class(status_headers,
wbrequest=wbrequest) wbrequest=wbrequest)
def _reject_self_redirect(self, wbrequest, cdx, status_headers): def _reject_self_redirect(self, wbrequest, cdx, status_headers):
""" """
Check if response is a 3xx redirect to the same url Check if response is a 3xx redirect to the same url
@ -237,18 +258,22 @@ class ReplayView:
request_url = wbrequest.wb_url.url.lower() request_url = wbrequest.wb_url.url.lower()
location_url = status_headers.get_header('Location') location_url = status_headers.get_header('Location')
if not location_url: if not location_url:
return return
location_url = location_url.lower() location_url = location_url.lower()
if (ReplayView.strip_scheme(request_url) == ReplayView.strip_scheme(location_url)): if (ReplayView.strip_scheme(request_url) ==
ReplayView.strip_scheme(location_url)):
raise CaptureException('Self Redirect: ' + str(cdx)) raise CaptureException('Self Redirect: ' + str(cdx))
def _reject_referrer_self_redirect(self, wbrequest): def _reject_referrer_self_redirect(self, wbrequest):
""" """
Perform final check for referrer based self-redirect. Perform final check for referrer based self-redirect.
This method should be called after verifying request timestamp matches capture. This method should be called after verifying that
if referrer is same as current url, reject this response and try another capture the request timestamp == capture timestamp
If referrer is same as current url,
reject this response and try another capture.
""" """
if not wbrequest.referrer: if not wbrequest.referrer:
return return
@ -258,23 +283,27 @@ class ReplayView:
wbrequest.rel_prefix + str(wbrequest.wb_url)) wbrequest.rel_prefix + str(wbrequest.wb_url))
if (ReplayView.strip_scheme(request_url) == if (ReplayView.strip_scheme(request_url) ==
ReplayView.strip_scheme(wbrequest.referrer)): ReplayView.strip_scheme(wbrequest.referrer)):
raise CaptureException('Self Redirect via Referrer: ' + str(wbrequest.wb_url)) raise CaptureException('Self Redirect via Referrer: ' +
str(wbrequest.wb_url))
@staticmethod @staticmethod
def strip_scheme(url): def strip_scheme(url):
""" """
>>> ReplayView.strip_scheme('https://example.com') == ReplayView.strip_scheme('http://example.com') >>> ReplayView.strip_scheme('https://example.com') ==\
ReplayView.strip_scheme('http://example.com')
True True
>>> ReplayView.strip_scheme('https://example.com') == ReplayView.strip_scheme('http:/example.com') >>> ReplayView.strip_scheme('https://example.com') ==\
ReplayView.strip_scheme('http:/example.com')
True True
>>> ReplayView.strip_scheme('https://example.com') == ReplayView.strip_scheme('example.com') >>> ReplayView.strip_scheme('https://example.com') ==\
ReplayView.strip_scheme('example.com')
True True
>>> ReplayView.strip_scheme('about://example.com') == ReplayView.strip_scheme('example.com') >>> ReplayView.strip_scheme('about://example.com') ==\
ReplayView.strip_scheme('example.com')
True True
""" """
m = ReplayView.STRIP_SCHEME.match(url) m = ReplayView.STRIP_SCHEME.match(url)
@ -287,6 +316,7 @@ class ReplayView:
else: else:
return url return url
if __name__ == "__main__": if __name__ == "__main__":
import doctest import doctest
doctest.testmod() doctest.testmod()

View File

@ -25,10 +25,9 @@ class PyTest(TestCommand):
def run_tests(self): def run_tests(self):
import pytest import pytest
import sys import sys
# cmdline opts moved to pytest.ini cmdline = ' --cov-config .coveragerc --cov pywb'
#cmdline = ' --cov-config .coveragerc --cov pywb' cmdline += ' -v --doctest-module ./pywb/ tests/'
#cmdline += ' -v --doctest-module ./pywb/ tests/' errcode = pytest.main(cmdline)
errcode = pytest.main('')
sys.exit(errcode) sys.exit(errcode)
setup( setup(

View File

@ -78,7 +78,7 @@ absoulte_paths: true
# List of route names: # List of route names:
# <route>: <package or file path> # <route>: <package or file path>
static_routes: static_routes:
static/test/route: static/ static/test/route: pywb/static/
# Enable simple http proxy mode # Enable simple http proxy mode
enable_http_proxy: true enable_http_proxy: true

View File

@ -1,94 +1,20 @@
# pywb config file # pywb memento test config file
# ======================================== # ========================================
#
# Settings for each collection # minimal settings for memento http api testing
collections: collections:
# <name>: <cdx_path>
# collection will be accessed via /<name>
# <cdx_path> is a string or list of:
# - string or list of one or more local .cdx file
# - string or list of one or more local dirs with .cdx files
# - a string value indicating remote http cdx server
pywb: ./sample_archive/cdx/ pywb: ./sample_archive/cdx/
# ex with filtering: filter CDX lines by filename starting with 'dupe' archive_paths: ['./sample_archive/warcs/']
pywb-filt: {'index_paths': './sample_archive/cdx/', 'filters': ['filename:dupe*']}
# indicate if cdx files are sorted by SURT keys -- eg: com,example)/
# SURT keys are recommended for future indices, but non-SURT cdxs
# are also supported
#
# * Set to true if cdxs start with surts: com,example)/
# * Set to false if cdx start with urls: example.com)/
surt_ordered: true
# list of paths prefixes for pywb look to 'resolve' WARC and ARC filenames
# in the cdx to their absolute path
#
# if path is:
# * local dir, use path as prefix
# * local file, lookup prefix in tab-delimited sorted index
# * http:// path, use path as remote prefix
# * redis:// path, use redis to lookup full path for w:<warc> as key
archive_paths: ['./invalid/path/to/ignore/', './sample_archive/warcs/']
# ==== Optional UI: HTML/Jinja2 Templates ====
# template for <head> insert into replayed html content
head_insert_html: ui/head_insert.html
# template to for 'calendar' query,
# eg, a listing of captures in response to a ../*/<url>
#
# may be a simple listing or a more complex 'calendar' UI
# if omitted, will list raw cdx in plain text
query_html: ui/query.html
# template for search page, which is displayed when no search url is entered
# in a collection
search_html: ui/search.html
# template for home page.
# if no other route is set, this will be rendered at /, /index.htm and /index.html
home_html: ui/index.html
# error page temlpate for may formatting error message and details
# if omitted, a text response is returned
error_html: ui/error.html
# ==== Other Paths ====
# list of host names that pywb will be running from to detect
# 'fallthrough' requests based on referrer
#
# eg: an incorrect request for http://localhost:8080/image.gif with a referrer
# of http://localhost:8080/pywb/index.html, pywb can correctly redirect
# to http://localhost:8080/pywb/image.gif
#
#hostpaths: ['http://localhost:8080']
# Rewrite urls with absolute paths instead of relative
absoulte_paths: true
# List of route names:
# <route>: <package or file path>
static_routes:
static/test/route: static/
# Test memento
enable_memento: true
# Enable simple http proxy mode # Enable simple http proxy mode
enable_http_proxy: true enable_http_proxy: true
# enable cdx server api for querying cdx directly (experimental) # enable cdx server api for timemap
enable_cdx_api: true enable_cdx_api: true
# test different port
port: 9000
#with memento
enable_memento: True