1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

refactoring: clean up handlers and replay_views for pep8

use BlockLoader().load for StaticHandler static file resolving
update static paths to point to pywb/static instead of static
This commit is contained in:
Ilya Kreymer 2014-03-14 18:17:22 -07:00
parent a69d565af5
commit 6461af030b
7 changed files with 113 additions and 156 deletions

View File

@ -83,7 +83,7 @@ archive_paths: ./sample_archive/warcs/
# <route>: <package or file path>
# default route static/default for pywb defaults
static_routes:
static/default: static/
static/default: pywb/static/
# ==== New / Experimental Settings ====
# Not yet production ready -- used primarily for testing

View File

@ -1,2 +0,0 @@
[pytest]
addopts=--cov-config .coveragerc --cov pywb -v --doctest-module ./pywb/ tests/

View File

@ -3,8 +3,11 @@ import mimetypes
import time
from pywb.utils.wbexception import NotFoundException
from pywb.utils.loaders import BlockLoader
from pywb.framework.basehandlers import BaseHandler, WbUrlHandler
from pywb.framework.wbrequestresponse import WbResponse
from views import TextCapturesView
@ -38,7 +41,8 @@ class WBHandler(WbUrlHandler):
if wbrequest.wb_url.mod == 'cdx_':
return self.text_query_view.render_response(wbrequest, cdx_lines)
if (wbrequest.wb_url.type == wbrequest.wb_url.QUERY) or (wbrequest.wb_url.type == wbrequest.wb_url.URL_QUERY):
if ((wbrequest.wb_url.type == wbrequest.wb_url.QUERY) or
(wbrequest.wb_url.type == wbrequest.wb_url.URL_QUERY)):
return self.query_view.render_response(wbrequest, cdx_lines)
with PerfTimer(wbrequest.env.get('X_PERF'), 'replay') as t:
@ -46,14 +50,12 @@ class WBHandler(WbUrlHandler):
cdx_lines,
self.index_reader.cdx_load_callback(wbrequest))
def render_search_page(self, wbrequest):
if self.search_view:
return self.search_view.render_response(wbrequest=wbrequest)
else:
return WbResponse.text_response('No Lookup Url Specified')
def __str__(self):
return 'WBHandler: ' + str(self.index_reader) + ', ' + str(self.replay)
@ -72,10 +74,11 @@ class StaticHandler(BaseHandler):
full_path = self.static_path + wbrequest.wb_url_str
try:
if full_path.startswith('.') or full_path.startswith('file://'):
data = open(full_path, 'rb')
else:
data = pkgutil.get_data(self.pkg, full_path)
#if full_path.startswith('.') or full_path.startswith('file://'):
# data = open(full_path, 'rb')
#else:
# data = pkgutil.get_data(self.pkg, full_path)
data = BlockLoader().load(full_path)
if 'wsgi.file_wrapper' in wbrequest.env:
reader = wbrequest.env['wsgi.file_wrapper'](data)
@ -87,7 +90,8 @@ class StaticHandler(BaseHandler):
return WbResponse.text_stream(data, content_type=content_type)
except IOError:
raise NotFoundException('Static File Not Found: ' + wbrequest.wb_url_str)
raise NotFoundException('Static File Not Found: ' +
wbrequest.wb_url_str)
def __str__(self):
return 'Static files from ' + self.static_path

View File

@ -48,7 +48,6 @@ class ReplayView:
else:
self.response_class = WbResponse
def __call__(self, wbrequest, cdx_lines, cdx_loader):
last_e = None
first = True
@ -56,11 +55,15 @@ class ReplayView:
# List of already failed w/arcs
failed_files = []
response = None
# Iterate over the cdx until find one that works
# The cdx should already be sorted in closest-to-timestamp order (from the cdx server)
# The cdx should already be sorted in
# closest-to-timestamp order (from the cdx server)
for cdx in cdx_lines:
try:
# optimize: can detect if redirect is needed just from the cdx, no need to load w/arc data
# optimize: can detect if redirect is needed just from the cdx,
# no need to load w/arc data if requiring exact match
if first:
redir_response = self._redirect_if_needed(wbrequest, cdx)
if redir_response:
@ -68,8 +71,31 @@ class ReplayView:
first = False
response = self.replay_capture(wbrequest,
cdx,
cdx_loader,
failed_files)
except (CaptureException, ArchiveLoadFailed) as ce:
import traceback
traceback.print_exc()
last_e = ce
pass
if response:
return response
if last_e:
raise last_e
else:
raise WbException('No Content Loaded for: ' +
str(wbrequest.wb_url))
def replay_capture(self, wbrequest, cdx, cdx_loader, failed_files):
(status_headers, stream) = (self.content_loader.
resolve_headers_and_payload(cdx, failed_files, cdx_loader))
resolve_headers_and_payload(cdx,
failed_files,
cdx_loader))
# check and reject self-redirect
self._reject_self_redirect(wbrequest, cdx, status_headers)
@ -84,17 +110,21 @@ class ReplayView:
response = None
# if Content-Length for payload is present, ensure we don't read past it
# if Content-Length for payload is present,
# ensure we don't read past it
content_length = status_headers.get_header('content-length')
if content_length:
stream = LimitReader.wrap_stream(stream, content_length)
if self.content_rewriter and wbrequest.wb_url.mod != 'id_':
response = self.rewrite_content(wbrequest, cdx, status_headers, stream)
else:
(status_headers, stream) = self.sanitize_content(status_headers, stream)
#status_headers.remove_header('content-length')
response = self.rewrite_content(wbrequest,
cdx,
status_headers,
stream)
else:
(status_headers, stream) = self.sanitize_content(status_headers,
stream)
response_iter = self.stream_to_iter(stream)
response = WbResponse(status_headers, response_iter)
@ -104,18 +134,6 @@ class ReplayView:
return response
except (CaptureException, ArchiveLoadFailed) as ce:
import traceback
traceback.print_exc()
last_e = ce
pass
if last_e:
raise last_e
else:
raise WbException('No Content Loaded for: ' + wbrequest.wb_url)
@staticmethod
def stream_to_iter(stream):
try:
@ -181,7 +199,6 @@ class ReplayView:
wbrequest=wbrequest,
cdx=cdx)
# Buffer rewrite iterator and return a response from a string
def buffered_response(self, status_headers, iterator):
out = BytesIO()
@ -194,7 +211,8 @@ class ReplayView:
content = out.getvalue()
content_length_str = str(len(content))
status_headers.headers.append(('Content-Length', content_length_str))
status_headers.headers.append(('Content-Length',
content_length_str))
out.close()
return content
@ -203,7 +221,9 @@ class ReplayView:
if wbrequest.is_proxy:
return None
redir_needed = hasattr(wbrequest, 'is_timegate') and wbrequest.is_timegate
# todo: generalize this?
redir_needed = (hasattr(wbrequest, 'is_timegate') and
wbrequest.is_timegate)
if not redir_needed and self.redir_to_exact:
redir_needed = (cdx['timestamp'] != wbrequest.wb_url.timestamp)
@ -211,7 +231,9 @@ class ReplayView:
if not redir_needed:
return None
new_url = wbrequest.urlrewriter.get_timestamp_url(cdx['timestamp'], cdx['original'])
new_url = wbrequest.urlrewriter.get_timestamp_url(cdx['timestamp'],
cdx['original'])
status_headers = StatusAndHeaders('302 Internal Redirect',
[('Location', new_url)])
@ -219,7 +241,6 @@ class ReplayView:
return self.response_class(status_headers,
wbrequest=wbrequest)
def _reject_self_redirect(self, wbrequest, cdx, status_headers):
"""
Check if response is a 3xx redirect to the same url
@ -241,14 +262,18 @@ class ReplayView:
location_url = location_url.lower()
if (ReplayView.strip_scheme(request_url) == ReplayView.strip_scheme(location_url)):
if (ReplayView.strip_scheme(request_url) ==
ReplayView.strip_scheme(location_url)):
raise CaptureException('Self Redirect: ' + str(cdx))
def _reject_referrer_self_redirect(self, wbrequest):
"""
Perform final check for referrer based self-redirect.
This method should be called after verifying request timestamp matches capture.
if referrer is same as current url, reject this response and try another capture
This method should be called after verifying that
the request timestamp == capture timestamp
If referrer is same as current url,
reject this response and try another capture.
"""
if not wbrequest.referrer:
return
@ -259,22 +284,26 @@ class ReplayView:
if (ReplayView.strip_scheme(request_url) ==
ReplayView.strip_scheme(wbrequest.referrer)):
raise CaptureException('Self Redirect via Referrer: ' + str(wbrequest.wb_url))
raise CaptureException('Self Redirect via Referrer: ' +
str(wbrequest.wb_url))
@staticmethod
def strip_scheme(url):
"""
>>> ReplayView.strip_scheme('https://example.com') == ReplayView.strip_scheme('http://example.com')
>>> ReplayView.strip_scheme('https://example.com') ==\
ReplayView.strip_scheme('http://example.com')
True
>>> ReplayView.strip_scheme('https://example.com') == ReplayView.strip_scheme('http:/example.com')
>>> ReplayView.strip_scheme('https://example.com') ==\
ReplayView.strip_scheme('http:/example.com')
True
>>> ReplayView.strip_scheme('https://example.com') == ReplayView.strip_scheme('example.com')
>>> ReplayView.strip_scheme('https://example.com') ==\
ReplayView.strip_scheme('example.com')
True
>>> ReplayView.strip_scheme('about://example.com') == ReplayView.strip_scheme('example.com')
>>> ReplayView.strip_scheme('about://example.com') ==\
ReplayView.strip_scheme('example.com')
True
"""
m = ReplayView.STRIP_SCHEME.match(url)
@ -287,6 +316,7 @@ class ReplayView:
else:
return url
if __name__ == "__main__":
import doctest
doctest.testmod()

View File

@ -25,10 +25,9 @@ class PyTest(TestCommand):
def run_tests(self):
import pytest
import sys
# cmdline opts moved to pytest.ini
#cmdline = ' --cov-config .coveragerc --cov pywb'
#cmdline += ' -v --doctest-module ./pywb/ tests/'
errcode = pytest.main('')
cmdline = ' --cov-config .coveragerc --cov pywb'
cmdline += ' -v --doctest-module ./pywb/ tests/'
errcode = pytest.main(cmdline)
sys.exit(errcode)
setup(

View File

@ -78,7 +78,7 @@ absoulte_paths: true
# List of route names:
# <route>: <package or file path>
static_routes:
static/test/route: static/
static/test/route: pywb/static/
# Enable simple http proxy mode
enable_http_proxy: true

View File

@ -1,94 +1,20 @@
# pywb config file
# pywb memento test config file
# ========================================
#
# Settings for each collection
# minimal settings for memento http api testing
collections:
# <name>: <cdx_path>
# collection will be accessed via /<name>
# <cdx_path> is a string or list of:
# - string or list of one or more local .cdx file
# - string or list of one or more local dirs with .cdx files
# - a string value indicating remote http cdx server
pywb: ./sample_archive/cdx/
# ex with filtering: filter CDX lines by filename starting with 'dupe'
pywb-filt: {'index_paths': './sample_archive/cdx/', 'filters': ['filename:dupe*']}
# indicate if cdx files are sorted by SURT keys -- eg: com,example)/
# SURT keys are recommended for future indices, but non-SURT cdxs
# are also supported
#
# * Set to true if cdxs start with surts: com,example)/
# * Set to false if cdx start with urls: example.com)/
surt_ordered: true
# list of paths prefixes for pywb look to 'resolve' WARC and ARC filenames
# in the cdx to their absolute path
#
# if path is:
# * local dir, use path as prefix
# * local file, lookup prefix in tab-delimited sorted index
# * http:// path, use path as remote prefix
# * redis:// path, use redis to lookup full path for w:<warc> as key
archive_paths: ['./invalid/path/to/ignore/', './sample_archive/warcs/']
# ==== Optional UI: HTML/Jinja2 Templates ====
# template for <head> insert into replayed html content
head_insert_html: ui/head_insert.html
# template to for 'calendar' query,
# eg, a listing of captures in response to a ../*/<url>
#
# may be a simple listing or a more complex 'calendar' UI
# if omitted, will list raw cdx in plain text
query_html: ui/query.html
# template for search page, which is displayed when no search url is entered
# in a collection
search_html: ui/search.html
# template for home page.
# if no other route is set, this will be rendered at /, /index.htm and /index.html
home_html: ui/index.html
# error page temlpate for may formatting error message and details
# if omitted, a text response is returned
error_html: ui/error.html
# ==== Other Paths ====
# list of host names that pywb will be running from to detect
# 'fallthrough' requests based on referrer
#
# eg: an incorrect request for http://localhost:8080/image.gif with a referrer
# of http://localhost:8080/pywb/index.html, pywb can correctly redirect
# to http://localhost:8080/pywb/image.gif
#
#hostpaths: ['http://localhost:8080']
# Rewrite urls with absolute paths instead of relative
absoulte_paths: true
# List of route names:
# <route>: <package or file path>
static_routes:
static/test/route: static/
archive_paths: ['./sample_archive/warcs/']
# Test memento
enable_memento: true
# Enable simple http proxy mode
enable_http_proxy: true
# enable cdx server api for querying cdx directly (experimental)
# enable cdx server api for timemap
enable_cdx_api: true
# test different port
port: 9000
#with memento
enable_memento: True