1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

Merge branch 'develop' into https-proxy

This commit is contained in:
Ilya Kreymer 2014-08-04 13:00:25 -07:00
commit 492aaa4a01
8 changed files with 131 additions and 138 deletions

View File

@ -18,11 +18,15 @@ from pywb.utils.bufferedreaders import ChunkedDataReader
#=================================================================
class RewriteContent:
def __init__(self, ds_rules_file=None, defmod=''):
def __init__(self, ds_rules_file=None, is_framed_replay=False):
self.ruleset = RuleSet(RewriteRules, 'rewrite',
default_rule_config={},
ds_rules_file=ds_rules_file)
self.defmod = defmod
if is_framed_replay:
self.defmod = 'mp_'
else:
self.defmod = ''
def sanitize_content(self, status_headers, stream):
# remove transfer encoding chunked and wrap in a dechunking stream

View File

@ -21,8 +21,8 @@ from rewrite_content import RewriteContent
#=================================================================
class LiveRewriter(object):
def __init__(self, defmod='', default_proxy=None):
self.rewriter = RewriteContent(defmod=defmod)
def __init__(self, is_framed_replay=False, default_proxy=None):
self.rewriter = RewriteContent(is_framed_replay=is_framed_replay)
self.default_proxy = default_proxy
if self.default_proxy:
logging.debug('Live Rewrite via proxy ' + self.default_proxy)
@ -73,7 +73,7 @@ class LiveRewriter(object):
def fetch_http(self, url,
env=None,
req_headers={},
req_headers=None,
follow_redirects=False,
proxies=None):
@ -84,6 +84,9 @@ class LiveRewriter(object):
proxies = {'http': self.default_proxy,
'https': self.default_proxy}
if not req_headers:
req_headers = {}
if env is not None:
method = env['REQUEST_METHOD'].upper()
input_ = env['wsgi.input']

View File

@ -2,6 +2,8 @@ import pkgutil
import mimetypes
import time
from datetime import datetime
from pywb.utils.wbexception import NotFoundException
from pywb.utils.loaders import BlockLoader
@ -11,8 +13,9 @@ from pywb.framework.wbrequestresponse import WbResponse
from pywb.warc.recordloader import ArcWarcRecordLoader
from pywb.warc.resolvingloader import ResolvingLoader
from views import J2TemplateView, add_env_globals
from views import J2TemplateView
from replay_views import ReplayView
from pywb.utils.timeutils import datetime_to_timestamp
#=================================================================
@ -26,6 +29,15 @@ class SearchPageWbUrlHandler(WbUrlHandler):
create_template(config.get('search_html'),
'Search Page'))
self.is_frame_mode = config.get('framed_replay', False)
if self.is_frame_mode:
html = config.get('frame_insert_html', 'ui/frame_insert.html')
self.frame_insert_view = (J2TemplateView.
create_template(html, 'Frame Insert'))
else:
self.frame_insert_view = None
def render_search_page(self, wbrequest, **kwargs):
if self.search_view:
return self.search_view.render_response(wbrequest=wbrequest,
@ -34,6 +46,38 @@ class SearchPageWbUrlHandler(WbUrlHandler):
else:
return WbResponse.text_response('No Lookup Url Specified')
def __call__(self, wbrequest):
# root search page
if wbrequest.wb_url_str == '/':
return self.render_search_page(wbrequest)
# render top level frame if in frame mode
# (not supported in proxy mode)
if (self.is_frame_mode and wbrequest.wb_url and
not wbrequest.wb_url.is_query() and
not wbrequest.wb_url.mod and
not wbrequest.options['is_proxy']):
params = self.get_top_frame_params(wbrequest)
return self.frame_insert_view.render_response(**params)
return self.handle_request(wbrequest)
def get_top_frame_params(self, wbrequest):
if wbrequest.wb_url.timestamp:
timestamp = wbrequest.wb_url.timestamp
else:
timestamp = datetime_to_timestamp(datetime.utcnow())
embed_url = wbrequest.wb_url.to_str(mod='mp_')
return dict(embed_url=embed_url,
wbrequest=wbrequest,
timestamp=timestamp,
url=wbrequest.wb_url.url,
content_type='text/html')
#=================================================================
# Standard WB Handler
@ -52,10 +96,6 @@ class WBHandler(SearchPageWbUrlHandler):
resolving_loader = ResolvingLoader(paths=paths,
record_loader=record_loader)
template_globals = config.get('template_globals')
if template_globals:
add_env_globals(template_globals)
self.replay = ReplayView(resolving_loader, config)
self.fallback_handler = None
@ -65,13 +105,9 @@ class WBHandler(SearchPageWbUrlHandler):
if self.fallback_name:
self.fallback_handler = handler_dict.get(self.fallback_name)
def __call__(self, wbrequest):
if wbrequest.wb_url_str == '/':
return self.render_search_page(wbrequest)
def handle_request(self, wbrequest):
try:
with PerfTimer(wbrequest.env.get('X_PERF'), 'query') as t:
response = self.index_reader.load_for_request(wbrequest)
response = self.handle_query(wbrequest)
except NotFoundException as nfe:
return self.handle_not_found(wbrequest, nfe)
@ -81,11 +117,13 @@ class WBHandler(SearchPageWbUrlHandler):
cdx_lines, cdx_callback = response
return self.handle_replay(wbrequest, cdx_lines, cdx_callback)
def handle_query(self, wbrequest):
return self.index_reader.load_for_request(wbrequest)
def handle_replay(self, wbrequest, cdx_lines, cdx_callback):
with PerfTimer(wbrequest.env.get('X_PERF'), 'replay') as t:
return self.replay(wbrequest,
cdx_lines,
cdx_callback)
return self.replay.render_content(wbrequest,
cdx_lines,
cdx_callback)
def handle_not_found(self, wbrequest, nfe):
if (not self.fallback_handler or
@ -154,19 +192,3 @@ class DebugEchoEnvHandler(BaseHandler): # pragma: no cover
class DebugEchoHandler(BaseHandler): # pragma: no cover
def __call__(self, wbrequest):
return WbResponse.text_response(str(wbrequest))
#=================================================================
class PerfTimer:
def __init__(self, perfdict, name):
self.perfdict = perfdict
self.name = name
def __enter__(self):
self.start = time.clock()
return self
def __exit__(self, *args):
self.end = time.clock()
if self.perfdict is not None:
self.perfdict[self.name] = str(self.end - self.start)

View File

@ -2,9 +2,11 @@ from pywb.framework.basehandlers import WbUrlHandler
from pywb.framework.wbrequestresponse import WbResponse
from pywb.framework.archivalrouter import ArchivalRouter, Route
from handlers import StaticHandler, SearchPageWbUrlHandler
from pywb.rewrite.rewrite_live import LiveRewriter
from pywb.rewrite.wburl import WbUrl
from replay_views import RewriteLiveView
from handlers import StaticHandler, SearchPageWbUrlHandler
from views import HeadInsertView
from pywb.utils.wbexception import WbException
@ -19,20 +21,44 @@ class LiveResourceException(WbException):
class RewriteHandler(SearchPageWbUrlHandler):
def __init__(self, config):
super(RewriteHandler, self).__init__(config)
self.rewrite_view = RewriteLiveView(config)
def __call__(self, wbrequest):
if wbrequest.wb_url_str == '/':
return self.render_search_page(wbrequest)
default_proxy = config.get('proxyhostport')
self.rewriter = LiveRewriter(is_framed_replay=self.is_frame_mode,
default_proxy=default_proxy)
self.head_insert_view = HeadInsertView.init_from_config(config)
def handle_request(self, wbrequest):
try:
return self.rewrite_view(wbrequest)
return self.render_content(wbrequest)
except Exception as exc:
url = wbrequest.wb_url.url
msg = 'Could not load the url from the live web: ' + url
raise LiveResourceException(msg=msg, url=url)
def _live_request_headers(self, wbrequest):
return {}
def render_content(self, wbrequest):
head_insert_func = self.head_insert_view.create_insert_func(wbrequest)
req_headers = self._live_request_headers(wbrequest)
ref_wburl_str = wbrequest.extract_referrer_wburl_str()
if ref_wburl_str:
wbrequest.env['REL_REFERER'] = WbUrl(ref_wburl_str).url
wb_url = wbrequest.wb_url
result = self.rewriter.fetch_request(wb_url, wbrequest.urlrewriter,
head_insert_func=head_insert_func,
req_headers=req_headers,
env=wbrequest.env)
return self._make_response(wbrequest, *result)
def _make_response(self, wbrequest, status_headers, gen, is_rewritten):
return WbResponse(status_headers, gen)
def __str__(self):
return 'Live Web Rewrite Handler'

View File

@ -6,7 +6,7 @@ from pywb.framework.wbrequestresponse import WbRequest
from pywb.framework.memento import MementoRequest
from pywb.framework.basehandlers import BaseHandler
from views import J2TemplateView
from views import J2TemplateView, add_env_globals
from views import J2HtmlCapturesView, HeadInsertView
from live_rewrite_handler import RewriteHandler
@ -74,7 +74,10 @@ def create_wb_handler(query_handler, config):
#=================================================================
def create_live_handler(config):
live_handler = RewriteHandler(config)
wb_handler_class = config.get('wb_handler_class', RewriteHandler)
live_handler = wb_handler_class(config)
return live_handler
@ -95,9 +98,12 @@ def init_collection(route_config):
create_template(route_config.get('query_html'),
'Captures Page'))
server_cls = route_config.get('server_cls')
query_handler = QueryHandler.init_from_config(route_config,
ds_rules_file,
html_view)
html_view,
server_cls)
return query_handler
@ -165,6 +171,11 @@ def create_wb_router(passed_config={}):
# store live and replay handlers
handler_dict = {}
# setup template globals
template_globals = config.get('template_globals')
if template_globals:
add_env_globals(template_globals)
for name, value in collections.iteritems():
if isinstance(value, BaseHandler):
handler_dict[name] = value

View File

@ -1,19 +1,14 @@
import re
import datetime
from io import BytesIO
from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.utils.wbexception import WbException, NotFoundException
from pywb.utils.loaders import LimitReader
from pywb.utils.timeutils import datetime_to_timestamp
from pywb.framework.wbrequestresponse import WbResponse
from pywb.framework.memento import MementoResponse
from pywb.rewrite.rewrite_content import RewriteContent
from pywb.rewrite.rewrite_live import LiveRewriter
from pywb.rewrite.wburl import WbUrl
from pywb.warc.recordloader import ArchiveLoadFailed
from views import J2TemplateView, add_env_globals
@ -32,92 +27,16 @@ class CaptureException(WbException):
#=================================================================
class BaseContentView(object):
def __init__(self, config):
self.is_frame_mode = config.get('framed_replay', False)
if self.is_frame_mode:
self._mp_mod = 'mp_'
else:
self._mp_mod = ''
view = config.get('head_insert_view')
if not view:
head_insert = config.get('head_insert_html',
'ui/head_insert.html')
view = HeadInsertView.create_template(head_insert, 'Head Insert')
self.head_insert_view = view
if not self.is_frame_mode:
self.frame_insert_view = None
return
view = config.get('frame_insert_view')
if not view:
frame_insert = config.get('frame_insert_html',
'ui/frame_insert.html')
view = J2TemplateView.create_template(frame_insert, 'Frame Insert')
self.frame_insert_view = view
def __call__(self, wbrequest, *args):
# render top level frame if in frame mode
# (not supported in proxy mode)
if (self.is_frame_mode and wbrequest.wb_url and
not wbrequest.wb_url.mod and
not wbrequest.options['is_proxy']):
embed_url = wbrequest.wb_url.to_str(mod=self._mp_mod)
timestamp = datetime_to_timestamp(datetime.datetime.utcnow())
url = wbrequest.wb_url.url
ctype = 'text/html'
return self.frame_insert_view.render_response(embed_url=embed_url,
wbrequest=wbrequest,
timestamp=timestamp,
url=url,
content_type=ctype)
return self.render_content(wbrequest, *args)
#=================================================================
class RewriteLiveView(BaseContentView):
def __init__(self, config):
super(RewriteLiveView, self).__init__(config)
default_proxy = config.get('proxyhostport')
self.rewriter = LiveRewriter(defmod=self._mp_mod,
default_proxy=default_proxy)
def render_content(self, wbrequest, *args):
head_insert_func = self.head_insert_view.create_insert_func(wbrequest)
ref_wburl_str = wbrequest.extract_referrer_wburl_str()
if ref_wburl_str:
wbrequest.env['REL_REFERER'] = WbUrl(ref_wburl_str).url
wb_url = wbrequest.wb_url
result = self.rewriter.fetch_request(wb_url, wbrequest.urlrewriter,
head_insert_func=head_insert_func,
env=wbrequest.env)
status_headers, gen, is_rewritten = result
return WbResponse(status_headers, gen)
#=================================================================
class ReplayView(BaseContentView):
class ReplayView(object):
STRIP_SCHEME = re.compile('^([\w]+:[/]*)?(.*?)$')
def __init__(self, content_loader, config):
super(ReplayView, self).__init__(config)
self.content_loader = content_loader
self.content_rewriter = RewriteContent(defmod=self._mp_mod)
framed = config.get('framed_replay')
self.content_rewriter = RewriteContent(is_framed_replay=framed)
self.head_insert_view = HeadInsertView.init_from_config(config)
self.buffer_response = config.get('buffer_response', True)
@ -131,12 +50,12 @@ class ReplayView(BaseContentView):
self._reporter = config.get('reporter')
def render_content(self, wbrequest, *args):
def render_content(self, wbrequest, cdx_lines, cdx_loader):
last_e = None
first = True
cdx_lines = args[0]
cdx_loader = args[1]
#cdx_lines = args[0]
#cdx_loader = args[1]
# List of already failed w/arcs
failed_files = []

View File

@ -141,6 +141,14 @@ class HeadInsertView(J2TemplateView):
return J2TemplateView.create_template(filename, desc,
HeadInsertView)
@staticmethod
def init_from_config(config):
view = config.get('head_insert_view')
if not view:
html = config.get('head_insert_html', 'ui/head_insert.html')
view = HeadInsertView.create_template(html, 'Head Insert')
return view
#=================================================================
# query views

View File

@ -310,7 +310,7 @@ class TestWb:
def test_excluded_content(self):
resp = self.testapp.get('/pywb/http://www.iana.org/_img/bookmark_icon.ico', status = 403)
resp = self.testapp.get('/pywb/mp_/http://www.iana.org/_img/bookmark_icon.ico', status = 403)
assert resp.status_int == 403
assert 'Excluded' in resp.body
@ -414,7 +414,7 @@ class TestWb:
def test_error(self):
resp = self.testapp.get('/pywb/?abc', status = 400)
resp = self.testapp.get('/pywb/mp_/?abc', status = 400)
assert resp.status_int == 400
assert 'Invalid Url: http://?abc' in resp.body