mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
Merge branch 'develop' into https-proxy
This commit is contained in:
commit
492aaa4a01
@ -18,11 +18,15 @@ from pywb.utils.bufferedreaders import ChunkedDataReader
|
||||
|
||||
#=================================================================
|
||||
class RewriteContent:
|
||||
def __init__(self, ds_rules_file=None, defmod=''):
|
||||
def __init__(self, ds_rules_file=None, is_framed_replay=False):
|
||||
self.ruleset = RuleSet(RewriteRules, 'rewrite',
|
||||
default_rule_config={},
|
||||
ds_rules_file=ds_rules_file)
|
||||
self.defmod = defmod
|
||||
|
||||
if is_framed_replay:
|
||||
self.defmod = 'mp_'
|
||||
else:
|
||||
self.defmod = ''
|
||||
|
||||
def sanitize_content(self, status_headers, stream):
|
||||
# remove transfer encoding chunked and wrap in a dechunking stream
|
||||
|
@ -21,8 +21,8 @@ from rewrite_content import RewriteContent
|
||||
|
||||
#=================================================================
|
||||
class LiveRewriter(object):
|
||||
def __init__(self, defmod='', default_proxy=None):
|
||||
self.rewriter = RewriteContent(defmod=defmod)
|
||||
def __init__(self, is_framed_replay=False, default_proxy=None):
|
||||
self.rewriter = RewriteContent(is_framed_replay=is_framed_replay)
|
||||
self.default_proxy = default_proxy
|
||||
if self.default_proxy:
|
||||
logging.debug('Live Rewrite via proxy ' + self.default_proxy)
|
||||
@ -73,7 +73,7 @@ class LiveRewriter(object):
|
||||
|
||||
def fetch_http(self, url,
|
||||
env=None,
|
||||
req_headers={},
|
||||
req_headers=None,
|
||||
follow_redirects=False,
|
||||
proxies=None):
|
||||
|
||||
@ -84,6 +84,9 @@ class LiveRewriter(object):
|
||||
proxies = {'http': self.default_proxy,
|
||||
'https': self.default_proxy}
|
||||
|
||||
if not req_headers:
|
||||
req_headers = {}
|
||||
|
||||
if env is not None:
|
||||
method = env['REQUEST_METHOD'].upper()
|
||||
input_ = env['wsgi.input']
|
||||
|
@ -2,6 +2,8 @@ import pkgutil
|
||||
import mimetypes
|
||||
import time
|
||||
|
||||
from datetime import datetime
|
||||
|
||||
from pywb.utils.wbexception import NotFoundException
|
||||
from pywb.utils.loaders import BlockLoader
|
||||
|
||||
@ -11,8 +13,9 @@ from pywb.framework.wbrequestresponse import WbResponse
|
||||
from pywb.warc.recordloader import ArcWarcRecordLoader
|
||||
from pywb.warc.resolvingloader import ResolvingLoader
|
||||
|
||||
from views import J2TemplateView, add_env_globals
|
||||
from views import J2TemplateView
|
||||
from replay_views import ReplayView
|
||||
from pywb.utils.timeutils import datetime_to_timestamp
|
||||
|
||||
|
||||
#=================================================================
|
||||
@ -26,6 +29,15 @@ class SearchPageWbUrlHandler(WbUrlHandler):
|
||||
create_template(config.get('search_html'),
|
||||
'Search Page'))
|
||||
|
||||
self.is_frame_mode = config.get('framed_replay', False)
|
||||
|
||||
if self.is_frame_mode:
|
||||
html = config.get('frame_insert_html', 'ui/frame_insert.html')
|
||||
self.frame_insert_view = (J2TemplateView.
|
||||
create_template(html, 'Frame Insert'))
|
||||
else:
|
||||
self.frame_insert_view = None
|
||||
|
||||
def render_search_page(self, wbrequest, **kwargs):
|
||||
if self.search_view:
|
||||
return self.search_view.render_response(wbrequest=wbrequest,
|
||||
@ -34,6 +46,38 @@ class SearchPageWbUrlHandler(WbUrlHandler):
|
||||
else:
|
||||
return WbResponse.text_response('No Lookup Url Specified')
|
||||
|
||||
def __call__(self, wbrequest):
|
||||
# root search page
|
||||
if wbrequest.wb_url_str == '/':
|
||||
return self.render_search_page(wbrequest)
|
||||
|
||||
# render top level frame if in frame mode
|
||||
# (not supported in proxy mode)
|
||||
if (self.is_frame_mode and wbrequest.wb_url and
|
||||
not wbrequest.wb_url.is_query() and
|
||||
not wbrequest.wb_url.mod and
|
||||
not wbrequest.options['is_proxy']):
|
||||
|
||||
params = self.get_top_frame_params(wbrequest)
|
||||
|
||||
return self.frame_insert_view.render_response(**params)
|
||||
|
||||
return self.handle_request(wbrequest)
|
||||
|
||||
def get_top_frame_params(self, wbrequest):
|
||||
if wbrequest.wb_url.timestamp:
|
||||
timestamp = wbrequest.wb_url.timestamp
|
||||
else:
|
||||
timestamp = datetime_to_timestamp(datetime.utcnow())
|
||||
|
||||
embed_url = wbrequest.wb_url.to_str(mod='mp_')
|
||||
|
||||
return dict(embed_url=embed_url,
|
||||
wbrequest=wbrequest,
|
||||
timestamp=timestamp,
|
||||
url=wbrequest.wb_url.url,
|
||||
content_type='text/html')
|
||||
|
||||
|
||||
#=================================================================
|
||||
# Standard WB Handler
|
||||
@ -52,10 +96,6 @@ class WBHandler(SearchPageWbUrlHandler):
|
||||
resolving_loader = ResolvingLoader(paths=paths,
|
||||
record_loader=record_loader)
|
||||
|
||||
template_globals = config.get('template_globals')
|
||||
if template_globals:
|
||||
add_env_globals(template_globals)
|
||||
|
||||
self.replay = ReplayView(resolving_loader, config)
|
||||
|
||||
self.fallback_handler = None
|
||||
@ -65,13 +105,9 @@ class WBHandler(SearchPageWbUrlHandler):
|
||||
if self.fallback_name:
|
||||
self.fallback_handler = handler_dict.get(self.fallback_name)
|
||||
|
||||
def __call__(self, wbrequest):
|
||||
if wbrequest.wb_url_str == '/':
|
||||
return self.render_search_page(wbrequest)
|
||||
|
||||
def handle_request(self, wbrequest):
|
||||
try:
|
||||
with PerfTimer(wbrequest.env.get('X_PERF'), 'query') as t:
|
||||
response = self.index_reader.load_for_request(wbrequest)
|
||||
response = self.handle_query(wbrequest)
|
||||
except NotFoundException as nfe:
|
||||
return self.handle_not_found(wbrequest, nfe)
|
||||
|
||||
@ -81,11 +117,13 @@ class WBHandler(SearchPageWbUrlHandler):
|
||||
cdx_lines, cdx_callback = response
|
||||
return self.handle_replay(wbrequest, cdx_lines, cdx_callback)
|
||||
|
||||
def handle_query(self, wbrequest):
|
||||
return self.index_reader.load_for_request(wbrequest)
|
||||
|
||||
def handle_replay(self, wbrequest, cdx_lines, cdx_callback):
|
||||
with PerfTimer(wbrequest.env.get('X_PERF'), 'replay') as t:
|
||||
return self.replay(wbrequest,
|
||||
cdx_lines,
|
||||
cdx_callback)
|
||||
return self.replay.render_content(wbrequest,
|
||||
cdx_lines,
|
||||
cdx_callback)
|
||||
|
||||
def handle_not_found(self, wbrequest, nfe):
|
||||
if (not self.fallback_handler or
|
||||
@ -154,19 +192,3 @@ class DebugEchoEnvHandler(BaseHandler): # pragma: no cover
|
||||
class DebugEchoHandler(BaseHandler): # pragma: no cover
|
||||
def __call__(self, wbrequest):
|
||||
return WbResponse.text_response(str(wbrequest))
|
||||
|
||||
|
||||
#=================================================================
|
||||
class PerfTimer:
|
||||
def __init__(self, perfdict, name):
|
||||
self.perfdict = perfdict
|
||||
self.name = name
|
||||
|
||||
def __enter__(self):
|
||||
self.start = time.clock()
|
||||
return self
|
||||
|
||||
def __exit__(self, *args):
|
||||
self.end = time.clock()
|
||||
if self.perfdict is not None:
|
||||
self.perfdict[self.name] = str(self.end - self.start)
|
||||
|
@ -2,9 +2,11 @@ from pywb.framework.basehandlers import WbUrlHandler
|
||||
from pywb.framework.wbrequestresponse import WbResponse
|
||||
from pywb.framework.archivalrouter import ArchivalRouter, Route
|
||||
|
||||
from handlers import StaticHandler, SearchPageWbUrlHandler
|
||||
from pywb.rewrite.rewrite_live import LiveRewriter
|
||||
from pywb.rewrite.wburl import WbUrl
|
||||
|
||||
from replay_views import RewriteLiveView
|
||||
from handlers import StaticHandler, SearchPageWbUrlHandler
|
||||
from views import HeadInsertView
|
||||
|
||||
from pywb.utils.wbexception import WbException
|
||||
|
||||
@ -19,20 +21,44 @@ class LiveResourceException(WbException):
|
||||
class RewriteHandler(SearchPageWbUrlHandler):
|
||||
def __init__(self, config):
|
||||
super(RewriteHandler, self).__init__(config)
|
||||
self.rewrite_view = RewriteLiveView(config)
|
||||
|
||||
def __call__(self, wbrequest):
|
||||
if wbrequest.wb_url_str == '/':
|
||||
return self.render_search_page(wbrequest)
|
||||
default_proxy = config.get('proxyhostport')
|
||||
self.rewriter = LiveRewriter(is_framed_replay=self.is_frame_mode,
|
||||
default_proxy=default_proxy)
|
||||
|
||||
self.head_insert_view = HeadInsertView.init_from_config(config)
|
||||
|
||||
def handle_request(self, wbrequest):
|
||||
try:
|
||||
return self.rewrite_view(wbrequest)
|
||||
return self.render_content(wbrequest)
|
||||
|
||||
except Exception as exc:
|
||||
url = wbrequest.wb_url.url
|
||||
msg = 'Could not load the url from the live web: ' + url
|
||||
raise LiveResourceException(msg=msg, url=url)
|
||||
|
||||
def _live_request_headers(self, wbrequest):
|
||||
return {}
|
||||
|
||||
def render_content(self, wbrequest):
|
||||
head_insert_func = self.head_insert_view.create_insert_func(wbrequest)
|
||||
req_headers = self._live_request_headers(wbrequest)
|
||||
|
||||
ref_wburl_str = wbrequest.extract_referrer_wburl_str()
|
||||
if ref_wburl_str:
|
||||
wbrequest.env['REL_REFERER'] = WbUrl(ref_wburl_str).url
|
||||
|
||||
wb_url = wbrequest.wb_url
|
||||
result = self.rewriter.fetch_request(wb_url, wbrequest.urlrewriter,
|
||||
head_insert_func=head_insert_func,
|
||||
req_headers=req_headers,
|
||||
env=wbrequest.env)
|
||||
|
||||
return self._make_response(wbrequest, *result)
|
||||
|
||||
def _make_response(self, wbrequest, status_headers, gen, is_rewritten):
|
||||
return WbResponse(status_headers, gen)
|
||||
|
||||
def __str__(self):
|
||||
return 'Live Web Rewrite Handler'
|
||||
|
||||
|
@ -6,7 +6,7 @@ from pywb.framework.wbrequestresponse import WbRequest
|
||||
from pywb.framework.memento import MementoRequest
|
||||
from pywb.framework.basehandlers import BaseHandler
|
||||
|
||||
from views import J2TemplateView
|
||||
from views import J2TemplateView, add_env_globals
|
||||
from views import J2HtmlCapturesView, HeadInsertView
|
||||
|
||||
from live_rewrite_handler import RewriteHandler
|
||||
@ -74,7 +74,10 @@ def create_wb_handler(query_handler, config):
|
||||
|
||||
#=================================================================
|
||||
def create_live_handler(config):
|
||||
live_handler = RewriteHandler(config)
|
||||
wb_handler_class = config.get('wb_handler_class', RewriteHandler)
|
||||
|
||||
live_handler = wb_handler_class(config)
|
||||
|
||||
return live_handler
|
||||
|
||||
|
||||
@ -95,9 +98,12 @@ def init_collection(route_config):
|
||||
create_template(route_config.get('query_html'),
|
||||
'Captures Page'))
|
||||
|
||||
server_cls = route_config.get('server_cls')
|
||||
|
||||
query_handler = QueryHandler.init_from_config(route_config,
|
||||
ds_rules_file,
|
||||
html_view)
|
||||
html_view,
|
||||
server_cls)
|
||||
|
||||
return query_handler
|
||||
|
||||
@ -165,6 +171,11 @@ def create_wb_router(passed_config={}):
|
||||
# store live and replay handlers
|
||||
handler_dict = {}
|
||||
|
||||
# setup template globals
|
||||
template_globals = config.get('template_globals')
|
||||
if template_globals:
|
||||
add_env_globals(template_globals)
|
||||
|
||||
for name, value in collections.iteritems():
|
||||
if isinstance(value, BaseHandler):
|
||||
handler_dict[name] = value
|
||||
|
@ -1,19 +1,14 @@
|
||||
import re
|
||||
import datetime
|
||||
from io import BytesIO
|
||||
|
||||
from pywb.utils.statusandheaders import StatusAndHeaders
|
||||
from pywb.utils.wbexception import WbException, NotFoundException
|
||||
from pywb.utils.loaders import LimitReader
|
||||
from pywb.utils.timeutils import datetime_to_timestamp
|
||||
|
||||
from pywb.framework.wbrequestresponse import WbResponse
|
||||
from pywb.framework.memento import MementoResponse
|
||||
|
||||
from pywb.rewrite.rewrite_content import RewriteContent
|
||||
from pywb.rewrite.rewrite_live import LiveRewriter
|
||||
from pywb.rewrite.wburl import WbUrl
|
||||
|
||||
from pywb.warc.recordloader import ArchiveLoadFailed
|
||||
|
||||
from views import J2TemplateView, add_env_globals
|
||||
@ -32,92 +27,16 @@ class CaptureException(WbException):
|
||||
|
||||
|
||||
#=================================================================
|
||||
class BaseContentView(object):
|
||||
def __init__(self, config):
|
||||
self.is_frame_mode = config.get('framed_replay', False)
|
||||
|
||||
if self.is_frame_mode:
|
||||
self._mp_mod = 'mp_'
|
||||
else:
|
||||
self._mp_mod = ''
|
||||
|
||||
view = config.get('head_insert_view')
|
||||
if not view:
|
||||
head_insert = config.get('head_insert_html',
|
||||
'ui/head_insert.html')
|
||||
view = HeadInsertView.create_template(head_insert, 'Head Insert')
|
||||
|
||||
self.head_insert_view = view
|
||||
|
||||
if not self.is_frame_mode:
|
||||
self.frame_insert_view = None
|
||||
return
|
||||
|
||||
view = config.get('frame_insert_view')
|
||||
if not view:
|
||||
frame_insert = config.get('frame_insert_html',
|
||||
'ui/frame_insert.html')
|
||||
|
||||
view = J2TemplateView.create_template(frame_insert, 'Frame Insert')
|
||||
|
||||
self.frame_insert_view = view
|
||||
|
||||
def __call__(self, wbrequest, *args):
|
||||
# render top level frame if in frame mode
|
||||
# (not supported in proxy mode)
|
||||
if (self.is_frame_mode and wbrequest.wb_url and
|
||||
not wbrequest.wb_url.mod and
|
||||
not wbrequest.options['is_proxy']):
|
||||
|
||||
embed_url = wbrequest.wb_url.to_str(mod=self._mp_mod)
|
||||
timestamp = datetime_to_timestamp(datetime.datetime.utcnow())
|
||||
url = wbrequest.wb_url.url
|
||||
ctype = 'text/html'
|
||||
|
||||
return self.frame_insert_view.render_response(embed_url=embed_url,
|
||||
wbrequest=wbrequest,
|
||||
timestamp=timestamp,
|
||||
url=url,
|
||||
content_type=ctype)
|
||||
|
||||
return self.render_content(wbrequest, *args)
|
||||
|
||||
|
||||
#=================================================================
|
||||
class RewriteLiveView(BaseContentView):
|
||||
def __init__(self, config):
|
||||
super(RewriteLiveView, self).__init__(config)
|
||||
|
||||
default_proxy = config.get('proxyhostport')
|
||||
self.rewriter = LiveRewriter(defmod=self._mp_mod,
|
||||
default_proxy=default_proxy)
|
||||
|
||||
def render_content(self, wbrequest, *args):
|
||||
head_insert_func = self.head_insert_view.create_insert_func(wbrequest)
|
||||
|
||||
ref_wburl_str = wbrequest.extract_referrer_wburl_str()
|
||||
if ref_wburl_str:
|
||||
wbrequest.env['REL_REFERER'] = WbUrl(ref_wburl_str).url
|
||||
|
||||
wb_url = wbrequest.wb_url
|
||||
result = self.rewriter.fetch_request(wb_url, wbrequest.urlrewriter,
|
||||
head_insert_func=head_insert_func,
|
||||
env=wbrequest.env)
|
||||
|
||||
status_headers, gen, is_rewritten = result
|
||||
|
||||
return WbResponse(status_headers, gen)
|
||||
|
||||
|
||||
#=================================================================
|
||||
class ReplayView(BaseContentView):
|
||||
class ReplayView(object):
|
||||
STRIP_SCHEME = re.compile('^([\w]+:[/]*)?(.*?)$')
|
||||
|
||||
def __init__(self, content_loader, config):
|
||||
super(ReplayView, self).__init__(config)
|
||||
|
||||
self.content_loader = content_loader
|
||||
self.content_rewriter = RewriteContent(defmod=self._mp_mod)
|
||||
|
||||
framed = config.get('framed_replay')
|
||||
self.content_rewriter = RewriteContent(is_framed_replay=framed)
|
||||
|
||||
self.head_insert_view = HeadInsertView.init_from_config(config)
|
||||
|
||||
self.buffer_response = config.get('buffer_response', True)
|
||||
|
||||
@ -131,12 +50,12 @@ class ReplayView(BaseContentView):
|
||||
|
||||
self._reporter = config.get('reporter')
|
||||
|
||||
def render_content(self, wbrequest, *args):
|
||||
def render_content(self, wbrequest, cdx_lines, cdx_loader):
|
||||
last_e = None
|
||||
first = True
|
||||
|
||||
cdx_lines = args[0]
|
||||
cdx_loader = args[1]
|
||||
#cdx_lines = args[0]
|
||||
#cdx_loader = args[1]
|
||||
|
||||
# List of already failed w/arcs
|
||||
failed_files = []
|
||||
|
@ -141,6 +141,14 @@ class HeadInsertView(J2TemplateView):
|
||||
return J2TemplateView.create_template(filename, desc,
|
||||
HeadInsertView)
|
||||
|
||||
@staticmethod
|
||||
def init_from_config(config):
|
||||
view = config.get('head_insert_view')
|
||||
if not view:
|
||||
html = config.get('head_insert_html', 'ui/head_insert.html')
|
||||
view = HeadInsertView.create_template(html, 'Head Insert')
|
||||
return view
|
||||
|
||||
|
||||
#=================================================================
|
||||
# query views
|
||||
|
@ -310,7 +310,7 @@ class TestWb:
|
||||
|
||||
|
||||
def test_excluded_content(self):
|
||||
resp = self.testapp.get('/pywb/http://www.iana.org/_img/bookmark_icon.ico', status = 403)
|
||||
resp = self.testapp.get('/pywb/mp_/http://www.iana.org/_img/bookmark_icon.ico', status = 403)
|
||||
assert resp.status_int == 403
|
||||
assert 'Excluded' in resp.body
|
||||
|
||||
@ -414,7 +414,7 @@ class TestWb:
|
||||
|
||||
|
||||
def test_error(self):
|
||||
resp = self.testapp.get('/pywb/?abc', status = 400)
|
||||
resp = self.testapp.get('/pywb/mp_/?abc', status = 400)
|
||||
assert resp.status_int == 400
|
||||
assert 'Invalid Url: http://?abc' in resp.body
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user