1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 08:04:49 +01:00

urlrewrite app: add bottle-based app, templateview separate from pywb webapp framework

This commit is contained in:
Ilya Kreymer 2016-03-27 17:34:45 -04:00
parent 017e9802f8
commit f12be3bc91
4 changed files with 423 additions and 86 deletions

View File

@ -2,8 +2,6 @@ from gevent.monkey import patch_all; patch_all()
import requests
from webagg.inputrequest import DirectWSGIInputRequest
from pywb.framework.archivalrouter import Route
from pywb.rewrite.rewrite_content import RewriteContent
@ -12,22 +10,22 @@ from pywb.warc.recordloader import ArcWarcRecordLoader
from pywb.webapp.live_rewrite_handler import RewriteHandler
from pywb.utils.canonicalize import canonicalize
from pywb.utils.timeutils import http_date_to_timestamp
from pywb.utils.loaders import extract_client_cookie
from pywb.cdx.cdxobject import CDXObject
from io import BytesIO
from six.moves.urllib.parse import quote, urlsplit
from six import iteritems
from rewriteinputreq import RewriteInputRequest
from six.moves.urllib.parse import quote
#=================================================================
# ============================================================================
class PlatformRoute(Route):
def apply_filters(self, wbrequest, matcher):
wbrequest.matchdict = matcher.groupdict()
#=============================================================================
# ============================================================================
class PlatformHandler(RewriteHandler):
def __init__(self, config):
super(PlatformHandler, self).__init__(config)
@ -93,85 +91,6 @@ class PlatformHandler(RewriteHandler):
return self._make_response(wbrequest, *result)
#=============================================================================
class RewriteInputRequest(DirectWSGIInputRequest):
def __init__(self, env, urlkey, url, rewriter):
super(RewriteInputRequest, self).__init__(env)
self.urlkey = urlkey
self.url = url
self.rewriter = rewriter
self.splits = urlsplit(self.url)
def get_full_request_uri(self):
uri = self.splits.path
if self.splits.query:
uri += '?' + self.splits.query
return uri
def get_req_headers(self):
headers = {}
has_cookies = False
for name, value in iteritems(self.env):
if name == 'HTTP_HOST':
name = 'Host'
value = self.splits.netloc
elif name == 'HTTP_ORIGIN':
name = 'Origin'
value = (self.splits.scheme + '://' + self.splits.netloc)
elif name == 'HTTP_X_CSRFTOKEN':
name = 'X-CSRFToken'
cookie_val = extract_client_cookie(env, 'csrftoken')
if cookie_val:
value = cookie_val
elif name == 'HTTP_X_FORWARDED_PROTO':
name = 'X-Forwarded-Proto'
value = self.splits.scheme
elif name == 'HTTP_COOKIE':
name = 'Cookie'
value = self._req_cookie_rewrite(value)
has_cookies = True
elif name.startswith('HTTP_'):
name = name[5:].title().replace('_', '-')
elif name in ('CONTENT_LENGTH', 'CONTENT_TYPE'):
name = name.title().replace('_', '-')
else:
value = None
if value:
headers[name] = value
if not has_cookies:
value = self._req_cookie_rewrite('')
if value:
headers['Cookie'] = value
return headers
def _req_cookie_rewrite(self, value):
rule = self.rewriter.ruleset.get_first_match(self.urlkey)
if not rule or not rule.req_cookie_rewrite:
return value
for cr in rule.req_cookie_rewrite:
try:
value = cr['rx'].sub(cr['replace'], value)
except KeyError:
pass
return value
if __name__ == "__main__":
from gevent.wsgi import WSGIServer
from pywb.apps.wayback import application

View File

@ -0,0 +1,85 @@
from webagg.inputrequest import DirectWSGIInputRequest
from pywb.utils.loaders import extract_client_cookie
from six import iteritems
from six.moves.urllib.parse import urlsplit
#=============================================================================
class RewriteInputRequest(DirectWSGIInputRequest):
def __init__(self, env, urlkey, url, rewriter):
super(RewriteInputRequest, self).__init__(env)
self.urlkey = urlkey
self.url = url
self.rewriter = rewriter
self.splits = urlsplit(self.url)
def get_full_request_uri(self):
uri = self.splits.path
if self.splits.query:
uri += '?' + self.splits.query
return uri
def get_req_headers(self):
headers = {}
has_cookies = False
for name, value in iteritems(self.env):
if name == 'HTTP_HOST':
name = 'Host'
value = self.splits.netloc
elif name == 'HTTP_ORIGIN':
name = 'Origin'
value = (self.splits.scheme + '://' + self.splits.netloc)
elif name == 'HTTP_X_CSRFTOKEN':
name = 'X-CSRFToken'
cookie_val = extract_client_cookie(env, 'csrftoken')
if cookie_val:
value = cookie_val
elif name == 'HTTP_X_FORWARDED_PROTO':
name = 'X-Forwarded-Proto'
value = self.splits.scheme
elif name == 'HTTP_COOKIE':
name = 'Cookie'
value = self._req_cookie_rewrite(value)
has_cookies = True
elif name.startswith('HTTP_'):
name = name[5:].title().replace('_', '-')
elif name in ('CONTENT_LENGTH', 'CONTENT_TYPE'):
name = name.title().replace('_', '-')
else:
value = None
if value:
headers[name] = value
if not has_cookies:
value = self._req_cookie_rewrite('')
if value:
headers['Cookie'] = value
return headers
def _req_cookie_rewrite(self, value):
rule = self.rewriter.ruleset.get_first_match(self.urlkey)
if not rule or not rule.req_cookie_rewrite:
return value
for cr in rule.req_cookie_rewrite:
try:
value = cr['rx'].sub(cr['replace'], value)
except KeyError:
pass
return value

163
urlrewrite/rewriterapp.py Normal file
View File

@ -0,0 +1,163 @@
import requests
from bottle import request, response, HTTPError
from pywb.rewrite.rewrite_content import RewriteContent
from pywb.rewrite.wburl import WbUrl
from pywb.rewrite.url_rewriter import UrlRewriter
from pywb.utils.canonicalize import canonicalize
from pywb.utils.timeutils import http_date_to_timestamp
from pywb.utils.loaders import extract_client_cookie
from pywb.cdx.cdxobject import CDXObject
from pywb.warc.recordloader import ArcWarcRecordLoader
from rewriteinputreq import RewriteInputRequest
from templateview import JinjaEnv, HeadInsertView, TopFrameView
from io import BytesIO
# ============================================================================
class RewriterApp(object):
def __init__(self, framed_replay=False):
self.loader = ArcWarcRecordLoader()
self.framed_replay = framed_replay
self.frame_mod = ''
self.replay_mod = 'mp_'
frame_type = 'inverse' if framed_replay else False
self.content_rewriter = RewriteContent(is_framed_replay=frame_type)
self.jenv = JinjaEnv(globals={'static_path': 'static/__pywb'})
self.head_insert_view = HeadInsertView(self.jenv, 'head_insert.html', 'banner.html')
self.frame_insert_view = TopFrameView(self.jenv, 'frame_insert.html', 'banner.html')
def render_content(self, wb_url, **kwargs):
wb_url = WbUrl(wb_url)
#if wb_url.mod == 'vi_':
# return self._get_video_info(wbrequest)
host_prefix = self.get_host_prefix()
rel_prefix = self.get_rel_prefix()
full_prefix = host_prefix + rel_prefix
if self.framed_replay and wb_url.mod == self.frame_mod:
return self.frame_insert_view.get_top_frame(wb_url,
full_prefix,
host_prefix,
self.frame_mod,
self.replay_mod)
urlrewriter = UrlRewriter(wb_url,
prefix=full_prefix,
full_prefix=full_prefix,
rel_prefix=rel_prefix)
self.unrewrite_referrer()
url = wb_url.url
urlkey = canonicalize(url)
inputreq = RewriteInputRequest(request.environ, urlkey, url,
self.content_rewriter)
req_data = inputreq.reconstruct_request(url)
headers = {'Content-Length': len(req_data),
'Content-Type': 'application/request'}
if wb_url.is_latest_replay():
closest = 'now'
else:
closest = wb_url.timestamp
upstream_url = self.get_upstream_url(url, closest, kwargs)
r = requests.post(upstream_url,
data=BytesIO(req_data),
headers=headers,
stream=True)
if r.status_code >= 400:
try:
r.raw.close()
except:
pass
data = dict(url=url, args=kwargs)
raise HTTPError(r.status_code, exception=data)
record = self.loader.parse_record_stream(r.raw)
cdx = CDXObject()
cdx['urlkey'] = urlkey
cdx['timestamp'] = http_date_to_timestamp(r.headers.get('Memento-Datetime'))
cdx['url'] = url
self._add_custom_params(cdx, kwargs)
if self.is_ajax():
head_insert_func = None
else:
head_insert_func = (self.head_insert_view.
create_insert_func(wb_url,
full_prefix,
host_prefix,
request.environ,
self.framed_replay))
result = self.content_rewriter.rewrite_content(urlrewriter,
record.status_headers,
record.stream,
head_insert_func,
urlkey,
cdx)
status_headers, gen, is_rw = result
response.status = int(status_headers.get_statuscode())
for n, v in status_headers.headers:
response.headers[n] = v
return gen
def get_host_prefix(self):
return request.urlparts.scheme + '://' + request.urlparts.netloc
def get_rel_prefix(self):
return request.script_name
def get_full_prefix(self):
return self.get_host_prefix() + self.get_rel_prefix()
def unrewrite_referrer(self):
referrer = request.environ.get('HTTP_REFERER')
if not referrer:
return False
full_prefix = self.get_full_prefix()
if referrer.startswith(full_prefix):
referrer = referrer[len(full_prefix):]
request.environ['HTTP_REFERER'] = referrer
return True
return False
def is_ajax(self):
value = request.environ.get('HTTP_X_REQUESTED_WITH')
if value and value.lower() == 'xmlhttprequest':
return True
return False
def get_upstream_url(self, url, closest, kwargs):
raise NotImplemented()
def _add_custom_params(self, cdx, kwargs):
pass

170
urlrewrite/templateview.py Normal file
View File

@ -0,0 +1,170 @@
from pywb.utils.timeutils import timestamp_to_datetime, timestamp_to_sec,
from pywb.utils.timeutils import timestamp_now
from six.moves.urllib.parse import urlsplit
from jinja2 import Environment
from jinja2 import FileSystemLoader, PackageLoader, ChoiceLoader
import json
import os
# ============================================================================
class FileOnlyPackageLoader(PackageLoader):
def get_source(self, env, template):
dir_, file_ = os.path.split(template)
return super(FileOnlyPackageLoader, self).get_source(env, file_)
# ============================================================================
class RelEnvironment(Environment):
"""Override join_path() to enable relative template paths."""
def join_path(self, template, parent):
return os.path.join(os.path.dirname(parent), template)
# ============================================================================
class JinjaEnv(object):
def __init__(self, paths=['templates', '.', '/'],
packages=['pywb'],
globals=None,
overlay=None):
self._init_filters()
loader = ChoiceLoader(self._make_loaders(paths, packages))
if overlay:
jinja_env = overlay.jinja_env.overlay(loader=loader, trim_blocks=True)
else:
jinja_env = RelEnvironment(loader=loader, trim_blocks=True)
jinja_env.filters.update(self.filters)
if globals:
jinja_env.globals.update(globals)
self.jinja_env = jinja_env
def _make_loaders(self, paths, packages):
loaders = []
# add loaders for paths
for path in paths:
loaders.append(FileSystemLoader(path))
# add loaders for all specified packages
for package in packages:
loaders.append(FileOnlyPackageLoader(package))
return loaders
def template_filter(self, param=None):
def deco(func):
name = param or func.__name__
self.filters[name] = func
return func
return deco
def _init_filters(self):
self.filters = {}
@self.template_filter()
def format_ts(value, format_='%a, %b %d %Y %H:%M:%S'):
if format_ == '%s':
return timestamp_to_sec(value)
else:
value = timestamp_to_datetime(value)
return value.strftime(format_)
@self.template_filter('urlsplit')
def get_urlsplit(url):
split = urlsplit(url)
return split
@self.template_filter()
def tojson(obj):
return json.dumps(obj)
# ============================================================================
class BaseInsertView(object):
def __init__(self, jenv, insert_file, banner_file):
self.jenv = jenv
self.insert_file = insert_file
self.banner_file = banner_file
def render_to_string(self, **kwargs):
template = self.jenv.jinja_env.get_template(self.insert_file)
return template.render(**kwargs)
# ============================================================================
class HeadInsertView(BaseInsertView):
def create_insert_func(self, wb_url,
wb_prefix,
host_prefix,
env,
is_framed,
coll='',
include_ts=True):
url = wb_url.get_url()
top_url = wb_prefix
top_url += wb_url.to_str(mod='')
include_wombat = not wb_url.is_banner_only
wbrequest = {'host_prefix': host_prefix,
'wb_prefix': wb_prefix,
'wb_url': wb_url,
'coll': coll,
'env': env,
'options': {'is_framed': is_framed},
'rewrite_opts': {}
}
def make_head_insert(rule, cdx):
return (self.render_to_string(wbrequest=wbrequest,
cdx=cdx,
top_url=top_url,
include_ts=include_ts,
include_wombat=include_wombat,
banner_html=self.banner_file,
rule=rule))
return make_head_insert
# ============================================================================
class TopFrameView(BaseInsertView):
def get_top_frame(self, wb_url,
wb_prefix,
host_prefix,
frame_mod,
replay_mod,
coll=''):
embed_url = wb_url.to_str(mod=replay_mod)
if wb_url.timestamp:
timestamp = wb_url.timestamp
else:
timestamp = timestamp_now()
wbrequest = {'host_prefix': host_prefix,
'wb_prefix': wb_prefix,
'wb_url': wb_url,
'coll': coll,
'options': {'frame_mod': frame_mod,
'replay_mod': replay_mod},
}
params = dict(embed_url=embed_url,
wbrequest=wbrequest,
timestamp=timestamp,
url=wb_url.get_url(),
banner_html=self.banner_file)
return self.render_to_string(**params)