1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

urlrewrite app: add bottle-based app, templateview separate from pywb webapp framework

This commit is contained in:
Ilya Kreymer 2016-03-27 17:34:45 -04:00
parent 017e9802f8
commit f12be3bc91
4 changed files with 423 additions and 86 deletions

View File

@ -2,8 +2,6 @@ from gevent.monkey import patch_all; patch_all()
import requests import requests
from webagg.inputrequest import DirectWSGIInputRequest
from pywb.framework.archivalrouter import Route from pywb.framework.archivalrouter import Route
from pywb.rewrite.rewrite_content import RewriteContent from pywb.rewrite.rewrite_content import RewriteContent
@ -12,22 +10,22 @@ from pywb.warc.recordloader import ArcWarcRecordLoader
from pywb.webapp.live_rewrite_handler import RewriteHandler from pywb.webapp.live_rewrite_handler import RewriteHandler
from pywb.utils.canonicalize import canonicalize from pywb.utils.canonicalize import canonicalize
from pywb.utils.timeutils import http_date_to_timestamp from pywb.utils.timeutils import http_date_to_timestamp
from pywb.utils.loaders import extract_client_cookie
from pywb.cdx.cdxobject import CDXObject from pywb.cdx.cdxobject import CDXObject
from io import BytesIO from io import BytesIO
from six.moves.urllib.parse import quote, urlsplit from rewriteinputreq import RewriteInputRequest
from six import iteritems
from six.moves.urllib.parse import quote
#================================================================= # ============================================================================
class PlatformRoute(Route): class PlatformRoute(Route):
def apply_filters(self, wbrequest, matcher): def apply_filters(self, wbrequest, matcher):
wbrequest.matchdict = matcher.groupdict() wbrequest.matchdict = matcher.groupdict()
#============================================================================= # ============================================================================
class PlatformHandler(RewriteHandler): class PlatformHandler(RewriteHandler):
def __init__(self, config): def __init__(self, config):
super(PlatformHandler, self).__init__(config) super(PlatformHandler, self).__init__(config)
@ -93,85 +91,6 @@ class PlatformHandler(RewriteHandler):
return self._make_response(wbrequest, *result) return self._make_response(wbrequest, *result)
#=============================================================================
class RewriteInputRequest(DirectWSGIInputRequest):
def __init__(self, env, urlkey, url, rewriter):
super(RewriteInputRequest, self).__init__(env)
self.urlkey = urlkey
self.url = url
self.rewriter = rewriter
self.splits = urlsplit(self.url)
def get_full_request_uri(self):
uri = self.splits.path
if self.splits.query:
uri += '?' + self.splits.query
return uri
def get_req_headers(self):
headers = {}
has_cookies = False
for name, value in iteritems(self.env):
if name == 'HTTP_HOST':
name = 'Host'
value = self.splits.netloc
elif name == 'HTTP_ORIGIN':
name = 'Origin'
value = (self.splits.scheme + '://' + self.splits.netloc)
elif name == 'HTTP_X_CSRFTOKEN':
name = 'X-CSRFToken'
cookie_val = extract_client_cookie(env, 'csrftoken')
if cookie_val:
value = cookie_val
elif name == 'HTTP_X_FORWARDED_PROTO':
name = 'X-Forwarded-Proto'
value = self.splits.scheme
elif name == 'HTTP_COOKIE':
name = 'Cookie'
value = self._req_cookie_rewrite(value)
has_cookies = True
elif name.startswith('HTTP_'):
name = name[5:].title().replace('_', '-')
elif name in ('CONTENT_LENGTH', 'CONTENT_TYPE'):
name = name.title().replace('_', '-')
else:
value = None
if value:
headers[name] = value
if not has_cookies:
value = self._req_cookie_rewrite('')
if value:
headers['Cookie'] = value
return headers
def _req_cookie_rewrite(self, value):
rule = self.rewriter.ruleset.get_first_match(self.urlkey)
if not rule or not rule.req_cookie_rewrite:
return value
for cr in rule.req_cookie_rewrite:
try:
value = cr['rx'].sub(cr['replace'], value)
except KeyError:
pass
return value
if __name__ == "__main__": if __name__ == "__main__":
from gevent.wsgi import WSGIServer from gevent.wsgi import WSGIServer
from pywb.apps.wayback import application from pywb.apps.wayback import application

View File

@ -0,0 +1,85 @@
from webagg.inputrequest import DirectWSGIInputRequest
from pywb.utils.loaders import extract_client_cookie
from six import iteritems
from six.moves.urllib.parse import urlsplit
#=============================================================================
class RewriteInputRequest(DirectWSGIInputRequest):
def __init__(self, env, urlkey, url, rewriter):
super(RewriteInputRequest, self).__init__(env)
self.urlkey = urlkey
self.url = url
self.rewriter = rewriter
self.splits = urlsplit(self.url)
def get_full_request_uri(self):
uri = self.splits.path
if self.splits.query:
uri += '?' + self.splits.query
return uri
def get_req_headers(self):
headers = {}
has_cookies = False
for name, value in iteritems(self.env):
if name == 'HTTP_HOST':
name = 'Host'
value = self.splits.netloc
elif name == 'HTTP_ORIGIN':
name = 'Origin'
value = (self.splits.scheme + '://' + self.splits.netloc)
elif name == 'HTTP_X_CSRFTOKEN':
name = 'X-CSRFToken'
cookie_val = extract_client_cookie(env, 'csrftoken')
if cookie_val:
value = cookie_val
elif name == 'HTTP_X_FORWARDED_PROTO':
name = 'X-Forwarded-Proto'
value = self.splits.scheme
elif name == 'HTTP_COOKIE':
name = 'Cookie'
value = self._req_cookie_rewrite(value)
has_cookies = True
elif name.startswith('HTTP_'):
name = name[5:].title().replace('_', '-')
elif name in ('CONTENT_LENGTH', 'CONTENT_TYPE'):
name = name.title().replace('_', '-')
else:
value = None
if value:
headers[name] = value
if not has_cookies:
value = self._req_cookie_rewrite('')
if value:
headers['Cookie'] = value
return headers
def _req_cookie_rewrite(self, value):
rule = self.rewriter.ruleset.get_first_match(self.urlkey)
if not rule or not rule.req_cookie_rewrite:
return value
for cr in rule.req_cookie_rewrite:
try:
value = cr['rx'].sub(cr['replace'], value)
except KeyError:
pass
return value

163
urlrewrite/rewriterapp.py Normal file
View File

@ -0,0 +1,163 @@
import requests
from bottle import request, response, HTTPError
from pywb.rewrite.rewrite_content import RewriteContent
from pywb.rewrite.wburl import WbUrl
from pywb.rewrite.url_rewriter import UrlRewriter
from pywb.utils.canonicalize import canonicalize
from pywb.utils.timeutils import http_date_to_timestamp
from pywb.utils.loaders import extract_client_cookie
from pywb.cdx.cdxobject import CDXObject
from pywb.warc.recordloader import ArcWarcRecordLoader
from rewriteinputreq import RewriteInputRequest
from templateview import JinjaEnv, HeadInsertView, TopFrameView
from io import BytesIO
# ============================================================================
class RewriterApp(object):
def __init__(self, framed_replay=False):
self.loader = ArcWarcRecordLoader()
self.framed_replay = framed_replay
self.frame_mod = ''
self.replay_mod = 'mp_'
frame_type = 'inverse' if framed_replay else False
self.content_rewriter = RewriteContent(is_framed_replay=frame_type)
self.jenv = JinjaEnv(globals={'static_path': 'static/__pywb'})
self.head_insert_view = HeadInsertView(self.jenv, 'head_insert.html', 'banner.html')
self.frame_insert_view = TopFrameView(self.jenv, 'frame_insert.html', 'banner.html')
def render_content(self, wb_url, **kwargs):
wb_url = WbUrl(wb_url)
#if wb_url.mod == 'vi_':
# return self._get_video_info(wbrequest)
host_prefix = self.get_host_prefix()
rel_prefix = self.get_rel_prefix()
full_prefix = host_prefix + rel_prefix
if self.framed_replay and wb_url.mod == self.frame_mod:
return self.frame_insert_view.get_top_frame(wb_url,
full_prefix,
host_prefix,
self.frame_mod,
self.replay_mod)
urlrewriter = UrlRewriter(wb_url,
prefix=full_prefix,
full_prefix=full_prefix,
rel_prefix=rel_prefix)
self.unrewrite_referrer()
url = wb_url.url
urlkey = canonicalize(url)
inputreq = RewriteInputRequest(request.environ, urlkey, url,
self.content_rewriter)
req_data = inputreq.reconstruct_request(url)
headers = {'Content-Length': len(req_data),
'Content-Type': 'application/request'}
if wb_url.is_latest_replay():
closest = 'now'
else:
closest = wb_url.timestamp
upstream_url = self.get_upstream_url(url, closest, kwargs)
r = requests.post(upstream_url,
data=BytesIO(req_data),
headers=headers,
stream=True)
if r.status_code >= 400:
try:
r.raw.close()
except:
pass
data = dict(url=url, args=kwargs)
raise HTTPError(r.status_code, exception=data)
record = self.loader.parse_record_stream(r.raw)
cdx = CDXObject()
cdx['urlkey'] = urlkey
cdx['timestamp'] = http_date_to_timestamp(r.headers.get('Memento-Datetime'))
cdx['url'] = url
self._add_custom_params(cdx, kwargs)
if self.is_ajax():
head_insert_func = None
else:
head_insert_func = (self.head_insert_view.
create_insert_func(wb_url,
full_prefix,
host_prefix,
request.environ,
self.framed_replay))
result = self.content_rewriter.rewrite_content(urlrewriter,
record.status_headers,
record.stream,
head_insert_func,
urlkey,
cdx)
status_headers, gen, is_rw = result
response.status = int(status_headers.get_statuscode())
for n, v in status_headers.headers:
response.headers[n] = v
return gen
def get_host_prefix(self):
return request.urlparts.scheme + '://' + request.urlparts.netloc
def get_rel_prefix(self):
return request.script_name
def get_full_prefix(self):
return self.get_host_prefix() + self.get_rel_prefix()
def unrewrite_referrer(self):
referrer = request.environ.get('HTTP_REFERER')
if not referrer:
return False
full_prefix = self.get_full_prefix()
if referrer.startswith(full_prefix):
referrer = referrer[len(full_prefix):]
request.environ['HTTP_REFERER'] = referrer
return True
return False
def is_ajax(self):
value = request.environ.get('HTTP_X_REQUESTED_WITH')
if value and value.lower() == 'xmlhttprequest':
return True
return False
def get_upstream_url(self, url, closest, kwargs):
raise NotImplemented()
def _add_custom_params(self, cdx, kwargs):
pass

170
urlrewrite/templateview.py Normal file
View File

@ -0,0 +1,170 @@
from pywb.utils.timeutils import timestamp_to_datetime, timestamp_to_sec,
from pywb.utils.timeutils import timestamp_now
from six.moves.urllib.parse import urlsplit
from jinja2 import Environment
from jinja2 import FileSystemLoader, PackageLoader, ChoiceLoader
import json
import os
# ============================================================================
class FileOnlyPackageLoader(PackageLoader):
def get_source(self, env, template):
dir_, file_ = os.path.split(template)
return super(FileOnlyPackageLoader, self).get_source(env, file_)
# ============================================================================
class RelEnvironment(Environment):
"""Override join_path() to enable relative template paths."""
def join_path(self, template, parent):
return os.path.join(os.path.dirname(parent), template)
# ============================================================================
class JinjaEnv(object):
def __init__(self, paths=['templates', '.', '/'],
packages=['pywb'],
globals=None,
overlay=None):
self._init_filters()
loader = ChoiceLoader(self._make_loaders(paths, packages))
if overlay:
jinja_env = overlay.jinja_env.overlay(loader=loader, trim_blocks=True)
else:
jinja_env = RelEnvironment(loader=loader, trim_blocks=True)
jinja_env.filters.update(self.filters)
if globals:
jinja_env.globals.update(globals)
self.jinja_env = jinja_env
def _make_loaders(self, paths, packages):
loaders = []
# add loaders for paths
for path in paths:
loaders.append(FileSystemLoader(path))
# add loaders for all specified packages
for package in packages:
loaders.append(FileOnlyPackageLoader(package))
return loaders
def template_filter(self, param=None):
def deco(func):
name = param or func.__name__
self.filters[name] = func
return func
return deco
def _init_filters(self):
self.filters = {}
@self.template_filter()
def format_ts(value, format_='%a, %b %d %Y %H:%M:%S'):
if format_ == '%s':
return timestamp_to_sec(value)
else:
value = timestamp_to_datetime(value)
return value.strftime(format_)
@self.template_filter('urlsplit')
def get_urlsplit(url):
split = urlsplit(url)
return split
@self.template_filter()
def tojson(obj):
return json.dumps(obj)
# ============================================================================
class BaseInsertView(object):
def __init__(self, jenv, insert_file, banner_file):
self.jenv = jenv
self.insert_file = insert_file
self.banner_file = banner_file
def render_to_string(self, **kwargs):
template = self.jenv.jinja_env.get_template(self.insert_file)
return template.render(**kwargs)
# ============================================================================
class HeadInsertView(BaseInsertView):
def create_insert_func(self, wb_url,
wb_prefix,
host_prefix,
env,
is_framed,
coll='',
include_ts=True):
url = wb_url.get_url()
top_url = wb_prefix
top_url += wb_url.to_str(mod='')
include_wombat = not wb_url.is_banner_only
wbrequest = {'host_prefix': host_prefix,
'wb_prefix': wb_prefix,
'wb_url': wb_url,
'coll': coll,
'env': env,
'options': {'is_framed': is_framed},
'rewrite_opts': {}
}
def make_head_insert(rule, cdx):
return (self.render_to_string(wbrequest=wbrequest,
cdx=cdx,
top_url=top_url,
include_ts=include_ts,
include_wombat=include_wombat,
banner_html=self.banner_file,
rule=rule))
return make_head_insert
# ============================================================================
class TopFrameView(BaseInsertView):
def get_top_frame(self, wb_url,
wb_prefix,
host_prefix,
frame_mod,
replay_mod,
coll=''):
embed_url = wb_url.to_str(mod=replay_mod)
if wb_url.timestamp:
timestamp = wb_url.timestamp
else:
timestamp = timestamp_now()
wbrequest = {'host_prefix': host_prefix,
'wb_prefix': wb_prefix,
'wb_url': wb_url,
'coll': coll,
'options': {'frame_mod': frame_mod,
'replay_mod': replay_mod},
}
params = dict(embed_url=embed_url,
wbrequest=wbrequest,
timestamp=timestamp,
url=wb_url.get_url(),
banner_html=self.banner_file)
return self.render_to_string(**params)