mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 08:04:49 +01:00
urlrewrite app: add bottle-based app, templateview separate from pywb webapp framework
This commit is contained in:
parent
017e9802f8
commit
f12be3bc91
@ -2,8 +2,6 @@ from gevent.monkey import patch_all; patch_all()
|
||||
|
||||
import requests
|
||||
|
||||
from webagg.inputrequest import DirectWSGIInputRequest
|
||||
|
||||
from pywb.framework.archivalrouter import Route
|
||||
|
||||
from pywb.rewrite.rewrite_content import RewriteContent
|
||||
@ -12,22 +10,22 @@ from pywb.warc.recordloader import ArcWarcRecordLoader
|
||||
from pywb.webapp.live_rewrite_handler import RewriteHandler
|
||||
from pywb.utils.canonicalize import canonicalize
|
||||
from pywb.utils.timeutils import http_date_to_timestamp
|
||||
from pywb.utils.loaders import extract_client_cookie
|
||||
from pywb.cdx.cdxobject import CDXObject
|
||||
|
||||
from io import BytesIO
|
||||
|
||||
from six.moves.urllib.parse import quote, urlsplit
|
||||
from six import iteritems
|
||||
from rewriteinputreq import RewriteInputRequest
|
||||
|
||||
from six.moves.urllib.parse import quote
|
||||
|
||||
|
||||
#=================================================================
|
||||
# ============================================================================
|
||||
class PlatformRoute(Route):
|
||||
def apply_filters(self, wbrequest, matcher):
|
||||
wbrequest.matchdict = matcher.groupdict()
|
||||
|
||||
|
||||
#=============================================================================
|
||||
# ============================================================================
|
||||
class PlatformHandler(RewriteHandler):
|
||||
def __init__(self, config):
|
||||
super(PlatformHandler, self).__init__(config)
|
||||
@ -93,85 +91,6 @@ class PlatformHandler(RewriteHandler):
|
||||
return self._make_response(wbrequest, *result)
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class RewriteInputRequest(DirectWSGIInputRequest):
|
||||
def __init__(self, env, urlkey, url, rewriter):
|
||||
super(RewriteInputRequest, self).__init__(env)
|
||||
self.urlkey = urlkey
|
||||
self.url = url
|
||||
self.rewriter = rewriter
|
||||
|
||||
self.splits = urlsplit(self.url)
|
||||
|
||||
def get_full_request_uri(self):
|
||||
uri = self.splits.path
|
||||
if self.splits.query:
|
||||
uri += '?' + self.splits.query
|
||||
|
||||
return uri
|
||||
|
||||
def get_req_headers(self):
|
||||
headers = {}
|
||||
|
||||
has_cookies = False
|
||||
|
||||
for name, value in iteritems(self.env):
|
||||
if name == 'HTTP_HOST':
|
||||
name = 'Host'
|
||||
value = self.splits.netloc
|
||||
|
||||
elif name == 'HTTP_ORIGIN':
|
||||
name = 'Origin'
|
||||
value = (self.splits.scheme + '://' + self.splits.netloc)
|
||||
|
||||
elif name == 'HTTP_X_CSRFTOKEN':
|
||||
name = 'X-CSRFToken'
|
||||
cookie_val = extract_client_cookie(env, 'csrftoken')
|
||||
if cookie_val:
|
||||
value = cookie_val
|
||||
|
||||
elif name == 'HTTP_X_FORWARDED_PROTO':
|
||||
name = 'X-Forwarded-Proto'
|
||||
value = self.splits.scheme
|
||||
|
||||
elif name == 'HTTP_COOKIE':
|
||||
name = 'Cookie'
|
||||
value = self._req_cookie_rewrite(value)
|
||||
has_cookies = True
|
||||
|
||||
elif name.startswith('HTTP_'):
|
||||
name = name[5:].title().replace('_', '-')
|
||||
|
||||
elif name in ('CONTENT_LENGTH', 'CONTENT_TYPE'):
|
||||
name = name.title().replace('_', '-')
|
||||
|
||||
else:
|
||||
value = None
|
||||
|
||||
if value:
|
||||
headers[name] = value
|
||||
|
||||
if not has_cookies:
|
||||
value = self._req_cookie_rewrite('')
|
||||
if value:
|
||||
headers['Cookie'] = value
|
||||
|
||||
return headers
|
||||
|
||||
def _req_cookie_rewrite(self, value):
|
||||
rule = self.rewriter.ruleset.get_first_match(self.urlkey)
|
||||
if not rule or not rule.req_cookie_rewrite:
|
||||
return value
|
||||
|
||||
for cr in rule.req_cookie_rewrite:
|
||||
try:
|
||||
value = cr['rx'].sub(cr['replace'], value)
|
||||
except KeyError:
|
||||
pass
|
||||
|
||||
return value
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from gevent.wsgi import WSGIServer
|
||||
from pywb.apps.wayback import application
|
||||
|
85
urlrewrite/rewriteinputreq.py
Normal file
85
urlrewrite/rewriteinputreq.py
Normal file
@ -0,0 +1,85 @@
|
||||
from webagg.inputrequest import DirectWSGIInputRequest
|
||||
from pywb.utils.loaders import extract_client_cookie
|
||||
|
||||
from six import iteritems
|
||||
from six.moves.urllib.parse import urlsplit
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class RewriteInputRequest(DirectWSGIInputRequest):
|
||||
def __init__(self, env, urlkey, url, rewriter):
|
||||
super(RewriteInputRequest, self).__init__(env)
|
||||
self.urlkey = urlkey
|
||||
self.url = url
|
||||
self.rewriter = rewriter
|
||||
|
||||
self.splits = urlsplit(self.url)
|
||||
|
||||
def get_full_request_uri(self):
|
||||
uri = self.splits.path
|
||||
if self.splits.query:
|
||||
uri += '?' + self.splits.query
|
||||
|
||||
return uri
|
||||
|
||||
def get_req_headers(self):
|
||||
headers = {}
|
||||
|
||||
has_cookies = False
|
||||
|
||||
for name, value in iteritems(self.env):
|
||||
if name == 'HTTP_HOST':
|
||||
name = 'Host'
|
||||
value = self.splits.netloc
|
||||
|
||||
elif name == 'HTTP_ORIGIN':
|
||||
name = 'Origin'
|
||||
value = (self.splits.scheme + '://' + self.splits.netloc)
|
||||
|
||||
elif name == 'HTTP_X_CSRFTOKEN':
|
||||
name = 'X-CSRFToken'
|
||||
cookie_val = extract_client_cookie(env, 'csrftoken')
|
||||
if cookie_val:
|
||||
value = cookie_val
|
||||
|
||||
elif name == 'HTTP_X_FORWARDED_PROTO':
|
||||
name = 'X-Forwarded-Proto'
|
||||
value = self.splits.scheme
|
||||
|
||||
elif name == 'HTTP_COOKIE':
|
||||
name = 'Cookie'
|
||||
value = self._req_cookie_rewrite(value)
|
||||
has_cookies = True
|
||||
|
||||
elif name.startswith('HTTP_'):
|
||||
name = name[5:].title().replace('_', '-')
|
||||
|
||||
elif name in ('CONTENT_LENGTH', 'CONTENT_TYPE'):
|
||||
name = name.title().replace('_', '-')
|
||||
|
||||
else:
|
||||
value = None
|
||||
|
||||
if value:
|
||||
headers[name] = value
|
||||
|
||||
if not has_cookies:
|
||||
value = self._req_cookie_rewrite('')
|
||||
if value:
|
||||
headers['Cookie'] = value
|
||||
|
||||
return headers
|
||||
|
||||
def _req_cookie_rewrite(self, value):
|
||||
rule = self.rewriter.ruleset.get_first_match(self.urlkey)
|
||||
if not rule or not rule.req_cookie_rewrite:
|
||||
return value
|
||||
|
||||
for cr in rule.req_cookie_rewrite:
|
||||
try:
|
||||
value = cr['rx'].sub(cr['replace'], value)
|
||||
except KeyError:
|
||||
pass
|
||||
|
||||
return value
|
||||
|
163
urlrewrite/rewriterapp.py
Normal file
163
urlrewrite/rewriterapp.py
Normal file
@ -0,0 +1,163 @@
|
||||
import requests
|
||||
|
||||
from bottle import request, response, HTTPError
|
||||
|
||||
from pywb.rewrite.rewrite_content import RewriteContent
|
||||
from pywb.rewrite.wburl import WbUrl
|
||||
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||
|
||||
from pywb.utils.canonicalize import canonicalize
|
||||
from pywb.utils.timeutils import http_date_to_timestamp
|
||||
from pywb.utils.loaders import extract_client_cookie
|
||||
|
||||
from pywb.cdx.cdxobject import CDXObject
|
||||
from pywb.warc.recordloader import ArcWarcRecordLoader
|
||||
|
||||
from rewriteinputreq import RewriteInputRequest
|
||||
from templateview import JinjaEnv, HeadInsertView, TopFrameView
|
||||
|
||||
from io import BytesIO
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class RewriterApp(object):
|
||||
def __init__(self, framed_replay=False):
|
||||
self.loader = ArcWarcRecordLoader()
|
||||
|
||||
self.framed_replay = framed_replay
|
||||
self.frame_mod = ''
|
||||
self.replay_mod = 'mp_'
|
||||
|
||||
frame_type = 'inverse' if framed_replay else False
|
||||
|
||||
self.content_rewriter = RewriteContent(is_framed_replay=frame_type)
|
||||
|
||||
self.jenv = JinjaEnv(globals={'static_path': 'static/__pywb'})
|
||||
self.head_insert_view = HeadInsertView(self.jenv, 'head_insert.html', 'banner.html')
|
||||
self.frame_insert_view = TopFrameView(self.jenv, 'frame_insert.html', 'banner.html')
|
||||
|
||||
def render_content(self, wb_url, **kwargs):
|
||||
wb_url = WbUrl(wb_url)
|
||||
#if wb_url.mod == 'vi_':
|
||||
# return self._get_video_info(wbrequest)
|
||||
|
||||
host_prefix = self.get_host_prefix()
|
||||
rel_prefix = self.get_rel_prefix()
|
||||
full_prefix = host_prefix + rel_prefix
|
||||
|
||||
if self.framed_replay and wb_url.mod == self.frame_mod:
|
||||
return self.frame_insert_view.get_top_frame(wb_url,
|
||||
full_prefix,
|
||||
host_prefix,
|
||||
self.frame_mod,
|
||||
self.replay_mod)
|
||||
|
||||
urlrewriter = UrlRewriter(wb_url,
|
||||
prefix=full_prefix,
|
||||
full_prefix=full_prefix,
|
||||
rel_prefix=rel_prefix)
|
||||
|
||||
self.unrewrite_referrer()
|
||||
|
||||
url = wb_url.url
|
||||
urlkey = canonicalize(url)
|
||||
|
||||
inputreq = RewriteInputRequest(request.environ, urlkey, url,
|
||||
self.content_rewriter)
|
||||
|
||||
req_data = inputreq.reconstruct_request(url)
|
||||
|
||||
headers = {'Content-Length': len(req_data),
|
||||
'Content-Type': 'application/request'}
|
||||
|
||||
if wb_url.is_latest_replay():
|
||||
closest = 'now'
|
||||
else:
|
||||
closest = wb_url.timestamp
|
||||
|
||||
upstream_url = self.get_upstream_url(url, closest, kwargs)
|
||||
|
||||
r = requests.post(upstream_url,
|
||||
data=BytesIO(req_data),
|
||||
headers=headers,
|
||||
stream=True)
|
||||
|
||||
if r.status_code >= 400:
|
||||
try:
|
||||
r.raw.close()
|
||||
except:
|
||||
pass
|
||||
|
||||
data = dict(url=url, args=kwargs)
|
||||
raise HTTPError(r.status_code, exception=data)
|
||||
|
||||
record = self.loader.parse_record_stream(r.raw)
|
||||
|
||||
cdx = CDXObject()
|
||||
cdx['urlkey'] = urlkey
|
||||
cdx['timestamp'] = http_date_to_timestamp(r.headers.get('Memento-Datetime'))
|
||||
cdx['url'] = url
|
||||
|
||||
self._add_custom_params(cdx, kwargs)
|
||||
|
||||
if self.is_ajax():
|
||||
head_insert_func = None
|
||||
else:
|
||||
head_insert_func = (self.head_insert_view.
|
||||
create_insert_func(wb_url,
|
||||
full_prefix,
|
||||
host_prefix,
|
||||
request.environ,
|
||||
self.framed_replay))
|
||||
|
||||
result = self.content_rewriter.rewrite_content(urlrewriter,
|
||||
record.status_headers,
|
||||
record.stream,
|
||||
head_insert_func,
|
||||
urlkey,
|
||||
cdx)
|
||||
|
||||
status_headers, gen, is_rw = result
|
||||
|
||||
response.status = int(status_headers.get_statuscode())
|
||||
|
||||
for n, v in status_headers.headers:
|
||||
response.headers[n] = v
|
||||
|
||||
return gen
|
||||
|
||||
def get_host_prefix(self):
|
||||
return request.urlparts.scheme + '://' + request.urlparts.netloc
|
||||
|
||||
def get_rel_prefix(self):
|
||||
return request.script_name
|
||||
|
||||
def get_full_prefix(self):
|
||||
return self.get_host_prefix() + self.get_rel_prefix()
|
||||
|
||||
def unrewrite_referrer(self):
|
||||
referrer = request.environ.get('HTTP_REFERER')
|
||||
if not referrer:
|
||||
return False
|
||||
|
||||
full_prefix = self.get_full_prefix()
|
||||
|
||||
if referrer.startswith(full_prefix):
|
||||
referrer = referrer[len(full_prefix):]
|
||||
request.environ['HTTP_REFERER'] = referrer
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def is_ajax(self):
|
||||
value = request.environ.get('HTTP_X_REQUESTED_WITH')
|
||||
if value and value.lower() == 'xmlhttprequest':
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def get_upstream_url(self, url, closest, kwargs):
|
||||
raise NotImplemented()
|
||||
|
||||
def _add_custom_params(self, cdx, kwargs):
|
||||
pass
|
170
urlrewrite/templateview.py
Normal file
170
urlrewrite/templateview.py
Normal file
@ -0,0 +1,170 @@
|
||||
from pywb.utils.timeutils import timestamp_to_datetime, timestamp_to_sec,
|
||||
from pywb.utils.timeutils import timestamp_now
|
||||
from six.moves.urllib.parse import urlsplit
|
||||
|
||||
from jinja2 import Environment
|
||||
from jinja2 import FileSystemLoader, PackageLoader, ChoiceLoader
|
||||
|
||||
import json
|
||||
import os
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class FileOnlyPackageLoader(PackageLoader):
|
||||
def get_source(self, env, template):
|
||||
dir_, file_ = os.path.split(template)
|
||||
return super(FileOnlyPackageLoader, self).get_source(env, file_)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class RelEnvironment(Environment):
|
||||
"""Override join_path() to enable relative template paths."""
|
||||
def join_path(self, template, parent):
|
||||
return os.path.join(os.path.dirname(parent), template)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class JinjaEnv(object):
|
||||
def __init__(self, paths=['templates', '.', '/'],
|
||||
packages=['pywb'],
|
||||
globals=None,
|
||||
overlay=None):
|
||||
|
||||
self._init_filters()
|
||||
|
||||
loader = ChoiceLoader(self._make_loaders(paths, packages))
|
||||
|
||||
if overlay:
|
||||
jinja_env = overlay.jinja_env.overlay(loader=loader, trim_blocks=True)
|
||||
else:
|
||||
jinja_env = RelEnvironment(loader=loader, trim_blocks=True)
|
||||
|
||||
jinja_env.filters.update(self.filters)
|
||||
if globals:
|
||||
jinja_env.globals.update(globals)
|
||||
self.jinja_env = jinja_env
|
||||
|
||||
def _make_loaders(self, paths, packages):
|
||||
loaders = []
|
||||
# add loaders for paths
|
||||
for path in paths:
|
||||
loaders.append(FileSystemLoader(path))
|
||||
|
||||
# add loaders for all specified packages
|
||||
for package in packages:
|
||||
loaders.append(FileOnlyPackageLoader(package))
|
||||
|
||||
return loaders
|
||||
|
||||
def template_filter(self, param=None):
|
||||
def deco(func):
|
||||
name = param or func.__name__
|
||||
self.filters[name] = func
|
||||
return func
|
||||
|
||||
return deco
|
||||
|
||||
def _init_filters(self):
|
||||
self.filters = {}
|
||||
|
||||
@self.template_filter()
|
||||
def format_ts(value, format_='%a, %b %d %Y %H:%M:%S'):
|
||||
if format_ == '%s':
|
||||
return timestamp_to_sec(value)
|
||||
else:
|
||||
value = timestamp_to_datetime(value)
|
||||
return value.strftime(format_)
|
||||
|
||||
@self.template_filter('urlsplit')
|
||||
def get_urlsplit(url):
|
||||
split = urlsplit(url)
|
||||
return split
|
||||
|
||||
@self.template_filter()
|
||||
def tojson(obj):
|
||||
return json.dumps(obj)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class BaseInsertView(object):
|
||||
def __init__(self, jenv, insert_file, banner_file):
|
||||
self.jenv = jenv
|
||||
self.insert_file = insert_file
|
||||
self.banner_file = banner_file
|
||||
|
||||
def render_to_string(self, **kwargs):
|
||||
template = self.jenv.jinja_env.get_template(self.insert_file)
|
||||
return template.render(**kwargs)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class HeadInsertView(BaseInsertView):
|
||||
def create_insert_func(self, wb_url,
|
||||
wb_prefix,
|
||||
host_prefix,
|
||||
env,
|
||||
is_framed,
|
||||
coll='',
|
||||
include_ts=True):
|
||||
|
||||
url = wb_url.get_url()
|
||||
|
||||
top_url = wb_prefix
|
||||
top_url += wb_url.to_str(mod='')
|
||||
|
||||
include_wombat = not wb_url.is_banner_only
|
||||
|
||||
wbrequest = {'host_prefix': host_prefix,
|
||||
'wb_prefix': wb_prefix,
|
||||
'wb_url': wb_url,
|
||||
'coll': coll,
|
||||
'env': env,
|
||||
'options': {'is_framed': is_framed},
|
||||
'rewrite_opts': {}
|
||||
}
|
||||
|
||||
def make_head_insert(rule, cdx):
|
||||
return (self.render_to_string(wbrequest=wbrequest,
|
||||
cdx=cdx,
|
||||
top_url=top_url,
|
||||
include_ts=include_ts,
|
||||
include_wombat=include_wombat,
|
||||
banner_html=self.banner_file,
|
||||
rule=rule))
|
||||
return make_head_insert
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class TopFrameView(BaseInsertView):
|
||||
def get_top_frame(self, wb_url,
|
||||
wb_prefix,
|
||||
host_prefix,
|
||||
frame_mod,
|
||||
replay_mod,
|
||||
coll=''):
|
||||
|
||||
embed_url = wb_url.to_str(mod=replay_mod)
|
||||
|
||||
if wb_url.timestamp:
|
||||
timestamp = wb_url.timestamp
|
||||
else:
|
||||
timestamp = timestamp_now()
|
||||
|
||||
wbrequest = {'host_prefix': host_prefix,
|
||||
'wb_prefix': wb_prefix,
|
||||
'wb_url': wb_url,
|
||||
'coll': coll,
|
||||
|
||||
'options': {'frame_mod': frame_mod,
|
||||
'replay_mod': replay_mod},
|
||||
}
|
||||
|
||||
params = dict(embed_url=embed_url,
|
||||
wbrequest=wbrequest,
|
||||
timestamp=timestamp,
|
||||
url=wb_url.get_url(),
|
||||
banner_html=self.banner_file)
|
||||
|
||||
return self.render_to_string(**params)
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user