diff --git a/urlrewrite/platformhandler.py b/urlrewrite/platformhandler.py index 5b29bacf..02e0c117 100644 --- a/urlrewrite/platformhandler.py +++ b/urlrewrite/platformhandler.py @@ -2,8 +2,6 @@ from gevent.monkey import patch_all; patch_all() import requests -from webagg.inputrequest import DirectWSGIInputRequest - from pywb.framework.archivalrouter import Route from pywb.rewrite.rewrite_content import RewriteContent @@ -12,22 +10,22 @@ from pywb.warc.recordloader import ArcWarcRecordLoader from pywb.webapp.live_rewrite_handler import RewriteHandler from pywb.utils.canonicalize import canonicalize from pywb.utils.timeutils import http_date_to_timestamp -from pywb.utils.loaders import extract_client_cookie from pywb.cdx.cdxobject import CDXObject from io import BytesIO -from six.moves.urllib.parse import quote, urlsplit -from six import iteritems +from rewriteinputreq import RewriteInputRequest + +from six.moves.urllib.parse import quote -#================================================================= +# ============================================================================ class PlatformRoute(Route): def apply_filters(self, wbrequest, matcher): wbrequest.matchdict = matcher.groupdict() -#============================================================================= +# ============================================================================ class PlatformHandler(RewriteHandler): def __init__(self, config): super(PlatformHandler, self).__init__(config) @@ -93,85 +91,6 @@ class PlatformHandler(RewriteHandler): return self._make_response(wbrequest, *result) -#============================================================================= -class RewriteInputRequest(DirectWSGIInputRequest): - def __init__(self, env, urlkey, url, rewriter): - super(RewriteInputRequest, self).__init__(env) - self.urlkey = urlkey - self.url = url - self.rewriter = rewriter - - self.splits = urlsplit(self.url) - - def get_full_request_uri(self): - uri = self.splits.path - if self.splits.query: - uri += '?' + self.splits.query - - return uri - - def get_req_headers(self): - headers = {} - - has_cookies = False - - for name, value in iteritems(self.env): - if name == 'HTTP_HOST': - name = 'Host' - value = self.splits.netloc - - elif name == 'HTTP_ORIGIN': - name = 'Origin' - value = (self.splits.scheme + '://' + self.splits.netloc) - - elif name == 'HTTP_X_CSRFTOKEN': - name = 'X-CSRFToken' - cookie_val = extract_client_cookie(env, 'csrftoken') - if cookie_val: - value = cookie_val - - elif name == 'HTTP_X_FORWARDED_PROTO': - name = 'X-Forwarded-Proto' - value = self.splits.scheme - - elif name == 'HTTP_COOKIE': - name = 'Cookie' - value = self._req_cookie_rewrite(value) - has_cookies = True - - elif name.startswith('HTTP_'): - name = name[5:].title().replace('_', '-') - - elif name in ('CONTENT_LENGTH', 'CONTENT_TYPE'): - name = name.title().replace('_', '-') - - else: - value = None - - if value: - headers[name] = value - - if not has_cookies: - value = self._req_cookie_rewrite('') - if value: - headers['Cookie'] = value - - return headers - - def _req_cookie_rewrite(self, value): - rule = self.rewriter.ruleset.get_first_match(self.urlkey) - if not rule or not rule.req_cookie_rewrite: - return value - - for cr in rule.req_cookie_rewrite: - try: - value = cr['rx'].sub(cr['replace'], value) - except KeyError: - pass - - return value - - if __name__ == "__main__": from gevent.wsgi import WSGIServer from pywb.apps.wayback import application diff --git a/urlrewrite/rewriteinputreq.py b/urlrewrite/rewriteinputreq.py new file mode 100644 index 00000000..28879e73 --- /dev/null +++ b/urlrewrite/rewriteinputreq.py @@ -0,0 +1,85 @@ +from webagg.inputrequest import DirectWSGIInputRequest +from pywb.utils.loaders import extract_client_cookie + +from six import iteritems +from six.moves.urllib.parse import urlsplit + + +#============================================================================= +class RewriteInputRequest(DirectWSGIInputRequest): + def __init__(self, env, urlkey, url, rewriter): + super(RewriteInputRequest, self).__init__(env) + self.urlkey = urlkey + self.url = url + self.rewriter = rewriter + + self.splits = urlsplit(self.url) + + def get_full_request_uri(self): + uri = self.splits.path + if self.splits.query: + uri += '?' + self.splits.query + + return uri + + def get_req_headers(self): + headers = {} + + has_cookies = False + + for name, value in iteritems(self.env): + if name == 'HTTP_HOST': + name = 'Host' + value = self.splits.netloc + + elif name == 'HTTP_ORIGIN': + name = 'Origin' + value = (self.splits.scheme + '://' + self.splits.netloc) + + elif name == 'HTTP_X_CSRFTOKEN': + name = 'X-CSRFToken' + cookie_val = extract_client_cookie(env, 'csrftoken') + if cookie_val: + value = cookie_val + + elif name == 'HTTP_X_FORWARDED_PROTO': + name = 'X-Forwarded-Proto' + value = self.splits.scheme + + elif name == 'HTTP_COOKIE': + name = 'Cookie' + value = self._req_cookie_rewrite(value) + has_cookies = True + + elif name.startswith('HTTP_'): + name = name[5:].title().replace('_', '-') + + elif name in ('CONTENT_LENGTH', 'CONTENT_TYPE'): + name = name.title().replace('_', '-') + + else: + value = None + + if value: + headers[name] = value + + if not has_cookies: + value = self._req_cookie_rewrite('') + if value: + headers['Cookie'] = value + + return headers + + def _req_cookie_rewrite(self, value): + rule = self.rewriter.ruleset.get_first_match(self.urlkey) + if not rule or not rule.req_cookie_rewrite: + return value + + for cr in rule.req_cookie_rewrite: + try: + value = cr['rx'].sub(cr['replace'], value) + except KeyError: + pass + + return value + diff --git a/urlrewrite/rewriterapp.py b/urlrewrite/rewriterapp.py new file mode 100644 index 00000000..ca4614af --- /dev/null +++ b/urlrewrite/rewriterapp.py @@ -0,0 +1,163 @@ +import requests + +from bottle import request, response, HTTPError + +from pywb.rewrite.rewrite_content import RewriteContent +from pywb.rewrite.wburl import WbUrl +from pywb.rewrite.url_rewriter import UrlRewriter + +from pywb.utils.canonicalize import canonicalize +from pywb.utils.timeutils import http_date_to_timestamp +from pywb.utils.loaders import extract_client_cookie + +from pywb.cdx.cdxobject import CDXObject +from pywb.warc.recordloader import ArcWarcRecordLoader + +from rewriteinputreq import RewriteInputRequest +from templateview import JinjaEnv, HeadInsertView, TopFrameView + +from io import BytesIO + + +# ============================================================================ +class RewriterApp(object): + def __init__(self, framed_replay=False): + self.loader = ArcWarcRecordLoader() + + self.framed_replay = framed_replay + self.frame_mod = '' + self.replay_mod = 'mp_' + + frame_type = 'inverse' if framed_replay else False + + self.content_rewriter = RewriteContent(is_framed_replay=frame_type) + + self.jenv = JinjaEnv(globals={'static_path': 'static/__pywb'}) + self.head_insert_view = HeadInsertView(self.jenv, 'head_insert.html', 'banner.html') + self.frame_insert_view = TopFrameView(self.jenv, 'frame_insert.html', 'banner.html') + + def render_content(self, wb_url, **kwargs): + wb_url = WbUrl(wb_url) + #if wb_url.mod == 'vi_': + # return self._get_video_info(wbrequest) + + host_prefix = self.get_host_prefix() + rel_prefix = self.get_rel_prefix() + full_prefix = host_prefix + rel_prefix + + if self.framed_replay and wb_url.mod == self.frame_mod: + return self.frame_insert_view.get_top_frame(wb_url, + full_prefix, + host_prefix, + self.frame_mod, + self.replay_mod) + + urlrewriter = UrlRewriter(wb_url, + prefix=full_prefix, + full_prefix=full_prefix, + rel_prefix=rel_prefix) + + self.unrewrite_referrer() + + url = wb_url.url + urlkey = canonicalize(url) + + inputreq = RewriteInputRequest(request.environ, urlkey, url, + self.content_rewriter) + + req_data = inputreq.reconstruct_request(url) + + headers = {'Content-Length': len(req_data), + 'Content-Type': 'application/request'} + + if wb_url.is_latest_replay(): + closest = 'now' + else: + closest = wb_url.timestamp + + upstream_url = self.get_upstream_url(url, closest, kwargs) + + r = requests.post(upstream_url, + data=BytesIO(req_data), + headers=headers, + stream=True) + + if r.status_code >= 400: + try: + r.raw.close() + except: + pass + + data = dict(url=url, args=kwargs) + raise HTTPError(r.status_code, exception=data) + + record = self.loader.parse_record_stream(r.raw) + + cdx = CDXObject() + cdx['urlkey'] = urlkey + cdx['timestamp'] = http_date_to_timestamp(r.headers.get('Memento-Datetime')) + cdx['url'] = url + + self._add_custom_params(cdx, kwargs) + + if self.is_ajax(): + head_insert_func = None + else: + head_insert_func = (self.head_insert_view. + create_insert_func(wb_url, + full_prefix, + host_prefix, + request.environ, + self.framed_replay)) + + result = self.content_rewriter.rewrite_content(urlrewriter, + record.status_headers, + record.stream, + head_insert_func, + urlkey, + cdx) + + status_headers, gen, is_rw = result + + response.status = int(status_headers.get_statuscode()) + + for n, v in status_headers.headers: + response.headers[n] = v + + return gen + + def get_host_prefix(self): + return request.urlparts.scheme + '://' + request.urlparts.netloc + + def get_rel_prefix(self): + return request.script_name + + def get_full_prefix(self): + return self.get_host_prefix() + self.get_rel_prefix() + + def unrewrite_referrer(self): + referrer = request.environ.get('HTTP_REFERER') + if not referrer: + return False + + full_prefix = self.get_full_prefix() + + if referrer.startswith(full_prefix): + referrer = referrer[len(full_prefix):] + request.environ['HTTP_REFERER'] = referrer + return True + + return False + + def is_ajax(self): + value = request.environ.get('HTTP_X_REQUESTED_WITH') + if value and value.lower() == 'xmlhttprequest': + return True + + return False + + def get_upstream_url(self, url, closest, kwargs): + raise NotImplemented() + + def _add_custom_params(self, cdx, kwargs): + pass diff --git a/urlrewrite/templateview.py b/urlrewrite/templateview.py new file mode 100644 index 00000000..758b9098 --- /dev/null +++ b/urlrewrite/templateview.py @@ -0,0 +1,170 @@ +from pywb.utils.timeutils import timestamp_to_datetime, timestamp_to_sec, +from pywb.utils.timeutils import timestamp_now +from six.moves.urllib.parse import urlsplit + +from jinja2 import Environment +from jinja2 import FileSystemLoader, PackageLoader, ChoiceLoader + +import json +import os + + +# ============================================================================ +class FileOnlyPackageLoader(PackageLoader): + def get_source(self, env, template): + dir_, file_ = os.path.split(template) + return super(FileOnlyPackageLoader, self).get_source(env, file_) + + +# ============================================================================ +class RelEnvironment(Environment): + """Override join_path() to enable relative template paths.""" + def join_path(self, template, parent): + return os.path.join(os.path.dirname(parent), template) + + +# ============================================================================ +class JinjaEnv(object): + def __init__(self, paths=['templates', '.', '/'], + packages=['pywb'], + globals=None, + overlay=None): + + self._init_filters() + + loader = ChoiceLoader(self._make_loaders(paths, packages)) + + if overlay: + jinja_env = overlay.jinja_env.overlay(loader=loader, trim_blocks=True) + else: + jinja_env = RelEnvironment(loader=loader, trim_blocks=True) + + jinja_env.filters.update(self.filters) + if globals: + jinja_env.globals.update(globals) + self.jinja_env = jinja_env + + def _make_loaders(self, paths, packages): + loaders = [] + # add loaders for paths + for path in paths: + loaders.append(FileSystemLoader(path)) + + # add loaders for all specified packages + for package in packages: + loaders.append(FileOnlyPackageLoader(package)) + + return loaders + + def template_filter(self, param=None): + def deco(func): + name = param or func.__name__ + self.filters[name] = func + return func + + return deco + + def _init_filters(self): + self.filters = {} + + @self.template_filter() + def format_ts(value, format_='%a, %b %d %Y %H:%M:%S'): + if format_ == '%s': + return timestamp_to_sec(value) + else: + value = timestamp_to_datetime(value) + return value.strftime(format_) + + @self.template_filter('urlsplit') + def get_urlsplit(url): + split = urlsplit(url) + return split + + @self.template_filter() + def tojson(obj): + return json.dumps(obj) + + +# ============================================================================ +class BaseInsertView(object): + def __init__(self, jenv, insert_file, banner_file): + self.jenv = jenv + self.insert_file = insert_file + self.banner_file = banner_file + + def render_to_string(self, **kwargs): + template = self.jenv.jinja_env.get_template(self.insert_file) + return template.render(**kwargs) + + +# ============================================================================ +class HeadInsertView(BaseInsertView): + def create_insert_func(self, wb_url, + wb_prefix, + host_prefix, + env, + is_framed, + coll='', + include_ts=True): + + url = wb_url.get_url() + + top_url = wb_prefix + top_url += wb_url.to_str(mod='') + + include_wombat = not wb_url.is_banner_only + + wbrequest = {'host_prefix': host_prefix, + 'wb_prefix': wb_prefix, + 'wb_url': wb_url, + 'coll': coll, + 'env': env, + 'options': {'is_framed': is_framed}, + 'rewrite_opts': {} + } + + def make_head_insert(rule, cdx): + return (self.render_to_string(wbrequest=wbrequest, + cdx=cdx, + top_url=top_url, + include_ts=include_ts, + include_wombat=include_wombat, + banner_html=self.banner_file, + rule=rule)) + return make_head_insert + + +# ============================================================================ +class TopFrameView(BaseInsertView): + def get_top_frame(self, wb_url, + wb_prefix, + host_prefix, + frame_mod, + replay_mod, + coll=''): + + embed_url = wb_url.to_str(mod=replay_mod) + + if wb_url.timestamp: + timestamp = wb_url.timestamp + else: + timestamp = timestamp_now() + + wbrequest = {'host_prefix': host_prefix, + 'wb_prefix': wb_prefix, + 'wb_url': wb_url, + 'coll': coll, + + 'options': {'frame_mod': frame_mod, + 'replay_mod': replay_mod}, + } + + params = dict(embed_url=embed_url, + wbrequest=wbrequest, + timestamp=timestamp, + url=wb_url.get_url(), + banner_html=self.banner_file) + + return self.render_to_string(**params) + +