From 7deba42851be57d4b721ffa6eb64d173b31ace01 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 24 Mar 2016 16:33:03 -0400 Subject: [PATCH] add urlrewrite pywb-adapter PlatformHandler for using traditional pywb setup with webrecorder components recorder and webagg --- setup.py | 4 +- urlrewrite/__init__.py | 0 urlrewrite/platformhandler.py | 180 ++++++++++++++++++++++++++++++++++ 3 files changed, 183 insertions(+), 1 deletion(-) create mode 100644 urlrewrite/__init__.py create mode 100644 urlrewrite/platformhandler.py diff --git a/setup.py b/setup.py index 7599878c..fa3083d8 100755 --- a/setup.py +++ b/setup.py @@ -31,12 +31,14 @@ setup( provides=[ 'webagg', 'recorder', + 'urlrewrite', + 'proxy', ], install_requires=[ 'pywb==0.30.0', ], dependency_links=[ - 'git+https://github.com/ikreymer/pywb.git@py3#egg=pywb-0.30.0-py3', + 'git+https://github.com/ikreymer/pywb.git@develop#egg=pywb-0.30.0-develop', ], zip_safe=True, entry_points=""" diff --git a/urlrewrite/__init__.py b/urlrewrite/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/urlrewrite/platformhandler.py b/urlrewrite/platformhandler.py new file mode 100644 index 00000000..5b29bacf --- /dev/null +++ b/urlrewrite/platformhandler.py @@ -0,0 +1,180 @@ +from gevent.monkey import patch_all; patch_all() + +import requests + +from webagg.inputrequest import DirectWSGIInputRequest + +from pywb.framework.archivalrouter import Route + +from pywb.rewrite.rewrite_content import RewriteContent +from pywb.rewrite.wburl import WbUrl +from pywb.warc.recordloader import ArcWarcRecordLoader +from pywb.webapp.live_rewrite_handler import RewriteHandler +from pywb.utils.canonicalize import canonicalize +from pywb.utils.timeutils import http_date_to_timestamp +from pywb.utils.loaders import extract_client_cookie +from pywb.cdx.cdxobject import CDXObject + +from io import BytesIO + +from six.moves.urllib.parse import quote, urlsplit +from six import iteritems + + +#================================================================= +class PlatformRoute(Route): + def apply_filters(self, wbrequest, matcher): + wbrequest.matchdict = matcher.groupdict() + + +#============================================================================= +class PlatformHandler(RewriteHandler): + def __init__(self, config): + super(PlatformHandler, self).__init__(config) + self.upstream_url = config.get('upstream_url') + self.loader = ArcWarcRecordLoader() + + framed = config.get('framed_replay') + self.content_rewriter = RewriteContent(is_framed_replay=framed) + + def render_content(self, wbrequest): + if wbrequest.wb_url.mod == 'vi_': + return self._get_video_info(wbrequest) + + ref_wburl_str = wbrequest.extract_referrer_wburl_str() + if ref_wburl_str: + wbrequest.env['HTTP_REFERER'] = WbUrl(ref_wburl_str).url + + urlkey = canonicalize(wbrequest.wb_url.url) + url = wbrequest.wb_url.url + + inputreq = RewriteInputRequest(wbrequest.env, urlkey, url, + self.content_rewriter) + + req_data = inputreq.reconstruct_request(url) + + headers = {'Content-Length': len(req_data), + 'Content-Type': 'application/request'} + + if wbrequest.wb_url.is_latest_replay(): + closest = 'now' + else: + closest = wbrequest.wb_url.timestamp + + upstream_url = self.upstream_url.format(url=quote(url), + closest=closest, + #coll=wbrequest.coll, + **wbrequest.matchdict) + + r = requests.post(upstream_url, + data=BytesIO(req_data), + headers=headers, + stream=True, + allow_redirects=False) + + r.raise_for_status() + + record = self.loader.parse_record_stream(r.raw) + + cdx = CDXObject() + cdx['urlkey'] = urlkey + cdx['timestamp'] = http_date_to_timestamp(r.headers.get('Memento-Datetime')) + cdx['url'] = url + + head_insert_func = self.head_insert_view.create_insert_func(wbrequest) + result = self.content_rewriter.rewrite_content(wbrequest.urlrewriter, + record.status_headers, + record.stream, + head_insert_func, + urlkey, + cdx) + + status_headers, gen, is_rw = result + return self._make_response(wbrequest, *result) + + +#============================================================================= +class RewriteInputRequest(DirectWSGIInputRequest): + def __init__(self, env, urlkey, url, rewriter): + super(RewriteInputRequest, self).__init__(env) + self.urlkey = urlkey + self.url = url + self.rewriter = rewriter + + self.splits = urlsplit(self.url) + + def get_full_request_uri(self): + uri = self.splits.path + if self.splits.query: + uri += '?' + self.splits.query + + return uri + + def get_req_headers(self): + headers = {} + + has_cookies = False + + for name, value in iteritems(self.env): + if name == 'HTTP_HOST': + name = 'Host' + value = self.splits.netloc + + elif name == 'HTTP_ORIGIN': + name = 'Origin' + value = (self.splits.scheme + '://' + self.splits.netloc) + + elif name == 'HTTP_X_CSRFTOKEN': + name = 'X-CSRFToken' + cookie_val = extract_client_cookie(env, 'csrftoken') + if cookie_val: + value = cookie_val + + elif name == 'HTTP_X_FORWARDED_PROTO': + name = 'X-Forwarded-Proto' + value = self.splits.scheme + + elif name == 'HTTP_COOKIE': + name = 'Cookie' + value = self._req_cookie_rewrite(value) + has_cookies = True + + elif name.startswith('HTTP_'): + name = name[5:].title().replace('_', '-') + + elif name in ('CONTENT_LENGTH', 'CONTENT_TYPE'): + name = name.title().replace('_', '-') + + else: + value = None + + if value: + headers[name] = value + + if not has_cookies: + value = self._req_cookie_rewrite('') + if value: + headers['Cookie'] = value + + return headers + + def _req_cookie_rewrite(self, value): + rule = self.rewriter.ruleset.get_first_match(self.urlkey) + if not rule or not rule.req_cookie_rewrite: + return value + + for cr in rule.req_cookie_rewrite: + try: + value = cr['rx'].sub(cr['replace'], value) + except KeyError: + pass + + return value + + +if __name__ == "__main__": + from gevent.wsgi import WSGIServer + from pywb.apps.wayback import application + + server = WSGIServer(('', 8090), application) + server.serve_forever()