1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-23 22:52:25 +01:00
pywb/urlrewrite/rewriterapp.py
Ilya Kreymer 0370470e68 urlrewrite: http range: support skipping record for range requests not starting at 0-
and performing async request,
support converting unbounded 0- to non-ranged and back
2016-04-15 02:21:39 +00:00

270 lines
8.7 KiB
Python

import requests
from bottle import request, response, HTTPError
from pywb.rewrite.rewrite_content import RewriteContent
from pywb.rewrite.wburl import WbUrl
from pywb.rewrite.url_rewriter import UrlRewriter
from pywb.utils.canonicalize import canonicalize
from pywb.utils.timeutils import http_date_to_timestamp
from pywb.utils.loaders import extract_client_cookie
from pywb.cdx.cdxobject import CDXObject
from pywb.warc.recordloader import ArcWarcRecordLoader
from urlrewrite.rewriteinputreq import RewriteInputRequest
from urlrewrite.templateview import JinjaEnv, HeadInsertView, TopFrameView
from io import BytesIO
import gevent
# ============================================================================
class RewriterApp(object):
def __init__(self, framed_replay=False, jinja_env=None):
self.loader = ArcWarcRecordLoader()
self.framed_replay = framed_replay
self.frame_mod = ''
self.replay_mod = 'mp_'
frame_type = 'inverse' if framed_replay else False
self.content_rewriter = RewriteContent(is_framed_replay=frame_type)
if not jinja_env:
jinja_env = JinjaEnv(globals={'static_path': 'static/__pywb'})
self.jinja_env = jinja_env
self.head_insert_view = HeadInsertView(self.jinja_env, 'head_insert.html', 'banner.html')
self.frame_insert_view = TopFrameView(self.jinja_env, 'frame_insert.html', 'banner.html')
def render_content(self, wb_url, **kwargs):
wb_url = WbUrl(wb_url)
#if wb_url.mod == 'vi_':
# return self._get_video_info(wbrequest)
host_prefix = self.get_host_prefix()
rel_prefix = self.get_rel_prefix()
full_prefix = host_prefix + rel_prefix
resp = self.handle_custom_response(wb_url, full_prefix, host_prefix, kwargs)
if resp is not None:
return resp
urlrewriter = UrlRewriter(wb_url,
prefix=full_prefix,
full_prefix=full_prefix,
rel_prefix=rel_prefix)
self.unrewrite_referrer()
url = wb_url.url
urlkey = canonicalize(url)
inputreq = RewriteInputRequest(request.environ, urlkey, url,
self.content_rewriter)
mod_url = None
use_206 = False
rangeres = None
readd_range = False
async_record_url = None
if kwargs.get('type') == 'record':
rangeres = inputreq.extract_range()
if rangeres:
mod_url, start, end, use_206 = rangeres
# if bytes=0- Range request,
# simply remove the range and still proxy
if start == 0 and not end and use_206:
url = mod_url
wb_url.url = mod_url
inputreq.url = mod_url
del request.environ['HTTP_RANGE']
readd_range = True
else:
async_record_url = mod_url
r = self._do_req(inputreq, url, wb_url, kwargs,
async_record_url is not None)
if r.status_code >= 400:
error = None
try:
error = r.raw.read()
r.raw.close()
except:
pass
if error:
error = error.decode('utf-8')
else:
error = ''
data = dict(url=url, args=kwargs, error=error)
raise HTTPError(r.status_code, exception=data)
if async_record_url:
#print('ASYNC REC', async_record_url)
request.environ.pop('HTTP_RANGE', '')
gevent.spawn(self._do_async_req,
inputreq,
async_record_url,
wb_url,
kwargs,
False)
record = self.loader.parse_record_stream(r.raw)
cdx = CDXObject()
cdx['urlkey'] = urlkey
cdx['timestamp'] = http_date_to_timestamp(r.headers.get('Memento-Datetime'))
cdx['url'] = url
self._add_custom_params(cdx, r.headers, kwargs)
if readd_range:
content_length = (record.status_headers.
get_header('Content-Length'))
try:
content_length = int(content_length)
record.status_headers.add_range(0, content_length,
content_length)
except (ValueError, TypeError):
pass
if self.is_ajax():
head_insert_func = None
else:
head_insert_func = (self.head_insert_view.
create_insert_func(wb_url,
full_prefix,
host_prefix,
request.environ,
self.framed_replay))
result = self.content_rewriter.rewrite_content(urlrewriter,
record.status_headers,
record.stream,
head_insert_func,
urlkey,
cdx)
status_headers, gen, is_rw = result
response.status = int(status_headers.get_statuscode())
for n, v in status_headers.headers:
response.add_header(n, v)
return gen
def _do_async_req(self, *args):
count = 0
#print('ASYNC')
try:
r = self._do_req(*args)
while True:
buff = r.raw.read(8192)
count += len(buff)
if not buff:
return
except:
import traceback
traceback.print_exc()
finally:
#print('CLOSING')
#print('READ ASYNC', count)
try:
r.raw.close()
except:
pass
def _do_req(self, inputreq, url, wb_url, kwargs, skip):
req_data = inputreq.reconstruct_request(url)
headers = {'Content-Length': len(req_data),
'Content-Type': 'application/request'}
if skip:
headers['Recorder-Skip'] = '1'
if wb_url.is_latest_replay():
closest = 'now'
else:
closest = wb_url.timestamp
upstream_url = self.get_upstream_url(url, wb_url, closest, kwargs)
r = requests.post(upstream_url,
data=BytesIO(req_data),
headers=headers,
stream=True)
return r
def get_host_prefix(self):
return request.urlparts.scheme + '://' + request.urlparts.netloc
def get_rel_prefix(self):
return request.script_name
def get_full_prefix(self):
return self.get_host_prefix() + self.get_rel_prefix()
def unrewrite_referrer(self):
referrer = request.environ.get('HTTP_REFERER')
if not referrer:
return False
full_prefix = self.get_full_prefix()
if referrer.startswith(full_prefix):
referrer = referrer[len(full_prefix):]
request.environ['HTTP_REFERER'] = WbUrl(referrer).url
return True
return False
def is_ajax(self):
value = request.environ.get('HTTP_X_REQUESTED_WITH')
value = value or request.environ.get('HTTP_X_PYWB_REQUESTED_WITH')
if value and value.lower() == 'xmlhttprequest':
return True
return False
def get_upstream_url(self, url, wb_url, closest, kwargs):
raise NotImplemented()
def _add_custom_params(self, cdx, headers, kwargs):
cdx['is_live'] = 'true'
pass
def get_top_frame_params(self, wb_url, kwargs):
return None
def handle_custom_response(self, wb_url, full_prefix, host_prefix, kwargs):
if self.framed_replay and wb_url.mod == self.frame_mod:
extra_params = self.get_top_frame_params(wb_url, kwargs)
return self.frame_insert_view.get_top_frame(wb_url,
full_prefix,
host_prefix,
request.environ,
self.frame_mod,
self.replay_mod,
coll='',
extra_params=extra_params)
return None