2016-03-27 17:34:45 -04:00
|
|
|
import requests
|
|
|
|
|
2017-05-08 19:17:09 -07:00
|
|
|
#from pywb.rewrite.rewrite_amf import RewriteAMFMixin
|
|
|
|
#from pywb.rewrite.rewrite_dash import RewriteDASHMixin
|
|
|
|
#from pywb.rewrite.rewrite_content import RewriteContent
|
2017-05-10 19:05:55 -07:00
|
|
|
from pywb.urlrewrite.rewriter import DefaultRewriter
|
2017-03-20 14:41:12 -07:00
|
|
|
|
2016-03-27 17:34:45 -04:00
|
|
|
from pywb.rewrite.wburl import WbUrl
|
2017-04-22 18:17:41 -07:00
|
|
|
from pywb.rewrite.url_rewriter import UrlRewriter, SchemeOnlyUrlRewriter
|
2016-03-27 17:34:45 -04:00
|
|
|
|
2016-04-25 12:03:23 -07:00
|
|
|
from pywb.utils.wbexception import WbException
|
2016-03-27 17:34:45 -04:00
|
|
|
from pywb.utils.canonicalize import canonicalize
|
|
|
|
from pywb.utils.loaders import extract_client_cookie
|
2017-03-01 18:37:38 -08:00
|
|
|
|
|
|
|
from warcio.timeutils import http_date_to_timestamp
|
|
|
|
from warcio.bufferedreaders import BufferedReader
|
|
|
|
from warcio.recordloader import ArcWarcRecordLoader
|
2017-02-19 20:49:51 -08:00
|
|
|
|
|
|
|
from pywb.webagg.utils import BUFF_SIZE
|
2016-03-27 17:34:45 -04:00
|
|
|
|
|
|
|
from pywb.cdx.cdxobject import CDXObject
|
2016-04-25 12:03:23 -07:00
|
|
|
from pywb.framework.wbrequestresponse import WbResponse
|
|
|
|
|
2017-04-24 15:08:42 -07:00
|
|
|
from pywb.webagg.utils import MementoUtils
|
2017-02-27 19:07:51 -08:00
|
|
|
|
|
|
|
from werkzeug.http import HTTP_STATUS_CODES
|
2017-04-04 18:10:49 -07:00
|
|
|
from six.moves.urllib.parse import urlencode, urlsplit, urlunsplit
|
2016-05-28 15:01:33 -07:00
|
|
|
|
2016-11-08 15:04:22 -08:00
|
|
|
from pywb.urlrewrite.rewriteinputreq import RewriteInputRequest
|
|
|
|
from pywb.urlrewrite.templateview import JinjaEnv, HeadInsertView, TopFrameView, BaseInsertView
|
2016-12-16 11:19:40 -08:00
|
|
|
|
2016-03-27 17:34:45 -04:00
|
|
|
|
|
|
|
from io import BytesIO
|
2016-12-08 13:44:11 -08:00
|
|
|
from copy import copy
|
2016-05-04 16:39:47 -07:00
|
|
|
|
2016-04-15 02:21:39 +00:00
|
|
|
import gevent
|
2016-04-25 12:03:23 -07:00
|
|
|
import json
|
|
|
|
|
|
|
|
|
|
|
|
# ============================================================================
|
|
|
|
class UpstreamException(WbException):
|
|
|
|
def __init__(self, status_code, url, details):
|
|
|
|
super(UpstreamException, self).__init__(url=url, msg=details)
|
|
|
|
self.status_code = status_code
|
2016-03-27 17:34:45 -04:00
|
|
|
|
|
|
|
|
2017-03-20 14:41:12 -07:00
|
|
|
# ============================================================================
|
2017-05-08 19:17:09 -07:00
|
|
|
#class Rewriter(RewriteDASHMixin, RewriteAMFMixin, RewriteContent):
|
|
|
|
# pass
|
2017-03-20 14:41:12 -07:00
|
|
|
|
|
|
|
|
2016-03-27 17:34:45 -04:00
|
|
|
# ============================================================================
|
|
|
|
class RewriterApp(object):
|
2016-05-28 15:01:33 -07:00
|
|
|
VIDEO_INFO_CONTENT_TYPE = 'application/vnd.youtube-dl_formats+json'
|
|
|
|
|
2016-04-25 12:03:23 -07:00
|
|
|
def __init__(self, framed_replay=False, jinja_env=None, config=None):
|
2016-03-27 17:34:45 -04:00
|
|
|
self.loader = ArcWarcRecordLoader()
|
|
|
|
|
2017-05-02 10:03:18 -07:00
|
|
|
self.config = config or {}
|
2017-02-17 18:04:07 -08:00
|
|
|
self.paths = {}
|
2016-04-25 12:03:23 -07:00
|
|
|
|
2016-03-27 17:34:45 -04:00
|
|
|
self.framed_replay = framed_replay
|
2017-05-02 10:03:18 -07:00
|
|
|
|
|
|
|
if framed_replay:
|
|
|
|
self.frame_mod = ''
|
|
|
|
self.replay_mod = 'mp_'
|
|
|
|
else:
|
|
|
|
self.frame_mod = None
|
|
|
|
self.replay_mod = ''
|
2016-03-27 17:34:45 -04:00
|
|
|
|
2017-05-08 19:17:09 -07:00
|
|
|
#frame_type = 'inverse' if framed_replay else False
|
2016-03-27 17:34:45 -04:00
|
|
|
|
2017-05-08 19:17:09 -07:00
|
|
|
#self.content_rewriter = Rewriter(is_framed_replay=frame_type)
|
2017-05-10 19:05:55 -07:00
|
|
|
self.content_rw = DefaultRewriter('pkg://pywb/rules.yaml', self.replay_mod)
|
2016-03-27 17:34:45 -04:00
|
|
|
|
2016-04-02 21:36:54 -07:00
|
|
|
if not jinja_env:
|
2017-04-26 12:12:34 -07:00
|
|
|
jinja_env = JinjaEnv(globals={'static_path': 'static'})
|
2016-04-02 21:36:54 -07:00
|
|
|
|
|
|
|
self.jinja_env = jinja_env
|
2016-04-25 12:03:23 -07:00
|
|
|
|
2017-05-02 10:03:18 -07:00
|
|
|
self.head_insert_view = HeadInsertView(self.jinja_env,
|
|
|
|
self._html_templ('head_insert_html'),
|
|
|
|
self._html_templ('banner_html'))
|
|
|
|
|
|
|
|
self.frame_insert_view = TopFrameView(self.jinja_env,
|
|
|
|
self._html_templ('frame_insert_html'),
|
|
|
|
self._html_templ('banner_html'))
|
|
|
|
|
|
|
|
self.error_view = BaseInsertView(self.jinja_env, self._html_templ('error_html'))
|
|
|
|
self.not_found_view = BaseInsertView(self.jinja_env, self._html_templ('not_found_html'))
|
|
|
|
self.query_view = BaseInsertView(self.jinja_env, self._html_templ('query_html'))
|
2016-04-25 12:03:23 -07:00
|
|
|
|
2016-05-04 16:39:47 -07:00
|
|
|
self.cookie_tracker = None
|
|
|
|
|
2017-05-02 10:03:18 -07:00
|
|
|
self.enable_memento = self.config.get('enable_memento')
|
|
|
|
|
|
|
|
def _html_templ(self, name):
|
|
|
|
value = self.config.get(name)
|
|
|
|
if not value:
|
|
|
|
value = name.replace('_html', '.html')
|
|
|
|
return value
|
2017-04-15 08:32:20 -07:00
|
|
|
|
2016-07-24 00:14:43 -04:00
|
|
|
def is_framed_replay(self, wb_url):
|
|
|
|
return (self.framed_replay and
|
|
|
|
wb_url.mod == self.frame_mod and
|
|
|
|
wb_url.is_replay())
|
|
|
|
|
2016-04-25 12:03:23 -07:00
|
|
|
def render_content(self, wb_url, kwargs, environ):
|
2016-03-27 17:34:45 -04:00
|
|
|
wb_url = WbUrl(wb_url)
|
|
|
|
|
2016-04-25 12:03:23 -07:00
|
|
|
host_prefix = self.get_host_prefix(environ)
|
|
|
|
rel_prefix = self.get_rel_prefix(environ)
|
2016-03-27 17:34:45 -04:00
|
|
|
full_prefix = host_prefix + rel_prefix
|
|
|
|
|
2016-04-25 12:03:23 -07:00
|
|
|
resp = self.handle_custom_response(environ, wb_url,
|
|
|
|
full_prefix, host_prefix, kwargs)
|
2016-04-07 10:37:40 -07:00
|
|
|
if resp is not None:
|
2016-07-24 00:14:43 -04:00
|
|
|
content_type = 'text/html'
|
|
|
|
|
|
|
|
# if not replay outer frame, specify utf-8 charset
|
|
|
|
if not self.is_framed_replay(wb_url):
|
|
|
|
content_type += '; charset=utf-8'
|
|
|
|
|
|
|
|
return WbResponse.text_response(resp, content_type=content_type)
|
2016-03-27 17:34:45 -04:00
|
|
|
|
2017-04-24 15:08:42 -07:00
|
|
|
is_proxy = ('wsgiprox.proxy_host' in environ)
|
2017-04-22 18:17:41 -07:00
|
|
|
|
|
|
|
if is_proxy:
|
2017-04-24 15:08:42 -07:00
|
|
|
environ['pywb_proxy_magic'] = environ['wsgiprox.proxy_host']
|
2017-04-22 18:17:41 -07:00
|
|
|
urlrewriter = SchemeOnlyUrlRewriter(wb_url, '')
|
|
|
|
framed_replay = False
|
|
|
|
|
|
|
|
else:
|
|
|
|
urlrewriter = UrlRewriter(wb_url,
|
|
|
|
prefix=full_prefix,
|
|
|
|
full_prefix=full_prefix,
|
|
|
|
rel_prefix=rel_prefix)
|
|
|
|
|
|
|
|
framed_replay = self.framed_replay
|
2016-03-27 17:34:45 -04:00
|
|
|
|
2017-04-04 18:10:49 -07:00
|
|
|
url_parts = urlsplit(wb_url.url)
|
|
|
|
if not url_parts.path:
|
|
|
|
scheme, netloc, path, query, frag = url_parts
|
|
|
|
path = '/'
|
|
|
|
url = urlunsplit((scheme, netloc, path, query, frag))
|
|
|
|
return WbResponse.redir_response(urlrewriter.rewrite(url),
|
|
|
|
'307 Temporary Redirect')
|
2017-04-04 15:41:03 -07:00
|
|
|
|
2017-04-22 18:17:41 -07:00
|
|
|
self.unrewrite_referrer(environ, full_prefix)
|
2016-03-27 17:34:45 -04:00
|
|
|
|
2016-05-28 15:01:33 -07:00
|
|
|
urlkey = canonicalize(wb_url.url)
|
2016-03-27 17:34:45 -04:00
|
|
|
|
2016-05-28 15:01:33 -07:00
|
|
|
inputreq = RewriteInputRequest(environ, urlkey, wb_url.url,
|
2017-05-08 19:17:09 -07:00
|
|
|
self.content_rw)
|
2016-03-27 17:34:45 -04:00
|
|
|
|
2016-05-28 15:01:33 -07:00
|
|
|
inputreq.include_post_query(wb_url.url)
|
2016-05-06 16:32:13 -07:00
|
|
|
|
2016-04-15 02:21:39 +00:00
|
|
|
mod_url = None
|
|
|
|
use_206 = False
|
|
|
|
rangeres = None
|
2016-03-27 17:34:45 -04:00
|
|
|
|
2016-04-15 02:21:39 +00:00
|
|
|
readd_range = False
|
|
|
|
async_record_url = None
|
2016-03-27 17:34:45 -04:00
|
|
|
|
2016-07-26 19:42:32 -04:00
|
|
|
if kwargs.get('type') in ('record', 'patch'):
|
2016-04-15 02:21:39 +00:00
|
|
|
rangeres = inputreq.extract_range()
|
2016-03-27 17:34:45 -04:00
|
|
|
|
2016-04-15 02:21:39 +00:00
|
|
|
if rangeres:
|
|
|
|
mod_url, start, end, use_206 = rangeres
|
2016-03-27 17:34:45 -04:00
|
|
|
|
2016-04-15 02:21:39 +00:00
|
|
|
# if bytes=0- Range request,
|
|
|
|
# simply remove the range and still proxy
|
|
|
|
if start == 0 and not end and use_206:
|
|
|
|
wb_url.url = mod_url
|
|
|
|
inputreq.url = mod_url
|
|
|
|
|
2016-04-25 12:03:23 -07:00
|
|
|
del environ['HTTP_RANGE']
|
2016-04-15 02:21:39 +00:00
|
|
|
readd_range = True
|
|
|
|
else:
|
|
|
|
async_record_url = mod_url
|
|
|
|
|
2016-05-04 16:39:47 -07:00
|
|
|
skip = async_record_url is not None
|
|
|
|
|
|
|
|
setcookie_headers = None
|
|
|
|
if self.cookie_tracker:
|
|
|
|
cookie_key = self.get_cookie_key(kwargs)
|
2016-12-11 18:59:02 -08:00
|
|
|
res = self.cookie_tracker.get_cookie_headers(wb_url.url, urlrewriter, cookie_key)
|
2016-05-04 16:39:47 -07:00
|
|
|
inputreq.extra_cookie, setcookie_headers = res
|
|
|
|
|
2016-05-28 15:01:33 -07:00
|
|
|
r = self._do_req(inputreq, wb_url, kwargs, skip)
|
2016-03-27 17:34:45 -04:00
|
|
|
|
|
|
|
if r.status_code >= 400:
|
2016-04-07 10:37:40 -07:00
|
|
|
error = None
|
2016-03-27 17:34:45 -04:00
|
|
|
try:
|
2016-04-07 10:37:40 -07:00
|
|
|
error = r.raw.read()
|
2016-03-27 17:34:45 -04:00
|
|
|
r.raw.close()
|
|
|
|
except:
|
|
|
|
pass
|
|
|
|
|
2016-04-07 10:37:40 -07:00
|
|
|
if error:
|
|
|
|
error = error.decode('utf-8')
|
|
|
|
else:
|
|
|
|
error = ''
|
|
|
|
|
2016-04-25 12:03:23 -07:00
|
|
|
details = dict(args=kwargs, error=error)
|
2016-05-28 15:01:33 -07:00
|
|
|
raise UpstreamException(r.status_code, url=wb_url.url, details=details)
|
2016-03-27 17:34:45 -04:00
|
|
|
|
2016-04-15 02:21:39 +00:00
|
|
|
if async_record_url:
|
2016-04-25 12:03:23 -07:00
|
|
|
environ.pop('HTTP_RANGE', '')
|
2016-12-08 13:44:11 -08:00
|
|
|
new_wb_url = copy(wb_url)
|
|
|
|
new_wb_url.url = async_record_url
|
|
|
|
|
2016-04-15 02:21:39 +00:00
|
|
|
gevent.spawn(self._do_async_req,
|
|
|
|
inputreq,
|
2016-12-08 13:44:11 -08:00
|
|
|
new_wb_url,
|
2016-04-15 02:21:39 +00:00
|
|
|
kwargs,
|
|
|
|
False)
|
|
|
|
|
2017-02-19 20:49:51 -08:00
|
|
|
stream = BufferedReader(r.raw, block_size=BUFF_SIZE)
|
2017-04-29 13:47:54 -07:00
|
|
|
record = self.loader.parse_record_stream(stream,
|
|
|
|
ensure_http_headers=True)
|
2016-03-27 17:34:45 -04:00
|
|
|
|
2017-02-27 19:07:51 -08:00
|
|
|
memento_dt = r.headers.get('Memento-Datetime')
|
|
|
|
target_uri = r.headers.get('WARC-Target-URI')
|
|
|
|
|
2017-04-28 12:37:24 -07:00
|
|
|
cdx = CDXObject(r.headers.get('Webagg-Cdx').encode('utf-8'))
|
|
|
|
|
|
|
|
#cdx['urlkey'] = urlkey
|
|
|
|
#cdx['timestamp'] = http_date_to_timestamp(memento_dt)
|
|
|
|
#cdx['url'] = target_uri
|
2016-03-27 17:34:45 -04:00
|
|
|
|
2017-04-21 15:37:21 -07:00
|
|
|
set_content_loc = False
|
|
|
|
|
|
|
|
# Check if Fuzzy Match
|
2017-04-28 12:37:24 -07:00
|
|
|
if target_uri != wb_url.url and cdx.get('is_fuzzy') == '1':
|
2017-04-21 15:37:21 -07:00
|
|
|
set_content_loc = True
|
|
|
|
|
2017-04-19 15:42:13 -07:00
|
|
|
# return WbResponse.redir_response(urlrewriter.rewrite(target_uri),
|
|
|
|
# '307 Temporary Redirect')
|
2017-03-20 14:41:12 -07:00
|
|
|
|
2016-04-07 10:37:40 -07:00
|
|
|
self._add_custom_params(cdx, r.headers, kwargs)
|
2016-03-27 17:34:45 -04:00
|
|
|
|
2017-03-21 18:13:34 +00:00
|
|
|
if readd_range and record.http_headers.get_statuscode() == '200':
|
2017-03-01 18:37:38 -08:00
|
|
|
content_length = (record.http_headers.
|
2016-04-15 02:21:39 +00:00
|
|
|
get_header('Content-Length'))
|
|
|
|
try:
|
|
|
|
content_length = int(content_length)
|
2017-03-01 18:37:38 -08:00
|
|
|
record.http_headers.add_range(0, content_length,
|
2016-04-15 02:21:39 +00:00
|
|
|
content_length)
|
|
|
|
except (ValueError, TypeError):
|
|
|
|
pass
|
|
|
|
|
2017-03-08 12:30:20 -08:00
|
|
|
is_ajax = self.is_ajax(environ)
|
|
|
|
if is_ajax:
|
2016-03-27 17:34:45 -04:00
|
|
|
head_insert_func = None
|
2016-10-22 07:19:46 +00:00
|
|
|
urlrewriter.rewrite_opts['is_ajax'] = True
|
2016-03-27 17:34:45 -04:00
|
|
|
else:
|
2016-04-25 12:03:23 -07:00
|
|
|
top_url = self.get_top_url(full_prefix, wb_url, cdx, kwargs)
|
2016-03-27 17:34:45 -04:00
|
|
|
head_insert_func = (self.head_insert_view.
|
|
|
|
create_insert_func(wb_url,
|
|
|
|
full_prefix,
|
|
|
|
host_prefix,
|
2016-04-25 12:03:23 -07:00
|
|
|
top_url,
|
|
|
|
environ,
|
2017-04-22 18:17:41 -07:00
|
|
|
framed_replay))
|
2016-03-27 17:34:45 -04:00
|
|
|
|
2016-05-04 16:39:47 -07:00
|
|
|
cookie_rewriter = None
|
|
|
|
if self.cookie_tracker:
|
|
|
|
cookie_rewriter = self.cookie_tracker.get_rewriter(urlrewriter,
|
|
|
|
cookie_key)
|
|
|
|
|
2017-05-08 19:17:09 -07:00
|
|
|
#result = self.content_rewriter.rewrite_content(urlrewriter,
|
|
|
|
# record.http_headers,
|
|
|
|
# record.raw_stream,
|
|
|
|
# head_insert_func,
|
|
|
|
# urlkey,
|
|
|
|
# cdx,
|
|
|
|
# cookie_rewriter,
|
|
|
|
# environ)
|
|
|
|
result = self.content_rw(record, urlrewriter, cookie_rewriter, head_insert_func, cdx)
|
2016-03-27 17:34:45 -04:00
|
|
|
|
|
|
|
status_headers, gen, is_rw = result
|
2016-05-04 16:39:47 -07:00
|
|
|
|
|
|
|
if setcookie_headers:
|
|
|
|
status_headers.headers.extend(setcookie_headers)
|
|
|
|
|
2016-12-16 11:19:40 -08:00
|
|
|
if ' ' not in status_headers.statusline:
|
|
|
|
status_headers.statusline += ' None'
|
|
|
|
|
2017-04-15 08:32:20 -07:00
|
|
|
if not is_ajax and self.enable_memento:
|
2017-03-08 12:30:20 -08:00
|
|
|
self._add_memento_links(urlrewriter, full_prefix, memento_dt, status_headers)
|
2017-02-27 19:07:51 -08:00
|
|
|
|
2017-04-21 15:37:21 -07:00
|
|
|
set_content_loc = True
|
|
|
|
|
|
|
|
if set_content_loc:
|
2017-03-08 12:30:20 -08:00
|
|
|
status_headers.headers.append(('Content-Location', urlrewriter.get_new_url(timestamp=cdx['timestamp'],
|
|
|
|
url=cdx['url'])))
|
2017-02-27 19:07:51 -08:00
|
|
|
#gen = buffer_iter(status_headers, gen)
|
2017-04-22 18:17:41 -07:00
|
|
|
response = WbResponse(status_headers, gen)
|
2017-02-27 19:07:51 -08:00
|
|
|
|
2017-04-22 18:17:41 -07:00
|
|
|
if is_proxy:
|
2017-04-24 15:08:42 -07:00
|
|
|
response.status_headers.remove_header('Content-Security-Policy')
|
|
|
|
response.status_headers.remove_header('X-Frame-Options')
|
2017-04-22 18:17:41 -07:00
|
|
|
|
|
|
|
return response
|
|
|
|
|
2017-02-27 19:07:51 -08:00
|
|
|
def _add_memento_links(self, urlrewriter, full_prefix, memento_dt, status_headers):
|
|
|
|
wb_url = urlrewriter.wburl
|
|
|
|
status_headers.headers.append(('Memento-Datetime', memento_dt))
|
|
|
|
|
2017-04-15 08:32:20 -07:00
|
|
|
memento_url = full_prefix + str(wb_url)
|
2017-02-27 19:07:51 -08:00
|
|
|
timegate_url = urlrewriter.get_new_url(timestamp='')
|
|
|
|
|
|
|
|
link = []
|
|
|
|
link.append(MementoUtils.make_link(timegate_url, 'timegate'))
|
|
|
|
link.append(MementoUtils.make_memento_link(memento_url, 'memento', memento_dt))
|
|
|
|
link_str = ', '.join(link)
|
|
|
|
|
|
|
|
status_headers.headers.append(('Link', link_str))
|
|
|
|
|
2016-04-25 12:03:23 -07:00
|
|
|
def get_top_url(self, full_prefix, wb_url, cdx, kwargs):
|
|
|
|
top_url = full_prefix
|
|
|
|
top_url += wb_url.to_str(mod='')
|
|
|
|
return top_url
|
2016-03-27 17:34:45 -04:00
|
|
|
|
2016-04-15 02:21:39 +00:00
|
|
|
def _do_async_req(self, *args):
|
|
|
|
count = 0
|
|
|
|
try:
|
|
|
|
r = self._do_req(*args)
|
|
|
|
while True:
|
|
|
|
buff = r.raw.read(8192)
|
|
|
|
count += len(buff)
|
|
|
|
if not buff:
|
|
|
|
return
|
|
|
|
except:
|
|
|
|
import traceback
|
|
|
|
traceback.print_exc()
|
|
|
|
|
|
|
|
finally:
|
|
|
|
try:
|
|
|
|
r.raw.close()
|
|
|
|
except:
|
|
|
|
pass
|
|
|
|
|
2016-04-25 12:03:23 -07:00
|
|
|
def handle_error(self, environ, ue):
|
2017-02-27 19:07:51 -08:00
|
|
|
if ue.status_code == 404:
|
|
|
|
return self._not_found_response(environ, ue.url)
|
|
|
|
|
|
|
|
else:
|
|
|
|
status = str(ue.status_code) + ' ' + HTTP_STATUS_CODES.get(ue.status_code, 'Unknown Error')
|
|
|
|
return self._error_response(environ, ue.url, ue.msg,
|
|
|
|
status=status)
|
|
|
|
|
|
|
|
def _not_found_response(self, environ, url):
|
|
|
|
resp = self.not_found_view.render_to_string(environ, url=url)
|
|
|
|
|
|
|
|
return WbResponse.text_response(resp, status='404 Not Found', content_type='text/html')
|
|
|
|
|
|
|
|
def _error_response(self, environ, msg='', details='', status='404 Not Found'):
|
|
|
|
resp = self.error_view.render_to_string(environ,
|
|
|
|
err_msg=msg,
|
|
|
|
err_details=details)
|
|
|
|
|
|
|
|
return WbResponse.text_response(resp, status=status, content_type='text/html')
|
2016-04-25 12:03:23 -07:00
|
|
|
|
2016-04-15 02:21:39 +00:00
|
|
|
|
2016-05-28 15:01:33 -07:00
|
|
|
def _do_req(self, inputreq, wb_url, kwargs, skip):
|
|
|
|
req_data = inputreq.reconstruct_request(wb_url.url)
|
2016-04-15 02:21:39 +00:00
|
|
|
|
2016-08-09 19:53:22 -04:00
|
|
|
headers = {'Content-Length': str(len(req_data)),
|
2016-04-15 02:21:39 +00:00
|
|
|
'Content-Type': 'application/request'}
|
|
|
|
|
|
|
|
if skip:
|
|
|
|
headers['Recorder-Skip'] = '1'
|
|
|
|
|
|
|
|
if wb_url.is_latest_replay():
|
|
|
|
closest = 'now'
|
|
|
|
else:
|
|
|
|
closest = wb_url.timestamp
|
|
|
|
|
2016-05-28 15:01:33 -07:00
|
|
|
params = {}
|
|
|
|
params['url'] = wb_url.url
|
|
|
|
params['closest'] = closest
|
|
|
|
|
|
|
|
if wb_url.mod == 'vi_':
|
|
|
|
params['content_type'] = self.VIDEO_INFO_CONTENT_TYPE
|
|
|
|
|
|
|
|
upstream_url = self.get_upstream_url(wb_url, kwargs, params)
|
|
|
|
|
2016-04-15 02:21:39 +00:00
|
|
|
r = requests.post(upstream_url,
|
|
|
|
data=BytesIO(req_data),
|
|
|
|
headers=headers,
|
|
|
|
stream=True)
|
|
|
|
|
|
|
|
return r
|
|
|
|
|
2016-04-15 04:01:36 +00:00
|
|
|
def do_query(self, wb_url, kwargs):
|
2016-05-28 15:01:33 -07:00
|
|
|
params = {}
|
|
|
|
params['url'] = wb_url.url
|
|
|
|
params['output'] = 'json'
|
|
|
|
params['from'] = wb_url.timestamp
|
|
|
|
params['to'] = wb_url.end_timestamp
|
2016-04-25 12:03:23 -07:00
|
|
|
|
2016-05-28 15:01:33 -07:00
|
|
|
upstream_url = self.get_upstream_url(wb_url, kwargs, params)
|
|
|
|
upstream_url = upstream_url.replace('/resource/postreq', '/index')
|
2016-04-25 12:03:23 -07:00
|
|
|
|
|
|
|
r = requests.get(upstream_url)
|
|
|
|
|
2016-04-15 04:01:36 +00:00
|
|
|
return r.text
|
2016-04-15 02:21:39 +00:00
|
|
|
|
2016-04-25 12:03:23 -07:00
|
|
|
def handle_query(self, environ, wb_url, kwargs):
|
|
|
|
res = self.do_query(wb_url, kwargs)
|
|
|
|
|
|
|
|
def format_cdx(text):
|
|
|
|
cdx_lines = text.rstrip().split('\n')
|
|
|
|
for cdx in cdx_lines:
|
|
|
|
if not cdx:
|
|
|
|
continue
|
|
|
|
|
|
|
|
cdx = json.loads(cdx)
|
|
|
|
self.process_query_cdx(cdx, wb_url, kwargs)
|
|
|
|
yield cdx
|
|
|
|
|
|
|
|
prefix = self.get_full_prefix(environ)
|
|
|
|
|
|
|
|
params = dict(url=wb_url.url,
|
|
|
|
prefix=prefix,
|
|
|
|
cdx_lines=list(format_cdx(res)))
|
|
|
|
|
|
|
|
extra_params = self.get_query_params(wb_url, kwargs)
|
|
|
|
if extra_params:
|
|
|
|
params.update(extra_params)
|
|
|
|
|
|
|
|
return self.query_view.render_to_string(environ, **params)
|
|
|
|
|
|
|
|
def process_query_cdx(self, cdx, wb_url, kwargs):
|
|
|
|
return
|
|
|
|
|
|
|
|
def get_query_params(self, wb_url, kwargs):
|
|
|
|
return None
|
|
|
|
|
|
|
|
def get_host_prefix(self, environ):
|
2017-04-22 18:17:41 -07:00
|
|
|
scheme = environ['wsgi.url_scheme'] + '://'
|
|
|
|
|
|
|
|
# proxy
|
2017-04-24 15:08:42 -07:00
|
|
|
host = environ.get('wsgiprox.proxy_host')
|
2017-04-22 18:17:41 -07:00
|
|
|
if host:
|
|
|
|
return scheme + host
|
|
|
|
|
|
|
|
# default
|
|
|
|
host = environ.get('HTTP_HOST')
|
|
|
|
if host:
|
|
|
|
return scheme + host
|
|
|
|
|
|
|
|
# if no host
|
|
|
|
host = environ['SERVER_NAME']
|
|
|
|
if environ['wsgi.url_scheme'] == 'https':
|
|
|
|
if environ['SERVER_PORT'] != '443':
|
|
|
|
host += ':' + environ['SERVER_PORT']
|
2016-04-25 12:03:23 -07:00
|
|
|
else:
|
2017-04-22 18:17:41 -07:00
|
|
|
if environ['SERVER_PORT'] != '80':
|
|
|
|
host += ':' + environ['SERVER_PORT']
|
2016-04-25 12:03:23 -07:00
|
|
|
|
2017-04-22 18:17:41 -07:00
|
|
|
return scheme + host
|
2016-04-25 12:03:23 -07:00
|
|
|
|
|
|
|
def get_rel_prefix(self, environ):
|
|
|
|
#return request.script_name
|
|
|
|
return environ.get('SCRIPT_NAME') + '/'
|
|
|
|
|
|
|
|
def get_full_prefix(self, environ):
|
|
|
|
return self.get_host_prefix(environ) + self.get_rel_prefix(environ)
|
2016-03-27 17:34:45 -04:00
|
|
|
|
2017-04-22 18:17:41 -07:00
|
|
|
def unrewrite_referrer(self, environ, full_prefix):
|
2016-04-25 12:03:23 -07:00
|
|
|
referrer = environ.get('HTTP_REFERER')
|
2016-03-27 17:34:45 -04:00
|
|
|
if not referrer:
|
|
|
|
return False
|
|
|
|
|
|
|
|
if referrer.startswith(full_prefix):
|
|
|
|
referrer = referrer[len(full_prefix):]
|
2016-04-25 12:03:23 -07:00
|
|
|
environ['HTTP_REFERER'] = WbUrl(referrer).url
|
2016-03-27 17:34:45 -04:00
|
|
|
return True
|
|
|
|
|
|
|
|
return False
|
|
|
|
|
2016-04-25 12:03:23 -07:00
|
|
|
def is_ajax(self, environ):
|
|
|
|
value = environ.get('HTTP_X_REQUESTED_WITH')
|
|
|
|
value = value or environ.get('HTTP_X_PYWB_REQUESTED_WITH')
|
2016-03-27 17:34:45 -04:00
|
|
|
if value and value.lower() == 'xmlhttprequest':
|
|
|
|
return True
|
|
|
|
|
|
|
|
return False
|
|
|
|
|
2016-05-28 15:01:33 -07:00
|
|
|
def get_base_url(self, wb_url, kwargs):
|
|
|
|
type = kwargs.get('type')
|
2017-02-17 18:04:07 -08:00
|
|
|
return self.paths[type].format(**kwargs)
|
2016-05-28 15:01:33 -07:00
|
|
|
|
|
|
|
def get_upstream_url(self, wb_url, kwargs, params):
|
|
|
|
base_url = self.get_base_url(wb_url, kwargs)
|
2016-10-02 11:29:51 -07:00
|
|
|
param_str = urlencode(params, True)
|
|
|
|
if param_str:
|
2017-02-17 18:04:07 -08:00
|
|
|
q_char = '&' if '?' in base_url else '?'
|
|
|
|
base_url += q_char + param_str
|
2016-05-28 15:01:33 -07:00
|
|
|
return base_url
|
2016-03-27 17:34:45 -04:00
|
|
|
|
2016-05-04 16:39:47 -07:00
|
|
|
def get_cookie_key(self, kwargs):
|
|
|
|
raise NotImplemented()
|
|
|
|
|
2016-04-07 10:37:40 -07:00
|
|
|
def _add_custom_params(self, cdx, headers, kwargs):
|
2016-03-27 17:34:45 -04:00
|
|
|
pass
|
2017-04-28 12:37:24 -07:00
|
|
|
#if resp_headers.get('Webagg-Source-Live') == '1':
|
|
|
|
# cdx['is_live'] = 'true'
|
2016-04-05 02:44:04 -07:00
|
|
|
|
|
|
|
def get_top_frame_params(self, wb_url, kwargs):
|
|
|
|
return None
|
|
|
|
|
2016-04-25 12:03:23 -07:00
|
|
|
def handle_custom_response(self, environ, wb_url, full_prefix, host_prefix, kwargs):
|
2016-04-15 04:01:36 +00:00
|
|
|
if wb_url.is_query():
|
2016-04-25 12:03:23 -07:00
|
|
|
return self.handle_query(environ, wb_url, kwargs)
|
2016-04-15 04:01:36 +00:00
|
|
|
|
2016-07-24 00:14:43 -04:00
|
|
|
if self.is_framed_replay(wb_url):
|
2016-04-07 10:37:40 -07:00
|
|
|
extra_params = self.get_top_frame_params(wb_url, kwargs)
|
|
|
|
return self.frame_insert_view.get_top_frame(wb_url,
|
|
|
|
full_prefix,
|
|
|
|
host_prefix,
|
2016-04-25 12:03:23 -07:00
|
|
|
environ,
|
2016-04-07 10:37:40 -07:00
|
|
|
self.frame_mod,
|
|
|
|
self.replay_mod,
|
|
|
|
coll='',
|
|
|
|
extra_params=extra_params)
|
|
|
|
|
|
|
|
return None
|