mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-31 03:04:12 +02:00
178 lines
5.9 KiB
Python
178 lines
5.9 KiB
Python
from pywb.framework.basehandlers import WbUrlHandler
|
|
from pywb.framework.wbrequestresponse import WbResponse
|
|
from pywb.framework.archivalrouter import ArchivalRouter, Route
|
|
|
|
from pywb.rewrite.rewrite_live import LiveRewriter
|
|
from pywb.rewrite.wburl import WbUrl
|
|
from pywb.rewrite.url_rewriter import HttpsUrlRewriter
|
|
|
|
from handlers import StaticHandler, SearchPageWbUrlHandler
|
|
from views import HeadInsertView
|
|
|
|
from pywb.utils.wbexception import WbException
|
|
|
|
import json
|
|
import requests
|
|
|
|
from rangecache import range_cache
|
|
|
|
|
|
#=================================================================
|
|
class LiveResourceException(WbException):
|
|
def status(self):
|
|
return '400 Bad Live Resource'
|
|
|
|
|
|
#=================================================================
|
|
class RewriteHandler(SearchPageWbUrlHandler):
|
|
|
|
LIVE_COOKIE = 'pywb.timestamp={0}; max-age=60'
|
|
|
|
youtubedl = None
|
|
|
|
def __init__(self, config):
|
|
super(RewriteHandler, self).__init__(config)
|
|
|
|
self.default_proxy = config.get('proxyhostport')
|
|
self.rewriter = LiveRewriter(is_framed_replay=self.is_frame_mode,
|
|
default_proxy=self.default_proxy)
|
|
|
|
self.head_insert_view = HeadInsertView.init_from_config(config)
|
|
|
|
self.live_cookie = config.get('live-cookie', self.LIVE_COOKIE)
|
|
|
|
self.ydl = None
|
|
|
|
def handle_request(self, wbrequest):
|
|
try:
|
|
return self.render_content(wbrequest)
|
|
|
|
except Exception as exc:
|
|
import traceback
|
|
err_details = traceback.format_exc(exc)
|
|
print err_details
|
|
|
|
url = wbrequest.wb_url.url
|
|
msg = 'Could not load the url from the live web: ' + url
|
|
raise LiveResourceException(msg=msg, url=url)
|
|
|
|
def _live_request_headers(self, wbrequest):
|
|
return {}
|
|
|
|
def render_content(self, wbrequest):
|
|
if wbrequest.wb_url.mod == 'vi_':
|
|
return self.get_video_info(wbrequest)
|
|
|
|
head_insert_func = self.head_insert_view.create_insert_func(wbrequest)
|
|
req_headers = self._live_request_headers(wbrequest)
|
|
|
|
ref_wburl_str = wbrequest.extract_referrer_wburl_str()
|
|
if ref_wburl_str:
|
|
wbrequest.env['REL_REFERER'] = WbUrl(ref_wburl_str).url
|
|
|
|
def do_req():
|
|
result = self.rewriter.fetch_request(wbrequest.wb_url.url,
|
|
wbrequest.urlrewriter,
|
|
head_insert_func=head_insert_func,
|
|
req_headers=req_headers,
|
|
env=wbrequest.env)
|
|
|
|
return self._make_response(wbrequest, *result)
|
|
|
|
cdx = dict(url=wbrequest.wb_url.url)
|
|
|
|
range_status, range_iter = range_cache(wbrequest, cdx, do_req)
|
|
|
|
if not range_status or not range_iter:
|
|
return do_req()
|
|
else:
|
|
result = range_status, range_iter, False
|
|
return self._make_response(wbrequest, *result)
|
|
|
|
|
|
def _make_response(self, wbrequest, status_headers, gen, is_rewritten):
|
|
# if cookie set, pass recorded timestamp info via cookie
|
|
# so that client side may be able to access it
|
|
# used by framed mode to update frame banner
|
|
if self.live_cookie:
|
|
cdx = wbrequest.env.get('pywb.cdx')
|
|
if cdx:
|
|
value = self.live_cookie.format(cdx['timestamp'])
|
|
status_headers.headers.append(('Set-Cookie', value))
|
|
|
|
return WbResponse(status_headers, gen)
|
|
|
|
def get_video_info(self, wbrequest):
|
|
if not self.youtubedl:
|
|
self.youtubedl = YoutubeDLWrapper()
|
|
|
|
info = self.youtubedl.extract_info(wbrequest.wb_url.url)
|
|
|
|
content_type = 'application/vnd.youtube-dl_formats+json'
|
|
metadata = json.dumps(info)
|
|
|
|
if self.default_proxy:
|
|
proxies = {'http': self.default_proxy}
|
|
|
|
headers = self._live_request_headers(wbrequest)
|
|
headers['Content-Type'] = content_type
|
|
|
|
url = HttpsUrlRewriter.remove_https(wbrequest.wb_url.url)
|
|
|
|
response = requests.request(method='PUTMETA',
|
|
url=url,
|
|
data=metadata,
|
|
headers=headers,
|
|
proxies=proxies,
|
|
verify=False)
|
|
|
|
return WbResponse.text_response(metadata, content_type=content_type)
|
|
|
|
def __str__(self):
|
|
return 'Live Web Rewrite Handler'
|
|
|
|
|
|
#=================================================================
|
|
class YoutubeDLWrapper(object):
|
|
""" Used to wrap youtubedl import, since youtubedl currently overrides
|
|
global HTMLParser.locatestarttagend regex with a different regex
|
|
that doesn't quite work.
|
|
|
|
This wrapper ensures that this regex is only set for YoutubeDL and unset
|
|
otherwise
|
|
"""
|
|
def __init__(self):
|
|
import HTMLParser as htmlparser
|
|
self.htmlparser = htmlparser
|
|
|
|
self.orig_tagregex = htmlparser.locatestarttagend
|
|
|
|
from youtube_dl import YoutubeDL as YoutubeDL
|
|
|
|
self.ydl_tagregex = htmlparser.locatestarttagend
|
|
|
|
htmlparser.locatestarttagend = self.orig_tagregex
|
|
|
|
self.ydl = YoutubeDL(dict(simulate=True,
|
|
youtube_include_dash_manifest=False))
|
|
self.ydl.add_default_info_extractors()
|
|
|
|
def extract_info(self, url):
|
|
info = None
|
|
try:
|
|
self.htmlparser.locatestarttagend = self.ydl_tagregex
|
|
info = self.ydl.extract_info(url)
|
|
finally:
|
|
self.htmlparser.locatestarttagend = self.orig_tagregex
|
|
|
|
return info
|
|
|
|
|
|
#=================================================================
|
|
def create_live_rewriter_app(config={}):
|
|
routes = [Route('rewrite', RewriteHandler(config)),
|
|
Route('static/default', StaticHandler('pywb/static/'))
|
|
]
|
|
|
|
return ArchivalRouter(routes, hostpaths=['http://localhost:8080'])
|