1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

added 'upstream' handler for connecting to another webagg when 'upstream_url' is set

output 'is_live' as string in live index
This commit is contained in:
Ilya Kreymer 2016-03-06 09:10:17 -08:00
parent 20ebccc13e
commit 0823ff4bd0
5 changed files with 54 additions and 8 deletions

View File

@ -113,7 +113,7 @@ class TestResAgg(object):
res = to_json_list(resp.text) res = to_json_list(resp.text)
res[0]['timestamp'] = '2016' res[0]['timestamp'] = '2016'
assert(res == [{'url': 'http://httpbin.org/get', 'urlkey': 'org,httpbin)/get', 'is_live': True, assert(res == [{'url': 'http://httpbin.org/get', 'urlkey': 'org,httpbin)/get', 'is_live': 'true',
'load_url': 'http://httpbin.org/get', 'source': 'live', 'timestamp': '2016'}]) 'load_url': 'http://httpbin.org/get', 'source': 'live', 'timestamp': '2016'}])
def test_live_resource(self): def test_live_resource(self):

View File

@ -46,6 +46,9 @@ def list_routes():
#============================================================================= #=============================================================================
def err_handler(exc): def err_handler(exc):
if bottle.debug:
print(exc)
traceback.print_exc()
response.status = exc.status_code response.status = exc.status_code
response.content_type = JSON_CT response.content_type = JSON_CT
err_msg = json.dumps({'message': exc.body}) err_msg = json.dumps({'message': exc.body})

View File

@ -1,4 +1,4 @@
from webagg.responseloader import WARCPathLoader, LiveWebLoader from webagg.responseloader import WARCPathLoader, LiveWebLoader, UpstreamProxyLoader
from webagg.utils import MementoUtils from webagg.utils import MementoUtils
from pywb.utils.wbexception import BadRequestException, WbException from pywb.utils.wbexception import BadRequestException, WbException
from pywb.utils.wbexception import NotFoundException from pywb.utils.wbexception import NotFoundException
@ -118,7 +118,8 @@ class ResourceHandler(IndexHandler):
class DefaultResourceHandler(ResourceHandler): class DefaultResourceHandler(ResourceHandler):
def __init__(self, index_source, warc_paths=''): def __init__(self, index_source, warc_paths=''):
loaders = [WARCPathLoader(warc_paths, index_source), loaders = [WARCPathLoader(warc_paths, index_source),
LiveWebLoader() UpstreamProxyLoader(),
LiveWebLoader(),
] ]
super(DefaultResourceHandler, self).__init__(index_source, loaders) super(DefaultResourceHandler, self).__init__(index_source, loaders)

View File

@ -51,9 +51,10 @@ class FileIndexSource(BaseIndexSource):
#============================================================================= #=============================================================================
class RemoteIndexSource(BaseIndexSource): class RemoteIndexSource(BaseIndexSource):
def __init__(self, api_url, replay_url): def __init__(self, api_url, replay_url, url_field='load_url'):
self.api_url_template = api_url self.api_url_template = api_url
self.replay_url = replay_url self.replay_url = replay_url
self.url_field = url_field
def load_index(self, params): def load_index(self, params):
api_url = res_template(self.api_url_template, params) api_url = res_template(self.api_url_template, params)
@ -65,13 +66,19 @@ class RemoteIndexSource(BaseIndexSource):
def do_load(lines): def do_load(lines):
for line in lines: for line in lines:
cdx = CDXObject(line) cdx = CDXObject(line)
cdx['load_url'] = self.replay_url.format( cdx[self.url_field] = self.replay_url.format(
timestamp=cdx['timestamp'], timestamp=cdx['timestamp'],
url=cdx['url']) url=cdx['url'])
yield cdx yield cdx
return do_load(lines) return do_load(lines)
@staticmethod
def upstream_webagg(base_url):
api_url = base_url + '/index?url={url}'
proxy_url = base_url + '/resource?url={url}&closest={timestamp}'
return RemoteIndexSource(api_url, proxy_url, 'upstream_url')
def __str__(self): def __str__(self):
return 'remote' return 'remote'
@ -84,7 +91,7 @@ class LiveIndexSource(BaseIndexSource):
cdx['timestamp'] = timestamp_now() cdx['timestamp'] = timestamp_now()
cdx['url'] = params['url'] cdx['url'] = params['url']
cdx['load_url'] = params['url'] cdx['load_url'] = params['url']
cdx['is_live'] = True cdx['is_live'] = 'true'
def live(): def live():
yield cdx yield cdx

View File

@ -1,5 +1,6 @@
from webagg.liverec import BaseRecorder from webagg.liverec import BaseRecorder
from webagg.liverec import request as remote_request from webagg.liverec import request as remote_request
from requests import request
from webagg.utils import MementoUtils from webagg.utils import MementoUtils
@ -159,6 +160,40 @@ class HeaderRecorder(BaseRecorder):
self.target_ip = ip[0] self.target_ip = ip[0]
#=============================================================================
class UpstreamProxyLoader(BaseLoader):
def _load_resource(self, cdx, params):
load_url = cdx.get('upstream_url')
if not load_url:
return None, None
input_req = params['_input_req']
method = input_req.get_req_method()
data = input_req.get_req_body()
req_headers = input_req.get_req_headers()
try:
upstream_res = request(url=load_url,
method=method,
stream=True,
allow_redirects=False,
headers=req_headers,
data=data,
timeout=params.get('_timeout'))
except Exception as e:
import traceback
traceback.print_exc()
raise LiveResourceException(load_url)
out_headers = upstream_res.headers
return out_headers, StreamIter(upstream_res.raw)
def __str__(self):
return 'UpstreamProxyLoader'
#============================================================================= #=============================================================================
class LiveWebLoader(BaseLoader): class LiveWebLoader(BaseLoader):
SKIP_HEADERS = (b'link', SKIP_HEADERS = (b'link',