mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 08:04:49 +01:00
loaders: return full WARC record in response, no need for upstream response handler
add UpstreamAggIndexSource to simplify upstream aggregator config, add test for upstream config bottle app: wrap in a ResAppAgg, allow multiple bottle apps py2: non-gevent concurrency not supported
This commit is contained in:
parent
0823ff4bd0
commit
c1895ae70f
@ -8,9 +8,12 @@ from webagg.indexsource import MementoIndexSource, FileIndexSource, LiveIndexSou
|
||||
from webagg.aggregator import GeventTimeoutAggregator, SimpleAggregator
|
||||
from webagg.aggregator import DirectoryIndexSource
|
||||
|
||||
from webagg.app import add_route, application
|
||||
from webagg.app import ResAggApp
|
||||
from webagg.utils import MementoUtils
|
||||
|
||||
from pywb.utils.statusandheaders import StatusAndHeadersParser
|
||||
from io import BytesIO
|
||||
|
||||
import webtest
|
||||
import bottle
|
||||
|
||||
@ -30,32 +33,32 @@ testapp = None
|
||||
def setup_module(self):
|
||||
live_source = SimpleAggregator({'live': LiveIndexSource()})
|
||||
live_handler = DefaultResourceHandler(live_source)
|
||||
add_route('/live', live_handler)
|
||||
app = ResAggApp()
|
||||
app.add_route('/live', live_handler)
|
||||
|
||||
source1 = GeventTimeoutAggregator(sources)
|
||||
handler1 = DefaultResourceHandler(source1, to_path('testdata/'))
|
||||
add_route('/many', handler1)
|
||||
app.add_route('/many', handler1)
|
||||
|
||||
source2 = SimpleAggregator({'post': FileIndexSource(to_path('testdata/post-test.cdxj'))})
|
||||
handler2 = DefaultResourceHandler(source2, to_path('testdata/'))
|
||||
add_route('/posttest', handler2)
|
||||
app.add_route('/posttest', handler2)
|
||||
|
||||
source3 = SimpleAggregator({'example': FileIndexSource(to_path('testdata/example.cdxj'))})
|
||||
handler3 = DefaultResourceHandler(source3, to_path('testdata/'))
|
||||
|
||||
add_route('/fallback', HandlerSeq([handler3,
|
||||
app.add_route('/fallback', HandlerSeq([handler3,
|
||||
handler2,
|
||||
live_handler]))
|
||||
|
||||
add_route('/seq', HandlerSeq([handler3,
|
||||
app.add_route('/seq', HandlerSeq([handler3,
|
||||
handler2]))
|
||||
|
||||
add_route('/empty', HandlerSeq([]))
|
||||
add_route('/invalid', DefaultResourceHandler([SimpleAggregator({'invalid': 'should not be a callable'})]))
|
||||
app.add_route('/empty', HandlerSeq([]))
|
||||
app.add_route('/invalid', DefaultResourceHandler([SimpleAggregator({'invalid': 'should not be a callable'})]))
|
||||
|
||||
application.debug = True
|
||||
global testapp
|
||||
testapp = webtest.TestApp(application)
|
||||
testapp = webtest.TestApp(app.application)
|
||||
|
||||
|
||||
def to_json_list(text):
|
||||
@ -66,6 +69,15 @@ class TestResAgg(object):
|
||||
def setup(self):
|
||||
self.testapp = testapp
|
||||
|
||||
def _check_uri_date(self, resp, uri, dt):
|
||||
buff = BytesIO(resp.body)
|
||||
status_headers = StatusAndHeadersParser(['WARC/1.0']).parse(buff)
|
||||
assert status_headers.get_header('WARC-Target-URI') == uri
|
||||
if dt == True:
|
||||
assert status_headers.get_header('WARC-Date') != ''
|
||||
else:
|
||||
assert status_headers.get_header('WARC-Date') == dt
|
||||
|
||||
def test_list_routes(self):
|
||||
resp = self.testapp.get('/')
|
||||
res = resp.json
|
||||
@ -120,9 +132,9 @@ class TestResAgg(object):
|
||||
headers = {'foo': 'bar'}
|
||||
resp = self.testapp.get('/live/resource?url=http://httpbin.org/get?foo=bar', headers=headers)
|
||||
|
||||
assert resp.headers['WARC-Coll'] == 'live'
|
||||
assert resp.headers['WARC-Target-URI'] == 'http://httpbin.org/get?foo=bar'
|
||||
assert resp.headers['WARC-Date'] != ''
|
||||
assert resp.headers['Source-Coll'] == 'live'
|
||||
|
||||
self._check_uri_date(resp, 'http://httpbin.org/get?foo=bar', True)
|
||||
|
||||
assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/get?foo=bar', 'original')
|
||||
assert resp.headers['Memento-Datetime'] != ''
|
||||
@ -136,9 +148,9 @@ class TestResAgg(object):
|
||||
resp = self.testapp.post('/live/resource?url=http://httpbin.org/post',
|
||||
OrderedDict([('foo', 'bar')]))
|
||||
|
||||
assert resp.headers['WARC-Coll'] == 'live'
|
||||
assert resp.headers['WARC-Target-URI'] == 'http://httpbin.org/post'
|
||||
assert resp.headers['WARC-Date'] != ''
|
||||
assert resp.headers['Source-Coll'] == 'live'
|
||||
|
||||
self._check_uri_date(resp, 'http://httpbin.org/post', True)
|
||||
|
||||
assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/post', 'original')
|
||||
assert resp.headers['Memento-Datetime'] != ''
|
||||
@ -151,9 +163,10 @@ class TestResAgg(object):
|
||||
def test_agg_select_mem_1(self):
|
||||
resp = self.testapp.get('/many/resource?url=http://vvork.com/&closest=20141001')
|
||||
|
||||
assert resp.headers['WARC-Coll'] == 'rhiz'
|
||||
assert resp.headers['WARC-Target-URI'] == 'http://www.vvork.com/'
|
||||
assert resp.headers['WARC-Date'] == '2014-10-06T18:43:57Z'
|
||||
assert resp.headers['Source-Coll'] == 'rhiz'
|
||||
|
||||
self._check_uri_date(resp, 'http://www.vvork.com/', '2014-10-06T18:43:57Z')
|
||||
|
||||
assert b'HTTP/1.1 200 OK' in resp.body
|
||||
|
||||
assert resp.headers['Link'] == MementoUtils.make_link('http://www.vvork.com/', 'original')
|
||||
@ -164,9 +177,10 @@ class TestResAgg(object):
|
||||
def test_agg_select_mem_2(self):
|
||||
resp = self.testapp.get('/many/resource?url=http://vvork.com/&closest=20151231')
|
||||
|
||||
assert resp.headers['WARC-Coll'] == 'ia'
|
||||
assert resp.headers['WARC-Target-URI'] == 'http://vvork.com/'
|
||||
assert resp.headers['WARC-Date'] == '2016-01-10T13:48:55Z'
|
||||
assert resp.headers['Source-Coll'] == 'ia'
|
||||
|
||||
self._check_uri_date(resp, 'http://vvork.com/', '2016-01-10T13:48:55Z')
|
||||
|
||||
assert b'HTTP/1.1 200 OK' in resp.body
|
||||
|
||||
assert resp.headers['Link'] == MementoUtils.make_link('http://vvork.com/', 'original')
|
||||
@ -177,9 +191,9 @@ class TestResAgg(object):
|
||||
def test_agg_select_live(self):
|
||||
resp = self.testapp.get('/many/resource?url=http://vvork.com/&closest=2016')
|
||||
|
||||
assert resp.headers['WARC-Coll'] == 'live'
|
||||
assert resp.headers['WARC-Target-URI'] == 'http://vvork.com/'
|
||||
assert resp.headers['WARC-Date'] != ''
|
||||
assert resp.headers['Source-Coll'] == 'live'
|
||||
|
||||
self._check_uri_date(resp, 'http://vvork.com/', True)
|
||||
|
||||
assert resp.headers['Link'] == MementoUtils.make_link('http://vvork.com/', 'original')
|
||||
assert resp.headers['Memento-Datetime'] != ''
|
||||
@ -189,9 +203,9 @@ class TestResAgg(object):
|
||||
def test_agg_select_local(self):
|
||||
resp = self.testapp.get('/many/resource?url=http://iana.org/&closest=20140126200624')
|
||||
|
||||
assert resp.headers['WARC-Coll'] == 'local'
|
||||
assert resp.headers['WARC-Target-URI'] == 'http://www.iana.org/'
|
||||
assert resp.headers['WARC-Date'] == '2014-01-26T20:06:24Z'
|
||||
assert resp.headers['Source-Coll'] == 'local'
|
||||
|
||||
self._check_uri_date(resp, 'http://www.iana.org/', '2014-01-26T20:06:24Z')
|
||||
|
||||
assert resp.headers['Link'] == MementoUtils.make_link('http://www.iana.org/', 'original')
|
||||
assert resp.headers['Memento-Datetime'] == 'Sun, 26 Jan 2014 20:06:24 GMT'
|
||||
@ -208,9 +222,9 @@ Host: iana.org
|
||||
|
||||
resp = self.testapp.post('/many/resource/postreq?url=http://iana.org/&closest=20140126200624', req_data)
|
||||
|
||||
assert resp.headers['WARC-Coll'] == 'local'
|
||||
assert resp.headers['WARC-Target-URI'] == 'http://www.iana.org/'
|
||||
assert resp.headers['WARC-Date'] == '2014-01-26T20:06:24Z'
|
||||
assert resp.headers['Source-Coll'] == 'local'
|
||||
|
||||
self._check_uri_date(resp, 'http://www.iana.org/', '2014-01-26T20:06:24Z')
|
||||
|
||||
assert resp.headers['Link'] == MementoUtils.make_link('http://www.iana.org/', 'original')
|
||||
assert resp.headers['Memento-Datetime'] == 'Sun, 26 Jan 2014 20:06:24 GMT'
|
||||
@ -227,9 +241,9 @@ Host: httpbin.org
|
||||
|
||||
resp = self.testapp.post('/many/resource/postreq?url=http://httpbin.org/get?foo=bar&closest=now', req_data)
|
||||
|
||||
assert resp.headers['WARC-Coll'] == 'live'
|
||||
assert resp.headers['WARC-Target-URI'] == 'http://httpbin.org/get?foo=bar'
|
||||
assert resp.headers['WARC-Date'] != ''
|
||||
assert resp.headers['Source-Coll'] == 'live'
|
||||
|
||||
self._check_uri_date(resp, 'http://httpbin.org/get?foo=bar', True)
|
||||
|
||||
assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/get?foo=bar', 'original')
|
||||
assert resp.headers['Memento-Datetime'] != ''
|
||||
@ -252,9 +266,9 @@ foo=bar&test=abc"""
|
||||
|
||||
resp = self.testapp.post('/posttest/resource/postreq?url=http://httpbin.org/post', req_data)
|
||||
|
||||
assert resp.headers['WARC-Coll'] == 'post'
|
||||
assert resp.headers['WARC-Target-URI'] == 'http://httpbin.org/post'
|
||||
assert resp.headers['WARC-Date'] != ''
|
||||
assert resp.headers['Source-Coll'] == 'post'
|
||||
|
||||
self._check_uri_date(resp, 'http://httpbin.org/post', True)
|
||||
|
||||
assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/post', 'original')
|
||||
assert resp.headers['Memento-Datetime'] != ''
|
||||
@ -271,8 +285,10 @@ foo=bar&test=abc"""
|
||||
|
||||
resp = self.testapp.post('/fallback/resource?url=http://httpbin.org/post', req_data)
|
||||
|
||||
assert resp.headers['WARC-Coll'] == 'post'
|
||||
assert resp.headers['WARC-Target-URI'] == 'http://httpbin.org/post'
|
||||
assert resp.headers['Source-Coll'] == 'post'
|
||||
|
||||
self._check_uri_date(resp, 'http://httpbin.org/post', True)
|
||||
|
||||
assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/post', 'original')
|
||||
|
||||
assert b'HTTP/1.1 200 OK' in resp.body
|
||||
@ -285,8 +301,10 @@ foo=bar&test=abc"""
|
||||
def test_agg_seq_fallback_1(self):
|
||||
resp = self.testapp.get('/fallback/resource?url=http://www.iana.org/')
|
||||
|
||||
assert resp.headers['WARC-Coll'] == 'live'
|
||||
assert resp.headers['WARC-Target-URI'] == 'http://www.iana.org/'
|
||||
assert resp.headers['Source-Coll'] == 'live'
|
||||
|
||||
self._check_uri_date(resp, 'http://www.iana.org/', True)
|
||||
|
||||
assert resp.headers['Link'] == MementoUtils.make_link('http://www.iana.org/', 'original')
|
||||
|
||||
assert b'HTTP/1.1 200 OK' in resp.body
|
||||
@ -296,9 +314,9 @@ foo=bar&test=abc"""
|
||||
def test_agg_seq_fallback_2(self):
|
||||
resp = self.testapp.get('/fallback/resource?url=http://www.example.com/')
|
||||
|
||||
assert resp.headers['WARC-Coll'] == 'example'
|
||||
assert resp.headers['WARC-Date'] == '2016-02-25T04:23:29Z'
|
||||
assert resp.headers['WARC-Target-URI'] == 'http://example.com/'
|
||||
assert resp.headers['Source-Coll'] == 'example'
|
||||
|
||||
self._check_uri_date(resp, 'http://example.com/', '2016-02-25T04:23:29Z')
|
||||
|
||||
assert resp.headers['Link'] == MementoUtils.make_link('http://example.com/', 'original')
|
||||
assert resp.headers['Memento-Datetime'] == 'Thu, 25 Feb 2016 04:23:29 GMT'
|
||||
@ -318,11 +336,14 @@ foo=bar&test=abc"""
|
||||
def test_agg_local_revisit(self):
|
||||
resp = self.testapp.get('/many/resource?url=http://www.example.com/&closest=20140127171251&sources=local')
|
||||
|
||||
assert resp.headers['WARC-Coll'] == 'local'
|
||||
assert resp.headers['WARC-Target-URI'] == 'http://example.com'
|
||||
assert resp.headers['WARC-Date'] == '2014-01-27T17:12:51Z'
|
||||
assert resp.headers['WARC-Refers-To-Target-URI'] == 'http://example.com'
|
||||
assert resp.headers['WARC-Refers-To-Date'] == '2014-01-27T17:12:00Z'
|
||||
assert resp.headers['Source-Coll'] == 'local'
|
||||
|
||||
buff = BytesIO(resp.body)
|
||||
status_headers = StatusAndHeadersParser(['WARC/1.0']).parse(buff)
|
||||
assert status_headers.get_header('WARC-Target-URI') == 'http://example.com'
|
||||
assert status_headers.get_header('WARC-Date') == '2014-01-27T17:12:51Z'
|
||||
assert status_headers.get_header('WARC-Refers-To-Target-URI') == 'http://example.com'
|
||||
assert status_headers.get_header('WARC-Refers-To-Date') == '2014-01-27T17:12:00Z'
|
||||
|
||||
assert resp.headers['Link'] == MementoUtils.make_link('http://example.com', 'original')
|
||||
assert resp.headers['Memento-Datetime'] == 'Mon, 27 Jan 2014 17:12:51 GMT'
|
||||
|
@ -9,6 +9,7 @@ from .testutils import json_list, to_path
|
||||
import json
|
||||
import pytest
|
||||
import time
|
||||
import six
|
||||
|
||||
from webagg.handlers import IndexHandler
|
||||
|
||||
@ -39,8 +40,13 @@ agg_nf = {'simple': SimpleAggregator(nf),
|
||||
'processes': ThreadedTimeoutAggregator(nf, timeout=5.0, use_processes=True),
|
||||
}
|
||||
|
||||
#def pytest_generate_tests(metafunc):
|
||||
# metafunc.parametrize("agg", list(aggs.values()), ids=list(aggs.keys()))
|
||||
if six.PY2:
|
||||
del aggs['threaded']
|
||||
del aggs['processes']
|
||||
del agg_tm['threaded']
|
||||
del agg_tm['processes']
|
||||
del agg_nf['threaded']
|
||||
del agg_nf['processes']
|
||||
|
||||
|
||||
@pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys()))
|
||||
|
@ -1,7 +1,7 @@
|
||||
from webagg.liverec import request as remote_request
|
||||
|
||||
from webagg.inputrequest import DirectWSGIInputRequest, POSTInputRequest
|
||||
from bottle import route, request, response, default_app, abort
|
||||
from bottle import route, request, response, abort, Bottle
|
||||
import bottle
|
||||
|
||||
import traceback
|
||||
@ -11,49 +11,46 @@ JSON_CT = 'application/json; charset=utf-8'
|
||||
|
||||
|
||||
#=============================================================================
|
||||
route_dict = {}
|
||||
class ResAggApp(object):
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.application = Bottle()
|
||||
self.application.default_error_handler = self.err_handler
|
||||
self.route_dict = {}
|
||||
|
||||
@self.application.route('/')
|
||||
def list_routes():
|
||||
return self.route_dict
|
||||
|
||||
#=============================================================================
|
||||
def add_route(path, handler):
|
||||
@route([path, path + '/<mode:path>'], 'ANY')
|
||||
@wrap_error
|
||||
def direct_input_request(mode=''):
|
||||
params = dict(request.query)
|
||||
params['mode'] = mode
|
||||
params['_input_req'] = DirectWSGIInputRequest(request.environ)
|
||||
return handler(params)
|
||||
def add_route(self, path, handler):
|
||||
@self.application.route([path, path + '/<mode:path>'], 'ANY')
|
||||
@wrap_error
|
||||
def direct_input_request(mode=''):
|
||||
params = dict(request.query)
|
||||
params['mode'] = mode
|
||||
params['_input_req'] = DirectWSGIInputRequest(request.environ)
|
||||
return handler(params)
|
||||
|
||||
@route([path + '/postreq', path + '/<mode:path>/postreq'], 'POST')
|
||||
@wrap_error
|
||||
def post_fullrequest(mode=''):
|
||||
params = dict(request.query)
|
||||
params['mode'] = mode
|
||||
params['_input_req'] = POSTInputRequest(request.environ)
|
||||
return handler(params)
|
||||
@self.application.route([path + '/postreq', path + '/<mode:path>/postreq'], 'POST')
|
||||
@wrap_error
|
||||
def post_fullrequest(mode=''):
|
||||
params = dict(request.query)
|
||||
params['mode'] = mode
|
||||
params['_input_req'] = POSTInputRequest(request.environ)
|
||||
return handler(params)
|
||||
|
||||
global route_dict
|
||||
handler_dict = handler.get_supported_modes()
|
||||
route_dict[path] = handler_dict
|
||||
route_dict[path + '/postreq'] = handler_dict
|
||||
handler_dict = handler.get_supported_modes()
|
||||
self.route_dict[path] = handler_dict
|
||||
self.route_dict[path + '/postreq'] = handler_dict
|
||||
|
||||
|
||||
#=============================================================================
|
||||
@route('/')
|
||||
def list_routes():
|
||||
return route_dict
|
||||
|
||||
|
||||
#=============================================================================
|
||||
def err_handler(exc):
|
||||
if bottle.debug:
|
||||
print(exc)
|
||||
traceback.print_exc()
|
||||
response.status = exc.status_code
|
||||
response.content_type = JSON_CT
|
||||
err_msg = json.dumps({'message': exc.body})
|
||||
response.headers['ResErrors'] = err_msg
|
||||
return err_msg
|
||||
def err_handler(self, exc):
|
||||
if bottle.debug:
|
||||
print(exc)
|
||||
traceback.print_exc()
|
||||
response.status = exc.status_code
|
||||
response.content_type = JSON_CT
|
||||
err_msg = json.dumps({'message': exc.body})
|
||||
response.headers['ResErrors'] = err_msg
|
||||
return err_msg
|
||||
|
||||
|
||||
#=============================================================================
|
||||
@ -99,8 +96,3 @@ def wrap_error(func):
|
||||
return wrap_func
|
||||
|
||||
|
||||
#=============================================================================
|
||||
application = default_app()
|
||||
application.default_error_handler = err_handler
|
||||
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
from webagg.responseloader import WARCPathLoader, LiveWebLoader, UpstreamProxyLoader
|
||||
from webagg.responseloader import WARCPathLoader, LiveWebLoader
|
||||
from webagg.utils import MementoUtils
|
||||
from pywb.utils.wbexception import BadRequestException, WbException
|
||||
from pywb.utils.wbexception import NotFoundException
|
||||
@ -118,7 +118,7 @@ class ResourceHandler(IndexHandler):
|
||||
class DefaultResourceHandler(ResourceHandler):
|
||||
def __init__(self, index_source, warc_paths=''):
|
||||
loaders = [WARCPathLoader(warc_paths, index_source),
|
||||
UpstreamProxyLoader(),
|
||||
# UpstreamProxyLoader(),
|
||||
LiveWebLoader(),
|
||||
]
|
||||
super(DefaultResourceHandler, self).__init__(index_source, loaders)
|
||||
|
@ -66,23 +66,33 @@ class RemoteIndexSource(BaseIndexSource):
|
||||
def do_load(lines):
|
||||
for line in lines:
|
||||
cdx = CDXObject(line)
|
||||
cdx[self.url_field] = self.replay_url.format(
|
||||
timestamp=cdx['timestamp'],
|
||||
url=cdx['url'])
|
||||
self._set_load_url(cdx)
|
||||
yield cdx
|
||||
|
||||
return do_load(lines)
|
||||
|
||||
@staticmethod
|
||||
def upstream_webagg(base_url):
|
||||
api_url = base_url + '/index?url={url}'
|
||||
proxy_url = base_url + '/resource?url={url}&closest={timestamp}'
|
||||
return RemoteIndexSource(api_url, proxy_url, 'upstream_url')
|
||||
def _set_load_url(self, cdx):
|
||||
cdx[self.url_field] = self.replay_url.format(
|
||||
timestamp=cdx['timestamp'],
|
||||
url=cdx['url'])
|
||||
|
||||
def __str__(self):
|
||||
return 'remote'
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class UpstreamAggIndexSource(RemoteIndexSource):
|
||||
def __init__(self, base_url):
|
||||
api_url = base_url + '/index?url={url}'
|
||||
proxy_url = base_url + '/resource?url={url}&closest={timestamp}'
|
||||
super(UpstreamAggIndexSource, self).__init__(api_url, proxy_url, 'filename')
|
||||
|
||||
def _set_load_url(self, cdx):
|
||||
super(UpstreamAggIndexSource, self)._set_load_url(cdx)
|
||||
cdx['offset'] = '0'
|
||||
cdx.pop('load_url', '')
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class LiveIndexSource(BaseIndexSource):
|
||||
def load_index(self, params):
|
||||
|
@ -1,34 +1,44 @@
|
||||
from webagg.liverec import BaseRecorder
|
||||
from webagg.liverec import request as remote_request
|
||||
from requests import request
|
||||
|
||||
from webagg.utils import MementoUtils
|
||||
|
||||
from requests import session
|
||||
|
||||
from pywb.utils.timeutils import timestamp_to_datetime, datetime_to_http_date
|
||||
from pywb.utils.timeutils import iso_date_to_datetime
|
||||
from pywb.utils.wbexception import LiveResourceException
|
||||
from pywb.utils.statusandheaders import StatusAndHeaders
|
||||
|
||||
from pywb.warc.resolvingloader import ResolvingLoader
|
||||
|
||||
|
||||
from io import BytesIO
|
||||
|
||||
import uuid
|
||||
import six
|
||||
import itertools
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class StreamIter(six.Iterator):
|
||||
def __init__(self, stream, header=None, size=8192):
|
||||
def __init__(self, stream, header1=None, header2=None, size=8192):
|
||||
self.stream = stream
|
||||
self.header = header
|
||||
self.header1 = header1
|
||||
self.header2 = header2
|
||||
self.size = size
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def __next__(self):
|
||||
if self.header:
|
||||
header = self.header
|
||||
self.header = None
|
||||
if self.header1:
|
||||
header = self.header1
|
||||
self.header1 = None
|
||||
return header
|
||||
elif self.header2:
|
||||
header = self.header2
|
||||
self.header2 = None
|
||||
return header
|
||||
|
||||
data = self.stream.read(self.size)
|
||||
@ -52,22 +62,44 @@ class StreamIter(six.Iterator):
|
||||
#=============================================================================
|
||||
class BaseLoader(object):
|
||||
def __call__(self, cdx, params):
|
||||
out_headers, res = self._load_resource(cdx, params)
|
||||
if not res:
|
||||
entry = self._load_resource(cdx, params)
|
||||
if not entry:
|
||||
return None, None
|
||||
|
||||
out_headers['WARC-Coll'] = cdx.get('source', '')
|
||||
warc_headers, other_headers_buff, stream = entry
|
||||
|
||||
out_headers = {}
|
||||
out_headers['Source-Coll'] = cdx.get('source', '')
|
||||
|
||||
out_headers['Link'] = MementoUtils.make_link(
|
||||
out_headers['WARC-Target-URI'],
|
||||
'original')
|
||||
warc_headers.get_header('WARC-Target-URI'),
|
||||
'original')
|
||||
|
||||
memento_dt = iso_date_to_datetime(out_headers['WARC-Date'])
|
||||
out_headers['Content-Type'] = 'application/warc-record'
|
||||
|
||||
memento_dt = iso_date_to_datetime(warc_headers.get_header('WARC-Date'))
|
||||
out_headers['Memento-Datetime'] = datetime_to_http_date(memento_dt)
|
||||
return out_headers, res
|
||||
|
||||
def _load_resource(self, cdx, params): #pragma: no cover
|
||||
raise NotImplemented()
|
||||
warc_headers_buff = warc_headers.to_bytes()
|
||||
|
||||
self._set_content_len(warc_headers.get_header('Content-Length'),
|
||||
out_headers,
|
||||
len(warc_headers_buff))
|
||||
|
||||
return out_headers, StreamIter(stream,
|
||||
header1=warc_headers_buff,
|
||||
header2=other_headers_buff)
|
||||
|
||||
def _set_content_len(self, content_len_str, headers, existing_len):
|
||||
# Try to set content-length, if it is available and valid
|
||||
try:
|
||||
content_len = int(content_len_str)
|
||||
except (KeyError, TypeError):
|
||||
content_len = -1
|
||||
|
||||
if content_len >= 0:
|
||||
content_len += existing_len
|
||||
headers['Content-Length'] = str(content_len)
|
||||
|
||||
|
||||
#=============================================================================
|
||||
@ -104,7 +136,7 @@ class WARCPathLoader(BaseLoader):
|
||||
|
||||
def _load_resource(self, cdx, params):
|
||||
if not cdx.get('filename') or cdx.get('offset') is None:
|
||||
return None, None
|
||||
return None
|
||||
|
||||
cdx._formatter = params.get('_formatter')
|
||||
failed_files = []
|
||||
@ -112,88 +144,29 @@ class WARCPathLoader(BaseLoader):
|
||||
load_headers_and_payload(cdx,
|
||||
failed_files,
|
||||
self.cdx_index_source))
|
||||
|
||||
record = payload
|
||||
out_headers = {}
|
||||
|
||||
for n, v in record.rec_headers.headers:
|
||||
out_headers[n] = v
|
||||
warc_headers = payload.rec_headers
|
||||
|
||||
if headers != payload:
|
||||
out_headers['WARC-Target-URI'] = headers.rec_headers.get_header('WARC-Target-URI')
|
||||
out_headers['WARC-Date'] = headers.rec_headers.get_header('WARC-Date')
|
||||
out_headers['WARC-Refers-To-Target-URI'] = payload.rec_headers.get_header('WARC-Target-URI')
|
||||
out_headers['WARC-Refers-To-Date'] = payload.rec_headers.get_header('WARC-Date')
|
||||
warc_headers.replace_header('WARC-Refers-To-Target-URI',
|
||||
payload.rec_headers.get_header('WARC-Target-URI'))
|
||||
|
||||
warc_headers.replace_header('WARC-Refers-To-Date',
|
||||
payload.rec_headers.get_header('WARC-Date'))
|
||||
|
||||
warc_headers.replace_header('WARC-Target-URI',
|
||||
headers.rec_headers.get_header('WARC-Target-URI'))
|
||||
|
||||
warc_headers.replace_header('WARC-Date',
|
||||
headers.rec_headers.get_header('WARC-Date'))
|
||||
|
||||
headers.stream.close()
|
||||
|
||||
return out_headers, StreamIter(record.stream)
|
||||
return (warc_headers, None, payload.stream)
|
||||
|
||||
def __str__(self):
|
||||
return 'WARCPathLoader'
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class HeaderRecorder(BaseRecorder):
|
||||
def __init__(self, skip_list=None):
|
||||
self.buff = BytesIO()
|
||||
self.skip_list = skip_list
|
||||
self.skipped = []
|
||||
self.target_ip = None
|
||||
|
||||
def write_response_header_line(self, line):
|
||||
if self.accept_header(line):
|
||||
self.buff.write(line)
|
||||
|
||||
def get_header(self):
|
||||
return self.buff.getvalue()
|
||||
|
||||
def accept_header(self, line):
|
||||
if self.skip_list and line.lower().startswith(self.skip_list):
|
||||
self.skipped.append(line)
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def finish_request(self, socket):
|
||||
ip = socket.getpeername()
|
||||
if ip:
|
||||
self.target_ip = ip[0]
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class UpstreamProxyLoader(BaseLoader):
|
||||
def _load_resource(self, cdx, params):
|
||||
load_url = cdx.get('upstream_url')
|
||||
if not load_url:
|
||||
return None, None
|
||||
|
||||
input_req = params['_input_req']
|
||||
|
||||
method = input_req.get_req_method()
|
||||
data = input_req.get_req_body()
|
||||
req_headers = input_req.get_req_headers()
|
||||
|
||||
try:
|
||||
upstream_res = request(url=load_url,
|
||||
method=method,
|
||||
stream=True,
|
||||
allow_redirects=False,
|
||||
headers=req_headers,
|
||||
data=data,
|
||||
timeout=params.get('_timeout'))
|
||||
except Exception as e:
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
raise LiveResourceException(load_url)
|
||||
|
||||
out_headers = upstream_res.headers
|
||||
|
||||
return out_headers, StreamIter(upstream_res.raw)
|
||||
|
||||
def __str__(self):
|
||||
return 'UpstreamProxyLoader'
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class LiveWebLoader(BaseLoader):
|
||||
SKIP_HEADERS = (b'link',
|
||||
@ -204,7 +177,7 @@ class LiveWebLoader(BaseLoader):
|
||||
def _load_resource(self, cdx, params):
|
||||
load_url = cdx.get('load_url')
|
||||
if not load_url:
|
||||
return None, None
|
||||
return None
|
||||
|
||||
recorder = HeaderRecorder(self.SKIP_HEADERS)
|
||||
|
||||
@ -236,31 +209,28 @@ class LiveWebLoader(BaseLoader):
|
||||
headers=req_headers,
|
||||
data=data,
|
||||
timeout=params.get('_timeout'))
|
||||
except Exception:
|
||||
except Exception as e:
|
||||
raise LiveResourceException(load_url)
|
||||
|
||||
resp_headers = recorder.get_header()
|
||||
http_headers_buff = recorder.get_headers_buff()
|
||||
|
||||
out_headers = {}
|
||||
out_headers['Content-Type'] = 'application/http; msgtype=response'
|
||||
warc_headers = {}
|
||||
|
||||
out_headers['WARC-Type'] = 'response'
|
||||
out_headers['WARC-Record-ID'] = self._make_warc_id()
|
||||
out_headers['WARC-Target-URI'] = cdx['url']
|
||||
out_headers['WARC-Date'] = self._make_date(dt)
|
||||
warc_headers['WARC-Type'] = 'response'
|
||||
warc_headers['WARC-Record-ID'] = self._make_warc_id()
|
||||
warc_headers['WARC-Target-URI'] = cdx['url']
|
||||
warc_headers['WARC-Date'] = self._make_date(dt)
|
||||
if recorder.target_ip:
|
||||
out_headers['WARC-IP-Address'] = recorder.target_ip
|
||||
warc_headers['WARC-IP-Address'] = recorder.target_ip
|
||||
|
||||
# Try to set content-length, if it is available and valid
|
||||
try:
|
||||
content_len = int(upstream_res.headers.get('content-length', 0))
|
||||
if content_len > 0:
|
||||
content_len += len(resp_headers)
|
||||
out_headers['Content-Length'] = content_len
|
||||
except (KeyError, TypeError):
|
||||
pass
|
||||
warc_headers['Content-Type'] = 'application/http; msgtype=response'
|
||||
|
||||
return out_headers, StreamIter(upstream_res.raw, header=resp_headers)
|
||||
self._set_content_len(upstream_res.headers.get('Content-Length', -1),
|
||||
warc_headers,
|
||||
len(http_headers_buff))
|
||||
|
||||
warc_headers = StatusAndHeaders('WARC/1.0', warc_headers.items())
|
||||
return (warc_headers, http_headers_buff, upstream_res.raw)
|
||||
|
||||
@staticmethod
|
||||
def _make_date(dt):
|
||||
@ -275,3 +245,32 @@ class LiveWebLoader(BaseLoader):
|
||||
def __str__(self):
|
||||
return 'LiveWebLoader'
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class HeaderRecorder(BaseRecorder):
|
||||
def __init__(self, skip_list=None):
|
||||
self.buff = BytesIO()
|
||||
self.skip_list = skip_list
|
||||
self.skipped = []
|
||||
self.target_ip = None
|
||||
|
||||
def write_response_header_line(self, line):
|
||||
if self.accept_header(line):
|
||||
self.buff.write(line)
|
||||
|
||||
def get_headers_buff(self):
|
||||
return self.buff.getvalue()
|
||||
|
||||
def accept_header(self, line):
|
||||
if self.skip_list and line.lower().startswith(self.skip_list):
|
||||
self.skipped.append(line)
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def finish_request(self, socket):
|
||||
ip = socket.getpeername()
|
||||
if ip:
|
||||
self.target_ip = ip[0]
|
||||
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user