diff --git a/test/test_handlers.py b/test/test_handlers.py index b4da3ace..f5c96e0f 100644 --- a/test/test_handlers.py +++ b/test/test_handlers.py @@ -8,9 +8,12 @@ from webagg.indexsource import MementoIndexSource, FileIndexSource, LiveIndexSou from webagg.aggregator import GeventTimeoutAggregator, SimpleAggregator from webagg.aggregator import DirectoryIndexSource -from webagg.app import add_route, application +from webagg.app import ResAggApp from webagg.utils import MementoUtils +from pywb.utils.statusandheaders import StatusAndHeadersParser +from io import BytesIO + import webtest import bottle @@ -30,32 +33,32 @@ testapp = None def setup_module(self): live_source = SimpleAggregator({'live': LiveIndexSource()}) live_handler = DefaultResourceHandler(live_source) - add_route('/live', live_handler) + app = ResAggApp() + app.add_route('/live', live_handler) source1 = GeventTimeoutAggregator(sources) handler1 = DefaultResourceHandler(source1, to_path('testdata/')) - add_route('/many', handler1) + app.add_route('/many', handler1) source2 = SimpleAggregator({'post': FileIndexSource(to_path('testdata/post-test.cdxj'))}) handler2 = DefaultResourceHandler(source2, to_path('testdata/')) - add_route('/posttest', handler2) + app.add_route('/posttest', handler2) source3 = SimpleAggregator({'example': FileIndexSource(to_path('testdata/example.cdxj'))}) handler3 = DefaultResourceHandler(source3, to_path('testdata/')) - add_route('/fallback', HandlerSeq([handler3, + app.add_route('/fallback', HandlerSeq([handler3, handler2, live_handler])) - add_route('/seq', HandlerSeq([handler3, + app.add_route('/seq', HandlerSeq([handler3, handler2])) - add_route('/empty', HandlerSeq([])) - add_route('/invalid', DefaultResourceHandler([SimpleAggregator({'invalid': 'should not be a callable'})])) + app.add_route('/empty', HandlerSeq([])) + app.add_route('/invalid', DefaultResourceHandler([SimpleAggregator({'invalid': 'should not be a callable'})])) - application.debug = True global testapp - testapp = webtest.TestApp(application) + testapp = webtest.TestApp(app.application) def to_json_list(text): @@ -66,6 +69,15 @@ class TestResAgg(object): def setup(self): self.testapp = testapp + def _check_uri_date(self, resp, uri, dt): + buff = BytesIO(resp.body) + status_headers = StatusAndHeadersParser(['WARC/1.0']).parse(buff) + assert status_headers.get_header('WARC-Target-URI') == uri + if dt == True: + assert status_headers.get_header('WARC-Date') != '' + else: + assert status_headers.get_header('WARC-Date') == dt + def test_list_routes(self): resp = self.testapp.get('/') res = resp.json @@ -120,9 +132,9 @@ class TestResAgg(object): headers = {'foo': 'bar'} resp = self.testapp.get('/live/resource?url=http://httpbin.org/get?foo=bar', headers=headers) - assert resp.headers['WARC-Coll'] == 'live' - assert resp.headers['WARC-Target-URI'] == 'http://httpbin.org/get?foo=bar' - assert resp.headers['WARC-Date'] != '' + assert resp.headers['Source-Coll'] == 'live' + + self._check_uri_date(resp, 'http://httpbin.org/get?foo=bar', True) assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/get?foo=bar', 'original') assert resp.headers['Memento-Datetime'] != '' @@ -136,9 +148,9 @@ class TestResAgg(object): resp = self.testapp.post('/live/resource?url=http://httpbin.org/post', OrderedDict([('foo', 'bar')])) - assert resp.headers['WARC-Coll'] == 'live' - assert resp.headers['WARC-Target-URI'] == 'http://httpbin.org/post' - assert resp.headers['WARC-Date'] != '' + assert resp.headers['Source-Coll'] == 'live' + + self._check_uri_date(resp, 'http://httpbin.org/post', True) assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/post', 'original') assert resp.headers['Memento-Datetime'] != '' @@ -151,9 +163,10 @@ class TestResAgg(object): def test_agg_select_mem_1(self): resp = self.testapp.get('/many/resource?url=http://vvork.com/&closest=20141001') - assert resp.headers['WARC-Coll'] == 'rhiz' - assert resp.headers['WARC-Target-URI'] == 'http://www.vvork.com/' - assert resp.headers['WARC-Date'] == '2014-10-06T18:43:57Z' + assert resp.headers['Source-Coll'] == 'rhiz' + + self._check_uri_date(resp, 'http://www.vvork.com/', '2014-10-06T18:43:57Z') + assert b'HTTP/1.1 200 OK' in resp.body assert resp.headers['Link'] == MementoUtils.make_link('http://www.vvork.com/', 'original') @@ -164,9 +177,10 @@ class TestResAgg(object): def test_agg_select_mem_2(self): resp = self.testapp.get('/many/resource?url=http://vvork.com/&closest=20151231') - assert resp.headers['WARC-Coll'] == 'ia' - assert resp.headers['WARC-Target-URI'] == 'http://vvork.com/' - assert resp.headers['WARC-Date'] == '2016-01-10T13:48:55Z' + assert resp.headers['Source-Coll'] == 'ia' + + self._check_uri_date(resp, 'http://vvork.com/', '2016-01-10T13:48:55Z') + assert b'HTTP/1.1 200 OK' in resp.body assert resp.headers['Link'] == MementoUtils.make_link('http://vvork.com/', 'original') @@ -177,9 +191,9 @@ class TestResAgg(object): def test_agg_select_live(self): resp = self.testapp.get('/many/resource?url=http://vvork.com/&closest=2016') - assert resp.headers['WARC-Coll'] == 'live' - assert resp.headers['WARC-Target-URI'] == 'http://vvork.com/' - assert resp.headers['WARC-Date'] != '' + assert resp.headers['Source-Coll'] == 'live' + + self._check_uri_date(resp, 'http://vvork.com/', True) assert resp.headers['Link'] == MementoUtils.make_link('http://vvork.com/', 'original') assert resp.headers['Memento-Datetime'] != '' @@ -189,9 +203,9 @@ class TestResAgg(object): def test_agg_select_local(self): resp = self.testapp.get('/many/resource?url=http://iana.org/&closest=20140126200624') - assert resp.headers['WARC-Coll'] == 'local' - assert resp.headers['WARC-Target-URI'] == 'http://www.iana.org/' - assert resp.headers['WARC-Date'] == '2014-01-26T20:06:24Z' + assert resp.headers['Source-Coll'] == 'local' + + self._check_uri_date(resp, 'http://www.iana.org/', '2014-01-26T20:06:24Z') assert resp.headers['Link'] == MementoUtils.make_link('http://www.iana.org/', 'original') assert resp.headers['Memento-Datetime'] == 'Sun, 26 Jan 2014 20:06:24 GMT' @@ -208,9 +222,9 @@ Host: iana.org resp = self.testapp.post('/many/resource/postreq?url=http://iana.org/&closest=20140126200624', req_data) - assert resp.headers['WARC-Coll'] == 'local' - assert resp.headers['WARC-Target-URI'] == 'http://www.iana.org/' - assert resp.headers['WARC-Date'] == '2014-01-26T20:06:24Z' + assert resp.headers['Source-Coll'] == 'local' + + self._check_uri_date(resp, 'http://www.iana.org/', '2014-01-26T20:06:24Z') assert resp.headers['Link'] == MementoUtils.make_link('http://www.iana.org/', 'original') assert resp.headers['Memento-Datetime'] == 'Sun, 26 Jan 2014 20:06:24 GMT' @@ -227,9 +241,9 @@ Host: httpbin.org resp = self.testapp.post('/many/resource/postreq?url=http://httpbin.org/get?foo=bar&closest=now', req_data) - assert resp.headers['WARC-Coll'] == 'live' - assert resp.headers['WARC-Target-URI'] == 'http://httpbin.org/get?foo=bar' - assert resp.headers['WARC-Date'] != '' + assert resp.headers['Source-Coll'] == 'live' + + self._check_uri_date(resp, 'http://httpbin.org/get?foo=bar', True) assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/get?foo=bar', 'original') assert resp.headers['Memento-Datetime'] != '' @@ -252,9 +266,9 @@ foo=bar&test=abc""" resp = self.testapp.post('/posttest/resource/postreq?url=http://httpbin.org/post', req_data) - assert resp.headers['WARC-Coll'] == 'post' - assert resp.headers['WARC-Target-URI'] == 'http://httpbin.org/post' - assert resp.headers['WARC-Date'] != '' + assert resp.headers['Source-Coll'] == 'post' + + self._check_uri_date(resp, 'http://httpbin.org/post', True) assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/post', 'original') assert resp.headers['Memento-Datetime'] != '' @@ -271,8 +285,10 @@ foo=bar&test=abc""" resp = self.testapp.post('/fallback/resource?url=http://httpbin.org/post', req_data) - assert resp.headers['WARC-Coll'] == 'post' - assert resp.headers['WARC-Target-URI'] == 'http://httpbin.org/post' + assert resp.headers['Source-Coll'] == 'post' + + self._check_uri_date(resp, 'http://httpbin.org/post', True) + assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/post', 'original') assert b'HTTP/1.1 200 OK' in resp.body @@ -285,8 +301,10 @@ foo=bar&test=abc""" def test_agg_seq_fallback_1(self): resp = self.testapp.get('/fallback/resource?url=http://www.iana.org/') - assert resp.headers['WARC-Coll'] == 'live' - assert resp.headers['WARC-Target-URI'] == 'http://www.iana.org/' + assert resp.headers['Source-Coll'] == 'live' + + self._check_uri_date(resp, 'http://www.iana.org/', True) + assert resp.headers['Link'] == MementoUtils.make_link('http://www.iana.org/', 'original') assert b'HTTP/1.1 200 OK' in resp.body @@ -296,9 +314,9 @@ foo=bar&test=abc""" def test_agg_seq_fallback_2(self): resp = self.testapp.get('/fallback/resource?url=http://www.example.com/') - assert resp.headers['WARC-Coll'] == 'example' - assert resp.headers['WARC-Date'] == '2016-02-25T04:23:29Z' - assert resp.headers['WARC-Target-URI'] == 'http://example.com/' + assert resp.headers['Source-Coll'] == 'example' + + self._check_uri_date(resp, 'http://example.com/', '2016-02-25T04:23:29Z') assert resp.headers['Link'] == MementoUtils.make_link('http://example.com/', 'original') assert resp.headers['Memento-Datetime'] == 'Thu, 25 Feb 2016 04:23:29 GMT' @@ -318,11 +336,14 @@ foo=bar&test=abc""" def test_agg_local_revisit(self): resp = self.testapp.get('/many/resource?url=http://www.example.com/&closest=20140127171251&sources=local') - assert resp.headers['WARC-Coll'] == 'local' - assert resp.headers['WARC-Target-URI'] == 'http://example.com' - assert resp.headers['WARC-Date'] == '2014-01-27T17:12:51Z' - assert resp.headers['WARC-Refers-To-Target-URI'] == 'http://example.com' - assert resp.headers['WARC-Refers-To-Date'] == '2014-01-27T17:12:00Z' + assert resp.headers['Source-Coll'] == 'local' + + buff = BytesIO(resp.body) + status_headers = StatusAndHeadersParser(['WARC/1.0']).parse(buff) + assert status_headers.get_header('WARC-Target-URI') == 'http://example.com' + assert status_headers.get_header('WARC-Date') == '2014-01-27T17:12:51Z' + assert status_headers.get_header('WARC-Refers-To-Target-URI') == 'http://example.com' + assert status_headers.get_header('WARC-Refers-To-Date') == '2014-01-27T17:12:00Z' assert resp.headers['Link'] == MementoUtils.make_link('http://example.com', 'original') assert resp.headers['Memento-Datetime'] == 'Mon, 27 Jan 2014 17:12:51 GMT' diff --git a/test/test_memento_agg.py b/test/test_memento_agg.py index 017d1871..88f36daf 100644 --- a/test/test_memento_agg.py +++ b/test/test_memento_agg.py @@ -9,6 +9,7 @@ from .testutils import json_list, to_path import json import pytest import time +import six from webagg.handlers import IndexHandler @@ -39,8 +40,13 @@ agg_nf = {'simple': SimpleAggregator(nf), 'processes': ThreadedTimeoutAggregator(nf, timeout=5.0, use_processes=True), } -#def pytest_generate_tests(metafunc): -# metafunc.parametrize("agg", list(aggs.values()), ids=list(aggs.keys())) +if six.PY2: + del aggs['threaded'] + del aggs['processes'] + del agg_tm['threaded'] + del agg_tm['processes'] + del agg_nf['threaded'] + del agg_nf['processes'] @pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys())) diff --git a/webagg/app.py b/webagg/app.py index 437c1105..2745223d 100644 --- a/webagg/app.py +++ b/webagg/app.py @@ -1,7 +1,7 @@ from webagg.liverec import request as remote_request from webagg.inputrequest import DirectWSGIInputRequest, POSTInputRequest -from bottle import route, request, response, default_app, abort +from bottle import route, request, response, abort, Bottle import bottle import traceback @@ -11,49 +11,46 @@ JSON_CT = 'application/json; charset=utf-8' #============================================================================= -route_dict = {} +class ResAggApp(object): + def __init__(self, *args, **kwargs): + self.application = Bottle() + self.application.default_error_handler = self.err_handler + self.route_dict = {} + @self.application.route('/') + def list_routes(): + return self.route_dict -#============================================================================= -def add_route(path, handler): - @route([path, path + '/'], 'ANY') - @wrap_error - def direct_input_request(mode=''): - params = dict(request.query) - params['mode'] = mode - params['_input_req'] = DirectWSGIInputRequest(request.environ) - return handler(params) + def add_route(self, path, handler): + @self.application.route([path, path + '/'], 'ANY') + @wrap_error + def direct_input_request(mode=''): + params = dict(request.query) + params['mode'] = mode + params['_input_req'] = DirectWSGIInputRequest(request.environ) + return handler(params) - @route([path + '/postreq', path + '//postreq'], 'POST') - @wrap_error - def post_fullrequest(mode=''): - params = dict(request.query) - params['mode'] = mode - params['_input_req'] = POSTInputRequest(request.environ) - return handler(params) + @self.application.route([path + '/postreq', path + '//postreq'], 'POST') + @wrap_error + def post_fullrequest(mode=''): + params = dict(request.query) + params['mode'] = mode + params['_input_req'] = POSTInputRequest(request.environ) + return handler(params) - global route_dict - handler_dict = handler.get_supported_modes() - route_dict[path] = handler_dict - route_dict[path + '/postreq'] = handler_dict + handler_dict = handler.get_supported_modes() + self.route_dict[path] = handler_dict + self.route_dict[path + '/postreq'] = handler_dict - -#============================================================================= -@route('/') -def list_routes(): - return route_dict - - -#============================================================================= -def err_handler(exc): - if bottle.debug: - print(exc) - traceback.print_exc() - response.status = exc.status_code - response.content_type = JSON_CT - err_msg = json.dumps({'message': exc.body}) - response.headers['ResErrors'] = err_msg - return err_msg + def err_handler(self, exc): + if bottle.debug: + print(exc) + traceback.print_exc() + response.status = exc.status_code + response.content_type = JSON_CT + err_msg = json.dumps({'message': exc.body}) + response.headers['ResErrors'] = err_msg + return err_msg #============================================================================= @@ -99,8 +96,3 @@ def wrap_error(func): return wrap_func -#============================================================================= -application = default_app() -application.default_error_handler = err_handler - - diff --git a/webagg/handlers.py b/webagg/handlers.py index b604bd62..55529156 100644 --- a/webagg/handlers.py +++ b/webagg/handlers.py @@ -1,4 +1,4 @@ -from webagg.responseloader import WARCPathLoader, LiveWebLoader, UpstreamProxyLoader +from webagg.responseloader import WARCPathLoader, LiveWebLoader from webagg.utils import MementoUtils from pywb.utils.wbexception import BadRequestException, WbException from pywb.utils.wbexception import NotFoundException @@ -118,7 +118,7 @@ class ResourceHandler(IndexHandler): class DefaultResourceHandler(ResourceHandler): def __init__(self, index_source, warc_paths=''): loaders = [WARCPathLoader(warc_paths, index_source), - UpstreamProxyLoader(), + # UpstreamProxyLoader(), LiveWebLoader(), ] super(DefaultResourceHandler, self).__init__(index_source, loaders) diff --git a/webagg/indexsource.py b/webagg/indexsource.py index 32fd3804..6989b894 100644 --- a/webagg/indexsource.py +++ b/webagg/indexsource.py @@ -66,23 +66,33 @@ class RemoteIndexSource(BaseIndexSource): def do_load(lines): for line in lines: cdx = CDXObject(line) - cdx[self.url_field] = self.replay_url.format( - timestamp=cdx['timestamp'], - url=cdx['url']) + self._set_load_url(cdx) yield cdx return do_load(lines) - @staticmethod - def upstream_webagg(base_url): - api_url = base_url + '/index?url={url}' - proxy_url = base_url + '/resource?url={url}&closest={timestamp}' - return RemoteIndexSource(api_url, proxy_url, 'upstream_url') + def _set_load_url(self, cdx): + cdx[self.url_field] = self.replay_url.format( + timestamp=cdx['timestamp'], + url=cdx['url']) def __str__(self): return 'remote' +#============================================================================= +class UpstreamAggIndexSource(RemoteIndexSource): + def __init__(self, base_url): + api_url = base_url + '/index?url={url}' + proxy_url = base_url + '/resource?url={url}&closest={timestamp}' + super(UpstreamAggIndexSource, self).__init__(api_url, proxy_url, 'filename') + + def _set_load_url(self, cdx): + super(UpstreamAggIndexSource, self)._set_load_url(cdx) + cdx['offset'] = '0' + cdx.pop('load_url', '') + + #============================================================================= class LiveIndexSource(BaseIndexSource): def load_index(self, params): diff --git a/webagg/responseloader.py b/webagg/responseloader.py index 48459345..d29b629d 100644 --- a/webagg/responseloader.py +++ b/webagg/responseloader.py @@ -1,34 +1,44 @@ from webagg.liverec import BaseRecorder from webagg.liverec import request as remote_request -from requests import request from webagg.utils import MementoUtils +from requests import session + from pywb.utils.timeutils import timestamp_to_datetime, datetime_to_http_date from pywb.utils.timeutils import iso_date_to_datetime from pywb.utils.wbexception import LiveResourceException +from pywb.utils.statusandheaders import StatusAndHeaders + from pywb.warc.resolvingloader import ResolvingLoader + from io import BytesIO import uuid import six +import itertools #============================================================================= class StreamIter(six.Iterator): - def __init__(self, stream, header=None, size=8192): + def __init__(self, stream, header1=None, header2=None, size=8192): self.stream = stream - self.header = header + self.header1 = header1 + self.header2 = header2 self.size = size def __iter__(self): return self def __next__(self): - if self.header: - header = self.header - self.header = None + if self.header1: + header = self.header1 + self.header1 = None + return header + elif self.header2: + header = self.header2 + self.header2 = None return header data = self.stream.read(self.size) @@ -52,22 +62,44 @@ class StreamIter(six.Iterator): #============================================================================= class BaseLoader(object): def __call__(self, cdx, params): - out_headers, res = self._load_resource(cdx, params) - if not res: + entry = self._load_resource(cdx, params) + if not entry: return None, None - out_headers['WARC-Coll'] = cdx.get('source', '') + warc_headers, other_headers_buff, stream = entry + + out_headers = {} + out_headers['Source-Coll'] = cdx.get('source', '') out_headers['Link'] = MementoUtils.make_link( - out_headers['WARC-Target-URI'], - 'original') + warc_headers.get_header('WARC-Target-URI'), + 'original') - memento_dt = iso_date_to_datetime(out_headers['WARC-Date']) + out_headers['Content-Type'] = 'application/warc-record' + + memento_dt = iso_date_to_datetime(warc_headers.get_header('WARC-Date')) out_headers['Memento-Datetime'] = datetime_to_http_date(memento_dt) - return out_headers, res - def _load_resource(self, cdx, params): #pragma: no cover - raise NotImplemented() + warc_headers_buff = warc_headers.to_bytes() + + self._set_content_len(warc_headers.get_header('Content-Length'), + out_headers, + len(warc_headers_buff)) + + return out_headers, StreamIter(stream, + header1=warc_headers_buff, + header2=other_headers_buff) + + def _set_content_len(self, content_len_str, headers, existing_len): + # Try to set content-length, if it is available and valid + try: + content_len = int(content_len_str) + except (KeyError, TypeError): + content_len = -1 + + if content_len >= 0: + content_len += existing_len + headers['Content-Length'] = str(content_len) #============================================================================= @@ -104,7 +136,7 @@ class WARCPathLoader(BaseLoader): def _load_resource(self, cdx, params): if not cdx.get('filename') or cdx.get('offset') is None: - return None, None + return None cdx._formatter = params.get('_formatter') failed_files = [] @@ -112,88 +144,29 @@ class WARCPathLoader(BaseLoader): load_headers_and_payload(cdx, failed_files, self.cdx_index_source)) - - record = payload - out_headers = {} - - for n, v in record.rec_headers.headers: - out_headers[n] = v + warc_headers = payload.rec_headers if headers != payload: - out_headers['WARC-Target-URI'] = headers.rec_headers.get_header('WARC-Target-URI') - out_headers['WARC-Date'] = headers.rec_headers.get_header('WARC-Date') - out_headers['WARC-Refers-To-Target-URI'] = payload.rec_headers.get_header('WARC-Target-URI') - out_headers['WARC-Refers-To-Date'] = payload.rec_headers.get_header('WARC-Date') + warc_headers.replace_header('WARC-Refers-To-Target-URI', + payload.rec_headers.get_header('WARC-Target-URI')) + + warc_headers.replace_header('WARC-Refers-To-Date', + payload.rec_headers.get_header('WARC-Date')) + + warc_headers.replace_header('WARC-Target-URI', + headers.rec_headers.get_header('WARC-Target-URI')) + + warc_headers.replace_header('WARC-Date', + headers.rec_headers.get_header('WARC-Date')) + headers.stream.close() - return out_headers, StreamIter(record.stream) + return (warc_headers, None, payload.stream) def __str__(self): return 'WARCPathLoader' -#============================================================================= -class HeaderRecorder(BaseRecorder): - def __init__(self, skip_list=None): - self.buff = BytesIO() - self.skip_list = skip_list - self.skipped = [] - self.target_ip = None - - def write_response_header_line(self, line): - if self.accept_header(line): - self.buff.write(line) - - def get_header(self): - return self.buff.getvalue() - - def accept_header(self, line): - if self.skip_list and line.lower().startswith(self.skip_list): - self.skipped.append(line) - return False - - return True - - def finish_request(self, socket): - ip = socket.getpeername() - if ip: - self.target_ip = ip[0] - - -#============================================================================= -class UpstreamProxyLoader(BaseLoader): - def _load_resource(self, cdx, params): - load_url = cdx.get('upstream_url') - if not load_url: - return None, None - - input_req = params['_input_req'] - - method = input_req.get_req_method() - data = input_req.get_req_body() - req_headers = input_req.get_req_headers() - - try: - upstream_res = request(url=load_url, - method=method, - stream=True, - allow_redirects=False, - headers=req_headers, - data=data, - timeout=params.get('_timeout')) - except Exception as e: - import traceback - traceback.print_exc() - raise LiveResourceException(load_url) - - out_headers = upstream_res.headers - - return out_headers, StreamIter(upstream_res.raw) - - def __str__(self): - return 'UpstreamProxyLoader' - - #============================================================================= class LiveWebLoader(BaseLoader): SKIP_HEADERS = (b'link', @@ -204,7 +177,7 @@ class LiveWebLoader(BaseLoader): def _load_resource(self, cdx, params): load_url = cdx.get('load_url') if not load_url: - return None, None + return None recorder = HeaderRecorder(self.SKIP_HEADERS) @@ -236,31 +209,28 @@ class LiveWebLoader(BaseLoader): headers=req_headers, data=data, timeout=params.get('_timeout')) - except Exception: + except Exception as e: raise LiveResourceException(load_url) - resp_headers = recorder.get_header() + http_headers_buff = recorder.get_headers_buff() - out_headers = {} - out_headers['Content-Type'] = 'application/http; msgtype=response' + warc_headers = {} - out_headers['WARC-Type'] = 'response' - out_headers['WARC-Record-ID'] = self._make_warc_id() - out_headers['WARC-Target-URI'] = cdx['url'] - out_headers['WARC-Date'] = self._make_date(dt) + warc_headers['WARC-Type'] = 'response' + warc_headers['WARC-Record-ID'] = self._make_warc_id() + warc_headers['WARC-Target-URI'] = cdx['url'] + warc_headers['WARC-Date'] = self._make_date(dt) if recorder.target_ip: - out_headers['WARC-IP-Address'] = recorder.target_ip + warc_headers['WARC-IP-Address'] = recorder.target_ip - # Try to set content-length, if it is available and valid - try: - content_len = int(upstream_res.headers.get('content-length', 0)) - if content_len > 0: - content_len += len(resp_headers) - out_headers['Content-Length'] = content_len - except (KeyError, TypeError): - pass + warc_headers['Content-Type'] = 'application/http; msgtype=response' - return out_headers, StreamIter(upstream_res.raw, header=resp_headers) + self._set_content_len(upstream_res.headers.get('Content-Length', -1), + warc_headers, + len(http_headers_buff)) + + warc_headers = StatusAndHeaders('WARC/1.0', warc_headers.items()) + return (warc_headers, http_headers_buff, upstream_res.raw) @staticmethod def _make_date(dt): @@ -275,3 +245,32 @@ class LiveWebLoader(BaseLoader): def __str__(self): return 'LiveWebLoader' + +#============================================================================= +class HeaderRecorder(BaseRecorder): + def __init__(self, skip_list=None): + self.buff = BytesIO() + self.skip_list = skip_list + self.skipped = [] + self.target_ip = None + + def write_response_header_line(self, line): + if self.accept_header(line): + self.buff.write(line) + + def get_headers_buff(self): + return self.buff.getvalue() + + def accept_header(self, line): + if self.skip_list and line.lower().startswith(self.skip_list): + self.skipped.append(line) + return False + + return True + + def finish_request(self, socket): + ip = socket.getpeername() + if ip: + self.target_ip = ip[0] + +