From c66d251a90c9f3ecddad64c0851283cb799d659c Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 1 Mar 2017 12:48:06 -0800 Subject: [PATCH 1/7] warc: make ArchiveIterator an actual iterator warc indexing test: add test for reading warc with interspersed empty gzip records, ensure they are ignored --- pywb/warc/archiveiterator.py | 83 ++++++++++++++++++--------------- pywb/warc/test/test_indexing.py | 24 ++++++++++ 2 files changed, 69 insertions(+), 38 deletions(-) diff --git a/pywb/warc/archiveiterator.py b/pywb/warc/archiveiterator.py index 69883304..c4785d3a 100644 --- a/pywb/warc/archiveiterator.py +++ b/pywb/warc/archiveiterator.py @@ -7,6 +7,7 @@ from pywb.warc.recordloader import ArcWarcRecordLoader import hashlib import base64 +import six import re import sys @@ -17,8 +18,11 @@ except ImportError: # pragma: no cover from ordereddict import OrderedDict -#================================================================= -class ArchiveIterator(object): +# ============================================================================ +BUFF_SIZE = 16384 + + +class ArchiveIterator(six.Iterator): """ Iterate over records in WARC and ARC files, both gzip chunk compressed and uncompressed @@ -52,9 +56,9 @@ class ArchiveIterator(object): Remainder: {1} """ - def __init__(self, fileobj, no_record_parse=False, - verify_http=False, arc2warc=False): + verify_http=False, arc2warc=False, block_size=BUFF_SIZE): + self.fh = fileobj self.loader = ArcWarcRecordLoader(verify_http=verify_http, @@ -69,55 +73,59 @@ class ArchiveIterator(object): self.member_info = None self.no_record_parse = no_record_parse - def __call__(self, block_size=16384): - """ iterate over each record - """ - - decomp_type = 'gzip' - self.reader = DecompressingBufferedReader(self.fh, block_size=block_size) self.offset = self.fh.tell() self.next_line = None - raise_invalid_gzip = False - empty_record = False - record = None + self._raise_invalid_gzip = False + self._is_empty = False + self._is_first = True + self.last_record = None + def __iter__(self): + return self + + def __next__(self): while True: + if not self._is_first: + self._finish_record() + + self._is_first = False + try: - curr_offset = self.fh.tell() - record = self._next_record(self.next_line) - if raise_invalid_gzip: + self.last_record = self._next_record(self.next_line) + if self._raise_invalid_gzip: self._raise_invalid_gzip_err() - yield record + return self.last_record except EOFError: - empty_record = True + self._is_empty = True - if record: - self.read_to_end(record) + def _finish_record(self): + if self.last_record: + self.read_to_end(self.last_record) - if self.reader.decompressor: - # if another gzip member, continue - if self.reader.read_next_member(): - continue + if self.reader.decompressor: + # if another gzip member, continue + if self.reader.read_next_member(): + return - # if empty record, then we're done - elif empty_record: - break + # if empty record, then we're done + elif self._is_empty: + raise StopIteration() - # otherwise, probably a gzip - # containing multiple non-chunked records - # raise this as an error - else: - raise_invalid_gzip = True + # otherwise, probably a gzip + # containing multiple non-chunked records + # raise this as an error + else: + self._raise_invalid_gzip = True - # non-gzip, so we're done - elif empty_record: - break + # non-gzip, so we're done + elif self._is_empty: + raise StopIteration() def _raise_invalid_gzip_err(self): """ A gzip file with multiple ARC/WARC records, non-chunked @@ -185,7 +193,7 @@ class ArchiveIterator(object): curr_offset = self.offset while True: - b = record.stream.read(8192) + b = record.stream.read(BUFF_SIZE) if not b: break num += len(b) @@ -349,7 +357,6 @@ class DefaultRecordParser(object): def create_record_iter(self, raw_iter): append_post = self.options.get('append_post') include_all = self.options.get('include_all') - block_size = self.options.get('block_size', 16384) surt_ordered = self.options.get('surt_ordered', True) minimal = self.options.get('minimal') @@ -357,7 +364,7 @@ class DefaultRecordParser(object): raise Exception('Sorry, minimal index option and ' + 'append POST options can not be used together') - for record in raw_iter(block_size): + for record in raw_iter: entry = None if not include_all and not minimal and (record.status_headers.get_statuscode() == '-'): diff --git a/pywb/warc/test/test_indexing.py b/pywb/warc/test/test_indexing.py index 40238bca..afc148ff 100644 --- a/pywb/warc/test/test_indexing.py +++ b/pywb/warc/test/test_indexing.py @@ -370,6 +370,30 @@ def test_cdxj_empty(): assert buff.getvalue() == b'' +def test_cdxj_middle_empty_records(): + empty_gzip_record = b'\x1f\x8b\x08\x00\x00\x00\x00\x00\x00\x03\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00' + + new_warc = BytesIO() + + with open(TEST_WARC_DIR + 'example2.warc.gz', 'rb') as fh: + new_warc.write(empty_gzip_record) + new_warc.write(fh.read()) + new_warc.write(empty_gzip_record) + new_warc.write(empty_gzip_record) + fh.seek(0) + new_warc.write(fh.read()) + + options = dict(cdxj=True) + + buff = BytesIO() + new_warc.seek(0) + + write_cdx_index(buff, new_warc, 'empty.warc.gz', **options) + + lines = buff.getvalue().rstrip().split(b'\n') + + assert len(lines) == 2, lines + if __name__ == "__main__": import doctest From 3faa55906a68d69232ee3b9b1627e01e840c9f99 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 1 Mar 2017 12:50:32 -0800 Subject: [PATCH 2/7] warcwriter: attempt to separate warc writing semantics from the recorder use StatusAndHeaders instead of requests CaseInsensitiveDict for consistency refactor writer api: create_warc_record() for creating new record copy_warc_record() for copying a full record from a stream add writer tests, separate from recorder --- pywb/recorder/recorderapp.py | 34 +++-- pywb/recorder/test/test_recorder.py | 46 +++---- pywb/recorder/test/test_writer.py | 121 +++++++++++++++++ pywb/recorder/warcwriter.py | 194 +++++++++++++--------------- pywb/utils/statusandheaders.py | 7 +- 5 files changed, 256 insertions(+), 146 deletions(-) create mode 100644 pywb/recorder/test/test_writer.py diff --git a/pywb/recorder/recorderapp.py b/pywb/recorder/recorderapp.py index bea431c8..03c101cf 100644 --- a/pywb/recorder/recorderapp.py +++ b/pywb/recorder/recorderapp.py @@ -5,11 +5,12 @@ from pywb.webagg.inputrequest import DirectWSGIInputRequest from pywb.recorder.filters import SkipRangeRequestFilter, CollectionFilter from six.moves.urllib.parse import parse_qsl +import six import json import tempfile -from requests.structures import CaseInsensitiveDict +#from requests.structures import CaseInsensitiveDict import requests import traceback @@ -67,10 +68,15 @@ class RecorderApp(object): req_head, req_pay, resp_head, resp_pay, params = result - resp_type, resp = self.writer.read_resp_record(resp_head, resp_pay) + #resp_type, resp = self.writer.read_resp_record(resp_head, resp_pay) + resp = self.writer.copy_warc_record(resp_pay) - if resp_type == 'response': - req = self.writer.create_req_record(req_head, req_pay) + if resp.rec_type == 'response': + uri = resp.rec_headers.get_header('WARC-Target-Uri') + req = self.writer.create_warc_record(uri=uri, + record_type='request', + payload=req_pay, + warc_headers_dict=req_head) self.writer.write_req_resp(req, resp, params) @@ -127,16 +133,16 @@ class RecorderApp(object): content_type = headers.get('Content-Type') - record = self.writer.create_custom_record(params['url'], - req_stream.out, - record_type, - content_type, - req_stream.headers) + record = self.writer.create_warc_record(uri=params['url'], + record_type=record_type, + payload=req_stream.out, + warc_content_type=content_type, + warc_headers_dict=req_stream.headers) self.writer.write_record(record, params) msg = {'success': 'true', - 'WARC-Date': record.rec_headers.get('WARC-Date')} + 'WARC-Date': record.rec_headers.get_header('WARC-Date')} finally: if req_stream: @@ -311,11 +317,11 @@ class RespWrapper(Wrapper): class ReqWrapper(Wrapper): def __init__(self, stream, req_headers, params, create_func): super(ReqWrapper, self).__init__(stream, params, create_func) - self.headers = CaseInsensitiveDict(req_headers) + self.headers = {} - for n in req_headers.keys(): - if not n.upper().startswith('WARC-'): - del self.headers[n] + for n in six.iterkeys(req_headers): + if n.upper().startswith('WARC-'): + self.headers[n] = req_headers[n] def close(self): # no need to close wsgi.input diff --git a/pywb/recorder/test/test_recorder.py b/pywb/recorder/test/test_recorder.py index 7710aa52..122c57ba 100644 --- a/pywb/recorder/test/test_recorder.py +++ b/pywb/recorder/test/test_recorder.py @@ -94,6 +94,8 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass) files = [x for x in os.listdir(coll_dir) if os.path.isfile(os.path.join(coll_dir, x))] assert len(files) == num assert all(x.endswith('.warc.gz') for x in files) + + self._verify_content_len(coll_dir, files) return files, coll_dir def _load_resp_req(self, base_path): @@ -105,7 +107,7 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass) stored_req = None with open(os.path.join(base_path, warc), 'rb') as fh: - for rec in ArchiveIterator(fh)(): + for rec in ArchiveIterator(fh): if rec.rec_type == 'response': stored_resp = rec elif rec.rec_type == 'request': @@ -115,6 +117,15 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass) assert stored_req is not None return stored_req, stored_resp + def _verify_content_len(self, base_dir, files): + for filename in files: + filename = os.path.join(base_dir, filename) + with open(filename, 'rb') as fh: + for record in ArchiveIterator(fh, no_record_parse=True): + assert record.status_headers == None + assert int(record.rec_headers.get_header('Content-Length')) == record.length + assert record.length == len(record.stream.read()) + def test_record_warc_1(self): recorder_app = RecorderApp(self.upstream_url, PerRecordWARCWriter(to_path(self.root_dir + '/warcs/'))) @@ -168,6 +179,8 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass) assert ('X-Other', 'foo') in stored_req.status_headers.headers assert ('Cookie', 'boo=far') in stored_req.status_headers.headers + self._test_all_warcs('/warcs/cookiecheck/', 1) + def test_record_cookies_skip_header(self): warc_path = to_path(self.root_dir + '/warcs/cookieskip/') header_filter = ExcludeSpecificHeaders(['Set-Cookie', 'Cookie']) @@ -191,6 +204,8 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass) assert ('X-Other', 'foo') in stored_req.status_headers.headers assert ('Cookie', 'boo=far') not in stored_req.status_headers.headers + self._test_all_warcs('/warcs/cookieskip/', 1) + def test_record_skip_wrong_coll(self): recorder_app = RecorderApp(self.upstream_url, writer=PerRecordWARCWriter(to_path(self.root_dir + '/warcs/')), accept_colls='not-live') @@ -470,34 +485,6 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass) self._test_all_warcs('/warcs/GOO/', 2) - def test_warcinfo_record(self): - simplewriter = SimpleTempWARCWriter(gzip=False) - params = {'software': 'recorder test', - 'format': 'WARC File Format 1.0', - 'json-metadata': json.dumps({'foo': 'bar'})} - - record = simplewriter.create_warcinfo_record('testfile.warc.gz', params) - simplewriter.write_record(record) - buff = simplewriter.get_buffer() - assert isinstance(buff, bytes) - - buff = BytesIO(buff) - parsed_record = ArcWarcRecordLoader().parse_record_stream(buff) - - assert parsed_record.rec_headers.get_header('WARC-Type') == 'warcinfo' - assert parsed_record.rec_headers.get_header('Content-Type') == 'application/warc-fields' - assert parsed_record.rec_headers.get_header('WARC-Filename') == 'testfile.warc.gz' - - buff = parsed_record.stream.read().decode('utf-8') - - length = parsed_record.rec_headers.get_header('Content-Length') - - assert len(buff) == int(length) - - assert 'json-metadata: {"foo": "bar"}\r\n' in buff - assert 'format: WARC File Format 1.0\r\n' in buff - assert 'json-metadata: {"foo": "bar"}\r\n' in buff - def test_record_custom_record(self): dedup_index = self._get_dedup_index(user=False) @@ -584,4 +571,3 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass) assert status_headers.get_header('Content-Type') == 'application/vnd.youtube-dl_formats+json' assert status_headers.get_header('WARC-Block-Digest') != '' assert status_headers.get_header('WARC-Block-Digest') == status_headers.get_header('WARC-Payload-Digest') - diff --git a/pywb/recorder/test/test_writer.py b/pywb/recorder/test/test_writer.py new file mode 100644 index 00000000..40bf1e97 --- /dev/null +++ b/pywb/recorder/test/test_writer.py @@ -0,0 +1,121 @@ +from pywb.utils.statusandheaders import StatusAndHeaders +from pywb.recorder.warcwriter import SimpleTempWARCWriter +from pywb.warc.recordloader import ArcWarcRecordLoader +from pywb.warc.archiveiterator import ArchiveIterator + +from io import BytesIO +from collections import OrderedDict +import json + + +# ============================================================================ +class FixedTestWARCWriter(SimpleTempWARCWriter): + @classmethod + def _make_warc_id(cls, id_=None): + return '' + + @classmethod + def _make_warc_date(cls): + return '2000-01-01T00:00:00Z' + + +# ============================================================================ +class TestWarcWriter(object): + def _validate_record_content_len(self, stream): + for record in ArchiveIterator(stream, no_record_parse=True): + assert record.status_headers == None + assert int(record.rec_headers.get_header('Content-Length')) == record.length + assert record.length == len(record.stream.read()) + + + def test_warcinfo_record(self): + simplewriter = FixedTestWARCWriter(gzip=False) + params = OrderedDict([('software', 'recorder test'), + ('format', 'WARC File Format 1.0'), + ('json-metadata', json.dumps({'foo': 'bar'}))]) + + record = simplewriter.create_warcinfo_record('testfile.warc.gz', params) + simplewriter.write_record(record) + buff = simplewriter.get_buffer() + assert isinstance(buff, bytes) + + buff = BytesIO(buff) + parsed_record = ArcWarcRecordLoader().parse_record_stream(buff) + + assert parsed_record.rec_headers.get_header('WARC-Type') == 'warcinfo' + assert parsed_record.rec_headers.get_header('Content-Type') == 'application/warc-fields' + assert parsed_record.rec_headers.get_header('WARC-Filename') == 'testfile.warc.gz' + + buff = parsed_record.stream.read().decode('utf-8') + + length = parsed_record.rec_headers.get_header('Content-Length') + + assert len(buff) == int(length) + + assert 'json-metadata: {"foo": "bar"}\r\n' in buff + assert 'format: WARC File Format 1.0\r\n' in buff + + warcinfo_record = '\ +WARC/1.0\r\n\ +WARC-Type: warcinfo\r\n\ +WARC-Record-ID: \r\n\ +WARC-Filename: testfile.warc.gz\r\n\ +WARC-Date: 2000-01-01T00:00:00Z\r\n\ +Content-Type: application/warc-fields\r\n\ +Content-Length: 86\r\n\ +\r\n\ +software: recorder test\r\n\ +format: WARC File Format 1.0\r\n\ +json-metadata: {"foo": "bar"}\r\n\ +\r\n\ +\r\n\ +' + + assert simplewriter.get_buffer().decode('utf-8') == warcinfo_record + + def test_generate_response(self): + headers_list = [('Content-Type', 'text/plain; charset="UTF-8"'), + ('Custom-Header', 'somevalue') + ] + + payload = b'some\ntext' + + status_headers = StatusAndHeaders('200 OK', headers_list, protocol='HTTP/1.0') + + + writer = FixedTestWARCWriter(gzip=False) + + record = writer.create_warc_record('http://example.com/', 'response', + payload=BytesIO(payload), + length=len(payload), + status_headers=status_headers) + + + writer.write_record(record) + + buff = writer.get_buffer() + + self._validate_record_content_len(BytesIO(buff)) + + warc_record = '\ +WARC/1.0\r\n\ +WARC-Type: response\r\n\ +WARC-Record-ID: \r\n\ +WARC-Target-URI: http://example.com/\r\n\ +WARC-Date: 2000-01-01T00:00:00Z\r\n\ +WARC-Block-Digest: sha1:B6QJ6BNJ3R4B23XXMRKZKHLPGJY2VE4O\r\n\ +WARC-Payload-Digest: sha1:B6QJ6BNJ3R4B23XXMRKZKHLPGJY2VE4O\r\n\ +Content-Type: application/http; msgtype=response\r\n\ +Content-Length: 97\r\n\ +\r\n\ +HTTP/1.0 200 OK\r\n\ +Content-Type: text/plain; charset="UTF-8"\r\n\ +Custom-Header: somevalue\r\n\ +\r\n\ +some\n\ +text\ +\r\n\ +\r\n\ +' + assert buff.decode('utf-8') == warc_record + diff --git a/pywb/recorder/warcwriter.py b/pywb/recorder/warcwriter.py index 5f26ee24..6c9eca8a 100644 --- a/pywb/recorder/warcwriter.py +++ b/pywb/recorder/warcwriter.py @@ -11,8 +11,6 @@ import shutil import traceback -from collections import OrderedDict - from socket import gethostname from io import BytesIO @@ -22,15 +20,12 @@ from pywb.utils.loaders import LimitReader, to_native_str from pywb.utils.bufferedreaders import BufferedReader from pywb.utils.timeutils import timestamp20_now, datetime_to_iso_date -from pywb.utils.statusandheaders import StatusAndHeadersParser +from pywb.utils.statusandheaders import StatusAndHeadersParser, StatusAndHeaders from pywb.warc.recordloader import ArcWarcRecord from pywb.warc.recordloader import ArcWarcRecordLoader -from requests.structures import CaseInsensitiveDict from pywb.webagg.utils import res_template, BUFF_SIZE -from pywb.recorder.filters import ExcludeNone - # ============================================================================ class BaseWARCWriter(object): @@ -45,14 +40,18 @@ class BaseWARCWriter(object): FILE_TEMPLATE = 'rec-{timestamp}-{hostname}.warc.gz' + WARC_VERSION = 'WARC/1.0' + def __init__(self, gzip=True, dedup_index=None, - header_filter=ExcludeNone(), *args, **kwargs): + header_filter=None, *args, **kwargs): + self.gzip = gzip self.dedup_index = dedup_index self.header_filter = header_filter self.hostname = gethostname() self.parser = StatusAndHeadersParser([], verify=False) + self.warc_version = kwargs.get('warc_version', self.WARC_VERSION) @staticmethod def _iter_stream(stream): @@ -64,8 +63,8 @@ class BaseWARCWriter(object): yield buf def ensure_digest(self, record): - block_digest = record.rec_headers.get('WARC-Block-Digest') - payload_digest = record.rec_headers.get('WARC-Payload-Digest') + block_digest = record.rec_headers.get_header('WARC-Block-Digest') + payload_digest = record.rec_headers.get_header('WARC-Payload-Digest') if block_digest and payload_digest: return @@ -82,28 +81,30 @@ class BaseWARCWriter(object): payload_digester.update(buf) record.stream.seek(pos) - record.rec_headers['WARC-Block-Digest'] = str(block_digester) - record.rec_headers['WARC-Payload-Digest'] = str(payload_digester) + record.rec_headers.add_header('WARC-Block-Digest', str(block_digester)) + record.rec_headers.add_header('WARC-Payload-Digest', str(payload_digester)) def _create_digester(self): return Digester('sha1') def _set_header_buff(self, record): - exclude_list = self.header_filter(record) + exclude_list = None + if self.header_filter: + exclude_list = self.header_filter(record) buff = record.status_headers.to_bytes(exclude_list) record.status_headers.headers_buff = buff def write_req_resp(self, req, resp, params): - url = resp.rec_headers.get('WARC-Target-URI') - dt = resp.rec_headers.get('WARC-Date') + url = resp.rec_headers.get_header('WARC-Target-URI') + dt = resp.rec_headers.get_header('WARC-Date') #req.rec_headers['Content-Type'] = req.content_type - req.rec_headers['WARC-Target-URI'] = url - req.rec_headers['WARC-Date'] = dt + req.rec_headers.replace_header('WARC-Target-URI', url) + req.rec_headers.replace_header('WARC-Date', dt) - resp_id = resp.rec_headers.get('WARC-Record-ID') + resp_id = resp.rec_headers.get_header('WARC-Record-ID') if resp_id: - req.rec_headers['WARC-Concurrent-To'] = resp_id + req.rec_headers.add_header('WARC-Concurrent-To', resp_id) resp = self._check_revisit(resp, params) if not resp: @@ -112,55 +113,13 @@ class BaseWARCWriter(object): self._do_write_req_resp(req, resp, params) - def create_req_record(self, req_headers, payload): - len_ = payload.tell() - payload.seek(0) - - warc_headers = req_headers - warc_headers['WARC-Type'] = 'request' - if not warc_headers.get('WARC-Record-ID'): - warc_headers['WARC-Record-ID'] = self._make_warc_id() - - status_headers = self.parser.parse(payload) - - record = ArcWarcRecord('warc', 'request', warc_headers, payload, - status_headers, '', len_) - - self._set_header_buff(record) - - return record - - def read_resp_record(self, resp_headers, payload): - len_ = payload.tell() - payload.seek(0) - - warc_headers = self.parser.parse(payload) - warc_headers = CaseInsensitiveDict(warc_headers.headers) - - record_type = warc_headers.get('WARC-Type', 'response') - - if record_type == 'response': - status_headers = self.parser.parse(payload) - else: - status_headers = None - - record = ArcWarcRecord('warc', record_type, warc_headers, payload, - status_headers, '', len_) - - if record_type == 'response': - self._set_header_buff(record) - - self.ensure_digest(record) - - return record_type, record - def create_warcinfo_record(self, filename, info): - warc_headers = {} - warc_headers['WARC-Record-ID'] = self._make_warc_id() - warc_headers['WARC-Type'] = 'warcinfo' + warc_headers = StatusAndHeaders(self.warc_version, []) + warc_headers.add_header('WARC-Type', 'warcinfo') + warc_headers.add_header('WARC-Record-ID', self._make_warc_id()) if filename: - warc_headers['WARC-Filename'] = filename - warc_headers['WARC-Date'] = datetime_to_iso_date(datetime.datetime.utcnow()) + warc_headers.add_header('WARC-Filename', filename) + warc_headers.add_header('WARC-Date', self._make_warc_date()) warcinfo = BytesIO() for n, v in six.iteritems(info): @@ -173,24 +132,54 @@ class BaseWARCWriter(object): return record - def create_custom_record(self, uri, payload, record_type, content_type, - warc_headers=None): + def copy_warc_record(self, payload): len_ = payload.tell() payload.seek(0) - warc_headers = warc_headers or {} - warc_headers['WARC-Record-ID'] = self._make_warc_id() - warc_headers['WARC-Type'] = record_type - warc_headers['WARC-Target-URI'] = uri + warc_headers = self.parser.parse(payload) - if 'WARC-Date' not in warc_headers: - warc_headers['WARC-Date'] = datetime_to_iso_date(datetime.datetime.utcnow()) + record_type = warc_headers.get_header('WARC-Type', 'response') + + return self._fill_record(record_type, warc_headers, None, payload, '', len_) + + def create_warc_record(self, uri, record_type, payload, + length=None, + warc_content_type='', + warc_headers_dict={}, + status_headers=None): + + if length is None: + length = payload.tell() + payload.seek(0) + + warc_headers = StatusAndHeaders(self.warc_version, list(warc_headers_dict.items())) + warc_headers.replace_header('WARC-Type', record_type) + if not warc_headers.get_header('WARC-Record-ID'): + warc_headers.add_header('WARC-Record-ID', self._make_warc_id()) + + if uri: + warc_headers.replace_header('WARC-Target-URI', uri) + + if not warc_headers.get_header('WARC-Date'): + warc_headers.add_header('WARC-Date', self._make_warc_date()) + + return self._fill_record(record_type, warc_headers, status_headers, + payload, warc_content_type, length) + + def _fill_record(self, record_type, warc_headers, status_headers, payload, warc_content_type, len_): + has_http_headers = (record_type in ('request', 'response', 'revisit')) + + if not status_headers and has_http_headers: + status_headers = self.parser.parse(payload) record = ArcWarcRecord('warc', record_type, warc_headers, payload, - None, content_type, len_) + status_headers, warc_content_type, len_) self.ensure_digest(record) + if has_http_headers: + self._set_header_buff(record) + return record def _check_revisit(self, record, params): @@ -198,9 +187,9 @@ class BaseWARCWriter(object): return record try: - url = record.rec_headers.get('WARC-Target-URI') - digest = record.rec_headers.get('WARC-Payload-Digest') - iso_dt = record.rec_headers.get('WARC-Date') + url = record.rec_headers.get_header('WARC-Target-URI') + digest = record.rec_headers.get_header('WARC-Payload-Digest') + iso_dt = record.rec_headers.get_header('WARC-Date') result = self.dedup_index.lookup_revisit(params, digest, url, iso_dt) except Exception as e: traceback.print_exc() @@ -210,11 +199,11 @@ class BaseWARCWriter(object): return None if isinstance(result, tuple) and result[0] == 'revisit': - record.rec_headers['WARC-Type'] = 'revisit' - record.rec_headers['WARC-Profile'] = self.REVISIT_PROFILE + record.rec_headers.replace_header('WARC-Type', 'revisit') + record.rec_headers.add_header('WARC-Profile', self.REVISIT_PROFILE) - record.rec_headers['WARC-Refers-To-Target-URI'] = result[1] - record.rec_headers['WARC-Refers-To-Date'] = result[2] + record.rec_headers.add_header('WARC-Refers-To-Target-URI', result[1]) + record.rec_headers.add_header('WARC-Refers-To-Date', result[2]) return record @@ -222,31 +211,26 @@ class BaseWARCWriter(object): if self.gzip: out = GzippingWrapper(out) - self._line(out, b'WARC/1.0') - - for n, v in six.iteritems(record.rec_headers): - if n.lower() in ('content-length', 'content-type'): - continue - - self._header(out, n, v) - - content_type = record.rec_headers.get('Content-Type') + # compute Content-Type + content_type = record.rec_headers.get_header('Content-Type') if not content_type: content_type = record.content_type - if not content_type: - content_type = self.WARC_RECORDS.get(record.rec_headers['WARC-Type']) + if not content_type: + content_type = self.WARC_RECORDS.get(record.rec_headers.get_header('WARC-Type')) - if content_type: - self._header(out, 'Content-Type', content_type) + if content_type: + record.rec_headers.replace_header('Content-Type', content_type) + #self._header(out, 'Content-Type', content_type) - if record.rec_headers['WARC-Type'] == 'revisit': + if record.rec_headers.get_header('WARC-Type') == 'revisit': http_headers_only = True else: http_headers_only = False - if record.length: + # compute Content-Length + if record.length or record.status_headers: actual_len = 0 if record.status_headers: actual_len = len(record.status_headers.headers_buff) @@ -259,10 +243,14 @@ class BaseWARCWriter(object): actual_len = record.length - diff - self._header(out, 'Content-Length', str(actual_len)) + record.rec_headers.replace_header('Content-Length', str(actual_len)) + #self._header(out, 'Content-Length', str(actual_len)) # add empty line - self._line(out, b'') + #self._line(out, b'') + + # write record headers + out.write(record.rec_headers.to_bytes()) # write headers buffer, if any if record.status_headers: @@ -290,12 +278,16 @@ class BaseWARCWriter(object): def _line(self, out, line): out.write(line + b'\r\n') - @staticmethod - def _make_warc_id(id_=None): + @classmethod + def _make_warc_id(cls, id_=None): if not id_: id_ = uuid.uuid1() return ''.format(id_) + @classmethod + def _make_warc_date(cls): + return datetime_to_iso_date(datetime.datetime.utcnow()) + # ============================================================================ class GzippingWrapper(object): @@ -421,7 +413,7 @@ class MultiFileWARCWriter(BaseWARCWriter): def _do_write_req_resp(self, req, resp, params): def write_callback(out, filename): - url = resp.rec_headers.get('WARC-Target-URI') + #url = resp.rec_headers.get_header('WARC-Target-URI') #print('Writing req/resp {0} to {1} '.format(url, filename)) if resp and self._is_write_resp(resp, params): diff --git a/pywb/utils/statusandheaders.py b/pywb/utils/statusandheaders.py index 82f6caf1..a5edea20 100644 --- a/pywb/utils/statusandheaders.py +++ b/pywb/utils/statusandheaders.py @@ -26,7 +26,7 @@ class StatusAndHeaders(object): self.protocol = protocol self.total_len = total_len - def get_header(self, name): + def get_header(self, name, default_value=None): """ return header (name, value) if found @@ -36,6 +36,11 @@ class StatusAndHeaders(object): if value[0].lower() == name_lower: return value[1] + return default_value + + def add_header(self, name, value): + self.headers.append((name, value)) + def replace_header(self, name, value): """ replace header with new value or add new header From 1213466afb888e8da24ad9117e0711a966a07ca0 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 1 Mar 2017 14:18:44 -0800 Subject: [PATCH 3/7] warc & recorder refactor: split BaseWARCWriter from MultiWARCWriter, move to warc/warcwriter.py, recorder/multifilewarcwriter.py split indexing functionality from base warc iterator, move to archiveindexer.py --- pywb/recorder/multifilewarcwriter.py | 269 +++++++++++++++ pywb/recorder/test/simplerec.py | 2 +- pywb/recorder/test/test_recorder.py | 2 +- pywb/utils/timeutils.py | 4 +- pywb/warc/archiveindexer.py | 342 ++++++++++++++++++++ pywb/warc/archiveiterator.py | 335 ------------------- pywb/warc/cdxindexer.py | 2 +- pywb/{recorder => warc}/test/test_writer.py | 10 +- pywb/{recorder => warc}/warcwriter.py | 313 ++---------------- 9 files changed, 654 insertions(+), 625 deletions(-) create mode 100644 pywb/recorder/multifilewarcwriter.py create mode 100644 pywb/warc/archiveindexer.py rename pywb/{recorder => warc}/test/test_writer.py (93%) rename pywb/{recorder => warc}/warcwriter.py (51%) diff --git a/pywb/recorder/multifilewarcwriter.py b/pywb/recorder/multifilewarcwriter.py new file mode 100644 index 00000000..673ac042 --- /dev/null +++ b/pywb/recorder/multifilewarcwriter.py @@ -0,0 +1,269 @@ +import base64 +import datetime +import os +import shutil + +import traceback + +import portalocker + +from pywb.utils.timeutils import timestamp20_now + +from pywb.webagg.utils import res_template + +from pywb.warc.warcwriter import BaseWARCWriter + + +# ============================================================================ +class MultiFileWARCWriter(BaseWARCWriter): + FILE_TEMPLATE = 'rec-{timestamp}-{hostname}.warc.gz' + + def __init__(self, dir_template, filename_template=None, max_size=0, + max_idle_secs=1800, *args, **kwargs): + super(MultiFileWARCWriter, self).__init__(*args, **kwargs) + + if not filename_template: + dir_template, filename_template = os.path.split(dir_template) + dir_template += os.path.sep + + if not filename_template: + filename_template = self.FILE_TEMPLATE + + self.dir_template = dir_template + self.key_template = kwargs.get('key_template', self.dir_template) + self.dedup_index = kwargs.get('dedup_index') + self.filename_template = filename_template + self.max_size = max_size + if max_idle_secs > 0: + self.max_idle_time = datetime.timedelta(seconds=max_idle_secs) + else: + self.max_idle_time = None + + self.fh_cache = {} + + def write_req_resp(self, req, resp, params): + url = resp.rec_headers.get_header('WARC-Target-URI') + dt = resp.rec_headers.get_header('WARC-Date') + + #req.rec_headers['Content-Type'] = req.content_type + req.rec_headers.replace_header('WARC-Target-URI', url) + req.rec_headers.replace_header('WARC-Date', dt) + + resp_id = resp.rec_headers.get_header('WARC-Record-ID') + if resp_id: + req.rec_headers.add_header('WARC-Concurrent-To', resp_id) + + resp = self._check_revisit(resp, params) + if not resp: + print('Skipping due to dedup') + return + + self._do_write_req_resp(req, resp, params) + + def _check_revisit(self, record, params): + if not self.dedup_index: + return record + + try: + url = record.rec_headers.get_header('WARC-Target-URI') + digest = record.rec_headers.get_header('WARC-Payload-Digest') + iso_dt = record.rec_headers.get_header('WARC-Date') + result = self.dedup_index.lookup_revisit(params, digest, url, iso_dt) + except Exception as e: + traceback.print_exc() + result = None + + if result == 'skip': + return None + + if isinstance(result, tuple) and result[0] == 'revisit': + record.rec_headers.replace_header('WARC-Type', 'revisit') + record.rec_headers.add_header('WARC-Profile', self.REVISIT_PROFILE) + + record.rec_headers.add_header('WARC-Refers-To-Target-URI', result[1]) + record.rec_headers.add_header('WARC-Refers-To-Date', result[2]) + + return record + + def get_new_filename(self, dir_, params): + timestamp = timestamp20_now() + + randstr = base64.b32encode(os.urandom(5)).decode('utf-8') + + filename = dir_ + res_template(self.filename_template, params, + hostname=self.hostname, + timestamp=timestamp, + random=randstr) + + return filename + + def allow_new_file(self, filename, params): + return True + + def _open_file(self, filename, params): + path, name = os.path.split(filename) + + try: + os.makedirs(path) + except: + pass + + fh = open(filename, 'a+b') + + if self.dedup_index: + self.dedup_index.add_warc_file(filename, params) + + return fh + + def _close_file(self, fh): + try: + portalocker.lock(fh, portalocker.LOCK_UN) + fh.close() + except Exception as e: + print(e) + + def get_dir_key(self, params): + return res_template(self.key_template, params) + + def close_key(self, dir_key): + if isinstance(dir_key, dict): + dir_key = self.get_dir_key(dir_key) + + result = self.fh_cache.pop(dir_key, None) + if not result: + return + + out, filename = result + self._close_file(out) + return filename + + def close_file(self, match_filename): + for dir_key, out, filename in self.iter_open_files(): + if filename == match_filename: + return self.close_key(dir_key) + + def _is_write_resp(self, resp, params): + return True + + def _is_write_req(self, req, params): + return True + + def write_record(self, record, params=None): + params = params or {} + self._do_write_req_resp(None, record, params) + + def _do_write_req_resp(self, req, resp, params): + def write_callback(out, filename): + #url = resp.rec_headers.get_header('WARC-Target-URI') + #print('Writing req/resp {0} to {1} '.format(url, filename)) + + if resp and self._is_write_resp(resp, params): + self._write_warc_record(out, resp) + + if req and self._is_write_req(req, params): + self._write_warc_record(out, req) + + return self._write_to_file(params, write_callback) + + def write_stream_to_file(self, params, stream): + def write_callback(out, filename): + #print('Writing stream to {0}'.format(filename)) + shutil.copyfileobj(stream, out) + + return self._write_to_file(params, write_callback) + + def _write_to_file(self, params, write_callback): + full_dir = res_template(self.dir_template, params) + dir_key = self.get_dir_key(params) + + result = self.fh_cache.get(dir_key) + + close_file = False + + if result: + out, filename = result + is_new = False + else: + filename = self.get_new_filename(full_dir, params) + + if not self.allow_new_file(filename, params): + return False + + out = self._open_file(filename, params) + + is_new = True + + try: + start = out.tell() + + write_callback(out, filename) + + out.flush() + + new_size = out.tell() + + out.seek(start) + + if self.dedup_index: + self.dedup_index.add_urls_to_index(out, params, + filename, + new_size - start) + + return True + + except Exception as e: + traceback.print_exc() + close_file = True + return False + + finally: + # check for rollover + if self.max_size and new_size > self.max_size: + close_file = True + + if close_file: + self._close_file(out) + if not is_new: + self.fh_cache.pop(dir_key, None) + + elif is_new: + portalocker.lock(out, portalocker.LOCK_EX | portalocker.LOCK_NB) + self.fh_cache[dir_key] = (out, filename) + + def iter_open_files(self): + for n, v in list(self.fh_cache.items()): + out, filename = v + yield n, out, filename + + def close(self): + for dir_key, out, filename in self.iter_open_files(): + self._close_file(out) + + self.fh_cache = {} + + def close_idle_files(self): + if not self.max_idle_time: + return + + now = datetime.datetime.now() + + for dir_key, out, filename in self.iter_open_files(): + try: + mtime = os.path.getmtime(filename) + except: + self.close_key(dir_key) + return + + mtime = datetime.datetime.fromtimestamp(mtime) + + if (now - mtime) > self.max_idle_time: + print('Closing idle ' + filename) + self.close_key(dir_key) + + +# ============================================================================ +class PerRecordWARCWriter(MultiFileWARCWriter): + def __init__(self, *args, **kwargs): + kwargs['max_size'] = 1 + super(PerRecordWARCWriter, self).__init__(*args, **kwargs) + diff --git a/pywb/recorder/test/simplerec.py b/pywb/recorder/test/simplerec.py index e7665c6b..8cfeddb5 100644 --- a/pywb/recorder/test/simplerec.py +++ b/pywb/recorder/test/simplerec.py @@ -3,7 +3,7 @@ from gevent import monkey; monkey.patch_all() from pywb.recorder.recorderapp import RecorderApp from pywb.recorder.redisindexer import WritableRedisIndexer -from pywb.recorder.warcwriter import MultiFileWARCWriter +from pywb.recorder.multifilewarcwriter import MultiFileWARCWriter from pywb.recorder.filters import SkipDupePolicy import atexit diff --git a/pywb/recorder/test/test_recorder.py b/pywb/recorder/test/test_recorder.py index 122c57ba..8a32ec59 100644 --- a/pywb/recorder/test/test_recorder.py +++ b/pywb/recorder/test/test_recorder.py @@ -13,7 +13,7 @@ from fakeredis import FakeStrictRedis from pywb.recorder.recorderapp import RecorderApp from pywb.recorder.redisindexer import WritableRedisIndexer -from pywb.recorder.warcwriter import PerRecordWARCWriter, MultiFileWARCWriter, SimpleTempWARCWriter +from pywb.recorder.multifilewarcwriter import PerRecordWARCWriter, MultiFileWARCWriter from pywb.recorder.filters import ExcludeSpecificHeaders from pywb.recorder.filters import SkipDupePolicy, WriteDupePolicy, WriteRevisitDupePolicy diff --git a/pywb/utils/timeutils.py b/pywb/utils/timeutils.py index 3c5cd9f3..93cd2c2f 100644 --- a/pywb/utils/timeutils.py +++ b/pywb/utils/timeutils.py @@ -7,7 +7,7 @@ import re import time import datetime import calendar -from six.moves import map + from email.utils import parsedate, formatdate #================================================================= @@ -37,7 +37,7 @@ def iso_date_to_datetime(string): if nums[-1] == '': nums = nums[:-1] - the_datetime = datetime.datetime(*map(int, nums)) + the_datetime = datetime.datetime(*(int(num) for num in nums)) return the_datetime diff --git a/pywb/warc/archiveindexer.py b/pywb/warc/archiveindexer.py new file mode 100644 index 00000000..e5e8cfe5 --- /dev/null +++ b/pywb/warc/archiveindexer.py @@ -0,0 +1,342 @@ +from pywb.utils.timeutils import iso_date_to_timestamp +from pywb.utils.canonicalize import canonicalize +from pywb.utils.loaders import extract_post_query, append_post_query + +from pywb.warc.archiveiterator import ArchiveIterator + +import hashlib +import base64 +import six + +import re +import sys + +try: # pragma: no cover + from collections import OrderedDict +except ImportError: # pragma: no cover + from ordereddict import OrderedDict + + +#================================================================= +class ArchiveIndexEntryMixin(object): + MIME_RE = re.compile('[; ]') + + def __init__(self): + super(ArchiveIndexEntryMixin, self).__init__() + self.reset_entry() + + def reset_entry(self): + self['urlkey'] = '' + self['metadata'] = '' + self.buffer = None + self.record = None + + + def extract_mime(self, mime, def_mime='unk'): + """ Utility function to extract mimetype only + from a full content type, removing charset settings + """ + self['mime'] = def_mime + if mime: + self['mime'] = self.MIME_RE.split(mime, 1)[0] + self['_content_type'] = mime + + def extract_status(self, status_headers): + """ Extract status code only from status line + """ + self['status'] = status_headers.get_statuscode() + if not self['status']: + self['status'] = '-' + elif self['status'] == '204' and 'Error' in status_headers.statusline: + self['status'] = '-' + + def set_rec_info(self, offset, length): + self['length'] = str(length) + self['offset'] = str(offset) + + def merge_request_data(self, other, options): + surt_ordered = options.get('surt_ordered', True) + + if other.record.rec_type != 'request': + return False + + # two requests, not correct + if self.record.rec_type == 'request': + return False + + # merge POST/PUT body query + post_query = other.get('_post_query') + if post_query: + url = append_post_query(self['url'], post_query) + self['urlkey'] = canonicalize(url, surt_ordered) + other['urlkey'] = self['urlkey'] + + referer = other.record.status_headers.get_header('referer') + if referer: + self['_referer'] = referer + + return True + + +#================================================================= +class DefaultRecordParser(object): + def __init__(self, **options): + self.options = options + self.entry_cache = {} + self.digester = None + self.buff = None + + def _create_index_entry(self, rec_type): + try: + entry = self.entry_cache[rec_type] + entry.reset_entry() + except: + if self.options.get('cdxj'): + entry = OrderedArchiveIndexEntry() + else: + entry = ArchiveIndexEntry() + + # don't reuse when using append post + # entry may be cached + if not self.options.get('append_post'): + self.entry_cache[rec_type] = entry + + return entry + + def begin_payload(self, compute_digest, entry): + if compute_digest: + self.digester = hashlib.sha1() + else: + self.digester = None + + self.entry = entry + entry.buffer = self.create_payload_buffer(entry) + + def handle_payload(self, buff): + if self.digester: + self.digester.update(buff) + + if self.entry and self.entry.buffer: + self.entry.buffer.write(buff) + + def end_payload(self, entry): + if self.digester: + entry['digest'] = base64.b32encode(self.digester.digest()).decode('ascii') + + self.entry = None + + def create_payload_buffer(self, entry): + return None + + def create_record_iter(self, raw_iter): + append_post = self.options.get('append_post') + include_all = self.options.get('include_all') + surt_ordered = self.options.get('surt_ordered', True) + minimal = self.options.get('minimal') + + if append_post and minimal: + raise Exception('Sorry, minimal index option and ' + + 'append POST options can not be used together') + + for record in raw_iter: + entry = None + + if not include_all and not minimal and (record.status_headers.get_statuscode() == '-'): + continue + + if record.rec_type == 'arc_header': + continue + + if record.format == 'warc': + if (record.rec_type in ('request', 'warcinfo') and + not include_all and + not append_post): + continue + + elif (not include_all and + record.content_type == 'application/warc-fields'): + continue + + entry = self.parse_warc_record(record) + elif record.format == 'arc': + entry = self.parse_arc_record(record) + + if not entry: + continue + + if entry.get('url') and not entry.get('urlkey'): + entry['urlkey'] = canonicalize(entry['url'], surt_ordered) + + compute_digest = False + + if (entry.get('digest', '-') == '-' and + record.rec_type not in ('revisit', 'request', 'warcinfo')): + + compute_digest = True + + elif not minimal and record.rec_type == 'request' and append_post: + method = record.status_headers.protocol + len_ = record.status_headers.get_header('Content-Length') + + post_query = extract_post_query(method, + entry.get('_content_type'), + len_, + record.stream) + + entry['_post_query'] = post_query + + entry.record = record + + self.begin_payload(compute_digest, entry) + raw_iter.read_to_end(record, self.handle_payload) + + entry.set_rec_info(*raw_iter.member_info) + self.end_payload(entry) + + yield entry + + def join_request_records(self, entry_iter): + prev_entry = None + + for entry in entry_iter: + if not prev_entry: + prev_entry = entry + continue + + # check for url match + if (entry['url'] != prev_entry['url']): + pass + + # check for concurrency also + elif (entry.record.rec_headers.get_header('WARC-Concurrent-To') != + prev_entry.record.rec_headers.get_header('WARC-Record-ID')): + pass + + elif (entry.merge_request_data(prev_entry, self.options) or + prev_entry.merge_request_data(entry, self.options)): + yield prev_entry + yield entry + prev_entry = None + continue + + yield prev_entry + prev_entry = entry + + if prev_entry: + yield prev_entry + + + #================================================================= + def parse_warc_record(self, record): + """ Parse warc record + """ + + entry = self._create_index_entry(record.rec_type) + + if record.rec_type == 'warcinfo': + entry['url'] = record.rec_headers.get_header('WARC-Filename') + entry['urlkey'] = entry['url'] + entry['_warcinfo'] = record.stream.read(record.length) + return entry + + entry['url'] = record.rec_headers.get_header('WARC-Target-Uri') + + # timestamp + entry['timestamp'] = iso_date_to_timestamp(record.rec_headers. + get_header('WARC-Date')) + + # mime + if record.rec_type == 'revisit': + entry['mime'] = 'warc/revisit' + elif self.options.get('minimal'): + entry['mime'] = '-' + else: + def_mime = '-' if record.rec_type == 'request' else 'unk' + entry.extract_mime(record.status_headers. + get_header('Content-Type'), + def_mime) + + # status -- only for response records (by convention): + if record.rec_type == 'response' and not self.options.get('minimal'): + entry.extract_status(record.status_headers) + else: + entry['status'] = '-' + + # digest + digest = record.rec_headers.get_header('WARC-Payload-Digest') + entry['digest'] = digest + if digest and digest.startswith('sha1:'): + entry['digest'] = digest[len('sha1:'):] + + elif not entry.get('digest'): + entry['digest'] = '-' + + # optional json metadata, if present + metadata = record.rec_headers.get_header('WARC-Json-Metadata') + if metadata: + entry['metadata'] = metadata + + return entry + + + #================================================================= + def parse_arc_record(self, record): + """ Parse arc record + """ + url = record.rec_headers.get_header('uri') + url = url.replace('\r', '%0D') + url = url.replace('\n', '%0A') + # replace formfeed + url = url.replace('\x0c', '%0C') + # replace nulls + url = url.replace('\x00', '%00') + + entry = self._create_index_entry(record.rec_type) + entry['url'] = url + + # timestamp + entry['timestamp'] = record.rec_headers.get_header('archive-date') + if len(entry['timestamp']) > 14: + entry['timestamp'] = entry['timestamp'][:14] + + if not self.options.get('minimal'): + # mime + entry.extract_mime(record.rec_headers.get_header('content-type')) + + # status + entry.extract_status(record.status_headers) + + # digest + entry['digest'] = '-' + + return entry + + def __call__(self, fh): + aiter = ArchiveIterator(fh, self.options.get('minimal', False), + self.options.get('verify_http', False), + self.options.get('arc2warc', False)) + + entry_iter = self.create_record_iter(aiter) + + if self.options.get('append_post'): + entry_iter = self.join_request_records(entry_iter) + + for entry in entry_iter: + if (entry.record.rec_type in ('request', 'warcinfo') and + not self.options.get('include_all')): + continue + + yield entry + + def open(self, filename): + with open(filename, 'rb') as fh: + for entry in self(fh): + yield entry + +class ArchiveIndexEntry(ArchiveIndexEntryMixin, dict): + pass + +class OrderedArchiveIndexEntry(ArchiveIndexEntryMixin, OrderedDict): + pass + + diff --git a/pywb/warc/archiveiterator.py b/pywb/warc/archiveiterator.py index c4785d3a..69296e5f 100644 --- a/pywb/warc/archiveiterator.py +++ b/pywb/warc/archiveiterator.py @@ -1,22 +1,10 @@ -from pywb.utils.timeutils import iso_date_to_timestamp from pywb.utils.bufferedreaders import DecompressingBufferedReader -from pywb.utils.canonicalize import canonicalize -from pywb.utils.loaders import extract_post_query, append_post_query from pywb.warc.recordloader import ArcWarcRecordLoader -import hashlib -import base64 import six - -import re import sys -try: # pragma: no cover - from collections import OrderedDict -except ImportError: # pragma: no cover - from ordereddict import OrderedDict - # ============================================================================ BUFF_SIZE = 16384 @@ -243,326 +231,3 @@ class ArchiveIterator(six.Iterator): return record -#================================================================= -class ArchiveIndexEntryMixin(object): - MIME_RE = re.compile('[; ]') - - def __init__(self): - super(ArchiveIndexEntryMixin, self).__init__() - self.reset_entry() - - def reset_entry(self): - self['urlkey'] = '' - self['metadata'] = '' - self.buffer = None - self.record = None - - - def extract_mime(self, mime, def_mime='unk'): - """ Utility function to extract mimetype only - from a full content type, removing charset settings - """ - self['mime'] = def_mime - if mime: - self['mime'] = self.MIME_RE.split(mime, 1)[0] - self['_content_type'] = mime - - def extract_status(self, status_headers): - """ Extract status code only from status line - """ - self['status'] = status_headers.get_statuscode() - if not self['status']: - self['status'] = '-' - elif self['status'] == '204' and 'Error' in status_headers.statusline: - self['status'] = '-' - - def set_rec_info(self, offset, length): - self['length'] = str(length) - self['offset'] = str(offset) - - def merge_request_data(self, other, options): - surt_ordered = options.get('surt_ordered', True) - - if other.record.rec_type != 'request': - return False - - # two requests, not correct - if self.record.rec_type == 'request': - return False - - # merge POST/PUT body query - post_query = other.get('_post_query') - if post_query: - url = append_post_query(self['url'], post_query) - self['urlkey'] = canonicalize(url, surt_ordered) - other['urlkey'] = self['urlkey'] - - referer = other.record.status_headers.get_header('referer') - if referer: - self['_referer'] = referer - - return True - - -#================================================================= -class DefaultRecordParser(object): - def __init__(self, **options): - self.options = options - self.entry_cache = {} - self.digester = None - self.buff = None - - def _create_index_entry(self, rec_type): - try: - entry = self.entry_cache[rec_type] - entry.reset_entry() - except: - if self.options.get('cdxj'): - entry = OrderedArchiveIndexEntry() - else: - entry = ArchiveIndexEntry() - - # don't reuse when using append post - # entry may be cached - if not self.options.get('append_post'): - self.entry_cache[rec_type] = entry - - return entry - - def begin_payload(self, compute_digest, entry): - if compute_digest: - self.digester = hashlib.sha1() - else: - self.digester = None - - self.entry = entry - entry.buffer = self.create_payload_buffer(entry) - - def handle_payload(self, buff): - if self.digester: - self.digester.update(buff) - - if self.entry and self.entry.buffer: - self.entry.buffer.write(buff) - - def end_payload(self, entry): - if self.digester: - entry['digest'] = base64.b32encode(self.digester.digest()).decode('ascii') - - self.entry = None - - def create_payload_buffer(self, entry): - return None - - def create_record_iter(self, raw_iter): - append_post = self.options.get('append_post') - include_all = self.options.get('include_all') - surt_ordered = self.options.get('surt_ordered', True) - minimal = self.options.get('minimal') - - if append_post and minimal: - raise Exception('Sorry, minimal index option and ' + - 'append POST options can not be used together') - - for record in raw_iter: - entry = None - - if not include_all and not minimal and (record.status_headers.get_statuscode() == '-'): - continue - - if record.rec_type == 'arc_header': - continue - - if record.format == 'warc': - if (record.rec_type in ('request', 'warcinfo') and - not include_all and - not append_post): - continue - - elif (not include_all and - record.content_type == 'application/warc-fields'): - continue - - entry = self.parse_warc_record(record) - elif record.format == 'arc': - entry = self.parse_arc_record(record) - - if not entry: - continue - - if entry.get('url') and not entry.get('urlkey'): - entry['urlkey'] = canonicalize(entry['url'], surt_ordered) - - compute_digest = False - - if (entry.get('digest', '-') == '-' and - record.rec_type not in ('revisit', 'request', 'warcinfo')): - - compute_digest = True - - elif not minimal and record.rec_type == 'request' and append_post: - method = record.status_headers.protocol - len_ = record.status_headers.get_header('Content-Length') - - post_query = extract_post_query(method, - entry.get('_content_type'), - len_, - record.stream) - - entry['_post_query'] = post_query - - entry.record = record - - self.begin_payload(compute_digest, entry) - raw_iter.read_to_end(record, self.handle_payload) - - entry.set_rec_info(*raw_iter.member_info) - self.end_payload(entry) - - yield entry - - def join_request_records(self, entry_iter): - prev_entry = None - - for entry in entry_iter: - if not prev_entry: - prev_entry = entry - continue - - # check for url match - if (entry['url'] != prev_entry['url']): - pass - - # check for concurrency also - elif (entry.record.rec_headers.get_header('WARC-Concurrent-To') != - prev_entry.record.rec_headers.get_header('WARC-Record-ID')): - pass - - elif (entry.merge_request_data(prev_entry, self.options) or - prev_entry.merge_request_data(entry, self.options)): - yield prev_entry - yield entry - prev_entry = None - continue - - yield prev_entry - prev_entry = entry - - if prev_entry: - yield prev_entry - - - #================================================================= - def parse_warc_record(self, record): - """ Parse warc record - """ - - entry = self._create_index_entry(record.rec_type) - - if record.rec_type == 'warcinfo': - entry['url'] = record.rec_headers.get_header('WARC-Filename') - entry['urlkey'] = entry['url'] - entry['_warcinfo'] = record.stream.read(record.length) - return entry - - entry['url'] = record.rec_headers.get_header('WARC-Target-Uri') - - # timestamp - entry['timestamp'] = iso_date_to_timestamp(record.rec_headers. - get_header('WARC-Date')) - - # mime - if record.rec_type == 'revisit': - entry['mime'] = 'warc/revisit' - elif self.options.get('minimal'): - entry['mime'] = '-' - else: - def_mime = '-' if record.rec_type == 'request' else 'unk' - entry.extract_mime(record.status_headers. - get_header('Content-Type'), - def_mime) - - # status -- only for response records (by convention): - if record.rec_type == 'response' and not self.options.get('minimal'): - entry.extract_status(record.status_headers) - else: - entry['status'] = '-' - - # digest - digest = record.rec_headers.get_header('WARC-Payload-Digest') - entry['digest'] = digest - if digest and digest.startswith('sha1:'): - entry['digest'] = digest[len('sha1:'):] - - elif not entry.get('digest'): - entry['digest'] = '-' - - # optional json metadata, if present - metadata = record.rec_headers.get_header('WARC-Json-Metadata') - if metadata: - entry['metadata'] = metadata - - return entry - - - #================================================================= - def parse_arc_record(self, record): - """ Parse arc record - """ - url = record.rec_headers.get_header('uri') - url = url.replace('\r', '%0D') - url = url.replace('\n', '%0A') - # replace formfeed - url = url.replace('\x0c', '%0C') - # replace nulls - url = url.replace('\x00', '%00') - - entry = self._create_index_entry(record.rec_type) - entry['url'] = url - - # timestamp - entry['timestamp'] = record.rec_headers.get_header('archive-date') - if len(entry['timestamp']) > 14: - entry['timestamp'] = entry['timestamp'][:14] - - if not self.options.get('minimal'): - # mime - entry.extract_mime(record.rec_headers.get_header('content-type')) - - # status - entry.extract_status(record.status_headers) - - # digest - entry['digest'] = '-' - - return entry - - def __call__(self, fh): - aiter = ArchiveIterator(fh, self.options.get('minimal', False), - self.options.get('verify_http', False), - self.options.get('arc2warc', False)) - - entry_iter = self.create_record_iter(aiter) - - if self.options.get('append_post'): - entry_iter = self.join_request_records(entry_iter) - - for entry in entry_iter: - if (entry.record.rec_type in ('request', 'warcinfo') and - not self.options.get('include_all')): - continue - - yield entry - - def open(self, filename): - with open(filename, 'rb') as fh: - for entry in self(fh): - yield entry - -class ArchiveIndexEntry(ArchiveIndexEntryMixin, dict): - pass - -class OrderedArchiveIndexEntry(ArchiveIndexEntryMixin, OrderedDict): - pass - - diff --git a/pywb/warc/cdxindexer.py b/pywb/warc/cdxindexer.py index 13e7ba26..fd633eed 100644 --- a/pywb/warc/cdxindexer.py +++ b/pywb/warc/cdxindexer.py @@ -31,7 +31,7 @@ from bisect import insort from six import StringIO -from pywb.warc.archiveiterator import DefaultRecordParser +from pywb.warc.archiveindexer import DefaultRecordParser import codecs import six diff --git a/pywb/recorder/test/test_writer.py b/pywb/warc/test/test_writer.py similarity index 93% rename from pywb/recorder/test/test_writer.py rename to pywb/warc/test/test_writer.py index 40bf1e97..dd593575 100644 --- a/pywb/recorder/test/test_writer.py +++ b/pywb/warc/test/test_writer.py @@ -1,5 +1,5 @@ from pywb.utils.statusandheaders import StatusAndHeaders -from pywb.recorder.warcwriter import SimpleTempWARCWriter +from pywb.warc.warcwriter import BufferWARCWriter from pywb.warc.recordloader import ArcWarcRecordLoader from pywb.warc.archiveiterator import ArchiveIterator @@ -9,7 +9,7 @@ import json # ============================================================================ -class FixedTestWARCWriter(SimpleTempWARCWriter): +class FixedTestWARCWriter(BufferWARCWriter): @classmethod def _make_warc_id(cls, id_=None): return '' @@ -36,7 +36,7 @@ class TestWarcWriter(object): record = simplewriter.create_warcinfo_record('testfile.warc.gz', params) simplewriter.write_record(record) - buff = simplewriter.get_buffer() + buff = simplewriter.get_contents() assert isinstance(buff, bytes) buff = BytesIO(buff) @@ -71,7 +71,7 @@ json-metadata: {"foo": "bar"}\r\n\ \r\n\ ' - assert simplewriter.get_buffer().decode('utf-8') == warcinfo_record + assert simplewriter.get_contents().decode('utf-8') == warcinfo_record def test_generate_response(self): headers_list = [('Content-Type', 'text/plain; charset="UTF-8"'), @@ -93,7 +93,7 @@ json-metadata: {"foo": "bar"}\r\n\ writer.write_record(record) - buff = writer.get_buffer() + buff = writer.get_contents() self._validate_record_content_len(BytesIO(buff)) diff --git a/pywb/recorder/warcwriter.py b/pywb/warc/warcwriter.py similarity index 51% rename from pywb/recorder/warcwriter.py rename to pywb/warc/warcwriter.py index 6c9eca8a..e98d46d3 100644 --- a/pywb/recorder/warcwriter.py +++ b/pywb/warc/warcwriter.py @@ -4,31 +4,24 @@ import base64 import hashlib import datetime import zlib -import sys -import os import six -import shutil - -import traceback from socket import gethostname from io import BytesIO -import portalocker - -from pywb.utils.loaders import LimitReader, to_native_str -from pywb.utils.bufferedreaders import BufferedReader -from pywb.utils.timeutils import timestamp20_now, datetime_to_iso_date +from pywb.utils.loaders import to_native_str +from pywb.utils.timeutils import datetime_to_iso_date from pywb.utils.statusandheaders import StatusAndHeadersParser, StatusAndHeaders + from pywb.warc.recordloader import ArcWarcRecord from pywb.warc.recordloader import ArcWarcRecordLoader -from pywb.webagg.utils import res_template, BUFF_SIZE - # ============================================================================ class BaseWARCWriter(object): + BUFF_SIZE = 16384 + WARC_RECORDS = {'warcinfo': 'application/warc-fields', 'response': 'application/http; msgtype=response', 'revisit': 'application/http; msgtype=response', @@ -38,25 +31,20 @@ class BaseWARCWriter(object): REVISIT_PROFILE = 'http://netpreserve.org/warc/1.0/revisit/uri-agnostic-identical-payload-digest' - FILE_TEMPLATE = 'rec-{timestamp}-{hostname}.warc.gz' - WARC_VERSION = 'WARC/1.0' - def __init__(self, gzip=True, dedup_index=None, - header_filter=None, *args, **kwargs): - + def __init__(self, gzip=True, header_filter=None, *args, **kwargs): self.gzip = gzip - self.dedup_index = dedup_index self.header_filter = header_filter self.hostname = gethostname() self.parser = StatusAndHeadersParser([], verify=False) self.warc_version = kwargs.get('warc_version', self.WARC_VERSION) - @staticmethod - def _iter_stream(stream): + @classmethod + def _iter_stream(cls, stream): while True: - buf = stream.read(BUFF_SIZE) + buf = stream.read(cls.BUFF_SIZE) if not buf: return @@ -94,25 +82,6 @@ class BaseWARCWriter(object): buff = record.status_headers.to_bytes(exclude_list) record.status_headers.headers_buff = buff - def write_req_resp(self, req, resp, params): - url = resp.rec_headers.get_header('WARC-Target-URI') - dt = resp.rec_headers.get_header('WARC-Date') - - #req.rec_headers['Content-Type'] = req.content_type - req.rec_headers.replace_header('WARC-Target-URI', url) - req.rec_headers.replace_header('WARC-Date', dt) - - resp_id = resp.rec_headers.get_header('WARC-Record-ID') - if resp_id: - req.rec_headers.add_header('WARC-Concurrent-To', resp_id) - - resp = self._check_revisit(resp, params) - if not resp: - print('Skipping due to dedup') - return - - self._do_write_req_resp(req, resp, params) - def create_warcinfo_record(self, filename, info): warc_headers = StatusAndHeaders(self.warc_version, []) warc_headers.add_header('WARC-Type', 'warcinfo') @@ -182,31 +151,6 @@ class BaseWARCWriter(object): return record - def _check_revisit(self, record, params): - if not self.dedup_index: - return record - - try: - url = record.rec_headers.get_header('WARC-Target-URI') - digest = record.rec_headers.get_header('WARC-Payload-Digest') - iso_dt = record.rec_headers.get_header('WARC-Date') - result = self.dedup_index.lookup_revisit(params, digest, url, iso_dt) - except Exception as e: - traceback.print_exc() - result = None - - if result == 'skip': - return None - - if isinstance(result, tuple) and result[0] == 'revisit': - record.rec_headers.replace_header('WARC-Type', 'revisit') - record.rec_headers.add_header('WARC-Profile', self.REVISIT_PROFILE) - - record.rec_headers.add_header('WARC-Refers-To-Target-URI', result[1]) - record.rec_headers.add_header('WARC-Refers-To-Date', result[2]) - - return record - def _write_warc_record(self, out, record, adjust_cl=True): if self.gzip: out = GzippingWrapper(out) @@ -321,231 +265,40 @@ class Digester(object): # ============================================================================ -class MultiFileWARCWriter(BaseWARCWriter): - def __init__(self, dir_template, filename_template=None, max_size=0, - max_idle_secs=1800, *args, **kwargs): - super(MultiFileWARCWriter, self).__init__(*args, **kwargs) - - if not filename_template: - dir_template, filename_template = os.path.split(dir_template) - dir_template += os.path.sep - - if not filename_template: - filename_template = self.FILE_TEMPLATE - - self.dir_template = dir_template - self.key_template = kwargs.get('key_template', self.dir_template) - self.filename_template = filename_template - self.max_size = max_size - if max_idle_secs > 0: - self.max_idle_time = datetime.timedelta(seconds=max_idle_secs) - else: - self.max_idle_time = None - - self.fh_cache = {} - - def get_new_filename(self, dir_, params): - timestamp = timestamp20_now() - - randstr = base64.b32encode(os.urandom(5)).decode('utf-8') - - filename = dir_ + res_template(self.filename_template, params, - hostname=self.hostname, - timestamp=timestamp, - random=randstr) - - return filename - - def allow_new_file(self, filename, params): - return True - - def _open_file(self, filename, params): - path, name = os.path.split(filename) - - try: - os.makedirs(path) - except: - pass - - fh = open(filename, 'a+b') - - if self.dedup_index: - self.dedup_index.add_warc_file(filename, params) - - return fh - - def _close_file(self, fh): - try: - portalocker.lock(fh, portalocker.LOCK_UN) - fh.close() - except Exception as e: - print(e) - - def get_dir_key(self, params): - return res_template(self.key_template, params) - - def close_key(self, dir_key): - if isinstance(dir_key, dict): - dir_key = self.get_dir_key(dir_key) - - result = self.fh_cache.pop(dir_key, None) - if not result: - return - - out, filename = result - self._close_file(out) - return filename - - def close_file(self, match_filename): - for dir_key, out, filename in self.iter_open_files(): - if filename == match_filename: - return self.close_key(dir_key) - - def _is_write_resp(self, resp, params): - return True - - def _is_write_req(self, req, params): - return True - - def write_record(self, record, params=None): - params = params or {} - self._do_write_req_resp(None, record, params) - - def _do_write_req_resp(self, req, resp, params): - def write_callback(out, filename): - #url = resp.rec_headers.get_header('WARC-Target-URI') - #print('Writing req/resp {0} to {1} '.format(url, filename)) - - if resp and self._is_write_resp(resp, params): - self._write_warc_record(out, resp) - - if req and self._is_write_req(req, params): - self._write_warc_record(out, req) - - return self._write_to_file(params, write_callback) - - def write_stream_to_file(self, params, stream): - def write_callback(out, filename): - #print('Writing stream to {0}'.format(filename)) - shutil.copyfileobj(stream, out) - - return self._write_to_file(params, write_callback) - - def _write_to_file(self, params, write_callback): - full_dir = res_template(self.dir_template, params) - dir_key = self.get_dir_key(params) - - result = self.fh_cache.get(dir_key) - - close_file = False - - if result: - out, filename = result - is_new = False - else: - filename = self.get_new_filename(full_dir, params) - - if not self.allow_new_file(filename, params): - return False - - out = self._open_file(filename, params) - - is_new = True - - try: - start = out.tell() - - write_callback(out, filename) - - out.flush() - - new_size = out.tell() - - out.seek(start) - - if self.dedup_index: - self.dedup_index.add_urls_to_index(out, params, - filename, - new_size - start) - - return True - - except Exception as e: - traceback.print_exc() - close_file = True - return False - - finally: - # check for rollover - if self.max_size and new_size > self.max_size: - close_file = True - - if close_file: - self._close_file(out) - if not is_new: - self.fh_cache.pop(dir_key, None) - - elif is_new: - portalocker.lock(out, portalocker.LOCK_EX | portalocker.LOCK_NB) - self.fh_cache[dir_key] = (out, filename) - - def iter_open_files(self): - for n, v in list(self.fh_cache.items()): - out, filename = v - yield n, out, filename - - def close(self): - for dir_key, out, filename in self.iter_open_files(): - self._close_file(out) - - self.fh_cache = {} - - def close_idle_files(self): - if not self.max_idle_time: - return - - now = datetime.datetime.now() - - for dir_key, out, filename in self.iter_open_files(): - try: - mtime = os.path.getmtime(filename) - except: - self.close_key(dir_key) - return - - mtime = datetime.datetime.fromtimestamp(mtime) - - if (now - mtime) > self.max_idle_time: - print('Closing idle ' + filename) - self.close_key(dir_key) - - -# ============================================================================ -class PerRecordWARCWriter(MultiFileWARCWriter): +class BufferWARCWriter(BaseWARCWriter): def __init__(self, *args, **kwargs): - kwargs['max_size'] = 1 - super(PerRecordWARCWriter, self).__init__(*args, **kwargs) - - -# ============================================================================ -class SimpleTempWARCWriter(BaseWARCWriter): - def __init__(self, *args, **kwargs): - super(SimpleTempWARCWriter, self).__init__(*args, **kwargs) + super(BufferWARCWriter, self).__init__(*args, **kwargs) self.out = self._create_buffer() def _create_buffer(self): return tempfile.SpooledTemporaryFile(max_size=512*1024) - def _do_write_req_resp(self, req, resp, params): - self._write_warc_record(self.out, resp) - self._write_warc_record(self.out, req) - - def write_record(self, record, params=None): + def write_record(self, record): self._write_warc_record(self.out, record) - def get_buffer(self): + def get_contents(self): pos = self.out.tell() self.out.seek(0) buff = self.out.read() self.out.seek(pos) return buff + + +# ============================================================================ +class FileWARCWriter(BufferWARCWriter): + def __init__(self, *args, **kwargs): + file_or_buff = None + if len(args) > 0: + file_or_buff = args[0] + else: + file_or_buff = kwargs.get('file') + + if isinstance(file_or_buff, str): + self.out = open(file_or_buff, 'rb') + elif hasattr(file_or_buff, 'read'): + self.out = file_or_buff + else: + raise Exception('file must be a readable or valid filename') + + + From b7285b1a77cf6982465a756e39b2d1041b92b876 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 1 Mar 2017 14:57:44 -0800 Subject: [PATCH 4/7] refactor: split off BlockLoader support into BlockArcWarcRecordLoader, plain ArcWarcRecordLoader only includes parse_record_stream(), no load() use BlockArcWarcRecordLoader() only when needed for replay --- pywb/warc/blockrecordloader.py | 33 +++++++++++++++++++++++++++++++++ pywb/warc/recordloader.py | 32 ++------------------------------ pywb/warc/resolvingloader.py | 5 +++-- pywb/warc/test/test_loading.py | 3 ++- pywb/webapp/handlers.py | 4 ++-- 5 files changed, 42 insertions(+), 35 deletions(-) create mode 100644 pywb/warc/blockrecordloader.py diff --git a/pywb/warc/blockrecordloader.py b/pywb/warc/blockrecordloader.py new file mode 100644 index 00000000..fea50c31 --- /dev/null +++ b/pywb/warc/blockrecordloader.py @@ -0,0 +1,33 @@ +from pywb.utils.loaders import BlockLoader +from pywb.utils.bufferedreaders import DecompressingBufferedReader +from pywb.warc.recordloader import ArcWarcRecordLoader + + +#================================================================= +class BlockArcWarcRecordLoader(ArcWarcRecordLoader): + def __init__(self, loader=None, cookie_maker=None, block_size=8192, *args, **kwargs): + if not loader: + loader = BlockLoader(cookie_maker=cookie_maker) + + self.loader = loader + self.block_size = block_size + super(BlockArcWarcRecordLoader, self).__init__(*args, **kwargs) + + def load(self, url, offset, length, no_record_parse=False): + """ Load a single record from given url at offset with length + and parse as either warc or arc record + """ + try: + length = int(length) + except: + length = -1 + + stream = self.loader.load(url, int(offset), length) + decomp_type = 'gzip' + + # Create decompressing stream + stream = DecompressingBufferedReader(stream=stream, + decomp_type=decomp_type, + block_size=self.block_size) + + return self.parse_record_stream(stream, no_record_parse=no_record_parse) diff --git a/pywb/warc/recordloader.py b/pywb/warc/recordloader.py index 751f3787..3becb294 100644 --- a/pywb/warc/recordloader.py +++ b/pywb/warc/recordloader.py @@ -4,15 +4,13 @@ from pywb.utils.statusandheaders import StatusAndHeaders from pywb.utils.statusandheaders import StatusAndHeadersParser from pywb.utils.statusandheaders import StatusAndHeadersParserException -from pywb.utils.loaders import BlockLoader, LimitReader +from pywb.utils.loaders import LimitReader from pywb.utils.loaders import to_native_str -from pywb.utils.bufferedreaders import DecompressingBufferedReader from pywb.utils.wbexception import WbException from pywb.utils.timeutils import timestamp_to_iso_date from six.moves import zip -import six #================================================================= @@ -56,14 +54,7 @@ class ArcWarcRecordLoader(object): NON_HTTP_SCHEMES = ('dns:', 'whois:', 'ntp:') HTTP_SCHEMES = ('http:', 'https:') - def __init__(self, loader=None, cookie_maker=None, block_size=8192, - verify_http=True, arc2warc=True): - if not loader: - loader = BlockLoader(cookie_maker=cookie_maker) - - self.loader = loader - self.block_size = block_size - + def __init__(self, verify_http=True, arc2warc=True): if arc2warc: self.arc_parser = ARC2WARCHeadersParser() else: @@ -74,25 +65,6 @@ class ArcWarcRecordLoader(object): self.http_req_parser = StatusAndHeadersParser(self.HTTP_VERBS, verify_http) - def load(self, url, offset, length, no_record_parse=False): - """ Load a single record from given url at offset with length - and parse as either warc or arc record - """ - try: - length = int(length) - except: - length = -1 - - stream = self.loader.load(url, int(offset), length) - decomp_type = 'gzip' - - # Create decompressing stream - stream = DecompressingBufferedReader(stream=stream, - decomp_type=decomp_type, - block_size=self.block_size) - - return self.parse_record_stream(stream, no_record_parse=no_record_parse) - def parse_record_stream(self, stream, statusline=None, known_format=None, diff --git a/pywb/warc/resolvingloader.py b/pywb/warc/resolvingloader.py index ed0f8049..5b74a9c7 100644 --- a/pywb/warc/resolvingloader.py +++ b/pywb/warc/resolvingloader.py @@ -1,5 +1,6 @@ from pywb.utils.timeutils import iso_date_to_timestamp -from pywb.warc.recordloader import ArcWarcRecordLoader, ArchiveLoadFailed +from pywb.warc.blockrecordloader import BlockArcWarcRecordLoader +from pywb.warc.recordloader import ArchiveLoadFailed from pywb.utils.wbexception import NotFoundException import six @@ -9,7 +10,7 @@ import six class ResolvingLoader(object): MISSING_REVISIT_MSG = 'Original for revisit record could not be loaded' - def __init__(self, path_resolvers, record_loader=ArcWarcRecordLoader(), no_record_parse=False): + def __init__(self, path_resolvers, record_loader=BlockArcWarcRecordLoader(), no_record_parse=False): self.path_resolvers = path_resolvers self.record_loader = record_loader self.no_record_parse = no_record_parse diff --git a/pywb/warc/test/test_loading.py b/pywb/warc/test/test_loading.py index ba3f669c..bd629a4a 100644 --- a/pywb/warc/test/test_loading.py +++ b/pywb/warc/test/test_loading.py @@ -293,6 +293,7 @@ import pprint import six from pywb.warc.recordloader import ArcWarcRecordLoader, ArchiveLoadFailed +from pywb.warc.blockrecordloader import BlockArcWarcRecordLoader from pywb.warc.resolvingloader import ResolvingLoader from pywb.warc.pathresolvers import PathResolverMapper from pywb.cdx.cdxobject import CDXObject @@ -326,7 +327,7 @@ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - \ def load_test_archive(test_file, offset, length): path = test_warc_dir + test_file - testloader = ArcWarcRecordLoader() + testloader = BlockArcWarcRecordLoader() archive = testloader.load(path, offset, length) diff --git a/pywb/webapp/handlers.py b/pywb/webapp/handlers.py index b7411ccb..2317d5db 100644 --- a/pywb/webapp/handlers.py +++ b/pywb/webapp/handlers.py @@ -12,7 +12,7 @@ from pywb.utils.statusandheaders import StatusAndHeaders from pywb.framework.basehandlers import BaseHandler, WbUrlHandler from pywb.framework.wbrequestresponse import WbResponse -from pywb.warc.recordloader import ArcWarcRecordLoader +from pywb.warc.blockrecordloader import BlockArcWarcRecordLoader from pywb.warc.resolvingloader import ResolvingLoader from pywb.warc.pathresolvers import PathResolverMapper @@ -134,7 +134,7 @@ class WBHandler(SearchPageWbUrlHandler): def _init_replay_view(self, config): cookie_maker = config.get('cookie_maker') - record_loader = ArcWarcRecordLoader(cookie_maker=cookie_maker) + record_loader = BlockArcWarcRecordLoader(cookie_maker=cookie_maker) paths = config.get('archive_paths') From 2b3fde028f5fb6b2e380ebd265126373b29cd0d8 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 1 Mar 2017 15:13:32 -0800 Subject: [PATCH 5/7] refactor: split LimitReader into limitreader.py --- pywb/utils/limitreader.py | 69 +++++++++++++++++++++++ pywb/utils/loaders.py | 73 +------------------------ pywb/utils/test/test_bufferedreaders.py | 2 +- pywb/utils/test/test_loaders.py | 3 +- pywb/warc/recordloader.py | 2 +- pywb/webagg/inputrequest.py | 2 +- pywb/webapp/rangecache.py | 2 +- pywb/webapp/replay_views.py | 2 +- 8 files changed, 77 insertions(+), 78 deletions(-) create mode 100644 pywb/utils/limitreader.py diff --git a/pywb/utils/limitreader.py b/pywb/utils/limitreader.py new file mode 100644 index 00000000..49d5d713 --- /dev/null +++ b/pywb/utils/limitreader.py @@ -0,0 +1,69 @@ +# ============================================================================ +class LimitReader(object): + """ + A reader which will not read more than specified limit + """ + + def __init__(self, stream, limit): + self.stream = stream + self.limit = limit + + if hasattr(stream, 'tell'): + self.tell = self._tell + + def _update(self, buff): + length = len(buff) + self.limit -= length + return buff + + def read(self, length=None): + if length is not None: + length = min(length, self.limit) + else: + length = self.limit + + if length == 0: + return b'' + + buff = self.stream.read(length) + return self._update(buff) + + def readline(self, length=None): + if length is not None: + length = min(length, self.limit) + else: + length = self.limit + + if length == 0: + return b'' + + buff = self.stream.readline(length) + return self._update(buff) + + def close(self): + self.stream.close() + + def _tell(self): + return self.stream.tell() + + @staticmethod + def wrap_stream(stream, content_length): + """ + If given content_length is an int > 0, wrap the stream + in a LimitReader. Otherwise, return the stream unaltered + """ + try: + content_length = int(content_length) + if content_length >= 0: + # optimize: if already a LimitStream, set limit to + # the smaller of the two limits + if isinstance(stream, LimitReader): + stream.limit = min(stream.limit, content_length) + else: + stream = LimitReader(stream, content_length) + + except (ValueError, TypeError): + pass + + return stream + diff --git a/pywb/utils/loaders.py b/pywb/utils/loaders.py index d803333d..53be7896 100644 --- a/pywb/utils/loaders.py +++ b/pywb/utils/loaders.py @@ -17,6 +17,7 @@ import base64 import cgi from io import open, BytesIO +from pywb.utils.limitreader import LimitReader try: from boto import connect_s3 @@ -500,78 +501,6 @@ class HMACCookieMaker(object): return cookie -#================================================================= -# Limit Reader -#================================================================= -class LimitReader(object): - """ - A reader which will not read more than specified limit - """ - - def __init__(self, stream, limit): - self.stream = stream - self.limit = limit - - if hasattr(stream, 'tell'): - self.tell = self._tell - - def _update(self, buff): - length = len(buff) - self.limit -= length - return buff - - def read(self, length=None): - if length is not None: - length = min(length, self.limit) - else: - length = self.limit - - if length == 0: - return b'' - - buff = self.stream.read(length) - return self._update(buff) - - def readline(self, length=None): - if length is not None: - length = min(length, self.limit) - else: - length = self.limit - - if length == 0: - return b'' - - buff = self.stream.readline(length) - return self._update(buff) - - def close(self): - self.stream.close() - - def _tell(self): - return self.stream.tell() - - @staticmethod - def wrap_stream(stream, content_length): - """ - If given content_length is an int > 0, wrap the stream - in a LimitReader. Otherwise, return the stream unaltered - """ - try: - content_length = int(content_length) - if content_length >= 0: - # optimize: if already a LimitStream, set limit to - # the smaller of the two limits - if isinstance(stream, LimitReader): - stream.limit = min(stream.limit, content_length) - else: - stream = LimitReader(stream, content_length) - - except (ValueError, TypeError): - pass - - return stream - - # ============================================================================ BlockLoader.init_default_loaders() diff --git a/pywb/utils/test/test_bufferedreaders.py b/pywb/utils/test/test_bufferedreaders.py index 1960e374..c5b91f21 100644 --- a/pywb/utils/test/test_bufferedreaders.py +++ b/pywb/utils/test/test_bufferedreaders.py @@ -101,7 +101,7 @@ Zero-Length chunk: from io import BytesIO from pywb.utils.bufferedreaders import ChunkedDataReader, ChunkedDataException from pywb.utils.bufferedreaders import DecompressingBufferedReader -from pywb.utils.loaders import LimitReader +from pywb.utils.limitreader import LimitReader from pywb import get_test_dir diff --git a/pywb/utils/test/test_loaders.py b/pywb/utils/test/test_loaders.py index dd5c3861..703ef81e 100644 --- a/pywb/utils/test/test_loaders.py +++ b/pywb/utils/test/test_loaders.py @@ -141,8 +141,9 @@ from io import BytesIO import requests from pywb.utils.loaders import BlockLoader, HMACCookieMaker, to_file_url -from pywb.utils.loaders import LimitReader, extract_client_cookie, extract_post_query +from pywb.utils.loaders import extract_client_cookie, extract_post_query from pywb.utils.loaders import append_post_query, read_last_line +from pywb.utils.limitreader import LimitReader from pywb.utils.bufferedreaders import DecompressingBufferedReader diff --git a/pywb/warc/recordloader.py b/pywb/warc/recordloader.py index 3becb294..90af9496 100644 --- a/pywb/warc/recordloader.py +++ b/pywb/warc/recordloader.py @@ -4,7 +4,7 @@ from pywb.utils.statusandheaders import StatusAndHeaders from pywb.utils.statusandheaders import StatusAndHeadersParser from pywb.utils.statusandheaders import StatusAndHeadersParserException -from pywb.utils.loaders import LimitReader +from pywb.utils.limitreader import LimitReader from pywb.utils.loaders import to_native_str from pywb.utils.wbexception import WbException diff --git a/pywb/webagg/inputrequest.py b/pywb/webagg/inputrequest.py index 50112959..1d2196cf 100644 --- a/pywb/webagg/inputrequest.py +++ b/pywb/webagg/inputrequest.py @@ -1,5 +1,5 @@ from pywb.utils.loaders import extract_post_query, append_post_query -from pywb.utils.loaders import LimitReader +from pywb.utils.limitreader import LimitReader from pywb.utils.statusandheaders import StatusAndHeadersParser from six.moves.urllib.parse import urlsplit, quote diff --git a/pywb/webapp/rangecache.py b/pywb/webapp/rangecache.py index a45b36f9..53a5e2ac 100644 --- a/pywb/webapp/rangecache.py +++ b/pywb/webapp/rangecache.py @@ -1,5 +1,5 @@ from pywb.utils.statusandheaders import StatusAndHeaders -from pywb.utils.loaders import LimitReader +from pywb.utils.limitreader import LimitReader from pywb.framework.cache import create_cache from tempfile import NamedTemporaryFile, mkdtemp diff --git a/pywb/webapp/replay_views.py b/pywb/webapp/replay_views.py index 69e9ea2d..b05ade12 100644 --- a/pywb/webapp/replay_views.py +++ b/pywb/webapp/replay_views.py @@ -7,7 +7,7 @@ from itertools import chain from pywb.utils.statusandheaders import StatusAndHeaders from pywb.utils.wbexception import WbException, NotFoundException -from pywb.utils.loaders import LimitReader +from pywb.utils.limitreader import LimitReader from pywb.utils.timeutils import timestamp_now from pywb.framework.wbrequestresponse import WbResponse From 4a94699a65f4cdfce7e1a84159dfea4eeda0f364 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 1 Mar 2017 18:02:35 -0800 Subject: [PATCH 6/7] warc refactor: ArchiveLoadFailed no longer derived from WbException catch separately, set status to 503 Archive Not Available explicitly --- pywb/warc/recordloader.py | 8 +++----- pywb/webagg/app.py | 6 +++++- pywb/webagg/handlers.py | 3 ++- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/pywb/warc/recordloader.py b/pywb/warc/recordloader.py index 90af9496..64c4181f 100644 --- a/pywb/warc/recordloader.py +++ b/pywb/warc/recordloader.py @@ -7,7 +7,7 @@ from pywb.utils.statusandheaders import StatusAndHeadersParserException from pywb.utils.limitreader import LimitReader from pywb.utils.loaders import to_native_str -from pywb.utils.wbexception import WbException +#from pywb.utils.wbexception import WbException from pywb.utils.timeutils import timestamp_to_iso_date from six.moves import zip @@ -27,7 +27,7 @@ class ArcWarcRecord(object): #================================================================= -class ArchiveLoadFailed(WbException): +class ArchiveLoadFailed(Exception): def __init__(self, reason, filename=''): if filename: msg = filename + ': ' + str(reason) @@ -35,9 +35,7 @@ class ArchiveLoadFailed(WbException): msg = str(reason) super(ArchiveLoadFailed, self).__init__(msg) - - def status(self): - return '503 Service Unavailable' + self.msg = msg #================================================================= diff --git a/pywb/webagg/app.py b/pywb/webagg/app.py index 90aadc5f..c2368c57 100644 --- a/pywb/webagg/app.py +++ b/pywb/webagg/app.py @@ -107,7 +107,11 @@ class ResAggApp(object): if self.debug: traceback.print_exc() - status = last_exc.status() + if not hasattr(last_exc, 'status'): + status = '503 Archive Not Available' + else: + status = last_exc.status() + message = last_exc.msg res = {'message': message} diff --git a/pywb/webagg/handlers.py b/pywb/webagg/handlers.py index 707a9c89..50a4362f 100644 --- a/pywb/webagg/handlers.py +++ b/pywb/webagg/handlers.py @@ -2,6 +2,7 @@ from pywb.webagg.responseloader import WARCPathLoader, LiveWebLoader, VideoLoad from pywb.webagg.utils import MementoUtils from pywb.utils.wbexception import BadRequestException, WbException from pywb.utils.wbexception import NotFoundException +from pywb.warc.recordloader import ArchiveLoadFailed from pywb.cdx.query import CDXQuery from pywb.cdx.cdxdomainspecific import load_domain_specific_cdx_rules @@ -151,7 +152,7 @@ class ResourceHandler(IndexHandler): out_headers, resp = loader(cdx, params) if resp is not None: return out_headers, resp, errs - except WbException as e: + except (WbException, ArchiveLoadFailed) as e: last_exc = e errs[str(loader)] = str(e) From 0784e4e5aa3af5f442a81a2cc9bf3e970d580685 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 1 Mar 2017 18:37:38 -0800 Subject: [PATCH 7/7] spin-off warcio! update imports to point to warcio warcio rename fixes: - ArcWarcRecord.stream -> raw_stream - ArcWarcRecord.status_headers -> http_headers - ArchiveLoadFailed single param init --- pywb/cdx/cdxops.py | 5 +- pywb/cdx/test/test_redis_source.py | 2 +- pywb/cdx/zipnum.py | 2 +- pywb/framework/memento.py | 4 +- pywb/framework/proxy.py | 8 +- pywb/framework/proxy_resolvers.py | 4 +- pywb/framework/test/test_wbrequestresponse.py | 2 +- pywb/framework/wbrequestresponse.py | 2 +- pywb/framework/wsgi_wrappers.py | 7 +- pywb/manager/manager.py | 2 +- pywb/recorder/filters.py | 2 +- pywb/recorder/multifilewarcwriter.py | 49 ++- pywb/recorder/recorderapp.py | 13 +- pywb/recorder/redisindexer.py | 3 +- pywb/recorder/test/test_recorder.py | 44 +-- pywb/rewrite/header_rewriter.py | 4 +- pywb/rewrite/rewrite_content.py | 9 +- pywb/rewrite/rewrite_live.py | 5 +- pywb/rewrite/test/test_header_rewriter.py | 4 +- pywb/static/flowplayer/flowplayer-3.2.18.swf | Bin 129772 -> 129773 bytes .../flowplayer/flowplayer.audio-3.2.11.swf | Bin 4263 -> 4264 bytes .../flowplayer/flowplayer.controls-3.2.16.swf | Bin 38274 -> 38275 bytes .../flowplayer.pseudostreaming-3.2.13.swf | Bin 4850 -> 4851 bytes pywb/urlrewrite/cookies.py | 2 +- pywb/urlrewrite/platformhandler.py | 6 +- pywb/urlrewrite/rewriterapp.py | 15 +- pywb/urlrewrite/templateview.py | 5 +- pywb/utils/bufferedreaders.py | 336 ------------------ pywb/utils/limitreader.py | 69 ---- pywb/utils/loaders.py | 2 +- pywb/utils/statusandheaders.py | 285 --------------- pywb/utils/test/test_bufferedreaders.py | 174 --------- pywb/utils/test/test_loaders.py | 48 +-- pywb/utils/test/test_statusandheaders.py | 182 ---------- pywb/utils/timeutils.py | 330 ----------------- pywb/warc/archiveindexer.py | 22 +- pywb/warc/archiveiterator.py | 233 ------------ pywb/warc/blockrecordloader.py | 4 +- pywb/warc/recordloader.py | 298 ---------------- pywb/warc/resolvingloader.py | 17 +- pywb/warc/test/test_loading.py | 10 +- pywb/warc/test/test_writer.py | 121 ------- pywb/warc/warcwriter.py | 304 ---------------- pywb/webagg/aggregator.py | 3 +- pywb/webagg/handlers.py | 2 +- pywb/webagg/indexsource.py | 4 +- pywb/webagg/inputrequest.py | 5 +- pywb/webagg/proxyindexsource.py | 2 +- pywb/webagg/responseloader.py | 21 +- pywb/webagg/test/test_handlers.py | 12 +- pywb/webagg/test/test_indexsource.py | 2 +- pywb/webagg/test/test_upstream.py | 8 +- pywb/webagg/utils.py | 7 +- pywb/webagg/zipnum.py | 2 +- pywb/webapp/handlers.py | 5 +- pywb/webapp/rangecache.py | 5 +- pywb/webapp/replay_views.py | 9 +- pywb/webapp/views.py | 2 +- setup.py | 1 + 59 files changed, 186 insertions(+), 2538 deletions(-) delete mode 100644 pywb/utils/bufferedreaders.py delete mode 100644 pywb/utils/limitreader.py delete mode 100644 pywb/utils/statusandheaders.py delete mode 100644 pywb/utils/test/test_bufferedreaders.py delete mode 100644 pywb/utils/test/test_statusandheaders.py delete mode 100644 pywb/utils/timeutils.py delete mode 100644 pywb/warc/archiveiterator.py delete mode 100644 pywb/warc/recordloader.py delete mode 100644 pywb/warc/test/test_writer.py delete mode 100644 pywb/warc/warcwriter.py diff --git a/pywb/cdx/cdxops.py b/pywb/cdx/cdxops.py index 18c420c5..42a1ac30 100644 --- a/pywb/cdx/cdxops.py +++ b/pywb/cdx/cdxops.py @@ -3,8 +3,9 @@ from pywb.cdx.cdxobject import TIMESTAMP, STATUSCODE, MIMETYPE, DIGEST from pywb.cdx.cdxobject import OFFSET, LENGTH, FILENAME from pywb.cdx.query import CDXQuery -from pywb.utils.timeutils import timestamp_to_sec, pad_timestamp -from pywb.utils.timeutils import PAD_14_DOWN, PAD_14_UP + +from warcio.timeutils import timestamp_to_sec, pad_timestamp +from warcio.timeutils import PAD_14_DOWN, PAD_14_UP import bisect diff --git a/pywb/cdx/test/test_redis_source.py b/pywb/cdx/test/test_redis_source.py index f46a9478..1fa65209 100644 --- a/pywb/cdx/test/test_redis_source.py +++ b/pywb/cdx/test/test_redis_source.py @@ -14,7 +14,7 @@ com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ from fakeredis import FakeStrictRedis from mock import patch -from pywb.utils.timeutils import timestamp_to_sec +from warcio.timeutils import timestamp_to_sec from pywb.cdx.cdxsource import RedisCDXSource from pywb.cdx.cdxserver import CDXServer diff --git a/pywb/cdx/zipnum.py b/pywb/cdx/zipnum.py index 9a51ae7f..a368083d 100644 --- a/pywb/cdx/zipnum.py +++ b/pywb/cdx/zipnum.py @@ -13,7 +13,7 @@ from pywb.cdx.cdxsource import CDXSource from pywb.cdx.cdxobject import IDXObject, CDXException from pywb.utils.loaders import BlockLoader, read_last_line -from pywb.utils.bufferedreaders import gzip_decompressor +from warcio.bufferedreaders import gzip_decompressor from pywb.utils.binsearch import iter_range, linearsearch, search diff --git a/pywb/framework/memento.py b/pywb/framework/memento.py index 65f41306..9f6fbe87 100644 --- a/pywb/framework/memento.py +++ b/pywb/framework/memento.py @@ -1,6 +1,6 @@ from pywb.utils.wbexception import BadRequestException -from pywb.utils.timeutils import http_date_to_timestamp -from pywb.utils.timeutils import timestamp_to_http_date +from warcio.timeutils import http_date_to_timestamp +from warcio.timeutils import timestamp_to_http_date from pywb.framework.wbrequestresponse import WbRequest, WbResponse from pywb.rewrite.wburl import WbUrl diff --git a/pywb/framework/proxy.py b/pywb/framework/proxy.py index 6b1cc739..9f5a1f26 100644 --- a/pywb/framework/proxy.py +++ b/pywb/framework/proxy.py @@ -4,6 +4,7 @@ from pywb.framework.wbrequestresponse import WbResponse, WbRequest from pywb.framework.archivalrouter import ArchivalRouter from six.moves.urllib.parse import urlsplit +from six import iteritems import base64 import socket @@ -15,8 +16,8 @@ from pywb.rewrite.url_rewriter import SchemeOnlyUrlRewriter, UrlRewriter from pywb.rewrite.rewrite_content import RewriteContent from pywb.utils.wbexception import BadRequestException -from pywb.utils.bufferedreaders import BufferedReader -from pywb.utils.loaders import to_native_str +from warcio.bufferedreaders import BufferedReader +from warcio.utils import to_native_str from pywb.framework.proxy_resolvers import ProxyAuthResolver, CookieResolver, IPCacheResolver @@ -250,7 +251,8 @@ class ProxyRouter(object): # add extra headers for replay responses if wbrequest.wb_url and wbrequest.wb_url.is_replay(): - response.status_headers.replace_headers(self.extra_headers) + for name, value in iteritems(self.extra_headers): + response.status_headers.replace_header(name, value) # check for content-length res = response.status_headers.get_header('content-length') diff --git a/pywb/framework/proxy_resolvers.py b/pywb/framework/proxy_resolvers.py index 880cbc1c..e4b34cb5 100644 --- a/pywb/framework/proxy_resolvers.py +++ b/pywb/framework/proxy_resolvers.py @@ -1,7 +1,6 @@ from pywb.framework.wbrequestresponse import WbResponse from pywb.utils.loaders import extract_client_cookie from pywb.utils.wbexception import WbException -from pywb.utils.statusandheaders import StatusAndHeaders from pywb.rewrite.wburl import WbUrl from pywb.framework.cache import create_cache @@ -10,7 +9,8 @@ from pywb.framework.basehandlers import WbUrlHandler from six.moves.urllib.parse import parse_qs, urlsplit import six -from pywb.utils.loaders import to_native_str +from warcio.statusandheaders import StatusAndHeaders +from warcio.utils import to_native_str import base64 import os diff --git a/pywb/framework/test/test_wbrequestresponse.py b/pywb/framework/test/test_wbrequestresponse.py index 2c550255..587dcc9e 100644 --- a/pywb/framework/test/test_wbrequestresponse.py +++ b/pywb/framework/test/test_wbrequestresponse.py @@ -61,7 +61,7 @@ from pywb.rewrite.wburl import WbUrl from pywb.rewrite.url_rewriter import UrlRewriter -from pywb.utils.statusandheaders import StatusAndHeaders +from warcio.statusandheaders import StatusAndHeaders from pywb.framework.wbrequestresponse import WbRequest, WbResponse diff --git a/pywb/framework/wbrequestresponse.py b/pywb/framework/wbrequestresponse.py index 8d96b815..1cbf171d 100644 --- a/pywb/framework/wbrequestresponse.py +++ b/pywb/framework/wbrequestresponse.py @@ -1,4 +1,4 @@ -from pywb.utils.statusandheaders import StatusAndHeaders +from warcio.statusandheaders import StatusAndHeaders from pywb.utils.loaders import extract_post_query, append_post_query from io import BytesIO diff --git a/pywb/framework/wsgi_wrappers.py b/pywb/framework/wsgi_wrappers.py index b0a2bf46..4e077977 100644 --- a/pywb/framework/wsgi_wrappers.py +++ b/pywb/framework/wsgi_wrappers.py @@ -1,7 +1,10 @@ from pywb.utils.wbexception import WbException, NotFoundException -from pywb.utils.loaders import load_yaml_config, to_native_str +from pywb.utils.loaders import load_yaml_config +from pywb.utils.loaders import load_yaml_config +from warcio.utils import to_native_str -from pywb.framework.wbrequestresponse import WbResponse, StatusAndHeaders +from pywb.framework.wbrequestresponse import WbResponse +from warcio.statusandheaders import StatusAndHeaders import os diff --git a/pywb/manager/manager.py b/pywb/manager/manager.py index 54722d0c..ebc18f5f 100644 --- a/pywb/manager/manager.py +++ b/pywb/manager/manager.py @@ -13,7 +13,7 @@ from pkg_resources import resource_string from argparse import ArgumentParser, RawTextHelpFormatter from pywb.utils.loaders import load_yaml_config -from pywb.utils.timeutils import timestamp20_now +from warcio.timeutils import timestamp20_now from pywb import DEFAULT_CONFIG diff --git a/pywb/recorder/filters.py b/pywb/recorder/filters.py index c9ab74ee..1d9c68e1 100644 --- a/pywb/recorder/filters.py +++ b/pywb/recorder/filters.py @@ -1,4 +1,4 @@ -from pywb.utils.timeutils import timestamp_to_datetime, datetime_to_iso_date +from warcio.timeutils import timestamp_to_datetime, datetime_to_iso_date import re diff --git a/pywb/recorder/multifilewarcwriter.py b/pywb/recorder/multifilewarcwriter.py index 673ac042..475dce01 100644 --- a/pywb/recorder/multifilewarcwriter.py +++ b/pywb/recorder/multifilewarcwriter.py @@ -7,12 +7,11 @@ import traceback import portalocker -from pywb.utils.timeutils import timestamp20_now +from warcio.timeutils import timestamp20_now +from warcio.warcwriter import BaseWARCWriter from pywb.webagg.utils import res_template -from pywb.warc.warcwriter import BaseWARCWriter - # ============================================================================ class MultiFileWARCWriter(BaseWARCWriter): @@ -22,6 +21,8 @@ class MultiFileWARCWriter(BaseWARCWriter): max_idle_secs=1800, *args, **kwargs): super(MultiFileWARCWriter, self).__init__(*args, **kwargs) + self.header_filter = kwargs.get('header_filter') + if not filename_template: dir_template, filename_template = os.path.split(dir_template) dir_template += os.path.sep @@ -41,27 +42,8 @@ class MultiFileWARCWriter(BaseWARCWriter): self.fh_cache = {} - def write_req_resp(self, req, resp, params): - url = resp.rec_headers.get_header('WARC-Target-URI') - dt = resp.rec_headers.get_header('WARC-Date') - - #req.rec_headers['Content-Type'] = req.content_type - req.rec_headers.replace_header('WARC-Target-URI', url) - req.rec_headers.replace_header('WARC-Date', dt) - - resp_id = resp.rec_headers.get_header('WARC-Record-ID') - if resp_id: - req.rec_headers.add_header('WARC-Concurrent-To', resp_id) - - resp = self._check_revisit(resp, params) - if not resp: - print('Skipping due to dedup') - return - - self._do_write_req_resp(req, resp, params) - def _check_revisit(self, record, params): - if not self.dedup_index: + if not self.dedup_index or record.rec_type != 'response': return record try: @@ -77,14 +59,18 @@ class MultiFileWARCWriter(BaseWARCWriter): return None if isinstance(result, tuple) and result[0] == 'revisit': - record.rec_headers.replace_header('WARC-Type', 'revisit') - record.rec_headers.add_header('WARC-Profile', self.REVISIT_PROFILE) - - record.rec_headers.add_header('WARC-Refers-To-Target-URI', result[1]) - record.rec_headers.add_header('WARC-Refers-To-Date', result[2]) + record = self.create_revisit_record(url, digest, result[1], result[2], + http_headers=record.http_headers) return record + def _set_header_buff(self, record): + exclude_list = None + if self.header_filter: + exclude_list = self.header_filter(record) + buff = record.http_headers.to_bytes(exclude_list) + record.http_headers.headers_buff = buff + def get_new_filename(self, dir_, params): timestamp = timestamp20_now() @@ -153,6 +139,11 @@ class MultiFileWARCWriter(BaseWARCWriter): self._do_write_req_resp(None, record, params) def _do_write_req_resp(self, req, resp, params): + resp = self._check_revisit(resp, params) + if not resp: + print('Skipping due to dedup') + return + def write_callback(out, filename): #url = resp.rec_headers.get_header('WARC-Target-URI') #print('Writing req/resp {0} to {1} '.format(url, filename)) @@ -180,6 +171,8 @@ class MultiFileWARCWriter(BaseWARCWriter): close_file = False + new_size = start = 0 + if result: out, filename = result is_new = False diff --git a/pywb/recorder/recorderapp.py b/pywb/recorder/recorderapp.py index 03c101cf..1d3a6992 100644 --- a/pywb/recorder/recorderapp.py +++ b/pywb/recorder/recorderapp.py @@ -69,16 +69,21 @@ class RecorderApp(object): req_head, req_pay, resp_head, resp_pay, params = result #resp_type, resp = self.writer.read_resp_record(resp_head, resp_pay) - resp = self.writer.copy_warc_record(resp_pay) + resp_length = resp_pay.tell() + resp_pay.seek(0) + resp = self.writer.create_record_from_stream(resp_pay, resp_length) if resp.rec_type == 'response': uri = resp.rec_headers.get_header('WARC-Target-Uri') + req_length = req_pay.tell() + req_pay.seek(0) req = self.writer.create_warc_record(uri=uri, record_type='request', payload=req_pay, + length=req_length, warc_headers_dict=req_head) - self.writer.write_req_resp(req, resp, params) + self.writer.write_request_response_pair(req, resp, params) else: self.writer.write_record(resp, params) @@ -133,9 +138,13 @@ class RecorderApp(object): content_type = headers.get('Content-Type') + payload_length = req_stream.out.tell() + req_stream.out.seek(0) + record = self.writer.create_warc_record(uri=params['url'], record_type=record_type, payload=req_stream.out, + length=payload_length, warc_content_type=content_type, warc_headers_dict=req_stream.headers) diff --git a/pywb/recorder/redisindexer.py b/pywb/recorder/redisindexer.py index a990330e..142dad1e 100644 --- a/pywb/recorder/redisindexer.py +++ b/pywb/recorder/redisindexer.py @@ -1,7 +1,8 @@ from pywb.utils.canonicalize import calc_search_range from pywb.cdx.cdxobject import CDXObject from pywb.warc.cdxindexer import write_cdx_index -from pywb.utils.timeutils import iso_date_to_timestamp + +from warcio.timeutils import iso_date_to_timestamp from io import BytesIO import os diff --git a/pywb/recorder/test/test_recorder.py b/pywb/recorder/test/test_recorder.py index 8a32ec59..4efac766 100644 --- a/pywb/recorder/test/test_recorder.py +++ b/pywb/recorder/test/test_recorder.py @@ -20,11 +20,13 @@ from pywb.recorder.filters import SkipDupePolicy, WriteDupePolicy, WriteRevisitD from pywb.webagg.utils import MementoUtils from pywb.cdx.cdxobject import CDXObject -from pywb.utils.statusandheaders import StatusAndHeadersParser -from pywb.utils.bufferedreaders import DecompressingBufferedReader -from pywb.warc.recordloader import ArcWarcRecordLoader + +from warcio.statusandheaders import StatusAndHeadersParser +from warcio.bufferedreaders import DecompressingBufferedReader +from warcio.recordloader import ArcWarcRecordLoader +from warcio.archiveiterator import ArchiveIterator + from pywb.warc.cdxindexer import write_cdx_index -from pywb.warc.archiveiterator import ArchiveIterator from six.moves.urllib.parse import quote, unquote, urlencode from io import BytesIO @@ -122,9 +124,9 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass) filename = os.path.join(base_dir, filename) with open(filename, 'rb') as fh: for record in ArchiveIterator(fh, no_record_parse=True): - assert record.status_headers == None + assert record.http_headers == None assert int(record.rec_headers.get_header('Content-Length')) == record.length - assert record.length == len(record.stream.read()) + assert record.length == len(record.raw_stream.read()) def test_record_warc_1(self): recorder_app = RecorderApp(self.upstream_url, @@ -168,16 +170,16 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass) buff = BytesIO(resp.body) record = ArcWarcRecordLoader().parse_record_stream(buff) - assert ('Set-Cookie', 'name=value; Path=/') in record.status_headers.headers - assert ('Set-Cookie', 'foo=bar; Path=/') in record.status_headers.headers + assert ('Set-Cookie', 'name=value; Path=/') in record.http_headers.headers + assert ('Set-Cookie', 'foo=bar; Path=/') in record.http_headers.headers stored_req, stored_resp = self._load_resp_req(base_path) - assert ('Set-Cookie', 'name=value; Path=/') in stored_resp.status_headers.headers - assert ('Set-Cookie', 'foo=bar; Path=/') in stored_resp.status_headers.headers + assert ('Set-Cookie', 'name=value; Path=/') in stored_resp.http_headers.headers + assert ('Set-Cookie', 'foo=bar; Path=/') in stored_resp.http_headers.headers - assert ('X-Other', 'foo') in stored_req.status_headers.headers - assert ('Cookie', 'boo=far') in stored_req.status_headers.headers + assert ('X-Other', 'foo') in stored_req.http_headers.headers + assert ('Cookie', 'boo=far') in stored_req.http_headers.headers self._test_all_warcs('/warcs/cookiecheck/', 1) @@ -193,16 +195,16 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass) buff = BytesIO(resp.body) record = ArcWarcRecordLoader().parse_record_stream(buff) - assert ('Set-Cookie', 'name=value; Path=/') in record.status_headers.headers - assert ('Set-Cookie', 'foo=bar; Path=/') in record.status_headers.headers + assert ('Set-Cookie', 'name=value; Path=/') in record.http_headers.headers + assert ('Set-Cookie', 'foo=bar; Path=/') in record.http_headers.headers stored_req, stored_resp = self._load_resp_req(warc_path) - assert ('Set-Cookie', 'name=value; Path=/') not in stored_resp.status_headers.headers - assert ('Set-Cookie', 'foo=bar; Path=/') not in stored_resp.status_headers.headers + assert ('Set-Cookie', 'name=value; Path=/') not in stored_resp.http_headers.headers + assert ('Set-Cookie', 'foo=bar; Path=/') not in stored_resp.http_headers.headers - assert ('X-Other', 'foo') in stored_req.status_headers.headers - assert ('Cookie', 'boo=far') not in stored_req.status_headers.headers + assert ('X-Other', 'foo') in stored_req.http_headers.headers + assert ('Cookie', 'boo=far') not in stored_req.http_headers.headers self._test_all_warcs('/warcs/cookieskip/', 1) @@ -530,10 +532,10 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass) assert status_headers.get_header('Content-Length') == str(len(buff)) assert status_headers.get_header('WARC-Custom') == 'foo' - assert record.stream.read() == buff + assert record.raw_stream.read() == buff - status_headers = record.status_headers - assert len(record.status_headers.headers) == 2 + status_headers = record.http_headers + assert len(record.http_headers.headers) == 2 assert status_headers.get_header('Content-Type') == 'text/plain' assert status_headers.get_header('Content-Length') == str(len(buff)) diff --git a/pywb/rewrite/header_rewriter.py b/pywb/rewrite/header_rewriter.py index dbdee1ea..f88f73ad 100644 --- a/pywb/rewrite/header_rewriter.py +++ b/pywb/rewrite/header_rewriter.py @@ -1,5 +1,5 @@ -from pywb.utils.statusandheaders import StatusAndHeaders -from pywb.utils.timeutils import datetime_to_http_date +from warcio.statusandheaders import StatusAndHeaders +from warcio.timeutils import datetime_to_http_date from datetime import datetime, timedelta import six diff --git a/pywb/rewrite/rewrite_content.py b/pywb/rewrite/rewrite_content.py index 2805a37e..483b51fa 100644 --- a/pywb/rewrite/rewrite_content.py +++ b/pywb/rewrite/rewrite_content.py @@ -12,10 +12,11 @@ from pywb.rewrite.header_rewriter import RewrittenStatusAndHeaders from pywb.rewrite.rewriterules import RewriteRules from pywb.utils.dsrules import RuleSet -from pywb.utils.statusandheaders import StatusAndHeaders -from pywb.utils.bufferedreaders import DecompressingBufferedReader -from pywb.utils.bufferedreaders import ChunkedDataReader, BufferedReader -from pywb.utils.loaders import to_native_str + +from warcio.statusandheaders import StatusAndHeaders +from warcio.bufferedreaders import DecompressingBufferedReader +from warcio.bufferedreaders import ChunkedDataReader, BufferedReader +from warcio.utils import to_native_str from pywb.rewrite.regex_rewriters import JSNoneRewriter, JSLinkOnlyRewriter diff --git a/pywb/rewrite/rewrite_live.py b/pywb/rewrite/rewrite_live.py index c3366376..38ea8b87 100644 --- a/pywb/rewrite/rewrite_live.py +++ b/pywb/rewrite/rewrite_live.py @@ -11,10 +11,11 @@ import os from six.moves.urllib.parse import urlsplit import six +from warcio.timeutils import timestamp_now +from warcio.statusandheaders import StatusAndHeaders + from pywb.utils.loaders import is_http, LimitReader, LocalFileLoader, to_file_url from pywb.utils.loaders import extract_client_cookie -from pywb.utils.timeutils import timestamp_now -from pywb.utils.statusandheaders import StatusAndHeaders from pywb.utils.canonicalize import canonicalize from pywb.rewrite.rewrite_content import RewriteContent diff --git a/pywb/rewrite/test/test_header_rewriter.py b/pywb/rewrite/test/test_header_rewriter.py index e58c6d84..2d918f44 100644 --- a/pywb/rewrite/test/test_header_rewriter.py +++ b/pywb/rewrite/test/test_header_rewriter.py @@ -50,9 +50,9 @@ HTTP Headers Rewriting from pywb.rewrite.header_rewriter import HeaderRewriter from pywb.rewrite.url_rewriter import UrlRewriter -from pywb.utils.statusandheaders import StatusAndHeaders +from warcio.statusandheaders import StatusAndHeaders -from pywb.utils.timeutils import datetime_to_http_date +from warcio.timeutils import datetime_to_http_date from datetime import datetime import pprint diff --git a/pywb/static/flowplayer/flowplayer-3.2.18.swf b/pywb/static/flowplayer/flowplayer-3.2.18.swf index aed1fcb126adc2b9e965246d51f708a9eb5e96cc..ae28ee238f86868ba940d8456767745677f19d09 100644 GIT binary patch delta 13 VcmaF!mHq8k_J%EtZ+ 0: - return - - if self.starting_data: - data = self.starting_data - self.starting_data = None - else: - if not block_size: - block_size = self.block_size - data = self.stream.read(block_size) - - self._process_read(data) - - def _process_read(self, data): - data = self._decompress(data) - self.buff_size = len(data) - self.num_read += self.buff_size - self.num_block_read += self.buff_size - self.buff = BytesIO(data) - - def _decompress(self, data): - if self.decompressor and data: - try: - data = self.decompressor.decompress(data) - except Exception as e: - # if first read attempt, assume non-gzipped stream - if self.num_block_read == 0: - if self.decomp_type == 'deflate': - self._init_decomp('deflate_alt') - data = self._decompress(data) - else: - self.decompressor = None - # otherwise (partly decompressed), something is wrong - else: - print(str(e)) - return b'' - return data - - def read(self, length=None): - """ - Fill bytes and read some number of bytes - (up to length if specified) - <= length bytes may be read if reached the end of input - if at buffer boundary, will attempt to read again until - specified length is read - """ - all_buffs = [] - while length is None or length > 0: - self._fillbuff() - buff = self.buff.read(length) - if not buff: - break - - all_buffs.append(buff) - if length: - length -= len(buff) - - return b''.join(all_buffs) - - - - def readline(self, length=None): - """ - Fill buffer and read a full line from the buffer - (up to specified length, if provided) - If no newline found at end, try filling buffer again in case - at buffer boundary. - """ - if length == 0: - return b'' - - self._fillbuff() - linebuff = self.buff.readline(length) - - # we may be at a boundary - while not linebuff.endswith(b'\n'): - if length: - length -= len(linebuff) - if length <= 0: - break - - self._fillbuff() - - if self.empty(): - break - - linebuff += self.buff.readline(length) - - return linebuff - - def empty(self): - return not self.buff or self.buff.tell() >= self.buff_size - - def read_next_member(self): - if not self.decompressor or not self.decompressor.unused_data: - return False - - self.starting_data = self.decompressor.unused_data - self._init_decomp(self.decomp_type) - return True - - def rem_length(self): - rem = 0 - if self.buff: - rem = self.buff_size - self.buff.tell() - - if self.decompressor and self.decompressor.unused_data: - rem += len(self.decompressor.unused_data) - return rem - - def close(self): - if self.stream: - self.stream.close() - self.stream = None - - @classmethod - def get_supported_decompressors(cls): - return cls.DECOMPRESSORS.keys() - - -#================================================================= -class DecompressingBufferedReader(BufferedReader): - """ - A BufferedReader which defaults to gzip decompression, - (unless different type specified) - """ - def __init__(self, *args, **kwargs): - if 'decomp_type' not in kwargs: - kwargs['decomp_type'] = 'gzip' - super(DecompressingBufferedReader, self).__init__(*args, **kwargs) - - -#================================================================= -class ChunkedDataException(Exception): - def __init__(self, msg, data=b''): - Exception.__init__(self, msg) - self.data = data - - -#================================================================= -class ChunkedDataReader(BufferedReader): - r""" - A ChunkedDataReader is a DecompressingBufferedReader - which also supports de-chunking of the data if it happens - to be http 'chunk-encoded'. - - If at any point the chunked header is not available, the stream is - assumed to not be chunked and no more dechunking occurs. - """ - def __init__(self, stream, raise_exceptions=False, **kwargs): - super(ChunkedDataReader, self).__init__(stream, **kwargs) - self.all_chunks_read = False - self.not_chunked = False - - # if False, we'll use best-guess fallback for parse errors - self.raise_chunked_data_exceptions = raise_exceptions - - def _fillbuff(self, block_size=None): - if self.not_chunked: - return super(ChunkedDataReader, self)._fillbuff(block_size) - - # Loop over chunks until there is some data (not empty()) - # In particular, gzipped data may require multiple chunks to - # return any decompressed result - while (self.empty() and - not self.all_chunks_read and - not self.not_chunked): - - try: - length_header = self.stream.readline(64) - self._try_decode(length_header) - except ChunkedDataException as e: - if self.raise_chunked_data_exceptions: - raise - - # Can't parse the data as chunked. - # It's possible that non-chunked data is served - # with a Transfer-Encoding: chunked. - # Treat this as non-chunk encoded from here on. - self._process_read(length_header + e.data) - self.not_chunked = True - - # parse as block as non-chunked - return super(ChunkedDataReader, self)._fillbuff(block_size) - - def _try_decode(self, length_header): - # decode length header - try: - chunk_size = int(length_header.strip().split(b';')[0], 16) - except ValueError: - raise ChunkedDataException(b"Couldn't decode length header " + - length_header) - - if not chunk_size: - # chunk_size 0 indicates end of file - self.all_chunks_read = True - self._process_read(b'') - return - - data_len = 0 - data = b'' - - # read chunk - while data_len < chunk_size: - new_data = self.stream.read(chunk_size - data_len) - - # if we unexpectedly run out of data, - # either raise an exception or just stop reading, - # assuming file was cut off - if not new_data: - if self.raise_chunked_data_exceptions: - msg = 'Ran out of data before end of chunk' - raise ChunkedDataException(msg, data) - else: - chunk_size = data_len - self.all_chunks_read = True - - data += new_data - data_len = len(data) - - # if we successfully read a block without running out, - # it should end in \r\n - if not self.all_chunks_read: - clrf = self.stream.read(2) - if clrf != b'\r\n': - raise ChunkedDataException(b"Chunk terminator not found.", - data) - - # hand to base class for further processing - self._process_read(data) - - def read(self, length=None): - """ read bytes from stream, if length specified, - may read across multiple chunks to get exact length - """ - buf = super(ChunkedDataReader, self).read(length) - if not length: - return buf - - # if length specified, attempt to read exact length - rem = length - len(buf) - while rem > 0: - new_buf = super(ChunkedDataReader, self).read(rem) - if not new_buf: - break - - buf += new_buf - rem -= len(new_buf) - - return buf diff --git a/pywb/utils/limitreader.py b/pywb/utils/limitreader.py deleted file mode 100644 index 49d5d713..00000000 --- a/pywb/utils/limitreader.py +++ /dev/null @@ -1,69 +0,0 @@ -# ============================================================================ -class LimitReader(object): - """ - A reader which will not read more than specified limit - """ - - def __init__(self, stream, limit): - self.stream = stream - self.limit = limit - - if hasattr(stream, 'tell'): - self.tell = self._tell - - def _update(self, buff): - length = len(buff) - self.limit -= length - return buff - - def read(self, length=None): - if length is not None: - length = min(length, self.limit) - else: - length = self.limit - - if length == 0: - return b'' - - buff = self.stream.read(length) - return self._update(buff) - - def readline(self, length=None): - if length is not None: - length = min(length, self.limit) - else: - length = self.limit - - if length == 0: - return b'' - - buff = self.stream.readline(length) - return self._update(buff) - - def close(self): - self.stream.close() - - def _tell(self): - return self.stream.tell() - - @staticmethod - def wrap_stream(stream, content_length): - """ - If given content_length is an int > 0, wrap the stream - in a LimitReader. Otherwise, return the stream unaltered - """ - try: - content_length = int(content_length) - if content_length >= 0: - # optimize: if already a LimitStream, set limit to - # the smaller of the two limits - if isinstance(stream, LimitReader): - stream.limit = min(stream.limit, content_length) - else: - stream = LimitReader(stream, content_length) - - except (ValueError, TypeError): - pass - - return stream - diff --git a/pywb/utils/loaders.py b/pywb/utils/loaders.py index 53be7896..84d28785 100644 --- a/pywb/utils/loaders.py +++ b/pywb/utils/loaders.py @@ -17,7 +17,7 @@ import base64 import cgi from io import open, BytesIO -from pywb.utils.limitreader import LimitReader +from warcio.limitreader import LimitReader try: from boto import connect_s3 diff --git a/pywb/utils/statusandheaders.py b/pywb/utils/statusandheaders.py deleted file mode 100644 index a5edea20..00000000 --- a/pywb/utils/statusandheaders.py +++ /dev/null @@ -1,285 +0,0 @@ -""" -Representation and parsing of HTTP-style status + headers -""" - -from pprint import pformat -from copy import copy -from six.moves import range -from six import iteritems -from pywb.utils.loaders import to_native_str -import uuid - - -WRAP_WIDTH = 80 - -#================================================================= -class StatusAndHeaders(object): - """ - Representation of parsed http-style status line and headers - Status Line if first line of request/response - Headers is a list of (name, value) tuples - An optional protocol which appears on first line may be specified - """ - def __init__(self, statusline, headers, protocol='', total_len=0): - self.statusline = statusline - self.headers = headers - self.protocol = protocol - self.total_len = total_len - - def get_header(self, name, default_value=None): - """ - return header (name, value) - if found - """ - name_lower = name.lower() - for value in self.headers: - if value[0].lower() == name_lower: - return value[1] - - return default_value - - def add_header(self, name, value): - self.headers.append((name, value)) - - def replace_header(self, name, value): - """ - replace header with new value or add new header - return old header value, if any - """ - name_lower = name.lower() - for index in range(len(self.headers) - 1, -1, -1): - curr_name, curr_value = self.headers[index] - if curr_name.lower() == name_lower: - self.headers[index] = (curr_name, value) - return curr_value - - self.headers.append((name, value)) - return None - - def replace_headers(self, header_dict): - """ - replace all headers in header_dict that already exist - add any remaining headers - """ - header_dict = copy(header_dict) - - for index in range(len(self.headers) - 1, -1, -1): - curr_name, curr_value = self.headers[index] - name_lower = curr_name.lower() - if name_lower in header_dict: - self.headers[index] = (curr_name, header_dict[name_lower]) - del header_dict[name_lower] - - for name, value in iteritems(header_dict): - self.headers.append((name, value)) - - def remove_header(self, name): - """ - Remove header (case-insensitive) - return True if header removed, False otherwise - """ - name_lower = name.lower() - for index in range(len(self.headers) - 1, -1, -1): - if self.headers[index][0].lower() == name_lower: - del self.headers[index] - return True - - return False - - def get_statuscode(self): - """ - Return the statuscode part of the status response line - (Assumes no protocol in the statusline) - """ - code = self.statusline.split(' ', 1)[0] - return code - - def validate_statusline(self, valid_statusline): - """ - Check that the statusline is valid, eg. starts with a numeric - code. If not, replace with passed in valid_statusline - """ - code = self.get_statuscode() - try: - code = int(code) - assert(code > 0) - return True - except(ValueError, AssertionError): - self.statusline = valid_statusline - return False - - def add_range(self, start, part_len, total_len): - """ - Add range headers indicating that this a partial response - """ - content_range = 'bytes {0}-{1}/{2}'.format(start, - start + part_len - 1, - total_len) - - self.statusline = '206 Partial Content' - self.replace_header('Content-Range', content_range) - self.replace_header('Accept-Ranges', 'bytes') - return self - - def __repr__(self): - headers_str = pformat(self.headers, indent=2, width=WRAP_WIDTH) - return "StatusAndHeaders(protocol = '{0}', statusline = '{1}', \ -headers = {2})".format(self.protocol, self.statusline, headers_str) - - def __eq__(self, other): - return (self.statusline == other.statusline and - self.headers == other.headers and - self.protocol == other.protocol) - - def __str__(self, exclude_list=None): - return self.to_str(exclude_list) - - def __bool__(self): - return bool(self.statusline or self.headers) - - __nonzero__ = __bool__ - - def to_str(self, exclude_list): - string = self.protocol - - if string and self.statusline: - string += ' ' - - if self.statusline: - string += self.statusline - - if string: - string += '\r\n' - - for h in self.headers: - if exclude_list and h[0].lower() in exclude_list: - continue - - string += ': '.join(h) + '\r\n' - - return string - - def to_bytes(self, exclude_list=None): - return self.to_str(exclude_list).encode('iso-8859-1') + b'\r\n' - - -#================================================================= -def _strip_count(string, total_read): - length = len(string) - return string.rstrip(), total_read + length - - -#================================================================= -class StatusAndHeadersParser(object): - """ - Parser which consumes a stream support readline() to read - status and headers and return a StatusAndHeaders object - """ - def __init__(self, statuslist, verify=True): - self.statuslist = statuslist - self.verify = verify - - def parse(self, stream, full_statusline=None): - """ - parse stream for status line and headers - return a StatusAndHeaders object - - support continuation headers starting with space or tab - """ - - def readline(): - return to_native_str(stream.readline()) - - # status line w newlines intact - if full_statusline is None: - full_statusline = readline() - else: - full_statusline = to_native_str(full_statusline) - - statusline, total_read = _strip_count(full_statusline, 0) - - headers = [] - - # at end of stream - if total_read == 0: - raise EOFError() - elif not statusline: - return StatusAndHeaders(statusline=statusline, - headers=headers, - protocol='', - total_len=total_read) - - # validate only if verify is set - if self.verify: - protocol_status = self.split_prefix(statusline, self.statuslist) - - if not protocol_status: - msg = 'Expected Status Line starting with {0} - Found: {1}' - msg = msg.format(self.statuslist, statusline) - raise StatusAndHeadersParserException(msg, full_statusline) - else: - protocol_status = statusline.split(' ', 1) - - line, total_read = _strip_count(readline(), total_read) - while line: - result = line.split(':', 1) - if len(result) == 2: - name = result[0].rstrip(' \t') - value = result[1].lstrip() - else: - name = result[0] - value = None - - next_line, total_read = _strip_count(readline(), - total_read) - - # append continuation lines, if any - while next_line and next_line.startswith((' ', '\t')): - if value is not None: - value += next_line - next_line, total_read = _strip_count(readline(), - total_read) - - if value is not None: - header = (name, value) - headers.append(header) - - line = next_line - - if len(protocol_status) > 1: - statusline = protocol_status[1].strip() - else: - statusline = '' - - return StatusAndHeaders(statusline=statusline, - headers=headers, - protocol=protocol_status[0], - total_len=total_read) - - @staticmethod - def split_prefix(key, prefixs): - """ - split key string into prefix and remainder - for first matching prefix from a list - """ - key_upper = key.upper() - for prefix in prefixs: - if key_upper.startswith(prefix): - plen = len(prefix) - return (key_upper[:plen], key[plen:]) - - @staticmethod - def make_warc_id(id_=None): - if not id_: - id_ = uuid.uuid1() - return ''.format(id_) - - -#================================================================= -class StatusAndHeadersParserException(Exception): - """ - status + headers parsing exception - """ - def __init__(self, msg, statusline): - super(StatusAndHeadersParserException, self).__init__(msg) - self.statusline = statusline diff --git a/pywb/utils/test/test_bufferedreaders.py b/pywb/utils/test/test_bufferedreaders.py deleted file mode 100644 index c5b91f21..00000000 --- a/pywb/utils/test/test_bufferedreaders.py +++ /dev/null @@ -1,174 +0,0 @@ -r""" -# DecompressingBufferedReader Tests -#================================================================= - -# DecompressingBufferedReader readline() ->>> print_str(DecompressingBufferedReader(open(test_cdx_dir + 'iana.cdx', 'rb')).readline()) -' CDX N b a m s k r M S V g\n' - -# detect not compressed ->>> print_str(DecompressingBufferedReader(open(test_cdx_dir + 'iana.cdx', 'rb'), decomp_type = 'gzip').readline()) -' CDX N b a m s k r M S V g\n' - -# decompress with on the fly compression, default gzip compression ->>> print_str(DecompressingBufferedReader(BytesIO(compress('ABC\n1234\n'))).read()) -'ABC\n1234\n' - -# decompress with on the fly compression, default 'inflate' compression ->>> print_str(DecompressingBufferedReader(BytesIO(compress_alt('ABC\n1234\n')), decomp_type='deflate').read()) -'ABC\n1234\n' - -# error: invalid compress type ->>> DecompressingBufferedReader(BytesIO(compress('ABC')), decomp_type = 'bzip2').read() -Traceback (most recent call last): -Exception: Decompression type not supported: bzip2 - -# invalid output when reading compressed data as not compressed ->>> DecompressingBufferedReader(BytesIO(compress('ABC')), decomp_type = None).read() != b'ABC' -True - - -# DecompressingBufferedReader readline() with decompression (zipnum file, no header) ->>> print_str(DecompressingBufferedReader(open(test_zip_dir + 'zipnum-sample.cdx.gz', 'rb'), decomp_type = 'gzip').readline()) -'com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz\n' - -# test very small block size ->>> dbr = DecompressingBufferedReader(BytesIO(b'ABCDEFG\nHIJKLMN\nOPQR\nXYZ'), block_size = 3) ->>> print_str(dbr.readline()); print_str(dbr.readline(4)); print_str(dbr.readline()); print_str(dbr.readline()); print_str(dbr.readline(2)); print_str(dbr.readline()); print_str(dbr.readline()) -'ABCDEFG\n' -'HIJK' -'LMN\n' -'OPQR\n' -'XY' -'Z' -'' - -# test zero length reads ->>> x = DecompressingBufferedReader(LimitReader(BytesIO(b'\r\n'), 1)) ->>> print_str(x.readline(0)); print_str(x.read(0)) -'' -'' - -# Chunk-Decoding Buffered Reader Tests -#================================================================= - -Properly formatted chunked data: ->>> c = ChunkedDataReader(BytesIO(b"4\r\n1234\r\n0\r\n\r\n")); ->>> print_str(c.read() + c.read() + c.read()) -'1234' - -Non-chunked data: ->>> print_str(ChunkedDataReader(BytesIO(b"xyz123!@#")).read()) -'xyz123!@#' - -Non-chunked, compressed data, specify decomp_type ->>> print_str(ChunkedDataReader(BytesIO(compress('ABCDEF')), decomp_type='gzip').read()) -'ABCDEF' - -Non-chunked, compressed data, specifiy compression seperately ->>> c = ChunkedDataReader(BytesIO(compress('ABCDEF'))); c.set_decomp('gzip'); print_str(c.read()) -'ABCDEF' - -Non-chunked, compressed data, wrap in DecompressingBufferedReader ->>> print_str(DecompressingBufferedReader(ChunkedDataReader(BytesIO(compress('\nABCDEF\nGHIJ')))).read()) -'\nABCDEF\nGHIJ' - -Chunked compressed data -Split compressed stream into 10-byte chunk and a remainder chunk ->>> b = compress('ABCDEFGHIJKLMNOP') ->>> l = len(b) ->>> in_ = format(10, 'x').encode('utf-8') + b"\r\n" + b[:10] + b"\r\n" + format(l - 10, 'x').encode('utf-8') + b"\r\n" + b[10:] + b"\r\n0\r\n\r\n" ->>> c = ChunkedDataReader(BytesIO(in_), decomp_type='gzip') ->>> print_str(c.read()) -'ABCDEFGHIJKLMNOP' - -Starts like chunked data, but isn't: ->>> c = ChunkedDataReader(BytesIO(b"1\r\nxyz123!@#")); ->>> print_str(c.read() + c.read()) -'1\r\nx123!@#' - -Chunked data cut off part way through: ->>> c = ChunkedDataReader(BytesIO(b"4\r\n1234\r\n4\r\n12")); ->>> print_str(c.read() + c.read()) -'123412' - -Zero-Length chunk: ->>> print_str(ChunkedDataReader(BytesIO(b"0\r\n\r\n")).read()) -'' - -""" - -from io import BytesIO -from pywb.utils.bufferedreaders import ChunkedDataReader, ChunkedDataException -from pywb.utils.bufferedreaders import DecompressingBufferedReader -from pywb.utils.limitreader import LimitReader - -from pywb import get_test_dir - -import six - -import zlib -import pytest - -test_cdx_dir = get_test_dir() + 'cdx/' -test_zip_dir = get_test_dir() + 'zipcdx/' - - -def compress(buff): - buff = buff.encode('utf-8') - compressobj = zlib.compressobj(6, zlib.DEFLATED, zlib.MAX_WBITS + 16) - compressed = compressobj.compress(buff) - compressed += compressobj.flush() - - return compressed - -# plain "inflate" -def compress_alt(buff): - buff = buff.encode('utf-8') - compressobj = zlib.compressobj(6, zlib.DEFLATED) - compressed = compressobj.compress(buff) - compressed += compressobj.flush() - # drop gzip headers/tail - compressed = compressed[2:-4] - - return compressed - -# Brotli - -def test_brotli(): - with open(get_test_dir() + 'text_content/quickfox_repeated.compressed', 'rb') as fh: - x = DecompressingBufferedReader(fh, decomp_type='br') - x.read() == b'The quick brown fox jumps over the lazy dog' * 4096 - - -# Compression -def test_compress_mix(): - # error: compressed member, followed by not compressed -- now allowed! - x = DecompressingBufferedReader(BytesIO(compress('ABC') + b'123'), decomp_type = 'gzip') - b = x.read() - assert b == b'ABC' - x.read_next_member() - assert x.read() == b'123' - #with pytest.raises(zlib.error): - # x.read() - #error: Error -3 while decompressing: incorrect header check - -# Errors - -def test_err_chunk_cut_off(): - # Chunked data cut off with exceptions - c = ChunkedDataReader(BytesIO(b"4\r\n1234\r\n4\r\n12"), raise_exceptions=True) - with pytest.raises(ChunkedDataException): - c.read() + c.read() - #ChunkedDataException: Ran out of data before end of chunk - - - -def print_str(string): - return string.decode('utf-8') if six.PY3 else string - - - -if __name__ == "__main__": - import doctest - doctest.testmod() diff --git a/pywb/utils/test/test_loaders.py b/pywb/utils/test/test_loaders.py index 703ef81e..9842c7a3 100644 --- a/pywb/utils/test/test_loaders.py +++ b/pywb/utils/test/test_loaders.py @@ -1,27 +1,5 @@ -#================================================================= r""" -# LimitReader Tests ->>> LimitReader(StringIO('abcdefghjiklmnopqrstuvwxyz'), 10).read(26) -'abcdefghji' - ->>> LimitReader(StringIO('abcdefghjiklmnopqrstuvwxyz'), 8).readline(26) -'abcdefgh' - ->>> LimitReader.wrap_stream(LimitReader(StringIO('abcdefghjiklmnopqrstuvwxyz'), 8), 4).readline(26) -'abcd' - ->>> read_multiple(LimitReader(StringIO('abcdefghjiklmnopqrstuvwxyz'), 10), [2, 2, 20]) -'efghji' - -# zero-length read ->>> print_str(LimitReader(StringIO('a'), 0).readline(0)) -'' - -# don't wrap if invalid length ->>> b = StringIO('b') ->>> LimitReader.wrap_stream(b, 'abc') == b -True - +#================================================================= # BlockLoader Tests (includes LimitReader) # Ensure attempt to read more than 100 bytes, reads exactly 100 bytes >>> len(BlockLoader().load(test_cdx_dir + 'iana.cdx', 0, 100).read(400)) @@ -143,27 +121,14 @@ import requests from pywb.utils.loaders import BlockLoader, HMACCookieMaker, to_file_url from pywb.utils.loaders import extract_client_cookie, extract_post_query from pywb.utils.loaders import append_post_query, read_last_line -from pywb.utils.limitreader import LimitReader -from pywb.utils.bufferedreaders import DecompressingBufferedReader +from warcio.bufferedreaders import DecompressingBufferedReader from pywb import get_test_dir test_cdx_dir = get_test_dir() + 'cdx/' -def read_multiple(reader, inc_reads): - result = None - for x in inc_reads: - result = reader.read(x) - return result - - -def seek_read_full(seekable_reader, offset): - seekable_reader.seek(offset) - seekable_reader.readline() #skip - return seekable_reader.readline() - def test_s3_read_1(): pytest.importorskip('boto') @@ -178,15 +143,6 @@ def test_s3_read_1(): assert reader.readline() == b'WARC/1.0\r\n' assert reader.readline() == b'WARC-Type: response\r\n' -def test_limit_post(): - reader = LimitReader(BytesIO(b'abcdefg'), 3) - r = requests.request(method='POST', - url='http://httpbin.org/post', - data=reader, - headers={'Content-Length': '3'}) - - assert '"abc"' in r.text - # Error def test_err_no_such_file(): # no such file diff --git a/pywb/utils/test/test_statusandheaders.py b/pywb/utils/test/test_statusandheaders.py deleted file mode 100644 index 9d198647..00000000 --- a/pywb/utils/test/test_statusandheaders.py +++ /dev/null @@ -1,182 +0,0 @@ -""" ->>> st1 = StatusAndHeadersParser(['HTTP/1.0']).parse(StringIO(status_headers_1)) ->>> st1 -StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '200 OK', headers = [ ('Content-Type', 'ABC'), - ('Some', 'Value'), - ('Multi-Line', 'Value1 Also This')]) - -# add range ->>> StatusAndHeaders(statusline = '200 OK', headers=[('Content-Type', 'text/plain')]).add_range(10, 4, 100) -StatusAndHeaders(protocol = '', statusline = '206 Partial Content', headers = [ ('Content-Type', 'text/plain'), - ('Content-Range', 'bytes 10-13/100'), - ('Accept-Ranges', 'bytes')]) - -# other protocol expected ->>> StatusAndHeadersParser(['Other']).parse(StringIO(status_headers_1)) # doctest: +IGNORE_EXCEPTION_DETAIL -Traceback (most recent call last): -StatusAndHeadersParserException: Expected Status Line starting with ['Other'] - Found: HTTP/1.0 200 OK - ->>> StatusAndHeadersParser(['Other'], verify=False).parse(StringIO(status_headers_1)) -StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '200 OK', headers = [ ('Content-Type', 'ABC'), - ('Some', 'Value'), - ('Multi-Line', 'Value1 Also This')]) - - -# verify protocol line ->>> StatusAndHeadersParser(['HTTP/1.0'], verify=True).parse(StringIO(unknown_protocol_headers)) # doctest: +IGNORE_EXCEPTION_DETAIL -Traceback (most recent call last): -StatusAndHeadersParserException: Expected Status Line starting with ['HTTP/1.0'] - Found: OtherBlah - - -# allow unexpected/invalid protocol line ->>> StatusAndHeadersParser(['HTTP/1.0'], verify=False).parse(StringIO(unknown_protocol_headers)) -StatusAndHeaders(protocol = 'OtherBlah', statusline = '', headers = [('Foo', 'Bar')]) - - - -# test equality op ->>> st1 == StatusAndHeadersParser(['HTTP/1.0']).parse(StringIO(status_headers_1)) -True - -# replace header, print new headers ->>> st1.replace_header('some', 'Another-Value'); st1 -'Value' -StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '200 OK', headers = [ ('Content-Type', 'ABC'), - ('Some', 'Another-Value'), - ('Multi-Line', 'Value1 Also This')]) - - -# remove header ->>> st1.remove_header('some') -True - -# already removed ->>> st1.remove_header('Some') -False - -# empty ->>> st2 = StatusAndHeadersParser(['HTTP/1.0']).parse(StringIO(status_headers_2)); x = st2.validate_statusline('204 No Content'); st2 -StatusAndHeaders(protocol = '', statusline = '204 No Content', headers = []) - - ->>> StatusAndHeadersParser(['HTTP/1.0']).parse(StringIO(status_headers_3)) -StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '204 Empty', headers = [('Content-Type', 'Value'), ('Content-Length', '0')]) - -# case-insensitive match ->>> StatusAndHeadersParser(['HTTP/1.0']).parse(StringIO(status_headers_4)) -StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '204 empty', headers = [('Content-Type', 'Value'), ('Content-Length', '0')]) - - -""" - - -from pywb.utils.statusandheaders import StatusAndHeadersParser, StatusAndHeaders -from six import StringIO -import pytest - - -status_headers_1 = "\ -HTTP/1.0 200 OK\r\n\ -Content-Type: ABC\r\n\ -HTTP/1.0 200 OK\r\n\ -Some: Value\r\n\ -Multi-Line: Value1\r\n\ - Also This\r\n\ -\r\n\ -Body" - - -status_headers_2 = """ - -""" - -status_headers_3 = "\ -HTTP/1.0 204 Empty\r\n\ -Content-Type: Value\r\n\ -%Invalid%\r\n\ -\tMultiline\r\n\ -Content-Length: 0\r\n\ -\r\n" - -status_headers_4 = "\ -http/1.0 204 empty\r\n\ -Content-Type: Value\r\n\ -%Invalid%\r\n\ -\tMultiline\r\n\ -Content-Length: 0\r\n\ -\r\n" - -unknown_protocol_headers = "\ -OtherBlah\r\n\ -Foo: Bar\r\n\ -\r\n" - - -req_headers = "\ -GET / HTTP/1.0\r\n\ -Foo: Bar\r\n\ -Content-Length: 0\r\n" - - -if __name__ == "__main__": - import doctest - doctest.testmod() - - - -def test_to_str_1(): - res = str(StatusAndHeadersParser(['HTTP/1.0']).parse(StringIO(status_headers_1))) - - exp = "\ -HTTP/1.0 200 OK\r\n\ -Content-Type: ABC\r\n\ -Some: Value\r\n\ -Multi-Line: Value1 Also This\r\n\ -" - assert(res == exp) - - -def test_to_str_exclude(): - sah = StatusAndHeadersParser(['HTTP/1.0']).parse(StringIO(status_headers_1)) - res = sah.to_str(['multi-line']) - - exp = "\ -HTTP/1.0 200 OK\r\n\ -Content-Type: ABC\r\n\ -Some: Value\r\n\ -" - assert(res == exp) - - assert(sah.to_bytes(['multi-line']) == (exp.encode('latin-1') + b'\r\n')) - - -def test_to_str_2(): - res = str(StatusAndHeadersParser(['GET']).parse(StringIO(req_headers))) - - assert(res == req_headers) - - res = str(StatusAndHeadersParser(['GET']).parse(StringIO(req_headers + '\r\n'))) - - assert(res == req_headers) - - -def test_to_str_with_remove(): - res = StatusAndHeadersParser(['GET']).parse(StringIO(req_headers)) - res.remove_header('Foo') - - exp = "\ -GET / HTTP/1.0\r\n\ -Content-Length: 0\r\n" - - assert(str(res) == exp) - -def test_status_empty(): - with pytest.raises(EOFError): - StatusAndHeadersParser([], verify=False).parse(StringIO('')) - - -def test_status_one_word(): - res = StatusAndHeadersParser(['GET'], verify=False).parse(StringIO('A')) - assert(str(res) == 'A\r\n') - - diff --git a/pywb/utils/timeutils.py b/pywb/utils/timeutils.py deleted file mode 100644 index 93cd2c2f..00000000 --- a/pywb/utils/timeutils.py +++ /dev/null @@ -1,330 +0,0 @@ -""" -utility functions for converting between -datetime, iso date and 14-digit timestamp -""" - -import re -import time -import datetime -import calendar - -from email.utils import parsedate, formatdate - -#================================================================= -# str <-> datetime conversion -#================================================================= - -DATE_TIMESPLIT = re.compile(r'[^\d]') - -TIMESTAMP_14 = '%Y%m%d%H%M%S' -ISO_DT = '%Y-%m-%dT%H:%M:%SZ' - -PAD_14_DOWN = '10000101000000' -PAD_14_UP = '29991231235959' -PAD_6_UP = '299912' - - -def iso_date_to_datetime(string): - """ - >>> iso_date_to_datetime('2013-12-26T10:11:12Z') - datetime.datetime(2013, 12, 26, 10, 11, 12) - - >>> iso_date_to_datetime('2013-12-26T10:11:12Z') - datetime.datetime(2013, 12, 26, 10, 11, 12) - """ - - nums = DATE_TIMESPLIT.split(string) - if nums[-1] == '': - nums = nums[:-1] - - the_datetime = datetime.datetime(*(int(num) for num in nums)) - return the_datetime - - -def http_date_to_datetime(string): - """ - >>> http_date_to_datetime('Thu, 26 Dec 2013 09:50:10 GMT') - datetime.datetime(2013, 12, 26, 9, 50, 10) - """ - return datetime.datetime(*parsedate(string)[:6]) - - -def datetime_to_http_date(the_datetime): - """ - >>> datetime_to_http_date(datetime.datetime(2013, 12, 26, 9, 50, 10)) - 'Thu, 26 Dec 2013 09:50:10 GMT' - - # Verify inverses - >>> x = 'Thu, 26 Dec 2013 09:50:10 GMT' - >>> datetime_to_http_date(http_date_to_datetime(x)) == x - True - """ - timeval = calendar.timegm(the_datetime.utctimetuple()) - return formatdate(timeval=timeval, - localtime=False, - usegmt=True) - - -def datetime_to_iso_date(the_datetime): - """ - >>> datetime_to_iso_date(datetime.datetime(2013, 12, 26, 10, 11, 12)) - '2013-12-26T10:11:12Z' - - >>> datetime_to_iso_date( datetime.datetime(2013, 12, 26, 10, 11, 12)) - '2013-12-26T10:11:12Z' - """ - - return the_datetime.strftime(ISO_DT) - - -def datetime_to_timestamp(the_datetime): - """ - >>> datetime_to_timestamp(datetime.datetime(2013, 12, 26, 10, 11, 12)) - '20131226101112' - """ - - return the_datetime.strftime(TIMESTAMP_14) - - -def timestamp_now(): - """ - >>> len(timestamp_now()) - 14 - """ - return datetime_to_timestamp(datetime.datetime.utcnow()) - - -def timestamp20_now(): - """ - Create 20-digit timestamp, useful to timestamping temp files - - >>> n = timestamp20_now() - >>> timestamp20_now() >= n - True - - >>> len(n) - 20 - - """ - now = datetime.datetime.utcnow() - return now.strftime('%Y%m%d%H%M%S%f') - - -def iso_date_to_timestamp(string): - """ - >>> iso_date_to_timestamp('2013-12-26T10:11:12Z') - '20131226101112' - - >>> iso_date_to_timestamp('2013-12-26T10:11:12') - '20131226101112' - """ - - return datetime_to_timestamp(iso_date_to_datetime(string)) - -def timestamp_to_iso_date(string): - """ - >>> timestamp_to_iso_date('20131226101112') - '2013-12-26T10:11:12Z' - - >>> timestamp_to_iso_date('20131226101112') - '2013-12-26T10:11:12Z' - """ - - - return datetime_to_iso_date(timestamp_to_datetime(string)) - - -def http_date_to_timestamp(string): - """ - >>> http_date_to_timestamp('Thu, 26 Dec 2013 09:50:00 GMT') - '20131226095000' - - >>> http_date_to_timestamp('Sun, 26 Jan 2014 20:08:04 GMT') - '20140126200804' - """ - return datetime_to_timestamp(http_date_to_datetime(string)) - - -# pad to certain length (default 6) -def pad_timestamp(string, pad_str=PAD_6_UP): - """ - >>> pad_timestamp('20') - '209912' - - >>> pad_timestamp('2014') - '201412' - - >>> pad_timestamp('20141011') - '20141011' - - >>> pad_timestamp('201410110010') - '201410110010' - """ - - str_len = len(string) - pad_len = len(pad_str) - - if str_len < pad_len: - string = string + pad_str[str_len:] - - return string - - -def timestamp_to_datetime(string): - """ - # >14-digit -- rest ignored - >>> timestamp_to_datetime('2014122609501011') - datetime.datetime(2014, 12, 26, 9, 50, 10) - - # 14-digit - >>> timestamp_to_datetime('20141226095010') - datetime.datetime(2014, 12, 26, 9, 50, 10) - - # 13-digit padding - >>> timestamp_to_datetime('2014122609501') - datetime.datetime(2014, 12, 26, 9, 50, 59) - - # 12-digit padding - >>> timestamp_to_datetime('201412260950') - datetime.datetime(2014, 12, 26, 9, 50, 59) - - # 11-digit padding - >>> timestamp_to_datetime('20141226095') - datetime.datetime(2014, 12, 26, 9, 59, 59) - - # 10-digit padding - >>> timestamp_to_datetime('2014122609') - datetime.datetime(2014, 12, 26, 9, 59, 59) - - # 9-digit padding - >>> timestamp_to_datetime('201412260') - datetime.datetime(2014, 12, 26, 23, 59, 59) - - # 8-digit padding - >>> timestamp_to_datetime('20141226') - datetime.datetime(2014, 12, 26, 23, 59, 59) - - # 7-digit padding - >>> timestamp_to_datetime('2014122') - datetime.datetime(2014, 12, 31, 23, 59, 59) - - # 6-digit padding - >>> timestamp_to_datetime('201410') - datetime.datetime(2014, 10, 31, 23, 59, 59) - - # 5-digit padding - >>> timestamp_to_datetime('20141') - datetime.datetime(2014, 12, 31, 23, 59, 59) - - # 4-digit padding - >>> timestamp_to_datetime('2014') - datetime.datetime(2014, 12, 31, 23, 59, 59) - - # 3-digit padding - >>> timestamp_to_datetime('201') - datetime.datetime(2019, 12, 31, 23, 59, 59) - - # 2-digit padding - >>> timestamp_to_datetime('20') - datetime.datetime(2099, 12, 31, 23, 59, 59) - - # 1-digit padding - >>> timestamp_to_datetime('2') - datetime.datetime(2999, 12, 31, 23, 59, 59) - - # 1-digit out-of-range padding - >>> timestamp_to_datetime('3') - datetime.datetime(2999, 12, 31, 23, 59, 59) - - # 0-digit padding - >>> timestamp_to_datetime('') - datetime.datetime(2999, 12, 31, 23, 59, 59) - - # bad month - >>> timestamp_to_datetime('20131709005601') - datetime.datetime(2013, 12, 9, 0, 56, 1) - - # all out of range except minutes - >>> timestamp_to_datetime('40001965252477') - datetime.datetime(2999, 12, 31, 23, 24, 59) - - # not a number! - >>> timestamp_to_datetime('2010abc') - datetime.datetime(2010, 12, 31, 23, 59, 59) - - """ - - # pad to 6 digits - string = pad_timestamp(string, PAD_6_UP) - - def clamp(val, min_, max_): - try: - val = int(val) - val = max(min_, min(val, max_)) - return val - except: - return max_ - - def extract(string, start, end, min_, max_): - if len(string) >= end: - return clamp(string[start:end], min_, max_) - else: - return max_ - - # now parse, clamp to boundary - year = extract(string, 0, 4, 1900, 2999) - month = extract(string, 4, 6, 1, 12) - day = extract(string, 6, 8, 1, calendar.monthrange(year, month)[1]) - hour = extract(string, 8, 10, 0, 23) - minute = extract(string, 10, 12, 0, 59) - second = extract(string, 12, 14, 0, 59) - - return datetime.datetime(year=year, - month=month, - day=day, - hour=hour, - minute=minute, - second=second) - - #return time.strptime(pad_timestamp(string), TIMESTAMP_14) - - -def timestamp_to_sec(string): - """ - >>> timestamp_to_sec('20131226095010') - 1388051410 - - # rounds to end of 2014 - >>> timestamp_to_sec('2014') - 1420070399 - """ - - return calendar.timegm(timestamp_to_datetime(string).utctimetuple()) - - -def sec_to_timestamp(secs): - """ - >>> sec_to_timestamp(1388051410) - '20131226095010' - - >>> sec_to_timestamp(1420070399) - '20141231235959' - """ - - return datetime_to_timestamp(datetime.datetime.utcfromtimestamp(secs)) - - -def timestamp_to_http_date(string): - """ - >>> timestamp_to_http_date('20131226095000') - 'Thu, 26 Dec 2013 09:50:00 GMT' - - >>> timestamp_to_http_date('20140126200804') - 'Sun, 26 Jan 2014 20:08:04 GMT' - """ - return datetime_to_http_date(timestamp_to_datetime(string)) - - -if __name__ == "__main__": - import doctest - doctest.testmod() diff --git a/pywb/warc/archiveindexer.py b/pywb/warc/archiveindexer.py index e5e8cfe5..e28271f2 100644 --- a/pywb/warc/archiveindexer.py +++ b/pywb/warc/archiveindexer.py @@ -1,8 +1,8 @@ -from pywb.utils.timeutils import iso_date_to_timestamp from pywb.utils.canonicalize import canonicalize from pywb.utils.loaders import extract_post_query, append_post_query -from pywb.warc.archiveiterator import ArchiveIterator +from warcio.timeutils import iso_date_to_timestamp +from warcio.archiveiterator import ArchiveIterator import hashlib import base64 @@ -71,7 +71,7 @@ class ArchiveIndexEntryMixin(object): self['urlkey'] = canonicalize(url, surt_ordered) other['urlkey'] = self['urlkey'] - referer = other.record.status_headers.get_header('referer') + referer = other.record.http_headers.get_header('referer') if referer: self['_referer'] = referer @@ -141,7 +141,7 @@ class DefaultRecordParser(object): for record in raw_iter: entry = None - if not include_all and not minimal and (record.status_headers.get_statuscode() == '-'): + if not include_all and not minimal and (record.http_headers.get_statuscode() == '-'): continue if record.rec_type == 'arc_header': @@ -175,13 +175,13 @@ class DefaultRecordParser(object): compute_digest = True elif not minimal and record.rec_type == 'request' and append_post: - method = record.status_headers.protocol - len_ = record.status_headers.get_header('Content-Length') + method = record.http_headers.protocol + len_ = record.http_headers.get_header('Content-Length') post_query = extract_post_query(method, entry.get('_content_type'), len_, - record.stream) + record.raw_stream) entry['_post_query'] = post_query @@ -236,7 +236,7 @@ class DefaultRecordParser(object): if record.rec_type == 'warcinfo': entry['url'] = record.rec_headers.get_header('WARC-Filename') entry['urlkey'] = entry['url'] - entry['_warcinfo'] = record.stream.read(record.length) + entry['_warcinfo'] = record.raw_stream.read(record.length) return entry entry['url'] = record.rec_headers.get_header('WARC-Target-Uri') @@ -252,13 +252,13 @@ class DefaultRecordParser(object): entry['mime'] = '-' else: def_mime = '-' if record.rec_type == 'request' else 'unk' - entry.extract_mime(record.status_headers. + entry.extract_mime(record.http_headers. get_header('Content-Type'), def_mime) # status -- only for response records (by convention): if record.rec_type == 'response' and not self.options.get('minimal'): - entry.extract_status(record.status_headers) + entry.extract_status(record.http_headers) else: entry['status'] = '-' @@ -304,7 +304,7 @@ class DefaultRecordParser(object): entry.extract_mime(record.rec_headers.get_header('content-type')) # status - entry.extract_status(record.status_headers) + entry.extract_status(record.http_headers) # digest entry['digest'] = '-' diff --git a/pywb/warc/archiveiterator.py b/pywb/warc/archiveiterator.py deleted file mode 100644 index 69296e5f..00000000 --- a/pywb/warc/archiveiterator.py +++ /dev/null @@ -1,233 +0,0 @@ -from pywb.utils.bufferedreaders import DecompressingBufferedReader - -from pywb.warc.recordloader import ArcWarcRecordLoader - -import six -import sys - - -# ============================================================================ -BUFF_SIZE = 16384 - - -class ArchiveIterator(six.Iterator): - """ Iterate over records in WARC and ARC files, both gzip chunk - compressed and uncompressed - - The indexer will automatically detect format, and decompress - if necessary. - - """ - - GZIP_ERR_MSG = """ - ERROR: Non-chunked gzip file detected, gzip block continues - beyond single record. - - This file is probably not a multi-chunk gzip but a single gzip file. - - To allow seek, a gzipped {1} must have each record compressed into - a single gzip chunk and concatenated together. - - This file is likely still valid and you can use it by decompressing it: - - gunzip myfile.{0}.gz - - You can then also use the 'warc2warc' tool from the 'warc-tools' - package which will create a properly chunked gzip file: - - warc2warc -Z myfile.{0} > myfile.{0}.gz -""" - - INC_RECORD = """\ - WARNING: Record not followed by newline, perhaps Content-Length is invalid - Offset: {0} - Remainder: {1} -""" - - def __init__(self, fileobj, no_record_parse=False, - verify_http=False, arc2warc=False, block_size=BUFF_SIZE): - - self.fh = fileobj - - self.loader = ArcWarcRecordLoader(verify_http=verify_http, - arc2warc=arc2warc) - self.reader = None - - self.offset = 0 - self.known_format = None - - self.mixed_arc_warc = arc2warc - - self.member_info = None - self.no_record_parse = no_record_parse - - self.reader = DecompressingBufferedReader(self.fh, - block_size=block_size) - self.offset = self.fh.tell() - - self.next_line = None - - self._raise_invalid_gzip = False - self._is_empty = False - self._is_first = True - self.last_record = None - - def __iter__(self): - return self - - def __next__(self): - while True: - if not self._is_first: - self._finish_record() - - self._is_first = False - - try: - self.last_record = self._next_record(self.next_line) - if self._raise_invalid_gzip: - self._raise_invalid_gzip_err() - - return self.last_record - - except EOFError: - self._is_empty = True - - def _finish_record(self): - if self.last_record: - self.read_to_end(self.last_record) - - if self.reader.decompressor: - # if another gzip member, continue - if self.reader.read_next_member(): - return - - # if empty record, then we're done - elif self._is_empty: - raise StopIteration() - - # otherwise, probably a gzip - # containing multiple non-chunked records - # raise this as an error - else: - self._raise_invalid_gzip = True - - # non-gzip, so we're done - elif self._is_empty: - raise StopIteration() - - def _raise_invalid_gzip_err(self): - """ A gzip file with multiple ARC/WARC records, non-chunked - has been detected. This is not valid for replay, so notify user - """ - frmt = 'warc/arc' - if self.known_format: - frmt = self.known_format - - frmt_up = frmt.upper() - - msg = self.GZIP_ERR_MSG.format(frmt, frmt_up) - raise Exception(msg) - - def _consume_blanklines(self): - """ Consume blank lines that are between records - - For warcs, there are usually 2 - - For arcs, may be 1 or 0 - - For block gzipped files, these are at end of each gzip envelope - and are included in record length which is the full gzip envelope - - For uncompressed, they are between records and so are NOT part of - the record length - - count empty_size so that it can be substracted from - the record length for uncompressed - - if first line read is not blank, likely error in WARC/ARC, - display a warning - """ - empty_size = 0 - first_line = True - - while True: - line = self.reader.readline() - if len(line) == 0: - return None, empty_size - - stripped = line.rstrip() - - if len(stripped) == 0 or first_line: - empty_size += len(line) - - if len(stripped) != 0: - # if first line is not blank, - # likely content-length was invalid, display warning - err_offset = self.fh.tell() - self.reader.rem_length() - empty_size - sys.stderr.write(self.INC_RECORD.format(err_offset, line)) - - first_line = False - continue - - return line, empty_size - - def read_to_end(self, record, payload_callback=None): - """ Read remainder of the stream - If a digester is included, update it - with the data read - """ - - # already at end of this record, don't read until it is consumed - if self.member_info: - return None - - num = 0 - curr_offset = self.offset - - while True: - b = record.stream.read(BUFF_SIZE) - if not b: - break - num += len(b) - if payload_callback: - payload_callback(b) - - """ - - For compressed files, blank lines are consumed - since they are part of record length - - For uncompressed files, blank lines are read later, - and not included in the record length - """ - #if self.reader.decompressor: - self.next_line, empty_size = self._consume_blanklines() - - self.offset = self.fh.tell() - self.reader.rem_length() - #if self.offset < 0: - # raise Exception('Not Gzipped Properly') - - if self.next_line: - self.offset -= len(self.next_line) - - length = self.offset - curr_offset - - if not self.reader.decompressor: - length -= empty_size - - self.member_info = (curr_offset, length) - #return self.member_info - #return next_line - - def _next_record(self, next_line): - """ Use loader to parse the record from the reader stream - Supporting warc and arc records - """ - record = self.loader.parse_record_stream(self.reader, - next_line, - self.known_format, - self.no_record_parse) - - self.member_info = None - - # Track known format for faster parsing of other records - if not self.mixed_arc_warc: - self.known_format = record.format - - return record - - diff --git a/pywb/warc/blockrecordloader.py b/pywb/warc/blockrecordloader.py index fea50c31..679ab51b 100644 --- a/pywb/warc/blockrecordloader.py +++ b/pywb/warc/blockrecordloader.py @@ -1,6 +1,6 @@ +from warcio.bufferedreaders import DecompressingBufferedReader +from warcio.recordloader import ArcWarcRecordLoader from pywb.utils.loaders import BlockLoader -from pywb.utils.bufferedreaders import DecompressingBufferedReader -from pywb.warc.recordloader import ArcWarcRecordLoader #================================================================= diff --git a/pywb/warc/recordloader.py b/pywb/warc/recordloader.py deleted file mode 100644 index 64c4181f..00000000 --- a/pywb/warc/recordloader.py +++ /dev/null @@ -1,298 +0,0 @@ -import collections - -from pywb.utils.statusandheaders import StatusAndHeaders -from pywb.utils.statusandheaders import StatusAndHeadersParser -from pywb.utils.statusandheaders import StatusAndHeadersParserException - -from pywb.utils.limitreader import LimitReader -from pywb.utils.loaders import to_native_str - -#from pywb.utils.wbexception import WbException -from pywb.utils.timeutils import timestamp_to_iso_date - -from six.moves import zip - - -#================================================================= -#ArcWarcRecord = collections.namedtuple('ArcWarcRecord', -# 'format, rec_type, rec_headers, ' + -# 'stream, status_headers, ' + -# 'content_type, length') - -#================================================================= -class ArcWarcRecord(object): - def __init__(self, *args): - (self.format, self.rec_type, self.rec_headers, self.stream, - self.status_headers, self.content_type, self.length) = args - - -#================================================================= -class ArchiveLoadFailed(Exception): - def __init__(self, reason, filename=''): - if filename: - msg = filename + ': ' + str(reason) - else: - msg = str(reason) - - super(ArchiveLoadFailed, self).__init__(msg) - self.msg = msg - - -#================================================================= -class ArcWarcRecordLoader(object): - WARC_TYPES = ['WARC/1.0', 'WARC/0.17', 'WARC/0.18'] - - HTTP_TYPES = ['HTTP/1.0', 'HTTP/1.1'] - - HTTP_VERBS = ['GET', 'HEAD', 'POST', 'PUT', 'DELETE', 'TRACE', - 'OPTIONS', 'CONNECT', 'PATCH'] - - NON_HTTP_RECORDS = ('warcinfo', 'arc_header', 'metadata', 'resource') - - NON_HTTP_SCHEMES = ('dns:', 'whois:', 'ntp:') - HTTP_SCHEMES = ('http:', 'https:') - - def __init__(self, verify_http=True, arc2warc=True): - if arc2warc: - self.arc_parser = ARC2WARCHeadersParser() - else: - self.arc_parser = ARCHeadersParser() - - self.warc_parser = StatusAndHeadersParser(self.WARC_TYPES) - self.http_parser = StatusAndHeadersParser(self.HTTP_TYPES, verify_http) - - self.http_req_parser = StatusAndHeadersParser(self.HTTP_VERBS, verify_http) - - def parse_record_stream(self, stream, - statusline=None, - known_format=None, - no_record_parse=False): - """ Parse file-like stream and return an ArcWarcRecord - encapsulating the record headers, http headers (if any), - and a stream limited to the remainder of the record. - - Pass statusline and known_format to detect_type_loader_headers() - to faciliate parsing. - """ - (the_format, rec_headers) = (self. - _detect_type_load_headers(stream, - statusline, - known_format)) - - if the_format == 'arc': - uri = rec_headers.get_header('uri') - length = rec_headers.get_header('length') - content_type = rec_headers.get_header('content-type') - sub_len = rec_headers.total_len - if uri and uri.startswith('filedesc://'): - rec_type = 'arc_header' - else: - rec_type = 'response' - - elif the_format in ('warc', 'arc2warc'): - rec_type = rec_headers.get_header('WARC-Type') - uri = rec_headers.get_header('WARC-Target-URI') - length = rec_headers.get_header('Content-Length') - content_type = rec_headers.get_header('Content-Type') - if the_format == 'warc': - sub_len = 0 - else: - sub_len = rec_headers.total_len - the_format = 'warc' - - is_err = False - - try: - if length is not None: - length = int(length) - sub_len - if length < 0: - is_err = True - - except (ValueError, TypeError): - is_err = True - - # err condition - if is_err: - length = 0 - # or status and headers are completely empty (blank lines found) - elif not rec_headers: - length = 0 - - # limit stream to the length for all valid records - if length is not None and length >= 0: - stream = LimitReader.wrap_stream(stream, length) - - # don't parse the http record at all - if no_record_parse: - status_headers = None#StatusAndHeaders('', []) - - # if empty record (error or otherwise) set status to 204 - elif length == 0: - if is_err: - msg = '204 Possible Error' - else: - msg = '204 No Content' - - status_headers = StatusAndHeaders(msg, []) - - # response record or non-empty revisit: parse HTTP status and headers! - elif (rec_type in ('response', 'revisit') - and uri.startswith(self.HTTP_SCHEMES)): - status_headers = self.http_parser.parse(stream) - - # request record: parse request - elif ((rec_type == 'request') - and uri.startswith(self.HTTP_SCHEMES)): - status_headers = self.http_req_parser.parse(stream) - - # everything else: create a no-status entry, set content-type - else: - content_type_header = [('Content-Type', content_type)] - - if length is not None and length >= 0: - content_type_header.append(('Content-Length', str(length))) - - status_headers = StatusAndHeaders('200 OK', content_type_header) - - return ArcWarcRecord(the_format, rec_type, - rec_headers, stream, status_headers, - content_type, length) - - def _detect_type_load_headers(self, stream, - statusline=None, known_format=None): - """ If known_format is specified ('warc' or 'arc'), - parse only as that format. - - Otherwise, try parsing record as WARC, then try parsing as ARC. - if neither one succeeds, we're out of luck. - """ - - if known_format != 'arc': - # try as warc first - try: - rec_headers = self.warc_parser.parse(stream, statusline) - return 'warc', rec_headers - except StatusAndHeadersParserException as se: - if known_format == 'warc': - msg = 'Invalid WARC record, first line: ' - raise ArchiveLoadFailed(msg + str(se.statusline)) - - statusline = se.statusline - pass - - # now try as arc - try: - rec_headers = self.arc_parser.parse(stream, statusline) - return self.arc_parser.get_rec_type(), rec_headers - except StatusAndHeadersParserException as se: - if known_format == 'arc': - msg = 'Invalid ARC record, first line: ' - else: - msg = 'Unknown archive format, first line: ' - raise ArchiveLoadFailed(msg + str(se.statusline)) - - -#================================================================= -class ARCHeadersParser(object): - # ARC 1.0 headers - ARC_HEADERS = ["uri", "ip-address", "archive-date", - "content-type", "length"] - - def __init__(self): - self.headernames = self.get_header_names() - - def get_rec_type(self): - return 'arc' - - def parse(self, stream, headerline=None): - total_read = 0 - - def readline(): - return to_native_str(stream.readline()) - - # if headerline passed in, use that - if headerline is None: - headerline = readline() - else: - headerline = to_native_str(headerline) - - header_len = len(headerline) - - if header_len == 0: - raise EOFError() - - headerline = headerline.rstrip() - - headernames = self.headernames - - # if arc header, consume next two lines - if headerline.startswith('filedesc://'): - version = readline() # skip version - spec = readline() # skip header spec, use preset one - total_read += len(version) - total_read += len(spec) - - parts = headerline.split(' ') - - if len(parts) != len(headernames): - msg = 'Wrong # of headers, expected arc headers {0}, Found {1}' - msg = msg.format(headernames, parts) - raise StatusAndHeadersParserException(msg, parts) - - - protocol, headers = self._get_protocol_and_headers(headerline, parts) - - return StatusAndHeaders(statusline='', - headers=headers, - protocol='WARC/1.0', - total_len=total_read) - - @classmethod - def get_header_names(cls): - return cls.ARC_HEADERS - - def _get_protocol_and_headers(self, headerline, parts): - headers = [] - - for name, value in zip(self.headernames, parts): - headers.append((name, value)) - - return ('ARC/1.0', headers) - - -#================================================================= -class ARC2WARCHeadersParser(ARCHeadersParser): - # Headers for converting ARC -> WARC Header - ARC_TO_WARC_HEADERS = ["WARC-Target-URI", - "WARC-IP-Address", - "WARC-Date", - "Content-Type", - "Content-Length"] - - def get_rec_type(self): - return 'arc2warc' - - @classmethod - def get_header_names(cls): - return cls.ARC_TO_WARC_HEADERS - - def _get_protocol_and_headers(self, headerline, parts): - headers = [] - - for name, value in zip(self.headernames, parts): - if name == 'WARC-Date': - value = timestamp_to_iso_date(value) - - headers.append((name, value)) - - if headerline.startswith('filedesc://'): - rec_type = 'arc_header' - else: - rec_type = 'response' - - headers.append(('WARC-Type', rec_type)) - headers.append(('WARC-Record-ID', StatusAndHeadersParser.make_warc_id())) - - return ('WARC/1.0', headers) - - diff --git a/pywb/warc/resolvingloader.py b/pywb/warc/resolvingloader.py index 5b74a9c7..e95a5038 100644 --- a/pywb/warc/resolvingloader.py +++ b/pywb/warc/resolvingloader.py @@ -1,6 +1,7 @@ -from pywb.utils.timeutils import iso_date_to_timestamp +from warcio.recordloader import ArchiveLoadFailed +from warcio.timeutils import iso_date_to_timestamp + from pywb.warc.blockrecordloader import BlockArcWarcRecordLoader -from pywb.warc.recordloader import ArchiveLoadFailed from pywb.utils.wbexception import NotFoundException import six @@ -27,20 +28,20 @@ class ResolvingLoader(object): elif headers_record != payload_record: # close remainder of stream as this record only used for # (already parsed) headers - headers_record.stream.close() + headers_record.raw_stream.close() # special case: check if headers record is actually empty # (eg empty revisit), then use headers from revisit - if not headers_record.status_headers.headers: + if not headers_record.http_headers.headers: headers_record = payload_record if not headers_record or not payload_record: raise ArchiveLoadFailed('Could not load ' + str(cdx)) # ensure status line is valid from here - headers_record.status_headers.validate_statusline('204 No Content') + headers_record.http_headers.validate_statusline('204 No Content') - return (headers_record.status_headers, payload_record.stream) + return (headers_record.http_headers, payload_record.raw_stream) def load_headers_and_payload(self, cdx, failed_files, cdx_loader): """ @@ -107,7 +108,7 @@ class ResolvingLoader(object): # optimization: if same file already failed this request, # don't try again if failed_files is not None and filename in failed_files: - raise ArchiveLoadFailed('Skipping Already Failed', filename) + raise ArchiveLoadFailed('Skipping Already Failed: ' + filename) any_found = False last_exc = None @@ -144,7 +145,7 @@ class ResolvingLoader(object): msg = 'Archive File Not Found' #raise ArchiveLoadFailed(msg, filename), None, last_traceback - six.reraise(ArchiveLoadFailed, ArchiveLoadFailed(msg, filename), last_traceback) + six.reraise(ArchiveLoadFailed, ArchiveLoadFailed(filename + ': ' + msg), last_traceback) def _load_different_url_payload(self, cdx, headers_record, failed_files, cdx_loader): diff --git a/pywb/warc/test/test_loading.py b/pywb/warc/test/test_loading.py index bd629a4a..9f2902e7 100644 --- a/pywb/warc/test/test_loading.py +++ b/pywb/warc/test/test_loading.py @@ -292,13 +292,13 @@ import sys import pprint import six -from pywb.warc.recordloader import ArcWarcRecordLoader, ArchiveLoadFailed +from warcio.recordloader import ArcWarcRecordLoader, ArchiveLoadFailed from pywb.warc.blockrecordloader import BlockArcWarcRecordLoader from pywb.warc.resolvingloader import ResolvingLoader from pywb.warc.pathresolvers import PathResolverMapper from pywb.cdx.cdxobject import CDXObject -import pywb.utils.statusandheaders +import warcio.statusandheaders from pywb import get_test_dir @@ -331,10 +331,12 @@ def load_test_archive(test_file, offset, length): archive = testloader.load(path, offset, length) - pywb.utils.statusandheaders.WRAP_WIDTH = 160 + warcio.statusandheaders.WRAP_WIDTH = 160 pprint.pprint(((archive.format, archive.rec_type), - archive.rec_headers, archive.status_headers), indent=1, width=160) + archive.rec_headers, archive.http_headers), indent=1, width=160) + + warcio.statusandheaders.WRAP_WIDTH = 80 #============================================================================== diff --git a/pywb/warc/test/test_writer.py b/pywb/warc/test/test_writer.py deleted file mode 100644 index dd593575..00000000 --- a/pywb/warc/test/test_writer.py +++ /dev/null @@ -1,121 +0,0 @@ -from pywb.utils.statusandheaders import StatusAndHeaders -from pywb.warc.warcwriter import BufferWARCWriter -from pywb.warc.recordloader import ArcWarcRecordLoader -from pywb.warc.archiveiterator import ArchiveIterator - -from io import BytesIO -from collections import OrderedDict -import json - - -# ============================================================================ -class FixedTestWARCWriter(BufferWARCWriter): - @classmethod - def _make_warc_id(cls, id_=None): - return '' - - @classmethod - def _make_warc_date(cls): - return '2000-01-01T00:00:00Z' - - -# ============================================================================ -class TestWarcWriter(object): - def _validate_record_content_len(self, stream): - for record in ArchiveIterator(stream, no_record_parse=True): - assert record.status_headers == None - assert int(record.rec_headers.get_header('Content-Length')) == record.length - assert record.length == len(record.stream.read()) - - - def test_warcinfo_record(self): - simplewriter = FixedTestWARCWriter(gzip=False) - params = OrderedDict([('software', 'recorder test'), - ('format', 'WARC File Format 1.0'), - ('json-metadata', json.dumps({'foo': 'bar'}))]) - - record = simplewriter.create_warcinfo_record('testfile.warc.gz', params) - simplewriter.write_record(record) - buff = simplewriter.get_contents() - assert isinstance(buff, bytes) - - buff = BytesIO(buff) - parsed_record = ArcWarcRecordLoader().parse_record_stream(buff) - - assert parsed_record.rec_headers.get_header('WARC-Type') == 'warcinfo' - assert parsed_record.rec_headers.get_header('Content-Type') == 'application/warc-fields' - assert parsed_record.rec_headers.get_header('WARC-Filename') == 'testfile.warc.gz' - - buff = parsed_record.stream.read().decode('utf-8') - - length = parsed_record.rec_headers.get_header('Content-Length') - - assert len(buff) == int(length) - - assert 'json-metadata: {"foo": "bar"}\r\n' in buff - assert 'format: WARC File Format 1.0\r\n' in buff - - warcinfo_record = '\ -WARC/1.0\r\n\ -WARC-Type: warcinfo\r\n\ -WARC-Record-ID: \r\n\ -WARC-Filename: testfile.warc.gz\r\n\ -WARC-Date: 2000-01-01T00:00:00Z\r\n\ -Content-Type: application/warc-fields\r\n\ -Content-Length: 86\r\n\ -\r\n\ -software: recorder test\r\n\ -format: WARC File Format 1.0\r\n\ -json-metadata: {"foo": "bar"}\r\n\ -\r\n\ -\r\n\ -' - - assert simplewriter.get_contents().decode('utf-8') == warcinfo_record - - def test_generate_response(self): - headers_list = [('Content-Type', 'text/plain; charset="UTF-8"'), - ('Custom-Header', 'somevalue') - ] - - payload = b'some\ntext' - - status_headers = StatusAndHeaders('200 OK', headers_list, protocol='HTTP/1.0') - - - writer = FixedTestWARCWriter(gzip=False) - - record = writer.create_warc_record('http://example.com/', 'response', - payload=BytesIO(payload), - length=len(payload), - status_headers=status_headers) - - - writer.write_record(record) - - buff = writer.get_contents() - - self._validate_record_content_len(BytesIO(buff)) - - warc_record = '\ -WARC/1.0\r\n\ -WARC-Type: response\r\n\ -WARC-Record-ID: \r\n\ -WARC-Target-URI: http://example.com/\r\n\ -WARC-Date: 2000-01-01T00:00:00Z\r\n\ -WARC-Block-Digest: sha1:B6QJ6BNJ3R4B23XXMRKZKHLPGJY2VE4O\r\n\ -WARC-Payload-Digest: sha1:B6QJ6BNJ3R4B23XXMRKZKHLPGJY2VE4O\r\n\ -Content-Type: application/http; msgtype=response\r\n\ -Content-Length: 97\r\n\ -\r\n\ -HTTP/1.0 200 OK\r\n\ -Content-Type: text/plain; charset="UTF-8"\r\n\ -Custom-Header: somevalue\r\n\ -\r\n\ -some\n\ -text\ -\r\n\ -\r\n\ -' - assert buff.decode('utf-8') == warc_record - diff --git a/pywb/warc/warcwriter.py b/pywb/warc/warcwriter.py deleted file mode 100644 index e98d46d3..00000000 --- a/pywb/warc/warcwriter.py +++ /dev/null @@ -1,304 +0,0 @@ -import tempfile -import uuid -import base64 -import hashlib -import datetime -import zlib -import six - -from socket import gethostname -from io import BytesIO - -from pywb.utils.loaders import to_native_str -from pywb.utils.timeutils import datetime_to_iso_date - -from pywb.utils.statusandheaders import StatusAndHeadersParser, StatusAndHeaders - -from pywb.warc.recordloader import ArcWarcRecord -from pywb.warc.recordloader import ArcWarcRecordLoader - - -# ============================================================================ -class BaseWARCWriter(object): - BUFF_SIZE = 16384 - - WARC_RECORDS = {'warcinfo': 'application/warc-fields', - 'response': 'application/http; msgtype=response', - 'revisit': 'application/http; msgtype=response', - 'request': 'application/http; msgtype=request', - 'metadata': 'application/warc-fields', - } - - REVISIT_PROFILE = 'http://netpreserve.org/warc/1.0/revisit/uri-agnostic-identical-payload-digest' - - WARC_VERSION = 'WARC/1.0' - - def __init__(self, gzip=True, header_filter=None, *args, **kwargs): - self.gzip = gzip - self.header_filter = header_filter - self.hostname = gethostname() - - self.parser = StatusAndHeadersParser([], verify=False) - self.warc_version = kwargs.get('warc_version', self.WARC_VERSION) - - @classmethod - def _iter_stream(cls, stream): - while True: - buf = stream.read(cls.BUFF_SIZE) - if not buf: - return - - yield buf - - def ensure_digest(self, record): - block_digest = record.rec_headers.get_header('WARC-Block-Digest') - payload_digest = record.rec_headers.get_header('WARC-Payload-Digest') - if block_digest and payload_digest: - return - - block_digester = self._create_digester() - payload_digester = self._create_digester() - - pos = record.stream.tell() - - if record.status_headers and hasattr(record.status_headers, 'headers_buff'): - block_digester.update(record.status_headers.headers_buff) - - for buf in self._iter_stream(record.stream): - block_digester.update(buf) - payload_digester.update(buf) - - record.stream.seek(pos) - record.rec_headers.add_header('WARC-Block-Digest', str(block_digester)) - record.rec_headers.add_header('WARC-Payload-Digest', str(payload_digester)) - - def _create_digester(self): - return Digester('sha1') - - def _set_header_buff(self, record): - exclude_list = None - if self.header_filter: - exclude_list = self.header_filter(record) - buff = record.status_headers.to_bytes(exclude_list) - record.status_headers.headers_buff = buff - - def create_warcinfo_record(self, filename, info): - warc_headers = StatusAndHeaders(self.warc_version, []) - warc_headers.add_header('WARC-Type', 'warcinfo') - warc_headers.add_header('WARC-Record-ID', self._make_warc_id()) - if filename: - warc_headers.add_header('WARC-Filename', filename) - warc_headers.add_header('WARC-Date', self._make_warc_date()) - - warcinfo = BytesIO() - for n, v in six.iteritems(info): - self._header(warcinfo, n, v) - - warcinfo.seek(0) - - record = ArcWarcRecord('warc', 'warcinfo', warc_headers, warcinfo, - None, '', len(warcinfo.getvalue())) - - return record - - def copy_warc_record(self, payload): - len_ = payload.tell() - payload.seek(0) - - warc_headers = self.parser.parse(payload) - - record_type = warc_headers.get_header('WARC-Type', 'response') - - return self._fill_record(record_type, warc_headers, None, payload, '', len_) - - def create_warc_record(self, uri, record_type, payload, - length=None, - warc_content_type='', - warc_headers_dict={}, - status_headers=None): - - if length is None: - length = payload.tell() - payload.seek(0) - - warc_headers = StatusAndHeaders(self.warc_version, list(warc_headers_dict.items())) - warc_headers.replace_header('WARC-Type', record_type) - if not warc_headers.get_header('WARC-Record-ID'): - warc_headers.add_header('WARC-Record-ID', self._make_warc_id()) - - if uri: - warc_headers.replace_header('WARC-Target-URI', uri) - - if not warc_headers.get_header('WARC-Date'): - warc_headers.add_header('WARC-Date', self._make_warc_date()) - - return self._fill_record(record_type, warc_headers, status_headers, - payload, warc_content_type, length) - - def _fill_record(self, record_type, warc_headers, status_headers, payload, warc_content_type, len_): - has_http_headers = (record_type in ('request', 'response', 'revisit')) - - if not status_headers and has_http_headers: - status_headers = self.parser.parse(payload) - - record = ArcWarcRecord('warc', record_type, warc_headers, payload, - status_headers, warc_content_type, len_) - - self.ensure_digest(record) - - if has_http_headers: - self._set_header_buff(record) - - return record - - def _write_warc_record(self, out, record, adjust_cl=True): - if self.gzip: - out = GzippingWrapper(out) - - # compute Content-Type - content_type = record.rec_headers.get_header('Content-Type') - - if not content_type: - content_type = record.content_type - - if not content_type: - content_type = self.WARC_RECORDS.get(record.rec_headers.get_header('WARC-Type')) - - if content_type: - record.rec_headers.replace_header('Content-Type', content_type) - #self._header(out, 'Content-Type', content_type) - - if record.rec_headers.get_header('WARC-Type') == 'revisit': - http_headers_only = True - else: - http_headers_only = False - - # compute Content-Length - if record.length or record.status_headers: - actual_len = 0 - if record.status_headers: - actual_len = len(record.status_headers.headers_buff) - - if not http_headers_only: - if adjust_cl: - diff = record.stream.tell() - actual_len - else: - diff = 0 - - actual_len = record.length - diff - - record.rec_headers.replace_header('Content-Length', str(actual_len)) - #self._header(out, 'Content-Length', str(actual_len)) - - # add empty line - #self._line(out, b'') - - # write record headers - out.write(record.rec_headers.to_bytes()) - - # write headers buffer, if any - if record.status_headers: - out.write(record.status_headers.headers_buff) - - if not http_headers_only: - for buf in self._iter_stream(record.stream): - out.write(buf) - #out.write(record.stream.read()) - - # add two lines - self._line(out, b'\r\n') - else: - # add three lines (1 for end of header, 2 for end of record) - self._line(out, b'Content-Length: 0\r\n\r\n') - - out.flush() - - def _header(self, out, name, value): - if not value: - return - - self._line(out, (name + ': ' + str(value)).encode('latin-1')) - - def _line(self, out, line): - out.write(line + b'\r\n') - - @classmethod - def _make_warc_id(cls, id_=None): - if not id_: - id_ = uuid.uuid1() - return ''.format(id_) - - @classmethod - def _make_warc_date(cls): - return datetime_to_iso_date(datetime.datetime.utcnow()) - - -# ============================================================================ -class GzippingWrapper(object): - def __init__(self, out): - self.compressor = zlib.compressobj(9, zlib.DEFLATED, zlib.MAX_WBITS + 16) - self.out = out - - def write(self, buff): - #if isinstance(buff, str): - # buff = buff.encode('utf-8') - buff = self.compressor.compress(buff) - self.out.write(buff) - - def flush(self): - buff = self.compressor.flush() - self.out.write(buff) - self.out.flush() - - -# ============================================================================ -class Digester(object): - def __init__(self, type_='sha1'): - self.type_ = type_ - self.digester = hashlib.new(type_) - - def update(self, buff): - self.digester.update(buff) - - def __str__(self): - return self.type_ + ':' + to_native_str(base64.b32encode(self.digester.digest())) - - -# ============================================================================ -class BufferWARCWriter(BaseWARCWriter): - def __init__(self, *args, **kwargs): - super(BufferWARCWriter, self).__init__(*args, **kwargs) - self.out = self._create_buffer() - - def _create_buffer(self): - return tempfile.SpooledTemporaryFile(max_size=512*1024) - - def write_record(self, record): - self._write_warc_record(self.out, record) - - def get_contents(self): - pos = self.out.tell() - self.out.seek(0) - buff = self.out.read() - self.out.seek(pos) - return buff - - -# ============================================================================ -class FileWARCWriter(BufferWARCWriter): - def __init__(self, *args, **kwargs): - file_or_buff = None - if len(args) > 0: - file_or_buff = args[0] - else: - file_or_buff = kwargs.get('file') - - if isinstance(file_or_buff, str): - self.out = open(file_or_buff, 'rb') - elif hasattr(file_or_buff, 'read'): - self.out = file_or_buff - else: - raise Exception('file must be a readable or valid filename') - - - diff --git a/pywb/webagg/aggregator.py b/pywb/webagg/aggregator.py index 5357dfe8..c5fe9739 100644 --- a/pywb/webagg/aggregator.py +++ b/pywb/webagg/aggregator.py @@ -5,7 +5,8 @@ import json import time import os -from pywb.utils.timeutils import timestamp_now +from warcio.timeutils import timestamp_now + from pywb.cdx.cdxops import process_cdx from pywb.cdx.query import CDXQuery diff --git a/pywb/webagg/handlers.py b/pywb/webagg/handlers.py index 50a4362f..6044edc7 100644 --- a/pywb/webagg/handlers.py +++ b/pywb/webagg/handlers.py @@ -2,7 +2,7 @@ from pywb.webagg.responseloader import WARCPathLoader, LiveWebLoader, VideoLoad from pywb.webagg.utils import MementoUtils from pywb.utils.wbexception import BadRequestException, WbException from pywb.utils.wbexception import NotFoundException -from pywb.warc.recordloader import ArchiveLoadFailed +from warcio.recordloader import ArchiveLoadFailed from pywb.cdx.query import CDXQuery from pywb.cdx.cdxdomainspecific import load_domain_specific_cdx_rules diff --git a/pywb/webagg/indexsource.py b/pywb/webagg/indexsource.py index d6564364..cc5cb249 100644 --- a/pywb/webagg/indexsource.py +++ b/pywb/webagg/indexsource.py @@ -1,6 +1,6 @@ from pywb.utils.binsearch import iter_range -from pywb.utils.timeutils import timestamp_to_http_date, http_date_to_timestamp -from pywb.utils.timeutils import timestamp_now +from warcio.timeutils import timestamp_to_http_date, http_date_to_timestamp +from warcio.timeutils import timestamp_now from pywb.utils.canonicalize import canonicalize from pywb.utils.wbexception import NotFoundException diff --git a/pywb/webagg/inputrequest.py b/pywb/webagg/inputrequest.py index 1d2196cf..f76ed067 100644 --- a/pywb/webagg/inputrequest.py +++ b/pywb/webagg/inputrequest.py @@ -1,6 +1,7 @@ +from warcio.limitreader import LimitReader +from warcio.statusandheaders import StatusAndHeadersParser + from pywb.utils.loaders import extract_post_query, append_post_query -from pywb.utils.limitreader import LimitReader -from pywb.utils.statusandheaders import StatusAndHeadersParser from six.moves.urllib.parse import urlsplit, quote from six import iteritems, StringIO diff --git a/pywb/webagg/proxyindexsource.py b/pywb/webagg/proxyindexsource.py index 741f116b..60b0f195 100644 --- a/pywb/webagg/proxyindexsource.py +++ b/pywb/webagg/proxyindexsource.py @@ -3,7 +3,7 @@ from pywb.utils.wbexception import NotFoundException from pywb.webagg.indexsource import BaseIndexSource, RemoteIndexSource from pywb.webagg.responseloader import LiveWebLoader from pywb.webagg.utils import ParamFormatter, res_template -from pywb.utils.timeutils import timestamp_now +from warcio.timeutils import timestamp_now #============================================================================= diff --git a/pywb/webagg/responseloader.py b/pywb/webagg/responseloader.py index 4af342a4..343f17c7 100644 --- a/pywb/webagg/responseloader.py +++ b/pywb/webagg/responseloader.py @@ -2,12 +2,13 @@ from pywb.webagg.utils import MementoUtils, StreamIter, compress_gzip_iter from pywb.webagg.utils import ParamFormatter from pywb.webagg.indexsource import RedisIndexSource -from pywb.utils.timeutils import timestamp_to_datetime, datetime_to_timestamp -from pywb.utils.timeutils import iso_date_to_datetime, datetime_to_iso_date -from pywb.utils.timeutils import http_date_to_datetime, datetime_to_http_date +from warcio.timeutils import timestamp_to_datetime, datetime_to_timestamp +from warcio.timeutils import iso_date_to_datetime, datetime_to_iso_date +from warcio.timeutils import http_date_to_datetime, datetime_to_http_date + +from warcio.statusandheaders import StatusAndHeaders, StatusAndHeadersParser from pywb.utils.wbexception import LiveResourceException, WbException -from pywb.utils.statusandheaders import StatusAndHeaders, StatusAndHeadersParser from pywb.warc.resolvingloader import ResolvingLoader @@ -243,11 +244,11 @@ class WARCPathLoader(BaseLoader): status = cdx.get('status') if not status or status.startswith('3'): - status_headers = self.headers_parser.parse(payload.stream) + http_headers = self.headers_parser.parse(payload.raw_stream) self.raise_on_self_redirect(params, cdx, - status_headers.get_statuscode(), - status_headers.get_header('Location')) - http_headers_buff = status_headers.to_bytes() + http_headers.get_statuscode(), + http_headers.get_header('Location')) + http_headers_buff = http_headers.to_bytes() else: http_headers_buff = None @@ -266,9 +267,9 @@ class WARCPathLoader(BaseLoader): warc_headers.replace_header('WARC-Date', headers.rec_headers.get_header('WARC-Date')) - headers.stream.close() + headers.raw_stream.close() - return (warc_headers, http_headers_buff, payload.stream) + return (warc_headers, http_headers_buff, payload.raw_stream) def __str__(self): return 'WARCPathLoader' diff --git a/pywb/webagg/test/test_handlers.py b/pywb/webagg/test/test_handlers.py index ecf489ce..1efc37e3 100644 --- a/pywb/webagg/test/test_handlers.py +++ b/pywb/webagg/test/test_handlers.py @@ -13,10 +13,10 @@ from pywb.webagg.aggregator import DirectoryIndexSource from pywb.webagg.app import ResAggApp from pywb.webagg.utils import MementoUtils -from pywb.warc.recordloader import ArcWarcRecordLoader +from warcio.recordloader import ArcWarcRecordLoader +from warcio.statusandheaders import StatusAndHeadersParser +from warcio.bufferedreaders import ChunkedDataReader -from pywb.utils.statusandheaders import StatusAndHeadersParser -from pywb.utils.bufferedreaders import ChunkedDataReader from io import BytesIO from six.moves.urllib.parse import urlencode @@ -215,9 +215,9 @@ class TestResAgg(MementoOverrideTests, FakeRedisTests, BaseTestClass): buff = BytesIO(resp.body) record = ArcWarcRecordLoader().parse_record_stream(buff, no_record_parse=False) - print(record.status_headers) - assert record.status_headers.get_statuscode() == '302' - assert record.status_headers.get_header('Location') == 'https://www.iana.org/' + print(record.http_headers) + assert record.http_headers.get_statuscode() == '302' + assert record.http_headers.get_header('Location') == 'https://www.iana.org/' @patch('pywb.webagg.indexsource.MementoIndexSource.get_timegate_links', MementoOverrideTests.mock_link_header('select_live')) def test_agg_select_live(self): diff --git a/pywb/webagg/test/test_indexsource.py b/pywb/webagg/test/test_indexsource.py index c9c19c55..e53ad9cd 100644 --- a/pywb/webagg/test/test_indexsource.py +++ b/pywb/webagg/test/test_indexsource.py @@ -3,7 +3,7 @@ from pywb.webagg.indexsource import LiveIndexSource from pywb.webagg.aggregator import SimpleAggregator -from pywb.utils.timeutils import timestamp_now +from warcio.timeutils import timestamp_now from .testutils import key_ts_res, TEST_CDX_PATH diff --git a/pywb/webagg/test/test_upstream.py b/pywb/webagg/test/test_upstream.py index 5dc32959..037a9ea8 100644 --- a/pywb/webagg/test/test_upstream.py +++ b/pywb/webagg/test/test_upstream.py @@ -8,7 +8,7 @@ from pywb.webagg.handlers import DefaultResourceHandler from pywb.webagg.aggregator import SimpleAggregator from pywb.webagg.proxyindexsource import ProxyMementoIndexSource, UpstreamAggIndexSource -from pywb.warc.recordloader import ArcWarcRecordLoader +from warcio.recordloader import ArcWarcRecordLoader from .testutils import LiveServerTests, BaseTestClass @@ -48,7 +48,7 @@ class TestUpstream(LiveServerTests, BaseTestClass): record = ArcWarcRecordLoader().parse_record_stream(resp.raw, no_record_parse=False) assert record.rec_headers.get_header('WARC-Target-URI') == 'http://httpbin.org/get' - assert record.status_headers.get_header('Date') != '' + assert record.http_headers.get_header('Date') != '' def test_upstream_1(self): resp = self.testapp.get('/upstream/resource?url=http://httpbin.org/get') @@ -58,7 +58,7 @@ class TestUpstream(LiveServerTests, BaseTestClass): record = ArcWarcRecordLoader().parse_record_stream(raw, no_record_parse=False) assert record.rec_headers.get_header('WARC-Target-URI') == 'http://httpbin.org/get' - assert record.status_headers.get_header('Date') != '' + assert record.http_headers.get_header('Date') != '' def test_upstream_2(self): resp = self.testapp.get('/upstream_opt/resource?url=http://httpbin.org/get') @@ -68,7 +68,7 @@ class TestUpstream(LiveServerTests, BaseTestClass): record = ArcWarcRecordLoader().parse_record_stream(raw, no_record_parse=False) assert record.rec_headers.get_header('WARC-Target-URI') == 'http://httpbin.org/get' - assert record.status_headers.get_header('Date') != '' + assert record.http_headers.get_header('Date') != '' diff --git a/pywb/webagg/utils.py b/pywb/webagg/utils.py index 24729022..41ff4216 100644 --- a/pywb/webagg/utils.py +++ b/pywb/webagg/utils.py @@ -7,7 +7,9 @@ import zlib from contextlib import closing -from pywb.utils.timeutils import timestamp_to_http_date +from warcio.timeutils import timestamp_to_http_date +from warcio.utils import BUFF_SIZE + from pywb.utils.wbexception import BadRequestException from pywb.utils.loaders import load_yaml_config @@ -20,9 +22,6 @@ LINK_SEG_SPLIT = re.compile(';\s*') LINK_URL = re.compile('<(.*)>') LINK_PROP = re.compile('([\w]+)="([^"]+)') -BUFF_SIZE = 16384 - - #============================================================================= class MementoException(BadRequestException): pass diff --git a/pywb/webagg/zipnum.py b/pywb/webagg/zipnum.py index 85026646..6b57b446 100644 --- a/pywb/webagg/zipnum.py +++ b/pywb/webagg/zipnum.py @@ -15,7 +15,7 @@ from pywb.cdx.cdxobject import IDXObject, CDXException, CDXObject from pywb.cdx.query import CDXQuery from pywb.utils.loaders import BlockLoader, read_last_line -from pywb.utils.bufferedreaders import gzip_decompressor +from warcio.bufferedreaders import gzip_decompressor from pywb.utils.binsearch import iter_range, linearsearch, search diff --git a/pywb/webapp/handlers.py b/pywb/webapp/handlers.py index 2317d5db..968dfd21 100644 --- a/pywb/webapp/handlers.py +++ b/pywb/webapp/handlers.py @@ -5,9 +5,11 @@ import logging from datetime import datetime +from warcio.statusandheaders import StatusAndHeaders +from warcio.timeutils import datetime_to_timestamp + from pywb.utils.wbexception import NotFoundException from pywb.utils.loaders import LocalFileLoader -from pywb.utils.statusandheaders import StatusAndHeaders from pywb.framework.basehandlers import BaseHandler, WbUrlHandler from pywb.framework.wbrequestresponse import WbResponse @@ -19,7 +21,6 @@ from pywb.warc.pathresolvers import PathResolverMapper from pywb.webapp.views import J2TemplateView, init_view from pywb.webapp.replay_views import ReplayView from pywb.framework.memento import MementoResponse -from pywb.utils.timeutils import datetime_to_timestamp #================================================================= diff --git a/pywb/webapp/rangecache.py b/pywb/webapp/rangecache.py index 53a5e2ac..9c024b54 100644 --- a/pywb/webapp/rangecache.py +++ b/pywb/webapp/rangecache.py @@ -1,5 +1,6 @@ -from pywb.utils.statusandheaders import StatusAndHeaders -from pywb.utils.limitreader import LimitReader +from warcio.statusandheaders import StatusAndHeaders +from warcio.limitreader import LimitReader + from pywb.framework.cache import create_cache from tempfile import NamedTemporaryFile, mkdtemp diff --git a/pywb/webapp/replay_views.py b/pywb/webapp/replay_views.py index b05ade12..7b6652f2 100644 --- a/pywb/webapp/replay_views.py +++ b/pywb/webapp/replay_views.py @@ -5,16 +5,17 @@ from io import BytesIO from six.moves.urllib.parse import urlsplit from itertools import chain -from pywb.utils.statusandheaders import StatusAndHeaders +from warcio.statusandheaders import StatusAndHeaders +from warcio.limitreader import LimitReader +from warcio.timeutils import timestamp_now +from warcio.recordloader import ArchiveLoadFailed + from pywb.utils.wbexception import WbException, NotFoundException -from pywb.utils.limitreader import LimitReader -from pywb.utils.timeutils import timestamp_now from pywb.framework.wbrequestresponse import WbResponse from pywb.framework.memento import MementoResponse from pywb.rewrite.rewrite_content import RewriteContent -from pywb.warc.recordloader import ArchiveLoadFailed from pywb.webapp.views import HeadInsertView diff --git a/pywb/webapp/views.py b/pywb/webapp/views.py index 26a8bd51..7ec27faa 100644 --- a/pywb/webapp/views.py +++ b/pywb/webapp/views.py @@ -1,4 +1,4 @@ -from pywb.utils.timeutils import timestamp_to_datetime, timestamp_to_sec +from warcio.timeutils import timestamp_to_datetime, timestamp_to_sec from pywb.framework.wbrequestresponse import WbResponse from pywb.framework.memento import make_timemap, LINK_FORMAT diff --git a/setup.py b/setup.py index e439588a..585a213e 100755 --- a/setup.py +++ b/setup.py @@ -71,6 +71,7 @@ setup( ], install_requires=[ 'six', + 'warcio', 'chardet', 'requests', 'redis',