diff --git a/pywb/utils/statusandheaders.py b/pywb/utils/statusandheaders.py index a7188c38..124b7f75 100644 --- a/pywb/utils/statusandheaders.py +++ b/pywb/utils/statusandheaders.py @@ -7,6 +7,7 @@ from copy import copy from six.moves import range from six import iteritems from pywb.utils.loaders import to_native_str +import uuid WRAP_WIDTH = 80 @@ -257,6 +258,12 @@ class StatusAndHeadersParser(object): plen = len(prefix) return (key_upper[:plen], key[plen:]) + @staticmethod + def make_warc_id(id_=None): + if not id_: + id_ = uuid.uuid1() + return ''.format(id_) + #================================================================= class StatusAndHeadersParserException(Exception): diff --git a/pywb/utils/timeutils.py b/pywb/utils/timeutils.py index 08ea700d..3c5cd9f3 100644 --- a/pywb/utils/timeutils.py +++ b/pywb/utils/timeutils.py @@ -121,6 +121,18 @@ def iso_date_to_timestamp(string): return datetime_to_timestamp(iso_date_to_datetime(string)) +def timestamp_to_iso_date(string): + """ + >>> timestamp_to_iso_date('20131226101112') + '2013-12-26T10:11:12Z' + + >>> timestamp_to_iso_date('20131226101112') + '2013-12-26T10:11:12Z' + """ + + + return datetime_to_iso_date(timestamp_to_datetime(string)) + def http_date_to_timestamp(string): """ diff --git a/pywb/warc/archiveiterator.py b/pywb/warc/archiveiterator.py index 4ff500d4..69883304 100644 --- a/pywb/warc/archiveiterator.py +++ b/pywb/warc/archiveiterator.py @@ -54,15 +54,18 @@ class ArchiveIterator(object): def __init__(self, fileobj, no_record_parse=False, - verify_http=False): + verify_http=False, arc2warc=False): self.fh = fileobj - self.loader = ArcWarcRecordLoader(verify_http=verify_http) + self.loader = ArcWarcRecordLoader(verify_http=verify_http, + arc2warc=arc2warc) self.reader = None self.offset = 0 self.known_format = None + self.mixed_arc_warc = arc2warc + self.member_info = None self.no_record_parse = no_record_parse @@ -226,7 +229,8 @@ class ArchiveIterator(object): self.member_info = None # Track known format for faster parsing of other records - self.known_format = record.format + if not self.mixed_arc_warc: + self.known_format = record.format return record @@ -359,6 +363,9 @@ class DefaultRecordParser(object): if not include_all and not minimal and (record.status_headers.get_statuscode() == '-'): continue + if record.rec_type == 'arc_header': + continue + if record.format == 'warc': if (record.rec_type in ('request', 'warcinfo') and not include_all and @@ -495,9 +502,6 @@ class DefaultRecordParser(object): def parse_arc_record(self, record): """ Parse arc record """ - if record.rec_type == 'arc_header': - return None - url = record.rec_headers.get_header('uri') url = url.replace('\r', '%0D') url = url.replace('\n', '%0A') @@ -528,7 +532,8 @@ class DefaultRecordParser(object): def __call__(self, fh): aiter = ArchiveIterator(fh, self.options.get('minimal', False), - self.options.get('verify_http', False)) + self.options.get('verify_http', False), + self.options.get('arc2warc', False)) entry_iter = self.create_record_iter(aiter) diff --git a/pywb/warc/recordloader.py b/pywb/warc/recordloader.py index 402d1524..43931958 100644 --- a/pywb/warc/recordloader.py +++ b/pywb/warc/recordloader.py @@ -9,6 +9,7 @@ from pywb.utils.loaders import to_native_str from pywb.utils.bufferedreaders import DecompressingBufferedReader from pywb.utils.wbexception import WbException +from pywb.utils.timeutils import timestamp_to_iso_date from six.moves import zip import six @@ -37,11 +38,6 @@ class ArchiveLoadFailed(WbException): #================================================================= class ArcWarcRecordLoader(object): - # Standard ARC v1.0 headers - # TODO: support ARC v2.0 also? - ARC_HEADERS = ["uri", "ip-address", "archive-date", - "content-type", "length"] - WARC_TYPES = ['WARC/1.0', 'WARC/0.17', 'WARC/0.18'] HTTP_TYPES = ['HTTP/1.0', 'HTTP/1.1'] @@ -55,14 +51,17 @@ class ArcWarcRecordLoader(object): HTTP_SCHEMES = ('http:', 'https:') def __init__(self, loader=None, cookie_maker=None, block_size=8192, - verify_http=True): + verify_http=True, arc2warc=True): if not loader: loader = BlockLoader(cookie_maker=cookie_maker) self.loader = loader self.block_size = block_size - self.arc_parser = ARCHeadersParser(self.ARC_HEADERS) + if arc2warc: + self.arc_parser = ARC2WARCHeadersParser() + else: + self.arc_parser = ARCHeadersParser() self.warc_parser = StatusAndHeadersParser(self.WARC_TYPES) self.http_parser = StatusAndHeadersParser(self.HTTP_TYPES, verify_http) @@ -114,12 +113,16 @@ class ArcWarcRecordLoader(object): else: rec_type = 'response' - elif the_format == 'warc': + elif the_format in ('warc', 'arc2warc'): rec_type = rec_headers.get_header('WARC-Type') uri = rec_headers.get_header('WARC-Target-URI') length = rec_headers.get_header('Content-Length') content_type = rec_headers.get_header('Content-Type') - sub_len = 0 + if the_format == 'warc': + sub_len = 0 + else: + sub_len = rec_headers.total_len + the_format = 'warc' is_err = False @@ -201,7 +204,7 @@ class ArcWarcRecordLoader(object): # now try as arc try: rec_headers = self.arc_parser.parse(stream, statusline) - return 'arc', rec_headers + return self.arc_parser.get_rec_type(), rec_headers except StatusAndHeadersParserException as se: if known_format == 'arc': msg = 'Invalid ARC record, first line: ' @@ -212,8 +215,15 @@ class ArcWarcRecordLoader(object): #================================================================= class ARCHeadersParser(object): - def __init__(self, headernames): - self.headernames = headernames + # ARC 1.0 headers + ARC_HEADERS = ["uri", "ip-address", "archive-date", + "content-type", "length"] + + def __init__(self): + self.headernames = self.get_header_names() + + def get_rec_type(self): + return 'arc' def parse(self, stream, headerline=None): total_read = 0 @@ -250,12 +260,60 @@ class ARCHeadersParser(object): msg = msg.format(headernames, parts) raise StatusAndHeadersParserException(msg, parts) - headers = [] - for name, value in zip(headernames, parts): - headers.append((name, value)) + protocol, headers = self._get_protocol_and_headers(headerline, parts) return StatusAndHeaders(statusline='', headers=headers, - protocol='ARC/1.0', + protocol='WARC/1.0', total_len=total_read) + + @classmethod + def get_header_names(cls): + return cls.ARC_HEADERS + + def _get_protocol_and_headers(self, headerline, parts): + headers = [] + + for name, value in zip(self.headernames, parts): + headers.append((name, value)) + + return ('ARC/1.0', headers) + + +#================================================================= +class ARC2WARCHeadersParser(ARCHeadersParser): + # Headers for converting ARC -> WARC Header + ARC_TO_WARC_HEADERS = ["WARC-Target-URI", + "WARC-IP-Address", + "WARC-Date", + "Content-Type", + "Content-Length"] + + def get_rec_type(self): + return 'arc2warc' + + @classmethod + def get_header_names(cls): + return cls.ARC_TO_WARC_HEADERS + + def _get_protocol_and_headers(self, headerline, parts): + headers = [] + + for name, value in zip(self.headernames, parts): + if name == 'WARC-Date': + value = timestamp_to_iso_date(value) + + headers.append((name, value)) + + if headerline.startswith('filedesc://'): + rec_type = 'arc_header' + else: + rec_type = 'response' + + headers.append(('WARC-Type', rec_type)) + headers.append(('WARC-Record-ID', StatusAndHeadersParser.make_warc_id())) + + return ('WARC/1.0', headers) + + diff --git a/pywb/warc/test/test_indexing.py b/pywb/warc/test/test_indexing.py index c38d3a08..daabf7bb 100644 --- a/pywb/warc/test/test_indexing.py +++ b/pywb/warc/test/test_indexing.py @@ -44,6 +44,19 @@ com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ CDX N b a m s k r M S V g com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1656 151 example.arc +# arc.gz +>>> print_cdx_index('example.arc.gz', arc2warc=True) + CDX N b a m s k r M S V g +com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 856 171 example.arc.gz + +# arc +>>> print_cdx_index('example.arc', arc2warc=True) + CDX N b a m s k r M S V g +com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1656 151 example.arc + + + + # wget warc, includes metadata by default >>> print_cdx_index('example-wget-1-14.warc.gz') CDX N b a m s k r M S V g @@ -328,6 +341,22 @@ def test_cdxj_arc_minimal(): com,example)/ 20140216050221 {"url": "http://example.com/", "digest": "PEWDX5GTH66WU74WBPGFECIYBMPMP3FP", "length": "856", "offset": "171", "filename": "example.arc.gz"} """) +def test_cdxj_arc_conv(): + # arc.gz -- json + res = cdx_index('example.arc.gz', cdxj=True, arc2warc=True) + assert parse_cdxj(res) == parse_cdxj(b""" +com,example)/ 20140216050221 {"url": "http://example.com/", "mime": "text/html", "status": "200", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "856", "offset": "171", "filename": "example.arc.gz"} +""") + +def test_cdxj_arc_minimal_conv(): + # arc.gz -- minimal + json + res = cdx_index('example.arc.gz', cdxj=True, minimal=True, arc2warc=True) + assert parse_cdxj(res) == parse_cdxj(b""" +com,example)/ 20140216050221 {"url": "http://example.com/", "digest": "PEWDX5GTH66WU74WBPGFECIYBMPMP3FP", "length": "856", "offset": "171", "filename": "example.arc.gz"} +""") + + + def test_cdxj_empty(): options = dict(cdxj=True)