record parser: arc-to-warc: support converting arc records to warc 'response' records on-the-fly to simplify

processing for tools that read WARC records. arc headers are converted to equivalent warc header, WARC-Record-ID generated on the fly #190
2025-03-15 00:03:28 +01:00 · 2016-07-31 22:31:21 -04:00 · 2016-07-31 22:31:21 -04:00 · 68b94fe671
commit 68b94fe671
parent 66ca8d8b26
5 changed files with 134 additions and 23 deletions
--- a/pywb/utils/statusandheaders.py
+++ b/pywb/utils/statusandheaders.py
@ -7,6 +7,7 @@ from copy import copy
 from six.moves import range
 from six import iteritems
 from pywb.utils.loaders import to_native_str
+import uuid


 WRAP_WIDTH = 80
@ -257,6 +258,12 @@ class StatusAndHeadersParser(object):
                plen = len(prefix)
                return (key_upper[:plen], key[plen:])

+    @staticmethod
+    def make_warc_id(id_=None):
+        if not id_:
+            id_ = uuid.uuid1()
+        return '<urn:uuid:{0}>'.format(id_)
+

 #=================================================================
 class StatusAndHeadersParserException(Exception):
--- a/pywb/utils/timeutils.py
+++ b/pywb/utils/timeutils.py
@ -121,6 +121,18 @@ def iso_date_to_timestamp(string):

    return datetime_to_timestamp(iso_date_to_datetime(string))

+def timestamp_to_iso_date(string):
+    """
+    >>> timestamp_to_iso_date('20131226101112')
+    '2013-12-26T10:11:12Z'
+
+    >>> timestamp_to_iso_date('20131226101112')
+    '2013-12-26T10:11:12Z'
+    """
+
+
+    return datetime_to_iso_date(timestamp_to_datetime(string))
+

 def http_date_to_timestamp(string):
    """
--- a/pywb/warc/archiveiterator.py
+++ b/pywb/warc/archiveiterator.py
@ -54,15 +54,18 @@ class ArchiveIterator(object):


    def __init__(self, fileobj, no_record_parse=False,
-                 verify_http=False):
+                 verify_http=False, arc2warc=False):
        self.fh = fileobj

-        self.loader = ArcWarcRecordLoader(verify_http=verify_http)
+        self.loader = ArcWarcRecordLoader(verify_http=verify_http,
+                                          arc2warc=arc2warc)
        self.reader = None

        self.offset = 0
        self.known_format = None

+        self.mixed_arc_warc = arc2warc
+
        self.member_info = None
        self.no_record_parse = no_record_parse

@ -226,7 +229,8 @@ class ArchiveIterator(object):
        self.member_info = None

        # Track known format for faster parsing of other records
-        self.known_format = record.format
+        if not self.mixed_arc_warc:
+            self.known_format = record.format

        return record

@ -359,6 +363,9 @@ class DefaultRecordParser(object):
            if not include_all and not minimal and (record.status_headers.get_statuscode() == '-'):
                continue

+            if record.rec_type == 'arc_header':
+                continue
+
            if record.format == 'warc':
                if (record.rec_type in ('request', 'warcinfo') and
                     not include_all and
@ -495,9 +502,6 @@ class DefaultRecordParser(object):
    def parse_arc_record(self, record):
        """ Parse arc record
        """
-        if record.rec_type == 'arc_header':
-            return None
-
        url = record.rec_headers.get_header('uri')
        url = url.replace('\r', '%0D')
        url = url.replace('\n', '%0A')
@ -528,7 +532,8 @@ class DefaultRecordParser(object):

    def __call__(self, fh):
        aiter = ArchiveIterator(fh, self.options.get('minimal', False),
-                                    self.options.get('verify_http', False))
+                                    self.options.get('verify_http', False),
+                                    self.options.get('arc2warc', False))

        entry_iter = self.create_record_iter(aiter)

--- a/pywb/warc/recordloader.py
+++ b/pywb/warc/recordloader.py
@ -9,6 +9,7 @@ from pywb.utils.loaders import to_native_str
 from pywb.utils.bufferedreaders import DecompressingBufferedReader

 from pywb.utils.wbexception import WbException
+from pywb.utils.timeutils import timestamp_to_iso_date

 from six.moves import zip
 import six
@ -37,11 +38,6 @@ class ArchiveLoadFailed(WbException):

 #=================================================================
 class ArcWarcRecordLoader(object):
-    # Standard ARC v1.0 headers
-    # TODO: support ARC v2.0 also?
-    ARC_HEADERS = ["uri", "ip-address", "archive-date",
-                   "content-type", "length"]
-
    WARC_TYPES = ['WARC/1.0', 'WARC/0.17', 'WARC/0.18']

    HTTP_TYPES = ['HTTP/1.0', 'HTTP/1.1']
@ -55,14 +51,17 @@ class ArcWarcRecordLoader(object):
    HTTP_SCHEMES = ('http:', 'https:')

    def __init__(self, loader=None, cookie_maker=None, block_size=8192,
-                 verify_http=True):
+                 verify_http=True, arc2warc=True):
        if not loader:
            loader = BlockLoader(cookie_maker=cookie_maker)

        self.loader = loader
        self.block_size = block_size

-        self.arc_parser = ARCHeadersParser(self.ARC_HEADERS)
+        if arc2warc:
+            self.arc_parser = ARC2WARCHeadersParser()
+        else:
+            self.arc_parser = ARCHeadersParser()

        self.warc_parser = StatusAndHeadersParser(self.WARC_TYPES)
        self.http_parser = StatusAndHeadersParser(self.HTTP_TYPES, verify_http)
@ -114,12 +113,16 @@ class ArcWarcRecordLoader(object):
            else:
                rec_type = 'response'

-        elif the_format == 'warc':
+        elif the_format in ('warc', 'arc2warc'):
            rec_type = rec_headers.get_header('WARC-Type')
            uri = rec_headers.get_header('WARC-Target-URI')
            length = rec_headers.get_header('Content-Length')
            content_type = rec_headers.get_header('Content-Type')
-            sub_len = 0
+            if the_format == 'warc':
+                sub_len = 0
+            else:
+                sub_len = rec_headers.total_len
+                the_format = 'warc'

        is_err = False

@ -201,7 +204,7 @@ class ArcWarcRecordLoader(object):
        # now try as arc
        try:
            rec_headers = self.arc_parser.parse(stream, statusline)
-            return 'arc', rec_headers
+            return self.arc_parser.get_rec_type(), rec_headers
        except StatusAndHeadersParserException as se:
            if known_format == 'arc':
                msg = 'Invalid ARC record, first line: '
@ -212,8 +215,15 @@ class ArcWarcRecordLoader(object):

 #=================================================================
 class ARCHeadersParser(object):
-    def __init__(self, headernames):
-        self.headernames = headernames
+    # ARC 1.0 headers
+    ARC_HEADERS = ["uri", "ip-address", "archive-date",
+                       "content-type", "length"]
+
+    def __init__(self):
+        self.headernames = self.get_header_names()
+
+    def get_rec_type(self):
+        return 'arc'

    def parse(self, stream, headerline=None):
        total_read = 0
@ -250,12 +260,60 @@ class ARCHeadersParser(object):
            msg = msg.format(headernames, parts)
            raise StatusAndHeadersParserException(msg, parts)

-        headers = []

-        for name, value in zip(headernames, parts):
-            headers.append((name, value))
+        protocol, headers = self._get_protocol_and_headers(headerline, parts)

        return StatusAndHeaders(statusline='',
                                headers=headers,
-                                protocol='ARC/1.0',
+                                protocol='WARC/1.0',
                                total_len=total_read)
+
+    @classmethod
+    def get_header_names(cls):
+        return cls.ARC_HEADERS
+
+    def _get_protocol_and_headers(self, headerline, parts):
+        headers = []
+
+        for name, value in zip(self.headernames, parts):
+            headers.append((name, value))
+
+        return ('ARC/1.0', headers)
+
+
+#=================================================================
+class ARC2WARCHeadersParser(ARCHeadersParser):
+    # Headers for converting ARC -> WARC Header
+    ARC_TO_WARC_HEADERS = ["WARC-Target-URI",
+                           "WARC-IP-Address",
+                           "WARC-Date",
+                           "Content-Type",
+                           "Content-Length"]
+
+    def get_rec_type(self):
+        return 'arc2warc'
+
+    @classmethod
+    def get_header_names(cls):
+        return cls.ARC_TO_WARC_HEADERS
+
+    def _get_protocol_and_headers(self, headerline, parts):
+        headers = []
+
+        for name, value in zip(self.headernames, parts):
+            if name == 'WARC-Date':
+                value = timestamp_to_iso_date(value)
+
+            headers.append((name, value))
+
+        if headerline.startswith('filedesc://'):
+            rec_type = 'arc_header'
+        else:
+            rec_type = 'response'
+
+        headers.append(('WARC-Type', rec_type))
+        headers.append(('WARC-Record-ID', StatusAndHeadersParser.make_warc_id()))
+
+        return ('WARC/1.0', headers)
+
+
--- a/pywb/warc/test/test_indexing.py
+++ b/pywb/warc/test/test_indexing.py
@ -44,6 +44,19 @@ com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ
 CDX N b a m s k r M S V g
 com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1656 151 example.arc

+# arc.gz
+>>> print_cdx_index('example.arc.gz', arc2warc=True)
+ CDX N b a m s k r M S V g
+com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 856 171 example.arc.gz
+
+# arc
+>>> print_cdx_index('example.arc', arc2warc=True)
+ CDX N b a m s k r M S V g
+com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1656 151 example.arc
+
+
+
+
 # wget warc, includes metadata by default
 >>> print_cdx_index('example-wget-1-14.warc.gz')
 CDX N b a m s k r M S V g
@ -328,6 +341,22 @@ def test_cdxj_arc_minimal():
 com,example)/ 20140216050221 {"url": "http://example.com/", "digest": "PEWDX5GTH66WU74WBPGFECIYBMPMP3FP", "length": "856", "offset": "171", "filename": "example.arc.gz"}
 """)

+def test_cdxj_arc_conv():
+    # arc.gz -- json
+    res = cdx_index('example.arc.gz', cdxj=True, arc2warc=True)
+    assert parse_cdxj(res) == parse_cdxj(b"""
+com,example)/ 20140216050221 {"url": "http://example.com/", "mime": "text/html", "status": "200", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "856", "offset": "171", "filename": "example.arc.gz"}
+""")
+
+def test_cdxj_arc_minimal_conv():
+    # arc.gz -- minimal + json
+    res = cdx_index('example.arc.gz', cdxj=True, minimal=True, arc2warc=True)
+    assert parse_cdxj(res) == parse_cdxj(b"""
+com,example)/ 20140216050221 {"url": "http://example.com/", "digest": "PEWDX5GTH66WU74WBPGFECIYBMPMP3FP", "length": "856", "offset": "171", "filename": "example.arc.gz"}
+""")
+
+
+

 def test_cdxj_empty():
    options = dict(cdxj=True)