1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

record parser: arc-to-warc: support converting arc records to warc 'response' records on-the-fly to simplify

processing for tools that read WARC records. arc headers are converted to equivalent warc header, WARC-Record-ID
generated on the fly #190
This commit is contained in:
Ilya Kreymer 2016-07-31 22:31:21 -04:00
parent 66ca8d8b26
commit 68b94fe671
5 changed files with 134 additions and 23 deletions

View File

@ -7,6 +7,7 @@ from copy import copy
from six.moves import range
from six import iteritems
from pywb.utils.loaders import to_native_str
import uuid
WRAP_WIDTH = 80
@ -257,6 +258,12 @@ class StatusAndHeadersParser(object):
plen = len(prefix)
return (key_upper[:plen], key[plen:])
@staticmethod
def make_warc_id(id_=None):
if not id_:
id_ = uuid.uuid1()
return '<urn:uuid:{0}>'.format(id_)
#=================================================================
class StatusAndHeadersParserException(Exception):

View File

@ -121,6 +121,18 @@ def iso_date_to_timestamp(string):
return datetime_to_timestamp(iso_date_to_datetime(string))
def timestamp_to_iso_date(string):
"""
>>> timestamp_to_iso_date('20131226101112')
'2013-12-26T10:11:12Z'
>>> timestamp_to_iso_date('20131226101112')
'2013-12-26T10:11:12Z'
"""
return datetime_to_iso_date(timestamp_to_datetime(string))
def http_date_to_timestamp(string):
"""

View File

@ -54,15 +54,18 @@ class ArchiveIterator(object):
def __init__(self, fileobj, no_record_parse=False,
verify_http=False):
verify_http=False, arc2warc=False):
self.fh = fileobj
self.loader = ArcWarcRecordLoader(verify_http=verify_http)
self.loader = ArcWarcRecordLoader(verify_http=verify_http,
arc2warc=arc2warc)
self.reader = None
self.offset = 0
self.known_format = None
self.mixed_arc_warc = arc2warc
self.member_info = None
self.no_record_parse = no_record_parse
@ -226,7 +229,8 @@ class ArchiveIterator(object):
self.member_info = None
# Track known format for faster parsing of other records
self.known_format = record.format
if not self.mixed_arc_warc:
self.known_format = record.format
return record
@ -359,6 +363,9 @@ class DefaultRecordParser(object):
if not include_all and not minimal and (record.status_headers.get_statuscode() == '-'):
continue
if record.rec_type == 'arc_header':
continue
if record.format == 'warc':
if (record.rec_type in ('request', 'warcinfo') and
not include_all and
@ -495,9 +502,6 @@ class DefaultRecordParser(object):
def parse_arc_record(self, record):
""" Parse arc record
"""
if record.rec_type == 'arc_header':
return None
url = record.rec_headers.get_header('uri')
url = url.replace('\r', '%0D')
url = url.replace('\n', '%0A')
@ -528,7 +532,8 @@ class DefaultRecordParser(object):
def __call__(self, fh):
aiter = ArchiveIterator(fh, self.options.get('minimal', False),
self.options.get('verify_http', False))
self.options.get('verify_http', False),
self.options.get('arc2warc', False))
entry_iter = self.create_record_iter(aiter)

View File

@ -9,6 +9,7 @@ from pywb.utils.loaders import to_native_str
from pywb.utils.bufferedreaders import DecompressingBufferedReader
from pywb.utils.wbexception import WbException
from pywb.utils.timeutils import timestamp_to_iso_date
from six.moves import zip
import six
@ -37,11 +38,6 @@ class ArchiveLoadFailed(WbException):
#=================================================================
class ArcWarcRecordLoader(object):
# Standard ARC v1.0 headers
# TODO: support ARC v2.0 also?
ARC_HEADERS = ["uri", "ip-address", "archive-date",
"content-type", "length"]
WARC_TYPES = ['WARC/1.0', 'WARC/0.17', 'WARC/0.18']
HTTP_TYPES = ['HTTP/1.0', 'HTTP/1.1']
@ -55,14 +51,17 @@ class ArcWarcRecordLoader(object):
HTTP_SCHEMES = ('http:', 'https:')
def __init__(self, loader=None, cookie_maker=None, block_size=8192,
verify_http=True):
verify_http=True, arc2warc=True):
if not loader:
loader = BlockLoader(cookie_maker=cookie_maker)
self.loader = loader
self.block_size = block_size
self.arc_parser = ARCHeadersParser(self.ARC_HEADERS)
if arc2warc:
self.arc_parser = ARC2WARCHeadersParser()
else:
self.arc_parser = ARCHeadersParser()
self.warc_parser = StatusAndHeadersParser(self.WARC_TYPES)
self.http_parser = StatusAndHeadersParser(self.HTTP_TYPES, verify_http)
@ -114,12 +113,16 @@ class ArcWarcRecordLoader(object):
else:
rec_type = 'response'
elif the_format == 'warc':
elif the_format in ('warc', 'arc2warc'):
rec_type = rec_headers.get_header('WARC-Type')
uri = rec_headers.get_header('WARC-Target-URI')
length = rec_headers.get_header('Content-Length')
content_type = rec_headers.get_header('Content-Type')
sub_len = 0
if the_format == 'warc':
sub_len = 0
else:
sub_len = rec_headers.total_len
the_format = 'warc'
is_err = False
@ -201,7 +204,7 @@ class ArcWarcRecordLoader(object):
# now try as arc
try:
rec_headers = self.arc_parser.parse(stream, statusline)
return 'arc', rec_headers
return self.arc_parser.get_rec_type(), rec_headers
except StatusAndHeadersParserException as se:
if known_format == 'arc':
msg = 'Invalid ARC record, first line: '
@ -212,8 +215,15 @@ class ArcWarcRecordLoader(object):
#=================================================================
class ARCHeadersParser(object):
def __init__(self, headernames):
self.headernames = headernames
# ARC 1.0 headers
ARC_HEADERS = ["uri", "ip-address", "archive-date",
"content-type", "length"]
def __init__(self):
self.headernames = self.get_header_names()
def get_rec_type(self):
return 'arc'
def parse(self, stream, headerline=None):
total_read = 0
@ -250,12 +260,60 @@ class ARCHeadersParser(object):
msg = msg.format(headernames, parts)
raise StatusAndHeadersParserException(msg, parts)
headers = []
for name, value in zip(headernames, parts):
headers.append((name, value))
protocol, headers = self._get_protocol_and_headers(headerline, parts)
return StatusAndHeaders(statusline='',
headers=headers,
protocol='ARC/1.0',
protocol='WARC/1.0',
total_len=total_read)
@classmethod
def get_header_names(cls):
return cls.ARC_HEADERS
def _get_protocol_and_headers(self, headerline, parts):
headers = []
for name, value in zip(self.headernames, parts):
headers.append((name, value))
return ('ARC/1.0', headers)
#=================================================================
class ARC2WARCHeadersParser(ARCHeadersParser):
# Headers for converting ARC -> WARC Header
ARC_TO_WARC_HEADERS = ["WARC-Target-URI",
"WARC-IP-Address",
"WARC-Date",
"Content-Type",
"Content-Length"]
def get_rec_type(self):
return 'arc2warc'
@classmethod
def get_header_names(cls):
return cls.ARC_TO_WARC_HEADERS
def _get_protocol_and_headers(self, headerline, parts):
headers = []
for name, value in zip(self.headernames, parts):
if name == 'WARC-Date':
value = timestamp_to_iso_date(value)
headers.append((name, value))
if headerline.startswith('filedesc://'):
rec_type = 'arc_header'
else:
rec_type = 'response'
headers.append(('WARC-Type', rec_type))
headers.append(('WARC-Record-ID', StatusAndHeadersParser.make_warc_id()))
return ('WARC/1.0', headers)

View File

@ -44,6 +44,19 @@ com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ
CDX N b a m s k r M S V g
com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1656 151 example.arc
# arc.gz
>>> print_cdx_index('example.arc.gz', arc2warc=True)
CDX N b a m s k r M S V g
com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 856 171 example.arc.gz
# arc
>>> print_cdx_index('example.arc', arc2warc=True)
CDX N b a m s k r M S V g
com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1656 151 example.arc
# wget warc, includes metadata by default
>>> print_cdx_index('example-wget-1-14.warc.gz')
CDX N b a m s k r M S V g
@ -328,6 +341,22 @@ def test_cdxj_arc_minimal():
com,example)/ 20140216050221 {"url": "http://example.com/", "digest": "PEWDX5GTH66WU74WBPGFECIYBMPMP3FP", "length": "856", "offset": "171", "filename": "example.arc.gz"}
""")
def test_cdxj_arc_conv():
# arc.gz -- json
res = cdx_index('example.arc.gz', cdxj=True, arc2warc=True)
assert parse_cdxj(res) == parse_cdxj(b"""
com,example)/ 20140216050221 {"url": "http://example.com/", "mime": "text/html", "status": "200", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "856", "offset": "171", "filename": "example.arc.gz"}
""")
def test_cdxj_arc_minimal_conv():
# arc.gz -- minimal + json
res = cdx_index('example.arc.gz', cdxj=True, minimal=True, arc2warc=True)
assert parse_cdxj(res) == parse_cdxj(b"""
com,example)/ 20140216050221 {"url": "http://example.com/", "digest": "PEWDX5GTH66WU74WBPGFECIYBMPMP3FP", "length": "856", "offset": "171", "filename": "example.arc.gz"}
""")
def test_cdxj_empty():
options = dict(cdxj=True)