mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
record parser: arc-to-warc: support converting arc records to warc 'response' records on-the-fly to simplify
processing for tools that read WARC records. arc headers are converted to equivalent warc header, WARC-Record-ID generated on the fly #190
This commit is contained in:
parent
66ca8d8b26
commit
68b94fe671
@ -7,6 +7,7 @@ from copy import copy
|
||||
from six.moves import range
|
||||
from six import iteritems
|
||||
from pywb.utils.loaders import to_native_str
|
||||
import uuid
|
||||
|
||||
|
||||
WRAP_WIDTH = 80
|
||||
@ -257,6 +258,12 @@ class StatusAndHeadersParser(object):
|
||||
plen = len(prefix)
|
||||
return (key_upper[:plen], key[plen:])
|
||||
|
||||
@staticmethod
|
||||
def make_warc_id(id_=None):
|
||||
if not id_:
|
||||
id_ = uuid.uuid1()
|
||||
return '<urn:uuid:{0}>'.format(id_)
|
||||
|
||||
|
||||
#=================================================================
|
||||
class StatusAndHeadersParserException(Exception):
|
||||
|
@ -121,6 +121,18 @@ def iso_date_to_timestamp(string):
|
||||
|
||||
return datetime_to_timestamp(iso_date_to_datetime(string))
|
||||
|
||||
def timestamp_to_iso_date(string):
|
||||
"""
|
||||
>>> timestamp_to_iso_date('20131226101112')
|
||||
'2013-12-26T10:11:12Z'
|
||||
|
||||
>>> timestamp_to_iso_date('20131226101112')
|
||||
'2013-12-26T10:11:12Z'
|
||||
"""
|
||||
|
||||
|
||||
return datetime_to_iso_date(timestamp_to_datetime(string))
|
||||
|
||||
|
||||
def http_date_to_timestamp(string):
|
||||
"""
|
||||
|
@ -54,15 +54,18 @@ class ArchiveIterator(object):
|
||||
|
||||
|
||||
def __init__(self, fileobj, no_record_parse=False,
|
||||
verify_http=False):
|
||||
verify_http=False, arc2warc=False):
|
||||
self.fh = fileobj
|
||||
|
||||
self.loader = ArcWarcRecordLoader(verify_http=verify_http)
|
||||
self.loader = ArcWarcRecordLoader(verify_http=verify_http,
|
||||
arc2warc=arc2warc)
|
||||
self.reader = None
|
||||
|
||||
self.offset = 0
|
||||
self.known_format = None
|
||||
|
||||
self.mixed_arc_warc = arc2warc
|
||||
|
||||
self.member_info = None
|
||||
self.no_record_parse = no_record_parse
|
||||
|
||||
@ -226,7 +229,8 @@ class ArchiveIterator(object):
|
||||
self.member_info = None
|
||||
|
||||
# Track known format for faster parsing of other records
|
||||
self.known_format = record.format
|
||||
if not self.mixed_arc_warc:
|
||||
self.known_format = record.format
|
||||
|
||||
return record
|
||||
|
||||
@ -359,6 +363,9 @@ class DefaultRecordParser(object):
|
||||
if not include_all and not minimal and (record.status_headers.get_statuscode() == '-'):
|
||||
continue
|
||||
|
||||
if record.rec_type == 'arc_header':
|
||||
continue
|
||||
|
||||
if record.format == 'warc':
|
||||
if (record.rec_type in ('request', 'warcinfo') and
|
||||
not include_all and
|
||||
@ -495,9 +502,6 @@ class DefaultRecordParser(object):
|
||||
def parse_arc_record(self, record):
|
||||
""" Parse arc record
|
||||
"""
|
||||
if record.rec_type == 'arc_header':
|
||||
return None
|
||||
|
||||
url = record.rec_headers.get_header('uri')
|
||||
url = url.replace('\r', '%0D')
|
||||
url = url.replace('\n', '%0A')
|
||||
@ -528,7 +532,8 @@ class DefaultRecordParser(object):
|
||||
|
||||
def __call__(self, fh):
|
||||
aiter = ArchiveIterator(fh, self.options.get('minimal', False),
|
||||
self.options.get('verify_http', False))
|
||||
self.options.get('verify_http', False),
|
||||
self.options.get('arc2warc', False))
|
||||
|
||||
entry_iter = self.create_record_iter(aiter)
|
||||
|
||||
|
@ -9,6 +9,7 @@ from pywb.utils.loaders import to_native_str
|
||||
from pywb.utils.bufferedreaders import DecompressingBufferedReader
|
||||
|
||||
from pywb.utils.wbexception import WbException
|
||||
from pywb.utils.timeutils import timestamp_to_iso_date
|
||||
|
||||
from six.moves import zip
|
||||
import six
|
||||
@ -37,11 +38,6 @@ class ArchiveLoadFailed(WbException):
|
||||
|
||||
#=================================================================
|
||||
class ArcWarcRecordLoader(object):
|
||||
# Standard ARC v1.0 headers
|
||||
# TODO: support ARC v2.0 also?
|
||||
ARC_HEADERS = ["uri", "ip-address", "archive-date",
|
||||
"content-type", "length"]
|
||||
|
||||
WARC_TYPES = ['WARC/1.0', 'WARC/0.17', 'WARC/0.18']
|
||||
|
||||
HTTP_TYPES = ['HTTP/1.0', 'HTTP/1.1']
|
||||
@ -55,14 +51,17 @@ class ArcWarcRecordLoader(object):
|
||||
HTTP_SCHEMES = ('http:', 'https:')
|
||||
|
||||
def __init__(self, loader=None, cookie_maker=None, block_size=8192,
|
||||
verify_http=True):
|
||||
verify_http=True, arc2warc=True):
|
||||
if not loader:
|
||||
loader = BlockLoader(cookie_maker=cookie_maker)
|
||||
|
||||
self.loader = loader
|
||||
self.block_size = block_size
|
||||
|
||||
self.arc_parser = ARCHeadersParser(self.ARC_HEADERS)
|
||||
if arc2warc:
|
||||
self.arc_parser = ARC2WARCHeadersParser()
|
||||
else:
|
||||
self.arc_parser = ARCHeadersParser()
|
||||
|
||||
self.warc_parser = StatusAndHeadersParser(self.WARC_TYPES)
|
||||
self.http_parser = StatusAndHeadersParser(self.HTTP_TYPES, verify_http)
|
||||
@ -114,12 +113,16 @@ class ArcWarcRecordLoader(object):
|
||||
else:
|
||||
rec_type = 'response'
|
||||
|
||||
elif the_format == 'warc':
|
||||
elif the_format in ('warc', 'arc2warc'):
|
||||
rec_type = rec_headers.get_header('WARC-Type')
|
||||
uri = rec_headers.get_header('WARC-Target-URI')
|
||||
length = rec_headers.get_header('Content-Length')
|
||||
content_type = rec_headers.get_header('Content-Type')
|
||||
sub_len = 0
|
||||
if the_format == 'warc':
|
||||
sub_len = 0
|
||||
else:
|
||||
sub_len = rec_headers.total_len
|
||||
the_format = 'warc'
|
||||
|
||||
is_err = False
|
||||
|
||||
@ -201,7 +204,7 @@ class ArcWarcRecordLoader(object):
|
||||
# now try as arc
|
||||
try:
|
||||
rec_headers = self.arc_parser.parse(stream, statusline)
|
||||
return 'arc', rec_headers
|
||||
return self.arc_parser.get_rec_type(), rec_headers
|
||||
except StatusAndHeadersParserException as se:
|
||||
if known_format == 'arc':
|
||||
msg = 'Invalid ARC record, first line: '
|
||||
@ -212,8 +215,15 @@ class ArcWarcRecordLoader(object):
|
||||
|
||||
#=================================================================
|
||||
class ARCHeadersParser(object):
|
||||
def __init__(self, headernames):
|
||||
self.headernames = headernames
|
||||
# ARC 1.0 headers
|
||||
ARC_HEADERS = ["uri", "ip-address", "archive-date",
|
||||
"content-type", "length"]
|
||||
|
||||
def __init__(self):
|
||||
self.headernames = self.get_header_names()
|
||||
|
||||
def get_rec_type(self):
|
||||
return 'arc'
|
||||
|
||||
def parse(self, stream, headerline=None):
|
||||
total_read = 0
|
||||
@ -250,12 +260,60 @@ class ARCHeadersParser(object):
|
||||
msg = msg.format(headernames, parts)
|
||||
raise StatusAndHeadersParserException(msg, parts)
|
||||
|
||||
headers = []
|
||||
|
||||
for name, value in zip(headernames, parts):
|
||||
headers.append((name, value))
|
||||
protocol, headers = self._get_protocol_and_headers(headerline, parts)
|
||||
|
||||
return StatusAndHeaders(statusline='',
|
||||
headers=headers,
|
||||
protocol='ARC/1.0',
|
||||
protocol='WARC/1.0',
|
||||
total_len=total_read)
|
||||
|
||||
@classmethod
|
||||
def get_header_names(cls):
|
||||
return cls.ARC_HEADERS
|
||||
|
||||
def _get_protocol_and_headers(self, headerline, parts):
|
||||
headers = []
|
||||
|
||||
for name, value in zip(self.headernames, parts):
|
||||
headers.append((name, value))
|
||||
|
||||
return ('ARC/1.0', headers)
|
||||
|
||||
|
||||
#=================================================================
|
||||
class ARC2WARCHeadersParser(ARCHeadersParser):
|
||||
# Headers for converting ARC -> WARC Header
|
||||
ARC_TO_WARC_HEADERS = ["WARC-Target-URI",
|
||||
"WARC-IP-Address",
|
||||
"WARC-Date",
|
||||
"Content-Type",
|
||||
"Content-Length"]
|
||||
|
||||
def get_rec_type(self):
|
||||
return 'arc2warc'
|
||||
|
||||
@classmethod
|
||||
def get_header_names(cls):
|
||||
return cls.ARC_TO_WARC_HEADERS
|
||||
|
||||
def _get_protocol_and_headers(self, headerline, parts):
|
||||
headers = []
|
||||
|
||||
for name, value in zip(self.headernames, parts):
|
||||
if name == 'WARC-Date':
|
||||
value = timestamp_to_iso_date(value)
|
||||
|
||||
headers.append((name, value))
|
||||
|
||||
if headerline.startswith('filedesc://'):
|
||||
rec_type = 'arc_header'
|
||||
else:
|
||||
rec_type = 'response'
|
||||
|
||||
headers.append(('WARC-Type', rec_type))
|
||||
headers.append(('WARC-Record-ID', StatusAndHeadersParser.make_warc_id()))
|
||||
|
||||
return ('WARC/1.0', headers)
|
||||
|
||||
|
||||
|
@ -44,6 +44,19 @@ com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ
|
||||
CDX N b a m s k r M S V g
|
||||
com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1656 151 example.arc
|
||||
|
||||
# arc.gz
|
||||
>>> print_cdx_index('example.arc.gz', arc2warc=True)
|
||||
CDX N b a m s k r M S V g
|
||||
com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 856 171 example.arc.gz
|
||||
|
||||
# arc
|
||||
>>> print_cdx_index('example.arc', arc2warc=True)
|
||||
CDX N b a m s k r M S V g
|
||||
com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1656 151 example.arc
|
||||
|
||||
|
||||
|
||||
|
||||
# wget warc, includes metadata by default
|
||||
>>> print_cdx_index('example-wget-1-14.warc.gz')
|
||||
CDX N b a m s k r M S V g
|
||||
@ -328,6 +341,22 @@ def test_cdxj_arc_minimal():
|
||||
com,example)/ 20140216050221 {"url": "http://example.com/", "digest": "PEWDX5GTH66WU74WBPGFECIYBMPMP3FP", "length": "856", "offset": "171", "filename": "example.arc.gz"}
|
||||
""")
|
||||
|
||||
def test_cdxj_arc_conv():
|
||||
# arc.gz -- json
|
||||
res = cdx_index('example.arc.gz', cdxj=True, arc2warc=True)
|
||||
assert parse_cdxj(res) == parse_cdxj(b"""
|
||||
com,example)/ 20140216050221 {"url": "http://example.com/", "mime": "text/html", "status": "200", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "856", "offset": "171", "filename": "example.arc.gz"}
|
||||
""")
|
||||
|
||||
def test_cdxj_arc_minimal_conv():
|
||||
# arc.gz -- minimal + json
|
||||
res = cdx_index('example.arc.gz', cdxj=True, minimal=True, arc2warc=True)
|
||||
assert parse_cdxj(res) == parse_cdxj(b"""
|
||||
com,example)/ 20140216050221 {"url": "http://example.com/", "digest": "PEWDX5GTH66WU74WBPGFECIYBMPMP3FP", "length": "856", "offset": "171", "filename": "example.arc.gz"}
|
||||
""")
|
||||
|
||||
|
||||
|
||||
|
||||
def test_cdxj_empty():
|
||||
options = dict(cdxj=True)
|
||||
|
Loading…
x
Reference in New Issue
Block a user