From 312bd715685b9e81a61ce1963c559e67db687feb Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 19 Feb 2014 00:13:15 -0800 Subject: [PATCH] automatic record (warc/arc) format detection and decompression if needed. no need to rely on file type listing --- pywb/utils/statusandheaders.py | 7 +++- pywb/warc/README.md | 14 ++++--- pywb/warc/recordloader.py | 73 ++++++++++++++++++---------------- pywb/warc/test/test_loading.py | 26 ++++++++++-- 4 files changed, 74 insertions(+), 46 deletions(-) diff --git a/pywb/utils/statusandheaders.py b/pywb/utils/statusandheaders.py index 85fd241e..01bb6614 100644 --- a/pywb/utils/statusandheaders.py +++ b/pywb/utils/statusandheaders.py @@ -72,7 +72,7 @@ class StatusAndHeadersParser(object): if not protocol_status: msg = 'Expected Status Line - Found: ' + statusline - raise StatusAndHeadersParserException(msg) + raise StatusAndHeadersParserException(msg, statusline) headers = [] @@ -104,4 +104,7 @@ class StatusAndHeadersParserException(Exception): """ status + headers parsing exception """ - pass + def __init__(self, msg, statusline): + super(StatusAndHeadersParserException, self).__init__(msg) + self.statusline = statusline + diff --git a/pywb/warc/README.md b/pywb/warc/README.md index f3a4bad4..91cc3036 100644 --- a/pywb/warc/README.md +++ b/pywb/warc/README.md @@ -1,17 +1,20 @@ ### pywb.warc This is the WARC/ARC record loading component of pywb wayback tool suite. - - -This package provides the following facilities: +The package provides the following facilities: * Resolve relative WARC/ARC filenames to a full path based on configurable resolvers * Resolve 'revisit' records from provided index to find a full record with headers and payload content -* Load WARC and ARC records either locally or via http using http 1.1 range requests +* Load WARC/ARC records either locally or via http using http 1.1 range requests +When loading archived content, the format type (WARC vs ARC) and compressed ARCs/WARCs +are decompressed automatically. +No assumption is made about format based on filename, content type +or other external parameters other than the content itself. + ### Tests This package will includes a test suite for loading a variety of WARC and ARC records. @@ -26,5 +29,4 @@ Tests so far: TODO: -* Different url revisit record resolving (TODO) -* File type detection (no .warc, .arc extensions) +* Different url revisit record resolving diff --git a/pywb/warc/recordloader.py b/pywb/warc/recordloader.py index 5937202c..05973f6b 100644 --- a/pywb/warc/recordloader.py +++ b/pywb/warc/recordloader.py @@ -4,6 +4,7 @@ import collections from pywb.utils.statusandheaders import StatusAndHeaders from pywb.utils.statusandheaders import StatusAndHeadersParser +from pywb.utils.statusandheaders import StatusAndHeadersParserException from pywb.utils.loaders import FileLoader, HttpLoader from pywb.utils.bufferedreaders import BufferedReader @@ -31,17 +32,6 @@ class ArcWarcRecordLoader: ARC_HEADERS = ["uri", "ip-address", "creation-date", "content-type", "length"] - # Since loading a range request, - # can only determine gzip-ness based on file extension - # (BufferedReader will however default to non-gzip if - # decompression fails) - FORMAT_MAP = { - '.warc.gz': ('warc', True), - '.arc.gz': ('arc', True), - '.warc': ('warc', False), - '.arc': ('arc', False), - } - @staticmethod def create_default_loaders(cookie_maker=None): http = HttpLoader(cookie_maker) @@ -74,21 +64,6 @@ class ArcWarcRecordLoader: if not loader: raise ArchiveLoadFailed('Unknown Protocol', url) - the_format = None - - for ext, iformat in self.FORMAT_MAP.iteritems(): - if url.endswith(ext): - the_format = iformat - break - - if the_format is None: - raise ArchiveLoadFailed('Unknown file format', url) - - (a_format, is_gzip) = the_format - - #decomp = utils.create_decompressor() if is_gzip else None - decomp_type = 'gzip' if is_gzip else None - try: length = int(length) except: @@ -96,15 +71,17 @@ class ArcWarcRecordLoader: raw = loader.load(url, long(offset), length) + decomp_type = 'gzip' + stream = BufferedReader(raw, length, self.chunk_size, decomp_type) - if a_format == 'arc': - rec_headers = self.arc_parser.parse(stream) + (the_format, rec_headers) = self._load_headers(stream) + + if the_format == 'arc': rec_type = 'response' empty = (rec_headers.get_header('length') == 0) - elif a_format == 'warc': - rec_headers = self.warc_parser.parse(stream) + elif the_format == 'warc': rec_type = rec_headers.get_header('WARC-Type') empty = (rec_headers.get_header('Content-Length') == '0') @@ -131,17 +108,44 @@ class ArcWarcRecordLoader: #(statusline, http_headers) = self.parse_http_headers(stream) status_headers = self.http_parser.parse(stream) - return ArcWarcRecord((a_format, rec_type), + return ArcWarcRecord((the_format, rec_type), rec_headers, stream, status_headers) + def _load_headers(self, stream): + """ + Try parsing record as WARC, then try parsing as ARC. + if neither one succeeds, we're out of luck. + """ + + statusline = None + + # try as warc first + try: + rec_headers = self.warc_parser.parse(stream) + return 'warc', rec_headers + except StatusAndHeadersParserException as se: + statusline = se.statusline + pass + + # now try as arc + try: + rec_headers = self.arc_parser.parse(stream, statusline) + return 'arc', rec_headers + except StatusAndHeadersParserException as se: + msg = 'Unknown archive format, first line: ' + se.statusline + raise ArchiveLoadFailed(msg) + #================================================================= class ARCHeadersParser: def __init__(self, headernames): self.headernames = headernames - def parse(self, stream): - headerline = stream.readline().rstrip() + def parse(self, stream, headerline=None): + + # if headerline passed in, use that + if not headerline: + headerline = stream.readline().rstrip() parts = headerline.split() @@ -149,7 +153,8 @@ class ARCHeadersParser: if len(parts) != len(headernames): msg = 'Wrong # of headers, expected arc headers {0}, Found {1}' - raise ArchiveLoadFailed(msg.format(headernames, parts)) + msg = msg.format(headernames, parts) + raise StatusAndHeadersParserException(msg, headernames) headers = [] diff --git a/pywb/warc/test/test_loading.py b/pywb/warc/test/test_loading.py index e1a40950..47176e3e 100644 --- a/pywb/warc/test/test_loading.py +++ b/pywb/warc/test/test_loading.py @@ -2,7 +2,7 @@ """ Test loading different types of records from a variety of formats -# Load response record from WARC +# Load response record from compressed WARC >>> load_test_archive('example.warc.gz', '333', '1043') (('warc', 'response'), StatusAndHeaders(protocol = 'WARC/1.0', statusline = '', headers = [ ('WARC-Type', 'response'), @@ -26,7 +26,7 @@ Test loading different types of records from a variety of formats ('Content-Length', '1270'), ('Connection', 'close')])) -# Load revisit record from WARC +# Load revisit record from compressed WARC >>> load_test_archive('example.warc.gz', '1864', '553') (('warc', 'revisit'), StatusAndHeaders(protocol = 'WARC/1.0', statusline = '', headers = [ ('WARC-Type', 'revisit'), @@ -59,7 +59,7 @@ Test loading different types of records from a variety of formats # Print parsed http headers + 2 lines of content # ============================================================================== -# Test loading from ARC based on cdx line +# Test loading from compressed ARC based on cdx line >>> load_from_cdx_test('com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 856 171 example.arc.gz') StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'), ('Cache-Control', 'max-age=604800'), @@ -75,6 +75,7 @@ StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Acc +# Uncompressed arc >>> load_from_cdx_test('com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1656 151 example.arc') StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'), ('Cache-Control', 'max-age=604800'), @@ -91,7 +92,7 @@ StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Acc -# Test loading from WARC based on cdx line +# Test loading from compressed WARC based on cdx line >>> load_from_cdx_test('com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz') StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'), ('Cache-Control', 'max-age=604800'), @@ -108,6 +109,23 @@ StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Acc +# Uncompressed WARC +>>> load_from_cdx_test('com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1987 460 example.warc') +StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'), + ('Cache-Control', 'max-age=604800'), + ('Content-Type', 'text/html'), + ('Date', 'Fri, 03 Jan 2014 03:03:21 GMT'), + ('Etag', '"359670651"'), + ('Expires', 'Fri, 10 Jan 2014 03:03:21 GMT'), + ('Last-Modified', 'Fri, 09 Aug 2013 23:54:35 GMT'), + ('Server', 'ECS (sjc/4FCE)'), + ('X-Cache', 'HIT'), + ('x-ec-custom-error', '1'), + ('Content-Length', '1270'), + ('Connection', 'close')]) + + + # Test cdx w/ revisit >>> load_from_cdx_test('com,example)/?example=1 20140103030341 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz 1043 333 example.warc.gz') StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),