1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-28 16:42:29 +01:00

automatic record (warc/arc) format detection and decompression if needed.

no need to rely on file type listing
This commit is contained in:
Ilya Kreymer 2014-02-19 00:13:15 -08:00
parent 84e0121aa5
commit 312bd71568
4 changed files with 74 additions and 46 deletions

View File

@ -72,7 +72,7 @@ class StatusAndHeadersParser(object):
if not protocol_status: if not protocol_status:
msg = 'Expected Status Line - Found: ' + statusline msg = 'Expected Status Line - Found: ' + statusline
raise StatusAndHeadersParserException(msg) raise StatusAndHeadersParserException(msg, statusline)
headers = [] headers = []
@ -104,4 +104,7 @@ class StatusAndHeadersParserException(Exception):
""" """
status + headers parsing exception status + headers parsing exception
""" """
pass def __init__(self, msg, statusline):
super(StatusAndHeadersParserException, self).__init__(msg)
self.statusline = statusline

View File

@ -1,17 +1,20 @@
### pywb.warc ### pywb.warc
This is the WARC/ARC record loading component of pywb wayback tool suite. This is the WARC/ARC record loading component of pywb wayback tool suite.
The package provides the following facilities:
This package provides the following facilities:
* Resolve relative WARC/ARC filenames to a full path based on configurable resolvers * Resolve relative WARC/ARC filenames to a full path based on configurable resolvers
* Resolve 'revisit' records from provided index to find a full record with headers and payload content * Resolve 'revisit' records from provided index to find a full record with headers and payload content
* Load WARC and ARC records either locally or via http using http 1.1 range requests * Load WARC/ARC records either locally or via http using http 1.1 range requests
When loading archived content, the format type (WARC vs ARC) and compressed ARCs/WARCs
are decompressed automatically.
No assumption is made about format based on filename, content type
or other external parameters other than the content itself.
### Tests ### Tests
This package will includes a test suite for loading a variety of WARC and ARC records. This package will includes a test suite for loading a variety of WARC and ARC records.
@ -26,5 +29,4 @@ Tests so far:
TODO: TODO:
* Different url revisit record resolving (TODO) * Different url revisit record resolving
* File type detection (no .warc, .arc extensions)

View File

@ -4,6 +4,7 @@ import collections
from pywb.utils.statusandheaders import StatusAndHeaders from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.utils.statusandheaders import StatusAndHeadersParser from pywb.utils.statusandheaders import StatusAndHeadersParser
from pywb.utils.statusandheaders import StatusAndHeadersParserException
from pywb.utils.loaders import FileLoader, HttpLoader from pywb.utils.loaders import FileLoader, HttpLoader
from pywb.utils.bufferedreaders import BufferedReader from pywb.utils.bufferedreaders import BufferedReader
@ -31,17 +32,6 @@ class ArcWarcRecordLoader:
ARC_HEADERS = ["uri", "ip-address", "creation-date", ARC_HEADERS = ["uri", "ip-address", "creation-date",
"content-type", "length"] "content-type", "length"]
# Since loading a range request,
# can only determine gzip-ness based on file extension
# (BufferedReader will however default to non-gzip if
# decompression fails)
FORMAT_MAP = {
'.warc.gz': ('warc', True),
'.arc.gz': ('arc', True),
'.warc': ('warc', False),
'.arc': ('arc', False),
}
@staticmethod @staticmethod
def create_default_loaders(cookie_maker=None): def create_default_loaders(cookie_maker=None):
http = HttpLoader(cookie_maker) http = HttpLoader(cookie_maker)
@ -74,21 +64,6 @@ class ArcWarcRecordLoader:
if not loader: if not loader:
raise ArchiveLoadFailed('Unknown Protocol', url) raise ArchiveLoadFailed('Unknown Protocol', url)
the_format = None
for ext, iformat in self.FORMAT_MAP.iteritems():
if url.endswith(ext):
the_format = iformat
break
if the_format is None:
raise ArchiveLoadFailed('Unknown file format', url)
(a_format, is_gzip) = the_format
#decomp = utils.create_decompressor() if is_gzip else None
decomp_type = 'gzip' if is_gzip else None
try: try:
length = int(length) length = int(length)
except: except:
@ -96,15 +71,17 @@ class ArcWarcRecordLoader:
raw = loader.load(url, long(offset), length) raw = loader.load(url, long(offset), length)
decomp_type = 'gzip'
stream = BufferedReader(raw, length, self.chunk_size, decomp_type) stream = BufferedReader(raw, length, self.chunk_size, decomp_type)
if a_format == 'arc': (the_format, rec_headers) = self._load_headers(stream)
rec_headers = self.arc_parser.parse(stream)
if the_format == 'arc':
rec_type = 'response' rec_type = 'response'
empty = (rec_headers.get_header('length') == 0) empty = (rec_headers.get_header('length') == 0)
elif a_format == 'warc': elif the_format == 'warc':
rec_headers = self.warc_parser.parse(stream)
rec_type = rec_headers.get_header('WARC-Type') rec_type = rec_headers.get_header('WARC-Type')
empty = (rec_headers.get_header('Content-Length') == '0') empty = (rec_headers.get_header('Content-Length') == '0')
@ -131,16 +108,43 @@ class ArcWarcRecordLoader:
#(statusline, http_headers) = self.parse_http_headers(stream) #(statusline, http_headers) = self.parse_http_headers(stream)
status_headers = self.http_parser.parse(stream) status_headers = self.http_parser.parse(stream)
return ArcWarcRecord((a_format, rec_type), return ArcWarcRecord((the_format, rec_type),
rec_headers, stream, status_headers) rec_headers, stream, status_headers)
def _load_headers(self, stream):
"""
Try parsing record as WARC, then try parsing as ARC.
if neither one succeeds, we're out of luck.
"""
statusline = None
# try as warc first
try:
rec_headers = self.warc_parser.parse(stream)
return 'warc', rec_headers
except StatusAndHeadersParserException as se:
statusline = se.statusline
pass
# now try as arc
try:
rec_headers = self.arc_parser.parse(stream, statusline)
return 'arc', rec_headers
except StatusAndHeadersParserException as se:
msg = 'Unknown archive format, first line: ' + se.statusline
raise ArchiveLoadFailed(msg)
#================================================================= #=================================================================
class ARCHeadersParser: class ARCHeadersParser:
def __init__(self, headernames): def __init__(self, headernames):
self.headernames = headernames self.headernames = headernames
def parse(self, stream): def parse(self, stream, headerline=None):
# if headerline passed in, use that
if not headerline:
headerline = stream.readline().rstrip() headerline = stream.readline().rstrip()
parts = headerline.split() parts = headerline.split()
@ -149,7 +153,8 @@ class ARCHeadersParser:
if len(parts) != len(headernames): if len(parts) != len(headernames):
msg = 'Wrong # of headers, expected arc headers {0}, Found {1}' msg = 'Wrong # of headers, expected arc headers {0}, Found {1}'
raise ArchiveLoadFailed(msg.format(headernames, parts)) msg = msg.format(headernames, parts)
raise StatusAndHeadersParserException(msg, headernames)
headers = [] headers = []

View File

@ -2,7 +2,7 @@
""" """
Test loading different types of records from a variety of formats Test loading different types of records from a variety of formats
# Load response record from WARC # Load response record from compressed WARC
>>> load_test_archive('example.warc.gz', '333', '1043') >>> load_test_archive('example.warc.gz', '333', '1043')
(('warc', 'response'), (('warc', 'response'),
StatusAndHeaders(protocol = 'WARC/1.0', statusline = '', headers = [ ('WARC-Type', 'response'), StatusAndHeaders(protocol = 'WARC/1.0', statusline = '', headers = [ ('WARC-Type', 'response'),
@ -26,7 +26,7 @@ Test loading different types of records from a variety of formats
('Content-Length', '1270'), ('Content-Length', '1270'),
('Connection', 'close')])) ('Connection', 'close')]))
# Load revisit record from WARC # Load revisit record from compressed WARC
>>> load_test_archive('example.warc.gz', '1864', '553') >>> load_test_archive('example.warc.gz', '1864', '553')
(('warc', 'revisit'), (('warc', 'revisit'),
StatusAndHeaders(protocol = 'WARC/1.0', statusline = '', headers = [ ('WARC-Type', 'revisit'), StatusAndHeaders(protocol = 'WARC/1.0', statusline = '', headers = [ ('WARC-Type', 'revisit'),
@ -59,7 +59,7 @@ Test loading different types of records from a variety of formats
# Print parsed http headers + 2 lines of content # Print parsed http headers + 2 lines of content
# ============================================================================== # ==============================================================================
# Test loading from ARC based on cdx line # Test loading from compressed ARC based on cdx line
>>> load_from_cdx_test('com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 856 171 example.arc.gz') >>> load_from_cdx_test('com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 856 171 example.arc.gz')
StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'), StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
('Cache-Control', 'max-age=604800'), ('Cache-Control', 'max-age=604800'),
@ -75,6 +75,7 @@ StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Acc
<!doctype html> <!doctype html>
<html> <html>
# Uncompressed arc
>>> load_from_cdx_test('com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1656 151 example.arc') >>> load_from_cdx_test('com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1656 151 example.arc')
StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'), StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
('Cache-Control', 'max-age=604800'), ('Cache-Control', 'max-age=604800'),
@ -91,7 +92,7 @@ StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Acc
<html> <html>
# Test loading from WARC based on cdx line # Test loading from compressed WARC based on cdx line
>>> load_from_cdx_test('com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz') >>> load_from_cdx_test('com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz')
StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'), StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
('Cache-Control', 'max-age=604800'), ('Cache-Control', 'max-age=604800'),
@ -108,6 +109,23 @@ StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Acc
<!doctype html> <!doctype html>
<html> <html>
# Uncompressed WARC
>>> load_from_cdx_test('com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1987 460 example.warc')
StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
('Cache-Control', 'max-age=604800'),
('Content-Type', 'text/html'),
('Date', 'Fri, 03 Jan 2014 03:03:21 GMT'),
('Etag', '"359670651"'),
('Expires', 'Fri, 10 Jan 2014 03:03:21 GMT'),
('Last-Modified', 'Fri, 09 Aug 2013 23:54:35 GMT'),
('Server', 'ECS (sjc/4FCE)'),
('X-Cache', 'HIT'),
('x-ec-custom-error', '1'),
('Content-Length', '1270'),
('Connection', 'close')])
<!doctype html>
<html>
# Test cdx w/ revisit # Test cdx w/ revisit
>>> load_from_cdx_test('com,example)/?example=1 20140103030341 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz 1043 333 example.warc.gz') >>> load_from_cdx_test('com,example)/?example=1 20140103030341 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz 1043 333 example.warc.gz')
StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'), StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),