mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-28 16:42:29 +01:00
automatic record (warc/arc) format detection and decompression if needed.
no need to rely on file type listing
This commit is contained in:
parent
84e0121aa5
commit
312bd71568
@ -72,7 +72,7 @@ class StatusAndHeadersParser(object):
|
|||||||
|
|
||||||
if not protocol_status:
|
if not protocol_status:
|
||||||
msg = 'Expected Status Line - Found: ' + statusline
|
msg = 'Expected Status Line - Found: ' + statusline
|
||||||
raise StatusAndHeadersParserException(msg)
|
raise StatusAndHeadersParserException(msg, statusline)
|
||||||
|
|
||||||
headers = []
|
headers = []
|
||||||
|
|
||||||
@ -104,4 +104,7 @@ class StatusAndHeadersParserException(Exception):
|
|||||||
"""
|
"""
|
||||||
status + headers parsing exception
|
status + headers parsing exception
|
||||||
"""
|
"""
|
||||||
pass
|
def __init__(self, msg, statusline):
|
||||||
|
super(StatusAndHeadersParserException, self).__init__(msg)
|
||||||
|
self.statusline = statusline
|
||||||
|
|
||||||
|
@ -1,17 +1,20 @@
|
|||||||
### pywb.warc
|
### pywb.warc
|
||||||
|
|
||||||
This is the WARC/ARC record loading component of pywb wayback tool suite.
|
This is the WARC/ARC record loading component of pywb wayback tool suite.
|
||||||
|
The package provides the following facilities:
|
||||||
|
|
||||||
This package provides the following facilities:
|
|
||||||
|
|
||||||
* Resolve relative WARC/ARC filenames to a full path based on configurable resolvers
|
* Resolve relative WARC/ARC filenames to a full path based on configurable resolvers
|
||||||
|
|
||||||
* Resolve 'revisit' records from provided index to find a full record with headers and payload content
|
* Resolve 'revisit' records from provided index to find a full record with headers and payload content
|
||||||
|
|
||||||
* Load WARC and ARC records either locally or via http using http 1.1 range requests
|
* Load WARC/ARC records either locally or via http using http 1.1 range requests
|
||||||
|
|
||||||
|
|
||||||
|
When loading archived content, the format type (WARC vs ARC) and compressed ARCs/WARCs
|
||||||
|
are decompressed automatically.
|
||||||
|
No assumption is made about format based on filename, content type
|
||||||
|
or other external parameters other than the content itself.
|
||||||
|
|
||||||
### Tests
|
### Tests
|
||||||
|
|
||||||
This package will includes a test suite for loading a variety of WARC and ARC records.
|
This package will includes a test suite for loading a variety of WARC and ARC records.
|
||||||
@ -26,5 +29,4 @@ Tests so far:
|
|||||||
|
|
||||||
TODO:
|
TODO:
|
||||||
|
|
||||||
* Different url revisit record resolving (TODO)
|
* Different url revisit record resolving
|
||||||
* File type detection (no .warc, .arc extensions)
|
|
||||||
|
@ -4,6 +4,7 @@ import collections
|
|||||||
|
|
||||||
from pywb.utils.statusandheaders import StatusAndHeaders
|
from pywb.utils.statusandheaders import StatusAndHeaders
|
||||||
from pywb.utils.statusandheaders import StatusAndHeadersParser
|
from pywb.utils.statusandheaders import StatusAndHeadersParser
|
||||||
|
from pywb.utils.statusandheaders import StatusAndHeadersParserException
|
||||||
|
|
||||||
from pywb.utils.loaders import FileLoader, HttpLoader
|
from pywb.utils.loaders import FileLoader, HttpLoader
|
||||||
from pywb.utils.bufferedreaders import BufferedReader
|
from pywb.utils.bufferedreaders import BufferedReader
|
||||||
@ -31,17 +32,6 @@ class ArcWarcRecordLoader:
|
|||||||
ARC_HEADERS = ["uri", "ip-address", "creation-date",
|
ARC_HEADERS = ["uri", "ip-address", "creation-date",
|
||||||
"content-type", "length"]
|
"content-type", "length"]
|
||||||
|
|
||||||
# Since loading a range request,
|
|
||||||
# can only determine gzip-ness based on file extension
|
|
||||||
# (BufferedReader will however default to non-gzip if
|
|
||||||
# decompression fails)
|
|
||||||
FORMAT_MAP = {
|
|
||||||
'.warc.gz': ('warc', True),
|
|
||||||
'.arc.gz': ('arc', True),
|
|
||||||
'.warc': ('warc', False),
|
|
||||||
'.arc': ('arc', False),
|
|
||||||
}
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def create_default_loaders(cookie_maker=None):
|
def create_default_loaders(cookie_maker=None):
|
||||||
http = HttpLoader(cookie_maker)
|
http = HttpLoader(cookie_maker)
|
||||||
@ -74,21 +64,6 @@ class ArcWarcRecordLoader:
|
|||||||
if not loader:
|
if not loader:
|
||||||
raise ArchiveLoadFailed('Unknown Protocol', url)
|
raise ArchiveLoadFailed('Unknown Protocol', url)
|
||||||
|
|
||||||
the_format = None
|
|
||||||
|
|
||||||
for ext, iformat in self.FORMAT_MAP.iteritems():
|
|
||||||
if url.endswith(ext):
|
|
||||||
the_format = iformat
|
|
||||||
break
|
|
||||||
|
|
||||||
if the_format is None:
|
|
||||||
raise ArchiveLoadFailed('Unknown file format', url)
|
|
||||||
|
|
||||||
(a_format, is_gzip) = the_format
|
|
||||||
|
|
||||||
#decomp = utils.create_decompressor() if is_gzip else None
|
|
||||||
decomp_type = 'gzip' if is_gzip else None
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
length = int(length)
|
length = int(length)
|
||||||
except:
|
except:
|
||||||
@ -96,15 +71,17 @@ class ArcWarcRecordLoader:
|
|||||||
|
|
||||||
raw = loader.load(url, long(offset), length)
|
raw = loader.load(url, long(offset), length)
|
||||||
|
|
||||||
|
decomp_type = 'gzip'
|
||||||
|
|
||||||
stream = BufferedReader(raw, length, self.chunk_size, decomp_type)
|
stream = BufferedReader(raw, length, self.chunk_size, decomp_type)
|
||||||
|
|
||||||
if a_format == 'arc':
|
(the_format, rec_headers) = self._load_headers(stream)
|
||||||
rec_headers = self.arc_parser.parse(stream)
|
|
||||||
|
if the_format == 'arc':
|
||||||
rec_type = 'response'
|
rec_type = 'response'
|
||||||
empty = (rec_headers.get_header('length') == 0)
|
empty = (rec_headers.get_header('length') == 0)
|
||||||
|
|
||||||
elif a_format == 'warc':
|
elif the_format == 'warc':
|
||||||
rec_headers = self.warc_parser.parse(stream)
|
|
||||||
rec_type = rec_headers.get_header('WARC-Type')
|
rec_type = rec_headers.get_header('WARC-Type')
|
||||||
empty = (rec_headers.get_header('Content-Length') == '0')
|
empty = (rec_headers.get_header('Content-Length') == '0')
|
||||||
|
|
||||||
@ -131,16 +108,43 @@ class ArcWarcRecordLoader:
|
|||||||
#(statusline, http_headers) = self.parse_http_headers(stream)
|
#(statusline, http_headers) = self.parse_http_headers(stream)
|
||||||
status_headers = self.http_parser.parse(stream)
|
status_headers = self.http_parser.parse(stream)
|
||||||
|
|
||||||
return ArcWarcRecord((a_format, rec_type),
|
return ArcWarcRecord((the_format, rec_type),
|
||||||
rec_headers, stream, status_headers)
|
rec_headers, stream, status_headers)
|
||||||
|
|
||||||
|
def _load_headers(self, stream):
|
||||||
|
"""
|
||||||
|
Try parsing record as WARC, then try parsing as ARC.
|
||||||
|
if neither one succeeds, we're out of luck.
|
||||||
|
"""
|
||||||
|
|
||||||
|
statusline = None
|
||||||
|
|
||||||
|
# try as warc first
|
||||||
|
try:
|
||||||
|
rec_headers = self.warc_parser.parse(stream)
|
||||||
|
return 'warc', rec_headers
|
||||||
|
except StatusAndHeadersParserException as se:
|
||||||
|
statusline = se.statusline
|
||||||
|
pass
|
||||||
|
|
||||||
|
# now try as arc
|
||||||
|
try:
|
||||||
|
rec_headers = self.arc_parser.parse(stream, statusline)
|
||||||
|
return 'arc', rec_headers
|
||||||
|
except StatusAndHeadersParserException as se:
|
||||||
|
msg = 'Unknown archive format, first line: ' + se.statusline
|
||||||
|
raise ArchiveLoadFailed(msg)
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class ARCHeadersParser:
|
class ARCHeadersParser:
|
||||||
def __init__(self, headernames):
|
def __init__(self, headernames):
|
||||||
self.headernames = headernames
|
self.headernames = headernames
|
||||||
|
|
||||||
def parse(self, stream):
|
def parse(self, stream, headerline=None):
|
||||||
|
|
||||||
|
# if headerline passed in, use that
|
||||||
|
if not headerline:
|
||||||
headerline = stream.readline().rstrip()
|
headerline = stream.readline().rstrip()
|
||||||
|
|
||||||
parts = headerline.split()
|
parts = headerline.split()
|
||||||
@ -149,7 +153,8 @@ class ARCHeadersParser:
|
|||||||
|
|
||||||
if len(parts) != len(headernames):
|
if len(parts) != len(headernames):
|
||||||
msg = 'Wrong # of headers, expected arc headers {0}, Found {1}'
|
msg = 'Wrong # of headers, expected arc headers {0}, Found {1}'
|
||||||
raise ArchiveLoadFailed(msg.format(headernames, parts))
|
msg = msg.format(headernames, parts)
|
||||||
|
raise StatusAndHeadersParserException(msg, headernames)
|
||||||
|
|
||||||
headers = []
|
headers = []
|
||||||
|
|
||||||
|
@ -2,7 +2,7 @@
|
|||||||
"""
|
"""
|
||||||
Test loading different types of records from a variety of formats
|
Test loading different types of records from a variety of formats
|
||||||
|
|
||||||
# Load response record from WARC
|
# Load response record from compressed WARC
|
||||||
>>> load_test_archive('example.warc.gz', '333', '1043')
|
>>> load_test_archive('example.warc.gz', '333', '1043')
|
||||||
(('warc', 'response'),
|
(('warc', 'response'),
|
||||||
StatusAndHeaders(protocol = 'WARC/1.0', statusline = '', headers = [ ('WARC-Type', 'response'),
|
StatusAndHeaders(protocol = 'WARC/1.0', statusline = '', headers = [ ('WARC-Type', 'response'),
|
||||||
@ -26,7 +26,7 @@ Test loading different types of records from a variety of formats
|
|||||||
('Content-Length', '1270'),
|
('Content-Length', '1270'),
|
||||||
('Connection', 'close')]))
|
('Connection', 'close')]))
|
||||||
|
|
||||||
# Load revisit record from WARC
|
# Load revisit record from compressed WARC
|
||||||
>>> load_test_archive('example.warc.gz', '1864', '553')
|
>>> load_test_archive('example.warc.gz', '1864', '553')
|
||||||
(('warc', 'revisit'),
|
(('warc', 'revisit'),
|
||||||
StatusAndHeaders(protocol = 'WARC/1.0', statusline = '', headers = [ ('WARC-Type', 'revisit'),
|
StatusAndHeaders(protocol = 'WARC/1.0', statusline = '', headers = [ ('WARC-Type', 'revisit'),
|
||||||
@ -59,7 +59,7 @@ Test loading different types of records from a variety of formats
|
|||||||
# Print parsed http headers + 2 lines of content
|
# Print parsed http headers + 2 lines of content
|
||||||
# ==============================================================================
|
# ==============================================================================
|
||||||
|
|
||||||
# Test loading from ARC based on cdx line
|
# Test loading from compressed ARC based on cdx line
|
||||||
>>> load_from_cdx_test('com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 856 171 example.arc.gz')
|
>>> load_from_cdx_test('com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 856 171 example.arc.gz')
|
||||||
StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
|
StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
|
||||||
('Cache-Control', 'max-age=604800'),
|
('Cache-Control', 'max-age=604800'),
|
||||||
@ -75,6 +75,7 @@ StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Acc
|
|||||||
<!doctype html>
|
<!doctype html>
|
||||||
<html>
|
<html>
|
||||||
|
|
||||||
|
# Uncompressed arc
|
||||||
>>> load_from_cdx_test('com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1656 151 example.arc')
|
>>> load_from_cdx_test('com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1656 151 example.arc')
|
||||||
StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
|
StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
|
||||||
('Cache-Control', 'max-age=604800'),
|
('Cache-Control', 'max-age=604800'),
|
||||||
@ -91,7 +92,7 @@ StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Acc
|
|||||||
<html>
|
<html>
|
||||||
|
|
||||||
|
|
||||||
# Test loading from WARC based on cdx line
|
# Test loading from compressed WARC based on cdx line
|
||||||
>>> load_from_cdx_test('com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz')
|
>>> load_from_cdx_test('com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz')
|
||||||
StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
|
StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
|
||||||
('Cache-Control', 'max-age=604800'),
|
('Cache-Control', 'max-age=604800'),
|
||||||
@ -108,6 +109,23 @@ StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Acc
|
|||||||
<!doctype html>
|
<!doctype html>
|
||||||
<html>
|
<html>
|
||||||
|
|
||||||
|
# Uncompressed WARC
|
||||||
|
>>> load_from_cdx_test('com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1987 460 example.warc')
|
||||||
|
StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
|
||||||
|
('Cache-Control', 'max-age=604800'),
|
||||||
|
('Content-Type', 'text/html'),
|
||||||
|
('Date', 'Fri, 03 Jan 2014 03:03:21 GMT'),
|
||||||
|
('Etag', '"359670651"'),
|
||||||
|
('Expires', 'Fri, 10 Jan 2014 03:03:21 GMT'),
|
||||||
|
('Last-Modified', 'Fri, 09 Aug 2013 23:54:35 GMT'),
|
||||||
|
('Server', 'ECS (sjc/4FCE)'),
|
||||||
|
('X-Cache', 'HIT'),
|
||||||
|
('x-ec-custom-error', '1'),
|
||||||
|
('Content-Length', '1270'),
|
||||||
|
('Connection', 'close')])
|
||||||
|
<!doctype html>
|
||||||
|
<html>
|
||||||
|
|
||||||
# Test cdx w/ revisit
|
# Test cdx w/ revisit
|
||||||
>>> load_from_cdx_test('com,example)/?example=1 20140103030341 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz 1043 333 example.warc.gz')
|
>>> load_from_cdx_test('com,example)/?example=1 20140103030341 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz 1043 333 example.warc.gz')
|
||||||
StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
|
StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
|
||||||
|
Loading…
x
Reference in New Issue
Block a user