1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-25 23:47:47 +01:00
pywb/pywb/warc/recordloader.py
2017-03-01 14:57:44 -08:00

301 lines
10 KiB
Python

import collections
from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.utils.statusandheaders import StatusAndHeadersParser
from pywb.utils.statusandheaders import StatusAndHeadersParserException
from pywb.utils.loaders import LimitReader
from pywb.utils.loaders import to_native_str
from pywb.utils.wbexception import WbException
from pywb.utils.timeutils import timestamp_to_iso_date
from six.moves import zip
#=================================================================
#ArcWarcRecord = collections.namedtuple('ArcWarcRecord',
# 'format, rec_type, rec_headers, ' +
# 'stream, status_headers, ' +
# 'content_type, length')
#=================================================================
class ArcWarcRecord(object):
def __init__(self, *args):
(self.format, self.rec_type, self.rec_headers, self.stream,
self.status_headers, self.content_type, self.length) = args
#=================================================================
class ArchiveLoadFailed(WbException):
def __init__(self, reason, filename=''):
if filename:
msg = filename + ': ' + str(reason)
else:
msg = str(reason)
super(ArchiveLoadFailed, self).__init__(msg)
def status(self):
return '503 Service Unavailable'
#=================================================================
class ArcWarcRecordLoader(object):
WARC_TYPES = ['WARC/1.0', 'WARC/0.17', 'WARC/0.18']
HTTP_TYPES = ['HTTP/1.0', 'HTTP/1.1']
HTTP_VERBS = ['GET', 'HEAD', 'POST', 'PUT', 'DELETE', 'TRACE',
'OPTIONS', 'CONNECT', 'PATCH']
NON_HTTP_RECORDS = ('warcinfo', 'arc_header', 'metadata', 'resource')
NON_HTTP_SCHEMES = ('dns:', 'whois:', 'ntp:')
HTTP_SCHEMES = ('http:', 'https:')
def __init__(self, verify_http=True, arc2warc=True):
if arc2warc:
self.arc_parser = ARC2WARCHeadersParser()
else:
self.arc_parser = ARCHeadersParser()
self.warc_parser = StatusAndHeadersParser(self.WARC_TYPES)
self.http_parser = StatusAndHeadersParser(self.HTTP_TYPES, verify_http)
self.http_req_parser = StatusAndHeadersParser(self.HTTP_VERBS, verify_http)
def parse_record_stream(self, stream,
statusline=None,
known_format=None,
no_record_parse=False):
""" Parse file-like stream and return an ArcWarcRecord
encapsulating the record headers, http headers (if any),
and a stream limited to the remainder of the record.
Pass statusline and known_format to detect_type_loader_headers()
to faciliate parsing.
"""
(the_format, rec_headers) = (self.
_detect_type_load_headers(stream,
statusline,
known_format))
if the_format == 'arc':
uri = rec_headers.get_header('uri')
length = rec_headers.get_header('length')
content_type = rec_headers.get_header('content-type')
sub_len = rec_headers.total_len
if uri and uri.startswith('filedesc://'):
rec_type = 'arc_header'
else:
rec_type = 'response'
elif the_format in ('warc', 'arc2warc'):
rec_type = rec_headers.get_header('WARC-Type')
uri = rec_headers.get_header('WARC-Target-URI')
length = rec_headers.get_header('Content-Length')
content_type = rec_headers.get_header('Content-Type')
if the_format == 'warc':
sub_len = 0
else:
sub_len = rec_headers.total_len
the_format = 'warc'
is_err = False
try:
if length is not None:
length = int(length) - sub_len
if length < 0:
is_err = True
except (ValueError, TypeError):
is_err = True
# err condition
if is_err:
length = 0
# or status and headers are completely empty (blank lines found)
elif not rec_headers:
length = 0
# limit stream to the length for all valid records
if length is not None and length >= 0:
stream = LimitReader.wrap_stream(stream, length)
# don't parse the http record at all
if no_record_parse:
status_headers = None#StatusAndHeaders('', [])
# if empty record (error or otherwise) set status to 204
elif length == 0:
if is_err:
msg = '204 Possible Error'
else:
msg = '204 No Content'
status_headers = StatusAndHeaders(msg, [])
# response record or non-empty revisit: parse HTTP status and headers!
elif (rec_type in ('response', 'revisit')
and uri.startswith(self.HTTP_SCHEMES)):
status_headers = self.http_parser.parse(stream)
# request record: parse request
elif ((rec_type == 'request')
and uri.startswith(self.HTTP_SCHEMES)):
status_headers = self.http_req_parser.parse(stream)
# everything else: create a no-status entry, set content-type
else:
content_type_header = [('Content-Type', content_type)]
if length is not None and length >= 0:
content_type_header.append(('Content-Length', str(length)))
status_headers = StatusAndHeaders('200 OK', content_type_header)
return ArcWarcRecord(the_format, rec_type,
rec_headers, stream, status_headers,
content_type, length)
def _detect_type_load_headers(self, stream,
statusline=None, known_format=None):
""" If known_format is specified ('warc' or 'arc'),
parse only as that format.
Otherwise, try parsing record as WARC, then try parsing as ARC.
if neither one succeeds, we're out of luck.
"""
if known_format != 'arc':
# try as warc first
try:
rec_headers = self.warc_parser.parse(stream, statusline)
return 'warc', rec_headers
except StatusAndHeadersParserException as se:
if known_format == 'warc':
msg = 'Invalid WARC record, first line: '
raise ArchiveLoadFailed(msg + str(se.statusline))
statusline = se.statusline
pass
# now try as arc
try:
rec_headers = self.arc_parser.parse(stream, statusline)
return self.arc_parser.get_rec_type(), rec_headers
except StatusAndHeadersParserException as se:
if known_format == 'arc':
msg = 'Invalid ARC record, first line: '
else:
msg = 'Unknown archive format, first line: '
raise ArchiveLoadFailed(msg + str(se.statusline))
#=================================================================
class ARCHeadersParser(object):
# ARC 1.0 headers
ARC_HEADERS = ["uri", "ip-address", "archive-date",
"content-type", "length"]
def __init__(self):
self.headernames = self.get_header_names()
def get_rec_type(self):
return 'arc'
def parse(self, stream, headerline=None):
total_read = 0
def readline():
return to_native_str(stream.readline())
# if headerline passed in, use that
if headerline is None:
headerline = readline()
else:
headerline = to_native_str(headerline)
header_len = len(headerline)
if header_len == 0:
raise EOFError()
headerline = headerline.rstrip()
headernames = self.headernames
# if arc header, consume next two lines
if headerline.startswith('filedesc://'):
version = readline() # skip version
spec = readline() # skip header spec, use preset one
total_read += len(version)
total_read += len(spec)
parts = headerline.split(' ')
if len(parts) != len(headernames):
msg = 'Wrong # of headers, expected arc headers {0}, Found {1}'
msg = msg.format(headernames, parts)
raise StatusAndHeadersParserException(msg, parts)
protocol, headers = self._get_protocol_and_headers(headerline, parts)
return StatusAndHeaders(statusline='',
headers=headers,
protocol='WARC/1.0',
total_len=total_read)
@classmethod
def get_header_names(cls):
return cls.ARC_HEADERS
def _get_protocol_and_headers(self, headerline, parts):
headers = []
for name, value in zip(self.headernames, parts):
headers.append((name, value))
return ('ARC/1.0', headers)
#=================================================================
class ARC2WARCHeadersParser(ARCHeadersParser):
# Headers for converting ARC -> WARC Header
ARC_TO_WARC_HEADERS = ["WARC-Target-URI",
"WARC-IP-Address",
"WARC-Date",
"Content-Type",
"Content-Length"]
def get_rec_type(self):
return 'arc2warc'
@classmethod
def get_header_names(cls):
return cls.ARC_TO_WARC_HEADERS
def _get_protocol_and_headers(self, headerline, parts):
headers = []
for name, value in zip(self.headernames, parts):
if name == 'WARC-Date':
value = timestamp_to_iso_date(value)
headers.append((name, value))
if headerline.startswith('filedesc://'):
rec_type = 'arc_header'
else:
rec_type = 'response'
headers.append(('WARC-Type', rec_type))
headers.append(('WARC-Record-ID', StatusAndHeadersParser.make_warc_id()))
return ('WARC/1.0', headers)