1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

warc load: make http response/request protocol/verb validation optional

enabled for replay, disabled by default for cdx-indexing, though can
be enabled with -v option #99
This commit is contained in:
Ilya Kreymer 2015-04-20 08:29:18 -07:00
parent 28e3cd791b
commit 08064f3806
4 changed files with 31 additions and 12 deletions

View File

@ -133,8 +133,9 @@ class StatusAndHeadersParser(object):
Parser which consumes a stream support readline() to read
status and headers and return a StatusAndHeaders object
"""
def __init__(self, statuslist):
def __init__(self, statuslist, verify=True):
self.statuslist = statuslist
self.verify = verify
def parse(self, stream, full_statusline=None):
"""
@ -160,12 +161,16 @@ class StatusAndHeadersParser(object):
protocol='',
total_len=total_read)
protocol_status = self.split_prefix(statusline, self.statuslist)
# validate only if verify is set
if self.verify:
protocol_status = self.split_prefix(statusline, self.statuslist)
if not protocol_status:
msg = 'Expected Status Line starting with {0} - Found: {1}'
msg = msg.format(self.statuslist, statusline)
raise StatusAndHeadersParserException(msg, full_statusline)
if not protocol_status:
msg = 'Expected Status Line starting with {0} - Found: {1}'
msg = msg.format(self.statuslist, statusline)
raise StatusAndHeadersParserException(msg, full_statusline)
else:
protocol_status = statusline.split(' ', 1)
line, total_read = _strip_count(stream.readline(), total_read)
while line:

View File

@ -45,10 +45,11 @@ class ArchiveIterator(object):
warc2warc -Z myfile.{0} > myfile.{0}.gz
"""
def __init__(self, fileobj, no_record_parse=False):
def __init__(self, fileobj, no_record_parse=False,
verify_http=False):
self.fh = fileobj
self.loader = ArcWarcRecordLoader()
self.loader = ArcWarcRecordLoader(verify_http=verify_http)
self.reader = None
self.offset = 0
@ -445,7 +446,8 @@ class DefaultRecordIter(object):
return entry
def __call__(self, fh):
aiter = ArchiveIterator(fh, self.options.get('minimal', False))
aiter = ArchiveIterator(fh, self.options.get('minimal', False),
self.options.get('verify_http', False))
entry_iter = self.create_record_iter(aiter)

View File

@ -311,6 +311,12 @@ Sort the output to each file before writing to create a total ordering
Convert SURT (Sort-friendly URI Reordering Transform) back to regular
urls for the cdx key. Default is to use SURT keys.
Not-recommended for new cdx, use only for backwards-compatibility.
"""
verify_help = """
Verify HTTP protocol (1.0/1.1) status in response records and http verb
on request records, ensuring the protocol or verb matches the expected list.
Raise an exception on failure. (This was previously the default behavior).
"""
cdx09_help = """
@ -391,6 +397,10 @@ instead of current working directory
action='store_true',
help=unsurt_help)
parser.add_argument('-v', '--verify',
action='store_true',
help=verify_help)
group = parser.add_mutually_exclusive_group()
group.add_argument('-9', '--cdx09',
action='store_true',
@ -416,6 +426,7 @@ instead of current working directory
append_post=cmd.postappend,
recurse=cmd.recurse,
rel_root=cmd.dir_root,
verify_http=cmd.verify,
cdx09=cmd.cdx09,
cdxj=cmd.cdxj,
minimal=cmd.minimal_cdxj)

View File

@ -49,7 +49,8 @@ class ArcWarcRecordLoader:
NON_HTTP_RECORDS = ('warcinfo', 'arc_header', 'metadata', 'resource')
def __init__(self, loader=None, cookie_maker=None, block_size=8192):
def __init__(self, loader=None, cookie_maker=None, block_size=8192,
verify_http=True):
if not loader:
loader = BlockLoader(cookie_maker)
@ -59,9 +60,9 @@ class ArcWarcRecordLoader:
self.arc_parser = ARCHeadersParser(self.ARC_HEADERS)
self.warc_parser = StatusAndHeadersParser(self.WARC_TYPES)
self.http_parser = StatusAndHeadersParser(self.HTTP_TYPES)
self.http_parser = StatusAndHeadersParser(self.HTTP_TYPES, verify_http)
self.http_req_parser = StatusAndHeadersParser(self.HTTP_VERBS)
self.http_req_parser = StatusAndHeadersParser(self.HTTP_VERBS, verify_http)
def load(self, url, offset, length):
""" Load a single record from given url at offset with length