mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
warc load: make http response/request protocol/verb validation optional
enabled for replay, disabled by default for cdx-indexing, though can be enabled with -v option #99
This commit is contained in:
parent
28e3cd791b
commit
08064f3806
@ -133,8 +133,9 @@ class StatusAndHeadersParser(object):
|
||||
Parser which consumes a stream support readline() to read
|
||||
status and headers and return a StatusAndHeaders object
|
||||
"""
|
||||
def __init__(self, statuslist):
|
||||
def __init__(self, statuslist, verify=True):
|
||||
self.statuslist = statuslist
|
||||
self.verify = verify
|
||||
|
||||
def parse(self, stream, full_statusline=None):
|
||||
"""
|
||||
@ -160,12 +161,16 @@ class StatusAndHeadersParser(object):
|
||||
protocol='',
|
||||
total_len=total_read)
|
||||
|
||||
protocol_status = self.split_prefix(statusline, self.statuslist)
|
||||
# validate only if verify is set
|
||||
if self.verify:
|
||||
protocol_status = self.split_prefix(statusline, self.statuslist)
|
||||
|
||||
if not protocol_status:
|
||||
msg = 'Expected Status Line starting with {0} - Found: {1}'
|
||||
msg = msg.format(self.statuslist, statusline)
|
||||
raise StatusAndHeadersParserException(msg, full_statusline)
|
||||
if not protocol_status:
|
||||
msg = 'Expected Status Line starting with {0} - Found: {1}'
|
||||
msg = msg.format(self.statuslist, statusline)
|
||||
raise StatusAndHeadersParserException(msg, full_statusline)
|
||||
else:
|
||||
protocol_status = statusline.split(' ', 1)
|
||||
|
||||
line, total_read = _strip_count(stream.readline(), total_read)
|
||||
while line:
|
||||
|
@ -45,10 +45,11 @@ class ArchiveIterator(object):
|
||||
warc2warc -Z myfile.{0} > myfile.{0}.gz
|
||||
"""
|
||||
|
||||
def __init__(self, fileobj, no_record_parse=False):
|
||||
def __init__(self, fileobj, no_record_parse=False,
|
||||
verify_http=False):
|
||||
self.fh = fileobj
|
||||
|
||||
self.loader = ArcWarcRecordLoader()
|
||||
self.loader = ArcWarcRecordLoader(verify_http=verify_http)
|
||||
self.reader = None
|
||||
|
||||
self.offset = 0
|
||||
@ -445,7 +446,8 @@ class DefaultRecordIter(object):
|
||||
return entry
|
||||
|
||||
def __call__(self, fh):
|
||||
aiter = ArchiveIterator(fh, self.options.get('minimal', False))
|
||||
aiter = ArchiveIterator(fh, self.options.get('minimal', False),
|
||||
self.options.get('verify_http', False))
|
||||
|
||||
entry_iter = self.create_record_iter(aiter)
|
||||
|
||||
|
@ -311,6 +311,12 @@ Sort the output to each file before writing to create a total ordering
|
||||
Convert SURT (Sort-friendly URI Reordering Transform) back to regular
|
||||
urls for the cdx key. Default is to use SURT keys.
|
||||
Not-recommended for new cdx, use only for backwards-compatibility.
|
||||
"""
|
||||
|
||||
verify_help = """
|
||||
Verify HTTP protocol (1.0/1.1) status in response records and http verb
|
||||
on request records, ensuring the protocol or verb matches the expected list.
|
||||
Raise an exception on failure. (This was previously the default behavior).
|
||||
"""
|
||||
|
||||
cdx09_help = """
|
||||
@ -391,6 +397,10 @@ instead of current working directory
|
||||
action='store_true',
|
||||
help=unsurt_help)
|
||||
|
||||
parser.add_argument('-v', '--verify',
|
||||
action='store_true',
|
||||
help=verify_help)
|
||||
|
||||
group = parser.add_mutually_exclusive_group()
|
||||
group.add_argument('-9', '--cdx09',
|
||||
action='store_true',
|
||||
@ -416,6 +426,7 @@ instead of current working directory
|
||||
append_post=cmd.postappend,
|
||||
recurse=cmd.recurse,
|
||||
rel_root=cmd.dir_root,
|
||||
verify_http=cmd.verify,
|
||||
cdx09=cmd.cdx09,
|
||||
cdxj=cmd.cdxj,
|
||||
minimal=cmd.minimal_cdxj)
|
||||
|
@ -49,7 +49,8 @@ class ArcWarcRecordLoader:
|
||||
|
||||
NON_HTTP_RECORDS = ('warcinfo', 'arc_header', 'metadata', 'resource')
|
||||
|
||||
def __init__(self, loader=None, cookie_maker=None, block_size=8192):
|
||||
def __init__(self, loader=None, cookie_maker=None, block_size=8192,
|
||||
verify_http=True):
|
||||
if not loader:
|
||||
loader = BlockLoader(cookie_maker)
|
||||
|
||||
@ -59,9 +60,9 @@ class ArcWarcRecordLoader:
|
||||
self.arc_parser = ARCHeadersParser(self.ARC_HEADERS)
|
||||
|
||||
self.warc_parser = StatusAndHeadersParser(self.WARC_TYPES)
|
||||
self.http_parser = StatusAndHeadersParser(self.HTTP_TYPES)
|
||||
self.http_parser = StatusAndHeadersParser(self.HTTP_TYPES, verify_http)
|
||||
|
||||
self.http_req_parser = StatusAndHeadersParser(self.HTTP_VERBS)
|
||||
self.http_req_parser = StatusAndHeadersParser(self.HTTP_VERBS, verify_http)
|
||||
|
||||
def load(self, url, offset, length):
|
||||
""" Load a single record from given url at offset with length
|
||||
|
Loading…
x
Reference in New Issue
Block a user