mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-28 00:25:21 +01:00
warc load: make http response/request protocol/verb validation optional
enabled for replay, disabled by default for cdx-indexing, though can be enabled with -v option #99
This commit is contained in:
parent
28e3cd791b
commit
08064f3806
@ -133,8 +133,9 @@ class StatusAndHeadersParser(object):
|
|||||||
Parser which consumes a stream support readline() to read
|
Parser which consumes a stream support readline() to read
|
||||||
status and headers and return a StatusAndHeaders object
|
status and headers and return a StatusAndHeaders object
|
||||||
"""
|
"""
|
||||||
def __init__(self, statuslist):
|
def __init__(self, statuslist, verify=True):
|
||||||
self.statuslist = statuslist
|
self.statuslist = statuslist
|
||||||
|
self.verify = verify
|
||||||
|
|
||||||
def parse(self, stream, full_statusline=None):
|
def parse(self, stream, full_statusline=None):
|
||||||
"""
|
"""
|
||||||
@ -160,12 +161,16 @@ class StatusAndHeadersParser(object):
|
|||||||
protocol='',
|
protocol='',
|
||||||
total_len=total_read)
|
total_len=total_read)
|
||||||
|
|
||||||
|
# validate only if verify is set
|
||||||
|
if self.verify:
|
||||||
protocol_status = self.split_prefix(statusline, self.statuslist)
|
protocol_status = self.split_prefix(statusline, self.statuslist)
|
||||||
|
|
||||||
if not protocol_status:
|
if not protocol_status:
|
||||||
msg = 'Expected Status Line starting with {0} - Found: {1}'
|
msg = 'Expected Status Line starting with {0} - Found: {1}'
|
||||||
msg = msg.format(self.statuslist, statusline)
|
msg = msg.format(self.statuslist, statusline)
|
||||||
raise StatusAndHeadersParserException(msg, full_statusline)
|
raise StatusAndHeadersParserException(msg, full_statusline)
|
||||||
|
else:
|
||||||
|
protocol_status = statusline.split(' ', 1)
|
||||||
|
|
||||||
line, total_read = _strip_count(stream.readline(), total_read)
|
line, total_read = _strip_count(stream.readline(), total_read)
|
||||||
while line:
|
while line:
|
||||||
|
@ -45,10 +45,11 @@ class ArchiveIterator(object):
|
|||||||
warc2warc -Z myfile.{0} > myfile.{0}.gz
|
warc2warc -Z myfile.{0} > myfile.{0}.gz
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, fileobj, no_record_parse=False):
|
def __init__(self, fileobj, no_record_parse=False,
|
||||||
|
verify_http=False):
|
||||||
self.fh = fileobj
|
self.fh = fileobj
|
||||||
|
|
||||||
self.loader = ArcWarcRecordLoader()
|
self.loader = ArcWarcRecordLoader(verify_http=verify_http)
|
||||||
self.reader = None
|
self.reader = None
|
||||||
|
|
||||||
self.offset = 0
|
self.offset = 0
|
||||||
@ -445,7 +446,8 @@ class DefaultRecordIter(object):
|
|||||||
return entry
|
return entry
|
||||||
|
|
||||||
def __call__(self, fh):
|
def __call__(self, fh):
|
||||||
aiter = ArchiveIterator(fh, self.options.get('minimal', False))
|
aiter = ArchiveIterator(fh, self.options.get('minimal', False),
|
||||||
|
self.options.get('verify_http', False))
|
||||||
|
|
||||||
entry_iter = self.create_record_iter(aiter)
|
entry_iter = self.create_record_iter(aiter)
|
||||||
|
|
||||||
|
@ -311,6 +311,12 @@ Sort the output to each file before writing to create a total ordering
|
|||||||
Convert SURT (Sort-friendly URI Reordering Transform) back to regular
|
Convert SURT (Sort-friendly URI Reordering Transform) back to regular
|
||||||
urls for the cdx key. Default is to use SURT keys.
|
urls for the cdx key. Default is to use SURT keys.
|
||||||
Not-recommended for new cdx, use only for backwards-compatibility.
|
Not-recommended for new cdx, use only for backwards-compatibility.
|
||||||
|
"""
|
||||||
|
|
||||||
|
verify_help = """
|
||||||
|
Verify HTTP protocol (1.0/1.1) status in response records and http verb
|
||||||
|
on request records, ensuring the protocol or verb matches the expected list.
|
||||||
|
Raise an exception on failure. (This was previously the default behavior).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
cdx09_help = """
|
cdx09_help = """
|
||||||
@ -391,6 +397,10 @@ instead of current working directory
|
|||||||
action='store_true',
|
action='store_true',
|
||||||
help=unsurt_help)
|
help=unsurt_help)
|
||||||
|
|
||||||
|
parser.add_argument('-v', '--verify',
|
||||||
|
action='store_true',
|
||||||
|
help=verify_help)
|
||||||
|
|
||||||
group = parser.add_mutually_exclusive_group()
|
group = parser.add_mutually_exclusive_group()
|
||||||
group.add_argument('-9', '--cdx09',
|
group.add_argument('-9', '--cdx09',
|
||||||
action='store_true',
|
action='store_true',
|
||||||
@ -416,6 +426,7 @@ instead of current working directory
|
|||||||
append_post=cmd.postappend,
|
append_post=cmd.postappend,
|
||||||
recurse=cmd.recurse,
|
recurse=cmd.recurse,
|
||||||
rel_root=cmd.dir_root,
|
rel_root=cmd.dir_root,
|
||||||
|
verify_http=cmd.verify,
|
||||||
cdx09=cmd.cdx09,
|
cdx09=cmd.cdx09,
|
||||||
cdxj=cmd.cdxj,
|
cdxj=cmd.cdxj,
|
||||||
minimal=cmd.minimal_cdxj)
|
minimal=cmd.minimal_cdxj)
|
||||||
|
@ -49,7 +49,8 @@ class ArcWarcRecordLoader:
|
|||||||
|
|
||||||
NON_HTTP_RECORDS = ('warcinfo', 'arc_header', 'metadata', 'resource')
|
NON_HTTP_RECORDS = ('warcinfo', 'arc_header', 'metadata', 'resource')
|
||||||
|
|
||||||
def __init__(self, loader=None, cookie_maker=None, block_size=8192):
|
def __init__(self, loader=None, cookie_maker=None, block_size=8192,
|
||||||
|
verify_http=True):
|
||||||
if not loader:
|
if not loader:
|
||||||
loader = BlockLoader(cookie_maker)
|
loader = BlockLoader(cookie_maker)
|
||||||
|
|
||||||
@ -59,9 +60,9 @@ class ArcWarcRecordLoader:
|
|||||||
self.arc_parser = ARCHeadersParser(self.ARC_HEADERS)
|
self.arc_parser = ARCHeadersParser(self.ARC_HEADERS)
|
||||||
|
|
||||||
self.warc_parser = StatusAndHeadersParser(self.WARC_TYPES)
|
self.warc_parser = StatusAndHeadersParser(self.WARC_TYPES)
|
||||||
self.http_parser = StatusAndHeadersParser(self.HTTP_TYPES)
|
self.http_parser = StatusAndHeadersParser(self.HTTP_TYPES, verify_http)
|
||||||
|
|
||||||
self.http_req_parser = StatusAndHeadersParser(self.HTTP_VERBS)
|
self.http_req_parser = StatusAndHeadersParser(self.HTTP_VERBS, verify_http)
|
||||||
|
|
||||||
def load(self, url, offset, length):
|
def load(self, url, offset, length):
|
||||||
""" Load a single record from given url at offset with length
|
""" Load a single record from given url at offset with length
|
||||||
|
Loading…
x
Reference in New Issue
Block a user