1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-28 16:42:29 +01:00

warc load: make http response/request protocol/verb validation optional

enabled for replay, disabled by default for cdx-indexing, though can
be enabled with -v option #99
This commit is contained in:
Ilya Kreymer 2015-04-20 08:29:18 -07:00
parent 28e3cd791b
commit 08064f3806
4 changed files with 31 additions and 12 deletions

View File

@ -133,8 +133,9 @@ class StatusAndHeadersParser(object):
Parser which consumes a stream support readline() to read Parser which consumes a stream support readline() to read
status and headers and return a StatusAndHeaders object status and headers and return a StatusAndHeaders object
""" """
def __init__(self, statuslist): def __init__(self, statuslist, verify=True):
self.statuslist = statuslist self.statuslist = statuslist
self.verify = verify
def parse(self, stream, full_statusline=None): def parse(self, stream, full_statusline=None):
""" """
@ -160,12 +161,16 @@ class StatusAndHeadersParser(object):
protocol='', protocol='',
total_len=total_read) total_len=total_read)
protocol_status = self.split_prefix(statusline, self.statuslist) # validate only if verify is set
if self.verify:
protocol_status = self.split_prefix(statusline, self.statuslist)
if not protocol_status: if not protocol_status:
msg = 'Expected Status Line starting with {0} - Found: {1}' msg = 'Expected Status Line starting with {0} - Found: {1}'
msg = msg.format(self.statuslist, statusline) msg = msg.format(self.statuslist, statusline)
raise StatusAndHeadersParserException(msg, full_statusline) raise StatusAndHeadersParserException(msg, full_statusline)
else:
protocol_status = statusline.split(' ', 1)
line, total_read = _strip_count(stream.readline(), total_read) line, total_read = _strip_count(stream.readline(), total_read)
while line: while line:

View File

@ -45,10 +45,11 @@ class ArchiveIterator(object):
warc2warc -Z myfile.{0} > myfile.{0}.gz warc2warc -Z myfile.{0} > myfile.{0}.gz
""" """
def __init__(self, fileobj, no_record_parse=False): def __init__(self, fileobj, no_record_parse=False,
verify_http=False):
self.fh = fileobj self.fh = fileobj
self.loader = ArcWarcRecordLoader() self.loader = ArcWarcRecordLoader(verify_http=verify_http)
self.reader = None self.reader = None
self.offset = 0 self.offset = 0
@ -445,7 +446,8 @@ class DefaultRecordIter(object):
return entry return entry
def __call__(self, fh): def __call__(self, fh):
aiter = ArchiveIterator(fh, self.options.get('minimal', False)) aiter = ArchiveIterator(fh, self.options.get('minimal', False),
self.options.get('verify_http', False))
entry_iter = self.create_record_iter(aiter) entry_iter = self.create_record_iter(aiter)

View File

@ -311,6 +311,12 @@ Sort the output to each file before writing to create a total ordering
Convert SURT (Sort-friendly URI Reordering Transform) back to regular Convert SURT (Sort-friendly URI Reordering Transform) back to regular
urls for the cdx key. Default is to use SURT keys. urls for the cdx key. Default is to use SURT keys.
Not-recommended for new cdx, use only for backwards-compatibility. Not-recommended for new cdx, use only for backwards-compatibility.
"""
verify_help = """
Verify HTTP protocol (1.0/1.1) status in response records and http verb
on request records, ensuring the protocol or verb matches the expected list.
Raise an exception on failure. (This was previously the default behavior).
""" """
cdx09_help = """ cdx09_help = """
@ -391,6 +397,10 @@ instead of current working directory
action='store_true', action='store_true',
help=unsurt_help) help=unsurt_help)
parser.add_argument('-v', '--verify',
action='store_true',
help=verify_help)
group = parser.add_mutually_exclusive_group() group = parser.add_mutually_exclusive_group()
group.add_argument('-9', '--cdx09', group.add_argument('-9', '--cdx09',
action='store_true', action='store_true',
@ -416,6 +426,7 @@ instead of current working directory
append_post=cmd.postappend, append_post=cmd.postappend,
recurse=cmd.recurse, recurse=cmd.recurse,
rel_root=cmd.dir_root, rel_root=cmd.dir_root,
verify_http=cmd.verify,
cdx09=cmd.cdx09, cdx09=cmd.cdx09,
cdxj=cmd.cdxj, cdxj=cmd.cdxj,
minimal=cmd.minimal_cdxj) minimal=cmd.minimal_cdxj)

View File

@ -49,7 +49,8 @@ class ArcWarcRecordLoader:
NON_HTTP_RECORDS = ('warcinfo', 'arc_header', 'metadata', 'resource') NON_HTTP_RECORDS = ('warcinfo', 'arc_header', 'metadata', 'resource')
def __init__(self, loader=None, cookie_maker=None, block_size=8192): def __init__(self, loader=None, cookie_maker=None, block_size=8192,
verify_http=True):
if not loader: if not loader:
loader = BlockLoader(cookie_maker) loader = BlockLoader(cookie_maker)
@ -59,9 +60,9 @@ class ArcWarcRecordLoader:
self.arc_parser = ARCHeadersParser(self.ARC_HEADERS) self.arc_parser = ARCHeadersParser(self.ARC_HEADERS)
self.warc_parser = StatusAndHeadersParser(self.WARC_TYPES) self.warc_parser = StatusAndHeadersParser(self.WARC_TYPES)
self.http_parser = StatusAndHeadersParser(self.HTTP_TYPES) self.http_parser = StatusAndHeadersParser(self.HTTP_TYPES, verify_http)
self.http_req_parser = StatusAndHeadersParser(self.HTTP_VERBS) self.http_req_parser = StatusAndHeadersParser(self.HTTP_VERBS, verify_http)
def load(self, url, offset, length): def load(self, url, offset, length):
""" Load a single record from given url at offset with length """ Load a single record from given url at offset with length