diff --git a/pywb/utils/statusandheaders.py b/pywb/utils/statusandheaders.py index 0ecfa59a..9fb1ea84 100644 --- a/pywb/utils/statusandheaders.py +++ b/pywb/utils/statusandheaders.py @@ -133,8 +133,9 @@ class StatusAndHeadersParser(object): Parser which consumes a stream support readline() to read status and headers and return a StatusAndHeaders object """ - def __init__(self, statuslist): + def __init__(self, statuslist, verify=True): self.statuslist = statuslist + self.verify = verify def parse(self, stream, full_statusline=None): """ @@ -160,12 +161,16 @@ class StatusAndHeadersParser(object): protocol='', total_len=total_read) - protocol_status = self.split_prefix(statusline, self.statuslist) + # validate only if verify is set + if self.verify: + protocol_status = self.split_prefix(statusline, self.statuslist) - if not protocol_status: - msg = 'Expected Status Line starting with {0} - Found: {1}' - msg = msg.format(self.statuslist, statusline) - raise StatusAndHeadersParserException(msg, full_statusline) + if not protocol_status: + msg = 'Expected Status Line starting with {0} - Found: {1}' + msg = msg.format(self.statuslist, statusline) + raise StatusAndHeadersParserException(msg, full_statusline) + else: + protocol_status = statusline.split(' ', 1) line, total_read = _strip_count(stream.readline(), total_read) while line: diff --git a/pywb/warc/archiveiterator.py b/pywb/warc/archiveiterator.py index c33ec313..07df7d8d 100644 --- a/pywb/warc/archiveiterator.py +++ b/pywb/warc/archiveiterator.py @@ -45,10 +45,11 @@ class ArchiveIterator(object): warc2warc -Z myfile.{0} > myfile.{0}.gz """ - def __init__(self, fileobj, no_record_parse=False): + def __init__(self, fileobj, no_record_parse=False, + verify_http=False): self.fh = fileobj - self.loader = ArcWarcRecordLoader() + self.loader = ArcWarcRecordLoader(verify_http=verify_http) self.reader = None self.offset = 0 @@ -445,7 +446,8 @@ class DefaultRecordIter(object): return entry def __call__(self, fh): - aiter = ArchiveIterator(fh, self.options.get('minimal', False)) + aiter = ArchiveIterator(fh, self.options.get('minimal', False), + self.options.get('verify_http', False)) entry_iter = self.create_record_iter(aiter) diff --git a/pywb/warc/cdxindexer.py b/pywb/warc/cdxindexer.py index 06ef5176..28f9a023 100644 --- a/pywb/warc/cdxindexer.py +++ b/pywb/warc/cdxindexer.py @@ -311,6 +311,12 @@ Sort the output to each file before writing to create a total ordering Convert SURT (Sort-friendly URI Reordering Transform) back to regular urls for the cdx key. Default is to use SURT keys. Not-recommended for new cdx, use only for backwards-compatibility. +""" + + verify_help = """ +Verify HTTP protocol (1.0/1.1) status in response records and http verb +on request records, ensuring the protocol or verb matches the expected list. +Raise an exception on failure. (This was previously the default behavior). """ cdx09_help = """ @@ -391,6 +397,10 @@ instead of current working directory action='store_true', help=unsurt_help) + parser.add_argument('-v', '--verify', + action='store_true', + help=verify_help) + group = parser.add_mutually_exclusive_group() group.add_argument('-9', '--cdx09', action='store_true', @@ -416,6 +426,7 @@ instead of current working directory append_post=cmd.postappend, recurse=cmd.recurse, rel_root=cmd.dir_root, + verify_http=cmd.verify, cdx09=cmd.cdx09, cdxj=cmd.cdxj, minimal=cmd.minimal_cdxj) diff --git a/pywb/warc/recordloader.py b/pywb/warc/recordloader.py index dae2fb9e..572429b5 100644 --- a/pywb/warc/recordloader.py +++ b/pywb/warc/recordloader.py @@ -49,7 +49,8 @@ class ArcWarcRecordLoader: NON_HTTP_RECORDS = ('warcinfo', 'arc_header', 'metadata', 'resource') - def __init__(self, loader=None, cookie_maker=None, block_size=8192): + def __init__(self, loader=None, cookie_maker=None, block_size=8192, + verify_http=True): if not loader: loader = BlockLoader(cookie_maker) @@ -59,9 +60,9 @@ class ArcWarcRecordLoader: self.arc_parser = ARCHeadersParser(self.ARC_HEADERS) self.warc_parser = StatusAndHeadersParser(self.WARC_TYPES) - self.http_parser = StatusAndHeadersParser(self.HTTP_TYPES) + self.http_parser = StatusAndHeadersParser(self.HTTP_TYPES, verify_http) - self.http_req_parser = StatusAndHeadersParser(self.HTTP_VERBS) + self.http_req_parser = StatusAndHeadersParser(self.HTTP_VERBS, verify_http) def load(self, url, offset, length): """ Load a single record from given url at offset with length