diff --git a/CHANGES.rst b/CHANGES.rst index ae02ded7..5bde2903 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -7,6 +7,18 @@ Video Buffering Replay pywb 0.6.4 changelist ~~~~~~~~~~~~~~~~~~~~~ +* Ignore bad multiline headers in warc. + +* Rewrite fix: Don't parse html entities in HTML rewriter. + +* Ensure cdx iterator closed when reeading. + +* Rewrite fix: remove pywb prefix from any query params. + +* Rewrite fix: better JS rewriting, avoid // comments when matching protocol-relative urls. + +* WARC metadata and resource records include in cdx from cdx-indexer by default + pywb 0.6.3 changelist ~~~~~~~~~~~~~~~~~~~~~ diff --git a/pywb/utils/statusandheaders.py b/pywb/utils/statusandheaders.py index 70ba850c..3f429814 100644 --- a/pywb/utils/statusandheaders.py +++ b/pywb/utils/statusandheaders.py @@ -169,7 +169,8 @@ class StatusAndHeadersParser(object): # append continuation lines, if any while next_line and next_line.startswith((' ', '\t')): - value += next_line + if value is not None: + value += next_line next_line, total_read = _strip_count(stream.readline(), total_read) diff --git a/pywb/utils/test/test_statusandheaders.py b/pywb/utils/test/test_statusandheaders.py index 2ee894b9..1929d17d 100644 --- a/pywb/utils/test/test_statusandheaders.py +++ b/pywb/utils/test/test_statusandheaders.py @@ -32,6 +32,10 @@ False # empty >>> st2 = StatusAndHeadersParser(['HTTP/1.0']).parse(BytesIO(status_headers_2)); x = st2.validate_statusline('204 No Content'); st2 StatusAndHeaders(protocol = '', statusline = '204 No Content', headers = []) + + +>>> StatusAndHeadersParser(['HTTP/1.0']).parse(BytesIO(status_headers_3)) +StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '204 Empty', headers = [('Content-Type', 'Value'), ('Content-Length', '0')]) """ @@ -54,6 +58,14 @@ status_headers_2 = """ """ +status_headers_3 = "\ +HTTP/1.0 204 Empty\r\n\ +Content-Type: Value\r\n\ +%Invalid%\r\n\ +\tMultiline\r\n\ +Content-Length: 0\r\n\ +\r\n" + if __name__ == "__main__": import doctest diff --git a/pywb/warc/archiveiterator.py b/pywb/warc/archiveiterator.py index e1408432..e0994a7f 100644 --- a/pywb/warc/archiveiterator.py +++ b/pywb/warc/archiveiterator.py @@ -21,6 +21,25 @@ class ArchiveIterator(object): """ + GZIP_ERR_MSG = """ + ERROR: Non-chunked gzip file detected, gzip block continues + beyond single record. + + This file is probably not a multi-chunk gzip but a single gzip file. + + To allow seek, a gzipped {1} must have each record compressed into + a single gzip chunk and concatenated together. + + This file is likely still valid and you can use it by decompressing it: + + gunzip myfile.{0}.gz + + You can then also use the 'warc2warc' tool from the 'warc-tools' + package which will create a properly chunked gzip file: + + warc2warc -Z myfile.{0} > myfile.{0}.gz + """ + def __init__(self, fileobj): self.fh = fileobj @@ -42,27 +61,34 @@ class ArchiveIterator(object): block_size=block_size) self.offset = self.fh.tell() - next_line = None + self.next_line = None + + is_valid = True while True: try: - record = self._next_record(next_line) + record = self._next_record(self.next_line) + if not is_valid: + self._raise_err() + yield record except EOFError: break self.read_to_end(record) - # for non-compressed, consume blank lines here - if not self.reader.decompressor: - next_line = self._consume_blanklines() - if next_line is None: - # at end of file - break + if self.reader.decompressor: + is_valid = self.reader.read_next_member() - # reset reader for next member - else: - self.reader.read_next_member() + def _raise_err(self): + frmt = 'warc/arc' + if self.known_format: + frmt = self.known_format + + frmt_up = frmt.upper() + + msg = self.GZIP_ERR_MSG.format(frmt, frmt_up) + raise Exception(msg) def _consume_blanklines(self): """ Consume blank lines that are between records @@ -72,25 +98,31 @@ class ArchiveIterator(object): and are included in record length which is the full gzip envelope - For uncompressed, they are between records and so are NOT part of the record length + + count empty_size so that it can be substracted from + the record length for uncompressed """ + empty_size = 0 while True: line = self.reader.readline() if len(line) == 0: - return None + return None, empty_size if line.rstrip() == '': - self.offset = self.fh.tell() - self.reader.rem_length() + empty_size += len(line) continue - return line + return line, empty_size def read_to_end(self, record, compute_digest=False): """ Read remainder of the stream If a digester is included, update it with the data read """ + + # already at end of this record, don't read until it is consumed if self.member_info: - return self.member_info + return None if compute_digest: digester = hashlib.sha1() @@ -114,19 +146,29 @@ class ArchiveIterator(object): - For uncompressed files, blank lines are read later, and not included in the record length """ - if self.reader.decompressor: - self._consume_blanklines() + #if self.reader.decompressor: + self.next_line, empty_size = self._consume_blanklines() self.offset = self.fh.tell() - self.reader.rem_length() + #if self.offset < 0: + # raise Exception('Not Gzipped Properly') + + if self.next_line: + self.offset -= len(self.next_line) + length = self.offset - curr_offset + if not self.reader.decompressor: + length -= empty_size + if compute_digest: digest = base64.b32encode(digester.digest()) else: digest = None self.member_info = (curr_offset, length, digest) - return self.member_info + #return self.member_info + #return next_line def _next_record(self, next_line): """ Use loader to parse the record from the reader stream @@ -250,7 +292,9 @@ def create_record_iter(arcv_iter, options): entry.post_query = post_query - entry.set_rec_info(*arcv_iter.read_to_end(record, compute_digest)) + #entry.set_rec_info(*arcv_iter.read_to_end(record, compute_digest)) + arcv_iter.read_to_end(record, compute_digest) + entry.set_rec_info(*arcv_iter.member_info) entry.record = record yield entry diff --git a/pywb/warc/test/test_indexing.py b/pywb/warc/test/test_indexing.py index cb8dc4bb..c8584c8d 100644 --- a/pywb/warc/test/test_indexing.py +++ b/pywb/warc/test/test_indexing.py @@ -144,7 +144,6 @@ Total: 4 from pywb import get_test_dir -#from pywb.warc.archiveindexer import ArchiveIndexer, main, cdx_filename from pywb.warc.cdxindexer import write_cdx_index, main, cdx_filename from io import BytesIO @@ -154,6 +153,9 @@ import os import shutil import tempfile +from pytest import raises + + TEST_CDX_DIR = get_test_dir() + 'cdx/' TEST_WARC_DIR = get_test_dir() + 'warcs/' @@ -231,3 +233,11 @@ def cli_lines_with_dir(input_): print('Total: ' + str(len(lines))) +def test_non_chunked_gzip_err(): + with raises(Exception): + print_cdx_index('example-bad.warc.gz.bad') + + +if __name__ == "__main__": + import doctest + doctest.testmod() diff --git a/sample_archive/warcs/example-bad.warc.gz.bad b/sample_archive/warcs/example-bad.warc.gz.bad new file mode 100644 index 00000000..95d2c415 Binary files /dev/null and b/sample_archive/warcs/example-bad.warc.gz.bad differ