mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
Merge branch 'develop' into video
This commit is contained in:
commit
1a91f514c5
12
CHANGES.rst
12
CHANGES.rst
@ -7,6 +7,18 @@ Video Buffering Replay
|
|||||||
pywb 0.6.4 changelist
|
pywb 0.6.4 changelist
|
||||||
~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
* Ignore bad multiline headers in warc.
|
||||||
|
|
||||||
|
* Rewrite fix: Don't parse html entities in HTML rewriter.
|
||||||
|
|
||||||
|
* Ensure cdx iterator closed when reeading.
|
||||||
|
|
||||||
|
* Rewrite fix: remove pywb prefix from any query params.
|
||||||
|
|
||||||
|
* Rewrite fix: better JS rewriting, avoid // comments when matching protocol-relative urls.
|
||||||
|
|
||||||
|
* WARC metadata and resource records include in cdx from cdx-indexer by default
|
||||||
|
|
||||||
|
|
||||||
pywb 0.6.3 changelist
|
pywb 0.6.3 changelist
|
||||||
~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
@ -169,7 +169,8 @@ class StatusAndHeadersParser(object):
|
|||||||
|
|
||||||
# append continuation lines, if any
|
# append continuation lines, if any
|
||||||
while next_line and next_line.startswith((' ', '\t')):
|
while next_line and next_line.startswith((' ', '\t')):
|
||||||
value += next_line
|
if value is not None:
|
||||||
|
value += next_line
|
||||||
next_line, total_read = _strip_count(stream.readline(),
|
next_line, total_read = _strip_count(stream.readline(),
|
||||||
total_read)
|
total_read)
|
||||||
|
|
||||||
|
@ -32,6 +32,10 @@ False
|
|||||||
# empty
|
# empty
|
||||||
>>> st2 = StatusAndHeadersParser(['HTTP/1.0']).parse(BytesIO(status_headers_2)); x = st2.validate_statusline('204 No Content'); st2
|
>>> st2 = StatusAndHeadersParser(['HTTP/1.0']).parse(BytesIO(status_headers_2)); x = st2.validate_statusline('204 No Content'); st2
|
||||||
StatusAndHeaders(protocol = '', statusline = '204 No Content', headers = [])
|
StatusAndHeaders(protocol = '', statusline = '204 No Content', headers = [])
|
||||||
|
|
||||||
|
|
||||||
|
>>> StatusAndHeadersParser(['HTTP/1.0']).parse(BytesIO(status_headers_3))
|
||||||
|
StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '204 Empty', headers = [('Content-Type', 'Value'), ('Content-Length', '0')])
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
@ -54,6 +58,14 @@ status_headers_2 = """
|
|||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
status_headers_3 = "\
|
||||||
|
HTTP/1.0 204 Empty\r\n\
|
||||||
|
Content-Type: Value\r\n\
|
||||||
|
%Invalid%\r\n\
|
||||||
|
\tMultiline\r\n\
|
||||||
|
Content-Length: 0\r\n\
|
||||||
|
\r\n"
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import doctest
|
import doctest
|
||||||
|
@ -21,6 +21,25 @@ class ArchiveIterator(object):
|
|||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
GZIP_ERR_MSG = """
|
||||||
|
ERROR: Non-chunked gzip file detected, gzip block continues
|
||||||
|
beyond single record.
|
||||||
|
|
||||||
|
This file is probably not a multi-chunk gzip but a single gzip file.
|
||||||
|
|
||||||
|
To allow seek, a gzipped {1} must have each record compressed into
|
||||||
|
a single gzip chunk and concatenated together.
|
||||||
|
|
||||||
|
This file is likely still valid and you can use it by decompressing it:
|
||||||
|
|
||||||
|
gunzip myfile.{0}.gz
|
||||||
|
|
||||||
|
You can then also use the 'warc2warc' tool from the 'warc-tools'
|
||||||
|
package which will create a properly chunked gzip file:
|
||||||
|
|
||||||
|
warc2warc -Z myfile.{0} > myfile.{0}.gz
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(self, fileobj):
|
def __init__(self, fileobj):
|
||||||
self.fh = fileobj
|
self.fh = fileobj
|
||||||
|
|
||||||
@ -42,27 +61,34 @@ class ArchiveIterator(object):
|
|||||||
block_size=block_size)
|
block_size=block_size)
|
||||||
self.offset = self.fh.tell()
|
self.offset = self.fh.tell()
|
||||||
|
|
||||||
next_line = None
|
self.next_line = None
|
||||||
|
|
||||||
|
is_valid = True
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
record = self._next_record(next_line)
|
record = self._next_record(self.next_line)
|
||||||
|
if not is_valid:
|
||||||
|
self._raise_err()
|
||||||
|
|
||||||
yield record
|
yield record
|
||||||
except EOFError:
|
except EOFError:
|
||||||
break
|
break
|
||||||
|
|
||||||
self.read_to_end(record)
|
self.read_to_end(record)
|
||||||
|
|
||||||
# for non-compressed, consume blank lines here
|
if self.reader.decompressor:
|
||||||
if not self.reader.decompressor:
|
is_valid = self.reader.read_next_member()
|
||||||
next_line = self._consume_blanklines()
|
|
||||||
if next_line is None:
|
|
||||||
# at end of file
|
|
||||||
break
|
|
||||||
|
|
||||||
# reset reader for next member
|
def _raise_err(self):
|
||||||
else:
|
frmt = 'warc/arc'
|
||||||
self.reader.read_next_member()
|
if self.known_format:
|
||||||
|
frmt = self.known_format
|
||||||
|
|
||||||
|
frmt_up = frmt.upper()
|
||||||
|
|
||||||
|
msg = self.GZIP_ERR_MSG.format(frmt, frmt_up)
|
||||||
|
raise Exception(msg)
|
||||||
|
|
||||||
def _consume_blanklines(self):
|
def _consume_blanklines(self):
|
||||||
""" Consume blank lines that are between records
|
""" Consume blank lines that are between records
|
||||||
@ -72,25 +98,31 @@ class ArchiveIterator(object):
|
|||||||
and are included in record length which is the full gzip envelope
|
and are included in record length which is the full gzip envelope
|
||||||
- For uncompressed, they are between records and so are NOT part of
|
- For uncompressed, they are between records and so are NOT part of
|
||||||
the record length
|
the record length
|
||||||
|
|
||||||
|
count empty_size so that it can be substracted from
|
||||||
|
the record length for uncompressed
|
||||||
"""
|
"""
|
||||||
|
empty_size = 0
|
||||||
while True:
|
while True:
|
||||||
line = self.reader.readline()
|
line = self.reader.readline()
|
||||||
if len(line) == 0:
|
if len(line) == 0:
|
||||||
return None
|
return None, empty_size
|
||||||
|
|
||||||
if line.rstrip() == '':
|
if line.rstrip() == '':
|
||||||
self.offset = self.fh.tell() - self.reader.rem_length()
|
empty_size += len(line)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
return line
|
return line, empty_size
|
||||||
|
|
||||||
def read_to_end(self, record, compute_digest=False):
|
def read_to_end(self, record, compute_digest=False):
|
||||||
""" Read remainder of the stream
|
""" Read remainder of the stream
|
||||||
If a digester is included, update it
|
If a digester is included, update it
|
||||||
with the data read
|
with the data read
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
# already at end of this record, don't read until it is consumed
|
||||||
if self.member_info:
|
if self.member_info:
|
||||||
return self.member_info
|
return None
|
||||||
|
|
||||||
if compute_digest:
|
if compute_digest:
|
||||||
digester = hashlib.sha1()
|
digester = hashlib.sha1()
|
||||||
@ -114,19 +146,29 @@ class ArchiveIterator(object):
|
|||||||
- For uncompressed files, blank lines are read later,
|
- For uncompressed files, blank lines are read later,
|
||||||
and not included in the record length
|
and not included in the record length
|
||||||
"""
|
"""
|
||||||
if self.reader.decompressor:
|
#if self.reader.decompressor:
|
||||||
self._consume_blanklines()
|
self.next_line, empty_size = self._consume_blanklines()
|
||||||
|
|
||||||
self.offset = self.fh.tell() - self.reader.rem_length()
|
self.offset = self.fh.tell() - self.reader.rem_length()
|
||||||
|
#if self.offset < 0:
|
||||||
|
# raise Exception('Not Gzipped Properly')
|
||||||
|
|
||||||
|
if self.next_line:
|
||||||
|
self.offset -= len(self.next_line)
|
||||||
|
|
||||||
length = self.offset - curr_offset
|
length = self.offset - curr_offset
|
||||||
|
|
||||||
|
if not self.reader.decompressor:
|
||||||
|
length -= empty_size
|
||||||
|
|
||||||
if compute_digest:
|
if compute_digest:
|
||||||
digest = base64.b32encode(digester.digest())
|
digest = base64.b32encode(digester.digest())
|
||||||
else:
|
else:
|
||||||
digest = None
|
digest = None
|
||||||
|
|
||||||
self.member_info = (curr_offset, length, digest)
|
self.member_info = (curr_offset, length, digest)
|
||||||
return self.member_info
|
#return self.member_info
|
||||||
|
#return next_line
|
||||||
|
|
||||||
def _next_record(self, next_line):
|
def _next_record(self, next_line):
|
||||||
""" Use loader to parse the record from the reader stream
|
""" Use loader to parse the record from the reader stream
|
||||||
@ -250,7 +292,9 @@ def create_record_iter(arcv_iter, options):
|
|||||||
|
|
||||||
entry.post_query = post_query
|
entry.post_query = post_query
|
||||||
|
|
||||||
entry.set_rec_info(*arcv_iter.read_to_end(record, compute_digest))
|
#entry.set_rec_info(*arcv_iter.read_to_end(record, compute_digest))
|
||||||
|
arcv_iter.read_to_end(record, compute_digest)
|
||||||
|
entry.set_rec_info(*arcv_iter.member_info)
|
||||||
entry.record = record
|
entry.record = record
|
||||||
|
|
||||||
yield entry
|
yield entry
|
||||||
|
@ -144,7 +144,6 @@ Total: 4
|
|||||||
|
|
||||||
from pywb import get_test_dir
|
from pywb import get_test_dir
|
||||||
|
|
||||||
#from pywb.warc.archiveindexer import ArchiveIndexer, main, cdx_filename
|
|
||||||
from pywb.warc.cdxindexer import write_cdx_index, main, cdx_filename
|
from pywb.warc.cdxindexer import write_cdx_index, main, cdx_filename
|
||||||
|
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
@ -154,6 +153,9 @@ import os
|
|||||||
import shutil
|
import shutil
|
||||||
import tempfile
|
import tempfile
|
||||||
|
|
||||||
|
from pytest import raises
|
||||||
|
|
||||||
|
|
||||||
TEST_CDX_DIR = get_test_dir() + 'cdx/'
|
TEST_CDX_DIR = get_test_dir() + 'cdx/'
|
||||||
TEST_WARC_DIR = get_test_dir() + 'warcs/'
|
TEST_WARC_DIR = get_test_dir() + 'warcs/'
|
||||||
|
|
||||||
@ -231,3 +233,11 @@ def cli_lines_with_dir(input_):
|
|||||||
print('Total: ' + str(len(lines)))
|
print('Total: ' + str(len(lines)))
|
||||||
|
|
||||||
|
|
||||||
|
def test_non_chunked_gzip_err():
|
||||||
|
with raises(Exception):
|
||||||
|
print_cdx_index('example-bad.warc.gz.bad')
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import doctest
|
||||||
|
doctest.testmod()
|
||||||
|
BIN
sample_archive/warcs/example-bad.warc.gz.bad
Normal file
BIN
sample_archive/warcs/example-bad.warc.gz.bad
Normal file
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user