mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
Merge branch 'develop' for 0.6.5
This commit is contained in:
commit
238a45bcbe
17
CHANGES.rst
17
CHANGES.rst
@ -1,3 +1,20 @@
|
||||
pywb 0.6.5 changelist
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
* fix static handling when content type can not be guessed, default to 'application/octet-stream'
|
||||
|
||||
* rewrite fix: understand partially encoded urls such as http%3A// in WbUrl, decode correctly
|
||||
|
||||
* rewrite fix: rewrite \/\/example.com and \\/\\/example.com in JS same as \\example.com
|
||||
|
||||
* cookies: add exact cookie rewriter which sets cookie to exact url only, never collection or host root
|
||||
|
||||
* don't rewrite rel=canonical links for services which rely on these
|
||||
|
||||
* cdx-indexer: Detect non-gzip chunk encoded .warc.gz/arc.gz archive files and show a meaningful
|
||||
error message explaining how to fix issue (uncompress and possibly use warctools warc2warc to recompress)
|
||||
|
||||
|
||||
pywb 0.6.4 changelist
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
PyWb 0.6.4
|
||||
PyWb 0.6.5
|
||||
==========
|
||||
|
||||
.. image:: https://travis-ci.org/ikreymer/pywb.png?branch=master
|
||||
|
@ -131,7 +131,7 @@ class WbRequest(object):
|
||||
if not self.wb_url:
|
||||
return
|
||||
|
||||
mime = self.env.get('CONTENT_TYPE').split(';')[0]
|
||||
mime = self.env.get('CONTENT_TYPE', '').split(';')[0]
|
||||
length = self.env.get('CONTENT_LENGTH')
|
||||
stream = self.env['wsgi.input']
|
||||
|
||||
|
@ -55,6 +55,24 @@ class MinimalScopeCookieRewriter(WbUrlBaseCookieRewriter):
|
||||
return morsel
|
||||
|
||||
|
||||
#=================================================================
|
||||
class ExactPathCookieRewriter(WbUrlBaseCookieRewriter):
|
||||
"""
|
||||
Rewrite cookies only using exact path, useful for live rewrite
|
||||
without a timestamp and to minimize cookie pollution
|
||||
|
||||
If path or domain present, simply remove
|
||||
"""
|
||||
|
||||
def rewrite_cookie(self, name, morsel):
|
||||
if morsel.get('domain'):
|
||||
del morsel['domain']
|
||||
# else set cookie to rewritten path
|
||||
if morsel.get('path'):
|
||||
del morsel['path']
|
||||
|
||||
self._remove_age_opts(morsel)
|
||||
return morsel
|
||||
#=================================================================
|
||||
class RootScopeCookieRewriter(WbUrlBaseCookieRewriter):
|
||||
"""
|
||||
@ -79,5 +97,7 @@ class RootScopeCookieRewriter(WbUrlBaseCookieRewriter):
|
||||
def get_cookie_rewriter(cookie_scope):
|
||||
if cookie_scope == 'root':
|
||||
return RootScopeCookieRewriter
|
||||
elif cookie_scope == 'exact':
|
||||
return ExactPathCookieRewriter
|
||||
else:
|
||||
return MinimalScopeCookieRewriter
|
||||
|
@ -174,6 +174,12 @@ class HTMLRewriterMixin(object):
|
||||
elif attr_name == 'crossorigin':
|
||||
attr_name = '_crossorigin'
|
||||
|
||||
# special case: link don't rewrite canonical
|
||||
elif tag == 'link' and attr_name == 'href':
|
||||
if not self.has_attr(tag_attrs, ('rel', 'canonical')):
|
||||
rw_mod = handler.get(attr_name)
|
||||
attr_value = self._rewrite_url(attr_value, rw_mod)
|
||||
|
||||
# special case: meta tag
|
||||
elif (tag == 'meta') and (attr_name == 'content'):
|
||||
if self.has_attr(tag_attrs, ('http-equiv', 'refresh')):
|
||||
|
@ -1,4 +1,5 @@
|
||||
r"""
|
||||
# Default -- MinimalScopeRewriter
|
||||
# No rewriting
|
||||
>>> rewrite_cookie('a=b; c=d;')
|
||||
[('Set-Cookie', 'a=b'), ('Set-Cookie', 'c=d')]
|
||||
@ -23,10 +24,17 @@ r"""
|
||||
>>> rewrite_cookie('abc@def=123')
|
||||
[]
|
||||
|
||||
# ExactCookieRewriter
|
||||
>>> rewrite_cookie('some=value; Path=/diff/path/;', urlrewriter, ExactPathCookieRewriter)
|
||||
[('Set-Cookie', 'some=value')]
|
||||
|
||||
>>> rewrite_cookie('some=value; Domain=.example.com; Path=/diff/path/; Max-Age=1500', urlrewriter, ExactPathCookieRewriter)
|
||||
[('Set-Cookie', 'some=value')]
|
||||
|
||||
"""
|
||||
|
||||
|
||||
from pywb.rewrite.cookie_rewriter import MinimalScopeCookieRewriter
|
||||
from pywb.rewrite.cookie_rewriter import MinimalScopeCookieRewriter, ExactPathCookieRewriter
|
||||
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||
|
||||
urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/pywb/')
|
||||
@ -34,6 +42,6 @@ urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.htm
|
||||
urlrewriter2 = UrlRewriter('em_/http://example.com/', '/preview/')
|
||||
|
||||
|
||||
def rewrite_cookie(cookie_str, rewriter=urlrewriter):
|
||||
return MinimalScopeCookieRewriter(rewriter).rewrite(cookie_str)
|
||||
def rewrite_cookie(cookie_str, rewriter=urlrewriter, cookie_rewriter=MinimalScopeCookieRewriter):
|
||||
return cookie_rewriter(rewriter).rewrite(cookie_str)
|
||||
|
||||
|
@ -102,6 +102,10 @@ ur"""
|
||||
>>> parse('<link href="abc.txt"><div>SomeTest</div>', head_insert = '<script>load_stuff();</script>')
|
||||
<link href="/web/20131226101010oe_/http://example.com/some/path/abc.txt"><script>load_stuff();</script><div>SomeTest</div>
|
||||
|
||||
# don't rewrite rel=canonical
|
||||
>>> parse('<link rel=canonical href="http://example.com/">')
|
||||
<link rel="canonical" href="http://example.com/">
|
||||
|
||||
# doctype
|
||||
>>> parse('<!doctype html PUBLIC "public">')
|
||||
<!doctype html PUBLIC "public">
|
||||
|
@ -45,6 +45,16 @@ r"""
|
||||
>>> _test_js('document_domain = "anotherdomain.com"; window.document.domain = "example.com"')
|
||||
'document_domain = "anotherdomain.com"; window.document.WB_wombat_domain = "example.com"'
|
||||
|
||||
# protocol-rel escapes
|
||||
>>> _test_js('"//example.com/"')
|
||||
'"/web/20131010/http://example.com/"'
|
||||
|
||||
>>> _test_js(r'"\/\/example.com/"')
|
||||
'"/web/20131010/http:\\/\\/example.com/"'
|
||||
|
||||
>>> _test_js(r'"\\/\\/example.com/"')
|
||||
'"/web/20131010/http:\\\\/\\\\/example.com/"'
|
||||
|
||||
# custom rules added
|
||||
>>> _test_js('window.location = "http://example.com/abc.html"; some_func(); ', [('some_func\(\).*', RegexRewriter.format('/*{0}*/'), 0)])
|
||||
'window.WB_wombat_location = "/web/20131010/http://example.com/abc.html"; /*some_func(); */'
|
||||
|
@ -50,6 +50,21 @@
|
||||
>>> do_rewrite(r'http:\/\/some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/')
|
||||
'localhost:8080/20101226101112/http:\\\\/\\\\/some-other-site.com'
|
||||
|
||||
>>> do_rewrite(r'//some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/')
|
||||
'localhost:8080/20101226101112/http://some-other-site.com'
|
||||
|
||||
>>> do_rewrite(r'\/\/some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/')
|
||||
'localhost:8080/20101226101112/http:\\\\/\\\\/some-other-site.com'
|
||||
|
||||
>>> do_rewrite(r'\\/\\/some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/')
|
||||
'localhost:8080/20101226101112/http:\\\\/\\\\/some-other-site.com'
|
||||
|
||||
>>> do_rewrite(r'http:\/\/some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/')
|
||||
'localhost:8080/20101226101112/http:\\\\/\\\\/some-other-site.com'
|
||||
|
||||
>>> do_rewrite(r'http:\/\/some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/')
|
||||
'localhost:8080/20101226101112/http:\\\\/\\\\/some-other-site.com'
|
||||
|
||||
>>> do_rewrite('../../other.html', '2020/http://example.com/index.html', '/')
|
||||
'/2020/http://example.com/other.html'
|
||||
|
||||
|
@ -26,6 +26,13 @@
|
||||
>>> repr(WbUrl('https://example.com/xyz?a=%2f&b=%2E'))
|
||||
"('latest_replay', '', '', 'https://example.com/xyz?a=%2f&b=%2E', 'https://example.com/xyz?a=%2f&b=%2E')"
|
||||
|
||||
# Test scheme partially encoded urls
|
||||
>>> repr(WbUrl('https%3A//example.com/'))
|
||||
"('latest_replay', '', '', 'https://example.com/', 'https://example.com/')"
|
||||
|
||||
>>> repr(WbUrl('2014/http%3A%2F%2Fexample.com/'))
|
||||
"('replay', '2014', '', 'http://example.com/', '2014/http://example.com/')"
|
||||
|
||||
# Query Urls
|
||||
# ======================
|
||||
>>> repr(WbUrl('*/http://example.com/abc?def=a'))
|
||||
@ -57,6 +64,21 @@
|
||||
>>> repr(WbUrl('/example.com/'))
|
||||
"('latest_replay', '', '', 'http://example.com/', 'http://example.com/')"
|
||||
|
||||
# Is_ Tests
|
||||
>>> u = WbUrl('*/http://example.com/abc?def=a*')
|
||||
>>> u.is_url_query()
|
||||
True
|
||||
|
||||
>>> u.is_query()
|
||||
True
|
||||
|
||||
>>> u2 = WbUrl('20130102im_/https:/example.com')
|
||||
>>> u2.is_embed
|
||||
True
|
||||
|
||||
>>> u2.is_replay()
|
||||
True
|
||||
|
||||
|
||||
# Error Urls
|
||||
# ======================
|
||||
|
@ -17,6 +17,8 @@ class UrlRewriter(object):
|
||||
|
||||
PROTOCOLS = ['http:', 'https:', 'ftp:', 'mms:', 'rtsp:', 'wais:']
|
||||
|
||||
REL_SCHEME = ('//', r'\/\/', r'\\/\\/')
|
||||
|
||||
def __init__(self, wburl, prefix, full_prefix=None, rel_prefix=None,
|
||||
root_path=None, cookie_scope=None):
|
||||
self.wburl = wburl if isinstance(wburl, WbUrl) else WbUrl(wburl)
|
||||
@ -45,7 +47,7 @@ class UrlRewriter(object):
|
||||
|
||||
is_abs = any(url.startswith(x) for x in self.PROTOCOLS)
|
||||
|
||||
if url.startswith('//'):
|
||||
if url.startswith(self.REL_SCHEME):
|
||||
is_abs = True
|
||||
url = 'http:' + url
|
||||
|
||||
|
@ -85,6 +85,9 @@ class WbUrl(BaseWbUrl):
|
||||
REPLAY_REGEX = re.compile('^(\d*)([a-z]+_)?/{0,3}(.+)$')
|
||||
|
||||
DEFAULT_SCHEME = 'http://'
|
||||
|
||||
PARTIAL_ENC_RX = re.compile('(https?%3A)?(%2F%2F)?', re.I)
|
||||
|
||||
# ======================
|
||||
|
||||
def __init__(self, url):
|
||||
@ -99,6 +102,14 @@ class WbUrl(BaseWbUrl):
|
||||
# protocol agnostic url -> http://
|
||||
# no protocol -> http://
|
||||
inx = self.url.find(':/')
|
||||
if inx < 0:
|
||||
# check for other partially encoded variants
|
||||
m = self.PARTIAL_ENC_RX.match(self.url)
|
||||
if m:
|
||||
len_ = len(m.group(0))
|
||||
self.url = urllib.unquote_plus(self.url[:len_]) + self.url[len_:]
|
||||
inx = self.url.find(':/')
|
||||
|
||||
if inx < 0:
|
||||
self.url = self.DEFAULT_SCHEME + self.url
|
||||
else:
|
||||
|
@ -21,6 +21,25 @@ class ArchiveIterator(object):
|
||||
|
||||
"""
|
||||
|
||||
GZIP_ERR_MSG = """
|
||||
ERROR: Non-chunked gzip file detected, gzip block continues
|
||||
beyond single record.
|
||||
|
||||
This file is probably not a multi-chunk gzip but a single gzip file.
|
||||
|
||||
To allow seek, a gzipped {1} must have each record compressed into
|
||||
a single gzip chunk and concatenated together.
|
||||
|
||||
This file is likely still valid and you can use it by decompressing it:
|
||||
|
||||
gunzip myfile.{0}.gz
|
||||
|
||||
You can then also use the 'warc2warc' tool from the 'warc-tools'
|
||||
package which will create a properly chunked gzip file:
|
||||
|
||||
warc2warc -Z myfile.{0} > myfile.{0}.gz
|
||||
"""
|
||||
|
||||
def __init__(self, fileobj):
|
||||
self.fh = fileobj
|
||||
|
||||
@ -42,27 +61,34 @@ class ArchiveIterator(object):
|
||||
block_size=block_size)
|
||||
self.offset = self.fh.tell()
|
||||
|
||||
next_line = None
|
||||
self.next_line = None
|
||||
|
||||
is_valid = True
|
||||
|
||||
while True:
|
||||
try:
|
||||
record = self._next_record(next_line)
|
||||
record = self._next_record(self.next_line)
|
||||
if not is_valid:
|
||||
self._raise_err()
|
||||
|
||||
yield record
|
||||
except EOFError:
|
||||
break
|
||||
|
||||
self.read_to_end(record)
|
||||
|
||||
# for non-compressed, consume blank lines here
|
||||
if not self.reader.decompressor:
|
||||
next_line = self._consume_blanklines()
|
||||
if next_line is None:
|
||||
# at end of file
|
||||
break
|
||||
if self.reader.decompressor:
|
||||
is_valid = self.reader.read_next_member()
|
||||
|
||||
# reset reader for next member
|
||||
else:
|
||||
self.reader.read_next_member()
|
||||
def _raise_err(self):
|
||||
frmt = 'warc/arc'
|
||||
if self.known_format:
|
||||
frmt = self.known_format
|
||||
|
||||
frmt_up = frmt.upper()
|
||||
|
||||
msg = self.GZIP_ERR_MSG.format(frmt, frmt_up)
|
||||
raise Exception(msg)
|
||||
|
||||
def _consume_blanklines(self):
|
||||
""" Consume blank lines that are between records
|
||||
@ -72,25 +98,31 @@ class ArchiveIterator(object):
|
||||
and are included in record length which is the full gzip envelope
|
||||
- For uncompressed, they are between records and so are NOT part of
|
||||
the record length
|
||||
|
||||
count empty_size so that it can be substracted from
|
||||
the record length for uncompressed
|
||||
"""
|
||||
empty_size = 0
|
||||
while True:
|
||||
line = self.reader.readline()
|
||||
if len(line) == 0:
|
||||
return None
|
||||
return None, empty_size
|
||||
|
||||
if line.rstrip() == '':
|
||||
self.offset = self.fh.tell() - self.reader.rem_length()
|
||||
empty_size += len(line)
|
||||
continue
|
||||
|
||||
return line
|
||||
return line, empty_size
|
||||
|
||||
def read_to_end(self, record, compute_digest=False):
|
||||
""" Read remainder of the stream
|
||||
If a digester is included, update it
|
||||
with the data read
|
||||
"""
|
||||
|
||||
# already at end of this record, don't read until it is consumed
|
||||
if self.member_info:
|
||||
return self.member_info
|
||||
return None
|
||||
|
||||
if compute_digest:
|
||||
digester = hashlib.sha1()
|
||||
@ -114,19 +146,29 @@ class ArchiveIterator(object):
|
||||
- For uncompressed files, blank lines are read later,
|
||||
and not included in the record length
|
||||
"""
|
||||
if self.reader.decompressor:
|
||||
self._consume_blanklines()
|
||||
#if self.reader.decompressor:
|
||||
self.next_line, empty_size = self._consume_blanklines()
|
||||
|
||||
self.offset = self.fh.tell() - self.reader.rem_length()
|
||||
#if self.offset < 0:
|
||||
# raise Exception('Not Gzipped Properly')
|
||||
|
||||
if self.next_line:
|
||||
self.offset -= len(self.next_line)
|
||||
|
||||
length = self.offset - curr_offset
|
||||
|
||||
if not self.reader.decompressor:
|
||||
length -= empty_size
|
||||
|
||||
if compute_digest:
|
||||
digest = base64.b32encode(digester.digest())
|
||||
else:
|
||||
digest = None
|
||||
|
||||
self.member_info = (curr_offset, length, digest)
|
||||
return self.member_info
|
||||
#return self.member_info
|
||||
#return next_line
|
||||
|
||||
def _next_record(self, next_line):
|
||||
""" Use loader to parse the record from the reader stream
|
||||
@ -250,7 +292,9 @@ def create_record_iter(arcv_iter, options):
|
||||
|
||||
entry.post_query = post_query
|
||||
|
||||
entry.set_rec_info(*arcv_iter.read_to_end(record, compute_digest))
|
||||
#entry.set_rec_info(*arcv_iter.read_to_end(record, compute_digest))
|
||||
arcv_iter.read_to_end(record, compute_digest)
|
||||
entry.set_rec_info(*arcv_iter.member_info)
|
||||
entry.record = record
|
||||
|
||||
yield entry
|
||||
|
@ -144,7 +144,6 @@ Total: 4
|
||||
|
||||
from pywb import get_test_dir
|
||||
|
||||
#from pywb.warc.archiveindexer import ArchiveIndexer, main, cdx_filename
|
||||
from pywb.warc.cdxindexer import write_cdx_index, main, cdx_filename
|
||||
|
||||
from io import BytesIO
|
||||
@ -154,6 +153,9 @@ import os
|
||||
import shutil
|
||||
import tempfile
|
||||
|
||||
from pytest import raises
|
||||
|
||||
|
||||
TEST_CDX_DIR = get_test_dir() + 'cdx/'
|
||||
TEST_WARC_DIR = get_test_dir() + 'warcs/'
|
||||
|
||||
@ -231,3 +233,11 @@ def cli_lines_with_dir(input_):
|
||||
print('Total: ' + str(len(lines)))
|
||||
|
||||
|
||||
def test_non_chunked_gzip_err():
|
||||
with raises(Exception):
|
||||
print_cdx_index('example-bad.warc.gz.bad')
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
doctest.testmod()
|
||||
|
@ -193,7 +193,11 @@ class StaticHandler(BaseHandler):
|
||||
else:
|
||||
reader = iter(lambda: data.read(), '')
|
||||
|
||||
content_type, _ = mimetypes.guess_type(full_path)
|
||||
content_type = 'application/octet-stream'
|
||||
|
||||
guessed = mimetypes.guess_type(full_path)
|
||||
if guessed[0]:
|
||||
content_type = guessed[0]
|
||||
|
||||
return WbResponse.text_stream(data,
|
||||
content_type=content_type,
|
||||
|
BIN
sample_archive/warcs/example-bad.warc.gz.bad
Normal file
BIN
sample_archive/warcs/example-bad.warc.gz.bad
Normal file
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user