diff --git a/CHANGES.rst b/CHANGES.rst
index 1ddaeea2..0fe9ae07 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -1,3 +1,20 @@
+pywb 0.6.5 changelist
+~~~~~~~~~~~~~~~~~~~~~
+
+* fix static handling when content type can not be guessed, default to 'application/octet-stream'
+
+* rewrite fix: understand partially encoded urls such as http%3A// in WbUrl, decode correctly
+
+* rewrite fix: rewrite \/\/example.com and \\/\\/example.com in JS same as \\example.com
+
+* cookies: add exact cookie rewriter which sets cookie to exact url only, never collection or host root
+
+* don't rewrite rel=canonical links for services which rely on these
+
+* cdx-indexer: Detect non-gzip chunk encoded .warc.gz/arc.gz archive files and show a meaningful
+ error message explaining how to fix issue (uncompress and possibly use warctools warc2warc to recompress)
+
+
pywb 0.6.4 changelist
~~~~~~~~~~~~~~~~~~~~~
diff --git a/README.rst b/README.rst
index 6a913b78..5150c8f8 100644
--- a/README.rst
+++ b/README.rst
@@ -1,4 +1,4 @@
-PyWb 0.6.4
+PyWb 0.6.5
==========
.. image:: https://travis-ci.org/ikreymer/pywb.png?branch=master
diff --git a/pywb/framework/wbrequestresponse.py b/pywb/framework/wbrequestresponse.py
index 808563ea..06970316 100644
--- a/pywb/framework/wbrequestresponse.py
+++ b/pywb/framework/wbrequestresponse.py
@@ -131,7 +131,7 @@ class WbRequest(object):
if not self.wb_url:
return
- mime = self.env.get('CONTENT_TYPE').split(';')[0]
+ mime = self.env.get('CONTENT_TYPE', '').split(';')[0]
length = self.env.get('CONTENT_LENGTH')
stream = self.env['wsgi.input']
diff --git a/pywb/rewrite/cookie_rewriter.py b/pywb/rewrite/cookie_rewriter.py
index 4724df4c..e9dd80ac 100644
--- a/pywb/rewrite/cookie_rewriter.py
+++ b/pywb/rewrite/cookie_rewriter.py
@@ -55,6 +55,24 @@ class MinimalScopeCookieRewriter(WbUrlBaseCookieRewriter):
return morsel
+#=================================================================
+class ExactPathCookieRewriter(WbUrlBaseCookieRewriter):
+ """
+ Rewrite cookies only using exact path, useful for live rewrite
+ without a timestamp and to minimize cookie pollution
+
+ If path or domain present, simply remove
+ """
+
+ def rewrite_cookie(self, name, morsel):
+ if morsel.get('domain'):
+ del morsel['domain']
+ # else set cookie to rewritten path
+ if morsel.get('path'):
+ del morsel['path']
+
+ self._remove_age_opts(morsel)
+ return morsel
#=================================================================
class RootScopeCookieRewriter(WbUrlBaseCookieRewriter):
"""
@@ -79,5 +97,7 @@ class RootScopeCookieRewriter(WbUrlBaseCookieRewriter):
def get_cookie_rewriter(cookie_scope):
if cookie_scope == 'root':
return RootScopeCookieRewriter
+ elif cookie_scope == 'exact':
+ return ExactPathCookieRewriter
else:
return MinimalScopeCookieRewriter
diff --git a/pywb/rewrite/html_rewriter.py b/pywb/rewrite/html_rewriter.py
index f0c904c2..618c5191 100644
--- a/pywb/rewrite/html_rewriter.py
+++ b/pywb/rewrite/html_rewriter.py
@@ -174,6 +174,12 @@ class HTMLRewriterMixin(object):
elif attr_name == 'crossorigin':
attr_name = '_crossorigin'
+ # special case: link don't rewrite canonical
+ elif tag == 'link' and attr_name == 'href':
+ if not self.has_attr(tag_attrs, ('rel', 'canonical')):
+ rw_mod = handler.get(attr_name)
+ attr_value = self._rewrite_url(attr_value, rw_mod)
+
# special case: meta tag
elif (tag == 'meta') and (attr_name == 'content'):
if self.has_attr(tag_attrs, ('http-equiv', 'refresh')):
diff --git a/pywb/rewrite/test/test_cookie_rewriter.py b/pywb/rewrite/test/test_cookie_rewriter.py
index c20f56f9..4f57464f 100644
--- a/pywb/rewrite/test/test_cookie_rewriter.py
+++ b/pywb/rewrite/test/test_cookie_rewriter.py
@@ -1,4 +1,5 @@
r"""
+# Default -- MinimalScopeRewriter
# No rewriting
>>> rewrite_cookie('a=b; c=d;')
[('Set-Cookie', 'a=b'), ('Set-Cookie', 'c=d')]
@@ -23,10 +24,17 @@ r"""
>>> rewrite_cookie('abc@def=123')
[]
+# ExactCookieRewriter
+>>> rewrite_cookie('some=value; Path=/diff/path/;', urlrewriter, ExactPathCookieRewriter)
+[('Set-Cookie', 'some=value')]
+
+>>> rewrite_cookie('some=value; Domain=.example.com; Path=/diff/path/; Max-Age=1500', urlrewriter, ExactPathCookieRewriter)
+[('Set-Cookie', 'some=value')]
+
"""
-from pywb.rewrite.cookie_rewriter import MinimalScopeCookieRewriter
+from pywb.rewrite.cookie_rewriter import MinimalScopeCookieRewriter, ExactPathCookieRewriter
from pywb.rewrite.url_rewriter import UrlRewriter
urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/pywb/')
@@ -34,6 +42,6 @@ urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.htm
urlrewriter2 = UrlRewriter('em_/http://example.com/', '/preview/')
-def rewrite_cookie(cookie_str, rewriter=urlrewriter):
- return MinimalScopeCookieRewriter(rewriter).rewrite(cookie_str)
+def rewrite_cookie(cookie_str, rewriter=urlrewriter, cookie_rewriter=MinimalScopeCookieRewriter):
+ return cookie_rewriter(rewriter).rewrite(cookie_str)
diff --git a/pywb/rewrite/test/test_html_rewriter.py b/pywb/rewrite/test/test_html_rewriter.py
index 45df4dfb..710fa338 100644
--- a/pywb/rewrite/test/test_html_rewriter.py
+++ b/pywb/rewrite/test/test_html_rewriter.py
@@ -102,6 +102,10 @@ ur"""
>>> parse('
SomeTest
', head_insert = '')
SomeTest
+# don't rewrite rel=canonical
+>>> parse('')
+
+
# doctype
>>> parse('')
diff --git a/pywb/rewrite/test/test_regex_rewriters.py b/pywb/rewrite/test/test_regex_rewriters.py
index 253328e5..92975a7f 100644
--- a/pywb/rewrite/test/test_regex_rewriters.py
+++ b/pywb/rewrite/test/test_regex_rewriters.py
@@ -45,6 +45,16 @@ r"""
>>> _test_js('document_domain = "anotherdomain.com"; window.document.domain = "example.com"')
'document_domain = "anotherdomain.com"; window.document.WB_wombat_domain = "example.com"'
+# protocol-rel escapes
+>>> _test_js('"//example.com/"')
+'"/web/20131010/http://example.com/"'
+
+>>> _test_js(r'"\/\/example.com/"')
+'"/web/20131010/http:\\/\\/example.com/"'
+
+>>> _test_js(r'"\\/\\/example.com/"')
+'"/web/20131010/http:\\\\/\\\\/example.com/"'
+
# custom rules added
>>> _test_js('window.location = "http://example.com/abc.html"; some_func(); ', [('some_func\(\).*', RegexRewriter.format('/*{0}*/'), 0)])
'window.WB_wombat_location = "/web/20131010/http://example.com/abc.html"; /*some_func(); */'
diff --git a/pywb/rewrite/test/test_url_rewriter.py b/pywb/rewrite/test/test_url_rewriter.py
index be0ca7da..3d324069 100644
--- a/pywb/rewrite/test/test_url_rewriter.py
+++ b/pywb/rewrite/test/test_url_rewriter.py
@@ -50,6 +50,21 @@
>>> do_rewrite(r'http:\/\/some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/')
'localhost:8080/20101226101112/http:\\\\/\\\\/some-other-site.com'
+>>> do_rewrite(r'//some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/')
+'localhost:8080/20101226101112/http://some-other-site.com'
+
+>>> do_rewrite(r'\/\/some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/')
+'localhost:8080/20101226101112/http:\\\\/\\\\/some-other-site.com'
+
+>>> do_rewrite(r'\\/\\/some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/')
+'localhost:8080/20101226101112/http:\\\\/\\\\/some-other-site.com'
+
+>>> do_rewrite(r'http:\/\/some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/')
+'localhost:8080/20101226101112/http:\\\\/\\\\/some-other-site.com'
+
+>>> do_rewrite(r'http:\/\/some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/')
+'localhost:8080/20101226101112/http:\\\\/\\\\/some-other-site.com'
+
>>> do_rewrite('../../other.html', '2020/http://example.com/index.html', '/')
'/2020/http://example.com/other.html'
diff --git a/pywb/rewrite/test/test_wburl.py b/pywb/rewrite/test/test_wburl.py
index bcad948e..b4d15b5d 100644
--- a/pywb/rewrite/test/test_wburl.py
+++ b/pywb/rewrite/test/test_wburl.py
@@ -26,6 +26,13 @@
>>> repr(WbUrl('https://example.com/xyz?a=%2f&b=%2E'))
"('latest_replay', '', '', 'https://example.com/xyz?a=%2f&b=%2E', 'https://example.com/xyz?a=%2f&b=%2E')"
+# Test scheme partially encoded urls
+>>> repr(WbUrl('https%3A//example.com/'))
+"('latest_replay', '', '', 'https://example.com/', 'https://example.com/')"
+
+>>> repr(WbUrl('2014/http%3A%2F%2Fexample.com/'))
+"('replay', '2014', '', 'http://example.com/', '2014/http://example.com/')"
+
# Query Urls
# ======================
>>> repr(WbUrl('*/http://example.com/abc?def=a'))
@@ -57,6 +64,21 @@
>>> repr(WbUrl('/example.com/'))
"('latest_replay', '', '', 'http://example.com/', 'http://example.com/')"
+# Is_ Tests
+>>> u = WbUrl('*/http://example.com/abc?def=a*')
+>>> u.is_url_query()
+True
+
+>>> u.is_query()
+True
+
+>>> u2 = WbUrl('20130102im_/https:/example.com')
+>>> u2.is_embed
+True
+
+>>> u2.is_replay()
+True
+
# Error Urls
# ======================
diff --git a/pywb/rewrite/url_rewriter.py b/pywb/rewrite/url_rewriter.py
index 61a48e50..aa87260c 100644
--- a/pywb/rewrite/url_rewriter.py
+++ b/pywb/rewrite/url_rewriter.py
@@ -17,7 +17,9 @@ class UrlRewriter(object):
PROTOCOLS = ['http:', 'https:', 'ftp:', 'mms:', 'rtsp:', 'wais:']
- def __init__(self, wburl, prefix, full_prefix=None, rel_prefix=None,
+ REL_SCHEME = ('//', r'\/\/', r'\\/\\/')
+
+ def __init__(self, wburl, prefix, full_prefix=None, rel_prefix=None,
root_path=None, cookie_scope=None):
self.wburl = wburl if isinstance(wburl, WbUrl) else WbUrl(wburl)
self.prefix = prefix
@@ -45,7 +47,7 @@ class UrlRewriter(object):
is_abs = any(url.startswith(x) for x in self.PROTOCOLS)
- if url.startswith('//'):
+ if url.startswith(self.REL_SCHEME):
is_abs = True
url = 'http:' + url
diff --git a/pywb/rewrite/wburl.py b/pywb/rewrite/wburl.py
index 91d36455..5421a1de 100644
--- a/pywb/rewrite/wburl.py
+++ b/pywb/rewrite/wburl.py
@@ -85,6 +85,9 @@ class WbUrl(BaseWbUrl):
REPLAY_REGEX = re.compile('^(\d*)([a-z]+_)?/{0,3}(.+)$')
DEFAULT_SCHEME = 'http://'
+
+ PARTIAL_ENC_RX = re.compile('(https?%3A)?(%2F%2F)?', re.I)
+
# ======================
def __init__(self, url):
@@ -99,6 +102,14 @@ class WbUrl(BaseWbUrl):
# protocol agnostic url -> http://
# no protocol -> http://
inx = self.url.find(':/')
+ if inx < 0:
+ # check for other partially encoded variants
+ m = self.PARTIAL_ENC_RX.match(self.url)
+ if m:
+ len_ = len(m.group(0))
+ self.url = urllib.unquote_plus(self.url[:len_]) + self.url[len_:]
+ inx = self.url.find(':/')
+
if inx < 0:
self.url = self.DEFAULT_SCHEME + self.url
else:
diff --git a/pywb/warc/archiveiterator.py b/pywb/warc/archiveiterator.py
index e1408432..e0994a7f 100644
--- a/pywb/warc/archiveiterator.py
+++ b/pywb/warc/archiveiterator.py
@@ -21,6 +21,25 @@ class ArchiveIterator(object):
"""
+ GZIP_ERR_MSG = """
+ ERROR: Non-chunked gzip file detected, gzip block continues
+ beyond single record.
+
+ This file is probably not a multi-chunk gzip but a single gzip file.
+
+ To allow seek, a gzipped {1} must have each record compressed into
+ a single gzip chunk and concatenated together.
+
+ This file is likely still valid and you can use it by decompressing it:
+
+ gunzip myfile.{0}.gz
+
+ You can then also use the 'warc2warc' tool from the 'warc-tools'
+ package which will create a properly chunked gzip file:
+
+ warc2warc -Z myfile.{0} > myfile.{0}.gz
+ """
+
def __init__(self, fileobj):
self.fh = fileobj
@@ -42,27 +61,34 @@ class ArchiveIterator(object):
block_size=block_size)
self.offset = self.fh.tell()
- next_line = None
+ self.next_line = None
+
+ is_valid = True
while True:
try:
- record = self._next_record(next_line)
+ record = self._next_record(self.next_line)
+ if not is_valid:
+ self._raise_err()
+
yield record
except EOFError:
break
self.read_to_end(record)
- # for non-compressed, consume blank lines here
- if not self.reader.decompressor:
- next_line = self._consume_blanklines()
- if next_line is None:
- # at end of file
- break
+ if self.reader.decompressor:
+ is_valid = self.reader.read_next_member()
- # reset reader for next member
- else:
- self.reader.read_next_member()
+ def _raise_err(self):
+ frmt = 'warc/arc'
+ if self.known_format:
+ frmt = self.known_format
+
+ frmt_up = frmt.upper()
+
+ msg = self.GZIP_ERR_MSG.format(frmt, frmt_up)
+ raise Exception(msg)
def _consume_blanklines(self):
""" Consume blank lines that are between records
@@ -72,25 +98,31 @@ class ArchiveIterator(object):
and are included in record length which is the full gzip envelope
- For uncompressed, they are between records and so are NOT part of
the record length
+
+ count empty_size so that it can be substracted from
+ the record length for uncompressed
"""
+ empty_size = 0
while True:
line = self.reader.readline()
if len(line) == 0:
- return None
+ return None, empty_size
if line.rstrip() == '':
- self.offset = self.fh.tell() - self.reader.rem_length()
+ empty_size += len(line)
continue
- return line
+ return line, empty_size
def read_to_end(self, record, compute_digest=False):
""" Read remainder of the stream
If a digester is included, update it
with the data read
"""
+
+ # already at end of this record, don't read until it is consumed
if self.member_info:
- return self.member_info
+ return None
if compute_digest:
digester = hashlib.sha1()
@@ -114,19 +146,29 @@ class ArchiveIterator(object):
- For uncompressed files, blank lines are read later,
and not included in the record length
"""
- if self.reader.decompressor:
- self._consume_blanklines()
+ #if self.reader.decompressor:
+ self.next_line, empty_size = self._consume_blanklines()
self.offset = self.fh.tell() - self.reader.rem_length()
+ #if self.offset < 0:
+ # raise Exception('Not Gzipped Properly')
+
+ if self.next_line:
+ self.offset -= len(self.next_line)
+
length = self.offset - curr_offset
+ if not self.reader.decompressor:
+ length -= empty_size
+
if compute_digest:
digest = base64.b32encode(digester.digest())
else:
digest = None
self.member_info = (curr_offset, length, digest)
- return self.member_info
+ #return self.member_info
+ #return next_line
def _next_record(self, next_line):
""" Use loader to parse the record from the reader stream
@@ -250,7 +292,9 @@ def create_record_iter(arcv_iter, options):
entry.post_query = post_query
- entry.set_rec_info(*arcv_iter.read_to_end(record, compute_digest))
+ #entry.set_rec_info(*arcv_iter.read_to_end(record, compute_digest))
+ arcv_iter.read_to_end(record, compute_digest)
+ entry.set_rec_info(*arcv_iter.member_info)
entry.record = record
yield entry
diff --git a/pywb/warc/test/test_indexing.py b/pywb/warc/test/test_indexing.py
index cb8dc4bb..c8584c8d 100644
--- a/pywb/warc/test/test_indexing.py
+++ b/pywb/warc/test/test_indexing.py
@@ -144,7 +144,6 @@ Total: 4
from pywb import get_test_dir
-#from pywb.warc.archiveindexer import ArchiveIndexer, main, cdx_filename
from pywb.warc.cdxindexer import write_cdx_index, main, cdx_filename
from io import BytesIO
@@ -154,6 +153,9 @@ import os
import shutil
import tempfile
+from pytest import raises
+
+
TEST_CDX_DIR = get_test_dir() + 'cdx/'
TEST_WARC_DIR = get_test_dir() + 'warcs/'
@@ -231,3 +233,11 @@ def cli_lines_with_dir(input_):
print('Total: ' + str(len(lines)))
+def test_non_chunked_gzip_err():
+ with raises(Exception):
+ print_cdx_index('example-bad.warc.gz.bad')
+
+
+if __name__ == "__main__":
+ import doctest
+ doctest.testmod()
diff --git a/pywb/webapp/handlers.py b/pywb/webapp/handlers.py
index 2c7962cc..a77f7060 100644
--- a/pywb/webapp/handlers.py
+++ b/pywb/webapp/handlers.py
@@ -40,7 +40,7 @@ class SearchPageWbUrlHandler(WbUrlHandler):
create_template(html, 'Frame Insert'))
self.banner_html = config.get('banner_html', 'banner.html')
-
+
if config.get('enable_memento', False):
self.response_class = MementoResponse
@@ -193,7 +193,11 @@ class StaticHandler(BaseHandler):
else:
reader = iter(lambda: data.read(), '')
- content_type, _ = mimetypes.guess_type(full_path)
+ content_type = 'application/octet-stream'
+
+ guessed = mimetypes.guess_type(full_path)
+ if guessed[0]:
+ content_type = guessed[0]
return WbResponse.text_stream(data,
content_type=content_type,
diff --git a/sample_archive/warcs/example-bad.warc.gz.bad b/sample_archive/warcs/example-bad.warc.gz.bad
new file mode 100644
index 00000000..95d2c415
Binary files /dev/null and b/sample_archive/warcs/example-bad.warc.gz.bad differ
diff --git a/setup.py b/setup.py
index 12225bee..b5ef2d26 100755
--- a/setup.py
+++ b/setup.py
@@ -34,7 +34,7 @@ class PyTest(TestCommand):
setup(
name='pywb',
- version='0.6.4',
+ version='0.6.5',
url='https://github.com/ikreymer/pywb',
author='Ilya Kreymer',
author_email='ikreymer@gmail.com',