From 044792f99fcf7a279e1fe28370389e611037694b Mon Sep 17 00:00:00 2001
From: Ilya Kreymer <ikreymer@gmail.com>
Date: Thu, 6 Nov 2014 01:28:56 -0800
Subject: [PATCH 1/9] bump version to 0.6.5!

---
 README.rst | 4 ++--
 setup.py   | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.rst b/README.rst
index 8e8fe103..30e9979e 100644
--- a/README.rst
+++ b/README.rst
@@ -1,4 +1,4 @@
-PyWb 0.6.4
+PyWb 0.6.5
 ==========
 
 .. image:: https://travis-ci.org/ikreymer/pywb.png?branch=develop
@@ -7,7 +7,7 @@ PyWb 0.6.4
       :target: https://coveralls.io/r/ikreymer/pywb?branch=develop
 .. image:: https://img.shields.io/gratipay/ikreymer.svg
       :target: https://www.gratipay.com/ikreymer/
-      
+
 pywb is a python implementation of web archival replay tools, sometimes also known as 'Wayback Machine'.
 
 pywb allows high-quality replay (browsing) of archived web data stored in standardized `ARC <http://en.wikipedia.org/wiki/ARC_(file_format)>`_ and `WARC <http://en.wikipedia.org/wiki/Web_ARChive>`_.
diff --git a/setup.py b/setup.py
index 12225bee..b5ef2d26 100755
--- a/setup.py
+++ b/setup.py
@@ -34,7 +34,7 @@ class PyTest(TestCommand):
 
 setup(
     name='pywb',
-    version='0.6.4',
+    version='0.6.5',
     url='https://github.com/ikreymer/pywb',
     author='Ilya Kreymer',
     author_email='ikreymer@gmail.com',

From 49e98e0cdc3bab540d912fbf1e067bc73d335ed0 Mon Sep 17 00:00:00 2001
From: Ilya Kreymer <ikreymer@gmail.com>
Date: Thu, 6 Nov 2014 01:29:14 -0800
Subject: [PATCH 2/9] archiveiterator/cdxindexer: cleaner load path for
 compressed and uncompressed, ability to distinguish between chunked and
 non-chunked warcs/arcs Raise error for non-chunked gzip warcs as they can not
 be indexed for replay, addressing #48 add 'bad' non-chunked gzip file for
 testing, using custom ext

---
 pywb/warc/archiveiterator.py                 |  82 ++++++++++++++-----
 pywb/warc/test/test_indexing.py              |  12 ++-
 sample_archive/warcs/example-bad.warc.gz.bad | Bin 0 -> 1950 bytes
 3 files changed, 74 insertions(+), 20 deletions(-)
 create mode 100644 sample_archive/warcs/example-bad.warc.gz.bad

diff --git a/pywb/warc/archiveiterator.py b/pywb/warc/archiveiterator.py
index e1408432..e0994a7f 100644
--- a/pywb/warc/archiveiterator.py
+++ b/pywb/warc/archiveiterator.py
@@ -21,6 +21,25 @@ class ArchiveIterator(object):
 
     """
 
+    GZIP_ERR_MSG = """
+    ERROR: Non-chunked gzip file detected, gzip block continues
+    beyond single record.
+
+    This file is probably not a multi-chunk gzip but a single gzip file.
+
+    To allow seek, a gzipped {1} must have each record compressed into
+    a single gzip chunk and concatenated together.
+
+    This file is likely still valid and you can use it by decompressing it:
+
+    gunzip myfile.{0}.gz
+
+    You can then also use the 'warc2warc' tool from the 'warc-tools'
+    package which will create a properly chunked gzip file:
+
+    warc2warc -Z myfile.{0} > myfile.{0}.gz
+    """
+
     def __init__(self, fileobj):
         self.fh = fileobj
 
@@ -42,27 +61,34 @@ class ArchiveIterator(object):
                                                   block_size=block_size)
         self.offset = self.fh.tell()
 
-        next_line = None
+        self.next_line = None
+
+        is_valid = True
 
         while True:
             try:
-                record = self._next_record(next_line)
+                record = self._next_record(self.next_line)
+                if not is_valid:
+                    self._raise_err()
+
                 yield record
             except EOFError:
                 break
 
             self.read_to_end(record)
 
-            # for non-compressed, consume blank lines here
-            if not self.reader.decompressor:
-                next_line = self._consume_blanklines()
-                if next_line is None:
-                    # at end of file
-                    break
+            if self.reader.decompressor:
+                is_valid = self.reader.read_next_member()
 
-            # reset reader for next member
-            else:
-                self.reader.read_next_member()
+    def _raise_err(self):
+        frmt = 'warc/arc'
+        if self.known_format:
+            frmt = self.known_format
+
+        frmt_up = frmt.upper()
+
+        msg = self.GZIP_ERR_MSG.format(frmt, frmt_up)
+        raise Exception(msg)
 
     def _consume_blanklines(self):
         """ Consume blank lines that are between records
@@ -72,25 +98,31 @@ class ArchiveIterator(object):
           and are included in record length which is the full gzip envelope
         - For uncompressed, they are between records and so are NOT part of
           the record length
+
+          count empty_size so that it can be substracted from
+          the record length for uncompressed
         """
+        empty_size = 0
         while True:
             line = self.reader.readline()
             if len(line) == 0:
-                return None
+                return None, empty_size
 
             if line.rstrip() == '':
-                self.offset = self.fh.tell() - self.reader.rem_length()
+                empty_size += len(line)
                 continue
 
-            return line
+            return line, empty_size
 
     def read_to_end(self, record, compute_digest=False):
         """ Read remainder of the stream
         If a digester is included, update it
         with the data read
         """
+
+        # already at end of this record, don't read until it is consumed
         if self.member_info:
-            return self.member_info
+            return None
 
         if compute_digest:
             digester = hashlib.sha1()
@@ -114,19 +146,29 @@ class ArchiveIterator(object):
         - For uncompressed files, blank lines are read later,
           and not included in the record length
         """
-        if self.reader.decompressor:
-            self._consume_blanklines()
+        #if self.reader.decompressor:
+        self.next_line, empty_size = self._consume_blanklines()
 
         self.offset = self.fh.tell() - self.reader.rem_length()
+        #if self.offset < 0:
+        #    raise Exception('Not Gzipped Properly')
+
+        if self.next_line:
+            self.offset -= len(self.next_line)
+
         length = self.offset - curr_offset
 
+        if not self.reader.decompressor:
+            length -= empty_size
+
         if compute_digest:
             digest = base64.b32encode(digester.digest())
         else:
             digest = None
 
         self.member_info = (curr_offset, length, digest)
-        return self.member_info
+        #return self.member_info
+        #return next_line
 
     def _next_record(self, next_line):
         """ Use loader to parse the record from the reader stream
@@ -250,7 +292,9 @@ def create_record_iter(arcv_iter, options):
 
             entry.post_query = post_query
 
-        entry.set_rec_info(*arcv_iter.read_to_end(record, compute_digest))
+        #entry.set_rec_info(*arcv_iter.read_to_end(record, compute_digest))
+        arcv_iter.read_to_end(record, compute_digest)
+        entry.set_rec_info(*arcv_iter.member_info)
         entry.record = record
 
         yield entry
diff --git a/pywb/warc/test/test_indexing.py b/pywb/warc/test/test_indexing.py
index cb8dc4bb..c8584c8d 100644
--- a/pywb/warc/test/test_indexing.py
+++ b/pywb/warc/test/test_indexing.py
@@ -144,7 +144,6 @@ Total: 4
 
 from pywb import get_test_dir
 
-#from pywb.warc.archiveindexer import ArchiveIndexer, main, cdx_filename
 from pywb.warc.cdxindexer import write_cdx_index, main, cdx_filename
 
 from io import BytesIO
@@ -154,6 +153,9 @@ import os
 import shutil
 import tempfile
 
+from pytest import raises
+
+
 TEST_CDX_DIR = get_test_dir() + 'cdx/'
 TEST_WARC_DIR = get_test_dir() + 'warcs/'
 
@@ -231,3 +233,11 @@ def cli_lines_with_dir(input_):
     print('Total: ' + str(len(lines)))
 
 
+def test_non_chunked_gzip_err():
+    with raises(Exception):
+        print_cdx_index('example-bad.warc.gz.bad')
+
+
+if __name__ == "__main__":
+    import doctest
+    doctest.testmod()
diff --git a/sample_archive/warcs/example-bad.warc.gz.bad b/sample_archive/warcs/example-bad.warc.gz.bad
new file mode 100644
index 0000000000000000000000000000000000000000..95d2c415b4d792cd7fab3f0141dd224c7d9b08f6
GIT binary patch
literal 1950
zcmV;P2VwXhiwFo2Ia^c!17&z&ZE$R5E_Y#aV*u?~S##US5q@VC|HFiNu{RuGa33O>
z&=Pg5B};3OmhBhU02q)H;W7@2+SLB{^Z?)?i;`pItR$j}9L&+v{q>ET@%uwrkw~|5
z10y`W&zOtuXzueQOxwx#%un+GpB=eqQ00kRRXlJ*FEIU(P;4qD!Mb4(>}k4&6;-kA
zz$b>P>7$mw5iNnLN)#Ov1ru#bG#CC=^>ep#m?kAlN_@_edAV?r;+WgXh8>z^5%+1y
z(?k}m;gGW^C|csjJYorrVPC|T42xTcLNxJF6?}Jh=V2gVNL3-h*#+#8(tIunLel)#
zZl~KR(y)X_pl}YWH_Ssfu!wGQUa}n3;4IRj1YDjV*6r}(ist2I*f4R&yly8<^O(XR
z!6XWSNwj2n0Wvfc{WK9fifQViMOkL9EPLEbs#x-pPzrNdoU6aSoLygjQEh{eFD(V-
zO9Y4(WsuXbl(HZcQZ#Tp(q+zyEKLgboG#6P7&g%@%mSujohlmknB`!H_=@G5ni2%x
z)upmem!?9VQ>Kvn0L8_;gr(s(oL<rUD5U{D;&UdzFBVjB->B!)o13fI<)`tfH9Ncd
z#rizE`bGb6@#gx&p=OTPqSJiNN<4diwgH!|Xq-h%^3(W_?RcoP+izNXf0>DQJEzm>
z6#%aQECgM??RLh#&$1G~rwQ=xBJUow1wc@L!LWcMPa_w_bcN}h4Naok<PlCKTa^G%
z9CbSl<i|PhBch?7X;Q<F*1)5ai)ptrDe2rrJ<V`Ti<pMegQ-=<b8&xtR3T3tJ*P#9
zFVcX6Uj^%n95k-xVz-7=%{6paGn(DkEC(}m(d6(Ny(<3f%lh%*<aM|65!dp$==2PB
ztuXVkUlnB<W0vPB98mTch-x*0C(IWX;UYgug#~mwgWm<IFYE#d5JsKB`iIeA)Isn!
zD0vyN(WFr(I!a^8lY!jKcA70QD=G39G%r{=>{Vrm?H-axJBz|RF#B5Z<*>JAPh8tW
ze)BZ!ZB%ivA+?W^2tvc&lC!%k&C8whcRVN;!+<Th&v5<FM?B#rrx7lEDhzQ;swnS4
znA#zli~XYLHEUiP+@rtL!}^i>xAQ!$k^uW@BoO|6NZ_L$Y_`PU2OLl(+o_BO0Z--+
z=0e~UhcxEVJ($pC#u9W*lcLv0y;Bw~SqTn{-m!|o%x0pG#yLpPM+I!f1<QH3rC1Ms
ztG@%j++n&F-$f=N+3GQ$28$7`%5)FA#)AX<WDj@l_aMlz9OpFP6*z#x;)5NQs2lO*
zwhzBcUO<o!b{Jff(`?<gcPI*7;6OxA+E{9mCTy4JpJEnpie3q~Zl$z@l&^PH_E3X=
zizL5oMh}790rMeZ5A?og?Hg#w!_^Jm8_2dyG~WZcHqnt7cLs9fkDY-K4crr$Yz=v#
zJaSE#YSyyRbioT0)bj{N5aFp8@nQkGC{yGyR249U42gIYK^PIJFBxhhcLDwbr~N7h
z2P-7>DXl=}dvOSrK)ErzmU^;1b;rv^T9qivIZ*affPyErJ`p_A;ub`&FU?pU^FriW
zX&`4Ct_Kh=auyDIZBl`PK;kr^q6m<i`-*j@>5VSZoFSexg@7O>0WmtN2@d2Mse#<$
zCO6^jnKb^j0+61|;ttiqkWm9WhGAmeatNlT7h+$5M5Tm*={eR*vbd&dKa#|E`Giw`
zmFM-r^raQYzc`tnOr}WQQbp_JzBf-biGC+r_jjRLiJ3pFzMX|c@^wfgr_W=^|1wmI
zS^p>br+U)f6c6W6JXEmmB9`FUb)O~c4KoO~csvJGU38KD3JyYLLn7$aM@2b+8aAm`
zXl0vUO#L+)!^NVcealPP&@4$a(W|$o(~I*yRI0ZOoiP74eT@zmc^WfW1GAG9(;$*z
zK?(?I&Nru)5ZrF)y@*$^1ip9$%t78itxMw_jXYb7)&ci9E7nZK2@Q|n016qb>mI{)
zsOs1<RpQuzu3G;0YEgZ!73=>4rD(%#o{Dy3b6vv93<MHoTOFrF=OcrfZMjv4%mV-g
zkce>BN*dIXZuAsF-vJ){qrz+tJ{NEPz4f#H^7>i-Pp+T;o=oB!PN-=b*qEv-SPy-Q
zZ39AdVA_@+*oNna-iv(jpO8s@BwqYKll&KDlJCFPPy*A^Je6XwXASE_1=DaW4Qq};
z0^<3gKW)DmtM-%_u41^Fv&W2eOZms9lAk}DsyZ91+KKUbW`5E>j^Cc@=V$LO-_0gx
zhm(`1&G)$i^m!|0PH2Ue?qQ<ZV8vD#V8`)%91;>L)UXKE&4({HG@_zo(Wq==*f<j*
z3nF0Zw_VYOi8kW|PsZhKGtG+>+qU80JrNEx@>Du#R5q<xAF1|EY;G0TMB9(}1I-g?
zET9{+bVs}#w3`vJEfq)$x3IysK5z>06vSLM`=5L*adde&{d6@!!t~MA?9KVvA?jgS
zKATLBW%+1&)U2oyk>$y|-ulf&`{JT~X|cg_^S+|(q}F{Dv`X!|XqxJViETO_w4eow
kQ$QDOI+91g_eKkrGtqEJZO2iQ-t9F03f;arO8pf80CkVP_5c6?

literal 0
HcmV?d00001


From 388f31e08fa35930be6ccc02d40b09985a5e8db6 Mon Sep 17 00:00:00 2001
From: Ilya Kreymer <ikreymer@gmail.com>
Date: Tue, 11 Nov 2014 15:34:14 -0800
Subject: [PATCH 3/9] rewrite: don't rewrite rel=canonical links, need to make
 rewriting more configurable (#50)

---
 pywb/rewrite/html_rewriter.py           | 6 ++++++
 pywb/rewrite/test/test_html_rewriter.py | 4 ++++
 2 files changed, 10 insertions(+)

diff --git a/pywb/rewrite/html_rewriter.py b/pywb/rewrite/html_rewriter.py
index f0c904c2..618c5191 100644
--- a/pywb/rewrite/html_rewriter.py
+++ b/pywb/rewrite/html_rewriter.py
@@ -174,6 +174,12 @@ class HTMLRewriterMixin(object):
             elif attr_name == 'crossorigin':
                 attr_name = '_crossorigin'
 
+            # special case: link don't rewrite canonical
+            elif tag == 'link' and attr_name == 'href':
+                if not self.has_attr(tag_attrs, ('rel', 'canonical')):
+                    rw_mod = handler.get(attr_name)
+                    attr_value = self._rewrite_url(attr_value, rw_mod)
+
             # special case: meta tag
             elif (tag == 'meta') and (attr_name == 'content'):
                 if self.has_attr(tag_attrs, ('http-equiv', 'refresh')):
diff --git a/pywb/rewrite/test/test_html_rewriter.py b/pywb/rewrite/test/test_html_rewriter.py
index 45df4dfb..710fa338 100644
--- a/pywb/rewrite/test/test_html_rewriter.py
+++ b/pywb/rewrite/test/test_html_rewriter.py
@@ -102,6 +102,10 @@ ur"""
 >>> parse('<link href="abc.txt"><div>SomeTest</div>', head_insert = '<script>load_stuff();</script>')
 <link href="/web/20131226101010oe_/http://example.com/some/path/abc.txt"><script>load_stuff();</script><div>SomeTest</div>
 
+# don't rewrite rel=canonical
+>>> parse('<link rel=canonical href="http://example.com/">')
+<link rel="canonical" href="http://example.com/">
+
 # doctype
 >>> parse('<!doctype html PUBLIC "public">')
 <!doctype html PUBLIC "public">

From 20070e95b67e1c21d226762398c4a46f5a87664e Mon Sep 17 00:00:00 2001
From: Ilya Kreymer <ikreymer@gmail.com>
Date: Thu, 13 Nov 2014 09:24:34 -0800
Subject: [PATCH 4/9] cookie_rewriter: add 'exact' cookie rewriter which never
 changes the path/domain

---
 pywb/rewrite/cookie_rewriter.py | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/pywb/rewrite/cookie_rewriter.py b/pywb/rewrite/cookie_rewriter.py
index 4724df4c..e9dd80ac 100644
--- a/pywb/rewrite/cookie_rewriter.py
+++ b/pywb/rewrite/cookie_rewriter.py
@@ -55,6 +55,24 @@ class MinimalScopeCookieRewriter(WbUrlBaseCookieRewriter):
         return morsel
 
 
+#=================================================================
+class ExactPathCookieRewriter(WbUrlBaseCookieRewriter):
+    """
+    Rewrite cookies only using exact path, useful for live rewrite
+    without a timestamp and to minimize cookie pollution
+
+    If path or domain present, simply remove
+    """
+
+    def rewrite_cookie(self, name, morsel):
+        if morsel.get('domain'):
+            del morsel['domain']
+        # else set cookie to rewritten path
+        if morsel.get('path'):
+            del morsel['path']
+
+        self._remove_age_opts(morsel)
+        return morsel
 #=================================================================
 class RootScopeCookieRewriter(WbUrlBaseCookieRewriter):
     """
@@ -79,5 +97,7 @@ class RootScopeCookieRewriter(WbUrlBaseCookieRewriter):
 def get_cookie_rewriter(cookie_scope):
     if cookie_scope == 'root':
         return RootScopeCookieRewriter
+    elif cookie_scope == 'exact':
+        return ExactPathCookieRewriter
     else:
         return MinimalScopeCookieRewriter

From b8b8c30573f2175d2280f80893b1f4043f6bd728 Mon Sep 17 00:00:00 2001
From: Ilya Kreymer <ikreymer@gmail.com>
Date: Thu, 13 Nov 2014 09:43:50 -0800
Subject: [PATCH 5/9] cookie_rewriter: add tests for exact cookie rewriter

---
 pywb/rewrite/test/test_cookie_rewriter.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/pywb/rewrite/test/test_cookie_rewriter.py b/pywb/rewrite/test/test_cookie_rewriter.py
index c20f56f9..4f57464f 100644
--- a/pywb/rewrite/test/test_cookie_rewriter.py
+++ b/pywb/rewrite/test/test_cookie_rewriter.py
@@ -1,4 +1,5 @@
 r"""
+# Default -- MinimalScopeRewriter
 # No rewriting
 >>> rewrite_cookie('a=b; c=d;')
 [('Set-Cookie', 'a=b'), ('Set-Cookie', 'c=d')]
@@ -23,10 +24,17 @@ r"""
 >>> rewrite_cookie('abc@def=123')
 []
 
+# ExactCookieRewriter
+>>> rewrite_cookie('some=value; Path=/diff/path/;', urlrewriter, ExactPathCookieRewriter)
+[('Set-Cookie', 'some=value')]
+
+>>> rewrite_cookie('some=value; Domain=.example.com; Path=/diff/path/; Max-Age=1500', urlrewriter, ExactPathCookieRewriter)
+[('Set-Cookie', 'some=value')]
+
 """
 
 
-from pywb.rewrite.cookie_rewriter import MinimalScopeCookieRewriter
+from pywb.rewrite.cookie_rewriter import MinimalScopeCookieRewriter, ExactPathCookieRewriter
 from pywb.rewrite.url_rewriter import UrlRewriter
 
 urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/pywb/')
@@ -34,6 +42,6 @@ urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.htm
 urlrewriter2 = UrlRewriter('em_/http://example.com/', '/preview/')
 
 
-def rewrite_cookie(cookie_str, rewriter=urlrewriter):
-    return MinimalScopeCookieRewriter(rewriter).rewrite(cookie_str)
+def rewrite_cookie(cookie_str, rewriter=urlrewriter, cookie_rewriter=MinimalScopeCookieRewriter):
+    return cookie_rewriter(rewriter).rewrite(cookie_str)
 

From d7eb40af20aa26e42f3b43e60f2fedd234200099 Mon Sep 17 00:00:00 2001
From: Ilya Kreymer <ikreymer@gmail.com>
Date: Sun, 23 Nov 2014 18:56:49 -0800
Subject: [PATCH 6/9] rewrite: properly rewrite scheme relative JS-escaped
 urls: '\/\/example.com', '\\/\\/example.com/', treat same as '//example.com'
 adding http: prefix

---
 pywb/rewrite/test/test_regex_rewriters.py | 10 ++++++++++
 pywb/rewrite/test/test_url_rewriter.py    | 15 +++++++++++++++
 pywb/rewrite/url_rewriter.py              |  6 ++++--
 3 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/pywb/rewrite/test/test_regex_rewriters.py b/pywb/rewrite/test/test_regex_rewriters.py
index 253328e5..92975a7f 100644
--- a/pywb/rewrite/test/test_regex_rewriters.py
+++ b/pywb/rewrite/test/test_regex_rewriters.py
@@ -45,6 +45,16 @@ r"""
 >>> _test_js('document_domain = "anotherdomain.com"; window.document.domain = "example.com"')
 'document_domain = "anotherdomain.com"; window.document.WB_wombat_domain = "example.com"'
 
+# protocol-rel escapes
+>>> _test_js('"//example.com/"')
+'"/web/20131010/http://example.com/"'
+
+>>> _test_js(r'"\/\/example.com/"')
+'"/web/20131010/http:\\/\\/example.com/"'
+
+>>> _test_js(r'"\\/\\/example.com/"')
+'"/web/20131010/http:\\\\/\\\\/example.com/"'
+
 # custom rules added
 >>> _test_js('window.location = "http://example.com/abc.html"; some_func(); ', [('some_func\(\).*', RegexRewriter.format('/*{0}*/'), 0)])
 'window.WB_wombat_location = "/web/20131010/http://example.com/abc.html"; /*some_func(); */'
diff --git a/pywb/rewrite/test/test_url_rewriter.py b/pywb/rewrite/test/test_url_rewriter.py
index be0ca7da..3d324069 100644
--- a/pywb/rewrite/test/test_url_rewriter.py
+++ b/pywb/rewrite/test/test_url_rewriter.py
@@ -50,6 +50,21 @@
 >>> do_rewrite(r'http:\/\/some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/')
 'localhost:8080/20101226101112/http:\\\\/\\\\/some-other-site.com'
 
+>>> do_rewrite(r'//some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/')
+'localhost:8080/20101226101112/http://some-other-site.com'
+
+>>> do_rewrite(r'\/\/some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/')
+'localhost:8080/20101226101112/http:\\\\/\\\\/some-other-site.com'
+
+>>> do_rewrite(r'\\/\\/some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/')
+'localhost:8080/20101226101112/http:\\\\/\\\\/some-other-site.com'
+
+>>> do_rewrite(r'http:\/\/some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/')
+'localhost:8080/20101226101112/http:\\\\/\\\\/some-other-site.com'
+
+>>> do_rewrite(r'http:\/\/some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/')
+'localhost:8080/20101226101112/http:\\\\/\\\\/some-other-site.com'
+
 >>> do_rewrite('../../other.html', '2020/http://example.com/index.html', '/')
 '/2020/http://example.com/other.html'
 
diff --git a/pywb/rewrite/url_rewriter.py b/pywb/rewrite/url_rewriter.py
index 61a48e50..aa87260c 100644
--- a/pywb/rewrite/url_rewriter.py
+++ b/pywb/rewrite/url_rewriter.py
@@ -17,7 +17,9 @@ class UrlRewriter(object):
 
     PROTOCOLS = ['http:', 'https:', 'ftp:', 'mms:', 'rtsp:', 'wais:']
 
-    def __init__(self, wburl, prefix, full_prefix=None, rel_prefix=None, 
+    REL_SCHEME = ('//', r'\/\/', r'\\/\\/')
+
+    def __init__(self, wburl, prefix, full_prefix=None, rel_prefix=None,
                  root_path=None, cookie_scope=None):
         self.wburl = wburl if isinstance(wburl, WbUrl) else WbUrl(wburl)
         self.prefix = prefix
@@ -45,7 +47,7 @@ class UrlRewriter(object):
 
         is_abs = any(url.startswith(x) for x in self.PROTOCOLS)
 
-        if url.startswith('//'):
+        if url.startswith(self.REL_SCHEME):
             is_abs = True
             url = 'http:' + url
 

From c996e70a6e9c89cbf5bb0f7e29ff9c3ac043aff5 Mon Sep 17 00:00:00 2001
From: Ilya Kreymer <ikreymer@gmail.com>
Date: Sat, 29 Nov 2014 11:13:57 -0800
Subject: [PATCH 7/9] wburl: detect and decode partially encoded schemes in
 url, such as http%3A//, https%A2F2F// before handling further add additional
 tests for wburl

---
 pywb/framework/wbrequestresponse.py |  2 +-
 pywb/rewrite/test/test_wburl.py     | 22 ++++++++++++++++++++++
 pywb/rewrite/wburl.py               | 11 +++++++++++
 3 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/pywb/framework/wbrequestresponse.py b/pywb/framework/wbrequestresponse.py
index 808563ea..06970316 100644
--- a/pywb/framework/wbrequestresponse.py
+++ b/pywb/framework/wbrequestresponse.py
@@ -131,7 +131,7 @@ class WbRequest(object):
         if not self.wb_url:
             return
 
-        mime = self.env.get('CONTENT_TYPE').split(';')[0]
+        mime = self.env.get('CONTENT_TYPE', '').split(';')[0]
         length = self.env.get('CONTENT_LENGTH')
         stream = self.env['wsgi.input']
 
diff --git a/pywb/rewrite/test/test_wburl.py b/pywb/rewrite/test/test_wburl.py
index bcad948e..b4d15b5d 100644
--- a/pywb/rewrite/test/test_wburl.py
+++ b/pywb/rewrite/test/test_wburl.py
@@ -26,6 +26,13 @@
 >>> repr(WbUrl('https://example.com/xyz?a=%2f&b=%2E'))
 "('latest_replay', '', '', 'https://example.com/xyz?a=%2f&b=%2E', 'https://example.com/xyz?a=%2f&b=%2E')"
 
+# Test scheme partially encoded urls
+>>> repr(WbUrl('https%3A//example.com/'))
+"('latest_replay', '', '', 'https://example.com/', 'https://example.com/')"
+
+>>> repr(WbUrl('2014/http%3A%2F%2Fexample.com/'))
+"('replay', '2014', '', 'http://example.com/', '2014/http://example.com/')"
+
 # Query Urls
 # ======================
 >>> repr(WbUrl('*/http://example.com/abc?def=a'))
@@ -57,6 +64,21 @@
 >>> repr(WbUrl('/example.com/'))
 "('latest_replay', '', '', 'http://example.com/', 'http://example.com/')"
 
+# Is_ Tests
+>>> u = WbUrl('*/http://example.com/abc?def=a*')
+>>> u.is_url_query()
+True
+
+>>> u.is_query()
+True
+
+>>> u2 = WbUrl('20130102im_/https:/example.com')
+>>> u2.is_embed
+True
+
+>>> u2.is_replay()
+True
+
 
 # Error Urls
 # ======================
diff --git a/pywb/rewrite/wburl.py b/pywb/rewrite/wburl.py
index 91d36455..5421a1de 100644
--- a/pywb/rewrite/wburl.py
+++ b/pywb/rewrite/wburl.py
@@ -85,6 +85,9 @@ class WbUrl(BaseWbUrl):
     REPLAY_REGEX = re.compile('^(\d*)([a-z]+_)?/{0,3}(.+)$')
 
     DEFAULT_SCHEME = 'http://'
+
+    PARTIAL_ENC_RX = re.compile('(https?%3A)?(%2F%2F)?', re.I)
+
     # ======================
 
     def __init__(self, url):
@@ -99,6 +102,14 @@ class WbUrl(BaseWbUrl):
         # protocol agnostic url -> http://
         # no protocol -> http://
         inx = self.url.find(':/')
+        if inx < 0:
+            # check for other partially encoded variants
+            m = self.PARTIAL_ENC_RX.match(self.url)
+            if m:
+                len_ = len(m.group(0))
+                self.url = urllib.unquote_plus(self.url[:len_]) + self.url[len_:]
+                inx = self.url.find(':/')
+
         if inx < 0:
             self.url = self.DEFAULT_SCHEME + self.url
         else:

From ea89702701a0aa5d2048f361cb95f9d5dfd89439 Mon Sep 17 00:00:00 2001
From: Ilya Kreymer <ikreymer@gmail.com>
Date: Thu, 4 Dec 2014 23:02:30 -0800
Subject: [PATCH 8/9] static handler: add default 'application/octet-stream'
 and only set guessed mime if not none

---
 pywb/webapp/handlers.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/pywb/webapp/handlers.py b/pywb/webapp/handlers.py
index 2c7962cc..a77f7060 100644
--- a/pywb/webapp/handlers.py
+++ b/pywb/webapp/handlers.py
@@ -40,7 +40,7 @@ class SearchPageWbUrlHandler(WbUrlHandler):
                                       create_template(html, 'Frame Insert'))
 
             self.banner_html = config.get('banner_html', 'banner.html')
-            
+
             if config.get('enable_memento', False):
                 self.response_class = MementoResponse
 
@@ -193,7 +193,11 @@ class StaticHandler(BaseHandler):
             else:
                 reader = iter(lambda: data.read(), '')
 
-            content_type, _ = mimetypes.guess_type(full_path)
+            content_type = 'application/octet-stream'
+
+            guessed = mimetypes.guess_type(full_path)
+            if guessed[0]:
+                content_type = guessed[0]
 
             return WbResponse.text_stream(data,
                                           content_type=content_type,

From d31a4df3a66f6483d92eabd1c1a819072e17ee0c Mon Sep 17 00:00:00 2001
From: Ilya Kreymer <ikreymer@gmail.com>
Date: Thu, 4 Dec 2014 23:10:51 -0800
Subject: [PATCH 9/9] add changelist for 0.6.5

---
 CHANGES.rst | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/CHANGES.rst b/CHANGES.rst
index 1ddaeea2..0fe9ae07 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -1,3 +1,20 @@
+pywb 0.6.5 changelist
+~~~~~~~~~~~~~~~~~~~~~
+
+* fix static handling when content type can not be guessed, default to 'application/octet-stream'
+
+* rewrite fix: understand partially encoded urls such as http%3A// in WbUrl, decode correctly
+
+* rewrite fix: rewrite \/\/example.com and \\/\\/example.com in JS same as \\example.com
+
+* cookies: add exact cookie rewriter which sets cookie to exact url only, never collection or host root
+
+* don't rewrite rel=canonical links for services which rely on these
+
+* cdx-indexer: Detect non-gzip chunk encoded .warc.gz/arc.gz archive files and show a meaningful
+  error message explaining how to fix issue (uncompress and possibly use warctools warc2warc to recompress)
+
+
 pywb 0.6.4 changelist
 ~~~~~~~~~~~~~~~~~~~~~