From 044792f99fcf7a279e1fe28370389e611037694b Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 6 Nov 2014 01:28:56 -0800 Subject: [PATCH 1/9] bump version to 0.6.5! --- README.rst | 4 ++-- setup.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.rst b/README.rst index 8e8fe103..30e9979e 100644 --- a/README.rst +++ b/README.rst @@ -1,4 +1,4 @@ -PyWb 0.6.4 +PyWb 0.6.5 ========== .. image:: https://travis-ci.org/ikreymer/pywb.png?branch=develop @@ -7,7 +7,7 @@ PyWb 0.6.4 :target: https://coveralls.io/r/ikreymer/pywb?branch=develop .. image:: https://img.shields.io/gratipay/ikreymer.svg :target: https://www.gratipay.com/ikreymer/ - + pywb is a python implementation of web archival replay tools, sometimes also known as 'Wayback Machine'. pywb allows high-quality replay (browsing) of archived web data stored in standardized `ARC `_ and `WARC `_. diff --git a/setup.py b/setup.py index 12225bee..b5ef2d26 100755 --- a/setup.py +++ b/setup.py @@ -34,7 +34,7 @@ class PyTest(TestCommand): setup( name='pywb', - version='0.6.4', + version='0.6.5', url='https://github.com/ikreymer/pywb', author='Ilya Kreymer', author_email='ikreymer@gmail.com', From 49e98e0cdc3bab540d912fbf1e067bc73d335ed0 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 6 Nov 2014 01:29:14 -0800 Subject: [PATCH 2/9] archiveiterator/cdxindexer: cleaner load path for compressed and uncompressed, ability to distinguish between chunked and non-chunked warcs/arcs Raise error for non-chunked gzip warcs as they can not be indexed for replay, addressing #48 add 'bad' non-chunked gzip file for testing, using custom ext --- pywb/warc/archiveiterator.py | 82 ++++++++++++++----- pywb/warc/test/test_indexing.py | 12 ++- sample_archive/warcs/example-bad.warc.gz.bad | Bin 0 -> 1950 bytes 3 files changed, 74 insertions(+), 20 deletions(-) create mode 100644 sample_archive/warcs/example-bad.warc.gz.bad diff --git a/pywb/warc/archiveiterator.py b/pywb/warc/archiveiterator.py index e1408432..e0994a7f 100644 --- a/pywb/warc/archiveiterator.py +++ b/pywb/warc/archiveiterator.py @@ -21,6 +21,25 @@ class ArchiveIterator(object): """ + GZIP_ERR_MSG = """ + ERROR: Non-chunked gzip file detected, gzip block continues + beyond single record. + + This file is probably not a multi-chunk gzip but a single gzip file. + + To allow seek, a gzipped {1} must have each record compressed into + a single gzip chunk and concatenated together. + + This file is likely still valid and you can use it by decompressing it: + + gunzip myfile.{0}.gz + + You can then also use the 'warc2warc' tool from the 'warc-tools' + package which will create a properly chunked gzip file: + + warc2warc -Z myfile.{0} > myfile.{0}.gz + """ + def __init__(self, fileobj): self.fh = fileobj @@ -42,27 +61,34 @@ class ArchiveIterator(object): block_size=block_size) self.offset = self.fh.tell() - next_line = None + self.next_line = None + + is_valid = True while True: try: - record = self._next_record(next_line) + record = self._next_record(self.next_line) + if not is_valid: + self._raise_err() + yield record except EOFError: break self.read_to_end(record) - # for non-compressed, consume blank lines here - if not self.reader.decompressor: - next_line = self._consume_blanklines() - if next_line is None: - # at end of file - break + if self.reader.decompressor: + is_valid = self.reader.read_next_member() - # reset reader for next member - else: - self.reader.read_next_member() + def _raise_err(self): + frmt = 'warc/arc' + if self.known_format: + frmt = self.known_format + + frmt_up = frmt.upper() + + msg = self.GZIP_ERR_MSG.format(frmt, frmt_up) + raise Exception(msg) def _consume_blanklines(self): """ Consume blank lines that are between records @@ -72,25 +98,31 @@ class ArchiveIterator(object): and are included in record length which is the full gzip envelope - For uncompressed, they are between records and so are NOT part of the record length + + count empty_size so that it can be substracted from + the record length for uncompressed """ + empty_size = 0 while True: line = self.reader.readline() if len(line) == 0: - return None + return None, empty_size if line.rstrip() == '': - self.offset = self.fh.tell() - self.reader.rem_length() + empty_size += len(line) continue - return line + return line, empty_size def read_to_end(self, record, compute_digest=False): """ Read remainder of the stream If a digester is included, update it with the data read """ + + # already at end of this record, don't read until it is consumed if self.member_info: - return self.member_info + return None if compute_digest: digester = hashlib.sha1() @@ -114,19 +146,29 @@ class ArchiveIterator(object): - For uncompressed files, blank lines are read later, and not included in the record length """ - if self.reader.decompressor: - self._consume_blanklines() + #if self.reader.decompressor: + self.next_line, empty_size = self._consume_blanklines() self.offset = self.fh.tell() - self.reader.rem_length() + #if self.offset < 0: + # raise Exception('Not Gzipped Properly') + + if self.next_line: + self.offset -= len(self.next_line) + length = self.offset - curr_offset + if not self.reader.decompressor: + length -= empty_size + if compute_digest: digest = base64.b32encode(digester.digest()) else: digest = None self.member_info = (curr_offset, length, digest) - return self.member_info + #return self.member_info + #return next_line def _next_record(self, next_line): """ Use loader to parse the record from the reader stream @@ -250,7 +292,9 @@ def create_record_iter(arcv_iter, options): entry.post_query = post_query - entry.set_rec_info(*arcv_iter.read_to_end(record, compute_digest)) + #entry.set_rec_info(*arcv_iter.read_to_end(record, compute_digest)) + arcv_iter.read_to_end(record, compute_digest) + entry.set_rec_info(*arcv_iter.member_info) entry.record = record yield entry diff --git a/pywb/warc/test/test_indexing.py b/pywb/warc/test/test_indexing.py index cb8dc4bb..c8584c8d 100644 --- a/pywb/warc/test/test_indexing.py +++ b/pywb/warc/test/test_indexing.py @@ -144,7 +144,6 @@ Total: 4 from pywb import get_test_dir -#from pywb.warc.archiveindexer import ArchiveIndexer, main, cdx_filename from pywb.warc.cdxindexer import write_cdx_index, main, cdx_filename from io import BytesIO @@ -154,6 +153,9 @@ import os import shutil import tempfile +from pytest import raises + + TEST_CDX_DIR = get_test_dir() + 'cdx/' TEST_WARC_DIR = get_test_dir() + 'warcs/' @@ -231,3 +233,11 @@ def cli_lines_with_dir(input_): print('Total: ' + str(len(lines))) +def test_non_chunked_gzip_err(): + with raises(Exception): + print_cdx_index('example-bad.warc.gz.bad') + + +if __name__ == "__main__": + import doctest + doctest.testmod() diff --git a/sample_archive/warcs/example-bad.warc.gz.bad b/sample_archive/warcs/example-bad.warc.gz.bad new file mode 100644 index 0000000000000000000000000000000000000000..95d2c415b4d792cd7fab3f0141dd224c7d9b08f6 GIT binary patch literal 1950 zcmV;P2VwXhiwFo2Ia^c!17&z&ZE$R5E_Y#aV*u?~S##US5q@VC|HFiNu{RuGa33O> z&=Pg5B};3OmhBhU02q)H;W7@2+SLB{^Z?)?i;`pItR$j}9L&+v{q>ET@%uwrkw~|5 z10y`W&zOtuXzueQOxwx#%un+GpB=eqQ00kRRXlJ*FEIU(P;4qD!Mb4(>}k4&6;-kA zz$b>P>7$mw5iNnLN)#Ov1ru#bG#CC=^>ep#m?kAlN_@_edAV?r;+WgXh8>z^5%+1y z(?k}m;gGW^C|csjJYorrVPC|T42xTcLNxJF6?}Jh=V2gVNL3-h*#+#8(tIunLel)# zZl~KR(y)X_pl}YWH_Ssfu!wGQUa}n3;4IRj1YDjV*6r}(ist2I*f4R&yly8<^O(XR z!6XWSNwj2n0Wvfc{WK9fifQViMOkL9EPLEbs#x-pPzrNdoU6aSoLygjQEh{eFD(V- zO9Y4(WsuXbl(HZcQZ#Tp(q+zyEKLgboG#6P7&g%@%mSujohlmknB`!H_=@G5ni2%x z)upmem!?9VQ>Kvn0L8_;gr(s(oLB!)o13fI<)`tfH9Ncd z#rizE`bGb6@#gx&p=OTPqSJiNN<4diwgH!|Xq-h%^3(W_?RcoP+izNXf0>DQJEzm> z6#%aQECgM??RLh#&$1G~rwQ=xBJUow1wc@L!LWcMPa_w_bcN}h4Naok{Vrm?H-axJBz|RF#B5Z<*>JAPh8tW ze)BZ!ZB%ivA+?W^2tvc&lC!%k&C8whcRVN;!+xAQ!$k^uW@BoO|6NZ_L$Y_`PU2OLl(+o_BO0Z--+ z=0e~UhcxEVJ($pC#u9W*lcLv0y;Bw~SqTn{-m!|o%x0pG#yLpPM+I!f1DXl=}dvOSrK)ErzmU^;1b;rv^T9qivIZ*affPyErJ`p_A;ub`&FU?pU^FriW zX&`4Ct_Kh=auyDIZBl`PK;kr^q6m5VSZoFSexg@7O>0WmtN2@d2Mse#<$ zCO6^jnKb^j0+61|;ttiqkWm9WhGAmeatNlT7h+$5M5Tm*={eR*vbd&dKa#|E`Giw` zmFM-r^raQYzc`tnOr}WQQbp_JzBf-biGC+r_jjRLiJ3pFzMX|c@^wfgr_W=^|1wmI zS^p>br+U)f6c6W6JXEmmB9`FUb)O~c4KoO~csvJGU38KD3JyYLLn7$aM@2b+8aAm` zXl0vUO#L+)!^NVcealPP&@4$a(W|$o(~I*yRI0ZOoiP74eT@zmc^WfW1GAG9(;$*z zK?(?I&Nru)5ZrF)y@*$^1ip9$%t78itxMw_jXYb7)&ci9E7nZK2@Q|n016qb>mI{) zsOs14rD(%#o{Dy3b6vv93BN*dIXZuAsF-vJ){qrz+tJ{NEPz4f#H^7>i-Pp+T;o=oB!PN-=b*qEv-SPy-Q zZ39AdVA_@+*oNna-iv(jpO8s@BwqYKll&KDlJCFPPy*A^Je6XwXASE_1=DaW4Qq}; z0^<3gKW)DmtM-%_u41^Fv&W2eOZms9lAk}DsyZ91+KKUbW`5E>j^Cc@=V$LO-_0gx zhm(`1&G)$i^m!|0PH2Ue?qQL)UXKE&4({HG@_zo(Wq==*f+qU80JrNEx@>Du#R5q06vSLM`=5L*adde&{d6@!!t~MA?9KVvA?jgS zKATLBW%+1&)U2oyk>$y|-ulf&`{JT~X|cg_^S+|(q}F{Dv`X!|XqxJViETO_w4eow kQ$QDOI+91g_eKkrGtqEJZO2iQ-t9F03f;arO8pf80CkVP_5c6? literal 0 HcmV?d00001 From 388f31e08fa35930be6ccc02d40b09985a5e8db6 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 11 Nov 2014 15:34:14 -0800 Subject: [PATCH 3/9] rewrite: don't rewrite rel=canonical links, need to make rewriting more configurable (#50) --- pywb/rewrite/html_rewriter.py | 6 ++++++ pywb/rewrite/test/test_html_rewriter.py | 4 ++++ 2 files changed, 10 insertions(+) diff --git a/pywb/rewrite/html_rewriter.py b/pywb/rewrite/html_rewriter.py index f0c904c2..618c5191 100644 --- a/pywb/rewrite/html_rewriter.py +++ b/pywb/rewrite/html_rewriter.py @@ -174,6 +174,12 @@ class HTMLRewriterMixin(object): elif attr_name == 'crossorigin': attr_name = '_crossorigin' + # special case: link don't rewrite canonical + elif tag == 'link' and attr_name == 'href': + if not self.has_attr(tag_attrs, ('rel', 'canonical')): + rw_mod = handler.get(attr_name) + attr_value = self._rewrite_url(attr_value, rw_mod) + # special case: meta tag elif (tag == 'meta') and (attr_name == 'content'): if self.has_attr(tag_attrs, ('http-equiv', 'refresh')): diff --git a/pywb/rewrite/test/test_html_rewriter.py b/pywb/rewrite/test/test_html_rewriter.py index 45df4dfb..710fa338 100644 --- a/pywb/rewrite/test/test_html_rewriter.py +++ b/pywb/rewrite/test/test_html_rewriter.py @@ -102,6 +102,10 @@ ur""" >>> parse('
SomeTest
', head_insert = '')
SomeTest
+# don't rewrite rel=canonical +>>> parse('') + + # doctype >>> parse('') From 20070e95b67e1c21d226762398c4a46f5a87664e Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 13 Nov 2014 09:24:34 -0800 Subject: [PATCH 4/9] cookie_rewriter: add 'exact' cookie rewriter which never changes the path/domain --- pywb/rewrite/cookie_rewriter.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/pywb/rewrite/cookie_rewriter.py b/pywb/rewrite/cookie_rewriter.py index 4724df4c..e9dd80ac 100644 --- a/pywb/rewrite/cookie_rewriter.py +++ b/pywb/rewrite/cookie_rewriter.py @@ -55,6 +55,24 @@ class MinimalScopeCookieRewriter(WbUrlBaseCookieRewriter): return morsel +#================================================================= +class ExactPathCookieRewriter(WbUrlBaseCookieRewriter): + """ + Rewrite cookies only using exact path, useful for live rewrite + without a timestamp and to minimize cookie pollution + + If path or domain present, simply remove + """ + + def rewrite_cookie(self, name, morsel): + if morsel.get('domain'): + del morsel['domain'] + # else set cookie to rewritten path + if morsel.get('path'): + del morsel['path'] + + self._remove_age_opts(morsel) + return morsel #================================================================= class RootScopeCookieRewriter(WbUrlBaseCookieRewriter): """ @@ -79,5 +97,7 @@ class RootScopeCookieRewriter(WbUrlBaseCookieRewriter): def get_cookie_rewriter(cookie_scope): if cookie_scope == 'root': return RootScopeCookieRewriter + elif cookie_scope == 'exact': + return ExactPathCookieRewriter else: return MinimalScopeCookieRewriter From b8b8c30573f2175d2280f80893b1f4043f6bd728 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 13 Nov 2014 09:43:50 -0800 Subject: [PATCH 5/9] cookie_rewriter: add tests for exact cookie rewriter --- pywb/rewrite/test/test_cookie_rewriter.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/pywb/rewrite/test/test_cookie_rewriter.py b/pywb/rewrite/test/test_cookie_rewriter.py index c20f56f9..4f57464f 100644 --- a/pywb/rewrite/test/test_cookie_rewriter.py +++ b/pywb/rewrite/test/test_cookie_rewriter.py @@ -1,4 +1,5 @@ r""" +# Default -- MinimalScopeRewriter # No rewriting >>> rewrite_cookie('a=b; c=d;') [('Set-Cookie', 'a=b'), ('Set-Cookie', 'c=d')] @@ -23,10 +24,17 @@ r""" >>> rewrite_cookie('abc@def=123') [] +# ExactCookieRewriter +>>> rewrite_cookie('some=value; Path=/diff/path/;', urlrewriter, ExactPathCookieRewriter) +[('Set-Cookie', 'some=value')] + +>>> rewrite_cookie('some=value; Domain=.example.com; Path=/diff/path/; Max-Age=1500', urlrewriter, ExactPathCookieRewriter) +[('Set-Cookie', 'some=value')] + """ -from pywb.rewrite.cookie_rewriter import MinimalScopeCookieRewriter +from pywb.rewrite.cookie_rewriter import MinimalScopeCookieRewriter, ExactPathCookieRewriter from pywb.rewrite.url_rewriter import UrlRewriter urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/pywb/') @@ -34,6 +42,6 @@ urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.htm urlrewriter2 = UrlRewriter('em_/http://example.com/', '/preview/') -def rewrite_cookie(cookie_str, rewriter=urlrewriter): - return MinimalScopeCookieRewriter(rewriter).rewrite(cookie_str) +def rewrite_cookie(cookie_str, rewriter=urlrewriter, cookie_rewriter=MinimalScopeCookieRewriter): + return cookie_rewriter(rewriter).rewrite(cookie_str) From d7eb40af20aa26e42f3b43e60f2fedd234200099 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sun, 23 Nov 2014 18:56:49 -0800 Subject: [PATCH 6/9] rewrite: properly rewrite scheme relative JS-escaped urls: '\/\/example.com', '\\/\\/example.com/', treat same as '//example.com' adding http: prefix --- pywb/rewrite/test/test_regex_rewriters.py | 10 ++++++++++ pywb/rewrite/test/test_url_rewriter.py | 15 +++++++++++++++ pywb/rewrite/url_rewriter.py | 6 ++++-- 3 files changed, 29 insertions(+), 2 deletions(-) diff --git a/pywb/rewrite/test/test_regex_rewriters.py b/pywb/rewrite/test/test_regex_rewriters.py index 253328e5..92975a7f 100644 --- a/pywb/rewrite/test/test_regex_rewriters.py +++ b/pywb/rewrite/test/test_regex_rewriters.py @@ -45,6 +45,16 @@ r""" >>> _test_js('document_domain = "anotherdomain.com"; window.document.domain = "example.com"') 'document_domain = "anotherdomain.com"; window.document.WB_wombat_domain = "example.com"' +# protocol-rel escapes +>>> _test_js('"//example.com/"') +'"/web/20131010/http://example.com/"' + +>>> _test_js(r'"\/\/example.com/"') +'"/web/20131010/http:\\/\\/example.com/"' + +>>> _test_js(r'"\\/\\/example.com/"') +'"/web/20131010/http:\\\\/\\\\/example.com/"' + # custom rules added >>> _test_js('window.location = "http://example.com/abc.html"; some_func(); ', [('some_func\(\).*', RegexRewriter.format('/*{0}*/'), 0)]) 'window.WB_wombat_location = "/web/20131010/http://example.com/abc.html"; /*some_func(); */' diff --git a/pywb/rewrite/test/test_url_rewriter.py b/pywb/rewrite/test/test_url_rewriter.py index be0ca7da..3d324069 100644 --- a/pywb/rewrite/test/test_url_rewriter.py +++ b/pywb/rewrite/test/test_url_rewriter.py @@ -50,6 +50,21 @@ >>> do_rewrite(r'http:\/\/some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/') 'localhost:8080/20101226101112/http:\\\\/\\\\/some-other-site.com' +>>> do_rewrite(r'//some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/') +'localhost:8080/20101226101112/http://some-other-site.com' + +>>> do_rewrite(r'\/\/some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/') +'localhost:8080/20101226101112/http:\\\\/\\\\/some-other-site.com' + +>>> do_rewrite(r'\\/\\/some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/') +'localhost:8080/20101226101112/http:\\\\/\\\\/some-other-site.com' + +>>> do_rewrite(r'http:\/\/some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/') +'localhost:8080/20101226101112/http:\\\\/\\\\/some-other-site.com' + +>>> do_rewrite(r'http:\/\/some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/') +'localhost:8080/20101226101112/http:\\\\/\\\\/some-other-site.com' + >>> do_rewrite('../../other.html', '2020/http://example.com/index.html', '/') '/2020/http://example.com/other.html' diff --git a/pywb/rewrite/url_rewriter.py b/pywb/rewrite/url_rewriter.py index 61a48e50..aa87260c 100644 --- a/pywb/rewrite/url_rewriter.py +++ b/pywb/rewrite/url_rewriter.py @@ -17,7 +17,9 @@ class UrlRewriter(object): PROTOCOLS = ['http:', 'https:', 'ftp:', 'mms:', 'rtsp:', 'wais:'] - def __init__(self, wburl, prefix, full_prefix=None, rel_prefix=None, + REL_SCHEME = ('//', r'\/\/', r'\\/\\/') + + def __init__(self, wburl, prefix, full_prefix=None, rel_prefix=None, root_path=None, cookie_scope=None): self.wburl = wburl if isinstance(wburl, WbUrl) else WbUrl(wburl) self.prefix = prefix @@ -45,7 +47,7 @@ class UrlRewriter(object): is_abs = any(url.startswith(x) for x in self.PROTOCOLS) - if url.startswith('//'): + if url.startswith(self.REL_SCHEME): is_abs = True url = 'http:' + url From c996e70a6e9c89cbf5bb0f7e29ff9c3ac043aff5 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 29 Nov 2014 11:13:57 -0800 Subject: [PATCH 7/9] wburl: detect and decode partially encoded schemes in url, such as http%3A//, https%A2F2F// before handling further add additional tests for wburl --- pywb/framework/wbrequestresponse.py | 2 +- pywb/rewrite/test/test_wburl.py | 22 ++++++++++++++++++++++ pywb/rewrite/wburl.py | 11 +++++++++++ 3 files changed, 34 insertions(+), 1 deletion(-) diff --git a/pywb/framework/wbrequestresponse.py b/pywb/framework/wbrequestresponse.py index 808563ea..06970316 100644 --- a/pywb/framework/wbrequestresponse.py +++ b/pywb/framework/wbrequestresponse.py @@ -131,7 +131,7 @@ class WbRequest(object): if not self.wb_url: return - mime = self.env.get('CONTENT_TYPE').split(';')[0] + mime = self.env.get('CONTENT_TYPE', '').split(';')[0] length = self.env.get('CONTENT_LENGTH') stream = self.env['wsgi.input'] diff --git a/pywb/rewrite/test/test_wburl.py b/pywb/rewrite/test/test_wburl.py index bcad948e..b4d15b5d 100644 --- a/pywb/rewrite/test/test_wburl.py +++ b/pywb/rewrite/test/test_wburl.py @@ -26,6 +26,13 @@ >>> repr(WbUrl('https://example.com/xyz?a=%2f&b=%2E')) "('latest_replay', '', '', 'https://example.com/xyz?a=%2f&b=%2E', 'https://example.com/xyz?a=%2f&b=%2E')" +# Test scheme partially encoded urls +>>> repr(WbUrl('https%3A//example.com/')) +"('latest_replay', '', '', 'https://example.com/', 'https://example.com/')" + +>>> repr(WbUrl('2014/http%3A%2F%2Fexample.com/')) +"('replay', '2014', '', 'http://example.com/', '2014/http://example.com/')" + # Query Urls # ====================== >>> repr(WbUrl('*/http://example.com/abc?def=a')) @@ -57,6 +64,21 @@ >>> repr(WbUrl('/example.com/')) "('latest_replay', '', '', 'http://example.com/', 'http://example.com/')" +# Is_ Tests +>>> u = WbUrl('*/http://example.com/abc?def=a*') +>>> u.is_url_query() +True + +>>> u.is_query() +True + +>>> u2 = WbUrl('20130102im_/https:/example.com') +>>> u2.is_embed +True + +>>> u2.is_replay() +True + # Error Urls # ====================== diff --git a/pywb/rewrite/wburl.py b/pywb/rewrite/wburl.py index 91d36455..5421a1de 100644 --- a/pywb/rewrite/wburl.py +++ b/pywb/rewrite/wburl.py @@ -85,6 +85,9 @@ class WbUrl(BaseWbUrl): REPLAY_REGEX = re.compile('^(\d*)([a-z]+_)?/{0,3}(.+)$') DEFAULT_SCHEME = 'http://' + + PARTIAL_ENC_RX = re.compile('(https?%3A)?(%2F%2F)?', re.I) + # ====================== def __init__(self, url): @@ -99,6 +102,14 @@ class WbUrl(BaseWbUrl): # protocol agnostic url -> http:// # no protocol -> http:// inx = self.url.find(':/') + if inx < 0: + # check for other partially encoded variants + m = self.PARTIAL_ENC_RX.match(self.url) + if m: + len_ = len(m.group(0)) + self.url = urllib.unquote_plus(self.url[:len_]) + self.url[len_:] + inx = self.url.find(':/') + if inx < 0: self.url = self.DEFAULT_SCHEME + self.url else: From ea89702701a0aa5d2048f361cb95f9d5dfd89439 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 4 Dec 2014 23:02:30 -0800 Subject: [PATCH 8/9] static handler: add default 'application/octet-stream' and only set guessed mime if not none --- pywb/webapp/handlers.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pywb/webapp/handlers.py b/pywb/webapp/handlers.py index 2c7962cc..a77f7060 100644 --- a/pywb/webapp/handlers.py +++ b/pywb/webapp/handlers.py @@ -40,7 +40,7 @@ class SearchPageWbUrlHandler(WbUrlHandler): create_template(html, 'Frame Insert')) self.banner_html = config.get('banner_html', 'banner.html') - + if config.get('enable_memento', False): self.response_class = MementoResponse @@ -193,7 +193,11 @@ class StaticHandler(BaseHandler): else: reader = iter(lambda: data.read(), '') - content_type, _ = mimetypes.guess_type(full_path) + content_type = 'application/octet-stream' + + guessed = mimetypes.guess_type(full_path) + if guessed[0]: + content_type = guessed[0] return WbResponse.text_stream(data, content_type=content_type, From d31a4df3a66f6483d92eabd1c1a819072e17ee0c Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 4 Dec 2014 23:10:51 -0800 Subject: [PATCH 9/9] add changelist for 0.6.5 --- CHANGES.rst | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/CHANGES.rst b/CHANGES.rst index 1ddaeea2..0fe9ae07 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,3 +1,20 @@ +pywb 0.6.5 changelist +~~~~~~~~~~~~~~~~~~~~~ + +* fix static handling when content type can not be guessed, default to 'application/octet-stream' + +* rewrite fix: understand partially encoded urls such as http%3A// in WbUrl, decode correctly + +* rewrite fix: rewrite \/\/example.com and \\/\\/example.com in JS same as \\example.com + +* cookies: add exact cookie rewriter which sets cookie to exact url only, never collection or host root + +* don't rewrite rel=canonical links for services which rely on these + +* cdx-indexer: Detect non-gzip chunk encoded .warc.gz/arc.gz archive files and show a meaningful + error message explaining how to fix issue (uncompress and possibly use warctools warc2warc to recompress) + + pywb 0.6.4 changelist ~~~~~~~~~~~~~~~~~~~~~