From ea89702701a0aa5d2048f361cb95f9d5dfd89439 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 4 Dec 2014 23:02:30 -0800 Subject: [PATCH 1/4] static handler: add default 'application/octet-stream' and only set guessed mime if not none --- pywb/webapp/handlers.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pywb/webapp/handlers.py b/pywb/webapp/handlers.py index 2c7962cc..a77f7060 100644 --- a/pywb/webapp/handlers.py +++ b/pywb/webapp/handlers.py @@ -40,7 +40,7 @@ class SearchPageWbUrlHandler(WbUrlHandler): create_template(html, 'Frame Insert')) self.banner_html = config.get('banner_html', 'banner.html') - + if config.get('enable_memento', False): self.response_class = MementoResponse @@ -193,7 +193,11 @@ class StaticHandler(BaseHandler): else: reader = iter(lambda: data.read(), '') - content_type, _ = mimetypes.guess_type(full_path) + content_type = 'application/octet-stream' + + guessed = mimetypes.guess_type(full_path) + if guessed[0]: + content_type = guessed[0] return WbResponse.text_stream(data, content_type=content_type, From d31a4df3a66f6483d92eabd1c1a819072e17ee0c Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 4 Dec 2014 23:10:51 -0800 Subject: [PATCH 2/4] add changelist for 0.6.5 --- CHANGES.rst | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/CHANGES.rst b/CHANGES.rst index 1ddaeea2..0fe9ae07 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,3 +1,20 @@ +pywb 0.6.5 changelist +~~~~~~~~~~~~~~~~~~~~~ + +* fix static handling when content type can not be guessed, default to 'application/octet-stream' + +* rewrite fix: understand partially encoded urls such as http%3A// in WbUrl, decode correctly + +* rewrite fix: rewrite \/\/example.com and \\/\\/example.com in JS same as \\example.com + +* cookies: add exact cookie rewriter which sets cookie to exact url only, never collection or host root + +* don't rewrite rel=canonical links for services which rely on these + +* cdx-indexer: Detect non-gzip chunk encoded .warc.gz/arc.gz archive files and show a meaningful + error message explaining how to fix issue (uncompress and possibly use warctools warc2warc to recompress) + + pywb 0.6.4 changelist ~~~~~~~~~~~~~~~~~~~~~ From 6440e2503f3fcc4534922641dcf5b134a33bdc57 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 6 Dec 2014 15:22:57 -0800 Subject: [PATCH 3/4] bump version to 0.6.6 --- README.rst | 10 +++++----- setup.py | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/README.rst b/README.rst index 30e9979e..43f1bfde 100644 --- a/README.rst +++ b/README.rst @@ -1,4 +1,4 @@ -PyWb 0.6.5 +PyWb 0.6.6 ========== .. image:: https://travis-ci.org/ikreymer/pywb.png?branch=develop @@ -44,7 +44,7 @@ This README contains a basic overview of using pywb. After reading this intro, c pywb Tools Overview ----------------------------- -In addition to the standard wayback machine (explained further below), pywb tool suite includes a +In addition to the standard wayback machine (explained further below), pywb tool suite includes a number of useful command-line and web server tools. The tools should be available to run after running ``python setup.py install``: @@ -58,10 +58,10 @@ running ``python setup.py install``: for all options. -* ``cdx-server`` -- a CDX API only server which returns a responses about CDX captures in bulk. +* ``cdx-server`` -- a CDX API only server which returns a responses about CDX captures in bulk. Includes most of the features of the `original cdx server implementation `_, updated documentation coming soon. - + * ``proxy-cert-auth`` -- a utility to support proxy mode. It can be used in CA root certificate, or per-host certificate with an existing root cert. @@ -151,7 +151,7 @@ If you would like to use non-SURT ordered .cdx files, simply add this field to t :: surt_ordered: false - + UI Customization """"""""""""""""""""" diff --git a/setup.py b/setup.py index b5ef2d26..4a36b078 100755 --- a/setup.py +++ b/setup.py @@ -34,7 +34,7 @@ class PyTest(TestCommand): setup( name='pywb', - version='0.6.5', + version='0.6.6', url='https://github.com/ikreymer/pywb', author='Ilya Kreymer', author_email='ikreymer@gmail.com', From 0495423e86204f461bcb0336e4f6cde0a7cda61b Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 6 Dec 2014 17:16:35 -0800 Subject: [PATCH 4/4] rewrite: add per-collection rewrite options, settable in 'rewrite_opts' block in each collection. Added rewrite_base to disable rewriting tag and rewrite_rel_canon to disable rewriting link rel=canon. Disabling tag rewrite fixex #51 and new system addresses #50 as well. --- CHANGES.rst | 11 +++++++++++ pywb/framework/archivalrouter.py | 4 +++- pywb/framework/wbrequestresponse.py | 6 ++++-- pywb/rewrite/html_rewriter.py | 23 ++++++++++++++++------- pywb/rewrite/regex_rewriters.py | 2 +- pywb/rewrite/test/test_html_rewriter.py | 24 +++++++++++++++++++++--- pywb/rewrite/url_rewriter.py | 3 ++- 7 files changed, 58 insertions(+), 15 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 0fe9ae07..0be413f1 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,3 +1,14 @@ +pywb 0.6.6 changelist +~~~~~~~~~~~~~~~~~~~~~ + +* Beginning of new rewrite options, settable per collections and stored in UrlRewriter. Available options: + + - `rewrite_base` - set to False to disable rewriting `` tag + - `rewrite_rel_canon` - set to false to disable rewriting `` + +* JS rewrite: Don't rewrite location if starting with '$' + + pywb 0.6.5 changelist ~~~~~~~~~~~~~~~~~~~~~ diff --git a/pywb/framework/archivalrouter.py b/pywb/framework/archivalrouter.py index 33230027..3b0b5a6d 100644 --- a/pywb/framework/archivalrouter.py +++ b/pywb/framework/archivalrouter.py @@ -62,7 +62,8 @@ class ArchivalRouter(object): use_abs_prefix=use_abs_prefix, wburl_class=route.handler.get_wburl_type(), urlrewriter_class=UrlRewriter, - cookie_scope=route.cookie_scope) + cookie_scope=route.cookie_scope, + rewrite_opts=route.rewrite_opts) # Allow for applying of additional filters route.apply_filters(wbrequest, matcher) @@ -101,6 +102,7 @@ class Route(object): # collection id from regex group (default 0) self.coll_group = coll_group self.cookie_scope = config.get('cookie_scope') + self.rewrite_opts = config.get('rewrite_opts', {}) self._custom_init(config) def is_handling(self, request_uri): diff --git a/pywb/framework/wbrequestresponse.py b/pywb/framework/wbrequestresponse.py index 06970316..7c48dbb3 100644 --- a/pywb/framework/wbrequestresponse.py +++ b/pywb/framework/wbrequestresponse.py @@ -38,7 +38,8 @@ class WbRequest(object): wburl_class=None, urlrewriter_class=None, is_proxy=False, - cookie_scope=None): + cookie_scope=None, + rewrite_opts={}): self.env = env @@ -77,7 +78,8 @@ class WbRequest(object): host_prefix + rel_prefix, rel_prefix, env.get('SCRIPT_NAME', '/'), - cookie_scope) + cookie_scope, + rewrite_opts) self.urlrewriter.deprefix_url() else: diff --git a/pywb/rewrite/html_rewriter.py b/pywb/rewrite/html_rewriter.py index 618c5191..cae65a89 100644 --- a/pywb/rewrite/html_rewriter.py +++ b/pywb/rewrite/html_rewriter.py @@ -92,6 +92,9 @@ class HTMLRewriterMixin(object): self.rewrite_tags = self._init_rewrite_tags(defmod) + # get opts from urlrewriter + self.opts = url_rewriter.rewrite_opts + # =========================== META_REFRESH_REGEX = re.compile('^[\\d.]+\\s*;\\s*url\\s*=\\s*(.+?)\\s*$', re.IGNORECASE | re.MULTILINE) @@ -174,9 +177,11 @@ class HTMLRewriterMixin(object): elif attr_name == 'crossorigin': attr_name = '_crossorigin' - # special case: link don't rewrite canonical + # special case: if rewrite_canon not set, + # don't rewrite rel=canonical elif tag == 'link' and attr_name == 'href': - if not self.has_attr(tag_attrs, ('rel', 'canonical')): + if (self.opts.get('rewrite_rel_canon', True) or + not self.has_attr(tag_attrs, ('rel', 'canonical'))): rw_mod = handler.get(attr_name) attr_value = self._rewrite_url(attr_value, rw_mod) @@ -191,17 +196,21 @@ class HTMLRewriterMixin(object): rw_mod = 'oe_' attr_value = self._rewrite_url(attr_value, rw_mod) + # special case: base tag + elif (tag == 'base') and (attr_name == 'href') and attr_value: + rw_mod = handler.get(attr_name) + base_value = self._rewrite_url(attr_value, rw_mod) + if self.opts.get('rewrite_base', True): + attr_value = base_value + self.url_rewriter = (self.url_rewriter. + rebase_rewriter(base_value)) + else: # rewrite url using tag handler rw_mod = handler.get(attr_name) if rw_mod is not None: attr_value = self._rewrite_url(attr_value, rw_mod) - # special case: base tag - if (tag == 'base') and (attr_name == 'href') and attr_value: - self.url_rewriter = (self.url_rewriter. - rebase_rewriter(attr_value)) - # write the attr! self._write_attr(attr_name, attr_value) diff --git a/pywb/rewrite/regex_rewriters.py b/pywb/rewrite/regex_rewriters.py index 179e06fd..375bca08 100644 --- a/pywb/rewrite/regex_rewriters.py +++ b/pywb/rewrite/regex_rewriters.py @@ -130,7 +130,7 @@ class JSLinkAndLocationRewriter(JSLinkOnlyRewriter): def __init__(self, rewriter, rules=[], prefix='WB_wombat_'): rules = rules + [ - (r'(?>> parse('') # -# Base Tests +# Base Tests -- w/ rewrite (default) >>> parse('') >>> parse('') +# Base Tests -- no rewrite +>>> parse('', urlrewriter=no_base_canon_rewriter) + + +>>> parse('', urlrewriter=no_base_canon_rewriter) + + + + # HTML Entities >>> parse('›   > ?') ›   > ? @@ -102,8 +111,12 @@ ur""" >>> parse('
SomeTest
', head_insert = '')
SomeTest
-# don't rewrite rel=canonical +# rel=canonical: rewrite (default) >>> parse('') + + +# rel=canonical: no_rewrite +>>> parse('', urlrewriter=no_base_canon_rewriter) # doctype @@ -143,7 +156,12 @@ import pprint urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/web/') -def parse(data, head_insert = None): +no_base_canon_rewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', + '/web/', + rewrite_opts=dict(rewrite_rel_canon=False, + rewrite_base=False)) + +def parse(data, head_insert=None, urlrewriter=urlrewriter): parser = HTMLRewriter(urlrewriter, head_insert = head_insert) #data = data.decode('utf-8') result = parser.rewrite(data) + parser.close() diff --git a/pywb/rewrite/url_rewriter.py b/pywb/rewrite/url_rewriter.py index aa87260c..a5cc7952 100644 --- a/pywb/rewrite/url_rewriter.py +++ b/pywb/rewrite/url_rewriter.py @@ -20,13 +20,14 @@ class UrlRewriter(object): REL_SCHEME = ('//', r'\/\/', r'\\/\\/') def __init__(self, wburl, prefix, full_prefix=None, rel_prefix=None, - root_path=None, cookie_scope=None): + root_path=None, cookie_scope=None, rewrite_opts={}): self.wburl = wburl if isinstance(wburl, WbUrl) else WbUrl(wburl) self.prefix = prefix self.full_prefix = full_prefix self.rel_prefix = rel_prefix if rel_prefix else prefix self.root_path = root_path if root_path else '/' self.cookie_scope = cookie_scope + self.rewrite_opts = rewrite_opts def rewrite(self, url, mod=None): # if special protocol, no rewriting at all