1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

Merge branch 'develop'

This commit is contained in:
Ilya Kreymer 2014-12-13 23:19:52 -08:00
commit bee70260ac
16 changed files with 160 additions and 58 deletions

View File

@ -1,3 +1,20 @@
pywb 0.6.6 changelist
~~~~~~~~~~~~~~~~~~~~~
* JS client side improvements: check for double-inits, preserve anchor in wb.js top location redirect
* JS Rewriters: add mixins for link + location (default), link only, location only rewriting by setting ``js_rewrite_location`` to ``all``, ``urls``, ``location``, respectively.
(New: location only rewriting does not change JS urls)
* Beginning of new rewrite options, settable per collections and stored in UrlRewriter. Available options:
- ``rewrite_base`` - set to False to disable rewriting ``<base href="...">`` tag
- ``rewrite_rel_canon`` - set to false to disable rewriting ``<link rel=canon href="...">``
* JS rewrite: Don't rewrite location if starting with '$'
pywb 0.6.5 changelist
~~~~~~~~~~~~~~~~~~~~~
@ -40,17 +57,17 @@ pywb 0.6.3 changelist
pywb 0.6.2 changelist
~~~~~~~~~~~~~~~~~~~~~
* Invert framed replay paradigm: Canonical page is always without a modifier (instead of with `mp_`), if using frames, the page redirects to `tf_`, and uses replaceState() to change url back to canonical form.
* Invert framed replay paradigm: Canonical page is always without a modifier (instead of with ``mp_``), if using frames, the page redirects to ``tf_``, and uses replaceState() to change url back to canonical form.
* Enable Memento support for framed replay, include Memento headers in top frame
* Easier to customize just the banner html, via `banner_html` setting in the config. Default banner uses ui/banner.html and inserts the script default_banner.js, which creates the banner.
* Easier to customize just the banner html, via ``banner_html`` setting in the config. Default banner uses ui/banner.html and inserts the script default_banner.js, which creates the banner.
Other implementations may create banner via custom JS or directly insert HTML, as needed. Setting `banner_html: False` will disable the banner.
Other implementations may create banner via custom JS or directly insert HTML, as needed. Setting ``banner_html: False`` will disable the banner.
* Small improvements to streaming response, read in fixed chunks to allow better streaming from live.
* Improved cookie and csrf-token rewriting, including: ability to set `cookie_scope: root` per collection to have all replayed cookies have their Path set to application root.
* Improved cookie and csrf-token rewriting, including: ability to set ``cookie_scope: root`` per collection to have all replayed cookies have their Path set to application root.
This is useful for replaying sites which share cookies amongst different pages and across archived time ranges.

View File

@ -1,4 +1,4 @@
PyWb 0.6.5
PyWb 0.6.6
==========
.. image:: https://travis-ci.org/ikreymer/pywb.png?branch=master
@ -44,7 +44,7 @@ This README contains a basic overview of using pywb. After reading this intro, c
pywb Tools Overview
-----------------------------
In addition to the standard wayback machine (explained further below), pywb tool suite includes a
In addition to the standard wayback machine (explained further below), pywb tool suite includes a
number of useful command-line and web server tools. The tools should be available to run after
running ``python setup.py install``:
@ -58,10 +58,10 @@ running ``python setup.py install``:
for all options.
* ``cdx-server`` -- a CDX API only server which returns a responses about CDX captures in bulk.
* ``cdx-server`` -- a CDX API only server which returns a responses about CDX captures in bulk.
Includes most of the features of the `original cdx server implementation <https://github.com/internetarchive/wayback/tree/master/wayback-cdx-server>`_,
updated documentation coming soon.
* ``proxy-cert-auth`` -- a utility to support proxy mode. It can be used in CA root certificate, or per-host certificate with an existing root cert.
@ -151,7 +151,7 @@ If you would like to use non-SURT ordered .cdx files, simply add this field to t
::
surt_ordered: false
UI Customization
"""""""""""""""""""""

View File

@ -62,7 +62,8 @@ class ArchivalRouter(object):
use_abs_prefix=use_abs_prefix,
wburl_class=route.handler.get_wburl_type(),
urlrewriter_class=UrlRewriter,
cookie_scope=route.cookie_scope)
cookie_scope=route.cookie_scope,
rewrite_opts=route.rewrite_opts)
# Allow for applying of additional filters
route.apply_filters(wbrequest, matcher)
@ -101,6 +102,7 @@ class Route(object):
# collection id from regex group (default 0)
self.coll_group = coll_group
self.cookie_scope = config.get('cookie_scope')
self.rewrite_opts = config.get('rewrite_opts', {})
self._custom_init(config)
def is_handling(self, request_uri):

View File

@ -38,7 +38,8 @@ class WbRequest(object):
wburl_class=None,
urlrewriter_class=None,
is_proxy=False,
cookie_scope=None):
cookie_scope=None,
rewrite_opts={}):
self.env = env
@ -77,7 +78,8 @@ class WbRequest(object):
host_prefix + rel_prefix,
rel_prefix,
env.get('SCRIPT_NAME', '/'),
cookie_scope)
cookie_scope,
rewrite_opts)
self.urlrewriter.deprefix_url()
else:

View File

@ -92,6 +92,9 @@ class HTMLRewriterMixin(object):
self.rewrite_tags = self._init_rewrite_tags(defmod)
# get opts from urlrewriter
self.opts = url_rewriter.rewrite_opts
# ===========================
META_REFRESH_REGEX = re.compile('^[\\d.]+\\s*;\\s*url\\s*=\\s*(.+?)\\s*$',
re.IGNORECASE | re.MULTILINE)
@ -174,9 +177,11 @@ class HTMLRewriterMixin(object):
elif attr_name == 'crossorigin':
attr_name = '_crossorigin'
# special case: link don't rewrite canonical
# special case: if rewrite_canon not set,
# don't rewrite rel=canonical
elif tag == 'link' and attr_name == 'href':
if not self.has_attr(tag_attrs, ('rel', 'canonical')):
if (self.opts.get('rewrite_rel_canon', True) or
not self.has_attr(tag_attrs, ('rel', 'canonical'))):
rw_mod = handler.get(attr_name)
attr_value = self._rewrite_url(attr_value, rw_mod)
@ -191,17 +196,21 @@ class HTMLRewriterMixin(object):
rw_mod = 'oe_'
attr_value = self._rewrite_url(attr_value, rw_mod)
# special case: base tag
elif (tag == 'base') and (attr_name == 'href') and attr_value:
rw_mod = handler.get(attr_name)
base_value = self._rewrite_url(attr_value, rw_mod)
if self.opts.get('rewrite_base', True):
attr_value = base_value
self.url_rewriter = (self.url_rewriter.
rebase_rewriter(base_value))
else:
# rewrite url using tag handler
rw_mod = handler.get(attr_name)
if rw_mod is not None:
attr_value = self._rewrite_url(attr_value, rw_mod)
# special case: base tag
if (tag == 'base') and (attr_name == 'href') and attr_value:
self.url_rewriter = (self.url_rewriter.
rebase_rewriter(attr_value))
# write the attr!
self._write_attr(attr_name, attr_value)

View File

@ -35,7 +35,7 @@ class RegexRewriter(object):
#DEFAULT_OP = add_prefix
def __init__(self, rules):
def __init__(self, rewriter, rules):
#rules = self.create_rules(http_prefix)
# Build regexstr, concatenating regex list
@ -106,7 +106,7 @@ class RegexRewriter(object):
#=================================================================
class JSLinkOnlyRewriter(RegexRewriter):
class JSLinkRewriterMixin(object):
"""
JS Rewriter which rewrites absolute http://, https:// and // urls
at the beginning of a string
@ -118,19 +118,20 @@ class JSLinkOnlyRewriter(RegexRewriter):
rules = rules + [
(self.JS_HTTPX, RegexRewriter.archival_rewrite(rewriter), 0)
]
super(JSLinkOnlyRewriter, self).__init__(rules)
super(JSLinkRewriterMixin, self).__init__(rewriter, rules)
#=================================================================
class JSLinkAndLocationRewriter(JSLinkOnlyRewriter):
class JSLocationRewriterMixin(object):
#class JSLinkAndLocationRewriter(JSLinkOnlyRewriter):
"""
JS Rewriter which also rewrites location and domain to the
JS Rewriter mixin which rewrites location and domain to the
specified prefix (default: 'WB_wombat_')
"""
def __init__(self, rewriter, rules=[], prefix='WB_wombat_'):
rules = rules + [
(r'(?<!/)\blocation\b(?!\":)', RegexRewriter.add_prefix(prefix), 0),
(r'(?<![/$])\blocation\b(?!\":)', RegexRewriter.add_prefix(prefix), 0),
(r'(?<=document\.)domain', RegexRewriter.add_prefix(prefix), 0),
(r'(?<=document\.)referrer', RegexRewriter.add_prefix(prefix), 0),
(r'(?<=document\.)cookie', RegexRewriter.add_prefix(prefix), 0),
@ -148,7 +149,23 @@ class JSLinkAndLocationRewriter(JSLinkOnlyRewriter):
#(r'\b(?:self|window)\b[!=\W]+\b(top)\b',
#RegexRewriter.add_prefix(prefix), 1),
]
super(JSLinkAndLocationRewriter, self).__init__(rewriter, rules)
super(JSLocationRewriterMixin, self).__init__(rewriter, rules)
#=================================================================
class JSLocationOnlyRewriter(JSLocationRewriterMixin, RegexRewriter):
pass
#=================================================================
class JSLinkOnlyRewriter(JSLinkRewriterMixin, RegexRewriter):
pass
#=================================================================
class JSLinkAndLocationRewriter(JSLocationRewriterMixin,
JSLinkRewriterMixin,
RegexRewriter):
pass
#=================================================================
@ -161,7 +178,7 @@ class XMLRewriter(RegexRewriter):
def __init__(self, rewriter, extra=[]):
rules = self._create_rules(rewriter)
super(XMLRewriter, self).__init__(rules)
super(XMLRewriter, self).__init__(rewriter, rules)
# custom filter to reject 'xmlns' attr
def filter(self, m):
@ -189,7 +206,7 @@ class CSSRewriter(RegexRewriter):
def __init__(self, rewriter):
rules = self._create_rules(rewriter)
super(CSSRewriter, self).__init__(rules)
super(CSSRewriter, self).__init__(rewriter, rules)
def _create_rules(self, rewriter):
return [

View File

@ -1,7 +1,7 @@
from pywb.utils.dsrules import BaseRule
from regex_rewriters import RegexRewriter, CSSRewriter, XMLRewriter
from regex_rewriters import JSLinkAndLocationRewriter, JSLinkOnlyRewriter
from regex_rewriters import JSLinkAndLocationRewriter, JSLinkOnlyRewriter, JSLocationOnlyRewriter
from header_rewriter import HeaderRewriter
from html_rewriter import HTMLRewriter
@ -27,12 +27,13 @@ class RewriteRules(BaseRule):
self.parse_comments = config.get('parse_comments', False)
# Custom handling for js rewriting, often the most complex
self.js_rewrite_location = config.get('js_rewrite_location', True)
self.js_rewrite_location = bool(self.js_rewrite_location)
self.js_rewrite_location = config.get('js_rewrite_location', 'all')
# ability to toggle rewriting
if self.js_rewrite_location:
if self.js_rewrite_location == 'all':
js_default_class = JSLinkAndLocationRewriter
elif self.js_rewrite_location == 'location':
js_default_class = JSLocationOnlyRewriter
else:
js_default_class = JSLinkOnlyRewriter

View File

@ -20,13 +20,22 @@ ur"""
#>>> parse('<input "selected"><img src></div>')
#<input "selected"=""><img src=""></div>
# Base Tests
# Base Tests -- w/ rewrite (default)
>>> parse('<html><head><base href="http://example.com/diff/path/file.html"/>')
<html><head><base href="/web/20131226101010/http://example.com/diff/path/file.html"/>
>>> parse('<base href="static/"/><img src="image.gif"/>')
<base href="/web/20131226101010/http://example.com/some/path/static/"/><img src="/web/20131226101010im_/http://example.com/some/path/static/image.gif"/>
# Base Tests -- no rewrite
>>> parse('<html><head><base href="http://example.com/diff/path/file.html"/>', urlrewriter=no_base_canon_rewriter)
<html><head><base href="http://example.com/diff/path/file.html"/>
>>> parse('<base href="static/"/><img src="image.gif"/>', urlrewriter=no_base_canon_rewriter)
<base href="static/"/><img src="/web/20131226101010im_/http://example.com/some/path/static/image.gif"/>
# HTML Entities
>>> parse('<a href="">&rsaquo; &nbsp; &#62; &#63</div>')
<a href="">&rsaquo; &nbsp; &#62; &#63</div>
@ -102,8 +111,12 @@ ur"""
>>> parse('<link href="abc.txt"><div>SomeTest</div>', head_insert = '<script>load_stuff();</script>')
<link href="/web/20131226101010oe_/http://example.com/some/path/abc.txt"><script>load_stuff();</script><div>SomeTest</div>
# don't rewrite rel=canonical
# rel=canonical: rewrite (default)
>>> parse('<link rel=canonical href="http://example.com/">')
<link rel="canonical" href="/web/20131226101010oe_/http://example.com/">
# rel=canonical: no_rewrite
>>> parse('<link rel=canonical href="http://example.com/">', urlrewriter=no_base_canon_rewriter)
<link rel="canonical" href="http://example.com/">
# doctype
@ -143,7 +156,12 @@ import pprint
urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/web/')
def parse(data, head_insert = None):
no_base_canon_rewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html',
'/web/',
rewrite_opts=dict(rewrite_rel_canon=False,
rewrite_base=False))
def parse(data, head_insert=None, urlrewriter=urlrewriter):
parser = HTMLRewriter(urlrewriter, head_insert = head_insert)
#data = data.decode('utf-8')
result = parser.rewrite(data) + parser.close()

View File

@ -3,7 +3,7 @@ r"""
# Custom Regex
#=================================================================
# Test https->http converter (other tests below in subclasses)
>>> RegexRewriter([(RegexRewriter.HTTPX_MATCH_STR, RegexRewriter.remove_https, 0)]).rewrite('a = https://example.com; b = http://example.com; c = https://some-url/path/https://embedded.example.com')
>>> RegexRewriter(urlrewriter, [(RegexRewriter.HTTPX_MATCH_STR, RegexRewriter.remove_https, 0)]).rewrite('a = https://example.com; b = http://example.com; c = https://some-url/path/https://embedded.example.com')
'a = http://example.com; b = http://example.com; c = http://some-url/path/http://embedded.example.com'

View File

@ -13,7 +13,7 @@ urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.htm
bn_urlrewriter = UrlRewriter('20131226101010bn_/http://example.com/some/path/index.html', '/pywb/')
def head_insert_func(rule, cdx):
if rule.js_rewrite_location == True:
if rule.js_rewrite_location != 'urls':
return '<script src="/static/default/wombat.js"> </script>'
else:
return ''
@ -26,10 +26,10 @@ def test_local_1():
'com,example,test)/')
# wombat insert added
assert '<head><script src="/static/default/wombat.js"> </script>' in buff
assert '<head><script src="/static/default/wombat.js"> </script>' in buff, buff
# location rewritten
assert 'window.WB_wombat_location = "/other.html"' in buff
# JS location and JS link rewritten
assert 'window.WB_wombat_location = "/pywb/20131226101010/http:\/\/example.com/dynamic_page.html"' in buff
# link rewritten
assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff
@ -65,7 +65,7 @@ def test_local_no_head_banner_only():
# link NOT rewritten
assert '"another.html"' in buff
def test_local_banner_only():
def test_local_banner_only_no_rewrite():
status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html',
bn_urlrewriter,
head_insert_func,
@ -74,13 +74,13 @@ def test_local_banner_only():
# wombat insert added
assert '<head><script src="/static/default/wombat.js"> </script>' in buff
# location NOT rewritten
assert 'window.location = "/other.html"' in buff
# JS location NOT rewritten, JS link NOT rewritten
assert 'window.location = "http:\/\/example.com/dynamic_page.html"' in buff, buff
# link NOT rewritten
assert '"another.html"' in buff
def test_local_2_no_js_location_rewrite():
def test_local_2_link_only_rewrite():
status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html',
urlrewriter,
head_insert_func,
@ -89,13 +89,28 @@ def test_local_2_no_js_location_rewrite():
# no wombat insert
assert '<head><script src="/static/default/wombat.js"> </script>' not in buff
# no location rewrite
assert 'window.location = "/other.html"' in buff
# JS location NOT rewritten, JS link rewritten
assert 'window.location = "/pywb/20131226101010/http:\/\/example.com/dynamic_page.html"' in buff
# still link rewrite
assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff
def test_local_2_js_loc_only_rewrite():
status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html',
urlrewriter,
head_insert_func,
'example,example,test,loconly)/')
# wombat insert added
assert '<script src="/static/default/wombat.js"> </script>' in buff
# JS location rewritten, JS link NOT rewritten
assert 'window.WB_wombat_location = "http:\/\/example.com/dynamic_page.html"' in buff
# still link rewrite in HTML
assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff
def test_example_1():
status_headers, buff = get_rewritten('http://example.com/', urlrewriter, req_headers={'Connection': 'close'})

View File

@ -20,13 +20,14 @@ class UrlRewriter(object):
REL_SCHEME = ('//', r'\/\/', r'\\/\\/')
def __init__(self, wburl, prefix, full_prefix=None, rel_prefix=None,
root_path=None, cookie_scope=None):
root_path=None, cookie_scope=None, rewrite_opts={}):
self.wburl = wburl if isinstance(wburl, WbUrl) else WbUrl(wburl)
self.prefix = prefix
self.full_prefix = full_prefix
self.rel_prefix = rel_prefix if rel_prefix else prefix
self.root_path = root_path if root_path else '/'
self.cookie_scope = cookie_scope
self.rewrite_opts = rewrite_opts
def rewrite(self, url, mod=None):
# if special protocol, no rewriting at all

View File

@ -120,6 +120,11 @@ rules:
# testing rules -- not for valid domain
#=================================================================
# this rule block is a non-existent prefix merely for testing
- url_prefix: 'example,example,test,loconly)/'
rewrite:
js_rewrite_location: location
- url_prefix: 'example,example,test)/'
canonicalize:
@ -131,10 +136,10 @@ rules:
- id
rewrite:
js_rewrite_location: False
js_rewrite_location: urls
# all domain rules -- fallback to this dataset
# all domain rules -- fallback to this dataset
#=================================================================
# Applies to all urls -- should be last
- url_prefix: ''

View File

@ -35,10 +35,14 @@ function init_banner() {
bid = PLAIN_BANNER_ID;
}
if (!document || !document.body) {
return;
}
if (document.getElementById(bid) != null) {
return;
}
_wb_js.create_banner_element(bid);
}
@ -56,14 +60,14 @@ this.ts_to_date = function(ts, is_gmt)
if (ts.length < 14) {
return ts;
}
var datestr = (ts.substring(0, 4) + "-" +
var datestr = (ts.substring(0, 4) + "-" +
ts.substring(4, 6) + "-" +
ts.substring(6, 8) + "T" +
ts.substring(8, 10) + ":" +
ts.substring(10, 12) + ":" +
ts.substring(12, 14) + "-00:00");
var date = new Date(datestr);
if (is_gmt) {
return date.toGMTString();
@ -117,10 +121,21 @@ function notify_top() {
}
this.load = function() {
if (window._wb_js_inited) {
return;
}
window._wb_js_inited = true;
if ((window.self == window.top) && wbinfo) {
if (wbinfo.top_url && (window.location.href != wbinfo.top_url) && wbinfo.mod != "bn_") {
var hash = window.location.hash;
var loc = window.location.href.replace(window.location.hash, "");
if (wbinfo.top_url && (loc != wbinfo.top_url) && wbinfo.mod != "bn_") {
// Auto-redirect to top frame
window.location.replace(wbinfo.top_url);
window.location.replace(wbinfo.top_url + hash);
} else {
// Init Banner (no frame or top frame)
add_event("readystatechange", init_banner, document);

View File

@ -1,5 +1,5 @@
<!-- WB Insert -->
{% if rule.js_rewrite_location and include_wombat %}
{% if rule.js_rewrite_location != 'urls' and include_wombat %}
<script src='{{ wbrequest.host_prefix }}/{{ static_path }}/wombat.js'> </script>
<script>
{% set urlsplit = cdx.original | urlsplit %}

View File

@ -6,7 +6,7 @@
<script>
var some_val = false;
if (some_val) {
window.location = "/other.html";
window.location = "http:\/\/example.com/dynamic_page.html";
}
</script>
Test Content

View File

@ -34,7 +34,7 @@ class PyTest(TestCommand):
setup(
name='pywb',
version='0.6.5',
version='0.6.6',
url='https://github.com/ikreymer/pywb',
author='Ilya Kreymer',
author_email='ikreymer@gmail.com',