diff --git a/CHANGES.rst b/CHANGES.rst index 547a8f52..f06b6263 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,4 +1,42 @@ -pywb 0.2.2 changelist +pywb 0.4.0 changelist +~~~~~~~~~~~~~~~~~~~~~ + +* Improved test coverage throughout the project. + +* live-rewrite-server: A new web server for checking rewriting rules against live content. A white-list of request headers is sent to + the destination server. See `rewrite_live.py `_ for more details. + +* Cookie Rewriting in Archival Mode: HTTP Set-Cookie header rewritten to remove Expires, rewrite Path and Domain. If Domain is used, Path is set to / to ensure cookie is visible + from all archival urls. + +* Much improved handling of chunk encoded responses, better handling of zero-length chunks and fix bug where not enough gzip data was read for a full chunk to be decoded. Support for chunk-decoding w/o gzip decompression + (for example, for binary data). + +* Redis CDX: Initial support for reading entire CDX 'file' from a redis key via ZRANGEBYLEX, though needs more testing. + +* Jinja templates: additional keyword args added to most templates for customization, export 'urlsplit' to use by templates. + +* Remove SeekableLineReader, just using standard file-like object for binary search. + +* Proper handling of js_ cs_ modifiers to select content-type. + +* New, experimental support for top-level 'frame mode', used by live-rewrite-server, to display rewritten content in a frame. The mp_ modifier is used + to indicate the main page when top-level page is a frame. + +* cdx-indexer: Support for creation of non-SURT, url-ordered as well SURT-ordered CDX files. + +* Further rewrite of wombat.js: support for window.open, postMessage overrides, additional rewriting at Node creation time, better hash change detection. + Use ``Object.defineProperty`` whenever possible to better override assignment to various JS properties. + See `wombat.js `_ for more info. + +* Update wombat.js to support: scheme-relative urls rewriting, dom manipulation rewriting, disable web Worker api which could leak to live requests + +* Fixed support for empty arc/warc records. Indexed with '-', replay with '204 No Content' + +* Improve lxml rewriting, letting lxml handle parsing and decoding from bytestream directly (to address #36) + + +pywb 0.3.0 changelist ~~~~~~~~~~~~~~~~~~~~~ * Generate cdx indexs via command-line `cdx-indexer` script. Optionally sorting, and output to either a single combined file or a file per-directory. diff --git a/README.rst b/README.rst index 844be635..7f381fdc 100644 --- a/README.rst +++ b/README.rst @@ -1,5 +1,5 @@ -PyWb 0.2.2 -============= +PyWb 0.4.0 +========== .. image:: https://travis-ci.org/ikreymer/pywb.png?branch=master :target: https://travis-ci.org/ikreymer/pywb @@ -9,7 +9,31 @@ PyWb 0.2.2 pywb is a python implementation of web archival replay tools, sometimes also known as 'Wayback Machine'. -pywb allows high-fidelity replay (browsing) of archived web data stored in standardized `ARC `_ and `WARC `_. +pywb allows high-quality replay (browsing) of archived web data stored in standardized `ARC `_ and `WARC `_. + +*For an example of deployed service using pywb, please see the https://webrecorder.io project* + +pywb Tools +----------------------------- + +In addition to the standard wayback machine (explained further below), pywb tool suite includes a +number of useful command-line and web server tools. The tools should be available to run after +running ``python setup.py install`` + +``live-rewrite-server`` -- a demo live rewriting web server which accepts requests using wayback machine url format at ``/rewrite/`` path, eg, ``/rewrite/http://example.com/`` +and applies the same url rewriting rules as are used for archived content. +This is useful for checking how live content will appear when archived before actually creating any archive files, or for recording data. +The `webrecorder.io `_ service is built using this tool. + +``cdx-indexer`` -- a command-line tool for creating CDX indexs from WARC and ARC files. Supports SURT and +non-SURT based cdx files and optional sorting. See ``cdx-indexer -h`` for all options. +for all options. + +``cdx-server`` -- a CDX API only server which returns a responses about CDX captures in bulk. +Includes most of the features of the `original cdx server implementation `_, +updated documentation coming soon. + +``wayback`` -- The full Wayback Machine application, further explained below. Latest Changes diff --git a/pywb/apps/live_rewrite_server.py b/pywb/apps/live_rewrite_server.py new file mode 100644 index 00000000..9b29e42b --- /dev/null +++ b/pywb/apps/live_rewrite_server.py @@ -0,0 +1,16 @@ +from pywb.framework.wsgi_wrappers import init_app, start_wsgi_server + +from pywb.webapp.live_rewrite_handler import create_live_rewriter_app + +#================================================================= +# init cdx server app +#================================================================= + +application = init_app(create_live_rewriter_app, load_yaml=False) + + +def main(): # pragma: no cover + start_wsgi_server(application, 'Live Rewriter App', default_port=8090) + +if __name__ == "__main__": + main() diff --git a/pywb/cdx/cdxdomainspecific.py b/pywb/cdx/cdxdomainspecific.py index 83b4d1ee..fd830c17 100644 --- a/pywb/cdx/cdxdomainspecific.py +++ b/pywb/cdx/cdxdomainspecific.py @@ -25,7 +25,7 @@ def load_domain_specific_cdx_rules(ds_rules_file, surt_ordered): ds_rules_file=ds_rules_file) if not surt_ordered: - for rule in rules: + for rule in rules.rules: rule.unsurt() if rules: @@ -36,7 +36,7 @@ def load_domain_specific_cdx_rules(ds_rules_file, surt_ordered): ds_rules_file=ds_rules_file) if not surt_ordered: - for rule in rules: + for rule in rules.rules: rule.unsurt() if rules: @@ -108,11 +108,12 @@ class FuzzyQuery: params.update({'url': url, 'matchType': 'prefix', 'filter': filter_}) - try: + + if 'reverse' in params: del params['reverse'] + + if 'closest' in params: del params['closest'] - except KeyError: - pass return params @@ -141,7 +142,7 @@ class CDXDomainSpecificRule(BaseRule): """ self.url_prefix = map(unsurt, self.url_prefix) if self.regex: - self.regex = unsurt(self.regex) + self.regex = re.compile(unsurt(self.regex.pattern)) if self.replace: self.replace = unsurt(self.replace) diff --git a/pywb/cdx/cdxsource.py b/pywb/cdx/cdxsource.py index bf57209d..daeedc34 100644 --- a/pywb/cdx/cdxsource.py +++ b/pywb/cdx/cdxsource.py @@ -1,5 +1,4 @@ from pywb.utils.binsearch import iter_range -from pywb.utils.loaders import SeekableTextFileReader from pywb.utils.wbexception import AccessException, NotFoundException from pywb.utils.wbexception import BadRequestException, WbException @@ -29,7 +28,7 @@ class CDXFile(CDXSource): self.filename = filename def load_cdx(self, query): - source = SeekableTextFileReader(self.filename) + source = open(self.filename) return iter_range(source, query.key, query.end_key) def __str__(self): @@ -94,22 +93,42 @@ class RedisCDXSource(CDXSource): def __init__(self, redis_url, config=None): import redis + + parts = redis_url.split('/') + if len(parts) > 4: + self.cdx_key = parts[4] + else: + self.cdx_key = None + self.redis_url = redis_url self.redis = redis.StrictRedis.from_url(redis_url) self.key_prefix = self.DEFAULT_KEY_PREFIX - if config: - self.key_prefix = config.get('redis_key_prefix', self.key_prefix) def load_cdx(self, query): """ Load cdx from redis cache, from an ordered list - Currently, there is no support for range queries - Only 'exact' matchType is supported - """ - key = query.key + If cdx_key is set, treat it as cdx file and load use + zrangebylex! (Supports all match types!) + Otherwise, assume a key per-url and load all entries for that key. + (Only exact match supported) + """ + + if self.cdx_key: + return self.load_sorted_range(query) + else: + return self.load_single_key(query.key) + + def load_sorted_range(self, query): + cdx_list = self.redis.zrangebylex(self.cdx_key, + '[' + query.key, + '(' + query.end_key) + + return cdx_list + + def load_single_key(self, key): # ensure only url/surt is part of key key = key.split(' ')[0] cdx_list = self.redis.zrange(self.key_prefix + key, 0, -1) diff --git a/pywb/cdx/test/test_cdxserver.py b/pywb/cdx/test/test_cdxserver.py index 3e4cdf3e..f90ef8aa 100644 --- a/pywb/cdx/test/test_cdxserver.py +++ b/pywb/cdx/test/test_cdxserver.py @@ -128,6 +128,36 @@ def test_fuzzy_match(): assert_cdx_fuzzy_match(RemoteCDXServer(CDX_SERVER_URL, ds_rules_file=DEFAULT_RULES_FILE)) +def test_fuzzy_no_match_1(): + # no match, no fuzzy + with patch('pywb.cdx.cdxsource.urllib2.urlopen', mock_urlopen): + server = CDXServer([TEST_CDX_DIR], ds_rules_file=DEFAULT_RULES_FILE) + with raises(NotFoundException): + server.load_cdx(url='http://notfound.example.com/', + output='cdxobject', + reverse=True, + allowFuzzy=True) + +def test_fuzzy_no_match_2(): + # fuzzy rule, but no actual match + with patch('pywb.cdx.cdxsource.urllib2.urlopen', mock_urlopen): + server = CDXServer([TEST_CDX_DIR], ds_rules_file=DEFAULT_RULES_FILE) + with raises(NotFoundException): + server.load_cdx(url='http://notfound.example.com/?_=1234', + closest='2014', + reverse=True, + output='cdxobject', + allowFuzzy=True) + +def test2_fuzzy_no_match_3(): + # special fuzzy rule, matches prefix test.example.example., + # but doesn't match rule regex + with patch('pywb.cdx.cdxsource.urllib2.urlopen', mock_urlopen): + server = CDXServer([TEST_CDX_DIR], ds_rules_file=DEFAULT_RULES_FILE) + with raises(NotFoundException): + server.load_cdx(url='http://test.example.example/', + allowFuzzy=True) + def assert_error(func, exception): with raises(exception): func(CDXServer(CDX_SERVER_URL)) diff --git a/pywb/cdx/test/test_redis_source.py b/pywb/cdx/test/test_redis_source.py index e620811c..9f5daa8d 100644 --- a/pywb/cdx/test/test_redis_source.py +++ b/pywb/cdx/test/test_redis_source.py @@ -1,9 +1,12 @@ """ ->>> redis_cdx('http://example.com') +>>> redis_cdx(redis_cdx_server, 'http://example.com') com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz +# TODO: enable when FakeRedis supports zrangebylex! +#>>> redis_cdx(redis_cdx_server_key, 'http://example.com') + """ from fakeredis import FakeStrictRedis @@ -21,13 +24,17 @@ import os test_cdx_dir = get_test_dir() + 'cdx/' -def load_cdx_into_redis(source, filename): +def load_cdx_into_redis(source, filename, key=None): # load a cdx into mock redis with open(test_cdx_dir + filename) as fh: for line in fh: - zadd_cdx(source, line) + zadd_cdx(source, line, key) + +def zadd_cdx(source, cdx, key): + if key: + source.redis.zadd(key, 0, cdx) + return -def zadd_cdx(source, cdx): parts = cdx.split(' ', 2) key = parts[0] @@ -49,9 +56,22 @@ def init_redis_server(): return CDXServer([source]) -def redis_cdx(url, **params): +@patch('redis.StrictRedis', FakeStrictRedis) +def init_redis_server_key_file(): + source = RedisCDXSource('redis://127.0.0.1:6379/0/key') + + for f in os.listdir(test_cdx_dir): + if f.endswith('.cdx'): + load_cdx_into_redis(source, f, source.cdx_key) + + return CDXServer([source]) + + +def redis_cdx(cdx_server, url, **params): cdx_iter = cdx_server.load_cdx(url=url, **params) for cdx in cdx_iter: sys.stdout.write(cdx) -cdx_server = init_redis_server() +redis_cdx_server = init_redis_server() +redis_cdx_server_key = init_redis_server_key_file() + diff --git a/pywb/cdx/zipnum.py b/pywb/cdx/zipnum.py index e282dfc0..071319a5 100644 --- a/pywb/cdx/zipnum.py +++ b/pywb/cdx/zipnum.py @@ -9,7 +9,6 @@ from cdxsource import CDXSource from cdxobject import IDXObject from pywb.utils.loaders import BlockLoader -from pywb.utils.loaders import SeekableTextFileReader from pywb.utils.bufferedreaders import gzip_decompressor from pywb.utils.binsearch import iter_range, linearsearch @@ -113,7 +112,7 @@ class ZipNumCluster(CDXSource): def load_cdx(self, query): self.load_loc() - reader = SeekableTextFileReader(self.summary) + reader = open(self.summary) idx_iter = iter_range(reader, query.key, diff --git a/pywb/framework/archivalrouter.py b/pywb/framework/archivalrouter.py index 4f5278de..dce54949 100644 --- a/pywb/framework/archivalrouter.py +++ b/pywb/framework/archivalrouter.py @@ -192,4 +192,4 @@ class ReferRedirect: '', '')) - return WbResponse.redir_response(final_url) + return WbResponse.redir_response(final_url, status='307 Temp Redirect') diff --git a/pywb/framework/test/test_wbrequestresponse.py b/pywb/framework/test/test_wbrequestresponse.py index f090a6ae..e066d4d1 100644 --- a/pywb/framework/test/test_wbrequestresponse.py +++ b/pywb/framework/test/test_wbrequestresponse.py @@ -21,10 +21,20 @@ >>> print_req_from_uri('/2010/example.com', {'wsgi.url_scheme': 'https', 'HTTP_HOST': 'localhost:8080'}, use_abs_prefix = True) {'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': 'https://localhost:8080/2010/', 'request_uri': '/2010/example.com'} -# No Scheme, so stick to relative +# No Scheme, default to http (shouldn't happen per WSGI standard) >>> print_req_from_uri('/2010/example.com', {'HTTP_HOST': 'localhost:8080'}, use_abs_prefix = True) -{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': '/2010/', 'request_uri': '/2010/example.com'} +{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': 'http://localhost:8080/2010/', 'request_uri': '/2010/example.com'} +# Referrer extraction +>>> WbUrl(req_from_uri('/web/2010/example.com', {'wsgi.url_scheme': 'http', 'HTTP_HOST': 'localhost:8080', 'HTTP_REFERER': 'http://localhost:8080/web/2011/blah.example.com/'}).extract_referrer_wburl_str()).url +'http://blah.example.com/' + +# incorrect referer +>>> req_from_uri('/web/2010/example.com', {'wsgi.url_scheme': 'http', 'HTTP_HOST': 'localhost:8080', 'HTTP_REFERER': 'http://other.example.com/web/2011/blah.example.com/'}).extract_referrer_wburl_str() + + +# no referer +>>> req_from_uri('/web/2010/example.com', {'wsgi.url_scheme': 'http', 'HTTP_HOST': 'localhost:8080'}).extract_referrer_wburl_str() # WbResponse Tests diff --git a/pywb/framework/wbrequestresponse.py b/pywb/framework/wbrequestresponse.py index ba1f6a02..446aa88a 100644 --- a/pywb/framework/wbrequestresponse.py +++ b/pywb/framework/wbrequestresponse.py @@ -23,7 +23,7 @@ class WbRequest(object): if not host: host = env['SERVER_NAME'] + ':' + env['SERVER_PORT'] - return env['wsgi.url_scheme'] + '://' + host + return env.get('wsgi.url_scheme', 'http') + '://' + host except KeyError: return '' @@ -66,7 +66,8 @@ class WbRequest(object): # wb_url present and not root page if wb_url_str != '/' and wburl_class: self.wb_url = wburl_class(wb_url_str) - self.urlrewriter = urlrewriter_class(self.wb_url, self.wb_prefix) + self.urlrewriter = urlrewriter_class(self.wb_url, self.wb_prefix, + host_prefix + rel_prefix) else: # no wb_url, just store blank wb_url self.wb_url = None @@ -87,17 +88,6 @@ class WbRequest(object): self._parse_extra() - @property - def is_embed(self): - return (self.wb_url and - self.wb_url.mod and - self.wb_url.mod != 'id_') - - @property - def is_identity(self): - return (self.wb_url and - self.wb_url.mod == 'id_') - def _is_ajax(self): value = self.env.get('HTTP_X_REQUESTED_WITH') if value and value.lower() == 'xmlhttprequest': @@ -116,6 +106,16 @@ class WbRequest(object): def _parse_extra(self): pass + def extract_referrer_wburl_str(self): + if not self.referrer: + return None + + if not self.referrer.startswith(self.host_prefix + self.rel_prefix): + return None + + wburl_str = self.referrer[len(self.host_prefix + self.rel_prefix):] + return wburl_str + #================================================================= class WbResponse(object): diff --git a/pywb/framework/wsgi_wrappers.py b/pywb/framework/wsgi_wrappers.py index 7401f89e..837a7c74 100644 --- a/pywb/framework/wsgi_wrappers.py +++ b/pywb/framework/wsgi_wrappers.py @@ -62,45 +62,50 @@ class WSGIApp(object): response = wb_router(env) if not response: - msg = 'No handler for "{0}"'.format(env['REL_REQUEST_URI']) + msg = 'No handler for "{0}".'.format(env['REL_REQUEST_URI']) raise NotFoundException(msg) except WbException as e: - response = handle_exception(env, wb_router, e, False) + response = self.handle_exception(env, e, False) except Exception as e: - response = handle_exception(env, wb_router, e, True) + response = self.handle_exception(env, e, True) return response(env, start_response) + def handle_exception(self, env, exc, print_trace): + error_view = None -#================================================================= -def handle_exception(env, wb_router, exc, print_trace): - error_view = None - if hasattr(wb_router, 'error_view'): - error_view = wb_router.error_view + if hasattr(self.wb_router, 'error_view'): + error_view = self.wb_router.error_view - if hasattr(exc, 'status'): - status = exc.status() - else: - status = '400 Bad Request' + if hasattr(exc, 'status'): + status = exc.status() + else: + status = '400 Bad Request' - if print_trace: - import traceback - err_details = traceback.format_exc(exc) - print err_details - else: - logging.info(str(exc)) - err_details = None + if hasattr(exc, 'url'): + err_url = exc.url + else: + err_url = None - if error_view: - import traceback - return error_view.render_response(err_msg=str(exc), - err_details=err_details, - status=status) - else: - return WbResponse.text_response(status + ' Error: ' + str(exc), - status=status) + if print_trace: + import traceback + err_details = traceback.format_exc(exc) + print err_details + else: + logging.info(str(exc)) + err_details = None + + if error_view: + return error_view.render_response(exc_type=type(exc).__name__, + err_msg=str(exc), + err_details=err_details, + status=status, + err_url=err_url) + else: + return WbResponse.text_response(status + ' Error: ' + str(exc), + status=status) #================================================================= DEFAULT_CONFIG_FILE = 'config.yaml' diff --git a/pywb/rewrite/cookie_rewriter.py b/pywb/rewrite/cookie_rewriter.py new file mode 100644 index 00000000..78e3c9c6 --- /dev/null +++ b/pywb/rewrite/cookie_rewriter.py @@ -0,0 +1,35 @@ +from Cookie import SimpleCookie, CookieError + + +#================================================================= +class WbUrlCookieRewriter(object): + """ Cookie rewriter for wburl-based requests + Remove the domain and rewrite path, if any, to match + given WbUrl using the url rewriter. + """ + def __init__(self, url_rewriter): + self.url_rewriter = url_rewriter + + def rewrite(self, cookie_str, header='Set-Cookie'): + results = [] + cookie = SimpleCookie() + try: + cookie.load(cookie_str) + except CookieError: + return results + + for name, morsel in cookie.iteritems(): + # if domain set, no choice but to expand cookie path to root + if morsel.get('domain'): + del morsel['domain'] + morsel['path'] = self.url_rewriter.prefix + # else set cookie to rewritten path + elif morsel.get('path'): + morsel['path'] = self.url_rewriter.rewrite(morsel['path']) + # remove expires as it refers to archived time + if morsel.get('expires'): + del morsel['expires'] + + results.append((header, morsel.OutputString())) + + return results diff --git a/pywb/rewrite/header_rewriter.py b/pywb/rewrite/header_rewriter.py index 93b007de..25b27de4 100644 --- a/pywb/rewrite/header_rewriter.py +++ b/pywb/rewrite/header_rewriter.py @@ -39,6 +39,8 @@ class HeaderRewriter: PROXY_NO_REWRITE_HEADERS = ['content-length'] + COOKIE_HEADERS = ['set-cookie', 'cookie'] + def __init__(self, header_prefix='X-Archive-Orig-'): self.header_prefix = header_prefix @@ -86,6 +88,8 @@ class HeaderRewriter: new_headers = [] removed_header_dict = {} + cookie_rewriter = urlrewriter.get_cookie_rewriter() + for (name, value) in headers: lowername = name.lower() @@ -109,6 +113,11 @@ class HeaderRewriter: not content_rewritten): new_headers.append((name, value)) + elif (lowername in self.COOKIE_HEADERS and + cookie_rewriter): + cookie_list = cookie_rewriter.rewrite(value) + new_headers.extend(cookie_list) + else: new_headers.append((self.header_prefix + name, value)) diff --git a/pywb/rewrite/html_rewriter.py b/pywb/rewrite/html_rewriter.py index 5a10d651..d33f9d46 100644 --- a/pywb/rewrite/html_rewriter.py +++ b/pywb/rewrite/html_rewriter.py @@ -19,35 +19,40 @@ class HTMLRewriterMixin(object): to rewriters for script and css """ - REWRITE_TAGS = { - 'a': {'href': ''}, - 'applet': {'codebase': 'oe_', - 'archive': 'oe_'}, - 'area': {'href': ''}, - 'base': {'href': ''}, - 'blockquote': {'cite': ''}, - 'body': {'background': 'im_'}, - 'del': {'cite': ''}, - 'embed': {'src': 'oe_'}, - 'head': {'': ''}, # for head rewriting - 'iframe': {'src': 'if_'}, - 'img': {'src': 'im_'}, - 'ins': {'cite': ''}, - 'input': {'src': 'im_'}, - 'form': {'action': ''}, - 'frame': {'src': 'fr_'}, - 'link': {'href': 'oe_'}, - 'meta': {'content': ''}, - 'object': {'codebase': 'oe_', - 'data': 'oe_'}, - 'q': {'cite': ''}, - 'ref': {'href': 'oe_'}, - 'script': {'src': 'js_'}, - 'div': {'data-src': '', - 'data-uri': ''}, - 'li': {'data-src': '', - 'data-uri': ''}, - } + @staticmethod + def _init_rewrite_tags(defmod): + rewrite_tags = { + 'a': {'href': defmod}, + 'applet': {'codebase': 'oe_', + 'archive': 'oe_'}, + 'area': {'href': defmod}, + 'base': {'href': defmod}, + 'blockquote': {'cite': defmod}, + 'body': {'background': 'im_'}, + 'del': {'cite': defmod}, + 'embed': {'src': 'oe_'}, + 'head': {'': defmod}, # for head rewriting + 'iframe': {'src': 'if_'}, + 'img': {'src': 'im_'}, + 'ins': {'cite': defmod}, + 'input': {'src': 'im_'}, + 'form': {'action': defmod}, + 'frame': {'src': 'fr_'}, + 'link': {'href': 'oe_'}, + 'meta': {'content': defmod}, + 'object': {'codebase': 'oe_', + 'data': 'oe_'}, + 'q': {'cite': defmod}, + 'ref': {'href': 'oe_'}, + 'script': {'src': 'js_'}, + 'source': {'src': 'oe_'}, + 'div': {'data-src': defmod, + 'data-uri': defmod}, + 'li': {'data-src': defmod, + 'data-uri': defmod}, + } + + return rewrite_tags STATE_TAGS = ['script', 'style'] @@ -55,7 +60,9 @@ class HTMLRewriterMixin(object): HEAD_TAGS = ['html', 'head', 'base', 'link', 'meta', 'title', 'style', 'script', 'object', 'bgsound'] - # =========================== + DATA_RW_PROTOCOLS = ('http://', 'https://', '//') + + #=========================== class AccumBuff: def __init__(self): self.ls = [] @@ -70,7 +77,8 @@ class HTMLRewriterMixin(object): def __init__(self, url_rewriter, head_insert=None, js_rewriter_class=JSRewriter, - css_rewriter_class=CSSRewriter): + css_rewriter_class=CSSRewriter, + defmod=''): self.url_rewriter = url_rewriter self._wb_parse_context = None @@ -79,6 +87,7 @@ class HTMLRewriterMixin(object): self.css_rewriter = css_rewriter_class(url_rewriter) self.head_insert = head_insert + self.rewrite_tags = self._init_rewrite_tags(defmod) # =========================== META_REFRESH_REGEX = re.compile('^[\\d.]+\\s*;\\s*url\\s*=\\s*(.+?)\\s*$', @@ -140,9 +149,9 @@ class HTMLRewriterMixin(object): self.head_insert = None # attr rewriting - handler = self.REWRITE_TAGS.get(tag) + handler = self.rewrite_tags.get(tag) if not handler: - handler = self.REWRITE_TAGS.get('') + handler = self.rewrite_tags.get('') if not handler: return False @@ -160,11 +169,22 @@ class HTMLRewriterMixin(object): elif attr_name == 'style': attr_value = self._rewrite_css(attr_value) + # special case: disable crossorigin attr + # as they may interfere with rewriting semantics + elif attr_name == 'crossorigin': + attr_name = '_crossorigin' + # special case: meta tag elif (tag == 'meta') and (attr_name == 'content'): if self.has_attr(tag_attrs, ('http-equiv', 'refresh')): attr_value = self._rewrite_meta_refresh(attr_value) + # special case: data- attrs + elif attr_name and attr_value and attr_name.startswith('data-'): + if attr_value.startswith(self.DATA_RW_PROTOCOLS): + rw_mod = 'oe_' + attr_value = self._rewrite_url(attr_value, rw_mod) + else: # special case: base tag if (tag == 'base') and (attr_name == 'href') and attr_value: @@ -245,16 +265,9 @@ class HTMLRewriterMixin(object): #================================================================= class HTMLRewriter(HTMLRewriterMixin, HTMLParser): - def __init__(self, url_rewriter, - head_insert=None, - js_rewriter_class=JSRewriter, - css_rewriter_class=CSSRewriter): - + def __init__(self, *args, **kwargs): HTMLParser.__init__(self) - super(HTMLRewriter, self).__init__(url_rewriter, - head_insert, - js_rewriter_class, - css_rewriter_class) + super(HTMLRewriter, self).__init__(*args, **kwargs) def feed(self, string): try: diff --git a/pywb/rewrite/lxml_html_rewriter.py b/pywb/rewrite/lxml_html_rewriter.py index 2c8a8b8a..29355be4 100644 --- a/pywb/rewrite/lxml_html_rewriter.py +++ b/pywb/rewrite/lxml_html_rewriter.py @@ -17,15 +17,8 @@ from html_rewriter import HTMLRewriterMixin class LXMLHTMLRewriter(HTMLRewriterMixin): END_HTML = re.compile(r'', re.IGNORECASE) - def __init__(self, url_rewriter, - head_insert=None, - js_rewriter_class=JSRewriter, - css_rewriter_class=CSSRewriter): - - super(LXMLHTMLRewriter, self).__init__(url_rewriter, - head_insert, - js_rewriter_class, - css_rewriter_class) + def __init__(self, *args, **kwargs): + super(LXMLHTMLRewriter, self).__init__(*args, **kwargs) self.target = RewriterTarget(self) self.parser = lxml.etree.HTMLParser(remove_pis=False, @@ -45,6 +38,18 @@ class LXMLHTMLRewriter(HTMLRewriterMixin): #string = string.replace(u'', u'') self.parser.feed(string) + def parse(self, stream): + self.out = self.AccumBuff() + + lxml.etree.parse(stream, self.parser) + + result = self.out.getvalue() + + # Clear buffer to create new one for next rewrite() + self.out = None + + return result + def _internal_close(self): if self.started: self.parser.close() @@ -79,7 +84,8 @@ class RewriterTarget(object): def data(self, data): if not self.rewriter._wb_parse_context: data = cgi.escape(data, quote=True) - + if isinstance(data, unicode): + data = data.replace(u'\xa0', ' ') self.rewriter.parse_data(data) def comment(self, data): diff --git a/pywb/rewrite/regex_rewriters.py b/pywb/rewrite/regex_rewriters.py index 5f429339..aba9462a 100644 --- a/pywb/rewrite/regex_rewriters.py +++ b/pywb/rewrite/regex_rewriters.py @@ -126,9 +126,18 @@ class JSLinkAndLocationRewriter(JSLinkOnlyRewriter): rules = rules + [ (r'(? 1: + url = 'http://' + ts_err[1] + + if url.startswith('//'): + url = 'http:' + url + + if is_http(url): + (status_headers, stream) = self.fetch_http(url, env, req_headers, + follow_redirects, + proxies) + else: + (status_headers, stream) = self.fetch_local_file(url) + + # explicit urlkey may be passed in (say for testing) + if not urlkey: + urlkey = canonicalize(url) + + if timestamp is None: + timestamp = datetime_to_timestamp(datetime.datetime.utcnow()) + + cdx = {'urlkey': urlkey, + 'timestamp': timestamp, + 'original': url, + 'statuscode': status_headers.get_statuscode(), + 'mimetype': status_headers.get_header('Content-Type') + } + + result = (self.rewriter. + rewrite_content(urlrewriter, + status_headers, + stream, + head_insert_func=head_insert_func, + urlkey=urlkey, + cdx=cdx, + mod=mod)) + + return result + + def get_rewritten(self, *args, **kwargs): + + result = self.fetch_request(*args, **kwargs) + + status_headers, gen, is_rewritten = result + + buff = ''.join(gen) + + return (status_headers, buff) #================================================================= def main(): # pragma: no cover + import sys + if len(sys.argv) < 2: msg = 'Usage: {0} url-to-fetch [wb-url-target] [extra-prefix]' print msg.format(sys.argv[0]) @@ -94,7 +199,9 @@ def main(): # pragma: no cover urlrewriter = UrlRewriter(wburl_str, prefix) - status_headers, buff = get_rewritten(url, urlrewriter) + liverewriter = LiveRewriter() + + status_headers, buff = liverewriter.get_rewritten(url, urlrewriter) sys.stdout.write(buff) return 0 diff --git a/pywb/rewrite/rewriterules.py b/pywb/rewrite/rewriterules.py index 03a23653..f9eae0b9 100644 --- a/pywb/rewrite/rewriterules.py +++ b/pywb/rewrite/rewriterules.py @@ -9,6 +9,7 @@ from html_rewriter import HTMLRewriter import itertools HTML = HTMLRewriter +_is_lxml = False #================================================================= @@ -18,12 +19,20 @@ def use_lxml_parser(): if LXML_SUPPORTED: global HTML + global _is_lxml HTML = LXMLHTMLRewriter logging.debug('Using LXML Parser') - return True + _is_lxml = True else: # pragma: no cover logging.debug('LXML Parser not available') - return False + _is_lxml = False + + return _is_lxml + + +#================================================================= +def is_lxml(): + return _is_lxml #================================================================= diff --git a/pywb/rewrite/test/test_cookie_rewriter.py b/pywb/rewrite/test/test_cookie_rewriter.py new file mode 100644 index 00000000..e5979fd4 --- /dev/null +++ b/pywb/rewrite/test/test_cookie_rewriter.py @@ -0,0 +1,33 @@ +r""" +# No rewriting +>>> rewrite_cookie('a=b; c=d;') +[('Set-Cookie', 'a=b'), ('Set-Cookie', 'c=d')] + +>>> rewrite_cookie('some=value; Path=/;') +[('Set-Cookie', 'some=value; Path=/pywb/20131226101010/http://example.com/')] + +>>> rewrite_cookie('some=value; Path=/diff/path/;') +[('Set-Cookie', 'some=value; Path=/pywb/20131226101010/http://example.com/diff/path/')] + +# if domain set, set path to root +>>> rewrite_cookie('some=value; Domain=.example.com; Path=/diff/path/;') +[('Set-Cookie', 'some=value; Path=/pywb/')] + +>>> rewrite_cookie('abc=def; Path=file.html; Expires=Wed, 13 Jan 2021 22:23:01 GMT') +[('Set-Cookie', 'abc=def; Path=/pywb/20131226101010/http://example.com/some/path/file.html')] + +# Cookie with invalid chars, not parsed +>>> rewrite_cookie('abc@def=123') +[] + +""" + + +from pywb.rewrite.cookie_rewriter import WbUrlCookieRewriter +from pywb.rewrite.url_rewriter import UrlRewriter + +urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/pywb/') + +def rewrite_cookie(cookie_str): + return WbUrlCookieRewriter(urlrewriter).rewrite(cookie_str) + diff --git a/pywb/rewrite/test/test_header_rewriter.py b/pywb/rewrite/test/test_header_rewriter.py new file mode 100644 index 00000000..1a2b2cea --- /dev/null +++ b/pywb/rewrite/test/test_header_rewriter.py @@ -0,0 +1,80 @@ +""" +#================================================================= +HTTP Headers Rewriting +#================================================================= + +# Text with charset +>>> _test_headers([('Date', 'Fri, 03 Jan 2014 03:03:21 GMT'), ('Content-Length', '5'), ('Content-Type', 'text/html;charset=UTF-8')]) +{'charset': 'utf-8', + 'removed_header_dict': {}, + 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Date', 'Fri, 03 Jan 2014 03:03:21 GMT'), + ('X-Archive-Orig-Content-Length', '5'), + ('Content-Type', 'text/html;charset=UTF-8')]), + 'text_type': 'html'} + +# Redirect +>>> _test_headers([('Connection', 'close'), ('Location', '/other.html')], '302 Redirect') +{'charset': None, + 'removed_header_dict': {}, + 'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [ ('X-Archive-Orig-Connection', 'close'), + ('Location', '/web/20131010/http://example.com/other.html')]), + 'text_type': None} + +# cookie, host/origin rewriting +>>> _test_headers([('Connection', 'close'), ('Set-Cookie', 'foo=bar; Path=/; abc=def; Path=somefile.html'), ('Host', 'example.com'), ('Origin', 'https://example.com')]) +{'charset': None, + 'removed_header_dict': {}, + 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Connection', 'close'), + ('Set-Cookie', 'foo=bar; Path=/web/20131010/http://example.com/'), + ( 'Set-Cookie', + 'abc=def; Path=/web/20131010/http://example.com/somefile.html'), + ('X-Archive-Orig-Host', 'example.com'), + ('X-Archive-Orig-Origin', 'https://example.com')]), + 'text_type': None} + + + +# gzip +>>> _test_headers([('Content-Length', '199999'), ('Content-Type', 'text/javascript'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')]) +{'charset': None, + 'removed_header_dict': {'content-encoding': 'gzip', + 'transfer-encoding': 'chunked'}, + 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Content-Length', '199999'), + ('Content-Type', 'text/javascript')]), + 'text_type': 'js'} + +# Binary -- transfer-encoding removed +>>> _test_headers([('Content-Length', '200000'), ('Content-Type', 'image/png'), ('Set-Cookie', 'foo=bar; Path=/;'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')]) +{'charset': None, + 'removed_header_dict': {'transfer-encoding': 'chunked'}, + 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('Content-Length', '200000'), + ('Content-Type', 'image/png'), + ('Set-Cookie', 'foo=bar; Path=/web/20131010/http://example.com/'), + ('Content-Encoding', 'gzip')]), + 'text_type': None} + +""" + + + +from pywb.rewrite.header_rewriter import HeaderRewriter +from pywb.rewrite.url_rewriter import UrlRewriter +from pywb.utils.statusandheaders import StatusAndHeaders + +import pprint + +urlrewriter = UrlRewriter('20131010/http://example.com/', '/web/') + + +headerrewriter = HeaderRewriter() + +def _test_headers(headers, status = '200 OK'): + rewritten = headerrewriter.rewrite(StatusAndHeaders(status, headers), urlrewriter) + return pprint.pprint(vars(rewritten)) + + +if __name__ == "__main__": + import doctest + doctest.testmod() + + diff --git a/pywb/rewrite/test/test_html_rewriter.py b/pywb/rewrite/test/test_html_rewriter.py index 6236ae1e..1cae626b 100644 --- a/pywb/rewrite/test/test_html_rewriter.py +++ b/pywb/rewrite/test/test_html_rewriter.py @@ -52,10 +52,18 @@ ur""" >>> parse('') +# Custom -data attribs +>>> parse('
') +
+ # Script tag >>> parse('') +# Script tag + crossorigin +>>> parse('') + + # Unterminated script tag, handle and auto-terminate >>> parse(' diff --git a/pywb/rewrite/test/test_lxml_html_rewriter.py b/pywb/rewrite/test/test_lxml_html_rewriter.py index 125977e7..e9af9b8c 100644 --- a/pywb/rewrite/test/test_lxml_html_rewriter.py +++ b/pywb/rewrite/test/test_lxml_html_rewriter.py @@ -47,10 +47,18 @@ ur""" >>> parse('') +# Custom -data attribs +>>> parse('
') +
+ # Script tag >>> parse('') +# Script tag + crossorigin +>>> parse('') + + # Unterminated script tag, will auto-terminate >>> parse(' @@ -119,6 +127,15 @@ ur""" >>> p = LXMLHTMLRewriter(urlrewriter) >>> p.close() '' + +# test   +>>> parse(' ') +

 

+ +# test multiple rewrites:   extra >, split comment +>>> p = LXMLHTMLRewriter(urlrewriter) +>>> p.rewrite('
    >
') + p.close() +u'
    >
' """ from pywb.rewrite.url_rewriter import UrlRewriter diff --git a/pywb/rewrite/test/test_regex_rewriters.py b/pywb/rewrite/test/test_regex_rewriters.py index fac38789..cbd2cb21 100644 --- a/pywb/rewrite/test/test_regex_rewriters.py +++ b/pywb/rewrite/test/test_regex_rewriters.py @@ -51,7 +51,7 @@ r""" # scheme-agnostic >>> _test_js('cool_Location = "//example.com/abc.html" //comment') -'cool_Location = "/web/20131010em_///example.com/abc.html" //comment' +'cool_Location = "/web/20131010em_/http://example.com/abc.html" //comment' #================================================================= @@ -116,61 +116,13 @@ r""" >>> _test_css("@import url(/url.css)\n@import url(/anotherurl.css)\n @import url(/and_a_third.css)") '@import url(/web/20131010em_/http://example.com/url.css)\n@import url(/web/20131010em_/http://example.com/anotherurl.css)\n @import url(/web/20131010em_/http://example.com/and_a_third.css)' -#================================================================= -HTTP Headers Rewriting -#================================================================= - -# Text with charset ->>> _test_headers([('Date', 'Fri, 03 Jan 2014 03:03:21 GMT'), ('Content-Length', '5'), ('Content-Type', 'text/html;charset=UTF-8')]) -{'charset': 'utf-8', - 'removed_header_dict': {}, - 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Date', 'Fri, 03 Jan 2014 03:03:21 GMT'), - ('X-Archive-Orig-Content-Length', '5'), - ('Content-Type', 'text/html;charset=UTF-8')]), - 'text_type': 'html'} - -# Redirect ->>> _test_headers([('Connection', 'close'), ('Location', '/other.html')], '302 Redirect') -{'charset': None, - 'removed_header_dict': {}, - 'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [ ('X-Archive-Orig-Connection', 'close'), - ('Location', '/web/20131010/http://example.com/other.html')]), - 'text_type': None} - -# gzip ->>> _test_headers([('Content-Length', '199999'), ('Content-Type', 'text/javascript'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')]) -{'charset': None, - 'removed_header_dict': {'content-encoding': 'gzip', - 'transfer-encoding': 'chunked'}, - 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Content-Length', '199999'), - ('Content-Type', 'text/javascript')]), - 'text_type': 'js'} - -# Binary ->>> _test_headers([('Content-Length', '200000'), ('Content-Type', 'image/png'), ('Cookie', 'blah'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')]) -{'charset': None, - 'removed_header_dict': {'transfer-encoding': 'chunked'}, - 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('Content-Length', '200000'), - ('Content-Type', 'image/png'), - ('X-Archive-Orig-Cookie', 'blah'), - ('Content-Encoding', 'gzip')]), - 'text_type': None} - -Removing Transfer-Encoding always, Was: - ('Content-Encoding', 'gzip'), - ('Transfer-Encoding', 'chunked')]), 'charset': None, 'text_type': None, 'removed_header_dict': {}} - - """ + #================================================================= from pywb.rewrite.url_rewriter import UrlRewriter from pywb.rewrite.regex_rewriters import RegexRewriter, JSRewriter, CSSRewriter, XMLRewriter -from pywb.rewrite.header_rewriter import HeaderRewriter -from pywb.utils.statusandheaders import StatusAndHeaders - -import pprint urlrewriter = UrlRewriter('20131010/http://example.com/', '/web/') @@ -184,12 +136,6 @@ def _test_xml(string): def _test_css(string): return CSSRewriter(urlrewriter).rewrite(string) -headerrewriter = HeaderRewriter() - -def _test_headers(headers, status = '200 OK'): - rewritten = headerrewriter.rewrite(StatusAndHeaders(status, headers), urlrewriter) - return pprint.pprint(vars(rewritten)) - if __name__ == "__main__": import doctest diff --git a/pywb/rewrite/test/test_rewrite_live.py b/pywb/rewrite/test/test_rewrite_live.py index 13a941ea..938c9ee1 100644 --- a/pywb/rewrite/test/test_rewrite_live.py +++ b/pywb/rewrite/test/test_rewrite_live.py @@ -1,14 +1,16 @@ -from pywb.rewrite.rewrite_live import get_rewritten +from pywb.rewrite.rewrite_live import LiveRewriter from pywb.rewrite.url_rewriter import UrlRewriter from pywb import get_test_dir +from io import BytesIO + # This module has some rewriting tests against the 'live web' # As such, the content may change and the test may break urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/pywb/') -def head_insert_func(rule): +def head_insert_func(rule, cdx): if rule.js_rewrite_location == True: return '' else: @@ -18,8 +20,8 @@ def head_insert_func(rule): def test_local_1(): status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html', urlrewriter, - 'com,example,test)/', - head_insert_func) + head_insert_func, + 'com,example,test)/') # wombat insert added assert '' in buff @@ -34,8 +36,8 @@ def test_local_1(): def test_local_2_no_js_location_rewrite(): status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html', urlrewriter, - 'example,example,test)/nolocation_rewrite', - head_insert_func) + head_insert_func, + 'example,example,test)/nolocation_rewrite') # no wombat insert assert '' not in buff @@ -46,28 +48,52 @@ def test_local_2_no_js_location_rewrite(): # still link rewrite assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff + def test_example_1(): - status_headers, buff = get_rewritten('http://example.com/', urlrewriter) - - # verify header rewriting - assert (('X-Archive-Orig-connection', 'close') in status_headers.headers), status_headers - - -def test_example_2(): - status_headers, buff = get_rewritten('http://example.com/', urlrewriter) + status_headers, buff = get_rewritten('http://example.com/', urlrewriter, req_headers={'Connection': 'close'}) # verify header rewriting assert (('X-Archive-Orig-connection', 'close') in status_headers.headers), status_headers assert '/pywb/20131226101010/http://www.iana.org/domains/example' in buff, buff +def test_example_2_redirect(): + status_headers, buff = get_rewritten('http://facebook.com/', urlrewriter) + # redirect, no content + assert status_headers.get_statuscode() == '301' + assert len(buff) == 0 + + +def test_example_3_rel(): + status_headers, buff = get_rewritten('//example.com/', urlrewriter) + assert status_headers.get_statuscode() == '200' + + +def test_example_4_rewrite_err(): + # may occur in case of rewrite mismatch, the /// gets stripped off + status_headers, buff = get_rewritten('http://localhost:8080///example.com/', urlrewriter) + assert status_headers.get_statuscode() == '200' def test_example_domain_specific_3(): urlrewriter2 = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/pywb/') - status_headers, buff = get_rewritten('http://facebook.com/digitalpreservation', urlrewriter2) + status_headers, buff = get_rewritten('http://facebook.com/digitalpreservation', urlrewriter2, follow_redirects=True) # comment out bootloader assert '/* Bootloader.configurePage' in buff +def test_post(): + buff = BytesIO('ABCDEF') + + env = {'REQUEST_METHOD': 'POST', + 'HTTP_ORIGIN': 'http://example.com', + 'HTTP_HOST': 'example.com', + 'wsgi.input': buff} + + status_headers, resp_buff = get_rewritten('http://example.com/', urlrewriter, env=env) + assert status_headers.get_statuscode() == '200', status_headers + + +def get_rewritten(*args, **kwargs): + return LiveRewriter().get_rewritten(*args, **kwargs) diff --git a/pywb/rewrite/test/test_url_rewriter.py b/pywb/rewrite/test/test_url_rewriter.py index cc28a660..59669b96 100644 --- a/pywb/rewrite/test/test_url_rewriter.py +++ b/pywb/rewrite/test/test_url_rewriter.py @@ -24,6 +24,12 @@ >>> do_rewrite('http://some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/') 'localhost:8080/20101226101112/http://some-other-site.com' +>>> do_rewrite('http://localhost:8080/web/2014im_/http://some-other-site.com', 'http://example.com/index.html', '/web/', full_prefix='http://localhost:8080/web/') +'http://localhost:8080/web/2014im_/http://some-other-site.com' + +>>> do_rewrite('/web/http://some-other-site.com', 'http://example.com/index.html', '/web/', full_prefix='http://localhost:8080/web/') +'/web/http://some-other-site.com' + >>> do_rewrite(r'http:\/\/some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/') 'localhost:8080/20101226101112/http:\\\\/\\\\/some-other-site.com' @@ -62,8 +68,8 @@ from pywb.rewrite.url_rewriter import UrlRewriter, HttpsUrlRewriter -def do_rewrite(rel_url, base_url, prefix, mod = None): - rewriter = UrlRewriter(base_url, prefix) +def do_rewrite(rel_url, base_url, prefix, mod=None, full_prefix=None): + rewriter = UrlRewriter(base_url, prefix, full_prefix=full_prefix) return rewriter.rewrite(rel_url, mod) diff --git a/pywb/rewrite/test/test_wburl.py b/pywb/rewrite/test/test_wburl.py index 955e24df..bcad948e 100644 --- a/pywb/rewrite/test/test_wburl.py +++ b/pywb/rewrite/test/test_wburl.py @@ -60,13 +60,14 @@ # Error Urls # ====================== ->>> x = WbUrl('/#$%#/') +# no longer rejecting this here +#>>> x = WbUrl('/#$%#/') Traceback (most recent call last): Exception: Bad Request Url: http://#$%#/ ->>> x = WbUrl('/http://example.com:abc/') -Traceback (most recent call last): -Exception: Bad Request Url: http://example.com:abc/ +#>>> x = WbUrl('/http://example.com:abc/') +#Traceback (most recent call last): +#Exception: Bad Request Url: http://example.com:abc/ >>> x = WbUrl('') Traceback (most recent call last): diff --git a/pywb/rewrite/url_rewriter.py b/pywb/rewrite/url_rewriter.py index 9545a040..843e665e 100644 --- a/pywb/rewrite/url_rewriter.py +++ b/pywb/rewrite/url_rewriter.py @@ -2,6 +2,7 @@ import copy import urlparse from wburl import WbUrl +from cookie_rewriter import WbUrlCookieRewriter #================================================================= @@ -14,11 +15,12 @@ class UrlRewriter(object): NO_REWRITE_URI_PREFIX = ['#', 'javascript:', 'data:', 'mailto:', 'about:'] - PROTOCOLS = ['http:', 'https:', '//', 'ftp:', 'mms:', 'rtsp:', 'wais:'] + PROTOCOLS = ['http:', 'https:', 'ftp:', 'mms:', 'rtsp:', 'wais:'] - def __init__(self, wburl, prefix): + def __init__(self, wburl, prefix, full_prefix=None): self.wburl = wburl if isinstance(wburl, WbUrl) else WbUrl(wburl) self.prefix = prefix + self.full_prefix = full_prefix #if self.prefix.endswith('/'): # self.prefix = self.prefix[:-1] @@ -28,29 +30,43 @@ class UrlRewriter(object): if any(url.startswith(x) for x in self.NO_REWRITE_URI_PREFIX): return url + if (self.prefix and + self.prefix != '/' and + url.startswith(self.prefix)): + return url + + if (self.full_prefix and + self.full_prefix != self.prefix and + url.startswith(self.full_prefix)): + return url + wburl = self.wburl - isAbs = any(url.startswith(x) for x in self.PROTOCOLS) + is_abs = any(url.startswith(x) for x in self.PROTOCOLS) + + if url.startswith('//'): + is_abs = True + url = 'http:' + url # Optimized rewriter for # -rel urls that don't start with / and # do not contain ../ and no special mod - if not (isAbs or mod or url.startswith('/') or ('../' in url)): - finalUrl = urlparse.urljoin(self.prefix + wburl.original_url, url) + if not (is_abs or mod or url.startswith('/') or ('../' in url)): + final_url = urlparse.urljoin(self.prefix + wburl.original_url, url) else: # optimize: join if not absolute url, otherwise just use that - if not isAbs: - newUrl = urlparse.urljoin(wburl.url, url).replace('../', '') + if not is_abs: + new_url = urlparse.urljoin(wburl.url, url).replace('../', '') else: - newUrl = url + new_url = url if mod is None: mod = wburl.mod - finalUrl = self.prefix + wburl.to_str(mod=mod, url=newUrl) + final_url = self.prefix + wburl.to_str(mod=mod, url=new_url) - return finalUrl + return final_url def get_abs_url(self, url=''): return self.prefix + self.wburl.to_str(url=url) @@ -67,6 +83,9 @@ class UrlRewriter(object): new_wburl.url = new_url return UrlRewriter(new_wburl, self.prefix) + def get_cookie_rewriter(self): + return WbUrlCookieRewriter(self) + def __repr__(self): return "UrlRewriter('{0}', '{1}')".format(self.wburl, self.prefix) @@ -81,7 +100,7 @@ class HttpsUrlRewriter(object): HTTP = 'http://' HTTPS = 'https://' - def __init__(self, wburl, prefix): + def __init__(self, wburl, prefix, full_prefix=None): pass def rewrite(self, url, mod=None): @@ -99,3 +118,6 @@ class HttpsUrlRewriter(object): def rebase_rewriter(self, new_url): return self + + def get_cookie_rewriter(self): + return None diff --git a/pywb/rewrite/wburl.py b/pywb/rewrite/wburl.py index 67bab4fb..3cd9ad72 100644 --- a/pywb/rewrite/wburl.py +++ b/pywb/rewrite/wburl.py @@ -39,7 +39,6 @@ wayback url format. """ import re -import rfc3987 #================================================================= @@ -64,6 +63,9 @@ class BaseWbUrl(object): def is_query(self): return self.is_query_type(self.type) + def is_url_query(self): + return (self.type == BaseWbUrl.URL_QUERY) + @staticmethod def is_replay_type(type_): return (type_ == BaseWbUrl.REPLAY or @@ -104,14 +106,6 @@ class WbUrl(BaseWbUrl): if inx < len(self.url) and self.url[inx] != '/': self.url = self.url[:inx] + '/' + self.url[inx:] - # BUG?: adding upper() because rfc3987 lib - # rejects lower case %-encoding - # %2F is fine, but %2f -- standard supports either - matcher = rfc3987.match(self.url.upper(), 'IRI') - - if not matcher: - raise Exception('Bad Request Url: ' + self.url) - # Match query regex # ====================== def _init_query(self, url): @@ -194,6 +188,21 @@ class WbUrl(BaseWbUrl): else: return url + @property + def is_mainpage(self): + return (not self.mod or + self.mod == 'mp_') + + @property + def is_embed(self): + return (self.mod and + self.mod != 'id_' and + self.mod != 'mp_') + + @property + def is_identity(self): + return (self.mod == 'id_') + def __str__(self): return self.to_str() diff --git a/pywb/rules.yaml b/pywb/rules.yaml index cd7325eb..04327c92 100644 --- a/pywb/rules.yaml +++ b/pywb/rules.yaml @@ -29,8 +29,7 @@ rules: # flickr rules #================================================================= - - url_prefix: ['com,yimg,l)/g/combo', 'com,yahooapis,yui)/combo'] - + - url_prefix: ['com,yimg,l)/g/combo', 'com,yimg,s)/pw/combo', 'com,yahooapis,yui)/combo'] fuzzy_lookup: '([^/]+(?:\.css|\.js))' @@ -61,3 +60,4 @@ rules: fuzzy_lookup: match: '(.*)[&?](?:_|uncache)=[\d]+[&]?' filter: '=urlkey:{0}' + replace: '?' diff --git a/pywb/static/wb.css b/pywb/static/wb.css index 1367a2fe..880f0890 100644 --- a/pywb/static/wb.css +++ b/pywb/static/wb.css @@ -1,15 +1,12 @@ -#_wayback_banner +#_wb_plain_banner, #_wb_frame_top_banner { display: block !important; top: 0px !important; left: 0px !important; font-family: "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif !important; - position: absolute !important; - padding: 4px !important; width: 100% !important; font-size: 24px !important; - border: 1px solid !important; background-color: lightYellow !important; color: black !important; text-align: center !important; @@ -17,3 +14,34 @@ line-height: normal !important; } +#_wb_plain_banner +{ + position: absolute !important; + padding: 4px !important; + border: 1px solid !important; +} + +#_wb_frame_top_banner +{ + position: fixed !important; + border: 0px; + height: 40px !important; +} + +.wb_iframe_div +{ + width: 100%; + height: 100%; + padding: 40px 4px 4px 0px; + border: none; + box-sizing: border-box; + -moz-box-sizing: border-box; + -webkit-box-sizing: border-box; +} + +.wb_iframe +{ + width: 100%; + height: 100%; + border: 2px solid tan; +} diff --git a/pywb/static/wb.js b/pywb/static/wb.js index ae5b586c..e10a522e 100644 --- a/pywb/static/wb.js +++ b/pywb/static/wb.js @@ -18,17 +18,28 @@ This file is part of pywb. */ function init_banner() { - var BANNER_ID = "_wayback_banner"; - - var banner = document.getElementById(BANNER_ID); + var PLAIN_BANNER_ID = "_wb_plain_banner"; + var FRAME_BANNER_ID = "_wb_frame_top_banner"; if (wbinfo.is_embed) { return; } + if (window.top != window.self) { + return; + } + + if (wbinfo.is_frame) { + bid = FRAME_BANNER_ID; + } else { + bid = PLAIN_BANNER_ID; + } + + var banner = document.getElementById(bid); + if (!banner) { banner = document.createElement("wb_div"); - banner.setAttribute("id", BANNER_ID); + banner.setAttribute("id", bid); banner.setAttribute("lang", "en"); text = "This is an archived page "; @@ -41,12 +52,56 @@ function init_banner() { } } -var readyStateCheckInterval = setInterval(function() { +function add_event(name, func, object) { + if (object.addEventListener) { + object.addEventListener(name, func); + return true; + } else if (object.attachEvent) { + object.attachEvent("on" + name, func); + return true; + } else { + return false; + } +} + +function remove_event(name, func, object) { + if (object.removeEventListener) { + object.removeEventListener(name, func); + return true; + } else if (object.detachEvent) { + object.detachEvent("on" + name, func); + return true; + } else { + return false; + } +} + +var notified_top = false; + +var detect_on_init = function() { + if (!notified_top && window && window.top && (window.self != window.top) && window.WB_wombat_location) { + if (!wbinfo.is_embed) { + window.top.postMessage(window.WB_wombat_location.href, "*"); + } + notified_top = true; + } + if (document.readyState === "interactive" || document.readyState === "complete") { init_banner(); - - clearInterval(readyStateCheckInterval); + + remove_event("readystatechange", detect_on_init, document); } -}, 10); +} + +add_event("readystatechange", detect_on_init, document); + + +if (wbinfo.is_frame_mp && wbinfo.canon_url && + (window.self == window.top) && + window.location.href != wbinfo.canon_url) { + + console.log('frame'); + window.location.replace(wbinfo.canon_url); +} diff --git a/pywb/static/wombat.js b/pywb/static/wombat.js index 112d6d37..78e4f7ea 100644 --- a/pywb/static/wombat.js +++ b/pywb/static/wombat.js @@ -18,7 +18,7 @@ This file is part of pywb. */ //============================================ -// Wombat JS-Rewriting Library +// Wombat JS-Rewriting Library v2.0 //============================================ WB_wombat_init = (function() { @@ -26,6 +26,7 @@ WB_wombat_init = (function() { var wb_replay_prefix; var wb_replay_date_prefix; var wb_capture_date_part; + var wb_orig_scheme; var wb_orig_host; var wb_wombat_updating = false; @@ -53,27 +54,93 @@ WB_wombat_init = (function() { } //============================================ - function rewrite_url(url) { - var http_prefix = "http://"; - var https_prefix = "https://"; + function starts_with(string, arr_or_prefix) { + if (arr_or_prefix instanceof Array) { + for (var i = 0; i < arr_or_prefix.length; i++) { + if (string.indexOf(arr_or_prefix[i]) == 0) { + return arr_or_prefix[i]; + } + } + } else if (string.indexOf(arr_or_prefix) == 0) { + return arr_or_prefix; + } + + return undefined; + } - // If not dealing with a string, just return it - if (!url || (typeof url) != "string") { + //============================================ + function ends_with(str, suffix) { + if (str.indexOf(suffix, str.length - suffix.length) !== -1) { + return suffix; + } else { + return undefined; + } + } + + //============================================ + var rewrite_url = rewrite_url_; + + function rewrite_url_debug(url) { + var rewritten = rewrite_url_(url); + if (url != rewritten) { + console.log('REWRITE: ' + url + ' -> ' + rewritten); + } else { + console.log('NOT REWRITTEN ' + url); + } + return rewritten; + } + + //============================================ + var HTTP_PREFIX = "http://"; + var HTTPS_PREFIX = "https://"; + var REL_PREFIX = "//"; + + var VALID_PREFIXES = [HTTP_PREFIX, HTTPS_PREFIX, REL_PREFIX]; + var IGNORE_PREFIXES = ["#", "about:", "data:", "mailto:", "javascript:"]; + + var BAD_PREFIXES; + + function init_bad_prefixes(prefix) { + BAD_PREFIXES = ["http:" + prefix, "https:" + prefix, + "http:/" + prefix, "https:/" + prefix]; + } + + //============================================ + function rewrite_url_(url) { + // If undefined, just return it + if (!url) { + return url; + } + + var urltype_ = (typeof url); + + // If object, use toString + if (urltype_ == "object") { + url = url.toString(); + } else if (urltype_ != "string") { + return url; + } + + // just in case wombat reference made it into url! + url = url.replace("WB_wombat_", ""); + + // ignore anchors, about, data + if (starts_with(url, IGNORE_PREFIXES)) { return url; } // If starts with prefix, no rewriting needed // Only check replay prefix (no date) as date may be different for each // capture - if (url.indexOf(wb_replay_prefix) == 0) { + if (starts_with(url, wb_replay_prefix) || starts_with(url, window.location.origin + wb_replay_prefix)) { return url; } // If server relative url, add prefix and original host - if (url.charAt(0) == "/") { + if (url.charAt(0) == "/" && !starts_with(url, REL_PREFIX)) { // Already a relative url, don't make any changes! - if (url.indexOf(wb_capture_date_part) >= 0) { + if (wb_capture_date_part && url.indexOf(wb_capture_date_part) >= 0) { return url; } @@ -81,109 +148,236 @@ WB_wombat_init = (function() { } // If full url starting with http://, add prefix - if (url.indexOf(http_prefix) == 0 || url.indexOf(https_prefix) == 0) { + + var prefix = starts_with(url, VALID_PREFIXES); + + if (prefix) { + if (starts_with(url, prefix + window.location.host + '/')) { + return url; + } + return wb_replay_date_prefix + url; + } + + // Check for common bad prefixes and remove them + prefix = starts_with(url, BAD_PREFIXES); + + if (prefix) { + url = extract_orig(url); return wb_replay_date_prefix + url; } // May or may not be a hostname, call function to determine // If it is, add the prefix and make sure port is removed - if (is_host_url(url)) { - return wb_replay_date_prefix + http_prefix + url; + if (is_host_url(url) && !starts_with(url, window.location.host + '/')) { + return wb_replay_date_prefix + wb_orig_scheme + url; } return url; } - //============================================ - function copy_object_fields(obj) { - var new_obj = {}; - - for (prop in obj) { - if ((typeof obj[prop]) != "function") { - new_obj[prop] = obj[prop]; - } - } - - return new_obj; - } - //============================================ function extract_orig(href) { if (!href) { return ""; } + href = href.toString(); + var index = href.indexOf("/http", 1); + + // extract original url from wburl if (index > 0) { - return href.substr(index + 1); + href = href.substr(index + 1); } else { - return href; + index = href.indexOf(wb_replay_prefix); + if (index >= 0) { + href = href.substr(index + wb_replay_prefix.length); + } + if ((href.length > 4) && + (href.charAt(2) == "_") && + (href.charAt(3) == "/")) { + href = href.substr(4); + } + + if (!starts_with(href, "http")) { + href = HTTP_PREFIX + href; + } } + + // remove trailing slash + if (ends_with(href, "/")) { + href = href.substring(0, href.length - 1); + } + + return href; } - + //============================================ - function copy_location_obj(loc) { - var new_loc = copy_object_fields(loc); - - new_loc._orig_loc = loc; - new_loc._orig_href = loc.href; + // Define custom property + function def_prop(obj, prop, value, set_func, get_func) { + var key = "_" + prop; + obj[key] = value; + + try { + Object.defineProperty(obj, prop, { + configurable: false, + enumerable: true, + set: function(newval) { + var result = set_func.call(obj, newval); + if (result != undefined) { + obj[key] = result; + } + }, + get: function() { + if (get_func) { + return get_func.call(obj, obj[key]); + } else { + return obj[key]; + } + } + }); + return true; + } catch (e) { + console.log(e); + obj[prop] = value; + return false; + } + } + + //============================================ + //Define WombatLocation + + function WombatLocation(loc) { + this._orig_loc = loc; + this._orig_href = loc.href; // Rewrite replace and assign functions - new_loc.replace = function(url) { - this._orig_loc.replace(rewrite_url(url)); + this.replace = function(url) { + return this._orig_loc.replace(rewrite_url(url)); } - new_loc.assign = function(url) { - this._orig_loc.assign(rewrite_url(url)); + this.assign = function(url) { + return this._orig_loc.assign(rewrite_url(url)); } - new_loc.reload = loc.reload; - + this.reload = loc.reload; + // Adapted from: // https://gist.github.com/jlong/2428561 var parser = document.createElement('a'); - parser.href = extract_orig(new_loc._orig_href); + var href = extract_orig(this._orig_href); + parser.href = href; + + //console.log(this._orig_href + " -> " + tmp_href); + this._autooverride = false; + + var _set_hash = function(hash) { + this._orig_loc.hash = hash; + return this._orig_loc.hash; + } + + var _get_hash = function() { + return this._orig_loc.hash; + } + + var _get_url_with_hash = function(url) { + return url + this._orig_loc.hash; + } + + href = parser.href; + var hash = parser.hash; + + if (hash) { + var hidx = href.lastIndexOf("#"); + if (hidx > 0) { + href = href.substring(0, hidx); + } + } + + if (Object.defineProperty) { + var res1 = def_prop(this, "href", href, + this.assign, + _get_url_with_hash); + + var res2 = def_prop(this, "hash", parser.hash, + _set_hash, + _get_hash); + + this._autooverride = res1 && res2; + } else { + this.href = href; + this.hash = parser.hash; + } + + this.host = parser.host; + this.hostname = parser.hostname; - new_loc.hash = parser.hash; - new_loc.host = parser.host; - new_loc.hostname = parser.hostname; - new_loc.href = parser.href; - - if (new_loc.origin) { - new_loc.origin = parser.origin; + if (parser.origin) { + this.origin = parser.origin; } - new_loc.pathname = parser.pathname; - new_loc.port = parser.port - new_loc.protocol = parser.protocol; - new_loc.search = parser.search; + this.pathname = parser.pathname; + this.port = parser.port + this.protocol = parser.protocol; + this.search = parser.search; - new_loc.toString = function() { + this.toString = function() { return this.href; } - - return new_loc; + + // Copy any remaining properties + for (prop in loc) { + if (this.hasOwnProperty(prop)) { + continue; + } + + if ((typeof loc[prop]) != "function") { + this[prop] = loc[prop]; + } + } } //============================================ - function update_location(req_href, orig_href, location) { - if (req_href && (extract_orig(orig_href) != extract_orig(req_href))) { - var final_href = rewrite_url(req_href); - - location.href = final_href; + function update_location(req_href, orig_href, actual_location, wombat_loc) { + if (!req_href) { + return; } + + if (req_href == orig_href) { + // Reset wombat loc to the unrewritten version + //if (wombat_loc) { + // wombat_loc.href = extract_orig(orig_href); + //} + return; + } + + + var ext_orig = extract_orig(orig_href); + var ext_req = extract_orig(req_href); + + if (!ext_orig || ext_orig == ext_req) { + return; + } + + var final_href = rewrite_url(req_href); + + console.log(actual_location.href + ' -> ' + final_href); + + actual_location.href = final_href; } //============================================ - function check_location_change(loc, is_top) { - var locType = (typeof loc); + function check_location_change(wombat_loc, is_top) { + var locType = (typeof wombat_loc); - var location = (is_top ? window.top.location : window.location); + var actual_location = (is_top ? window.top.location : window.location); // String has been assigned to location, so assign it if (locType == "string") { - update_location(loc, location.href, location) - + update_location(wombat_loc, actual_location.href, actual_location); + } else if (locType == "object") { - update_location(loc.href, loc._orig_href, location); + update_location(wombat_loc.href, + wombat_loc._orig_href, + actual_location); } } @@ -197,10 +391,21 @@ WB_wombat_init = (function() { check_location_change(window.WB_wombat_location, false); - if (window.self.location != window.top.location) { + // Only check top if its a different window + if (window.self.WB_wombat_location != window.top.WB_wombat_location) { check_location_change(window.top.WB_wombat_location, true); } +// lochash = window.WB_wombat_location.hash; +// +// if (lochash) { +// window.location.hash = lochash; +// +// //if (window.top.update_wb_url) { +// // window.top.location.hash = lochash; +// //} +// } + wb_wombat_updating = false; } @@ -222,7 +427,7 @@ WB_wombat_init = (function() { //============================================ function copy_history_func(history, func_name) { - orig_func = history[func_name]; + var orig_func = history[func_name]; if (!orig_func) { return; @@ -252,6 +457,12 @@ WB_wombat_init = (function() { function open_rewritten(method, url, async, user, password) { url = rewrite_url(url); + + // defaults to true + if (async != false) { + async = true; + } + return orig.call(this, method, url, async, user, password); } @@ -259,45 +470,262 @@ WB_wombat_init = (function() { } //============================================ - function wombat_init(replay_prefix, capture_date, orig_host, timestamp) { - wb_replay_prefix = replay_prefix; - wb_replay_date_prefix = replay_prefix + capture_date + "/"; - wb_capture_date_part = "/" + capture_date + "/"; + function init_worker_override() { + if (!window.Worker) { + return; + } - wb_orig_host = "http://" + orig_host; + // for now, disabling workers until override of worker content can be supported + // hopefully, pages depending on workers will have a fallback + window.Worker = undefined; + } + + //============================================ + function rewrite_attr(elem, name) { + if (!elem || !elem.getAttribute) { + return; + } + + var value = elem.getAttribute(name); + + if (!value) { + return; + } + + if (starts_with(value, "javascript:")) { + return; + } + + //var orig_value = value; + value = rewrite_url(value); + + elem.setAttribute(name, value); + } + + //============================================ + function rewrite_elem(elem) + { + rewrite_attr(elem, "src"); + rewrite_attr(elem, "href"); + + if (elem && elem.getAttribute && elem.getAttribute("crossorigin")) { + elem.removeAttribute("crossorigin"); + } + } + + //============================================ + function init_dom_override() { + if (!Node || !Node.prototype) { + return; + } + + function override_attr(obj, attr) { + var setter = function(orig) { + var val = rewrite_url(orig); + //console.log(orig + " -> " + val); + this.setAttribute(attr, val); + return val; + } + + var getter = function(val) { + var res = this.getAttribute(attr); + return res; + } + + var curr_src = obj.getAttribute(attr); + + def_prop(obj, attr, curr_src, setter, getter); + } + + function replace_dom_func(funcname) { + var orig = Node.prototype[funcname]; + + Node.prototype[funcname] = function() { + var child = arguments[0]; + + rewrite_elem(child); + + var desc; + + if (child instanceof DocumentFragment) { + // desc = child.querySelectorAll("*[href],*[src]"); + } else if (child.getElementsByTagName) { + // desc = child.getElementsByTagName("*"); + } + + if (desc) { + for (var i = 0; i < desc.length; i++) { + rewrite_elem(desc[i]); + } + } + + var created = orig.apply(this, arguments); + + if (created.tagName == "IFRAME" || + created.tagName == "IMG" || + created.tagName == "SCRIPT") { + + override_attr(created, "src"); + + } else if (created.tagName == "A") { + override_attr(created, "href"); + } + + return created; + } + } + + replace_dom_func("appendChild"); + replace_dom_func("insertBefore"); + replace_dom_func("replaceChild"); + } + + var postmessage_rewritten; + + //============================================ + function init_postmessage_override() + { + if (!Window.prototype.postMessage) { + return; + } + + var orig = Window.prototype.postMessage; + + postmessage_rewritten = function(message, targetOrigin, transfer) { + if (targetOrigin && targetOrigin != "*") { + targetOrigin = window.location.origin; + } + + return orig.call(this, message, targetOrigin, transfer); + } + + window.postMessage = postmessage_rewritten; + window.Window.prototype.postMessage = postmessage_rewritten; + + for (var i = 0; i < window.frames.length; i++) { + try { + window.frames[i].postMessage = postmessage_rewritten; + } catch (e) { + console.log(e); + } + } + } + + //============================================ + function init_open_override() + { + if (!Window.prototype.open) { + return; + } + + var orig = Window.prototype.open; + + var open_rewritten = function(strUrl, strWindowName, strWindowFeatures) { + strUrl = rewrite_url(strUrl); + return orig.call(this, strUrl, strWindowName, strWindowFeatures); + } + + window.open = open_rewritten; + window.Window.prototype.open = open_rewritten; + + for (var i = 0; i < window.frames.length; i++) { + try { + window.frames[i].open = open_rewritten; + } catch (e) { + console.log(e); + } + } + } + + //============================================ + function wombat_init(replay_prefix, capture_date, orig_scheme, orig_host, timestamp) { + wb_replay_prefix = replay_prefix; + + wb_replay_date_prefix = replay_prefix + capture_date + "em_/"; + + if (capture_date.length > 0) { + wb_capture_date_part = "/" + capture_date + "/"; + } else { + wb_capture_date_part = ""; + } + + wb_orig_scheme = orig_scheme + '://'; + + wb_orig_host = wb_orig_scheme + orig_host; + + init_bad_prefixes(replay_prefix); // Location - window.WB_wombat_location = copy_location_obj(window.self.location); - document.WB_wombat_location = window.WB_wombat_location; + var wombat_location = new WombatLocation(window.self.location); + + if (wombat_location._autooverride) { + + var setter = function(val) { + if (typeof(val) == "string") { + if (starts_with(val, "about:")) { + return undefined; + } + this._WB_wombat_location.href = val; + } + } + + def_prop(window, "WB_wombat_location", wombat_location, setter); + def_prop(document, "WB_wombat_location", wombat_location, setter); + } else { + window.WB_wombat_location = wombat_location; + document.WB_wombat_location = wombat_location; + + // Check quickly after page load + setTimeout(check_all_locations, 500); + + // Check periodically every few seconds + setInterval(check_all_locations, 500); + } + + var is_framed = (window.top.wbinfo && window.top.wbinfo.is_frame); if (window.self.location != window.top.location) { - window.top.WB_wombat_location = copy_location_obj(window.top.location); + if (is_framed) { + window.top.WB_wombat_location = window.WB_wombat_location; + window.WB_wombat_top = window.self; + } else { + window.top.WB_wombat_location = new WombatLocation(window.top.location); + + window.WB_wombat_top = window.top; + } + } else { + window.WB_wombat_top = window.top; } - if (window.opener) { - window.opener.WB_wombat_location = copy_location_obj(window.opener.location); - } + //if (window.opener) { + // window.opener.WB_wombat_location = copy_location_obj(window.opener.location); + //} // Domain document.WB_wombat_domain = orig_host; + document.WB_wombat_referrer = extract_orig(document.referrer); // History copy_history_func(window.history, 'pushState'); copy_history_func(window.history, 'replaceState'); + + // open + init_open_override(); + // postMessage + init_postmessage_override(); + // Ajax init_ajax_rewrite(); + init_worker_override(); + + // DOM + init_dom_override(); // Random - init_seeded_random(timestamp); + init_seeded_random(timestamp); } - // Check quickly after page load - setTimeout(check_all_locations, 100); - - // Check periodically every few seconds - setInterval(check_all_locations, 500); - return wombat_init; })(this); diff --git a/pywb/ui/frame_insert.html b/pywb/ui/frame_insert.html new file mode 100644 index 00000000..3ba9a406 --- /dev/null +++ b/pywb/ui/frame_insert.html @@ -0,0 +1,55 @@ + + + + + + + + + +
+