1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-26 07:49:24 +01:00
pywb/pywb/rewrite/default_rewriter.py
John Berlin 94784d6e5d wombat overhaul! fixes #449 (#451)
wombat:
 - I: function overrides applied by wombat now better appear to be the original new function name same as originals when possible
 - I: WombatLocation now looks and behaves more like the original Location interface
 - I: The custom storage class now looks and behaves more like the original Storage
 - I: SVG image rewriting has been improved: both the href and xlink:href deprecated since SVG2 now rewritten always
 - I: document.open now handles the case of creation of a new window
 - I: Request object rewriting of the readonly href property is now correctly handled
 - I: EventTarget.addEventListener, removeEventListener overrides now preserve the original this argument of the wrapped listener
 - A: document.close override to ensure wombat is initialized after write or writeln usage
 - A: reconstruction of <doctype...> in rewriteHTMLComplete IFF it was included in the original string of HTML
 - A: document.body setter override to ensure rewriting of the new body or frameset
 - A: Attr.[value, nodeValue, textContent] added setter override to perform URL rewrites
 - A: SVGElements rewriting of the filter, style, xlink:href, href, and src attributes
 - A: HTMLTrackElement rewriting of the src attribute of the
 - A: HTMLQuoteElement and HTMLModElement rewriting of the cite attribute
 - A: Worklet.addModule: Loads JS module specified by a URL.
 - A: HTMLHyperlinkElementUtils overrides to the areaelement
 - A: ShadowRootoverrides to: innerHTML even though inherites from DocumentFragement and Node it still has innerHTML getter setter.
 - A: ShadowRoot, Element, DocumentFragment append, prepend: adds strings of HTML or a new Node inherited from ParentNode
 - A: StylePropertyMap override: New way to access and set CSS properties.
 - A: Response.redirecthttps rewriting of the URL argument.
 - A:  UIEvent, MouseEvent, TouchEvent, KeyboardEvent, WheelEvent, InputEvent, and CompositionEven constructor and init{even-name} overrides in order to ensure that wombats JS Proxy usage does not affect their defined behaviors
 - A: XSLTProcessor override to ensure its usage is not affected by wombats JS Proxy usage.
 - A: navigator.unregisterProtocolHandler: Same override as existing navigator.registerProtocolHandler but from the inverse operation
 - A: PresentationRequest: Constructor takes a URL or an array of URLs.
 - A: EventSource and WebSocket override in order to ensure that they do not cause live leaks
 - A: overrides for the child node interface
 - Fix: autofetch worker creatation of the backing worker when it is operating within an execution context with a null origin
tests:
  - A: 559 tests specific to wombat and client side rewritting
pywb:
  - Fix: a few broken tests due to iana.org requiring a user agent in its requests
rewrite:
  - introduced a new JSWorkerRewriter class in order to support rewriting via wombat workers in the context of all supported worker variants via
  - ensured rewriter app correctly sets the static prefix
ci:
 - Modified travis.yml to specifically enumerate jobs
documentation:
  - Documented new wombat, wombat proxy moded, wombat workers
auto-fetch:
 - switched to mutation observer when in proxy mode so that the behaviors can operate in tandem with the autofetcher
2019-05-15 11:42:51 -07:00

147 lines
4.0 KiB
Python

from pywb.rewrite.content_rewriter import BaseContentRewriter
from pywb.rewrite.html_rewriter import HTMLRewriter
from pywb.rewrite.html_insert_rewriter import HTMLInsertOnlyRewriter
from pywb.rewrite.regex_rewriters import RegexRewriter, CSSRewriter, XMLRewriter
from pywb.rewrite.regex_rewriters import JSLocationOnlyRewriter, JSNoneRewriter, JSWombatProxyRewriter
from pywb.rewrite.header_rewriter import DefaultHeaderRewriter
from pywb.rewrite.cookie_rewriter import HostScopeCookieRewriter
from pywb.rewrite.jsonp_rewriter import JSONPRewriter
from pywb.rewrite.rewrite_dash import RewriteDASH
from pywb.rewrite.rewrite_hls import RewriteHLS
from pywb.rewrite.rewrite_amf import RewriteAMF
from pywb.rewrite.rewrite_js_workers import JSWorkerRewriter
from pywb import DEFAULT_RULES_FILE
import copy
from werkzeug.useragents import UserAgent
# ============================================================================
class DefaultRewriter(BaseContentRewriter):
DEFAULT_REWRITERS = {
'header': DefaultHeaderRewriter,
'cookie': HostScopeCookieRewriter,
'html': HTMLRewriter,
'html-banner-only': HTMLInsertOnlyRewriter,
'css': CSSRewriter,
'js': JSLocationOnlyRewriter,
'js-proxy': JSNoneRewriter,
'js-worker': JSWorkerRewriter,
'json': JSONPRewriter,
'xml': XMLRewriter,
'dash': RewriteDASH,
'hls': RewriteHLS,
'amf': RewriteAMF,
}
rewrite_types = {
# HTML
'text/html': 'guess-html',
'application/xhtml': 'html',
'application/xhtml+xml': 'html',
# CSS
'text/css': 'css',
# JS
'text/javascript': 'js',
'application/javascript': 'js',
'application/x-javascript': 'js',
# JSON
'application/json': 'json',
# HLS
'application/x-mpegURL': 'hls',
'application/vnd.apple.mpegurl': 'hls',
# DASH
'application/dash+xml': 'dash',
# AMF
'application/x-amf': 'amf',
# XML -- don't rewrite xml
#'text/xml': 'xml',
#'application/xml': 'xml',
#'application/rss+xml': 'xml',
# PLAIN
'text/plain': 'guess-text',
# DEFAULT or octet-stream
'': 'guess-text',
'application/octet-stream': 'guess-bin'
}
default_content_types = {
'html': 'text/html',
'css': 'text/css',
'js': 'text/javascript'
}
def __init__(self, replay_mod='', config=None):
config = config or {}
rules_file = config.get('rules_file', DEFAULT_RULES_FILE)
super(DefaultRewriter, self).__init__(rules_file, replay_mod)
self.all_rewriters = copy.copy(self.DEFAULT_REWRITERS)
def init_js_regex(self, regexs):
return RegexRewriter.parse_rules_from_config(regexs)
def get_rewrite_types(self):
return self.rewrite_types
# ============================================================================
class RewriterWithJSProxy(DefaultRewriter):
def __init__(self, *args, **kwargs):
super(RewriterWithJSProxy, self).__init__(*args, **kwargs)
def get_rewriter(self, rw_type, rwinfo=None):
if rw_type == 'js' and rwinfo:
# check if UA allows this
if self.ua_allows_obj_proxy(rwinfo.url_rewriter.rewrite_opts):
return JSWombatProxyRewriter
# otherwise, return default rewriter
return super(RewriterWithJSProxy, self).get_rewriter(rw_type, rwinfo)
def ua_allows_obj_proxy(self, opts):
ua = opts.get('ua')
if not ua:
ua_string = opts.get('ua_string')
if ua_string:
ua = UserAgent(ua_string)
if ua is None:
return True
supported = {
'chrome': '49.0',
'firefox': '44.0',
'safari': '10.0',
'opera': '36.0',
'edge': '12.0',
'msie': None,
}
min_vers = supported.get(ua.browser)
return (min_vers and ua.version >= min_vers)