diff --git a/pywb/wbarchivalurl.py b/pywb/wbarchivalurl.py
index 85bb4cbe..89345f02 100644
--- a/pywb/wbarchivalurl.py
+++ b/pywb/wbarchivalurl.py
@@ -129,24 +129,28 @@ class ArchivalUrl:
# Str Representation
# ====================
- def __str__(self):
- if self.type == ArchivalUrl.QUERY or self.type == ArchivalUrl.URL_QUERY:
+ @staticmethod
+ def to_str(atype, mod, timestamp, url):
+ if atype == ArchivalUrl.QUERY or atype == ArchivalUrl.URL_QUERY:
tsmod = "/"
- if self.mod:
- tsmod += self.mod + "/"
- if self.timestamp:
- tsmod += self.timestamp
+ if mod:
+ tsmod += mod + "/"
+ if timestamp:
+ tsmod += timestamp
- tsmod += "*/" + self.url
- if self.type == ArchivalUrl.URL_QUERY:
+ tsmod += "*/" + url
+ if atype == ArchivalUrl.URL_QUERY:
tsmod += "*"
return tsmod
else:
- tsmod = self.timestamp + self.mod
+ tsmod = timestamp + mod
if len(tsmod) > 0:
- return "/" + tsmod + "/" + self.url
+ return "/" + tsmod + "/" + url
else:
- return "/" + self.url
+ return "/" + url
+
+ def __str__(self):
+ return ArchivalUrl.to_str(self.type, self.mod, self.timestamp, self.url)
def __repr__(self):
return str((self.type, self.timestamp, self.mod, self.url, str(self)))
diff --git a/pywb/wbhtml.py b/pywb/wbhtml.py
index 7574cf77..d707eef3 100644
--- a/pywb/wbhtml.py
+++ b/pywb/wbhtml.py
@@ -1,12 +1,10 @@
import sys
+import re
from HTMLParser import HTMLParser
from wburlrewriter import ArchivalUrlRewriter
-tag_list = {
- 'a': {'href': ''},
- 'img': {'src': 'im_'}
-}
+
# create a subclass and override the handler methods
class WBHtml(HTMLParser):
@@ -14,57 +12,164 @@ class WBHtml(HTMLParser):
>>> WBHtml(rewriter).feed('Text')
Text
- >>> WBHtml(rewriter).feed('
')
- 
+ >>> WBHtml(rewriter).feed('

')
+ 
+ >>> WBHtml(rewriter).feed('
')
+ 
+
+ >>> WBHtml(rewriter).feed('')
+
"""
+ REWRITE_TAGS = {
+ 'a': {'href': ''},
+ 'applet': {'codebase': 'oe_',
+ 'archive': 'oe_'},
+ 'area': {'href': ''},
+ 'base': {'href': ''},
+ 'blockquote': {'cite': ''},
+ 'body': {'background': 'im_'},
+ 'del': {'cite': ''},
+ 'embed': {'src': 'oe_'},
+ 'iframe': {'src': 'if_'},
+ 'img': {'src': 'im_'},
+ 'ins': {'cite': ''},
+ 'input': {'src': 'im_'},
+ 'form': {'action': ''},
+ 'frame': {'src': 'fr_'},
+ 'link': {'href': 'oe_'},
+ 'meta': {'content': ''},
+ 'object': {'codebase': 'oe_',
+ 'data': 'oe_'},
+ 'q': {'cite': ''},
+ 'script': {'src': 'js_'},
+ 'div': {'data-src' : '',
+ 'data-uri' : ''},
+ 'li': {'data-src' : '',
+ 'data-uri' : ''},
+ }
+
+ STATE_TAGS = ['head', 'body', 'script', 'style']
+
+
def __init__(self, rewriter, outstream = None):
HTMLParser.__init__(self)
self.rewriter = rewriter
+ self._wbParseContext = None
self.out = outstream if outstream else sys.stdout
- def _rewriteAttr(self, mod, value):
+
+ # ===========================
+ META_REFRESH_REGEX = re.compile('^[\\d.]+\\s*;\\s*url\\s*=\\s*(.+?)\\s*$', re.IGNORECASE | re.MULTILINE)
+
+ def _rewriteMetaRefresh(self, metaRefresh):
+ m = WBHtml.META_REFRESH_REGEX.match(metaRefresh)
+ if not m:
+ return metaRefresh
+
+ try:
+ metaRefresh = metaRefresh[:m.start(1)] + self._rewriteURL(m.group(1)) + metaRefresh[m.end(1):]
+ except Exception:
+ pass
+
+ return metaRefresh
+ # ===========================
+
+ NO_REWRITE_PREFIX = ['javascript:', 'data:', 'mailto:']
+
+ def _rewriteURL(self, value, mod = None):
+ if any (value.startswith(x) for x in WBHtml.NO_REWRITE_PREFIX):
+ return value
+
return self.rewriter.rewrite(value, mod)
- def rewriteTagAttrs(self, tag, tagAttrs, isStartEnd):
- rwAttrs = tag_list.get(tag)
- if not rwAttrs:
- rwAttrs = tag_list.get('')
- if not rwAttrs:
+ def _rewriteCSS(self, cssContent):
+ return cssContent
+
+ def _rewriteScript(self, scriptContent):
+ return scriptContent
+
+ def hasAttr(self, tagAttrs, attr):
+ name, value = attr
+ for attrName, attrValue in tagAttrs:
+ if attrName == name:
+ return value.lower() == attrValue.lower()
+ return False
+
+ def rewriteTagAttrs(self, tag, tagAttrs, isStartEnd):
+ handler = WBHtml.REWRITE_TAGS.get(tag)
+ if not handler:
+ handler = WBHtml.REWRITE_TAGS.get('')
+
+ if not handler:
return False
+ # special case: base tag
+ if (tag == 'base'):
+ newBase = tagAttrs.get('href')
+ if newBase:
+ self.rewriter.setBaseUrl(newBase[1])
+
+ # special case: script or style parse context
+ elif ((tag == 'script') or (tag == 'style')) and (self._wbParseContext == None):
+ self._wbParseContext = tag
+
self.out.write('<' + tag)
+
for attr in tagAttrs:
- name, value = attr
- rwMod = rwAttrs.get(name)
+ attrName, attrValue = attr
- if rwMod is not None:
- value = self._rewriteAttr(rwMod, value)
+ # special case: inline JS/event handler
+ if attrValue.startswith('javascript:') or attrName.startswith("on"):
+ attrValue = self._rewriteScript(attrValue)
- self.out.write(' {0}="{1}"'.format(name, value))
+ # special case: inline CSS/style attribute
+ elif attrName == 'style':
+ attrValue = self._rewriteCSS(attrValue)
+
+ # special case: meta tag
+ elif (tag == 'meta') and (attrName == 'content') and self.hasAttr(tagAttrs, ('http-equiv', 'refresh')):
+ attrValue = self._rewriteMetaRefresh(attrValue)
+
+ else:
+ rwMod = handler.get(attrName)
+ if rwMod is not None:
+ attrValue = self._rewriteURL(attrValue, rwMod)
+
+ self.out.write(' {0}="{1}"'.format(attrName, attrValue))
self.out.write('/>' if isStartEnd else '>')
+
return True
def handle_starttag(self, tag, attrs):
-
if not self.rewriteTagAttrs(tag, attrs, False):
self.out.write(self.get_starttag_text())
def handle_startendtag(self, tag, attrs):
-
if not self.rewriteTagAttrs(tag, attrs, True):
self.out.write(self.get_starttag_text())
def handle_endtag(self, tag):
+ if (tag == self._wbParseContext):
+ self._wbParseContext = None
+
self.out.write('' + tag + '>')
- def handle_data(self, data):
+ def parseData(self, data):
+ if self._wbParseContext == 'script':
+ data = self._rewriteScript(data)
+ elif self._wbParseContext == 'style':
+ data = self._rewriteCSS(data)
+
self.out.write(data)
+ def handle_data(self, data):
+ self.parseData(data)
+
def handle_entityref(self, data):
self.out.write('&' + data)
@@ -72,7 +177,9 @@ class WBHtml(HTMLParser):
self.out.write('' + data)
def handle_comment(self, data):
- self.out.write('')
+ self.out.write('')
def handle_decl(self, data):
self.out.write('')
@@ -81,9 +188,9 @@ class WBHtml(HTMLParser):
self.out.write('' + data + '>')
def unknown_decl(self, data):
- self.out.write('')
-
-
+ self.out.write('')
# instantiate the parser and fed it some HTML
diff --git a/pywb/wburlrewriter.py b/pywb/wburlrewriter.py
index 0ff3ef97..041dc753 100644
--- a/pywb/wburlrewriter.py
+++ b/pywb/wburlrewriter.py
@@ -20,6 +20,12 @@ class ArchivalUrlRewriter:
>>> test_rewrite('../../other.html', '/*/http://example.com/index.html', 'localhost:8080/')
'localhost:8080/*/http://example.com/other.html'
+ >>> test_rewrite('path/../../other.html', '/*/http://example.com/index.html', 'localhost:8080/')
+ 'localhost:8080/*/http://example.com/other.html'
+
+ >>> test_rewrite('http://some-other-site.com', '/20101226101112/http://example.com/index.html', 'localhost:8080/')
+ 'localhost:8080/20101226101112/http://some-other-site.com'
+
>>> test_rewrite('../../other.html', '/2020/http://example.com/index.html', '/')
'/2020/http://example.com/other.html'
@@ -28,25 +34,33 @@ class ArchivalUrlRewriter:
"""
def __init__(self, wburl_str, prefix):
- self.wburl_str = wburl_str
+ self.wburl = ArchivalUrl(wburl_str)
self.prefix = prefix
+
if self.prefix.endswith('/'):
self.prefix = self.prefix[:-1]
def rewrite(self, rel_url, mod = None):
- if '../' in rel_url or mod:
- wburl = ArchivalUrl(self.wburl_str)
- wburl.url = urlparse.urljoin(wburl.url, rel_url)
- wburl.url = wburl.url.replace('../', '')
- if mod is not None:
- wburl.mod = mod
+ wburl = self.wburl
- final_url = self.prefix + str(wburl)
- else:
- final_url = urlparse.urljoin(self.prefix + self.wburl_str, rel_url)
+ # Disable optimization, doesn't work for external links
+ # if relative path or different mod, create rewrite from split up ArchivalUrl
+ #if rel_url.startswith('/') or ('../' in rel_url) or mod:
+ newUrl = urlparse.urljoin(wburl.url, rel_url).replace('../', '')
+
+ if mod is None:
+ mod = wburl.mod
+
+ final_url = self.prefix + ArchivalUrl.to_str(wburl.type, mod, wburl.timestamp, newUrl)
+ # otherwise, optimize, and join directly with full url
+ #else:
+ # final_url = urlparse.urljoin(self.prefix + wburl.original_url, rel_url)
return final_url
+ def setBaseUrl(self, newUrl):
+ self.wburl.url = newUrl
+
if __name__ == "__main__":
import doctest