diff --git a/pywb/indexreader.py b/pywb/indexreader.py
index 1a4f8709..95453c9f 100644
--- a/pywb/indexreader.py
+++ b/pywb/indexreader.py
@@ -48,19 +48,19 @@ class RemoteCDXServer:
return response
@staticmethod
- def getQueryParams(wburl):
+ def getQueryParams(wburl, limit = '150000', collapseTime = '10', replayClosest = '10'):
return {
ArchivalUrl.QUERY:
- {'collapseTime': '10', 'filter': '!statuscode:(500|502|504)', 'limit': '150000'},
+ {'collapseTime': collapseTime, 'filter': '!statuscode:(500|502|504)', 'limit': limit},
ArchivalUrl.URL_QUERY:
- {'collapse': 'urlkey', 'matchType': 'prefix', 'showGroupCount': True, 'showUniqCount': True, 'lastSkipTimestamp': True, 'limit': '100',
+ {'collapse': 'urlkey', 'matchType': 'prefix', 'showGroupCount': True, 'showUniqCount': True, 'lastSkipTimestamp': True, 'limit': limit,
'fl': 'urlkey,original,timestamp,endtimestamp,groupcount,uniqcount',
},
ArchivalUrl.REPLAY:
- {'sort': 'closest', 'filter': '!statuscode:(500|502|504)', 'limit': '10', 'closest': wburl.timestamp, 'resolveRevisits': True},
+ {'sort': 'closest', 'filter': '!statuscode:(500|502|504)', 'limit': replayClosest, 'closest': wburl.timestamp, 'resolveRevisits': True},
ArchivalUrl.LATEST_REPLAY:
{'sort': 'reverse', 'filter': 'statuscode:[23]..', 'limit': '1', 'resolveRevisits': True}
@@ -82,7 +82,7 @@ class CDXCaptureResult:
cdxformat = i
if not cdxformat:
- raise InvalidCDXException('unknown %d-field cdx format' % len(fields))
+ raise InvalidCDXException('unknown {0}-field cdx format'.format(len(fields)))
for header, field in zip(cdxformat, fields):
setattr(self, header, field)
diff --git a/pywb/wbhtml.py b/pywb/wbhtml.py
new file mode 100644
index 00000000..7574cf77
--- /dev/null
+++ b/pywb/wbhtml.py
@@ -0,0 +1,101 @@
+import sys
+
+from HTMLParser import HTMLParser
+from wburlrewriter import ArchivalUrlRewriter
+
+tag_list = {
+ 'a': {'href': ''},
+ 'img': {'src': 'im_'}
+}
+
+# create a subclass and override the handler methods
+class WBHtml(HTMLParser):
+ """
+ >>> WBHtml(rewriter).feed('Text')
+ Text
+
+ >>> WBHtml(rewriter).feed('
')
+ 
+
+ """
+
+ def __init__(self, rewriter, outstream = None):
+ HTMLParser.__init__(self)
+
+ self.rewriter = rewriter
+ self.out = outstream if outstream else sys.stdout
+
+ def _rewriteAttr(self, mod, value):
+ return self.rewriter.rewrite(value, mod)
+
+ def rewriteTagAttrs(self, tag, tagAttrs, isStartEnd):
+ rwAttrs = tag_list.get(tag)
+ if not rwAttrs:
+ rwAttrs = tag_list.get('')
+
+ if not rwAttrs:
+ return False
+
+ self.out.write('<' + tag)
+ for attr in tagAttrs:
+ name, value = attr
+ rwMod = rwAttrs.get(name)
+
+ if rwMod is not None:
+ value = self._rewriteAttr(rwMod, value)
+
+ self.out.write(' {0}="{1}"'.format(name, value))
+
+ self.out.write('/>' if isStartEnd else '>')
+ return True
+
+ def handle_starttag(self, tag, attrs):
+
+ if not self.rewriteTagAttrs(tag, attrs, False):
+ self.out.write(self.get_starttag_text())
+
+ def handle_startendtag(self, tag, attrs):
+
+ if not self.rewriteTagAttrs(tag, attrs, True):
+ self.out.write(self.get_starttag_text())
+
+ def handle_endtag(self, tag):
+ self.out.write('' + tag + '>')
+
+ def handle_data(self, data):
+ self.out.write(data)
+
+ def handle_entityref(self, data):
+ self.out.write('&' + data)
+
+ def handle_charref(self, data):
+ self.out.write('' + data)
+
+ def handle_comment(self, data):
+ self.out.write('')
+
+ def handle_decl(self, data):
+ self.out.write('')
+
+ def handle_pi(self, data):
+ self.out.write('' + data + '>')
+
+ def unknown_decl(self, data):
+ self.out.write('')
+
+
+
+
+# instantiate the parser and fed it some HTML
+#parser = WBHtml()
+#instr = '
Test\nParse me!
'
+#print instr
+#print
+#parser.feed(instr)
+#print
+if __name__ == "__main__":
+ import doctest
+
+ rewriter = ArchivalUrlRewriter('/20131226101010/http://example.com/some/path/index.html', '/web/')
+
+ doctest.testmod()
diff --git a/pywb/wburlrewriter.py b/pywb/wburlrewriter.py
index 8e587404..0ff3ef97 100644
--- a/pywb/wburlrewriter.py
+++ b/pywb/wburlrewriter.py
@@ -8,6 +8,9 @@ class ArchivalUrlRewriter:
>>> test_rewrite('other.html', '/20131010/http://example.com/path/page.html', 'https://web.archive.org/web/')
'https://web.archive.org/web/20131010/http://example.com/path/other.html'
+ >>> test_rewrite('file.js', '/20131010/http://example.com/path/page.html', 'https://web.archive.org/web/', 'js_')
+ 'https://web.archive.org/web/20131010js_/http://example.com/path/file.js'
+
>>> test_rewrite('./other.html', '/20130907*/http://example.com/path/page.html', '/coll/')
'/coll/20130907*/http://example.com/path/other.html'
@@ -30,11 +33,13 @@ class ArchivalUrlRewriter:
if self.prefix.endswith('/'):
self.prefix = self.prefix[:-1]
- def rewrite(self, rel_url):
- if '../' in rel_url:
+ def rewrite(self, rel_url, mod = None):
+ if '../' in rel_url or mod:
wburl = ArchivalUrl(self.wburl_str)
wburl.url = urlparse.urljoin(wburl.url, rel_url)
wburl.url = wburl.url.replace('../', '')
+ if mod is not None:
+ wburl.mod = mod
final_url = self.prefix + str(wburl)
else:
@@ -45,8 +50,8 @@ class ArchivalUrlRewriter:
if __name__ == "__main__":
import doctest
- def test_rewrite(rel_url, base_url, prefix):
+ def test_rewrite(rel_url, base_url, prefix, mod = None):
rewriter = ArchivalUrlRewriter(base_url, prefix)
- return rewriter.rewrite(rel_url)
+ return rewriter.rewrite(rel_url, mod)
doctest.testmod()