diff --git a/pywb/indexreader.py b/pywb/indexreader.py index 1a4f8709..95453c9f 100644 --- a/pywb/indexreader.py +++ b/pywb/indexreader.py @@ -48,19 +48,19 @@ class RemoteCDXServer: return response @staticmethod - def getQueryParams(wburl): + def getQueryParams(wburl, limit = '150000', collapseTime = '10', replayClosest = '10'): return { ArchivalUrl.QUERY: - {'collapseTime': '10', 'filter': '!statuscode:(500|502|504)', 'limit': '150000'}, + {'collapseTime': collapseTime, 'filter': '!statuscode:(500|502|504)', 'limit': limit}, ArchivalUrl.URL_QUERY: - {'collapse': 'urlkey', 'matchType': 'prefix', 'showGroupCount': True, 'showUniqCount': True, 'lastSkipTimestamp': True, 'limit': '100', + {'collapse': 'urlkey', 'matchType': 'prefix', 'showGroupCount': True, 'showUniqCount': True, 'lastSkipTimestamp': True, 'limit': limit, 'fl': 'urlkey,original,timestamp,endtimestamp,groupcount,uniqcount', }, ArchivalUrl.REPLAY: - {'sort': 'closest', 'filter': '!statuscode:(500|502|504)', 'limit': '10', 'closest': wburl.timestamp, 'resolveRevisits': True}, + {'sort': 'closest', 'filter': '!statuscode:(500|502|504)', 'limit': replayClosest, 'closest': wburl.timestamp, 'resolveRevisits': True}, ArchivalUrl.LATEST_REPLAY: {'sort': 'reverse', 'filter': 'statuscode:[23]..', 'limit': '1', 'resolveRevisits': True} @@ -82,7 +82,7 @@ class CDXCaptureResult: cdxformat = i if not cdxformat: - raise InvalidCDXException('unknown %d-field cdx format' % len(fields)) + raise InvalidCDXException('unknown {0}-field cdx format'.format(len(fields))) for header, field in zip(cdxformat, fields): setattr(self, header, field) diff --git a/pywb/wbhtml.py b/pywb/wbhtml.py new file mode 100644 index 00000000..7574cf77 --- /dev/null +++ b/pywb/wbhtml.py @@ -0,0 +1,101 @@ +import sys + +from HTMLParser import HTMLParser +from wburlrewriter import ArchivalUrlRewriter + +tag_list = { + 'a': {'href': ''}, + 'img': {'src': 'im_'} +} + +# create a subclass and override the handler methods +class WBHtml(HTMLParser): + """ + >>> WBHtml(rewriter).feed('Text') + Text + + >>> WBHtml(rewriter).feed('
') +
+ + """ + + def __init__(self, rewriter, outstream = None): + HTMLParser.__init__(self) + + self.rewriter = rewriter + self.out = outstream if outstream else sys.stdout + + def _rewriteAttr(self, mod, value): + return self.rewriter.rewrite(value, mod) + + def rewriteTagAttrs(self, tag, tagAttrs, isStartEnd): + rwAttrs = tag_list.get(tag) + if not rwAttrs: + rwAttrs = tag_list.get('') + + if not rwAttrs: + return False + + self.out.write('<' + tag) + for attr in tagAttrs: + name, value = attr + rwMod = rwAttrs.get(name) + + if rwMod is not None: + value = self._rewriteAttr(rwMod, value) + + self.out.write(' {0}="{1}"'.format(name, value)) + + self.out.write('/>' if isStartEnd else '>') + return True + + def handle_starttag(self, tag, attrs): + + if not self.rewriteTagAttrs(tag, attrs, False): + self.out.write(self.get_starttag_text()) + + def handle_startendtag(self, tag, attrs): + + if not self.rewriteTagAttrs(tag, attrs, True): + self.out.write(self.get_starttag_text()) + + def handle_endtag(self, tag): + self.out.write('') + + def handle_data(self, data): + self.out.write(data) + + def handle_entityref(self, data): + self.out.write('&' + data) + + def handle_charref(self, data): + self.out.write('&#' + data) + + def handle_comment(self, data): + self.out.write('') + + def handle_decl(self, data): + self.out.write('') + + def handle_pi(self, data): + self.out.write('') + + def unknown_decl(self, data): + self.out.write('') + + + + +# instantiate the parser and fed it some HTML +#parser = WBHtml() +#instr = '
Test\n

Parse me!

' +#print instr +#print +#parser.feed(instr) +#print +if __name__ == "__main__": + import doctest + + rewriter = ArchivalUrlRewriter('/20131226101010/http://example.com/some/path/index.html', '/web/') + + doctest.testmod() diff --git a/pywb/wburlrewriter.py b/pywb/wburlrewriter.py index 8e587404..0ff3ef97 100644 --- a/pywb/wburlrewriter.py +++ b/pywb/wburlrewriter.py @@ -8,6 +8,9 @@ class ArchivalUrlRewriter: >>> test_rewrite('other.html', '/20131010/http://example.com/path/page.html', 'https://web.archive.org/web/') 'https://web.archive.org/web/20131010/http://example.com/path/other.html' + >>> test_rewrite('file.js', '/20131010/http://example.com/path/page.html', 'https://web.archive.org/web/', 'js_') + 'https://web.archive.org/web/20131010js_/http://example.com/path/file.js' + >>> test_rewrite('./other.html', '/20130907*/http://example.com/path/page.html', '/coll/') '/coll/20130907*/http://example.com/path/other.html' @@ -30,11 +33,13 @@ class ArchivalUrlRewriter: if self.prefix.endswith('/'): self.prefix = self.prefix[:-1] - def rewrite(self, rel_url): - if '../' in rel_url: + def rewrite(self, rel_url, mod = None): + if '../' in rel_url or mod: wburl = ArchivalUrl(self.wburl_str) wburl.url = urlparse.urljoin(wburl.url, rel_url) wburl.url = wburl.url.replace('../', '') + if mod is not None: + wburl.mod = mod final_url = self.prefix + str(wburl) else: @@ -45,8 +50,8 @@ class ArchivalUrlRewriter: if __name__ == "__main__": import doctest - def test_rewrite(rel_url, base_url, prefix): + def test_rewrite(rel_url, base_url, prefix, mod = None): rewriter = ArchivalUrlRewriter(base_url, prefix) - return rewriter.rewrite(rel_url) + return rewriter.rewrite(rel_url, mod) doctest.testmod()