1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

Merge develop and lxml

This commit is contained in:
Ilya Kreymer 2014-03-18 17:14:27 -07:00
commit 53590537e0
8 changed files with 67 additions and 49 deletions

View File

@ -87,6 +87,17 @@ class WbRequest(object):
self._parse_extra() self._parse_extra()
@property
def is_embed(self):
return (self.wb_url and
self.wb_url.mod and
self.wb_url.mod != 'id_')
@property
def is_identity(self):
return (self.wb_url and
self.wb_url.mod == 'id_')
def _is_ajax(self): def _is_ajax(self):
value = self.env.get('HTTP_X_REQUESTED_WITH') value = self.env.get('HTTP_X_REQUESTED_WITH')
if not value: if not value:

View File

@ -25,7 +25,7 @@ class RegexRewriter(object):
@staticmethod @staticmethod
def archival_rewrite(rewriter): def archival_rewrite(rewriter):
return lambda string: rewriter.rewrite(string) return lambda string: rewriter.rewrite(string, 'em_')
#@staticmethod #@staticmethod
#def replacer(other): #def replacer(other):
@ -33,7 +33,7 @@ class RegexRewriter(object):
HTTPX_MATCH_STR = r'https?:\\?/\\?/[A-Za-z0-9:_@.-]+' HTTPX_MATCH_STR = r'https?:\\?/\\?/[A-Za-z0-9:_@.-]+'
DEFAULT_OP = add_prefix #DEFAULT_OP = add_prefix
def __init__(self, rules): def __init__(self, rules):
#rules = self.create_rules(http_prefix) #rules = self.create_rules(http_prefix)
@ -74,8 +74,8 @@ class RegexRewriter(object):
return m.group(0) return m.group(0)
# Custom func # Custom func
if not hasattr(op, '__call__'): #if not hasattr(op, '__call__'):
op = RegexRewriter.DEFAULT_OP(op) # op = RegexRewriter.DEFAULT_OP(op)
result = op(m.group(i)) result = op(m.group(i))
final_str = result final_str = result
@ -124,8 +124,8 @@ class JSLinkAndLocationRewriter(JSLinkOnlyRewriter):
def __init__(self, rewriter, rules=[], prefix='WB_wombat_'): def __init__(self, rewriter, rules=[], prefix='WB_wombat_'):
rules = rules + [ rules = rules + [
(r'(?<!/)\blocation\b', prefix, 0), (r'(?<!/)\blocation\b', RegexRewriter.add_prefix(prefix), 0),
(r'(?<=document\.)domain', prefix, 0), (r'(?<=document\.)domain', RegexRewriter.add_prefix(prefix), 0),
] ]
#import sys #import sys
#sys.stderr.write('\n\n*** RULES:' + str(rules) + '\n\n') #sys.stderr.write('\n\n*** RULES:' + str(rules) + '\n\n')
@ -140,7 +140,7 @@ JSRewriter = JSLinkAndLocationRewriter
#================================================================= #=================================================================
class XMLRewriter(RegexRewriter): class XMLRewriter(RegexRewriter):
def __init__(self, rewriter, extra=[]): def __init__(self, rewriter, extra=[]):
rules = self._create_rules(rewriter.get_abs_url()) rules = self._create_rules(rewriter)
super(XMLRewriter, self).__init__(rules) super(XMLRewriter, self).__init__(rules)
@ -152,10 +152,11 @@ class XMLRewriter(RegexRewriter):
return True return True
def _create_rules(self, http_prefix): def _create_rules(self, rewriter):
return [ return [
('([A-Za-z:]+[\s=]+)?["\'\s]*(' + ('([A-Za-z:]+[\s=]+)?["\'\s]*(' +
RegexRewriter.HTTPX_MATCH_STR + ')', http_prefix, 2), RegexRewriter.HTTPX_MATCH_STR + ')',
RegexRewriter.archival_rewrite(rewriter), 2),
] ]

View File

@ -51,24 +51,24 @@ ur"""
# Script tag # Script tag
>>> parse('<script>window.location = "http://example.com/a/b/c.html"</script>') >>> parse('<script>window.location = "http://example.com/a/b/c.html"</script>')
<script>window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html"</script> <script>window.WB_wombat_location = "/web/20131226101010em_/http://example.com/a/b/c.html"</script>
# Unterminated script tag, handle and auto-terminate # Unterminated script tag, handle and auto-terminate
>>> parse('<script>window.location = "http://example.com/a/b/c.html"</sc>') >>> parse('<script>window.location = "http://example.com/a/b/c.html"</sc>')
<script>window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html"</sc></script> <script>window.WB_wombat_location = "/web/20131226101010em_/http://example.com/a/b/c.html"</sc></script>
>>> parse('<script>/*<![CDATA[*/window.location = "http://example.com/a/b/c.html;/*]]>*/"</script>') >>> parse('<script>/*<![CDATA[*/window.location = "http://example.com/a/b/c.html;/*]]>*/"</script>')
<script>/*<![CDATA[*/window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html;/*]]>*/"</script> <script>/*<![CDATA[*/window.WB_wombat_location = "/web/20131226101010em_/http://example.com/a/b/c.html;/*]]>*/"</script>
>>> parse('<div style="background: url(\'abc.html\')" onblah onclick="location = \'redirect.html\'"></div>') >>> parse('<div style="background: url(\'abc.html\')" onblah onclick="location = \'redirect.html\'"></div>')
<div style="background: url('/web/20131226101010/http://example.com/some/path/abc.html')" onblah="" onclick="WB_wombat_location = 'redirect.html'"></div> <div style="background: url('/web/20131226101010em_/http://example.com/some/path/abc.html')" onblah="" onclick="WB_wombat_location = 'redirect.html'"></div>
>>> parse('<style>@import "styles.css" .a { font-face: url(\'myfont.ttf\') }</style>') >>> parse('<style>@import "styles.css" .a { font-face: url(\'myfont.ttf\') }</style>')
<style>@import "/web/20131226101010/http://example.com/some/path/styles.css" .a { font-face: url('/web/20131226101010/http://example.com/some/path/myfont.ttf') }</style> <style>@import "/web/20131226101010em_/http://example.com/some/path/styles.css" .a { font-face: url('/web/20131226101010em_/http://example.com/some/path/myfont.ttf') }</style>
# Unterminated style tag, handle and auto-terminate # Unterminated style tag, handle and auto-terminate
>>> parse('<style>@import url(styles.css)') >>> parse('<style>@import url(styles.css)')
<style>@import url(/web/20131226101010/http://example.com/some/path/styles.css)</style> <style>@import url(/web/20131226101010em_/http://example.com/some/path/styles.css)</style>
# Head Insertion # Head Insertion
>>> parse('<html><head><script src="other.js"></script></head><body>Test</body></html>', head_insert = '<script src="cool.js"></script>') >>> parse('<html><head><script src="other.js"></script></head><body>Test</body></html>', head_insert = '<script src="cool.js"></script>')

View File

@ -49,24 +49,24 @@ ur"""
# Script tag # Script tag
>>> parse('<script>window.location = "http://example.com/a/b/c.html"</script>') >>> parse('<script>window.location = "http://example.com/a/b/c.html"</script>')
<html><head><script>window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html"</script></head></html> <html><head><script>window.WB_wombat_location = "/web/20131226101010em_/http://example.com/a/b/c.html"</script></head></html>
# Unterminated script tag, will auto-terminate # Unterminated script tag, will auto-terminate
>>> parse('<script>window.location = "http://example.com/a/b/c.html"</sc>') >>> parse('<script>window.location = "http://example.com/a/b/c.html"</sc>')
<html><head><script>window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html"</sc></script></head></html> <html><head><script>window.WB_wombat_location = "/web/20131226101010em_/http://example.com/a/b/c.html"</sc></script></head></html>
>>> parse('<script>/*<![CDATA[*/window.location = "http://example.com/a/b/c.html;/*]]>*/"</script>') >>> parse('<script>/*<![CDATA[*/window.location = "http://example.com/a/b/c.html;/*]]>*/"</script>')
<html><head><script>/*<![CDATA[*/window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html;/*]]>*/"</script></head></html> <html><head><script>/*<![CDATA[*/window.WB_wombat_location = "/web/20131226101010em_/http://example.com/a/b/c.html;/*]]>*/"</script></head></html>
>>> parse('<div style="background: url(\'abc.html\')" onblah onclick="location = \'redirect.html\'"></div>') >>> parse('<div style="background: url(\'abc.html\')" onblah onclick="location = \'redirect.html\'"></div>')
<html><body><div style="background: url('/web/20131226101010/http://example.com/some/path/abc.html')" onblah="" onclick="WB_wombat_location = 'redirect.html'"></div></body></html> <html><body><div style="background: url('/web/20131226101010em_/http://example.com/some/path/abc.html')" onblah="" onclick="WB_wombat_location = 'redirect.html'"></div></body></html>
>>> parse('<style>@import "styles.css" .a { font-face: url(\'myfont.ttf\') }</style>') >>> parse('<style>@import "styles.css" .a { font-face: url(\'myfont.ttf\') }</style>')
<html><head><style>@import "/web/20131226101010/http://example.com/some/path/styles.css" .a { font-face: url('/web/20131226101010/http://example.com/some/path/myfont.ttf') }</style></head></html> <html><head><style>@import "/web/20131226101010em_/http://example.com/some/path/styles.css" .a { font-face: url('/web/20131226101010em_/http://example.com/some/path/myfont.ttf') }</style></head></html>
# Unterminated style tag, handle but don't auto-terminate # Unterminated style tag, handle but don't auto-terminate
>>> parse('<style>@import url(styles.css)') >>> parse('<style>@import url(styles.css)')
<html><head><style>@import url(/web/20131226101010/http://example.com/some/path/styles.css)</style></head></html> <html><head><style>@import url(/web/20131226101010em_/http://example.com/some/path/styles.css)</style></head></html>
# Head Insertion # Head Insertion
>>> parse('<html><head><script src="other.js"></script></head><body>Test</body></html>', head_insert = '<script src="cool.js"></script>') >>> parse('<html><head><script src="other.js"></script></head><body>Test</body></html>', head_insert = '<script src="cool.js"></script>')

View File

@ -12,16 +12,16 @@ r"""
#================================================================= #=================================================================
>>> _test_js('location = "http://example.com/abc.html"') >>> _test_js('location = "http://example.com/abc.html"')
'WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"' 'WB_wombat_location = "/web/20131010em_/http://example.com/abc.html"'
>>> _test_js(r'location = "http:\/\/example.com/abc.html"') >>> _test_js(r'location = "http:\/\/example.com/abc.html"')
'WB_wombat_location = "/web/20131010im_/http:\\/\\/example.com/abc.html"' 'WB_wombat_location = "/web/20131010em_/http:\\/\\/example.com/abc.html"'
>>> _test_js(r'location = "http:\\/\\/example.com/abc.html"') >>> _test_js(r'location = "http:\\/\\/example.com/abc.html"')
'WB_wombat_location = "/web/20131010im_/http:\\\\/\\\\/example.com/abc.html"' 'WB_wombat_location = "/web/20131010em_/http:\\\\/\\\\/example.com/abc.html"'
>>> _test_js(r"location = 'http://example.com/abc.html/'") >>> _test_js(r"location = 'http://example.com/abc.html/'")
"WB_wombat_location = '/web/20131010im_/http://example.com/abc.html/'" "WB_wombat_location = '/web/20131010em_/http://example.com/abc.html/'"
>>> _test_js(r'location = http://example.com/abc.html/') >>> _test_js(r'location = http://example.com/abc.html/')
'WB_wombat_location = http://example.com/abc.html/' 'WB_wombat_location = http://example.com/abc.html/'
@ -37,21 +37,21 @@ r"""
'"/location" == some_location_val; locations = WB_wombat_location;' '"/location" == some_location_val; locations = WB_wombat_location;'
>>> _test_js('cool_Location = "http://example.com/abc.html"') >>> _test_js('cool_Location = "http://example.com/abc.html"')
'cool_Location = "/web/20131010im_/http://example.com/abc.html"' 'cool_Location = "/web/20131010em_/http://example.com/abc.html"'
>>> _test_js('window.location = "http://example.com/abc.html" document.domain = "anotherdomain.com"') >>> _test_js('window.location = "http://example.com/abc.html" document.domain = "anotherdomain.com"')
'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html" document.WB_wombat_domain = "anotherdomain.com"' 'window.WB_wombat_location = "/web/20131010em_/http://example.com/abc.html" document.WB_wombat_domain = "anotherdomain.com"'
>>> _test_js('document_domain = "anotherdomain.com"; window.document.domain = "example.com"') >>> _test_js('document_domain = "anotherdomain.com"; window.document.domain = "example.com"')
'document_domain = "anotherdomain.com"; window.document.WB_wombat_domain = "example.com"' 'document_domain = "anotherdomain.com"; window.document.WB_wombat_domain = "example.com"'
# custom rules added # custom rules added
>>> _test_js('window.location = "http://example.com/abc.html"; some_func(); ', [('some_func\(\).*', RegexRewriter.format('/*{0}*/'), 0)]) >>> _test_js('window.location = "http://example.com/abc.html"; some_func(); ', [('some_func\(\).*', RegexRewriter.format('/*{0}*/'), 0)])
'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"; /*some_func(); */' 'window.WB_wombat_location = "/web/20131010em_/http://example.com/abc.html"; /*some_func(); */'
# scheme-agnostic # scheme-agnostic
>>> _test_js('cool_Location = "//example.com/abc.html" //comment') >>> _test_js('cool_Location = "//example.com/abc.html" //comment')
'cool_Location = "/web/20131010im_///example.com/abc.html" //comment' 'cool_Location = "/web/20131010em_///example.com/abc.html" //comment'
#================================================================= #=================================================================
@ -59,62 +59,62 @@ r"""
#================================================================= #=================================================================
>>> _test_xml('<tag xmlns="http://www.example.com/ns" attr="http://example.com"></tag>') >>> _test_xml('<tag xmlns="http://www.example.com/ns" attr="http://example.com"></tag>')
'<tag xmlns="http://www.example.com/ns" attr="/web/20131010im_/http://example.com"></tag>' '<tag xmlns="http://www.example.com/ns" attr="/web/20131010em_/http://example.com"></tag>'
>>> _test_xml('<tag xmlns:xsi="http://www.example.com/ns" attr=" http://example.com"></tag>') >>> _test_xml('<tag xmlns:xsi="http://www.example.com/ns" attr=" http://example.com"></tag>')
'<tag xmlns:xsi="http://www.example.com/ns" attr=" /web/20131010im_/http://example.com"></tag>' '<tag xmlns:xsi="http://www.example.com/ns" attr=" /web/20131010em_/http://example.com"></tag>'
>>> _test_xml('<tag> http://example.com<other>abchttp://example.com</other></tag>') >>> _test_xml('<tag> http://example.com<other>abchttp://example.com</other></tag>')
'<tag> /web/20131010im_/http://example.com<other>abchttp://example.com</other></tag>' '<tag> /web/20131010em_/http://example.com<other>abchttp://example.com</other></tag>'
>>> _test_xml('<main> http://www.example.com/blah</tag> <other xmlns:abcdef= " http://example.com"/> http://example.com </main>') >>> _test_xml('<main> http://www.example.com/blah</tag> <other xmlns:abcdef= " http://example.com"/> http://example.com </main>')
'<main> /web/20131010im_/http://www.example.com/blah</tag> <other xmlns:abcdef= " http://example.com"/> /web/20131010im_/http://example.com </main>' '<main> /web/20131010em_/http://www.example.com/blah</tag> <other xmlns:abcdef= " http://example.com"/> /web/20131010em_/http://example.com </main>'
#================================================================= #=================================================================
# CSS Rewriting # CSS Rewriting
#================================================================= #=================================================================
>>> _test_css("background: url('/some/path.html')") >>> _test_css("background: url('/some/path.html')")
"background: url('/web/20131010im_/http://example.com/some/path.html')" "background: url('/web/20131010em_/http://example.com/some/path.html')"
>>> _test_css("background: url('../path.html')") >>> _test_css("background: url('../path.html')")
"background: url('/web/20131010im_/http://example.com/path.html')" "background: url('/web/20131010em_/http://example.com/path.html')"
>>> _test_css("background: url(\"http://domain.com/path.html\")") >>> _test_css("background: url(\"http://domain.com/path.html\")")
'background: url("/web/20131010im_/http://domain.com/path.html")' 'background: url("/web/20131010em_/http://domain.com/path.html")'
>>> _test_css("background: url(file.jpeg)") >>> _test_css("background: url(file.jpeg)")
'background: url(/web/20131010im_/http://example.com/file.jpeg)' 'background: url(/web/20131010em_/http://example.com/file.jpeg)'
>>> _test_css("background: url('')") >>> _test_css("background: url('')")
"background: url('')" "background: url('')"
>>> _test_css("background: url (\"weirdpath\')") >>> _test_css("background: url (\"weirdpath\')")
'background: url ("/web/20131010im_/http://example.com/weirdpath\')' 'background: url ("/web/20131010em_/http://example.com/weirdpath\')'
>>> _test_css("@import url ('path.css')") >>> _test_css("@import url ('path.css')")
"@import url ('/web/20131010im_/http://example.com/path.css')" "@import url ('/web/20131010em_/http://example.com/path.css')"
>>> _test_css("@import url('path.css')") >>> _test_css("@import url('path.css')")
"@import url('/web/20131010im_/http://example.com/path.css')" "@import url('/web/20131010em_/http://example.com/path.css')"
>>> _test_css("@import ( 'path.css')") >>> _test_css("@import ( 'path.css')")
"@import ( '/web/20131010im_/http://example.com/path.css')" "@import ( '/web/20131010em_/http://example.com/path.css')"
>>> _test_css("@import \"path.css\"") >>> _test_css("@import \"path.css\"")
'@import "/web/20131010im_/http://example.com/path.css"' '@import "/web/20131010em_/http://example.com/path.css"'
>>> _test_css("@import ('../path.css\"") >>> _test_css("@import ('../path.css\"")
'@import (\'/web/20131010im_/http://example.com/path.css"' '@import (\'/web/20131010em_/http://example.com/path.css"'
>>> _test_css("@import ('../url.css\"") >>> _test_css("@import ('../url.css\"")
'@import (\'/web/20131010im_/http://example.com/url.css"' '@import (\'/web/20131010em_/http://example.com/url.css"'
>>> _test_css("@import (\"url.css\")") >>> _test_css("@import (\"url.css\")")
'@import ("/web/20131010im_/http://example.com/url.css")' '@import ("/web/20131010em_/http://example.com/url.css")'
>>> _test_css("@import url(/url.css)\n@import url(/anotherurl.css)\n @import url(/and_a_third.css)") >>> _test_css("@import url(/url.css)\n@import url(/anotherurl.css)\n @import url(/and_a_third.css)")
'@import url(/web/20131010im_/http://example.com/url.css)\n@import url(/web/20131010im_/http://example.com/anotherurl.css)\n @import url(/web/20131010im_/http://example.com/and_a_third.css)' '@import url(/web/20131010em_/http://example.com/url.css)\n@import url(/web/20131010em_/http://example.com/anotherurl.css)\n @import url(/web/20131010em_/http://example.com/and_a_third.css)'
#================================================================= #=================================================================
HTTP Headers Rewriting HTTP Headers Rewriting
@ -134,7 +134,7 @@ HTTP Headers Rewriting
{'charset': None, {'charset': None,
'removed_header_dict': {}, 'removed_header_dict': {},
'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [ ('X-Archive-Orig-Connection', 'close'), 'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [ ('X-Archive-Orig-Connection', 'close'),
('Location', '/web/20131010im_/http://example.com/other.html')]), ('Location', '/web/20131010/http://example.com/other.html')]),
'text_type': None} 'text_type': None}
# gzip # gzip
@ -172,7 +172,7 @@ from pywb.utils.statusandheaders import StatusAndHeaders
import pprint import pprint
urlrewriter = UrlRewriter('20131010im_/http://example.com/', '/web/') urlrewriter = UrlRewriter('20131010/http://example.com/', '/web/')
def _test_js(string, extra = []): def _test_js(string, extra = []):

View File

@ -22,6 +22,10 @@ function init_banner() {
var banner = document.getElementById(BANNER_ID); var banner = document.getElementById(BANNER_ID);
if (wbinfo.is_embed) {
return;
}
if (!banner) { if (!banner) {
banner = document.createElement("wb_div"); banner = document.createElement("wb_div");
banner.setAttribute("id", BANNER_ID); banner.setAttribute("id", BANNER_ID);

View File

@ -11,7 +11,9 @@
<script> <script>
wbinfo = {} wbinfo = {}
wbinfo.capture_str = "{{ cdx.timestamp | format_ts }}"; wbinfo.capture_str = "{{ cdx.timestamp | format_ts }}";
wbinfo.is_embed = {{"true" if wbrequest.is_embed else "false"}};
</script> </script>
<script src='{{ wbrequest.host_prefix }}/static/default/wb.js'> </script> <script src='{{ wbrequest.host_prefix }}/static/default/wb.js'> </script>
<link rel='stylesheet' href='{{ wbrequest.host_prefix }}/static/default/wb.css'/> <link rel='stylesheet' href='{{ wbrequest.host_prefix }}/static/default/wb.css'/>
<!-- End WB Insert --> <!-- End WB Insert -->

View File

@ -32,7 +32,7 @@ class PyTest(TestCommand):
setup( setup(
name='pywb', name='pywb',
version='0.2.0', version='0.2.2',
url='https://github.com/ikreymer/pywb', url='https://github.com/ikreymer/pywb',
author='Ilya Kreymer', author='Ilya Kreymer',
author_email='ikreymer@gmail.com', author_email='ikreymer@gmail.com',