1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

style fixes: convert camelCase func and var names to 'not_camel_case'

WbHtml -> HTMLRewriter
ArchivalUrl -> WbUrl
This commit is contained in:
Ilya Kreymer 2014-01-28 19:37:37 -08:00
parent c0f8edf517
commit 6de794a4e1
12 changed files with 385 additions and 397 deletions

View File

@ -2,14 +2,14 @@ import urlparse
import re
from wbrequestresponse import WbRequest, WbResponse
from url_rewriter import ArchivalUrlRewriter
from wbarchivalurl import ArchivalUrl
from url_rewriter import UrlRewriter
from wburl import WbUrl
#=================================================================
# ArchivalRequestRouter -- route WB requests in archival mode
#=================================================================
class ArchivalRequestRouter:
def __init__(self, handlers, hostpaths = None, abs_path = True, archivalurl_class = ArchivalUrl):
def __init__(self, handlers, hostpaths = None, abs_path = True, archivalurl_class = WbUrl):
self.handlers = handlers
self.fallback = ReferRedirect(hostpaths)
self.abs_path = abs_path
@ -46,7 +46,7 @@ class Route:
self.coll_group = coll_group
def __call__(self, env, useAbsPrefix, archivalurl_class):
def __call__(self, env, use_abs_prefix, archivalurl_class):
request_uri = env['REL_REQUEST_URI']
matcher = self.regex.match(request_uri[1:])
if not matcher:
@ -68,19 +68,19 @@ class Route:
coll = coll,
wb_url = wb_url,
wb_prefix = wb_prefix,
use_abs_prefix = useAbsPrefix,
use_abs_prefix = use_abs_prefix,
archivalurl_class = archivalurl_class)
# Allow for setup of additional filters
self._addFilters(wbrequest, matcher)
self._add_filters(wbrequest, matcher)
return self._handleRequest(wbrequest)
return self._handle_request(wbrequest)
def _addFilters(self, wbrequest, matcher):
def _add_filters(self, wbrequest, matcher):
pass
def _handleRequest(self, wbrequest):
def _handle_request(self, wbrequest):
return self.handler(wbrequest)
@ -90,10 +90,10 @@ class Route:
class ReferRedirect:
"""
>>> ReferRedirect('http://localhost:8080/').matchPrefixs
>>> ReferRedirect('http://localhost:8080/').match_prefixs
['http://localhost:8080/']
>>> ReferRedirect(['http://example:9090/']).matchPrefixs
>>> ReferRedirect(['http://example:9090/']).match_prefixs
['http://example:9090/']
>>> test_redir('http://localhost:8080/', '/other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html')
@ -118,18 +118,18 @@ class ReferRedirect:
"""
def __init__(self, matchPrefixs):
if isinstance(matchPrefixs, list):
self.matchPrefixs = matchPrefixs
def __init__(self, match_prefixs):
if isinstance(match_prefixs, list):
self.match_prefixs = match_prefixs
else:
self.matchPrefixs = [matchPrefixs]
self.match_prefixs = [match_prefixs]
def __call__(self, wbrequest):
if wbrequest.referrer is None:
return None
if not any (wbrequest.referrer.startswith(i) for i in self.matchPrefixs):
if not any (wbrequest.referrer.startswith(i) for i in self.match_prefixs):
return None
try:
@ -145,7 +145,7 @@ class ReferRedirect:
# No match on any exception
try:
rewriter = ArchivalUrlRewriter('/' + ref_path[1], script_name + '/' + ref_path[0])
rewriter = UrlRewriter('/' + ref_path[1], script_name + '/' + ref_path[0])
except Exception:
return None
@ -167,16 +167,16 @@ class ReferRedirect:
import utils
if __name__ == "__main__" or utils.enable_doctests():
def test_redir(matchHost, request_uri, referrer, script_name = ''):
def test_redir(match_host, request_uri, referrer, script_name = ''):
env = {'REL_REQUEST_URI': request_uri, 'HTTP_REFERER': referrer, 'SCRIPT_NAME': script_name}
redir = ReferRedirect(matchHost)
redir = ReferRedirect(match_host)
req = WbRequest.from_uri(request_uri, env)
rep = redir(req)
if not rep:
return False
return rep.status_headers.getHeader('Location')
return rep.status_headers.get_header('Location')
import doctest

View File

@ -10,21 +10,21 @@ from wbrequestresponse import StatusAndHeaders
#=================================================================
class HttpReader:
def __init__(self, hmac = None, hmacDuration = 30):
def __init__(self, hmac = None, hmac_duration = 30):
self.hmac = hmac
self.hmacDuration = hmacDuration
self.hmac_duration = hmac_duration
def load(self, url, offset, length):
if length > 0:
rangeHeader = 'bytes={0}-{1}'.format(offset, offset + length - 1)
range_header = 'bytes={0}-{1}'.format(offset, offset + length - 1)
else:
rangeHeader = 'bytes={0}-'.format(offset)
range_header = 'bytes={0}-'.format(offset)
headers = {}
headers['Range'] = rangeHeader
headers['Range'] = range_header
if self.hmac:
headers['Cookie'] = self.hmac(self.hmacDuration)
headers['Cookie'] = self.hmac(self.hmac_duration)
request = urllib2.Request(url, headers = headers)
return urllib2.urlopen(request)
@ -50,7 +50,7 @@ WBArchiveRecord = collections.namedtuple('WBArchiveRecord', 'type, rec_headers,
class ArchiveLoader:
"""
>>> loadTestArchive('example.warc.gz', '333', '1043')
>>> load_test_archive('example.warc.gz', '333', '1043')
(('warc', 'response'),
StatusAndHeaders(protocol = 'WARC/1.0', statusline = '', headers = [ ('WARC-Type', 'response'),
('WARC-Record-ID', '<urn:uuid:6d058047-ede2-4a13-be79-90c17c631dd4>'),
@ -74,7 +74,7 @@ class ArchiveLoader:
('Connection', 'close')]))
>>> loadTestArchive('example.warc.gz', '1864', '553')
>>> load_test_archive('example.warc.gz', '1864', '553')
(('warc', 'revisit'),
StatusAndHeaders(protocol = 'WARC/1.0', statusline = '', headers = [ ('WARC-Type', 'revisit'),
('WARC-Record-ID', '<urn:uuid:3619f5b0-d967-44be-8f24-762098d427c4>'),
@ -114,7 +114,7 @@ class ArchiveLoader:
}
@staticmethod
def createDefaultLoaders():
def create_default_loaders():
http = HttpReader()
file = FileReader()
return {
@ -125,35 +125,35 @@ class ArchiveLoader:
}
def __init__(self, loaders = {}, chunkSize = 8192):
self.loaders = loaders if loaders else ArchiveLoader.createDefaultLoaders()
self.chunkSize = chunkSize
def __init__(self, loaders = {}, chunk_size = 8192):
self.loaders = loaders if loaders else ArchiveLoader.create_default_loaders()
self.chunk_size = chunk_size
self.arcParser = ARCHeadersParser(ArchiveLoader.ARC_HEADERS)
self.warcParser = StatusAndHeadersParser(['WARC/1.0', 'WARC/0.17', 'WARC/0.18'])
self.httpParser = StatusAndHeadersParser(['HTTP/1.0', 'HTTP/1.1'])
self.arc_parser = ARCHeadersParser(ArchiveLoader.ARC_HEADERS)
self.warc_parser = StatusAndHeadersParser(['WARC/1.0', 'WARC/0.17', 'WARC/0.18'])
self.http_parser = StatusAndHeadersParser(['HTTP/1.0', 'HTTP/1.1'])
def load(self, url, offset, length):
urlParts = urlparse.urlsplit(url)
url_parts = urlparse.urlsplit(url)
try:
loader = self.loaders.get(urlParts.scheme)
loader = self.loaders.get(url_parts.scheme)
except Exception:
raise wbexceptions.UnknownLoaderProtocolException(url)
theFormat = None
the_format = None
for ext, iformat in ArchiveLoader.FORMAT_MAP.iteritems():
if url.endswith(ext):
theFormat = iformat
the_format = iformat
break
if theFormat is None:
if the_format is None:
raise wbexceptions.UnknownArchiveFormatException(url)
(aFormat, isGzip) = theFormat
(a_format, is_gzip) = the_format
decomp = utils.create_decompressor() if isGzip else None
decomp = utils.create_decompressor() if is_gzip else None
try:
length = int(length)
@ -163,17 +163,17 @@ class ArchiveLoader:
raw = loader.load(url, long(offset), length)
stream = LineReader(raw, length, self.chunkSize, decomp)
stream = LineReader(raw, length, self.chunk_size, decomp)
if aFormat == 'arc':
rec_headers = self.arcParser.parse(stream)
recType = 'response'
empty = (rec_headers.getHeader('length') == 0)
if a_format == 'arc':
rec_headers = self.arc_parser.parse(stream)
rec_type = 'response'
empty = (rec_headers.get_header('length') == 0)
elif aFormat == 'warc':
rec_headers = self.warcParser.parse(stream)
recType = rec_headers.getHeader('WARC-Type')
empty = (rec_headers.getHeader('Content-Length') == '0')
elif a_format == 'warc':
rec_headers = self.warc_parser.parse(stream)
rec_type = rec_headers.get_header('WARC-Type')
empty = (rec_headers.get_header('Content-Length') == '0')
# special case: empty w/arc record (hopefully a revisit)
if empty:
@ -181,21 +181,21 @@ class ArchiveLoader:
# special case: warc records that are not expected to have http headers
# attempt to add 200 status and content-type
elif recType == 'metadata' or recType == 'resource':
status_headers = StatusAndHeaders('200 OK', [('Content-Type', rec_headers.getHeader('Content-Type'))])
elif rec_type == 'metadata' or rec_type == 'resource':
status_headers = StatusAndHeaders('200 OK', [('Content-Type', rec_headers.get_header('Content-Type'))])
# special case: http 0.9 response, no status or headers
#elif recType == 'response':
# contentType = rec_headers.getHeader('Content-Type')
# if contentType and (';version=0.9' in contentType):
#elif rec_type == 'response':
# content_type = rec_headers.get_header('Content-Type')
# if content_type and (';version=0.9' in content_type):
# status_headers = StatusAndHeaders('200 OK', [])
# response record: parse HTTP status and headers!
else:
#(statusline, http_headers) = self.parseHttpHeaders(stream)
status_headers = self.httpParser.parse(stream)
#(statusline, http_headers) = self.parse_http_headers(stream)
status_headers = self.http_parser.parse(stream)
return WBArchiveRecord((aFormat, recType), rec_headers, stream, status_headers)
return WBArchiveRecord((a_format, rec_type), rec_headers, stream, status_headers)
#=================================================================
@ -206,9 +206,9 @@ class StatusAndHeadersParser:
def parse(self, stream):
statusline = stream.readline().rstrip()
protocolStatus = utils.split_prefix(statusline, self.statuslist)
protocol_status = utils.split_prefix(statusline, self.statuslist)
if not protocolStatus:
if not protocol_status:
raise wbexceptions.InvalidArchiveRecordException('Expected Status Line, Found: ' + statusline)
headers = []
@ -220,7 +220,7 @@ class StatusAndHeadersParser:
headers.append(header)
line = stream.readline().rstrip()
return StatusAndHeaders(statusline = protocolStatus[1].strip(), headers = headers, protocol = protocolStatus[0])
return StatusAndHeaders(statusline = protocol_status[1].strip(), headers = headers, protocol = protocol_status[0])
#=================================================================
class ARCHeadersParser:
@ -247,25 +247,25 @@ class ARCHeadersParser:
#=================================================================
class LineReader:
def __init__(self, stream, maxLen = 0, chunkSize = 1024, decomp = None):
def __init__(self, stream, max_len = 0, chunk_size = 1024, decomp = None):
self.stream = stream
self.chunkSize = chunkSize
self.chunk_size = chunk_size
self.decomp = decomp
self.buff = None
self.numRead = 0
self.maxLen = maxLen
self.num_read = 0
self.max_len = max_len
def _fillbuff(self, chunkSize = None):
if not chunkSize:
chunkSize = self.chunkSize
def _fillbuff(self, chunk_size = None):
if not chunk_size:
chunk_size = self.chunk_size
if not self.buff or self.buff.pos >= self.buff.len:
toRead = min(self.maxLen - self.numRead, self.chunkSize) if (self.maxLen > 0) else self.chunkSize
data = self.stream.read(toRead)
to_read = min(self.max_len - self.num_read, self.chunk_size) if (self.max_len > 0) else self.chunk_size
data = self.stream.read(to_read)
self._process_read(data)
def _process_read(self, data):
self.numRead += len(data)
self.num_read += len(data)
if self.decomp and data:
data = self.decomp.decompress(data)
@ -310,45 +310,45 @@ class ChunkedLineReader(LineReader):
'123412'
"""
allChunksRead = False
notChunked = False
raiseChunkedDataExceptions = False # if False, we'll use best-guess fallback for parse errors
all_chunks_read = False
not_chunked = False
raise_chunked_data_exceptions = False # if False, we'll use best-guess fallback for parse errors
def _fillbuff(self, chunkSize = None):
if self.notChunked:
return LineReader._fillbuff(self, chunkSize)
def _fillbuff(self, chunk_size = None):
if self.not_chunked:
return LineReader._fillbuff(self, chunk_size)
if self.allChunksRead:
if self.all_chunks_read:
return
if not self.buff or self.buff.pos >= self.buff.len:
lengthHeader = self.stream.readline(64)
length_header = self.stream.readline(64)
data = ''
try:
# decode length header
try:
chunkSize = int(lengthHeader.strip().split(';')[0], 16)
chunk_size = int(length_header.strip().split(';')[0], 16)
except ValueError:
raise ChunkedDataException("Couldn't decode length header '%s'" % lengthHeader)
raise ChunkedDataException("Couldn't decode length header '%s'" % length_header)
if chunkSize:
if chunk_size:
# read chunk
while len(data) < chunkSize:
newData = self.stream.read(chunkSize - len(data))
while len(data) < chunk_size:
new_data = self.stream.read(chunk_size - len(data))
# if we unexpectedly run out of data, either raise an exception or just stop reading, assuming file was cut off
if not newData:
if self.raiseChunkedDataExceptions:
if not new_data:
if self.raise_chunked_data_exceptions:
raise ChunkedDataException("Ran out of data before end of chunk")
else:
chunkSize = len(data)
self.allChunksRead = True
chunk_size = len(data)
self.all_chunks_read = True
data += newData
data += new_data
# if we successfully read a block without running out, it should end in \r\n
if not self.allChunksRead:
if not self.all_chunks_read:
clrf = self.stream.read(2)
if clrf != '\r\n':
raise ChunkedDataException("Chunk terminator not found.")
@ -356,19 +356,19 @@ class ChunkedLineReader(LineReader):
if self.decomp:
data = self.decomp.decompress(data)
else:
# chunkSize 0 indicates end of file
self.allChunksRead = True
# chunk_size 0 indicates end of file
self.all_chunks_read = True
data = ''
self._process_read(data)
except ChunkedDataException:
if self.raiseChunkedDataExceptions:
if self.raise_chunked_data_exceptions:
raise
# Can't parse the data as chunked.
# It's possible that non-chunked data is set with a Transfer-Encoding: chunked
# Treat this as non-chunk encoded from here on
self._process_read(lengthHeader+data)
self.notChunked = True
self._process_read(length_header + data)
self.not_chunked = True
#=================================================================
@ -379,7 +379,7 @@ if __name__ == "__main__" or utils.enable_doctests():
testloader = ArchiveLoader()
def loadTestArchive(test_file, offset, length):
def load_test_archive(test_file, offset, length):
path = os.path.dirname(os.path.realpath(__file__)) + '/../test/' + test_file
archive = testloader.load(path, offset, length)

View File

@ -2,14 +2,14 @@ from wbrequestresponse import StatusAndHeaders
#=================================================================
class RewrittenStatusAndHeaders:
def __init__(self, statusline, headers, removedHeaderDict, textType, charset):
def __init__(self, statusline, headers, removed_header_dict, text_type, charset):
self.status_headers = StatusAndHeaders(statusline, headers)
self.removedHeaderDict = removedHeaderDict
self.textType = textType
self.removed_header_dict = removed_header_dict
self.text_type = text_type
self.charset = charset
def containsRemovedHeader(self, name, value):
return self.removedHeaderDict.get(name) == value
def contains_removed_header(self, name, value):
return self.removed_header_dict.get(name) == value
#=================================================================
@ -17,30 +17,30 @@ class HeaderRewriter:
"""
# Text with charset
>>> test_rewrite([('Date', 'Fri, 03 Jan 2014 03:03:21 GMT'), ('Content-Length', '5'), ('Content-Type', 'text/html;charset=UTF-8')])
{'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Date', 'Fri, 03 Jan 2014 03:03:21 GMT'),
{'text_type': 'html', 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Date', 'Fri, 03 Jan 2014 03:03:21 GMT'),
('X-Archive-Orig-Content-Length', '5'),
('Content-Type', 'text/html;charset=UTF-8')]), 'charset': 'utf-8', 'textType': 'html', 'removedHeaderDict': {}}
('Content-Type', 'text/html;charset=UTF-8')]), 'removed_header_dict': {}, 'charset': 'utf-8'}
# Redirect
>>> test_rewrite([('Connection', 'close'), ('Location', '/other.html')], '302 Redirect')
{'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [ ('X-Archive-Orig-Connection', 'close'),
('Location', '/web/20131226101010/http://example.com/other.html')]), 'charset': None, 'textType': None, 'removedHeaderDict': {}}
{'text_type': None, 'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [ ('X-Archive-Orig-Connection', 'close'),
('Location', '/web/20131226101010/http://example.com/other.html')]), 'removed_header_dict': {}, 'charset': None}
# gzip
>>> test_rewrite([('Content-Length', '199999'), ('Content-Type', 'text/javascript'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')])
{'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Content-Length', '199999'),
('Content-Type', 'text/javascript')]), 'charset': None, 'textType': 'js', 'removedHeaderDict': {'transfer-encoding': 'chunked', 'content-encoding': 'gzip'}}
{'text_type': 'js', 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Content-Length', '199999'),
('Content-Type', 'text/javascript')]), 'removed_header_dict': {'transfer-encoding': 'chunked', 'content-encoding': 'gzip'}, 'charset': None}
# Binary
>>> test_rewrite([('Content-Length', '200000'), ('Content-Type', 'image/png'), ('Cookie', 'blah'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')])
{'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('Content-Length', '200000'),
{'text_type': None, 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('Content-Length', '200000'),
('Content-Type', 'image/png'),
('X-Archive-Orig-Cookie', 'blah'),
('Content-Encoding', 'gzip')]), 'charset': None, 'textType': None, 'removedHeaderDict': {'transfer-encoding': 'chunked'}}
('Content-Encoding', 'gzip')]), 'removed_header_dict': {'transfer-encoding': 'chunked'}, 'charset': None}
Removing Transfer-Encoding always, Was:
('Content-Encoding', 'gzip'),
('Transfer-Encoding', 'chunked')]), 'charset': None, 'textType': None, 'removedHeaderDict': {}}
('Transfer-Encoding', 'chunked')]), 'charset': None, 'text_type': None, 'removed_header_dict': {}}
"""
@ -63,64 +63,64 @@ class HeaderRewriter:
PROXY_NO_REWRITE_HEADERS = ['content-length']
def __init__(self, headerPrefix = 'X-Archive-Orig-'):
self.headerPrefix = headerPrefix
def __init__(self, header_prefix = 'X-Archive-Orig-'):
self.header_prefix = header_prefix
def rewrite(self, status_headers, urlrewriter):
contentType = status_headers.getHeader('Content-Type')
textType = None
content_type = status_headers.get_header('Content-Type')
text_type = None
charset = None
stripEncoding = False
strip_encoding = False
if contentType:
textType = self._extractTextType(contentType)
if textType:
charset = self._extractCharSet(contentType)
stripEncoding = True
if content_type:
text_type = self._extract_text_type(content_type)
if text_type:
charset = self._extract_char_set(content_type)
strip_encoding = True
(newHeaders, removedHeaderDict) = self._rewriteHeaders(status_headers.headers, urlrewriter, stripEncoding)
(new_headers, removed_header_dict) = self._rewrite_headers(status_headers.headers, urlrewriter, strip_encoding)
return RewrittenStatusAndHeaders(status_headers.statusline, newHeaders, removedHeaderDict, textType, charset)
return RewrittenStatusAndHeaders(status_headers.statusline, new_headers, removed_header_dict, text_type, charset)
def _extractTextType(self, contentType):
def _extract_text_type(self, content_type):
for ctype, mimelist in self.REWRITE_TYPES.iteritems():
if any ((mime in contentType) for mime in mimelist):
if any ((mime in content_type) for mime in mimelist):
return ctype
return None
def _extractCharSet(self, contentType):
def _extract_char_set(self, content_type):
CHARSET_TOKEN = 'charset='
idx = contentType.find(CHARSET_TOKEN)
idx = content_type.find(CHARSET_TOKEN)
if idx < 0:
return None
return contentType[idx + len(CHARSET_TOKEN):].lower()
return content_type[idx + len(CHARSET_TOKEN):].lower()
def _rewriteHeaders(self, headers, urlrewriter, contentRewritten = False):
newHeaders = []
removedHeaderDict = {}
def _rewrite_headers(self, headers, urlrewriter, content_rewritten = False):
new_headers = []
removed_header_dict = {}
for (name, value) in headers:
lowername = name.lower()
if lowername in self.PROXY_HEADERS:
newHeaders.append((name, value))
new_headers.append((name, value))
elif lowername in self.URL_REWRITE_HEADERS:
newHeaders.append((name, urlrewriter.rewrite(value)))
new_headers.append((name, urlrewriter.rewrite(value)))
elif lowername in self.ENCODING_HEADERS:
if contentRewritten:
removedHeaderDict[lowername] = value
if content_rewritten:
removed_header_dict[lowername] = value
else:
newHeaders.append((name, value))
new_headers.append((name, value))
elif lowername in self.REMOVE_HEADERS:
removedHeaderDict[lowername] = value
elif lowername in self.PROXY_NO_REWRITE_HEADERS and not contentRewritten:
newHeaders.append((name, value))
removed_header_dict[lowername] = value
elif lowername in self.PROXY_NO_REWRITE_HEADERS and not content_rewritten:
new_headers.append((name, value))
else:
newHeaders.append((self.headerPrefix + name, value))
new_headers.append((self.header_prefix + name, value))
return (newHeaders, removedHeaderDict)
return (new_headers, removed_header_dict)
import utils
if __name__ == "__main__" or utils.enable_doctests():
@ -128,7 +128,7 @@ if __name__ == "__main__" or utils.enable_doctests():
import pprint
import url_rewriter
urlrewriter = url_rewriter.ArchivalUrlRewriter('/20131226101010/http://example.com/some/path/index.html', '/web/')
urlrewriter = url_rewriter.UrlRewriter('/20131226101010/http://example.com/some/path/index.html', '/web/')
headerrewriter = HeaderRewriter()

View File

@ -5,13 +5,13 @@ import sys
import re
from HTMLParser import HTMLParser
from url_rewriter import ArchivalUrlRewriter
from url_rewriter import UrlRewriter
from regex_rewriters import JSRewriter, CSSRewriter
#=================================================================
# WBHtml --html parser for custom rewriting, also handlers for script and css
# HTMLRewriter -- html parser for custom rewriting, also handlers for script and css
#=================================================================
class WBHtml(HTMLParser):
class HTMLRewriter(HTMLParser):
r"""
>>> parse('<HTML><A Href="page.html">Text</a></hTmL>')
<HTML><a href="/web/20131226101010/http://example.com/some/path/page.html">Text</a></html>
@ -72,13 +72,13 @@ class WBHtml(HTMLParser):
<style>@import url(/web/20131226101010/http://example.com/some/path/styles.css)</style>
# Head Insertion
>>> parse('<html><head><script src="other.js"></script></head><body>Test</body></html>', headInsert = '<script src="cool.js"></script>')
>>> parse('<html><head><script src="other.js"></script></head><body>Test</body></html>', head_insert = '<script src="cool.js"></script>')
<html><head><script src="cool.js"></script><script src="/web/20131226101010js_/http://example.com/some/path/other.js"></script></head><body>Test</body></html>
>>> parse('<body><div>SomeTest</div>', headInsert = '/* Insert */')
>>> parse('<body><div>SomeTest</div>', head_insert = '/* Insert */')
/* Insert */<body><div>SomeTest</div>
>>> parse('<link href="abc.txt"><div>SomeTest</div>', headInsert = '<script>load_stuff();</script>')
>>> parse('<link href="abc.txt"><div>SomeTest</div>', head_insert = '<script>load_stuff();</script>')
<link href="/web/20131226101010oe_/http://example.com/some/path/abc.txt"><script>load_stuff();</script><div>SomeTest</div>
"""
@ -125,128 +125,128 @@ class WBHtml(HTMLParser):
self.buff += string
def __init__(self, url_rewriter, outstream = None, headInsert = None, jsRewriterClass = JSRewriter, cssRewriterClass = CSSRewriter):
def __init__(self, url_rewriter, outstream = None, head_insert = None, js_rewriter_class = JSRewriter, css_rewriter_class = CSSRewriter):
HTMLParser.__init__(self)
self.url_rewriter = url_rewriter
self._wbParseContext = None
self.out = outstream if outstream else WBHtml.AccumBuff()
self._wb_parse_context = None
self.out = outstream if outstream else self.AccumBuff()
self.jsRewriter = jsRewriterClass(url_rewriter)
self.cssRewriter = cssRewriterClass(url_rewriter)
self.js_rewriter = js_rewriter_class(url_rewriter)
self.css_rewriter = css_rewriter_class(url_rewriter)
self.headInsert = headInsert
self.head_insert = head_insert
# ===========================
META_REFRESH_REGEX = re.compile('^[\\d.]+\\s*;\\s*url\\s*=\\s*(.+?)\\s*$', re.IGNORECASE | re.MULTILINE)
def _rewriteMetaRefresh(self, metaRefresh):
if not metaRefresh:
def _rewrite_meta_refresh(self, meta_refresh):
if not meta_refresh:
return None
m = WBHtml.META_REFRESH_REGEX.match(metaRefresh)
m = self.META_REFRESH_REGEX.match(meta_refresh)
if not m:
return metaRefresh
return meta_refresh
try:
metaRefresh = metaRefresh[:m.start(1)] + self._rewriteURL(m.group(1)) + metaRefresh[m.end(1):]
meta_refresh = meta_refresh[:m.start(1)] + self._rewrite_url(m.group(1)) + meta_refresh[m.end(1):]
except Exception:
pass
return metaRefresh
return meta_refresh
# ===========================
def _rewriteURL(self, value, mod = None):
def _rewrite_url(self, value, mod = None):
return self.url_rewriter.rewrite(value, mod) if value else None
def _rewriteCSS(self, cssContent):
return self.cssRewriter.rewrite(cssContent) if cssContent else None
def _rewrite_css(self, css_content):
return self.css_rewriter.rewrite(css_content) if css_content else None
def _rewriteScript(self, scriptContent):
return self.jsRewriter.rewrite(scriptContent) if scriptContent else None
def _rewrite_script(self, script_content):
return self.js_rewriter.rewrite(script_content) if script_content else None
def hasAttr(self, tagAttrs, attr):
def has_attr(self, tag_attrs, attr):
name, value = attr
for attrName, attrValue in tagAttrs:
if attrName == name:
return value.lower() == attrValue.lower()
for attr_name, attr_value in tag_attrs:
if attr_name == name:
return value.lower() == attr_value.lower()
return False
def rewriteTagAttrs(self, tag, tagAttrs, isStartEnd):
def rewrite_tag_attrs(self, tag, tag_attrs, is_start_end):
# special case: script or style parse context
if (tag in WBHtml.STATE_TAGS) and (self._wbParseContext == None):
self._wbParseContext = tag
if (tag in self.STATE_TAGS) and (self._wb_parse_context == None):
self._wb_parse_context = tag
# special case: head insertion, non-head tags
elif (self.headInsert and (self._wbParseContext == None) and (tag not in WBHtml.HEAD_TAGS)):
self.out.write(self.headInsert)
self.headInsert = None
elif (self.head_insert and (self._wb_parse_context == None) and (tag not in self.HEAD_TAGS)):
self.out.write(self.head_insert)
self.head_insert = None
# attr rewriting
handler = WBHtml.REWRITE_TAGS.get(tag)
handler = self.REWRITE_TAGS.get(tag)
if not handler:
handler = WBHtml.REWRITE_TAGS.get('')
handler = self.REWRITE_TAGS.get('')
if not handler:
return False
self.out.write('<' + tag)
for attr in tagAttrs:
attrName, attrValue = attr
for attr in tag_attrs:
attr_name, attr_value = attr
# special case: inline JS/event handler
if (attrValue and attrValue.startswith('javascript:')) or attrName.startswith('on'):
attrValue = self._rewriteScript(attrValue)
if (attr_value and attr_value.startswith('javascript:')) or attr_name.startswith('on'):
attr_value = self._rewrite_script(attr_value)
# special case: inline CSS/style attribute
elif attrName == 'style':
attrValue = self._rewriteCSS(attrValue)
elif attr_name == 'style':
attr_value = self._rewrite_css(attr_value)
# special case: meta tag
elif (tag == 'meta') and (attrName == 'content'):
if self.hasAttr(tagAttrs, ('http-equiv', 'refresh')):
attrValue = self._rewriteMetaRefresh(attrValue)
elif (tag == 'meta') and (attr_name == 'content'):
if self.has_attr(tag_attrs, ('http-equiv', 'refresh')):
attr_value = self._rewrite_meta_refresh(attr_value)
else:
# special case: base tag
if (tag == 'base') and (attrName == 'href') and attrValue:
self.url_rewriter.setBaseUrl(attrValue)
if (tag == 'base') and (attr_name == 'href') and attr_value:
self.url_rewriter.set_base_url(attr_value)
rwMod = handler.get(attrName)
if rwMod is not None:
attrValue = self._rewriteURL(attrValue, rwMod)
rw_mod = handler.get(attr_name)
if rw_mod is not None:
attr_value = self._rewrite_url(attr_value, rw_mod)
# parser doesn't differentiate between 'attr=""' and just 'attr'
# 'attr=""' is more common, so use that form
if attrValue:
self.out.write(' ' + attrName + '="' + attrValue + '"')
if attr_value:
self.out.write(' ' + attr_name + '="' + attr_value + '"')
else:
self.out.write(' ' + attrName + '=""')
self.out.write(' ' + attr_name + '=""')
self.out.write('/>' if isStartEnd else '>')
self.out.write('/>' if is_start_end else '>')
# special case: head tag
if (self.headInsert) and (self._wbParseContext == None) and (tag == 'head'):
self.out.write(self.headInsert)
self.headInsert = None
if (self.head_insert) and (self._wb_parse_context == None) and (tag == 'head'):
self.out.write(self.head_insert)
self.head_insert = None
return True
def parseData(self, data):
if self._wbParseContext == 'script':
data = self._rewriteScript(data)
elif self._wbParseContext == 'style':
data = self._rewriteCSS(data)
def parse_data(self, data):
if self._wb_parse_context == 'script':
data = self._rewrite_script(data)
elif self._wb_parse_context == 'style':
data = self._rewrite_css(data)
self.out.write(data)
def rewrite(self, string):
if not self.out:
self.out = WBHtml.AccumBuff()
self.out = self.AccumBuff()
self.feed(string)
@ -258,9 +258,9 @@ class WBHtml(HTMLParser):
# HTMLParser overrides below
def close(self):
if (self._wbParseContext):
result = self.rewrite('</' + self._wbParseContext + '>')
self._wbParseContext = None
if (self._wb_parse_context):
result = self.rewrite('</' + self._wb_parse_context + '>')
self._wb_parse_context = None
else:
result = ''
@ -268,21 +268,21 @@ class WBHtml(HTMLParser):
return result
def handle_starttag(self, tag, attrs):
if not self.rewriteTagAttrs(tag, attrs, False):
if not self.rewrite_tag_attrs(tag, attrs, False):
self.out.write(self.get_starttag_text())
def handle_startendtag(self, tag, attrs):
if not self.rewriteTagAttrs(tag, attrs, True):
if not self.rewrite_tag_attrs(tag, attrs, True):
self.out.write(self.get_starttag_text())
def handle_endtag(self, tag):
if (tag == self._wbParseContext):
self._wbParseContext = None
if (tag == self._wb_parse_context):
self._wb_parse_context = None
self.out.write('</' + tag + '>')
def handle_data(self, data):
self.parseData(data)
self.parse_data(data)
def handle_entityref(self, data):
self.out.write('&' + data + ';')
@ -292,7 +292,7 @@ class WBHtml(HTMLParser):
def handle_comment(self, data):
self.out.write('<!--')
self.parseData(data)
self.parse_data(data)
self.out.write('-->')
def handle_decl(self, data):
@ -303,24 +303,17 @@ class WBHtml(HTMLParser):
def unknown_decl(self, data):
self.out.write('<![')
self.parseData(data)
self.parse_data(data)
self.out.write(']>')
# instantiate the parser and fed it some HTML
#parser = WBHtml()
#instr = '<HTML X=\'a\' B=\'234\' some="other"><a href="Test"><BR/><head><title>Test</title></head>\n<body><h1>Parse me!</h1></body></HTML>'
#print instr
#print
#parser.feed(instr)
#print
import utils
if __name__ == "__main__" or utils.enable_doctests():
url_rewriter = ArchivalUrlRewriter('/20131226101010/http://example.com/some/path/index.html', '/web/')
url_rewriter = UrlRewriter('/20131226101010/http://example.com/some/path/index.html', '/web/')
def parse(data, headInsert = None):
parser = WBHtml(url_rewriter, headInsert = headInsert)
def parse(data, head_insert = None):
parser = HTMLRewriter(url_rewriter, head_insert = head_insert)
print parser.rewrite(data) + parser.close()
import doctest

View File

@ -6,8 +6,6 @@ import wbrequestresponse
import surt
from collections import OrderedDict
from wbarchivalurl import ArchivalUrl
import binsearch
import cdxserve
import logging
@ -22,11 +20,11 @@ class IndexReader:
params = self.get_query_params(wburl)
# add any custom filter from the request
if wbrequest.queryFilter:
params['filter'] = wbrequest.queryFilter
if wbrequest.query_filter:
params['filter'] = wbrequest.query_filter
if wbrequest.customParams:
params.update(wbrequest.customParams)
if wbrequest.custom_params:
params.update(wbrequest.custom_params)
cdxlines = self.load_cdx(wburl.url, params, parsed_cdx)
@ -133,9 +131,9 @@ class RemoteCDXServer(IndexReader):
('length', '1792')]
"""
def __init__(self, serverUrl, cookie = None):
self.serverUrl = serverUrl
self.authCookie = cookie
def __init__(self, server_url, cookie = None):
self.server_url = server_url
self.auth_cookie = cookie
def load_cdx(self, url, params = {}, parsed_cdx = True, **kwvalues):
#url is required, must be passed explicitly!
@ -145,10 +143,10 @@ class RemoteCDXServer(IndexReader):
urlparams = urllib.urlencode(params, True)
try:
request = urllib2.Request(self.serverUrl, urlparams)
request = urllib2.Request(self.server_url, urlparams)
if self.authCookie:
request.add_header('Cookie', self.authCookie)
if self.auth_cookie:
request.add_header('Cookie', self.auth_cookie)
response = urllib2.urlopen(request)
except urllib2.HTTPError, e:
@ -168,7 +166,7 @@ class RemoteCDXServer(IndexReader):
# with lower values if there are too many captures. Ideally, should be around 10-20
# The replayClosest is the max number of cdx lines, so max number of retry attempts that WB will make
def get_query_params(self, wburl, limit = '150000', collapseTime = '10', replayClosest = '4000'):
def get_query_params(self, wburl, limit = '150000', collapse_time = '10', replay_closest = '4000'):
return {
wburl.QUERY:

View File

@ -25,7 +25,7 @@ def pywb_config(head_insert = ''):
prefixes = [replay_resolvers.PrefixResolver(test_dir)]
# Create rewriting replay handler to rewrite records
replayer = replay_views.RewritingReplayView(resolvers = prefixes, archiveloader = aloader, headInsert = head_insert, buffer_response = True)
replayer = replay_views.RewritingReplayView(resolvers = prefixes, archiveloader = aloader, head_insert = head_insert, buffer_response = True)
# Create Jinja2 based html query view
html_view = views.J2QueryView('./ui/', 'query.html')

View File

@ -2,30 +2,30 @@ import re
import sys
import itertools
from url_rewriter import ArchivalUrlRewriter
from url_rewriter import UrlRewriter
#=================================================================
class RegexRewriter:
"""
# Test https->http converter (other tests below in subclasses)
>>> RegexRewriter([(RegexRewriter.HTTPX_MATCH_STR, RegexRewriter.removeHttps, 0)]).rewrite('a = https://example.com; b = http://example.com; c = https://some-url/path/https://embedded.example.com')
>>> RegexRewriter([(RegexRewriter.HTTPX_MATCH_STR, RegexRewriter.remove_https, 0)]).rewrite('a = https://example.com; b = http://example.com; c = https://some-url/path/https://embedded.example.com')
'a = http://example.com; b = http://example.com; c = http://some-url/path/http://embedded.example.com'
"""
@staticmethod
def commentOut(string):
def comment_out(string):
return '/*' + string + '*/'
@staticmethod
def removeHttps(string):
def remove_https(string):
return string.replace("https", "http")
@staticmethod
def addPrefix(prefix):
def add_prefix(prefix):
return lambda string: prefix + string
@staticmethod
def archivalRewrite(rewriter):
def archival_rewrite(rewriter):
return lambda x: rewriter.rewrite(x)
@staticmethod
@ -34,19 +34,19 @@ class RegexRewriter:
HTTPX_MATCH_STR = 'https?:\\\\?/\\\\?/[A-Za-z0-9:_@.-]+'
DEFAULT_OP = addPrefix
DEFAULT_OP = add_prefix
def __init__(self, rules):
#rules = self.createRules(httpPrefix)
#rules = self.create_rules(http_prefix)
# Build regexstr, concatenating regex list
regexStr = '|'.join(['(' + rx + ')' for rx, op, count in rules])
regex_str = '|'.join(['(' + rx + ')' for rx, op, count in rules])
# ensure it's not middle of a word, wrap in non-capture group
regexStr = '(?<!\w)(?:' + regexStr + ')'
regex_str = '(?<!\w)(?:' + regex_str + ')'
self.regex = re.compile(regexStr, re.M)
self.regex = re.compile(regex_str, re.M)
self.rules = rules
def filter(self, m):
@ -63,7 +63,7 @@ class RegexRewriter:
for _, op, count in self.rules:
i += 1
fullM = i
full_m = i
while count > 0:
i += 1
count -= 1
@ -82,8 +82,8 @@ class RegexRewriter:
result = op(m.group(i))
# if extracting partial match
if i != fullM:
result = m.string[m.start(fullM):m.start(i)] + result + m.string[m.end(i):m.end(fullM)]
if i != full_m:
result = m.string[m.start(full_m):m.start(i)] + result + m.string[m.end(i):m.end(full_m)]
return result
@ -105,21 +105,21 @@ class JSRewriter(RegexRewriter):
'document_domain = "anotherdomain.com"; window.document.WB_wombat_domain = "example.com"'
# custom rules added
>>> test_js('window.location = "http://example.com/abc.html"; some_func(); ', [('some_func\(\).*', RegexRewriter.commentOut, 0)])
>>> test_js('window.location = "http://example.com/abc.html"; some_func(); ', [('some_func\(\).*', RegexRewriter.comment_out, 0)])
'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"; /*some_func(); */'
"""
def __init__(self, rewriter, extra = []):
rules = self._createRules(rewriter.getAbsUrl())
rules = self._create_rules(rewriter.get_abs_url())
rules.extend(extra)
RegexRewriter.__init__(self, rules)
def _createRules(self, httpPrefix):
def _create_rules(self, http_prefix):
return [
(RegexRewriter.HTTPX_MATCH_STR, httpPrefix, 0),
(RegexRewriter.HTTPX_MATCH_STR, http_prefix, 0),
('location', 'WB_wombat_', 0),
('(?<=document\.)domain', 'WB_wombat_', 0),
]
@ -143,7 +143,7 @@ class XMLRewriter(RegexRewriter):
"""
def __init__(self, rewriter, extra = []):
rules = self._createRules(rewriter.getAbsUrl())
rules = self._create_rules(rewriter.get_abs_url())
RegexRewriter.__init__(self, rules)
@ -155,9 +155,9 @@ class XMLRewriter(RegexRewriter):
return True
def _createRules(self, httpPrefix):
def _create_rules(self, http_prefix):
return [
('([A-Za-z:]+[\s=]+)?["\'\s]*(' + RegexRewriter.HTTPX_MATCH_STR + ')', httpPrefix, 2),
('([A-Za-z:]+[\s=]+)?["\'\s]*(' + RegexRewriter.HTTPX_MATCH_STR + ')', http_prefix, 2),
]
#=================================================================
@ -211,20 +211,20 @@ class CSSRewriter(RegexRewriter):
CSS_IMPORT_NO_URL_REGEX = "@import\\s+(?!url)\\(?\\s*['\"]?(?!url[\\s\\(])([\w.:/\\\\-]+)"
def __init__(self, rewriter):
rules = self._createRules(rewriter)
rules = self._create_rules(rewriter)
RegexRewriter.__init__(self, rules)
def _createRules(self, rewriter):
def _create_rules(self, rewriter):
return [
(CSSRewriter.CSS_URL_REGEX, RegexRewriter.archivalRewrite(rewriter), 1),
(CSSRewriter.CSS_IMPORT_NO_URL_REGEX, RegexRewriter.archivalRewrite(rewriter), 1),
(CSSRewriter.CSS_URL_REGEX, RegexRewriter.archival_rewrite(rewriter), 1),
(CSSRewriter.CSS_IMPORT_NO_URL_REGEX, RegexRewriter.archival_rewrite(rewriter), 1),
]
import utils
if __name__ == "__main__" or utils.enable_doctests():
arcrw = ArchivalUrlRewriter('/20131010im_/http://example.com/', '/web/')
arcrw = UrlRewriter('/20131010im_/http://example.com/', '/web/')
def test_js(string, extra = []):
return JSRewriter(arcrw, extra).rewrite(string)

View File

@ -6,10 +6,9 @@ import itertools
import archiveloader
from wbrequestresponse import WbResponse, StatusAndHeaders
from wbarchivalurl import ArchivalUrl
import utils
from url_rewriter import ArchivalUrlRewriter
from url_rewriter import UrlRewriter
from header_rewriter import HeaderRewriter
import html_rewriter
import regex_rewriters
@ -28,7 +27,7 @@ class ReplayView:
first = True
# List of already failed w/arcs
failedFiles = []
failed_files = []
# Iterate over the cdx until find one that works
# The cdx should already be sorted in closest-to-timestamp order (from the cdx server)
@ -36,10 +35,10 @@ class ReplayView:
try:
# ability to intercept and redirect
if first:
self._checkRedir(wbrequest, cdx)
self._check_redir(wbrequest, cdx)
first = False
response = self.doReplay(cdx, wbrequest, cdx_reader, failedFiles)
response = self.do_replay(cdx, wbrequest, cdx_reader, failed_files)
if response:
response.cdx = cdx
@ -56,17 +55,17 @@ class ReplayView:
else:
raise wbexceptions.UnresolvedArchiveFileException()
def _checkRedir(self, wbrequest, cdx):
def _check_redir(self, wbrequest, cdx):
return None
def _load(self, cdx, revisit, failedFiles):
def _load(self, cdx, revisit, failed_files):
if revisit:
(filename, offset, length) = (cdx['orig.filename'], cdx['orig.offset'], cdx['orig.length'])
else:
(filename, offset, length) = (cdx['filename'], cdx['offset'], cdx['length'])
#optimization: if same file already failed this request, don't try again
if failedFiles and filename in failedFiles:
if failed_files and filename in failed_files:
raise wbexceptions.ArchiveLoadFailed(filename, 'Skipping Already Failed')
any_found = False
@ -86,8 +85,8 @@ class ReplayView:
pass
# Unsuccessful if reached here
if failedFiles:
failedFiles.append(filename)
if failed_files:
failed_files.append(filename)
if not any_found:
raise wbexceptions.UnresolvedArchiveFileException('Archive File Not Found: ' + filename)
@ -95,45 +94,45 @@ class ReplayView:
raise wbexceptions.ArchiveLoadFailed(filename, last_exc.reason if last_exc else '')
def doReplay(self, cdx, wbrequest, cdx_reader, failedFiles):
hasCurr = (cdx['filename'] != '-')
hasOrig = (cdx.get('orig.filename','-') != '-')
def do_replay(self, cdx, wbrequest, cdx_reader, failed_files):
has_curr = (cdx['filename'] != '-')
has_orig = (cdx.get('orig.filename','-') != '-')
# load headers record from cdx['filename'] unless it is '-' (rare)
headersRecord = self._load(cdx, False, failedFiles) if hasCurr else None
headers_record = self._load(cdx, False, failed_files) if has_curr else None
# two index lookups
# Case 1: if mimetype is still warc/revisit
if cdx['mimetype'] == 'warc/revisit' and headersRecord:
payloadRecord = self._load_different_url_payload(wbrequest, cdx_reader, cdx, headersRecord, failedFiles)
if cdx['mimetype'] == 'warc/revisit' and headers_record:
payload_record = self._load_different_url_payload(wbrequest, cdx_reader, cdx, headers_record, failed_files)
# single lookup cases
# case 2: non-revisit
elif (hasCurr and not hasOrig):
payloadRecord = headersRecord
elif (has_curr and not has_orig):
payload_record = headers_record
# case 3: identical url revisit, load payload from orig.filename
elif (hasOrig):
payloadRecord = self._load(cdx, True, failedFiles)
elif (has_orig):
payload_record = self._load(cdx, True, failed_files)
# special case: set header to payload if old-style revisit with missing header
if not headersRecord:
headersRecord = payloadRecord
elif headersRecord != payloadRecord:
if not headers_record:
headers_record = payload_record
elif headers_record != payload_record:
# close remainder of stream as this record only used for (already parsed) headers
headersRecord.stream.close()
headers_record.stream.close()
# special case: check if headers record is actually empty (eg empty revisit), then use headers from revisit
if not headersRecord.status_headers.headers:
headersRecord = payloadRecord
if not headers_record.status_headers.headers:
headers_record = payload_record
if not headersRecord or not payloadRecord:
if not headers_record or not payload_record:
raise wbexceptions.CaptureException('Invalid CDX' + str(cdx))
response = WbResponse(headersRecord.status_headers, self.create_stream_gen(payloadRecord.stream))
response._stream = payloadRecord.stream
response = WbResponse(headers_record.status_headers, self.create_stream_gen(payload_record.stream))
response._stream = payload_record.stream
return response
@ -141,14 +140,14 @@ class ReplayView:
# Handle the case where a duplicate of a capture with same digest exists at a different url
# Must query the index at that url filtering by matching digest
# Raise exception if no matches found
def _load_different_url_payload(self, wbrequest, cdx_reader, cdx, headersRecord, failedFiles):
ref_target_uri = headersRecord.rec_headers.getHeader('WARC-Refers-To-Target-URI')
def _load_different_url_payload(self, wbrequest, cdx_reader, cdx, headers_record, failed_files):
ref_target_uri = headers_record.rec_headers.get_header('WARC-Refers-To-Target-URI')
# Check for unresolved revisit error, if refers to target uri not present or same as the current url
if not ref_target_uri or (ref_target_uri == headersRecord.rec_headers.getHeader('WARC-Target-URI')):
if not ref_target_uri or (ref_target_uri == headers_record.rec_headers.get_header('WARC-Target-URI')):
raise wbexceptions.CaptureException('Missing Revisit Original' + str(cdx))
ref_target_date = headersRecord.rec_headers.getHeader('WARC-Refers-To-Date')
ref_target_date = headers_record.rec_headers.get_header('WARC-Refers-To-Date')
if not ref_target_date:
ref_target_date = cdx['timestamp']
@ -163,7 +162,7 @@ class ReplayView:
orig_wbreq.wb_url.timestamp = ref_target_date
# Must also match digest
orig_wbreq.queryFilter.append('digest:' + cdx['digest'])
orig_wbreq.query_filter.append('digest:' + cdx['digest'])
orig_cdx_lines = cdx_reader.load_for_request(orig_wbreq, parsed_cdx = True)
@ -171,8 +170,8 @@ class ReplayView:
try:
#cdx = cdx_reader.CDXCaptureResult(cdx)
#print cdx
payloadRecord = self._load(cdx, False, failedFiles)
return payloadRecord
payload_record = self._load(cdx, False, failed_files)
return payload_record
except wbexceptions.CaptureException as e:
pass
@ -180,13 +179,13 @@ class ReplayView:
raise wbexceptions.CaptureException('Original for revisit could not be loaded')
def resolveFull(self, filename):
def resolve_full(self, filename):
# Attempt to resolve cdx file to full path
fullUrl = None
full_url = None
for resolver in self.resolvers:
fullUrl = resolver(filename)
if fullUrl:
return fullUrl
full_url = resolver(filename)
if full_url:
return full_url
raise wbexceptions.UnresolvedArchiveFileException('Archive File Not Found: ' + filename)
@ -214,36 +213,34 @@ class ReplayView:
#=================================================================
class RewritingReplayView(ReplayView):
def __init__(self, resolvers, archiveloader, headInsert = None, headerRewriter = None, redir_to_exact = True, buffer_response = False):
def __init__(self, resolvers, archiveloader, head_insert = None, header_rewriter = None, redir_to_exact = True, buffer_response = False):
ReplayView.__init__(self, resolvers, archiveloader)
self.headInsert = headInsert
if not headerRewriter:
headerRewriter = HeaderRewriter()
self.headerRewriter = headerRewriter
self.head_insert = head_insert
self.header_rewriter = header_rewriter if header_rewriter else HeaderRewriter()
self.redir_to_exact = redir_to_exact
# buffer or stream rewritten response
self.buffer_response = buffer_response
def _textContentType(self, contentType):
def _text_content_type(self, content_type):
for ctype, mimelist in self.REWRITE_TYPES.iteritems():
if any ((mime in contentType) for mime in mimelist):
if any ((mime in content_type) for mime in mimelist):
return ctype
return None
def __call__(self, wbrequest, index, cdx_reader):
urlrewriter = ArchivalUrlRewriter(wbrequest.wb_url, wbrequest.wb_prefix)
urlrewriter = UrlRewriter(wbrequest.wb_url, wbrequest.wb_prefix)
wbrequest.urlrewriter = urlrewriter
response = ReplayView.__call__(self, wbrequest, index, cdx_reader)
if response and response.cdx:
self._checkRedir(wbrequest, response.cdx)
self._check_redir(wbrequest, response.cdx)
rewrittenHeaders = self.headerRewriter.rewrite(response.status_headers, urlrewriter)
rewritten_headers = self.header_rewriter.rewrite(response.status_headers, urlrewriter)
# TODO: better way to pass this?
stream = response._stream
@ -253,7 +250,7 @@ class RewritingReplayView(ReplayView):
de_chunk = False
# handle transfer-encoding: chunked
if (rewrittenHeaders.containsRemovedHeader('transfer-encoding', 'chunked')):
if (rewritten_headers.contains_removed_header('transfer-encoding', 'chunked')):
stream = archiveloader.ChunkedLineReader(stream)
de_chunk = True
@ -267,8 +264,8 @@ class RewritingReplayView(ReplayView):
# non-text content type, just send through with rewritten headers
# but may need to dechunk
if rewrittenHeaders.textType is None:
response.status_headers = rewrittenHeaders.status_headers
if rewritten_headers.text_type is None:
response.status_headers = rewritten_headers.status_headers
if de_chunk:
response.body = self.create_stream_gen(stream)
@ -278,15 +275,15 @@ class RewritingReplayView(ReplayView):
# Handle text rewriting
# special case -- need to ungzip the body
if (rewrittenHeaders.containsRemovedHeader('content-encoding', 'gzip')):
if (rewritten_headers.contains_removed_header('content-encoding', 'gzip')):
stream = archiveloader.LineReader(stream, decomp = utils.create_decompressor())
# TODO: is this right?
if rewrittenHeaders.charset:
encoding = rewrittenHeaders.charset
if rewritten_headers.charset:
encoding = rewritten_headers.charset
first_buff = None
else:
(encoding, first_buff) = self._detectCharset(stream)
(encoding, first_buff) = self._detect_charset(stream)
# if chardet thinks its ascii, use utf-8
if encoding == 'ascii':
@ -294,24 +291,24 @@ class RewritingReplayView(ReplayView):
encoding = 'utf-8'
# Buffering response for html, streaming for others?
#if rewrittenHeaders.textType == 'html':
# return self._rewriteHtml(encoding, urlrewriter, stream, rewrittenHeaders.status_headers, firstBuff)
#if rewritten_headers.text_type == 'html':
# return self._rewrite_html(encoding, urlrewriter, stream, rewritten_headers.status_headers, first_buff)
#else:
# return self._rewriteOther(rewrittenHeaders.textType, encoding, urlrewriter, stream, rewrittenHeaders.status_headers, firstBuff)
# return self._rewrite_other(rewritten_headers.text_type, encoding, urlrewriter, stream, rewritten_headers.status_headers, first_buff)
textType = rewrittenHeaders.textType
status_headers = rewrittenHeaders.status_headers
text_type = rewritten_headers.text_type
status_headers = rewritten_headers.status_headers
if textType == 'html':
rewriter = html_rewriter.WBHtml(urlrewriter, outstream = None, headInsert = self.headInsert)
elif textType == 'css':
if text_type == 'html':
rewriter = html_rewriter.HTMLRewriter(urlrewriter, outstream = None, head_insert = self.head_insert)
elif text_type == 'css':
rewriter = regex_rewriters.CSSRewriter(urlrewriter)
elif textType == 'js':
elif text_type == 'js':
rewriter = regex_rewriters.JSRewriter(urlrewriter)
elif textType == 'xml':
elif text_type == 'xml':
rewriter = regex_rewriters.XMLRewriter(urlrewriter)
else:
raise Exception('Unknown Text Type for Rewrite: ' + textType)
raise Exception('Unknown Text Type for Rewrite: ' + text_type)
# Create generator for response
response_gen = self._create_rewrite_stream(rewriter, encoding, stream, first_buff)
@ -333,17 +330,17 @@ class RewritingReplayView(ReplayView):
finally:
content = out.getvalue()
contentLengthStr = str(len(content))
status_headers.headers.append(('Content-Length', contentLengthStr))
content_length_str = str(len(content))
status_headers.headers.append(('Content-Length', content_length_str))
out.close()
return WbResponse(status_headers, value = [content])
# Create rewrite response from record (no Content-Length), may even be chunked by front-end
def _create_rewrite_stream(self, rewriter, encoding, stream, first_buff = None):
def doRewrite(buff):
def do_rewrite(buff):
if encoding:
buff = self._decodeBuff(buff, stream, encoding)
buff = self._decode_buff(buff, stream, encoding)
buff = rewriter.rewrite(buff)
@ -352,13 +349,13 @@ class RewritingReplayView(ReplayView):
return buff
def doFinish():
def do_finish():
return rewriter.close()
return self.create_stream_gen(stream, rewrite_func = doRewrite, final_read_func = doFinish, first_buff = first_buff)
return self.create_stream_gen(stream, rewrite_func = do_rewrite, final_read_func = do_finish, first_buff = first_buff)
def _decodeBuff(self, buff, stream, encoding):
def _decode_buff(self, buff, stream, encoding):
try:
buff = buff.decode(encoding)
except UnicodeDecodeError, e:
@ -376,37 +373,37 @@ class RewritingReplayView(ReplayView):
return buff
def _detectCharset(self, stream):
def _detect_charset(self, stream):
buff = stream.read(8192)
result = chardet.detect(buff)
print "chardet result: " + str(result)
return (result['encoding'], buff)
def _checkRedir(self, wbrequest, cdx):
def _check_redir(self, wbrequest, cdx):
if self.redir_to_exact and cdx and (cdx['timestamp'] != wbrequest.wb_url.timestamp):
newUrl = wbrequest.urlrewriter.getTimestampUrl(cdx['timestamp'], cdx['original'])
raise wbexceptions.InternalRedirect(newUrl)
new_url = wbrequest.urlrewriter.get_timestamp_url(cdx['timestamp'], cdx['original'])
raise wbexceptions.InternalRedirect(new_url)
#return WbResponse.better_timestamp_response(wbrequest, cdx['timestamp'])
return None
def doReplay(self, cdx, wbrequest, index, failedFiles):
wbresponse = ReplayView.doReplay(self, cdx, wbrequest, index, failedFiles)
def do_replay(self, cdx, wbrequest, index, failed_files):
wbresponse = ReplayView.do_replay(self, cdx, wbrequest, index, failed_files)
# Check for self redirect
if wbresponse.status_headers.statusline.startswith('3'):
if self.isSelfRedirect(wbrequest, wbresponse.status_headers):
if self.is_self_redirect(wbrequest, wbresponse.status_headers):
raise wbexceptions.CaptureException('Self Redirect: ' + str(cdx))
return wbresponse
def isSelfRedirect(self, wbrequest, status_headers):
requestUrl = wbrequest.wb_url.url.lower()
locationUrl = status_headers.getHeader('Location').lower()
#return requestUrl == locationUrl
return (ArchivalUrlRewriter.stripProtocol(requestUrl) == ArchivalUrlRewriter.stripProtocol(locationUrl))
def is_self_redirect(self, wbrequest, status_headers):
request_url = wbrequest.wb_url.url.lower()
location_url = status_headers.get_header('Location').lower()
#return request_url == location_url
return (UrlRewriter.strip_protocol(request_url) == UrlRewriter.strip_protocol(location_url))

View File

@ -1,10 +1,10 @@
import copy
import urlparse
from wbarchivalurl import ArchivalUrl
from wburl import WbUrl
class ArchivalUrlRewriter:
class UrlRewriter:
"""
>>> test_rewrite('other.html', '/20131010/http://example.com/path/page.html', 'https://web.archive.org/web/')
'https://web.archive.org/web/20131010/http://example.com/path/other.html'
@ -42,13 +42,13 @@ class ArchivalUrlRewriter:
>>> test_rewrite('mailto:example@example.com', '/20131010/http://example.com/path/page.html', 'https://web.archive.org/web/')
'mailto:example@example.com'
>>> ArchivalUrlRewriter('/19960708im_/http://domain.example.com/path.txt', '/abc/').getAbsUrl()
>>> UrlRewriter('/19960708im_/http://domain.example.com/path.txt', '/abc/').get_abs_url()
'/abc/19960708im_/'
>>> ArchivalUrlRewriter('/2013id_/example.com/file/path/blah.html', '/123/').getTimestampUrl('20131024')
>>> UrlRewriter('/2013id_/example.com/file/path/blah.html', '/123/').get_timestamp_url('20131024')
'/123/20131024id_/http://example.com/file/path/blah.html'
>>> ArchivalUrlRewriter.stripProtocol('https://example.com') == ArchivalUrlRewriter.stripProtocol('http://example.com')
>>> UrlRewriter.strip_protocol('https://example.com') == UrlRewriter.strip_protocol('http://example.com')
True
"""
@ -57,7 +57,7 @@ class ArchivalUrlRewriter:
PROTOCOLS = ['http://', 'https://', '//', 'ftp://', 'mms://', 'rtsp://', 'wais://']
def __init__(self, wburl, prefix):
self.wburl = wburl if isinstance(wburl, ArchivalUrl) else ArchivalUrl(wburl)
self.wburl = wburl if isinstance(wburl, WbUrl) else WbUrl(wburl)
self.prefix = prefix
self.archivalurl_class = self.wburl.__class__
@ -66,12 +66,12 @@ class ArchivalUrlRewriter:
def rewrite(self, url, mod = None):
# if special protocol, no rewriting at all
if any (url.startswith(x) for x in ArchivalUrlRewriter.NO_REWRITE_URI_PREFIX):
if any (url.startswith(x) for x in self.NO_REWRITE_URI_PREFIX):
return url
wburl = self.wburl
isAbs = any (url.startswith(x) for x in ArchivalUrlRewriter.PROTOCOLS)
isAbs = any (url.startswith(x) for x in self.PROTOCOLS)
# Optimized rewriter for
# -rel urls that don't start with / and don't contain ../ and no special mod
@ -92,22 +92,22 @@ class ArchivalUrlRewriter:
return finalUrl
def getAbsUrl(self, url = ''):
def get_abs_url(self, url = ''):
return self.prefix + self.wburl.to_str(url=url)
def getTimestampUrl(self, timestamp, url = None):
def get_timestamp_url(self, timestamp, url = None):
if url is None:
url = self.wburl.url
return self.prefix + self.wburl.to_str(timestamp=timestamp, url=url)
def setBaseUrl(self, newUrl):
def set_base_url(self, newUrl):
self.wburl.url = newUrl
@staticmethod
def stripProtocol(url):
for protocol in ArchivalUrlRewriter.PROTOCOLS:
def strip_protocol(url):
for protocol in UrlRewriter.PROTOCOLS:
if url.startswith(protocol):
return url[len(protocol):]
@ -117,7 +117,7 @@ class ArchivalUrlRewriter:
import utils
if __name__ == "__main__" or utils.enable_doctests():
def test_rewrite(rel_url, base_url, prefix, mod = None):
rewriter = ArchivalUrlRewriter(base_url, prefix)
rewriter = UrlRewriter(base_url, prefix)
return rewriter.rewrite(rel_url, mod)
import doctest

View File

@ -36,19 +36,19 @@ class HMACCookieMaker:
self.name = name
def __call__(self, duration, extraId = ''):
def __call__(self, duration, extra_id = ''):
expire = str(long(time.time() + duration))
if extraId:
msg = extraId + '-' + expire
if extra_id:
msg = extra_id + '-' + expire
else:
msg = expire
hmacdigest = hmac.new(self.key, msg)
hexdigest = hmacdigest.hexdigest()
if extraId:
cookie = '{0}-{1}={2}-{3}'.format(self.name, extraId, expire, hexdigest)
if extra_id:
cookie = '{0}-{1}={2}-{3}'.format(self.name, extra_id, expire, hexdigest)
else:
cookie = '{0}={1}-{2}'.format(self.name, expire, hexdigest)

View File

@ -1,4 +1,4 @@
from wbarchivalurl import ArchivalUrl
from wburl import WbUrl
import utils
import pprint
@ -54,19 +54,19 @@ class WbRequest:
@staticmethod
def makeAbsPrefix(env, rel_prefix):
def make_abs_prefix(env, rel_prefix):
try:
return env['wsgi.url_scheme'] + '://' + env['HTTP_HOST'] + rel_prefix
except KeyError:
return rel_prefix
def __init__(self, env, request_uri, wb_prefix, wb_url, coll, use_abs_prefix = False, archivalurl_class = ArchivalUrl):
def __init__(self, env, request_uri, wb_prefix, wb_url, coll, use_abs_prefix = False, archivalurl_class = WbUrl):
self.env = env
self.request_uri = request_uri if request_uri else env.get('REL_REQUEST_URI')
self.wb_prefix = wb_prefix if not use_abs_prefix else WbRequest.makeAbsPrefix(env, wb_prefix)
self.wb_prefix = wb_prefix if not use_abs_prefix else WbRequest.make_abs_prefix(env, wb_prefix)
self.wb_url = archivalurl_class(wb_url)
@ -76,9 +76,9 @@ class WbRequest:
self.is_ajax = self._is_ajax()
self.queryFilter = []
self.query_filter = []
self.customParams = {}
self.custom_params = {}
# PERF
env['X_PERF'] = {}
@ -165,16 +165,16 @@ class StatusAndHeaders:
self.headers = headers
self.protocol = protocol
def getHeader(self, name):
nameLower = name.lower()
def get_header(self, name):
name_lower = name.lower()
for value in self.headers:
if (value[0].lower() == nameLower):
if (value[0].lower() == name_lower):
return value[1]
def remove_header(self, name):
nameLower = name.lower()
name_lower = name.lower()
for x in xrange(len(self.headers) - 1, -1, -1):
if self.headers[x][0].lower() == nameLower:
if self.headers[x][0].lower() == name_lower:
del self.headers[x]
break

View File

@ -5,57 +5,57 @@ import rfc3987
import wbexceptions
# ArchivalUrl : archivalurl representation for WB
# WbUrl : wb archival url representation for WB
class ArchivalUrl:
class WbUrl:
"""
# Replay Urls
# ======================
>>> repr(ArchivalUrl('/20131010000506/example.com'))
>>> repr(WbUrl('/20131010000506/example.com'))
"('replay', '20131010000506', '', 'http://example.com', '/20131010000506/http://example.com')"
>>> repr(ArchivalUrl('/20130102im_/https://example.com'))
>>> repr(WbUrl('/20130102im_/https://example.com'))
"('replay', '20130102', 'im_', 'https://example.com', '/20130102im_/https://example.com')"
# Protocol agnostic convert to http
>>> repr(ArchivalUrl('/20130102im_///example.com'))
>>> repr(WbUrl('/20130102im_///example.com'))
"('replay', '20130102', 'im_', 'http://example.com', '/20130102im_/http://example.com')"
>>> repr(ArchivalUrl('/cs_/example.com'))
>>> repr(WbUrl('/cs_/example.com'))
"('latest_replay', '', 'cs_', 'http://example.com', '/cs_/http://example.com')"
>>> repr(ArchivalUrl('/https://example.com/xyz'))
>>> repr(WbUrl('/https://example.com/xyz'))
"('latest_replay', '', '', 'https://example.com/xyz', '/https://example.com/xyz')"
>>> repr(ArchivalUrl('/https://example.com/xyz?a=%2f&b=%2E'))
>>> repr(WbUrl('/https://example.com/xyz?a=%2f&b=%2E'))
"('latest_replay', '', '', 'https://example.com/xyz?a=%2f&b=%2E', '/https://example.com/xyz?a=%2f&b=%2E')"
# Query Urls
# ======================
>>> repr(ArchivalUrl('/*/http://example.com/abc?def=a'))
>>> repr(WbUrl('/*/http://example.com/abc?def=a'))
"('query', '', '', 'http://example.com/abc?def=a', '/*/http://example.com/abc?def=a')"
>>> repr(ArchivalUrl('/*/http://example.com/abc?def=a*'))
>>> repr(WbUrl('/*/http://example.com/abc?def=a*'))
"('url_query', '', '', 'http://example.com/abc?def=a', '/*/http://example.com/abc?def=a*')"
>>> repr(ArchivalUrl('/json/*/http://example.com/abc?def=a'))
>>> repr(WbUrl('/json/*/http://example.com/abc?def=a'))
"('query', '', 'json', 'http://example.com/abc?def=a', '/json/*/http://example.com/abc?def=a')"
>>> repr(ArchivalUrl('/timemap-link/2011*/http://example.com/abc?def=a'))
>>> repr(WbUrl('/timemap-link/2011*/http://example.com/abc?def=a'))
"('query', '2011', 'timemap-link', 'http://example.com/abc?def=a', '/timemap-link/2011*/http://example.com/abc?def=a')"
# Error Urls
# ======================
>>> x = ArchivalUrl('abc')
>>> x = WbUrl('abc')
Traceback (most recent call last):
RequestParseException: Invalid WB Request Url: abc
>>> x = ArchivalUrl('/#$%#/')
>>> x = WbUrl('/#$%#/')
Traceback (most recent call last):
BadUrlException: Bad Request Url: http://#$%#/
>>> x = ArchivalUrl('/http://example.com:abc/')
>>> x = WbUrl('/http://example.com:abc/')
Traceback (most recent call last):
BadUrlException: Bad Request Url: http://example.com:abc/
"""