1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

style fixes: convert camelCase func and var names to 'not_camel_case'

WbHtml -> HTMLRewriter
ArchivalUrl -> WbUrl
This commit is contained in:
Ilya Kreymer 2014-01-28 19:37:37 -08:00
parent c0f8edf517
commit 6de794a4e1
12 changed files with 385 additions and 397 deletions

View File

@ -2,14 +2,14 @@ import urlparse
import re import re
from wbrequestresponse import WbRequest, WbResponse from wbrequestresponse import WbRequest, WbResponse
from url_rewriter import ArchivalUrlRewriter from url_rewriter import UrlRewriter
from wbarchivalurl import ArchivalUrl from wburl import WbUrl
#================================================================= #=================================================================
# ArchivalRequestRouter -- route WB requests in archival mode # ArchivalRequestRouter -- route WB requests in archival mode
#================================================================= #=================================================================
class ArchivalRequestRouter: class ArchivalRequestRouter:
def __init__(self, handlers, hostpaths = None, abs_path = True, archivalurl_class = ArchivalUrl): def __init__(self, handlers, hostpaths = None, abs_path = True, archivalurl_class = WbUrl):
self.handlers = handlers self.handlers = handlers
self.fallback = ReferRedirect(hostpaths) self.fallback = ReferRedirect(hostpaths)
self.abs_path = abs_path self.abs_path = abs_path
@ -46,7 +46,7 @@ class Route:
self.coll_group = coll_group self.coll_group = coll_group
def __call__(self, env, useAbsPrefix, archivalurl_class): def __call__(self, env, use_abs_prefix, archivalurl_class):
request_uri = env['REL_REQUEST_URI'] request_uri = env['REL_REQUEST_URI']
matcher = self.regex.match(request_uri[1:]) matcher = self.regex.match(request_uri[1:])
if not matcher: if not matcher:
@ -68,19 +68,19 @@ class Route:
coll = coll, coll = coll,
wb_url = wb_url, wb_url = wb_url,
wb_prefix = wb_prefix, wb_prefix = wb_prefix,
use_abs_prefix = useAbsPrefix, use_abs_prefix = use_abs_prefix,
archivalurl_class = archivalurl_class) archivalurl_class = archivalurl_class)
# Allow for setup of additional filters # Allow for setup of additional filters
self._addFilters(wbrequest, matcher) self._add_filters(wbrequest, matcher)
return self._handleRequest(wbrequest) return self._handle_request(wbrequest)
def _addFilters(self, wbrequest, matcher): def _add_filters(self, wbrequest, matcher):
pass pass
def _handleRequest(self, wbrequest): def _handle_request(self, wbrequest):
return self.handler(wbrequest) return self.handler(wbrequest)
@ -90,10 +90,10 @@ class Route:
class ReferRedirect: class ReferRedirect:
""" """
>>> ReferRedirect('http://localhost:8080/').matchPrefixs >>> ReferRedirect('http://localhost:8080/').match_prefixs
['http://localhost:8080/'] ['http://localhost:8080/']
>>> ReferRedirect(['http://example:9090/']).matchPrefixs >>> ReferRedirect(['http://example:9090/']).match_prefixs
['http://example:9090/'] ['http://example:9090/']
>>> test_redir('http://localhost:8080/', '/other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html') >>> test_redir('http://localhost:8080/', '/other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html')
@ -118,18 +118,18 @@ class ReferRedirect:
""" """
def __init__(self, matchPrefixs): def __init__(self, match_prefixs):
if isinstance(matchPrefixs, list): if isinstance(match_prefixs, list):
self.matchPrefixs = matchPrefixs self.match_prefixs = match_prefixs
else: else:
self.matchPrefixs = [matchPrefixs] self.match_prefixs = [match_prefixs]
def __call__(self, wbrequest): def __call__(self, wbrequest):
if wbrequest.referrer is None: if wbrequest.referrer is None:
return None return None
if not any (wbrequest.referrer.startswith(i) for i in self.matchPrefixs): if not any (wbrequest.referrer.startswith(i) for i in self.match_prefixs):
return None return None
try: try:
@ -145,7 +145,7 @@ class ReferRedirect:
# No match on any exception # No match on any exception
try: try:
rewriter = ArchivalUrlRewriter('/' + ref_path[1], script_name + '/' + ref_path[0]) rewriter = UrlRewriter('/' + ref_path[1], script_name + '/' + ref_path[0])
except Exception: except Exception:
return None return None
@ -167,16 +167,16 @@ class ReferRedirect:
import utils import utils
if __name__ == "__main__" or utils.enable_doctests(): if __name__ == "__main__" or utils.enable_doctests():
def test_redir(matchHost, request_uri, referrer, script_name = ''): def test_redir(match_host, request_uri, referrer, script_name = ''):
env = {'REL_REQUEST_URI': request_uri, 'HTTP_REFERER': referrer, 'SCRIPT_NAME': script_name} env = {'REL_REQUEST_URI': request_uri, 'HTTP_REFERER': referrer, 'SCRIPT_NAME': script_name}
redir = ReferRedirect(matchHost) redir = ReferRedirect(match_host)
req = WbRequest.from_uri(request_uri, env) req = WbRequest.from_uri(request_uri, env)
rep = redir(req) rep = redir(req)
if not rep: if not rep:
return False return False
return rep.status_headers.getHeader('Location') return rep.status_headers.get_header('Location')
import doctest import doctest

View File

@ -10,21 +10,21 @@ from wbrequestresponse import StatusAndHeaders
#================================================================= #=================================================================
class HttpReader: class HttpReader:
def __init__(self, hmac = None, hmacDuration = 30): def __init__(self, hmac = None, hmac_duration = 30):
self.hmac = hmac self.hmac = hmac
self.hmacDuration = hmacDuration self.hmac_duration = hmac_duration
def load(self, url, offset, length): def load(self, url, offset, length):
if length > 0: if length > 0:
rangeHeader = 'bytes={0}-{1}'.format(offset, offset + length - 1) range_header = 'bytes={0}-{1}'.format(offset, offset + length - 1)
else: else:
rangeHeader = 'bytes={0}-'.format(offset) range_header = 'bytes={0}-'.format(offset)
headers = {} headers = {}
headers['Range'] = rangeHeader headers['Range'] = range_header
if self.hmac: if self.hmac:
headers['Cookie'] = self.hmac(self.hmacDuration) headers['Cookie'] = self.hmac(self.hmac_duration)
request = urllib2.Request(url, headers = headers) request = urllib2.Request(url, headers = headers)
return urllib2.urlopen(request) return urllib2.urlopen(request)
@ -50,7 +50,7 @@ WBArchiveRecord = collections.namedtuple('WBArchiveRecord', 'type, rec_headers,
class ArchiveLoader: class ArchiveLoader:
""" """
>>> loadTestArchive('example.warc.gz', '333', '1043') >>> load_test_archive('example.warc.gz', '333', '1043')
(('warc', 'response'), (('warc', 'response'),
StatusAndHeaders(protocol = 'WARC/1.0', statusline = '', headers = [ ('WARC-Type', 'response'), StatusAndHeaders(protocol = 'WARC/1.0', statusline = '', headers = [ ('WARC-Type', 'response'),
('WARC-Record-ID', '<urn:uuid:6d058047-ede2-4a13-be79-90c17c631dd4>'), ('WARC-Record-ID', '<urn:uuid:6d058047-ede2-4a13-be79-90c17c631dd4>'),
@ -74,7 +74,7 @@ class ArchiveLoader:
('Connection', 'close')])) ('Connection', 'close')]))
>>> loadTestArchive('example.warc.gz', '1864', '553') >>> load_test_archive('example.warc.gz', '1864', '553')
(('warc', 'revisit'), (('warc', 'revisit'),
StatusAndHeaders(protocol = 'WARC/1.0', statusline = '', headers = [ ('WARC-Type', 'revisit'), StatusAndHeaders(protocol = 'WARC/1.0', statusline = '', headers = [ ('WARC-Type', 'revisit'),
('WARC-Record-ID', '<urn:uuid:3619f5b0-d967-44be-8f24-762098d427c4>'), ('WARC-Record-ID', '<urn:uuid:3619f5b0-d967-44be-8f24-762098d427c4>'),
@ -114,7 +114,7 @@ class ArchiveLoader:
} }
@staticmethod @staticmethod
def createDefaultLoaders(): def create_default_loaders():
http = HttpReader() http = HttpReader()
file = FileReader() file = FileReader()
return { return {
@ -125,35 +125,35 @@ class ArchiveLoader:
} }
def __init__(self, loaders = {}, chunkSize = 8192): def __init__(self, loaders = {}, chunk_size = 8192):
self.loaders = loaders if loaders else ArchiveLoader.createDefaultLoaders() self.loaders = loaders if loaders else ArchiveLoader.create_default_loaders()
self.chunkSize = chunkSize self.chunk_size = chunk_size
self.arcParser = ARCHeadersParser(ArchiveLoader.ARC_HEADERS) self.arc_parser = ARCHeadersParser(ArchiveLoader.ARC_HEADERS)
self.warcParser = StatusAndHeadersParser(['WARC/1.0', 'WARC/0.17', 'WARC/0.18']) self.warc_parser = StatusAndHeadersParser(['WARC/1.0', 'WARC/0.17', 'WARC/0.18'])
self.httpParser = StatusAndHeadersParser(['HTTP/1.0', 'HTTP/1.1']) self.http_parser = StatusAndHeadersParser(['HTTP/1.0', 'HTTP/1.1'])
def load(self, url, offset, length): def load(self, url, offset, length):
urlParts = urlparse.urlsplit(url) url_parts = urlparse.urlsplit(url)
try: try:
loader = self.loaders.get(urlParts.scheme) loader = self.loaders.get(url_parts.scheme)
except Exception: except Exception:
raise wbexceptions.UnknownLoaderProtocolException(url) raise wbexceptions.UnknownLoaderProtocolException(url)
theFormat = None the_format = None
for ext, iformat in ArchiveLoader.FORMAT_MAP.iteritems(): for ext, iformat in ArchiveLoader.FORMAT_MAP.iteritems():
if url.endswith(ext): if url.endswith(ext):
theFormat = iformat the_format = iformat
break break
if theFormat is None: if the_format is None:
raise wbexceptions.UnknownArchiveFormatException(url) raise wbexceptions.UnknownArchiveFormatException(url)
(aFormat, isGzip) = theFormat (a_format, is_gzip) = the_format
decomp = utils.create_decompressor() if isGzip else None decomp = utils.create_decompressor() if is_gzip else None
try: try:
length = int(length) length = int(length)
@ -163,17 +163,17 @@ class ArchiveLoader:
raw = loader.load(url, long(offset), length) raw = loader.load(url, long(offset), length)
stream = LineReader(raw, length, self.chunkSize, decomp) stream = LineReader(raw, length, self.chunk_size, decomp)
if aFormat == 'arc': if a_format == 'arc':
rec_headers = self.arcParser.parse(stream) rec_headers = self.arc_parser.parse(stream)
recType = 'response' rec_type = 'response'
empty = (rec_headers.getHeader('length') == 0) empty = (rec_headers.get_header('length') == 0)
elif aFormat == 'warc': elif a_format == 'warc':
rec_headers = self.warcParser.parse(stream) rec_headers = self.warc_parser.parse(stream)
recType = rec_headers.getHeader('WARC-Type') rec_type = rec_headers.get_header('WARC-Type')
empty = (rec_headers.getHeader('Content-Length') == '0') empty = (rec_headers.get_header('Content-Length') == '0')
# special case: empty w/arc record (hopefully a revisit) # special case: empty w/arc record (hopefully a revisit)
if empty: if empty:
@ -181,21 +181,21 @@ class ArchiveLoader:
# special case: warc records that are not expected to have http headers # special case: warc records that are not expected to have http headers
# attempt to add 200 status and content-type # attempt to add 200 status and content-type
elif recType == 'metadata' or recType == 'resource': elif rec_type == 'metadata' or rec_type == 'resource':
status_headers = StatusAndHeaders('200 OK', [('Content-Type', rec_headers.getHeader('Content-Type'))]) status_headers = StatusAndHeaders('200 OK', [('Content-Type', rec_headers.get_header('Content-Type'))])
# special case: http 0.9 response, no status or headers # special case: http 0.9 response, no status or headers
#elif recType == 'response': #elif rec_type == 'response':
# contentType = rec_headers.getHeader('Content-Type') # content_type = rec_headers.get_header('Content-Type')
# if contentType and (';version=0.9' in contentType): # if content_type and (';version=0.9' in content_type):
# status_headers = StatusAndHeaders('200 OK', []) # status_headers = StatusAndHeaders('200 OK', [])
# response record: parse HTTP status and headers! # response record: parse HTTP status and headers!
else: else:
#(statusline, http_headers) = self.parseHttpHeaders(stream) #(statusline, http_headers) = self.parse_http_headers(stream)
status_headers = self.httpParser.parse(stream) status_headers = self.http_parser.parse(stream)
return WBArchiveRecord((aFormat, recType), rec_headers, stream, status_headers) return WBArchiveRecord((a_format, rec_type), rec_headers, stream, status_headers)
#================================================================= #=================================================================
@ -206,9 +206,9 @@ class StatusAndHeadersParser:
def parse(self, stream): def parse(self, stream):
statusline = stream.readline().rstrip() statusline = stream.readline().rstrip()
protocolStatus = utils.split_prefix(statusline, self.statuslist) protocol_status = utils.split_prefix(statusline, self.statuslist)
if not protocolStatus: if not protocol_status:
raise wbexceptions.InvalidArchiveRecordException('Expected Status Line, Found: ' + statusline) raise wbexceptions.InvalidArchiveRecordException('Expected Status Line, Found: ' + statusline)
headers = [] headers = []
@ -220,7 +220,7 @@ class StatusAndHeadersParser:
headers.append(header) headers.append(header)
line = stream.readline().rstrip() line = stream.readline().rstrip()
return StatusAndHeaders(statusline = protocolStatus[1].strip(), headers = headers, protocol = protocolStatus[0]) return StatusAndHeaders(statusline = protocol_status[1].strip(), headers = headers, protocol = protocol_status[0])
#================================================================= #=================================================================
class ARCHeadersParser: class ARCHeadersParser:
@ -247,25 +247,25 @@ class ARCHeadersParser:
#================================================================= #=================================================================
class LineReader: class LineReader:
def __init__(self, stream, maxLen = 0, chunkSize = 1024, decomp = None): def __init__(self, stream, max_len = 0, chunk_size = 1024, decomp = None):
self.stream = stream self.stream = stream
self.chunkSize = chunkSize self.chunk_size = chunk_size
self.decomp = decomp self.decomp = decomp
self.buff = None self.buff = None
self.numRead = 0 self.num_read = 0
self.maxLen = maxLen self.max_len = max_len
def _fillbuff(self, chunkSize = None): def _fillbuff(self, chunk_size = None):
if not chunkSize: if not chunk_size:
chunkSize = self.chunkSize chunk_size = self.chunk_size
if not self.buff or self.buff.pos >= self.buff.len: if not self.buff or self.buff.pos >= self.buff.len:
toRead = min(self.maxLen - self.numRead, self.chunkSize) if (self.maxLen > 0) else self.chunkSize to_read = min(self.max_len - self.num_read, self.chunk_size) if (self.max_len > 0) else self.chunk_size
data = self.stream.read(toRead) data = self.stream.read(to_read)
self._process_read(data) self._process_read(data)
def _process_read(self, data): def _process_read(self, data):
self.numRead += len(data) self.num_read += len(data)
if self.decomp and data: if self.decomp and data:
data = self.decomp.decompress(data) data = self.decomp.decompress(data)
@ -310,45 +310,45 @@ class ChunkedLineReader(LineReader):
'123412' '123412'
""" """
allChunksRead = False all_chunks_read = False
notChunked = False not_chunked = False
raiseChunkedDataExceptions = False # if False, we'll use best-guess fallback for parse errors raise_chunked_data_exceptions = False # if False, we'll use best-guess fallback for parse errors
def _fillbuff(self, chunkSize = None): def _fillbuff(self, chunk_size = None):
if self.notChunked: if self.not_chunked:
return LineReader._fillbuff(self, chunkSize) return LineReader._fillbuff(self, chunk_size)
if self.allChunksRead: if self.all_chunks_read:
return return
if not self.buff or self.buff.pos >= self.buff.len: if not self.buff or self.buff.pos >= self.buff.len:
lengthHeader = self.stream.readline(64) length_header = self.stream.readline(64)
data = '' data = ''
try: try:
# decode length header # decode length header
try: try:
chunkSize = int(lengthHeader.strip().split(';')[0], 16) chunk_size = int(length_header.strip().split(';')[0], 16)
except ValueError: except ValueError:
raise ChunkedDataException("Couldn't decode length header '%s'" % lengthHeader) raise ChunkedDataException("Couldn't decode length header '%s'" % length_header)
if chunkSize: if chunk_size:
# read chunk # read chunk
while len(data) < chunkSize: while len(data) < chunk_size:
newData = self.stream.read(chunkSize - len(data)) new_data = self.stream.read(chunk_size - len(data))
# if we unexpectedly run out of data, either raise an exception or just stop reading, assuming file was cut off # if we unexpectedly run out of data, either raise an exception or just stop reading, assuming file was cut off
if not newData: if not new_data:
if self.raiseChunkedDataExceptions: if self.raise_chunked_data_exceptions:
raise ChunkedDataException("Ran out of data before end of chunk") raise ChunkedDataException("Ran out of data before end of chunk")
else: else:
chunkSize = len(data) chunk_size = len(data)
self.allChunksRead = True self.all_chunks_read = True
data += newData data += new_data
# if we successfully read a block without running out, it should end in \r\n # if we successfully read a block without running out, it should end in \r\n
if not self.allChunksRead: if not self.all_chunks_read:
clrf = self.stream.read(2) clrf = self.stream.read(2)
if clrf != '\r\n': if clrf != '\r\n':
raise ChunkedDataException("Chunk terminator not found.") raise ChunkedDataException("Chunk terminator not found.")
@ -356,19 +356,19 @@ class ChunkedLineReader(LineReader):
if self.decomp: if self.decomp:
data = self.decomp.decompress(data) data = self.decomp.decompress(data)
else: else:
# chunkSize 0 indicates end of file # chunk_size 0 indicates end of file
self.allChunksRead = True self.all_chunks_read = True
data = '' data = ''
self._process_read(data) self._process_read(data)
except ChunkedDataException: except ChunkedDataException:
if self.raiseChunkedDataExceptions: if self.raise_chunked_data_exceptions:
raise raise
# Can't parse the data as chunked. # Can't parse the data as chunked.
# It's possible that non-chunked data is set with a Transfer-Encoding: chunked # It's possible that non-chunked data is set with a Transfer-Encoding: chunked
# Treat this as non-chunk encoded from here on # Treat this as non-chunk encoded from here on
self._process_read(lengthHeader+data) self._process_read(length_header + data)
self.notChunked = True self.not_chunked = True
#================================================================= #=================================================================
@ -379,7 +379,7 @@ if __name__ == "__main__" or utils.enable_doctests():
testloader = ArchiveLoader() testloader = ArchiveLoader()
def loadTestArchive(test_file, offset, length): def load_test_archive(test_file, offset, length):
path = os.path.dirname(os.path.realpath(__file__)) + '/../test/' + test_file path = os.path.dirname(os.path.realpath(__file__)) + '/../test/' + test_file
archive = testloader.load(path, offset, length) archive = testloader.load(path, offset, length)

View File

@ -2,14 +2,14 @@ from wbrequestresponse import StatusAndHeaders
#================================================================= #=================================================================
class RewrittenStatusAndHeaders: class RewrittenStatusAndHeaders:
def __init__(self, statusline, headers, removedHeaderDict, textType, charset): def __init__(self, statusline, headers, removed_header_dict, text_type, charset):
self.status_headers = StatusAndHeaders(statusline, headers) self.status_headers = StatusAndHeaders(statusline, headers)
self.removedHeaderDict = removedHeaderDict self.removed_header_dict = removed_header_dict
self.textType = textType self.text_type = text_type
self.charset = charset self.charset = charset
def containsRemovedHeader(self, name, value): def contains_removed_header(self, name, value):
return self.removedHeaderDict.get(name) == value return self.removed_header_dict.get(name) == value
#================================================================= #=================================================================
@ -17,30 +17,30 @@ class HeaderRewriter:
""" """
# Text with charset # Text with charset
>>> test_rewrite([('Date', 'Fri, 03 Jan 2014 03:03:21 GMT'), ('Content-Length', '5'), ('Content-Type', 'text/html;charset=UTF-8')]) >>> test_rewrite([('Date', 'Fri, 03 Jan 2014 03:03:21 GMT'), ('Content-Length', '5'), ('Content-Type', 'text/html;charset=UTF-8')])
{'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Date', 'Fri, 03 Jan 2014 03:03:21 GMT'), {'text_type': 'html', 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Date', 'Fri, 03 Jan 2014 03:03:21 GMT'),
('X-Archive-Orig-Content-Length', '5'), ('X-Archive-Orig-Content-Length', '5'),
('Content-Type', 'text/html;charset=UTF-8')]), 'charset': 'utf-8', 'textType': 'html', 'removedHeaderDict': {}} ('Content-Type', 'text/html;charset=UTF-8')]), 'removed_header_dict': {}, 'charset': 'utf-8'}
# Redirect # Redirect
>>> test_rewrite([('Connection', 'close'), ('Location', '/other.html')], '302 Redirect') >>> test_rewrite([('Connection', 'close'), ('Location', '/other.html')], '302 Redirect')
{'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [ ('X-Archive-Orig-Connection', 'close'), {'text_type': None, 'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [ ('X-Archive-Orig-Connection', 'close'),
('Location', '/web/20131226101010/http://example.com/other.html')]), 'charset': None, 'textType': None, 'removedHeaderDict': {}} ('Location', '/web/20131226101010/http://example.com/other.html')]), 'removed_header_dict': {}, 'charset': None}
# gzip # gzip
>>> test_rewrite([('Content-Length', '199999'), ('Content-Type', 'text/javascript'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')]) >>> test_rewrite([('Content-Length', '199999'), ('Content-Type', 'text/javascript'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')])
{'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Content-Length', '199999'), {'text_type': 'js', 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Content-Length', '199999'),
('Content-Type', 'text/javascript')]), 'charset': None, 'textType': 'js', 'removedHeaderDict': {'transfer-encoding': 'chunked', 'content-encoding': 'gzip'}} ('Content-Type', 'text/javascript')]), 'removed_header_dict': {'transfer-encoding': 'chunked', 'content-encoding': 'gzip'}, 'charset': None}
# Binary # Binary
>>> test_rewrite([('Content-Length', '200000'), ('Content-Type', 'image/png'), ('Cookie', 'blah'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')]) >>> test_rewrite([('Content-Length', '200000'), ('Content-Type', 'image/png'), ('Cookie', 'blah'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')])
{'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('Content-Length', '200000'), {'text_type': None, 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('Content-Length', '200000'),
('Content-Type', 'image/png'), ('Content-Type', 'image/png'),
('X-Archive-Orig-Cookie', 'blah'), ('X-Archive-Orig-Cookie', 'blah'),
('Content-Encoding', 'gzip')]), 'charset': None, 'textType': None, 'removedHeaderDict': {'transfer-encoding': 'chunked'}} ('Content-Encoding', 'gzip')]), 'removed_header_dict': {'transfer-encoding': 'chunked'}, 'charset': None}
Removing Transfer-Encoding always, Was: Removing Transfer-Encoding always, Was:
('Content-Encoding', 'gzip'), ('Content-Encoding', 'gzip'),
('Transfer-Encoding', 'chunked')]), 'charset': None, 'textType': None, 'removedHeaderDict': {}} ('Transfer-Encoding', 'chunked')]), 'charset': None, 'text_type': None, 'removed_header_dict': {}}
""" """
@ -63,64 +63,64 @@ class HeaderRewriter:
PROXY_NO_REWRITE_HEADERS = ['content-length'] PROXY_NO_REWRITE_HEADERS = ['content-length']
def __init__(self, headerPrefix = 'X-Archive-Orig-'): def __init__(self, header_prefix = 'X-Archive-Orig-'):
self.headerPrefix = headerPrefix self.header_prefix = header_prefix
def rewrite(self, status_headers, urlrewriter): def rewrite(self, status_headers, urlrewriter):
contentType = status_headers.getHeader('Content-Type') content_type = status_headers.get_header('Content-Type')
textType = None text_type = None
charset = None charset = None
stripEncoding = False strip_encoding = False
if contentType: if content_type:
textType = self._extractTextType(contentType) text_type = self._extract_text_type(content_type)
if textType: if text_type:
charset = self._extractCharSet(contentType) charset = self._extract_char_set(content_type)
stripEncoding = True strip_encoding = True
(newHeaders, removedHeaderDict) = self._rewriteHeaders(status_headers.headers, urlrewriter, stripEncoding) (new_headers, removed_header_dict) = self._rewrite_headers(status_headers.headers, urlrewriter, strip_encoding)
return RewrittenStatusAndHeaders(status_headers.statusline, newHeaders, removedHeaderDict, textType, charset) return RewrittenStatusAndHeaders(status_headers.statusline, new_headers, removed_header_dict, text_type, charset)
def _extractTextType(self, contentType): def _extract_text_type(self, content_type):
for ctype, mimelist in self.REWRITE_TYPES.iteritems(): for ctype, mimelist in self.REWRITE_TYPES.iteritems():
if any ((mime in contentType) for mime in mimelist): if any ((mime in content_type) for mime in mimelist):
return ctype return ctype
return None return None
def _extractCharSet(self, contentType): def _extract_char_set(self, content_type):
CHARSET_TOKEN = 'charset=' CHARSET_TOKEN = 'charset='
idx = contentType.find(CHARSET_TOKEN) idx = content_type.find(CHARSET_TOKEN)
if idx < 0: if idx < 0:
return None return None
return contentType[idx + len(CHARSET_TOKEN):].lower() return content_type[idx + len(CHARSET_TOKEN):].lower()
def _rewriteHeaders(self, headers, urlrewriter, contentRewritten = False): def _rewrite_headers(self, headers, urlrewriter, content_rewritten = False):
newHeaders = [] new_headers = []
removedHeaderDict = {} removed_header_dict = {}
for (name, value) in headers: for (name, value) in headers:
lowername = name.lower() lowername = name.lower()
if lowername in self.PROXY_HEADERS: if lowername in self.PROXY_HEADERS:
newHeaders.append((name, value)) new_headers.append((name, value))
elif lowername in self.URL_REWRITE_HEADERS: elif lowername in self.URL_REWRITE_HEADERS:
newHeaders.append((name, urlrewriter.rewrite(value))) new_headers.append((name, urlrewriter.rewrite(value)))
elif lowername in self.ENCODING_HEADERS: elif lowername in self.ENCODING_HEADERS:
if contentRewritten: if content_rewritten:
removedHeaderDict[lowername] = value removed_header_dict[lowername] = value
else: else:
newHeaders.append((name, value)) new_headers.append((name, value))
elif lowername in self.REMOVE_HEADERS: elif lowername in self.REMOVE_HEADERS:
removedHeaderDict[lowername] = value removed_header_dict[lowername] = value
elif lowername in self.PROXY_NO_REWRITE_HEADERS and not contentRewritten: elif lowername in self.PROXY_NO_REWRITE_HEADERS and not content_rewritten:
newHeaders.append((name, value)) new_headers.append((name, value))
else: else:
newHeaders.append((self.headerPrefix + name, value)) new_headers.append((self.header_prefix + name, value))
return (newHeaders, removedHeaderDict) return (new_headers, removed_header_dict)
import utils import utils
if __name__ == "__main__" or utils.enable_doctests(): if __name__ == "__main__" or utils.enable_doctests():
@ -128,7 +128,7 @@ if __name__ == "__main__" or utils.enable_doctests():
import pprint import pprint
import url_rewriter import url_rewriter
urlrewriter = url_rewriter.ArchivalUrlRewriter('/20131226101010/http://example.com/some/path/index.html', '/web/') urlrewriter = url_rewriter.UrlRewriter('/20131226101010/http://example.com/some/path/index.html', '/web/')
headerrewriter = HeaderRewriter() headerrewriter = HeaderRewriter()

View File

@ -5,13 +5,13 @@ import sys
import re import re
from HTMLParser import HTMLParser from HTMLParser import HTMLParser
from url_rewriter import ArchivalUrlRewriter from url_rewriter import UrlRewriter
from regex_rewriters import JSRewriter, CSSRewriter from regex_rewriters import JSRewriter, CSSRewriter
#================================================================= #=================================================================
# WBHtml --html parser for custom rewriting, also handlers for script and css # HTMLRewriter -- html parser for custom rewriting, also handlers for script and css
#================================================================= #=================================================================
class WBHtml(HTMLParser): class HTMLRewriter(HTMLParser):
r""" r"""
>>> parse('<HTML><A Href="page.html">Text</a></hTmL>') >>> parse('<HTML><A Href="page.html">Text</a></hTmL>')
<HTML><a href="/web/20131226101010/http://example.com/some/path/page.html">Text</a></html> <HTML><a href="/web/20131226101010/http://example.com/some/path/page.html">Text</a></html>
@ -72,13 +72,13 @@ class WBHtml(HTMLParser):
<style>@import url(/web/20131226101010/http://example.com/some/path/styles.css)</style> <style>@import url(/web/20131226101010/http://example.com/some/path/styles.css)</style>
# Head Insertion # Head Insertion
>>> parse('<html><head><script src="other.js"></script></head><body>Test</body></html>', headInsert = '<script src="cool.js"></script>') >>> parse('<html><head><script src="other.js"></script></head><body>Test</body></html>', head_insert = '<script src="cool.js"></script>')
<html><head><script src="cool.js"></script><script src="/web/20131226101010js_/http://example.com/some/path/other.js"></script></head><body>Test</body></html> <html><head><script src="cool.js"></script><script src="/web/20131226101010js_/http://example.com/some/path/other.js"></script></head><body>Test</body></html>
>>> parse('<body><div>SomeTest</div>', headInsert = '/* Insert */') >>> parse('<body><div>SomeTest</div>', head_insert = '/* Insert */')
/* Insert */<body><div>SomeTest</div> /* Insert */<body><div>SomeTest</div>
>>> parse('<link href="abc.txt"><div>SomeTest</div>', headInsert = '<script>load_stuff();</script>') >>> parse('<link href="abc.txt"><div>SomeTest</div>', head_insert = '<script>load_stuff();</script>')
<link href="/web/20131226101010oe_/http://example.com/some/path/abc.txt"><script>load_stuff();</script><div>SomeTest</div> <link href="/web/20131226101010oe_/http://example.com/some/path/abc.txt"><script>load_stuff();</script><div>SomeTest</div>
""" """
@ -125,128 +125,128 @@ class WBHtml(HTMLParser):
self.buff += string self.buff += string
def __init__(self, url_rewriter, outstream = None, headInsert = None, jsRewriterClass = JSRewriter, cssRewriterClass = CSSRewriter): def __init__(self, url_rewriter, outstream = None, head_insert = None, js_rewriter_class = JSRewriter, css_rewriter_class = CSSRewriter):
HTMLParser.__init__(self) HTMLParser.__init__(self)
self.url_rewriter = url_rewriter self.url_rewriter = url_rewriter
self._wbParseContext = None self._wb_parse_context = None
self.out = outstream if outstream else WBHtml.AccumBuff() self.out = outstream if outstream else self.AccumBuff()
self.jsRewriter = jsRewriterClass(url_rewriter) self.js_rewriter = js_rewriter_class(url_rewriter)
self.cssRewriter = cssRewriterClass(url_rewriter) self.css_rewriter = css_rewriter_class(url_rewriter)
self.headInsert = headInsert self.head_insert = head_insert
# =========================== # ===========================
META_REFRESH_REGEX = re.compile('^[\\d.]+\\s*;\\s*url\\s*=\\s*(.+?)\\s*$', re.IGNORECASE | re.MULTILINE) META_REFRESH_REGEX = re.compile('^[\\d.]+\\s*;\\s*url\\s*=\\s*(.+?)\\s*$', re.IGNORECASE | re.MULTILINE)
def _rewriteMetaRefresh(self, metaRefresh): def _rewrite_meta_refresh(self, meta_refresh):
if not metaRefresh: if not meta_refresh:
return None return None
m = WBHtml.META_REFRESH_REGEX.match(metaRefresh) m = self.META_REFRESH_REGEX.match(meta_refresh)
if not m: if not m:
return metaRefresh return meta_refresh
try: try:
metaRefresh = metaRefresh[:m.start(1)] + self._rewriteURL(m.group(1)) + metaRefresh[m.end(1):] meta_refresh = meta_refresh[:m.start(1)] + self._rewrite_url(m.group(1)) + meta_refresh[m.end(1):]
except Exception: except Exception:
pass pass
return metaRefresh return meta_refresh
# =========================== # ===========================
def _rewriteURL(self, value, mod = None): def _rewrite_url(self, value, mod = None):
return self.url_rewriter.rewrite(value, mod) if value else None return self.url_rewriter.rewrite(value, mod) if value else None
def _rewriteCSS(self, cssContent): def _rewrite_css(self, css_content):
return self.cssRewriter.rewrite(cssContent) if cssContent else None return self.css_rewriter.rewrite(css_content) if css_content else None
def _rewriteScript(self, scriptContent): def _rewrite_script(self, script_content):
return self.jsRewriter.rewrite(scriptContent) if scriptContent else None return self.js_rewriter.rewrite(script_content) if script_content else None
def hasAttr(self, tagAttrs, attr): def has_attr(self, tag_attrs, attr):
name, value = attr name, value = attr
for attrName, attrValue in tagAttrs: for attr_name, attr_value in tag_attrs:
if attrName == name: if attr_name == name:
return value.lower() == attrValue.lower() return value.lower() == attr_value.lower()
return False return False
def rewriteTagAttrs(self, tag, tagAttrs, isStartEnd): def rewrite_tag_attrs(self, tag, tag_attrs, is_start_end):
# special case: script or style parse context # special case: script or style parse context
if (tag in WBHtml.STATE_TAGS) and (self._wbParseContext == None): if (tag in self.STATE_TAGS) and (self._wb_parse_context == None):
self._wbParseContext = tag self._wb_parse_context = tag
# special case: head insertion, non-head tags # special case: head insertion, non-head tags
elif (self.headInsert and (self._wbParseContext == None) and (tag not in WBHtml.HEAD_TAGS)): elif (self.head_insert and (self._wb_parse_context == None) and (tag not in self.HEAD_TAGS)):
self.out.write(self.headInsert) self.out.write(self.head_insert)
self.headInsert = None self.head_insert = None
# attr rewriting # attr rewriting
handler = WBHtml.REWRITE_TAGS.get(tag) handler = self.REWRITE_TAGS.get(tag)
if not handler: if not handler:
handler = WBHtml.REWRITE_TAGS.get('') handler = self.REWRITE_TAGS.get('')
if not handler: if not handler:
return False return False
self.out.write('<' + tag) self.out.write('<' + tag)
for attr in tagAttrs: for attr in tag_attrs:
attrName, attrValue = attr attr_name, attr_value = attr
# special case: inline JS/event handler # special case: inline JS/event handler
if (attrValue and attrValue.startswith('javascript:')) or attrName.startswith('on'): if (attr_value and attr_value.startswith('javascript:')) or attr_name.startswith('on'):
attrValue = self._rewriteScript(attrValue) attr_value = self._rewrite_script(attr_value)
# special case: inline CSS/style attribute # special case: inline CSS/style attribute
elif attrName == 'style': elif attr_name == 'style':
attrValue = self._rewriteCSS(attrValue) attr_value = self._rewrite_css(attr_value)
# special case: meta tag # special case: meta tag
elif (tag == 'meta') and (attrName == 'content'): elif (tag == 'meta') and (attr_name == 'content'):
if self.hasAttr(tagAttrs, ('http-equiv', 'refresh')): if self.has_attr(tag_attrs, ('http-equiv', 'refresh')):
attrValue = self._rewriteMetaRefresh(attrValue) attr_value = self._rewrite_meta_refresh(attr_value)
else: else:
# special case: base tag # special case: base tag
if (tag == 'base') and (attrName == 'href') and attrValue: if (tag == 'base') and (attr_name == 'href') and attr_value:
self.url_rewriter.setBaseUrl(attrValue) self.url_rewriter.set_base_url(attr_value)
rwMod = handler.get(attrName) rw_mod = handler.get(attr_name)
if rwMod is not None: if rw_mod is not None:
attrValue = self._rewriteURL(attrValue, rwMod) attr_value = self._rewrite_url(attr_value, rw_mod)
# parser doesn't differentiate between 'attr=""' and just 'attr' # parser doesn't differentiate between 'attr=""' and just 'attr'
# 'attr=""' is more common, so use that form # 'attr=""' is more common, so use that form
if attrValue: if attr_value:
self.out.write(' ' + attrName + '="' + attrValue + '"') self.out.write(' ' + attr_name + '="' + attr_value + '"')
else: else:
self.out.write(' ' + attrName + '=""') self.out.write(' ' + attr_name + '=""')
self.out.write('/>' if isStartEnd else '>') self.out.write('/>' if is_start_end else '>')
# special case: head tag # special case: head tag
if (self.headInsert) and (self._wbParseContext == None) and (tag == 'head'): if (self.head_insert) and (self._wb_parse_context == None) and (tag == 'head'):
self.out.write(self.headInsert) self.out.write(self.head_insert)
self.headInsert = None self.head_insert = None
return True return True
def parseData(self, data): def parse_data(self, data):
if self._wbParseContext == 'script': if self._wb_parse_context == 'script':
data = self._rewriteScript(data) data = self._rewrite_script(data)
elif self._wbParseContext == 'style': elif self._wb_parse_context == 'style':
data = self._rewriteCSS(data) data = self._rewrite_css(data)
self.out.write(data) self.out.write(data)
def rewrite(self, string): def rewrite(self, string):
if not self.out: if not self.out:
self.out = WBHtml.AccumBuff() self.out = self.AccumBuff()
self.feed(string) self.feed(string)
@ -258,9 +258,9 @@ class WBHtml(HTMLParser):
# HTMLParser overrides below # HTMLParser overrides below
def close(self): def close(self):
if (self._wbParseContext): if (self._wb_parse_context):
result = self.rewrite('</' + self._wbParseContext + '>') result = self.rewrite('</' + self._wb_parse_context + '>')
self._wbParseContext = None self._wb_parse_context = None
else: else:
result = '' result = ''
@ -268,21 +268,21 @@ class WBHtml(HTMLParser):
return result return result
def handle_starttag(self, tag, attrs): def handle_starttag(self, tag, attrs):
if not self.rewriteTagAttrs(tag, attrs, False): if not self.rewrite_tag_attrs(tag, attrs, False):
self.out.write(self.get_starttag_text()) self.out.write(self.get_starttag_text())
def handle_startendtag(self, tag, attrs): def handle_startendtag(self, tag, attrs):
if not self.rewriteTagAttrs(tag, attrs, True): if not self.rewrite_tag_attrs(tag, attrs, True):
self.out.write(self.get_starttag_text()) self.out.write(self.get_starttag_text())
def handle_endtag(self, tag): def handle_endtag(self, tag):
if (tag == self._wbParseContext): if (tag == self._wb_parse_context):
self._wbParseContext = None self._wb_parse_context = None
self.out.write('</' + tag + '>') self.out.write('</' + tag + '>')
def handle_data(self, data): def handle_data(self, data):
self.parseData(data) self.parse_data(data)
def handle_entityref(self, data): def handle_entityref(self, data):
self.out.write('&' + data + ';') self.out.write('&' + data + ';')
@ -292,7 +292,7 @@ class WBHtml(HTMLParser):
def handle_comment(self, data): def handle_comment(self, data):
self.out.write('<!--') self.out.write('<!--')
self.parseData(data) self.parse_data(data)
self.out.write('-->') self.out.write('-->')
def handle_decl(self, data): def handle_decl(self, data):
@ -303,24 +303,17 @@ class WBHtml(HTMLParser):
def unknown_decl(self, data): def unknown_decl(self, data):
self.out.write('<![') self.out.write('<![')
self.parseData(data) self.parse_data(data)
self.out.write(']>') self.out.write(']>')
# instantiate the parser and fed it some HTML
#parser = WBHtml()
#instr = '<HTML X=\'a\' B=\'234\' some="other"><a href="Test"><BR/><head><title>Test</title></head>\n<body><h1>Parse me!</h1></body></HTML>'
#print instr
#print
#parser.feed(instr)
#print
import utils import utils
if __name__ == "__main__" or utils.enable_doctests(): if __name__ == "__main__" or utils.enable_doctests():
url_rewriter = ArchivalUrlRewriter('/20131226101010/http://example.com/some/path/index.html', '/web/') url_rewriter = UrlRewriter('/20131226101010/http://example.com/some/path/index.html', '/web/')
def parse(data, headInsert = None): def parse(data, head_insert = None):
parser = WBHtml(url_rewriter, headInsert = headInsert) parser = HTMLRewriter(url_rewriter, head_insert = head_insert)
print parser.rewrite(data) + parser.close() print parser.rewrite(data) + parser.close()
import doctest import doctest

View File

@ -6,8 +6,6 @@ import wbrequestresponse
import surt import surt
from collections import OrderedDict from collections import OrderedDict
from wbarchivalurl import ArchivalUrl
import binsearch import binsearch
import cdxserve import cdxserve
import logging import logging
@ -22,11 +20,11 @@ class IndexReader:
params = self.get_query_params(wburl) params = self.get_query_params(wburl)
# add any custom filter from the request # add any custom filter from the request
if wbrequest.queryFilter: if wbrequest.query_filter:
params['filter'] = wbrequest.queryFilter params['filter'] = wbrequest.query_filter
if wbrequest.customParams: if wbrequest.custom_params:
params.update(wbrequest.customParams) params.update(wbrequest.custom_params)
cdxlines = self.load_cdx(wburl.url, params, parsed_cdx) cdxlines = self.load_cdx(wburl.url, params, parsed_cdx)
@ -133,9 +131,9 @@ class RemoteCDXServer(IndexReader):
('length', '1792')] ('length', '1792')]
""" """
def __init__(self, serverUrl, cookie = None): def __init__(self, server_url, cookie = None):
self.serverUrl = serverUrl self.server_url = server_url
self.authCookie = cookie self.auth_cookie = cookie
def load_cdx(self, url, params = {}, parsed_cdx = True, **kwvalues): def load_cdx(self, url, params = {}, parsed_cdx = True, **kwvalues):
#url is required, must be passed explicitly! #url is required, must be passed explicitly!
@ -145,10 +143,10 @@ class RemoteCDXServer(IndexReader):
urlparams = urllib.urlencode(params, True) urlparams = urllib.urlencode(params, True)
try: try:
request = urllib2.Request(self.serverUrl, urlparams) request = urllib2.Request(self.server_url, urlparams)
if self.authCookie: if self.auth_cookie:
request.add_header('Cookie', self.authCookie) request.add_header('Cookie', self.auth_cookie)
response = urllib2.urlopen(request) response = urllib2.urlopen(request)
except urllib2.HTTPError, e: except urllib2.HTTPError, e:
@ -168,7 +166,7 @@ class RemoteCDXServer(IndexReader):
# with lower values if there are too many captures. Ideally, should be around 10-20 # with lower values if there are too many captures. Ideally, should be around 10-20
# The replayClosest is the max number of cdx lines, so max number of retry attempts that WB will make # The replayClosest is the max number of cdx lines, so max number of retry attempts that WB will make
def get_query_params(self, wburl, limit = '150000', collapseTime = '10', replayClosest = '4000'): def get_query_params(self, wburl, limit = '150000', collapse_time = '10', replay_closest = '4000'):
return { return {
wburl.QUERY: wburl.QUERY:

View File

@ -25,7 +25,7 @@ def pywb_config(head_insert = ''):
prefixes = [replay_resolvers.PrefixResolver(test_dir)] prefixes = [replay_resolvers.PrefixResolver(test_dir)]
# Create rewriting replay handler to rewrite records # Create rewriting replay handler to rewrite records
replayer = replay_views.RewritingReplayView(resolvers = prefixes, archiveloader = aloader, headInsert = head_insert, buffer_response = True) replayer = replay_views.RewritingReplayView(resolvers = prefixes, archiveloader = aloader, head_insert = head_insert, buffer_response = True)
# Create Jinja2 based html query view # Create Jinja2 based html query view
html_view = views.J2QueryView('./ui/', 'query.html') html_view = views.J2QueryView('./ui/', 'query.html')

View File

@ -2,30 +2,30 @@ import re
import sys import sys
import itertools import itertools
from url_rewriter import ArchivalUrlRewriter from url_rewriter import UrlRewriter
#================================================================= #=================================================================
class RegexRewriter: class RegexRewriter:
""" """
# Test https->http converter (other tests below in subclasses) # Test https->http converter (other tests below in subclasses)
>>> RegexRewriter([(RegexRewriter.HTTPX_MATCH_STR, RegexRewriter.removeHttps, 0)]).rewrite('a = https://example.com; b = http://example.com; c = https://some-url/path/https://embedded.example.com') >>> RegexRewriter([(RegexRewriter.HTTPX_MATCH_STR, RegexRewriter.remove_https, 0)]).rewrite('a = https://example.com; b = http://example.com; c = https://some-url/path/https://embedded.example.com')
'a = http://example.com; b = http://example.com; c = http://some-url/path/http://embedded.example.com' 'a = http://example.com; b = http://example.com; c = http://some-url/path/http://embedded.example.com'
""" """
@staticmethod @staticmethod
def commentOut(string): def comment_out(string):
return '/*' + string + '*/' return '/*' + string + '*/'
@staticmethod @staticmethod
def removeHttps(string): def remove_https(string):
return string.replace("https", "http") return string.replace("https", "http")
@staticmethod @staticmethod
def addPrefix(prefix): def add_prefix(prefix):
return lambda string: prefix + string return lambda string: prefix + string
@staticmethod @staticmethod
def archivalRewrite(rewriter): def archival_rewrite(rewriter):
return lambda x: rewriter.rewrite(x) return lambda x: rewriter.rewrite(x)
@staticmethod @staticmethod
@ -34,19 +34,19 @@ class RegexRewriter:
HTTPX_MATCH_STR = 'https?:\\\\?/\\\\?/[A-Za-z0-9:_@.-]+' HTTPX_MATCH_STR = 'https?:\\\\?/\\\\?/[A-Za-z0-9:_@.-]+'
DEFAULT_OP = addPrefix DEFAULT_OP = add_prefix
def __init__(self, rules): def __init__(self, rules):
#rules = self.createRules(httpPrefix) #rules = self.create_rules(http_prefix)
# Build regexstr, concatenating regex list # Build regexstr, concatenating regex list
regexStr = '|'.join(['(' + rx + ')' for rx, op, count in rules]) regex_str = '|'.join(['(' + rx + ')' for rx, op, count in rules])
# ensure it's not middle of a word, wrap in non-capture group # ensure it's not middle of a word, wrap in non-capture group
regexStr = '(?<!\w)(?:' + regexStr + ')' regex_str = '(?<!\w)(?:' + regex_str + ')'
self.regex = re.compile(regexStr, re.M) self.regex = re.compile(regex_str, re.M)
self.rules = rules self.rules = rules
def filter(self, m): def filter(self, m):
@ -63,7 +63,7 @@ class RegexRewriter:
for _, op, count in self.rules: for _, op, count in self.rules:
i += 1 i += 1
fullM = i full_m = i
while count > 0: while count > 0:
i += 1 i += 1
count -= 1 count -= 1
@ -82,8 +82,8 @@ class RegexRewriter:
result = op(m.group(i)) result = op(m.group(i))
# if extracting partial match # if extracting partial match
if i != fullM: if i != full_m:
result = m.string[m.start(fullM):m.start(i)] + result + m.string[m.end(i):m.end(fullM)] result = m.string[m.start(full_m):m.start(i)] + result + m.string[m.end(i):m.end(full_m)]
return result return result
@ -105,21 +105,21 @@ class JSRewriter(RegexRewriter):
'document_domain = "anotherdomain.com"; window.document.WB_wombat_domain = "example.com"' 'document_domain = "anotherdomain.com"; window.document.WB_wombat_domain = "example.com"'
# custom rules added # custom rules added
>>> test_js('window.location = "http://example.com/abc.html"; some_func(); ', [('some_func\(\).*', RegexRewriter.commentOut, 0)]) >>> test_js('window.location = "http://example.com/abc.html"; some_func(); ', [('some_func\(\).*', RegexRewriter.comment_out, 0)])
'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"; /*some_func(); */' 'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"; /*some_func(); */'
""" """
def __init__(self, rewriter, extra = []): def __init__(self, rewriter, extra = []):
rules = self._createRules(rewriter.getAbsUrl()) rules = self._create_rules(rewriter.get_abs_url())
rules.extend(extra) rules.extend(extra)
RegexRewriter.__init__(self, rules) RegexRewriter.__init__(self, rules)
def _createRules(self, httpPrefix): def _create_rules(self, http_prefix):
return [ return [
(RegexRewriter.HTTPX_MATCH_STR, httpPrefix, 0), (RegexRewriter.HTTPX_MATCH_STR, http_prefix, 0),
('location', 'WB_wombat_', 0), ('location', 'WB_wombat_', 0),
('(?<=document\.)domain', 'WB_wombat_', 0), ('(?<=document\.)domain', 'WB_wombat_', 0),
] ]
@ -143,7 +143,7 @@ class XMLRewriter(RegexRewriter):
""" """
def __init__(self, rewriter, extra = []): def __init__(self, rewriter, extra = []):
rules = self._createRules(rewriter.getAbsUrl()) rules = self._create_rules(rewriter.get_abs_url())
RegexRewriter.__init__(self, rules) RegexRewriter.__init__(self, rules)
@ -155,9 +155,9 @@ class XMLRewriter(RegexRewriter):
return True return True
def _createRules(self, httpPrefix): def _create_rules(self, http_prefix):
return [ return [
('([A-Za-z:]+[\s=]+)?["\'\s]*(' + RegexRewriter.HTTPX_MATCH_STR + ')', httpPrefix, 2), ('([A-Za-z:]+[\s=]+)?["\'\s]*(' + RegexRewriter.HTTPX_MATCH_STR + ')', http_prefix, 2),
] ]
#================================================================= #=================================================================
@ -211,20 +211,20 @@ class CSSRewriter(RegexRewriter):
CSS_IMPORT_NO_URL_REGEX = "@import\\s+(?!url)\\(?\\s*['\"]?(?!url[\\s\\(])([\w.:/\\\\-]+)" CSS_IMPORT_NO_URL_REGEX = "@import\\s+(?!url)\\(?\\s*['\"]?(?!url[\\s\\(])([\w.:/\\\\-]+)"
def __init__(self, rewriter): def __init__(self, rewriter):
rules = self._createRules(rewriter) rules = self._create_rules(rewriter)
RegexRewriter.__init__(self, rules) RegexRewriter.__init__(self, rules)
def _createRules(self, rewriter): def _create_rules(self, rewriter):
return [ return [
(CSSRewriter.CSS_URL_REGEX, RegexRewriter.archivalRewrite(rewriter), 1), (CSSRewriter.CSS_URL_REGEX, RegexRewriter.archival_rewrite(rewriter), 1),
(CSSRewriter.CSS_IMPORT_NO_URL_REGEX, RegexRewriter.archivalRewrite(rewriter), 1), (CSSRewriter.CSS_IMPORT_NO_URL_REGEX, RegexRewriter.archival_rewrite(rewriter), 1),
] ]
import utils import utils
if __name__ == "__main__" or utils.enable_doctests(): if __name__ == "__main__" or utils.enable_doctests():
arcrw = ArchivalUrlRewriter('/20131010im_/http://example.com/', '/web/') arcrw = UrlRewriter('/20131010im_/http://example.com/', '/web/')
def test_js(string, extra = []): def test_js(string, extra = []):
return JSRewriter(arcrw, extra).rewrite(string) return JSRewriter(arcrw, extra).rewrite(string)

View File

@ -6,10 +6,9 @@ import itertools
import archiveloader import archiveloader
from wbrequestresponse import WbResponse, StatusAndHeaders from wbrequestresponse import WbResponse, StatusAndHeaders
from wbarchivalurl import ArchivalUrl
import utils import utils
from url_rewriter import ArchivalUrlRewriter from url_rewriter import UrlRewriter
from header_rewriter import HeaderRewriter from header_rewriter import HeaderRewriter
import html_rewriter import html_rewriter
import regex_rewriters import regex_rewriters
@ -28,7 +27,7 @@ class ReplayView:
first = True first = True
# List of already failed w/arcs # List of already failed w/arcs
failedFiles = [] failed_files = []
# Iterate over the cdx until find one that works # Iterate over the cdx until find one that works
# The cdx should already be sorted in closest-to-timestamp order (from the cdx server) # The cdx should already be sorted in closest-to-timestamp order (from the cdx server)
@ -36,10 +35,10 @@ class ReplayView:
try: try:
# ability to intercept and redirect # ability to intercept and redirect
if first: if first:
self._checkRedir(wbrequest, cdx) self._check_redir(wbrequest, cdx)
first = False first = False
response = self.doReplay(cdx, wbrequest, cdx_reader, failedFiles) response = self.do_replay(cdx, wbrequest, cdx_reader, failed_files)
if response: if response:
response.cdx = cdx response.cdx = cdx
@ -56,17 +55,17 @@ class ReplayView:
else: else:
raise wbexceptions.UnresolvedArchiveFileException() raise wbexceptions.UnresolvedArchiveFileException()
def _checkRedir(self, wbrequest, cdx): def _check_redir(self, wbrequest, cdx):
return None return None
def _load(self, cdx, revisit, failedFiles): def _load(self, cdx, revisit, failed_files):
if revisit: if revisit:
(filename, offset, length) = (cdx['orig.filename'], cdx['orig.offset'], cdx['orig.length']) (filename, offset, length) = (cdx['orig.filename'], cdx['orig.offset'], cdx['orig.length'])
else: else:
(filename, offset, length) = (cdx['filename'], cdx['offset'], cdx['length']) (filename, offset, length) = (cdx['filename'], cdx['offset'], cdx['length'])
#optimization: if same file already failed this request, don't try again #optimization: if same file already failed this request, don't try again
if failedFiles and filename in failedFiles: if failed_files and filename in failed_files:
raise wbexceptions.ArchiveLoadFailed(filename, 'Skipping Already Failed') raise wbexceptions.ArchiveLoadFailed(filename, 'Skipping Already Failed')
any_found = False any_found = False
@ -86,8 +85,8 @@ class ReplayView:
pass pass
# Unsuccessful if reached here # Unsuccessful if reached here
if failedFiles: if failed_files:
failedFiles.append(filename) failed_files.append(filename)
if not any_found: if not any_found:
raise wbexceptions.UnresolvedArchiveFileException('Archive File Not Found: ' + filename) raise wbexceptions.UnresolvedArchiveFileException('Archive File Not Found: ' + filename)
@ -95,45 +94,45 @@ class ReplayView:
raise wbexceptions.ArchiveLoadFailed(filename, last_exc.reason if last_exc else '') raise wbexceptions.ArchiveLoadFailed(filename, last_exc.reason if last_exc else '')
def doReplay(self, cdx, wbrequest, cdx_reader, failedFiles): def do_replay(self, cdx, wbrequest, cdx_reader, failed_files):
hasCurr = (cdx['filename'] != '-') has_curr = (cdx['filename'] != '-')
hasOrig = (cdx.get('orig.filename','-') != '-') has_orig = (cdx.get('orig.filename','-') != '-')
# load headers record from cdx['filename'] unless it is '-' (rare) # load headers record from cdx['filename'] unless it is '-' (rare)
headersRecord = self._load(cdx, False, failedFiles) if hasCurr else None headers_record = self._load(cdx, False, failed_files) if has_curr else None
# two index lookups # two index lookups
# Case 1: if mimetype is still warc/revisit # Case 1: if mimetype is still warc/revisit
if cdx['mimetype'] == 'warc/revisit' and headersRecord: if cdx['mimetype'] == 'warc/revisit' and headers_record:
payloadRecord = self._load_different_url_payload(wbrequest, cdx_reader, cdx, headersRecord, failedFiles) payload_record = self._load_different_url_payload(wbrequest, cdx_reader, cdx, headers_record, failed_files)
# single lookup cases # single lookup cases
# case 2: non-revisit # case 2: non-revisit
elif (hasCurr and not hasOrig): elif (has_curr and not has_orig):
payloadRecord = headersRecord payload_record = headers_record
# case 3: identical url revisit, load payload from orig.filename # case 3: identical url revisit, load payload from orig.filename
elif (hasOrig): elif (has_orig):
payloadRecord = self._load(cdx, True, failedFiles) payload_record = self._load(cdx, True, failed_files)
# special case: set header to payload if old-style revisit with missing header # special case: set header to payload if old-style revisit with missing header
if not headersRecord: if not headers_record:
headersRecord = payloadRecord headers_record = payload_record
elif headersRecord != payloadRecord: elif headers_record != payload_record:
# close remainder of stream as this record only used for (already parsed) headers # close remainder of stream as this record only used for (already parsed) headers
headersRecord.stream.close() headers_record.stream.close()
# special case: check if headers record is actually empty (eg empty revisit), then use headers from revisit # special case: check if headers record is actually empty (eg empty revisit), then use headers from revisit
if not headersRecord.status_headers.headers: if not headers_record.status_headers.headers:
headersRecord = payloadRecord headers_record = payload_record
if not headersRecord or not payloadRecord: if not headers_record or not payload_record:
raise wbexceptions.CaptureException('Invalid CDX' + str(cdx)) raise wbexceptions.CaptureException('Invalid CDX' + str(cdx))
response = WbResponse(headersRecord.status_headers, self.create_stream_gen(payloadRecord.stream)) response = WbResponse(headers_record.status_headers, self.create_stream_gen(payload_record.stream))
response._stream = payloadRecord.stream response._stream = payload_record.stream
return response return response
@ -141,14 +140,14 @@ class ReplayView:
# Handle the case where a duplicate of a capture with same digest exists at a different url # Handle the case where a duplicate of a capture with same digest exists at a different url
# Must query the index at that url filtering by matching digest # Must query the index at that url filtering by matching digest
# Raise exception if no matches found # Raise exception if no matches found
def _load_different_url_payload(self, wbrequest, cdx_reader, cdx, headersRecord, failedFiles): def _load_different_url_payload(self, wbrequest, cdx_reader, cdx, headers_record, failed_files):
ref_target_uri = headersRecord.rec_headers.getHeader('WARC-Refers-To-Target-URI') ref_target_uri = headers_record.rec_headers.get_header('WARC-Refers-To-Target-URI')
# Check for unresolved revisit error, if refers to target uri not present or same as the current url # Check for unresolved revisit error, if refers to target uri not present or same as the current url
if not ref_target_uri or (ref_target_uri == headersRecord.rec_headers.getHeader('WARC-Target-URI')): if not ref_target_uri or (ref_target_uri == headers_record.rec_headers.get_header('WARC-Target-URI')):
raise wbexceptions.CaptureException('Missing Revisit Original' + str(cdx)) raise wbexceptions.CaptureException('Missing Revisit Original' + str(cdx))
ref_target_date = headersRecord.rec_headers.getHeader('WARC-Refers-To-Date') ref_target_date = headers_record.rec_headers.get_header('WARC-Refers-To-Date')
if not ref_target_date: if not ref_target_date:
ref_target_date = cdx['timestamp'] ref_target_date = cdx['timestamp']
@ -163,7 +162,7 @@ class ReplayView:
orig_wbreq.wb_url.timestamp = ref_target_date orig_wbreq.wb_url.timestamp = ref_target_date
# Must also match digest # Must also match digest
orig_wbreq.queryFilter.append('digest:' + cdx['digest']) orig_wbreq.query_filter.append('digest:' + cdx['digest'])
orig_cdx_lines = cdx_reader.load_for_request(orig_wbreq, parsed_cdx = True) orig_cdx_lines = cdx_reader.load_for_request(orig_wbreq, parsed_cdx = True)
@ -171,8 +170,8 @@ class ReplayView:
try: try:
#cdx = cdx_reader.CDXCaptureResult(cdx) #cdx = cdx_reader.CDXCaptureResult(cdx)
#print cdx #print cdx
payloadRecord = self._load(cdx, False, failedFiles) payload_record = self._load(cdx, False, failed_files)
return payloadRecord return payload_record
except wbexceptions.CaptureException as e: except wbexceptions.CaptureException as e:
pass pass
@ -180,13 +179,13 @@ class ReplayView:
raise wbexceptions.CaptureException('Original for revisit could not be loaded') raise wbexceptions.CaptureException('Original for revisit could not be loaded')
def resolveFull(self, filename): def resolve_full(self, filename):
# Attempt to resolve cdx file to full path # Attempt to resolve cdx file to full path
fullUrl = None full_url = None
for resolver in self.resolvers: for resolver in self.resolvers:
fullUrl = resolver(filename) full_url = resolver(filename)
if fullUrl: if full_url:
return fullUrl return full_url
raise wbexceptions.UnresolvedArchiveFileException('Archive File Not Found: ' + filename) raise wbexceptions.UnresolvedArchiveFileException('Archive File Not Found: ' + filename)
@ -214,36 +213,34 @@ class ReplayView:
#================================================================= #=================================================================
class RewritingReplayView(ReplayView): class RewritingReplayView(ReplayView):
def __init__(self, resolvers, archiveloader, headInsert = None, headerRewriter = None, redir_to_exact = True, buffer_response = False): def __init__(self, resolvers, archiveloader, head_insert = None, header_rewriter = None, redir_to_exact = True, buffer_response = False):
ReplayView.__init__(self, resolvers, archiveloader) ReplayView.__init__(self, resolvers, archiveloader)
self.headInsert = headInsert self.head_insert = head_insert
if not headerRewriter: self.header_rewriter = header_rewriter if header_rewriter else HeaderRewriter()
headerRewriter = HeaderRewriter()
self.headerRewriter = headerRewriter
self.redir_to_exact = redir_to_exact self.redir_to_exact = redir_to_exact
# buffer or stream rewritten response # buffer or stream rewritten response
self.buffer_response = buffer_response self.buffer_response = buffer_response
def _textContentType(self, contentType): def _text_content_type(self, content_type):
for ctype, mimelist in self.REWRITE_TYPES.iteritems(): for ctype, mimelist in self.REWRITE_TYPES.iteritems():
if any ((mime in contentType) for mime in mimelist): if any ((mime in content_type) for mime in mimelist):
return ctype return ctype
return None return None
def __call__(self, wbrequest, index, cdx_reader): def __call__(self, wbrequest, index, cdx_reader):
urlrewriter = ArchivalUrlRewriter(wbrequest.wb_url, wbrequest.wb_prefix) urlrewriter = UrlRewriter(wbrequest.wb_url, wbrequest.wb_prefix)
wbrequest.urlrewriter = urlrewriter wbrequest.urlrewriter = urlrewriter
response = ReplayView.__call__(self, wbrequest, index, cdx_reader) response = ReplayView.__call__(self, wbrequest, index, cdx_reader)
if response and response.cdx: if response and response.cdx:
self._checkRedir(wbrequest, response.cdx) self._check_redir(wbrequest, response.cdx)
rewrittenHeaders = self.headerRewriter.rewrite(response.status_headers, urlrewriter) rewritten_headers = self.header_rewriter.rewrite(response.status_headers, urlrewriter)
# TODO: better way to pass this? # TODO: better way to pass this?
stream = response._stream stream = response._stream
@ -253,7 +250,7 @@ class RewritingReplayView(ReplayView):
de_chunk = False de_chunk = False
# handle transfer-encoding: chunked # handle transfer-encoding: chunked
if (rewrittenHeaders.containsRemovedHeader('transfer-encoding', 'chunked')): if (rewritten_headers.contains_removed_header('transfer-encoding', 'chunked')):
stream = archiveloader.ChunkedLineReader(stream) stream = archiveloader.ChunkedLineReader(stream)
de_chunk = True de_chunk = True
@ -267,8 +264,8 @@ class RewritingReplayView(ReplayView):
# non-text content type, just send through with rewritten headers # non-text content type, just send through with rewritten headers
# but may need to dechunk # but may need to dechunk
if rewrittenHeaders.textType is None: if rewritten_headers.text_type is None:
response.status_headers = rewrittenHeaders.status_headers response.status_headers = rewritten_headers.status_headers
if de_chunk: if de_chunk:
response.body = self.create_stream_gen(stream) response.body = self.create_stream_gen(stream)
@ -278,15 +275,15 @@ class RewritingReplayView(ReplayView):
# Handle text rewriting # Handle text rewriting
# special case -- need to ungzip the body # special case -- need to ungzip the body
if (rewrittenHeaders.containsRemovedHeader('content-encoding', 'gzip')): if (rewritten_headers.contains_removed_header('content-encoding', 'gzip')):
stream = archiveloader.LineReader(stream, decomp = utils.create_decompressor()) stream = archiveloader.LineReader(stream, decomp = utils.create_decompressor())
# TODO: is this right? # TODO: is this right?
if rewrittenHeaders.charset: if rewritten_headers.charset:
encoding = rewrittenHeaders.charset encoding = rewritten_headers.charset
first_buff = None first_buff = None
else: else:
(encoding, first_buff) = self._detectCharset(stream) (encoding, first_buff) = self._detect_charset(stream)
# if chardet thinks its ascii, use utf-8 # if chardet thinks its ascii, use utf-8
if encoding == 'ascii': if encoding == 'ascii':
@ -294,24 +291,24 @@ class RewritingReplayView(ReplayView):
encoding = 'utf-8' encoding = 'utf-8'
# Buffering response for html, streaming for others? # Buffering response for html, streaming for others?
#if rewrittenHeaders.textType == 'html': #if rewritten_headers.text_type == 'html':
# return self._rewriteHtml(encoding, urlrewriter, stream, rewrittenHeaders.status_headers, firstBuff) # return self._rewrite_html(encoding, urlrewriter, stream, rewritten_headers.status_headers, first_buff)
#else: #else:
# return self._rewriteOther(rewrittenHeaders.textType, encoding, urlrewriter, stream, rewrittenHeaders.status_headers, firstBuff) # return self._rewrite_other(rewritten_headers.text_type, encoding, urlrewriter, stream, rewritten_headers.status_headers, first_buff)
textType = rewrittenHeaders.textType text_type = rewritten_headers.text_type
status_headers = rewrittenHeaders.status_headers status_headers = rewritten_headers.status_headers
if textType == 'html': if text_type == 'html':
rewriter = html_rewriter.WBHtml(urlrewriter, outstream = None, headInsert = self.headInsert) rewriter = html_rewriter.HTMLRewriter(urlrewriter, outstream = None, head_insert = self.head_insert)
elif textType == 'css': elif text_type == 'css':
rewriter = regex_rewriters.CSSRewriter(urlrewriter) rewriter = regex_rewriters.CSSRewriter(urlrewriter)
elif textType == 'js': elif text_type == 'js':
rewriter = regex_rewriters.JSRewriter(urlrewriter) rewriter = regex_rewriters.JSRewriter(urlrewriter)
elif textType == 'xml': elif text_type == 'xml':
rewriter = regex_rewriters.XMLRewriter(urlrewriter) rewriter = regex_rewriters.XMLRewriter(urlrewriter)
else: else:
raise Exception('Unknown Text Type for Rewrite: ' + textType) raise Exception('Unknown Text Type for Rewrite: ' + text_type)
# Create generator for response # Create generator for response
response_gen = self._create_rewrite_stream(rewriter, encoding, stream, first_buff) response_gen = self._create_rewrite_stream(rewriter, encoding, stream, first_buff)
@ -333,17 +330,17 @@ class RewritingReplayView(ReplayView):
finally: finally:
content = out.getvalue() content = out.getvalue()
contentLengthStr = str(len(content)) content_length_str = str(len(content))
status_headers.headers.append(('Content-Length', contentLengthStr)) status_headers.headers.append(('Content-Length', content_length_str))
out.close() out.close()
return WbResponse(status_headers, value = [content]) return WbResponse(status_headers, value = [content])
# Create rewrite response from record (no Content-Length), may even be chunked by front-end # Create rewrite response from record (no Content-Length), may even be chunked by front-end
def _create_rewrite_stream(self, rewriter, encoding, stream, first_buff = None): def _create_rewrite_stream(self, rewriter, encoding, stream, first_buff = None):
def doRewrite(buff): def do_rewrite(buff):
if encoding: if encoding:
buff = self._decodeBuff(buff, stream, encoding) buff = self._decode_buff(buff, stream, encoding)
buff = rewriter.rewrite(buff) buff = rewriter.rewrite(buff)
@ -352,13 +349,13 @@ class RewritingReplayView(ReplayView):
return buff return buff
def doFinish(): def do_finish():
return rewriter.close() return rewriter.close()
return self.create_stream_gen(stream, rewrite_func = doRewrite, final_read_func = doFinish, first_buff = first_buff) return self.create_stream_gen(stream, rewrite_func = do_rewrite, final_read_func = do_finish, first_buff = first_buff)
def _decodeBuff(self, buff, stream, encoding): def _decode_buff(self, buff, stream, encoding):
try: try:
buff = buff.decode(encoding) buff = buff.decode(encoding)
except UnicodeDecodeError, e: except UnicodeDecodeError, e:
@ -376,37 +373,37 @@ class RewritingReplayView(ReplayView):
return buff return buff
def _detectCharset(self, stream): def _detect_charset(self, stream):
buff = stream.read(8192) buff = stream.read(8192)
result = chardet.detect(buff) result = chardet.detect(buff)
print "chardet result: " + str(result) print "chardet result: " + str(result)
return (result['encoding'], buff) return (result['encoding'], buff)
def _checkRedir(self, wbrequest, cdx): def _check_redir(self, wbrequest, cdx):
if self.redir_to_exact and cdx and (cdx['timestamp'] != wbrequest.wb_url.timestamp): if self.redir_to_exact and cdx and (cdx['timestamp'] != wbrequest.wb_url.timestamp):
newUrl = wbrequest.urlrewriter.getTimestampUrl(cdx['timestamp'], cdx['original']) new_url = wbrequest.urlrewriter.get_timestamp_url(cdx['timestamp'], cdx['original'])
raise wbexceptions.InternalRedirect(newUrl) raise wbexceptions.InternalRedirect(new_url)
#return WbResponse.better_timestamp_response(wbrequest, cdx['timestamp']) #return WbResponse.better_timestamp_response(wbrequest, cdx['timestamp'])
return None return None
def doReplay(self, cdx, wbrequest, index, failedFiles): def do_replay(self, cdx, wbrequest, index, failed_files):
wbresponse = ReplayView.doReplay(self, cdx, wbrequest, index, failedFiles) wbresponse = ReplayView.do_replay(self, cdx, wbrequest, index, failed_files)
# Check for self redirect # Check for self redirect
if wbresponse.status_headers.statusline.startswith('3'): if wbresponse.status_headers.statusline.startswith('3'):
if self.isSelfRedirect(wbrequest, wbresponse.status_headers): if self.is_self_redirect(wbrequest, wbresponse.status_headers):
raise wbexceptions.CaptureException('Self Redirect: ' + str(cdx)) raise wbexceptions.CaptureException('Self Redirect: ' + str(cdx))
return wbresponse return wbresponse
def isSelfRedirect(self, wbrequest, status_headers): def is_self_redirect(self, wbrequest, status_headers):
requestUrl = wbrequest.wb_url.url.lower() request_url = wbrequest.wb_url.url.lower()
locationUrl = status_headers.getHeader('Location').lower() location_url = status_headers.get_header('Location').lower()
#return requestUrl == locationUrl #return request_url == location_url
return (ArchivalUrlRewriter.stripProtocol(requestUrl) == ArchivalUrlRewriter.stripProtocol(locationUrl)) return (UrlRewriter.strip_protocol(request_url) == UrlRewriter.strip_protocol(location_url))

View File

@ -1,10 +1,10 @@
import copy import copy
import urlparse import urlparse
from wbarchivalurl import ArchivalUrl from wburl import WbUrl
class ArchivalUrlRewriter: class UrlRewriter:
""" """
>>> test_rewrite('other.html', '/20131010/http://example.com/path/page.html', 'https://web.archive.org/web/') >>> test_rewrite('other.html', '/20131010/http://example.com/path/page.html', 'https://web.archive.org/web/')
'https://web.archive.org/web/20131010/http://example.com/path/other.html' 'https://web.archive.org/web/20131010/http://example.com/path/other.html'
@ -42,13 +42,13 @@ class ArchivalUrlRewriter:
>>> test_rewrite('mailto:example@example.com', '/20131010/http://example.com/path/page.html', 'https://web.archive.org/web/') >>> test_rewrite('mailto:example@example.com', '/20131010/http://example.com/path/page.html', 'https://web.archive.org/web/')
'mailto:example@example.com' 'mailto:example@example.com'
>>> ArchivalUrlRewriter('/19960708im_/http://domain.example.com/path.txt', '/abc/').getAbsUrl() >>> UrlRewriter('/19960708im_/http://domain.example.com/path.txt', '/abc/').get_abs_url()
'/abc/19960708im_/' '/abc/19960708im_/'
>>> ArchivalUrlRewriter('/2013id_/example.com/file/path/blah.html', '/123/').getTimestampUrl('20131024') >>> UrlRewriter('/2013id_/example.com/file/path/blah.html', '/123/').get_timestamp_url('20131024')
'/123/20131024id_/http://example.com/file/path/blah.html' '/123/20131024id_/http://example.com/file/path/blah.html'
>>> ArchivalUrlRewriter.stripProtocol('https://example.com') == ArchivalUrlRewriter.stripProtocol('http://example.com') >>> UrlRewriter.strip_protocol('https://example.com') == UrlRewriter.strip_protocol('http://example.com')
True True
""" """
@ -57,7 +57,7 @@ class ArchivalUrlRewriter:
PROTOCOLS = ['http://', 'https://', '//', 'ftp://', 'mms://', 'rtsp://', 'wais://'] PROTOCOLS = ['http://', 'https://', '//', 'ftp://', 'mms://', 'rtsp://', 'wais://']
def __init__(self, wburl, prefix): def __init__(self, wburl, prefix):
self.wburl = wburl if isinstance(wburl, ArchivalUrl) else ArchivalUrl(wburl) self.wburl = wburl if isinstance(wburl, WbUrl) else WbUrl(wburl)
self.prefix = prefix self.prefix = prefix
self.archivalurl_class = self.wburl.__class__ self.archivalurl_class = self.wburl.__class__
@ -66,12 +66,12 @@ class ArchivalUrlRewriter:
def rewrite(self, url, mod = None): def rewrite(self, url, mod = None):
# if special protocol, no rewriting at all # if special protocol, no rewriting at all
if any (url.startswith(x) for x in ArchivalUrlRewriter.NO_REWRITE_URI_PREFIX): if any (url.startswith(x) for x in self.NO_REWRITE_URI_PREFIX):
return url return url
wburl = self.wburl wburl = self.wburl
isAbs = any (url.startswith(x) for x in ArchivalUrlRewriter.PROTOCOLS) isAbs = any (url.startswith(x) for x in self.PROTOCOLS)
# Optimized rewriter for # Optimized rewriter for
# -rel urls that don't start with / and don't contain ../ and no special mod # -rel urls that don't start with / and don't contain ../ and no special mod
@ -92,22 +92,22 @@ class ArchivalUrlRewriter:
return finalUrl return finalUrl
def getAbsUrl(self, url = ''): def get_abs_url(self, url = ''):
return self.prefix + self.wburl.to_str(url=url) return self.prefix + self.wburl.to_str(url=url)
def getTimestampUrl(self, timestamp, url = None): def get_timestamp_url(self, timestamp, url = None):
if url is None: if url is None:
url = self.wburl.url url = self.wburl.url
return self.prefix + self.wburl.to_str(timestamp=timestamp, url=url) return self.prefix + self.wburl.to_str(timestamp=timestamp, url=url)
def setBaseUrl(self, newUrl): def set_base_url(self, newUrl):
self.wburl.url = newUrl self.wburl.url = newUrl
@staticmethod @staticmethod
def stripProtocol(url): def strip_protocol(url):
for protocol in ArchivalUrlRewriter.PROTOCOLS: for protocol in UrlRewriter.PROTOCOLS:
if url.startswith(protocol): if url.startswith(protocol):
return url[len(protocol):] return url[len(protocol):]
@ -117,7 +117,7 @@ class ArchivalUrlRewriter:
import utils import utils
if __name__ == "__main__" or utils.enable_doctests(): if __name__ == "__main__" or utils.enable_doctests():
def test_rewrite(rel_url, base_url, prefix, mod = None): def test_rewrite(rel_url, base_url, prefix, mod = None):
rewriter = ArchivalUrlRewriter(base_url, prefix) rewriter = UrlRewriter(base_url, prefix)
return rewriter.rewrite(rel_url, mod) return rewriter.rewrite(rel_url, mod)
import doctest import doctest

View File

@ -36,19 +36,19 @@ class HMACCookieMaker:
self.name = name self.name = name
def __call__(self, duration, extraId = ''): def __call__(self, duration, extra_id = ''):
expire = str(long(time.time() + duration)) expire = str(long(time.time() + duration))
if extraId: if extra_id:
msg = extraId + '-' + expire msg = extra_id + '-' + expire
else: else:
msg = expire msg = expire
hmacdigest = hmac.new(self.key, msg) hmacdigest = hmac.new(self.key, msg)
hexdigest = hmacdigest.hexdigest() hexdigest = hmacdigest.hexdigest()
if extraId: if extra_id:
cookie = '{0}-{1}={2}-{3}'.format(self.name, extraId, expire, hexdigest) cookie = '{0}-{1}={2}-{3}'.format(self.name, extra_id, expire, hexdigest)
else: else:
cookie = '{0}={1}-{2}'.format(self.name, expire, hexdigest) cookie = '{0}={1}-{2}'.format(self.name, expire, hexdigest)

View File

@ -1,4 +1,4 @@
from wbarchivalurl import ArchivalUrl from wburl import WbUrl
import utils import utils
import pprint import pprint
@ -54,19 +54,19 @@ class WbRequest:
@staticmethod @staticmethod
def makeAbsPrefix(env, rel_prefix): def make_abs_prefix(env, rel_prefix):
try: try:
return env['wsgi.url_scheme'] + '://' + env['HTTP_HOST'] + rel_prefix return env['wsgi.url_scheme'] + '://' + env['HTTP_HOST'] + rel_prefix
except KeyError: except KeyError:
return rel_prefix return rel_prefix
def __init__(self, env, request_uri, wb_prefix, wb_url, coll, use_abs_prefix = False, archivalurl_class = ArchivalUrl): def __init__(self, env, request_uri, wb_prefix, wb_url, coll, use_abs_prefix = False, archivalurl_class = WbUrl):
self.env = env self.env = env
self.request_uri = request_uri if request_uri else env.get('REL_REQUEST_URI') self.request_uri = request_uri if request_uri else env.get('REL_REQUEST_URI')
self.wb_prefix = wb_prefix if not use_abs_prefix else WbRequest.makeAbsPrefix(env, wb_prefix) self.wb_prefix = wb_prefix if not use_abs_prefix else WbRequest.make_abs_prefix(env, wb_prefix)
self.wb_url = archivalurl_class(wb_url) self.wb_url = archivalurl_class(wb_url)
@ -76,9 +76,9 @@ class WbRequest:
self.is_ajax = self._is_ajax() self.is_ajax = self._is_ajax()
self.queryFilter = [] self.query_filter = []
self.customParams = {} self.custom_params = {}
# PERF # PERF
env['X_PERF'] = {} env['X_PERF'] = {}
@ -165,16 +165,16 @@ class StatusAndHeaders:
self.headers = headers self.headers = headers
self.protocol = protocol self.protocol = protocol
def getHeader(self, name): def get_header(self, name):
nameLower = name.lower() name_lower = name.lower()
for value in self.headers: for value in self.headers:
if (value[0].lower() == nameLower): if (value[0].lower() == name_lower):
return value[1] return value[1]
def remove_header(self, name): def remove_header(self, name):
nameLower = name.lower() name_lower = name.lower()
for x in xrange(len(self.headers) - 1, -1, -1): for x in xrange(len(self.headers) - 1, -1, -1):
if self.headers[x][0].lower() == nameLower: if self.headers[x][0].lower() == name_lower:
del self.headers[x] del self.headers[x]
break break

View File

@ -5,57 +5,57 @@ import rfc3987
import wbexceptions import wbexceptions
# ArchivalUrl : archivalurl representation for WB # WbUrl : wb archival url representation for WB
class ArchivalUrl: class WbUrl:
""" """
# Replay Urls # Replay Urls
# ====================== # ======================
>>> repr(ArchivalUrl('/20131010000506/example.com')) >>> repr(WbUrl('/20131010000506/example.com'))
"('replay', '20131010000506', '', 'http://example.com', '/20131010000506/http://example.com')" "('replay', '20131010000506', '', 'http://example.com', '/20131010000506/http://example.com')"
>>> repr(ArchivalUrl('/20130102im_/https://example.com')) >>> repr(WbUrl('/20130102im_/https://example.com'))
"('replay', '20130102', 'im_', 'https://example.com', '/20130102im_/https://example.com')" "('replay', '20130102', 'im_', 'https://example.com', '/20130102im_/https://example.com')"
# Protocol agnostic convert to http # Protocol agnostic convert to http
>>> repr(ArchivalUrl('/20130102im_///example.com')) >>> repr(WbUrl('/20130102im_///example.com'))
"('replay', '20130102', 'im_', 'http://example.com', '/20130102im_/http://example.com')" "('replay', '20130102', 'im_', 'http://example.com', '/20130102im_/http://example.com')"
>>> repr(ArchivalUrl('/cs_/example.com')) >>> repr(WbUrl('/cs_/example.com'))
"('latest_replay', '', 'cs_', 'http://example.com', '/cs_/http://example.com')" "('latest_replay', '', 'cs_', 'http://example.com', '/cs_/http://example.com')"
>>> repr(ArchivalUrl('/https://example.com/xyz')) >>> repr(WbUrl('/https://example.com/xyz'))
"('latest_replay', '', '', 'https://example.com/xyz', '/https://example.com/xyz')" "('latest_replay', '', '', 'https://example.com/xyz', '/https://example.com/xyz')"
>>> repr(ArchivalUrl('/https://example.com/xyz?a=%2f&b=%2E')) >>> repr(WbUrl('/https://example.com/xyz?a=%2f&b=%2E'))
"('latest_replay', '', '', 'https://example.com/xyz?a=%2f&b=%2E', '/https://example.com/xyz?a=%2f&b=%2E')" "('latest_replay', '', '', 'https://example.com/xyz?a=%2f&b=%2E', '/https://example.com/xyz?a=%2f&b=%2E')"
# Query Urls # Query Urls
# ====================== # ======================
>>> repr(ArchivalUrl('/*/http://example.com/abc?def=a')) >>> repr(WbUrl('/*/http://example.com/abc?def=a'))
"('query', '', '', 'http://example.com/abc?def=a', '/*/http://example.com/abc?def=a')" "('query', '', '', 'http://example.com/abc?def=a', '/*/http://example.com/abc?def=a')"
>>> repr(ArchivalUrl('/*/http://example.com/abc?def=a*')) >>> repr(WbUrl('/*/http://example.com/abc?def=a*'))
"('url_query', '', '', 'http://example.com/abc?def=a', '/*/http://example.com/abc?def=a*')" "('url_query', '', '', 'http://example.com/abc?def=a', '/*/http://example.com/abc?def=a*')"
>>> repr(ArchivalUrl('/json/*/http://example.com/abc?def=a')) >>> repr(WbUrl('/json/*/http://example.com/abc?def=a'))
"('query', '', 'json', 'http://example.com/abc?def=a', '/json/*/http://example.com/abc?def=a')" "('query', '', 'json', 'http://example.com/abc?def=a', '/json/*/http://example.com/abc?def=a')"
>>> repr(ArchivalUrl('/timemap-link/2011*/http://example.com/abc?def=a')) >>> repr(WbUrl('/timemap-link/2011*/http://example.com/abc?def=a'))
"('query', '2011', 'timemap-link', 'http://example.com/abc?def=a', '/timemap-link/2011*/http://example.com/abc?def=a')" "('query', '2011', 'timemap-link', 'http://example.com/abc?def=a', '/timemap-link/2011*/http://example.com/abc?def=a')"
# Error Urls # Error Urls
# ====================== # ======================
>>> x = ArchivalUrl('abc') >>> x = WbUrl('abc')
Traceback (most recent call last): Traceback (most recent call last):
RequestParseException: Invalid WB Request Url: abc RequestParseException: Invalid WB Request Url: abc
>>> x = ArchivalUrl('/#$%#/') >>> x = WbUrl('/#$%#/')
Traceback (most recent call last): Traceback (most recent call last):
BadUrlException: Bad Request Url: http://#$%#/ BadUrlException: Bad Request Url: http://#$%#/
>>> x = ArchivalUrl('/http://example.com:abc/') >>> x = WbUrl('/http://example.com:abc/')
Traceback (most recent call last): Traceback (most recent call last):
BadUrlException: Bad Request Url: http://example.com:abc/ BadUrlException: Bad Request Url: http://example.com:abc/
""" """