diff --git a/pywb/archivalrouter.py b/pywb/archivalrouter.py index 6c10dff9..869408df 100644 --- a/pywb/archivalrouter.py +++ b/pywb/archivalrouter.py @@ -2,14 +2,14 @@ import urlparse import re from wbrequestresponse import WbRequest, WbResponse -from url_rewriter import ArchivalUrlRewriter -from wbarchivalurl import ArchivalUrl +from url_rewriter import UrlRewriter +from wburl import WbUrl #================================================================= # ArchivalRequestRouter -- route WB requests in archival mode #================================================================= class ArchivalRequestRouter: - def __init__(self, handlers, hostpaths = None, abs_path = True, archivalurl_class = ArchivalUrl): + def __init__(self, handlers, hostpaths = None, abs_path = True, archivalurl_class = WbUrl): self.handlers = handlers self.fallback = ReferRedirect(hostpaths) self.abs_path = abs_path @@ -46,7 +46,7 @@ class Route: self.coll_group = coll_group - def __call__(self, env, useAbsPrefix, archivalurl_class): + def __call__(self, env, use_abs_prefix, archivalurl_class): request_uri = env['REL_REQUEST_URI'] matcher = self.regex.match(request_uri[1:]) if not matcher: @@ -68,19 +68,19 @@ class Route: coll = coll, wb_url = wb_url, wb_prefix = wb_prefix, - use_abs_prefix = useAbsPrefix, + use_abs_prefix = use_abs_prefix, archivalurl_class = archivalurl_class) # Allow for setup of additional filters - self._addFilters(wbrequest, matcher) + self._add_filters(wbrequest, matcher) - return self._handleRequest(wbrequest) + return self._handle_request(wbrequest) - def _addFilters(self, wbrequest, matcher): + def _add_filters(self, wbrequest, matcher): pass - def _handleRequest(self, wbrequest): + def _handle_request(self, wbrequest): return self.handler(wbrequest) @@ -90,10 +90,10 @@ class Route: class ReferRedirect: """ - >>> ReferRedirect('http://localhost:8080/').matchPrefixs + >>> ReferRedirect('http://localhost:8080/').match_prefixs ['http://localhost:8080/'] - >>> ReferRedirect(['http://example:9090/']).matchPrefixs + >>> ReferRedirect(['http://example:9090/']).match_prefixs ['http://example:9090/'] >>> test_redir('http://localhost:8080/', '/other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html') @@ -118,18 +118,18 @@ class ReferRedirect: """ - def __init__(self, matchPrefixs): - if isinstance(matchPrefixs, list): - self.matchPrefixs = matchPrefixs + def __init__(self, match_prefixs): + if isinstance(match_prefixs, list): + self.match_prefixs = match_prefixs else: - self.matchPrefixs = [matchPrefixs] + self.match_prefixs = [match_prefixs] def __call__(self, wbrequest): if wbrequest.referrer is None: return None - if not any (wbrequest.referrer.startswith(i) for i in self.matchPrefixs): + if not any (wbrequest.referrer.startswith(i) for i in self.match_prefixs): return None try: @@ -145,7 +145,7 @@ class ReferRedirect: # No match on any exception try: - rewriter = ArchivalUrlRewriter('/' + ref_path[1], script_name + '/' + ref_path[0]) + rewriter = UrlRewriter('/' + ref_path[1], script_name + '/' + ref_path[0]) except Exception: return None @@ -167,16 +167,16 @@ class ReferRedirect: import utils if __name__ == "__main__" or utils.enable_doctests(): - def test_redir(matchHost, request_uri, referrer, script_name = ''): + def test_redir(match_host, request_uri, referrer, script_name = ''): env = {'REL_REQUEST_URI': request_uri, 'HTTP_REFERER': referrer, 'SCRIPT_NAME': script_name} - redir = ReferRedirect(matchHost) + redir = ReferRedirect(match_host) req = WbRequest.from_uri(request_uri, env) rep = redir(req) if not rep: return False - return rep.status_headers.getHeader('Location') + return rep.status_headers.get_header('Location') import doctest diff --git a/pywb/archiveloader.py b/pywb/archiveloader.py index 03ca7111..586c359a 100644 --- a/pywb/archiveloader.py +++ b/pywb/archiveloader.py @@ -10,21 +10,21 @@ from wbrequestresponse import StatusAndHeaders #================================================================= class HttpReader: - def __init__(self, hmac = None, hmacDuration = 30): + def __init__(self, hmac = None, hmac_duration = 30): self.hmac = hmac - self.hmacDuration = hmacDuration + self.hmac_duration = hmac_duration def load(self, url, offset, length): if length > 0: - rangeHeader = 'bytes={0}-{1}'.format(offset, offset + length - 1) + range_header = 'bytes={0}-{1}'.format(offset, offset + length - 1) else: - rangeHeader = 'bytes={0}-'.format(offset) + range_header = 'bytes={0}-'.format(offset) headers = {} - headers['Range'] = rangeHeader + headers['Range'] = range_header if self.hmac: - headers['Cookie'] = self.hmac(self.hmacDuration) + headers['Cookie'] = self.hmac(self.hmac_duration) request = urllib2.Request(url, headers = headers) return urllib2.urlopen(request) @@ -50,7 +50,7 @@ WBArchiveRecord = collections.namedtuple('WBArchiveRecord', 'type, rec_headers, class ArchiveLoader: """ - >>> loadTestArchive('example.warc.gz', '333', '1043') + >>> load_test_archive('example.warc.gz', '333', '1043') (('warc', 'response'), StatusAndHeaders(protocol = 'WARC/1.0', statusline = '', headers = [ ('WARC-Type', 'response'), ('WARC-Record-ID', ''), @@ -74,7 +74,7 @@ class ArchiveLoader: ('Connection', 'close')])) - >>> loadTestArchive('example.warc.gz', '1864', '553') + >>> load_test_archive('example.warc.gz', '1864', '553') (('warc', 'revisit'), StatusAndHeaders(protocol = 'WARC/1.0', statusline = '', headers = [ ('WARC-Type', 'revisit'), ('WARC-Record-ID', ''), @@ -114,7 +114,7 @@ class ArchiveLoader: } @staticmethod - def createDefaultLoaders(): + def create_default_loaders(): http = HttpReader() file = FileReader() return { @@ -125,35 +125,35 @@ class ArchiveLoader: } - def __init__(self, loaders = {}, chunkSize = 8192): - self.loaders = loaders if loaders else ArchiveLoader.createDefaultLoaders() - self.chunkSize = chunkSize + def __init__(self, loaders = {}, chunk_size = 8192): + self.loaders = loaders if loaders else ArchiveLoader.create_default_loaders() + self.chunk_size = chunk_size - self.arcParser = ARCHeadersParser(ArchiveLoader.ARC_HEADERS) - self.warcParser = StatusAndHeadersParser(['WARC/1.0', 'WARC/0.17', 'WARC/0.18']) - self.httpParser = StatusAndHeadersParser(['HTTP/1.0', 'HTTP/1.1']) + self.arc_parser = ARCHeadersParser(ArchiveLoader.ARC_HEADERS) + self.warc_parser = StatusAndHeadersParser(['WARC/1.0', 'WARC/0.17', 'WARC/0.18']) + self.http_parser = StatusAndHeadersParser(['HTTP/1.0', 'HTTP/1.1']) def load(self, url, offset, length): - urlParts = urlparse.urlsplit(url) + url_parts = urlparse.urlsplit(url) try: - loader = self.loaders.get(urlParts.scheme) + loader = self.loaders.get(url_parts.scheme) except Exception: raise wbexceptions.UnknownLoaderProtocolException(url) - theFormat = None + the_format = None for ext, iformat in ArchiveLoader.FORMAT_MAP.iteritems(): if url.endswith(ext): - theFormat = iformat + the_format = iformat break - if theFormat is None: + if the_format is None: raise wbexceptions.UnknownArchiveFormatException(url) - (aFormat, isGzip) = theFormat + (a_format, is_gzip) = the_format - decomp = utils.create_decompressor() if isGzip else None + decomp = utils.create_decompressor() if is_gzip else None try: length = int(length) @@ -163,17 +163,17 @@ class ArchiveLoader: raw = loader.load(url, long(offset), length) - stream = LineReader(raw, length, self.chunkSize, decomp) + stream = LineReader(raw, length, self.chunk_size, decomp) - if aFormat == 'arc': - rec_headers = self.arcParser.parse(stream) - recType = 'response' - empty = (rec_headers.getHeader('length') == 0) + if a_format == 'arc': + rec_headers = self.arc_parser.parse(stream) + rec_type = 'response' + empty = (rec_headers.get_header('length') == 0) - elif aFormat == 'warc': - rec_headers = self.warcParser.parse(stream) - recType = rec_headers.getHeader('WARC-Type') - empty = (rec_headers.getHeader('Content-Length') == '0') + elif a_format == 'warc': + rec_headers = self.warc_parser.parse(stream) + rec_type = rec_headers.get_header('WARC-Type') + empty = (rec_headers.get_header('Content-Length') == '0') # special case: empty w/arc record (hopefully a revisit) if empty: @@ -181,21 +181,21 @@ class ArchiveLoader: # special case: warc records that are not expected to have http headers # attempt to add 200 status and content-type - elif recType == 'metadata' or recType == 'resource': - status_headers = StatusAndHeaders('200 OK', [('Content-Type', rec_headers.getHeader('Content-Type'))]) + elif rec_type == 'metadata' or rec_type == 'resource': + status_headers = StatusAndHeaders('200 OK', [('Content-Type', rec_headers.get_header('Content-Type'))]) # special case: http 0.9 response, no status or headers - #elif recType == 'response': - # contentType = rec_headers.getHeader('Content-Type') - # if contentType and (';version=0.9' in contentType): + #elif rec_type == 'response': + # content_type = rec_headers.get_header('Content-Type') + # if content_type and (';version=0.9' in content_type): # status_headers = StatusAndHeaders('200 OK', []) # response record: parse HTTP status and headers! else: - #(statusline, http_headers) = self.parseHttpHeaders(stream) - status_headers = self.httpParser.parse(stream) + #(statusline, http_headers) = self.parse_http_headers(stream) + status_headers = self.http_parser.parse(stream) - return WBArchiveRecord((aFormat, recType), rec_headers, stream, status_headers) + return WBArchiveRecord((a_format, rec_type), rec_headers, stream, status_headers) #================================================================= @@ -206,9 +206,9 @@ class StatusAndHeadersParser: def parse(self, stream): statusline = stream.readline().rstrip() - protocolStatus = utils.split_prefix(statusline, self.statuslist) + protocol_status = utils.split_prefix(statusline, self.statuslist) - if not protocolStatus: + if not protocol_status: raise wbexceptions.InvalidArchiveRecordException('Expected Status Line, Found: ' + statusline) headers = [] @@ -220,7 +220,7 @@ class StatusAndHeadersParser: headers.append(header) line = stream.readline().rstrip() - return StatusAndHeaders(statusline = protocolStatus[1].strip(), headers = headers, protocol = protocolStatus[0]) + return StatusAndHeaders(statusline = protocol_status[1].strip(), headers = headers, protocol = protocol_status[0]) #================================================================= class ARCHeadersParser: @@ -247,25 +247,25 @@ class ARCHeadersParser: #================================================================= class LineReader: - def __init__(self, stream, maxLen = 0, chunkSize = 1024, decomp = None): + def __init__(self, stream, max_len = 0, chunk_size = 1024, decomp = None): self.stream = stream - self.chunkSize = chunkSize + self.chunk_size = chunk_size self.decomp = decomp self.buff = None - self.numRead = 0 - self.maxLen = maxLen + self.num_read = 0 + self.max_len = max_len - def _fillbuff(self, chunkSize = None): - if not chunkSize: - chunkSize = self.chunkSize + def _fillbuff(self, chunk_size = None): + if not chunk_size: + chunk_size = self.chunk_size if not self.buff or self.buff.pos >= self.buff.len: - toRead = min(self.maxLen - self.numRead, self.chunkSize) if (self.maxLen > 0) else self.chunkSize - data = self.stream.read(toRead) + to_read = min(self.max_len - self.num_read, self.chunk_size) if (self.max_len > 0) else self.chunk_size + data = self.stream.read(to_read) self._process_read(data) def _process_read(self, data): - self.numRead += len(data) + self.num_read += len(data) if self.decomp and data: data = self.decomp.decompress(data) @@ -310,45 +310,45 @@ class ChunkedLineReader(LineReader): '123412' """ - allChunksRead = False - notChunked = False - raiseChunkedDataExceptions = False # if False, we'll use best-guess fallback for parse errors + all_chunks_read = False + not_chunked = False + raise_chunked_data_exceptions = False # if False, we'll use best-guess fallback for parse errors - def _fillbuff(self, chunkSize = None): - if self.notChunked: - return LineReader._fillbuff(self, chunkSize) + def _fillbuff(self, chunk_size = None): + if self.not_chunked: + return LineReader._fillbuff(self, chunk_size) - if self.allChunksRead: + if self.all_chunks_read: return if not self.buff or self.buff.pos >= self.buff.len: - lengthHeader = self.stream.readline(64) + length_header = self.stream.readline(64) data = '' try: # decode length header try: - chunkSize = int(lengthHeader.strip().split(';')[0], 16) + chunk_size = int(length_header.strip().split(';')[0], 16) except ValueError: - raise ChunkedDataException("Couldn't decode length header '%s'" % lengthHeader) + raise ChunkedDataException("Couldn't decode length header '%s'" % length_header) - if chunkSize: + if chunk_size: # read chunk - while len(data) < chunkSize: - newData = self.stream.read(chunkSize - len(data)) + while len(data) < chunk_size: + new_data = self.stream.read(chunk_size - len(data)) # if we unexpectedly run out of data, either raise an exception or just stop reading, assuming file was cut off - if not newData: - if self.raiseChunkedDataExceptions: + if not new_data: + if self.raise_chunked_data_exceptions: raise ChunkedDataException("Ran out of data before end of chunk") else: - chunkSize = len(data) - self.allChunksRead = True + chunk_size = len(data) + self.all_chunks_read = True - data += newData + data += new_data # if we successfully read a block without running out, it should end in \r\n - if not self.allChunksRead: + if not self.all_chunks_read: clrf = self.stream.read(2) if clrf != '\r\n': raise ChunkedDataException("Chunk terminator not found.") @@ -356,19 +356,19 @@ class ChunkedLineReader(LineReader): if self.decomp: data = self.decomp.decompress(data) else: - # chunkSize 0 indicates end of file - self.allChunksRead = True + # chunk_size 0 indicates end of file + self.all_chunks_read = True data = '' self._process_read(data) except ChunkedDataException: - if self.raiseChunkedDataExceptions: + if self.raise_chunked_data_exceptions: raise # Can't parse the data as chunked. # It's possible that non-chunked data is set with a Transfer-Encoding: chunked # Treat this as non-chunk encoded from here on - self._process_read(lengthHeader+data) - self.notChunked = True + self._process_read(length_header + data) + self.not_chunked = True #================================================================= @@ -379,7 +379,7 @@ if __name__ == "__main__" or utils.enable_doctests(): testloader = ArchiveLoader() - def loadTestArchive(test_file, offset, length): + def load_test_archive(test_file, offset, length): path = os.path.dirname(os.path.realpath(__file__)) + '/../test/' + test_file archive = testloader.load(path, offset, length) diff --git a/pywb/header_rewriter.py b/pywb/header_rewriter.py index 5e4a70e2..1a6b65b0 100644 --- a/pywb/header_rewriter.py +++ b/pywb/header_rewriter.py @@ -2,14 +2,14 @@ from wbrequestresponse import StatusAndHeaders #================================================================= class RewrittenStatusAndHeaders: - def __init__(self, statusline, headers, removedHeaderDict, textType, charset): + def __init__(self, statusline, headers, removed_header_dict, text_type, charset): self.status_headers = StatusAndHeaders(statusline, headers) - self.removedHeaderDict = removedHeaderDict - self.textType = textType + self.removed_header_dict = removed_header_dict + self.text_type = text_type self.charset = charset - def containsRemovedHeader(self, name, value): - return self.removedHeaderDict.get(name) == value + def contains_removed_header(self, name, value): + return self.removed_header_dict.get(name) == value #================================================================= @@ -17,30 +17,30 @@ class HeaderRewriter: """ # Text with charset >>> test_rewrite([('Date', 'Fri, 03 Jan 2014 03:03:21 GMT'), ('Content-Length', '5'), ('Content-Type', 'text/html;charset=UTF-8')]) - {'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Date', 'Fri, 03 Jan 2014 03:03:21 GMT'), + {'text_type': 'html', 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Date', 'Fri, 03 Jan 2014 03:03:21 GMT'), ('X-Archive-Orig-Content-Length', '5'), - ('Content-Type', 'text/html;charset=UTF-8')]), 'charset': 'utf-8', 'textType': 'html', 'removedHeaderDict': {}} + ('Content-Type', 'text/html;charset=UTF-8')]), 'removed_header_dict': {}, 'charset': 'utf-8'} # Redirect >>> test_rewrite([('Connection', 'close'), ('Location', '/other.html')], '302 Redirect') - {'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [ ('X-Archive-Orig-Connection', 'close'), - ('Location', '/web/20131226101010/http://example.com/other.html')]), 'charset': None, 'textType': None, 'removedHeaderDict': {}} + {'text_type': None, 'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [ ('X-Archive-Orig-Connection', 'close'), + ('Location', '/web/20131226101010/http://example.com/other.html')]), 'removed_header_dict': {}, 'charset': None} # gzip >>> test_rewrite([('Content-Length', '199999'), ('Content-Type', 'text/javascript'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')]) - {'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Content-Length', '199999'), - ('Content-Type', 'text/javascript')]), 'charset': None, 'textType': 'js', 'removedHeaderDict': {'transfer-encoding': 'chunked', 'content-encoding': 'gzip'}} + {'text_type': 'js', 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Content-Length', '199999'), + ('Content-Type', 'text/javascript')]), 'removed_header_dict': {'transfer-encoding': 'chunked', 'content-encoding': 'gzip'}, 'charset': None} # Binary >>> test_rewrite([('Content-Length', '200000'), ('Content-Type', 'image/png'), ('Cookie', 'blah'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')]) - {'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('Content-Length', '200000'), + {'text_type': None, 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('Content-Length', '200000'), ('Content-Type', 'image/png'), ('X-Archive-Orig-Cookie', 'blah'), - ('Content-Encoding', 'gzip')]), 'charset': None, 'textType': None, 'removedHeaderDict': {'transfer-encoding': 'chunked'}} + ('Content-Encoding', 'gzip')]), 'removed_header_dict': {'transfer-encoding': 'chunked'}, 'charset': None} Removing Transfer-Encoding always, Was: ('Content-Encoding', 'gzip'), - ('Transfer-Encoding', 'chunked')]), 'charset': None, 'textType': None, 'removedHeaderDict': {}} + ('Transfer-Encoding', 'chunked')]), 'charset': None, 'text_type': None, 'removed_header_dict': {}} """ @@ -63,64 +63,64 @@ class HeaderRewriter: PROXY_NO_REWRITE_HEADERS = ['content-length'] - def __init__(self, headerPrefix = 'X-Archive-Orig-'): - self.headerPrefix = headerPrefix + def __init__(self, header_prefix = 'X-Archive-Orig-'): + self.header_prefix = header_prefix def rewrite(self, status_headers, urlrewriter): - contentType = status_headers.getHeader('Content-Type') - textType = None + content_type = status_headers.get_header('Content-Type') + text_type = None charset = None - stripEncoding = False + strip_encoding = False - if contentType: - textType = self._extractTextType(contentType) - if textType: - charset = self._extractCharSet(contentType) - stripEncoding = True + if content_type: + text_type = self._extract_text_type(content_type) + if text_type: + charset = self._extract_char_set(content_type) + strip_encoding = True - (newHeaders, removedHeaderDict) = self._rewriteHeaders(status_headers.headers, urlrewriter, stripEncoding) + (new_headers, removed_header_dict) = self._rewrite_headers(status_headers.headers, urlrewriter, strip_encoding) - return RewrittenStatusAndHeaders(status_headers.statusline, newHeaders, removedHeaderDict, textType, charset) + return RewrittenStatusAndHeaders(status_headers.statusline, new_headers, removed_header_dict, text_type, charset) - def _extractTextType(self, contentType): + def _extract_text_type(self, content_type): for ctype, mimelist in self.REWRITE_TYPES.iteritems(): - if any ((mime in contentType) for mime in mimelist): + if any ((mime in content_type) for mime in mimelist): return ctype return None - def _extractCharSet(self, contentType): + def _extract_char_set(self, content_type): CHARSET_TOKEN = 'charset=' - idx = contentType.find(CHARSET_TOKEN) + idx = content_type.find(CHARSET_TOKEN) if idx < 0: return None - return contentType[idx + len(CHARSET_TOKEN):].lower() + return content_type[idx + len(CHARSET_TOKEN):].lower() - def _rewriteHeaders(self, headers, urlrewriter, contentRewritten = False): - newHeaders = [] - removedHeaderDict = {} + def _rewrite_headers(self, headers, urlrewriter, content_rewritten = False): + new_headers = [] + removed_header_dict = {} for (name, value) in headers: lowername = name.lower() if lowername in self.PROXY_HEADERS: - newHeaders.append((name, value)) + new_headers.append((name, value)) elif lowername in self.URL_REWRITE_HEADERS: - newHeaders.append((name, urlrewriter.rewrite(value))) + new_headers.append((name, urlrewriter.rewrite(value))) elif lowername in self.ENCODING_HEADERS: - if contentRewritten: - removedHeaderDict[lowername] = value + if content_rewritten: + removed_header_dict[lowername] = value else: - newHeaders.append((name, value)) + new_headers.append((name, value)) elif lowername in self.REMOVE_HEADERS: - removedHeaderDict[lowername] = value - elif lowername in self.PROXY_NO_REWRITE_HEADERS and not contentRewritten: - newHeaders.append((name, value)) + removed_header_dict[lowername] = value + elif lowername in self.PROXY_NO_REWRITE_HEADERS and not content_rewritten: + new_headers.append((name, value)) else: - newHeaders.append((self.headerPrefix + name, value)) + new_headers.append((self.header_prefix + name, value)) - return (newHeaders, removedHeaderDict) + return (new_headers, removed_header_dict) import utils if __name__ == "__main__" or utils.enable_doctests(): @@ -128,7 +128,7 @@ if __name__ == "__main__" or utils.enable_doctests(): import pprint import url_rewriter - urlrewriter = url_rewriter.ArchivalUrlRewriter('/20131226101010/http://example.com/some/path/index.html', '/web/') + urlrewriter = url_rewriter.UrlRewriter('/20131226101010/http://example.com/some/path/index.html', '/web/') headerrewriter = HeaderRewriter() diff --git a/pywb/html_rewriter.py b/pywb/html_rewriter.py index 0ee6bf2a..f5228bc5 100644 --- a/pywb/html_rewriter.py +++ b/pywb/html_rewriter.py @@ -5,13 +5,13 @@ import sys import re from HTMLParser import HTMLParser -from url_rewriter import ArchivalUrlRewriter +from url_rewriter import UrlRewriter from regex_rewriters import JSRewriter, CSSRewriter #================================================================= -# WBHtml --html parser for custom rewriting, also handlers for script and css +# HTMLRewriter -- html parser for custom rewriting, also handlers for script and css #================================================================= -class WBHtml(HTMLParser): +class HTMLRewriter(HTMLParser): r""" >>> parse('Text') Text @@ -72,13 +72,13 @@ class WBHtml(HTMLParser): # Head Insertion - >>> parse('Test', headInsert = '') + >>> parse('Test', head_insert = '') Test - >>> parse('
SomeTest
', headInsert = '/* Insert */') + >>> parse('
SomeTest
', head_insert = '/* Insert */') /* Insert */
SomeTest
- >>> parse('
SomeTest
', headInsert = '') + >>> parse('
SomeTest
', head_insert = '')
SomeTest
""" @@ -125,128 +125,128 @@ class WBHtml(HTMLParser): self.buff += string - def __init__(self, url_rewriter, outstream = None, headInsert = None, jsRewriterClass = JSRewriter, cssRewriterClass = CSSRewriter): + def __init__(self, url_rewriter, outstream = None, head_insert = None, js_rewriter_class = JSRewriter, css_rewriter_class = CSSRewriter): HTMLParser.__init__(self) self.url_rewriter = url_rewriter - self._wbParseContext = None - self.out = outstream if outstream else WBHtml.AccumBuff() + self._wb_parse_context = None + self.out = outstream if outstream else self.AccumBuff() - self.jsRewriter = jsRewriterClass(url_rewriter) - self.cssRewriter = cssRewriterClass(url_rewriter) + self.js_rewriter = js_rewriter_class(url_rewriter) + self.css_rewriter = css_rewriter_class(url_rewriter) - self.headInsert = headInsert + self.head_insert = head_insert # =========================== META_REFRESH_REGEX = re.compile('^[\\d.]+\\s*;\\s*url\\s*=\\s*(.+?)\\s*$', re.IGNORECASE | re.MULTILINE) - def _rewriteMetaRefresh(self, metaRefresh): - if not metaRefresh: + def _rewrite_meta_refresh(self, meta_refresh): + if not meta_refresh: return None - m = WBHtml.META_REFRESH_REGEX.match(metaRefresh) + m = self.META_REFRESH_REGEX.match(meta_refresh) if not m: - return metaRefresh + return meta_refresh try: - metaRefresh = metaRefresh[:m.start(1)] + self._rewriteURL(m.group(1)) + metaRefresh[m.end(1):] + meta_refresh = meta_refresh[:m.start(1)] + self._rewrite_url(m.group(1)) + meta_refresh[m.end(1):] except Exception: pass - return metaRefresh + return meta_refresh # =========================== - def _rewriteURL(self, value, mod = None): + def _rewrite_url(self, value, mod = None): return self.url_rewriter.rewrite(value, mod) if value else None - def _rewriteCSS(self, cssContent): - return self.cssRewriter.rewrite(cssContent) if cssContent else None + def _rewrite_css(self, css_content): + return self.css_rewriter.rewrite(css_content) if css_content else None - def _rewriteScript(self, scriptContent): - return self.jsRewriter.rewrite(scriptContent) if scriptContent else None + def _rewrite_script(self, script_content): + return self.js_rewriter.rewrite(script_content) if script_content else None - def hasAttr(self, tagAttrs, attr): + def has_attr(self, tag_attrs, attr): name, value = attr - for attrName, attrValue in tagAttrs: - if attrName == name: - return value.lower() == attrValue.lower() + for attr_name, attr_value in tag_attrs: + if attr_name == name: + return value.lower() == attr_value.lower() return False - def rewriteTagAttrs(self, tag, tagAttrs, isStartEnd): + def rewrite_tag_attrs(self, tag, tag_attrs, is_start_end): # special case: script or style parse context - if (tag in WBHtml.STATE_TAGS) and (self._wbParseContext == None): - self._wbParseContext = tag + if (tag in self.STATE_TAGS) and (self._wb_parse_context == None): + self._wb_parse_context = tag # special case: head insertion, non-head tags - elif (self.headInsert and (self._wbParseContext == None) and (tag not in WBHtml.HEAD_TAGS)): - self.out.write(self.headInsert) - self.headInsert = None + elif (self.head_insert and (self._wb_parse_context == None) and (tag not in self.HEAD_TAGS)): + self.out.write(self.head_insert) + self.head_insert = None # attr rewriting - handler = WBHtml.REWRITE_TAGS.get(tag) + handler = self.REWRITE_TAGS.get(tag) if not handler: - handler = WBHtml.REWRITE_TAGS.get('') + handler = self.REWRITE_TAGS.get('') if not handler: return False self.out.write('<' + tag) - for attr in tagAttrs: - attrName, attrValue = attr + for attr in tag_attrs: + attr_name, attr_value = attr # special case: inline JS/event handler - if (attrValue and attrValue.startswith('javascript:')) or attrName.startswith('on'): - attrValue = self._rewriteScript(attrValue) + if (attr_value and attr_value.startswith('javascript:')) or attr_name.startswith('on'): + attr_value = self._rewrite_script(attr_value) # special case: inline CSS/style attribute - elif attrName == 'style': - attrValue = self._rewriteCSS(attrValue) + elif attr_name == 'style': + attr_value = self._rewrite_css(attr_value) # special case: meta tag - elif (tag == 'meta') and (attrName == 'content'): - if self.hasAttr(tagAttrs, ('http-equiv', 'refresh')): - attrValue = self._rewriteMetaRefresh(attrValue) + elif (tag == 'meta') and (attr_name == 'content'): + if self.has_attr(tag_attrs, ('http-equiv', 'refresh')): + attr_value = self._rewrite_meta_refresh(attr_value) else: # special case: base tag - if (tag == 'base') and (attrName == 'href') and attrValue: - self.url_rewriter.setBaseUrl(attrValue) + if (tag == 'base') and (attr_name == 'href') and attr_value: + self.url_rewriter.set_base_url(attr_value) - rwMod = handler.get(attrName) - if rwMod is not None: - attrValue = self._rewriteURL(attrValue, rwMod) + rw_mod = handler.get(attr_name) + if rw_mod is not None: + attr_value = self._rewrite_url(attr_value, rw_mod) # parser doesn't differentiate between 'attr=""' and just 'attr' # 'attr=""' is more common, so use that form - if attrValue: - self.out.write(' ' + attrName + '="' + attrValue + '"') + if attr_value: + self.out.write(' ' + attr_name + '="' + attr_value + '"') else: - self.out.write(' ' + attrName + '=""') + self.out.write(' ' + attr_name + '=""') - self.out.write('/>' if isStartEnd else '>') + self.out.write('/>' if is_start_end else '>') # special case: head tag - if (self.headInsert) and (self._wbParseContext == None) and (tag == 'head'): - self.out.write(self.headInsert) - self.headInsert = None + if (self.head_insert) and (self._wb_parse_context == None) and (tag == 'head'): + self.out.write(self.head_insert) + self.head_insert = None return True - def parseData(self, data): - if self._wbParseContext == 'script': - data = self._rewriteScript(data) - elif self._wbParseContext == 'style': - data = self._rewriteCSS(data) + def parse_data(self, data): + if self._wb_parse_context == 'script': + data = self._rewrite_script(data) + elif self._wb_parse_context == 'style': + data = self._rewrite_css(data) self.out.write(data) def rewrite(self, string): if not self.out: - self.out = WBHtml.AccumBuff() + self.out = self.AccumBuff() self.feed(string) @@ -258,9 +258,9 @@ class WBHtml(HTMLParser): # HTMLParser overrides below def close(self): - if (self._wbParseContext): - result = self.rewrite('') - self._wbParseContext = None + if (self._wb_parse_context): + result = self.rewrite('') + self._wb_parse_context = None else: result = '' @@ -268,21 +268,21 @@ class WBHtml(HTMLParser): return result def handle_starttag(self, tag, attrs): - if not self.rewriteTagAttrs(tag, attrs, False): + if not self.rewrite_tag_attrs(tag, attrs, False): self.out.write(self.get_starttag_text()) def handle_startendtag(self, tag, attrs): - if not self.rewriteTagAttrs(tag, attrs, True): + if not self.rewrite_tag_attrs(tag, attrs, True): self.out.write(self.get_starttag_text()) def handle_endtag(self, tag): - if (tag == self._wbParseContext): - self._wbParseContext = None + if (tag == self._wb_parse_context): + self._wb_parse_context = None self.out.write('') def handle_data(self, data): - self.parseData(data) + self.parse_data(data) def handle_entityref(self, data): self.out.write('&' + data + ';') @@ -292,7 +292,7 @@ class WBHtml(HTMLParser): def handle_comment(self, data): self.out.write('') def handle_decl(self, data): @@ -303,24 +303,17 @@ class WBHtml(HTMLParser): def unknown_decl(self, data): self.out.write('') -# instantiate the parser and fed it some HTML -#parser = WBHtml() -#instr = '
Test\n

Parse me!

' -#print instr -#print -#parser.feed(instr) -#print import utils if __name__ == "__main__" or utils.enable_doctests(): - url_rewriter = ArchivalUrlRewriter('/20131226101010/http://example.com/some/path/index.html', '/web/') + url_rewriter = UrlRewriter('/20131226101010/http://example.com/some/path/index.html', '/web/') - def parse(data, headInsert = None): - parser = WBHtml(url_rewriter, headInsert = headInsert) + def parse(data, head_insert = None): + parser = HTMLRewriter(url_rewriter, head_insert = head_insert) print parser.rewrite(data) + parser.close() import doctest diff --git a/pywb/indexreader.py b/pywb/indexreader.py index 919c4594..00a1050c 100644 --- a/pywb/indexreader.py +++ b/pywb/indexreader.py @@ -6,8 +6,6 @@ import wbrequestresponse import surt from collections import OrderedDict -from wbarchivalurl import ArchivalUrl - import binsearch import cdxserve import logging @@ -22,11 +20,11 @@ class IndexReader: params = self.get_query_params(wburl) # add any custom filter from the request - if wbrequest.queryFilter: - params['filter'] = wbrequest.queryFilter + if wbrequest.query_filter: + params['filter'] = wbrequest.query_filter - if wbrequest.customParams: - params.update(wbrequest.customParams) + if wbrequest.custom_params: + params.update(wbrequest.custom_params) cdxlines = self.load_cdx(wburl.url, params, parsed_cdx) @@ -133,9 +131,9 @@ class RemoteCDXServer(IndexReader): ('length', '1792')] """ - def __init__(self, serverUrl, cookie = None): - self.serverUrl = serverUrl - self.authCookie = cookie + def __init__(self, server_url, cookie = None): + self.server_url = server_url + self.auth_cookie = cookie def load_cdx(self, url, params = {}, parsed_cdx = True, **kwvalues): #url is required, must be passed explicitly! @@ -145,10 +143,10 @@ class RemoteCDXServer(IndexReader): urlparams = urllib.urlencode(params, True) try: - request = urllib2.Request(self.serverUrl, urlparams) + request = urllib2.Request(self.server_url, urlparams) - if self.authCookie: - request.add_header('Cookie', self.authCookie) + if self.auth_cookie: + request.add_header('Cookie', self.auth_cookie) response = urllib2.urlopen(request) except urllib2.HTTPError, e: @@ -168,7 +166,7 @@ class RemoteCDXServer(IndexReader): # with lower values if there are too many captures. Ideally, should be around 10-20 # The replayClosest is the max number of cdx lines, so max number of retry attempts that WB will make - def get_query_params(self, wburl, limit = '150000', collapseTime = '10', replayClosest = '4000'): + def get_query_params(self, wburl, limit = '150000', collapse_time = '10', replay_closest = '4000'): return { wburl.QUERY: diff --git a/pywb/pywb_init.py b/pywb/pywb_init.py index c7eff2b6..87f485ef 100644 --- a/pywb/pywb_init.py +++ b/pywb/pywb_init.py @@ -25,7 +25,7 @@ def pywb_config(head_insert = ''): prefixes = [replay_resolvers.PrefixResolver(test_dir)] # Create rewriting replay handler to rewrite records - replayer = replay_views.RewritingReplayView(resolvers = prefixes, archiveloader = aloader, headInsert = head_insert, buffer_response = True) + replayer = replay_views.RewritingReplayView(resolvers = prefixes, archiveloader = aloader, head_insert = head_insert, buffer_response = True) # Create Jinja2 based html query view html_view = views.J2QueryView('./ui/', 'query.html') diff --git a/pywb/regex_rewriters.py b/pywb/regex_rewriters.py index 7ef33926..bf9d0361 100644 --- a/pywb/regex_rewriters.py +++ b/pywb/regex_rewriters.py @@ -2,30 +2,30 @@ import re import sys import itertools -from url_rewriter import ArchivalUrlRewriter +from url_rewriter import UrlRewriter #================================================================= class RegexRewriter: """ # Test https->http converter (other tests below in subclasses) - >>> RegexRewriter([(RegexRewriter.HTTPX_MATCH_STR, RegexRewriter.removeHttps, 0)]).rewrite('a = https://example.com; b = http://example.com; c = https://some-url/path/https://embedded.example.com') + >>> RegexRewriter([(RegexRewriter.HTTPX_MATCH_STR, RegexRewriter.remove_https, 0)]).rewrite('a = https://example.com; b = http://example.com; c = https://some-url/path/https://embedded.example.com') 'a = http://example.com; b = http://example.com; c = http://some-url/path/http://embedded.example.com' """ @staticmethod - def commentOut(string): + def comment_out(string): return '/*' + string + '*/' @staticmethod - def removeHttps(string): + def remove_https(string): return string.replace("https", "http") @staticmethod - def addPrefix(prefix): + def add_prefix(prefix): return lambda string: prefix + string @staticmethod - def archivalRewrite(rewriter): + def archival_rewrite(rewriter): return lambda x: rewriter.rewrite(x) @staticmethod @@ -34,19 +34,19 @@ class RegexRewriter: HTTPX_MATCH_STR = 'https?:\\\\?/\\\\?/[A-Za-z0-9:_@.-]+' - DEFAULT_OP = addPrefix + DEFAULT_OP = add_prefix def __init__(self, rules): - #rules = self.createRules(httpPrefix) + #rules = self.create_rules(http_prefix) # Build regexstr, concatenating regex list - regexStr = '|'.join(['(' + rx + ')' for rx, op, count in rules]) + regex_str = '|'.join(['(' + rx + ')' for rx, op, count in rules]) # ensure it's not middle of a word, wrap in non-capture group - regexStr = '(? 0: i += 1 count -= 1 @@ -82,8 +82,8 @@ class RegexRewriter: result = op(m.group(i)) # if extracting partial match - if i != fullM: - result = m.string[m.start(fullM):m.start(i)] + result + m.string[m.end(i):m.end(fullM)] + if i != full_m: + result = m.string[m.start(full_m):m.start(i)] + result + m.string[m.end(i):m.end(full_m)] return result @@ -105,21 +105,21 @@ class JSRewriter(RegexRewriter): 'document_domain = "anotherdomain.com"; window.document.WB_wombat_domain = "example.com"' # custom rules added - >>> test_js('window.location = "http://example.com/abc.html"; some_func(); ', [('some_func\(\).*', RegexRewriter.commentOut, 0)]) + >>> test_js('window.location = "http://example.com/abc.html"; some_func(); ', [('some_func\(\).*', RegexRewriter.comment_out, 0)]) 'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"; /*some_func(); */' """ def __init__(self, rewriter, extra = []): - rules = self._createRules(rewriter.getAbsUrl()) + rules = self._create_rules(rewriter.get_abs_url()) rules.extend(extra) RegexRewriter.__init__(self, rules) - def _createRules(self, httpPrefix): + def _create_rules(self, http_prefix): return [ - (RegexRewriter.HTTPX_MATCH_STR, httpPrefix, 0), + (RegexRewriter.HTTPX_MATCH_STR, http_prefix, 0), ('location', 'WB_wombat_', 0), ('(?<=document\.)domain', 'WB_wombat_', 0), ] @@ -143,7 +143,7 @@ class XMLRewriter(RegexRewriter): """ def __init__(self, rewriter, extra = []): - rules = self._createRules(rewriter.getAbsUrl()) + rules = self._create_rules(rewriter.get_abs_url()) RegexRewriter.__init__(self, rules) @@ -155,9 +155,9 @@ class XMLRewriter(RegexRewriter): return True - def _createRules(self, httpPrefix): + def _create_rules(self, http_prefix): return [ - ('([A-Za-z:]+[\s=]+)?["\'\s]*(' + RegexRewriter.HTTPX_MATCH_STR + ')', httpPrefix, 2), + ('([A-Za-z:]+[\s=]+)?["\'\s]*(' + RegexRewriter.HTTPX_MATCH_STR + ')', http_prefix, 2), ] #================================================================= @@ -211,20 +211,20 @@ class CSSRewriter(RegexRewriter): CSS_IMPORT_NO_URL_REGEX = "@import\\s+(?!url)\\(?\\s*['\"]?(?!url[\\s\\(])([\w.:/\\\\-]+)" def __init__(self, rewriter): - rules = self._createRules(rewriter) + rules = self._create_rules(rewriter) RegexRewriter.__init__(self, rules) - def _createRules(self, rewriter): + def _create_rules(self, rewriter): return [ - (CSSRewriter.CSS_URL_REGEX, RegexRewriter.archivalRewrite(rewriter), 1), - (CSSRewriter.CSS_IMPORT_NO_URL_REGEX, RegexRewriter.archivalRewrite(rewriter), 1), + (CSSRewriter.CSS_URL_REGEX, RegexRewriter.archival_rewrite(rewriter), 1), + (CSSRewriter.CSS_IMPORT_NO_URL_REGEX, RegexRewriter.archival_rewrite(rewriter), 1), ] import utils if __name__ == "__main__" or utils.enable_doctests(): - arcrw = ArchivalUrlRewriter('/20131010im_/http://example.com/', '/web/') + arcrw = UrlRewriter('/20131010im_/http://example.com/', '/web/') def test_js(string, extra = []): return JSRewriter(arcrw, extra).rewrite(string) diff --git a/pywb/replay_views.py b/pywb/replay_views.py index 52fadf55..eff7f440 100644 --- a/pywb/replay_views.py +++ b/pywb/replay_views.py @@ -6,10 +6,9 @@ import itertools import archiveloader from wbrequestresponse import WbResponse, StatusAndHeaders -from wbarchivalurl import ArchivalUrl import utils -from url_rewriter import ArchivalUrlRewriter +from url_rewriter import UrlRewriter from header_rewriter import HeaderRewriter import html_rewriter import regex_rewriters @@ -28,7 +27,7 @@ class ReplayView: first = True # List of already failed w/arcs - failedFiles = [] + failed_files = [] # Iterate over the cdx until find one that works # The cdx should already be sorted in closest-to-timestamp order (from the cdx server) @@ -36,10 +35,10 @@ class ReplayView: try: # ability to intercept and redirect if first: - self._checkRedir(wbrequest, cdx) + self._check_redir(wbrequest, cdx) first = False - response = self.doReplay(cdx, wbrequest, cdx_reader, failedFiles) + response = self.do_replay(cdx, wbrequest, cdx_reader, failed_files) if response: response.cdx = cdx @@ -56,17 +55,17 @@ class ReplayView: else: raise wbexceptions.UnresolvedArchiveFileException() - def _checkRedir(self, wbrequest, cdx): + def _check_redir(self, wbrequest, cdx): return None - def _load(self, cdx, revisit, failedFiles): + def _load(self, cdx, revisit, failed_files): if revisit: (filename, offset, length) = (cdx['orig.filename'], cdx['orig.offset'], cdx['orig.length']) else: (filename, offset, length) = (cdx['filename'], cdx['offset'], cdx['length']) #optimization: if same file already failed this request, don't try again - if failedFiles and filename in failedFiles: + if failed_files and filename in failed_files: raise wbexceptions.ArchiveLoadFailed(filename, 'Skipping Already Failed') any_found = False @@ -86,8 +85,8 @@ class ReplayView: pass # Unsuccessful if reached here - if failedFiles: - failedFiles.append(filename) + if failed_files: + failed_files.append(filename) if not any_found: raise wbexceptions.UnresolvedArchiveFileException('Archive File Not Found: ' + filename) @@ -95,45 +94,45 @@ class ReplayView: raise wbexceptions.ArchiveLoadFailed(filename, last_exc.reason if last_exc else '') - def doReplay(self, cdx, wbrequest, cdx_reader, failedFiles): - hasCurr = (cdx['filename'] != '-') - hasOrig = (cdx.get('orig.filename','-') != '-') + def do_replay(self, cdx, wbrequest, cdx_reader, failed_files): + has_curr = (cdx['filename'] != '-') + has_orig = (cdx.get('orig.filename','-') != '-') # load headers record from cdx['filename'] unless it is '-' (rare) - headersRecord = self._load(cdx, False, failedFiles) if hasCurr else None + headers_record = self._load(cdx, False, failed_files) if has_curr else None # two index lookups # Case 1: if mimetype is still warc/revisit - if cdx['mimetype'] == 'warc/revisit' and headersRecord: - payloadRecord = self._load_different_url_payload(wbrequest, cdx_reader, cdx, headersRecord, failedFiles) + if cdx['mimetype'] == 'warc/revisit' and headers_record: + payload_record = self._load_different_url_payload(wbrequest, cdx_reader, cdx, headers_record, failed_files) # single lookup cases # case 2: non-revisit - elif (hasCurr and not hasOrig): - payloadRecord = headersRecord + elif (has_curr and not has_orig): + payload_record = headers_record # case 3: identical url revisit, load payload from orig.filename - elif (hasOrig): - payloadRecord = self._load(cdx, True, failedFiles) + elif (has_orig): + payload_record = self._load(cdx, True, failed_files) # special case: set header to payload if old-style revisit with missing header - if not headersRecord: - headersRecord = payloadRecord - elif headersRecord != payloadRecord: + if not headers_record: + headers_record = payload_record + elif headers_record != payload_record: # close remainder of stream as this record only used for (already parsed) headers - headersRecord.stream.close() + headers_record.stream.close() # special case: check if headers record is actually empty (eg empty revisit), then use headers from revisit - if not headersRecord.status_headers.headers: - headersRecord = payloadRecord + if not headers_record.status_headers.headers: + headers_record = payload_record - if not headersRecord or not payloadRecord: + if not headers_record or not payload_record: raise wbexceptions.CaptureException('Invalid CDX' + str(cdx)) - response = WbResponse(headersRecord.status_headers, self.create_stream_gen(payloadRecord.stream)) - response._stream = payloadRecord.stream + response = WbResponse(headers_record.status_headers, self.create_stream_gen(payload_record.stream)) + response._stream = payload_record.stream return response @@ -141,14 +140,14 @@ class ReplayView: # Handle the case where a duplicate of a capture with same digest exists at a different url # Must query the index at that url filtering by matching digest # Raise exception if no matches found - def _load_different_url_payload(self, wbrequest, cdx_reader, cdx, headersRecord, failedFiles): - ref_target_uri = headersRecord.rec_headers.getHeader('WARC-Refers-To-Target-URI') + def _load_different_url_payload(self, wbrequest, cdx_reader, cdx, headers_record, failed_files): + ref_target_uri = headers_record.rec_headers.get_header('WARC-Refers-To-Target-URI') # Check for unresolved revisit error, if refers to target uri not present or same as the current url - if not ref_target_uri or (ref_target_uri == headersRecord.rec_headers.getHeader('WARC-Target-URI')): + if not ref_target_uri or (ref_target_uri == headers_record.rec_headers.get_header('WARC-Target-URI')): raise wbexceptions.CaptureException('Missing Revisit Original' + str(cdx)) - ref_target_date = headersRecord.rec_headers.getHeader('WARC-Refers-To-Date') + ref_target_date = headers_record.rec_headers.get_header('WARC-Refers-To-Date') if not ref_target_date: ref_target_date = cdx['timestamp'] @@ -163,7 +162,7 @@ class ReplayView: orig_wbreq.wb_url.timestamp = ref_target_date # Must also match digest - orig_wbreq.queryFilter.append('digest:' + cdx['digest']) + orig_wbreq.query_filter.append('digest:' + cdx['digest']) orig_cdx_lines = cdx_reader.load_for_request(orig_wbreq, parsed_cdx = True) @@ -171,8 +170,8 @@ class ReplayView: try: #cdx = cdx_reader.CDXCaptureResult(cdx) #print cdx - payloadRecord = self._load(cdx, False, failedFiles) - return payloadRecord + payload_record = self._load(cdx, False, failed_files) + return payload_record except wbexceptions.CaptureException as e: pass @@ -180,13 +179,13 @@ class ReplayView: raise wbexceptions.CaptureException('Original for revisit could not be loaded') - def resolveFull(self, filename): + def resolve_full(self, filename): # Attempt to resolve cdx file to full path - fullUrl = None + full_url = None for resolver in self.resolvers: - fullUrl = resolver(filename) - if fullUrl: - return fullUrl + full_url = resolver(filename) + if full_url: + return full_url raise wbexceptions.UnresolvedArchiveFileException('Archive File Not Found: ' + filename) @@ -214,36 +213,34 @@ class ReplayView: #================================================================= class RewritingReplayView(ReplayView): - def __init__(self, resolvers, archiveloader, headInsert = None, headerRewriter = None, redir_to_exact = True, buffer_response = False): + def __init__(self, resolvers, archiveloader, head_insert = None, header_rewriter = None, redir_to_exact = True, buffer_response = False): ReplayView.__init__(self, resolvers, archiveloader) - self.headInsert = headInsert - if not headerRewriter: - headerRewriter = HeaderRewriter() - self.headerRewriter = headerRewriter + self.head_insert = head_insert + self.header_rewriter = header_rewriter if header_rewriter else HeaderRewriter() self.redir_to_exact = redir_to_exact # buffer or stream rewritten response self.buffer_response = buffer_response - def _textContentType(self, contentType): + def _text_content_type(self, content_type): for ctype, mimelist in self.REWRITE_TYPES.iteritems(): - if any ((mime in contentType) for mime in mimelist): + if any ((mime in content_type) for mime in mimelist): return ctype return None def __call__(self, wbrequest, index, cdx_reader): - urlrewriter = ArchivalUrlRewriter(wbrequest.wb_url, wbrequest.wb_prefix) + urlrewriter = UrlRewriter(wbrequest.wb_url, wbrequest.wb_prefix) wbrequest.urlrewriter = urlrewriter response = ReplayView.__call__(self, wbrequest, index, cdx_reader) if response and response.cdx: - self._checkRedir(wbrequest, response.cdx) + self._check_redir(wbrequest, response.cdx) - rewrittenHeaders = self.headerRewriter.rewrite(response.status_headers, urlrewriter) + rewritten_headers = self.header_rewriter.rewrite(response.status_headers, urlrewriter) # TODO: better way to pass this? stream = response._stream @@ -253,7 +250,7 @@ class RewritingReplayView(ReplayView): de_chunk = False # handle transfer-encoding: chunked - if (rewrittenHeaders.containsRemovedHeader('transfer-encoding', 'chunked')): + if (rewritten_headers.contains_removed_header('transfer-encoding', 'chunked')): stream = archiveloader.ChunkedLineReader(stream) de_chunk = True @@ -267,8 +264,8 @@ class RewritingReplayView(ReplayView): # non-text content type, just send through with rewritten headers # but may need to dechunk - if rewrittenHeaders.textType is None: - response.status_headers = rewrittenHeaders.status_headers + if rewritten_headers.text_type is None: + response.status_headers = rewritten_headers.status_headers if de_chunk: response.body = self.create_stream_gen(stream) @@ -278,15 +275,15 @@ class RewritingReplayView(ReplayView): # Handle text rewriting # special case -- need to ungzip the body - if (rewrittenHeaders.containsRemovedHeader('content-encoding', 'gzip')): + if (rewritten_headers.contains_removed_header('content-encoding', 'gzip')): stream = archiveloader.LineReader(stream, decomp = utils.create_decompressor()) # TODO: is this right? - if rewrittenHeaders.charset: - encoding = rewrittenHeaders.charset + if rewritten_headers.charset: + encoding = rewritten_headers.charset first_buff = None else: - (encoding, first_buff) = self._detectCharset(stream) + (encoding, first_buff) = self._detect_charset(stream) # if chardet thinks its ascii, use utf-8 if encoding == 'ascii': @@ -294,24 +291,24 @@ class RewritingReplayView(ReplayView): encoding = 'utf-8' # Buffering response for html, streaming for others? - #if rewrittenHeaders.textType == 'html': - # return self._rewriteHtml(encoding, urlrewriter, stream, rewrittenHeaders.status_headers, firstBuff) + #if rewritten_headers.text_type == 'html': + # return self._rewrite_html(encoding, urlrewriter, stream, rewritten_headers.status_headers, first_buff) #else: - # return self._rewriteOther(rewrittenHeaders.textType, encoding, urlrewriter, stream, rewrittenHeaders.status_headers, firstBuff) + # return self._rewrite_other(rewritten_headers.text_type, encoding, urlrewriter, stream, rewritten_headers.status_headers, first_buff) - textType = rewrittenHeaders.textType - status_headers = rewrittenHeaders.status_headers + text_type = rewritten_headers.text_type + status_headers = rewritten_headers.status_headers - if textType == 'html': - rewriter = html_rewriter.WBHtml(urlrewriter, outstream = None, headInsert = self.headInsert) - elif textType == 'css': + if text_type == 'html': + rewriter = html_rewriter.HTMLRewriter(urlrewriter, outstream = None, head_insert = self.head_insert) + elif text_type == 'css': rewriter = regex_rewriters.CSSRewriter(urlrewriter) - elif textType == 'js': + elif text_type == 'js': rewriter = regex_rewriters.JSRewriter(urlrewriter) - elif textType == 'xml': + elif text_type == 'xml': rewriter = regex_rewriters.XMLRewriter(urlrewriter) else: - raise Exception('Unknown Text Type for Rewrite: ' + textType) + raise Exception('Unknown Text Type for Rewrite: ' + text_type) # Create generator for response response_gen = self._create_rewrite_stream(rewriter, encoding, stream, first_buff) @@ -333,17 +330,17 @@ class RewritingReplayView(ReplayView): finally: content = out.getvalue() - contentLengthStr = str(len(content)) - status_headers.headers.append(('Content-Length', contentLengthStr)) + content_length_str = str(len(content)) + status_headers.headers.append(('Content-Length', content_length_str)) out.close() return WbResponse(status_headers, value = [content]) # Create rewrite response from record (no Content-Length), may even be chunked by front-end def _create_rewrite_stream(self, rewriter, encoding, stream, first_buff = None): - def doRewrite(buff): + def do_rewrite(buff): if encoding: - buff = self._decodeBuff(buff, stream, encoding) + buff = self._decode_buff(buff, stream, encoding) buff = rewriter.rewrite(buff) @@ -352,13 +349,13 @@ class RewritingReplayView(ReplayView): return buff - def doFinish(): + def do_finish(): return rewriter.close() - return self.create_stream_gen(stream, rewrite_func = doRewrite, final_read_func = doFinish, first_buff = first_buff) + return self.create_stream_gen(stream, rewrite_func = do_rewrite, final_read_func = do_finish, first_buff = first_buff) - def _decodeBuff(self, buff, stream, encoding): + def _decode_buff(self, buff, stream, encoding): try: buff = buff.decode(encoding) except UnicodeDecodeError, e: @@ -376,37 +373,37 @@ class RewritingReplayView(ReplayView): return buff - def _detectCharset(self, stream): + def _detect_charset(self, stream): buff = stream.read(8192) result = chardet.detect(buff) print "chardet result: " + str(result) return (result['encoding'], buff) - def _checkRedir(self, wbrequest, cdx): + def _check_redir(self, wbrequest, cdx): if self.redir_to_exact and cdx and (cdx['timestamp'] != wbrequest.wb_url.timestamp): - newUrl = wbrequest.urlrewriter.getTimestampUrl(cdx['timestamp'], cdx['original']) - raise wbexceptions.InternalRedirect(newUrl) + new_url = wbrequest.urlrewriter.get_timestamp_url(cdx['timestamp'], cdx['original']) + raise wbexceptions.InternalRedirect(new_url) #return WbResponse.better_timestamp_response(wbrequest, cdx['timestamp']) return None - def doReplay(self, cdx, wbrequest, index, failedFiles): - wbresponse = ReplayView.doReplay(self, cdx, wbrequest, index, failedFiles) + def do_replay(self, cdx, wbrequest, index, failed_files): + wbresponse = ReplayView.do_replay(self, cdx, wbrequest, index, failed_files) # Check for self redirect if wbresponse.status_headers.statusline.startswith('3'): - if self.isSelfRedirect(wbrequest, wbresponse.status_headers): + if self.is_self_redirect(wbrequest, wbresponse.status_headers): raise wbexceptions.CaptureException('Self Redirect: ' + str(cdx)) return wbresponse - def isSelfRedirect(self, wbrequest, status_headers): - requestUrl = wbrequest.wb_url.url.lower() - locationUrl = status_headers.getHeader('Location').lower() - #return requestUrl == locationUrl - return (ArchivalUrlRewriter.stripProtocol(requestUrl) == ArchivalUrlRewriter.stripProtocol(locationUrl)) + def is_self_redirect(self, wbrequest, status_headers): + request_url = wbrequest.wb_url.url.lower() + location_url = status_headers.get_header('Location').lower() + #return request_url == location_url + return (UrlRewriter.strip_protocol(request_url) == UrlRewriter.strip_protocol(location_url)) diff --git a/pywb/url_rewriter.py b/pywb/url_rewriter.py index cec3abf2..18f016eb 100644 --- a/pywb/url_rewriter.py +++ b/pywb/url_rewriter.py @@ -1,10 +1,10 @@ import copy import urlparse -from wbarchivalurl import ArchivalUrl +from wburl import WbUrl -class ArchivalUrlRewriter: +class UrlRewriter: """ >>> test_rewrite('other.html', '/20131010/http://example.com/path/page.html', 'https://web.archive.org/web/') 'https://web.archive.org/web/20131010/http://example.com/path/other.html' @@ -42,13 +42,13 @@ class ArchivalUrlRewriter: >>> test_rewrite('mailto:example@example.com', '/20131010/http://example.com/path/page.html', 'https://web.archive.org/web/') 'mailto:example@example.com' - >>> ArchivalUrlRewriter('/19960708im_/http://domain.example.com/path.txt', '/abc/').getAbsUrl() + >>> UrlRewriter('/19960708im_/http://domain.example.com/path.txt', '/abc/').get_abs_url() '/abc/19960708im_/' - >>> ArchivalUrlRewriter('/2013id_/example.com/file/path/blah.html', '/123/').getTimestampUrl('20131024') + >>> UrlRewriter('/2013id_/example.com/file/path/blah.html', '/123/').get_timestamp_url('20131024') '/123/20131024id_/http://example.com/file/path/blah.html' - >>> ArchivalUrlRewriter.stripProtocol('https://example.com') == ArchivalUrlRewriter.stripProtocol('http://example.com') + >>> UrlRewriter.strip_protocol('https://example.com') == UrlRewriter.strip_protocol('http://example.com') True """ @@ -57,7 +57,7 @@ class ArchivalUrlRewriter: PROTOCOLS = ['http://', 'https://', '//', 'ftp://', 'mms://', 'rtsp://', 'wais://'] def __init__(self, wburl, prefix): - self.wburl = wburl if isinstance(wburl, ArchivalUrl) else ArchivalUrl(wburl) + self.wburl = wburl if isinstance(wburl, WbUrl) else WbUrl(wburl) self.prefix = prefix self.archivalurl_class = self.wburl.__class__ @@ -66,12 +66,12 @@ class ArchivalUrlRewriter: def rewrite(self, url, mod = None): # if special protocol, no rewriting at all - if any (url.startswith(x) for x in ArchivalUrlRewriter.NO_REWRITE_URI_PREFIX): + if any (url.startswith(x) for x in self.NO_REWRITE_URI_PREFIX): return url wburl = self.wburl - isAbs = any (url.startswith(x) for x in ArchivalUrlRewriter.PROTOCOLS) + isAbs = any (url.startswith(x) for x in self.PROTOCOLS) # Optimized rewriter for # -rel urls that don't start with / and don't contain ../ and no special mod @@ -92,22 +92,22 @@ class ArchivalUrlRewriter: return finalUrl - def getAbsUrl(self, url = ''): + def get_abs_url(self, url = ''): return self.prefix + self.wburl.to_str(url=url) - def getTimestampUrl(self, timestamp, url = None): + def get_timestamp_url(self, timestamp, url = None): if url is None: url = self.wburl.url return self.prefix + self.wburl.to_str(timestamp=timestamp, url=url) - def setBaseUrl(self, newUrl): + def set_base_url(self, newUrl): self.wburl.url = newUrl @staticmethod - def stripProtocol(url): - for protocol in ArchivalUrlRewriter.PROTOCOLS: + def strip_protocol(url): + for protocol in UrlRewriter.PROTOCOLS: if url.startswith(protocol): return url[len(protocol):] @@ -117,7 +117,7 @@ class ArchivalUrlRewriter: import utils if __name__ == "__main__" or utils.enable_doctests(): def test_rewrite(rel_url, base_url, prefix, mod = None): - rewriter = ArchivalUrlRewriter(base_url, prefix) + rewriter = UrlRewriter(base_url, prefix) return rewriter.rewrite(rel_url, mod) import doctest diff --git a/pywb/utils.py b/pywb/utils.py index ea5f5179..02375376 100644 --- a/pywb/utils.py +++ b/pywb/utils.py @@ -36,19 +36,19 @@ class HMACCookieMaker: self.name = name - def __call__(self, duration, extraId = ''): + def __call__(self, duration, extra_id = ''): expire = str(long(time.time() + duration)) - if extraId: - msg = extraId + '-' + expire + if extra_id: + msg = extra_id + '-' + expire else: msg = expire hmacdigest = hmac.new(self.key, msg) hexdigest = hmacdigest.hexdigest() - if extraId: - cookie = '{0}-{1}={2}-{3}'.format(self.name, extraId, expire, hexdigest) + if extra_id: + cookie = '{0}-{1}={2}-{3}'.format(self.name, extra_id, expire, hexdigest) else: cookie = '{0}={1}-{2}'.format(self.name, expire, hexdigest) diff --git a/pywb/wbrequestresponse.py b/pywb/wbrequestresponse.py index 59a5a5b4..e19ae361 100644 --- a/pywb/wbrequestresponse.py +++ b/pywb/wbrequestresponse.py @@ -1,4 +1,4 @@ -from wbarchivalurl import ArchivalUrl +from wburl import WbUrl import utils import pprint @@ -54,19 +54,19 @@ class WbRequest: @staticmethod - def makeAbsPrefix(env, rel_prefix): + def make_abs_prefix(env, rel_prefix): try: return env['wsgi.url_scheme'] + '://' + env['HTTP_HOST'] + rel_prefix except KeyError: return rel_prefix - def __init__(self, env, request_uri, wb_prefix, wb_url, coll, use_abs_prefix = False, archivalurl_class = ArchivalUrl): + def __init__(self, env, request_uri, wb_prefix, wb_url, coll, use_abs_prefix = False, archivalurl_class = WbUrl): self.env = env self.request_uri = request_uri if request_uri else env.get('REL_REQUEST_URI') - self.wb_prefix = wb_prefix if not use_abs_prefix else WbRequest.makeAbsPrefix(env, wb_prefix) + self.wb_prefix = wb_prefix if not use_abs_prefix else WbRequest.make_abs_prefix(env, wb_prefix) self.wb_url = archivalurl_class(wb_url) @@ -76,9 +76,9 @@ class WbRequest: self.is_ajax = self._is_ajax() - self.queryFilter = [] + self.query_filter = [] - self.customParams = {} + self.custom_params = {} # PERF env['X_PERF'] = {} @@ -165,16 +165,16 @@ class StatusAndHeaders: self.headers = headers self.protocol = protocol - def getHeader(self, name): - nameLower = name.lower() + def get_header(self, name): + name_lower = name.lower() for value in self.headers: - if (value[0].lower() == nameLower): + if (value[0].lower() == name_lower): return value[1] def remove_header(self, name): - nameLower = name.lower() + name_lower = name.lower() for x in xrange(len(self.headers) - 1, -1, -1): - if self.headers[x][0].lower() == nameLower: + if self.headers[x][0].lower() == name_lower: del self.headers[x] break diff --git a/pywb/wbarchivalurl.py b/pywb/wburl.py similarity index 86% rename from pywb/wbarchivalurl.py rename to pywb/wburl.py index 1cba4182..431dc786 100644 --- a/pywb/wbarchivalurl.py +++ b/pywb/wburl.py @@ -5,57 +5,57 @@ import rfc3987 import wbexceptions -# ArchivalUrl : archivalurl representation for WB +# WbUrl : wb archival url representation for WB -class ArchivalUrl: +class WbUrl: """ # Replay Urls # ====================== - >>> repr(ArchivalUrl('/20131010000506/example.com')) + >>> repr(WbUrl('/20131010000506/example.com')) "('replay', '20131010000506', '', 'http://example.com', '/20131010000506/http://example.com')" - >>> repr(ArchivalUrl('/20130102im_/https://example.com')) + >>> repr(WbUrl('/20130102im_/https://example.com')) "('replay', '20130102', 'im_', 'https://example.com', '/20130102im_/https://example.com')" # Protocol agnostic convert to http - >>> repr(ArchivalUrl('/20130102im_///example.com')) + >>> repr(WbUrl('/20130102im_///example.com')) "('replay', '20130102', 'im_', 'http://example.com', '/20130102im_/http://example.com')" - >>> repr(ArchivalUrl('/cs_/example.com')) + >>> repr(WbUrl('/cs_/example.com')) "('latest_replay', '', 'cs_', 'http://example.com', '/cs_/http://example.com')" - >>> repr(ArchivalUrl('/https://example.com/xyz')) + >>> repr(WbUrl('/https://example.com/xyz')) "('latest_replay', '', '', 'https://example.com/xyz', '/https://example.com/xyz')" - >>> repr(ArchivalUrl('/https://example.com/xyz?a=%2f&b=%2E')) + >>> repr(WbUrl('/https://example.com/xyz?a=%2f&b=%2E')) "('latest_replay', '', '', 'https://example.com/xyz?a=%2f&b=%2E', '/https://example.com/xyz?a=%2f&b=%2E')" # Query Urls # ====================== - >>> repr(ArchivalUrl('/*/http://example.com/abc?def=a')) + >>> repr(WbUrl('/*/http://example.com/abc?def=a')) "('query', '', '', 'http://example.com/abc?def=a', '/*/http://example.com/abc?def=a')" - >>> repr(ArchivalUrl('/*/http://example.com/abc?def=a*')) + >>> repr(WbUrl('/*/http://example.com/abc?def=a*')) "('url_query', '', '', 'http://example.com/abc?def=a', '/*/http://example.com/abc?def=a*')" - >>> repr(ArchivalUrl('/json/*/http://example.com/abc?def=a')) + >>> repr(WbUrl('/json/*/http://example.com/abc?def=a')) "('query', '', 'json', 'http://example.com/abc?def=a', '/json/*/http://example.com/abc?def=a')" - >>> repr(ArchivalUrl('/timemap-link/2011*/http://example.com/abc?def=a')) + >>> repr(WbUrl('/timemap-link/2011*/http://example.com/abc?def=a')) "('query', '2011', 'timemap-link', 'http://example.com/abc?def=a', '/timemap-link/2011*/http://example.com/abc?def=a')" # Error Urls # ====================== - >>> x = ArchivalUrl('abc') + >>> x = WbUrl('abc') Traceback (most recent call last): RequestParseException: Invalid WB Request Url: abc - >>> x = ArchivalUrl('/#$%#/') + >>> x = WbUrl('/#$%#/') Traceback (most recent call last): BadUrlException: Bad Request Url: http://#$%#/ - >>> x = ArchivalUrl('/http://example.com:abc/') + >>> x = WbUrl('/http://example.com:abc/') Traceback (most recent call last): BadUrlException: Bad Request Url: http://example.com:abc/ """