diff --git a/pywb/archivalrouter.py b/pywb/archivalrouter.py index da122a9e..9af5dd32 100644 --- a/pywb/archivalrouter.py +++ b/pywb/archivalrouter.py @@ -1,7 +1,7 @@ import urlparse from wbrequestresponse import WbRequest, WbResponse -from wburlrewriter import ArchivalUrlRewriter +from url_rewriter import ArchivalUrlRewriter #================================================================= # ArchivalRequestRouter -- route WB requests in archival mode @@ -122,7 +122,7 @@ if __name__ == "__main__": if not rep: return False - return rep.get_header('Location') + return rep.status_headers.getHeader('Location') doctest.testmod() diff --git a/pywb/archiveloader.py b/pywb/archiveloader.py index 365492fd..cb7456fa 100644 --- a/pywb/archiveloader.py +++ b/pywb/archiveloader.py @@ -1,16 +1,15 @@ -import hanzo.warctools - -import re +import itertools import utils -import zlib import urllib2 import StringIO import urlparse import collections import wbexceptions +from wbrequestresponse import StatusAndHeaders + #================================================================= -class HttpStreamLoader: +class HttpReader: def __init__(self, hmac = None, hmacDuration = 30): self.hmac = hmac self.hmacDuration = hmacDuration @@ -33,7 +32,7 @@ class HttpStreamLoader: #================================================================= # Untested, but for completeness -class FileStreamLoader: +class FileReader: def load(self, url, offset, length): if url.startswith('file://'): url = url[len('file://'):] @@ -45,27 +44,79 @@ class FileStreamLoader: #================================================================= -WBArchiveRecord = collections.namedtuple('WBArchiveRecord', 'type, record, stream, statusline, httpHeaders') +WBArchiveRecord = collections.namedtuple('WBArchiveRecord', 'type, rec_headers, stream, status_headers') #================================================================= + class ArchiveLoader: + """ + >>> loadTestArchive('example.warc.gz', '333', '1043') + (('warc', 'response'), + StatusAndHeaders(protocol = 'WARC/1.0', statusline = '', headers = [ ('WARC-Type', 'response'), + ('WARC-Record-ID', ''), + ('WARC-Date', '2014-01-03T03:03:21Z'), + ('Content-Length', '1610'), + ('Content-Type', 'application/http; msgtype=response'), + ('WARC-Payload-Digest', 'sha1:B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'), + ('WARC-Target-URI', 'http://example.com?example=1'), + ('WARC-Warcinfo-ID', '')]), + StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'), + ('Cache-Control', 'max-age=604800'), + ('Content-Type', 'text/html'), + ('Date', 'Fri, 03 Jan 2014 03:03:21 GMT'), + ('Etag', '"359670651"'), + ('Expires', 'Fri, 10 Jan 2014 03:03:21 GMT'), + ('Last-Modified', 'Fri, 09 Aug 2013 23:54:35 GMT'), + ('Server', 'ECS (sjc/4FCE)'), + ('X-Cache', 'HIT'), + ('x-ec-custom-error', '1'), + ('Content-Length', '1270'), + ('Connection', 'close')])) + + + >>> loadTestArchive('example.warc.gz', '1864', '553') + (('warc', 'revisit'), + StatusAndHeaders(protocol = 'WARC/1.0', statusline = '', headers = [ ('WARC-Type', 'revisit'), + ('WARC-Record-ID', ''), + ('WARC-Date', '2014-01-03T03:03:41Z'), + ('Content-Length', '340'), + ('Content-Type', 'application/http; msgtype=response'), + ('WARC-Payload-Digest', 'sha1:B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'), + ('WARC-Target-URI', 'http://example.com?example=1'), + ('WARC-Warcinfo-ID', ''), + ( 'WARC-Profile', + 'http://netpreserve.org/warc/0.18/revisit/identical-payload-digest'), + ('WARC-Refers-To-Target-URI', 'http://example.com?example=1'), + ('WARC-Refers-To-Date', '2014-01-03T03:03:21Z')]), + StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'), + ('Cache-Control', 'max-age=604800'), + ('Content-Type', 'text/html'), + ('Date', 'Fri, 03 Jan 2014 03:03:41 GMT'), + ('Etag', '"359670651"'), + ('Expires', 'Fri, 10 Jan 2014 03:03:41 GMT'), + ('Last-Modified', 'Fri, 09 Aug 2013 23:54:35 GMT'), + ('Server', 'ECS (sjc/4FCE)'), + ('X-Cache', 'HIT'), + ('x-ec-custom-error', '1'), + ('Content-Length', '1270'), + ('Connection', 'close')])) + """ + # Standard ARC headers ARC_HEADERS = ["uri", "ip-address", "creation-date", "content-type", "length"] # Since loading a range request, can only determine gzip-ness based on file extension FORMAT_MAP = { - '.warc.gz': (hanzo.warctools.WarcRecord, 'warc', True), - '.arc.gz': (hanzo.warctools.ArcRecord, 'arc', True), - '.warc': (hanzo.warctools.WarcRecord, 'warc', False), - '.arc': (hanzo.warctools.ArcRecord, 'arc', False), + '.warc.gz': ('warc', True), + '.arc.gz': ('arc', True), + '.warc': ('warc', False), + '.arc': ('arc', False), } - HTTP_STATUS_REGEX = re.compile('^HTTP/[\d.]+ (\d+.*)$') - @staticmethod def createDefaultLoaders(): - http = HttpStreamLoader() - file = FileStreamLoader() + http = HttpReader() + file = FileReader() return { 'http': http, 'https': http, @@ -78,6 +129,10 @@ class ArchiveLoader: self.loaders = loaders if loaders else ArchiveLoader.createDefaultLoaders() self.chunkSize = chunkSize + self.arcParser = ARCHeadersParser(ArchiveLoader.ARC_HEADERS) + self.warcParser = StatusAndHeadersParser(['WARC/1.0', 'WARC/0.17', 'WARC/0.18']) + self.httpParser = StatusAndHeadersParser(['HTTP/1.0', 'HTTP/1.1']) + def load(self, url, offset, length): urlParts = urlparse.urlsplit(url) @@ -86,22 +141,19 @@ class ArchiveLoader: except Exception: raise wbexceptions.UnknownLoaderProtocolException(url) - loaderCls = None + theFormat = None - for ext, (loaderCls, aFormat, gzip) in ArchiveLoader.FORMAT_MAP.iteritems(): + for ext, iformat in ArchiveLoader.FORMAT_MAP.iteritems(): if url.endswith(ext): - loaderCls = loaderCls - aFormat = aFormat - isGzip = gzip + theFormat = iformat break - if loaderCls is None: + if theFormat is None: raise wbexceptions.UnknownArchiveFormatException(url) - if isGzip: - decomp = zlib.decompressobj(16+zlib.MAX_WBITS) - else: - decomp = None + (aFormat, isGzip) = theFormat + + decomp = utils.create_decompressor() if isGzip else None try: length = int(length) @@ -111,73 +163,87 @@ class ArchiveLoader: raw = loader.load(url, long(offset), length) - reader = LineReader(raw, length, self.chunkSize, decomp) - - parser = loaderCls.make_parser() - - if aFormat == 'arc': - parser.headers = ArchiveLoader.ARC_HEADERS - - (parsed, errors, _) = parser.parse(reader, 0) - - if errors: - reader.close() - raise wbexceptions.InvalidArchiveRecordException('Error Parsing Record', errors) - + stream = LineReader(raw, length, self.chunkSize, decomp) if aFormat == 'arc': + rec_headers = self.arcParser.parse(stream) recType = 'response' - empty = (utils.get_header(parsed.headers, 'length') == 0) - else: - recType = utils.get_header(parsed.headers, 'WARC-Type') - empty = (utils.get_header(parsed.headers, 'Content-Length') == '0') + empty = (rec_headers.getHeader('length') == 0) + + elif aFormat == 'warc': + rec_headers = self.warcParser.parse(stream) + recType = rec_headers.getHeader('WARC-Type') + empty = (rec_headers.getHeader('Content-Length') == '0') # special case: empty w/arc record (hopefully a revisit) if empty: - statusline = '204 No Content' - headers = [] + status_headers = StatusAndHeaders('204 No Content', []) # special case: warc records that are not expected to have http headers # attempt to add 200 status and content-type elif recType == 'metadata' or recType == 'resource': - statusline = '200 OK' - headers = [('Content-Type', utils.get_header(parsed.headers, 'Content-Type'))] + status_headers = StatusAndHeaders('200 OK', [('Content-Type', rec_headers.getHeader('Content-Type'))]) # special case: http 0.9 response, no status or headers - #elif recType == 'response' and (';version=0.9' in utils.get_header(parsed.headers, 'Content-Type')): - # statusline = '200 OK' - # headers = [] + #elif recType == 'response': + # contentType = rec_headers.getHeader('Content-Type') + # if contentType and (';version=0.9' in contentType): + # status_headers = StatusAndHeaders('200 OK', []) # response record: parse HTTP status and headers! else: - (statusline, headers) = self.parseHttpHeaders(reader) + #(statusline, http_headers) = self.parseHttpHeaders(stream) + status_headers = self.httpParser.parse(stream) - return WBArchiveRecord((aFormat, recType), parsed, reader, statusline, headers) + return WBArchiveRecord((aFormat, recType), rec_headers, stream, status_headers) - def parseHttpHeaders(self, stream): - def nextHeaderLine(stream): - return stream.readline().rstrip() +#================================================================= +class StatusAndHeadersParser: + def __init__(self, statuslist): + self.statuslist = statuslist - line = nextHeaderLine(stream) - matched = self.HTTP_STATUS_REGEX.match(line) + def parse(self, stream): + statusline = stream.readline().rstrip() - if not matched: - raise wbexceptions.InvalidArchiveRecordException('Expected HTTP Status Line, Found: ' + line) + protocolStatus = utils.split_prefix(statusline, self.statuslist) + + if not protocolStatus: + raise wbexceptions.InvalidArchiveRecordException('Expected Status Line, Found: ' + statusline) - #status = int(matched.group(2)) - statusline = matched.group(1) headers = [] - line = nextHeaderLine(stream) - + line = stream.readline().rstrip() while line and line != '\r\n': name, value = line.split(':', 1) - value = value.strip() - headers.append((name, value)) - line = nextHeaderLine(stream) + header = (name, value.strip()) + headers.append(header) + line = stream.readline().rstrip() - return (statusline, headers) + return StatusAndHeaders(statusline = protocolStatus[1].strip(), headers = headers, protocol = protocolStatus[0]) + +#================================================================= +class ARCHeadersParser: + def __init__(self, headernames): + self.headernames = headernames + + + def parse(self, stream): + headerline = stream.readline().rstrip() + + parts = headerline.split() + + headernames = self.headernames + + if len(parts) != len(headernames): + raise wbexceptions.InvalidArchiveRecordException('Wrong # of heaeders, expected arc headers {0}, Found {1}'.format(headernames, parts)) + + headers = [] + + for name, value in itertools.izip(headernames, parts): + headers.append((name, value)) + + return StatusAndHeaders(statusline = '', headers = headers, protocol = 'ARC/1.0') #================================================================= class LineReader: @@ -217,4 +283,19 @@ class LineReader: self.stream = None +#================================================================= +if __name__ == "__main__": + import doctest + import os + import pprint + + testloader = ArchiveLoader() + + def loadTestArchive(test_file, offset, length): + path = os.path.dirname(os.path.realpath(__file__)) + '/../test/' + test_file + + archive = testloader.load(path, offset, length) + pprint.pprint((archive.type, archive.rec_headers, archive.status_headers)) + + doctest.testmod() diff --git a/pywb/header_rewriter.py b/pywb/header_rewriter.py new file mode 100644 index 00000000..3b539082 --- /dev/null +++ b/pywb/header_rewriter.py @@ -0,0 +1,133 @@ +from wbrequestresponse import StatusAndHeaders + +#================================================================= +class RewrittenStatusAndHeaders: + def __init__(self, statusline, headers, removedHeaderDict, textType, charset): + self.status_headers = StatusAndHeaders(statusline, headers) + self.removedHeaderDict = removedHeaderDict + self.textType = textType + self.charset = charset + + def containsRemovedHeader(self, name, value): + return self.removedHeaderDict.get(name) == value + + +#================================================================= +class HeaderRewriter: + """ + # Text with charset + >>> test_rewrite([('Date', 'Fri, 03 Jan 2014 03:03:21 GMT'), ('Content-Length', '5'), ('Content-Type', 'text/html;charset=utf-8')]) + {'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Date', 'Fri, 03 Jan 2014 03:03:21 GMT'), + ('X-Archive-Orig-Content-Length', '5'), + ('Content-Type', 'text/html;charset=utf-8')]), 'charset': 'utf-8', 'textType': 'html', 'removedHeaderDict': {}} + + # Redirect + >>> test_rewrite([('Connection', 'close'), ('Location', '/other.html')], '302 Redirect') + {'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [ ('X-Archive-Orig-Connection', 'close'), + ('Location', '/web/20131226101010/http://example.com/other.html')]), 'charset': None, 'textType': None, 'removedHeaderDict': {}} + + # gzip + >>> test_rewrite([('Content-Length', '199999'), ('Content-Type', 'text/javascript'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')]) + {'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Content-Length', '199999'), + ('Content-Type', 'text/javascript')]), 'charset': None, 'textType': 'js', 'removedHeaderDict': {'transfer-encoding': 'chunked', 'content-encoding': 'gzip'}} + + # Binary + >>> test_rewrite([('Content-Length', '200000'), ('Content-Type', 'image/png'), ('Cookie', 'blah'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')]) + {'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('Content-Length', '200000'), + ('Content-Type', 'image/png'), + ('X-Archive-Orig-Cookie', 'blah'), + ('Content-Encoding', 'gzip'), + ('Transfer-Encoding', 'chunked')]), 'charset': None, 'textType': None, 'removedHeaderDict': {}} + + """ + + + REWRITE_TYPES = { + 'html': ['text/html', 'application/xhtml'], + 'css': ['text/css'], + 'js': ['text/javascript', 'application/javascript', 'application/x-javascript'], + 'xml': ['/xml', '+xml', '.xml', '.rss'], + } + + + PROXY_HEADERS = ('content-type', 'content-disposition') + + URL_REWRITE_HEADERS = ('location', 'content-location', 'content-base') + + ENCODING_HEADERS = ('content-encoding', 'transfer-encoding') + + PROXY_NO_REWRITE_HEADERS = ('content-length') + + def __init__(self, headerPrefix = 'X-Archive-Orig-'): + self.headerPrefix = headerPrefix + + def rewrite(self, status_headers, urlrewriter): + contentType = status_headers.getHeader('Content-Type') + textType = None + charset = None + stripEncoding = False + + if contentType: + textType = self._extractTextType(contentType) + if textType: + charset = self._extractCharSet(contentType) + stripEncoding = True + + (newHeaders, removedHeaderDict) = self._rewriteHeaders(status_headers.headers, urlrewriter, stripEncoding) + + return RewrittenStatusAndHeaders(status_headers.statusline, newHeaders, removedHeaderDict, textType, charset) + + + def _extractTextType(self, contentType): + for ctype, mimelist in self.REWRITE_TYPES.iteritems(): + if any ((mime in contentType) for mime in mimelist): + return ctype + + return None + + def _extractCharSet(self, contentType): + CHARSET_TOKEN = 'charset=' + idx = contentType.find(CHARSET_TOKEN) + if idx < 0: + return None + + return contentType[idx + len(CHARSET_TOKEN):] + + def _rewriteHeaders(self, headers, urlrewriter, contentRewritten = False): + newHeaders = [] + removedHeaderDict = {} + + for (name, value) in headers: + lowername = name.lower() + if lowername in self.PROXY_HEADERS: + newHeaders.append((name, value)) + elif lowername in self.URL_REWRITE_HEADERS: + newHeaders.append((name, urlrewriter.rewrite(value))) + elif lowername in self.ENCODING_HEADERS: + if contentRewritten: + removedHeaderDict[lowername] = value + else: + newHeaders.append((name, value)) + elif lowername in self.PROXY_NO_REWRITE_HEADERS and not contentRewritten: + newHeaders.append((name, value)) + else: + newHeaders.append((self.headerPrefix + name, value)) + + return (newHeaders, removedHeaderDict) + +if __name__ == "__main__": + import doctest + import os + import pprint + import url_rewriter + + urlrewriter = url_rewriter.ArchivalUrlRewriter('/20131226101010/http://example.com/some/path/index.html', '/web/') + + headerrewriter = HeaderRewriter() + + def test_rewrite(headers, status = '200 OK'): + rewritten = headerrewriter.rewrite(StatusAndHeaders(status, headers), urlrewriter) + return vars(rewritten) + + doctest.testmod() + diff --git a/pywb/wbhtml.py b/pywb/html_rewriter.py similarity index 99% rename from pywb/wbhtml.py rename to pywb/html_rewriter.py index 93354754..6198c474 100644 --- a/pywb/wbhtml.py +++ b/pywb/html_rewriter.py @@ -5,8 +5,8 @@ import sys import re from HTMLParser import HTMLParser -from wburlrewriter import ArchivalUrlRewriter -from regexmatch import JSRewriter, CSSRewriter +from url_rewriter import ArchivalUrlRewriter +from regex_rewriters import JSRewriter, CSSRewriter #================================================================= # WBHtml --html parser for custom rewriting, also handlers for script and css diff --git a/pywb/indexreader.py b/pywb/indexreader.py index 26c1e190..973d3e6f 100644 --- a/pywb/indexreader.py +++ b/pywb/indexreader.py @@ -10,37 +10,18 @@ class RemoteCDXServer: >>> x = cdxserver.load('example.com', parse_cdx = True, limit = '2') >>> pprint(x[0]) {'digest': 'HT2DYGA5UKZCPBSFVCV3JOBXGW2G5UUA', - 'filename': 'DJ_crawl2.20020401123359-c/DJ_crawl3.20020120141301.arc.gz', 'length': '1792', 'mimetype': 'text/html', - 'offset': '49482198', 'original': 'http://example.com:80/', - 'redirect': '-', - 'robotflags': '-', 'statuscode': '200', 'timestamp': '20020120142510', 'urlkey': 'com,example)/'} - >>> x = cdxserver.load('example.com', parse_cdx = True, params = {'resolveRevisits': True, 'closest': '20131226', 'sort': 'closest', 'limit': '1'}) - >>> pprint(x[0]) - {'digest': 'B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A', - 'filename': 'top_domains-00800-20131210-035838/top_domains-00800-20131210051705-00024.warc.gz', - 'length': '523', - 'mimetype': 'warc/revisit', - 'offset': '247256770', - 'orig.filename': 'deccanchronicle.com-20130107-023325/IA-FOC-deccanchronicle.com-20130921004125-00000.warc.gz', - 'orig.length': '529', - 'orig.offset': '769759', - 'original': 'http://www.example.com/', - 'redirect': '-', - 'robotflags': '-', - 'statuscode': '-', - 'timestamp': '20131210052355', - 'urlkey': 'com,example)/'} - """ + """ - def __init__(self, serverUrl): + def __init__(self, serverUrl, cookie = None): self.serverUrl = serverUrl + self.authCookie = cookie def load(self, url, params = {}, parse_cdx = False, **kwvalues): #url is required, must be passed explicitly! @@ -51,6 +32,10 @@ class RemoteCDXServer: try: request = urllib2.Request(self.serverUrl, urlparams) + + if self.authCookie: + request.add_header('Cookie', self.authCookie) + response = urllib2.urlopen(request) except urllib2.HTTPError, e: if e.code == 403: @@ -91,6 +76,9 @@ class RemoteCDXServer: class CDXCaptureResult(dict): CDX_FORMATS = [ + # Public CDX Format + ["urlkey","timestamp","original","mimetype","statuscode","digest","length"], + # CDX 11 Format ["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"], diff --git a/pywb/query.py b/pywb/query.py index 4f5574b8..9f932964 100644 --- a/pywb/query.py +++ b/pywb/query.py @@ -4,8 +4,11 @@ import wbrequestresponse import wbexceptions class QueryHandler: - def __init__(self): - self.cdxserver = indexreader.RemoteCDXServer('http://web.archive.org/cdx/search/cdx') + def __init__(self, cdxserver = None): + if not cdxserver: + cdxserver = indexreader.RemoteCDXServer('http://web.archive.org/cdx/search/cdx') + + self.cdxserver = cdxserver def __call__(self, wbrequest, prev_wbresponse): wburl = wbrequest.wb_url diff --git a/pywb/regexmatch.py b/pywb/regex_rewriters.py similarity index 69% rename from pywb/regexmatch.py rename to pywb/regex_rewriters.py index c2b61bbc..1c949b60 100644 --- a/pywb/regexmatch.py +++ b/pywb/regex_rewriters.py @@ -2,12 +2,13 @@ import re import sys import itertools -from wburlrewriter import ArchivalUrlRewriter +from url_rewriter import ArchivalUrlRewriter +#================================================================= class RegexRewriter: """ # Test https->http converter (other tests below in subclasses) - >>> RegexRewriter([(RegexRewriter.HTTPX_MATCH_REGEX, RegexRewriter.removeHttps, 0)]).replaceAll('a = https://example.com; b = http://example.com; c = https://some-url/path/https://embedded.example.com') + >>> RegexRewriter([(RegexRewriter.HTTPX_MATCH_STR, RegexRewriter.removeHttps, 0)]).replaceAll('a = https://example.com; b = http://example.com; c = https://some-url/path/https://embedded.example.com') 'a = http://example.com; b = http://example.com; c = http://some-url/path/http://embedded.example.com' """ @@ -27,7 +28,7 @@ class RegexRewriter: def archivalRewrite(rewriter): return lambda x: rewriter.rewrite(x) - HTTPX_MATCH_REGEX = 'https?:\\\\?/\\\\?/[A-Za-z0-9:_@.-]+' + HTTPX_MATCH_STR = 'https?:\\\\?/\\\\?/[A-Za-z0-9:_@.-]+' DEFAULT_OP = addPrefix @@ -44,6 +45,9 @@ class RegexRewriter: self.regex = re.compile(regexStr, re.M) self.rules = rules + def filter(self, m): + return True + def replaceAll(self, string): return self.regex.sub(lambda x: self.replace(x), string) @@ -60,6 +64,10 @@ class RegexRewriter: if not m.group(i): continue + # Optional filter to skip matches + if not self.filter(m): + return m.group(0) + # Custom func if not hasattr(op, '__call__'): op = RegexRewriter.DEFAULT_OP(op) @@ -74,6 +82,7 @@ class RegexRewriter: +#================================================================= class JSRewriter(RegexRewriter): """ >>> test_js('location = "http://example.com/abc.html"') @@ -100,11 +109,47 @@ class JSRewriter(RegexRewriter): def _createRules(self, httpPrefix): return [ - (RegexRewriter.HTTPX_MATCH_REGEX, httpPrefix, 0), + (RegexRewriter.HTTPX_MATCH_STR, httpPrefix, 0), ('location|domain', 'WB_wombat_', 0), ] +#================================================================= +class XMLRewriter(RegexRewriter): + """ + >>> test_xml('') + '' + + >>> test_xml('') + '' + + >>> test_xml(' http://example.comabchttp://example.com') + ' /web/20131010im_/http://example.comabchttp://example.com' + + >>> test_xml('
http://www.example.com/blah http://example.com
') + '
/web/20131010im_/http://www.example.com/blah /web/20131010im_/http://example.com
' + + """ + + def __init__(self, rewriter, extra = []): + rules = self._createRules(rewriter.getAbsUrl()) + + RegexRewriter.__init__(self, rules) + + # custom filter to reject 'xmlns' attr + def filter(self, m): + attr = m.group(1) + if attr and attr.startswith('xmlns'): + return False + + return True + + def _createRules(self, httpPrefix): + return [ + ('([A-Za-z:]+[\s=]+)?["\'\s]*(' + RegexRewriter.HTTPX_MATCH_STR + ')', httpPrefix, 2), + ] + +#================================================================= class CSSRewriter(RegexRewriter): r""" >>> test_css("background: url('/some/path.html')") @@ -172,6 +217,9 @@ if __name__ == "__main__": def test_js(string, extra = []): return JSRewriter(arcrw, extra).replaceAll(string) + def test_xml(string): + return XMLRewriter(arcrw).replaceAll(string) + def test_css(string): return CSSRewriter(arcrw).replaceAll(string) diff --git a/pywb/replay.py b/pywb/replay.py index e1a9af5a..9c88bedc 100644 --- a/pywb/replay.py +++ b/pywb/replay.py @@ -1,14 +1,18 @@ import StringIO from urllib2 import URLError +import chardet +import redis import indexreader -from wbrequestresponse import WbResponse +from wbrequestresponse import WbResponse, StatusAndHeaders from wbarchivalurl import ArchivalUrl import utils -from wburlrewriter import ArchivalUrlRewriter -import wbhtml -import regexmatch +from url_rewriter import ArchivalUrlRewriter +from header_rewriter import HeaderRewriter +import html_rewriter +import regex_rewriters + import wbexceptions #================================================================= @@ -111,19 +115,19 @@ class ReplayHandler(object): payloadRecord = self._load(cdx, True, failedFiles) # Case 4: if headers record is actually empty (eg empty revisit), then use headers from revisit - if not headersRecord.httpHeaders: + if not headersRecord.status_headers.headers: headersRecord.stream.close() headersRecord = payloadRecord else: headersRecord.stream.close() - + isRevisit = True else: raise wbexceptions.CaptureException('Invalid CDX' + cdx) - return WbResponse.stream_response(headersRecord.statusline, headersRecord.httpHeaders, payloadRecord.stream) + return WbResponse.stream_response(headersRecord.status_headers, payloadRecord.stream) def resolveFull(self, filename): @@ -140,26 +144,12 @@ class ReplayHandler(object): #================================================================= class RewritingReplayHandler(ReplayHandler): - - REWRITE_TYPES = { - 'html': ['text/html', 'application/xhtml'], - 'css': ['text/css'], - 'js': ['text/javascript', 'application/javascript', 'application/x-javascript'], - 'xml': ['/xml', '+xml', '.xml', '.rss'], - } - - - PROXY_HEADERS = ('content-type', 'content-disposition') - - URL_REWRITE_HEADERS = ('location', 'content-location', 'content-base') - - ENCODING_HEADERS = ('content-encoding', 'transfer-encoding') - - - def __init__(self, resolvers, archiveloader, headerPrefix = 'X-Archive-Orig-', headInsert = None): + def __init__(self, resolvers, archiveloader, headInsert = None, headerRewriter = None): ReplayHandler.__init__(self, resolvers, archiveloader) - self.headerPrefix = headerPrefix self.headInsert = headInsert + if not headerRewriter: + headerRewriter = HeaderRewriter() + self.headerRewriter = headerRewriter def _textContentType(self, contentType): @@ -183,88 +173,94 @@ class RewritingReplayHandler(ReplayHandler): if wbrequest.wb_url.mod == 'id_': return response - contentType = utils.get_header(response.headersList, 'Content-Type') - - textType = self._textContentType(contentType) if contentType else None - - (newHeaders, remHeaders) = self._rewriteHeaders(response.headersList, urlrewriter, textType is not None) - # binary type, just send through - if textType is None: - response.headersList = newHeaders + rewrittenHeaders = self.headerRewriter.rewrite(response.status_headers, urlrewriter) + + # non-text content type, just send through with rewritten headers + if rewrittenHeaders.textType is None: + response.status_headers = rewrittenHeaders.status_headers return response # Handle text rewriting - # TODO: better way to pass this + # TODO: better way to pass this? stream = response._stream # special case -- need to ungzip the body - if (utils.contains_header(remHeaders, ('Content-Encoding', 'gzip'))): - stream = archiveloader.LineStream(stream, decomp = zlib.decompressobj(16 + zlib.MAX_WBITS)) + if (rewrittenHeaders.containsRemovedHeader('content-encoding', 'gzip')): + stream = archiveloader.LineStream(stream, decomp = utils.create_decompressor()) - return self._rewriteContent(textType, urlrewriter, stream, newHeaders, response) + # TODO: is this right? + if rewrittenHeaders.charset: + encoding = rewrittenHeaders.charset + firstBuff = None + else: + (encoding, firstBuff) = self._detectCharset(stream) - # TODO: first non-streaming attempt, probably want to stream - def _rewriteContent(self, textType, urlrewriter, stream, newHeaders, origResponse, encoding = 'utf-8'): - if textType == 'html': - out = StringIO.StringIO() - #out = SimpleWriter() - htmlrewriter = wbhtml.WBHtml(urlrewriter, out, self.headInsert) + # if ascii, set to noop encode operation + if encoding == 'ascii': + encoding = None + #encoding = 'utf-8' - try: - buff = stream.read() - while buff: + # Buffering response for html, streaming for others? + if rewrittenHeaders.textType == 'html': + return self._rewriteHtml(encoding, urlrewriter, stream, rewrittenHeaders.status_headers, firstBuff) + else: + return self._rewriteOther(rewrittenHeaders.textType, encoding, urlrewriter, stream, rewrittenHeaders.status_headers, firstBuff) + + + def _rewriteHtml(self, encoding, urlrewriter, stream, status_headers, firstBuff = None): + out = StringIO.StringIO() + htmlrewriter = html_rewriter.WBHtml(urlrewriter, out, self.headInsert) + + try: + buff = firstBuff if firstBuff else stream.read() + while buff: + if encoding: buff = buff.decode(encoding) - htmlrewriter.feed(buff) - buff = stream.read() + htmlrewriter.feed(buff) + buff = stream.read() - htmlrewriter.close() + # Close rewriter if gracefully made it to end + htmlrewriter.close() - #except Exception as e: - # print e + finally: + content = out.getvalue() + if encoding: + content = content.encode(encoding) - finally: - content = out.getvalue().encode(encoding) value = [content] - newHeaders.append(('Content-Length', str(len(value[0])))) + contentLengthStr = str(len(content)) + status_headers.headers.append(('Content-Length', contentLengthStr)) out.close() - return WbResponse(status = origResponse.status, headersList = newHeaders, value = value) - - else: - if textType == 'css': - rewriter = regexmatch.CSSRewriter(urlrewriter) - elif textType == 'js': - rewriter = regexmatch.JSRewriter(urlrewriter) - - def doRewrite(buff): - return rewriter.replaceAll(buff) - - return WbResponse.stream_response(origResponse.status, newHeaders, stream, doRewrite) + return WbResponse(status_headers, value = value) + def _rewriteOther(self, textType, encoding, urlrewriter, stream, status_headers, firstBuff = None): + if textType == 'css': + rewriter = regex_rewriters.CSSRewriter(urlrewriter) + elif textType == 'js': + rewriter = regex_rewriters.JSRewriter(urlrewriter) + elif textType == 'xml': + rewriter = regex_rewriters.XMLRewriter(urlrewriter) - def _rewriteHeaders(self, headers, urlrewriter, stripEncoding = False): - newHeaders = [] - removedHeaders = [] + def doRewrite(buff): + if encoding: + buff = buff.decode(encoding) + buff = rewriter.replaceAll(buff) + if encoding: + buff = buff.encode(encoding) - for (name, value) in headers: - lowername = name.lower() - if lowername in self.PROXY_HEADERS: - newHeaders.append((name, value)) - elif lowername in self.URL_REWRITE_HEADERS: - newHeaders.append((name, urlrewriter.rewrite(value))) - elif lowername in self.ENCODING_HEADERS: - if stripEncoding: - removedHeaders.append((name, value)) - else: - newHeaders.append((name, value)) - else: - newHeaders.append((self.headerPrefix + name, value)) + return buff - return (newHeaders, removedHeaders) + return WbResponse.stream_response(status_headers, stream, doRewrite, firstBuff) + def _detectCharset(self, stream): + buff = stream.read(8192) + result = chardet.detect(buff) + print "chardet result: " + str(result) + return (result['encoding'], buff) def _checkRedir(self, wbrequest, cdx): if cdx and (cdx['timestamp'] != wbrequest.wb_url.timestamp): @@ -279,15 +275,15 @@ class RewritingReplayHandler(ReplayHandler): wbresponse = ReplayHandler.doReplay(self, cdx, wbrequest, failedFiles) # Check for self redirect - if wbresponse.status.startswith('3'): - if self.isSelfRedirect(wbrequest, wbresponse.headersList): + if wbresponse.status_headers.statusline.startswith('3'): + if self.isSelfRedirect(wbrequest, wbresponse.status_headers): raise wbexceptions.CaptureException('Self Redirect: ' + str(cdx)) return wbresponse - def isSelfRedirect(self, wbrequest, httpHeaders): + def isSelfRedirect(self, wbrequest, status_headers): requestUrl = wbrequest.wb_url.url.lower() - locationUrl = utils.get_header(httpHeaders, 'Location').lower() + locationUrl = status_headers.getHeader('Location').lower() #return requestUrl == locationUrl return (ArchivalUrlRewriter.stripProtocol(requestUrl) == ArchivalUrlRewriter.stripProtocol(locationUrl)) @@ -301,4 +297,16 @@ def PrefixResolver(prefix, contains): return makeUrl - +#====================================== +class RedisResolver: + def __init__(self, redisUrl, keyPrefix = 'w:'): + self.redisUrl = redisUrl + self.keyPrefix = keyPrefix + self.redis = redis.StrictRedis.from_url(redisUrl) + + def __call__(self, filename): + try: + return self.redis.hget(self.keyPrefix + filename, 'path') + except Exception as e: + print e + return None diff --git a/pywb/wburlrewriter.py b/pywb/url_rewriter.py similarity index 100% rename from pywb/wburlrewriter.py rename to pywb/url_rewriter.py diff --git a/pywb/utils.py b/pywb/utils.py index ba427e55..a79e208c 100644 --- a/pywb/utils.py +++ b/pywb/utils.py @@ -1,6 +1,7 @@ import itertools import hmac import time +import zlib def peek_iter(iterable): try: @@ -11,21 +12,15 @@ def peek_iter(iterable): return itertools.chain([first], iterable) -def get_header(headersList, name): - nameLower = name.lower() - for value in headersList: - if (value[0].lower() == nameLower): - return value[1] +def split_prefix(key, prefixs): + for p in prefixs: + if key.startswith(p): + plen = len(p) + return (key[:plen], key[plen:]) - return None -def contains_header(headersList, seekHeader): - header = get_header(headersList, seekHeader[0]) - if not header: - return False - - # see if found header matches value! - return (header == seekHeader[1]) +def create_decompressor(): + return zlib.decompressobj(16 + zlib.MAX_WBITS) class HMACCookieMaker: def __init__(self, key, name): diff --git a/pywb/wbapp.py b/pywb/wbapp.py index 5a70c42b..439ed7ea 100644 --- a/pywb/wbapp.py +++ b/pywb/wbapp.py @@ -2,7 +2,7 @@ from query import QueryHandler from replay import FullHandler import wbexceptions -from wbrequestresponse import WbResponse +from wbrequestresponse import WbResponse, StatusAndHeaders from archivalrouter import ArchivalRequestRouter @@ -17,10 +17,11 @@ class WBHandler: ## =========== -query = QueryHandler() import testwb +query = QueryHandler(testwb.createCdxServer()) + headInsert = """ @@ -54,7 +55,11 @@ def application(env, start_response): raise wbexceptions.NotFoundException(env['REQUEST_URI'] + ' was not found') except wbexceptions.InternalRedirect as ir: - response = WbResponse(status = ir.status, headersList = ir.httpHeaders) + response = WbResponse(StatusAndHeaders(ir.status, ir.httpHeaders)) + + except (wbexceptions.NotFoundException, wbexceptions.AccessException) as e: + print "[INFO]: " + str(e) + response = handleException(env, e) except Exception as e: last_exc = e diff --git a/pywb/wbrequestresponse.py b/pywb/wbrequestresponse.py index 0e35eb36..9da9abe7 100644 --- a/pywb/wbrequestresponse.py +++ b/pywb/wbrequestresponse.py @@ -1,5 +1,7 @@ from wbarchivalurl import ArchivalUrl import utils + +import pprint #WB Request and Response class WbRequest: @@ -80,38 +82,36 @@ class WbRequest: class WbResponse: """ >>> WbResponse.text_response('Test') - {'status': '200 OK', 'body': ['Test'], 'headersList': [('Content-Type', 'text/plain')]} + {'body': ['Test'], 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [('Content-Type', 'text/plain')])} >>> WbResponse.text_stream(['Test', 'Another'], '404') - {'status': '404', 'body': ['Test', 'Another'], 'headersList': [('Content-Type', 'text/plain')]} + {'body': ['Test', 'Another'], 'status_headers': StatusAndHeaders(protocol = '', statusline = '404', headers = [('Content-Type', 'text/plain')])} >>> WbResponse.redir_response('http://example.com/otherfile') - {'status': '302 Redirect', 'body': [], 'headersList': [('Location', 'http://example.com/otherfile')]} - + {'body': [], 'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [('Location', 'http://example.com/otherfile')])} """ - def __init__(self, status, value = [], headersList = []): - self.status = status + def __init__(self, status_headers, value = []): + self.status_headers = status_headers self.body = value - self.headersList = headersList @staticmethod def text_stream(text, status = '200 OK'): - return WbResponse(status, value = text, headersList = [('Content-Type', 'text/plain')]) + return WbResponse(StatusAndHeaders(status, [('Content-Type', 'text/plain')]), value = text) @staticmethod def text_response(text, status = '200 OK'): - return WbResponse(status, value = [text], headersList = [('Content-Type', 'text/plain')]) + return WbResponse(StatusAndHeaders(status, [('Content-Type', 'text/plain')]), value = [text]) @staticmethod def redir_response(location, status = '302 Redirect'): - return WbResponse(status, headersList = [('Location', location)]) + return WbResponse(StatusAndHeaders(status, [('Location', location)])) @staticmethod - def stream_response(statusline, headers, stream, proc = None): + def stream_response(status_headers, stream, proc = None, firstBuff = None): def streamGen(): try: - buff = stream.read() + buff = firstBuff if firstBuff else stream.read() while buff: if proc: buff = proc(buff) @@ -120,25 +120,12 @@ class WbResponse: finally: stream.close() - response = WbResponse(statusline, headersList = headers, value = streamGen()) + response = WbResponse(status_headers, value = streamGen()) response._stream = stream return response - @staticmethod - def better_timestamp_response(wbrequest, newTimestamp): - wbrequest.wb_url.timestamp = newTimestamp - newUrl = wbrequest.wb_prefix + str(wbrequest.wb_url)[1:] - return WbResponse.redir_response(newUrl) - - def get_header(self, name): - return utils.get_header(self.headersList, name) - def __call__(self, env, start_response): - #headersList = [] - #for key, value in self.headers.iteritems(): - # headersList.append((key, value)) - - start_response(self.status, self.headersList) + start_response(self.status_headers.statusline, self.status_headers.headers) if env['REQUEST_METHOD'] == 'HEAD': if hasattr(self.body, 'close'): @@ -155,6 +142,28 @@ class WbResponse: return str(vars(self)) +#================================================================= +class StatusAndHeaders: + def __init__(self, statusline, headers, protocol = ''): + self.statusline = statusline + self.headers = headers + self.protocol = protocol + + def getHeader(self, name): + nameLower = name.lower() + for value in self.headers: + if (value[0].lower() == nameLower): + return value[1] + + return None + + def __repr__(self): + return "StatusAndHeaders(protocol = '{0}', statusline = '{1}', headers = {2})".format(self.protocol, self.statusline, pprint.pformat(self.headers, indent = 2)) + #return pprint.pformat(self.__dict__) + + def __eq__(self, other): + return self.statusline == other.statusline and self.headers == other.headers and self.protocol == other.protocol + if __name__ == "__main__": import doctest