mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
rename rewriters
header_rewriter added! support for encoding detection various fixes xmlrewriter
This commit is contained in:
parent
edbcaaf108
commit
2357f108a3
@ -1,7 +1,7 @@
|
|||||||
import urlparse
|
import urlparse
|
||||||
|
|
||||||
from wbrequestresponse import WbRequest, WbResponse
|
from wbrequestresponse import WbRequest, WbResponse
|
||||||
from wburlrewriter import ArchivalUrlRewriter
|
from url_rewriter import ArchivalUrlRewriter
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
# ArchivalRequestRouter -- route WB requests in archival mode
|
# ArchivalRequestRouter -- route WB requests in archival mode
|
||||||
@ -122,7 +122,7 @@ if __name__ == "__main__":
|
|||||||
if not rep:
|
if not rep:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
return rep.get_header('Location')
|
return rep.status_headers.getHeader('Location')
|
||||||
|
|
||||||
|
|
||||||
doctest.testmod()
|
doctest.testmod()
|
||||||
|
@ -1,16 +1,15 @@
|
|||||||
import hanzo.warctools
|
import itertools
|
||||||
|
|
||||||
import re
|
|
||||||
import utils
|
import utils
|
||||||
import zlib
|
|
||||||
import urllib2
|
import urllib2
|
||||||
import StringIO
|
import StringIO
|
||||||
import urlparse
|
import urlparse
|
||||||
import collections
|
import collections
|
||||||
import wbexceptions
|
import wbexceptions
|
||||||
|
|
||||||
|
from wbrequestresponse import StatusAndHeaders
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class HttpStreamLoader:
|
class HttpReader:
|
||||||
def __init__(self, hmac = None, hmacDuration = 30):
|
def __init__(self, hmac = None, hmacDuration = 30):
|
||||||
self.hmac = hmac
|
self.hmac = hmac
|
||||||
self.hmacDuration = hmacDuration
|
self.hmacDuration = hmacDuration
|
||||||
@ -33,7 +32,7 @@ class HttpStreamLoader:
|
|||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
# Untested, but for completeness
|
# Untested, but for completeness
|
||||||
class FileStreamLoader:
|
class FileReader:
|
||||||
def load(self, url, offset, length):
|
def load(self, url, offset, length):
|
||||||
if url.startswith('file://'):
|
if url.startswith('file://'):
|
||||||
url = url[len('file://'):]
|
url = url[len('file://'):]
|
||||||
@ -45,27 +44,79 @@ class FileStreamLoader:
|
|||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
WBArchiveRecord = collections.namedtuple('WBArchiveRecord', 'type, record, stream, statusline, httpHeaders')
|
WBArchiveRecord = collections.namedtuple('WBArchiveRecord', 'type, rec_headers, stream, status_headers')
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
|
|
||||||
class ArchiveLoader:
|
class ArchiveLoader:
|
||||||
|
"""
|
||||||
|
>>> loadTestArchive('example.warc.gz', '333', '1043')
|
||||||
|
(('warc', 'response'),
|
||||||
|
StatusAndHeaders(protocol = 'WARC/1.0', statusline = '', headers = [ ('WARC-Type', 'response'),
|
||||||
|
('WARC-Record-ID', '<urn:uuid:6d058047-ede2-4a13-be79-90c17c631dd4>'),
|
||||||
|
('WARC-Date', '2014-01-03T03:03:21Z'),
|
||||||
|
('Content-Length', '1610'),
|
||||||
|
('Content-Type', 'application/http; msgtype=response'),
|
||||||
|
('WARC-Payload-Digest', 'sha1:B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'),
|
||||||
|
('WARC-Target-URI', 'http://example.com?example=1'),
|
||||||
|
('WARC-Warcinfo-ID', '<urn:uuid:fbd6cf0a-6160-4550-b343-12188dc05234>')]),
|
||||||
|
StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
|
||||||
|
('Cache-Control', 'max-age=604800'),
|
||||||
|
('Content-Type', 'text/html'),
|
||||||
|
('Date', 'Fri, 03 Jan 2014 03:03:21 GMT'),
|
||||||
|
('Etag', '"359670651"'),
|
||||||
|
('Expires', 'Fri, 10 Jan 2014 03:03:21 GMT'),
|
||||||
|
('Last-Modified', 'Fri, 09 Aug 2013 23:54:35 GMT'),
|
||||||
|
('Server', 'ECS (sjc/4FCE)'),
|
||||||
|
('X-Cache', 'HIT'),
|
||||||
|
('x-ec-custom-error', '1'),
|
||||||
|
('Content-Length', '1270'),
|
||||||
|
('Connection', 'close')]))
|
||||||
|
|
||||||
|
|
||||||
|
>>> loadTestArchive('example.warc.gz', '1864', '553')
|
||||||
|
(('warc', 'revisit'),
|
||||||
|
StatusAndHeaders(protocol = 'WARC/1.0', statusline = '', headers = [ ('WARC-Type', 'revisit'),
|
||||||
|
('WARC-Record-ID', '<urn:uuid:3619f5b0-d967-44be-8f24-762098d427c4>'),
|
||||||
|
('WARC-Date', '2014-01-03T03:03:41Z'),
|
||||||
|
('Content-Length', '340'),
|
||||||
|
('Content-Type', 'application/http; msgtype=response'),
|
||||||
|
('WARC-Payload-Digest', 'sha1:B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'),
|
||||||
|
('WARC-Target-URI', 'http://example.com?example=1'),
|
||||||
|
('WARC-Warcinfo-ID', '<urn:uuid:fbd6cf0a-6160-4550-b343-12188dc05234>'),
|
||||||
|
( 'WARC-Profile',
|
||||||
|
'http://netpreserve.org/warc/0.18/revisit/identical-payload-digest'),
|
||||||
|
('WARC-Refers-To-Target-URI', 'http://example.com?example=1'),
|
||||||
|
('WARC-Refers-To-Date', '2014-01-03T03:03:21Z')]),
|
||||||
|
StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
|
||||||
|
('Cache-Control', 'max-age=604800'),
|
||||||
|
('Content-Type', 'text/html'),
|
||||||
|
('Date', 'Fri, 03 Jan 2014 03:03:41 GMT'),
|
||||||
|
('Etag', '"359670651"'),
|
||||||
|
('Expires', 'Fri, 10 Jan 2014 03:03:41 GMT'),
|
||||||
|
('Last-Modified', 'Fri, 09 Aug 2013 23:54:35 GMT'),
|
||||||
|
('Server', 'ECS (sjc/4FCE)'),
|
||||||
|
('X-Cache', 'HIT'),
|
||||||
|
('x-ec-custom-error', '1'),
|
||||||
|
('Content-Length', '1270'),
|
||||||
|
('Connection', 'close')]))
|
||||||
|
"""
|
||||||
|
|
||||||
# Standard ARC headers
|
# Standard ARC headers
|
||||||
ARC_HEADERS = ["uri", "ip-address", "creation-date", "content-type", "length"]
|
ARC_HEADERS = ["uri", "ip-address", "creation-date", "content-type", "length"]
|
||||||
|
|
||||||
# Since loading a range request, can only determine gzip-ness based on file extension
|
# Since loading a range request, can only determine gzip-ness based on file extension
|
||||||
FORMAT_MAP = {
|
FORMAT_MAP = {
|
||||||
'.warc.gz': (hanzo.warctools.WarcRecord, 'warc', True),
|
'.warc.gz': ('warc', True),
|
||||||
'.arc.gz': (hanzo.warctools.ArcRecord, 'arc', True),
|
'.arc.gz': ('arc', True),
|
||||||
'.warc': (hanzo.warctools.WarcRecord, 'warc', False),
|
'.warc': ('warc', False),
|
||||||
'.arc': (hanzo.warctools.ArcRecord, 'arc', False),
|
'.arc': ('arc', False),
|
||||||
}
|
}
|
||||||
|
|
||||||
HTTP_STATUS_REGEX = re.compile('^HTTP/[\d.]+ (\d+.*)$')
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def createDefaultLoaders():
|
def createDefaultLoaders():
|
||||||
http = HttpStreamLoader()
|
http = HttpReader()
|
||||||
file = FileStreamLoader()
|
file = FileReader()
|
||||||
return {
|
return {
|
||||||
'http': http,
|
'http': http,
|
||||||
'https': http,
|
'https': http,
|
||||||
@ -78,6 +129,10 @@ class ArchiveLoader:
|
|||||||
self.loaders = loaders if loaders else ArchiveLoader.createDefaultLoaders()
|
self.loaders = loaders if loaders else ArchiveLoader.createDefaultLoaders()
|
||||||
self.chunkSize = chunkSize
|
self.chunkSize = chunkSize
|
||||||
|
|
||||||
|
self.arcParser = ARCHeadersParser(ArchiveLoader.ARC_HEADERS)
|
||||||
|
self.warcParser = StatusAndHeadersParser(['WARC/1.0', 'WARC/0.17', 'WARC/0.18'])
|
||||||
|
self.httpParser = StatusAndHeadersParser(['HTTP/1.0', 'HTTP/1.1'])
|
||||||
|
|
||||||
def load(self, url, offset, length):
|
def load(self, url, offset, length):
|
||||||
urlParts = urlparse.urlsplit(url)
|
urlParts = urlparse.urlsplit(url)
|
||||||
|
|
||||||
@ -86,22 +141,19 @@ class ArchiveLoader:
|
|||||||
except Exception:
|
except Exception:
|
||||||
raise wbexceptions.UnknownLoaderProtocolException(url)
|
raise wbexceptions.UnknownLoaderProtocolException(url)
|
||||||
|
|
||||||
loaderCls = None
|
theFormat = None
|
||||||
|
|
||||||
for ext, (loaderCls, aFormat, gzip) in ArchiveLoader.FORMAT_MAP.iteritems():
|
for ext, iformat in ArchiveLoader.FORMAT_MAP.iteritems():
|
||||||
if url.endswith(ext):
|
if url.endswith(ext):
|
||||||
loaderCls = loaderCls
|
theFormat = iformat
|
||||||
aFormat = aFormat
|
|
||||||
isGzip = gzip
|
|
||||||
break
|
break
|
||||||
|
|
||||||
if loaderCls is None:
|
if theFormat is None:
|
||||||
raise wbexceptions.UnknownArchiveFormatException(url)
|
raise wbexceptions.UnknownArchiveFormatException(url)
|
||||||
|
|
||||||
if isGzip:
|
(aFormat, isGzip) = theFormat
|
||||||
decomp = zlib.decompressobj(16+zlib.MAX_WBITS)
|
|
||||||
else:
|
decomp = utils.create_decompressor() if isGzip else None
|
||||||
decomp = None
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
length = int(length)
|
length = int(length)
|
||||||
@ -111,73 +163,87 @@ class ArchiveLoader:
|
|||||||
|
|
||||||
raw = loader.load(url, long(offset), length)
|
raw = loader.load(url, long(offset), length)
|
||||||
|
|
||||||
reader = LineReader(raw, length, self.chunkSize, decomp)
|
stream = LineReader(raw, length, self.chunkSize, decomp)
|
||||||
|
|
||||||
parser = loaderCls.make_parser()
|
|
||||||
|
|
||||||
if aFormat == 'arc':
|
|
||||||
parser.headers = ArchiveLoader.ARC_HEADERS
|
|
||||||
|
|
||||||
(parsed, errors, _) = parser.parse(reader, 0)
|
|
||||||
|
|
||||||
if errors:
|
|
||||||
reader.close()
|
|
||||||
raise wbexceptions.InvalidArchiveRecordException('Error Parsing Record', errors)
|
|
||||||
|
|
||||||
|
|
||||||
if aFormat == 'arc':
|
if aFormat == 'arc':
|
||||||
|
rec_headers = self.arcParser.parse(stream)
|
||||||
recType = 'response'
|
recType = 'response'
|
||||||
empty = (utils.get_header(parsed.headers, 'length') == 0)
|
empty = (rec_headers.getHeader('length') == 0)
|
||||||
else:
|
|
||||||
recType = utils.get_header(parsed.headers, 'WARC-Type')
|
elif aFormat == 'warc':
|
||||||
empty = (utils.get_header(parsed.headers, 'Content-Length') == '0')
|
rec_headers = self.warcParser.parse(stream)
|
||||||
|
recType = rec_headers.getHeader('WARC-Type')
|
||||||
|
empty = (rec_headers.getHeader('Content-Length') == '0')
|
||||||
|
|
||||||
# special case: empty w/arc record (hopefully a revisit)
|
# special case: empty w/arc record (hopefully a revisit)
|
||||||
if empty:
|
if empty:
|
||||||
statusline = '204 No Content'
|
status_headers = StatusAndHeaders('204 No Content', [])
|
||||||
headers = []
|
|
||||||
|
|
||||||
# special case: warc records that are not expected to have http headers
|
# special case: warc records that are not expected to have http headers
|
||||||
# attempt to add 200 status and content-type
|
# attempt to add 200 status and content-type
|
||||||
elif recType == 'metadata' or recType == 'resource':
|
elif recType == 'metadata' or recType == 'resource':
|
||||||
statusline = '200 OK'
|
status_headers = StatusAndHeaders('200 OK', [('Content-Type', rec_headers.getHeader('Content-Type'))])
|
||||||
headers = [('Content-Type', utils.get_header(parsed.headers, 'Content-Type'))]
|
|
||||||
|
|
||||||
# special case: http 0.9 response, no status or headers
|
# special case: http 0.9 response, no status or headers
|
||||||
#elif recType == 'response' and (';version=0.9' in utils.get_header(parsed.headers, 'Content-Type')):
|
#elif recType == 'response':
|
||||||
# statusline = '200 OK'
|
# contentType = rec_headers.getHeader('Content-Type')
|
||||||
# headers = []
|
# if contentType and (';version=0.9' in contentType):
|
||||||
|
# status_headers = StatusAndHeaders('200 OK', [])
|
||||||
|
|
||||||
# response record: parse HTTP status and headers!
|
# response record: parse HTTP status and headers!
|
||||||
else:
|
else:
|
||||||
(statusline, headers) = self.parseHttpHeaders(reader)
|
#(statusline, http_headers) = self.parseHttpHeaders(stream)
|
||||||
|
status_headers = self.httpParser.parse(stream)
|
||||||
|
|
||||||
return WBArchiveRecord((aFormat, recType), parsed, reader, statusline, headers)
|
return WBArchiveRecord((aFormat, recType), rec_headers, stream, status_headers)
|
||||||
|
|
||||||
|
|
||||||
def parseHttpHeaders(self, stream):
|
#=================================================================
|
||||||
def nextHeaderLine(stream):
|
class StatusAndHeadersParser:
|
||||||
return stream.readline().rstrip()
|
def __init__(self, statuslist):
|
||||||
|
self.statuslist = statuslist
|
||||||
|
|
||||||
line = nextHeaderLine(stream)
|
def parse(self, stream):
|
||||||
matched = self.HTTP_STATUS_REGEX.match(line)
|
statusline = stream.readline().rstrip()
|
||||||
|
|
||||||
if not matched:
|
protocolStatus = utils.split_prefix(statusline, self.statuslist)
|
||||||
raise wbexceptions.InvalidArchiveRecordException('Expected HTTP Status Line, Found: ' + line)
|
|
||||||
|
if not protocolStatus:
|
||||||
|
raise wbexceptions.InvalidArchiveRecordException('Expected Status Line, Found: ' + statusline)
|
||||||
|
|
||||||
#status = int(matched.group(2))
|
|
||||||
statusline = matched.group(1)
|
|
||||||
headers = []
|
headers = []
|
||||||
|
|
||||||
line = nextHeaderLine(stream)
|
line = stream.readline().rstrip()
|
||||||
|
|
||||||
while line and line != '\r\n':
|
while line and line != '\r\n':
|
||||||
name, value = line.split(':', 1)
|
name, value = line.split(':', 1)
|
||||||
value = value.strip()
|
header = (name, value.strip())
|
||||||
headers.append((name, value))
|
headers.append(header)
|
||||||
line = nextHeaderLine(stream)
|
line = stream.readline().rstrip()
|
||||||
|
|
||||||
return (statusline, headers)
|
return StatusAndHeaders(statusline = protocolStatus[1].strip(), headers = headers, protocol = protocolStatus[0])
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class ARCHeadersParser:
|
||||||
|
def __init__(self, headernames):
|
||||||
|
self.headernames = headernames
|
||||||
|
|
||||||
|
|
||||||
|
def parse(self, stream):
|
||||||
|
headerline = stream.readline().rstrip()
|
||||||
|
|
||||||
|
parts = headerline.split()
|
||||||
|
|
||||||
|
headernames = self.headernames
|
||||||
|
|
||||||
|
if len(parts) != len(headernames):
|
||||||
|
raise wbexceptions.InvalidArchiveRecordException('Wrong # of heaeders, expected arc headers {0}, Found {1}'.format(headernames, parts))
|
||||||
|
|
||||||
|
headers = []
|
||||||
|
|
||||||
|
for name, value in itertools.izip(headernames, parts):
|
||||||
|
headers.append((name, value))
|
||||||
|
|
||||||
|
return StatusAndHeaders(statusline = '', headers = headers, protocol = 'ARC/1.0')
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class LineReader:
|
class LineReader:
|
||||||
@ -217,4 +283,19 @@ class LineReader:
|
|||||||
self.stream = None
|
self.stream = None
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import doctest
|
||||||
|
import os
|
||||||
|
import pprint
|
||||||
|
|
||||||
|
testloader = ArchiveLoader()
|
||||||
|
|
||||||
|
def loadTestArchive(test_file, offset, length):
|
||||||
|
path = os.path.dirname(os.path.realpath(__file__)) + '/../test/' + test_file
|
||||||
|
|
||||||
|
archive = testloader.load(path, offset, length)
|
||||||
|
pprint.pprint((archive.type, archive.rec_headers, archive.status_headers))
|
||||||
|
|
||||||
|
doctest.testmod()
|
||||||
|
|
||||||
|
133
pywb/header_rewriter.py
Normal file
133
pywb/header_rewriter.py
Normal file
@ -0,0 +1,133 @@
|
|||||||
|
from wbrequestresponse import StatusAndHeaders
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class RewrittenStatusAndHeaders:
|
||||||
|
def __init__(self, statusline, headers, removedHeaderDict, textType, charset):
|
||||||
|
self.status_headers = StatusAndHeaders(statusline, headers)
|
||||||
|
self.removedHeaderDict = removedHeaderDict
|
||||||
|
self.textType = textType
|
||||||
|
self.charset = charset
|
||||||
|
|
||||||
|
def containsRemovedHeader(self, name, value):
|
||||||
|
return self.removedHeaderDict.get(name) == value
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class HeaderRewriter:
|
||||||
|
"""
|
||||||
|
# Text with charset
|
||||||
|
>>> test_rewrite([('Date', 'Fri, 03 Jan 2014 03:03:21 GMT'), ('Content-Length', '5'), ('Content-Type', 'text/html;charset=utf-8')])
|
||||||
|
{'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Date', 'Fri, 03 Jan 2014 03:03:21 GMT'),
|
||||||
|
('X-Archive-Orig-Content-Length', '5'),
|
||||||
|
('Content-Type', 'text/html;charset=utf-8')]), 'charset': 'utf-8', 'textType': 'html', 'removedHeaderDict': {}}
|
||||||
|
|
||||||
|
# Redirect
|
||||||
|
>>> test_rewrite([('Connection', 'close'), ('Location', '/other.html')], '302 Redirect')
|
||||||
|
{'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [ ('X-Archive-Orig-Connection', 'close'),
|
||||||
|
('Location', '/web/20131226101010/http://example.com/other.html')]), 'charset': None, 'textType': None, 'removedHeaderDict': {}}
|
||||||
|
|
||||||
|
# gzip
|
||||||
|
>>> test_rewrite([('Content-Length', '199999'), ('Content-Type', 'text/javascript'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')])
|
||||||
|
{'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Content-Length', '199999'),
|
||||||
|
('Content-Type', 'text/javascript')]), 'charset': None, 'textType': 'js', 'removedHeaderDict': {'transfer-encoding': 'chunked', 'content-encoding': 'gzip'}}
|
||||||
|
|
||||||
|
# Binary
|
||||||
|
>>> test_rewrite([('Content-Length', '200000'), ('Content-Type', 'image/png'), ('Cookie', 'blah'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')])
|
||||||
|
{'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('Content-Length', '200000'),
|
||||||
|
('Content-Type', 'image/png'),
|
||||||
|
('X-Archive-Orig-Cookie', 'blah'),
|
||||||
|
('Content-Encoding', 'gzip'),
|
||||||
|
('Transfer-Encoding', 'chunked')]), 'charset': None, 'textType': None, 'removedHeaderDict': {}}
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
REWRITE_TYPES = {
|
||||||
|
'html': ['text/html', 'application/xhtml'],
|
||||||
|
'css': ['text/css'],
|
||||||
|
'js': ['text/javascript', 'application/javascript', 'application/x-javascript'],
|
||||||
|
'xml': ['/xml', '+xml', '.xml', '.rss'],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
PROXY_HEADERS = ('content-type', 'content-disposition')
|
||||||
|
|
||||||
|
URL_REWRITE_HEADERS = ('location', 'content-location', 'content-base')
|
||||||
|
|
||||||
|
ENCODING_HEADERS = ('content-encoding', 'transfer-encoding')
|
||||||
|
|
||||||
|
PROXY_NO_REWRITE_HEADERS = ('content-length')
|
||||||
|
|
||||||
|
def __init__(self, headerPrefix = 'X-Archive-Orig-'):
|
||||||
|
self.headerPrefix = headerPrefix
|
||||||
|
|
||||||
|
def rewrite(self, status_headers, urlrewriter):
|
||||||
|
contentType = status_headers.getHeader('Content-Type')
|
||||||
|
textType = None
|
||||||
|
charset = None
|
||||||
|
stripEncoding = False
|
||||||
|
|
||||||
|
if contentType:
|
||||||
|
textType = self._extractTextType(contentType)
|
||||||
|
if textType:
|
||||||
|
charset = self._extractCharSet(contentType)
|
||||||
|
stripEncoding = True
|
||||||
|
|
||||||
|
(newHeaders, removedHeaderDict) = self._rewriteHeaders(status_headers.headers, urlrewriter, stripEncoding)
|
||||||
|
|
||||||
|
return RewrittenStatusAndHeaders(status_headers.statusline, newHeaders, removedHeaderDict, textType, charset)
|
||||||
|
|
||||||
|
|
||||||
|
def _extractTextType(self, contentType):
|
||||||
|
for ctype, mimelist in self.REWRITE_TYPES.iteritems():
|
||||||
|
if any ((mime in contentType) for mime in mimelist):
|
||||||
|
return ctype
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _extractCharSet(self, contentType):
|
||||||
|
CHARSET_TOKEN = 'charset='
|
||||||
|
idx = contentType.find(CHARSET_TOKEN)
|
||||||
|
if idx < 0:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return contentType[idx + len(CHARSET_TOKEN):]
|
||||||
|
|
||||||
|
def _rewriteHeaders(self, headers, urlrewriter, contentRewritten = False):
|
||||||
|
newHeaders = []
|
||||||
|
removedHeaderDict = {}
|
||||||
|
|
||||||
|
for (name, value) in headers:
|
||||||
|
lowername = name.lower()
|
||||||
|
if lowername in self.PROXY_HEADERS:
|
||||||
|
newHeaders.append((name, value))
|
||||||
|
elif lowername in self.URL_REWRITE_HEADERS:
|
||||||
|
newHeaders.append((name, urlrewriter.rewrite(value)))
|
||||||
|
elif lowername in self.ENCODING_HEADERS:
|
||||||
|
if contentRewritten:
|
||||||
|
removedHeaderDict[lowername] = value
|
||||||
|
else:
|
||||||
|
newHeaders.append((name, value))
|
||||||
|
elif lowername in self.PROXY_NO_REWRITE_HEADERS and not contentRewritten:
|
||||||
|
newHeaders.append((name, value))
|
||||||
|
else:
|
||||||
|
newHeaders.append((self.headerPrefix + name, value))
|
||||||
|
|
||||||
|
return (newHeaders, removedHeaderDict)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import doctest
|
||||||
|
import os
|
||||||
|
import pprint
|
||||||
|
import url_rewriter
|
||||||
|
|
||||||
|
urlrewriter = url_rewriter.ArchivalUrlRewriter('/20131226101010/http://example.com/some/path/index.html', '/web/')
|
||||||
|
|
||||||
|
headerrewriter = HeaderRewriter()
|
||||||
|
|
||||||
|
def test_rewrite(headers, status = '200 OK'):
|
||||||
|
rewritten = headerrewriter.rewrite(StatusAndHeaders(status, headers), urlrewriter)
|
||||||
|
return vars(rewritten)
|
||||||
|
|
||||||
|
doctest.testmod()
|
||||||
|
|
@ -5,8 +5,8 @@ import sys
|
|||||||
import re
|
import re
|
||||||
|
|
||||||
from HTMLParser import HTMLParser
|
from HTMLParser import HTMLParser
|
||||||
from wburlrewriter import ArchivalUrlRewriter
|
from url_rewriter import ArchivalUrlRewriter
|
||||||
from regexmatch import JSRewriter, CSSRewriter
|
from regex_rewriters import JSRewriter, CSSRewriter
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
# WBHtml --html parser for custom rewriting, also handlers for script and css
|
# WBHtml --html parser for custom rewriting, also handlers for script and css
|
@ -10,37 +10,18 @@ class RemoteCDXServer:
|
|||||||
>>> x = cdxserver.load('example.com', parse_cdx = True, limit = '2')
|
>>> x = cdxserver.load('example.com', parse_cdx = True, limit = '2')
|
||||||
>>> pprint(x[0])
|
>>> pprint(x[0])
|
||||||
{'digest': 'HT2DYGA5UKZCPBSFVCV3JOBXGW2G5UUA',
|
{'digest': 'HT2DYGA5UKZCPBSFVCV3JOBXGW2G5UUA',
|
||||||
'filename': 'DJ_crawl2.20020401123359-c/DJ_crawl3.20020120141301.arc.gz',
|
|
||||||
'length': '1792',
|
'length': '1792',
|
||||||
'mimetype': 'text/html',
|
'mimetype': 'text/html',
|
||||||
'offset': '49482198',
|
|
||||||
'original': 'http://example.com:80/',
|
'original': 'http://example.com:80/',
|
||||||
'redirect': '-',
|
|
||||||
'robotflags': '-',
|
|
||||||
'statuscode': '200',
|
'statuscode': '200',
|
||||||
'timestamp': '20020120142510',
|
'timestamp': '20020120142510',
|
||||||
'urlkey': 'com,example)/'}
|
'urlkey': 'com,example)/'}
|
||||||
|
|
||||||
>>> x = cdxserver.load('example.com', parse_cdx = True, params = {'resolveRevisits': True, 'closest': '20131226', 'sort': 'closest', 'limit': '1'})
|
"""
|
||||||
>>> pprint(x[0])
|
|
||||||
{'digest': 'B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A',
|
|
||||||
'filename': 'top_domains-00800-20131210-035838/top_domains-00800-20131210051705-00024.warc.gz',
|
|
||||||
'length': '523',
|
|
||||||
'mimetype': 'warc/revisit',
|
|
||||||
'offset': '247256770',
|
|
||||||
'orig.filename': 'deccanchronicle.com-20130107-023325/IA-FOC-deccanchronicle.com-20130921004125-00000.warc.gz',
|
|
||||||
'orig.length': '529',
|
|
||||||
'orig.offset': '769759',
|
|
||||||
'original': 'http://www.example.com/',
|
|
||||||
'redirect': '-',
|
|
||||||
'robotflags': '-',
|
|
||||||
'statuscode': '-',
|
|
||||||
'timestamp': '20131210052355',
|
|
||||||
'urlkey': 'com,example)/'}
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, serverUrl):
|
def __init__(self, serverUrl, cookie = None):
|
||||||
self.serverUrl = serverUrl
|
self.serverUrl = serverUrl
|
||||||
|
self.authCookie = cookie
|
||||||
|
|
||||||
def load(self, url, params = {}, parse_cdx = False, **kwvalues):
|
def load(self, url, params = {}, parse_cdx = False, **kwvalues):
|
||||||
#url is required, must be passed explicitly!
|
#url is required, must be passed explicitly!
|
||||||
@ -51,6 +32,10 @@ class RemoteCDXServer:
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
request = urllib2.Request(self.serverUrl, urlparams)
|
request = urllib2.Request(self.serverUrl, urlparams)
|
||||||
|
|
||||||
|
if self.authCookie:
|
||||||
|
request.add_header('Cookie', self.authCookie)
|
||||||
|
|
||||||
response = urllib2.urlopen(request)
|
response = urllib2.urlopen(request)
|
||||||
except urllib2.HTTPError, e:
|
except urllib2.HTTPError, e:
|
||||||
if e.code == 403:
|
if e.code == 403:
|
||||||
@ -91,6 +76,9 @@ class RemoteCDXServer:
|
|||||||
|
|
||||||
class CDXCaptureResult(dict):
|
class CDXCaptureResult(dict):
|
||||||
CDX_FORMATS = [
|
CDX_FORMATS = [
|
||||||
|
# Public CDX Format
|
||||||
|
["urlkey","timestamp","original","mimetype","statuscode","digest","length"],
|
||||||
|
|
||||||
# CDX 11 Format
|
# CDX 11 Format
|
||||||
["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"],
|
["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"],
|
||||||
|
|
||||||
|
@ -4,8 +4,11 @@ import wbrequestresponse
|
|||||||
import wbexceptions
|
import wbexceptions
|
||||||
|
|
||||||
class QueryHandler:
|
class QueryHandler:
|
||||||
def __init__(self):
|
def __init__(self, cdxserver = None):
|
||||||
self.cdxserver = indexreader.RemoteCDXServer('http://web.archive.org/cdx/search/cdx')
|
if not cdxserver:
|
||||||
|
cdxserver = indexreader.RemoteCDXServer('http://web.archive.org/cdx/search/cdx')
|
||||||
|
|
||||||
|
self.cdxserver = cdxserver
|
||||||
|
|
||||||
def __call__(self, wbrequest, prev_wbresponse):
|
def __call__(self, wbrequest, prev_wbresponse):
|
||||||
wburl = wbrequest.wb_url
|
wburl = wbrequest.wb_url
|
||||||
|
@ -2,12 +2,13 @@ import re
|
|||||||
import sys
|
import sys
|
||||||
import itertools
|
import itertools
|
||||||
|
|
||||||
from wburlrewriter import ArchivalUrlRewriter
|
from url_rewriter import ArchivalUrlRewriter
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
class RegexRewriter:
|
class RegexRewriter:
|
||||||
"""
|
"""
|
||||||
# Test https->http converter (other tests below in subclasses)
|
# Test https->http converter (other tests below in subclasses)
|
||||||
>>> RegexRewriter([(RegexRewriter.HTTPX_MATCH_REGEX, RegexRewriter.removeHttps, 0)]).replaceAll('a = https://example.com; b = http://example.com; c = https://some-url/path/https://embedded.example.com')
|
>>> RegexRewriter([(RegexRewriter.HTTPX_MATCH_STR, RegexRewriter.removeHttps, 0)]).replaceAll('a = https://example.com; b = http://example.com; c = https://some-url/path/https://embedded.example.com')
|
||||||
'a = http://example.com; b = http://example.com; c = http://some-url/path/http://embedded.example.com'
|
'a = http://example.com; b = http://example.com; c = http://some-url/path/http://embedded.example.com'
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@ -27,7 +28,7 @@ class RegexRewriter:
|
|||||||
def archivalRewrite(rewriter):
|
def archivalRewrite(rewriter):
|
||||||
return lambda x: rewriter.rewrite(x)
|
return lambda x: rewriter.rewrite(x)
|
||||||
|
|
||||||
HTTPX_MATCH_REGEX = 'https?:\\\\?/\\\\?/[A-Za-z0-9:_@.-]+'
|
HTTPX_MATCH_STR = 'https?:\\\\?/\\\\?/[A-Za-z0-9:_@.-]+'
|
||||||
|
|
||||||
DEFAULT_OP = addPrefix
|
DEFAULT_OP = addPrefix
|
||||||
|
|
||||||
@ -44,6 +45,9 @@ class RegexRewriter:
|
|||||||
self.regex = re.compile(regexStr, re.M)
|
self.regex = re.compile(regexStr, re.M)
|
||||||
self.rules = rules
|
self.rules = rules
|
||||||
|
|
||||||
|
def filter(self, m):
|
||||||
|
return True
|
||||||
|
|
||||||
def replaceAll(self, string):
|
def replaceAll(self, string):
|
||||||
return self.regex.sub(lambda x: self.replace(x), string)
|
return self.regex.sub(lambda x: self.replace(x), string)
|
||||||
|
|
||||||
@ -60,6 +64,10 @@ class RegexRewriter:
|
|||||||
if not m.group(i):
|
if not m.group(i):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# Optional filter to skip matches
|
||||||
|
if not self.filter(m):
|
||||||
|
return m.group(0)
|
||||||
|
|
||||||
# Custom func
|
# Custom func
|
||||||
if not hasattr(op, '__call__'):
|
if not hasattr(op, '__call__'):
|
||||||
op = RegexRewriter.DEFAULT_OP(op)
|
op = RegexRewriter.DEFAULT_OP(op)
|
||||||
@ -74,6 +82,7 @@ class RegexRewriter:
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
class JSRewriter(RegexRewriter):
|
class JSRewriter(RegexRewriter):
|
||||||
"""
|
"""
|
||||||
>>> test_js('location = "http://example.com/abc.html"')
|
>>> test_js('location = "http://example.com/abc.html"')
|
||||||
@ -100,11 +109,47 @@ class JSRewriter(RegexRewriter):
|
|||||||
|
|
||||||
def _createRules(self, httpPrefix):
|
def _createRules(self, httpPrefix):
|
||||||
return [
|
return [
|
||||||
(RegexRewriter.HTTPX_MATCH_REGEX, httpPrefix, 0),
|
(RegexRewriter.HTTPX_MATCH_STR, httpPrefix, 0),
|
||||||
('location|domain', 'WB_wombat_', 0),
|
('location|domain', 'WB_wombat_', 0),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class XMLRewriter(RegexRewriter):
|
||||||
|
"""
|
||||||
|
>>> test_xml('<tag xmlns="http://www.example.com/ns" attr="http://example.com"></tag>')
|
||||||
|
'<tag xmlns="http://www.example.com/ns" attr="/web/20131010im_/http://example.com"></tag>'
|
||||||
|
|
||||||
|
>>> test_xml('<tag xmlns:xsi="http://www.example.com/ns" attr=" http://example.com"></tag>')
|
||||||
|
'<tag xmlns:xsi="http://www.example.com/ns" attr=" /web/20131010im_/http://example.com"></tag>'
|
||||||
|
|
||||||
|
>>> test_xml('<tag> http://example.com<other>abchttp://example.com</other></tag>')
|
||||||
|
'<tag> /web/20131010im_/http://example.com<other>abchttp://example.com</other></tag>'
|
||||||
|
|
||||||
|
>>> test_xml('<main> http://www.example.com/blah</tag> <other xmlns:abcdef= " http://example.com"/> http://example.com </main>')
|
||||||
|
'<main> /web/20131010im_/http://www.example.com/blah</tag> <other xmlns:abcdef= " http://example.com"/> /web/20131010im_/http://example.com </main>'
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, rewriter, extra = []):
|
||||||
|
rules = self._createRules(rewriter.getAbsUrl())
|
||||||
|
|
||||||
|
RegexRewriter.__init__(self, rules)
|
||||||
|
|
||||||
|
# custom filter to reject 'xmlns' attr
|
||||||
|
def filter(self, m):
|
||||||
|
attr = m.group(1)
|
||||||
|
if attr and attr.startswith('xmlns'):
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
def _createRules(self, httpPrefix):
|
||||||
|
return [
|
||||||
|
('([A-Za-z:]+[\s=]+)?["\'\s]*(' + RegexRewriter.HTTPX_MATCH_STR + ')', httpPrefix, 2),
|
||||||
|
]
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
class CSSRewriter(RegexRewriter):
|
class CSSRewriter(RegexRewriter):
|
||||||
r"""
|
r"""
|
||||||
>>> test_css("background: url('/some/path.html')")
|
>>> test_css("background: url('/some/path.html')")
|
||||||
@ -172,6 +217,9 @@ if __name__ == "__main__":
|
|||||||
def test_js(string, extra = []):
|
def test_js(string, extra = []):
|
||||||
return JSRewriter(arcrw, extra).replaceAll(string)
|
return JSRewriter(arcrw, extra).replaceAll(string)
|
||||||
|
|
||||||
|
def test_xml(string):
|
||||||
|
return XMLRewriter(arcrw).replaceAll(string)
|
||||||
|
|
||||||
def test_css(string):
|
def test_css(string):
|
||||||
return CSSRewriter(arcrw).replaceAll(string)
|
return CSSRewriter(arcrw).replaceAll(string)
|
||||||
|
|
184
pywb/replay.py
184
pywb/replay.py
@ -1,14 +1,18 @@
|
|||||||
import StringIO
|
import StringIO
|
||||||
from urllib2 import URLError
|
from urllib2 import URLError
|
||||||
|
import chardet
|
||||||
|
import redis
|
||||||
|
|
||||||
import indexreader
|
import indexreader
|
||||||
from wbrequestresponse import WbResponse
|
from wbrequestresponse import WbResponse, StatusAndHeaders
|
||||||
from wbarchivalurl import ArchivalUrl
|
from wbarchivalurl import ArchivalUrl
|
||||||
import utils
|
import utils
|
||||||
from wburlrewriter import ArchivalUrlRewriter
|
|
||||||
|
|
||||||
import wbhtml
|
from url_rewriter import ArchivalUrlRewriter
|
||||||
import regexmatch
|
from header_rewriter import HeaderRewriter
|
||||||
|
import html_rewriter
|
||||||
|
import regex_rewriters
|
||||||
|
|
||||||
import wbexceptions
|
import wbexceptions
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
@ -111,19 +115,19 @@ class ReplayHandler(object):
|
|||||||
payloadRecord = self._load(cdx, True, failedFiles)
|
payloadRecord = self._load(cdx, True, failedFiles)
|
||||||
|
|
||||||
# Case 4: if headers record is actually empty (eg empty revisit), then use headers from revisit
|
# Case 4: if headers record is actually empty (eg empty revisit), then use headers from revisit
|
||||||
if not headersRecord.httpHeaders:
|
if not headersRecord.status_headers.headers:
|
||||||
headersRecord.stream.close()
|
headersRecord.stream.close()
|
||||||
headersRecord = payloadRecord
|
headersRecord = payloadRecord
|
||||||
else:
|
else:
|
||||||
headersRecord.stream.close()
|
headersRecord.stream.close()
|
||||||
|
|
||||||
|
|
||||||
isRevisit = True
|
isRevisit = True
|
||||||
|
|
||||||
else:
|
else:
|
||||||
raise wbexceptions.CaptureException('Invalid CDX' + cdx)
|
raise wbexceptions.CaptureException('Invalid CDX' + cdx)
|
||||||
|
|
||||||
return WbResponse.stream_response(headersRecord.statusline, headersRecord.httpHeaders, payloadRecord.stream)
|
return WbResponse.stream_response(headersRecord.status_headers, payloadRecord.stream)
|
||||||
|
|
||||||
|
|
||||||
def resolveFull(self, filename):
|
def resolveFull(self, filename):
|
||||||
@ -140,26 +144,12 @@ class ReplayHandler(object):
|
|||||||
#=================================================================
|
#=================================================================
|
||||||
class RewritingReplayHandler(ReplayHandler):
|
class RewritingReplayHandler(ReplayHandler):
|
||||||
|
|
||||||
|
def __init__(self, resolvers, archiveloader, headInsert = None, headerRewriter = None):
|
||||||
REWRITE_TYPES = {
|
|
||||||
'html': ['text/html', 'application/xhtml'],
|
|
||||||
'css': ['text/css'],
|
|
||||||
'js': ['text/javascript', 'application/javascript', 'application/x-javascript'],
|
|
||||||
'xml': ['/xml', '+xml', '.xml', '.rss'],
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
PROXY_HEADERS = ('content-type', 'content-disposition')
|
|
||||||
|
|
||||||
URL_REWRITE_HEADERS = ('location', 'content-location', 'content-base')
|
|
||||||
|
|
||||||
ENCODING_HEADERS = ('content-encoding', 'transfer-encoding')
|
|
||||||
|
|
||||||
|
|
||||||
def __init__(self, resolvers, archiveloader, headerPrefix = 'X-Archive-Orig-', headInsert = None):
|
|
||||||
ReplayHandler.__init__(self, resolvers, archiveloader)
|
ReplayHandler.__init__(self, resolvers, archiveloader)
|
||||||
self.headerPrefix = headerPrefix
|
|
||||||
self.headInsert = headInsert
|
self.headInsert = headInsert
|
||||||
|
if not headerRewriter:
|
||||||
|
headerRewriter = HeaderRewriter()
|
||||||
|
self.headerRewriter = headerRewriter
|
||||||
|
|
||||||
|
|
||||||
def _textContentType(self, contentType):
|
def _textContentType(self, contentType):
|
||||||
@ -183,88 +173,94 @@ class RewritingReplayHandler(ReplayHandler):
|
|||||||
if wbrequest.wb_url.mod == 'id_':
|
if wbrequest.wb_url.mod == 'id_':
|
||||||
return response
|
return response
|
||||||
|
|
||||||
contentType = utils.get_header(response.headersList, 'Content-Type')
|
|
||||||
|
|
||||||
textType = self._textContentType(contentType) if contentType else None
|
|
||||||
|
|
||||||
(newHeaders, remHeaders) = self._rewriteHeaders(response.headersList, urlrewriter, textType is not None)
|
|
||||||
|
|
||||||
# binary type, just send through
|
rewrittenHeaders = self.headerRewriter.rewrite(response.status_headers, urlrewriter)
|
||||||
if textType is None:
|
|
||||||
response.headersList = newHeaders
|
# non-text content type, just send through with rewritten headers
|
||||||
|
if rewrittenHeaders.textType is None:
|
||||||
|
response.status_headers = rewrittenHeaders.status_headers
|
||||||
return response
|
return response
|
||||||
|
|
||||||
# Handle text rewriting
|
# Handle text rewriting
|
||||||
# TODO: better way to pass this
|
# TODO: better way to pass this?
|
||||||
stream = response._stream
|
stream = response._stream
|
||||||
|
|
||||||
# special case -- need to ungzip the body
|
# special case -- need to ungzip the body
|
||||||
if (utils.contains_header(remHeaders, ('Content-Encoding', 'gzip'))):
|
if (rewrittenHeaders.containsRemovedHeader('content-encoding', 'gzip')):
|
||||||
stream = archiveloader.LineStream(stream, decomp = zlib.decompressobj(16 + zlib.MAX_WBITS))
|
stream = archiveloader.LineStream(stream, decomp = utils.create_decompressor())
|
||||||
|
|
||||||
return self._rewriteContent(textType, urlrewriter, stream, newHeaders, response)
|
# TODO: is this right?
|
||||||
|
if rewrittenHeaders.charset:
|
||||||
|
encoding = rewrittenHeaders.charset
|
||||||
|
firstBuff = None
|
||||||
|
else:
|
||||||
|
(encoding, firstBuff) = self._detectCharset(stream)
|
||||||
|
|
||||||
# TODO: first non-streaming attempt, probably want to stream
|
# if ascii, set to noop encode operation
|
||||||
def _rewriteContent(self, textType, urlrewriter, stream, newHeaders, origResponse, encoding = 'utf-8'):
|
if encoding == 'ascii':
|
||||||
if textType == 'html':
|
encoding = None
|
||||||
out = StringIO.StringIO()
|
#encoding = 'utf-8'
|
||||||
#out = SimpleWriter()
|
|
||||||
htmlrewriter = wbhtml.WBHtml(urlrewriter, out, self.headInsert)
|
|
||||||
|
|
||||||
try:
|
# Buffering response for html, streaming for others?
|
||||||
buff = stream.read()
|
if rewrittenHeaders.textType == 'html':
|
||||||
while buff:
|
return self._rewriteHtml(encoding, urlrewriter, stream, rewrittenHeaders.status_headers, firstBuff)
|
||||||
|
else:
|
||||||
|
return self._rewriteOther(rewrittenHeaders.textType, encoding, urlrewriter, stream, rewrittenHeaders.status_headers, firstBuff)
|
||||||
|
|
||||||
|
|
||||||
|
def _rewriteHtml(self, encoding, urlrewriter, stream, status_headers, firstBuff = None):
|
||||||
|
out = StringIO.StringIO()
|
||||||
|
htmlrewriter = html_rewriter.WBHtml(urlrewriter, out, self.headInsert)
|
||||||
|
|
||||||
|
try:
|
||||||
|
buff = firstBuff if firstBuff else stream.read()
|
||||||
|
while buff:
|
||||||
|
if encoding:
|
||||||
buff = buff.decode(encoding)
|
buff = buff.decode(encoding)
|
||||||
htmlrewriter.feed(buff)
|
htmlrewriter.feed(buff)
|
||||||
buff = stream.read()
|
buff = stream.read()
|
||||||
|
|
||||||
htmlrewriter.close()
|
# Close rewriter if gracefully made it to end
|
||||||
|
htmlrewriter.close()
|
||||||
|
|
||||||
#except Exception as e:
|
finally:
|
||||||
# print e
|
content = out.getvalue()
|
||||||
|
if encoding:
|
||||||
|
content = content.encode(encoding)
|
||||||
|
|
||||||
finally:
|
|
||||||
content = out.getvalue().encode(encoding)
|
|
||||||
value = [content]
|
value = [content]
|
||||||
newHeaders.append(('Content-Length', str(len(value[0]))))
|
contentLengthStr = str(len(content))
|
||||||
|
status_headers.headers.append(('Content-Length', contentLengthStr))
|
||||||
out.close()
|
out.close()
|
||||||
|
|
||||||
return WbResponse(status = origResponse.status, headersList = newHeaders, value = value)
|
return WbResponse(status_headers, value = value)
|
||||||
|
|
||||||
else:
|
|
||||||
if textType == 'css':
|
|
||||||
rewriter = regexmatch.CSSRewriter(urlrewriter)
|
|
||||||
elif textType == 'js':
|
|
||||||
rewriter = regexmatch.JSRewriter(urlrewriter)
|
|
||||||
|
|
||||||
def doRewrite(buff):
|
|
||||||
return rewriter.replaceAll(buff)
|
|
||||||
|
|
||||||
return WbResponse.stream_response(origResponse.status, newHeaders, stream, doRewrite)
|
|
||||||
|
|
||||||
|
|
||||||
|
def _rewriteOther(self, textType, encoding, urlrewriter, stream, status_headers, firstBuff = None):
|
||||||
|
if textType == 'css':
|
||||||
|
rewriter = regex_rewriters.CSSRewriter(urlrewriter)
|
||||||
|
elif textType == 'js':
|
||||||
|
rewriter = regex_rewriters.JSRewriter(urlrewriter)
|
||||||
|
elif textType == 'xml':
|
||||||
|
rewriter = regex_rewriters.XMLRewriter(urlrewriter)
|
||||||
|
|
||||||
|
|
||||||
def _rewriteHeaders(self, headers, urlrewriter, stripEncoding = False):
|
def doRewrite(buff):
|
||||||
newHeaders = []
|
if encoding:
|
||||||
removedHeaders = []
|
buff = buff.decode(encoding)
|
||||||
|
buff = rewriter.replaceAll(buff)
|
||||||
|
if encoding:
|
||||||
|
buff = buff.encode(encoding)
|
||||||
|
|
||||||
for (name, value) in headers:
|
return buff
|
||||||
lowername = name.lower()
|
|
||||||
if lowername in self.PROXY_HEADERS:
|
|
||||||
newHeaders.append((name, value))
|
|
||||||
elif lowername in self.URL_REWRITE_HEADERS:
|
|
||||||
newHeaders.append((name, urlrewriter.rewrite(value)))
|
|
||||||
elif lowername in self.ENCODING_HEADERS:
|
|
||||||
if stripEncoding:
|
|
||||||
removedHeaders.append((name, value))
|
|
||||||
else:
|
|
||||||
newHeaders.append((name, value))
|
|
||||||
else:
|
|
||||||
newHeaders.append((self.headerPrefix + name, value))
|
|
||||||
|
|
||||||
return (newHeaders, removedHeaders)
|
return WbResponse.stream_response(status_headers, stream, doRewrite, firstBuff)
|
||||||
|
|
||||||
|
def _detectCharset(self, stream):
|
||||||
|
buff = stream.read(8192)
|
||||||
|
result = chardet.detect(buff)
|
||||||
|
print "chardet result: " + str(result)
|
||||||
|
return (result['encoding'], buff)
|
||||||
|
|
||||||
def _checkRedir(self, wbrequest, cdx):
|
def _checkRedir(self, wbrequest, cdx):
|
||||||
if cdx and (cdx['timestamp'] != wbrequest.wb_url.timestamp):
|
if cdx and (cdx['timestamp'] != wbrequest.wb_url.timestamp):
|
||||||
@ -279,15 +275,15 @@ class RewritingReplayHandler(ReplayHandler):
|
|||||||
wbresponse = ReplayHandler.doReplay(self, cdx, wbrequest, failedFiles)
|
wbresponse = ReplayHandler.doReplay(self, cdx, wbrequest, failedFiles)
|
||||||
|
|
||||||
# Check for self redirect
|
# Check for self redirect
|
||||||
if wbresponse.status.startswith('3'):
|
if wbresponse.status_headers.statusline.startswith('3'):
|
||||||
if self.isSelfRedirect(wbrequest, wbresponse.headersList):
|
if self.isSelfRedirect(wbrequest, wbresponse.status_headers):
|
||||||
raise wbexceptions.CaptureException('Self Redirect: ' + str(cdx))
|
raise wbexceptions.CaptureException('Self Redirect: ' + str(cdx))
|
||||||
|
|
||||||
return wbresponse
|
return wbresponse
|
||||||
|
|
||||||
def isSelfRedirect(self, wbrequest, httpHeaders):
|
def isSelfRedirect(self, wbrequest, status_headers):
|
||||||
requestUrl = wbrequest.wb_url.url.lower()
|
requestUrl = wbrequest.wb_url.url.lower()
|
||||||
locationUrl = utils.get_header(httpHeaders, 'Location').lower()
|
locationUrl = status_headers.getHeader('Location').lower()
|
||||||
#return requestUrl == locationUrl
|
#return requestUrl == locationUrl
|
||||||
return (ArchivalUrlRewriter.stripProtocol(requestUrl) == ArchivalUrlRewriter.stripProtocol(locationUrl))
|
return (ArchivalUrlRewriter.stripProtocol(requestUrl) == ArchivalUrlRewriter.stripProtocol(locationUrl))
|
||||||
|
|
||||||
@ -301,4 +297,16 @@ def PrefixResolver(prefix, contains):
|
|||||||
|
|
||||||
return makeUrl
|
return makeUrl
|
||||||
|
|
||||||
|
#======================================
|
||||||
|
class RedisResolver:
|
||||||
|
def __init__(self, redisUrl, keyPrefix = 'w:'):
|
||||||
|
self.redisUrl = redisUrl
|
||||||
|
self.keyPrefix = keyPrefix
|
||||||
|
self.redis = redis.StrictRedis.from_url(redisUrl)
|
||||||
|
|
||||||
|
def __call__(self, filename):
|
||||||
|
try:
|
||||||
|
return self.redis.hget(self.keyPrefix + filename, 'path')
|
||||||
|
except Exception as e:
|
||||||
|
print e
|
||||||
|
return None
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
import itertools
|
import itertools
|
||||||
import hmac
|
import hmac
|
||||||
import time
|
import time
|
||||||
|
import zlib
|
||||||
|
|
||||||
def peek_iter(iterable):
|
def peek_iter(iterable):
|
||||||
try:
|
try:
|
||||||
@ -11,21 +12,15 @@ def peek_iter(iterable):
|
|||||||
return itertools.chain([first], iterable)
|
return itertools.chain([first], iterable)
|
||||||
|
|
||||||
|
|
||||||
def get_header(headersList, name):
|
def split_prefix(key, prefixs):
|
||||||
nameLower = name.lower()
|
for p in prefixs:
|
||||||
for value in headersList:
|
if key.startswith(p):
|
||||||
if (value[0].lower() == nameLower):
|
plen = len(p)
|
||||||
return value[1]
|
return (key[:plen], key[plen:])
|
||||||
|
|
||||||
return None
|
|
||||||
|
|
||||||
def contains_header(headersList, seekHeader):
|
def create_decompressor():
|
||||||
header = get_header(headersList, seekHeader[0])
|
return zlib.decompressobj(16 + zlib.MAX_WBITS)
|
||||||
if not header:
|
|
||||||
return False
|
|
||||||
|
|
||||||
# see if found header matches value!
|
|
||||||
return (header == seekHeader[1])
|
|
||||||
|
|
||||||
class HMACCookieMaker:
|
class HMACCookieMaker:
|
||||||
def __init__(self, key, name):
|
def __init__(self, key, name):
|
||||||
|
@ -2,7 +2,7 @@ from query import QueryHandler
|
|||||||
from replay import FullHandler
|
from replay import FullHandler
|
||||||
import wbexceptions
|
import wbexceptions
|
||||||
|
|
||||||
from wbrequestresponse import WbResponse
|
from wbrequestresponse import WbResponse, StatusAndHeaders
|
||||||
from archivalrouter import ArchivalRequestRouter
|
from archivalrouter import ArchivalRequestRouter
|
||||||
|
|
||||||
|
|
||||||
@ -17,10 +17,11 @@ class WBHandler:
|
|||||||
|
|
||||||
|
|
||||||
## ===========
|
## ===========
|
||||||
query = QueryHandler()
|
|
||||||
|
|
||||||
import testwb
|
import testwb
|
||||||
|
|
||||||
|
query = QueryHandler(testwb.createCdxServer())
|
||||||
|
|
||||||
headInsert = """
|
headInsert = """
|
||||||
|
|
||||||
<!-- WB Insert -->
|
<!-- WB Insert -->
|
||||||
@ -54,7 +55,11 @@ def application(env, start_response):
|
|||||||
raise wbexceptions.NotFoundException(env['REQUEST_URI'] + ' was not found')
|
raise wbexceptions.NotFoundException(env['REQUEST_URI'] + ' was not found')
|
||||||
|
|
||||||
except wbexceptions.InternalRedirect as ir:
|
except wbexceptions.InternalRedirect as ir:
|
||||||
response = WbResponse(status = ir.status, headersList = ir.httpHeaders)
|
response = WbResponse(StatusAndHeaders(ir.status, ir.httpHeaders))
|
||||||
|
|
||||||
|
except (wbexceptions.NotFoundException, wbexceptions.AccessException) as e:
|
||||||
|
print "[INFO]: " + str(e)
|
||||||
|
response = handleException(env, e)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
last_exc = e
|
last_exc = e
|
||||||
|
@ -1,5 +1,7 @@
|
|||||||
from wbarchivalurl import ArchivalUrl
|
from wbarchivalurl import ArchivalUrl
|
||||||
import utils
|
import utils
|
||||||
|
|
||||||
|
import pprint
|
||||||
#WB Request and Response
|
#WB Request and Response
|
||||||
|
|
||||||
class WbRequest:
|
class WbRequest:
|
||||||
@ -80,38 +82,36 @@ class WbRequest:
|
|||||||
class WbResponse:
|
class WbResponse:
|
||||||
"""
|
"""
|
||||||
>>> WbResponse.text_response('Test')
|
>>> WbResponse.text_response('Test')
|
||||||
{'status': '200 OK', 'body': ['Test'], 'headersList': [('Content-Type', 'text/plain')]}
|
{'body': ['Test'], 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [('Content-Type', 'text/plain')])}
|
||||||
|
|
||||||
>>> WbResponse.text_stream(['Test', 'Another'], '404')
|
>>> WbResponse.text_stream(['Test', 'Another'], '404')
|
||||||
{'status': '404', 'body': ['Test', 'Another'], 'headersList': [('Content-Type', 'text/plain')]}
|
{'body': ['Test', 'Another'], 'status_headers': StatusAndHeaders(protocol = '', statusline = '404', headers = [('Content-Type', 'text/plain')])}
|
||||||
|
|
||||||
>>> WbResponse.redir_response('http://example.com/otherfile')
|
>>> WbResponse.redir_response('http://example.com/otherfile')
|
||||||
{'status': '302 Redirect', 'body': [], 'headersList': [('Location', 'http://example.com/otherfile')]}
|
{'body': [], 'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [('Location', 'http://example.com/otherfile')])}
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, status, value = [], headersList = []):
|
def __init__(self, status_headers, value = []):
|
||||||
self.status = status
|
self.status_headers = status_headers
|
||||||
self.body = value
|
self.body = value
|
||||||
self.headersList = headersList
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def text_stream(text, status = '200 OK'):
|
def text_stream(text, status = '200 OK'):
|
||||||
return WbResponse(status, value = text, headersList = [('Content-Type', 'text/plain')])
|
return WbResponse(StatusAndHeaders(status, [('Content-Type', 'text/plain')]), value = text)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def text_response(text, status = '200 OK'):
|
def text_response(text, status = '200 OK'):
|
||||||
return WbResponse(status, value = [text], headersList = [('Content-Type', 'text/plain')])
|
return WbResponse(StatusAndHeaders(status, [('Content-Type', 'text/plain')]), value = [text])
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def redir_response(location, status = '302 Redirect'):
|
def redir_response(location, status = '302 Redirect'):
|
||||||
return WbResponse(status, headersList = [('Location', location)])
|
return WbResponse(StatusAndHeaders(status, [('Location', location)]))
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def stream_response(statusline, headers, stream, proc = None):
|
def stream_response(status_headers, stream, proc = None, firstBuff = None):
|
||||||
def streamGen():
|
def streamGen():
|
||||||
try:
|
try:
|
||||||
buff = stream.read()
|
buff = firstBuff if firstBuff else stream.read()
|
||||||
while buff:
|
while buff:
|
||||||
if proc:
|
if proc:
|
||||||
buff = proc(buff)
|
buff = proc(buff)
|
||||||
@ -120,25 +120,12 @@ class WbResponse:
|
|||||||
finally:
|
finally:
|
||||||
stream.close()
|
stream.close()
|
||||||
|
|
||||||
response = WbResponse(statusline, headersList = headers, value = streamGen())
|
response = WbResponse(status_headers, value = streamGen())
|
||||||
response._stream = stream
|
response._stream = stream
|
||||||
return response
|
return response
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def better_timestamp_response(wbrequest, newTimestamp):
|
|
||||||
wbrequest.wb_url.timestamp = newTimestamp
|
|
||||||
newUrl = wbrequest.wb_prefix + str(wbrequest.wb_url)[1:]
|
|
||||||
return WbResponse.redir_response(newUrl)
|
|
||||||
|
|
||||||
def get_header(self, name):
|
|
||||||
return utils.get_header(self.headersList, name)
|
|
||||||
|
|
||||||
def __call__(self, env, start_response):
|
def __call__(self, env, start_response):
|
||||||
#headersList = []
|
start_response(self.status_headers.statusline, self.status_headers.headers)
|
||||||
#for key, value in self.headers.iteritems():
|
|
||||||
# headersList.append((key, value))
|
|
||||||
|
|
||||||
start_response(self.status, self.headersList)
|
|
||||||
|
|
||||||
if env['REQUEST_METHOD'] == 'HEAD':
|
if env['REQUEST_METHOD'] == 'HEAD':
|
||||||
if hasattr(self.body, 'close'):
|
if hasattr(self.body, 'close'):
|
||||||
@ -155,6 +142,28 @@ class WbResponse:
|
|||||||
return str(vars(self))
|
return str(vars(self))
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class StatusAndHeaders:
|
||||||
|
def __init__(self, statusline, headers, protocol = ''):
|
||||||
|
self.statusline = statusline
|
||||||
|
self.headers = headers
|
||||||
|
self.protocol = protocol
|
||||||
|
|
||||||
|
def getHeader(self, name):
|
||||||
|
nameLower = name.lower()
|
||||||
|
for value in self.headers:
|
||||||
|
if (value[0].lower() == nameLower):
|
||||||
|
return value[1]
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return "StatusAndHeaders(protocol = '{0}', statusline = '{1}', headers = {2})".format(self.protocol, self.statusline, pprint.pformat(self.headers, indent = 2))
|
||||||
|
#return pprint.pformat(self.__dict__)
|
||||||
|
|
||||||
|
def __eq__(self, other):
|
||||||
|
return self.statusline == other.statusline and self.headers == other.headers and self.protocol == other.protocol
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import doctest
|
import doctest
|
||||||
|
Loading…
x
Reference in New Issue
Block a user