mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
rename rewriters
header_rewriter added! support for encoding detection various fixes xmlrewriter
This commit is contained in:
parent
edbcaaf108
commit
2357f108a3
@ -1,7 +1,7 @@
|
||||
import urlparse
|
||||
|
||||
from wbrequestresponse import WbRequest, WbResponse
|
||||
from wburlrewriter import ArchivalUrlRewriter
|
||||
from url_rewriter import ArchivalUrlRewriter
|
||||
|
||||
#=================================================================
|
||||
# ArchivalRequestRouter -- route WB requests in archival mode
|
||||
@ -122,7 +122,7 @@ if __name__ == "__main__":
|
||||
if not rep:
|
||||
return False
|
||||
|
||||
return rep.get_header('Location')
|
||||
return rep.status_headers.getHeader('Location')
|
||||
|
||||
|
||||
doctest.testmod()
|
||||
|
@ -1,16 +1,15 @@
|
||||
import hanzo.warctools
|
||||
|
||||
import re
|
||||
import itertools
|
||||
import utils
|
||||
import zlib
|
||||
import urllib2
|
||||
import StringIO
|
||||
import urlparse
|
||||
import collections
|
||||
import wbexceptions
|
||||
|
||||
from wbrequestresponse import StatusAndHeaders
|
||||
|
||||
#=================================================================
|
||||
class HttpStreamLoader:
|
||||
class HttpReader:
|
||||
def __init__(self, hmac = None, hmacDuration = 30):
|
||||
self.hmac = hmac
|
||||
self.hmacDuration = hmacDuration
|
||||
@ -33,7 +32,7 @@ class HttpStreamLoader:
|
||||
|
||||
#=================================================================
|
||||
# Untested, but for completeness
|
||||
class FileStreamLoader:
|
||||
class FileReader:
|
||||
def load(self, url, offset, length):
|
||||
if url.startswith('file://'):
|
||||
url = url[len('file://'):]
|
||||
@ -45,27 +44,79 @@ class FileStreamLoader:
|
||||
|
||||
|
||||
#=================================================================
|
||||
WBArchiveRecord = collections.namedtuple('WBArchiveRecord', 'type, record, stream, statusline, httpHeaders')
|
||||
WBArchiveRecord = collections.namedtuple('WBArchiveRecord', 'type, rec_headers, stream, status_headers')
|
||||
|
||||
#=================================================================
|
||||
|
||||
class ArchiveLoader:
|
||||
"""
|
||||
>>> loadTestArchive('example.warc.gz', '333', '1043')
|
||||
(('warc', 'response'),
|
||||
StatusAndHeaders(protocol = 'WARC/1.0', statusline = '', headers = [ ('WARC-Type', 'response'),
|
||||
('WARC-Record-ID', '<urn:uuid:6d058047-ede2-4a13-be79-90c17c631dd4>'),
|
||||
('WARC-Date', '2014-01-03T03:03:21Z'),
|
||||
('Content-Length', '1610'),
|
||||
('Content-Type', 'application/http; msgtype=response'),
|
||||
('WARC-Payload-Digest', 'sha1:B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'),
|
||||
('WARC-Target-URI', 'http://example.com?example=1'),
|
||||
('WARC-Warcinfo-ID', '<urn:uuid:fbd6cf0a-6160-4550-b343-12188dc05234>')]),
|
||||
StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
|
||||
('Cache-Control', 'max-age=604800'),
|
||||
('Content-Type', 'text/html'),
|
||||
('Date', 'Fri, 03 Jan 2014 03:03:21 GMT'),
|
||||
('Etag', '"359670651"'),
|
||||
('Expires', 'Fri, 10 Jan 2014 03:03:21 GMT'),
|
||||
('Last-Modified', 'Fri, 09 Aug 2013 23:54:35 GMT'),
|
||||
('Server', 'ECS (sjc/4FCE)'),
|
||||
('X-Cache', 'HIT'),
|
||||
('x-ec-custom-error', '1'),
|
||||
('Content-Length', '1270'),
|
||||
('Connection', 'close')]))
|
||||
|
||||
|
||||
>>> loadTestArchive('example.warc.gz', '1864', '553')
|
||||
(('warc', 'revisit'),
|
||||
StatusAndHeaders(protocol = 'WARC/1.0', statusline = '', headers = [ ('WARC-Type', 'revisit'),
|
||||
('WARC-Record-ID', '<urn:uuid:3619f5b0-d967-44be-8f24-762098d427c4>'),
|
||||
('WARC-Date', '2014-01-03T03:03:41Z'),
|
||||
('Content-Length', '340'),
|
||||
('Content-Type', 'application/http; msgtype=response'),
|
||||
('WARC-Payload-Digest', 'sha1:B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'),
|
||||
('WARC-Target-URI', 'http://example.com?example=1'),
|
||||
('WARC-Warcinfo-ID', '<urn:uuid:fbd6cf0a-6160-4550-b343-12188dc05234>'),
|
||||
( 'WARC-Profile',
|
||||
'http://netpreserve.org/warc/0.18/revisit/identical-payload-digest'),
|
||||
('WARC-Refers-To-Target-URI', 'http://example.com?example=1'),
|
||||
('WARC-Refers-To-Date', '2014-01-03T03:03:21Z')]),
|
||||
StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
|
||||
('Cache-Control', 'max-age=604800'),
|
||||
('Content-Type', 'text/html'),
|
||||
('Date', 'Fri, 03 Jan 2014 03:03:41 GMT'),
|
||||
('Etag', '"359670651"'),
|
||||
('Expires', 'Fri, 10 Jan 2014 03:03:41 GMT'),
|
||||
('Last-Modified', 'Fri, 09 Aug 2013 23:54:35 GMT'),
|
||||
('Server', 'ECS (sjc/4FCE)'),
|
||||
('X-Cache', 'HIT'),
|
||||
('x-ec-custom-error', '1'),
|
||||
('Content-Length', '1270'),
|
||||
('Connection', 'close')]))
|
||||
"""
|
||||
|
||||
# Standard ARC headers
|
||||
ARC_HEADERS = ["uri", "ip-address", "creation-date", "content-type", "length"]
|
||||
|
||||
# Since loading a range request, can only determine gzip-ness based on file extension
|
||||
FORMAT_MAP = {
|
||||
'.warc.gz': (hanzo.warctools.WarcRecord, 'warc', True),
|
||||
'.arc.gz': (hanzo.warctools.ArcRecord, 'arc', True),
|
||||
'.warc': (hanzo.warctools.WarcRecord, 'warc', False),
|
||||
'.arc': (hanzo.warctools.ArcRecord, 'arc', False),
|
||||
'.warc.gz': ('warc', True),
|
||||
'.arc.gz': ('arc', True),
|
||||
'.warc': ('warc', False),
|
||||
'.arc': ('arc', False),
|
||||
}
|
||||
|
||||
HTTP_STATUS_REGEX = re.compile('^HTTP/[\d.]+ (\d+.*)$')
|
||||
|
||||
@staticmethod
|
||||
def createDefaultLoaders():
|
||||
http = HttpStreamLoader()
|
||||
file = FileStreamLoader()
|
||||
http = HttpReader()
|
||||
file = FileReader()
|
||||
return {
|
||||
'http': http,
|
||||
'https': http,
|
||||
@ -78,6 +129,10 @@ class ArchiveLoader:
|
||||
self.loaders = loaders if loaders else ArchiveLoader.createDefaultLoaders()
|
||||
self.chunkSize = chunkSize
|
||||
|
||||
self.arcParser = ARCHeadersParser(ArchiveLoader.ARC_HEADERS)
|
||||
self.warcParser = StatusAndHeadersParser(['WARC/1.0', 'WARC/0.17', 'WARC/0.18'])
|
||||
self.httpParser = StatusAndHeadersParser(['HTTP/1.0', 'HTTP/1.1'])
|
||||
|
||||
def load(self, url, offset, length):
|
||||
urlParts = urlparse.urlsplit(url)
|
||||
|
||||
@ -86,22 +141,19 @@ class ArchiveLoader:
|
||||
except Exception:
|
||||
raise wbexceptions.UnknownLoaderProtocolException(url)
|
||||
|
||||
loaderCls = None
|
||||
theFormat = None
|
||||
|
||||
for ext, (loaderCls, aFormat, gzip) in ArchiveLoader.FORMAT_MAP.iteritems():
|
||||
for ext, iformat in ArchiveLoader.FORMAT_MAP.iteritems():
|
||||
if url.endswith(ext):
|
||||
loaderCls = loaderCls
|
||||
aFormat = aFormat
|
||||
isGzip = gzip
|
||||
theFormat = iformat
|
||||
break
|
||||
|
||||
if loaderCls is None:
|
||||
if theFormat is None:
|
||||
raise wbexceptions.UnknownArchiveFormatException(url)
|
||||
|
||||
if isGzip:
|
||||
decomp = zlib.decompressobj(16+zlib.MAX_WBITS)
|
||||
else:
|
||||
decomp = None
|
||||
(aFormat, isGzip) = theFormat
|
||||
|
||||
decomp = utils.create_decompressor() if isGzip else None
|
||||
|
||||
try:
|
||||
length = int(length)
|
||||
@ -111,73 +163,87 @@ class ArchiveLoader:
|
||||
|
||||
raw = loader.load(url, long(offset), length)
|
||||
|
||||
reader = LineReader(raw, length, self.chunkSize, decomp)
|
||||
|
||||
parser = loaderCls.make_parser()
|
||||
|
||||
if aFormat == 'arc':
|
||||
parser.headers = ArchiveLoader.ARC_HEADERS
|
||||
|
||||
(parsed, errors, _) = parser.parse(reader, 0)
|
||||
|
||||
if errors:
|
||||
reader.close()
|
||||
raise wbexceptions.InvalidArchiveRecordException('Error Parsing Record', errors)
|
||||
|
||||
stream = LineReader(raw, length, self.chunkSize, decomp)
|
||||
|
||||
if aFormat == 'arc':
|
||||
rec_headers = self.arcParser.parse(stream)
|
||||
recType = 'response'
|
||||
empty = (utils.get_header(parsed.headers, 'length') == 0)
|
||||
else:
|
||||
recType = utils.get_header(parsed.headers, 'WARC-Type')
|
||||
empty = (utils.get_header(parsed.headers, 'Content-Length') == '0')
|
||||
empty = (rec_headers.getHeader('length') == 0)
|
||||
|
||||
elif aFormat == 'warc':
|
||||
rec_headers = self.warcParser.parse(stream)
|
||||
recType = rec_headers.getHeader('WARC-Type')
|
||||
empty = (rec_headers.getHeader('Content-Length') == '0')
|
||||
|
||||
# special case: empty w/arc record (hopefully a revisit)
|
||||
if empty:
|
||||
statusline = '204 No Content'
|
||||
headers = []
|
||||
status_headers = StatusAndHeaders('204 No Content', [])
|
||||
|
||||
# special case: warc records that are not expected to have http headers
|
||||
# attempt to add 200 status and content-type
|
||||
elif recType == 'metadata' or recType == 'resource':
|
||||
statusline = '200 OK'
|
||||
headers = [('Content-Type', utils.get_header(parsed.headers, 'Content-Type'))]
|
||||
status_headers = StatusAndHeaders('200 OK', [('Content-Type', rec_headers.getHeader('Content-Type'))])
|
||||
|
||||
# special case: http 0.9 response, no status or headers
|
||||
#elif recType == 'response' and (';version=0.9' in utils.get_header(parsed.headers, 'Content-Type')):
|
||||
# statusline = '200 OK'
|
||||
# headers = []
|
||||
#elif recType == 'response':
|
||||
# contentType = rec_headers.getHeader('Content-Type')
|
||||
# if contentType and (';version=0.9' in contentType):
|
||||
# status_headers = StatusAndHeaders('200 OK', [])
|
||||
|
||||
# response record: parse HTTP status and headers!
|
||||
else:
|
||||
(statusline, headers) = self.parseHttpHeaders(reader)
|
||||
#(statusline, http_headers) = self.parseHttpHeaders(stream)
|
||||
status_headers = self.httpParser.parse(stream)
|
||||
|
||||
return WBArchiveRecord((aFormat, recType), parsed, reader, statusline, headers)
|
||||
return WBArchiveRecord((aFormat, recType), rec_headers, stream, status_headers)
|
||||
|
||||
|
||||
def parseHttpHeaders(self, stream):
|
||||
def nextHeaderLine(stream):
|
||||
return stream.readline().rstrip()
|
||||
#=================================================================
|
||||
class StatusAndHeadersParser:
|
||||
def __init__(self, statuslist):
|
||||
self.statuslist = statuslist
|
||||
|
||||
line = nextHeaderLine(stream)
|
||||
matched = self.HTTP_STATUS_REGEX.match(line)
|
||||
def parse(self, stream):
|
||||
statusline = stream.readline().rstrip()
|
||||
|
||||
if not matched:
|
||||
raise wbexceptions.InvalidArchiveRecordException('Expected HTTP Status Line, Found: ' + line)
|
||||
protocolStatus = utils.split_prefix(statusline, self.statuslist)
|
||||
|
||||
if not protocolStatus:
|
||||
raise wbexceptions.InvalidArchiveRecordException('Expected Status Line, Found: ' + statusline)
|
||||
|
||||
#status = int(matched.group(2))
|
||||
statusline = matched.group(1)
|
||||
headers = []
|
||||
|
||||
line = nextHeaderLine(stream)
|
||||
|
||||
line = stream.readline().rstrip()
|
||||
while line and line != '\r\n':
|
||||
name, value = line.split(':', 1)
|
||||
value = value.strip()
|
||||
headers.append((name, value))
|
||||
line = nextHeaderLine(stream)
|
||||
header = (name, value.strip())
|
||||
headers.append(header)
|
||||
line = stream.readline().rstrip()
|
||||
|
||||
return (statusline, headers)
|
||||
return StatusAndHeaders(statusline = protocolStatus[1].strip(), headers = headers, protocol = protocolStatus[0])
|
||||
|
||||
#=================================================================
|
||||
class ARCHeadersParser:
|
||||
def __init__(self, headernames):
|
||||
self.headernames = headernames
|
||||
|
||||
|
||||
def parse(self, stream):
|
||||
headerline = stream.readline().rstrip()
|
||||
|
||||
parts = headerline.split()
|
||||
|
||||
headernames = self.headernames
|
||||
|
||||
if len(parts) != len(headernames):
|
||||
raise wbexceptions.InvalidArchiveRecordException('Wrong # of heaeders, expected arc headers {0}, Found {1}'.format(headernames, parts))
|
||||
|
||||
headers = []
|
||||
|
||||
for name, value in itertools.izip(headernames, parts):
|
||||
headers.append((name, value))
|
||||
|
||||
return StatusAndHeaders(statusline = '', headers = headers, protocol = 'ARC/1.0')
|
||||
|
||||
#=================================================================
|
||||
class LineReader:
|
||||
@ -217,4 +283,19 @@ class LineReader:
|
||||
self.stream = None
|
||||
|
||||
|
||||
#=================================================================
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
import os
|
||||
import pprint
|
||||
|
||||
testloader = ArchiveLoader()
|
||||
|
||||
def loadTestArchive(test_file, offset, length):
|
||||
path = os.path.dirname(os.path.realpath(__file__)) + '/../test/' + test_file
|
||||
|
||||
archive = testloader.load(path, offset, length)
|
||||
pprint.pprint((archive.type, archive.rec_headers, archive.status_headers))
|
||||
|
||||
doctest.testmod()
|
||||
|
||||
|
133
pywb/header_rewriter.py
Normal file
133
pywb/header_rewriter.py
Normal file
@ -0,0 +1,133 @@
|
||||
from wbrequestresponse import StatusAndHeaders
|
||||
|
||||
#=================================================================
|
||||
class RewrittenStatusAndHeaders:
|
||||
def __init__(self, statusline, headers, removedHeaderDict, textType, charset):
|
||||
self.status_headers = StatusAndHeaders(statusline, headers)
|
||||
self.removedHeaderDict = removedHeaderDict
|
||||
self.textType = textType
|
||||
self.charset = charset
|
||||
|
||||
def containsRemovedHeader(self, name, value):
|
||||
return self.removedHeaderDict.get(name) == value
|
||||
|
||||
|
||||
#=================================================================
|
||||
class HeaderRewriter:
|
||||
"""
|
||||
# Text with charset
|
||||
>>> test_rewrite([('Date', 'Fri, 03 Jan 2014 03:03:21 GMT'), ('Content-Length', '5'), ('Content-Type', 'text/html;charset=utf-8')])
|
||||
{'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Date', 'Fri, 03 Jan 2014 03:03:21 GMT'),
|
||||
('X-Archive-Orig-Content-Length', '5'),
|
||||
('Content-Type', 'text/html;charset=utf-8')]), 'charset': 'utf-8', 'textType': 'html', 'removedHeaderDict': {}}
|
||||
|
||||
# Redirect
|
||||
>>> test_rewrite([('Connection', 'close'), ('Location', '/other.html')], '302 Redirect')
|
||||
{'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [ ('X-Archive-Orig-Connection', 'close'),
|
||||
('Location', '/web/20131226101010/http://example.com/other.html')]), 'charset': None, 'textType': None, 'removedHeaderDict': {}}
|
||||
|
||||
# gzip
|
||||
>>> test_rewrite([('Content-Length', '199999'), ('Content-Type', 'text/javascript'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')])
|
||||
{'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Content-Length', '199999'),
|
||||
('Content-Type', 'text/javascript')]), 'charset': None, 'textType': 'js', 'removedHeaderDict': {'transfer-encoding': 'chunked', 'content-encoding': 'gzip'}}
|
||||
|
||||
# Binary
|
||||
>>> test_rewrite([('Content-Length', '200000'), ('Content-Type', 'image/png'), ('Cookie', 'blah'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')])
|
||||
{'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('Content-Length', '200000'),
|
||||
('Content-Type', 'image/png'),
|
||||
('X-Archive-Orig-Cookie', 'blah'),
|
||||
('Content-Encoding', 'gzip'),
|
||||
('Transfer-Encoding', 'chunked')]), 'charset': None, 'textType': None, 'removedHeaderDict': {}}
|
||||
|
||||
"""
|
||||
|
||||
|
||||
REWRITE_TYPES = {
|
||||
'html': ['text/html', 'application/xhtml'],
|
||||
'css': ['text/css'],
|
||||
'js': ['text/javascript', 'application/javascript', 'application/x-javascript'],
|
||||
'xml': ['/xml', '+xml', '.xml', '.rss'],
|
||||
}
|
||||
|
||||
|
||||
PROXY_HEADERS = ('content-type', 'content-disposition')
|
||||
|
||||
URL_REWRITE_HEADERS = ('location', 'content-location', 'content-base')
|
||||
|
||||
ENCODING_HEADERS = ('content-encoding', 'transfer-encoding')
|
||||
|
||||
PROXY_NO_REWRITE_HEADERS = ('content-length')
|
||||
|
||||
def __init__(self, headerPrefix = 'X-Archive-Orig-'):
|
||||
self.headerPrefix = headerPrefix
|
||||
|
||||
def rewrite(self, status_headers, urlrewriter):
|
||||
contentType = status_headers.getHeader('Content-Type')
|
||||
textType = None
|
||||
charset = None
|
||||
stripEncoding = False
|
||||
|
||||
if contentType:
|
||||
textType = self._extractTextType(contentType)
|
||||
if textType:
|
||||
charset = self._extractCharSet(contentType)
|
||||
stripEncoding = True
|
||||
|
||||
(newHeaders, removedHeaderDict) = self._rewriteHeaders(status_headers.headers, urlrewriter, stripEncoding)
|
||||
|
||||
return RewrittenStatusAndHeaders(status_headers.statusline, newHeaders, removedHeaderDict, textType, charset)
|
||||
|
||||
|
||||
def _extractTextType(self, contentType):
|
||||
for ctype, mimelist in self.REWRITE_TYPES.iteritems():
|
||||
if any ((mime in contentType) for mime in mimelist):
|
||||
return ctype
|
||||
|
||||
return None
|
||||
|
||||
def _extractCharSet(self, contentType):
|
||||
CHARSET_TOKEN = 'charset='
|
||||
idx = contentType.find(CHARSET_TOKEN)
|
||||
if idx < 0:
|
||||
return None
|
||||
|
||||
return contentType[idx + len(CHARSET_TOKEN):]
|
||||
|
||||
def _rewriteHeaders(self, headers, urlrewriter, contentRewritten = False):
|
||||
newHeaders = []
|
||||
removedHeaderDict = {}
|
||||
|
||||
for (name, value) in headers:
|
||||
lowername = name.lower()
|
||||
if lowername in self.PROXY_HEADERS:
|
||||
newHeaders.append((name, value))
|
||||
elif lowername in self.URL_REWRITE_HEADERS:
|
||||
newHeaders.append((name, urlrewriter.rewrite(value)))
|
||||
elif lowername in self.ENCODING_HEADERS:
|
||||
if contentRewritten:
|
||||
removedHeaderDict[lowername] = value
|
||||
else:
|
||||
newHeaders.append((name, value))
|
||||
elif lowername in self.PROXY_NO_REWRITE_HEADERS and not contentRewritten:
|
||||
newHeaders.append((name, value))
|
||||
else:
|
||||
newHeaders.append((self.headerPrefix + name, value))
|
||||
|
||||
return (newHeaders, removedHeaderDict)
|
||||
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
import os
|
||||
import pprint
|
||||
import url_rewriter
|
||||
|
||||
urlrewriter = url_rewriter.ArchivalUrlRewriter('/20131226101010/http://example.com/some/path/index.html', '/web/')
|
||||
|
||||
headerrewriter = HeaderRewriter()
|
||||
|
||||
def test_rewrite(headers, status = '200 OK'):
|
||||
rewritten = headerrewriter.rewrite(StatusAndHeaders(status, headers), urlrewriter)
|
||||
return vars(rewritten)
|
||||
|
||||
doctest.testmod()
|
||||
|
@ -5,8 +5,8 @@ import sys
|
||||
import re
|
||||
|
||||
from HTMLParser import HTMLParser
|
||||
from wburlrewriter import ArchivalUrlRewriter
|
||||
from regexmatch import JSRewriter, CSSRewriter
|
||||
from url_rewriter import ArchivalUrlRewriter
|
||||
from regex_rewriters import JSRewriter, CSSRewriter
|
||||
|
||||
#=================================================================
|
||||
# WBHtml --html parser for custom rewriting, also handlers for script and css
|
@ -10,37 +10,18 @@ class RemoteCDXServer:
|
||||
>>> x = cdxserver.load('example.com', parse_cdx = True, limit = '2')
|
||||
>>> pprint(x[0])
|
||||
{'digest': 'HT2DYGA5UKZCPBSFVCV3JOBXGW2G5UUA',
|
||||
'filename': 'DJ_crawl2.20020401123359-c/DJ_crawl3.20020120141301.arc.gz',
|
||||
'length': '1792',
|
||||
'mimetype': 'text/html',
|
||||
'offset': '49482198',
|
||||
'original': 'http://example.com:80/',
|
||||
'redirect': '-',
|
||||
'robotflags': '-',
|
||||
'statuscode': '200',
|
||||
'timestamp': '20020120142510',
|
||||
'urlkey': 'com,example)/'}
|
||||
|
||||
>>> x = cdxserver.load('example.com', parse_cdx = True, params = {'resolveRevisits': True, 'closest': '20131226', 'sort': 'closest', 'limit': '1'})
|
||||
>>> pprint(x[0])
|
||||
{'digest': 'B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A',
|
||||
'filename': 'top_domains-00800-20131210-035838/top_domains-00800-20131210051705-00024.warc.gz',
|
||||
'length': '523',
|
||||
'mimetype': 'warc/revisit',
|
||||
'offset': '247256770',
|
||||
'orig.filename': 'deccanchronicle.com-20130107-023325/IA-FOC-deccanchronicle.com-20130921004125-00000.warc.gz',
|
||||
'orig.length': '529',
|
||||
'orig.offset': '769759',
|
||||
'original': 'http://www.example.com/',
|
||||
'redirect': '-',
|
||||
'robotflags': '-',
|
||||
'statuscode': '-',
|
||||
'timestamp': '20131210052355',
|
||||
'urlkey': 'com,example)/'}
|
||||
"""
|
||||
"""
|
||||
|
||||
def __init__(self, serverUrl):
|
||||
def __init__(self, serverUrl, cookie = None):
|
||||
self.serverUrl = serverUrl
|
||||
self.authCookie = cookie
|
||||
|
||||
def load(self, url, params = {}, parse_cdx = False, **kwvalues):
|
||||
#url is required, must be passed explicitly!
|
||||
@ -51,6 +32,10 @@ class RemoteCDXServer:
|
||||
|
||||
try:
|
||||
request = urllib2.Request(self.serverUrl, urlparams)
|
||||
|
||||
if self.authCookie:
|
||||
request.add_header('Cookie', self.authCookie)
|
||||
|
||||
response = urllib2.urlopen(request)
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 403:
|
||||
@ -91,6 +76,9 @@ class RemoteCDXServer:
|
||||
|
||||
class CDXCaptureResult(dict):
|
||||
CDX_FORMATS = [
|
||||
# Public CDX Format
|
||||
["urlkey","timestamp","original","mimetype","statuscode","digest","length"],
|
||||
|
||||
# CDX 11 Format
|
||||
["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"],
|
||||
|
||||
|
@ -4,8 +4,11 @@ import wbrequestresponse
|
||||
import wbexceptions
|
||||
|
||||
class QueryHandler:
|
||||
def __init__(self):
|
||||
self.cdxserver = indexreader.RemoteCDXServer('http://web.archive.org/cdx/search/cdx')
|
||||
def __init__(self, cdxserver = None):
|
||||
if not cdxserver:
|
||||
cdxserver = indexreader.RemoteCDXServer('http://web.archive.org/cdx/search/cdx')
|
||||
|
||||
self.cdxserver = cdxserver
|
||||
|
||||
def __call__(self, wbrequest, prev_wbresponse):
|
||||
wburl = wbrequest.wb_url
|
||||
|
@ -2,12 +2,13 @@ import re
|
||||
import sys
|
||||
import itertools
|
||||
|
||||
from wburlrewriter import ArchivalUrlRewriter
|
||||
from url_rewriter import ArchivalUrlRewriter
|
||||
|
||||
#=================================================================
|
||||
class RegexRewriter:
|
||||
"""
|
||||
# Test https->http converter (other tests below in subclasses)
|
||||
>>> RegexRewriter([(RegexRewriter.HTTPX_MATCH_REGEX, RegexRewriter.removeHttps, 0)]).replaceAll('a = https://example.com; b = http://example.com; c = https://some-url/path/https://embedded.example.com')
|
||||
>>> RegexRewriter([(RegexRewriter.HTTPX_MATCH_STR, RegexRewriter.removeHttps, 0)]).replaceAll('a = https://example.com; b = http://example.com; c = https://some-url/path/https://embedded.example.com')
|
||||
'a = http://example.com; b = http://example.com; c = http://some-url/path/http://embedded.example.com'
|
||||
"""
|
||||
|
||||
@ -27,7 +28,7 @@ class RegexRewriter:
|
||||
def archivalRewrite(rewriter):
|
||||
return lambda x: rewriter.rewrite(x)
|
||||
|
||||
HTTPX_MATCH_REGEX = 'https?:\\\\?/\\\\?/[A-Za-z0-9:_@.-]+'
|
||||
HTTPX_MATCH_STR = 'https?:\\\\?/\\\\?/[A-Za-z0-9:_@.-]+'
|
||||
|
||||
DEFAULT_OP = addPrefix
|
||||
|
||||
@ -44,6 +45,9 @@ class RegexRewriter:
|
||||
self.regex = re.compile(regexStr, re.M)
|
||||
self.rules = rules
|
||||
|
||||
def filter(self, m):
|
||||
return True
|
||||
|
||||
def replaceAll(self, string):
|
||||
return self.regex.sub(lambda x: self.replace(x), string)
|
||||
|
||||
@ -60,6 +64,10 @@ class RegexRewriter:
|
||||
if not m.group(i):
|
||||
continue
|
||||
|
||||
# Optional filter to skip matches
|
||||
if not self.filter(m):
|
||||
return m.group(0)
|
||||
|
||||
# Custom func
|
||||
if not hasattr(op, '__call__'):
|
||||
op = RegexRewriter.DEFAULT_OP(op)
|
||||
@ -74,6 +82,7 @@ class RegexRewriter:
|
||||
|
||||
|
||||
|
||||
#=================================================================
|
||||
class JSRewriter(RegexRewriter):
|
||||
"""
|
||||
>>> test_js('location = "http://example.com/abc.html"')
|
||||
@ -100,11 +109,47 @@ class JSRewriter(RegexRewriter):
|
||||
|
||||
def _createRules(self, httpPrefix):
|
||||
return [
|
||||
(RegexRewriter.HTTPX_MATCH_REGEX, httpPrefix, 0),
|
||||
(RegexRewriter.HTTPX_MATCH_STR, httpPrefix, 0),
|
||||
('location|domain', 'WB_wombat_', 0),
|
||||
]
|
||||
|
||||
|
||||
#=================================================================
|
||||
class XMLRewriter(RegexRewriter):
|
||||
"""
|
||||
>>> test_xml('<tag xmlns="http://www.example.com/ns" attr="http://example.com"></tag>')
|
||||
'<tag xmlns="http://www.example.com/ns" attr="/web/20131010im_/http://example.com"></tag>'
|
||||
|
||||
>>> test_xml('<tag xmlns:xsi="http://www.example.com/ns" attr=" http://example.com"></tag>')
|
||||
'<tag xmlns:xsi="http://www.example.com/ns" attr=" /web/20131010im_/http://example.com"></tag>'
|
||||
|
||||
>>> test_xml('<tag> http://example.com<other>abchttp://example.com</other></tag>')
|
||||
'<tag> /web/20131010im_/http://example.com<other>abchttp://example.com</other></tag>'
|
||||
|
||||
>>> test_xml('<main> http://www.example.com/blah</tag> <other xmlns:abcdef= " http://example.com"/> http://example.com </main>')
|
||||
'<main> /web/20131010im_/http://www.example.com/blah</tag> <other xmlns:abcdef= " http://example.com"/> /web/20131010im_/http://example.com </main>'
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, rewriter, extra = []):
|
||||
rules = self._createRules(rewriter.getAbsUrl())
|
||||
|
||||
RegexRewriter.__init__(self, rules)
|
||||
|
||||
# custom filter to reject 'xmlns' attr
|
||||
def filter(self, m):
|
||||
attr = m.group(1)
|
||||
if attr and attr.startswith('xmlns'):
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def _createRules(self, httpPrefix):
|
||||
return [
|
||||
('([A-Za-z:]+[\s=]+)?["\'\s]*(' + RegexRewriter.HTTPX_MATCH_STR + ')', httpPrefix, 2),
|
||||
]
|
||||
|
||||
#=================================================================
|
||||
class CSSRewriter(RegexRewriter):
|
||||
r"""
|
||||
>>> test_css("background: url('/some/path.html')")
|
||||
@ -172,6 +217,9 @@ if __name__ == "__main__":
|
||||
def test_js(string, extra = []):
|
||||
return JSRewriter(arcrw, extra).replaceAll(string)
|
||||
|
||||
def test_xml(string):
|
||||
return XMLRewriter(arcrw).replaceAll(string)
|
||||
|
||||
def test_css(string):
|
||||
return CSSRewriter(arcrw).replaceAll(string)
|
||||
|
184
pywb/replay.py
184
pywb/replay.py
@ -1,14 +1,18 @@
|
||||
import StringIO
|
||||
from urllib2 import URLError
|
||||
import chardet
|
||||
import redis
|
||||
|
||||
import indexreader
|
||||
from wbrequestresponse import WbResponse
|
||||
from wbrequestresponse import WbResponse, StatusAndHeaders
|
||||
from wbarchivalurl import ArchivalUrl
|
||||
import utils
|
||||
from wburlrewriter import ArchivalUrlRewriter
|
||||
|
||||
import wbhtml
|
||||
import regexmatch
|
||||
from url_rewriter import ArchivalUrlRewriter
|
||||
from header_rewriter import HeaderRewriter
|
||||
import html_rewriter
|
||||
import regex_rewriters
|
||||
|
||||
import wbexceptions
|
||||
|
||||
#=================================================================
|
||||
@ -111,19 +115,19 @@ class ReplayHandler(object):
|
||||
payloadRecord = self._load(cdx, True, failedFiles)
|
||||
|
||||
# Case 4: if headers record is actually empty (eg empty revisit), then use headers from revisit
|
||||
if not headersRecord.httpHeaders:
|
||||
if not headersRecord.status_headers.headers:
|
||||
headersRecord.stream.close()
|
||||
headersRecord = payloadRecord
|
||||
else:
|
||||
headersRecord.stream.close()
|
||||
|
||||
|
||||
|
||||
isRevisit = True
|
||||
|
||||
else:
|
||||
raise wbexceptions.CaptureException('Invalid CDX' + cdx)
|
||||
|
||||
return WbResponse.stream_response(headersRecord.statusline, headersRecord.httpHeaders, payloadRecord.stream)
|
||||
return WbResponse.stream_response(headersRecord.status_headers, payloadRecord.stream)
|
||||
|
||||
|
||||
def resolveFull(self, filename):
|
||||
@ -140,26 +144,12 @@ class ReplayHandler(object):
|
||||
#=================================================================
|
||||
class RewritingReplayHandler(ReplayHandler):
|
||||
|
||||
|
||||
REWRITE_TYPES = {
|
||||
'html': ['text/html', 'application/xhtml'],
|
||||
'css': ['text/css'],
|
||||
'js': ['text/javascript', 'application/javascript', 'application/x-javascript'],
|
||||
'xml': ['/xml', '+xml', '.xml', '.rss'],
|
||||
}
|
||||
|
||||
|
||||
PROXY_HEADERS = ('content-type', 'content-disposition')
|
||||
|
||||
URL_REWRITE_HEADERS = ('location', 'content-location', 'content-base')
|
||||
|
||||
ENCODING_HEADERS = ('content-encoding', 'transfer-encoding')
|
||||
|
||||
|
||||
def __init__(self, resolvers, archiveloader, headerPrefix = 'X-Archive-Orig-', headInsert = None):
|
||||
def __init__(self, resolvers, archiveloader, headInsert = None, headerRewriter = None):
|
||||
ReplayHandler.__init__(self, resolvers, archiveloader)
|
||||
self.headerPrefix = headerPrefix
|
||||
self.headInsert = headInsert
|
||||
if not headerRewriter:
|
||||
headerRewriter = HeaderRewriter()
|
||||
self.headerRewriter = headerRewriter
|
||||
|
||||
|
||||
def _textContentType(self, contentType):
|
||||
@ -183,88 +173,94 @@ class RewritingReplayHandler(ReplayHandler):
|
||||
if wbrequest.wb_url.mod == 'id_':
|
||||
return response
|
||||
|
||||
contentType = utils.get_header(response.headersList, 'Content-Type')
|
||||
|
||||
textType = self._textContentType(contentType) if contentType else None
|
||||
|
||||
(newHeaders, remHeaders) = self._rewriteHeaders(response.headersList, urlrewriter, textType is not None)
|
||||
|
||||
# binary type, just send through
|
||||
if textType is None:
|
||||
response.headersList = newHeaders
|
||||
rewrittenHeaders = self.headerRewriter.rewrite(response.status_headers, urlrewriter)
|
||||
|
||||
# non-text content type, just send through with rewritten headers
|
||||
if rewrittenHeaders.textType is None:
|
||||
response.status_headers = rewrittenHeaders.status_headers
|
||||
return response
|
||||
|
||||
# Handle text rewriting
|
||||
# TODO: better way to pass this
|
||||
# TODO: better way to pass this?
|
||||
stream = response._stream
|
||||
|
||||
# special case -- need to ungzip the body
|
||||
if (utils.contains_header(remHeaders, ('Content-Encoding', 'gzip'))):
|
||||
stream = archiveloader.LineStream(stream, decomp = zlib.decompressobj(16 + zlib.MAX_WBITS))
|
||||
if (rewrittenHeaders.containsRemovedHeader('content-encoding', 'gzip')):
|
||||
stream = archiveloader.LineStream(stream, decomp = utils.create_decompressor())
|
||||
|
||||
return self._rewriteContent(textType, urlrewriter, stream, newHeaders, response)
|
||||
# TODO: is this right?
|
||||
if rewrittenHeaders.charset:
|
||||
encoding = rewrittenHeaders.charset
|
||||
firstBuff = None
|
||||
else:
|
||||
(encoding, firstBuff) = self._detectCharset(stream)
|
||||
|
||||
# TODO: first non-streaming attempt, probably want to stream
|
||||
def _rewriteContent(self, textType, urlrewriter, stream, newHeaders, origResponse, encoding = 'utf-8'):
|
||||
if textType == 'html':
|
||||
out = StringIO.StringIO()
|
||||
#out = SimpleWriter()
|
||||
htmlrewriter = wbhtml.WBHtml(urlrewriter, out, self.headInsert)
|
||||
# if ascii, set to noop encode operation
|
||||
if encoding == 'ascii':
|
||||
encoding = None
|
||||
#encoding = 'utf-8'
|
||||
|
||||
try:
|
||||
buff = stream.read()
|
||||
while buff:
|
||||
# Buffering response for html, streaming for others?
|
||||
if rewrittenHeaders.textType == 'html':
|
||||
return self._rewriteHtml(encoding, urlrewriter, stream, rewrittenHeaders.status_headers, firstBuff)
|
||||
else:
|
||||
return self._rewriteOther(rewrittenHeaders.textType, encoding, urlrewriter, stream, rewrittenHeaders.status_headers, firstBuff)
|
||||
|
||||
|
||||
def _rewriteHtml(self, encoding, urlrewriter, stream, status_headers, firstBuff = None):
|
||||
out = StringIO.StringIO()
|
||||
htmlrewriter = html_rewriter.WBHtml(urlrewriter, out, self.headInsert)
|
||||
|
||||
try:
|
||||
buff = firstBuff if firstBuff else stream.read()
|
||||
while buff:
|
||||
if encoding:
|
||||
buff = buff.decode(encoding)
|
||||
htmlrewriter.feed(buff)
|
||||
buff = stream.read()
|
||||
htmlrewriter.feed(buff)
|
||||
buff = stream.read()
|
||||
|
||||
htmlrewriter.close()
|
||||
# Close rewriter if gracefully made it to end
|
||||
htmlrewriter.close()
|
||||
|
||||
#except Exception as e:
|
||||
# print e
|
||||
finally:
|
||||
content = out.getvalue()
|
||||
if encoding:
|
||||
content = content.encode(encoding)
|
||||
|
||||
finally:
|
||||
content = out.getvalue().encode(encoding)
|
||||
value = [content]
|
||||
newHeaders.append(('Content-Length', str(len(value[0]))))
|
||||
contentLengthStr = str(len(content))
|
||||
status_headers.headers.append(('Content-Length', contentLengthStr))
|
||||
out.close()
|
||||
|
||||
return WbResponse(status = origResponse.status, headersList = newHeaders, value = value)
|
||||
|
||||
else:
|
||||
if textType == 'css':
|
||||
rewriter = regexmatch.CSSRewriter(urlrewriter)
|
||||
elif textType == 'js':
|
||||
rewriter = regexmatch.JSRewriter(urlrewriter)
|
||||
|
||||
def doRewrite(buff):
|
||||
return rewriter.replaceAll(buff)
|
||||
|
||||
return WbResponse.stream_response(origResponse.status, newHeaders, stream, doRewrite)
|
||||
return WbResponse(status_headers, value = value)
|
||||
|
||||
|
||||
def _rewriteOther(self, textType, encoding, urlrewriter, stream, status_headers, firstBuff = None):
|
||||
if textType == 'css':
|
||||
rewriter = regex_rewriters.CSSRewriter(urlrewriter)
|
||||
elif textType == 'js':
|
||||
rewriter = regex_rewriters.JSRewriter(urlrewriter)
|
||||
elif textType == 'xml':
|
||||
rewriter = regex_rewriters.XMLRewriter(urlrewriter)
|
||||
|
||||
|
||||
def _rewriteHeaders(self, headers, urlrewriter, stripEncoding = False):
|
||||
newHeaders = []
|
||||
removedHeaders = []
|
||||
def doRewrite(buff):
|
||||
if encoding:
|
||||
buff = buff.decode(encoding)
|
||||
buff = rewriter.replaceAll(buff)
|
||||
if encoding:
|
||||
buff = buff.encode(encoding)
|
||||
|
||||
for (name, value) in headers:
|
||||
lowername = name.lower()
|
||||
if lowername in self.PROXY_HEADERS:
|
||||
newHeaders.append((name, value))
|
||||
elif lowername in self.URL_REWRITE_HEADERS:
|
||||
newHeaders.append((name, urlrewriter.rewrite(value)))
|
||||
elif lowername in self.ENCODING_HEADERS:
|
||||
if stripEncoding:
|
||||
removedHeaders.append((name, value))
|
||||
else:
|
||||
newHeaders.append((name, value))
|
||||
else:
|
||||
newHeaders.append((self.headerPrefix + name, value))
|
||||
return buff
|
||||
|
||||
return (newHeaders, removedHeaders)
|
||||
return WbResponse.stream_response(status_headers, stream, doRewrite, firstBuff)
|
||||
|
||||
def _detectCharset(self, stream):
|
||||
buff = stream.read(8192)
|
||||
result = chardet.detect(buff)
|
||||
print "chardet result: " + str(result)
|
||||
return (result['encoding'], buff)
|
||||
|
||||
def _checkRedir(self, wbrequest, cdx):
|
||||
if cdx and (cdx['timestamp'] != wbrequest.wb_url.timestamp):
|
||||
@ -279,15 +275,15 @@ class RewritingReplayHandler(ReplayHandler):
|
||||
wbresponse = ReplayHandler.doReplay(self, cdx, wbrequest, failedFiles)
|
||||
|
||||
# Check for self redirect
|
||||
if wbresponse.status.startswith('3'):
|
||||
if self.isSelfRedirect(wbrequest, wbresponse.headersList):
|
||||
if wbresponse.status_headers.statusline.startswith('3'):
|
||||
if self.isSelfRedirect(wbrequest, wbresponse.status_headers):
|
||||
raise wbexceptions.CaptureException('Self Redirect: ' + str(cdx))
|
||||
|
||||
return wbresponse
|
||||
|
||||
def isSelfRedirect(self, wbrequest, httpHeaders):
|
||||
def isSelfRedirect(self, wbrequest, status_headers):
|
||||
requestUrl = wbrequest.wb_url.url.lower()
|
||||
locationUrl = utils.get_header(httpHeaders, 'Location').lower()
|
||||
locationUrl = status_headers.getHeader('Location').lower()
|
||||
#return requestUrl == locationUrl
|
||||
return (ArchivalUrlRewriter.stripProtocol(requestUrl) == ArchivalUrlRewriter.stripProtocol(locationUrl))
|
||||
|
||||
@ -301,4 +297,16 @@ def PrefixResolver(prefix, contains):
|
||||
|
||||
return makeUrl
|
||||
|
||||
|
||||
#======================================
|
||||
class RedisResolver:
|
||||
def __init__(self, redisUrl, keyPrefix = 'w:'):
|
||||
self.redisUrl = redisUrl
|
||||
self.keyPrefix = keyPrefix
|
||||
self.redis = redis.StrictRedis.from_url(redisUrl)
|
||||
|
||||
def __call__(self, filename):
|
||||
try:
|
||||
return self.redis.hget(self.keyPrefix + filename, 'path')
|
||||
except Exception as e:
|
||||
print e
|
||||
return None
|
||||
|
@ -1,6 +1,7 @@
|
||||
import itertools
|
||||
import hmac
|
||||
import time
|
||||
import zlib
|
||||
|
||||
def peek_iter(iterable):
|
||||
try:
|
||||
@ -11,21 +12,15 @@ def peek_iter(iterable):
|
||||
return itertools.chain([first], iterable)
|
||||
|
||||
|
||||
def get_header(headersList, name):
|
||||
nameLower = name.lower()
|
||||
for value in headersList:
|
||||
if (value[0].lower() == nameLower):
|
||||
return value[1]
|
||||
def split_prefix(key, prefixs):
|
||||
for p in prefixs:
|
||||
if key.startswith(p):
|
||||
plen = len(p)
|
||||
return (key[:plen], key[plen:])
|
||||
|
||||
return None
|
||||
|
||||
def contains_header(headersList, seekHeader):
|
||||
header = get_header(headersList, seekHeader[0])
|
||||
if not header:
|
||||
return False
|
||||
|
||||
# see if found header matches value!
|
||||
return (header == seekHeader[1])
|
||||
def create_decompressor():
|
||||
return zlib.decompressobj(16 + zlib.MAX_WBITS)
|
||||
|
||||
class HMACCookieMaker:
|
||||
def __init__(self, key, name):
|
||||
|
@ -2,7 +2,7 @@ from query import QueryHandler
|
||||
from replay import FullHandler
|
||||
import wbexceptions
|
||||
|
||||
from wbrequestresponse import WbResponse
|
||||
from wbrequestresponse import WbResponse, StatusAndHeaders
|
||||
from archivalrouter import ArchivalRequestRouter
|
||||
|
||||
|
||||
@ -17,10 +17,11 @@ class WBHandler:
|
||||
|
||||
|
||||
## ===========
|
||||
query = QueryHandler()
|
||||
|
||||
import testwb
|
||||
|
||||
query = QueryHandler(testwb.createCdxServer())
|
||||
|
||||
headInsert = """
|
||||
|
||||
<!-- WB Insert -->
|
||||
@ -54,7 +55,11 @@ def application(env, start_response):
|
||||
raise wbexceptions.NotFoundException(env['REQUEST_URI'] + ' was not found')
|
||||
|
||||
except wbexceptions.InternalRedirect as ir:
|
||||
response = WbResponse(status = ir.status, headersList = ir.httpHeaders)
|
||||
response = WbResponse(StatusAndHeaders(ir.status, ir.httpHeaders))
|
||||
|
||||
except (wbexceptions.NotFoundException, wbexceptions.AccessException) as e:
|
||||
print "[INFO]: " + str(e)
|
||||
response = handleException(env, e)
|
||||
|
||||
except Exception as e:
|
||||
last_exc = e
|
||||
|
@ -1,5 +1,7 @@
|
||||
from wbarchivalurl import ArchivalUrl
|
||||
import utils
|
||||
|
||||
import pprint
|
||||
#WB Request and Response
|
||||
|
||||
class WbRequest:
|
||||
@ -80,38 +82,36 @@ class WbRequest:
|
||||
class WbResponse:
|
||||
"""
|
||||
>>> WbResponse.text_response('Test')
|
||||
{'status': '200 OK', 'body': ['Test'], 'headersList': [('Content-Type', 'text/plain')]}
|
||||
{'body': ['Test'], 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [('Content-Type', 'text/plain')])}
|
||||
|
||||
>>> WbResponse.text_stream(['Test', 'Another'], '404')
|
||||
{'status': '404', 'body': ['Test', 'Another'], 'headersList': [('Content-Type', 'text/plain')]}
|
||||
{'body': ['Test', 'Another'], 'status_headers': StatusAndHeaders(protocol = '', statusline = '404', headers = [('Content-Type', 'text/plain')])}
|
||||
|
||||
>>> WbResponse.redir_response('http://example.com/otherfile')
|
||||
{'status': '302 Redirect', 'body': [], 'headersList': [('Location', 'http://example.com/otherfile')]}
|
||||
|
||||
{'body': [], 'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [('Location', 'http://example.com/otherfile')])}
|
||||
"""
|
||||
|
||||
def __init__(self, status, value = [], headersList = []):
|
||||
self.status = status
|
||||
def __init__(self, status_headers, value = []):
|
||||
self.status_headers = status_headers
|
||||
self.body = value
|
||||
self.headersList = headersList
|
||||
|
||||
@staticmethod
|
||||
def text_stream(text, status = '200 OK'):
|
||||
return WbResponse(status, value = text, headersList = [('Content-Type', 'text/plain')])
|
||||
return WbResponse(StatusAndHeaders(status, [('Content-Type', 'text/plain')]), value = text)
|
||||
|
||||
@staticmethod
|
||||
def text_response(text, status = '200 OK'):
|
||||
return WbResponse(status, value = [text], headersList = [('Content-Type', 'text/plain')])
|
||||
return WbResponse(StatusAndHeaders(status, [('Content-Type', 'text/plain')]), value = [text])
|
||||
|
||||
@staticmethod
|
||||
def redir_response(location, status = '302 Redirect'):
|
||||
return WbResponse(status, headersList = [('Location', location)])
|
||||
return WbResponse(StatusAndHeaders(status, [('Location', location)]))
|
||||
|
||||
@staticmethod
|
||||
def stream_response(statusline, headers, stream, proc = None):
|
||||
def stream_response(status_headers, stream, proc = None, firstBuff = None):
|
||||
def streamGen():
|
||||
try:
|
||||
buff = stream.read()
|
||||
buff = firstBuff if firstBuff else stream.read()
|
||||
while buff:
|
||||
if proc:
|
||||
buff = proc(buff)
|
||||
@ -120,25 +120,12 @@ class WbResponse:
|
||||
finally:
|
||||
stream.close()
|
||||
|
||||
response = WbResponse(statusline, headersList = headers, value = streamGen())
|
||||
response = WbResponse(status_headers, value = streamGen())
|
||||
response._stream = stream
|
||||
return response
|
||||
|
||||
@staticmethod
|
||||
def better_timestamp_response(wbrequest, newTimestamp):
|
||||
wbrequest.wb_url.timestamp = newTimestamp
|
||||
newUrl = wbrequest.wb_prefix + str(wbrequest.wb_url)[1:]
|
||||
return WbResponse.redir_response(newUrl)
|
||||
|
||||
def get_header(self, name):
|
||||
return utils.get_header(self.headersList, name)
|
||||
|
||||
def __call__(self, env, start_response):
|
||||
#headersList = []
|
||||
#for key, value in self.headers.iteritems():
|
||||
# headersList.append((key, value))
|
||||
|
||||
start_response(self.status, self.headersList)
|
||||
start_response(self.status_headers.statusline, self.status_headers.headers)
|
||||
|
||||
if env['REQUEST_METHOD'] == 'HEAD':
|
||||
if hasattr(self.body, 'close'):
|
||||
@ -155,6 +142,28 @@ class WbResponse:
|
||||
return str(vars(self))
|
||||
|
||||
|
||||
#=================================================================
|
||||
class StatusAndHeaders:
|
||||
def __init__(self, statusline, headers, protocol = ''):
|
||||
self.statusline = statusline
|
||||
self.headers = headers
|
||||
self.protocol = protocol
|
||||
|
||||
def getHeader(self, name):
|
||||
nameLower = name.lower()
|
||||
for value in self.headers:
|
||||
if (value[0].lower() == nameLower):
|
||||
return value[1]
|
||||
|
||||
return None
|
||||
|
||||
def __repr__(self):
|
||||
return "StatusAndHeaders(protocol = '{0}', statusline = '{1}', headers = {2})".format(self.protocol, self.statusline, pprint.pformat(self.headers, indent = 2))
|
||||
#return pprint.pformat(self.__dict__)
|
||||
|
||||
def __eq__(self, other):
|
||||
return self.statusline == other.statusline and self.headers == other.headers and self.protocol == other.protocol
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
|
Loading…
x
Reference in New Issue
Block a user