1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

rename rewriters

header_rewriter added!
support for encoding detection
various fixes
xmlrewriter
This commit is contained in:
Ilya Kreymer 2014-01-03 13:03:03 -08:00
parent edbcaaf108
commit 2357f108a3
12 changed files with 499 additions and 229 deletions

View File

@ -1,7 +1,7 @@
import urlparse
from wbrequestresponse import WbRequest, WbResponse
from wburlrewriter import ArchivalUrlRewriter
from url_rewriter import ArchivalUrlRewriter
#=================================================================
# ArchivalRequestRouter -- route WB requests in archival mode
@ -122,7 +122,7 @@ if __name__ == "__main__":
if not rep:
return False
return rep.get_header('Location')
return rep.status_headers.getHeader('Location')
doctest.testmod()

View File

@ -1,16 +1,15 @@
import hanzo.warctools
import re
import itertools
import utils
import zlib
import urllib2
import StringIO
import urlparse
import collections
import wbexceptions
from wbrequestresponse import StatusAndHeaders
#=================================================================
class HttpStreamLoader:
class HttpReader:
def __init__(self, hmac = None, hmacDuration = 30):
self.hmac = hmac
self.hmacDuration = hmacDuration
@ -33,7 +32,7 @@ class HttpStreamLoader:
#=================================================================
# Untested, but for completeness
class FileStreamLoader:
class FileReader:
def load(self, url, offset, length):
if url.startswith('file://'):
url = url[len('file://'):]
@ -45,27 +44,79 @@ class FileStreamLoader:
#=================================================================
WBArchiveRecord = collections.namedtuple('WBArchiveRecord', 'type, record, stream, statusline, httpHeaders')
WBArchiveRecord = collections.namedtuple('WBArchiveRecord', 'type, rec_headers, stream, status_headers')
#=================================================================
class ArchiveLoader:
"""
>>> loadTestArchive('example.warc.gz', '333', '1043')
(('warc', 'response'),
StatusAndHeaders(protocol = 'WARC/1.0', statusline = '', headers = [ ('WARC-Type', 'response'),
('WARC-Record-ID', '<urn:uuid:6d058047-ede2-4a13-be79-90c17c631dd4>'),
('WARC-Date', '2014-01-03T03:03:21Z'),
('Content-Length', '1610'),
('Content-Type', 'application/http; msgtype=response'),
('WARC-Payload-Digest', 'sha1:B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'),
('WARC-Target-URI', 'http://example.com?example=1'),
('WARC-Warcinfo-ID', '<urn:uuid:fbd6cf0a-6160-4550-b343-12188dc05234>')]),
StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
('Cache-Control', 'max-age=604800'),
('Content-Type', 'text/html'),
('Date', 'Fri, 03 Jan 2014 03:03:21 GMT'),
('Etag', '"359670651"'),
('Expires', 'Fri, 10 Jan 2014 03:03:21 GMT'),
('Last-Modified', 'Fri, 09 Aug 2013 23:54:35 GMT'),
('Server', 'ECS (sjc/4FCE)'),
('X-Cache', 'HIT'),
('x-ec-custom-error', '1'),
('Content-Length', '1270'),
('Connection', 'close')]))
>>> loadTestArchive('example.warc.gz', '1864', '553')
(('warc', 'revisit'),
StatusAndHeaders(protocol = 'WARC/1.0', statusline = '', headers = [ ('WARC-Type', 'revisit'),
('WARC-Record-ID', '<urn:uuid:3619f5b0-d967-44be-8f24-762098d427c4>'),
('WARC-Date', '2014-01-03T03:03:41Z'),
('Content-Length', '340'),
('Content-Type', 'application/http; msgtype=response'),
('WARC-Payload-Digest', 'sha1:B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'),
('WARC-Target-URI', 'http://example.com?example=1'),
('WARC-Warcinfo-ID', '<urn:uuid:fbd6cf0a-6160-4550-b343-12188dc05234>'),
( 'WARC-Profile',
'http://netpreserve.org/warc/0.18/revisit/identical-payload-digest'),
('WARC-Refers-To-Target-URI', 'http://example.com?example=1'),
('WARC-Refers-To-Date', '2014-01-03T03:03:21Z')]),
StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
('Cache-Control', 'max-age=604800'),
('Content-Type', 'text/html'),
('Date', 'Fri, 03 Jan 2014 03:03:41 GMT'),
('Etag', '"359670651"'),
('Expires', 'Fri, 10 Jan 2014 03:03:41 GMT'),
('Last-Modified', 'Fri, 09 Aug 2013 23:54:35 GMT'),
('Server', 'ECS (sjc/4FCE)'),
('X-Cache', 'HIT'),
('x-ec-custom-error', '1'),
('Content-Length', '1270'),
('Connection', 'close')]))
"""
# Standard ARC headers
ARC_HEADERS = ["uri", "ip-address", "creation-date", "content-type", "length"]
# Since loading a range request, can only determine gzip-ness based on file extension
FORMAT_MAP = {
'.warc.gz': (hanzo.warctools.WarcRecord, 'warc', True),
'.arc.gz': (hanzo.warctools.ArcRecord, 'arc', True),
'.warc': (hanzo.warctools.WarcRecord, 'warc', False),
'.arc': (hanzo.warctools.ArcRecord, 'arc', False),
'.warc.gz': ('warc', True),
'.arc.gz': ('arc', True),
'.warc': ('warc', False),
'.arc': ('arc', False),
}
HTTP_STATUS_REGEX = re.compile('^HTTP/[\d.]+ (\d+.*)$')
@staticmethod
def createDefaultLoaders():
http = HttpStreamLoader()
file = FileStreamLoader()
http = HttpReader()
file = FileReader()
return {
'http': http,
'https': http,
@ -78,6 +129,10 @@ class ArchiveLoader:
self.loaders = loaders if loaders else ArchiveLoader.createDefaultLoaders()
self.chunkSize = chunkSize
self.arcParser = ARCHeadersParser(ArchiveLoader.ARC_HEADERS)
self.warcParser = StatusAndHeadersParser(['WARC/1.0', 'WARC/0.17', 'WARC/0.18'])
self.httpParser = StatusAndHeadersParser(['HTTP/1.0', 'HTTP/1.1'])
def load(self, url, offset, length):
urlParts = urlparse.urlsplit(url)
@ -86,22 +141,19 @@ class ArchiveLoader:
except Exception:
raise wbexceptions.UnknownLoaderProtocolException(url)
loaderCls = None
theFormat = None
for ext, (loaderCls, aFormat, gzip) in ArchiveLoader.FORMAT_MAP.iteritems():
for ext, iformat in ArchiveLoader.FORMAT_MAP.iteritems():
if url.endswith(ext):
loaderCls = loaderCls
aFormat = aFormat
isGzip = gzip
theFormat = iformat
break
if loaderCls is None:
if theFormat is None:
raise wbexceptions.UnknownArchiveFormatException(url)
if isGzip:
decomp = zlib.decompressobj(16+zlib.MAX_WBITS)
else:
decomp = None
(aFormat, isGzip) = theFormat
decomp = utils.create_decompressor() if isGzip else None
try:
length = int(length)
@ -111,73 +163,87 @@ class ArchiveLoader:
raw = loader.load(url, long(offset), length)
reader = LineReader(raw, length, self.chunkSize, decomp)
parser = loaderCls.make_parser()
if aFormat == 'arc':
parser.headers = ArchiveLoader.ARC_HEADERS
(parsed, errors, _) = parser.parse(reader, 0)
if errors:
reader.close()
raise wbexceptions.InvalidArchiveRecordException('Error Parsing Record', errors)
stream = LineReader(raw, length, self.chunkSize, decomp)
if aFormat == 'arc':
rec_headers = self.arcParser.parse(stream)
recType = 'response'
empty = (utils.get_header(parsed.headers, 'length') == 0)
else:
recType = utils.get_header(parsed.headers, 'WARC-Type')
empty = (utils.get_header(parsed.headers, 'Content-Length') == '0')
empty = (rec_headers.getHeader('length') == 0)
elif aFormat == 'warc':
rec_headers = self.warcParser.parse(stream)
recType = rec_headers.getHeader('WARC-Type')
empty = (rec_headers.getHeader('Content-Length') == '0')
# special case: empty w/arc record (hopefully a revisit)
if empty:
statusline = '204 No Content'
headers = []
status_headers = StatusAndHeaders('204 No Content', [])
# special case: warc records that are not expected to have http headers
# attempt to add 200 status and content-type
elif recType == 'metadata' or recType == 'resource':
statusline = '200 OK'
headers = [('Content-Type', utils.get_header(parsed.headers, 'Content-Type'))]
status_headers = StatusAndHeaders('200 OK', [('Content-Type', rec_headers.getHeader('Content-Type'))])
# special case: http 0.9 response, no status or headers
#elif recType == 'response' and (';version=0.9' in utils.get_header(parsed.headers, 'Content-Type')):
# statusline = '200 OK'
# headers = []
#elif recType == 'response':
# contentType = rec_headers.getHeader('Content-Type')
# if contentType and (';version=0.9' in contentType):
# status_headers = StatusAndHeaders('200 OK', [])
# response record: parse HTTP status and headers!
else:
(statusline, headers) = self.parseHttpHeaders(reader)
#(statusline, http_headers) = self.parseHttpHeaders(stream)
status_headers = self.httpParser.parse(stream)
return WBArchiveRecord((aFormat, recType), parsed, reader, statusline, headers)
return WBArchiveRecord((aFormat, recType), rec_headers, stream, status_headers)
def parseHttpHeaders(self, stream):
def nextHeaderLine(stream):
return stream.readline().rstrip()
#=================================================================
class StatusAndHeadersParser:
def __init__(self, statuslist):
self.statuslist = statuslist
line = nextHeaderLine(stream)
matched = self.HTTP_STATUS_REGEX.match(line)
def parse(self, stream):
statusline = stream.readline().rstrip()
if not matched:
raise wbexceptions.InvalidArchiveRecordException('Expected HTTP Status Line, Found: ' + line)
protocolStatus = utils.split_prefix(statusline, self.statuslist)
if not protocolStatus:
raise wbexceptions.InvalidArchiveRecordException('Expected Status Line, Found: ' + statusline)
#status = int(matched.group(2))
statusline = matched.group(1)
headers = []
line = nextHeaderLine(stream)
line = stream.readline().rstrip()
while line and line != '\r\n':
name, value = line.split(':', 1)
value = value.strip()
headers.append((name, value))
line = nextHeaderLine(stream)
header = (name, value.strip())
headers.append(header)
line = stream.readline().rstrip()
return (statusline, headers)
return StatusAndHeaders(statusline = protocolStatus[1].strip(), headers = headers, protocol = protocolStatus[0])
#=================================================================
class ARCHeadersParser:
def __init__(self, headernames):
self.headernames = headernames
def parse(self, stream):
headerline = stream.readline().rstrip()
parts = headerline.split()
headernames = self.headernames
if len(parts) != len(headernames):
raise wbexceptions.InvalidArchiveRecordException('Wrong # of heaeders, expected arc headers {0}, Found {1}'.format(headernames, parts))
headers = []
for name, value in itertools.izip(headernames, parts):
headers.append((name, value))
return StatusAndHeaders(statusline = '', headers = headers, protocol = 'ARC/1.0')
#=================================================================
class LineReader:
@ -217,4 +283,19 @@ class LineReader:
self.stream = None
#=================================================================
if __name__ == "__main__":
import doctest
import os
import pprint
testloader = ArchiveLoader()
def loadTestArchive(test_file, offset, length):
path = os.path.dirname(os.path.realpath(__file__)) + '/../test/' + test_file
archive = testloader.load(path, offset, length)
pprint.pprint((archive.type, archive.rec_headers, archive.status_headers))
doctest.testmod()

133
pywb/header_rewriter.py Normal file
View File

@ -0,0 +1,133 @@
from wbrequestresponse import StatusAndHeaders
#=================================================================
class RewrittenStatusAndHeaders:
def __init__(self, statusline, headers, removedHeaderDict, textType, charset):
self.status_headers = StatusAndHeaders(statusline, headers)
self.removedHeaderDict = removedHeaderDict
self.textType = textType
self.charset = charset
def containsRemovedHeader(self, name, value):
return self.removedHeaderDict.get(name) == value
#=================================================================
class HeaderRewriter:
"""
# Text with charset
>>> test_rewrite([('Date', 'Fri, 03 Jan 2014 03:03:21 GMT'), ('Content-Length', '5'), ('Content-Type', 'text/html;charset=utf-8')])
{'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Date', 'Fri, 03 Jan 2014 03:03:21 GMT'),
('X-Archive-Orig-Content-Length', '5'),
('Content-Type', 'text/html;charset=utf-8')]), 'charset': 'utf-8', 'textType': 'html', 'removedHeaderDict': {}}
# Redirect
>>> test_rewrite([('Connection', 'close'), ('Location', '/other.html')], '302 Redirect')
{'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [ ('X-Archive-Orig-Connection', 'close'),
('Location', '/web/20131226101010/http://example.com/other.html')]), 'charset': None, 'textType': None, 'removedHeaderDict': {}}
# gzip
>>> test_rewrite([('Content-Length', '199999'), ('Content-Type', 'text/javascript'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')])
{'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Content-Length', '199999'),
('Content-Type', 'text/javascript')]), 'charset': None, 'textType': 'js', 'removedHeaderDict': {'transfer-encoding': 'chunked', 'content-encoding': 'gzip'}}
# Binary
>>> test_rewrite([('Content-Length', '200000'), ('Content-Type', 'image/png'), ('Cookie', 'blah'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')])
{'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('Content-Length', '200000'),
('Content-Type', 'image/png'),
('X-Archive-Orig-Cookie', 'blah'),
('Content-Encoding', 'gzip'),
('Transfer-Encoding', 'chunked')]), 'charset': None, 'textType': None, 'removedHeaderDict': {}}
"""
REWRITE_TYPES = {
'html': ['text/html', 'application/xhtml'],
'css': ['text/css'],
'js': ['text/javascript', 'application/javascript', 'application/x-javascript'],
'xml': ['/xml', '+xml', '.xml', '.rss'],
}
PROXY_HEADERS = ('content-type', 'content-disposition')
URL_REWRITE_HEADERS = ('location', 'content-location', 'content-base')
ENCODING_HEADERS = ('content-encoding', 'transfer-encoding')
PROXY_NO_REWRITE_HEADERS = ('content-length')
def __init__(self, headerPrefix = 'X-Archive-Orig-'):
self.headerPrefix = headerPrefix
def rewrite(self, status_headers, urlrewriter):
contentType = status_headers.getHeader('Content-Type')
textType = None
charset = None
stripEncoding = False
if contentType:
textType = self._extractTextType(contentType)
if textType:
charset = self._extractCharSet(contentType)
stripEncoding = True
(newHeaders, removedHeaderDict) = self._rewriteHeaders(status_headers.headers, urlrewriter, stripEncoding)
return RewrittenStatusAndHeaders(status_headers.statusline, newHeaders, removedHeaderDict, textType, charset)
def _extractTextType(self, contentType):
for ctype, mimelist in self.REWRITE_TYPES.iteritems():
if any ((mime in contentType) for mime in mimelist):
return ctype
return None
def _extractCharSet(self, contentType):
CHARSET_TOKEN = 'charset='
idx = contentType.find(CHARSET_TOKEN)
if idx < 0:
return None
return contentType[idx + len(CHARSET_TOKEN):]
def _rewriteHeaders(self, headers, urlrewriter, contentRewritten = False):
newHeaders = []
removedHeaderDict = {}
for (name, value) in headers:
lowername = name.lower()
if lowername in self.PROXY_HEADERS:
newHeaders.append((name, value))
elif lowername in self.URL_REWRITE_HEADERS:
newHeaders.append((name, urlrewriter.rewrite(value)))
elif lowername in self.ENCODING_HEADERS:
if contentRewritten:
removedHeaderDict[lowername] = value
else:
newHeaders.append((name, value))
elif lowername in self.PROXY_NO_REWRITE_HEADERS and not contentRewritten:
newHeaders.append((name, value))
else:
newHeaders.append((self.headerPrefix + name, value))
return (newHeaders, removedHeaderDict)
if __name__ == "__main__":
import doctest
import os
import pprint
import url_rewriter
urlrewriter = url_rewriter.ArchivalUrlRewriter('/20131226101010/http://example.com/some/path/index.html', '/web/')
headerrewriter = HeaderRewriter()
def test_rewrite(headers, status = '200 OK'):
rewritten = headerrewriter.rewrite(StatusAndHeaders(status, headers), urlrewriter)
return vars(rewritten)
doctest.testmod()

View File

@ -5,8 +5,8 @@ import sys
import re
from HTMLParser import HTMLParser
from wburlrewriter import ArchivalUrlRewriter
from regexmatch import JSRewriter, CSSRewriter
from url_rewriter import ArchivalUrlRewriter
from regex_rewriters import JSRewriter, CSSRewriter
#=================================================================
# WBHtml --html parser for custom rewriting, also handlers for script and css

View File

@ -10,37 +10,18 @@ class RemoteCDXServer:
>>> x = cdxserver.load('example.com', parse_cdx = True, limit = '2')
>>> pprint(x[0])
{'digest': 'HT2DYGA5UKZCPBSFVCV3JOBXGW2G5UUA',
'filename': 'DJ_crawl2.20020401123359-c/DJ_crawl3.20020120141301.arc.gz',
'length': '1792',
'mimetype': 'text/html',
'offset': '49482198',
'original': 'http://example.com:80/',
'redirect': '-',
'robotflags': '-',
'statuscode': '200',
'timestamp': '20020120142510',
'urlkey': 'com,example)/'}
>>> x = cdxserver.load('example.com', parse_cdx = True, params = {'resolveRevisits': True, 'closest': '20131226', 'sort': 'closest', 'limit': '1'})
>>> pprint(x[0])
{'digest': 'B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A',
'filename': 'top_domains-00800-20131210-035838/top_domains-00800-20131210051705-00024.warc.gz',
'length': '523',
'mimetype': 'warc/revisit',
'offset': '247256770',
'orig.filename': 'deccanchronicle.com-20130107-023325/IA-FOC-deccanchronicle.com-20130921004125-00000.warc.gz',
'orig.length': '529',
'orig.offset': '769759',
'original': 'http://www.example.com/',
'redirect': '-',
'robotflags': '-',
'statuscode': '-',
'timestamp': '20131210052355',
'urlkey': 'com,example)/'}
"""
"""
def __init__(self, serverUrl):
def __init__(self, serverUrl, cookie = None):
self.serverUrl = serverUrl
self.authCookie = cookie
def load(self, url, params = {}, parse_cdx = False, **kwvalues):
#url is required, must be passed explicitly!
@ -51,6 +32,10 @@ class RemoteCDXServer:
try:
request = urllib2.Request(self.serverUrl, urlparams)
if self.authCookie:
request.add_header('Cookie', self.authCookie)
response = urllib2.urlopen(request)
except urllib2.HTTPError, e:
if e.code == 403:
@ -91,6 +76,9 @@ class RemoteCDXServer:
class CDXCaptureResult(dict):
CDX_FORMATS = [
# Public CDX Format
["urlkey","timestamp","original","mimetype","statuscode","digest","length"],
# CDX 11 Format
["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"],

View File

@ -4,8 +4,11 @@ import wbrequestresponse
import wbexceptions
class QueryHandler:
def __init__(self):
self.cdxserver = indexreader.RemoteCDXServer('http://web.archive.org/cdx/search/cdx')
def __init__(self, cdxserver = None):
if not cdxserver:
cdxserver = indexreader.RemoteCDXServer('http://web.archive.org/cdx/search/cdx')
self.cdxserver = cdxserver
def __call__(self, wbrequest, prev_wbresponse):
wburl = wbrequest.wb_url

View File

@ -2,12 +2,13 @@ import re
import sys
import itertools
from wburlrewriter import ArchivalUrlRewriter
from url_rewriter import ArchivalUrlRewriter
#=================================================================
class RegexRewriter:
"""
# Test https->http converter (other tests below in subclasses)
>>> RegexRewriter([(RegexRewriter.HTTPX_MATCH_REGEX, RegexRewriter.removeHttps, 0)]).replaceAll('a = https://example.com; b = http://example.com; c = https://some-url/path/https://embedded.example.com')
>>> RegexRewriter([(RegexRewriter.HTTPX_MATCH_STR, RegexRewriter.removeHttps, 0)]).replaceAll('a = https://example.com; b = http://example.com; c = https://some-url/path/https://embedded.example.com')
'a = http://example.com; b = http://example.com; c = http://some-url/path/http://embedded.example.com'
"""
@ -27,7 +28,7 @@ class RegexRewriter:
def archivalRewrite(rewriter):
return lambda x: rewriter.rewrite(x)
HTTPX_MATCH_REGEX = 'https?:\\\\?/\\\\?/[A-Za-z0-9:_@.-]+'
HTTPX_MATCH_STR = 'https?:\\\\?/\\\\?/[A-Za-z0-9:_@.-]+'
DEFAULT_OP = addPrefix
@ -44,6 +45,9 @@ class RegexRewriter:
self.regex = re.compile(regexStr, re.M)
self.rules = rules
def filter(self, m):
return True
def replaceAll(self, string):
return self.regex.sub(lambda x: self.replace(x), string)
@ -60,6 +64,10 @@ class RegexRewriter:
if not m.group(i):
continue
# Optional filter to skip matches
if not self.filter(m):
return m.group(0)
# Custom func
if not hasattr(op, '__call__'):
op = RegexRewriter.DEFAULT_OP(op)
@ -74,6 +82,7 @@ class RegexRewriter:
#=================================================================
class JSRewriter(RegexRewriter):
"""
>>> test_js('location = "http://example.com/abc.html"')
@ -100,11 +109,47 @@ class JSRewriter(RegexRewriter):
def _createRules(self, httpPrefix):
return [
(RegexRewriter.HTTPX_MATCH_REGEX, httpPrefix, 0),
(RegexRewriter.HTTPX_MATCH_STR, httpPrefix, 0),
('location|domain', 'WB_wombat_', 0),
]
#=================================================================
class XMLRewriter(RegexRewriter):
"""
>>> test_xml('<tag xmlns="http://www.example.com/ns" attr="http://example.com"></tag>')
'<tag xmlns="http://www.example.com/ns" attr="/web/20131010im_/http://example.com"></tag>'
>>> test_xml('<tag xmlns:xsi="http://www.example.com/ns" attr=" http://example.com"></tag>')
'<tag xmlns:xsi="http://www.example.com/ns" attr=" /web/20131010im_/http://example.com"></tag>'
>>> test_xml('<tag> http://example.com<other>abchttp://example.com</other></tag>')
'<tag> /web/20131010im_/http://example.com<other>abchttp://example.com</other></tag>'
>>> test_xml('<main> http://www.example.com/blah</tag> <other xmlns:abcdef= " http://example.com"/> http://example.com </main>')
'<main> /web/20131010im_/http://www.example.com/blah</tag> <other xmlns:abcdef= " http://example.com"/> /web/20131010im_/http://example.com </main>'
"""
def __init__(self, rewriter, extra = []):
rules = self._createRules(rewriter.getAbsUrl())
RegexRewriter.__init__(self, rules)
# custom filter to reject 'xmlns' attr
def filter(self, m):
attr = m.group(1)
if attr and attr.startswith('xmlns'):
return False
return True
def _createRules(self, httpPrefix):
return [
('([A-Za-z:]+[\s=]+)?["\'\s]*(' + RegexRewriter.HTTPX_MATCH_STR + ')', httpPrefix, 2),
]
#=================================================================
class CSSRewriter(RegexRewriter):
r"""
>>> test_css("background: url('/some/path.html')")
@ -172,6 +217,9 @@ if __name__ == "__main__":
def test_js(string, extra = []):
return JSRewriter(arcrw, extra).replaceAll(string)
def test_xml(string):
return XMLRewriter(arcrw).replaceAll(string)
def test_css(string):
return CSSRewriter(arcrw).replaceAll(string)

View File

@ -1,14 +1,18 @@
import StringIO
from urllib2 import URLError
import chardet
import redis
import indexreader
from wbrequestresponse import WbResponse
from wbrequestresponse import WbResponse, StatusAndHeaders
from wbarchivalurl import ArchivalUrl
import utils
from wburlrewriter import ArchivalUrlRewriter
import wbhtml
import regexmatch
from url_rewriter import ArchivalUrlRewriter
from header_rewriter import HeaderRewriter
import html_rewriter
import regex_rewriters
import wbexceptions
#=================================================================
@ -111,19 +115,19 @@ class ReplayHandler(object):
payloadRecord = self._load(cdx, True, failedFiles)
# Case 4: if headers record is actually empty (eg empty revisit), then use headers from revisit
if not headersRecord.httpHeaders:
if not headersRecord.status_headers.headers:
headersRecord.stream.close()
headersRecord = payloadRecord
else:
headersRecord.stream.close()
isRevisit = True
else:
raise wbexceptions.CaptureException('Invalid CDX' + cdx)
return WbResponse.stream_response(headersRecord.statusline, headersRecord.httpHeaders, payloadRecord.stream)
return WbResponse.stream_response(headersRecord.status_headers, payloadRecord.stream)
def resolveFull(self, filename):
@ -140,26 +144,12 @@ class ReplayHandler(object):
#=================================================================
class RewritingReplayHandler(ReplayHandler):
REWRITE_TYPES = {
'html': ['text/html', 'application/xhtml'],
'css': ['text/css'],
'js': ['text/javascript', 'application/javascript', 'application/x-javascript'],
'xml': ['/xml', '+xml', '.xml', '.rss'],
}
PROXY_HEADERS = ('content-type', 'content-disposition')
URL_REWRITE_HEADERS = ('location', 'content-location', 'content-base')
ENCODING_HEADERS = ('content-encoding', 'transfer-encoding')
def __init__(self, resolvers, archiveloader, headerPrefix = 'X-Archive-Orig-', headInsert = None):
def __init__(self, resolvers, archiveloader, headInsert = None, headerRewriter = None):
ReplayHandler.__init__(self, resolvers, archiveloader)
self.headerPrefix = headerPrefix
self.headInsert = headInsert
if not headerRewriter:
headerRewriter = HeaderRewriter()
self.headerRewriter = headerRewriter
def _textContentType(self, contentType):
@ -183,88 +173,94 @@ class RewritingReplayHandler(ReplayHandler):
if wbrequest.wb_url.mod == 'id_':
return response
contentType = utils.get_header(response.headersList, 'Content-Type')
textType = self._textContentType(contentType) if contentType else None
(newHeaders, remHeaders) = self._rewriteHeaders(response.headersList, urlrewriter, textType is not None)
# binary type, just send through
if textType is None:
response.headersList = newHeaders
rewrittenHeaders = self.headerRewriter.rewrite(response.status_headers, urlrewriter)
# non-text content type, just send through with rewritten headers
if rewrittenHeaders.textType is None:
response.status_headers = rewrittenHeaders.status_headers
return response
# Handle text rewriting
# TODO: better way to pass this
# TODO: better way to pass this?
stream = response._stream
# special case -- need to ungzip the body
if (utils.contains_header(remHeaders, ('Content-Encoding', 'gzip'))):
stream = archiveloader.LineStream(stream, decomp = zlib.decompressobj(16 + zlib.MAX_WBITS))
if (rewrittenHeaders.containsRemovedHeader('content-encoding', 'gzip')):
stream = archiveloader.LineStream(stream, decomp = utils.create_decompressor())
return self._rewriteContent(textType, urlrewriter, stream, newHeaders, response)
# TODO: is this right?
if rewrittenHeaders.charset:
encoding = rewrittenHeaders.charset
firstBuff = None
else:
(encoding, firstBuff) = self._detectCharset(stream)
# TODO: first non-streaming attempt, probably want to stream
def _rewriteContent(self, textType, urlrewriter, stream, newHeaders, origResponse, encoding = 'utf-8'):
if textType == 'html':
out = StringIO.StringIO()
#out = SimpleWriter()
htmlrewriter = wbhtml.WBHtml(urlrewriter, out, self.headInsert)
# if ascii, set to noop encode operation
if encoding == 'ascii':
encoding = None
#encoding = 'utf-8'
try:
buff = stream.read()
while buff:
# Buffering response for html, streaming for others?
if rewrittenHeaders.textType == 'html':
return self._rewriteHtml(encoding, urlrewriter, stream, rewrittenHeaders.status_headers, firstBuff)
else:
return self._rewriteOther(rewrittenHeaders.textType, encoding, urlrewriter, stream, rewrittenHeaders.status_headers, firstBuff)
def _rewriteHtml(self, encoding, urlrewriter, stream, status_headers, firstBuff = None):
out = StringIO.StringIO()
htmlrewriter = html_rewriter.WBHtml(urlrewriter, out, self.headInsert)
try:
buff = firstBuff if firstBuff else stream.read()
while buff:
if encoding:
buff = buff.decode(encoding)
htmlrewriter.feed(buff)
buff = stream.read()
htmlrewriter.feed(buff)
buff = stream.read()
htmlrewriter.close()
# Close rewriter if gracefully made it to end
htmlrewriter.close()
#except Exception as e:
# print e
finally:
content = out.getvalue()
if encoding:
content = content.encode(encoding)
finally:
content = out.getvalue().encode(encoding)
value = [content]
newHeaders.append(('Content-Length', str(len(value[0]))))
contentLengthStr = str(len(content))
status_headers.headers.append(('Content-Length', contentLengthStr))
out.close()
return WbResponse(status = origResponse.status, headersList = newHeaders, value = value)
else:
if textType == 'css':
rewriter = regexmatch.CSSRewriter(urlrewriter)
elif textType == 'js':
rewriter = regexmatch.JSRewriter(urlrewriter)
def doRewrite(buff):
return rewriter.replaceAll(buff)
return WbResponse.stream_response(origResponse.status, newHeaders, stream, doRewrite)
return WbResponse(status_headers, value = value)
def _rewriteOther(self, textType, encoding, urlrewriter, stream, status_headers, firstBuff = None):
if textType == 'css':
rewriter = regex_rewriters.CSSRewriter(urlrewriter)
elif textType == 'js':
rewriter = regex_rewriters.JSRewriter(urlrewriter)
elif textType == 'xml':
rewriter = regex_rewriters.XMLRewriter(urlrewriter)
def _rewriteHeaders(self, headers, urlrewriter, stripEncoding = False):
newHeaders = []
removedHeaders = []
def doRewrite(buff):
if encoding:
buff = buff.decode(encoding)
buff = rewriter.replaceAll(buff)
if encoding:
buff = buff.encode(encoding)
for (name, value) in headers:
lowername = name.lower()
if lowername in self.PROXY_HEADERS:
newHeaders.append((name, value))
elif lowername in self.URL_REWRITE_HEADERS:
newHeaders.append((name, urlrewriter.rewrite(value)))
elif lowername in self.ENCODING_HEADERS:
if stripEncoding:
removedHeaders.append((name, value))
else:
newHeaders.append((name, value))
else:
newHeaders.append((self.headerPrefix + name, value))
return buff
return (newHeaders, removedHeaders)
return WbResponse.stream_response(status_headers, stream, doRewrite, firstBuff)
def _detectCharset(self, stream):
buff = stream.read(8192)
result = chardet.detect(buff)
print "chardet result: " + str(result)
return (result['encoding'], buff)
def _checkRedir(self, wbrequest, cdx):
if cdx and (cdx['timestamp'] != wbrequest.wb_url.timestamp):
@ -279,15 +275,15 @@ class RewritingReplayHandler(ReplayHandler):
wbresponse = ReplayHandler.doReplay(self, cdx, wbrequest, failedFiles)
# Check for self redirect
if wbresponse.status.startswith('3'):
if self.isSelfRedirect(wbrequest, wbresponse.headersList):
if wbresponse.status_headers.statusline.startswith('3'):
if self.isSelfRedirect(wbrequest, wbresponse.status_headers):
raise wbexceptions.CaptureException('Self Redirect: ' + str(cdx))
return wbresponse
def isSelfRedirect(self, wbrequest, httpHeaders):
def isSelfRedirect(self, wbrequest, status_headers):
requestUrl = wbrequest.wb_url.url.lower()
locationUrl = utils.get_header(httpHeaders, 'Location').lower()
locationUrl = status_headers.getHeader('Location').lower()
#return requestUrl == locationUrl
return (ArchivalUrlRewriter.stripProtocol(requestUrl) == ArchivalUrlRewriter.stripProtocol(locationUrl))
@ -301,4 +297,16 @@ def PrefixResolver(prefix, contains):
return makeUrl
#======================================
class RedisResolver:
def __init__(self, redisUrl, keyPrefix = 'w:'):
self.redisUrl = redisUrl
self.keyPrefix = keyPrefix
self.redis = redis.StrictRedis.from_url(redisUrl)
def __call__(self, filename):
try:
return self.redis.hget(self.keyPrefix + filename, 'path')
except Exception as e:
print e
return None

View File

@ -1,6 +1,7 @@
import itertools
import hmac
import time
import zlib
def peek_iter(iterable):
try:
@ -11,21 +12,15 @@ def peek_iter(iterable):
return itertools.chain([first], iterable)
def get_header(headersList, name):
nameLower = name.lower()
for value in headersList:
if (value[0].lower() == nameLower):
return value[1]
def split_prefix(key, prefixs):
for p in prefixs:
if key.startswith(p):
plen = len(p)
return (key[:plen], key[plen:])
return None
def contains_header(headersList, seekHeader):
header = get_header(headersList, seekHeader[0])
if not header:
return False
# see if found header matches value!
return (header == seekHeader[1])
def create_decompressor():
return zlib.decompressobj(16 + zlib.MAX_WBITS)
class HMACCookieMaker:
def __init__(self, key, name):

View File

@ -2,7 +2,7 @@ from query import QueryHandler
from replay import FullHandler
import wbexceptions
from wbrequestresponse import WbResponse
from wbrequestresponse import WbResponse, StatusAndHeaders
from archivalrouter import ArchivalRequestRouter
@ -17,10 +17,11 @@ class WBHandler:
## ===========
query = QueryHandler()
import testwb
query = QueryHandler(testwb.createCdxServer())
headInsert = """
<!-- WB Insert -->
@ -54,7 +55,11 @@ def application(env, start_response):
raise wbexceptions.NotFoundException(env['REQUEST_URI'] + ' was not found')
except wbexceptions.InternalRedirect as ir:
response = WbResponse(status = ir.status, headersList = ir.httpHeaders)
response = WbResponse(StatusAndHeaders(ir.status, ir.httpHeaders))
except (wbexceptions.NotFoundException, wbexceptions.AccessException) as e:
print "[INFO]: " + str(e)
response = handleException(env, e)
except Exception as e:
last_exc = e

View File

@ -1,5 +1,7 @@
from wbarchivalurl import ArchivalUrl
import utils
import pprint
#WB Request and Response
class WbRequest:
@ -80,38 +82,36 @@ class WbRequest:
class WbResponse:
"""
>>> WbResponse.text_response('Test')
{'status': '200 OK', 'body': ['Test'], 'headersList': [('Content-Type', 'text/plain')]}
{'body': ['Test'], 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [('Content-Type', 'text/plain')])}
>>> WbResponse.text_stream(['Test', 'Another'], '404')
{'status': '404', 'body': ['Test', 'Another'], 'headersList': [('Content-Type', 'text/plain')]}
{'body': ['Test', 'Another'], 'status_headers': StatusAndHeaders(protocol = '', statusline = '404', headers = [('Content-Type', 'text/plain')])}
>>> WbResponse.redir_response('http://example.com/otherfile')
{'status': '302 Redirect', 'body': [], 'headersList': [('Location', 'http://example.com/otherfile')]}
{'body': [], 'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [('Location', 'http://example.com/otherfile')])}
"""
def __init__(self, status, value = [], headersList = []):
self.status = status
def __init__(self, status_headers, value = []):
self.status_headers = status_headers
self.body = value
self.headersList = headersList
@staticmethod
def text_stream(text, status = '200 OK'):
return WbResponse(status, value = text, headersList = [('Content-Type', 'text/plain')])
return WbResponse(StatusAndHeaders(status, [('Content-Type', 'text/plain')]), value = text)
@staticmethod
def text_response(text, status = '200 OK'):
return WbResponse(status, value = [text], headersList = [('Content-Type', 'text/plain')])
return WbResponse(StatusAndHeaders(status, [('Content-Type', 'text/plain')]), value = [text])
@staticmethod
def redir_response(location, status = '302 Redirect'):
return WbResponse(status, headersList = [('Location', location)])
return WbResponse(StatusAndHeaders(status, [('Location', location)]))
@staticmethod
def stream_response(statusline, headers, stream, proc = None):
def stream_response(status_headers, stream, proc = None, firstBuff = None):
def streamGen():
try:
buff = stream.read()
buff = firstBuff if firstBuff else stream.read()
while buff:
if proc:
buff = proc(buff)
@ -120,25 +120,12 @@ class WbResponse:
finally:
stream.close()
response = WbResponse(statusline, headersList = headers, value = streamGen())
response = WbResponse(status_headers, value = streamGen())
response._stream = stream
return response
@staticmethod
def better_timestamp_response(wbrequest, newTimestamp):
wbrequest.wb_url.timestamp = newTimestamp
newUrl = wbrequest.wb_prefix + str(wbrequest.wb_url)[1:]
return WbResponse.redir_response(newUrl)
def get_header(self, name):
return utils.get_header(self.headersList, name)
def __call__(self, env, start_response):
#headersList = []
#for key, value in self.headers.iteritems():
# headersList.append((key, value))
start_response(self.status, self.headersList)
start_response(self.status_headers.statusline, self.status_headers.headers)
if env['REQUEST_METHOD'] == 'HEAD':
if hasattr(self.body, 'close'):
@ -155,6 +142,28 @@ class WbResponse:
return str(vars(self))
#=================================================================
class StatusAndHeaders:
def __init__(self, statusline, headers, protocol = ''):
self.statusline = statusline
self.headers = headers
self.protocol = protocol
def getHeader(self, name):
nameLower = name.lower()
for value in self.headers:
if (value[0].lower() == nameLower):
return value[1]
return None
def __repr__(self):
return "StatusAndHeaders(protocol = '{0}', statusline = '{1}', headers = {2})".format(self.protocol, self.statusline, pprint.pformat(self.headers, indent = 2))
#return pprint.pformat(self.__dict__)
def __eq__(self, other):
return self.statusline == other.statusline and self.headers == other.headers and self.protocol == other.protocol
if __name__ == "__main__":
import doctest