1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

rename rewriters

header_rewriter added!
support for encoding detection
various fixes
xmlrewriter
This commit is contained in:
Ilya Kreymer 2014-01-03 13:03:03 -08:00
parent edbcaaf108
commit 2357f108a3
12 changed files with 499 additions and 229 deletions

View File

@ -1,7 +1,7 @@
import urlparse import urlparse
from wbrequestresponse import WbRequest, WbResponse from wbrequestresponse import WbRequest, WbResponse
from wburlrewriter import ArchivalUrlRewriter from url_rewriter import ArchivalUrlRewriter
#================================================================= #=================================================================
# ArchivalRequestRouter -- route WB requests in archival mode # ArchivalRequestRouter -- route WB requests in archival mode
@ -122,7 +122,7 @@ if __name__ == "__main__":
if not rep: if not rep:
return False return False
return rep.get_header('Location') return rep.status_headers.getHeader('Location')
doctest.testmod() doctest.testmod()

View File

@ -1,16 +1,15 @@
import hanzo.warctools import itertools
import re
import utils import utils
import zlib
import urllib2 import urllib2
import StringIO import StringIO
import urlparse import urlparse
import collections import collections
import wbexceptions import wbexceptions
from wbrequestresponse import StatusAndHeaders
#================================================================= #=================================================================
class HttpStreamLoader: class HttpReader:
def __init__(self, hmac = None, hmacDuration = 30): def __init__(self, hmac = None, hmacDuration = 30):
self.hmac = hmac self.hmac = hmac
self.hmacDuration = hmacDuration self.hmacDuration = hmacDuration
@ -33,7 +32,7 @@ class HttpStreamLoader:
#================================================================= #=================================================================
# Untested, but for completeness # Untested, but for completeness
class FileStreamLoader: class FileReader:
def load(self, url, offset, length): def load(self, url, offset, length):
if url.startswith('file://'): if url.startswith('file://'):
url = url[len('file://'):] url = url[len('file://'):]
@ -45,27 +44,79 @@ class FileStreamLoader:
#================================================================= #=================================================================
WBArchiveRecord = collections.namedtuple('WBArchiveRecord', 'type, record, stream, statusline, httpHeaders') WBArchiveRecord = collections.namedtuple('WBArchiveRecord', 'type, rec_headers, stream, status_headers')
#================================================================= #=================================================================
class ArchiveLoader: class ArchiveLoader:
"""
>>> loadTestArchive('example.warc.gz', '333', '1043')
(('warc', 'response'),
StatusAndHeaders(protocol = 'WARC/1.0', statusline = '', headers = [ ('WARC-Type', 'response'),
('WARC-Record-ID', '<urn:uuid:6d058047-ede2-4a13-be79-90c17c631dd4>'),
('WARC-Date', '2014-01-03T03:03:21Z'),
('Content-Length', '1610'),
('Content-Type', 'application/http; msgtype=response'),
('WARC-Payload-Digest', 'sha1:B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'),
('WARC-Target-URI', 'http://example.com?example=1'),
('WARC-Warcinfo-ID', '<urn:uuid:fbd6cf0a-6160-4550-b343-12188dc05234>')]),
StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
('Cache-Control', 'max-age=604800'),
('Content-Type', 'text/html'),
('Date', 'Fri, 03 Jan 2014 03:03:21 GMT'),
('Etag', '"359670651"'),
('Expires', 'Fri, 10 Jan 2014 03:03:21 GMT'),
('Last-Modified', 'Fri, 09 Aug 2013 23:54:35 GMT'),
('Server', 'ECS (sjc/4FCE)'),
('X-Cache', 'HIT'),
('x-ec-custom-error', '1'),
('Content-Length', '1270'),
('Connection', 'close')]))
>>> loadTestArchive('example.warc.gz', '1864', '553')
(('warc', 'revisit'),
StatusAndHeaders(protocol = 'WARC/1.0', statusline = '', headers = [ ('WARC-Type', 'revisit'),
('WARC-Record-ID', '<urn:uuid:3619f5b0-d967-44be-8f24-762098d427c4>'),
('WARC-Date', '2014-01-03T03:03:41Z'),
('Content-Length', '340'),
('Content-Type', 'application/http; msgtype=response'),
('WARC-Payload-Digest', 'sha1:B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'),
('WARC-Target-URI', 'http://example.com?example=1'),
('WARC-Warcinfo-ID', '<urn:uuid:fbd6cf0a-6160-4550-b343-12188dc05234>'),
( 'WARC-Profile',
'http://netpreserve.org/warc/0.18/revisit/identical-payload-digest'),
('WARC-Refers-To-Target-URI', 'http://example.com?example=1'),
('WARC-Refers-To-Date', '2014-01-03T03:03:21Z')]),
StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
('Cache-Control', 'max-age=604800'),
('Content-Type', 'text/html'),
('Date', 'Fri, 03 Jan 2014 03:03:41 GMT'),
('Etag', '"359670651"'),
('Expires', 'Fri, 10 Jan 2014 03:03:41 GMT'),
('Last-Modified', 'Fri, 09 Aug 2013 23:54:35 GMT'),
('Server', 'ECS (sjc/4FCE)'),
('X-Cache', 'HIT'),
('x-ec-custom-error', '1'),
('Content-Length', '1270'),
('Connection', 'close')]))
"""
# Standard ARC headers # Standard ARC headers
ARC_HEADERS = ["uri", "ip-address", "creation-date", "content-type", "length"] ARC_HEADERS = ["uri", "ip-address", "creation-date", "content-type", "length"]
# Since loading a range request, can only determine gzip-ness based on file extension # Since loading a range request, can only determine gzip-ness based on file extension
FORMAT_MAP = { FORMAT_MAP = {
'.warc.gz': (hanzo.warctools.WarcRecord, 'warc', True), '.warc.gz': ('warc', True),
'.arc.gz': (hanzo.warctools.ArcRecord, 'arc', True), '.arc.gz': ('arc', True),
'.warc': (hanzo.warctools.WarcRecord, 'warc', False), '.warc': ('warc', False),
'.arc': (hanzo.warctools.ArcRecord, 'arc', False), '.arc': ('arc', False),
} }
HTTP_STATUS_REGEX = re.compile('^HTTP/[\d.]+ (\d+.*)$')
@staticmethod @staticmethod
def createDefaultLoaders(): def createDefaultLoaders():
http = HttpStreamLoader() http = HttpReader()
file = FileStreamLoader() file = FileReader()
return { return {
'http': http, 'http': http,
'https': http, 'https': http,
@ -78,6 +129,10 @@ class ArchiveLoader:
self.loaders = loaders if loaders else ArchiveLoader.createDefaultLoaders() self.loaders = loaders if loaders else ArchiveLoader.createDefaultLoaders()
self.chunkSize = chunkSize self.chunkSize = chunkSize
self.arcParser = ARCHeadersParser(ArchiveLoader.ARC_HEADERS)
self.warcParser = StatusAndHeadersParser(['WARC/1.0', 'WARC/0.17', 'WARC/0.18'])
self.httpParser = StatusAndHeadersParser(['HTTP/1.0', 'HTTP/1.1'])
def load(self, url, offset, length): def load(self, url, offset, length):
urlParts = urlparse.urlsplit(url) urlParts = urlparse.urlsplit(url)
@ -86,22 +141,19 @@ class ArchiveLoader:
except Exception: except Exception:
raise wbexceptions.UnknownLoaderProtocolException(url) raise wbexceptions.UnknownLoaderProtocolException(url)
loaderCls = None theFormat = None
for ext, (loaderCls, aFormat, gzip) in ArchiveLoader.FORMAT_MAP.iteritems(): for ext, iformat in ArchiveLoader.FORMAT_MAP.iteritems():
if url.endswith(ext): if url.endswith(ext):
loaderCls = loaderCls theFormat = iformat
aFormat = aFormat
isGzip = gzip
break break
if loaderCls is None: if theFormat is None:
raise wbexceptions.UnknownArchiveFormatException(url) raise wbexceptions.UnknownArchiveFormatException(url)
if isGzip: (aFormat, isGzip) = theFormat
decomp = zlib.decompressobj(16+zlib.MAX_WBITS)
else: decomp = utils.create_decompressor() if isGzip else None
decomp = None
try: try:
length = int(length) length = int(length)
@ -111,73 +163,87 @@ class ArchiveLoader:
raw = loader.load(url, long(offset), length) raw = loader.load(url, long(offset), length)
reader = LineReader(raw, length, self.chunkSize, decomp) stream = LineReader(raw, length, self.chunkSize, decomp)
parser = loaderCls.make_parser()
if aFormat == 'arc':
parser.headers = ArchiveLoader.ARC_HEADERS
(parsed, errors, _) = parser.parse(reader, 0)
if errors:
reader.close()
raise wbexceptions.InvalidArchiveRecordException('Error Parsing Record', errors)
if aFormat == 'arc': if aFormat == 'arc':
rec_headers = self.arcParser.parse(stream)
recType = 'response' recType = 'response'
empty = (utils.get_header(parsed.headers, 'length') == 0) empty = (rec_headers.getHeader('length') == 0)
else:
recType = utils.get_header(parsed.headers, 'WARC-Type') elif aFormat == 'warc':
empty = (utils.get_header(parsed.headers, 'Content-Length') == '0') rec_headers = self.warcParser.parse(stream)
recType = rec_headers.getHeader('WARC-Type')
empty = (rec_headers.getHeader('Content-Length') == '0')
# special case: empty w/arc record (hopefully a revisit) # special case: empty w/arc record (hopefully a revisit)
if empty: if empty:
statusline = '204 No Content' status_headers = StatusAndHeaders('204 No Content', [])
headers = []
# special case: warc records that are not expected to have http headers # special case: warc records that are not expected to have http headers
# attempt to add 200 status and content-type # attempt to add 200 status and content-type
elif recType == 'metadata' or recType == 'resource': elif recType == 'metadata' or recType == 'resource':
statusline = '200 OK' status_headers = StatusAndHeaders('200 OK', [('Content-Type', rec_headers.getHeader('Content-Type'))])
headers = [('Content-Type', utils.get_header(parsed.headers, 'Content-Type'))]
# special case: http 0.9 response, no status or headers # special case: http 0.9 response, no status or headers
#elif recType == 'response' and (';version=0.9' in utils.get_header(parsed.headers, 'Content-Type')): #elif recType == 'response':
# statusline = '200 OK' # contentType = rec_headers.getHeader('Content-Type')
# headers = [] # if contentType and (';version=0.9' in contentType):
# status_headers = StatusAndHeaders('200 OK', [])
# response record: parse HTTP status and headers! # response record: parse HTTP status and headers!
else: else:
(statusline, headers) = self.parseHttpHeaders(reader) #(statusline, http_headers) = self.parseHttpHeaders(stream)
status_headers = self.httpParser.parse(stream)
return WBArchiveRecord((aFormat, recType), parsed, reader, statusline, headers) return WBArchiveRecord((aFormat, recType), rec_headers, stream, status_headers)
def parseHttpHeaders(self, stream): #=================================================================
def nextHeaderLine(stream): class StatusAndHeadersParser:
return stream.readline().rstrip() def __init__(self, statuslist):
self.statuslist = statuslist
line = nextHeaderLine(stream) def parse(self, stream):
matched = self.HTTP_STATUS_REGEX.match(line) statusline = stream.readline().rstrip()
if not matched: protocolStatus = utils.split_prefix(statusline, self.statuslist)
raise wbexceptions.InvalidArchiveRecordException('Expected HTTP Status Line, Found: ' + line)
if not protocolStatus:
raise wbexceptions.InvalidArchiveRecordException('Expected Status Line, Found: ' + statusline)
#status = int(matched.group(2))
statusline = matched.group(1)
headers = [] headers = []
line = nextHeaderLine(stream) line = stream.readline().rstrip()
while line and line != '\r\n': while line and line != '\r\n':
name, value = line.split(':', 1) name, value = line.split(':', 1)
value = value.strip() header = (name, value.strip())
headers.append((name, value)) headers.append(header)
line = nextHeaderLine(stream) line = stream.readline().rstrip()
return (statusline, headers) return StatusAndHeaders(statusline = protocolStatus[1].strip(), headers = headers, protocol = protocolStatus[0])
#=================================================================
class ARCHeadersParser:
def __init__(self, headernames):
self.headernames = headernames
def parse(self, stream):
headerline = stream.readline().rstrip()
parts = headerline.split()
headernames = self.headernames
if len(parts) != len(headernames):
raise wbexceptions.InvalidArchiveRecordException('Wrong # of heaeders, expected arc headers {0}, Found {1}'.format(headernames, parts))
headers = []
for name, value in itertools.izip(headernames, parts):
headers.append((name, value))
return StatusAndHeaders(statusline = '', headers = headers, protocol = 'ARC/1.0')
#================================================================= #=================================================================
class LineReader: class LineReader:
@ -217,4 +283,19 @@ class LineReader:
self.stream = None self.stream = None
#=================================================================
if __name__ == "__main__":
import doctest
import os
import pprint
testloader = ArchiveLoader()
def loadTestArchive(test_file, offset, length):
path = os.path.dirname(os.path.realpath(__file__)) + '/../test/' + test_file
archive = testloader.load(path, offset, length)
pprint.pprint((archive.type, archive.rec_headers, archive.status_headers))
doctest.testmod()

133
pywb/header_rewriter.py Normal file
View File

@ -0,0 +1,133 @@
from wbrequestresponse import StatusAndHeaders
#=================================================================
class RewrittenStatusAndHeaders:
def __init__(self, statusline, headers, removedHeaderDict, textType, charset):
self.status_headers = StatusAndHeaders(statusline, headers)
self.removedHeaderDict = removedHeaderDict
self.textType = textType
self.charset = charset
def containsRemovedHeader(self, name, value):
return self.removedHeaderDict.get(name) == value
#=================================================================
class HeaderRewriter:
"""
# Text with charset
>>> test_rewrite([('Date', 'Fri, 03 Jan 2014 03:03:21 GMT'), ('Content-Length', '5'), ('Content-Type', 'text/html;charset=utf-8')])
{'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Date', 'Fri, 03 Jan 2014 03:03:21 GMT'),
('X-Archive-Orig-Content-Length', '5'),
('Content-Type', 'text/html;charset=utf-8')]), 'charset': 'utf-8', 'textType': 'html', 'removedHeaderDict': {}}
# Redirect
>>> test_rewrite([('Connection', 'close'), ('Location', '/other.html')], '302 Redirect')
{'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [ ('X-Archive-Orig-Connection', 'close'),
('Location', '/web/20131226101010/http://example.com/other.html')]), 'charset': None, 'textType': None, 'removedHeaderDict': {}}
# gzip
>>> test_rewrite([('Content-Length', '199999'), ('Content-Type', 'text/javascript'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')])
{'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Content-Length', '199999'),
('Content-Type', 'text/javascript')]), 'charset': None, 'textType': 'js', 'removedHeaderDict': {'transfer-encoding': 'chunked', 'content-encoding': 'gzip'}}
# Binary
>>> test_rewrite([('Content-Length', '200000'), ('Content-Type', 'image/png'), ('Cookie', 'blah'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')])
{'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('Content-Length', '200000'),
('Content-Type', 'image/png'),
('X-Archive-Orig-Cookie', 'blah'),
('Content-Encoding', 'gzip'),
('Transfer-Encoding', 'chunked')]), 'charset': None, 'textType': None, 'removedHeaderDict': {}}
"""
REWRITE_TYPES = {
'html': ['text/html', 'application/xhtml'],
'css': ['text/css'],
'js': ['text/javascript', 'application/javascript', 'application/x-javascript'],
'xml': ['/xml', '+xml', '.xml', '.rss'],
}
PROXY_HEADERS = ('content-type', 'content-disposition')
URL_REWRITE_HEADERS = ('location', 'content-location', 'content-base')
ENCODING_HEADERS = ('content-encoding', 'transfer-encoding')
PROXY_NO_REWRITE_HEADERS = ('content-length')
def __init__(self, headerPrefix = 'X-Archive-Orig-'):
self.headerPrefix = headerPrefix
def rewrite(self, status_headers, urlrewriter):
contentType = status_headers.getHeader('Content-Type')
textType = None
charset = None
stripEncoding = False
if contentType:
textType = self._extractTextType(contentType)
if textType:
charset = self._extractCharSet(contentType)
stripEncoding = True
(newHeaders, removedHeaderDict) = self._rewriteHeaders(status_headers.headers, urlrewriter, stripEncoding)
return RewrittenStatusAndHeaders(status_headers.statusline, newHeaders, removedHeaderDict, textType, charset)
def _extractTextType(self, contentType):
for ctype, mimelist in self.REWRITE_TYPES.iteritems():
if any ((mime in contentType) for mime in mimelist):
return ctype
return None
def _extractCharSet(self, contentType):
CHARSET_TOKEN = 'charset='
idx = contentType.find(CHARSET_TOKEN)
if idx < 0:
return None
return contentType[idx + len(CHARSET_TOKEN):]
def _rewriteHeaders(self, headers, urlrewriter, contentRewritten = False):
newHeaders = []
removedHeaderDict = {}
for (name, value) in headers:
lowername = name.lower()
if lowername in self.PROXY_HEADERS:
newHeaders.append((name, value))
elif lowername in self.URL_REWRITE_HEADERS:
newHeaders.append((name, urlrewriter.rewrite(value)))
elif lowername in self.ENCODING_HEADERS:
if contentRewritten:
removedHeaderDict[lowername] = value
else:
newHeaders.append((name, value))
elif lowername in self.PROXY_NO_REWRITE_HEADERS and not contentRewritten:
newHeaders.append((name, value))
else:
newHeaders.append((self.headerPrefix + name, value))
return (newHeaders, removedHeaderDict)
if __name__ == "__main__":
import doctest
import os
import pprint
import url_rewriter
urlrewriter = url_rewriter.ArchivalUrlRewriter('/20131226101010/http://example.com/some/path/index.html', '/web/')
headerrewriter = HeaderRewriter()
def test_rewrite(headers, status = '200 OK'):
rewritten = headerrewriter.rewrite(StatusAndHeaders(status, headers), urlrewriter)
return vars(rewritten)
doctest.testmod()

View File

@ -5,8 +5,8 @@ import sys
import re import re
from HTMLParser import HTMLParser from HTMLParser import HTMLParser
from wburlrewriter import ArchivalUrlRewriter from url_rewriter import ArchivalUrlRewriter
from regexmatch import JSRewriter, CSSRewriter from regex_rewriters import JSRewriter, CSSRewriter
#================================================================= #=================================================================
# WBHtml --html parser for custom rewriting, also handlers for script and css # WBHtml --html parser for custom rewriting, also handlers for script and css

View File

@ -10,37 +10,18 @@ class RemoteCDXServer:
>>> x = cdxserver.load('example.com', parse_cdx = True, limit = '2') >>> x = cdxserver.load('example.com', parse_cdx = True, limit = '2')
>>> pprint(x[0]) >>> pprint(x[0])
{'digest': 'HT2DYGA5UKZCPBSFVCV3JOBXGW2G5UUA', {'digest': 'HT2DYGA5UKZCPBSFVCV3JOBXGW2G5UUA',
'filename': 'DJ_crawl2.20020401123359-c/DJ_crawl3.20020120141301.arc.gz',
'length': '1792', 'length': '1792',
'mimetype': 'text/html', 'mimetype': 'text/html',
'offset': '49482198',
'original': 'http://example.com:80/', 'original': 'http://example.com:80/',
'redirect': '-',
'robotflags': '-',
'statuscode': '200', 'statuscode': '200',
'timestamp': '20020120142510', 'timestamp': '20020120142510',
'urlkey': 'com,example)/'} 'urlkey': 'com,example)/'}
>>> x = cdxserver.load('example.com', parse_cdx = True, params = {'resolveRevisits': True, 'closest': '20131226', 'sort': 'closest', 'limit': '1'}) """
>>> pprint(x[0])
{'digest': 'B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A',
'filename': 'top_domains-00800-20131210-035838/top_domains-00800-20131210051705-00024.warc.gz',
'length': '523',
'mimetype': 'warc/revisit',
'offset': '247256770',
'orig.filename': 'deccanchronicle.com-20130107-023325/IA-FOC-deccanchronicle.com-20130921004125-00000.warc.gz',
'orig.length': '529',
'orig.offset': '769759',
'original': 'http://www.example.com/',
'redirect': '-',
'robotflags': '-',
'statuscode': '-',
'timestamp': '20131210052355',
'urlkey': 'com,example)/'}
"""
def __init__(self, serverUrl): def __init__(self, serverUrl, cookie = None):
self.serverUrl = serverUrl self.serverUrl = serverUrl
self.authCookie = cookie
def load(self, url, params = {}, parse_cdx = False, **kwvalues): def load(self, url, params = {}, parse_cdx = False, **kwvalues):
#url is required, must be passed explicitly! #url is required, must be passed explicitly!
@ -51,6 +32,10 @@ class RemoteCDXServer:
try: try:
request = urllib2.Request(self.serverUrl, urlparams) request = urllib2.Request(self.serverUrl, urlparams)
if self.authCookie:
request.add_header('Cookie', self.authCookie)
response = urllib2.urlopen(request) response = urllib2.urlopen(request)
except urllib2.HTTPError, e: except urllib2.HTTPError, e:
if e.code == 403: if e.code == 403:
@ -91,6 +76,9 @@ class RemoteCDXServer:
class CDXCaptureResult(dict): class CDXCaptureResult(dict):
CDX_FORMATS = [ CDX_FORMATS = [
# Public CDX Format
["urlkey","timestamp","original","mimetype","statuscode","digest","length"],
# CDX 11 Format # CDX 11 Format
["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"], ["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"],

View File

@ -4,8 +4,11 @@ import wbrequestresponse
import wbexceptions import wbexceptions
class QueryHandler: class QueryHandler:
def __init__(self): def __init__(self, cdxserver = None):
self.cdxserver = indexreader.RemoteCDXServer('http://web.archive.org/cdx/search/cdx') if not cdxserver:
cdxserver = indexreader.RemoteCDXServer('http://web.archive.org/cdx/search/cdx')
self.cdxserver = cdxserver
def __call__(self, wbrequest, prev_wbresponse): def __call__(self, wbrequest, prev_wbresponse):
wburl = wbrequest.wb_url wburl = wbrequest.wb_url

View File

@ -2,12 +2,13 @@ import re
import sys import sys
import itertools import itertools
from wburlrewriter import ArchivalUrlRewriter from url_rewriter import ArchivalUrlRewriter
#=================================================================
class RegexRewriter: class RegexRewriter:
""" """
# Test https->http converter (other tests below in subclasses) # Test https->http converter (other tests below in subclasses)
>>> RegexRewriter([(RegexRewriter.HTTPX_MATCH_REGEX, RegexRewriter.removeHttps, 0)]).replaceAll('a = https://example.com; b = http://example.com; c = https://some-url/path/https://embedded.example.com') >>> RegexRewriter([(RegexRewriter.HTTPX_MATCH_STR, RegexRewriter.removeHttps, 0)]).replaceAll('a = https://example.com; b = http://example.com; c = https://some-url/path/https://embedded.example.com')
'a = http://example.com; b = http://example.com; c = http://some-url/path/http://embedded.example.com' 'a = http://example.com; b = http://example.com; c = http://some-url/path/http://embedded.example.com'
""" """
@ -27,7 +28,7 @@ class RegexRewriter:
def archivalRewrite(rewriter): def archivalRewrite(rewriter):
return lambda x: rewriter.rewrite(x) return lambda x: rewriter.rewrite(x)
HTTPX_MATCH_REGEX = 'https?:\\\\?/\\\\?/[A-Za-z0-9:_@.-]+' HTTPX_MATCH_STR = 'https?:\\\\?/\\\\?/[A-Za-z0-9:_@.-]+'
DEFAULT_OP = addPrefix DEFAULT_OP = addPrefix
@ -44,6 +45,9 @@ class RegexRewriter:
self.regex = re.compile(regexStr, re.M) self.regex = re.compile(regexStr, re.M)
self.rules = rules self.rules = rules
def filter(self, m):
return True
def replaceAll(self, string): def replaceAll(self, string):
return self.regex.sub(lambda x: self.replace(x), string) return self.regex.sub(lambda x: self.replace(x), string)
@ -60,6 +64,10 @@ class RegexRewriter:
if not m.group(i): if not m.group(i):
continue continue
# Optional filter to skip matches
if not self.filter(m):
return m.group(0)
# Custom func # Custom func
if not hasattr(op, '__call__'): if not hasattr(op, '__call__'):
op = RegexRewriter.DEFAULT_OP(op) op = RegexRewriter.DEFAULT_OP(op)
@ -74,6 +82,7 @@ class RegexRewriter:
#=================================================================
class JSRewriter(RegexRewriter): class JSRewriter(RegexRewriter):
""" """
>>> test_js('location = "http://example.com/abc.html"') >>> test_js('location = "http://example.com/abc.html"')
@ -100,11 +109,47 @@ class JSRewriter(RegexRewriter):
def _createRules(self, httpPrefix): def _createRules(self, httpPrefix):
return [ return [
(RegexRewriter.HTTPX_MATCH_REGEX, httpPrefix, 0), (RegexRewriter.HTTPX_MATCH_STR, httpPrefix, 0),
('location|domain', 'WB_wombat_', 0), ('location|domain', 'WB_wombat_', 0),
] ]
#=================================================================
class XMLRewriter(RegexRewriter):
"""
>>> test_xml('<tag xmlns="http://www.example.com/ns" attr="http://example.com"></tag>')
'<tag xmlns="http://www.example.com/ns" attr="/web/20131010im_/http://example.com"></tag>'
>>> test_xml('<tag xmlns:xsi="http://www.example.com/ns" attr=" http://example.com"></tag>')
'<tag xmlns:xsi="http://www.example.com/ns" attr=" /web/20131010im_/http://example.com"></tag>'
>>> test_xml('<tag> http://example.com<other>abchttp://example.com</other></tag>')
'<tag> /web/20131010im_/http://example.com<other>abchttp://example.com</other></tag>'
>>> test_xml('<main> http://www.example.com/blah</tag> <other xmlns:abcdef= " http://example.com"/> http://example.com </main>')
'<main> /web/20131010im_/http://www.example.com/blah</tag> <other xmlns:abcdef= " http://example.com"/> /web/20131010im_/http://example.com </main>'
"""
def __init__(self, rewriter, extra = []):
rules = self._createRules(rewriter.getAbsUrl())
RegexRewriter.__init__(self, rules)
# custom filter to reject 'xmlns' attr
def filter(self, m):
attr = m.group(1)
if attr and attr.startswith('xmlns'):
return False
return True
def _createRules(self, httpPrefix):
return [
('([A-Za-z:]+[\s=]+)?["\'\s]*(' + RegexRewriter.HTTPX_MATCH_STR + ')', httpPrefix, 2),
]
#=================================================================
class CSSRewriter(RegexRewriter): class CSSRewriter(RegexRewriter):
r""" r"""
>>> test_css("background: url('/some/path.html')") >>> test_css("background: url('/some/path.html')")
@ -172,6 +217,9 @@ if __name__ == "__main__":
def test_js(string, extra = []): def test_js(string, extra = []):
return JSRewriter(arcrw, extra).replaceAll(string) return JSRewriter(arcrw, extra).replaceAll(string)
def test_xml(string):
return XMLRewriter(arcrw).replaceAll(string)
def test_css(string): def test_css(string):
return CSSRewriter(arcrw).replaceAll(string) return CSSRewriter(arcrw).replaceAll(string)

View File

@ -1,14 +1,18 @@
import StringIO import StringIO
from urllib2 import URLError from urllib2 import URLError
import chardet
import redis
import indexreader import indexreader
from wbrequestresponse import WbResponse from wbrequestresponse import WbResponse, StatusAndHeaders
from wbarchivalurl import ArchivalUrl from wbarchivalurl import ArchivalUrl
import utils import utils
from wburlrewriter import ArchivalUrlRewriter
import wbhtml from url_rewriter import ArchivalUrlRewriter
import regexmatch from header_rewriter import HeaderRewriter
import html_rewriter
import regex_rewriters
import wbexceptions import wbexceptions
#================================================================= #=================================================================
@ -111,19 +115,19 @@ class ReplayHandler(object):
payloadRecord = self._load(cdx, True, failedFiles) payloadRecord = self._load(cdx, True, failedFiles)
# Case 4: if headers record is actually empty (eg empty revisit), then use headers from revisit # Case 4: if headers record is actually empty (eg empty revisit), then use headers from revisit
if not headersRecord.httpHeaders: if not headersRecord.status_headers.headers:
headersRecord.stream.close() headersRecord.stream.close()
headersRecord = payloadRecord headersRecord = payloadRecord
else: else:
headersRecord.stream.close() headersRecord.stream.close()
isRevisit = True isRevisit = True
else: else:
raise wbexceptions.CaptureException('Invalid CDX' + cdx) raise wbexceptions.CaptureException('Invalid CDX' + cdx)
return WbResponse.stream_response(headersRecord.statusline, headersRecord.httpHeaders, payloadRecord.stream) return WbResponse.stream_response(headersRecord.status_headers, payloadRecord.stream)
def resolveFull(self, filename): def resolveFull(self, filename):
@ -140,26 +144,12 @@ class ReplayHandler(object):
#================================================================= #=================================================================
class RewritingReplayHandler(ReplayHandler): class RewritingReplayHandler(ReplayHandler):
def __init__(self, resolvers, archiveloader, headInsert = None, headerRewriter = None):
REWRITE_TYPES = {
'html': ['text/html', 'application/xhtml'],
'css': ['text/css'],
'js': ['text/javascript', 'application/javascript', 'application/x-javascript'],
'xml': ['/xml', '+xml', '.xml', '.rss'],
}
PROXY_HEADERS = ('content-type', 'content-disposition')
URL_REWRITE_HEADERS = ('location', 'content-location', 'content-base')
ENCODING_HEADERS = ('content-encoding', 'transfer-encoding')
def __init__(self, resolvers, archiveloader, headerPrefix = 'X-Archive-Orig-', headInsert = None):
ReplayHandler.__init__(self, resolvers, archiveloader) ReplayHandler.__init__(self, resolvers, archiveloader)
self.headerPrefix = headerPrefix
self.headInsert = headInsert self.headInsert = headInsert
if not headerRewriter:
headerRewriter = HeaderRewriter()
self.headerRewriter = headerRewriter
def _textContentType(self, contentType): def _textContentType(self, contentType):
@ -183,88 +173,94 @@ class RewritingReplayHandler(ReplayHandler):
if wbrequest.wb_url.mod == 'id_': if wbrequest.wb_url.mod == 'id_':
return response return response
contentType = utils.get_header(response.headersList, 'Content-Type')
textType = self._textContentType(contentType) if contentType else None
(newHeaders, remHeaders) = self._rewriteHeaders(response.headersList, urlrewriter, textType is not None)
# binary type, just send through rewrittenHeaders = self.headerRewriter.rewrite(response.status_headers, urlrewriter)
if textType is None:
response.headersList = newHeaders # non-text content type, just send through with rewritten headers
if rewrittenHeaders.textType is None:
response.status_headers = rewrittenHeaders.status_headers
return response return response
# Handle text rewriting # Handle text rewriting
# TODO: better way to pass this # TODO: better way to pass this?
stream = response._stream stream = response._stream
# special case -- need to ungzip the body # special case -- need to ungzip the body
if (utils.contains_header(remHeaders, ('Content-Encoding', 'gzip'))): if (rewrittenHeaders.containsRemovedHeader('content-encoding', 'gzip')):
stream = archiveloader.LineStream(stream, decomp = zlib.decompressobj(16 + zlib.MAX_WBITS)) stream = archiveloader.LineStream(stream, decomp = utils.create_decompressor())
return self._rewriteContent(textType, urlrewriter, stream, newHeaders, response) # TODO: is this right?
if rewrittenHeaders.charset:
encoding = rewrittenHeaders.charset
firstBuff = None
else:
(encoding, firstBuff) = self._detectCharset(stream)
# TODO: first non-streaming attempt, probably want to stream # if ascii, set to noop encode operation
def _rewriteContent(self, textType, urlrewriter, stream, newHeaders, origResponse, encoding = 'utf-8'): if encoding == 'ascii':
if textType == 'html': encoding = None
out = StringIO.StringIO() #encoding = 'utf-8'
#out = SimpleWriter()
htmlrewriter = wbhtml.WBHtml(urlrewriter, out, self.headInsert)
try: # Buffering response for html, streaming for others?
buff = stream.read() if rewrittenHeaders.textType == 'html':
while buff: return self._rewriteHtml(encoding, urlrewriter, stream, rewrittenHeaders.status_headers, firstBuff)
else:
return self._rewriteOther(rewrittenHeaders.textType, encoding, urlrewriter, stream, rewrittenHeaders.status_headers, firstBuff)
def _rewriteHtml(self, encoding, urlrewriter, stream, status_headers, firstBuff = None):
out = StringIO.StringIO()
htmlrewriter = html_rewriter.WBHtml(urlrewriter, out, self.headInsert)
try:
buff = firstBuff if firstBuff else stream.read()
while buff:
if encoding:
buff = buff.decode(encoding) buff = buff.decode(encoding)
htmlrewriter.feed(buff) htmlrewriter.feed(buff)
buff = stream.read() buff = stream.read()
htmlrewriter.close() # Close rewriter if gracefully made it to end
htmlrewriter.close()
#except Exception as e: finally:
# print e content = out.getvalue()
if encoding:
content = content.encode(encoding)
finally:
content = out.getvalue().encode(encoding)
value = [content] value = [content]
newHeaders.append(('Content-Length', str(len(value[0])))) contentLengthStr = str(len(content))
status_headers.headers.append(('Content-Length', contentLengthStr))
out.close() out.close()
return WbResponse(status = origResponse.status, headersList = newHeaders, value = value) return WbResponse(status_headers, value = value)
else:
if textType == 'css':
rewriter = regexmatch.CSSRewriter(urlrewriter)
elif textType == 'js':
rewriter = regexmatch.JSRewriter(urlrewriter)
def doRewrite(buff):
return rewriter.replaceAll(buff)
return WbResponse.stream_response(origResponse.status, newHeaders, stream, doRewrite)
def _rewriteOther(self, textType, encoding, urlrewriter, stream, status_headers, firstBuff = None):
if textType == 'css':
rewriter = regex_rewriters.CSSRewriter(urlrewriter)
elif textType == 'js':
rewriter = regex_rewriters.JSRewriter(urlrewriter)
elif textType == 'xml':
rewriter = regex_rewriters.XMLRewriter(urlrewriter)
def _rewriteHeaders(self, headers, urlrewriter, stripEncoding = False): def doRewrite(buff):
newHeaders = [] if encoding:
removedHeaders = [] buff = buff.decode(encoding)
buff = rewriter.replaceAll(buff)
if encoding:
buff = buff.encode(encoding)
for (name, value) in headers: return buff
lowername = name.lower()
if lowername in self.PROXY_HEADERS:
newHeaders.append((name, value))
elif lowername in self.URL_REWRITE_HEADERS:
newHeaders.append((name, urlrewriter.rewrite(value)))
elif lowername in self.ENCODING_HEADERS:
if stripEncoding:
removedHeaders.append((name, value))
else:
newHeaders.append((name, value))
else:
newHeaders.append((self.headerPrefix + name, value))
return (newHeaders, removedHeaders) return WbResponse.stream_response(status_headers, stream, doRewrite, firstBuff)
def _detectCharset(self, stream):
buff = stream.read(8192)
result = chardet.detect(buff)
print "chardet result: " + str(result)
return (result['encoding'], buff)
def _checkRedir(self, wbrequest, cdx): def _checkRedir(self, wbrequest, cdx):
if cdx and (cdx['timestamp'] != wbrequest.wb_url.timestamp): if cdx and (cdx['timestamp'] != wbrequest.wb_url.timestamp):
@ -279,15 +275,15 @@ class RewritingReplayHandler(ReplayHandler):
wbresponse = ReplayHandler.doReplay(self, cdx, wbrequest, failedFiles) wbresponse = ReplayHandler.doReplay(self, cdx, wbrequest, failedFiles)
# Check for self redirect # Check for self redirect
if wbresponse.status.startswith('3'): if wbresponse.status_headers.statusline.startswith('3'):
if self.isSelfRedirect(wbrequest, wbresponse.headersList): if self.isSelfRedirect(wbrequest, wbresponse.status_headers):
raise wbexceptions.CaptureException('Self Redirect: ' + str(cdx)) raise wbexceptions.CaptureException('Self Redirect: ' + str(cdx))
return wbresponse return wbresponse
def isSelfRedirect(self, wbrequest, httpHeaders): def isSelfRedirect(self, wbrequest, status_headers):
requestUrl = wbrequest.wb_url.url.lower() requestUrl = wbrequest.wb_url.url.lower()
locationUrl = utils.get_header(httpHeaders, 'Location').lower() locationUrl = status_headers.getHeader('Location').lower()
#return requestUrl == locationUrl #return requestUrl == locationUrl
return (ArchivalUrlRewriter.stripProtocol(requestUrl) == ArchivalUrlRewriter.stripProtocol(locationUrl)) return (ArchivalUrlRewriter.stripProtocol(requestUrl) == ArchivalUrlRewriter.stripProtocol(locationUrl))
@ -301,4 +297,16 @@ def PrefixResolver(prefix, contains):
return makeUrl return makeUrl
#======================================
class RedisResolver:
def __init__(self, redisUrl, keyPrefix = 'w:'):
self.redisUrl = redisUrl
self.keyPrefix = keyPrefix
self.redis = redis.StrictRedis.from_url(redisUrl)
def __call__(self, filename):
try:
return self.redis.hget(self.keyPrefix + filename, 'path')
except Exception as e:
print e
return None

View File

@ -1,6 +1,7 @@
import itertools import itertools
import hmac import hmac
import time import time
import zlib
def peek_iter(iterable): def peek_iter(iterable):
try: try:
@ -11,21 +12,15 @@ def peek_iter(iterable):
return itertools.chain([first], iterable) return itertools.chain([first], iterable)
def get_header(headersList, name): def split_prefix(key, prefixs):
nameLower = name.lower() for p in prefixs:
for value in headersList: if key.startswith(p):
if (value[0].lower() == nameLower): plen = len(p)
return value[1] return (key[:plen], key[plen:])
return None
def contains_header(headersList, seekHeader): def create_decompressor():
header = get_header(headersList, seekHeader[0]) return zlib.decompressobj(16 + zlib.MAX_WBITS)
if not header:
return False
# see if found header matches value!
return (header == seekHeader[1])
class HMACCookieMaker: class HMACCookieMaker:
def __init__(self, key, name): def __init__(self, key, name):

View File

@ -2,7 +2,7 @@ from query import QueryHandler
from replay import FullHandler from replay import FullHandler
import wbexceptions import wbexceptions
from wbrequestresponse import WbResponse from wbrequestresponse import WbResponse, StatusAndHeaders
from archivalrouter import ArchivalRequestRouter from archivalrouter import ArchivalRequestRouter
@ -17,10 +17,11 @@ class WBHandler:
## =========== ## ===========
query = QueryHandler()
import testwb import testwb
query = QueryHandler(testwb.createCdxServer())
headInsert = """ headInsert = """
<!-- WB Insert --> <!-- WB Insert -->
@ -54,7 +55,11 @@ def application(env, start_response):
raise wbexceptions.NotFoundException(env['REQUEST_URI'] + ' was not found') raise wbexceptions.NotFoundException(env['REQUEST_URI'] + ' was not found')
except wbexceptions.InternalRedirect as ir: except wbexceptions.InternalRedirect as ir:
response = WbResponse(status = ir.status, headersList = ir.httpHeaders) response = WbResponse(StatusAndHeaders(ir.status, ir.httpHeaders))
except (wbexceptions.NotFoundException, wbexceptions.AccessException) as e:
print "[INFO]: " + str(e)
response = handleException(env, e)
except Exception as e: except Exception as e:
last_exc = e last_exc = e

View File

@ -1,5 +1,7 @@
from wbarchivalurl import ArchivalUrl from wbarchivalurl import ArchivalUrl
import utils import utils
import pprint
#WB Request and Response #WB Request and Response
class WbRequest: class WbRequest:
@ -80,38 +82,36 @@ class WbRequest:
class WbResponse: class WbResponse:
""" """
>>> WbResponse.text_response('Test') >>> WbResponse.text_response('Test')
{'status': '200 OK', 'body': ['Test'], 'headersList': [('Content-Type', 'text/plain')]} {'body': ['Test'], 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [('Content-Type', 'text/plain')])}
>>> WbResponse.text_stream(['Test', 'Another'], '404') >>> WbResponse.text_stream(['Test', 'Another'], '404')
{'status': '404', 'body': ['Test', 'Another'], 'headersList': [('Content-Type', 'text/plain')]} {'body': ['Test', 'Another'], 'status_headers': StatusAndHeaders(protocol = '', statusline = '404', headers = [('Content-Type', 'text/plain')])}
>>> WbResponse.redir_response('http://example.com/otherfile') >>> WbResponse.redir_response('http://example.com/otherfile')
{'status': '302 Redirect', 'body': [], 'headersList': [('Location', 'http://example.com/otherfile')]} {'body': [], 'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [('Location', 'http://example.com/otherfile')])}
""" """
def __init__(self, status, value = [], headersList = []): def __init__(self, status_headers, value = []):
self.status = status self.status_headers = status_headers
self.body = value self.body = value
self.headersList = headersList
@staticmethod @staticmethod
def text_stream(text, status = '200 OK'): def text_stream(text, status = '200 OK'):
return WbResponse(status, value = text, headersList = [('Content-Type', 'text/plain')]) return WbResponse(StatusAndHeaders(status, [('Content-Type', 'text/plain')]), value = text)
@staticmethod @staticmethod
def text_response(text, status = '200 OK'): def text_response(text, status = '200 OK'):
return WbResponse(status, value = [text], headersList = [('Content-Type', 'text/plain')]) return WbResponse(StatusAndHeaders(status, [('Content-Type', 'text/plain')]), value = [text])
@staticmethod @staticmethod
def redir_response(location, status = '302 Redirect'): def redir_response(location, status = '302 Redirect'):
return WbResponse(status, headersList = [('Location', location)]) return WbResponse(StatusAndHeaders(status, [('Location', location)]))
@staticmethod @staticmethod
def stream_response(statusline, headers, stream, proc = None): def stream_response(status_headers, stream, proc = None, firstBuff = None):
def streamGen(): def streamGen():
try: try:
buff = stream.read() buff = firstBuff if firstBuff else stream.read()
while buff: while buff:
if proc: if proc:
buff = proc(buff) buff = proc(buff)
@ -120,25 +120,12 @@ class WbResponse:
finally: finally:
stream.close() stream.close()
response = WbResponse(statusline, headersList = headers, value = streamGen()) response = WbResponse(status_headers, value = streamGen())
response._stream = stream response._stream = stream
return response return response
@staticmethod
def better_timestamp_response(wbrequest, newTimestamp):
wbrequest.wb_url.timestamp = newTimestamp
newUrl = wbrequest.wb_prefix + str(wbrequest.wb_url)[1:]
return WbResponse.redir_response(newUrl)
def get_header(self, name):
return utils.get_header(self.headersList, name)
def __call__(self, env, start_response): def __call__(self, env, start_response):
#headersList = [] start_response(self.status_headers.statusline, self.status_headers.headers)
#for key, value in self.headers.iteritems():
# headersList.append((key, value))
start_response(self.status, self.headersList)
if env['REQUEST_METHOD'] == 'HEAD': if env['REQUEST_METHOD'] == 'HEAD':
if hasattr(self.body, 'close'): if hasattr(self.body, 'close'):
@ -155,6 +142,28 @@ class WbResponse:
return str(vars(self)) return str(vars(self))
#=================================================================
class StatusAndHeaders:
def __init__(self, statusline, headers, protocol = ''):
self.statusline = statusline
self.headers = headers
self.protocol = protocol
def getHeader(self, name):
nameLower = name.lower()
for value in self.headers:
if (value[0].lower() == nameLower):
return value[1]
return None
def __repr__(self):
return "StatusAndHeaders(protocol = '{0}', statusline = '{1}', headers = {2})".format(self.protocol, self.statusline, pprint.pformat(self.headers, indent = 2))
#return pprint.pformat(self.__dict__)
def __eq__(self, other):
return self.statusline == other.statusline and self.headers == other.headers and self.protocol == other.protocol
if __name__ == "__main__": if __name__ == "__main__":
import doctest import doctest