1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-14 15:53:28 +01:00

pywb 0.2!

move to distinct packages: pywb.utils, pywb.cdx, pywb.warc, pywb.util, pywb.rewrite!
each package will have its own README and tests
shared sample_data and install
This commit is contained in:
Ilya Kreymer 2014-02-17 02:34:39 -08:00
parent 2528ee0a7c
commit 5345459298
61 changed files with 2951 additions and 2185 deletions

View File

@ -1,2 +0,0 @@
#Allow importing

View File

@ -1,3 +1,4 @@
#Allow importing
import os
def get_test_dir():
return os.path.dirname(os.path.realpath(__file__)) + '/../sample_archive/'

View File

@ -3,8 +3,8 @@ import re
import wbexceptions
from wbrequestresponse import WbRequest, WbResponse
from url_rewriter import UrlRewriter
from wburl import WbUrl
from pywb.rewrite.url_rewriter import UrlRewriter
from pywb.rewrite.wburl import WbUrl
#=================================================================
# ArchivalRouter -- route WB requests in archival mode
@ -45,20 +45,6 @@ class ArchivalRouter:
# of request uri (excluding first '/')
#=================================================================
class Route:
"""
# route with relative path
>>> Route('web', handlers.BaseHandler())({'REL_REQUEST_URI': '/web/test.example.com', 'SCRIPT_NAME': ''}, False)
{'wb_url': ('latest_replay', '', '', 'http://test.example.com', 'http://test.example.com'), 'coll': 'web', 'wb_prefix': '/web/', 'request_uri': '/web/test.example.com'}
# route with absolute path, running at script /my_pywb
>>> Route('web', handlers.BaseHandler())({'REL_REQUEST_URI': '/web/2013im_/test.example.com', 'SCRIPT_NAME': '/my_pywb', 'HTTP_HOST': 'localhost:8081', 'wsgi.url_scheme': 'https'}, True)
{'wb_url': ('replay', '2013', 'im_', 'http://test.example.com', '2013im_/http://test.example.com'), 'coll': 'web', 'wb_prefix': 'https://localhost:8081/my_pywb/web/', 'request_uri': '/web/2013im_/test.example.com'}
# not matching route -- skipped
>>> Route('web', handlers.BaseHandler())({'REL_REQUEST_URI': '/other/test.example.com', 'SCRIPT_NAME': ''}, False)
"""
# match upto next / or ? or end
SLASH_QUERY_LOOKAHEAD ='(?=/|$|\?)'
@ -127,57 +113,6 @@ class Route:
# ReferRedirect -- redirect urls that have 'fallen through' based on the referrer settings
#=================================================================
class ReferRedirect:
"""
>>> ReferRedirect('http://localhost:8080/').match_prefixs
['http://localhost:8080/']
>>> ReferRedirect(['http://example:9090/']).match_prefixs
['http://example:9090/']
>>> test_redir('http://localhost:8080/', '/diff_path/other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html')
'http://localhost:8080/coll/20131010/http://example.com/diff_path/other.html'
>>> test_redir('http://localhost:8080/', '/../other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html')
'http://localhost:8080/coll/20131010/http://example.com/other.html'
>>> test_redir('http://localhost:8080/', '/../../other.html', 'http://localhost:8080/coll/20131010/http://example.com/index.html')
'http://localhost:8080/coll/20131010/http://example.com/other.html'
# Custom collection
>>> test_redir('http://localhost:8080/', '/other.html', 'http://localhost:8080/complex/123/20131010/http://example.com/path/page.html', coll='complex/123')
'http://localhost:8080/complex/123/20131010/http://example.com/other.html'
# With timestamp included
>>> test_redir('http://localhost:8080/', '/20131010/other.html', 'http://localhost:8080/coll/20131010/http://example.com/index.html')
'http://localhost:8080/coll/20131010/http://example.com/other.html'
# With timestamp included
>>> test_redir('http://localhost:8080/', '/20131010/path/other.html', 'http://localhost:8080/coll/20131010/http://example.com/some/index.html')
'http://localhost:8080/coll/20131010/http://example.com/path/other.html'
# Wrong Host
>>> test_redir('http://example:8080/', '/other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html')
False
# Right Host
>>> test_redir('http://localhost:8080/', '/other.html', 'http://example.com:8080/coll/20131010/http://example.com/path/page.html', http_host = 'example.com:8080')
'http://example.com:8080/coll/20131010/http://example.com/other.html'
# With custom SCRIPT_NAME
>>> test_redir('http://localhost:8080/', '/../other.html', 'http://localhost:8080/extra/coll/20131010/http://example.com/path/page.html', '/extra')
'http://localhost:8080/extra/coll/20131010/http://example.com/other.html'
# With custom SCRIPT_NAME + timestamp
>>> test_redir('http://localhost:8080/', '/20131010/other.html', 'http://localhost:8080/extra/coll/20131010/http://example.com/path/page.html', '/extra')
'http://localhost:8080/extra/coll/20131010/http://example.com/other.html'
# With custom SCRIPT_NAME, bad match
>>> test_redir('http://localhost:8080/', '/../other.html', 'http://localhost:8080/extra/coll/20131010/http://example.com/path/page.html', '/extr')
False
"""
def __init__(self, match_prefixs):
if isinstance(match_prefixs, list):
self.match_prefixs = match_prefixs
@ -240,31 +175,3 @@ class ReferRedirect:
final_url = urlparse.urlunsplit((ref_split.scheme, ref_split.netloc, rewriter.rewrite(rel_request_uri), '', ''))
return WbResponse.redir_response(final_url)
import utils
if __name__ == "__main__" or utils.enable_doctests():
import handlers
def test_redir(match_host, request_uri, referrer, script_name = '', coll = 'coll', http_host = None):
env = {'REL_REQUEST_URI': request_uri, 'HTTP_REFERER': referrer, 'SCRIPT_NAME': script_name}
if http_host:
env['HTTP_HOST'] = http_host
routes = [Route(coll, handlers.BaseHandler())]
redir = ReferRedirect(match_host)
#req = WbRequest.from_uri(request_uri, env)
rep = redir(env, routes)
if not rep:
return False
return rep.status_headers.get_header('Location')
import doctest
doctest.testmod()

View File

@ -1,461 +0,0 @@
import itertools
import utils
import urllib2
import StringIO
import urlparse
import collections
import wbexceptions
from wbrequestresponse import StatusAndHeaders
#=================================================================
# load a reader from http
#=================================================================
class HttpLoader:
"""
Load content over http with range request and optional signature
"""
def __init__(self, hmac = None, hmac_duration = 30):
self.hmac = hmac
self.hmac_duration = hmac_duration
def load(self, url, offset, length):
if length > 0:
range_header = 'bytes={0}-{1}'.format(offset, offset + length - 1)
else:
range_header = 'bytes={0}-'.format(offset)
headers = {}
headers['Range'] = range_header
if self.hmac:
headers['Cookie'] = self.hmac(self.hmac_duration)
request = urllib2.Request(url, headers = headers)
return urllib2.urlopen(request)
#=================================================================
# load a reader from local filesystem
#=================================================================
class FileLoader:
"""
Load content from local file-system
# Ensure attempt to read more than 100 bytes, only reads 100 bytes
>>> len(FileLoader().load(utils.test_data_dir() + 'warcs/iana.warc.gz', 0, 100).read('400'))
100
"""
def load(self, url, offset, length):
if url.startswith('file://'):
url = url[len('file://'):]
afile = open(url, 'rb')
afile.seek(offset)
if length > 0:
return LimitReader(afile, length)
else:
return afile
#=================================================================
# A reader which will not read past the specified limit
#=================================================================
class LimitReader:
"""
>>> LimitReader(StringIO.StringIO('abcdefghjiklmnopqrstuvwxyz'), 10).read(26)
'abcdefghji'
>>> LimitReader(StringIO.StringIO('abcdefghjiklmnopqrstuvwxyz'), 8).readline(26)
'abcdefgh'
>>> test_multiple_reads(LimitReader(StringIO.StringIO('abcdefghjiklmnopqrstuvwxyz'), 10), [2, 2, 20])
'efghji'
"""
def __init__(self, stream, limit):
self.stream = stream
self.limit = limit
if not self.limit:
self.limit = 1
def read(self, length = None):
length = min(length, self.limit) if length else self.limit
buff = self.stream.read(length)
self.limit -= len(buff)
return buff
def readline(self, length = None):
length = min(length, self.limit) if length else self.limit
buff = self.stream.readline(length)
self.limit -= len(buff)
return buff
def close(self):
self.stream.close()
#=================================================================
WBArchiveRecord = collections.namedtuple('WBArchiveRecord', 'type, rec_headers, stream, status_headers')
#=================================================================
class ArchiveLoader:
"""
>>> load_test_archive('example.warc.gz', '333', '1043')
(('warc', 'response'),
StatusAndHeaders(protocol = 'WARC/1.0', statusline = '', headers = [ ('WARC-Type', 'response'),
('WARC-Record-ID', '<urn:uuid:6d058047-ede2-4a13-be79-90c17c631dd4>'),
('WARC-Date', '2014-01-03T03:03:21Z'),
('Content-Length', '1610'),
('Content-Type', 'application/http; msgtype=response'),
('WARC-Payload-Digest', 'sha1:B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'),
('WARC-Target-URI', 'http://example.com?example=1'),
('WARC-Warcinfo-ID', '<urn:uuid:fbd6cf0a-6160-4550-b343-12188dc05234>')]),
StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
('Cache-Control', 'max-age=604800'),
('Content-Type', 'text/html'),
('Date', 'Fri, 03 Jan 2014 03:03:21 GMT'),
('Etag', '"359670651"'),
('Expires', 'Fri, 10 Jan 2014 03:03:21 GMT'),
('Last-Modified', 'Fri, 09 Aug 2013 23:54:35 GMT'),
('Server', 'ECS (sjc/4FCE)'),
('X-Cache', 'HIT'),
('x-ec-custom-error', '1'),
('Content-Length', '1270'),
('Connection', 'close')]))
>>> load_test_archive('example.warc.gz', '1864', '553')
(('warc', 'revisit'),
StatusAndHeaders(protocol = 'WARC/1.0', statusline = '', headers = [ ('WARC-Type', 'revisit'),
('WARC-Record-ID', '<urn:uuid:3619f5b0-d967-44be-8f24-762098d427c4>'),
('WARC-Date', '2014-01-03T03:03:41Z'),
('Content-Length', '340'),
('Content-Type', 'application/http; msgtype=response'),
('WARC-Payload-Digest', 'sha1:B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'),
('WARC-Target-URI', 'http://example.com?example=1'),
('WARC-Warcinfo-ID', '<urn:uuid:fbd6cf0a-6160-4550-b343-12188dc05234>'),
( 'WARC-Profile',
'http://netpreserve.org/warc/0.18/revisit/identical-payload-digest'),
('WARC-Refers-To-Target-URI', 'http://example.com?example=1'),
('WARC-Refers-To-Date', '2014-01-03T03:03:21Z')]),
StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
('Cache-Control', 'max-age=604800'),
('Content-Type', 'text/html'),
('Date', 'Fri, 03 Jan 2014 03:03:41 GMT'),
('Etag', '"359670651"'),
('Expires', 'Fri, 10 Jan 2014 03:03:41 GMT'),
('Last-Modified', 'Fri, 09 Aug 2013 23:54:35 GMT'),
('Server', 'ECS (sjc/4FCE)'),
('X-Cache', 'HIT'),
('x-ec-custom-error', '1'),
('Content-Length', '1270'),
('Connection', 'close')]))
"""
# Standard ARC headers
ARC_HEADERS = ["uri", "ip-address", "creation-date", "content-type", "length"]
# Since loading a range request, can only determine gzip-ness based on file extension
FORMAT_MAP = {
'.warc.gz': ('warc', True),
'.arc.gz': ('arc', True),
'.warc': ('warc', False),
'.arc': ('arc', False),
}
@staticmethod
def create_default_loaders(hmac = None):
http = HttpLoader(hmac)
file = FileLoader()
return {
'http': http,
'https': http,
'file': file,
'': file
}
def __init__(self, loaders = {}, hmac = None, chunk_size = 8192):
self.loaders = loaders if loaders else ArchiveLoader.create_default_loaders(hmac)
self.chunk_size = chunk_size
self.arc_parser = ARCHeadersParser(ArchiveLoader.ARC_HEADERS)
self.warc_parser = StatusAndHeadersParser(['WARC/1.0', 'WARC/0.17', 'WARC/0.18'])
self.http_parser = StatusAndHeadersParser(['HTTP/1.0', 'HTTP/1.1'])
def load(self, url, offset, length):
url_parts = urlparse.urlsplit(url)
loader = self.loaders.get(url_parts.scheme)
if not loader:
raise wbexceptions.UnknownLoaderProtocolException(url)
the_format = None
for ext, iformat in ArchiveLoader.FORMAT_MAP.iteritems():
if url.endswith(ext):
the_format = iformat
break
if the_format is None:
raise wbexceptions.UnknownArchiveFormatException(url)
(a_format, is_gzip) = the_format
decomp = utils.create_decompressor() if is_gzip else None
try:
length = int(length)
except:
length = -1
raw = loader.load(url, long(offset), length)
stream = LineReader(raw, length, self.chunk_size, decomp)
if a_format == 'arc':
rec_headers = self.arc_parser.parse(stream)
rec_type = 'response'
empty = (rec_headers.get_header('length') == 0)
elif a_format == 'warc':
rec_headers = self.warc_parser.parse(stream)
rec_type = rec_headers.get_header('WARC-Type')
empty = (rec_headers.get_header('Content-Length') == '0')
# special case: empty w/arc record (hopefully a revisit)
if empty:
status_headers = StatusAndHeaders('204 No Content', [])
# special case: warc records that are not expected to have http headers
# attempt to add 200 status and content-type
elif rec_type == 'metadata' or rec_type == 'resource':
status_headers = StatusAndHeaders('200 OK', [('Content-Type', rec_headers.get_header('Content-Type'))])
# special case: http 0.9 response, no status or headers
#elif rec_type == 'response':
# content_type = rec_headers.get_header('Content-Type')
# if content_type and (';version=0.9' in content_type):
# status_headers = StatusAndHeaders('200 OK', [])
# response record: parse HTTP status and headers!
else:
#(statusline, http_headers) = self.parse_http_headers(stream)
status_headers = self.http_parser.parse(stream)
return WBArchiveRecord((a_format, rec_type), rec_headers, stream, status_headers)
#=================================================================
class StatusAndHeadersParser:
def __init__(self, statuslist):
self.statuslist = statuslist
def parse(self, stream):
statusline = stream.readline().rstrip()
protocol_status = utils.split_prefix(statusline, self.statuslist)
if not protocol_status:
raise wbexceptions.InvalidArchiveRecordException('Expected Status Line, Found: ' + statusline)
headers = []
line = stream.readline().rstrip()
while line and line != '\r\n':
name, value = line.split(':', 1)
header = (name, value.strip())
headers.append(header)
line = stream.readline().rstrip()
return StatusAndHeaders(statusline = protocol_status[1].strip(), headers = headers, protocol = protocol_status[0])
#=================================================================
class ARCHeadersParser:
def __init__(self, headernames):
self.headernames = headernames
def parse(self, stream):
headerline = stream.readline().rstrip()
parts = headerline.split()
headernames = self.headernames
if len(parts) != len(headernames):
raise wbexceptions.InvalidArchiveRecordException('Wrong # of heaeders, expected arc headers {0}, Found {1}'.format(headernames, parts))
headers = []
for name, value in itertools.izip(headernames, parts):
headers.append((name, value))
return StatusAndHeaders(statusline = '', headers = headers, protocol = 'ARC/1.0')
#=================================================================
class LineReader:
def __init__(self, stream, max_len = 0, chunk_size = 1024, decomp = None):
self.stream = stream
self.chunk_size = chunk_size
self.decomp = decomp
self.buff = None
self.num_read = 0
self.max_len = max_len
def _fillbuff(self, chunk_size = None):
if not chunk_size:
chunk_size = self.chunk_size
if not self.buff or self.buff.pos >= self.buff.len:
to_read = min(self.max_len - self.num_read, self.chunk_size) if (self.max_len > 0) else self.chunk_size
data = self.stream.read(to_read)
self._process_read(data)
def _process_read(self, data):
if self.decomp and data:
try:
data = self.decomp.decompress(data)
except Exception:
# if first read attempt, assume non-gzipped stream
if self.num_read == 0:
self.decomp = False
# otherwise (partly decompressed), something is wrong
else:
raise
self.num_read += len(data)
self.buff = StringIO.StringIO(data)
def read(self, length = None):
self._fillbuff()
return self.buff.read(length)
def readline(self, length = None):
self._fillbuff()
return self.buff.readline(length)
def close(self):
if self.stream:
self.stream.close()
self.stream = None
class ChunkedDataException(Exception):
pass
class ChunkedLineReader(LineReader):
r"""
Properly formatted chunked data:
>>> c=ChunkedLineReader(StringIO.StringIO("4\r\n1234\r\n0\r\n\r\n")); c.read()+c.read()
'1234'
Non-chunked data:
>>> ChunkedLineReader(StringIO.StringIO("xyz123!@#")).read()
'xyz123!@#'
Starts like chunked data, but isn't:
>>> c=ChunkedLineReader(StringIO.StringIO("1\r\nxyz123!@#")); c.read()+c.read()
'1\r\nx123!@#'
Chunked data cut off part way through:
>>> c=ChunkedLineReader(StringIO.StringIO("4\r\n1234\r\n4\r\n12"));c.read()+c.read()
'123412'
"""
all_chunks_read = False
not_chunked = False
raise_chunked_data_exceptions = False # if False, we'll use best-guess fallback for parse errors
def _fillbuff(self, chunk_size = None):
if self.not_chunked:
return LineReader._fillbuff(self, chunk_size)
if self.all_chunks_read:
return
if not self.buff or self.buff.pos >= self.buff.len:
length_header = self.stream.readline(64)
data = ''
try:
# decode length header
try:
chunk_size = int(length_header.strip().split(';')[0], 16)
except ValueError:
raise ChunkedDataException("Couldn't decode length header '%s'" % length_header)
if chunk_size:
# read chunk
while len(data) < chunk_size:
new_data = self.stream.read(chunk_size - len(data))
# if we unexpectedly run out of data, either raise an exception or just stop reading, assuming file was cut off
if not new_data:
if self.raise_chunked_data_exceptions:
raise ChunkedDataException("Ran out of data before end of chunk")
else:
chunk_size = len(data)
self.all_chunks_read = True
data += new_data
# if we successfully read a block without running out, it should end in \r\n
if not self.all_chunks_read:
clrf = self.stream.read(2)
if clrf != '\r\n':
raise ChunkedDataException("Chunk terminator not found.")
if self.decomp:
data = self.decomp.decompress(data)
else:
# chunk_size 0 indicates end of file
self.all_chunks_read = True
data = ''
self._process_read(data)
except ChunkedDataException:
if self.raise_chunked_data_exceptions:
raise
# Can't parse the data as chunked.
# It's possible that non-chunked data is set with a Transfer-Encoding: chunked
# Treat this as non-chunk encoded from here on
self._process_read(length_header + data)
self.not_chunked = True
#=================================================================
import utils
if __name__ == "__main__" or utils.enable_doctests():
import os
import pprint
testloader = ArchiveLoader()
def load_test_archive(test_file, offset, length):
path = utils.test_data_dir() + 'warcs/' + test_file
archive = testloader.load(path, offset, length)
pprint.pprint((archive.type, archive.rec_headers, archive.status_headers))
def test_multiple_reads(reader, inc_reads):
result = None
for x in inc_reads:
result = reader.read(x)
return result
import doctest
doctest.testmod()

View File

@ -1,123 +0,0 @@
from collections import deque
import os
import itertools
#=================================================================
# Binary Search over a text file
#=================================================================
class FileReader:
"""
A very simple file-like object wrapper that knows it's size
getsize() method returns the filesize
"""
def __init__(self, filename):
self.fh = open(filename, 'rb')
self.filename = filename
self.size = os.path.getsize(filename)
def getsize(self):
return self.size
def readline(self):
return self.fh.readline()
def seek(self, offset):
return self.fh.seek(offset)
def close(self):
return self.fh.close()
#=================================================================
def binsearch_offset(reader, key, compare_func=cmp, block_size=8192):
"""
Find offset of the full line which matches a given 'key' using binary search
If key is not found, the offset is of the line after the key
File is subdivided into block_size (default 8192) sized blocks
Optional compare_func may be specified
"""
min = 0
max = reader.getsize() / block_size
while (max - min > 1):
mid = min + ((max - min) / 2)
reader.seek(mid * block_size)
if mid > 0:
reader.readline() # skip partial line
line = reader.readline()
if compare_func(key, line) > 0:
min = mid
else:
max = mid
return (min * block_size)
def search(reader, key, prev_size = 0, compare_func = cmp, block_size = 8192):
"""
Perform a binsearch for a specified key down to block_size (8192) sized blocks,
followed by linear search within the block to find first matching line.
When performing linear search, keep track of up to N previous lines before
first matching line.
"""
min = binsearch_offset(reader, key, compare_func, block_size)
reader.seek(min)
if min > 0:
reader.readline() # skip partial line
if prev_size > 1:
prev_deque = deque(maxlen = prev_size)
line = None
while True:
line = reader.readline()
if not line:
break
if compare_func(line, key) >= 0:
break
if prev_size == 1:
prev = line
elif prev_size > 1:
prev_deque.append(line)
def gen_iter(line):
if prev_size == 1:
yield prev.rstrip()
elif prev_size > 1:
for i in prev_deque:
yield i.rstrip()
while line:
yield line.rstrip()
line = reader.readline()
return gen_iter(line)
# Iterate over prefix matches
def iter_prefix(reader, key):
"""
Creates an iterator which iterates over prefix matches for a key in a sorted text file
A line matches as long as it starts with key
"""
return itertools.takewhile(lambda line: line.startswith(key), search(reader, key))
def iter_exact(reader, key, token=' '):
"""
Create an iterator which iterates over exact matches for a key in a sorted text file
Key is terminated by a token (default ' ')
"""
return iter_prefix(reader, key + token)

36
pywb/cdx/README.md Normal file
View File

@ -0,0 +1,36 @@
## PyWb CDX v0.2
[![Build Status](https://travis-ci.org/ikreymer/pywb_cdx.png?branch=master)](https://travis-ci.org/ikreymer/pywb_cdx)
This package contains the CDX processing suite of the pywb wayback tool suite.
The CDX Server loads, filters and transforms cdx from multiple sources in response
to a given query.
### Installation and Tests
`pip install -r requirements` -- to install
`python run-tests.py` -- to run all tests
### Sample App
A very simple reference WSGI app is included.
Run: `python -m pywb_cdx.wsgi_cdxserver` to start the app, keyboard interrupt to stop.
The default [config.yaml](pywb_cdx/config.yaml) points to the sample data directory
and uses port 8080
### CDX Server API Reference
Goal is to provide compatiblity with this feature set and more:
https://github.com/internetarchive/wayback/tree/master/wayback-cdx-server
TODO

View File

@ -1,25 +1,31 @@
from collections import OrderedDict
import itertools
#=================================================================
class CDXObject(OrderedDict):
CDX_FORMATS = [
# Public CDX Format
["urlkey","timestamp","original","mimetype","statuscode","digest","length"],
["urlkey", "timestamp", "original", "mimetype", "statuscode",
"digest", "length"],
# CDX 11 Format
["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"],
["urlkey", "timestamp", "original", "mimetype", "statuscode",
"digest", "redirect", "robotflags", "length", "offset", "filename"],
# CDX 9 Format
["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","offset","filename"],
["urlkey", "timestamp", "original", "mimetype", "statuscode",
"digest", "redirect", "offset", "filename"],
# CDX 11 Format + 3 revisit resolve fields
["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename",
"orig.length","orig.offset","orig.filename"],
["urlkey", "timestamp", "original", "mimetype", "statuscode",
"digest", "redirect", "robotflags", "length", "offset", "filename",
"orig.length", "orig.offset", "orig.filename"],
# CDX 9 Format + 3 revisit resolve fields
["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","offset","filename",
"orig.length","orig.offset","orig.filename"]
["urlkey", "timestamp", "original", "mimetype", "statuscode",
"digest", "redirect", "offset", "filename",
"orig.length", "orig.offset", "orig.filename"]
]
def __init__(self, cdxline):
@ -53,5 +59,3 @@ class CDXObject(OrderedDict):
li = itertools.imap(lambda (n, val): val, self.items())
return ' '.join(li)

View File

@ -1,8 +1,6 @@
from cdxobject import CDXObject
from pywb.utils.timeutils import timestamp_to_sec
from ..binsearch.binsearch import iter_exact, iter_prefix, FileReader
import timeutils
import bisect
import itertools
import re
@ -11,7 +9,6 @@ from heapq import merge
from collections import deque
#=================================================================
def cdx_text_out(cdx, fields):
if not fields:
@ -26,30 +23,31 @@ def cdx_load(sources, params):
cdx_iter = make_cdx_iter(cdx_iter)
resolve_revisits = params.get('resolve_revisits', False)
if resolve_revisits:
cdx_iter = cdx_resolve_revisits(cdx_iter)
if not params.get('proxy_all'):
resolve_revisits = params.get('resolve_revisits', False)
if resolve_revisits:
cdx_iter = cdx_resolve_revisits(cdx_iter)
filters = params.get('filter', None)
if filters:
cdx_iter = cdx_filter(cdx_iter, filters)
filters = params.get('filter', None)
if filters:
cdx_iter = cdx_filter(cdx_iter, filters)
collapse_time = params.get('collapse_time', None)
if collapse_time:
cdx_iter = cdx_collapse_time_status(cdx_iter, collapse_time)
collapse_time = params.get('collapse_time', None)
if collapse_time:
cdx_iter = cdx_collapse_time_status(cdx_iter, collapse_time)
limit = int(params.get('limit', 1000000))
limit = int(params.get('limit', 1000000))
reverse = params.get('reverse', False)
if reverse:
cdx_iter = cdx_reverse(cdx_iter, limit)
reverse = params.get('reverse', False)
if reverse:
cdx_iter = cdx_reverse(cdx_iter, limit)
closest_to = params.get('closest_to', None)
if closest_to:
cdx_iter = cdx_sort_closest(closest_to, cdx_iter, limit)
closest_to = params.get('closest', None)
if closest_to:
cdx_iter = cdx_sort_closest(closest_to, cdx_iter, limit)
if limit:
cdx_iter = cdx_limit(cdx_iter, limit)
if limit:
cdx_iter = cdx_limit(cdx_iter, limit)
# output raw cdx objects
if params.get('output') == 'raw':
@ -73,6 +71,7 @@ def load_cdx_streams(sources, params):
merged_stream = merge(*(source_iters))
return merged_stream
#=================================================================
# convert text cdx stream to CDXObject
def make_cdx_iter(text_iter):
@ -98,7 +97,7 @@ def cdx_reverse(cdx_iter, limit):
return [last] if last else []
reverse_cdxs = deque(maxlen = limit)
reverse_cdxs = deque(maxlen=limit)
for cdx in cdx_iter:
reverse_cdxs.appendleft(cdx)
@ -142,14 +141,13 @@ def cdx_filter(cdx_iter, filter_strings):
filters = map(Filter, filter_strings)
for cdx in cdx_iter:
if all (x(cdx) for x in filters):
if all(x(cdx) for x in filters):
yield cdx
#=================================================================
# collapse by timestamp and status code
def cdx_collapse_time_status(cdx_iter, timelen = 10):
def cdx_collapse_time_status(cdx_iter, timelen=10):
timelen = int(timelen)
last_token = None
@ -163,16 +161,15 @@ def cdx_collapse_time_status(cdx_iter, timelen = 10):
yield cdx
#=================================================================
# sort CDXCaptureResult by closest to timestamp
def cdx_sort_closest(closest, cdx_iter, limit = 10):
def cdx_sort_closest(closest, cdx_iter, limit=10):
closest_cdx = []
closest_sec = timeutils.timestamp_to_sec(closest)
closest_sec = timestamp_to_sec(closest)
for cdx in cdx_iter:
sec = timeutils.timestamp_to_sec(cdx['timestamp'])
sec = timestamp_to_sec(cdx['timestamp'])
key = abs(closest_sec - sec)
# create tuple to sort by key
@ -186,22 +183,22 @@ def cdx_sort_closest(closest, cdx_iter, limit = 10):
if len(closest_cdx) > limit:
closest_cdx.pop()
return itertools.imap(lambda x: x[1], closest_cdx)
#=================================================================
# resolve revisits
# Fields to append from cdx original to revisit
ORIG_TUPLE = ['length', 'offset', 'filename']
def cdx_resolve_revisits(cdx_iter):
originals = {}
for cdx in cdx_iter:
is_revisit = (cdx['mimetype'] == 'warc/revisit') or (cdx['filename'] == '-')
is_revisit = ((cdx['mimetype'] == 'warc/revisit') or
(cdx['filename'] == '-'))
digest = cdx['digest']
@ -210,7 +207,6 @@ def cdx_resolve_revisits(cdx_iter):
if not original_cdx and not is_revisit:
originals[digest] = cdx
if original_cdx and is_revisit:
fill_orig = lambda field: original_cdx[field]
# Transfer mimetype and statuscode
@ -224,5 +220,3 @@ def cdx_resolve_revisits(cdx_iter):
cdx['orig.' + field] = fill_orig(field)
yield cdx

View File

@ -1,5 +1,4 @@
import surt
from ..binsearch.binsearch import iter_exact, iter_prefix, FileReader
from cdxops import cdx_load
import itertools
@ -7,39 +6,21 @@ import logging
import os
import urlparse
from cdxsource import CDXSource, CDXFile, RemoteCDXSource
#=================================================================
class CDXFile:
def __init__(self, filename):
self.filename = filename
def load_cdx(self, params):
source = FileReader(self.filename)
match_type = params.get('match_type')
if match_type == 'prefix':
iter_func = iter_prefix
else:
iter_func = iter_exact
key = params.get('key')
return iter_func(source, key)
def __str__(self):
return 'CDX File - ' + self.filename
#=================================================================
class CDXException(Exception):
def __init__(self, msg, url = None):
Exception.__init__(self, msg)
self.url = url
def status(self):
return '400 Bad Request'
#=================================================================
class AccessException(CDXException):
def status(self):
return '403 Bad Request'
#=================================================================
class CDXServer:
"""
@ -47,33 +28,51 @@ class CDXServer:
responds to queries and dispatches to the cdx ops for processing
"""
def __init__(self, sources, surt_ordered = True):
@staticmethod
def create_from_config(config):
paths = config.get('index_paths')
surt_ordered = config.get('surt_ordered', True)
return CDXServer(paths, surt_ordered)
def __init__(self, sources, surt_ordered=True):
self.sources = []
self.surt_ordered = surt_ordered
logging.debug('CDX Surt-Ordered? ' + str(surt_ordered))
if not isinstance(sources, list):
sources = [sources]
for src in sources:
if os.path.isdir(src):
for file in os.listdir(src):
self.add_cdx_loader(src + file)
else:
self.add_cdx_loader(src)
if isinstance(src, CDXSource):
self.add_cdx_source(src)
elif isinstance(src, str):
if os.path.isdir(src):
for file in os.listdir(src):
self.add_cdx_source(src + file)
else:
self.add_cdx_source(src)
if len(self.sources) == 0:
logging.exception('No CDX Sources Found!')
logging.exception('No CDX Sources Found from: ' + str(sources))
def add_cdx_loader(self, filename):
source = self.create_cdx_loader(filename)
if not source:
return
def add_cdx_source(self, source):
if not isinstance(source, CDXSource):
source = self.create_cdx_source(source)
if not source:
return
logging.debug('Adding CDX Source: ' + str(source))
self.sources.append(source)
@staticmethod
def create_cdx_loader(filename):
def create_cdx_source(filename):
if filename.startswith('http://') or filename.startswith('https://'):
return RemoteCDXSource(filename)
if filename.endswith('.cdx'):
return CDXFile(filename)
return None
#TODO: support zipnum
#elif filename.endswith('.summary')
@ -81,27 +80,52 @@ class CDXServer:
#elif filename.startswith('redis://')
# return RedisCDXSource(filename)
def load_cdx(self, **params):
# canonicalize to surt (canonicalization is part of surt conversion)
# if key not set, assume 'url' is set and needs canonicalization
if not params.get('key'):
params['key'] = self._canonicalize(params)
self._convert_old_style(params)
return cdx_load(self.sources, params)
def _canonicalize(self, params):
"""
Canonicalize url and convert to surt
If no surt-mode, convert back to url form
as surt conversion is currently part of canonicalization
"""
try:
url = params['url']
except KeyError:
raise CDXException('The url= param must be specified to query the cdx server')
msg = 'A url= param must be specified to query the cdx server'
raise CDXException(msg)
try:
key = surt.surt(url)
except Exception as e:
raise CDXException('Invalid url: ', url)
raise CDXException('Invalid Url: ' + url)
# if not surt, unsurt the surt to get canonicalized non-surt url
if not self.surt_ordered:
key = unsurt(key)
params['key'] = key
return key
return cdx_load(self.sources, params)
def _convert_old_style(self, params):
"""
Convert old-style CDX Server param semantics
"""
collapse_time = params.get('collapseTime')
if collapse_time:
params['collapse_time'] = collapse_time
resolve_revisits = params.get('resolveRevisits')
if resolve_revisits:
params['resolve_revisits'] = resolve_revisits
if params.get('sort') == 'reverse':
params['reverse'] = True
def load_cdx_from_request(self, env):
#url = wbrequest.wb_url.url
@ -113,7 +137,8 @@ class CDXServer:
params['output'] = 'text'
# parse_qs produces arrays for single values
# cdxreader expects singleton params for all except filters, so convert here
# cdx processing expects singleton params for all params,
# except filters, so convert here
# use first value of the list
for name, val in params.iteritems():
if name != 'filter':
@ -122,13 +147,10 @@ class CDXServer:
cdx_lines = self.load_cdx(**params)
return cdx_lines
def __str__(self):
return 'load cdx indexes from ' + str(self.sources)
#=================================================================
def unsurt(surt):
"""
@ -141,7 +163,8 @@ def unsurt(surt):
'com,example)'
# Long surt
>>> unsurt('suffix,domain,sub,subsub,another,subdomain)/path/file/index.html?a=b?c=)/')
>>> unsurt('suffix,domain,sub,subsub,another,subdomain)/path/file/\
index.html?a=b?c=)/')
'subdomain.another.subsub.sub.domain.suffix)/path/file/index.html?a=b?c=)/'
"""
@ -158,3 +181,6 @@ def unsurt(surt):
return surt
if __name__ == "__main__":
import doctest
doctest.testmod()

92
pywb/cdx/cdxsource.py Normal file
View File

@ -0,0 +1,92 @@
from pywb.utils.binsearch import iter_exact, iter_prefix
from pywb.utils.loaders import SeekableTextFileReader
import urllib
import urllib2
#=================================================================
class CDXSource(object):
"""
Represents any cdx index source
"""
def load_cdx(self, params):
raise NotImplementedError('Implement in subclass')
#=================================================================
class CDXFile(CDXSource):
"""
Represents a local plain-text .cdx file
"""
def __init__(self, filename):
self.filename = filename
def load_cdx(self, params):
source = SeekableTextFileReader(self.filename)
match_type = params.get('match_type')
if match_type == 'prefix':
iter_func = iter_prefix
else:
iter_func = iter_exact
key = params.get('key')
return iter_func(source, key)
def __str__(self):
return 'CDX File - ' + self.filename
#=================================================================
class RemoteCDXSource(CDXSource):
"""
Represents a remote cdx server, to which requests will be proxied.
Only url and match type params are proxied at this time,
the stream is passed through all other filters locally.
"""
def __init__(self, filename, cookie=None, proxy_all=True):
self.remote_url = filename
self.cookie = cookie
self.proxy_all = proxy_all
def load_cdx(self, proxy_params):
if self.proxy_all:
params = proxy_params
params['proxy_all'] = True
else:
# Only send url and matchType params to remote
params = {}
params['url'] = proxy_params['url']
match_type = proxy_params.get('match_type')
if match_type:
proxy_params['matchType'] = match_type
urlparams = urllib.urlencode(params, True)
try:
request = urllib2.Request(self.remote_url, urlparams)
if self.cookie:
request.add_header('Cookie', self.cookie)
response = urllib2.urlopen(request)
except urllib2.HTTPError as e:
if e.code == 403:
exc_msg = e.read()
msg = ('Blocked By Robots' if 'Blocked By Robots' in exc_msg
else 'Excluded')
raise AccessException(msg)
else:
raise
return iter(response)
def __str__(self):
return 'Remote CDX Server: ' + self.remote_url

3
pywb/cdx/config.yaml Normal file
View File

@ -0,0 +1,3 @@
#CDX Server WSGI App Config
index_paths: ./sample_data/
port: 8090

View File

@ -0,0 +1,163 @@
#=================================================================
"""
# Merge Sort Multipe CDX Sources
>>> cdx_ops_test(url = 'http://iana.org/', sources = [test_cdx_dir + 'dupes.cdx', test_cdx_dir + 'iana.cdx'])
org,iana)/ 20140126200624 http://www.iana.org/ text/html 200 OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 2258 334 iana.warc.gz
org,iana)/ 20140127171238 http://iana.org unk 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 343 1858 dupes.warc.gz
org,iana)/ 20140127171238 http://www.iana.org/ warc/revisit - OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 536 2678 dupes.warc.gz
# Limit CDX Stream
>>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', limit = 3)
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200625 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 117166 198285 iana.warc.gz
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200654 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf warc/revisit - YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 548 482544 iana.warc.gz
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200706 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf warc/revisit - YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 495230 iana.warc.gz
# Reverse CDX Stream
>>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', reverse = True, resolve_revisits = True, limit = 3)
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201308 https://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 551 783712 iana.warc.gz 117166 198285 iana.warc.gz
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201249 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 771773 iana.warc.gz 117166 198285 iana.warc.gz
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201240 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 551 757988 iana.warc.gz 117166 198285 iana.warc.gz
>>> cdx_ops_test('http://iana.org/_js/2013.1/jquery.js', reverse = True, resolve_revisits = True, limit = 1)
org,iana)/_js/2013.1/jquery.js 20140126201307 https://www.iana.org/_js/2013.1/jquery.js application/x-javascript 200 AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO - - 543 778507 iana.warc.gz 33449 7311 iana.warc.gz
# No matching results
>>> cdx_ops_test('http://iana.org/dont_have_this', reverse = True, resolve_revisits = True, limit = 2)
# Filter cdx
>>> cdx_ops_test(url = 'http://iana.org/domains', match_type = 'prefix', filter = ['mimetype:text/html'])
org,iana)/domains 20140126200825 http://www.iana.org/domains text/html 200 7UPSCLNWNZP33LGW6OJGSF2Y4CDG4ES7 - - 2912 610534 iana.warc.gz
org,iana)/domains/arpa 20140126201248 http://www.iana.org/domains/arpa text/html 200 QOFZZRN6JIKAL2JRL6ZC2VVG42SPKGHT - - 2939 759039 iana.warc.gz
org,iana)/domains/idn-tables 20140126201127 http://www.iana.org/domains/idn-tables text/html 200 HNCUFTJMOQOGAEY6T56KVC3T7TVLKGEW - - 8118 715878 iana.warc.gz
org,iana)/domains/int 20140126201239 http://www.iana.org/domains/int text/html 200 X32BBNNORV4SPEHTQF5KI5NFHSKTZK6Q - - 2482 746788 iana.warc.gz
org,iana)/domains/reserved 20140126201054 http://www.iana.org/domains/reserved text/html 200 R5AAEQX5XY5X5DG66B23ODN5DUBWRA27 - - 3573 701457 iana.warc.gz
org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', filter = 'statuscode:200')
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
# Collapse by timestamp
# unresolved revisits, different statuscode results in an extra repeat
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapse_time = 11)
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
org,iana)/_css/2013.1/screen.css 20140126200653 http://www.iana.org/_css/2013.1/screen.css warc/revisit - BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 533 328367 iana.warc.gz
org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css warc/revisit - BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz
# resolved revisits
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapse_time = '11', resolve_revisits = True)
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz - - -
org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz 8754 41238 iana.warc.gz
# Sort by closest timestamp + field select output
>>> cdx_ops_test(closest = '20140126200826', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', fields = 'timestamp', limit = 10)
20140126200826
20140126200816
20140126200805
20140126200912
20140126200738
20140126200930
20140126200718
20140126200706
20140126200654
20140126200625
>>> cdx_ops_test(closest = '20140126201306', url = 'http://iana.org/dnssec', resolve_revisits = True, sources = [test_cdx_dir + 'dupes.cdx', test_cdx_dir + 'iana.cdx'])
org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz - - -
org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz - - -
>>> cdx_ops_test(closest = '20140126201307', url = 'http://iana.org/dnssec', resolve_revisits = True)
org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz - - -
org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz - - -
# equal dist prefer earlier
>>> cdx_ops_test(closest = '20140126200700', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolve_revisits = True, limit = 2)
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200654 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 548 482544 iana.warc.gz 117166 198285 iana.warc.gz
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200706 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 495230 iana.warc.gz 117166 198285 iana.warc.gz
>>> cdx_ops_test(closest = '20140126200659', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolve_revisits = True, limit = 2, fields = 'timestamp')
20140126200654
20140126200706
>>> cdx_ops_test(closest = '20140126200701', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolve_revisits = True, limit = 2, fields = 'timestamp')
20140126200706
20140126200654
# Resolve Revisits
>>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/inconsolata.otf', resolve_revisits = True)
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200826 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 34054 620049 iana.warc.gz - - -
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200912 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 546 667073 iana.warc.gz 34054 620049 iana.warc.gz
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200930 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 534 697255 iana.warc.gz 34054 620049 iana.warc.gz
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201055 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 547 714833 iana.warc.gz 34054 620049 iana.warc.gz
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201249 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 551 768625 iana.warc.gz 34054 620049 iana.warc.gz
>>> cdx_ops_test('http://iana.org/domains/root/db', resolve_revisits = True)
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz - - -
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz - - -
# CDX Server init
>>> x = CDXServer([test_cdx_dir]).load_cdx(url = 'example.com', limit = 1, output = 'raw')
>>> pprint.pprint(x.next().items())
[('urlkey', 'com,example)/'),
('timestamp', '20140127171200'),
('original', 'http://example.com'),
('mimetype', 'text/html'),
('statuscode', '200'),
('digest', 'B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'),
('redirect', '-'),
('robotflags', '-'),
('length', '1046'),
('offset', '334'),
('filename', 'dupes.warc.gz')]
# NOTE: external dependency -- need self-contained test
>>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'example.com', output = 'raw', limit = '2')
>>> pprint.pprint(x.next().items())
[('urlkey', 'com,example)/'),
('timestamp', '20020120142510'),
('original', 'http://example.com:80/'),
('mimetype', 'text/html'),
('statuscode', '200'),
('digest', 'HT2DYGA5UKZCPBSFVCV3JOBXGW2G5UUA'),
('length', '1792')]
"""
#=================================================================
from pywb.cdx.cdxserver import CDXServer
import os
import sys
import pprint
from pywb import get_test_dir
#test_cdx_dir = os.path.dirname(os.path.realpath(__file__)) + '/../sample_data/'
test_cdx_dir = get_test_dir() + 'cdx/'
def cdx_ops_test(url, sources = [test_cdx_dir + 'iana.cdx'], **kwparams):
kwparams['url'] = url
kwparams['output'] = 'text'
server = CDXServer(sources)
results = server.load_cdx(**kwparams)
for x in results:
sys.stdout.write(x)
if __name__ == "__main__":
import doctest
doctest.testmod()

View File

@ -0,0 +1,72 @@
from cdxserver import CDXServer
import logging
import os
import yaml
import pkgutil
#=================================================================
TEST_CDX_DIR = os.path.dirname(os.path.realpath(__file__)) + '/../sample_data/'
CONFIG_FILE = 'config.yaml'
DEFAULT_PORT = 8080
if __package__:
config = pkgutil.get_data(__package__, CONFIG_FILE)
config = yaml.load(config)
else:
config = None
#=================================================================
def main():
logging.basicConfig(format='%(asctime)s: [%(levelname)s]: %(message)s',
level=logging.DEBUG)
cdx_config = config.get('index_paths') if config else None
if not cdx_config:
cdx_config = [TEST_CDX_DIR]
cdxserver = CDXServer(cdx_config)
def application(env, start_response):
try:
response = cdxserver.load_cdx_from_request(env)
start_response('200 OK', [('Content-Type', 'text/plain')])
response = list(response)
except Exception as exc:
import traceback
err_details = traceback.format_exc(exc)
start_response('400 Error', [('Content-Type', 'text/plain')])
response = [str(exc)]
print err_details
return response
return application
if __name__ == "__main__":
from wsgiref.simple_server import make_server
app = main()
port = DEFAULT_PORT
if config:
port = config.get('port', DEFAULT_PORT)
httpd = make_server('', port, app)
logging.debug('Starting CDX Server on port ' + str(port))
try:
httpd.serve_forever()
except KeyboardInterrupt:
pass
logging.debug('Stopping CDX Server')
else:
application = main()

View File

@ -1,42 +0,0 @@
from cdxserver import CDXServer
import logging
import os
test_cdx_dir = os.path.dirname(os.path.realpath(__file__)) + '/../../sample_archive/cdx/'
#=================================================================
def main(config = None):
logging.basicConfig(format = '%(asctime)s: [%(levelname)s]: %(message)s', level = logging.DEBUG)
if not config:
config = [test_cdx_dir]
cdxserver = CDXServer(config)
def application(env, start_response):
try:
response = cdxserver.load_cdx_from_request(env)
start_response('200 OK', [('Content-Type', 'text/plain')])
response = list(response)
except Exception as exc:
import traceback
err_details = traceback.format_exc(exc)
start_response('400 Error', [('Content-Type', 'text/plain')])
response = [str(exc)]
print err_details
return response
return application
if __name__ == "__main__":
pass
else:
application = main()

View File

@ -1,59 +1,34 @@
import archiveloader
import views
import handlers
import indexreader
import replay_views
import replay_resolvers
import logging
import hmac
import time
from pywb.warc.recordloader import ArcWarcRecordLoader
from pywb.warc.resolvingloader import ResolvingLoader
from pywb.rewrite.rewrite_content import RewriteContent
#=================================================================
# Config Loading
#=================================================================
def load_template_file(file, desc = None, view_class = views.J2TemplateView):
if file:
logging.info('Adding {0}: {1}'.format(desc if desc else name, file))
logging.debug('Adding {0}: {1}'.format(desc if desc else name, file))
file = view_class(file)
return file
#=================================================================
# Cookie Signing
#=================================================================
def create_wb_handler(cdx_server, config):
class HMACCookieMaker:
def __init__(self, key, name):
self.key = key
self.name = name
record_loader = ArcWarcRecordLoader(cookie_maker = config.get('cookie_maker'))
paths = config.get('archive_paths')
def __call__(self, duration, extra_id = ''):
expire = str(long(time.time() + duration))
resolving_loader = ResolvingLoader(paths = paths, cdx_server = cdx_server, record_loader = record_loader)
if extra_id:
msg = extra_id + '-' + expire
else:
msg = expire
replayer = replay_views.ReplayView(
content_loader = resolving_loader,
hmacdigest = hmac.new(self.key, msg)
hexdigest = hmacdigest.hexdigest()
if extra_id:
cookie = '{0}-{1}={2}-{3}'.format(self.name, extra_id, expire, hexdigest)
else:
cookie = '{0}={1}-{2}'.format(self.name, expire, hexdigest)
return cookie
#=================================================================
def create_wb_handler(cdx_source, config):
replayer = replay_views.RewritingReplayView(
resolvers = replay_resolvers.make_best_resolvers(config.get('archive_paths')),
loader = archiveloader.ArchiveLoader(hmac = config.get('hmac')),
content_rewriter = RewriteContent(),
head_insert_view = load_template_file(config.get('head_insert_html'), 'Head Insert'),
@ -66,7 +41,7 @@ def create_wb_handler(cdx_source, config):
wb_handler = handlers.WBHandler(
cdx_source,
cdx_server,
replayer,

View File

@ -1,13 +1,12 @@
import views
import utils
import urlparse
from wbrequestresponse import WbResponse
from wburl import WbUrl
from wbexceptions import WbException, NotFoundException
import pkgutil
import mimetypes
import time
from pywb.rewrite.wburl import WbUrl
from wbrequestresponse import WbResponse
from wbexceptions import WbException, NotFoundException
from views import TextCapturesView
class BaseHandler:
@ -22,23 +21,22 @@ class BaseHandler:
# Standard WB Handler
#=================================================================
class WBHandler(BaseHandler):
def __init__(self, cdx_reader, replay, html_view = None, search_view = None):
self.cdx_reader = cdx_reader
def __init__(self, index_reader, replay, html_view = None, search_view = None):
self.index_reader = index_reader
self.replay = replay
self.text_view = views.TextCapturesView()
self.text_view = TextCapturesView()
self.html_view = html_view
self.search_view = search_view
def __call__(self, wbrequest):
if wbrequest.wb_url_str == '/':
return self.render_search_page(wbrequest)
with utils.PerfTimer(wbrequest.env.get('X_PERF'), 'query') as t:
cdx_lines = self.cdx_reader.load_for_request(wbrequest, parsed_cdx = True)
with PerfTimer(wbrequest.env.get('X_PERF'), 'query') as t:
cdx_lines = self.index_reader.load_for_request(wbrequest)
# new special modifier to always show cdx index
if wbrequest.wb_url.mod == 'cdx_':
@ -48,8 +46,8 @@ class WBHandler(BaseHandler):
query_view = self.html_view if self.html_view else self.text_view
return query_view.render_response(wbrequest, cdx_lines)
with utils.PerfTimer(wbrequest.env.get('X_PERF'), 'replay') as t:
return self.replay(wbrequest, cdx_lines, self.cdx_reader)
with PerfTimer(wbrequest.env.get('X_PERF'), 'replay') as t:
return self.replay(wbrequest, cdx_lines)
def render_search_page(self, wbrequest):
@ -60,18 +58,18 @@ class WBHandler(BaseHandler):
def __str__(self):
return 'WBHandler: ' + str(self.cdx_reader) + ', ' + str(self.replay)
return 'WBHandler: ' + str(self.index_reader) + ', ' + str(self.replay)
#=================================================================
# CDX-Server Handler -- pass all params to cdx server
#=================================================================
class CDXHandler(BaseHandler):
def __init__(self, cdx_server, view = None):
self.cdx_server = cdx_server
self.view = view if view else views.TextCapturesView()
def __init__(self, index_reader, view = None):
self.index_reader = index_reader
self.view = view if view else TextCapturesView()
def __call__(self, wbrequest):
cdx_lines = self.cdx_server.load_cdx_from_request(wbrequest.env)
cdx_lines = self.index_reader.cdx_server.load_cdx_from_request(wbrequest.env)
return self.view.render_response(wbrequest, cdx_lines)
@ -81,7 +79,7 @@ class CDXHandler(BaseHandler):
return None
def __str__(self):
return 'CDX Server: ' + str(self.cdx_server)
return 'Index Reader: ' + str(self.index_reader)
#=================================================================
@ -136,4 +134,19 @@ class DebugEchoHandler(BaseHandler):
return WbResponse.text_response(str(wbrequest))
#=================================================================
class PerfTimer:
def __init__(self, perfdict, name):
self.perfdict = perfdict
self.name = name
def __enter__(self):
self.start = time.clock()
return self
def __exit__(self, *args):
self.end = time.clock()
if self.perfdict is not None:
self.perfdict[self.name] = str(self.end - self.start)

View File

@ -1,17 +1,22 @@
import urllib
import urllib2
import wbexceptions
import wbrequestresponse
from collections import OrderedDict
from cdxserver.cdxserver import CDXServer, CDXException
from cdxserver.cdxobject import CDXObject
from itertools import chain
from pprint import pprint
import logging
from pywb.cdx.cdxserver import CDXServer, CDXException
from pywb.cdx.cdxobject import CDXObject
#=================================================================
class IndexReader:
def load_for_request(self, wbrequest, parsed_cdx = True):
class IndexReader(object):
def __init__(self, config):
if isinstance(config, str):
self.cdx_server = CDXServer(config)
else:
self.cdx_server = CDXServer.create_from_config(config)
def load_for_request(self, wbrequest):
wburl = wbrequest.wb_url
# init standard params
@ -24,147 +29,27 @@ class IndexReader:
if wbrequest.custom_params:
params.update(wbrequest.custom_params)
#params['url'] = wburl.url
output = 'raw' if parsed_cdx else 'text'
params['url'] = wburl.url
try:
cdxlines = self.load_cdx(url = wburl.url, output = output, **params)
cdxlines = self.load_cdx(output='raw', **params)
except CDXException:
raise wbexceptions.BadUrlException('Bad Request Url: ' + wburl.url)
cdxlines = utils.peek_iter(cdxlines)
cdxlines = self.peek_iter(cdxlines)
if cdxlines is None:
raise wbexceptions.NotFoundException('WB Does Not Have Url: ' + wburl.url)
cdxlines = self.filter_cdx(wbrequest, cdxlines)
return cdxlines
def filter_cdx(self, wbrequest, cdxlines):
# Subclasses may wrap cdxlines iterator in a filter
return cdxlines
def load_cdx(self, **params):
return self.cdx_server.load_cdx(**params)
def load_cdx(self, url, params = {}, parsed_cdx = True):
raise NotImplementedError('Override in subclasses')
@staticmethod
def make_best_cdx_source(paths, config):
# may be a string or list
surt_ordered = config.get('surt_ordered', True)
# support mixed cdx streams and remote servers?
# for now, list implies local sources
if isinstance(paths, list):
if len(paths) > 1:
return EmbeddedCDXServer(paths, surt_ordered)
else:
# treat as non-list
paths = paths[0]
# a single uri
uri = paths
# Check for remote cdx server
if (uri.startswith('http://') or uri.startswith('https://')) and not uri.endswith('.cdx'):
cookie = config.get('cookie', None)
return RemoteCDXServer(uri, cookie = cookie)
else:
return EmbeddedCDXServer([uri], surt_ordered)
#=================================================================
class EmbeddedCDXServer(CDXServer, IndexReader):
def get_query_params(self, wburl, limit = 150000, collapse_time = None, replay_closest = 10):
if wburl.type == wburl.URL_QUERY:
raise NotImplementedError('Url Query Not Yet Supported')
return {
wburl.QUERY:
{'collapse_time': collapse_time, 'filter': '!statuscode:(500|502|504)', 'limit': limit},
wburl.URL_QUERY:
{},
# raise Exception('Not Yet Implemented')
# {'collapse': 'urlkey', 'matchType': 'prefix', 'showGroupCount': True, 'showUniqCount': True, 'lastSkipTimestamp': True, 'limit': limit,
# 'fl': 'urlkey,original,timestamp,endtimestamp,groupcount,uniqcount',
# },
wburl.REPLAY:
{'filter': '!statuscode:(500|502|504)', 'limit': replay_closest, 'closest_to': wburl.timestamp, 'resolve_revisits': True},
wburl.LATEST_REPLAY:
{'reverse': True, 'filter': 'statuscode:[23]..', 'limit': '1', 'resolve_revisits': True}
}[wburl.type]
def __str__(self):
return 'load cdx indexes from ' + str(self.sources)
#=================================================================
class RemoteCDXServer(IndexReader):
"""
>>> x = RemoteCDXServer('http://web.archive.org/cdx/search/cdx').load_cdx('example.com', parsed_cdx = True, limit = '2')
>>> pprint(x.next().items())
[('urlkey', 'com,example)/'),
('timestamp', '20020120142510'),
('original', 'http://example.com:80/'),
('mimetype', 'text/html'),
('statuscode', '200'),
('digest', 'HT2DYGA5UKZCPBSFVCV3JOBXGW2G5UUA'),
('length', '1792')]
"""
def __init__(self, server_url, cookie = None):
self.server_url = server_url
self.auth_cookie = cookie
def load_cdx(self, url, params = {}, parsed_cdx = True, **kwvalues):
#url is required, must be passed explicitly!
params['url'] = url
params.update(**kwvalues)
urlparams = urllib.urlencode(params, True)
try:
request = urllib2.Request(self.server_url, urlparams)
if self.auth_cookie:
request.add_header('Cookie', self.auth_cookie)
response = urllib2.urlopen(request)
except urllib2.HTTPError, e:
if e.code == 403:
exc_msg = e.read()
msg = 'Blocked By Robots' if 'Blocked By Robots' in exc_msg else 'Excluded'
raise wbexceptions.AccessException(msg)
else:
raise
if parsed_cdx:
return (CDXObject(cdx) for cdx in response)
else:
return iter(response)
# Note: this params are designed to make pywb compatible with the original Java wayback-cdx-server API:
# https://github.com/internetarchive/wayback/tree/master/wayback-cdx-server
# Soon, this will be switched over to support the native pywb cdx server
# BUG: Setting replayClosest to high number for now, as cdx server sometimes returns wrong result
# with lower values if there are too many captures. Ideally, should be around 10-20
# The replayClosest is the max number of cdx lines, so max number of retry attempts that WB will make
def get_query_params(self, wburl, limit = '150000', collapse_time = '10', replay_closest = '4000'):
return {
wburl.QUERY:
{'collapseTime': collapse_time, 'filter': '!statuscode:(500|502|504)', 'limit': limit},
@ -184,18 +69,20 @@ class RemoteCDXServer(IndexReader):
}[wburl.type]
@staticmethod
def peek_iter(iterable):
try:
first = next(iterable)
except StopIteration:
return None
def __str__(self):
return 'server cdx from ' + self.server_url
return chain([first], iterable)
#=================================================================
class RemoteCDXServer(IndexReader):
def __init__(self, remote_url, cookie=None):
self.remote = RemoteCDXSource(remote_url=remote_url, cookie=cookie, proxy_all=True)
self.cdx_server = CDXServer(self.remote)
# Testing
import utils
if __name__ == "__main__" or utils.enable_doctests():
from pprint import pprint
test_dir = utils.test_data_dir() + 'cdx/'
import doctest
doctest.testmod()
#def load_cdx(self, **params):
#return remote.load_cdx(**params)

View File

@ -1,11 +1,12 @@
import handlers
import indexreader
import archivalrouter
import config_utils
import proxy
import os
import yaml
import config_utils
import logging
import proxy
#=================================================================
DEFAULTS = {
@ -49,24 +50,20 @@ def pywb_config_manual(passed_config = {}):
collections = config.get('collections')
for name, value in collections.iteritems():
route_config = config
if isinstance(value, dict):
# if a dict, extend with base properies
index_paths = value['index_paths']
route_config = DictChain(value, config)
if isinstance(value, str):
route_config = config
cdx_server = indexreader.IndexReader(value)
else:
index_paths = str(value)
cdx_source = indexreader.IndexReader.make_best_cdx_source(index_paths, route_config)
route_config = DictChain(value, config)
cdx_server = indexreader.IndexReader(route_config)
wb_handler = config_utils.create_wb_handler(
cdx_source = cdx_source,
cdx_server = cdx_server,
config = route_config,
)
logging.info('Adding Collection: ' + name)
logging.debug('Adding Collection: ' + name)
route_class = route_config.get('route_class', archivalrouter.Route)
@ -74,7 +71,7 @@ def pywb_config_manual(passed_config = {}):
# cdx query handler
if route_config.get('enable_cdx_api', False):
routes.append(archivalrouter.Route(name + '-cdx', handlers.CDXHandler(cdx_source)))
routes.append(archivalrouter.Route(name + '-cdx', handlers.CDXHandler(cdx_server)))
if config.get('debug_echo_env', False):
@ -125,11 +122,3 @@ def pywb_config(config_file = None):
return pywb_config_manual(config)
import utils
if __name__ == "__main__" or utils.enable_doctests():
# Just test for execution for now
#pywb_config(os.path.dirname(os.path.realpath(__file__)) + '/../config.yaml')
pywb_config_manual()

View File

@ -1,269 +0,0 @@
import re
import sys
import itertools
from url_rewriter import UrlRewriter
#=================================================================
class RegexRewriter:
"""
# Test https->http converter (other tests below in subclasses)
>>> RegexRewriter([(RegexRewriter.HTTPX_MATCH_STR, RegexRewriter.remove_https, 0)]).rewrite('a = https://example.com; b = http://example.com; c = https://some-url/path/https://embedded.example.com')
'a = http://example.com; b = http://example.com; c = http://some-url/path/http://embedded.example.com'
"""
@staticmethod
def comment_out(string):
return '/*' + string + '*/'
@staticmethod
def remove_https(string):
return string.replace("https", "http")
@staticmethod
def add_prefix(prefix):
return lambda string: prefix + string
@staticmethod
def archival_rewrite(rewriter):
return lambda x: rewriter.rewrite(x)
@staticmethod
def replacer(string):
return lambda x: string
HTTPX_MATCH_STR = r'https?:\\?/\\?/[A-Za-z0-9:_@.-]+'
DEFAULT_OP = add_prefix
def __init__(self, rules):
#rules = self.create_rules(http_prefix)
# Build regexstr, concatenating regex list
regex_str = '|'.join(['(' + rx + ')' for rx, op, count in rules])
# ensure it's not middle of a word, wrap in non-capture group
regex_str = '(?<!\w)(?:' + regex_str + ')'
self.regex = re.compile(regex_str, re.M)
self.rules = rules
def filter(self, m):
return True
def rewrite(self, string):
return self.regex.sub(lambda x: self.replace(x), string)
def close(self):
return ''
def replace(self, m):
i = 0
for _, op, count in self.rules:
i += 1
full_m = i
while count > 0:
i += 1
count -= 1
if not m.group(i):
continue
# Optional filter to skip matches
if not self.filter(m):
return m.group(0)
# Custom func
if not hasattr(op, '__call__'):
op = RegexRewriter.DEFAULT_OP(op)
result = op(m.group(i))
# if extracting partial match
if i != full_m:
result = m.string[m.start(full_m):m.start(i)] + result + m.string[m.end(i):m.end(full_m)]
return result
#=================================================================
class JSRewriter(RegexRewriter):
"""
>>> test_js('location = "http://example.com/abc.html"')
'WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"'
>>> test_js(r'location = "http:\/\/example.com/abc.html"')
'WB_wombat_location = "/web/20131010im_/http:\\\\/\\\\/example.com/abc.html"'
>>> test_js(r'location = "http:\\/\\/example.com/abc.html"')
'WB_wombat_location = "/web/20131010im_/http:\\\\/\\\\/example.com/abc.html"'
>>> test_js(r"location = 'http://example.com/abc.html/'")
"WB_wombat_location = '/web/20131010im_/http://example.com/abc.html/'"
>>> test_js(r'location = http://example.com/abc.html/')
'WB_wombat_location = http://example.com/abc.html/'
>>> test_js(r'location = /http:\/\/example.com/abc.html/')
'WB_wombat_location = /http:\\\\/\\\\/example.com/abc.html/'
>>> test_js('"/location" == some_location_val; locations = location;')
'"/location" == some_location_val; locations = WB_wombat_location;'
>>> test_js('cool_Location = "http://example.com/abc.html"')
'cool_Location = "/web/20131010im_/http://example.com/abc.html"'
>>> test_js('window.location = "http://example.com/abc.html" document.domain = "anotherdomain.com"')
'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html" document.WB_wombat_domain = "anotherdomain.com"'
>>> test_js('document_domain = "anotherdomain.com"; window.document.domain = "example.com"')
'document_domain = "anotherdomain.com"; window.document.WB_wombat_domain = "example.com"'
# custom rules added
>>> test_js('window.location = "http://example.com/abc.html"; some_func(); ', [('some_func\(\).*', RegexRewriter.comment_out, 0)])
'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"; /*some_func(); */'
# scheme-agnostic
>>> test_js('cool_Location = "//example.com/abc.html" //comment')
'cool_Location = "/web/20131010im_///example.com/abc.html" //comment'
"""
JS_HTTPX = r'(?<="|\')(?:https?:)?\\?/\\?/[A-Za-z0-9:_@.-]+'
def __init__(self, rewriter, extra = []):
rules = self._create_rules(rewriter.get_abs_url())
rules.extend(extra)
RegexRewriter.__init__(self, rules)
def _create_rules(self, http_prefix):
return [
(self.JS_HTTPX, http_prefix, 0),
(r'(?<!/)\blocation\b', 'WB_wombat_', 0),
(r'(?<=document\.)domain', 'WB_wombat_', 0),
]
#=================================================================
class XMLRewriter(RegexRewriter):
"""
>>> test_xml('<tag xmlns="http://www.example.com/ns" attr="http://example.com"></tag>')
'<tag xmlns="http://www.example.com/ns" attr="/web/20131010im_/http://example.com"></tag>'
>>> test_xml('<tag xmlns:xsi="http://www.example.com/ns" attr=" http://example.com"></tag>')
'<tag xmlns:xsi="http://www.example.com/ns" attr=" /web/20131010im_/http://example.com"></tag>'
>>> test_xml('<tag> http://example.com<other>abchttp://example.com</other></tag>')
'<tag> /web/20131010im_/http://example.com<other>abchttp://example.com</other></tag>'
>>> test_xml('<main> http://www.example.com/blah</tag> <other xmlns:abcdef= " http://example.com"/> http://example.com </main>')
'<main> /web/20131010im_/http://www.example.com/blah</tag> <other xmlns:abcdef= " http://example.com"/> /web/20131010im_/http://example.com </main>'
"""
def __init__(self, rewriter, extra = []):
rules = self._create_rules(rewriter.get_abs_url())
RegexRewriter.__init__(self, rules)
# custom filter to reject 'xmlns' attr
def filter(self, m):
attr = m.group(1)
if attr and attr.startswith('xmlns'):
return False
return True
def _create_rules(self, http_prefix):
return [
('([A-Za-z:]+[\s=]+)?["\'\s]*(' + RegexRewriter.HTTPX_MATCH_STR + ')', http_prefix, 2),
]
#=================================================================
class CSSRewriter(RegexRewriter):
r"""
>>> test_css("background: url('/some/path.html')")
"background: url('/web/20131010im_/http://example.com/some/path.html')"
>>> test_css("background: url('../path.html')")
"background: url('/web/20131010im_/http://example.com/path.html')"
>>> test_css("background: url(\"http://domain.com/path.html\")")
'background: url("/web/20131010im_/http://domain.com/path.html")'
>>> test_css("background: url(file.jpeg)")
'background: url(/web/20131010im_/http://example.com/file.jpeg)'
>>> test_css("background: url('')")
"background: url('')"
>>> test_css("background: url (\"weirdpath\')")
'background: url ("/web/20131010im_/http://example.com/weirdpath\')'
>>> test_css("@import url ('path.css')")
"@import url ('/web/20131010im_/http://example.com/path.css')"
>>> test_css("@import url('path.css')")
"@import url('/web/20131010im_/http://example.com/path.css')"
>>> test_css("@import ( 'path.css')")
"@import ( '/web/20131010im_/http://example.com/path.css')"
>>> test_css("@import \"path.css\"")
'@import "/web/20131010im_/http://example.com/path.css"'
>>> test_css("@import ('../path.css\"")
'@import (\'/web/20131010im_/http://example.com/path.css"'
>>> test_css("@import ('../url.css\"")
'@import (\'/web/20131010im_/http://example.com/url.css"'
>>> test_css("@import (\"url.css\")")
'@import ("/web/20131010im_/http://example.com/url.css")'
>>> test_css("@import url(/url.css)\n@import url(/anotherurl.css)\n @import url(/and_a_third.css)")
'@import url(/web/20131010im_/http://example.com/url.css)\n@import url(/web/20131010im_/http://example.com/anotherurl.css)\n @import url(/web/20131010im_/http://example.com/and_a_third.css)'
"""
CSS_URL_REGEX = "url\\s*\\(\\s*[\\\\\"']*([^)'\"]+)[\\\\\"']*\\s*\\)"
CSS_IMPORT_NO_URL_REGEX = "@import\\s+(?!url)\\(?\\s*['\"]?(?!url[\\s\\(])([\w.:/\\\\-]+)"
def __init__(self, rewriter):
rules = self._create_rules(rewriter)
RegexRewriter.__init__(self, rules)
def _create_rules(self, rewriter):
return [
(CSSRewriter.CSS_URL_REGEX, RegexRewriter.archival_rewrite(rewriter), 1),
(CSSRewriter.CSS_IMPORT_NO_URL_REGEX, RegexRewriter.archival_rewrite(rewriter), 1),
]
import utils
if __name__ == "__main__" or utils.enable_doctests():
arcrw = UrlRewriter('20131010im_/http://example.com/', '/web/')
def test_js(string, extra = []):
return JSRewriter(arcrw, extra).rewrite(string)
def test_xml(string):
return XMLRewriter(arcrw).rewrite(string)
def test_css(string):
return CSSRewriter(arcrw).rewrite(string)
import doctest
doctest.testmod()

View File

@ -1,30 +1,30 @@
import StringIO
from urllib2 import URLError
import chardet
import copy
import itertools
import archiveloader
from wbrequestresponse import WbResponse, StatusAndHeaders
import utils
from url_rewriter import UrlRewriter
from header_rewriter import HeaderRewriter
import html_rewriter
import regex_rewriters
from pywb.rewrite.url_rewriter import UrlRewriter
from pywb.utils.bufferedreaders import ChunkedDataReader
from wbrequestresponse import WbResponse
import wbexceptions
#=================================================================
class ReplayView:
def __init__(self, resolvers, loader = None, reporter = None):
self.resolvers = resolvers
self.loader = loader if loader else archiveloader.ArchiveLoader()
def __init__(self, content_loader, content_rewriter, head_insert_view = None,
redir_to_exact = True, buffer_response = False, reporter = None):
self.content_loader = content_loader
self.content_rewriter = content_rewriter
self.head_insert_view = head_insert_view
self.redir_to_exact = redir_to_exact
# buffer or stream rewritten response
self.buffer_response = buffer_response
self._reporter = reporter
def __call__(self, wbrequest, cdx_lines, cdx_reader):
def __call__(self, wbrequest, cdx_lines):
last_e = None
first = True
@ -40,9 +40,22 @@ class ReplayView:
self._redirect_if_needed(wbrequest, cdx)
first = False
(cdx, status_headers, stream) = self.resolve_headers_and_payload(cdx, wbrequest, cdx_reader, failed_files)
(status_headers, stream) = self.content_loader.resolve_headers_and_payload(cdx, failed_files)
response = self.make_response(wbrequest, cdx, status_headers, stream)
# check and reject self-redirect
self._reject_self_redirect(wbrequest, cdx, status_headers)
# check if redir is needed
self._redirect_if_needed(wbrequest, cdx)
response = None
if self.content_rewriter and wbrequest.wb_url.mod != 'id_':
response = self.rewrite_content(wbrequest, cdx, status_headers, stream)
else:
(status_headers, stream) = self.sanitize_content(status_headers, stream)
response_iter = self.stream_to_iter(stream)
response = WbResponse(status_headers, response_iter)
# notify reporter callback, if any
if self._reporter:
@ -62,288 +75,57 @@ class ReplayView:
else:
raise wbexceptions.UnresolvedArchiveFileException()
# callback to issue a redirect to another request
# subclasses may provide custom logic
def _redirect_if_needed(self, wbrequest, cdx):
pass
def _load(self, cdx, revisit, failed_files):
if revisit:
(filename, offset, length) = (cdx['orig.filename'], cdx['orig.offset'], cdx['orig.length'])
else:
(filename, offset, length) = (cdx['filename'], cdx['offset'], cdx['length'])
#optimization: if same file already failed this request, don't try again
if failed_files and filename in failed_files:
raise wbexceptions.ArchiveLoadFailed(filename, 'Skipping Already Failed')
any_found = False
last_exc = None
for resolver in self.resolvers:
possible_paths = resolver(filename)
if possible_paths:
for path in possible_paths:
any_found = True
try:
return self.loader.load(path, offset, length)
except Exception as ue:
last_exc = ue
print last_exc
pass
# Unsuccessful if reached here
if failed_files:
failed_files.append(filename)
if not any_found:
raise wbexceptions.UnresolvedArchiveFileException('Archive File Not Found: ' + filename)
else:
raise wbexceptions.ArchiveLoadFailed(filename, last_exc.reason if last_exc else '')
def resolve_headers_and_payload(self, cdx, wbrequest, cdx_reader, failed_files):
has_curr = (cdx['filename'] != '-')
has_orig = (cdx.get('orig.filename','-') != '-')
# load headers record from cdx['filename'] unless it is '-' (rare)
headers_record = self._load(cdx, False, failed_files) if has_curr else None
# two index lookups
# Case 1: if mimetype is still warc/revisit
if cdx['mimetype'] == 'warc/revisit' and headers_record:
payload_record = self._load_different_url_payload(wbrequest, cdx_reader, cdx, headers_record, failed_files)
# single lookup cases
# case 2: non-revisit
elif (has_curr and not has_orig):
payload_record = headers_record
# case 3: identical url revisit, load payload from orig.filename
elif (has_orig):
payload_record = self._load(cdx, True, failed_files)
# special case: set header to payload if old-style revisit with missing header
if not headers_record:
headers_record = payload_record
elif headers_record != payload_record:
# close remainder of stream as this record only used for (already parsed) headers
headers_record.stream.close()
# special case: check if headers record is actually empty (eg empty revisit), then use headers from revisit
if not headers_record.status_headers.headers:
headers_record = payload_record
if not headers_record or not payload_record:
raise wbexceptions.CaptureException('Invalid CDX' + str(cdx))
#response = WbResponse(headers_record.status_headers, self.create_stream_gen(payload_record.stream))
#response._stream = payload_record.stream
return (cdx, headers_record.status_headers, payload_record.stream)
# done here! just return response
# subclasses make override to do additional processing
def make_response(self, wbrequest, cdx, status_headers, stream):
return self.create_stream_response(status_headers, stream)
# create response from headers and wrapping stream in generator
def create_stream_response(self, status_headers, stream):
return WbResponse(status_headers, self.create_stream_gen(stream))
# Handle the case where a duplicate of a capture with same digest exists at a different url
# Must query the index at that url filtering by matching digest
# Raise exception if no matches found
def _load_different_url_payload(self, wbrequest, cdx_reader, cdx, headers_record, failed_files):
ref_target_uri = headers_record.rec_headers.get_header('WARC-Refers-To-Target-URI')
# Check for unresolved revisit error, if refers to target uri not present or same as the current url
if not ref_target_uri or (ref_target_uri == headers_record.rec_headers.get_header('WARC-Target-URI')):
raise wbexceptions.CaptureException('Missing Revisit Original' + str(cdx))
ref_target_date = headers_record.rec_headers.get_header('WARC-Refers-To-Date')
if not ref_target_date:
ref_target_date = cdx['timestamp']
else:
ref_target_date = utils.iso_date_to_timestamp(ref_target_date)
# clone WbRequest
orig_wbreq = copy.copy(wbrequest)
orig_wbreq.wb_url = copy.copy(orig_wbreq.wb_url)
orig_wbreq.wb_url.url = ref_target_uri
orig_wbreq.wb_url.timestamp = ref_target_date
# Must also match digest
orig_wbreq.query_filter.append('digest:' + cdx['digest'])
orig_cdx_lines = cdx_reader.load_for_request(orig_wbreq, parsed_cdx = True)
for cdx in orig_cdx_lines:
try:
#cdx = cdx_reader.CDXCaptureResult(cdx)
#print cdx
payload_record = self._load(cdx, False, failed_files)
return payload_record
except wbexceptions.CaptureException as e:
pass
raise wbexceptions.CaptureException('Original for revisit could not be loaded')
def resolve_full(self, filename):
# Attempt to resolve cdx file to full path
full_url = None
for resolver in self.resolvers:
full_url = resolver(filename)
if full_url:
return full_url
raise wbexceptions.UnresolvedArchiveFileException('Archive File Not Found: ' + filename)
# Create a generator reading from a stream, with optional rewriting and final read call
@staticmethod
def create_stream_gen(stream, rewrite_func = None, final_read_func = None, first_buff = None):
def stream_to_iter(stream):
try:
buff = first_buff if first_buff else stream.read()
buff = stream.read()
while buff:
if rewrite_func:
buff = rewrite_func(buff)
yield buff
buff = stream.read()
# For adding a tail/handling final buffer
if final_read_func:
buff = final_read_func()
if buff:
yield buff
finally:
stream.close()
def sanitize_content(self, status_headers, stream):
# remove transfer encoding chunked and wrap in a dechunking stream
if (status_headers.remove_header('transfer-encoding')):
stream = ChunkedDataReader(stream)
def __str__(self):
return 'find archive files from ' + str(self.resolvers)
#=================================================================
class RewritingReplayView(ReplayView):
def __init__(self, resolvers, loader = None, head_insert_view = None, header_rewriter = None, redir_to_exact = True, buffer_response = False, reporter = None):
ReplayView.__init__(self, resolvers, loader, reporter)
self.head_insert_view = head_insert_view
self.header_rewriter = header_rewriter if header_rewriter else HeaderRewriter()
self.redir_to_exact = redir_to_exact
# buffer or stream rewritten response
self.buffer_response = buffer_response
def _text_content_type(self, content_type):
for ctype, mimelist in self.REWRITE_TYPES.iteritems():
if any ((mime in content_type) for mime in mimelist):
return ctype
return None
def make_response(self, wbrequest, cdx, status_headers, stream):
# check and reject self-redirect
self._reject_self_redirect(wbrequest, cdx, status_headers)
# check if redir is needed
self._redirect_if_needed(wbrequest, cdx)
return (status_headers, stream)
def rewrite_content(self, wbrequest, cdx, status_headers, stream):
urlrewriter = wbrequest.urlrewriter
rewritten_headers = self.header_rewriter.rewrite(status_headers, urlrewriter)
(rewritten_headers, stream) = self.content_rewriter.rewrite_headers(urlrewriter, status_headers, stream)
# de_chunking in case chunk encoding is broken
# TODO: investigate further
de_chunk = False
# handle transfer-encoding: chunked
if (rewritten_headers.contains_removed_header('transfer-encoding', 'chunked')):
stream = archiveloader.ChunkedLineReader(stream)
de_chunk = True
# transparent, though still may need to dechunk
if wbrequest.wb_url.mod == 'id_':
if de_chunk:
status_headers.remove_header('transfer-encoding')
return self.create_stream_response(status_headers, stream)
# non-text content type, just send through with rewritten headers
# but may need to dechunk
# no rewriting needed!
if rewritten_headers.text_type is None:
status_headers = rewritten_headers.status_headers
response_iter = self.stream_to_iter(stream)
return WbResponse(rewritten_headers.status_headers, response_iter)
return self.create_stream_response(status_headers, stream)
# Handle text rewriting
# special case -- need to ungzip the body
if (rewritten_headers.contains_removed_header('content-encoding', 'gzip')):
stream = archiveloader.LineReader(stream, decomp = utils.create_decompressor())
# TODO: is this right?
if rewritten_headers.charset:
encoding = rewritten_headers.charset
first_buff = None
# do head insert
if self.head_insert_view:
head_insert_str = self.head_insert_view.render_to_string(wbrequest = wbrequest, cdx = cdx)
else:
(encoding, first_buff) = self._detect_charset(stream)
head_insert_str = None
# if chardet thinks its ascii, use utf-8
if encoding == 'ascii':
#encoding = None
encoding = 'utf-8'
# Buffering response for html, streaming for others?
#if rewritten_headers.text_type == 'html':
# return self._rewrite_html(encoding, urlrewriter, stream, rewritten_headers.status_headers, first_buff)
#else:
# return self._rewrite_other(rewritten_headers.text_type, encoding, urlrewriter, stream, rewritten_headers.status_headers, first_buff)
text_type = rewritten_headers.text_type
status_headers = rewritten_headers.status_headers
if text_type == 'html':
head_insert_str = self.head_insert_view.render_to_string(wbrequest = wbrequest, cdx = cdx) if self.head_insert_view else None
rewriter = html_rewriter.HTMLRewriter(urlrewriter, outstream = None, head_insert = head_insert_str)
elif text_type == 'css':
rewriter = regex_rewriters.CSSRewriter(urlrewriter)
elif text_type == 'js':
rewriter = regex_rewriters.JSRewriter(urlrewriter)
elif text_type == 'xml':
rewriter = regex_rewriters.XMLRewriter(urlrewriter)
else:
raise Exception('Unknown Text Type for Rewrite: ' + text_type)
# Create generator for response
response_gen = self._create_rewrite_stream(rewriter, encoding, stream, first_buff)
(status_headers, response_gen) = self.content_rewriter.rewrite_content(urlrewriter, rewritten_headers, stream, head_insert_str)
if self.buffer_response:
return self._create_buffer_response(status_headers, response_gen)
else:
return WbResponse(status_headers, value = response_gen)
if wbrequest.wb_url.mod == 'id_':
status_headers.remove_header('content-length')
return self.buffered_response(status_headers, response_gen)
return WbResponse(status_headers, response_gen)
# Buffer rewrite generator and return a response from a string
def _create_buffer_response(self, status_headers, generator):
# Buffer rewrite iterator and return a response from a string
def buffered_response(self, status_headers, iterator):
out = StringIO.StringIO()
try:
for buff in generator:
for buff in iterator:
out.write(buff)
finally:
@ -355,53 +137,9 @@ class RewritingReplayView(ReplayView):
return WbResponse(status_headers, value = [content])
# Create rewrite response from record (no Content-Length), may even be chunked by front-end
def _create_rewrite_stream(self, rewriter, encoding, stream, first_buff = None):
def do_rewrite(buff):
if encoding:
buff = self._decode_buff(buff, stream, encoding)
buff = rewriter.rewrite(buff)
if encoding:
buff = buff.encode(encoding)
return buff
def do_finish():
return rewriter.close()
return self.create_stream_gen(stream, rewrite_func = do_rewrite, final_read_func = do_finish, first_buff = first_buff)
def _decode_buff(self, buff, stream, encoding):
try:
buff = buff.decode(encoding)
except UnicodeDecodeError, e:
# chunk may have cut apart unicode bytes -- add 1-3 bytes and retry
for i in range(3):
buff += stream.read(1)
try:
buff = buff.decode(encoding)
break
except UnicodeDecodeError:
pass
else:
raise
return buff
def _detect_charset(self, stream):
buff = stream.read(8192)
result = chardet.detect(buff)
print "chardet result: " + str(result)
return (result['encoding'], buff)
def _redirect_if_needed(self, wbrequest, cdx):
is_proxy = wbrequest.is_proxy
if self.redir_to_exact and not is_proxy and cdx and (cdx['timestamp'] != wbrequest.wb_url.timestamp):
if self.redir_to_exact and not wbrequest.is_proxy and cdx and (cdx['timestamp'] != wbrequest.wb_url.timestamp):
new_url = wbrequest.urlrewriter.get_timestamp_url(cdx['timestamp'], cdx['original'])
raise wbexceptions.InternalRedirect(new_url)

47
pywb/rewrite/README.md Normal file
View File

@ -0,0 +1,47 @@
## PyWb Rewrite v0.2
[![Build Status](https://travis-ci.org/ikreymer/pywb_rewrite.png?branch=master)](https://travis-ci.org/ikreymer/pywb_rewrite)
This package includes the content rewriting component of the pywb wayback tool suite.
This package applies standard rewriting content rewriting, in the form of url rewriting, for
HTTP headers, html, css, js and xml content.
An additional domain-specific rewritin is planned, especially for JS, to allow for proper
replay of difficult pages.
### Command-Line Rewriter
To enable easier testing of rewriting, this package includes a command-line rewriter
which will fetch a live url and apply the registered rewriting rules to that url:
After installing with:
`pip install -r requirements.txt`
Run:
`python ./pywb_rewrite/rewrite_live.py http://example.com`
To specify custom timestamp and prefix:
```
python ./pywb_rewrite/rewrite_live.py http://example.com /mycoll/20141026000102/http://mysite.example.com/path.html
```
This will print to stdout the content of `http://example.com` with all urls rewritten relative to
`/mycoll/20141026000102/http://mysite.example.com/path.html`.
Headers are also rewritten, for further details, consult the `get_rewritten` function in
[pywb_rewrite/rewrite_live.py](pywb_rewrite/rewrite_live.py)
### Tests
Rewriting doctests as well as live rewriting tests (subject to change) are provided.
To run full test suite: `python run-tests.py`

View File

@ -1,4 +1,4 @@
from wbrequestresponse import StatusAndHeaders
from pywb.utils.statusandheaders import StatusAndHeaders
#=================================================================
class RewrittenStatusAndHeaders:
@ -14,37 +14,6 @@ class RewrittenStatusAndHeaders:
#=================================================================
class HeaderRewriter:
"""
# Text with charset
>>> test_rewrite([('Date', 'Fri, 03 Jan 2014 03:03:21 GMT'), ('Content-Length', '5'), ('Content-Type', 'text/html;charset=UTF-8')])
{'text_type': 'html', 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Date', 'Fri, 03 Jan 2014 03:03:21 GMT'),
('X-Archive-Orig-Content-Length', '5'),
('Content-Type', 'text/html;charset=UTF-8')]), 'removed_header_dict': {}, 'charset': 'utf-8'}
# Redirect
>>> test_rewrite([('Connection', 'close'), ('Location', '/other.html')], '302 Redirect')
{'text_type': None, 'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [ ('X-Archive-Orig-Connection', 'close'),
('Location', '/web/20131226101010/http://example.com/other.html')]), 'removed_header_dict': {}, 'charset': None}
# gzip
>>> test_rewrite([('Content-Length', '199999'), ('Content-Type', 'text/javascript'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')])
{'text_type': 'js', 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Content-Length', '199999'),
('Content-Type', 'text/javascript')]), 'removed_header_dict': {'transfer-encoding': 'chunked', 'content-encoding': 'gzip'}, 'charset': None}
# Binary
>>> test_rewrite([('Content-Length', '200000'), ('Content-Type', 'image/png'), ('Cookie', 'blah'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')])
{'text_type': None, 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('Content-Length', '200000'),
('Content-Type', 'image/png'),
('X-Archive-Orig-Cookie', 'blah'),
('Content-Encoding', 'gzip')]), 'removed_header_dict': {'transfer-encoding': 'chunked'}, 'charset': None}
Removing Transfer-Encoding always, Was:
('Content-Encoding', 'gzip'),
('Transfer-Encoding', 'chunked')]), 'charset': None, 'text_type': None, 'removed_header_dict': {}}
"""
REWRITE_TYPES = {
'html': ['text/html', 'application/xhtml'],
'css': ['text/css'],
@ -122,20 +91,3 @@ class HeaderRewriter:
return (new_headers, removed_header_dict)
import utils
if __name__ == "__main__" or utils.enable_doctests():
import os
import pprint
import url_rewriter
urlrewriter = url_rewriter.UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/web/')
headerrewriter = HeaderRewriter()
def test_rewrite(headers, status = '200 OK'):
rewritten = headerrewriter.rewrite(StatusAndHeaders(status, headers), urlrewriter)
return vars(rewritten)
import doctest
doctest.testmod()

View File

@ -12,75 +12,8 @@ from regex_rewriters import JSRewriter, CSSRewriter
# HTMLRewriter -- html parser for custom rewriting, also handlers for script and css
#=================================================================
class HTMLRewriter(HTMLParser):
r"""
>>> parse('<HTML><A Href="page.html">Text</a></hTmL>')
<HTML><a href="/web/20131226101010/http://example.com/some/path/page.html">Text</a></html>
>>> parse('<body x="y"><img src="../img.gif"/><br/></body>')
<body x="y"><img src="/web/20131226101010im_/http://example.com/some/img.gif"/><br/></body>
>>> parse('<body x="y"><img src="/img.gif"/><br/></body>')
<body x="y"><img src="/web/20131226101010im_/http://example.com/img.gif"/><br/></body>
>>> parse('<input "selected"><img src></div>')
<input "selected"=""><img src=""></div>
>>> parse('<html><head><base href="http://example.com/some/path/index.html"/>')
<html><head><base href="/web/20131226101010/http://example.com/some/path/index.html"/>
# HTML Entities
>>> parse('<a href="">&rsaquo; &nbsp; &#62;</div>')
<a href="">&rsaquo; &nbsp; &#62;</div>
# Don't rewrite anchors
>>> parse('<HTML><A Href="#abc">Text</a></hTmL>')
<HTML><a href="#abc">Text</a></html>
# Unicode
>>> parse('<a href="http://испытание.испытание/">испытание</a>')
<a href="/web/20131226101010/http://испытание.испытание/">испытание</a>
# Meta tag
>>> parse('<META http-equiv="refresh" content="10; URL=/abc/def.html">')
<meta http-equiv="refresh" content="10; URL=/web/20131226101010/http://example.com/abc/def.html">
>>> parse('<meta http-equiv="Content-type" content="text/html; charset=utf-8" />')
<meta http-equiv="Content-type" content="text/html; charset=utf-8"/>
>>> parse('<META http-equiv="refresh" content>')
<meta http-equiv="refresh" content="">
# Script tag
>>> parse('<script>window.location = "http://example.com/a/b/c.html"</script>')
<script>window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html"</script>
# Unterminated script tag auto-terminate
>>> parse('<script>window.location = "http://example.com/a/b/c.html"</sc>')
<script>window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html"</sc></script>
>>> parse('<script>/*<![CDATA[*/window.location = "http://example.com/a/b/c.html;/*]]>*/"</script>')
<script>/*<![CDATA[*/window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html;/*]]>*/"</script>
>>> parse('<div style="background: url(\'abc.html\')" onblah onclick="location = \'redirect.html\'"></div>')
<div style="background: url('/web/20131226101010/http://example.com/some/path/abc.html')" onblah="" onclick="WB_wombat_location = 'redirect.html'"></div>
>>> parse('<style>@import "styles.css" .a { font-face: url(\'myfont.ttf\') }</style>')
<style>@import "/web/20131226101010/http://example.com/some/path/styles.css" .a { font-face: url('/web/20131226101010/http://example.com/some/path/myfont.ttf') }</style>
# Unterminated style tag auto-terminate
>>> parse('<style>@import url(styles.css)')
<style>@import url(/web/20131226101010/http://example.com/some/path/styles.css)</style>
# Head Insertion
>>> parse('<html><head><script src="other.js"></script></head><body>Test</body></html>', head_insert = '<script src="cool.js"></script>')
<html><head><script src="cool.js"></script><script src="/web/20131226101010js_/http://example.com/some/path/other.js"></script></head><body>Test</body></html>
>>> parse('<body><div>SomeTest</div>', head_insert = '/* Insert */')
/* Insert */<body><div>SomeTest</div>
>>> parse('<link href="abc.txt"><div>SomeTest</div>', head_insert = '<script>load_stuff();</script>')
<link href="/web/20131226101010oe_/http://example.com/some/path/abc.txt"><script>load_stuff();</script><div>SomeTest</div>
"""
HTML-Parsing Rewriter
"""
REWRITE_TAGS = {
@ -307,16 +240,4 @@ class HTMLRewriter(HTMLParser):
self.out.write(']>')
import utils
if __name__ == "__main__" or utils.enable_doctests():
url_rewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/web/')
def parse(data, head_insert = None):
parser = HTMLRewriter(url_rewriter, head_insert = head_insert)
print parser.rewrite(data) + parser.close()
import doctest
doctest.testmod()

View File

@ -0,0 +1,156 @@
import re
import sys
import itertools
from url_rewriter import UrlRewriter
#=================================================================
class RegexRewriter(object):
@staticmethod
def comment_out(string):
return '/*' + string + '*/'
@staticmethod
def remove_https(string):
return string.replace("https", "http")
@staticmethod
def add_prefix(prefix):
return lambda string: prefix + string
@staticmethod
def archival_rewrite(rewriter):
return lambda x: rewriter.rewrite(x)
@staticmethod
def replacer(string):
return lambda x: string
HTTPX_MATCH_STR = r'https?:\\?/\\?/[A-Za-z0-9:_@.-]+'
DEFAULT_OP = add_prefix
def __init__(self, rules):
#rules = self.create_rules(http_prefix)
# Build regexstr, concatenating regex list
regex_str = '|'.join(['(' + rx + ')' for rx, op, count in rules])
# ensure it's not middle of a word, wrap in non-capture group
regex_str = '(?<!\w)(?:' + regex_str + ')'
self.regex = re.compile(regex_str, re.M)
self.rules = rules
def filter(self, m):
return True
def rewrite(self, string):
return self.regex.sub(lambda x: self.replace(x), string)
def close(self):
return ''
def replace(self, m):
i = 0
for _, op, count in self.rules:
i += 1
full_m = i
while count > 0:
i += 1
count -= 1
if not m.group(i):
continue
# Optional filter to skip matches
if not self.filter(m):
return m.group(0)
# Custom func
if not hasattr(op, '__call__'):
op = RegexRewriter.DEFAULT_OP(op)
result = op(m.group(i))
# if extracting partial match
if i != full_m:
result = m.string[m.start(full_m):m.start(i)] + result + m.string[m.end(i):m.end(full_m)]
return result
#=================================================================
class JSLinkRewriter(RegexRewriter):
"""
JS Rewriter which rewrites absolute http://, https:// and // urls
at the beginning of a string
"""
JS_HTTPX = r'(?<="|\')(?:https?:)?\\{0,2}/\\{0,2}/[A-Za-z0-9:_@.-]+'
def __init__(self, rewriter, rules = []):
rules = rules + [(self.JS_HTTPX, rewriter.get_abs_url(), 0)]
super(JSLinkRewriter, self).__init__(rules)
#=================================================================
class JSLocationAndLinkRewriter(JSLinkRewriter):
"""
JS Rewriter which also rewrites location and domain to the
specified prefix (default: 'WB_wombat_')
"""
def __init__(self, rewriter, rules = [], prefix = 'WB_wombat_'):
rules = rules + [
(r'(?<!/)\blocation\b', prefix, 0),
(r'(?<=document\.)domain', prefix, 0),
]
super(JSLocationAndLinkRewriter, self).__init__(rewriter, rules)
#=================================================================
# Set 'default' JSRewriter
JSRewriter = JSLocationAndLinkRewriter
#=================================================================
class XMLRewriter(RegexRewriter):
def __init__(self, rewriter, extra = []):
rules = self._create_rules(rewriter.get_abs_url())
RegexRewriter.__init__(self, rules)
# custom filter to reject 'xmlns' attr
def filter(self, m):
attr = m.group(1)
if attr and attr.startswith('xmlns'):
return False
return True
def _create_rules(self, http_prefix):
return [
('([A-Za-z:]+[\s=]+)?["\'\s]*(' + RegexRewriter.HTTPX_MATCH_STR + ')', http_prefix, 2),
]
#=================================================================
class CSSRewriter(RegexRewriter):
CSS_URL_REGEX = "url\\s*\\(\\s*[\\\\\"']*([^)'\"]+)[\\\\\"']*\\s*\\)"
CSS_IMPORT_NO_URL_REGEX = "@import\\s+(?!url)\\(?\\s*['\"]?(?!url[\\s\\(])([\w.:/\\\\-]+)"
def __init__(self, rewriter):
rules = self._create_rules(rewriter)
RegexRewriter.__init__(self, rules)
def _create_rules(self, rewriter):
return [
(CSSRewriter.CSS_URL_REGEX, RegexRewriter.archival_rewrite(rewriter), 1),
(CSSRewriter.CSS_IMPORT_NO_URL_REGEX, RegexRewriter.archival_rewrite(rewriter), 1),
]

View File

@ -0,0 +1,151 @@
import chardet
from url_rewriter import UrlRewriter
from html_rewriter import HTMLRewriter
from regex_rewriters import RegexRewriter, JSRewriter, CSSRewriter, XMLRewriter
from header_rewriter import HeaderRewriter, RewrittenStatusAndHeaders
from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.utils.bufferedreaders import BufferedReader, ChunkedDataReader
class RewriteContent:
DEFAULT_CONTENT_REWRITERS = {
'header': HeaderRewriter,
'js': JSRewriter,
'css': CSSRewriter,
'xml': XMLRewriter,
'html': HTMLRewriter
}
def __init__(self, rewriters = {}):
self.rewriters = dict(self.DEFAULT_CONTENT_REWRITERS.items() + rewriters.items())
def rewrite_headers(self, urlrewriter, status_headers, stream):
rewritten_headers = self.rewriters['header']().rewrite(status_headers, urlrewriter)
# note: since chunking may be broken, approach taken here is to *always* attempt
# to dechunk if transfer-encoding: chunked is present
#
# an alternative may be to serve chunked unless content rewriting is needed
# todo: possible revisit this approach
if (rewritten_headers.contains_removed_header('transfer-encoding', 'chunked')):
stream = ChunkedDataReader(stream)
return (rewritten_headers, stream)
def rewrite_content(self, urlrewriter, headers, stream, head_insert_str = None):
# see if we've already rewritten headers
if isinstance(headers, RewrittenStatusAndHeaders):
rewritten_headers = headers
elif isinstance(headers, StatusAndHeaders):
# otherwise, need to determine if rewriting is even necessary
(rewritten_headers, stream) = self.rewrite_headers(urlrewriter, headers, stream)
# no rewriting needed here
if rewritten_headers.text_type is None:
gen = self.stream_to_gen(stream)
return (status_headers, gen)
status_headers = rewritten_headers.status_headers
# Handle text content rewriting
# =========================================================================
# special case -- need to ungzip the body
if (rewritten_headers.contains_removed_header('content-encoding', 'gzip')):
stream = BufferedReader(stream, 'gzip')
if rewritten_headers.charset:
encoding = rewritten_headers.charset
first_buff = None
else:
(encoding, first_buff) = self._detect_charset(stream)
# if chardet thinks its ascii, use utf-8
if encoding == 'ascii':
encoding = 'utf-8'
text_type = rewritten_headers.text_type
rewriter_class = self.rewriters.get(text_type)
if not rewriter_class:
raise Exception('Unknown Text Type for Rewrite: ' + text_type)
if text_type == 'html':
rewriter = rewriter_class(urlrewriter, outstream = None, head_insert = head_insert_str)
else:
rewriter = rewriter_class(urlrewriter)
# Create rewriting generator
gen = self._rewriting_stream_gen(rewriter, encoding, stream, first_buff)
return (status_headers, gen)
# Create rewrite stream, may even be chunked by front-end
def _rewriting_stream_gen(self, rewriter, encoding, stream, first_buff = None):
def do_rewrite(buff):
if encoding:
buff = self._decode_buff(buff, stream, encoding)
buff = rewriter.rewrite(buff)
if encoding:
buff = buff.encode(encoding)
return buff
def do_finish():
return rewriter.close()
return self.stream_to_gen(stream, rewrite_func = do_rewrite, final_read_func = do_finish, first_buff = first_buff)
def _decode_buff(self, buff, stream, encoding):
try:
buff = buff.decode(encoding)
except UnicodeDecodeError, e:
# chunk may have cut apart unicode bytes -- add 1-3 bytes and retry
for i in range(3):
buff += stream.read(1)
try:
buff = buff.decode(encoding)
break
except UnicodeDecodeError:
pass
else:
raise
return buff
def _detect_charset(self, stream):
buff = stream.read(8192)
result = chardet.detect(buff)
print "chardet result: " + str(result)
return (result['encoding'], buff)
# Create a generator reading from a stream, with optional rewriting and final read call
@staticmethod
def stream_to_gen(stream, rewrite_func = None, final_read_func = None, first_buff = None):
try:
buff = first_buff if first_buff else stream.read()
while buff:
if rewrite_func:
buff = rewrite_func(buff)
yield buff
buff = stream.read()
# For adding a tail/handling final buffer
if final_read_func:
buff = final_read_func()
if buff:
yield buff
finally:
stream.close()

View File

@ -0,0 +1,68 @@
import urllib2
import os
import sys
import datetime
from pywb.utils.timeutils import datetime_to_timestamp
from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.rewrite.url_rewriter import UrlRewriter
from pywb.rewrite.rewrite_content import RewriteContent
"""
Fetch a url from live web and apply rewriting rules
"""
#=================================================================
def get_status_and_stream(url):
resp = urllib2.urlopen(url)
headers = []
for name, value in resp.info().dict.iteritems():
headers.append((name, value))
status_headers = StatusAndHeaders('200 OK', headers)
stream = resp
return (status_headers, stream)
#=================================================================
def get_rewritten(url, urlrewriter):
(status_headers, stream) = get_status_and_stream(url)
status_headers, gen = RewriteContent().rewrite_content(urlrewriter, status_headers, stream)
buff = ''
for x in gen:
buff += x
return (status_headers, buff)
#=================================================================
def main():
if len(sys.argv) < 2:
print 'Usage: {0} url-to-fetch [wb-url-target] [extra-prefix]'.format(sys.argv[0])
exit(1)
else:
url = sys.argv[1]
if len(sys.argv) >= 3:
wburl_str = sys.argv[2]
if wburl_str.startswith('/'):
wburl_str = wburl_str[1:]
prefix, wburl_str = wburl_str.split('/', 1)
prefix = '/' + prefix + '/'
else:
wburl_str = datetime_to_timestamp(datetime.datetime.now()) + '/http://example.com/path/sample.html'
prefix = '/pywb_rewrite/'
urlrewriter = UrlRewriter(wburl_str, prefix)
status_headers, buff = get_rewritten(url, urlrewriter)
sys.stdout.write(buff)
#=================================================================
if __name__ == "__main__":
main()

View File

@ -0,0 +1,266 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
r"""
#=================================================================
# HTML Rewriting
#=================================================================
>>> parse('<HTML><A Href="page.html">Text</a></hTmL>')
<HTML><a href="/web/20131226101010/http://example.com/some/path/page.html">Text</a></html>
>>> parse('<body x="y"><img src="../img.gif"/><br/></body>')
<body x="y"><img src="/web/20131226101010im_/http://example.com/some/img.gif"/><br/></body>
>>> parse('<body x="y"><img src="/img.gif"/><br/></body>')
<body x="y"><img src="/web/20131226101010im_/http://example.com/img.gif"/><br/></body>
>>> parse('<input "selected"><img src></div>')
<input "selected"=""><img src=""></div>
>>> parse('<html><head><base href="http://example.com/some/path/index.html"/>')
<html><head><base href="/web/20131226101010/http://example.com/some/path/index.html"/>
# HTML Entities
>>> parse('<a href="">&rsaquo; &nbsp; &#62;</div>')
<a href="">&rsaquo; &nbsp; &#62;</div>
# Don't rewrite anchors
>>> parse('<HTML><A Href="#abc">Text</a></hTmL>')
<HTML><a href="#abc">Text</a></html>
# Unicode
>>> parse('<a href="http://испытание.испытание/">испытание</a>')
<a href="/web/20131226101010/http://испытание.испытание/">испытание</a>
# Meta tag
>>> parse('<META http-equiv="refresh" content="10; URL=/abc/def.html">')
<meta http-equiv="refresh" content="10; URL=/web/20131226101010/http://example.com/abc/def.html">
>>> parse('<meta http-equiv="Content-type" content="text/html; charset=utf-8" />')
<meta http-equiv="Content-type" content="text/html; charset=utf-8"/>
>>> parse('<META http-equiv="refresh" content>')
<meta http-equiv="refresh" content="">
# Script tag
>>> parse('<script>window.location = "http://example.com/a/b/c.html"</script>')
<script>window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html"</script>
# Unterminated script tag auto-terminate
>>> parse('<script>window.location = "http://example.com/a/b/c.html"</sc>')
<script>window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html"</sc></script>
>>> parse('<script>/*<![CDATA[*/window.location = "http://example.com/a/b/c.html;/*]]>*/"</script>')
<script>/*<![CDATA[*/window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html;/*]]>*/"</script>
>>> parse('<div style="background: url(\'abc.html\')" onblah onclick="location = \'redirect.html\'"></div>')
<div style="background: url('/web/20131226101010/http://example.com/some/path/abc.html')" onblah="" onclick="WB_wombat_location = 'redirect.html'"></div>
>>> parse('<style>@import "styles.css" .a { font-face: url(\'myfont.ttf\') }</style>')
<style>@import "/web/20131226101010/http://example.com/some/path/styles.css" .a { font-face: url('/web/20131226101010/http://example.com/some/path/myfont.ttf') }</style>
# Unterminated style tag auto-terminate
>>> parse('<style>@import url(styles.css)')
<style>@import url(/web/20131226101010/http://example.com/some/path/styles.css)</style>
# Head Insertion
>>> parse('<html><head><script src="other.js"></script></head><body>Test</body></html>', head_insert = '<script src="cool.js"></script>')
<html><head><script src="cool.js"></script><script src="/web/20131226101010js_/http://example.com/some/path/other.js"></script></head><body>Test</body></html>
>>> parse('<body><div>SomeTest</div>', head_insert = '/* Insert */')
/* Insert */<body><div>SomeTest</div>
>>> parse('<link href="abc.txt"><div>SomeTest</div>', head_insert = '<script>load_stuff();</script>')
<link href="/web/20131226101010oe_/http://example.com/some/path/abc.txt"><script>load_stuff();</script><div>SomeTest</div>
#=================================================================
# Custom Regex
# Test https->http converter (other tests below in subclasses)
>>> RegexRewriter([(RegexRewriter.HTTPX_MATCH_STR, RegexRewriter.remove_https, 0)]).rewrite('a = https://example.com; b = http://example.com; c = https://some-url/path/https://embedded.example.com')
'a = http://example.com; b = http://example.com; c = http://some-url/path/http://embedded.example.com'
#=================================================================
# JS Rewriting
#=================================================================
>>> _test_js('location = "http://example.com/abc.html"')
'WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"'
>>> _test_js(r'location = "http:\/\/example.com/abc.html"')
'WB_wombat_location = "/web/20131010im_/http:\\/\\/example.com/abc.html"'
>>> _test_js(r'location = "http:\\/\\/example.com/abc.html"')
'WB_wombat_location = "/web/20131010im_/http:\\\\/\\\\/example.com/abc.html"'
>>> _test_js(r"location = 'http://example.com/abc.html/'")
"WB_wombat_location = '/web/20131010im_/http://example.com/abc.html/'"
>>> _test_js(r'location = http://example.com/abc.html/')
'WB_wombat_location = http://example.com/abc.html/'
# not rewritten -- to be handled on client side
>>> _test_js(r'location = "/abc.html"')
'WB_wombat_location = "/abc.html"'
>>> _test_js(r'location = /http:\/\/example.com/abc.html/')
'WB_wombat_location = /http:\\/\\/example.com/abc.html/'
>>> _test_js('"/location" == some_location_val; locations = location;')
'"/location" == some_location_val; locations = WB_wombat_location;'
>>> _test_js('cool_Location = "http://example.com/abc.html"')
'cool_Location = "/web/20131010im_/http://example.com/abc.html"'
>>> _test_js('window.location = "http://example.com/abc.html" document.domain = "anotherdomain.com"')
'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html" document.WB_wombat_domain = "anotherdomain.com"'
>>> _test_js('document_domain = "anotherdomain.com"; window.document.domain = "example.com"')
'document_domain = "anotherdomain.com"; window.document.WB_wombat_domain = "example.com"'
# custom rules added
>>> _test_js('window.location = "http://example.com/abc.html"; some_func(); ', [('some_func\(\).*', RegexRewriter.comment_out, 0)])
'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"; /*some_func(); */'
# scheme-agnostic
>>> _test_js('cool_Location = "//example.com/abc.html" //comment')
'cool_Location = "/web/20131010im_///example.com/abc.html" //comment'
#=================================================================
# XML Rewriting
#=================================================================
>>> _test_xml('<tag xmlns="http://www.example.com/ns" attr="http://example.com"></tag>')
'<tag xmlns="http://www.example.com/ns" attr="/web/20131010im_/http://example.com"></tag>'
>>> _test_xml('<tag xmlns:xsi="http://www.example.com/ns" attr=" http://example.com"></tag>')
'<tag xmlns:xsi="http://www.example.com/ns" attr=" /web/20131010im_/http://example.com"></tag>'
>>> _test_xml('<tag> http://example.com<other>abchttp://example.com</other></tag>')
'<tag> /web/20131010im_/http://example.com<other>abchttp://example.com</other></tag>'
>>> _test_xml('<main> http://www.example.com/blah</tag> <other xmlns:abcdef= " http://example.com"/> http://example.com </main>')
'<main> /web/20131010im_/http://www.example.com/blah</tag> <other xmlns:abcdef= " http://example.com"/> /web/20131010im_/http://example.com </main>'
#=================================================================
# CSS Rewriting
#=================================================================
>>> _test_css("background: url('/some/path.html')")
"background: url('/web/20131010im_/http://example.com/some/path.html')"
>>> _test_css("background: url('../path.html')")
"background: url('/web/20131010im_/http://example.com/path.html')"
>>> _test_css("background: url(\"http://domain.com/path.html\")")
'background: url("/web/20131010im_/http://domain.com/path.html")'
>>> _test_css("background: url(file.jpeg)")
'background: url(/web/20131010im_/http://example.com/file.jpeg)'
>>> _test_css("background: url('')")
"background: url('')"
>>> _test_css("background: url (\"weirdpath\')")
'background: url ("/web/20131010im_/http://example.com/weirdpath\')'
>>> _test_css("@import url ('path.css')")
"@import url ('/web/20131010im_/http://example.com/path.css')"
>>> _test_css("@import url('path.css')")
"@import url('/web/20131010im_/http://example.com/path.css')"
>>> _test_css("@import ( 'path.css')")
"@import ( '/web/20131010im_/http://example.com/path.css')"
>>> _test_css("@import \"path.css\"")
'@import "/web/20131010im_/http://example.com/path.css"'
>>> _test_css("@import ('../path.css\"")
'@import (\'/web/20131010im_/http://example.com/path.css"'
>>> _test_css("@import ('../url.css\"")
'@import (\'/web/20131010im_/http://example.com/url.css"'
>>> _test_css("@import (\"url.css\")")
'@import ("/web/20131010im_/http://example.com/url.css")'
>>> _test_css("@import url(/url.css)\n@import url(/anotherurl.css)\n @import url(/and_a_third.css)")
'@import url(/web/20131010im_/http://example.com/url.css)\n@import url(/web/20131010im_/http://example.com/anotherurl.css)\n @import url(/web/20131010im_/http://example.com/and_a_third.css)'
#=================================================================
HTTP Headers Rewriting
#=================================================================
# Text with charset
>>> _test_headers([('Date', 'Fri, 03 Jan 2014 03:03:21 GMT'), ('Content-Length', '5'), ('Content-Type', 'text/html;charset=UTF-8')])
{'text_type': 'html', 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Date', 'Fri, 03 Jan 2014 03:03:21 GMT'),
('X-Archive-Orig-Content-Length', '5'),
('Content-Type', 'text/html;charset=UTF-8')]), 'removed_header_dict': {}, 'charset': 'utf-8'}
# Redirect
>>> _test_headers([('Connection', 'close'), ('Location', '/other.html')], '302 Redirect')
{'text_type': None, 'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [ ('X-Archive-Orig-Connection', 'close'),
('Location', '/web/20131226101010/http://example.com/other.html')]), 'removed_header_dict': {}, 'charset': None}
# gzip
>>> _test_headers([('Content-Length', '199999'), ('Content-Type', 'text/javascript'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')])
{'text_type': 'js', 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Content-Length', '199999'),
('Content-Type', 'text/javascript')]), 'removed_header_dict': {'transfer-encoding': 'chunked', 'content-encoding': 'gzip'}, 'charset': None}
# Binary
>>> _test_headers([('Content-Length', '200000'), ('Content-Type', 'image/png'), ('Cookie', 'blah'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')])
{'text_type': None, 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('Content-Length', '200000'),
('Content-Type', 'image/png'),
('X-Archive-Orig-Cookie', 'blah'),
('Content-Encoding', 'gzip')]), 'removed_header_dict': {'transfer-encoding': 'chunked'}, 'charset': None}
Removing Transfer-Encoding always, Was:
('Content-Encoding', 'gzip'),
('Transfer-Encoding', 'chunked')]), 'charset': None, 'text_type': None, 'removed_header_dict': {}}
"""
#=================================================================
from pywb.rewrite.url_rewriter import UrlRewriter
from pywb.rewrite.html_rewriter import HTMLRewriter
from pywb.rewrite.regex_rewriters import RegexRewriter, JSRewriter, CSSRewriter, XMLRewriter
from pywb.rewrite.header_rewriter import HeaderRewriter
from pywb.utils.statusandheaders import StatusAndHeaders
urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/web/')
def parse(data, head_insert = None):
parser = HTMLRewriter(urlrewriter, head_insert = head_insert)
print parser.rewrite(data) + parser.close()
arcrw = UrlRewriter('20131010im_/http://example.com/', '/web/')
def _test_js(string, extra = []):
return JSRewriter(arcrw, extra).rewrite(string)
def _test_xml(string):
return XMLRewriter(arcrw).rewrite(string)
def _test_css(string):
return CSSRewriter(arcrw).rewrite(string)
headerrewriter = HeaderRewriter()
def _test_headers(headers, status = '200 OK'):
rewritten = headerrewriter.rewrite(StatusAndHeaders(status, headers), urlrewriter)
return vars(rewritten)
if __name__ == "__main__":
import doctest
doctest.testmod()

View File

@ -0,0 +1,32 @@
from pywb.rewrite.rewrite_live import get_rewritten
from pywb.rewrite.url_rewriter import UrlRewriter
# This module has some rewriting tests against the 'live web'
# As such, the content may change and the test may break
urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/pywb/')
def test_example_1():
status_headers, buff = get_rewritten('http://example.com/', urlrewriter)
# verify header rewriting
assert (('X-Archive-Orig-connection', 'close') in status_headers.headers), status_headers
def test_example_2():
status_headers, buff = get_rewritten('http://example.com/', urlrewriter)
# verify header rewriting
assert (('X-Archive-Orig-connection', 'close') in status_headers.headers), status_headers
assert '/pywb/20131226101010/http://www.iana.org/domains/example' in buff, buff
def test_example_3():
status_headers, buff = get_rewritten('http://archive.org/', urlrewriter)
assert '/pywb/20131226101010/http://example.com/about/terms.php' in buff, buff

View File

@ -6,43 +6,43 @@ from wburl import WbUrl
class UrlRewriter:
"""
>>> test_rewrite('other.html', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/')
>>> do_rewrite('other.html', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/')
'https://web.archive.org/web/20131010/http://example.com/path/other.html'
>>> test_rewrite('file.js', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/', 'js_')
>>> do_rewrite('file.js', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/', 'js_')
'https://web.archive.org/web/20131010js_/http://example.com/path/file.js'
>>> test_rewrite('/other.html', '20130907*/http://example.com/path/page.html', '/coll/')
>>> do_rewrite('/other.html', '20130907*/http://example.com/path/page.html', '/coll/')
'/coll/20130907*/http://example.com/other.html'
>>> test_rewrite('./other.html', '20130907*/http://example.com/path/page.html', '/coll/')
>>> do_rewrite('./other.html', '20130907*/http://example.com/path/page.html', '/coll/')
'/coll/20130907*/http://example.com/path/other.html'
>>> test_rewrite('../other.html', '20131112im_/http://example.com/path/page.html', '/coll/')
>>> do_rewrite('../other.html', '20131112im_/http://example.com/path/page.html', '/coll/')
'/coll/20131112im_/http://example.com/other.html'
>>> test_rewrite('../../other.html', '*/http://example.com/index.html', 'localhost:8080/')
>>> do_rewrite('../../other.html', '*/http://example.com/index.html', 'localhost:8080/')
'localhost:8080/*/http://example.com/other.html'
>>> test_rewrite('path/../../other.html', '*/http://example.com/index.html', 'localhost:8080/')
>>> do_rewrite('path/../../other.html', '*/http://example.com/index.html', 'localhost:8080/')
'localhost:8080/*/http://example.com/other.html'
>>> test_rewrite('http://some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/')
>>> do_rewrite('http://some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/')
'localhost:8080/20101226101112/http://some-other-site.com'
>>> test_rewrite('../../other.html', '2020/http://example.com/index.html', '/')
>>> do_rewrite('../../other.html', '2020/http://example.com/index.html', '/')
'/2020/http://example.com/other.html'
>>> test_rewrite('../../other.html', '2020/http://example.com/index.html', '')
>>> do_rewrite('../../other.html', '2020/http://example.com/index.html', '')
'2020/http://example.com/other.html'
>>> test_rewrite('', '20131010010203/http://example.com/file.html', '/web/')
>>> do_rewrite('', '20131010010203/http://example.com/file.html', '/web/')
'/web/20131010010203/http://example.com/file.html'
>>> test_rewrite('#anchor', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/')
>>> do_rewrite('#anchor', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/')
'#anchor'
>>> test_rewrite('mailto:example@example.com', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/')
>>> do_rewrite('mailto:example@example.com', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/')
'mailto:example@example.com'
>>> UrlRewriter('19960708im_/http://domain.example.com/path.txt', '/abc/').get_abs_url()
@ -62,7 +62,6 @@ class UrlRewriter:
def __init__(self, wburl, prefix):
self.wburl = wburl if isinstance(wburl, WbUrl) else WbUrl(wburl)
self.prefix = prefix
self.archivalurl_class = self.wburl.__class__
#if self.prefix.endswith('/'):
# self.prefix = self.prefix[:-1]
@ -74,7 +73,7 @@ class UrlRewriter:
wburl = self.wburl
isAbs = any (url.startswith(x) for x in self.PROTOCOLS)
isAbs = any(url.startswith(x) for x in self.PROTOCOLS)
# Optimized rewriter for
# -rel urls that don't start with / and don't contain ../ and no special mod
@ -117,12 +116,11 @@ class UrlRewriter:
return url
import utils
if __name__ == "__main__" or utils.enable_doctests():
def test_rewrite(rel_url, base_url, prefix, mod = None):
rewriter = UrlRewriter(base_url, prefix)
return rewriter.rewrite(rel_url, mod)
def do_rewrite(rel_url, base_url, prefix, mod = None):
rewriter = UrlRewriter(base_url, prefix)
return rewriter.rewrite(rel_url, mod)
if __name__ == "__main__":
import doctest
doctest.testmod()

View File

@ -3,9 +3,38 @@
import re
import rfc3987
import wbexceptions
# WbUrl : wb archival url representation for WB
"""
WbUrl represents the standard wayback archival url format.
A regular url is a subset of the WbUrl (latest replay).
The WbUrl expresses the common interface for interacting
with the wayback machine.
There WbUrl may represent one of the following forms:
query form: [/modifier]/[timestamp][-end_timestamp]*/<url>
modifier, timestamp and end_timestamp are optional
*/example.com
20101112030201*/http://example.com
2009-2015*/http://example.com
/cdx/*/http://example.com
url query form: used to indicate query across urls
same as query form but with a final *
*/example.com*
20101112030201*/http://example.com*
replay form:
20101112030201/http://example.com
20101112030201im_/http://example.com
latest_replay: (no timestamp)
http://example.com
"""
class WbUrl:
"""
@ -38,6 +67,13 @@ class WbUrl:
>>> repr(WbUrl('*/http://example.com/abc?def=a*'))
"('url_query', '', '', 'http://example.com/abc?def=a', '*/http://example.com/abc?def=a*')"
>>> repr(WbUrl('2010*/http://example.com/abc?def=a'))
"('query', '2010', '', 'http://example.com/abc?def=a', '2010*/http://example.com/abc?def=a')"
# timestamp range query
>>> repr(WbUrl('2009-2015*/http://example.com/abc?def=a'))
"('query', '2009', '', 'http://example.com/abc?def=a', '2009-2015*/http://example.com/abc?def=a')"
>>> repr(WbUrl('json/*/http://example.com/abc?def=a'))
"('query', '', 'json', 'http://example.com/abc?def=a', 'json/*/http://example.com/abc?def=a')"
@ -59,16 +95,16 @@ class WbUrl:
# ======================
>>> x = WbUrl('/#$%#/')
Traceback (most recent call last):
BadUrlException: Bad Request Url: http://#$%#/
Exception: Bad Request Url: http://#$%#/
>>> x = WbUrl('/http://example.com:abc/')
Traceback (most recent call last):
BadUrlException: Bad Request Url: http://example.com:abc/
Exception: Bad Request Url: http://example.com:abc/
"""
# Regexs
# ======================
QUERY_REGEX = re.compile('^(?:([\w\-:]+)/)?(\d*)\*/?(.*)$')
QUERY_REGEX = re.compile('^(?:([\w\-:]+)/)?(\d*)(?:-(\d+))?\*/?(.*)$')
REPLAY_REGEX = re.compile('^(\d*)([a-z]+_)?/{0,3}(.*)$')
QUERY = 'query'
@ -85,13 +121,14 @@ class WbUrl:
self.type = None
self.url = ''
self.timestamp = ''
self.end_timestamp = ''
self.mod = ''
if not any (f(url) for f in [self._init_query, self._init_replay]):
raise wbexceptions.RequestParseException('Invalid WB Request Url: ', url)
raise Exception('Invalid WbUrl: ', url)
if len(self.url) == 0:
raise wbexceptions.RequestParseException('Invalid WB Request Url: ', url)
raise Exception('Invalid WbUrl: ', url)
# protocol agnostic url -> http://
#if self.url.startswith('//'):
@ -105,7 +142,7 @@ class WbUrl:
matcher = rfc3987.match(self.url.upper(), 'IRI')
if not matcher:
raise wbexceptions.BadUrlException('Bad Request Url: ' + self.url)
raise Exception('Bad Request Url: ' + self.url)
# Match query regex
# ======================
@ -118,7 +155,8 @@ class WbUrl:
self.mod = res[0]
self.timestamp = res[1]
self.url = res[2]
self.end_timestamp = res[2]
self.url = res[3]
if self.url.endswith('*'):
self.type = self.URL_QUERY
self.url = self.url[:-1]
@ -151,6 +189,7 @@ class WbUrl:
atype = overrides['type'] if 'type' in overrides else self.type
mod = overrides['mod'] if 'mod' in overrides else self.mod
timestamp = overrides['timestamp'] if 'timestamp' in overrides else self.timestamp
end_timestamp = overrides['end_timestamp'] if 'end_timestamp' in overrides else self.end_timestamp
url = overrides['url'] if 'url' in overrides else self.url
if atype == self.QUERY or atype == self.URL_QUERY:
@ -159,6 +198,8 @@ class WbUrl:
tsmod += mod + "/"
if timestamp:
tsmod += timestamp
if end_timestamp:
tsmod += '-' + end_timestamp
tsmod += "*/" + url
if atype == self.URL_QUERY:

View File

@ -1,122 +0,0 @@
import itertools
import time
import zlib
import time
import datetime
import calendar
import re
def peek_iter(iterable):
try:
first = next(iterable)
except StopIteration:
return None
return itertools.chain([first], iterable)
def split_prefix(key, prefixs):
for p in prefixs:
if key.startswith(p):
plen = len(p)
return (key[:plen], key[plen:])
def create_decompressor():
return zlib.decompressobj(16 + zlib.MAX_WBITS)
#=================================================================
# Adapted from example at
class PerfTimer:
def __init__(self, perfdict, name):
self.perfdict = perfdict
self.name = name
def __enter__(self):
self.start = time.clock()
return self
def __exit__(self, *args):
self.end = time.clock()
if self.perfdict is not None:
self.perfdict[self.name] = str(self.end - self.start)
#=================================================================
# adapted -from wsgiref.request_uri, but doesn't include domain name and allows all characters
# allowed in the path segment according to: http://tools.ietf.org/html/rfc3986#section-3.3
# explained here: http://stackoverflow.com/questions/4669692/valid-characters-for-directory-part-of-a-url-for-short-links
def rel_request_uri(environ, include_query=1):
"""
Return the requested path, optionally including the query string
# Simple test:
>>> rel_request_uri({'PATH_INFO': '/web/example.com'})
'/web/example.com'
# Test all unecoded special chars and double-quote
# (double-quote must be encoded but not single quote)
>>> rel_request_uri({'PATH_INFO': "/web/example.com/0~!+$&'()*+,;=:\\\""})
"/web/example.com/0~!+$&'()*+,;=:%22"
"""
from urllib import quote
url = quote(environ.get('PATH_INFO',''), safe='/~!$&\'()*+,;=:@')
if include_query and environ.get('QUERY_STRING'):
url += '?' + environ['QUERY_STRING']
return url
#=================================================================
def unsurt(surt):
"""
# Simple surt
>>> unsurt('com,example)/')
'example.com)/'
# Broken surt
>>> unsurt('com,example)')
'com,example)'
# Long surt
>>> unsurt('suffix,domain,sub,subsub,another,subdomain)/path/file/index.html?a=b?c=)/')
'subdomain.another.subsub.sub.domain.suffix)/path/file/index.html?a=b?c=)/'
"""
try:
index = surt.index(')/')
parts = surt[0:index].split(',')
parts.reverse()
host = '.'.join(parts)
host += surt[index:]
return host
except ValueError:
# May not be a valid surt
return surt
#=================================================================
# Support for bulk doctest testing via nose or py.test
# nosetests --with-doctest
# py.test --doctest_modules
import sys
is_in_testtool = any(sys.argv[0].endswith(tool) for tool in ['py.test', 'nosetests'])
def enable_doctests():
return is_in_testtool
def test_data_dir():
import os
return os.path.dirname(os.path.realpath(__file__)) + '/../sample_archive/'
#=================================================================
if __name__ == "__main__" or enable_doctests():
import doctest
doctest.testmod()

16
pywb/utils/README.md Normal file
View File

@ -0,0 +1,16 @@
## PyWb Utils v0.2 ##
[![Build Status](https://travis-ci.org/ikreymer/pywb_utils.png?branch=master)](https://travis-ci.org/ikreymer/pywb_utils)
This is a standalone module contains a variety of utils used by pywb wayback tool suite.
`python run-tests.py` will run all tests
#### Modules
[binsearch.py](pywb_utils/binsearch.py) -- Binary search implementation over text files
[loaders.py](pywb_utils/loaders.py) -- Loading abstraction for http, local file system, as well as buffered and seekable file readers
[timeutils.py](pywb_utils/timeutils.py) -- Utility functions for converting between standard datetime formats 14-digit timestamp

0
pywb/utils/__init__.py Normal file
View File

110
pywb/utils/binsearch.py Normal file
View File

@ -0,0 +1,110 @@
"""
Utility functions for performing binary search over a sorted text file
"""
from collections import deque
import itertools
#=================================================================
def binsearch_offset(reader, key, compare_func=cmp, block_size=8192):
"""
Find offset of the line which matches a given 'key' using binary search
If key is not found, the offset is of the line after the key
File is subdivided into block_size (default 8192) sized blocks
Optional compare_func may be specified
"""
min_ = 0
max_ = reader.getsize() / block_size
while max_ - min_ > 1:
mid = min_ + ((max_ - min_) / 2)
reader.seek(mid * block_size)
if mid > 0:
reader.readline() # skip partial line
line = reader.readline()
if compare_func(key, line) > 0:
min_ = mid
else:
max_ = mid
return min_ * block_size
#=================================================================
def search(reader, key, prev_size=0, compare_func=cmp, block_size=8192):
"""
Perform a binary search for a specified key to within a 'block_size'
(default 8192) sized block followed by linear search
within the block to find first matching line.
When performin_g linear search, keep track of up to N previous lines before
first matching line.
"""
min_ = binsearch_offset(reader, key, compare_func, block_size)
reader.seek(min_)
if min_ > 0:
reader.readline() # skip partial line
if prev_size > 1:
prev_deque = deque(max_len=prev_size)
line = None
while True:
line = reader.readline()
if not line:
break
if compare_func(line, key) >= 0:
break
if prev_size == 1:
prev = line
elif prev_size > 1:
prev_deque.append(line)
def gen_iter(line):
"""
Create iterator over any previous lines to
current matched line
"""
if prev_size == 1:
yield prev.rstrip()
elif prev_size > 1:
for i in prev_deque:
yield i.rstrip()
while line:
yield line.rstrip()
line = reader.readline()
return gen_iter(line)
#=================================================================
def iter_prefix(reader, key):
"""
Creates an iterator which iterates over lines that start with prefix
'key' in a sorted text file.
"""
return itertools.takewhile(
lambda line: line.startswith(key),
search(reader, key))
#=================================================================
def iter_exact(reader, key, token=' '):
"""
Create an iterator which iterates over lines where the first field matches
the 'key', equivalent to token + sep prefix.
Default field termin_ator/seperator is ' '
"""
return iter_prefix(reader, key + token)

View File

@ -0,0 +1,204 @@
import StringIO
import zlib
#=================================================================
def gzip_decompressor():
"""
Decompressor which can handle decompress gzip stream
"""
return zlib.decompressobj(16 + zlib.MAX_WBITS)
#=================================================================
class BufferedReader(object):
"""
A wrapping line reader which wraps an existing reader.
Read operations operate on underlying buffer, which is filled to
block_size (1024 default)
If an optional decompress type is specified,
data is fed through the decompressor when read from the buffer.
Currently supported decompression: gzip
If decompression fails on first try, data is assumed to be decompressed
and no exception is thrown. If a failure occurs after data has been
partially decompressed, the exception is propagated.
"""
DECOMPRESSORS = {'gzip': gzip_decompressor}
def __init__(self, stream, max_len=0, block_size=1024, decomp_type=None):
self.stream = stream
self.block_size = block_size
if decomp_type:
try:
self.decompressor = self.DECOMPRESSORS[decomp_type.lower()]()
except KeyError:
raise Exception('Decompression type not supported: ' +
decomp_type)
else:
self.decompressor = None
self.buff = None
self.num_read = 0
self.max_len = max_len
def _fillbuff(self, block_size=None):
if not block_size:
block_size = self.block_size
if not self.buff or self.buff.pos >= self.buff.len:
if self.max_len > 0:
to_read = min(self.max_len - self.num_read, self.block_size)
else:
to_read = self.block_size
data = self.stream.read(to_read)
self._process_read(data)
def _process_read(self, data):
data = self._decompress(data)
self.num_read += len(data)
self.buff = StringIO.StringIO(data)
def _decompress(self, data):
if self.decompressor and data:
try:
data = self.decompressor.decompress(data)
except Exception:
# if first read attempt, assume non-gzipped stream
if self.num_read == 0:
self.decompressor = None
# otherwise (partly decompressed), something is wrong
else:
raise
return data
def read(self, length=None):
self._fillbuff()
return self.buff.read(length)
def readline(self, length=None):
self._fillbuff()
return self.buff.readline(length)
def close(self):
if self.stream:
self.stream.close()
self.stream = None
#=================================================================
class ChunkedDataException(Exception):
pass
#=================================================================
class ChunkedDataReader(BufferedReader):
r"""
A ChunkedDataReader is a BufferedReader which also supports de-chunking
of the data if it happens to be http 'chunk-encoded'.
If at any point the chunked header is not available, the stream is
assumed to not be chunked and no more dechunking occurs.
Properly formatted chunked data:
>>> c = ChunkedDataReader(StringIO.StringIO("4\r\n1234\r\n0\r\n\r\n"));
>>> c.read() + c.read()
'1234'
Non-chunked data:
>>> ChunkedDataReader(StringIO.StringIO("xyz123!@#")).read()
'xyz123!@#'
Starts like chunked data, but isn't:
>>> c = ChunkedDataReader(StringIO.StringIO("1\r\nxyz123!@#"));
>>> c.read() + c.read()
'1\r\nx123!@#'
Chunked data cut off part way through:
>>> c = ChunkedDataReader(StringIO.StringIO("4\r\n1234\r\n4\r\n12"));
>>> c.read() + c.read()
'123412'
"""
all_chunks_read = False
not_chunked = False
# if False, we'll use best-guess fallback for parse errors
raise_chunked_data_exceptions = False
def _fillbuff(self, block_size=None):
if self.not_chunked:
return BufferedReader._fillbuff(self, block_size)
if self.all_chunks_read:
return
if not self.buff or self.buff.pos >= self.buff.len:
length_header = self.stream.readline(64)
self._data = ''
try:
self._try_decode(length_header)
except ChunkedDataException:
if self.raise_chunked_data_exceptions:
raise
# Can't parse the data as chunked.
# It's possible that non-chunked data is served
# with a Transfer-Encoding: chunked.
# Treat this as non-chunk encoded from here on.
self._process_read(length_header + self._data)
self.not_chunked = True
def _try_decode(self, length_header):
# decode length header
try:
chunk_size = int(length_header.strip().split(';')[0], 16)
except ValueError:
raise ChunkedDataException("Couldn't decode length header " +
length_header)
if not chunk_size:
# chunk_size 0 indicates end of file
self.all_chunks_read = True
#self._process_read('')
return
data_len = len(self._data)
# read chunk
while data_len < chunk_size:
new_data = self.stream.read(chunk_size - data_len)
# if we unexpectedly run out of data,
# either raise an exception or just stop reading,
# assuming file was cut off
if not new_data:
if self.raise_chunked_data_exceptions:
msg = 'Ran out of data before end of chunk'
raise ChunkedDataException(msg)
else:
chunk_size = data_len
self.all_chunks_read = True
self._data += new_data
data_len = len(self._data)
# if we successfully read a block without running out,
# it should end in \r\n
if not self.all_chunks_read:
clrf = self.stream.read(2)
if clrf != '\r\n':
raise ChunkedDataException("Chunk terminator not found.")
# hand to base class for further processing
self._process_read(self._data)
if __name__ == "__main__":
import doctest
doctest.testmod()

152
pywb/utils/loaders.py Normal file
View File

@ -0,0 +1,152 @@
"""
This module provides loaders for local file system and over http
local and remote access
"""
import os
import hmac
import urllib2
import time
#=================================================================
# load a reader from http
#=================================================================
class HttpLoader(object):
"""
Load a file-like reader over http using range requests
and an optional cookie created via a cookie_maker
"""
def __init__(self, cookie_maker=None):
self.cookie_maker = cookie_maker
def load(self, url, offset, length):
if length > 0:
range_header = 'bytes={0}-{1}'.format(offset, offset + length - 1)
else:
range_header = 'bytes={0}-'.format(offset)
headers = {}
headers['Range'] = range_header
if self.cookie_maker:
headers['Cookie'] = self.cookie_maker.make()
request = urllib2.Request(url, headers=headers)
return urllib2.urlopen(request)
#=================================================================
# Signed Cookie-Maker
#=================================================================
class HMACCookieMaker(object):
"""
Utility class to produce signed HMAC digest cookies
to be used with each http request
"""
def __init__(self, key, name, duration=10):
self.key = key
self.name = name
# duration in seconds
self.duration = duration
def make(self, extra_id=''):
expire = str(long(time.time() + self.duration))
if extra_id:
msg = extra_id + '-' + expire
else:
msg = expire
hmacdigest = hmac.new(self.key, msg)
hexdigest = hmacdigest.hexdigest()
if extra_id:
cookie = '{0}-{1}={2}-{3}'.format(self.name, extra_id,
expire, hexdigest)
else:
cookie = '{0}={1}-{2}'.format(self.name, expire, hexdigest)
return cookie
#=================================================================
# load a reader from local filesystem
#=================================================================
class FileLoader(object):
"""
Load a file-like reader from the local file system
"""
def load(self, url, offset, length):
if url.startswith('file://'):
url = url[len('file://'):]
afile = open(url, 'rb')
afile.seek(offset)
if length > 0:
return LimitReader(afile, length)
#=================================================================
# Limit Reader
#=================================================================
class LimitReader(object):
"""
A reader which will not read more than specified limit
"""
def __init__(self, stream, limit):
self.stream = stream
self.limit = limit
if not self.limit:
self.limit = 1
def read(self, length=None):
length = min(length, self.limit) if length else self.limit
buff = self.stream.read(length)
self.limit -= len(buff)
return buff
def readline(self, length=None):
length = min(length, self.limit) if length else self.limit
buff = self.stream.readline(length)
self.limit -= len(buff)
return buff
def close(self):
self.stream.close()
#=================================================================
# Local text file with known size -- used for binsearch
#=================================================================
class SeekableTextFileReader(object):
"""
A very simple file-like object wrapper that knows it's total size,
via getsize()
Supports seek() operation.
Assumed to be a text file. Used for binsearch.
"""
def __init__(self, filename):
self.fh = open(filename, 'rb')
self.filename = filename
self.size = os.path.getsize(filename)
def getsize(self):
return self.size
def read(self):
return self.fh.read()
def readline(self):
return self.fh.readline()
def seek(self, offset):
return self.fh.seek(offset)
def close(self):
return self.fh.close()

View File

@ -0,0 +1,107 @@
"""
Representation and parsing of HTTP-style status + headers
"""
import pprint
#=================================================================
class StatusAndHeaders(object):
"""
Representation of parsed http-style status line and headers
Status Line if first line of request/response
Headers is a list of (name, value) tuples
An optional protocol which appears on first line may be specified
"""
def __init__(self, statusline, headers, protocol=''):
self.statusline = statusline
self.headers = headers
self.protocol = protocol
def get_header(self, name):
"""
return header (name, value)
if found
"""
name_lower = name.lower()
for value in self.headers:
if value[0].lower() == name_lower:
return value[1]
def remove_header(self, name):
"""
remove header (case-insensitive)
return True if header removed, False otherwise
"""
name_lower = name.lower()
for index in xrange(len(self.headers) - 1, -1, -1):
if self.headers[index][0].lower() == name_lower:
del self.headers[index]
return True
return False
def __repr__(self):
headers_str = pprint.pformat(self.headers, indent=2)
return "StatusAndHeaders(protocol = '{0}', statusline = '{1}', \
headers = {2})".format(self.protocol, self.statusline, headers_str)
def __eq__(self, other):
return (self.statusline == other.statusline and
self.headers == other.headers and
self.protocol == other.protocol)
#=================================================================
class StatusAndHeadersParser(object):
"""
Parser which consumes a stream support readline() to read
status and headers and return a StatusAndHeaders object
"""
def __init__(self, statuslist):
self.statuslist = statuslist
def parse(self, stream):
"""
parse stream for status line and headers
return a StatusAndHeaders object
"""
statusline = stream.readline().rstrip()
protocol_status = self.split_prefix(statusline, self.statuslist)
if not protocol_status:
msg = 'Expected Status Line - Found: ' + statusline
raise StatusAndHeadersParserException(msg)
headers = []
line = stream.readline().rstrip()
while line and line != '\r\n':
name, value = line.split(':', 1)
header = (name, value.strip())
headers.append(header)
line = stream.readline().rstrip()
return StatusAndHeaders(statusline=protocol_status[1].strip(),
headers=headers,
protocol=protocol_status[0])
@staticmethod
def split_prefix(key, prefixs):
"""
split key string into prefix and remainder
for first matching prefix from a list
"""
for prefix in prefixs:
if key.startswith(prefix):
plen = len(prefix)
return (key[:plen], key[plen:])
#=================================================================
class StatusAndHeadersParserException(Exception):
"""
status + headers parsing exception
"""
pass

View File

@ -0,0 +1,52 @@
#=================================================================
"""
# binsearch tests
# Prefix Search
>>> print_binsearch_results('org,iana)/domains/root', iter_prefix)
org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz
>>> print_binsearch_results('org,iana)/domains/root', iter_exact)
org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz
>>> print_binsearch_results('org,iana)/', iter_exact)
org,iana)/ 20140126200624 http://www.iana.org/ text/html 200 OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 2258 334 iana.warc.gz
>>> print_binsearch_results('org,iana)/domains/root/db', iter_exact)
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
# Exact Search
>>> print_binsearch_results('org,iaana)/', iter_exact)
>>> print_binsearch_results('org,ibna)/', iter_exact)
>>> print_binsearch_results('org,iana)/time-zones', iter_exact)
org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
"""
#=================================================================
import os
from pywb.utils.binsearch import iter_prefix, iter_exact
from pywb.utils.loaders import SeekableTextFileReader
from pywb import get_test_dir
#test_cdx_dir = os.path.dirname(os.path.realpath(__file__)) + '/../sample-data/'
test_cdx_dir = get_test_dir() + 'cdx/'
def print_binsearch_results(key, iter_func):
cdx = SeekableTextFileReader(test_cdx_dir + 'iana.cdx')
for line in iter_func(cdx, key):
print line
if __name__ == "__main__":
import doctest
doctest.testmod()

View File

@ -0,0 +1,69 @@
#=================================================================
"""
# LimitReader Tests
>>> LimitReader(StringIO.StringIO('abcdefghjiklmnopqrstuvwxyz'), 10).read(26)
'abcdefghji'
>>> LimitReader(StringIO.StringIO('abcdefghjiklmnopqrstuvwxyz'), 8).readline(26)
'abcdefgh'
>>> read_multiple(LimitReader(StringIO.StringIO('abcdefghjiklmnopqrstuvwxyz'), 10), [2, 2, 20])
'efghji'
# FileLoader Tests (includes LimitReader)
# Ensure attempt to read more than 100 bytes, reads exactly 100 bytes
>>> len(FileLoader().load(test_cdx_dir + 'iana.cdx', 0, 100).read('400'))
100
# SeekableTextFileReader Test
>>> sr = SeekableTextFileReader(test_cdx_dir + 'iana.cdx')
>>> sr.getsize()
30399
>>> seek_read_full(sr, 100)
'org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200826 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 34054 620049 iana.warc.gz\\n'
#BufferedReader readline()
>>> BufferedReader(open(test_cdx_dir + 'iana.cdx', 'rb')).readline()
' CDX N b a m s k r M S V g\\n'
#BufferedReader readline() with decompression
>>> BufferedReader(open(test_cdx_dir + 'iana.cdx.gz', 'rb'), decomp_type = 'gzip').readline()
' CDX N b a m s k r M S V g\\n'
>>> HttpLoader(HMACCookieMaker('test', 'test', 5)).load('http://example.com', 41, 14).read()
'Example Domain'
"""
#=================================================================
import os
import StringIO
from pywb.utils.loaders import FileLoader, HttpLoader, HMACCookieMaker
from pywb.utils.loaders import LimitReader, SeekableTextFileReader
from pywb.utils.bufferedreaders import BufferedReader
from pywb import get_test_dir
#test_cdx_dir = os.path.dirname(os.path.realpath(__file__)) + '/../sample-data/'
test_cdx_dir = get_test_dir() + 'cdx/'
def read_multiple(reader, inc_reads):
result = None
for x in inc_reads:
result = reader.read(x)
return result
def seek_read_full(seekable_reader, offset):
seekable_reader.seek(offset)
seekable_reader.readline() #skip
return seekable_reader.readline()
if __name__ == "__main__":
import doctest
doctest.testmod()

View File

@ -1,20 +1,25 @@
"""
utility functions for converting between
datetime, iso date and 14-digit timestamp
"""
import re
import time
import datetime
import calendar
from itertools import imap
#=================================================================
# str <-> datetime conversion
#=================================================================
DATE_TIMESPLIT = re.compile('[^\d]')
DATE_TIMESPLIT = re.compile(r'[^\d]')
TIMESTAMP_14 = '%Y%m%d%H%M%S'
PAD_STAMP_END = '29991231235959'
def iso_date_to_datetime(string):
"""
>>> iso_date_to_datetime('2013-12-26T10:11:12Z')
@ -28,16 +33,18 @@ def iso_date_to_datetime(string):
if nums[-1] == '':
nums = nums[:-1]
dt = datetime.datetime(*map(int, nums))
return dt
the_datetime = datetime.datetime(*imap(int, nums))
return the_datetime
def datetime_to_timestamp(dt):
def datetime_to_timestamp(the_datetime):
"""
>>> datetime_to_timestamp(datetime.datetime(2013, 12, 26, 10, 11, 12))
'20131226101112'
"""
return dt.strftime(TIMESTAMP_14)
return the_datetime.strftime(TIMESTAMP_14)
def iso_date_to_timestamp(string):
"""
@ -52,7 +59,7 @@ def iso_date_to_timestamp(string):
# default pad is end of range for compatibility
def pad_timestamp(string, pad_str = PAD_STAMP_END):
def pad_timestamp(string, pad_str=PAD_STAMP_END):
"""
>>> pad_timestamp('20')
'20991231235959'
@ -76,10 +83,12 @@ def pad_timestamp(string, pad_str = PAD_STAMP_END):
def timestamp_to_datetime(string):
"""
>>> timestamp_to_datetime('20131226095010')
time.struct_time(tm_year=2013, tm_mon=12, tm_mday=26, tm_hour=9, tm_min=50, tm_sec=10, tm_wday=3, tm_yday=360, tm_isdst=-1)
time.struct_time(tm_year=2013, tm_mon=12, tm_mday=26, \
tm_hour=9, tm_min=50, tm_sec=10, tm_wday=3, tm_yday=360, tm_isdst=-1)
>>> timestamp_to_datetime('2014')
time.struct_time(tm_year=2014, tm_mon=12, tm_mday=31, tm_hour=23, tm_min=59, tm_sec=59, tm_wday=2, tm_yday=365, tm_isdst=-1)
time.struct_time(tm_year=2014, tm_mon=12, tm_mday=31, \
tm_hour=23, tm_min=59, tm_sec=59, tm_wday=2, tm_yday=365, tm_isdst=-1)
"""
# Default pad to end of range for comptability

View File

@ -1,4 +1,4 @@
import cdxserver.timeutils as timeutils
import pywb.utils.timeutils as timeutils
import wbrequestresponse
import wbexceptions

22
pywb/warc/README.md Normal file
View File

@ -0,0 +1,22 @@
## PyWb Warc v0.2
[![Build Status](https://travis-ci.org/ikreymer/pywb_warc.png?branch=master)](https://travis-ci.org/ikreymer/pywb_warc)
This is the WARC/ARC record loading component of pywb wayback tool suite.
This package provides the following facilities:
* Resolve relative WARC/ARC filenames to a full path based on configurable resolvers
* Resolve 'revisit' records from provided index to find a full record with headers and payload content
* Load WARC and ARC records either locally or via http using http 1.1 range requests
### Tests
This package will include a test suite for different WARC and ARC loading formats.
To run: `python run-tests.py`

0
pywb/warc/__init__.py Normal file
View File

View File

@ -1,13 +1,27 @@
import redis
import binsearch.binsearch
from pywb.utils.binsearch import iter_exact
from pywb.utils.loaders import SeekableTextFileReader
import urlparse
import os
import logging
#======================================
# PrefixResolver - convert cdx file entry to url with prefix if url contains specified string
#======================================
"""
The purpose of this module is to 'resolve' a warc/arc filename,
often found in a CDX file, to a full loadable url.
Supported resolvers are: url prefix, path index lookup and redis
make_best_resolver() attempts to guess the resolver method for given uri
"""
#=================================================================
# PrefixResolver - convert cdx file entry to url with prefix
# if url contains specified string
#=================================================================
class PrefixResolver:
def __init__(self, prefix, contains):
self.prefix = prefix
@ -18,14 +32,15 @@ class PrefixResolver:
def __repr__(self):
if self.contains:
return "PrefixResolver('{0}', contains = '{1}')".format(self.prefix, self.contains)
return ("PrefixResolver('{0}', contains = '{1}')"
.format(self.prefix, self.contains))
else:
return "PrefixResolver('{0}')".format(self.prefix)
#======================================
#=================================================================
class RedisResolver:
def __init__(self, redis_url, key_prefix = None):
def __init__(self, redis_url, key_prefix=None):
self.redis_url = redis_url
self.key_prefix = key_prefix if key_prefix else 'w:'
self.redis = redis.StrictRedis.from_url(redis_url)
@ -42,14 +57,14 @@ class RedisResolver:
return "RedisResolver('{0}')".format(self.redis_url)
#======================================
#=================================================================
class PathIndexResolver:
def __init__(self, pathindex_file):
self.pathindex_file = pathindex_file
self.reader = binsearch.binsearch.FileReader(pathindex_file)
self.reader = SeekableTextFileReader(pathindex_file)
def __call__(self, filename):
result = binsearch.binsearch.iter_exact(self.reader, filename, '\t')
result = iter_exact(self.reader, filename, '\t')
def gen_list(result):
for pathline in result:
@ -63,6 +78,7 @@ class PathIndexResolver:
return "PathIndexResolver('{0}')".format(self.pathindex_file)
#=================================================================
#TODO: more options (remote files, contains param, etc..)
# find best resolver given the path
def make_best_resolver(param):
@ -80,11 +96,14 @@ def make_best_resolver(param):
RedisResolver('redis://myhost.example.com:1234/1')
# a file
>>> class_name(make_best_resolver('file://' + os.path.realpath(__file__)))
>>> r = make_best_resolver('file://' + os.path.realpath(__file__))
>>> r.__class__.__name__
'PathIndexResolver'
# a dir
>>> class_name(make_best_resolver('file://' + os.path.dirname(os.path.realpath(__file__))))
>>> path = os.path.realpath(__file__)
>>> r = make_best_resolver('file://' + os.path.dirname(path))
>>> r.__class__.__name__
'PrefixResolver'
"""
@ -99,27 +118,29 @@ def make_best_resolver(param):
url_parts = urlparse.urlsplit(path)
if url_parts.scheme == 'redis':
logging.info('Adding Redis Index: ' + path)
logging.debug('Adding Redis Index: ' + path)
return RedisResolver(path, arg)
if url_parts.scheme == 'file':
path = url_parts.path
if os.path.isfile(path):
logging.info('Adding Path Index: ' + path)
logging.debug('Adding Path Index: ' + path)
return PathIndexResolver(path)
# non-file paths always treated as prefix for now
else:
logging.info('Adding Archive Path Source: ' + path)
logging.debug('Adding Archive Path Source: ' + path)
return PrefixResolver(path, arg)
#=================================================================
def make_best_resolvers(paths):
"""
>>> make_best_resolvers(['http://myhost.example.com/warcs/', 'redis://myhost.example.com:1234/1'])
[PrefixResolver('http://myhost.example.com/warcs/'), RedisResolver('redis://myhost.example.com:1234/1')]
>>> r = make_best_resolvers(['http://example.com/warcs/',\
'redis://example.com:1234/1'])
>>> map(lambda x: x.__class__.__name__, r)
['PrefixResolver', 'RedisResolver']
"""
if hasattr(paths, '__iter__'):
return map(make_best_resolver, paths)
@ -127,13 +148,7 @@ def make_best_resolvers(paths):
return [make_best_resolver(paths)]
import utils
#=================================================================
if __name__ == "__main__" or utils.enable_doctests():
def class_name(obj):
return obj.__class__.__name__
if __name__ == "__main__":
import doctest
doctest.testmod()

161
pywb/warc/recordloader.py Normal file
View File

@ -0,0 +1,161 @@
import itertools
import urlparse
import collections
from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.utils.statusandheaders import StatusAndHeadersParser
from pywb.utils.loaders import FileLoader, HttpLoader
from pywb.utils.bufferedreaders import BufferedReader
#=================================================================
ArcWarcRecord = collections.namedtuple('ArchiveRecord',
'type, rec_headers, ' +
'stream, status_headers')
#=================================================================
class ArchiveLoadFailed(Exception):
def __init__(self, reason, filename=''):
super(ArchiveLoadFailed, self).__init__(filename + ':' + str(reason))
#self.filename = filename
#self.reason = reason
def status(self):
return '503 Service Unavailable'
#=================================================================
class ArcWarcRecordLoader:
# Standard ARC headers
ARC_HEADERS = ["uri", "ip-address", "creation-date",
"content-type", "length"]
# Since loading a range request,
# can only determine gzip-ness based on file extension
# (BufferedReader will however default to non-gzip if
# decompression fails)
FORMAT_MAP = {
'.warc.gz': ('warc', True),
'.arc.gz': ('arc', True),
'.warc': ('warc', False),
'.arc': ('arc', False),
}
@staticmethod
def create_default_loaders(cookie_maker=None):
http = HttpLoader(cookie_maker)
file = FileLoader()
return {
'http': http,
'https': http,
'file': file,
'': file
}
def __init__(self, loaders={}, cookie_maker=None, chunk_size=8192):
self.loaders = loaders
if not self.loaders:
self.loaders = self.create_default_loaders(cookie_maker)
self.chunk_size = chunk_size
self.arc_parser = ARCHeadersParser(self.ARC_HEADERS)
warc_types = ['WARC/1.0', 'WARC/0.17', 'WARC/0.18']
self.warc_parser = StatusAndHeadersParser(warc_types)
self.http_parser = StatusAndHeadersParser(['HTTP/1.0', 'HTTP/1.1'])
def load(self, url, offset, length):
url_parts = urlparse.urlsplit(url)
loader = self.loaders.get(url_parts.scheme)
if not loader:
raise ArchiveLoadFailed('Unknown Protocol', url)
the_format = None
for ext, iformat in self.FORMAT_MAP.iteritems():
if url.endswith(ext):
the_format = iformat
break
if the_format is None:
raise ArchiveLoadFailed('Unknown file format', url)
(a_format, is_gzip) = the_format
#decomp = utils.create_decompressor() if is_gzip else None
decomp_type = 'gzip' if is_gzip else None
try:
length = int(length)
except:
length = -1
raw = loader.load(url, long(offset), length)
stream = BufferedReader(raw, length, self.chunk_size, decomp_type)
if a_format == 'arc':
rec_headers = self.arc_parser.parse(stream)
rec_type = 'response'
empty = (rec_headers.get_header('length') == 0)
elif a_format == 'warc':
rec_headers = self.warc_parser.parse(stream)
rec_type = rec_headers.get_header('WARC-Type')
empty = (rec_headers.get_header('Content-Length') == '0')
# special case: empty w/arc record (hopefully a revisit)
if empty:
status_headers = StatusAndHeaders('204 No Content', [])
# special case: warc records that are not expected to have http headers
# attempt to add 200 status and content-type
elif rec_type == 'metadata' or rec_type == 'resource':
content_type = [('Content-Type',
rec_headers.get_header('Content-Type'))]
status_headers = StatusAndHeaders('200 OK', content_type)
# special case: http 0.9 response, no status or headers
#elif rec_type == 'response':
# content_type = rec_headers.get_header('Content-Type')
# if content_type and (';version=0.9' in content_type):
# status_headers = StatusAndHeaders('200 OK', [])
# response record: parse HTTP status and headers!
else:
#(statusline, http_headers) = self.parse_http_headers(stream)
status_headers = self.http_parser.parse(stream)
return ArcWarcRecord((a_format, rec_type),
rec_headers, stream, status_headers)
#=================================================================
class ARCHeadersParser:
def __init__(self, headernames):
self.headernames = headernames
def parse(self, stream):
headerline = stream.readline().rstrip()
parts = headerline.split()
headernames = self.headernames
if len(parts) != len(headernames):
msg = 'Wrong # of headers, expected arc headers {0}, Found {1}'
raise ArchiveLoadFailed(msg.format(headernames, parts))
headers = []
for name, value in itertools.izip(headernames, parts):
headers.append((name, value))
return StatusAndHeaders(statusline='',
headers=headers,
protocol='ARC/1.0')

View File

@ -0,0 +1,176 @@
from pywb.utils.timeutils import iso_date_to_timestamp
from recordloader import ArcWarcRecordLoader, ArchiveLoadFailed
from pathresolvers import make_best_resolvers
#=================================================================
class ResolvingLoader:
def __init__(self, paths, record_loader=ArcWarcRecordLoader(),
cdx_server=None):
self.path_resolvers = make_best_resolvers(paths)
self.record_loader = record_loader
self.cdx_server = cdx_server
def resolve_headers_and_payload(self, cdx, failed_files):
"""
Resolve headers and payload for a given capture
In the simple case, headers and payload are in the same record.
In the case of revisit records, the payload and headers may be in
different records.
If the original has already been found, lookup original using
orig. fields in cdx dict.
Otherwise, call _load_different_url_payload() to get cdx index
from a different url to find the original record.
"""
has_curr = (cdx['filename'] != '-')
has_orig = (cdx.get('orig.filename', '-') != '-')
# load headers record from cdx['filename'] unless it is '-' (rare)
headers_record = None
if has_curr:
headers_record = self._resolve_path_load(cdx, False, failed_files)
# two index lookups
# Case 1: if mimetype is still warc/revisit
if cdx['mimetype'] == 'warc/revisit' and headers_record:
payload_record = self._load_different_url_payload(cdx,
headers_record,
failed_files)
# single lookup cases
# case 2: non-revisit
elif (has_curr and not has_orig):
payload_record = headers_record
# case 3: identical url revisit, load payload from orig.filename
elif (has_orig):
payload_record = self._resolve_path_load(cdx, True, failed_files)
# special case: set header to payload if old-style revisit
# with missing header
if not headers_record:
headers_record = payload_record
elif headers_record != payload_record:
# close remainder of stream as this record only used for
# (already parsed) headers
headers_record.stream.close()
# special case: check if headers record is actually empty
# (eg empty revisit), then use headers from revisit
if not headers_record.status_headers.headers:
headers_record = payload_record
if not headers_record or not payload_record:
raise ArchiveLoadFailed('Could not load ' + str(cdx))
return (headers_record.status_headers, payload_record.stream)
def _resolve_path_load(self, cdx, is_original, failed_files):
"""
Load specific record based on filename, offset and length
fields in the cdx.
If original=True, use the orig.* fields for the cdx
Resolve the filename to full path using specified path resolvers
If failed_files list provided, keep track of failed resolve attempts
"""
if is_original:
(filename, offset, length) = (cdx['orig.filename'],
cdx['orig.offset'],
cdx['orig.length'])
else:
(filename, offset, length) = (cdx['filename'],
cdx['offset'],
cdx['length'])
# optimization: if same file already failed this request,
# don't try again
if failed_files and filename in failed_files:
raise ArchiveLoadFailed('Skipping Already Failed', filename)
any_found = False
last_exc = None
for resolver in self.path_resolvers:
possible_paths = resolver(filename)
if possible_paths:
for path in possible_paths:
any_found = True
try:
return self.record_loader.load(path, offset, length)
except Exception as ue:
last_exc = ue
# Unsuccessful if reached here
if failed_files:
failed_files.append(filename)
if last_exc:
msg = str(last_exc.__class__.__name__)
else:
msg = 'Archive File Not Found'
raise ArchiveLoadFailed(msg, filename)
def _load_different_url_payload(self, cdx, headers_record, failed_files):
"""
Handle the case where a duplicate of a capture with same digest
exists at a different url.
If a cdx_server is provided, a query is made for matching
url, timestamp and digest.
Raise exception if no matches found.
"""
ref_target_uri = (headers_record.rec_headers.
get_header('WARC-Refers-To-Target-URI'))
target_uri = headers_record.rec_headers.get_header('WARC-Target-URI')
# Check for unresolved revisit error,
# if refers to target uri not present or same as the current url
if not ref_target_uri or (ref_target_uri == target_uri):
raise ArchiveLoadFailed('Missing Revisit Original')
ref_target_date = (headers_record.rec_headers.
get_header('WARC-Refers-To-Date'))
if not ref_target_date:
ref_target_date = cdx['timestamp']
else:
ref_target_date = iso_date_to_timestamp(ref_target_date)
orig_cdx_lines = self.load_cdx_for_dupe(ref_target_uri,
ref_target_date, digest)
for cdx in orig_cdx_lines:
try:
payload_record = self._load_and_resolve(cdx, False,
failed_files)
return payload_record
except ArchiveLoadFailed as e:
pass
raise ArchiveLoadFailed('Original for revisit could not be loaded')
def load_cdx_for_dupe(url, timestamp, digest):
"""
If a cdx_server is available, return response from server,
otherwise empty list
"""
if not self.cdx_server:
return []
params = {'url': url,
'closest': closest,
'filter': 'digest:' + digest,
'output': 'raw'}
return self.cdx_server.load_cdx(params)

View File

@ -0,0 +1,199 @@
"""
Test loading different types of records from a variety of formats
# Load response record from WARC
>>> load_test_archive('example.warc.gz', '333', '1043')
(('warc', 'response'),
StatusAndHeaders(protocol = 'WARC/1.0', statusline = '', headers = [ ('WARC-Type', 'response'),
('WARC-Record-ID', '<urn:uuid:6d058047-ede2-4a13-be79-90c17c631dd4>'),
('WARC-Date', '2014-01-03T03:03:21Z'),
('Content-Length', '1610'),
('Content-Type', 'application/http; msgtype=response'),
('WARC-Payload-Digest', 'sha1:B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'),
('WARC-Target-URI', 'http://example.com?example=1'),
('WARC-Warcinfo-ID', '<urn:uuid:fbd6cf0a-6160-4550-b343-12188dc05234>')]),
StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
('Cache-Control', 'max-age=604800'),
('Content-Type', 'text/html'),
('Date', 'Fri, 03 Jan 2014 03:03:21 GMT'),
('Etag', '"359670651"'),
('Expires', 'Fri, 10 Jan 2014 03:03:21 GMT'),
('Last-Modified', 'Fri, 09 Aug 2013 23:54:35 GMT'),
('Server', 'ECS (sjc/4FCE)'),
('X-Cache', 'HIT'),
('x-ec-custom-error', '1'),
('Content-Length', '1270'),
('Connection', 'close')]))
# Load revisit record from WARC
>>> load_test_archive('example.warc.gz', '1864', '553')
(('warc', 'revisit'),
StatusAndHeaders(protocol = 'WARC/1.0', statusline = '', headers = [ ('WARC-Type', 'revisit'),
('WARC-Record-ID', '<urn:uuid:3619f5b0-d967-44be-8f24-762098d427c4>'),
('WARC-Date', '2014-01-03T03:03:41Z'),
('Content-Length', '340'),
('Content-Type', 'application/http; msgtype=response'),
('WARC-Payload-Digest', 'sha1:B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'),
('WARC-Target-URI', 'http://example.com?example=1'),
('WARC-Warcinfo-ID', '<urn:uuid:fbd6cf0a-6160-4550-b343-12188dc05234>'),
( 'WARC-Profile',
'http://netpreserve.org/warc/0.18/revisit/identical-payload-digest'),
('WARC-Refers-To-Target-URI', 'http://example.com?example=1'),
('WARC-Refers-To-Date', '2014-01-03T03:03:21Z')]),
StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
('Cache-Control', 'max-age=604800'),
('Content-Type', 'text/html'),
('Date', 'Fri, 03 Jan 2014 03:03:41 GMT'),
('Etag', '"359670651"'),
('Expires', 'Fri, 10 Jan 2014 03:03:41 GMT'),
('Last-Modified', 'Fri, 09 Aug 2013 23:54:35 GMT'),
('Server', 'ECS (sjc/4FCE)'),
('X-Cache', 'HIT'),
('x-ec-custom-error', '1'),
('Content-Length', '1270'),
('Connection', 'close')]))
# Test of record loading based on cdx line
# Print parsed http headers + 2 lines of content
# ==============================================================================
# Test loading from ARC based on cdx line
>>> load_from_cdx_test('com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 856 171 example.arc.gz')
StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
('Cache-Control', 'max-age=604800'),
('Content-Type', 'text/html'),
('Date', 'Sun, 16 Feb 2014 05:02:20 GMT'),
('Etag', '"359670651"'),
('Expires', 'Sun, 23 Feb 2014 05:02:20 GMT'),
('Last-Modified', 'Fri, 09 Aug 2013 23:54:35 GMT'),
('Server', 'ECS (sjc/4FCE)'),
('X-Cache', 'HIT'),
('x-ec-custom-error', '1'),
('Content-Length', '1270')])
<!doctype html>
<html>
>>> load_from_cdx_test('com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1656 151 example.arc')
StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
('Cache-Control', 'max-age=604800'),
('Content-Type', 'text/html'),
('Date', 'Sun, 16 Feb 2014 05:02:20 GMT'),
('Etag', '"359670651"'),
('Expires', 'Sun, 23 Feb 2014 05:02:20 GMT'),
('Last-Modified', 'Fri, 09 Aug 2013 23:54:35 GMT'),
('Server', 'ECS (sjc/4FCE)'),
('X-Cache', 'HIT'),
('x-ec-custom-error', '1'),
('Content-Length', '1270')])
<!doctype html>
<html>
# Test loading from WARC based on cdx line
>>> load_from_cdx_test('com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz')
StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
('Cache-Control', 'max-age=604800'),
('Content-Type', 'text/html'),
('Date', 'Fri, 03 Jan 2014 03:03:21 GMT'),
('Etag', '"359670651"'),
('Expires', 'Fri, 10 Jan 2014 03:03:21 GMT'),
('Last-Modified', 'Fri, 09 Aug 2013 23:54:35 GMT'),
('Server', 'ECS (sjc/4FCE)'),
('X-Cache', 'HIT'),
('x-ec-custom-error', '1'),
('Content-Length', '1270'),
('Connection', 'close')])
<!doctype html>
<html>
# Test cdx w/ revisit
>>> load_from_cdx_test('com,example)/?example=1 20140103030341 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz 1043 333 example.warc.gz')
StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
('Cache-Control', 'max-age=604800'),
('Content-Type', 'text/html'),
('Date', 'Fri, 03 Jan 2014 03:03:41 GMT'),
('Etag', '"359670651"'),
('Expires', 'Fri, 10 Jan 2014 03:03:41 GMT'),
('Last-Modified', 'Fri, 09 Aug 2013 23:54:35 GMT'),
('Server', 'ECS (sjc/4FCE)'),
('X-Cache', 'HIT'),
('x-ec-custom-error', '1'),
('Content-Length', '1270'),
('Connection', 'close')])
<!doctype html>
<html>
# Test loading warc created by wget 1.14
>>> load_from_cdx_test('com,example)/ 20140216012908 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1151 792 example-wget-1-14.warc.gz')
StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
('Cache-Control', 'max-age=604800'),
('Content-Type', 'text/html'),
('Date', 'Sun, 16 Feb 2014 01:29:08 GMT'),
('Etag', '"359670651"'),
('Expires', 'Sun, 23 Feb 2014 01:29:08 GMT'),
('Last-Modified', 'Fri, 09 Aug 2013 23:54:35 GMT'),
('Server', 'ECS (sjc/4FB4)'),
('X-Cache', 'HIT'),
('x-ec-custom-error', '1'),
('Content-Length', '1270')])
<!doctype html>
<html>
# Error Handling
# Invalid WARC Offset
>>> load_from_cdx_test('com,example)/?example=1 20140103030341 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1860 example.warc.gz 1043 333 example.warc.gz')
Traceback (most recent call last):
ArchiveLoadFailed: example.warc.gz:StatusAndHeadersParserException
# Invalid ARC Offset
>>> load_from_cdx_test('com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 332 example.warc.gz')
Traceback (most recent call last):
ArchiveLoadFailed: example.warc.gz:StatusAndHeadersParserException
# Error Expected with revisit -- invalid offset on original
>>> load_from_cdx_test('com,example)/?example=1 20140103030341 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz 1043 330 example.warc.gz')
Traceback (most recent call last):
ArchiveLoadFailed: example.warc.gz:StatusAndHeadersParserException
"""
import os
import sys
import pprint
from pywb.warc.recordloader import ArcWarcRecordLoader, ArchiveLoadFailed
from pywb.warc.pathresolvers import make_best_resolvers
from pywb.warc.resolvingloader import ResolvingLoader
from pywb.cdx.cdxobject import CDXObject
from pywb import get_test_dir
#test_warc_dir = os.path.dirname(os.path.realpath(__file__)) + '/../sample_data/'
test_warc_dir = get_test_dir() + 'warcs/'
def load_test_archive(test_file, offset, length):
path = test_warc_dir + test_file
testloader = ArcWarcRecordLoader()
archive = testloader.load(path, offset, length)
archive = testloader.load(path, offset, length)
pprint.pprint((archive.type, archive.rec_headers, archive.status_headers))
def load_from_cdx_test(cdx):
resolve_loader = ResolvingLoader(test_warc_dir)
cdx = CDXObject(cdx)
(headers, stream) = resolve_loader.resolve_headers_and_payload(cdx, None)
print headers
sys.stdout.write(stream.readline())
sys.stdout.write(stream.readline())

View File

@ -1,8 +1,7 @@
import utils
import wbexceptions
from wbrequestresponse import WbResponse, StatusAndHeaders
from cdxserver.cdxserver import CDXException
from pywb.cdx.cdxserver import CDXException
import os
import importlib
@ -10,13 +9,37 @@ import logging
#=================================================================
# adapted -from wsgiref.request_uri, but doesn't include domain name and allows all characters
# allowed in the path segment according to: http://tools.ietf.org/html/rfc3986#section-3.3
# explained here: http://stackoverflow.com/questions/4669692/valid-characters-for-directory-part-of-a-url-for-short-links
def rel_request_uri(environ, include_query=1):
"""
Return the requested path, optionally including the query string
# Simple test:
>>> rel_request_uri({'PATH_INFO': '/web/example.com'})
'/web/example.com'
# Test all unecoded special chars and double-quote
# (double-quote must be encoded but not single quote)
>>> rel_request_uri({'PATH_INFO': "/web/example.com/0~!+$&'()*+,;=:\\\""})
"/web/example.com/0~!+$&'()*+,;=:%22"
"""
from urllib import quote
url = quote(environ.get('PATH_INFO',''), safe='/~!$&\'()*+,;=:@')
if include_query and environ.get('QUERY_STRING'):
url += '?' + environ['QUERY_STRING']
return url
#=================================================================
def create_wb_app(wb_router):
# Top-level wsgi application
def application(env, start_response):
if env.get('SCRIPT_NAME') or not env.get('REQUEST_URI'):
env['REL_REQUEST_URI'] = utils.rel_request_uri(env)
env['REL_REQUEST_URI'] = rel_request_uri(env)
else:
env['REL_REQUEST_URI'] = env['REQUEST_URI']
@ -95,7 +118,7 @@ def main():
raise
#=================================================================
if __name__ == "__main__" or utils.enable_doctests():
if __name__ == "__main__":
pass
else:
application = main()

View File

@ -1,7 +1,6 @@
from wburl import WbUrl
from url_rewriter import UrlRewriter
import utils
from pywb.rewrite.wburl import WbUrl
from pywb.rewrite.url_rewriter import UrlRewriter
from pywb.utils.statusandheaders import StatusAndHeaders
import pprint
#WB Request and Response
@ -182,35 +181,6 @@ class WbResponse:
def __repr__(self):
return str(vars(self))
#=================================================================
class StatusAndHeaders:
def __init__(self, statusline, headers, protocol = ''):
self.statusline = statusline
self.headers = headers
self.protocol = protocol
def get_header(self, name):
name_lower = name.lower()
for value in self.headers:
if (value[0].lower() == name_lower):
return value[1]
def remove_header(self, name):
name_lower = name.lower()
for x in xrange(len(self.headers) - 1, -1, -1):
if self.headers[x][0].lower() == name_lower:
del self.headers[x]
break
def __repr__(self):
return "StatusAndHeaders(protocol = '{0}', statusline = '{1}', headers = {2})".format(self.protocol, self.statusline, pprint.pformat(self.headers, indent = 2))
#return pprint.pformat(self.__dict__)
def __eq__(self, other):
return self.statusline == other.statusline and self.headers == other.headers and self.protocol == other.protocol
if __name__ == "__main__":
import doctest
doctest.testmod()

3
run-tests.py Normal file
View File

@ -0,0 +1,3 @@
import pytest
result = pytest.main('-v --doctest-module tests/ pywb/')
exit(result)

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1,69 @@
filedesc://live-web-example.arc.gz 127.0.0.1 20140216050221 text/plain 75
1 0 LiveWeb Capture
URL IP-address Archive-date Content-type Archive-length
http://example.com/ 93.184.216.119 20140216050221 text/html 1591
HTTP/1.1 200 OK
Accept-Ranges: bytes
Cache-Control: max-age=604800
Content-Type: text/html
Date: Sun, 16 Feb 2014 05:02:20 GMT
Etag: "359670651"
Expires: Sun, 23 Feb 2014 05:02:20 GMT
Last-Modified: Fri, 09 Aug 2013 23:54:35 GMT
Server: ECS (sjc/4FCE)
X-Cache: HIT
x-ec-custom-error: 1
Content-Length: 1270
<!doctype html>
<html>
<head>
<title>Example Domain</title>
<meta charset="utf-8" />
<meta http-equiv="Content-type" content="text/html; charset=utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<style type="text/css">
body {
background-color: #f0f0f2;
margin: 0;
padding: 0;
font-family: "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
}
div {
width: 600px;
margin: 5em auto;
padding: 50px;
background-color: #fff;
border-radius: 1em;
}
a:link, a:visited {
color: #38488f;
text-decoration: none;
}
@media (max-width: 700px) {
body {
background-color: #fff;
}
div {
width: auto;
margin: 0 auto;
border-radius: 0;
padding: 1em;
}
}
</style>
</head>
<body>
<div>
<h1>Example Domain</h1>
<p>This domain is established to be used for illustrative examples in documents. You may use this
domain in examples without prior coordination or asking for permission.</p>
<p><a href="http://www.iana.org/domains/example">More information...</a></p>
</div>
</body>
</html>

Binary file not shown.

View File

@ -5,18 +5,18 @@ import setuptools
import glob
setuptools.setup(name='pywb',
version='0.1',
version='0.2',
url='https://github.com/ikreymer/pywb',
author='Ilya Kreymer',
author_email='ilya@archive.org',
long_description=open('README.md').read(),
license='GPL',
packages=['pywb', 'pywb.binsearch', 'pywb.cdxserver'],
provides=['pywb', 'pywb.binsearch', 'pywb.cdxserver'],
packages=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite'],
provides=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite'],
package_data={'pywb': ['ui/*', 'static/*']},
data_files = [('sample_archive/cdx/', glob.glob('sample_archive/cdx/*')),
('sample_archive/warcs/', glob.glob('sample_archive/warcs/*'))],
install_requires=['uwsgi', 'rfc3987', 'chardet', 'redis', 'jinja2', 'surt', 'pyyaml', 'WebTest'],
tests_require=['WebTest', 'pytest'],
install_requires=['uwsgi', 'rfc3987', 'chardet', 'redis', 'jinja2', 'surt', 'pyyaml', 'WebTest','pytest'],
# tests_require=['WebTest', 'pytest'],
zip_safe=False)

View File

@ -0,0 +1,88 @@
"""
Test Route
# route with relative path
>>> Route('web', BaseHandler())({'REL_REQUEST_URI': '/web/test.example.com', 'SCRIPT_NAME': ''}, False)
{'wb_url': ('latest_replay', '', '', 'http://test.example.com', 'http://test.example.com'), 'coll': 'web', 'wb_prefix': '/web/', 'request_uri': '/web/test.example.com'}
# route with absolute path, running at script /my_pywb
>>> Route('web', BaseHandler())({'REL_REQUEST_URI': '/web/2013im_/test.example.com', 'SCRIPT_NAME': '/my_pywb', 'HTTP_HOST': 'localhost:8081', 'wsgi.url_scheme': 'https'}, True)
{'wb_url': ('replay', '2013', 'im_', 'http://test.example.com', '2013im_/http://test.example.com'), 'coll': 'web', 'wb_prefix': 'https://localhost:8081/my_pywb/web/', 'request_uri': '/web/2013im_/test.example.com'}
# not matching route -- skipped
>>> Route('web', BaseHandler())({'REL_REQUEST_URI': '/other/test.example.com', 'SCRIPT_NAME': ''}, False)
# Referer Redirect Test
>>> ReferRedirect('http://localhost:8080/').match_prefixs
['http://localhost:8080/']
>>> ReferRedirect(['http://example:9090/']).match_prefixs
['http://example:9090/']
>>> _test_redir('http://localhost:8080/', '/diff_path/other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html')
'http://localhost:8080/coll/20131010/http://example.com/diff_path/other.html'
>>> _test_redir('http://localhost:8080/', '/../other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html')
'http://localhost:8080/coll/20131010/http://example.com/other.html'
>>> _test_redir('http://localhost:8080/', '/../../other.html', 'http://localhost:8080/coll/20131010/http://example.com/index.html')
'http://localhost:8080/coll/20131010/http://example.com/other.html'
# Custom collection
>>> _test_redir('http://localhost:8080/', '/other.html', 'http://localhost:8080/complex/123/20131010/http://example.com/path/page.html', coll='complex/123')
'http://localhost:8080/complex/123/20131010/http://example.com/other.html'
# With timestamp included
>>> _test_redir('http://localhost:8080/', '/20131010/other.html', 'http://localhost:8080/coll/20131010/http://example.com/index.html')
'http://localhost:8080/coll/20131010/http://example.com/other.html'
# With timestamp included
>>> _test_redir('http://localhost:8080/', '/20131010/path/other.html', 'http://localhost:8080/coll/20131010/http://example.com/some/index.html')
'http://localhost:8080/coll/20131010/http://example.com/path/other.html'
# Wrong Host
>>> _test_redir('http://example:8080/', '/other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html')
False
# Right Host
>>> _test_redir('http://localhost:8080/', '/other.html', 'http://example.com:8080/coll/20131010/http://example.com/path/page.html', http_host = 'example.com:8080')
'http://example.com:8080/coll/20131010/http://example.com/other.html'
# With custom SCRIPT_NAME
>>> _test_redir('http://localhost:8080/', '/../other.html', 'http://localhost:8080/extra/coll/20131010/http://example.com/path/page.html', '/extra')
'http://localhost:8080/extra/coll/20131010/http://example.com/other.html'
# With custom SCRIPT_NAME + timestamp
>>> _test_redir('http://localhost:8080/', '/20131010/other.html', 'http://localhost:8080/extra/coll/20131010/http://example.com/path/page.html', '/extra')
'http://localhost:8080/extra/coll/20131010/http://example.com/other.html'
# With custom SCRIPT_NAME, bad match
>>> _test_redir('http://localhost:8080/', '/../other.html', 'http://localhost:8080/extra/coll/20131010/http://example.com/path/page.html', '/extr')
False
"""
from pywb.archivalrouter import Route, ReferRedirect
from pywb.handlers import BaseHandler
def _test_redir(match_host, request_uri, referrer, script_name = '', coll = 'coll', http_host = None):
env = {'REL_REQUEST_URI': request_uri, 'HTTP_REFERER': referrer, 'SCRIPT_NAME': script_name}
if http_host:
env['HTTP_HOST'] = http_host
routes = [Route(coll, BaseHandler())]
redir = ReferRedirect(match_host)
#req = WbRequest.from_uri(request_uri, env)
rep = redir(env, routes)
if not rep:
return False
return rep.status_headers.get_header('Location')

View File

@ -1,43 +0,0 @@
import os
from ..pywb.binsearch.binsearch import iter_prefix, iter_exact, FileReader
test_cdx_dir = os.path.dirname(os.path.realpath(__file__)) + '/../sample_archive/cdx/'
def binsearch_cdx_test(key, iter_func):
"""
# Prefix Search
>>> binsearch_cdx_test('org,iana)/domains/root', iter_prefix)
org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz
>>> binsearch_cdx_test('org,iana)/domains/root', iter_exact)
org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz
>>> binsearch_cdx_test('org,iana)/', iter_exact)
org,iana)/ 20140126200624 http://www.iana.org/ text/html 200 OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 2258 334 iana.warc.gz
>>> binsearch_cdx_test('org,iana)/domains/root/db', iter_exact)
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
# Exact Search
>>> binsearch_cdx_test('org,iaana)/', iter_exact)
>>> binsearch_cdx_test('org,ibna)/', iter_exact)
>>> binsearch_cdx_test('org,iana)/time-zones', iter_exact)
org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
"""
cdx = FileReader(test_cdx_dir + 'iana.cdx')
for line in iter_func(cdx, key):
print line
if __name__ == "__main__":
import doctest
doctest.testmod()

View File

@ -1,149 +0,0 @@
from ..pywb.binsearch.binsearch import iter_exact, iter_prefix, FileReader
from ..pywb.cdxserver.cdxserver import CDXServer
import os
import sys
import pprint
test_cdx_dir = os.path.dirname(os.path.realpath(__file__)) + '/../sample_archive/cdx/'
def cdx_ops_test(url, sources = [test_cdx_dir + 'iana.cdx'], **kwparams):
"""
# Merge Sort Multipe CDX Sources
>>> cdx_ops_test(url = 'http://iana.org/', sources = [test_cdx_dir + 'dupes.cdx', test_cdx_dir + 'iana.cdx'])
org,iana)/ 20140126200624 http://www.iana.org/ text/html 200 OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 2258 334 iana.warc.gz
org,iana)/ 20140127171238 http://iana.org unk 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 343 1858 dupes.warc.gz
org,iana)/ 20140127171238 http://www.iana.org/ warc/revisit - OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 536 2678 dupes.warc.gz
# Limit CDX Stream
>>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', limit = 3)
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200625 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 117166 198285 iana.warc.gz
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200654 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf warc/revisit - YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 548 482544 iana.warc.gz
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200706 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf warc/revisit - YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 495230 iana.warc.gz
# Reverse CDX Stream
>>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', reverse = True, resolve_revisits = True, limit = 3)
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201308 https://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 551 783712 iana.warc.gz 117166 198285 iana.warc.gz
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201249 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 771773 iana.warc.gz 117166 198285 iana.warc.gz
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201240 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 551 757988 iana.warc.gz 117166 198285 iana.warc.gz
>>> cdx_ops_test('http://iana.org/_js/2013.1/jquery.js', reverse = True, resolve_revisits = True, limit = 1)
org,iana)/_js/2013.1/jquery.js 20140126201307 https://www.iana.org/_js/2013.1/jquery.js application/x-javascript 200 AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO - - 543 778507 iana.warc.gz 33449 7311 iana.warc.gz
# No matching results
>>> cdx_ops_test('http://iana.org/dont_have_this', reverse = True, resolve_revisits = True, limit = 2)
# Filter cdx
>>> cdx_ops_test(url = 'http://iana.org/domains', match_type = 'prefix', filter = ['mimetype:text/html'])
org,iana)/domains 20140126200825 http://www.iana.org/domains text/html 200 7UPSCLNWNZP33LGW6OJGSF2Y4CDG4ES7 - - 2912 610534 iana.warc.gz
org,iana)/domains/arpa 20140126201248 http://www.iana.org/domains/arpa text/html 200 QOFZZRN6JIKAL2JRL6ZC2VVG42SPKGHT - - 2939 759039 iana.warc.gz
org,iana)/domains/idn-tables 20140126201127 http://www.iana.org/domains/idn-tables text/html 200 HNCUFTJMOQOGAEY6T56KVC3T7TVLKGEW - - 8118 715878 iana.warc.gz
org,iana)/domains/int 20140126201239 http://www.iana.org/domains/int text/html 200 X32BBNNORV4SPEHTQF5KI5NFHSKTZK6Q - - 2482 746788 iana.warc.gz
org,iana)/domains/reserved 20140126201054 http://www.iana.org/domains/reserved text/html 200 R5AAEQX5XY5X5DG66B23ODN5DUBWRA27 - - 3573 701457 iana.warc.gz
org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', filter = 'statuscode:200')
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
# Collapse by timestamp
# unresolved revisits, different statuscode results in an extra repeat
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapse_time = 11)
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
org,iana)/_css/2013.1/screen.css 20140126200653 http://www.iana.org/_css/2013.1/screen.css warc/revisit - BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 533 328367 iana.warc.gz
org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css warc/revisit - BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz
# resolved revisits
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapse_time = '11', resolve_revisits = True)
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz - - -
org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz 8754 41238 iana.warc.gz
# Sort by closest timestamp + field select output
>>> cdx_ops_test(closest_to = '20140126200826', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', fields = 'timestamp', limit = 10)
20140126200826
20140126200816
20140126200805
20140126200912
20140126200738
20140126200930
20140126200718
20140126200706
20140126200654
20140126200625
>>> cdx_ops_test(closest_to = '20140126201306', url = 'http://iana.org/dnssec', resolve_revisits = True, sources = [test_cdx_dir + 'dupes.cdx', test_cdx_dir + 'iana.cdx'])
org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz - - -
org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz - - -
>>> cdx_ops_test(closest_to = '20140126201307', url = 'http://iana.org/dnssec', resolve_revisits = True)
org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz - - -
org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz - - -
# equal dist prefer earlier
>>> cdx_ops_test(closest_to = '20140126200700', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolve_revisits = True, limit = 2)
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200654 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 548 482544 iana.warc.gz 117166 198285 iana.warc.gz
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200706 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 495230 iana.warc.gz 117166 198285 iana.warc.gz
>>> cdx_ops_test(closest_to = '20140126200659', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolve_revisits = True, limit = 2, fields = 'timestamp')
20140126200654
20140126200706
>>> cdx_ops_test(closest_to = '20140126200701', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolve_revisits = True, limit = 2, fields = 'timestamp')
20140126200706
20140126200654
# Resolve Revisits
>>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/inconsolata.otf', resolve_revisits = True)
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200826 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 34054 620049 iana.warc.gz - - -
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200912 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 546 667073 iana.warc.gz 34054 620049 iana.warc.gz
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200930 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 534 697255 iana.warc.gz 34054 620049 iana.warc.gz
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201055 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 547 714833 iana.warc.gz 34054 620049 iana.warc.gz
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201249 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 551 768625 iana.warc.gz 34054 620049 iana.warc.gz
>>> cdx_ops_test('http://iana.org/domains/root/db', resolve_revisits = True)
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz - - -
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz - - -
# CDX Server init
>>> x = CDXServer([test_cdx_dir]).load_cdx(url = 'example.com', limit = 1, output = 'raw')
>>> pprint.pprint(x.next().items())
[('urlkey', 'com,example)/'),
('timestamp', '20140127171200'),
('original', 'http://example.com'),
('mimetype', 'text/html'),
('statuscode', '200'),
('digest', 'B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'),
('redirect', '-'),
('robotflags', '-'),
('length', '1046'),
('offset', '334'),
('filename', 'dupes.warc.gz')]
"""
kwparams['url'] = url
kwparams['output'] = 'text'
server = CDXServer(sources)
results = server.load_cdx(**kwparams)
for x in results:
sys.stdout.write(x)
if __name__ == "__main__":
import doctest
doctest.testmod()

View File

@ -1,7 +1,7 @@
import webtest
from ..pywb.pywb_init import pywb_config
from ..pywb.wbapp import create_wb_app
from ..pywb.cdxserver.cdxobject import CDXObject
from pywb.pywb_init import pywb_config
from pywb.wbapp import create_wb_app
from pywb.cdx.cdxobject import CDXObject
class TestWb:
TEST_CONFIG = 'test_config.yaml'