mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-14 15:53:28 +01:00
pywb 0.2!
move to distinct packages: pywb.utils, pywb.cdx, pywb.warc, pywb.util, pywb.rewrite! each package will have its own README and tests shared sample_data and install
This commit is contained in:
parent
2528ee0a7c
commit
5345459298
@ -1,2 +0,0 @@
|
|||||||
#Allow importing
|
|
||||||
|
|
@ -1,3 +1,4 @@
|
|||||||
#Allow importing
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
def get_test_dir():
|
||||||
|
return os.path.dirname(os.path.realpath(__file__)) + '/../sample_archive/'
|
||||||
|
@ -3,8 +3,8 @@ import re
|
|||||||
import wbexceptions
|
import wbexceptions
|
||||||
|
|
||||||
from wbrequestresponse import WbRequest, WbResponse
|
from wbrequestresponse import WbRequest, WbResponse
|
||||||
from url_rewriter import UrlRewriter
|
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||||
from wburl import WbUrl
|
from pywb.rewrite.wburl import WbUrl
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
# ArchivalRouter -- route WB requests in archival mode
|
# ArchivalRouter -- route WB requests in archival mode
|
||||||
@ -45,20 +45,6 @@ class ArchivalRouter:
|
|||||||
# of request uri (excluding first '/')
|
# of request uri (excluding first '/')
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class Route:
|
class Route:
|
||||||
"""
|
|
||||||
# route with relative path
|
|
||||||
>>> Route('web', handlers.BaseHandler())({'REL_REQUEST_URI': '/web/test.example.com', 'SCRIPT_NAME': ''}, False)
|
|
||||||
{'wb_url': ('latest_replay', '', '', 'http://test.example.com', 'http://test.example.com'), 'coll': 'web', 'wb_prefix': '/web/', 'request_uri': '/web/test.example.com'}
|
|
||||||
|
|
||||||
# route with absolute path, running at script /my_pywb
|
|
||||||
>>> Route('web', handlers.BaseHandler())({'REL_REQUEST_URI': '/web/2013im_/test.example.com', 'SCRIPT_NAME': '/my_pywb', 'HTTP_HOST': 'localhost:8081', 'wsgi.url_scheme': 'https'}, True)
|
|
||||||
{'wb_url': ('replay', '2013', 'im_', 'http://test.example.com', '2013im_/http://test.example.com'), 'coll': 'web', 'wb_prefix': 'https://localhost:8081/my_pywb/web/', 'request_uri': '/web/2013im_/test.example.com'}
|
|
||||||
|
|
||||||
|
|
||||||
# not matching route -- skipped
|
|
||||||
>>> Route('web', handlers.BaseHandler())({'REL_REQUEST_URI': '/other/test.example.com', 'SCRIPT_NAME': ''}, False)
|
|
||||||
"""
|
|
||||||
|
|
||||||
# match upto next / or ? or end
|
# match upto next / or ? or end
|
||||||
SLASH_QUERY_LOOKAHEAD ='(?=/|$|\?)'
|
SLASH_QUERY_LOOKAHEAD ='(?=/|$|\?)'
|
||||||
|
|
||||||
@ -127,57 +113,6 @@ class Route:
|
|||||||
# ReferRedirect -- redirect urls that have 'fallen through' based on the referrer settings
|
# ReferRedirect -- redirect urls that have 'fallen through' based on the referrer settings
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class ReferRedirect:
|
class ReferRedirect:
|
||||||
|
|
||||||
"""
|
|
||||||
>>> ReferRedirect('http://localhost:8080/').match_prefixs
|
|
||||||
['http://localhost:8080/']
|
|
||||||
|
|
||||||
>>> ReferRedirect(['http://example:9090/']).match_prefixs
|
|
||||||
['http://example:9090/']
|
|
||||||
|
|
||||||
>>> test_redir('http://localhost:8080/', '/diff_path/other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html')
|
|
||||||
'http://localhost:8080/coll/20131010/http://example.com/diff_path/other.html'
|
|
||||||
|
|
||||||
>>> test_redir('http://localhost:8080/', '/../other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html')
|
|
||||||
'http://localhost:8080/coll/20131010/http://example.com/other.html'
|
|
||||||
|
|
||||||
>>> test_redir('http://localhost:8080/', '/../../other.html', 'http://localhost:8080/coll/20131010/http://example.com/index.html')
|
|
||||||
'http://localhost:8080/coll/20131010/http://example.com/other.html'
|
|
||||||
|
|
||||||
# Custom collection
|
|
||||||
>>> test_redir('http://localhost:8080/', '/other.html', 'http://localhost:8080/complex/123/20131010/http://example.com/path/page.html', coll='complex/123')
|
|
||||||
'http://localhost:8080/complex/123/20131010/http://example.com/other.html'
|
|
||||||
|
|
||||||
# With timestamp included
|
|
||||||
>>> test_redir('http://localhost:8080/', '/20131010/other.html', 'http://localhost:8080/coll/20131010/http://example.com/index.html')
|
|
||||||
'http://localhost:8080/coll/20131010/http://example.com/other.html'
|
|
||||||
|
|
||||||
# With timestamp included
|
|
||||||
>>> test_redir('http://localhost:8080/', '/20131010/path/other.html', 'http://localhost:8080/coll/20131010/http://example.com/some/index.html')
|
|
||||||
'http://localhost:8080/coll/20131010/http://example.com/path/other.html'
|
|
||||||
|
|
||||||
# Wrong Host
|
|
||||||
>>> test_redir('http://example:8080/', '/other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html')
|
|
||||||
False
|
|
||||||
|
|
||||||
# Right Host
|
|
||||||
>>> test_redir('http://localhost:8080/', '/other.html', 'http://example.com:8080/coll/20131010/http://example.com/path/page.html', http_host = 'example.com:8080')
|
|
||||||
'http://example.com:8080/coll/20131010/http://example.com/other.html'
|
|
||||||
|
|
||||||
# With custom SCRIPT_NAME
|
|
||||||
>>> test_redir('http://localhost:8080/', '/../other.html', 'http://localhost:8080/extra/coll/20131010/http://example.com/path/page.html', '/extra')
|
|
||||||
'http://localhost:8080/extra/coll/20131010/http://example.com/other.html'
|
|
||||||
|
|
||||||
# With custom SCRIPT_NAME + timestamp
|
|
||||||
>>> test_redir('http://localhost:8080/', '/20131010/other.html', 'http://localhost:8080/extra/coll/20131010/http://example.com/path/page.html', '/extra')
|
|
||||||
'http://localhost:8080/extra/coll/20131010/http://example.com/other.html'
|
|
||||||
|
|
||||||
# With custom SCRIPT_NAME, bad match
|
|
||||||
>>> test_redir('http://localhost:8080/', '/../other.html', 'http://localhost:8080/extra/coll/20131010/http://example.com/path/page.html', '/extr')
|
|
||||||
False
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, match_prefixs):
|
def __init__(self, match_prefixs):
|
||||||
if isinstance(match_prefixs, list):
|
if isinstance(match_prefixs, list):
|
||||||
self.match_prefixs = match_prefixs
|
self.match_prefixs = match_prefixs
|
||||||
@ -240,31 +175,3 @@ class ReferRedirect:
|
|||||||
final_url = urlparse.urlunsplit((ref_split.scheme, ref_split.netloc, rewriter.rewrite(rel_request_uri), '', ''))
|
final_url = urlparse.urlunsplit((ref_split.scheme, ref_split.netloc, rewriter.rewrite(rel_request_uri), '', ''))
|
||||||
|
|
||||||
return WbResponse.redir_response(final_url)
|
return WbResponse.redir_response(final_url)
|
||||||
|
|
||||||
|
|
||||||
import utils
|
|
||||||
if __name__ == "__main__" or utils.enable_doctests():
|
|
||||||
|
|
||||||
import handlers
|
|
||||||
|
|
||||||
def test_redir(match_host, request_uri, referrer, script_name = '', coll = 'coll', http_host = None):
|
|
||||||
env = {'REL_REQUEST_URI': request_uri, 'HTTP_REFERER': referrer, 'SCRIPT_NAME': script_name}
|
|
||||||
|
|
||||||
if http_host:
|
|
||||||
env['HTTP_HOST'] = http_host
|
|
||||||
|
|
||||||
routes = [Route(coll, handlers.BaseHandler())]
|
|
||||||
|
|
||||||
redir = ReferRedirect(match_host)
|
|
||||||
#req = WbRequest.from_uri(request_uri, env)
|
|
||||||
rep = redir(env, routes)
|
|
||||||
if not rep:
|
|
||||||
return False
|
|
||||||
|
|
||||||
return rep.status_headers.get_header('Location')
|
|
||||||
|
|
||||||
|
|
||||||
import doctest
|
|
||||||
doctest.testmod()
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,461 +0,0 @@
|
|||||||
import itertools
|
|
||||||
import utils
|
|
||||||
import urllib2
|
|
||||||
import StringIO
|
|
||||||
import urlparse
|
|
||||||
import collections
|
|
||||||
import wbexceptions
|
|
||||||
|
|
||||||
from wbrequestresponse import StatusAndHeaders
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
# load a reader from http
|
|
||||||
#=================================================================
|
|
||||||
|
|
||||||
class HttpLoader:
|
|
||||||
"""
|
|
||||||
Load content over http with range request and optional signature
|
|
||||||
"""
|
|
||||||
def __init__(self, hmac = None, hmac_duration = 30):
|
|
||||||
self.hmac = hmac
|
|
||||||
self.hmac_duration = hmac_duration
|
|
||||||
|
|
||||||
def load(self, url, offset, length):
|
|
||||||
if length > 0:
|
|
||||||
range_header = 'bytes={0}-{1}'.format(offset, offset + length - 1)
|
|
||||||
else:
|
|
||||||
range_header = 'bytes={0}-'.format(offset)
|
|
||||||
|
|
||||||
headers = {}
|
|
||||||
headers['Range'] = range_header
|
|
||||||
|
|
||||||
if self.hmac:
|
|
||||||
headers['Cookie'] = self.hmac(self.hmac_duration)
|
|
||||||
|
|
||||||
request = urllib2.Request(url, headers = headers)
|
|
||||||
return urllib2.urlopen(request)
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
# load a reader from local filesystem
|
|
||||||
#=================================================================
|
|
||||||
class FileLoader:
|
|
||||||
"""
|
|
||||||
Load content from local file-system
|
|
||||||
|
|
||||||
# Ensure attempt to read more than 100 bytes, only reads 100 bytes
|
|
||||||
>>> len(FileLoader().load(utils.test_data_dir() + 'warcs/iana.warc.gz', 0, 100).read('400'))
|
|
||||||
100
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
def load(self, url, offset, length):
|
|
||||||
if url.startswith('file://'):
|
|
||||||
url = url[len('file://'):]
|
|
||||||
|
|
||||||
afile = open(url, 'rb')
|
|
||||||
afile.seek(offset)
|
|
||||||
|
|
||||||
if length > 0:
|
|
||||||
return LimitReader(afile, length)
|
|
||||||
else:
|
|
||||||
return afile
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
# A reader which will not read past the specified limit
|
|
||||||
#=================================================================
|
|
||||||
class LimitReader:
|
|
||||||
"""
|
|
||||||
>>> LimitReader(StringIO.StringIO('abcdefghjiklmnopqrstuvwxyz'), 10).read(26)
|
|
||||||
'abcdefghji'
|
|
||||||
|
|
||||||
>>> LimitReader(StringIO.StringIO('abcdefghjiklmnopqrstuvwxyz'), 8).readline(26)
|
|
||||||
'abcdefgh'
|
|
||||||
|
|
||||||
>>> test_multiple_reads(LimitReader(StringIO.StringIO('abcdefghjiklmnopqrstuvwxyz'), 10), [2, 2, 20])
|
|
||||||
'efghji'
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, stream, limit):
|
|
||||||
self.stream = stream
|
|
||||||
self.limit = limit
|
|
||||||
|
|
||||||
if not self.limit:
|
|
||||||
self.limit = 1
|
|
||||||
|
|
||||||
|
|
||||||
def read(self, length = None):
|
|
||||||
length = min(length, self.limit) if length else self.limit
|
|
||||||
buff = self.stream.read(length)
|
|
||||||
self.limit -= len(buff)
|
|
||||||
return buff
|
|
||||||
|
|
||||||
|
|
||||||
def readline(self, length = None):
|
|
||||||
length = min(length, self.limit) if length else self.limit
|
|
||||||
buff = self.stream.readline(length)
|
|
||||||
self.limit -= len(buff)
|
|
||||||
return buff
|
|
||||||
|
|
||||||
def close(self):
|
|
||||||
self.stream.close()
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
WBArchiveRecord = collections.namedtuple('WBArchiveRecord', 'type, rec_headers, stream, status_headers')
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
|
|
||||||
class ArchiveLoader:
|
|
||||||
"""
|
|
||||||
>>> load_test_archive('example.warc.gz', '333', '1043')
|
|
||||||
(('warc', 'response'),
|
|
||||||
StatusAndHeaders(protocol = 'WARC/1.0', statusline = '', headers = [ ('WARC-Type', 'response'),
|
|
||||||
('WARC-Record-ID', '<urn:uuid:6d058047-ede2-4a13-be79-90c17c631dd4>'),
|
|
||||||
('WARC-Date', '2014-01-03T03:03:21Z'),
|
|
||||||
('Content-Length', '1610'),
|
|
||||||
('Content-Type', 'application/http; msgtype=response'),
|
|
||||||
('WARC-Payload-Digest', 'sha1:B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'),
|
|
||||||
('WARC-Target-URI', 'http://example.com?example=1'),
|
|
||||||
('WARC-Warcinfo-ID', '<urn:uuid:fbd6cf0a-6160-4550-b343-12188dc05234>')]),
|
|
||||||
StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
|
|
||||||
('Cache-Control', 'max-age=604800'),
|
|
||||||
('Content-Type', 'text/html'),
|
|
||||||
('Date', 'Fri, 03 Jan 2014 03:03:21 GMT'),
|
|
||||||
('Etag', '"359670651"'),
|
|
||||||
('Expires', 'Fri, 10 Jan 2014 03:03:21 GMT'),
|
|
||||||
('Last-Modified', 'Fri, 09 Aug 2013 23:54:35 GMT'),
|
|
||||||
('Server', 'ECS (sjc/4FCE)'),
|
|
||||||
('X-Cache', 'HIT'),
|
|
||||||
('x-ec-custom-error', '1'),
|
|
||||||
('Content-Length', '1270'),
|
|
||||||
('Connection', 'close')]))
|
|
||||||
|
|
||||||
|
|
||||||
>>> load_test_archive('example.warc.gz', '1864', '553')
|
|
||||||
(('warc', 'revisit'),
|
|
||||||
StatusAndHeaders(protocol = 'WARC/1.0', statusline = '', headers = [ ('WARC-Type', 'revisit'),
|
|
||||||
('WARC-Record-ID', '<urn:uuid:3619f5b0-d967-44be-8f24-762098d427c4>'),
|
|
||||||
('WARC-Date', '2014-01-03T03:03:41Z'),
|
|
||||||
('Content-Length', '340'),
|
|
||||||
('Content-Type', 'application/http; msgtype=response'),
|
|
||||||
('WARC-Payload-Digest', 'sha1:B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'),
|
|
||||||
('WARC-Target-URI', 'http://example.com?example=1'),
|
|
||||||
('WARC-Warcinfo-ID', '<urn:uuid:fbd6cf0a-6160-4550-b343-12188dc05234>'),
|
|
||||||
( 'WARC-Profile',
|
|
||||||
'http://netpreserve.org/warc/0.18/revisit/identical-payload-digest'),
|
|
||||||
('WARC-Refers-To-Target-URI', 'http://example.com?example=1'),
|
|
||||||
('WARC-Refers-To-Date', '2014-01-03T03:03:21Z')]),
|
|
||||||
StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
|
|
||||||
('Cache-Control', 'max-age=604800'),
|
|
||||||
('Content-Type', 'text/html'),
|
|
||||||
('Date', 'Fri, 03 Jan 2014 03:03:41 GMT'),
|
|
||||||
('Etag', '"359670651"'),
|
|
||||||
('Expires', 'Fri, 10 Jan 2014 03:03:41 GMT'),
|
|
||||||
('Last-Modified', 'Fri, 09 Aug 2013 23:54:35 GMT'),
|
|
||||||
('Server', 'ECS (sjc/4FCE)'),
|
|
||||||
('X-Cache', 'HIT'),
|
|
||||||
('x-ec-custom-error', '1'),
|
|
||||||
('Content-Length', '1270'),
|
|
||||||
('Connection', 'close')]))
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Standard ARC headers
|
|
||||||
ARC_HEADERS = ["uri", "ip-address", "creation-date", "content-type", "length"]
|
|
||||||
|
|
||||||
# Since loading a range request, can only determine gzip-ness based on file extension
|
|
||||||
FORMAT_MAP = {
|
|
||||||
'.warc.gz': ('warc', True),
|
|
||||||
'.arc.gz': ('arc', True),
|
|
||||||
'.warc': ('warc', False),
|
|
||||||
'.arc': ('arc', False),
|
|
||||||
}
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def create_default_loaders(hmac = None):
|
|
||||||
http = HttpLoader(hmac)
|
|
||||||
file = FileLoader()
|
|
||||||
return {
|
|
||||||
'http': http,
|
|
||||||
'https': http,
|
|
||||||
'file': file,
|
|
||||||
'': file
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def __init__(self, loaders = {}, hmac = None, chunk_size = 8192):
|
|
||||||
self.loaders = loaders if loaders else ArchiveLoader.create_default_loaders(hmac)
|
|
||||||
self.chunk_size = chunk_size
|
|
||||||
|
|
||||||
self.arc_parser = ARCHeadersParser(ArchiveLoader.ARC_HEADERS)
|
|
||||||
self.warc_parser = StatusAndHeadersParser(['WARC/1.0', 'WARC/0.17', 'WARC/0.18'])
|
|
||||||
self.http_parser = StatusAndHeadersParser(['HTTP/1.0', 'HTTP/1.1'])
|
|
||||||
|
|
||||||
def load(self, url, offset, length):
|
|
||||||
url_parts = urlparse.urlsplit(url)
|
|
||||||
|
|
||||||
loader = self.loaders.get(url_parts.scheme)
|
|
||||||
if not loader:
|
|
||||||
raise wbexceptions.UnknownLoaderProtocolException(url)
|
|
||||||
|
|
||||||
the_format = None
|
|
||||||
|
|
||||||
for ext, iformat in ArchiveLoader.FORMAT_MAP.iteritems():
|
|
||||||
if url.endswith(ext):
|
|
||||||
the_format = iformat
|
|
||||||
break
|
|
||||||
|
|
||||||
if the_format is None:
|
|
||||||
raise wbexceptions.UnknownArchiveFormatException(url)
|
|
||||||
|
|
||||||
(a_format, is_gzip) = the_format
|
|
||||||
|
|
||||||
decomp = utils.create_decompressor() if is_gzip else None
|
|
||||||
|
|
||||||
try:
|
|
||||||
length = int(length)
|
|
||||||
except:
|
|
||||||
length = -1
|
|
||||||
|
|
||||||
|
|
||||||
raw = loader.load(url, long(offset), length)
|
|
||||||
|
|
||||||
stream = LineReader(raw, length, self.chunk_size, decomp)
|
|
||||||
|
|
||||||
if a_format == 'arc':
|
|
||||||
rec_headers = self.arc_parser.parse(stream)
|
|
||||||
rec_type = 'response'
|
|
||||||
empty = (rec_headers.get_header('length') == 0)
|
|
||||||
|
|
||||||
elif a_format == 'warc':
|
|
||||||
rec_headers = self.warc_parser.parse(stream)
|
|
||||||
rec_type = rec_headers.get_header('WARC-Type')
|
|
||||||
empty = (rec_headers.get_header('Content-Length') == '0')
|
|
||||||
|
|
||||||
# special case: empty w/arc record (hopefully a revisit)
|
|
||||||
if empty:
|
|
||||||
status_headers = StatusAndHeaders('204 No Content', [])
|
|
||||||
|
|
||||||
# special case: warc records that are not expected to have http headers
|
|
||||||
# attempt to add 200 status and content-type
|
|
||||||
elif rec_type == 'metadata' or rec_type == 'resource':
|
|
||||||
status_headers = StatusAndHeaders('200 OK', [('Content-Type', rec_headers.get_header('Content-Type'))])
|
|
||||||
|
|
||||||
# special case: http 0.9 response, no status or headers
|
|
||||||
#elif rec_type == 'response':
|
|
||||||
# content_type = rec_headers.get_header('Content-Type')
|
|
||||||
# if content_type and (';version=0.9' in content_type):
|
|
||||||
# status_headers = StatusAndHeaders('200 OK', [])
|
|
||||||
|
|
||||||
# response record: parse HTTP status and headers!
|
|
||||||
else:
|
|
||||||
#(statusline, http_headers) = self.parse_http_headers(stream)
|
|
||||||
status_headers = self.http_parser.parse(stream)
|
|
||||||
|
|
||||||
return WBArchiveRecord((a_format, rec_type), rec_headers, stream, status_headers)
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
class StatusAndHeadersParser:
|
|
||||||
def __init__(self, statuslist):
|
|
||||||
self.statuslist = statuslist
|
|
||||||
|
|
||||||
def parse(self, stream):
|
|
||||||
statusline = stream.readline().rstrip()
|
|
||||||
|
|
||||||
protocol_status = utils.split_prefix(statusline, self.statuslist)
|
|
||||||
|
|
||||||
if not protocol_status:
|
|
||||||
raise wbexceptions.InvalidArchiveRecordException('Expected Status Line, Found: ' + statusline)
|
|
||||||
|
|
||||||
headers = []
|
|
||||||
|
|
||||||
line = stream.readline().rstrip()
|
|
||||||
while line and line != '\r\n':
|
|
||||||
name, value = line.split(':', 1)
|
|
||||||
header = (name, value.strip())
|
|
||||||
headers.append(header)
|
|
||||||
line = stream.readline().rstrip()
|
|
||||||
|
|
||||||
return StatusAndHeaders(statusline = protocol_status[1].strip(), headers = headers, protocol = protocol_status[0])
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
class ARCHeadersParser:
|
|
||||||
def __init__(self, headernames):
|
|
||||||
self.headernames = headernames
|
|
||||||
|
|
||||||
|
|
||||||
def parse(self, stream):
|
|
||||||
headerline = stream.readline().rstrip()
|
|
||||||
|
|
||||||
parts = headerline.split()
|
|
||||||
|
|
||||||
headernames = self.headernames
|
|
||||||
|
|
||||||
if len(parts) != len(headernames):
|
|
||||||
raise wbexceptions.InvalidArchiveRecordException('Wrong # of heaeders, expected arc headers {0}, Found {1}'.format(headernames, parts))
|
|
||||||
|
|
||||||
headers = []
|
|
||||||
|
|
||||||
for name, value in itertools.izip(headernames, parts):
|
|
||||||
headers.append((name, value))
|
|
||||||
|
|
||||||
return StatusAndHeaders(statusline = '', headers = headers, protocol = 'ARC/1.0')
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
class LineReader:
|
|
||||||
def __init__(self, stream, max_len = 0, chunk_size = 1024, decomp = None):
|
|
||||||
self.stream = stream
|
|
||||||
self.chunk_size = chunk_size
|
|
||||||
self.decomp = decomp
|
|
||||||
self.buff = None
|
|
||||||
self.num_read = 0
|
|
||||||
self.max_len = max_len
|
|
||||||
|
|
||||||
def _fillbuff(self, chunk_size = None):
|
|
||||||
if not chunk_size:
|
|
||||||
chunk_size = self.chunk_size
|
|
||||||
|
|
||||||
if not self.buff or self.buff.pos >= self.buff.len:
|
|
||||||
to_read = min(self.max_len - self.num_read, self.chunk_size) if (self.max_len > 0) else self.chunk_size
|
|
||||||
data = self.stream.read(to_read)
|
|
||||||
self._process_read(data)
|
|
||||||
|
|
||||||
def _process_read(self, data):
|
|
||||||
if self.decomp and data:
|
|
||||||
try:
|
|
||||||
data = self.decomp.decompress(data)
|
|
||||||
except Exception:
|
|
||||||
# if first read attempt, assume non-gzipped stream
|
|
||||||
if self.num_read == 0:
|
|
||||||
self.decomp = False
|
|
||||||
# otherwise (partly decompressed), something is wrong
|
|
||||||
else:
|
|
||||||
raise
|
|
||||||
|
|
||||||
self.num_read += len(data)
|
|
||||||
self.buff = StringIO.StringIO(data)
|
|
||||||
|
|
||||||
|
|
||||||
def read(self, length = None):
|
|
||||||
self._fillbuff()
|
|
||||||
return self.buff.read(length)
|
|
||||||
|
|
||||||
def readline(self, length = None):
|
|
||||||
self._fillbuff()
|
|
||||||
return self.buff.readline(length)
|
|
||||||
|
|
||||||
def close(self):
|
|
||||||
if self.stream:
|
|
||||||
self.stream.close()
|
|
||||||
self.stream = None
|
|
||||||
|
|
||||||
|
|
||||||
class ChunkedDataException(Exception):
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class ChunkedLineReader(LineReader):
|
|
||||||
r"""
|
|
||||||
Properly formatted chunked data:
|
|
||||||
>>> c=ChunkedLineReader(StringIO.StringIO("4\r\n1234\r\n0\r\n\r\n")); c.read()+c.read()
|
|
||||||
'1234'
|
|
||||||
|
|
||||||
Non-chunked data:
|
|
||||||
>>> ChunkedLineReader(StringIO.StringIO("xyz123!@#")).read()
|
|
||||||
'xyz123!@#'
|
|
||||||
|
|
||||||
Starts like chunked data, but isn't:
|
|
||||||
>>> c=ChunkedLineReader(StringIO.StringIO("1\r\nxyz123!@#")); c.read()+c.read()
|
|
||||||
'1\r\nx123!@#'
|
|
||||||
|
|
||||||
Chunked data cut off part way through:
|
|
||||||
>>> c=ChunkedLineReader(StringIO.StringIO("4\r\n1234\r\n4\r\n12"));c.read()+c.read()
|
|
||||||
'123412'
|
|
||||||
"""
|
|
||||||
|
|
||||||
all_chunks_read = False
|
|
||||||
not_chunked = False
|
|
||||||
raise_chunked_data_exceptions = False # if False, we'll use best-guess fallback for parse errors
|
|
||||||
|
|
||||||
def _fillbuff(self, chunk_size = None):
|
|
||||||
if self.not_chunked:
|
|
||||||
return LineReader._fillbuff(self, chunk_size)
|
|
||||||
|
|
||||||
if self.all_chunks_read:
|
|
||||||
return
|
|
||||||
|
|
||||||
if not self.buff or self.buff.pos >= self.buff.len:
|
|
||||||
length_header = self.stream.readline(64)
|
|
||||||
data = ''
|
|
||||||
|
|
||||||
try:
|
|
||||||
# decode length header
|
|
||||||
try:
|
|
||||||
chunk_size = int(length_header.strip().split(';')[0], 16)
|
|
||||||
except ValueError:
|
|
||||||
raise ChunkedDataException("Couldn't decode length header '%s'" % length_header)
|
|
||||||
|
|
||||||
if chunk_size:
|
|
||||||
# read chunk
|
|
||||||
while len(data) < chunk_size:
|
|
||||||
new_data = self.stream.read(chunk_size - len(data))
|
|
||||||
|
|
||||||
# if we unexpectedly run out of data, either raise an exception or just stop reading, assuming file was cut off
|
|
||||||
if not new_data:
|
|
||||||
if self.raise_chunked_data_exceptions:
|
|
||||||
raise ChunkedDataException("Ran out of data before end of chunk")
|
|
||||||
else:
|
|
||||||
chunk_size = len(data)
|
|
||||||
self.all_chunks_read = True
|
|
||||||
|
|
||||||
data += new_data
|
|
||||||
|
|
||||||
# if we successfully read a block without running out, it should end in \r\n
|
|
||||||
if not self.all_chunks_read:
|
|
||||||
clrf = self.stream.read(2)
|
|
||||||
if clrf != '\r\n':
|
|
||||||
raise ChunkedDataException("Chunk terminator not found.")
|
|
||||||
|
|
||||||
if self.decomp:
|
|
||||||
data = self.decomp.decompress(data)
|
|
||||||
else:
|
|
||||||
# chunk_size 0 indicates end of file
|
|
||||||
self.all_chunks_read = True
|
|
||||||
data = ''
|
|
||||||
|
|
||||||
self._process_read(data)
|
|
||||||
except ChunkedDataException:
|
|
||||||
if self.raise_chunked_data_exceptions:
|
|
||||||
raise
|
|
||||||
# Can't parse the data as chunked.
|
|
||||||
# It's possible that non-chunked data is set with a Transfer-Encoding: chunked
|
|
||||||
# Treat this as non-chunk encoded from here on
|
|
||||||
self._process_read(length_header + data)
|
|
||||||
self.not_chunked = True
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
import utils
|
|
||||||
if __name__ == "__main__" or utils.enable_doctests():
|
|
||||||
import os
|
|
||||||
import pprint
|
|
||||||
|
|
||||||
testloader = ArchiveLoader()
|
|
||||||
|
|
||||||
def load_test_archive(test_file, offset, length):
|
|
||||||
path = utils.test_data_dir() + 'warcs/' + test_file
|
|
||||||
|
|
||||||
archive = testloader.load(path, offset, length)
|
|
||||||
pprint.pprint((archive.type, archive.rec_headers, archive.status_headers))
|
|
||||||
|
|
||||||
def test_multiple_reads(reader, inc_reads):
|
|
||||||
result = None
|
|
||||||
for x in inc_reads:
|
|
||||||
result = reader.read(x)
|
|
||||||
return result
|
|
||||||
|
|
||||||
import doctest
|
|
||||||
doctest.testmod()
|
|
||||||
|
|
@ -1,123 +0,0 @@
|
|||||||
from collections import deque
|
|
||||||
import os
|
|
||||||
import itertools
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
# Binary Search over a text file
|
|
||||||
#=================================================================
|
|
||||||
class FileReader:
|
|
||||||
"""
|
|
||||||
A very simple file-like object wrapper that knows it's size
|
|
||||||
getsize() method returns the filesize
|
|
||||||
"""
|
|
||||||
def __init__(self, filename):
|
|
||||||
self.fh = open(filename, 'rb')
|
|
||||||
self.filename = filename
|
|
||||||
self.size = os.path.getsize(filename)
|
|
||||||
|
|
||||||
def getsize(self):
|
|
||||||
return self.size
|
|
||||||
|
|
||||||
def readline(self):
|
|
||||||
return self.fh.readline()
|
|
||||||
|
|
||||||
def seek(self, offset):
|
|
||||||
return self.fh.seek(offset)
|
|
||||||
|
|
||||||
def close(self):
|
|
||||||
return self.fh.close()
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
def binsearch_offset(reader, key, compare_func=cmp, block_size=8192):
|
|
||||||
"""
|
|
||||||
Find offset of the full line which matches a given 'key' using binary search
|
|
||||||
If key is not found, the offset is of the line after the key
|
|
||||||
|
|
||||||
File is subdivided into block_size (default 8192) sized blocks
|
|
||||||
Optional compare_func may be specified
|
|
||||||
"""
|
|
||||||
min = 0
|
|
||||||
max = reader.getsize() / block_size
|
|
||||||
|
|
||||||
while (max - min > 1):
|
|
||||||
mid = min + ((max - min) / 2)
|
|
||||||
reader.seek(mid * block_size)
|
|
||||||
|
|
||||||
if mid > 0:
|
|
||||||
reader.readline() # skip partial line
|
|
||||||
|
|
||||||
line = reader.readline()
|
|
||||||
|
|
||||||
if compare_func(key, line) > 0:
|
|
||||||
min = mid
|
|
||||||
else:
|
|
||||||
max = mid
|
|
||||||
|
|
||||||
return (min * block_size)
|
|
||||||
|
|
||||||
|
|
||||||
def search(reader, key, prev_size = 0, compare_func = cmp, block_size = 8192):
|
|
||||||
"""
|
|
||||||
Perform a binsearch for a specified key down to block_size (8192) sized blocks,
|
|
||||||
followed by linear search within the block to find first matching line.
|
|
||||||
|
|
||||||
When performing linear search, keep track of up to N previous lines before
|
|
||||||
first matching line.
|
|
||||||
"""
|
|
||||||
min = binsearch_offset(reader, key, compare_func, block_size)
|
|
||||||
|
|
||||||
reader.seek(min)
|
|
||||||
|
|
||||||
if min > 0:
|
|
||||||
reader.readline() # skip partial line
|
|
||||||
|
|
||||||
if prev_size > 1:
|
|
||||||
prev_deque = deque(maxlen = prev_size)
|
|
||||||
|
|
||||||
line = None
|
|
||||||
|
|
||||||
while True:
|
|
||||||
line = reader.readline()
|
|
||||||
if not line:
|
|
||||||
break
|
|
||||||
if compare_func(line, key) >= 0:
|
|
||||||
break
|
|
||||||
|
|
||||||
if prev_size == 1:
|
|
||||||
prev = line
|
|
||||||
elif prev_size > 1:
|
|
||||||
prev_deque.append(line)
|
|
||||||
|
|
||||||
def gen_iter(line):
|
|
||||||
if prev_size == 1:
|
|
||||||
yield prev.rstrip()
|
|
||||||
elif prev_size > 1:
|
|
||||||
for i in prev_deque:
|
|
||||||
yield i.rstrip()
|
|
||||||
|
|
||||||
while line:
|
|
||||||
yield line.rstrip()
|
|
||||||
line = reader.readline()
|
|
||||||
|
|
||||||
return gen_iter(line)
|
|
||||||
|
|
||||||
|
|
||||||
# Iterate over prefix matches
|
|
||||||
def iter_prefix(reader, key):
|
|
||||||
"""
|
|
||||||
Creates an iterator which iterates over prefix matches for a key in a sorted text file
|
|
||||||
A line matches as long as it starts with key
|
|
||||||
"""
|
|
||||||
|
|
||||||
return itertools.takewhile(lambda line: line.startswith(key), search(reader, key))
|
|
||||||
|
|
||||||
|
|
||||||
def iter_exact(reader, key, token=' '):
|
|
||||||
"""
|
|
||||||
Create an iterator which iterates over exact matches for a key in a sorted text file
|
|
||||||
Key is terminated by a token (default ' ')
|
|
||||||
"""
|
|
||||||
|
|
||||||
return iter_prefix(reader, key + token)
|
|
||||||
|
|
36
pywb/cdx/README.md
Normal file
36
pywb/cdx/README.md
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
## PyWb CDX v0.2
|
||||||
|
|
||||||
|
[](https://travis-ci.org/ikreymer/pywb_cdx)
|
||||||
|
|
||||||
|
|
||||||
|
This package contains the CDX processing suite of the pywb wayback tool suite.
|
||||||
|
|
||||||
|
The CDX Server loads, filters and transforms cdx from multiple sources in response
|
||||||
|
to a given query.
|
||||||
|
|
||||||
|
### Installation and Tests
|
||||||
|
|
||||||
|
`pip install -r requirements` -- to install
|
||||||
|
|
||||||
|
`python run-tests.py` -- to run all tests
|
||||||
|
|
||||||
|
|
||||||
|
### Sample App
|
||||||
|
|
||||||
|
A very simple reference WSGI app is included.
|
||||||
|
|
||||||
|
Run: `python -m pywb_cdx.wsgi_cdxserver` to start the app, keyboard interrupt to stop.
|
||||||
|
|
||||||
|
The default [config.yaml](pywb_cdx/config.yaml) points to the sample data directory
|
||||||
|
and uses port 8080
|
||||||
|
|
||||||
|
### CDX Server API Reference
|
||||||
|
|
||||||
|
Goal is to provide compatiblity with this feature set and more:
|
||||||
|
https://github.com/internetarchive/wayback/tree/master/wayback-cdx-server
|
||||||
|
|
||||||
|
TODO
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -1,25 +1,31 @@
|
|||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
import itertools
|
import itertools
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class CDXObject(OrderedDict):
|
class CDXObject(OrderedDict):
|
||||||
CDX_FORMATS = [
|
CDX_FORMATS = [
|
||||||
# Public CDX Format
|
# Public CDX Format
|
||||||
["urlkey","timestamp","original","mimetype","statuscode","digest","length"],
|
["urlkey", "timestamp", "original", "mimetype", "statuscode",
|
||||||
|
"digest", "length"],
|
||||||
|
|
||||||
# CDX 11 Format
|
# CDX 11 Format
|
||||||
["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"],
|
["urlkey", "timestamp", "original", "mimetype", "statuscode",
|
||||||
|
"digest", "redirect", "robotflags", "length", "offset", "filename"],
|
||||||
|
|
||||||
# CDX 9 Format
|
# CDX 9 Format
|
||||||
["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","offset","filename"],
|
["urlkey", "timestamp", "original", "mimetype", "statuscode",
|
||||||
|
"digest", "redirect", "offset", "filename"],
|
||||||
|
|
||||||
# CDX 11 Format + 3 revisit resolve fields
|
# CDX 11 Format + 3 revisit resolve fields
|
||||||
["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename",
|
["urlkey", "timestamp", "original", "mimetype", "statuscode",
|
||||||
"orig.length","orig.offset","orig.filename"],
|
"digest", "redirect", "robotflags", "length", "offset", "filename",
|
||||||
|
"orig.length", "orig.offset", "orig.filename"],
|
||||||
|
|
||||||
# CDX 9 Format + 3 revisit resolve fields
|
# CDX 9 Format + 3 revisit resolve fields
|
||||||
["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","offset","filename",
|
["urlkey", "timestamp", "original", "mimetype", "statuscode",
|
||||||
"orig.length","orig.offset","orig.filename"]
|
"digest", "redirect", "offset", "filename",
|
||||||
|
"orig.length", "orig.offset", "orig.filename"]
|
||||||
]
|
]
|
||||||
|
|
||||||
def __init__(self, cdxline):
|
def __init__(self, cdxline):
|
||||||
@ -53,5 +59,3 @@ class CDXObject(OrderedDict):
|
|||||||
|
|
||||||
li = itertools.imap(lambda (n, val): val, self.items())
|
li = itertools.imap(lambda (n, val): val, self.items())
|
||||||
return ' '.join(li)
|
return ' '.join(li)
|
||||||
|
|
||||||
|
|
@ -1,8 +1,6 @@
|
|||||||
from cdxobject import CDXObject
|
from cdxobject import CDXObject
|
||||||
|
from pywb.utils.timeutils import timestamp_to_sec
|
||||||
|
|
||||||
from ..binsearch.binsearch import iter_exact, iter_prefix, FileReader
|
|
||||||
|
|
||||||
import timeutils
|
|
||||||
import bisect
|
import bisect
|
||||||
import itertools
|
import itertools
|
||||||
import re
|
import re
|
||||||
@ -11,7 +9,6 @@ from heapq import merge
|
|||||||
from collections import deque
|
from collections import deque
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
def cdx_text_out(cdx, fields):
|
def cdx_text_out(cdx, fields):
|
||||||
if not fields:
|
if not fields:
|
||||||
@ -26,30 +23,31 @@ def cdx_load(sources, params):
|
|||||||
|
|
||||||
cdx_iter = make_cdx_iter(cdx_iter)
|
cdx_iter = make_cdx_iter(cdx_iter)
|
||||||
|
|
||||||
resolve_revisits = params.get('resolve_revisits', False)
|
if not params.get('proxy_all'):
|
||||||
if resolve_revisits:
|
resolve_revisits = params.get('resolve_revisits', False)
|
||||||
cdx_iter = cdx_resolve_revisits(cdx_iter)
|
if resolve_revisits:
|
||||||
|
cdx_iter = cdx_resolve_revisits(cdx_iter)
|
||||||
|
|
||||||
filters = params.get('filter', None)
|
filters = params.get('filter', None)
|
||||||
if filters:
|
if filters:
|
||||||
cdx_iter = cdx_filter(cdx_iter, filters)
|
cdx_iter = cdx_filter(cdx_iter, filters)
|
||||||
|
|
||||||
collapse_time = params.get('collapse_time', None)
|
collapse_time = params.get('collapse_time', None)
|
||||||
if collapse_time:
|
if collapse_time:
|
||||||
cdx_iter = cdx_collapse_time_status(cdx_iter, collapse_time)
|
cdx_iter = cdx_collapse_time_status(cdx_iter, collapse_time)
|
||||||
|
|
||||||
limit = int(params.get('limit', 1000000))
|
limit = int(params.get('limit', 1000000))
|
||||||
|
|
||||||
reverse = params.get('reverse', False)
|
reverse = params.get('reverse', False)
|
||||||
if reverse:
|
if reverse:
|
||||||
cdx_iter = cdx_reverse(cdx_iter, limit)
|
cdx_iter = cdx_reverse(cdx_iter, limit)
|
||||||
|
|
||||||
closest_to = params.get('closest_to', None)
|
closest_to = params.get('closest', None)
|
||||||
if closest_to:
|
if closest_to:
|
||||||
cdx_iter = cdx_sort_closest(closest_to, cdx_iter, limit)
|
cdx_iter = cdx_sort_closest(closest_to, cdx_iter, limit)
|
||||||
|
|
||||||
if limit:
|
if limit:
|
||||||
cdx_iter = cdx_limit(cdx_iter, limit)
|
cdx_iter = cdx_limit(cdx_iter, limit)
|
||||||
|
|
||||||
# output raw cdx objects
|
# output raw cdx objects
|
||||||
if params.get('output') == 'raw':
|
if params.get('output') == 'raw':
|
||||||
@ -73,6 +71,7 @@ def load_cdx_streams(sources, params):
|
|||||||
merged_stream = merge(*(source_iters))
|
merged_stream = merge(*(source_iters))
|
||||||
return merged_stream
|
return merged_stream
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
# convert text cdx stream to CDXObject
|
# convert text cdx stream to CDXObject
|
||||||
def make_cdx_iter(text_iter):
|
def make_cdx_iter(text_iter):
|
||||||
@ -98,7 +97,7 @@ def cdx_reverse(cdx_iter, limit):
|
|||||||
|
|
||||||
return [last] if last else []
|
return [last] if last else []
|
||||||
|
|
||||||
reverse_cdxs = deque(maxlen = limit)
|
reverse_cdxs = deque(maxlen=limit)
|
||||||
|
|
||||||
for cdx in cdx_iter:
|
for cdx in cdx_iter:
|
||||||
reverse_cdxs.appendleft(cdx)
|
reverse_cdxs.appendleft(cdx)
|
||||||
@ -142,14 +141,13 @@ def cdx_filter(cdx_iter, filter_strings):
|
|||||||
filters = map(Filter, filter_strings)
|
filters = map(Filter, filter_strings)
|
||||||
|
|
||||||
for cdx in cdx_iter:
|
for cdx in cdx_iter:
|
||||||
if all (x(cdx) for x in filters):
|
if all(x(cdx) for x in filters):
|
||||||
yield cdx
|
yield cdx
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
# collapse by timestamp and status code
|
# collapse by timestamp and status code
|
||||||
def cdx_collapse_time_status(cdx_iter, timelen = 10):
|
def cdx_collapse_time_status(cdx_iter, timelen=10):
|
||||||
timelen = int(timelen)
|
timelen = int(timelen)
|
||||||
|
|
||||||
last_token = None
|
last_token = None
|
||||||
@ -163,16 +161,15 @@ def cdx_collapse_time_status(cdx_iter, timelen = 10):
|
|||||||
yield cdx
|
yield cdx
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
# sort CDXCaptureResult by closest to timestamp
|
# sort CDXCaptureResult by closest to timestamp
|
||||||
def cdx_sort_closest(closest, cdx_iter, limit = 10):
|
def cdx_sort_closest(closest, cdx_iter, limit=10):
|
||||||
closest_cdx = []
|
closest_cdx = []
|
||||||
|
|
||||||
closest_sec = timeutils.timestamp_to_sec(closest)
|
closest_sec = timestamp_to_sec(closest)
|
||||||
|
|
||||||
for cdx in cdx_iter:
|
for cdx in cdx_iter:
|
||||||
sec = timeutils.timestamp_to_sec(cdx['timestamp'])
|
sec = timestamp_to_sec(cdx['timestamp'])
|
||||||
key = abs(closest_sec - sec)
|
key = abs(closest_sec - sec)
|
||||||
|
|
||||||
# create tuple to sort by key
|
# create tuple to sort by key
|
||||||
@ -186,22 +183,22 @@ def cdx_sort_closest(closest, cdx_iter, limit = 10):
|
|||||||
if len(closest_cdx) > limit:
|
if len(closest_cdx) > limit:
|
||||||
closest_cdx.pop()
|
closest_cdx.pop()
|
||||||
|
|
||||||
|
|
||||||
return itertools.imap(lambda x: x[1], closest_cdx)
|
return itertools.imap(lambda x: x[1], closest_cdx)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
# resolve revisits
|
# resolve revisits
|
||||||
|
|
||||||
# Fields to append from cdx original to revisit
|
# Fields to append from cdx original to revisit
|
||||||
ORIG_TUPLE = ['length', 'offset', 'filename']
|
ORIG_TUPLE = ['length', 'offset', 'filename']
|
||||||
|
|
||||||
|
|
||||||
def cdx_resolve_revisits(cdx_iter):
|
def cdx_resolve_revisits(cdx_iter):
|
||||||
originals = {}
|
originals = {}
|
||||||
|
|
||||||
for cdx in cdx_iter:
|
for cdx in cdx_iter:
|
||||||
is_revisit = (cdx['mimetype'] == 'warc/revisit') or (cdx['filename'] == '-')
|
is_revisit = ((cdx['mimetype'] == 'warc/revisit') or
|
||||||
|
(cdx['filename'] == '-'))
|
||||||
|
|
||||||
digest = cdx['digest']
|
digest = cdx['digest']
|
||||||
|
|
||||||
@ -210,7 +207,6 @@ def cdx_resolve_revisits(cdx_iter):
|
|||||||
if not original_cdx and not is_revisit:
|
if not original_cdx and not is_revisit:
|
||||||
originals[digest] = cdx
|
originals[digest] = cdx
|
||||||
|
|
||||||
|
|
||||||
if original_cdx and is_revisit:
|
if original_cdx and is_revisit:
|
||||||
fill_orig = lambda field: original_cdx[field]
|
fill_orig = lambda field: original_cdx[field]
|
||||||
# Transfer mimetype and statuscode
|
# Transfer mimetype and statuscode
|
||||||
@ -224,5 +220,3 @@ def cdx_resolve_revisits(cdx_iter):
|
|||||||
cdx['orig.' + field] = fill_orig(field)
|
cdx['orig.' + field] = fill_orig(field)
|
||||||
|
|
||||||
yield cdx
|
yield cdx
|
||||||
|
|
||||||
|
|
@ -1,5 +1,4 @@
|
|||||||
import surt
|
import surt
|
||||||
from ..binsearch.binsearch import iter_exact, iter_prefix, FileReader
|
|
||||||
from cdxops import cdx_load
|
from cdxops import cdx_load
|
||||||
|
|
||||||
import itertools
|
import itertools
|
||||||
@ -7,39 +6,21 @@ import logging
|
|||||||
import os
|
import os
|
||||||
import urlparse
|
import urlparse
|
||||||
|
|
||||||
|
from cdxsource import CDXSource, CDXFile, RemoteCDXSource
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
class CDXFile:
|
|
||||||
def __init__(self, filename):
|
|
||||||
self.filename = filename
|
|
||||||
|
|
||||||
def load_cdx(self, params):
|
|
||||||
source = FileReader(self.filename)
|
|
||||||
|
|
||||||
match_type = params.get('match_type')
|
|
||||||
|
|
||||||
if match_type == 'prefix':
|
|
||||||
iter_func = iter_prefix
|
|
||||||
else:
|
|
||||||
iter_func = iter_exact
|
|
||||||
|
|
||||||
key = params.get('key')
|
|
||||||
|
|
||||||
return iter_func(source, key)
|
|
||||||
|
|
||||||
def __str__(self):
|
|
||||||
return 'CDX File - ' + self.filename
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class CDXException(Exception):
|
class CDXException(Exception):
|
||||||
def __init__(self, msg, url = None):
|
|
||||||
Exception.__init__(self, msg)
|
|
||||||
self.url = url
|
|
||||||
|
|
||||||
def status(self):
|
def status(self):
|
||||||
return '400 Bad Request'
|
return '400 Bad Request'
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class AccessException(CDXException):
|
||||||
|
def status(self):
|
||||||
|
return '403 Bad Request'
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class CDXServer:
|
class CDXServer:
|
||||||
"""
|
"""
|
||||||
@ -47,33 +28,51 @@ class CDXServer:
|
|||||||
responds to queries and dispatches to the cdx ops for processing
|
responds to queries and dispatches to the cdx ops for processing
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, sources, surt_ordered = True):
|
@staticmethod
|
||||||
|
def create_from_config(config):
|
||||||
|
paths = config.get('index_paths')
|
||||||
|
surt_ordered = config.get('surt_ordered', True)
|
||||||
|
return CDXServer(paths, surt_ordered)
|
||||||
|
|
||||||
|
def __init__(self, sources, surt_ordered=True):
|
||||||
self.sources = []
|
self.sources = []
|
||||||
self.surt_ordered = surt_ordered
|
self.surt_ordered = surt_ordered
|
||||||
|
|
||||||
logging.debug('CDX Surt-Ordered? ' + str(surt_ordered))
|
logging.debug('CDX Surt-Ordered? ' + str(surt_ordered))
|
||||||
|
|
||||||
|
if not isinstance(sources, list):
|
||||||
|
sources = [sources]
|
||||||
|
|
||||||
for src in sources:
|
for src in sources:
|
||||||
if os.path.isdir(src):
|
if isinstance(src, CDXSource):
|
||||||
for file in os.listdir(src):
|
self.add_cdx_source(src)
|
||||||
self.add_cdx_loader(src + file)
|
elif isinstance(src, str):
|
||||||
else:
|
if os.path.isdir(src):
|
||||||
self.add_cdx_loader(src)
|
for file in os.listdir(src):
|
||||||
|
self.add_cdx_source(src + file)
|
||||||
|
else:
|
||||||
|
self.add_cdx_source(src)
|
||||||
|
|
||||||
if len(self.sources) == 0:
|
if len(self.sources) == 0:
|
||||||
logging.exception('No CDX Sources Found!')
|
logging.exception('No CDX Sources Found from: ' + str(sources))
|
||||||
|
|
||||||
def add_cdx_loader(self, filename):
|
def add_cdx_source(self, source):
|
||||||
source = self.create_cdx_loader(filename)
|
if not isinstance(source, CDXSource):
|
||||||
if not source:
|
source = self.create_cdx_source(source)
|
||||||
return
|
if not source:
|
||||||
|
return
|
||||||
|
|
||||||
logging.debug('Adding CDX Source: ' + str(source))
|
logging.debug('Adding CDX Source: ' + str(source))
|
||||||
self.sources.append(source)
|
self.sources.append(source)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def create_cdx_loader(filename):
|
def create_cdx_source(filename):
|
||||||
|
if filename.startswith('http://') or filename.startswith('https://'):
|
||||||
|
return RemoteCDXSource(filename)
|
||||||
|
|
||||||
if filename.endswith('.cdx'):
|
if filename.endswith('.cdx'):
|
||||||
return CDXFile(filename)
|
return CDXFile(filename)
|
||||||
|
|
||||||
return None
|
return None
|
||||||
#TODO: support zipnum
|
#TODO: support zipnum
|
||||||
#elif filename.endswith('.summary')
|
#elif filename.endswith('.summary')
|
||||||
@ -81,27 +80,52 @@ class CDXServer:
|
|||||||
#elif filename.startswith('redis://')
|
#elif filename.startswith('redis://')
|
||||||
# return RedisCDXSource(filename)
|
# return RedisCDXSource(filename)
|
||||||
|
|
||||||
|
|
||||||
def load_cdx(self, **params):
|
def load_cdx(self, **params):
|
||||||
# canonicalize to surt (canonicalization is part of surt conversion)
|
# if key not set, assume 'url' is set and needs canonicalization
|
||||||
|
if not params.get('key'):
|
||||||
|
params['key'] = self._canonicalize(params)
|
||||||
|
|
||||||
|
self._convert_old_style(params)
|
||||||
|
|
||||||
|
return cdx_load(self.sources, params)
|
||||||
|
|
||||||
|
def _canonicalize(self, params):
|
||||||
|
"""
|
||||||
|
Canonicalize url and convert to surt
|
||||||
|
If no surt-mode, convert back to url form
|
||||||
|
as surt conversion is currently part of canonicalization
|
||||||
|
"""
|
||||||
try:
|
try:
|
||||||
url = params['url']
|
url = params['url']
|
||||||
except KeyError:
|
except KeyError:
|
||||||
raise CDXException('The url= param must be specified to query the cdx server')
|
msg = 'A url= param must be specified to query the cdx server'
|
||||||
|
raise CDXException(msg)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
key = surt.surt(url)
|
key = surt.surt(url)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise CDXException('Invalid url: ', url)
|
raise CDXException('Invalid Url: ' + url)
|
||||||
|
|
||||||
# if not surt, unsurt the surt to get canonicalized non-surt url
|
# if not surt, unsurt the surt to get canonicalized non-surt url
|
||||||
if not self.surt_ordered:
|
if not self.surt_ordered:
|
||||||
key = unsurt(key)
|
key = unsurt(key)
|
||||||
|
|
||||||
params['key'] = key
|
return key
|
||||||
|
|
||||||
return cdx_load(self.sources, params)
|
def _convert_old_style(self, params):
|
||||||
|
"""
|
||||||
|
Convert old-style CDX Server param semantics
|
||||||
|
"""
|
||||||
|
collapse_time = params.get('collapseTime')
|
||||||
|
if collapse_time:
|
||||||
|
params['collapse_time'] = collapse_time
|
||||||
|
|
||||||
|
resolve_revisits = params.get('resolveRevisits')
|
||||||
|
if resolve_revisits:
|
||||||
|
params['resolve_revisits'] = resolve_revisits
|
||||||
|
|
||||||
|
if params.get('sort') == 'reverse':
|
||||||
|
params['reverse'] = True
|
||||||
|
|
||||||
def load_cdx_from_request(self, env):
|
def load_cdx_from_request(self, env):
|
||||||
#url = wbrequest.wb_url.url
|
#url = wbrequest.wb_url.url
|
||||||
@ -113,7 +137,8 @@ class CDXServer:
|
|||||||
params['output'] = 'text'
|
params['output'] = 'text'
|
||||||
|
|
||||||
# parse_qs produces arrays for single values
|
# parse_qs produces arrays for single values
|
||||||
# cdxreader expects singleton params for all except filters, so convert here
|
# cdx processing expects singleton params for all params,
|
||||||
|
# except filters, so convert here
|
||||||
# use first value of the list
|
# use first value of the list
|
||||||
for name, val in params.iteritems():
|
for name, val in params.iteritems():
|
||||||
if name != 'filter':
|
if name != 'filter':
|
||||||
@ -122,13 +147,10 @@ class CDXServer:
|
|||||||
cdx_lines = self.load_cdx(**params)
|
cdx_lines = self.load_cdx(**params)
|
||||||
return cdx_lines
|
return cdx_lines
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return 'load cdx indexes from ' + str(self.sources)
|
return 'load cdx indexes from ' + str(self.sources)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
def unsurt(surt):
|
def unsurt(surt):
|
||||||
"""
|
"""
|
||||||
@ -141,7 +163,8 @@ def unsurt(surt):
|
|||||||
'com,example)'
|
'com,example)'
|
||||||
|
|
||||||
# Long surt
|
# Long surt
|
||||||
>>> unsurt('suffix,domain,sub,subsub,another,subdomain)/path/file/index.html?a=b?c=)/')
|
>>> unsurt('suffix,domain,sub,subsub,another,subdomain)/path/file/\
|
||||||
|
index.html?a=b?c=)/')
|
||||||
'subdomain.another.subsub.sub.domain.suffix)/path/file/index.html?a=b?c=)/'
|
'subdomain.another.subsub.sub.domain.suffix)/path/file/index.html?a=b?c=)/'
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@ -158,3 +181,6 @@ def unsurt(surt):
|
|||||||
return surt
|
return surt
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import doctest
|
||||||
|
doctest.testmod()
|
92
pywb/cdx/cdxsource.py
Normal file
92
pywb/cdx/cdxsource.py
Normal file
@ -0,0 +1,92 @@
|
|||||||
|
from pywb.utils.binsearch import iter_exact, iter_prefix
|
||||||
|
from pywb.utils.loaders import SeekableTextFileReader
|
||||||
|
|
||||||
|
import urllib
|
||||||
|
import urllib2
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class CDXSource(object):
|
||||||
|
"""
|
||||||
|
Represents any cdx index source
|
||||||
|
"""
|
||||||
|
def load_cdx(self, params):
|
||||||
|
raise NotImplementedError('Implement in subclass')
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class CDXFile(CDXSource):
|
||||||
|
"""
|
||||||
|
Represents a local plain-text .cdx file
|
||||||
|
"""
|
||||||
|
def __init__(self, filename):
|
||||||
|
self.filename = filename
|
||||||
|
|
||||||
|
def load_cdx(self, params):
|
||||||
|
source = SeekableTextFileReader(self.filename)
|
||||||
|
|
||||||
|
match_type = params.get('match_type')
|
||||||
|
|
||||||
|
if match_type == 'prefix':
|
||||||
|
iter_func = iter_prefix
|
||||||
|
else:
|
||||||
|
iter_func = iter_exact
|
||||||
|
|
||||||
|
key = params.get('key')
|
||||||
|
|
||||||
|
return iter_func(source, key)
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return 'CDX File - ' + self.filename
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class RemoteCDXSource(CDXSource):
|
||||||
|
"""
|
||||||
|
Represents a remote cdx server, to which requests will be proxied.
|
||||||
|
|
||||||
|
Only url and match type params are proxied at this time,
|
||||||
|
the stream is passed through all other filters locally.
|
||||||
|
"""
|
||||||
|
def __init__(self, filename, cookie=None, proxy_all=True):
|
||||||
|
self.remote_url = filename
|
||||||
|
self.cookie = cookie
|
||||||
|
self.proxy_all = proxy_all
|
||||||
|
|
||||||
|
def load_cdx(self, proxy_params):
|
||||||
|
if self.proxy_all:
|
||||||
|
params = proxy_params
|
||||||
|
params['proxy_all'] = True
|
||||||
|
else:
|
||||||
|
# Only send url and matchType params to remote
|
||||||
|
params = {}
|
||||||
|
params['url'] = proxy_params['url']
|
||||||
|
match_type = proxy_params.get('match_type')
|
||||||
|
|
||||||
|
if match_type:
|
||||||
|
proxy_params['matchType'] = match_type
|
||||||
|
|
||||||
|
urlparams = urllib.urlencode(params, True)
|
||||||
|
|
||||||
|
try:
|
||||||
|
request = urllib2.Request(self.remote_url, urlparams)
|
||||||
|
|
||||||
|
if self.cookie:
|
||||||
|
request.add_header('Cookie', self.cookie)
|
||||||
|
|
||||||
|
response = urllib2.urlopen(request)
|
||||||
|
|
||||||
|
except urllib2.HTTPError as e:
|
||||||
|
if e.code == 403:
|
||||||
|
exc_msg = e.read()
|
||||||
|
msg = ('Blocked By Robots' if 'Blocked By Robots' in exc_msg
|
||||||
|
else 'Excluded')
|
||||||
|
|
||||||
|
raise AccessException(msg)
|
||||||
|
else:
|
||||||
|
raise
|
||||||
|
|
||||||
|
return iter(response)
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return 'Remote CDX Server: ' + self.remote_url
|
3
pywb/cdx/config.yaml
Normal file
3
pywb/cdx/config.yaml
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
#CDX Server WSGI App Config
|
||||||
|
index_paths: ./sample_data/
|
||||||
|
port: 8090
|
163
pywb/cdx/test/cdxserver_test.py
Normal file
163
pywb/cdx/test/cdxserver_test.py
Normal file
@ -0,0 +1,163 @@
|
|||||||
|
#=================================================================
|
||||||
|
"""
|
||||||
|
# Merge Sort Multipe CDX Sources
|
||||||
|
>>> cdx_ops_test(url = 'http://iana.org/', sources = [test_cdx_dir + 'dupes.cdx', test_cdx_dir + 'iana.cdx'])
|
||||||
|
org,iana)/ 20140126200624 http://www.iana.org/ text/html 200 OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 2258 334 iana.warc.gz
|
||||||
|
org,iana)/ 20140127171238 http://iana.org unk 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 343 1858 dupes.warc.gz
|
||||||
|
org,iana)/ 20140127171238 http://www.iana.org/ warc/revisit - OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 536 2678 dupes.warc.gz
|
||||||
|
|
||||||
|
|
||||||
|
# Limit CDX Stream
|
||||||
|
>>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', limit = 3)
|
||||||
|
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200625 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 117166 198285 iana.warc.gz
|
||||||
|
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200654 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf warc/revisit - YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 548 482544 iana.warc.gz
|
||||||
|
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200706 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf warc/revisit - YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 495230 iana.warc.gz
|
||||||
|
|
||||||
|
|
||||||
|
# Reverse CDX Stream
|
||||||
|
>>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', reverse = True, resolve_revisits = True, limit = 3)
|
||||||
|
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201308 https://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 551 783712 iana.warc.gz 117166 198285 iana.warc.gz
|
||||||
|
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201249 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 771773 iana.warc.gz 117166 198285 iana.warc.gz
|
||||||
|
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201240 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 551 757988 iana.warc.gz 117166 198285 iana.warc.gz
|
||||||
|
|
||||||
|
>>> cdx_ops_test('http://iana.org/_js/2013.1/jquery.js', reverse = True, resolve_revisits = True, limit = 1)
|
||||||
|
org,iana)/_js/2013.1/jquery.js 20140126201307 https://www.iana.org/_js/2013.1/jquery.js application/x-javascript 200 AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO - - 543 778507 iana.warc.gz 33449 7311 iana.warc.gz
|
||||||
|
|
||||||
|
# No matching results
|
||||||
|
>>> cdx_ops_test('http://iana.org/dont_have_this', reverse = True, resolve_revisits = True, limit = 2)
|
||||||
|
|
||||||
|
|
||||||
|
# Filter cdx
|
||||||
|
>>> cdx_ops_test(url = 'http://iana.org/domains', match_type = 'prefix', filter = ['mimetype:text/html'])
|
||||||
|
org,iana)/domains 20140126200825 http://www.iana.org/domains text/html 200 7UPSCLNWNZP33LGW6OJGSF2Y4CDG4ES7 - - 2912 610534 iana.warc.gz
|
||||||
|
org,iana)/domains/arpa 20140126201248 http://www.iana.org/domains/arpa text/html 200 QOFZZRN6JIKAL2JRL6ZC2VVG42SPKGHT - - 2939 759039 iana.warc.gz
|
||||||
|
org,iana)/domains/idn-tables 20140126201127 http://www.iana.org/domains/idn-tables text/html 200 HNCUFTJMOQOGAEY6T56KVC3T7TVLKGEW - - 8118 715878 iana.warc.gz
|
||||||
|
org,iana)/domains/int 20140126201239 http://www.iana.org/domains/int text/html 200 X32BBNNORV4SPEHTQF5KI5NFHSKTZK6Q - - 2482 746788 iana.warc.gz
|
||||||
|
org,iana)/domains/reserved 20140126201054 http://www.iana.org/domains/reserved text/html 200 R5AAEQX5XY5X5DG66B23ODN5DUBWRA27 - - 3573 701457 iana.warc.gz
|
||||||
|
org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz
|
||||||
|
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz
|
||||||
|
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
|
||||||
|
org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz
|
||||||
|
|
||||||
|
|
||||||
|
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', filter = 'statuscode:200')
|
||||||
|
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
|
||||||
|
|
||||||
|
|
||||||
|
# Collapse by timestamp
|
||||||
|
# unresolved revisits, different statuscode results in an extra repeat
|
||||||
|
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapse_time = 11)
|
||||||
|
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
|
||||||
|
org,iana)/_css/2013.1/screen.css 20140126200653 http://www.iana.org/_css/2013.1/screen.css warc/revisit - BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 533 328367 iana.warc.gz
|
||||||
|
org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css warc/revisit - BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz
|
||||||
|
|
||||||
|
# resolved revisits
|
||||||
|
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapse_time = '11', resolve_revisits = True)
|
||||||
|
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz - - -
|
||||||
|
org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz 8754 41238 iana.warc.gz
|
||||||
|
|
||||||
|
|
||||||
|
# Sort by closest timestamp + field select output
|
||||||
|
>>> cdx_ops_test(closest = '20140126200826', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', fields = 'timestamp', limit = 10)
|
||||||
|
20140126200826
|
||||||
|
20140126200816
|
||||||
|
20140126200805
|
||||||
|
20140126200912
|
||||||
|
20140126200738
|
||||||
|
20140126200930
|
||||||
|
20140126200718
|
||||||
|
20140126200706
|
||||||
|
20140126200654
|
||||||
|
20140126200625
|
||||||
|
|
||||||
|
>>> cdx_ops_test(closest = '20140126201306', url = 'http://iana.org/dnssec', resolve_revisits = True, sources = [test_cdx_dir + 'dupes.cdx', test_cdx_dir + 'iana.cdx'])
|
||||||
|
org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz - - -
|
||||||
|
org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz - - -
|
||||||
|
|
||||||
|
|
||||||
|
>>> cdx_ops_test(closest = '20140126201307', url = 'http://iana.org/dnssec', resolve_revisits = True)
|
||||||
|
org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz - - -
|
||||||
|
org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz - - -
|
||||||
|
|
||||||
|
# equal dist prefer earlier
|
||||||
|
>>> cdx_ops_test(closest = '20140126200700', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolve_revisits = True, limit = 2)
|
||||||
|
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200654 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 548 482544 iana.warc.gz 117166 198285 iana.warc.gz
|
||||||
|
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200706 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 495230 iana.warc.gz 117166 198285 iana.warc.gz
|
||||||
|
|
||||||
|
>>> cdx_ops_test(closest = '20140126200659', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolve_revisits = True, limit = 2, fields = 'timestamp')
|
||||||
|
20140126200654
|
||||||
|
20140126200706
|
||||||
|
|
||||||
|
>>> cdx_ops_test(closest = '20140126200701', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolve_revisits = True, limit = 2, fields = 'timestamp')
|
||||||
|
20140126200706
|
||||||
|
20140126200654
|
||||||
|
|
||||||
|
|
||||||
|
# Resolve Revisits
|
||||||
|
>>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/inconsolata.otf', resolve_revisits = True)
|
||||||
|
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200826 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 34054 620049 iana.warc.gz - - -
|
||||||
|
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200912 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 546 667073 iana.warc.gz 34054 620049 iana.warc.gz
|
||||||
|
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200930 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 534 697255 iana.warc.gz 34054 620049 iana.warc.gz
|
||||||
|
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201055 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 547 714833 iana.warc.gz 34054 620049 iana.warc.gz
|
||||||
|
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201249 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 551 768625 iana.warc.gz 34054 620049 iana.warc.gz
|
||||||
|
|
||||||
|
>>> cdx_ops_test('http://iana.org/domains/root/db', resolve_revisits = True)
|
||||||
|
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz - - -
|
||||||
|
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz - - -
|
||||||
|
|
||||||
|
|
||||||
|
# CDX Server init
|
||||||
|
>>> x = CDXServer([test_cdx_dir]).load_cdx(url = 'example.com', limit = 1, output = 'raw')
|
||||||
|
>>> pprint.pprint(x.next().items())
|
||||||
|
[('urlkey', 'com,example)/'),
|
||||||
|
('timestamp', '20140127171200'),
|
||||||
|
('original', 'http://example.com'),
|
||||||
|
('mimetype', 'text/html'),
|
||||||
|
('statuscode', '200'),
|
||||||
|
('digest', 'B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'),
|
||||||
|
('redirect', '-'),
|
||||||
|
('robotflags', '-'),
|
||||||
|
('length', '1046'),
|
||||||
|
('offset', '334'),
|
||||||
|
('filename', 'dupes.warc.gz')]
|
||||||
|
|
||||||
|
# NOTE: external dependency -- need self-contained test
|
||||||
|
>>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'example.com', output = 'raw', limit = '2')
|
||||||
|
>>> pprint.pprint(x.next().items())
|
||||||
|
[('urlkey', 'com,example)/'),
|
||||||
|
('timestamp', '20020120142510'),
|
||||||
|
('original', 'http://example.com:80/'),
|
||||||
|
('mimetype', 'text/html'),
|
||||||
|
('statuscode', '200'),
|
||||||
|
('digest', 'HT2DYGA5UKZCPBSFVCV3JOBXGW2G5UUA'),
|
||||||
|
('length', '1792')]
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
from pywb.cdx.cdxserver import CDXServer
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import pprint
|
||||||
|
|
||||||
|
from pywb import get_test_dir
|
||||||
|
#test_cdx_dir = os.path.dirname(os.path.realpath(__file__)) + '/../sample_data/'
|
||||||
|
test_cdx_dir = get_test_dir() + 'cdx/'
|
||||||
|
|
||||||
|
def cdx_ops_test(url, sources = [test_cdx_dir + 'iana.cdx'], **kwparams):
|
||||||
|
kwparams['url'] = url
|
||||||
|
kwparams['output'] = 'text'
|
||||||
|
|
||||||
|
server = CDXServer(sources)
|
||||||
|
results = server.load_cdx(**kwparams)
|
||||||
|
|
||||||
|
for x in results:
|
||||||
|
sys.stdout.write(x)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import doctest
|
||||||
|
doctest.testmod()
|
||||||
|
|
||||||
|
|
72
pywb/cdx/wsgi_cdxserver.py
Normal file
72
pywb/cdx/wsgi_cdxserver.py
Normal file
@ -0,0 +1,72 @@
|
|||||||
|
from cdxserver import CDXServer
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import yaml
|
||||||
|
import pkgutil
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
TEST_CDX_DIR = os.path.dirname(os.path.realpath(__file__)) + '/../sample_data/'
|
||||||
|
|
||||||
|
CONFIG_FILE = 'config.yaml'
|
||||||
|
|
||||||
|
DEFAULT_PORT = 8080
|
||||||
|
|
||||||
|
if __package__:
|
||||||
|
config = pkgutil.get_data(__package__, CONFIG_FILE)
|
||||||
|
config = yaml.load(config)
|
||||||
|
else:
|
||||||
|
config = None
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
def main():
|
||||||
|
logging.basicConfig(format='%(asctime)s: [%(levelname)s]: %(message)s',
|
||||||
|
level=logging.DEBUG)
|
||||||
|
|
||||||
|
cdx_config = config.get('index_paths') if config else None
|
||||||
|
|
||||||
|
if not cdx_config:
|
||||||
|
cdx_config = [TEST_CDX_DIR]
|
||||||
|
|
||||||
|
cdxserver = CDXServer(cdx_config)
|
||||||
|
|
||||||
|
def application(env, start_response):
|
||||||
|
try:
|
||||||
|
response = cdxserver.load_cdx_from_request(env)
|
||||||
|
start_response('200 OK', [('Content-Type', 'text/plain')])
|
||||||
|
|
||||||
|
response = list(response)
|
||||||
|
|
||||||
|
except Exception as exc:
|
||||||
|
import traceback
|
||||||
|
err_details = traceback.format_exc(exc)
|
||||||
|
start_response('400 Error', [('Content-Type', 'text/plain')])
|
||||||
|
response = [str(exc)]
|
||||||
|
print err_details
|
||||||
|
|
||||||
|
return response
|
||||||
|
|
||||||
|
return application
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
from wsgiref.simple_server import make_server
|
||||||
|
|
||||||
|
app = main()
|
||||||
|
|
||||||
|
port = DEFAULT_PORT
|
||||||
|
if config:
|
||||||
|
port = config.get('port', DEFAULT_PORT)
|
||||||
|
|
||||||
|
httpd = make_server('', port, app)
|
||||||
|
|
||||||
|
logging.debug('Starting CDX Server on port ' + str(port))
|
||||||
|
|
||||||
|
try:
|
||||||
|
httpd.serve_forever()
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
pass
|
||||||
|
|
||||||
|
logging.debug('Stopping CDX Server')
|
||||||
|
else:
|
||||||
|
application = main()
|
@ -1,42 +0,0 @@
|
|||||||
from cdxserver import CDXServer
|
|
||||||
import logging
|
|
||||||
import os
|
|
||||||
|
|
||||||
|
|
||||||
test_cdx_dir = os.path.dirname(os.path.realpath(__file__)) + '/../../sample_archive/cdx/'
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
def main(config = None):
|
|
||||||
logging.basicConfig(format = '%(asctime)s: [%(levelname)s]: %(message)s', level = logging.DEBUG)
|
|
||||||
|
|
||||||
if not config:
|
|
||||||
config = [test_cdx_dir]
|
|
||||||
|
|
||||||
cdxserver = CDXServer(config)
|
|
||||||
|
|
||||||
def application(env, start_response):
|
|
||||||
try:
|
|
||||||
response = cdxserver.load_cdx_from_request(env)
|
|
||||||
start_response('200 OK', [('Content-Type', 'text/plain')])
|
|
||||||
|
|
||||||
response = list(response)
|
|
||||||
|
|
||||||
except Exception as exc:
|
|
||||||
import traceback
|
|
||||||
err_details = traceback.format_exc(exc)
|
|
||||||
start_response('400 Error', [('Content-Type', 'text/plain')])
|
|
||||||
response = [str(exc)]
|
|
||||||
print err_details
|
|
||||||
|
|
||||||
return response
|
|
||||||
|
|
||||||
|
|
||||||
return application
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
application = main()
|
|
||||||
|
|
||||||
|
|
@ -1,59 +1,34 @@
|
|||||||
import archiveloader
|
|
||||||
import views
|
import views
|
||||||
import handlers
|
import handlers
|
||||||
import indexreader
|
|
||||||
import replay_views
|
import replay_views
|
||||||
import replay_resolvers
|
|
||||||
import logging
|
import logging
|
||||||
import hmac
|
|
||||||
import time
|
from pywb.warc.recordloader import ArcWarcRecordLoader
|
||||||
|
from pywb.warc.resolvingloader import ResolvingLoader
|
||||||
|
from pywb.rewrite.rewrite_content import RewriteContent
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
# Config Loading
|
# Config Loading
|
||||||
#=================================================================
|
#=================================================================
|
||||||
def load_template_file(file, desc = None, view_class = views.J2TemplateView):
|
def load_template_file(file, desc = None, view_class = views.J2TemplateView):
|
||||||
if file:
|
if file:
|
||||||
logging.info('Adding {0}: {1}'.format(desc if desc else name, file))
|
logging.debug('Adding {0}: {1}'.format(desc if desc else name, file))
|
||||||
file = view_class(file)
|
file = view_class(file)
|
||||||
|
|
||||||
return file
|
return file
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
# Cookie Signing
|
def create_wb_handler(cdx_server, config):
|
||||||
#=================================================================
|
|
||||||
|
|
||||||
class HMACCookieMaker:
|
record_loader = ArcWarcRecordLoader(cookie_maker = config.get('cookie_maker'))
|
||||||
def __init__(self, key, name):
|
paths = config.get('archive_paths')
|
||||||
self.key = key
|
|
||||||
self.name = name
|
|
||||||
|
|
||||||
def __call__(self, duration, extra_id = ''):
|
resolving_loader = ResolvingLoader(paths = paths, cdx_server = cdx_server, record_loader = record_loader)
|
||||||
expire = str(long(time.time() + duration))
|
|
||||||
|
|
||||||
if extra_id:
|
replayer = replay_views.ReplayView(
|
||||||
msg = extra_id + '-' + expire
|
content_loader = resolving_loader,
|
||||||
else:
|
|
||||||
msg = expire
|
|
||||||
|
|
||||||
hmacdigest = hmac.new(self.key, msg)
|
content_rewriter = RewriteContent(),
|
||||||
hexdigest = hmacdigest.hexdigest()
|
|
||||||
|
|
||||||
if extra_id:
|
|
||||||
cookie = '{0}-{1}={2}-{3}'.format(self.name, extra_id, expire, hexdigest)
|
|
||||||
else:
|
|
||||||
cookie = '{0}={1}-{2}'.format(self.name, expire, hexdigest)
|
|
||||||
|
|
||||||
return cookie
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
def create_wb_handler(cdx_source, config):
|
|
||||||
|
|
||||||
replayer = replay_views.RewritingReplayView(
|
|
||||||
|
|
||||||
resolvers = replay_resolvers.make_best_resolvers(config.get('archive_paths')),
|
|
||||||
|
|
||||||
loader = archiveloader.ArchiveLoader(hmac = config.get('hmac')),
|
|
||||||
|
|
||||||
head_insert_view = load_template_file(config.get('head_insert_html'), 'Head Insert'),
|
head_insert_view = load_template_file(config.get('head_insert_html'), 'Head Insert'),
|
||||||
|
|
||||||
@ -66,7 +41,7 @@ def create_wb_handler(cdx_source, config):
|
|||||||
|
|
||||||
|
|
||||||
wb_handler = handlers.WBHandler(
|
wb_handler = handlers.WBHandler(
|
||||||
cdx_source,
|
cdx_server,
|
||||||
|
|
||||||
replayer,
|
replayer,
|
||||||
|
|
||||||
|
@ -1,13 +1,12 @@
|
|||||||
import views
|
|
||||||
import utils
|
|
||||||
import urlparse
|
import urlparse
|
||||||
|
|
||||||
from wbrequestresponse import WbResponse
|
|
||||||
from wburl import WbUrl
|
|
||||||
from wbexceptions import WbException, NotFoundException
|
|
||||||
|
|
||||||
import pkgutil
|
import pkgutil
|
||||||
import mimetypes
|
import mimetypes
|
||||||
|
import time
|
||||||
|
|
||||||
|
from pywb.rewrite.wburl import WbUrl
|
||||||
|
from wbrequestresponse import WbResponse
|
||||||
|
from wbexceptions import WbException, NotFoundException
|
||||||
|
from views import TextCapturesView
|
||||||
|
|
||||||
|
|
||||||
class BaseHandler:
|
class BaseHandler:
|
||||||
@ -22,23 +21,22 @@ class BaseHandler:
|
|||||||
# Standard WB Handler
|
# Standard WB Handler
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class WBHandler(BaseHandler):
|
class WBHandler(BaseHandler):
|
||||||
def __init__(self, cdx_reader, replay, html_view = None, search_view = None):
|
def __init__(self, index_reader, replay, html_view = None, search_view = None):
|
||||||
self.cdx_reader = cdx_reader
|
self.index_reader = index_reader
|
||||||
self.replay = replay
|
self.replay = replay
|
||||||
|
|
||||||
self.text_view = views.TextCapturesView()
|
self.text_view = TextCapturesView()
|
||||||
|
|
||||||
self.html_view = html_view
|
self.html_view = html_view
|
||||||
self.search_view = search_view
|
self.search_view = search_view
|
||||||
|
|
||||||
|
|
||||||
def __call__(self, wbrequest):
|
def __call__(self, wbrequest):
|
||||||
|
|
||||||
if wbrequest.wb_url_str == '/':
|
if wbrequest.wb_url_str == '/':
|
||||||
return self.render_search_page(wbrequest)
|
return self.render_search_page(wbrequest)
|
||||||
|
|
||||||
with utils.PerfTimer(wbrequest.env.get('X_PERF'), 'query') as t:
|
with PerfTimer(wbrequest.env.get('X_PERF'), 'query') as t:
|
||||||
cdx_lines = self.cdx_reader.load_for_request(wbrequest, parsed_cdx = True)
|
cdx_lines = self.index_reader.load_for_request(wbrequest)
|
||||||
|
|
||||||
# new special modifier to always show cdx index
|
# new special modifier to always show cdx index
|
||||||
if wbrequest.wb_url.mod == 'cdx_':
|
if wbrequest.wb_url.mod == 'cdx_':
|
||||||
@ -48,8 +46,8 @@ class WBHandler(BaseHandler):
|
|||||||
query_view = self.html_view if self.html_view else self.text_view
|
query_view = self.html_view if self.html_view else self.text_view
|
||||||
return query_view.render_response(wbrequest, cdx_lines)
|
return query_view.render_response(wbrequest, cdx_lines)
|
||||||
|
|
||||||
with utils.PerfTimer(wbrequest.env.get('X_PERF'), 'replay') as t:
|
with PerfTimer(wbrequest.env.get('X_PERF'), 'replay') as t:
|
||||||
return self.replay(wbrequest, cdx_lines, self.cdx_reader)
|
return self.replay(wbrequest, cdx_lines)
|
||||||
|
|
||||||
|
|
||||||
def render_search_page(self, wbrequest):
|
def render_search_page(self, wbrequest):
|
||||||
@ -60,18 +58,18 @@ class WBHandler(BaseHandler):
|
|||||||
|
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return 'WBHandler: ' + str(self.cdx_reader) + ', ' + str(self.replay)
|
return 'WBHandler: ' + str(self.index_reader) + ', ' + str(self.replay)
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
# CDX-Server Handler -- pass all params to cdx server
|
# CDX-Server Handler -- pass all params to cdx server
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class CDXHandler(BaseHandler):
|
class CDXHandler(BaseHandler):
|
||||||
def __init__(self, cdx_server, view = None):
|
def __init__(self, index_reader, view = None):
|
||||||
self.cdx_server = cdx_server
|
self.index_reader = index_reader
|
||||||
self.view = view if view else views.TextCapturesView()
|
self.view = view if view else TextCapturesView()
|
||||||
|
|
||||||
def __call__(self, wbrequest):
|
def __call__(self, wbrequest):
|
||||||
cdx_lines = self.cdx_server.load_cdx_from_request(wbrequest.env)
|
cdx_lines = self.index_reader.cdx_server.load_cdx_from_request(wbrequest.env)
|
||||||
|
|
||||||
return self.view.render_response(wbrequest, cdx_lines)
|
return self.view.render_response(wbrequest, cdx_lines)
|
||||||
|
|
||||||
@ -81,7 +79,7 @@ class CDXHandler(BaseHandler):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return 'CDX Server: ' + str(self.cdx_server)
|
return 'Index Reader: ' + str(self.index_reader)
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
@ -136,4 +134,19 @@ class DebugEchoHandler(BaseHandler):
|
|||||||
return WbResponse.text_response(str(wbrequest))
|
return WbResponse.text_response(str(wbrequest))
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class PerfTimer:
|
||||||
|
def __init__(self, perfdict, name):
|
||||||
|
self.perfdict = perfdict
|
||||||
|
self.name = name
|
||||||
|
|
||||||
|
def __enter__(self):
|
||||||
|
self.start = time.clock()
|
||||||
|
return self
|
||||||
|
|
||||||
|
def __exit__(self, *args):
|
||||||
|
self.end = time.clock()
|
||||||
|
if self.perfdict is not None:
|
||||||
|
self.perfdict[self.name] = str(self.end - self.start)
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,17 +1,22 @@
|
|||||||
import urllib
|
import urllib
|
||||||
import urllib2
|
import urllib2
|
||||||
import wbexceptions
|
import wbexceptions
|
||||||
import wbrequestresponse
|
|
||||||
from collections import OrderedDict
|
|
||||||
|
|
||||||
from cdxserver.cdxserver import CDXServer, CDXException
|
from itertools import chain
|
||||||
from cdxserver.cdxobject import CDXObject
|
from pprint import pprint
|
||||||
|
|
||||||
import logging
|
from pywb.cdx.cdxserver import CDXServer, CDXException
|
||||||
|
from pywb.cdx.cdxobject import CDXObject
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class IndexReader:
|
class IndexReader(object):
|
||||||
def load_for_request(self, wbrequest, parsed_cdx = True):
|
def __init__(self, config):
|
||||||
|
if isinstance(config, str):
|
||||||
|
self.cdx_server = CDXServer(config)
|
||||||
|
else:
|
||||||
|
self.cdx_server = CDXServer.create_from_config(config)
|
||||||
|
|
||||||
|
def load_for_request(self, wbrequest):
|
||||||
wburl = wbrequest.wb_url
|
wburl = wbrequest.wb_url
|
||||||
|
|
||||||
# init standard params
|
# init standard params
|
||||||
@ -24,147 +29,27 @@ class IndexReader:
|
|||||||
if wbrequest.custom_params:
|
if wbrequest.custom_params:
|
||||||
params.update(wbrequest.custom_params)
|
params.update(wbrequest.custom_params)
|
||||||
|
|
||||||
#params['url'] = wburl.url
|
params['url'] = wburl.url
|
||||||
output = 'raw' if parsed_cdx else 'text'
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
cdxlines = self.load_cdx(url = wburl.url, output = output, **params)
|
cdxlines = self.load_cdx(output='raw', **params)
|
||||||
except CDXException:
|
except CDXException:
|
||||||
raise wbexceptions.BadUrlException('Bad Request Url: ' + wburl.url)
|
raise wbexceptions.BadUrlException('Bad Request Url: ' + wburl.url)
|
||||||
|
|
||||||
cdxlines = utils.peek_iter(cdxlines)
|
cdxlines = self.peek_iter(cdxlines)
|
||||||
|
|
||||||
if cdxlines is None:
|
if cdxlines is None:
|
||||||
raise wbexceptions.NotFoundException('WB Does Not Have Url: ' + wburl.url)
|
raise wbexceptions.NotFoundException('WB Does Not Have Url: ' + wburl.url)
|
||||||
|
|
||||||
cdxlines = self.filter_cdx(wbrequest, cdxlines)
|
|
||||||
|
|
||||||
return cdxlines
|
return cdxlines
|
||||||
|
|
||||||
def filter_cdx(self, wbrequest, cdxlines):
|
def load_cdx(self, **params):
|
||||||
# Subclasses may wrap cdxlines iterator in a filter
|
return self.cdx_server.load_cdx(**params)
|
||||||
return cdxlines
|
|
||||||
|
|
||||||
def load_cdx(self, url, params = {}, parsed_cdx = True):
|
|
||||||
raise NotImplementedError('Override in subclasses')
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def make_best_cdx_source(paths, config):
|
|
||||||
# may be a string or list
|
|
||||||
surt_ordered = config.get('surt_ordered', True)
|
|
||||||
|
|
||||||
# support mixed cdx streams and remote servers?
|
|
||||||
# for now, list implies local sources
|
|
||||||
if isinstance(paths, list):
|
|
||||||
if len(paths) > 1:
|
|
||||||
return EmbeddedCDXServer(paths, surt_ordered)
|
|
||||||
else:
|
|
||||||
# treat as non-list
|
|
||||||
paths = paths[0]
|
|
||||||
|
|
||||||
# a single uri
|
|
||||||
uri = paths
|
|
||||||
|
|
||||||
# Check for remote cdx server
|
|
||||||
if (uri.startswith('http://') or uri.startswith('https://')) and not uri.endswith('.cdx'):
|
|
||||||
cookie = config.get('cookie', None)
|
|
||||||
return RemoteCDXServer(uri, cookie = cookie)
|
|
||||||
else:
|
|
||||||
return EmbeddedCDXServer([uri], surt_ordered)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
class EmbeddedCDXServer(CDXServer, IndexReader):
|
|
||||||
def get_query_params(self, wburl, limit = 150000, collapse_time = None, replay_closest = 10):
|
def get_query_params(self, wburl, limit = 150000, collapse_time = None, replay_closest = 10):
|
||||||
|
|
||||||
if wburl.type == wburl.URL_QUERY:
|
if wburl.type == wburl.URL_QUERY:
|
||||||
raise NotImplementedError('Url Query Not Yet Supported')
|
raise NotImplementedError('Url Query Not Yet Supported')
|
||||||
|
|
||||||
return {
|
return {
|
||||||
|
|
||||||
wburl.QUERY:
|
|
||||||
{'collapse_time': collapse_time, 'filter': '!statuscode:(500|502|504)', 'limit': limit},
|
|
||||||
|
|
||||||
wburl.URL_QUERY:
|
|
||||||
{},
|
|
||||||
# raise Exception('Not Yet Implemented')
|
|
||||||
# {'collapse': 'urlkey', 'matchType': 'prefix', 'showGroupCount': True, 'showUniqCount': True, 'lastSkipTimestamp': True, 'limit': limit,
|
|
||||||
# 'fl': 'urlkey,original,timestamp,endtimestamp,groupcount,uniqcount',
|
|
||||||
# },
|
|
||||||
|
|
||||||
wburl.REPLAY:
|
|
||||||
{'filter': '!statuscode:(500|502|504)', 'limit': replay_closest, 'closest_to': wburl.timestamp, 'resolve_revisits': True},
|
|
||||||
|
|
||||||
wburl.LATEST_REPLAY:
|
|
||||||
{'reverse': True, 'filter': 'statuscode:[23]..', 'limit': '1', 'resolve_revisits': True}
|
|
||||||
|
|
||||||
}[wburl.type]
|
|
||||||
|
|
||||||
|
|
||||||
def __str__(self):
|
|
||||||
return 'load cdx indexes from ' + str(self.sources)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
class RemoteCDXServer(IndexReader):
|
|
||||||
"""
|
|
||||||
>>> x = RemoteCDXServer('http://web.archive.org/cdx/search/cdx').load_cdx('example.com', parsed_cdx = True, limit = '2')
|
|
||||||
>>> pprint(x.next().items())
|
|
||||||
[('urlkey', 'com,example)/'),
|
|
||||||
('timestamp', '20020120142510'),
|
|
||||||
('original', 'http://example.com:80/'),
|
|
||||||
('mimetype', 'text/html'),
|
|
||||||
('statuscode', '200'),
|
|
||||||
('digest', 'HT2DYGA5UKZCPBSFVCV3JOBXGW2G5UUA'),
|
|
||||||
('length', '1792')]
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, server_url, cookie = None):
|
|
||||||
self.server_url = server_url
|
|
||||||
self.auth_cookie = cookie
|
|
||||||
|
|
||||||
def load_cdx(self, url, params = {}, parsed_cdx = True, **kwvalues):
|
|
||||||
#url is required, must be passed explicitly!
|
|
||||||
params['url'] = url
|
|
||||||
params.update(**kwvalues)
|
|
||||||
|
|
||||||
urlparams = urllib.urlencode(params, True)
|
|
||||||
|
|
||||||
try:
|
|
||||||
request = urllib2.Request(self.server_url, urlparams)
|
|
||||||
|
|
||||||
if self.auth_cookie:
|
|
||||||
request.add_header('Cookie', self.auth_cookie)
|
|
||||||
|
|
||||||
response = urllib2.urlopen(request)
|
|
||||||
except urllib2.HTTPError, e:
|
|
||||||
if e.code == 403:
|
|
||||||
exc_msg = e.read()
|
|
||||||
msg = 'Blocked By Robots' if 'Blocked By Robots' in exc_msg else 'Excluded'
|
|
||||||
raise wbexceptions.AccessException(msg)
|
|
||||||
else:
|
|
||||||
raise
|
|
||||||
|
|
||||||
if parsed_cdx:
|
|
||||||
return (CDXObject(cdx) for cdx in response)
|
|
||||||
else:
|
|
||||||
return iter(response)
|
|
||||||
|
|
||||||
|
|
||||||
# Note: this params are designed to make pywb compatible with the original Java wayback-cdx-server API:
|
|
||||||
# https://github.com/internetarchive/wayback/tree/master/wayback-cdx-server
|
|
||||||
# Soon, this will be switched over to support the native pywb cdx server
|
|
||||||
|
|
||||||
# BUG: Setting replayClosest to high number for now, as cdx server sometimes returns wrong result
|
|
||||||
# with lower values if there are too many captures. Ideally, should be around 10-20
|
|
||||||
# The replayClosest is the max number of cdx lines, so max number of retry attempts that WB will make
|
|
||||||
|
|
||||||
def get_query_params(self, wburl, limit = '150000', collapse_time = '10', replay_closest = '4000'):
|
|
||||||
return {
|
|
||||||
|
|
||||||
wburl.QUERY:
|
wburl.QUERY:
|
||||||
{'collapseTime': collapse_time, 'filter': '!statuscode:(500|502|504)', 'limit': limit},
|
{'collapseTime': collapse_time, 'filter': '!statuscode:(500|502|504)', 'limit': limit},
|
||||||
|
|
||||||
@ -184,18 +69,20 @@ class RemoteCDXServer(IndexReader):
|
|||||||
|
|
||||||
}[wburl.type]
|
}[wburl.type]
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def peek_iter(iterable):
|
||||||
|
try:
|
||||||
|
first = next(iterable)
|
||||||
|
except StopIteration:
|
||||||
|
return None
|
||||||
|
|
||||||
def __str__(self):
|
return chain([first], iterable)
|
||||||
return 'server cdx from ' + self.server_url
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class RemoteCDXServer(IndexReader):
|
||||||
|
def __init__(self, remote_url, cookie=None):
|
||||||
|
self.remote = RemoteCDXSource(remote_url=remote_url, cookie=cookie, proxy_all=True)
|
||||||
|
self.cdx_server = CDXServer(self.remote)
|
||||||
|
|
||||||
# Testing
|
#def load_cdx(self, **params):
|
||||||
|
#return remote.load_cdx(**params)
|
||||||
import utils
|
|
||||||
if __name__ == "__main__" or utils.enable_doctests():
|
|
||||||
from pprint import pprint
|
|
||||||
|
|
||||||
test_dir = utils.test_data_dir() + 'cdx/'
|
|
||||||
|
|
||||||
import doctest
|
|
||||||
doctest.testmod()
|
|
||||||
|
@ -1,11 +1,12 @@
|
|||||||
import handlers
|
import handlers
|
||||||
import indexreader
|
import indexreader
|
||||||
import archivalrouter
|
import archivalrouter
|
||||||
|
import config_utils
|
||||||
|
import proxy
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import yaml
|
import yaml
|
||||||
import config_utils
|
|
||||||
import logging
|
import logging
|
||||||
import proxy
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
DEFAULTS = {
|
DEFAULTS = {
|
||||||
@ -49,24 +50,20 @@ def pywb_config_manual(passed_config = {}):
|
|||||||
collections = config.get('collections')
|
collections = config.get('collections')
|
||||||
|
|
||||||
for name, value in collections.iteritems():
|
for name, value in collections.iteritems():
|
||||||
route_config = config
|
if isinstance(value, str):
|
||||||
|
route_config = config
|
||||||
if isinstance(value, dict):
|
cdx_server = indexreader.IndexReader(value)
|
||||||
# if a dict, extend with base properies
|
|
||||||
index_paths = value['index_paths']
|
|
||||||
route_config = DictChain(value, config)
|
|
||||||
else:
|
else:
|
||||||
index_paths = str(value)
|
route_config = DictChain(value, config)
|
||||||
|
cdx_server = indexreader.IndexReader(route_config)
|
||||||
cdx_source = indexreader.IndexReader.make_best_cdx_source(index_paths, route_config)
|
|
||||||
|
|
||||||
|
|
||||||
wb_handler = config_utils.create_wb_handler(
|
wb_handler = config_utils.create_wb_handler(
|
||||||
cdx_source = cdx_source,
|
cdx_server = cdx_server,
|
||||||
config = route_config,
|
config = route_config,
|
||||||
)
|
)
|
||||||
|
|
||||||
logging.info('Adding Collection: ' + name)
|
logging.debug('Adding Collection: ' + name)
|
||||||
|
|
||||||
route_class = route_config.get('route_class', archivalrouter.Route)
|
route_class = route_config.get('route_class', archivalrouter.Route)
|
||||||
|
|
||||||
@ -74,7 +71,7 @@ def pywb_config_manual(passed_config = {}):
|
|||||||
|
|
||||||
# cdx query handler
|
# cdx query handler
|
||||||
if route_config.get('enable_cdx_api', False):
|
if route_config.get('enable_cdx_api', False):
|
||||||
routes.append(archivalrouter.Route(name + '-cdx', handlers.CDXHandler(cdx_source)))
|
routes.append(archivalrouter.Route(name + '-cdx', handlers.CDXHandler(cdx_server)))
|
||||||
|
|
||||||
|
|
||||||
if config.get('debug_echo_env', False):
|
if config.get('debug_echo_env', False):
|
||||||
@ -125,11 +122,3 @@ def pywb_config(config_file = None):
|
|||||||
|
|
||||||
return pywb_config_manual(config)
|
return pywb_config_manual(config)
|
||||||
|
|
||||||
|
|
||||||
import utils
|
|
||||||
if __name__ == "__main__" or utils.enable_doctests():
|
|
||||||
# Just test for execution for now
|
|
||||||
#pywb_config(os.path.dirname(os.path.realpath(__file__)) + '/../config.yaml')
|
|
||||||
pywb_config_manual()
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,269 +0,0 @@
|
|||||||
import re
|
|
||||||
import sys
|
|
||||||
import itertools
|
|
||||||
|
|
||||||
from url_rewriter import UrlRewriter
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
class RegexRewriter:
|
|
||||||
"""
|
|
||||||
# Test https->http converter (other tests below in subclasses)
|
|
||||||
>>> RegexRewriter([(RegexRewriter.HTTPX_MATCH_STR, RegexRewriter.remove_https, 0)]).rewrite('a = https://example.com; b = http://example.com; c = https://some-url/path/https://embedded.example.com')
|
|
||||||
'a = http://example.com; b = http://example.com; c = http://some-url/path/http://embedded.example.com'
|
|
||||||
"""
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def comment_out(string):
|
|
||||||
return '/*' + string + '*/'
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def remove_https(string):
|
|
||||||
return string.replace("https", "http")
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def add_prefix(prefix):
|
|
||||||
return lambda string: prefix + string
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def archival_rewrite(rewriter):
|
|
||||||
return lambda x: rewriter.rewrite(x)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def replacer(string):
|
|
||||||
return lambda x: string
|
|
||||||
|
|
||||||
HTTPX_MATCH_STR = r'https?:\\?/\\?/[A-Za-z0-9:_@.-]+'
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_OP = add_prefix
|
|
||||||
|
|
||||||
|
|
||||||
def __init__(self, rules):
|
|
||||||
#rules = self.create_rules(http_prefix)
|
|
||||||
|
|
||||||
# Build regexstr, concatenating regex list
|
|
||||||
regex_str = '|'.join(['(' + rx + ')' for rx, op, count in rules])
|
|
||||||
|
|
||||||
# ensure it's not middle of a word, wrap in non-capture group
|
|
||||||
regex_str = '(?<!\w)(?:' + regex_str + ')'
|
|
||||||
|
|
||||||
self.regex = re.compile(regex_str, re.M)
|
|
||||||
self.rules = rules
|
|
||||||
|
|
||||||
def filter(self, m):
|
|
||||||
return True
|
|
||||||
|
|
||||||
def rewrite(self, string):
|
|
||||||
return self.regex.sub(lambda x: self.replace(x), string)
|
|
||||||
|
|
||||||
def close(self):
|
|
||||||
return ''
|
|
||||||
|
|
||||||
def replace(self, m):
|
|
||||||
i = 0
|
|
||||||
for _, op, count in self.rules:
|
|
||||||
i += 1
|
|
||||||
|
|
||||||
full_m = i
|
|
||||||
while count > 0:
|
|
||||||
i += 1
|
|
||||||
count -= 1
|
|
||||||
|
|
||||||
if not m.group(i):
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Optional filter to skip matches
|
|
||||||
if not self.filter(m):
|
|
||||||
return m.group(0)
|
|
||||||
|
|
||||||
# Custom func
|
|
||||||
if not hasattr(op, '__call__'):
|
|
||||||
op = RegexRewriter.DEFAULT_OP(op)
|
|
||||||
|
|
||||||
result = op(m.group(i))
|
|
||||||
|
|
||||||
# if extracting partial match
|
|
||||||
if i != full_m:
|
|
||||||
result = m.string[m.start(full_m):m.start(i)] + result + m.string[m.end(i):m.end(full_m)]
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
class JSRewriter(RegexRewriter):
|
|
||||||
"""
|
|
||||||
>>> test_js('location = "http://example.com/abc.html"')
|
|
||||||
'WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"'
|
|
||||||
|
|
||||||
>>> test_js(r'location = "http:\/\/example.com/abc.html"')
|
|
||||||
'WB_wombat_location = "/web/20131010im_/http:\\\\/\\\\/example.com/abc.html"'
|
|
||||||
|
|
||||||
>>> test_js(r'location = "http:\\/\\/example.com/abc.html"')
|
|
||||||
'WB_wombat_location = "/web/20131010im_/http:\\\\/\\\\/example.com/abc.html"'
|
|
||||||
|
|
||||||
>>> test_js(r"location = 'http://example.com/abc.html/'")
|
|
||||||
"WB_wombat_location = '/web/20131010im_/http://example.com/abc.html/'"
|
|
||||||
|
|
||||||
>>> test_js(r'location = http://example.com/abc.html/')
|
|
||||||
'WB_wombat_location = http://example.com/abc.html/'
|
|
||||||
|
|
||||||
>>> test_js(r'location = /http:\/\/example.com/abc.html/')
|
|
||||||
'WB_wombat_location = /http:\\\\/\\\\/example.com/abc.html/'
|
|
||||||
|
|
||||||
>>> test_js('"/location" == some_location_val; locations = location;')
|
|
||||||
'"/location" == some_location_val; locations = WB_wombat_location;'
|
|
||||||
|
|
||||||
>>> test_js('cool_Location = "http://example.com/abc.html"')
|
|
||||||
'cool_Location = "/web/20131010im_/http://example.com/abc.html"'
|
|
||||||
|
|
||||||
>>> test_js('window.location = "http://example.com/abc.html" document.domain = "anotherdomain.com"')
|
|
||||||
'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html" document.WB_wombat_domain = "anotherdomain.com"'
|
|
||||||
|
|
||||||
>>> test_js('document_domain = "anotherdomain.com"; window.document.domain = "example.com"')
|
|
||||||
'document_domain = "anotherdomain.com"; window.document.WB_wombat_domain = "example.com"'
|
|
||||||
|
|
||||||
# custom rules added
|
|
||||||
>>> test_js('window.location = "http://example.com/abc.html"; some_func(); ', [('some_func\(\).*', RegexRewriter.comment_out, 0)])
|
|
||||||
'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"; /*some_func(); */'
|
|
||||||
|
|
||||||
# scheme-agnostic
|
|
||||||
>>> test_js('cool_Location = "//example.com/abc.html" //comment')
|
|
||||||
'cool_Location = "/web/20131010im_///example.com/abc.html" //comment'
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
JS_HTTPX = r'(?<="|\')(?:https?:)?\\?/\\?/[A-Za-z0-9:_@.-]+'
|
|
||||||
|
|
||||||
def __init__(self, rewriter, extra = []):
|
|
||||||
rules = self._create_rules(rewriter.get_abs_url())
|
|
||||||
rules.extend(extra)
|
|
||||||
|
|
||||||
RegexRewriter.__init__(self, rules)
|
|
||||||
|
|
||||||
|
|
||||||
def _create_rules(self, http_prefix):
|
|
||||||
return [
|
|
||||||
(self.JS_HTTPX, http_prefix, 0),
|
|
||||||
(r'(?<!/)\blocation\b', 'WB_wombat_', 0),
|
|
||||||
(r'(?<=document\.)domain', 'WB_wombat_', 0),
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
class XMLRewriter(RegexRewriter):
|
|
||||||
"""
|
|
||||||
>>> test_xml('<tag xmlns="http://www.example.com/ns" attr="http://example.com"></tag>')
|
|
||||||
'<tag xmlns="http://www.example.com/ns" attr="/web/20131010im_/http://example.com"></tag>'
|
|
||||||
|
|
||||||
>>> test_xml('<tag xmlns:xsi="http://www.example.com/ns" attr=" http://example.com"></tag>')
|
|
||||||
'<tag xmlns:xsi="http://www.example.com/ns" attr=" /web/20131010im_/http://example.com"></tag>'
|
|
||||||
|
|
||||||
>>> test_xml('<tag> http://example.com<other>abchttp://example.com</other></tag>')
|
|
||||||
'<tag> /web/20131010im_/http://example.com<other>abchttp://example.com</other></tag>'
|
|
||||||
|
|
||||||
>>> test_xml('<main> http://www.example.com/blah</tag> <other xmlns:abcdef= " http://example.com"/> http://example.com </main>')
|
|
||||||
'<main> /web/20131010im_/http://www.example.com/blah</tag> <other xmlns:abcdef= " http://example.com"/> /web/20131010im_/http://example.com </main>'
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, rewriter, extra = []):
|
|
||||||
rules = self._create_rules(rewriter.get_abs_url())
|
|
||||||
|
|
||||||
RegexRewriter.__init__(self, rules)
|
|
||||||
|
|
||||||
# custom filter to reject 'xmlns' attr
|
|
||||||
def filter(self, m):
|
|
||||||
attr = m.group(1)
|
|
||||||
if attr and attr.startswith('xmlns'):
|
|
||||||
return False
|
|
||||||
|
|
||||||
return True
|
|
||||||
|
|
||||||
def _create_rules(self, http_prefix):
|
|
||||||
return [
|
|
||||||
('([A-Za-z:]+[\s=]+)?["\'\s]*(' + RegexRewriter.HTTPX_MATCH_STR + ')', http_prefix, 2),
|
|
||||||
]
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
class CSSRewriter(RegexRewriter):
|
|
||||||
r"""
|
|
||||||
>>> test_css("background: url('/some/path.html')")
|
|
||||||
"background: url('/web/20131010im_/http://example.com/some/path.html')"
|
|
||||||
|
|
||||||
>>> test_css("background: url('../path.html')")
|
|
||||||
"background: url('/web/20131010im_/http://example.com/path.html')"
|
|
||||||
|
|
||||||
>>> test_css("background: url(\"http://domain.com/path.html\")")
|
|
||||||
'background: url("/web/20131010im_/http://domain.com/path.html")'
|
|
||||||
|
|
||||||
>>> test_css("background: url(file.jpeg)")
|
|
||||||
'background: url(/web/20131010im_/http://example.com/file.jpeg)'
|
|
||||||
|
|
||||||
>>> test_css("background: url('')")
|
|
||||||
"background: url('')"
|
|
||||||
|
|
||||||
>>> test_css("background: url (\"weirdpath\')")
|
|
||||||
'background: url ("/web/20131010im_/http://example.com/weirdpath\')'
|
|
||||||
|
|
||||||
>>> test_css("@import url ('path.css')")
|
|
||||||
"@import url ('/web/20131010im_/http://example.com/path.css')"
|
|
||||||
|
|
||||||
>>> test_css("@import url('path.css')")
|
|
||||||
"@import url('/web/20131010im_/http://example.com/path.css')"
|
|
||||||
|
|
||||||
>>> test_css("@import ( 'path.css')")
|
|
||||||
"@import ( '/web/20131010im_/http://example.com/path.css')"
|
|
||||||
|
|
||||||
>>> test_css("@import \"path.css\"")
|
|
||||||
'@import "/web/20131010im_/http://example.com/path.css"'
|
|
||||||
|
|
||||||
>>> test_css("@import ('../path.css\"")
|
|
||||||
'@import (\'/web/20131010im_/http://example.com/path.css"'
|
|
||||||
|
|
||||||
>>> test_css("@import ('../url.css\"")
|
|
||||||
'@import (\'/web/20131010im_/http://example.com/url.css"'
|
|
||||||
|
|
||||||
>>> test_css("@import (\"url.css\")")
|
|
||||||
'@import ("/web/20131010im_/http://example.com/url.css")'
|
|
||||||
|
|
||||||
>>> test_css("@import url(/url.css)\n@import url(/anotherurl.css)\n @import url(/and_a_third.css)")
|
|
||||||
'@import url(/web/20131010im_/http://example.com/url.css)\n@import url(/web/20131010im_/http://example.com/anotherurl.css)\n @import url(/web/20131010im_/http://example.com/and_a_third.css)'
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
CSS_URL_REGEX = "url\\s*\\(\\s*[\\\\\"']*([^)'\"]+)[\\\\\"']*\\s*\\)"
|
|
||||||
CSS_IMPORT_NO_URL_REGEX = "@import\\s+(?!url)\\(?\\s*['\"]?(?!url[\\s\\(])([\w.:/\\\\-]+)"
|
|
||||||
|
|
||||||
def __init__(self, rewriter):
|
|
||||||
rules = self._create_rules(rewriter)
|
|
||||||
|
|
||||||
RegexRewriter.__init__(self, rules)
|
|
||||||
|
|
||||||
|
|
||||||
def _create_rules(self, rewriter):
|
|
||||||
return [
|
|
||||||
(CSSRewriter.CSS_URL_REGEX, RegexRewriter.archival_rewrite(rewriter), 1),
|
|
||||||
(CSSRewriter.CSS_IMPORT_NO_URL_REGEX, RegexRewriter.archival_rewrite(rewriter), 1),
|
|
||||||
]
|
|
||||||
|
|
||||||
import utils
|
|
||||||
if __name__ == "__main__" or utils.enable_doctests():
|
|
||||||
arcrw = UrlRewriter('20131010im_/http://example.com/', '/web/')
|
|
||||||
|
|
||||||
def test_js(string, extra = []):
|
|
||||||
return JSRewriter(arcrw, extra).rewrite(string)
|
|
||||||
|
|
||||||
def test_xml(string):
|
|
||||||
return XMLRewriter(arcrw).rewrite(string)
|
|
||||||
|
|
||||||
def test_css(string):
|
|
||||||
return CSSRewriter(arcrw).rewrite(string)
|
|
||||||
|
|
||||||
|
|
||||||
import doctest
|
|
||||||
doctest.testmod()
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -1,30 +1,30 @@
|
|||||||
import StringIO
|
import StringIO
|
||||||
from urllib2 import URLError
|
|
||||||
import chardet
|
|
||||||
import copy
|
|
||||||
import itertools
|
|
||||||
|
|
||||||
import archiveloader
|
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||||
from wbrequestresponse import WbResponse, StatusAndHeaders
|
from pywb.utils.bufferedreaders import ChunkedDataReader
|
||||||
import utils
|
from wbrequestresponse import WbResponse
|
||||||
|
|
||||||
from url_rewriter import UrlRewriter
|
|
||||||
from header_rewriter import HeaderRewriter
|
|
||||||
import html_rewriter
|
|
||||||
import regex_rewriters
|
|
||||||
|
|
||||||
import wbexceptions
|
import wbexceptions
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class ReplayView:
|
class ReplayView:
|
||||||
def __init__(self, resolvers, loader = None, reporter = None):
|
def __init__(self, content_loader, content_rewriter, head_insert_view = None,
|
||||||
self.resolvers = resolvers
|
redir_to_exact = True, buffer_response = False, reporter = None):
|
||||||
self.loader = loader if loader else archiveloader.ArchiveLoader()
|
|
||||||
|
self.content_loader = content_loader
|
||||||
|
self.content_rewriter = content_rewriter
|
||||||
|
|
||||||
|
self.head_insert_view = head_insert_view
|
||||||
|
|
||||||
|
self.redir_to_exact = redir_to_exact
|
||||||
|
# buffer or stream rewritten response
|
||||||
|
self.buffer_response = buffer_response
|
||||||
|
|
||||||
self._reporter = reporter
|
self._reporter = reporter
|
||||||
|
|
||||||
|
|
||||||
def __call__(self, wbrequest, cdx_lines, cdx_reader):
|
def __call__(self, wbrequest, cdx_lines):
|
||||||
last_e = None
|
last_e = None
|
||||||
first = True
|
first = True
|
||||||
|
|
||||||
@ -40,9 +40,22 @@ class ReplayView:
|
|||||||
self._redirect_if_needed(wbrequest, cdx)
|
self._redirect_if_needed(wbrequest, cdx)
|
||||||
first = False
|
first = False
|
||||||
|
|
||||||
(cdx, status_headers, stream) = self.resolve_headers_and_payload(cdx, wbrequest, cdx_reader, failed_files)
|
(status_headers, stream) = self.content_loader.resolve_headers_and_payload(cdx, failed_files)
|
||||||
|
|
||||||
response = self.make_response(wbrequest, cdx, status_headers, stream)
|
# check and reject self-redirect
|
||||||
|
self._reject_self_redirect(wbrequest, cdx, status_headers)
|
||||||
|
|
||||||
|
# check if redir is needed
|
||||||
|
self._redirect_if_needed(wbrequest, cdx)
|
||||||
|
|
||||||
|
response = None
|
||||||
|
|
||||||
|
if self.content_rewriter and wbrequest.wb_url.mod != 'id_':
|
||||||
|
response = self.rewrite_content(wbrequest, cdx, status_headers, stream)
|
||||||
|
else:
|
||||||
|
(status_headers, stream) = self.sanitize_content(status_headers, stream)
|
||||||
|
response_iter = self.stream_to_iter(stream)
|
||||||
|
response = WbResponse(status_headers, response_iter)
|
||||||
|
|
||||||
# notify reporter callback, if any
|
# notify reporter callback, if any
|
||||||
if self._reporter:
|
if self._reporter:
|
||||||
@ -62,288 +75,57 @@ class ReplayView:
|
|||||||
else:
|
else:
|
||||||
raise wbexceptions.UnresolvedArchiveFileException()
|
raise wbexceptions.UnresolvedArchiveFileException()
|
||||||
|
|
||||||
|
|
||||||
# callback to issue a redirect to another request
|
|
||||||
# subclasses may provide custom logic
|
|
||||||
def _redirect_if_needed(self, wbrequest, cdx):
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
def _load(self, cdx, revisit, failed_files):
|
|
||||||
if revisit:
|
|
||||||
(filename, offset, length) = (cdx['orig.filename'], cdx['orig.offset'], cdx['orig.length'])
|
|
||||||
else:
|
|
||||||
(filename, offset, length) = (cdx['filename'], cdx['offset'], cdx['length'])
|
|
||||||
|
|
||||||
#optimization: if same file already failed this request, don't try again
|
|
||||||
if failed_files and filename in failed_files:
|
|
||||||
raise wbexceptions.ArchiveLoadFailed(filename, 'Skipping Already Failed')
|
|
||||||
|
|
||||||
any_found = False
|
|
||||||
last_exc = None
|
|
||||||
for resolver in self.resolvers:
|
|
||||||
possible_paths = resolver(filename)
|
|
||||||
|
|
||||||
if possible_paths:
|
|
||||||
for path in possible_paths:
|
|
||||||
any_found = True
|
|
||||||
try:
|
|
||||||
return self.loader.load(path, offset, length)
|
|
||||||
|
|
||||||
except Exception as ue:
|
|
||||||
last_exc = ue
|
|
||||||
print last_exc
|
|
||||||
pass
|
|
||||||
|
|
||||||
# Unsuccessful if reached here
|
|
||||||
if failed_files:
|
|
||||||
failed_files.append(filename)
|
|
||||||
|
|
||||||
if not any_found:
|
|
||||||
raise wbexceptions.UnresolvedArchiveFileException('Archive File Not Found: ' + filename)
|
|
||||||
else:
|
|
||||||
raise wbexceptions.ArchiveLoadFailed(filename, last_exc.reason if last_exc else '')
|
|
||||||
|
|
||||||
|
|
||||||
def resolve_headers_and_payload(self, cdx, wbrequest, cdx_reader, failed_files):
|
|
||||||
has_curr = (cdx['filename'] != '-')
|
|
||||||
has_orig = (cdx.get('orig.filename','-') != '-')
|
|
||||||
|
|
||||||
# load headers record from cdx['filename'] unless it is '-' (rare)
|
|
||||||
headers_record = self._load(cdx, False, failed_files) if has_curr else None
|
|
||||||
|
|
||||||
# two index lookups
|
|
||||||
# Case 1: if mimetype is still warc/revisit
|
|
||||||
if cdx['mimetype'] == 'warc/revisit' and headers_record:
|
|
||||||
payload_record = self._load_different_url_payload(wbrequest, cdx_reader, cdx, headers_record, failed_files)
|
|
||||||
|
|
||||||
# single lookup cases
|
|
||||||
# case 2: non-revisit
|
|
||||||
elif (has_curr and not has_orig):
|
|
||||||
payload_record = headers_record
|
|
||||||
|
|
||||||
# case 3: identical url revisit, load payload from orig.filename
|
|
||||||
elif (has_orig):
|
|
||||||
payload_record = self._load(cdx, True, failed_files)
|
|
||||||
|
|
||||||
# special case: set header to payload if old-style revisit with missing header
|
|
||||||
if not headers_record:
|
|
||||||
headers_record = payload_record
|
|
||||||
elif headers_record != payload_record:
|
|
||||||
# close remainder of stream as this record only used for (already parsed) headers
|
|
||||||
headers_record.stream.close()
|
|
||||||
|
|
||||||
# special case: check if headers record is actually empty (eg empty revisit), then use headers from revisit
|
|
||||||
if not headers_record.status_headers.headers:
|
|
||||||
headers_record = payload_record
|
|
||||||
|
|
||||||
|
|
||||||
if not headers_record or not payload_record:
|
|
||||||
raise wbexceptions.CaptureException('Invalid CDX' + str(cdx))
|
|
||||||
|
|
||||||
|
|
||||||
#response = WbResponse(headers_record.status_headers, self.create_stream_gen(payload_record.stream))
|
|
||||||
#response._stream = payload_record.stream
|
|
||||||
return (cdx, headers_record.status_headers, payload_record.stream)
|
|
||||||
|
|
||||||
|
|
||||||
# done here! just return response
|
|
||||||
# subclasses make override to do additional processing
|
|
||||||
def make_response(self, wbrequest, cdx, status_headers, stream):
|
|
||||||
return self.create_stream_response(status_headers, stream)
|
|
||||||
|
|
||||||
|
|
||||||
# create response from headers and wrapping stream in generator
|
|
||||||
def create_stream_response(self, status_headers, stream):
|
|
||||||
return WbResponse(status_headers, self.create_stream_gen(stream))
|
|
||||||
|
|
||||||
|
|
||||||
# Handle the case where a duplicate of a capture with same digest exists at a different url
|
|
||||||
# Must query the index at that url filtering by matching digest
|
|
||||||
# Raise exception if no matches found
|
|
||||||
def _load_different_url_payload(self, wbrequest, cdx_reader, cdx, headers_record, failed_files):
|
|
||||||
ref_target_uri = headers_record.rec_headers.get_header('WARC-Refers-To-Target-URI')
|
|
||||||
|
|
||||||
# Check for unresolved revisit error, if refers to target uri not present or same as the current url
|
|
||||||
if not ref_target_uri or (ref_target_uri == headers_record.rec_headers.get_header('WARC-Target-URI')):
|
|
||||||
raise wbexceptions.CaptureException('Missing Revisit Original' + str(cdx))
|
|
||||||
|
|
||||||
ref_target_date = headers_record.rec_headers.get_header('WARC-Refers-To-Date')
|
|
||||||
|
|
||||||
if not ref_target_date:
|
|
||||||
ref_target_date = cdx['timestamp']
|
|
||||||
else:
|
|
||||||
ref_target_date = utils.iso_date_to_timestamp(ref_target_date)
|
|
||||||
|
|
||||||
# clone WbRequest
|
|
||||||
orig_wbreq = copy.copy(wbrequest)
|
|
||||||
orig_wbreq.wb_url = copy.copy(orig_wbreq.wb_url)
|
|
||||||
|
|
||||||
orig_wbreq.wb_url.url = ref_target_uri
|
|
||||||
orig_wbreq.wb_url.timestamp = ref_target_date
|
|
||||||
|
|
||||||
# Must also match digest
|
|
||||||
orig_wbreq.query_filter.append('digest:' + cdx['digest'])
|
|
||||||
|
|
||||||
orig_cdx_lines = cdx_reader.load_for_request(orig_wbreq, parsed_cdx = True)
|
|
||||||
|
|
||||||
for cdx in orig_cdx_lines:
|
|
||||||
try:
|
|
||||||
#cdx = cdx_reader.CDXCaptureResult(cdx)
|
|
||||||
#print cdx
|
|
||||||
payload_record = self._load(cdx, False, failed_files)
|
|
||||||
return payload_record
|
|
||||||
|
|
||||||
except wbexceptions.CaptureException as e:
|
|
||||||
pass
|
|
||||||
|
|
||||||
raise wbexceptions.CaptureException('Original for revisit could not be loaded')
|
|
||||||
|
|
||||||
|
|
||||||
def resolve_full(self, filename):
|
|
||||||
# Attempt to resolve cdx file to full path
|
|
||||||
full_url = None
|
|
||||||
for resolver in self.resolvers:
|
|
||||||
full_url = resolver(filename)
|
|
||||||
if full_url:
|
|
||||||
return full_url
|
|
||||||
|
|
||||||
raise wbexceptions.UnresolvedArchiveFileException('Archive File Not Found: ' + filename)
|
|
||||||
|
|
||||||
|
|
||||||
# Create a generator reading from a stream, with optional rewriting and final read call
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def create_stream_gen(stream, rewrite_func = None, final_read_func = None, first_buff = None):
|
def stream_to_iter(stream):
|
||||||
try:
|
try:
|
||||||
buff = first_buff if first_buff else stream.read()
|
buff = stream.read()
|
||||||
while buff:
|
while buff:
|
||||||
if rewrite_func:
|
|
||||||
buff = rewrite_func(buff)
|
|
||||||
yield buff
|
yield buff
|
||||||
buff = stream.read()
|
buff = stream.read()
|
||||||
|
|
||||||
# For adding a tail/handling final buffer
|
|
||||||
if final_read_func:
|
|
||||||
buff = final_read_func()
|
|
||||||
if buff:
|
|
||||||
yield buff
|
|
||||||
|
|
||||||
finally:
|
finally:
|
||||||
stream.close()
|
stream.close()
|
||||||
|
|
||||||
|
def sanitize_content(self, status_headers, stream):
|
||||||
|
# remove transfer encoding chunked and wrap in a dechunking stream
|
||||||
|
if (status_headers.remove_header('transfer-encoding')):
|
||||||
|
stream = ChunkedDataReader(stream)
|
||||||
|
|
||||||
def __str__(self):
|
return (status_headers, stream)
|
||||||
return 'find archive files from ' + str(self.resolvers)
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
class RewritingReplayView(ReplayView):
|
|
||||||
|
|
||||||
def __init__(self, resolvers, loader = None, head_insert_view = None, header_rewriter = None, redir_to_exact = True, buffer_response = False, reporter = None):
|
|
||||||
ReplayView.__init__(self, resolvers, loader, reporter)
|
|
||||||
self.head_insert_view = head_insert_view
|
|
||||||
self.header_rewriter = header_rewriter if header_rewriter else HeaderRewriter()
|
|
||||||
self.redir_to_exact = redir_to_exact
|
|
||||||
|
|
||||||
# buffer or stream rewritten response
|
|
||||||
self.buffer_response = buffer_response
|
|
||||||
|
|
||||||
|
|
||||||
def _text_content_type(self, content_type):
|
|
||||||
for ctype, mimelist in self.REWRITE_TYPES.iteritems():
|
|
||||||
if any ((mime in content_type) for mime in mimelist):
|
|
||||||
return ctype
|
|
||||||
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def make_response(self, wbrequest, cdx, status_headers, stream):
|
|
||||||
# check and reject self-redirect
|
|
||||||
self._reject_self_redirect(wbrequest, cdx, status_headers)
|
|
||||||
|
|
||||||
# check if redir is needed
|
|
||||||
self._redirect_if_needed(wbrequest, cdx)
|
|
||||||
|
|
||||||
|
def rewrite_content(self, wbrequest, cdx, status_headers, stream):
|
||||||
urlrewriter = wbrequest.urlrewriter
|
urlrewriter = wbrequest.urlrewriter
|
||||||
|
|
||||||
rewritten_headers = self.header_rewriter.rewrite(status_headers, urlrewriter)
|
(rewritten_headers, stream) = self.content_rewriter.rewrite_headers(urlrewriter, status_headers, stream)
|
||||||
|
|
||||||
# de_chunking in case chunk encoding is broken
|
# no rewriting needed!
|
||||||
# TODO: investigate further
|
|
||||||
de_chunk = False
|
|
||||||
|
|
||||||
# handle transfer-encoding: chunked
|
|
||||||
if (rewritten_headers.contains_removed_header('transfer-encoding', 'chunked')):
|
|
||||||
stream = archiveloader.ChunkedLineReader(stream)
|
|
||||||
de_chunk = True
|
|
||||||
|
|
||||||
# transparent, though still may need to dechunk
|
|
||||||
if wbrequest.wb_url.mod == 'id_':
|
|
||||||
if de_chunk:
|
|
||||||
status_headers.remove_header('transfer-encoding')
|
|
||||||
|
|
||||||
return self.create_stream_response(status_headers, stream)
|
|
||||||
|
|
||||||
# non-text content type, just send through with rewritten headers
|
|
||||||
# but may need to dechunk
|
|
||||||
if rewritten_headers.text_type is None:
|
if rewritten_headers.text_type is None:
|
||||||
status_headers = rewritten_headers.status_headers
|
response_iter = self.stream_to_iter(stream)
|
||||||
|
return WbResponse(rewritten_headers.status_headers, response_iter)
|
||||||
|
|
||||||
return self.create_stream_response(status_headers, stream)
|
# do head insert
|
||||||
|
if self.head_insert_view:
|
||||||
# Handle text rewriting
|
head_insert_str = self.head_insert_view.render_to_string(wbrequest = wbrequest, cdx = cdx)
|
||||||
|
|
||||||
# special case -- need to ungzip the body
|
|
||||||
if (rewritten_headers.contains_removed_header('content-encoding', 'gzip')):
|
|
||||||
stream = archiveloader.LineReader(stream, decomp = utils.create_decompressor())
|
|
||||||
|
|
||||||
# TODO: is this right?
|
|
||||||
if rewritten_headers.charset:
|
|
||||||
encoding = rewritten_headers.charset
|
|
||||||
first_buff = None
|
|
||||||
else:
|
else:
|
||||||
(encoding, first_buff) = self._detect_charset(stream)
|
head_insert_str = None
|
||||||
|
|
||||||
# if chardet thinks its ascii, use utf-8
|
(status_headers, response_gen) = self.content_rewriter.rewrite_content(urlrewriter, rewritten_headers, stream, head_insert_str)
|
||||||
if encoding == 'ascii':
|
|
||||||
#encoding = None
|
|
||||||
encoding = 'utf-8'
|
|
||||||
|
|
||||||
# Buffering response for html, streaming for others?
|
|
||||||
#if rewritten_headers.text_type == 'html':
|
|
||||||
# return self._rewrite_html(encoding, urlrewriter, stream, rewritten_headers.status_headers, first_buff)
|
|
||||||
#else:
|
|
||||||
# return self._rewrite_other(rewritten_headers.text_type, encoding, urlrewriter, stream, rewritten_headers.status_headers, first_buff)
|
|
||||||
|
|
||||||
text_type = rewritten_headers.text_type
|
|
||||||
status_headers = rewritten_headers.status_headers
|
|
||||||
|
|
||||||
if text_type == 'html':
|
|
||||||
head_insert_str = self.head_insert_view.render_to_string(wbrequest = wbrequest, cdx = cdx) if self.head_insert_view else None
|
|
||||||
rewriter = html_rewriter.HTMLRewriter(urlrewriter, outstream = None, head_insert = head_insert_str)
|
|
||||||
elif text_type == 'css':
|
|
||||||
rewriter = regex_rewriters.CSSRewriter(urlrewriter)
|
|
||||||
elif text_type == 'js':
|
|
||||||
rewriter = regex_rewriters.JSRewriter(urlrewriter)
|
|
||||||
elif text_type == 'xml':
|
|
||||||
rewriter = regex_rewriters.XMLRewriter(urlrewriter)
|
|
||||||
else:
|
|
||||||
raise Exception('Unknown Text Type for Rewrite: ' + text_type)
|
|
||||||
|
|
||||||
# Create generator for response
|
|
||||||
response_gen = self._create_rewrite_stream(rewriter, encoding, stream, first_buff)
|
|
||||||
|
|
||||||
if self.buffer_response:
|
if self.buffer_response:
|
||||||
return self._create_buffer_response(status_headers, response_gen)
|
if wbrequest.wb_url.mod == 'id_':
|
||||||
else:
|
status_headers.remove_header('content-length')
|
||||||
return WbResponse(status_headers, value = response_gen)
|
|
||||||
|
return self.buffered_response(status_headers, response_gen)
|
||||||
|
|
||||||
|
return WbResponse(status_headers, response_gen)
|
||||||
|
|
||||||
|
|
||||||
# Buffer rewrite generator and return a response from a string
|
# Buffer rewrite iterator and return a response from a string
|
||||||
def _create_buffer_response(self, status_headers, generator):
|
def buffered_response(self, status_headers, iterator):
|
||||||
out = StringIO.StringIO()
|
out = StringIO.StringIO()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
for buff in generator:
|
for buff in iterator:
|
||||||
out.write(buff)
|
out.write(buff)
|
||||||
|
|
||||||
finally:
|
finally:
|
||||||
@ -355,53 +137,9 @@ class RewritingReplayView(ReplayView):
|
|||||||
|
|
||||||
return WbResponse(status_headers, value = [content])
|
return WbResponse(status_headers, value = [content])
|
||||||
|
|
||||||
# Create rewrite response from record (no Content-Length), may even be chunked by front-end
|
|
||||||
def _create_rewrite_stream(self, rewriter, encoding, stream, first_buff = None):
|
|
||||||
def do_rewrite(buff):
|
|
||||||
if encoding:
|
|
||||||
buff = self._decode_buff(buff, stream, encoding)
|
|
||||||
|
|
||||||
buff = rewriter.rewrite(buff)
|
|
||||||
|
|
||||||
if encoding:
|
|
||||||
buff = buff.encode(encoding)
|
|
||||||
|
|
||||||
return buff
|
|
||||||
|
|
||||||
def do_finish():
|
|
||||||
return rewriter.close()
|
|
||||||
|
|
||||||
return self.create_stream_gen(stream, rewrite_func = do_rewrite, final_read_func = do_finish, first_buff = first_buff)
|
|
||||||
|
|
||||||
|
|
||||||
def _decode_buff(self, buff, stream, encoding):
|
|
||||||
try:
|
|
||||||
buff = buff.decode(encoding)
|
|
||||||
except UnicodeDecodeError, e:
|
|
||||||
# chunk may have cut apart unicode bytes -- add 1-3 bytes and retry
|
|
||||||
for i in range(3):
|
|
||||||
buff += stream.read(1)
|
|
||||||
try:
|
|
||||||
buff = buff.decode(encoding)
|
|
||||||
break
|
|
||||||
except UnicodeDecodeError:
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
raise
|
|
||||||
|
|
||||||
return buff
|
|
||||||
|
|
||||||
|
|
||||||
def _detect_charset(self, stream):
|
|
||||||
buff = stream.read(8192)
|
|
||||||
result = chardet.detect(buff)
|
|
||||||
print "chardet result: " + str(result)
|
|
||||||
return (result['encoding'], buff)
|
|
||||||
|
|
||||||
|
|
||||||
def _redirect_if_needed(self, wbrequest, cdx):
|
def _redirect_if_needed(self, wbrequest, cdx):
|
||||||
is_proxy = wbrequest.is_proxy
|
if self.redir_to_exact and not wbrequest.is_proxy and cdx and (cdx['timestamp'] != wbrequest.wb_url.timestamp):
|
||||||
if self.redir_to_exact and not is_proxy and cdx and (cdx['timestamp'] != wbrequest.wb_url.timestamp):
|
|
||||||
new_url = wbrequest.urlrewriter.get_timestamp_url(cdx['timestamp'], cdx['original'])
|
new_url = wbrequest.urlrewriter.get_timestamp_url(cdx['timestamp'], cdx['original'])
|
||||||
raise wbexceptions.InternalRedirect(new_url)
|
raise wbexceptions.InternalRedirect(new_url)
|
||||||
|
|
||||||
|
47
pywb/rewrite/README.md
Normal file
47
pywb/rewrite/README.md
Normal file
@ -0,0 +1,47 @@
|
|||||||
|
## PyWb Rewrite v0.2
|
||||||
|
|
||||||
|
[](https://travis-ci.org/ikreymer/pywb_rewrite)
|
||||||
|
|
||||||
|
This package includes the content rewriting component of the pywb wayback tool suite.
|
||||||
|
|
||||||
|
This package applies standard rewriting content rewriting, in the form of url rewriting, for
|
||||||
|
HTTP headers, html, css, js and xml content.
|
||||||
|
|
||||||
|
An additional domain-specific rewritin is planned, especially for JS, to allow for proper
|
||||||
|
replay of difficult pages.
|
||||||
|
|
||||||
|
|
||||||
|
### Command-Line Rewriter
|
||||||
|
|
||||||
|
To enable easier testing of rewriting, this package includes a command-line rewriter
|
||||||
|
which will fetch a live url and apply the registered rewriting rules to that url:
|
||||||
|
|
||||||
|
After installing with:
|
||||||
|
|
||||||
|
`pip install -r requirements.txt`
|
||||||
|
|
||||||
|
Run:
|
||||||
|
|
||||||
|
`python ./pywb_rewrite/rewrite_live.py http://example.com`
|
||||||
|
|
||||||
|
To specify custom timestamp and prefix:
|
||||||
|
|
||||||
|
```
|
||||||
|
python ./pywb_rewrite/rewrite_live.py http://example.com /mycoll/20141026000102/http://mysite.example.com/path.html
|
||||||
|
```
|
||||||
|
|
||||||
|
This will print to stdout the content of `http://example.com` with all urls rewritten relative to
|
||||||
|
`/mycoll/20141026000102/http://mysite.example.com/path.html`.
|
||||||
|
|
||||||
|
Headers are also rewritten, for further details, consult the `get_rewritten` function in
|
||||||
|
[pywb_rewrite/rewrite_live.py](pywb_rewrite/rewrite_live.py)
|
||||||
|
|
||||||
|
|
||||||
|
### Tests
|
||||||
|
|
||||||
|
Rewriting doctests as well as live rewriting tests (subject to change) are provided.
|
||||||
|
To run full test suite: `python run-tests.py`
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -1,4 +1,4 @@
|
|||||||
from wbrequestresponse import StatusAndHeaders
|
from pywb.utils.statusandheaders import StatusAndHeaders
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class RewrittenStatusAndHeaders:
|
class RewrittenStatusAndHeaders:
|
||||||
@ -14,37 +14,6 @@ class RewrittenStatusAndHeaders:
|
|||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class HeaderRewriter:
|
class HeaderRewriter:
|
||||||
"""
|
|
||||||
# Text with charset
|
|
||||||
>>> test_rewrite([('Date', 'Fri, 03 Jan 2014 03:03:21 GMT'), ('Content-Length', '5'), ('Content-Type', 'text/html;charset=UTF-8')])
|
|
||||||
{'text_type': 'html', 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Date', 'Fri, 03 Jan 2014 03:03:21 GMT'),
|
|
||||||
('X-Archive-Orig-Content-Length', '5'),
|
|
||||||
('Content-Type', 'text/html;charset=UTF-8')]), 'removed_header_dict': {}, 'charset': 'utf-8'}
|
|
||||||
|
|
||||||
# Redirect
|
|
||||||
>>> test_rewrite([('Connection', 'close'), ('Location', '/other.html')], '302 Redirect')
|
|
||||||
{'text_type': None, 'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [ ('X-Archive-Orig-Connection', 'close'),
|
|
||||||
('Location', '/web/20131226101010/http://example.com/other.html')]), 'removed_header_dict': {}, 'charset': None}
|
|
||||||
|
|
||||||
# gzip
|
|
||||||
>>> test_rewrite([('Content-Length', '199999'), ('Content-Type', 'text/javascript'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')])
|
|
||||||
{'text_type': 'js', 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Content-Length', '199999'),
|
|
||||||
('Content-Type', 'text/javascript')]), 'removed_header_dict': {'transfer-encoding': 'chunked', 'content-encoding': 'gzip'}, 'charset': None}
|
|
||||||
|
|
||||||
# Binary
|
|
||||||
>>> test_rewrite([('Content-Length', '200000'), ('Content-Type', 'image/png'), ('Cookie', 'blah'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')])
|
|
||||||
{'text_type': None, 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('Content-Length', '200000'),
|
|
||||||
('Content-Type', 'image/png'),
|
|
||||||
('X-Archive-Orig-Cookie', 'blah'),
|
|
||||||
('Content-Encoding', 'gzip')]), 'removed_header_dict': {'transfer-encoding': 'chunked'}, 'charset': None}
|
|
||||||
|
|
||||||
Removing Transfer-Encoding always, Was:
|
|
||||||
('Content-Encoding', 'gzip'),
|
|
||||||
('Transfer-Encoding', 'chunked')]), 'charset': None, 'text_type': None, 'removed_header_dict': {}}
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
REWRITE_TYPES = {
|
REWRITE_TYPES = {
|
||||||
'html': ['text/html', 'application/xhtml'],
|
'html': ['text/html', 'application/xhtml'],
|
||||||
'css': ['text/css'],
|
'css': ['text/css'],
|
||||||
@ -122,20 +91,3 @@ class HeaderRewriter:
|
|||||||
|
|
||||||
return (new_headers, removed_header_dict)
|
return (new_headers, removed_header_dict)
|
||||||
|
|
||||||
import utils
|
|
||||||
if __name__ == "__main__" or utils.enable_doctests():
|
|
||||||
import os
|
|
||||||
import pprint
|
|
||||||
import url_rewriter
|
|
||||||
|
|
||||||
urlrewriter = url_rewriter.UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/web/')
|
|
||||||
|
|
||||||
headerrewriter = HeaderRewriter()
|
|
||||||
|
|
||||||
def test_rewrite(headers, status = '200 OK'):
|
|
||||||
rewritten = headerrewriter.rewrite(StatusAndHeaders(status, headers), urlrewriter)
|
|
||||||
return vars(rewritten)
|
|
||||||
|
|
||||||
import doctest
|
|
||||||
doctest.testmod()
|
|
||||||
|
|
@ -12,75 +12,8 @@ from regex_rewriters import JSRewriter, CSSRewriter
|
|||||||
# HTMLRewriter -- html parser for custom rewriting, also handlers for script and css
|
# HTMLRewriter -- html parser for custom rewriting, also handlers for script and css
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class HTMLRewriter(HTMLParser):
|
class HTMLRewriter(HTMLParser):
|
||||||
r"""
|
"""
|
||||||
>>> parse('<HTML><A Href="page.html">Text</a></hTmL>')
|
HTML-Parsing Rewriter
|
||||||
<HTML><a href="/web/20131226101010/http://example.com/some/path/page.html">Text</a></html>
|
|
||||||
|
|
||||||
>>> parse('<body x="y"><img src="../img.gif"/><br/></body>')
|
|
||||||
<body x="y"><img src="/web/20131226101010im_/http://example.com/some/img.gif"/><br/></body>
|
|
||||||
|
|
||||||
>>> parse('<body x="y"><img src="/img.gif"/><br/></body>')
|
|
||||||
<body x="y"><img src="/web/20131226101010im_/http://example.com/img.gif"/><br/></body>
|
|
||||||
|
|
||||||
>>> parse('<input "selected"><img src></div>')
|
|
||||||
<input "selected"=""><img src=""></div>
|
|
||||||
|
|
||||||
>>> parse('<html><head><base href="http://example.com/some/path/index.html"/>')
|
|
||||||
<html><head><base href="/web/20131226101010/http://example.com/some/path/index.html"/>
|
|
||||||
|
|
||||||
# HTML Entities
|
|
||||||
>>> parse('<a href="">› ></div>')
|
|
||||||
<a href="">› ></div>
|
|
||||||
|
|
||||||
# Don't rewrite anchors
|
|
||||||
>>> parse('<HTML><A Href="#abc">Text</a></hTmL>')
|
|
||||||
<HTML><a href="#abc">Text</a></html>
|
|
||||||
|
|
||||||
# Unicode
|
|
||||||
>>> parse('<a href="http://испытание.испытание/">испытание</a>')
|
|
||||||
<a href="/web/20131226101010/http://испытание.испытание/">испытание</a>
|
|
||||||
|
|
||||||
# Meta tag
|
|
||||||
>>> parse('<META http-equiv="refresh" content="10; URL=/abc/def.html">')
|
|
||||||
<meta http-equiv="refresh" content="10; URL=/web/20131226101010/http://example.com/abc/def.html">
|
|
||||||
|
|
||||||
>>> parse('<meta http-equiv="Content-type" content="text/html; charset=utf-8" />')
|
|
||||||
<meta http-equiv="Content-type" content="text/html; charset=utf-8"/>
|
|
||||||
|
|
||||||
>>> parse('<META http-equiv="refresh" content>')
|
|
||||||
<meta http-equiv="refresh" content="">
|
|
||||||
|
|
||||||
# Script tag
|
|
||||||
>>> parse('<script>window.location = "http://example.com/a/b/c.html"</script>')
|
|
||||||
<script>window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html"</script>
|
|
||||||
|
|
||||||
# Unterminated script tag auto-terminate
|
|
||||||
>>> parse('<script>window.location = "http://example.com/a/b/c.html"</sc>')
|
|
||||||
<script>window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html"</sc></script>
|
|
||||||
|
|
||||||
>>> parse('<script>/*<![CDATA[*/window.location = "http://example.com/a/b/c.html;/*]]>*/"</script>')
|
|
||||||
<script>/*<![CDATA[*/window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html;/*]]>*/"</script>
|
|
||||||
|
|
||||||
>>> parse('<div style="background: url(\'abc.html\')" onblah onclick="location = \'redirect.html\'"></div>')
|
|
||||||
<div style="background: url('/web/20131226101010/http://example.com/some/path/abc.html')" onblah="" onclick="WB_wombat_location = 'redirect.html'"></div>
|
|
||||||
|
|
||||||
>>> parse('<style>@import "styles.css" .a { font-face: url(\'myfont.ttf\') }</style>')
|
|
||||||
<style>@import "/web/20131226101010/http://example.com/some/path/styles.css" .a { font-face: url('/web/20131226101010/http://example.com/some/path/myfont.ttf') }</style>
|
|
||||||
|
|
||||||
# Unterminated style tag auto-terminate
|
|
||||||
>>> parse('<style>@import url(styles.css)')
|
|
||||||
<style>@import url(/web/20131226101010/http://example.com/some/path/styles.css)</style>
|
|
||||||
|
|
||||||
# Head Insertion
|
|
||||||
>>> parse('<html><head><script src="other.js"></script></head><body>Test</body></html>', head_insert = '<script src="cool.js"></script>')
|
|
||||||
<html><head><script src="cool.js"></script><script src="/web/20131226101010js_/http://example.com/some/path/other.js"></script></head><body>Test</body></html>
|
|
||||||
|
|
||||||
>>> parse('<body><div>SomeTest</div>', head_insert = '/* Insert */')
|
|
||||||
/* Insert */<body><div>SomeTest</div>
|
|
||||||
|
|
||||||
>>> parse('<link href="abc.txt"><div>SomeTest</div>', head_insert = '<script>load_stuff();</script>')
|
|
||||||
<link href="/web/20131226101010oe_/http://example.com/some/path/abc.txt"><script>load_stuff();</script><div>SomeTest</div>
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
REWRITE_TAGS = {
|
REWRITE_TAGS = {
|
||||||
@ -307,16 +240,4 @@ class HTMLRewriter(HTMLParser):
|
|||||||
self.out.write(']>')
|
self.out.write(']>')
|
||||||
|
|
||||||
|
|
||||||
import utils
|
|
||||||
if __name__ == "__main__" or utils.enable_doctests():
|
|
||||||
|
|
||||||
url_rewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/web/')
|
|
||||||
|
|
||||||
def parse(data, head_insert = None):
|
|
||||||
parser = HTMLRewriter(url_rewriter, head_insert = head_insert)
|
|
||||||
print parser.rewrite(data) + parser.close()
|
|
||||||
|
|
||||||
import doctest
|
|
||||||
doctest.testmod()
|
|
||||||
|
|
||||||
|
|
156
pywb/rewrite/regex_rewriters.py
Normal file
156
pywb/rewrite/regex_rewriters.py
Normal file
@ -0,0 +1,156 @@
|
|||||||
|
import re
|
||||||
|
import sys
|
||||||
|
import itertools
|
||||||
|
|
||||||
|
from url_rewriter import UrlRewriter
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class RegexRewriter(object):
|
||||||
|
@staticmethod
|
||||||
|
def comment_out(string):
|
||||||
|
return '/*' + string + '*/'
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def remove_https(string):
|
||||||
|
return string.replace("https", "http")
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def add_prefix(prefix):
|
||||||
|
return lambda string: prefix + string
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def archival_rewrite(rewriter):
|
||||||
|
return lambda x: rewriter.rewrite(x)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def replacer(string):
|
||||||
|
return lambda x: string
|
||||||
|
|
||||||
|
HTTPX_MATCH_STR = r'https?:\\?/\\?/[A-Za-z0-9:_@.-]+'
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
DEFAULT_OP = add_prefix
|
||||||
|
|
||||||
|
|
||||||
|
def __init__(self, rules):
|
||||||
|
#rules = self.create_rules(http_prefix)
|
||||||
|
|
||||||
|
# Build regexstr, concatenating regex list
|
||||||
|
regex_str = '|'.join(['(' + rx + ')' for rx, op, count in rules])
|
||||||
|
|
||||||
|
# ensure it's not middle of a word, wrap in non-capture group
|
||||||
|
regex_str = '(?<!\w)(?:' + regex_str + ')'
|
||||||
|
|
||||||
|
self.regex = re.compile(regex_str, re.M)
|
||||||
|
self.rules = rules
|
||||||
|
|
||||||
|
def filter(self, m):
|
||||||
|
return True
|
||||||
|
|
||||||
|
def rewrite(self, string):
|
||||||
|
return self.regex.sub(lambda x: self.replace(x), string)
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
return ''
|
||||||
|
|
||||||
|
def replace(self, m):
|
||||||
|
i = 0
|
||||||
|
for _, op, count in self.rules:
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
full_m = i
|
||||||
|
while count > 0:
|
||||||
|
i += 1
|
||||||
|
count -= 1
|
||||||
|
|
||||||
|
if not m.group(i):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Optional filter to skip matches
|
||||||
|
if not self.filter(m):
|
||||||
|
return m.group(0)
|
||||||
|
|
||||||
|
# Custom func
|
||||||
|
if not hasattr(op, '__call__'):
|
||||||
|
op = RegexRewriter.DEFAULT_OP(op)
|
||||||
|
|
||||||
|
result = op(m.group(i))
|
||||||
|
|
||||||
|
# if extracting partial match
|
||||||
|
if i != full_m:
|
||||||
|
result = m.string[m.start(full_m):m.start(i)] + result + m.string[m.end(i):m.end(full_m)]
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class JSLinkRewriter(RegexRewriter):
|
||||||
|
"""
|
||||||
|
JS Rewriter which rewrites absolute http://, https:// and // urls
|
||||||
|
at the beginning of a string
|
||||||
|
"""
|
||||||
|
JS_HTTPX = r'(?<="|\')(?:https?:)?\\{0,2}/\\{0,2}/[A-Za-z0-9:_@.-]+'
|
||||||
|
|
||||||
|
def __init__(self, rewriter, rules = []):
|
||||||
|
rules = rules + [(self.JS_HTTPX, rewriter.get_abs_url(), 0)]
|
||||||
|
super(JSLinkRewriter, self).__init__(rules)
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class JSLocationAndLinkRewriter(JSLinkRewriter):
|
||||||
|
"""
|
||||||
|
JS Rewriter which also rewrites location and domain to the
|
||||||
|
specified prefix (default: 'WB_wombat_')
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, rewriter, rules = [], prefix = 'WB_wombat_'):
|
||||||
|
rules = rules + [
|
||||||
|
(r'(?<!/)\blocation\b', prefix, 0),
|
||||||
|
(r'(?<=document\.)domain', prefix, 0),
|
||||||
|
]
|
||||||
|
super(JSLocationAndLinkRewriter, self).__init__(rewriter, rules)
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
# Set 'default' JSRewriter
|
||||||
|
JSRewriter = JSLocationAndLinkRewriter
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class XMLRewriter(RegexRewriter):
|
||||||
|
def __init__(self, rewriter, extra = []):
|
||||||
|
rules = self._create_rules(rewriter.get_abs_url())
|
||||||
|
|
||||||
|
RegexRewriter.__init__(self, rules)
|
||||||
|
|
||||||
|
# custom filter to reject 'xmlns' attr
|
||||||
|
def filter(self, m):
|
||||||
|
attr = m.group(1)
|
||||||
|
if attr and attr.startswith('xmlns'):
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
def _create_rules(self, http_prefix):
|
||||||
|
return [
|
||||||
|
('([A-Za-z:]+[\s=]+)?["\'\s]*(' + RegexRewriter.HTTPX_MATCH_STR + ')', http_prefix, 2),
|
||||||
|
]
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class CSSRewriter(RegexRewriter):
|
||||||
|
CSS_URL_REGEX = "url\\s*\\(\\s*[\\\\\"']*([^)'\"]+)[\\\\\"']*\\s*\\)"
|
||||||
|
CSS_IMPORT_NO_URL_REGEX = "@import\\s+(?!url)\\(?\\s*['\"]?(?!url[\\s\\(])([\w.:/\\\\-]+)"
|
||||||
|
|
||||||
|
def __init__(self, rewriter):
|
||||||
|
rules = self._create_rules(rewriter)
|
||||||
|
|
||||||
|
RegexRewriter.__init__(self, rules)
|
||||||
|
|
||||||
|
|
||||||
|
def _create_rules(self, rewriter):
|
||||||
|
return [
|
||||||
|
(CSSRewriter.CSS_URL_REGEX, RegexRewriter.archival_rewrite(rewriter), 1),
|
||||||
|
(CSSRewriter.CSS_IMPORT_NO_URL_REGEX, RegexRewriter.archival_rewrite(rewriter), 1),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
151
pywb/rewrite/rewrite_content.py
Normal file
151
pywb/rewrite/rewrite_content.py
Normal file
@ -0,0 +1,151 @@
|
|||||||
|
import chardet
|
||||||
|
|
||||||
|
from url_rewriter import UrlRewriter
|
||||||
|
from html_rewriter import HTMLRewriter
|
||||||
|
from regex_rewriters import RegexRewriter, JSRewriter, CSSRewriter, XMLRewriter
|
||||||
|
from header_rewriter import HeaderRewriter, RewrittenStatusAndHeaders
|
||||||
|
|
||||||
|
from pywb.utils.statusandheaders import StatusAndHeaders
|
||||||
|
from pywb.utils.bufferedreaders import BufferedReader, ChunkedDataReader
|
||||||
|
|
||||||
|
class RewriteContent:
|
||||||
|
|
||||||
|
DEFAULT_CONTENT_REWRITERS = {
|
||||||
|
'header': HeaderRewriter,
|
||||||
|
'js': JSRewriter,
|
||||||
|
'css': CSSRewriter,
|
||||||
|
'xml': XMLRewriter,
|
||||||
|
'html': HTMLRewriter
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def __init__(self, rewriters = {}):
|
||||||
|
self.rewriters = dict(self.DEFAULT_CONTENT_REWRITERS.items() + rewriters.items())
|
||||||
|
|
||||||
|
|
||||||
|
def rewrite_headers(self, urlrewriter, status_headers, stream):
|
||||||
|
rewritten_headers = self.rewriters['header']().rewrite(status_headers, urlrewriter)
|
||||||
|
|
||||||
|
# note: since chunking may be broken, approach taken here is to *always* attempt
|
||||||
|
# to dechunk if transfer-encoding: chunked is present
|
||||||
|
#
|
||||||
|
# an alternative may be to serve chunked unless content rewriting is needed
|
||||||
|
# todo: possible revisit this approach
|
||||||
|
|
||||||
|
if (rewritten_headers.contains_removed_header('transfer-encoding', 'chunked')):
|
||||||
|
stream = ChunkedDataReader(stream)
|
||||||
|
|
||||||
|
return (rewritten_headers, stream)
|
||||||
|
|
||||||
|
def rewrite_content(self, urlrewriter, headers, stream, head_insert_str = None):
|
||||||
|
# see if we've already rewritten headers
|
||||||
|
if isinstance(headers, RewrittenStatusAndHeaders):
|
||||||
|
rewritten_headers = headers
|
||||||
|
elif isinstance(headers, StatusAndHeaders):
|
||||||
|
# otherwise, need to determine if rewriting is even necessary
|
||||||
|
(rewritten_headers, stream) = self.rewrite_headers(urlrewriter, headers, stream)
|
||||||
|
# no rewriting needed here
|
||||||
|
if rewritten_headers.text_type is None:
|
||||||
|
gen = self.stream_to_gen(stream)
|
||||||
|
return (status_headers, gen)
|
||||||
|
|
||||||
|
status_headers = rewritten_headers.status_headers
|
||||||
|
# Handle text content rewriting
|
||||||
|
# =========================================================================
|
||||||
|
# special case -- need to ungzip the body
|
||||||
|
if (rewritten_headers.contains_removed_header('content-encoding', 'gzip')):
|
||||||
|
stream = BufferedReader(stream, 'gzip')
|
||||||
|
|
||||||
|
|
||||||
|
if rewritten_headers.charset:
|
||||||
|
encoding = rewritten_headers.charset
|
||||||
|
first_buff = None
|
||||||
|
else:
|
||||||
|
(encoding, first_buff) = self._detect_charset(stream)
|
||||||
|
|
||||||
|
# if chardet thinks its ascii, use utf-8
|
||||||
|
if encoding == 'ascii':
|
||||||
|
encoding = 'utf-8'
|
||||||
|
|
||||||
|
text_type = rewritten_headers.text_type
|
||||||
|
|
||||||
|
rewriter_class = self.rewriters.get(text_type)
|
||||||
|
if not rewriter_class:
|
||||||
|
raise Exception('Unknown Text Type for Rewrite: ' + text_type)
|
||||||
|
|
||||||
|
|
||||||
|
if text_type == 'html':
|
||||||
|
rewriter = rewriter_class(urlrewriter, outstream = None, head_insert = head_insert_str)
|
||||||
|
else:
|
||||||
|
rewriter = rewriter_class(urlrewriter)
|
||||||
|
|
||||||
|
# Create rewriting generator
|
||||||
|
gen = self._rewriting_stream_gen(rewriter, encoding, stream, first_buff)
|
||||||
|
return (status_headers, gen)
|
||||||
|
|
||||||
|
|
||||||
|
# Create rewrite stream, may even be chunked by front-end
|
||||||
|
def _rewriting_stream_gen(self, rewriter, encoding, stream, first_buff = None):
|
||||||
|
def do_rewrite(buff):
|
||||||
|
if encoding:
|
||||||
|
buff = self._decode_buff(buff, stream, encoding)
|
||||||
|
|
||||||
|
buff = rewriter.rewrite(buff)
|
||||||
|
|
||||||
|
if encoding:
|
||||||
|
buff = buff.encode(encoding)
|
||||||
|
|
||||||
|
return buff
|
||||||
|
|
||||||
|
def do_finish():
|
||||||
|
return rewriter.close()
|
||||||
|
|
||||||
|
return self.stream_to_gen(stream, rewrite_func = do_rewrite, final_read_func = do_finish, first_buff = first_buff)
|
||||||
|
|
||||||
|
|
||||||
|
def _decode_buff(self, buff, stream, encoding):
|
||||||
|
try:
|
||||||
|
buff = buff.decode(encoding)
|
||||||
|
except UnicodeDecodeError, e:
|
||||||
|
# chunk may have cut apart unicode bytes -- add 1-3 bytes and retry
|
||||||
|
for i in range(3):
|
||||||
|
buff += stream.read(1)
|
||||||
|
try:
|
||||||
|
buff = buff.decode(encoding)
|
||||||
|
break
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
raise
|
||||||
|
|
||||||
|
return buff
|
||||||
|
|
||||||
|
|
||||||
|
def _detect_charset(self, stream):
|
||||||
|
buff = stream.read(8192)
|
||||||
|
result = chardet.detect(buff)
|
||||||
|
print "chardet result: " + str(result)
|
||||||
|
return (result['encoding'], buff)
|
||||||
|
|
||||||
|
|
||||||
|
# Create a generator reading from a stream, with optional rewriting and final read call
|
||||||
|
@staticmethod
|
||||||
|
def stream_to_gen(stream, rewrite_func = None, final_read_func = None, first_buff = None):
|
||||||
|
try:
|
||||||
|
buff = first_buff if first_buff else stream.read()
|
||||||
|
while buff:
|
||||||
|
if rewrite_func:
|
||||||
|
buff = rewrite_func(buff)
|
||||||
|
yield buff
|
||||||
|
buff = stream.read()
|
||||||
|
|
||||||
|
# For adding a tail/handling final buffer
|
||||||
|
if final_read_func:
|
||||||
|
buff = final_read_func()
|
||||||
|
if buff:
|
||||||
|
yield buff
|
||||||
|
|
||||||
|
finally:
|
||||||
|
stream.close()
|
||||||
|
|
||||||
|
|
68
pywb/rewrite/rewrite_live.py
Normal file
68
pywb/rewrite/rewrite_live.py
Normal file
@ -0,0 +1,68 @@
|
|||||||
|
import urllib2
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import datetime
|
||||||
|
|
||||||
|
from pywb.utils.timeutils import datetime_to_timestamp
|
||||||
|
from pywb.utils.statusandheaders import StatusAndHeaders
|
||||||
|
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||||
|
from pywb.rewrite.rewrite_content import RewriteContent
|
||||||
|
|
||||||
|
"""
|
||||||
|
Fetch a url from live web and apply rewriting rules
|
||||||
|
"""
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
def get_status_and_stream(url):
|
||||||
|
resp = urllib2.urlopen(url)
|
||||||
|
|
||||||
|
headers = []
|
||||||
|
for name, value in resp.info().dict.iteritems():
|
||||||
|
headers.append((name, value))
|
||||||
|
|
||||||
|
status_headers = StatusAndHeaders('200 OK', headers)
|
||||||
|
stream = resp
|
||||||
|
|
||||||
|
return (status_headers, stream)
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
def get_rewritten(url, urlrewriter):
|
||||||
|
(status_headers, stream) = get_status_and_stream(url)
|
||||||
|
|
||||||
|
status_headers, gen = RewriteContent().rewrite_content(urlrewriter, status_headers, stream)
|
||||||
|
|
||||||
|
buff = ''
|
||||||
|
for x in gen:
|
||||||
|
buff += x
|
||||||
|
|
||||||
|
return (status_headers, buff)
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
def main():
|
||||||
|
if len(sys.argv) < 2:
|
||||||
|
print 'Usage: {0} url-to-fetch [wb-url-target] [extra-prefix]'.format(sys.argv[0])
|
||||||
|
exit(1)
|
||||||
|
else:
|
||||||
|
url = sys.argv[1]
|
||||||
|
|
||||||
|
if len(sys.argv) >= 3:
|
||||||
|
wburl_str = sys.argv[2]
|
||||||
|
if wburl_str.startswith('/'):
|
||||||
|
wburl_str = wburl_str[1:]
|
||||||
|
|
||||||
|
prefix, wburl_str = wburl_str.split('/', 1)
|
||||||
|
prefix = '/' + prefix + '/'
|
||||||
|
else:
|
||||||
|
wburl_str = datetime_to_timestamp(datetime.datetime.now()) + '/http://example.com/path/sample.html'
|
||||||
|
prefix = '/pywb_rewrite/'
|
||||||
|
|
||||||
|
urlrewriter = UrlRewriter(wburl_str, prefix)
|
||||||
|
|
||||||
|
status_headers, buff = get_rewritten(url, urlrewriter)
|
||||||
|
|
||||||
|
sys.stdout.write(buff)
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
266
pywb/rewrite/test/test_rewrite.py
Normal file
266
pywb/rewrite/test/test_rewrite.py
Normal file
@ -0,0 +1,266 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
r"""
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
# HTML Rewriting
|
||||||
|
#=================================================================
|
||||||
|
|
||||||
|
>>> parse('<HTML><A Href="page.html">Text</a></hTmL>')
|
||||||
|
<HTML><a href="/web/20131226101010/http://example.com/some/path/page.html">Text</a></html>
|
||||||
|
|
||||||
|
>>> parse('<body x="y"><img src="../img.gif"/><br/></body>')
|
||||||
|
<body x="y"><img src="/web/20131226101010im_/http://example.com/some/img.gif"/><br/></body>
|
||||||
|
|
||||||
|
>>> parse('<body x="y"><img src="/img.gif"/><br/></body>')
|
||||||
|
<body x="y"><img src="/web/20131226101010im_/http://example.com/img.gif"/><br/></body>
|
||||||
|
|
||||||
|
>>> parse('<input "selected"><img src></div>')
|
||||||
|
<input "selected"=""><img src=""></div>
|
||||||
|
|
||||||
|
>>> parse('<html><head><base href="http://example.com/some/path/index.html"/>')
|
||||||
|
<html><head><base href="/web/20131226101010/http://example.com/some/path/index.html"/>
|
||||||
|
|
||||||
|
# HTML Entities
|
||||||
|
>>> parse('<a href="">› ></div>')
|
||||||
|
<a href="">› ></div>
|
||||||
|
|
||||||
|
# Don't rewrite anchors
|
||||||
|
>>> parse('<HTML><A Href="#abc">Text</a></hTmL>')
|
||||||
|
<HTML><a href="#abc">Text</a></html>
|
||||||
|
|
||||||
|
# Unicode
|
||||||
|
>>> parse('<a href="http://испытание.испытание/">испытание</a>')
|
||||||
|
<a href="/web/20131226101010/http://испытание.испытание/">испытание</a>
|
||||||
|
|
||||||
|
# Meta tag
|
||||||
|
>>> parse('<META http-equiv="refresh" content="10; URL=/abc/def.html">')
|
||||||
|
<meta http-equiv="refresh" content="10; URL=/web/20131226101010/http://example.com/abc/def.html">
|
||||||
|
|
||||||
|
>>> parse('<meta http-equiv="Content-type" content="text/html; charset=utf-8" />')
|
||||||
|
<meta http-equiv="Content-type" content="text/html; charset=utf-8"/>
|
||||||
|
|
||||||
|
>>> parse('<META http-equiv="refresh" content>')
|
||||||
|
<meta http-equiv="refresh" content="">
|
||||||
|
|
||||||
|
# Script tag
|
||||||
|
>>> parse('<script>window.location = "http://example.com/a/b/c.html"</script>')
|
||||||
|
<script>window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html"</script>
|
||||||
|
|
||||||
|
# Unterminated script tag auto-terminate
|
||||||
|
>>> parse('<script>window.location = "http://example.com/a/b/c.html"</sc>')
|
||||||
|
<script>window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html"</sc></script>
|
||||||
|
|
||||||
|
>>> parse('<script>/*<![CDATA[*/window.location = "http://example.com/a/b/c.html;/*]]>*/"</script>')
|
||||||
|
<script>/*<![CDATA[*/window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html;/*]]>*/"</script>
|
||||||
|
|
||||||
|
>>> parse('<div style="background: url(\'abc.html\')" onblah onclick="location = \'redirect.html\'"></div>')
|
||||||
|
<div style="background: url('/web/20131226101010/http://example.com/some/path/abc.html')" onblah="" onclick="WB_wombat_location = 'redirect.html'"></div>
|
||||||
|
|
||||||
|
>>> parse('<style>@import "styles.css" .a { font-face: url(\'myfont.ttf\') }</style>')
|
||||||
|
<style>@import "/web/20131226101010/http://example.com/some/path/styles.css" .a { font-face: url('/web/20131226101010/http://example.com/some/path/myfont.ttf') }</style>
|
||||||
|
|
||||||
|
# Unterminated style tag auto-terminate
|
||||||
|
>>> parse('<style>@import url(styles.css)')
|
||||||
|
<style>@import url(/web/20131226101010/http://example.com/some/path/styles.css)</style>
|
||||||
|
|
||||||
|
# Head Insertion
|
||||||
|
>>> parse('<html><head><script src="other.js"></script></head><body>Test</body></html>', head_insert = '<script src="cool.js"></script>')
|
||||||
|
<html><head><script src="cool.js"></script><script src="/web/20131226101010js_/http://example.com/some/path/other.js"></script></head><body>Test</body></html>
|
||||||
|
|
||||||
|
>>> parse('<body><div>SomeTest</div>', head_insert = '/* Insert */')
|
||||||
|
/* Insert */<body><div>SomeTest</div>
|
||||||
|
|
||||||
|
>>> parse('<link href="abc.txt"><div>SomeTest</div>', head_insert = '<script>load_stuff();</script>')
|
||||||
|
<link href="/web/20131226101010oe_/http://example.com/some/path/abc.txt"><script>load_stuff();</script><div>SomeTest</div>
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
# Custom Regex
|
||||||
|
# Test https->http converter (other tests below in subclasses)
|
||||||
|
>>> RegexRewriter([(RegexRewriter.HTTPX_MATCH_STR, RegexRewriter.remove_https, 0)]).rewrite('a = https://example.com; b = http://example.com; c = https://some-url/path/https://embedded.example.com')
|
||||||
|
'a = http://example.com; b = http://example.com; c = http://some-url/path/http://embedded.example.com'
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
# JS Rewriting
|
||||||
|
#=================================================================
|
||||||
|
|
||||||
|
>>> _test_js('location = "http://example.com/abc.html"')
|
||||||
|
'WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"'
|
||||||
|
|
||||||
|
>>> _test_js(r'location = "http:\/\/example.com/abc.html"')
|
||||||
|
'WB_wombat_location = "/web/20131010im_/http:\\/\\/example.com/abc.html"'
|
||||||
|
|
||||||
|
>>> _test_js(r'location = "http:\\/\\/example.com/abc.html"')
|
||||||
|
'WB_wombat_location = "/web/20131010im_/http:\\\\/\\\\/example.com/abc.html"'
|
||||||
|
|
||||||
|
>>> _test_js(r"location = 'http://example.com/abc.html/'")
|
||||||
|
"WB_wombat_location = '/web/20131010im_/http://example.com/abc.html/'"
|
||||||
|
|
||||||
|
>>> _test_js(r'location = http://example.com/abc.html/')
|
||||||
|
'WB_wombat_location = http://example.com/abc.html/'
|
||||||
|
|
||||||
|
# not rewritten -- to be handled on client side
|
||||||
|
>>> _test_js(r'location = "/abc.html"')
|
||||||
|
'WB_wombat_location = "/abc.html"'
|
||||||
|
|
||||||
|
>>> _test_js(r'location = /http:\/\/example.com/abc.html/')
|
||||||
|
'WB_wombat_location = /http:\\/\\/example.com/abc.html/'
|
||||||
|
|
||||||
|
>>> _test_js('"/location" == some_location_val; locations = location;')
|
||||||
|
'"/location" == some_location_val; locations = WB_wombat_location;'
|
||||||
|
|
||||||
|
>>> _test_js('cool_Location = "http://example.com/abc.html"')
|
||||||
|
'cool_Location = "/web/20131010im_/http://example.com/abc.html"'
|
||||||
|
|
||||||
|
>>> _test_js('window.location = "http://example.com/abc.html" document.domain = "anotherdomain.com"')
|
||||||
|
'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html" document.WB_wombat_domain = "anotherdomain.com"'
|
||||||
|
|
||||||
|
>>> _test_js('document_domain = "anotherdomain.com"; window.document.domain = "example.com"')
|
||||||
|
'document_domain = "anotherdomain.com"; window.document.WB_wombat_domain = "example.com"'
|
||||||
|
|
||||||
|
# custom rules added
|
||||||
|
>>> _test_js('window.location = "http://example.com/abc.html"; some_func(); ', [('some_func\(\).*', RegexRewriter.comment_out, 0)])
|
||||||
|
'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"; /*some_func(); */'
|
||||||
|
|
||||||
|
# scheme-agnostic
|
||||||
|
>>> _test_js('cool_Location = "//example.com/abc.html" //comment')
|
||||||
|
'cool_Location = "/web/20131010im_///example.com/abc.html" //comment'
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
# XML Rewriting
|
||||||
|
#=================================================================
|
||||||
|
|
||||||
|
>>> _test_xml('<tag xmlns="http://www.example.com/ns" attr="http://example.com"></tag>')
|
||||||
|
'<tag xmlns="http://www.example.com/ns" attr="/web/20131010im_/http://example.com"></tag>'
|
||||||
|
|
||||||
|
>>> _test_xml('<tag xmlns:xsi="http://www.example.com/ns" attr=" http://example.com"></tag>')
|
||||||
|
'<tag xmlns:xsi="http://www.example.com/ns" attr=" /web/20131010im_/http://example.com"></tag>'
|
||||||
|
|
||||||
|
>>> _test_xml('<tag> http://example.com<other>abchttp://example.com</other></tag>')
|
||||||
|
'<tag> /web/20131010im_/http://example.com<other>abchttp://example.com</other></tag>'
|
||||||
|
|
||||||
|
>>> _test_xml('<main> http://www.example.com/blah</tag> <other xmlns:abcdef= " http://example.com"/> http://example.com </main>')
|
||||||
|
'<main> /web/20131010im_/http://www.example.com/blah</tag> <other xmlns:abcdef= " http://example.com"/> /web/20131010im_/http://example.com </main>'
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
# CSS Rewriting
|
||||||
|
#=================================================================
|
||||||
|
|
||||||
|
>>> _test_css("background: url('/some/path.html')")
|
||||||
|
"background: url('/web/20131010im_/http://example.com/some/path.html')"
|
||||||
|
|
||||||
|
>>> _test_css("background: url('../path.html')")
|
||||||
|
"background: url('/web/20131010im_/http://example.com/path.html')"
|
||||||
|
|
||||||
|
>>> _test_css("background: url(\"http://domain.com/path.html\")")
|
||||||
|
'background: url("/web/20131010im_/http://domain.com/path.html")'
|
||||||
|
|
||||||
|
>>> _test_css("background: url(file.jpeg)")
|
||||||
|
'background: url(/web/20131010im_/http://example.com/file.jpeg)'
|
||||||
|
|
||||||
|
>>> _test_css("background: url('')")
|
||||||
|
"background: url('')"
|
||||||
|
|
||||||
|
>>> _test_css("background: url (\"weirdpath\')")
|
||||||
|
'background: url ("/web/20131010im_/http://example.com/weirdpath\')'
|
||||||
|
|
||||||
|
>>> _test_css("@import url ('path.css')")
|
||||||
|
"@import url ('/web/20131010im_/http://example.com/path.css')"
|
||||||
|
|
||||||
|
>>> _test_css("@import url('path.css')")
|
||||||
|
"@import url('/web/20131010im_/http://example.com/path.css')"
|
||||||
|
|
||||||
|
>>> _test_css("@import ( 'path.css')")
|
||||||
|
"@import ( '/web/20131010im_/http://example.com/path.css')"
|
||||||
|
|
||||||
|
>>> _test_css("@import \"path.css\"")
|
||||||
|
'@import "/web/20131010im_/http://example.com/path.css"'
|
||||||
|
|
||||||
|
>>> _test_css("@import ('../path.css\"")
|
||||||
|
'@import (\'/web/20131010im_/http://example.com/path.css"'
|
||||||
|
|
||||||
|
>>> _test_css("@import ('../url.css\"")
|
||||||
|
'@import (\'/web/20131010im_/http://example.com/url.css"'
|
||||||
|
|
||||||
|
>>> _test_css("@import (\"url.css\")")
|
||||||
|
'@import ("/web/20131010im_/http://example.com/url.css")'
|
||||||
|
|
||||||
|
>>> _test_css("@import url(/url.css)\n@import url(/anotherurl.css)\n @import url(/and_a_third.css)")
|
||||||
|
'@import url(/web/20131010im_/http://example.com/url.css)\n@import url(/web/20131010im_/http://example.com/anotherurl.css)\n @import url(/web/20131010im_/http://example.com/and_a_third.css)'
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
HTTP Headers Rewriting
|
||||||
|
#=================================================================
|
||||||
|
|
||||||
|
# Text with charset
|
||||||
|
>>> _test_headers([('Date', 'Fri, 03 Jan 2014 03:03:21 GMT'), ('Content-Length', '5'), ('Content-Type', 'text/html;charset=UTF-8')])
|
||||||
|
{'text_type': 'html', 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Date', 'Fri, 03 Jan 2014 03:03:21 GMT'),
|
||||||
|
('X-Archive-Orig-Content-Length', '5'),
|
||||||
|
('Content-Type', 'text/html;charset=UTF-8')]), 'removed_header_dict': {}, 'charset': 'utf-8'}
|
||||||
|
|
||||||
|
# Redirect
|
||||||
|
>>> _test_headers([('Connection', 'close'), ('Location', '/other.html')], '302 Redirect')
|
||||||
|
{'text_type': None, 'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [ ('X-Archive-Orig-Connection', 'close'),
|
||||||
|
('Location', '/web/20131226101010/http://example.com/other.html')]), 'removed_header_dict': {}, 'charset': None}
|
||||||
|
|
||||||
|
# gzip
|
||||||
|
>>> _test_headers([('Content-Length', '199999'), ('Content-Type', 'text/javascript'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')])
|
||||||
|
{'text_type': 'js', 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Content-Length', '199999'),
|
||||||
|
('Content-Type', 'text/javascript')]), 'removed_header_dict': {'transfer-encoding': 'chunked', 'content-encoding': 'gzip'}, 'charset': None}
|
||||||
|
|
||||||
|
# Binary
|
||||||
|
>>> _test_headers([('Content-Length', '200000'), ('Content-Type', 'image/png'), ('Cookie', 'blah'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')])
|
||||||
|
{'text_type': None, 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('Content-Length', '200000'),
|
||||||
|
('Content-Type', 'image/png'),
|
||||||
|
('X-Archive-Orig-Cookie', 'blah'),
|
||||||
|
('Content-Encoding', 'gzip')]), 'removed_header_dict': {'transfer-encoding': 'chunked'}, 'charset': None}
|
||||||
|
|
||||||
|
Removing Transfer-Encoding always, Was:
|
||||||
|
('Content-Encoding', 'gzip'),
|
||||||
|
('Transfer-Encoding', 'chunked')]), 'charset': None, 'text_type': None, 'removed_header_dict': {}}
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||||
|
from pywb.rewrite.html_rewriter import HTMLRewriter
|
||||||
|
from pywb.rewrite.regex_rewriters import RegexRewriter, JSRewriter, CSSRewriter, XMLRewriter
|
||||||
|
from pywb.rewrite.header_rewriter import HeaderRewriter
|
||||||
|
|
||||||
|
from pywb.utils.statusandheaders import StatusAndHeaders
|
||||||
|
|
||||||
|
|
||||||
|
urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/web/')
|
||||||
|
|
||||||
|
def parse(data, head_insert = None):
|
||||||
|
parser = HTMLRewriter(urlrewriter, head_insert = head_insert)
|
||||||
|
print parser.rewrite(data) + parser.close()
|
||||||
|
|
||||||
|
arcrw = UrlRewriter('20131010im_/http://example.com/', '/web/')
|
||||||
|
|
||||||
|
|
||||||
|
def _test_js(string, extra = []):
|
||||||
|
return JSRewriter(arcrw, extra).rewrite(string)
|
||||||
|
|
||||||
|
def _test_xml(string):
|
||||||
|
return XMLRewriter(arcrw).rewrite(string)
|
||||||
|
|
||||||
|
def _test_css(string):
|
||||||
|
return CSSRewriter(arcrw).rewrite(string)
|
||||||
|
|
||||||
|
headerrewriter = HeaderRewriter()
|
||||||
|
|
||||||
|
def _test_headers(headers, status = '200 OK'):
|
||||||
|
rewritten = headerrewriter.rewrite(StatusAndHeaders(status, headers), urlrewriter)
|
||||||
|
return vars(rewritten)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import doctest
|
||||||
|
doctest.testmod()
|
||||||
|
|
||||||
|
|
32
pywb/rewrite/test/test_rewrite_live.py
Normal file
32
pywb/rewrite/test/test_rewrite_live.py
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
from pywb.rewrite.rewrite_live import get_rewritten
|
||||||
|
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||||
|
|
||||||
|
# This module has some rewriting tests against the 'live web'
|
||||||
|
# As such, the content may change and the test may break
|
||||||
|
|
||||||
|
urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/pywb/')
|
||||||
|
|
||||||
|
|
||||||
|
def test_example_1():
|
||||||
|
status_headers, buff = get_rewritten('http://example.com/', urlrewriter)
|
||||||
|
|
||||||
|
# verify header rewriting
|
||||||
|
assert (('X-Archive-Orig-connection', 'close') in status_headers.headers), status_headers
|
||||||
|
|
||||||
|
|
||||||
|
def test_example_2():
|
||||||
|
status_headers, buff = get_rewritten('http://example.com/', urlrewriter)
|
||||||
|
|
||||||
|
# verify header rewriting
|
||||||
|
assert (('X-Archive-Orig-connection', 'close') in status_headers.headers), status_headers
|
||||||
|
|
||||||
|
assert '/pywb/20131226101010/http://www.iana.org/domains/example' in buff, buff
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def test_example_3():
|
||||||
|
status_headers, buff = get_rewritten('http://archive.org/', urlrewriter)
|
||||||
|
|
||||||
|
assert '/pywb/20131226101010/http://example.com/about/terms.php' in buff, buff
|
||||||
|
|
||||||
|
|
@ -6,43 +6,43 @@ from wburl import WbUrl
|
|||||||
|
|
||||||
class UrlRewriter:
|
class UrlRewriter:
|
||||||
"""
|
"""
|
||||||
>>> test_rewrite('other.html', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/')
|
>>> do_rewrite('other.html', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/')
|
||||||
'https://web.archive.org/web/20131010/http://example.com/path/other.html'
|
'https://web.archive.org/web/20131010/http://example.com/path/other.html'
|
||||||
|
|
||||||
>>> test_rewrite('file.js', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/', 'js_')
|
>>> do_rewrite('file.js', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/', 'js_')
|
||||||
'https://web.archive.org/web/20131010js_/http://example.com/path/file.js'
|
'https://web.archive.org/web/20131010js_/http://example.com/path/file.js'
|
||||||
|
|
||||||
>>> test_rewrite('/other.html', '20130907*/http://example.com/path/page.html', '/coll/')
|
>>> do_rewrite('/other.html', '20130907*/http://example.com/path/page.html', '/coll/')
|
||||||
'/coll/20130907*/http://example.com/other.html'
|
'/coll/20130907*/http://example.com/other.html'
|
||||||
|
|
||||||
>>> test_rewrite('./other.html', '20130907*/http://example.com/path/page.html', '/coll/')
|
>>> do_rewrite('./other.html', '20130907*/http://example.com/path/page.html', '/coll/')
|
||||||
'/coll/20130907*/http://example.com/path/other.html'
|
'/coll/20130907*/http://example.com/path/other.html'
|
||||||
|
|
||||||
>>> test_rewrite('../other.html', '20131112im_/http://example.com/path/page.html', '/coll/')
|
>>> do_rewrite('../other.html', '20131112im_/http://example.com/path/page.html', '/coll/')
|
||||||
'/coll/20131112im_/http://example.com/other.html'
|
'/coll/20131112im_/http://example.com/other.html'
|
||||||
|
|
||||||
>>> test_rewrite('../../other.html', '*/http://example.com/index.html', 'localhost:8080/')
|
>>> do_rewrite('../../other.html', '*/http://example.com/index.html', 'localhost:8080/')
|
||||||
'localhost:8080/*/http://example.com/other.html'
|
'localhost:8080/*/http://example.com/other.html'
|
||||||
|
|
||||||
>>> test_rewrite('path/../../other.html', '*/http://example.com/index.html', 'localhost:8080/')
|
>>> do_rewrite('path/../../other.html', '*/http://example.com/index.html', 'localhost:8080/')
|
||||||
'localhost:8080/*/http://example.com/other.html'
|
'localhost:8080/*/http://example.com/other.html'
|
||||||
|
|
||||||
>>> test_rewrite('http://some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/')
|
>>> do_rewrite('http://some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/')
|
||||||
'localhost:8080/20101226101112/http://some-other-site.com'
|
'localhost:8080/20101226101112/http://some-other-site.com'
|
||||||
|
|
||||||
>>> test_rewrite('../../other.html', '2020/http://example.com/index.html', '/')
|
>>> do_rewrite('../../other.html', '2020/http://example.com/index.html', '/')
|
||||||
'/2020/http://example.com/other.html'
|
'/2020/http://example.com/other.html'
|
||||||
|
|
||||||
>>> test_rewrite('../../other.html', '2020/http://example.com/index.html', '')
|
>>> do_rewrite('../../other.html', '2020/http://example.com/index.html', '')
|
||||||
'2020/http://example.com/other.html'
|
'2020/http://example.com/other.html'
|
||||||
|
|
||||||
>>> test_rewrite('', '20131010010203/http://example.com/file.html', '/web/')
|
>>> do_rewrite('', '20131010010203/http://example.com/file.html', '/web/')
|
||||||
'/web/20131010010203/http://example.com/file.html'
|
'/web/20131010010203/http://example.com/file.html'
|
||||||
|
|
||||||
>>> test_rewrite('#anchor', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/')
|
>>> do_rewrite('#anchor', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/')
|
||||||
'#anchor'
|
'#anchor'
|
||||||
|
|
||||||
>>> test_rewrite('mailto:example@example.com', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/')
|
>>> do_rewrite('mailto:example@example.com', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/')
|
||||||
'mailto:example@example.com'
|
'mailto:example@example.com'
|
||||||
|
|
||||||
>>> UrlRewriter('19960708im_/http://domain.example.com/path.txt', '/abc/').get_abs_url()
|
>>> UrlRewriter('19960708im_/http://domain.example.com/path.txt', '/abc/').get_abs_url()
|
||||||
@ -62,7 +62,6 @@ class UrlRewriter:
|
|||||||
def __init__(self, wburl, prefix):
|
def __init__(self, wburl, prefix):
|
||||||
self.wburl = wburl if isinstance(wburl, WbUrl) else WbUrl(wburl)
|
self.wburl = wburl if isinstance(wburl, WbUrl) else WbUrl(wburl)
|
||||||
self.prefix = prefix
|
self.prefix = prefix
|
||||||
self.archivalurl_class = self.wburl.__class__
|
|
||||||
|
|
||||||
#if self.prefix.endswith('/'):
|
#if self.prefix.endswith('/'):
|
||||||
# self.prefix = self.prefix[:-1]
|
# self.prefix = self.prefix[:-1]
|
||||||
@ -74,7 +73,7 @@ class UrlRewriter:
|
|||||||
|
|
||||||
wburl = self.wburl
|
wburl = self.wburl
|
||||||
|
|
||||||
isAbs = any (url.startswith(x) for x in self.PROTOCOLS)
|
isAbs = any(url.startswith(x) for x in self.PROTOCOLS)
|
||||||
|
|
||||||
# Optimized rewriter for
|
# Optimized rewriter for
|
||||||
# -rel urls that don't start with / and don't contain ../ and no special mod
|
# -rel urls that don't start with / and don't contain ../ and no special mod
|
||||||
@ -117,12 +116,11 @@ class UrlRewriter:
|
|||||||
return url
|
return url
|
||||||
|
|
||||||
|
|
||||||
import utils
|
def do_rewrite(rel_url, base_url, prefix, mod = None):
|
||||||
if __name__ == "__main__" or utils.enable_doctests():
|
rewriter = UrlRewriter(base_url, prefix)
|
||||||
def test_rewrite(rel_url, base_url, prefix, mod = None):
|
return rewriter.rewrite(rel_url, mod)
|
||||||
rewriter = UrlRewriter(base_url, prefix)
|
|
||||||
return rewriter.rewrite(rel_url, mod)
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
import doctest
|
import doctest
|
||||||
doctest.testmod()
|
doctest.testmod()
|
||||||
|
|
@ -3,9 +3,38 @@
|
|||||||
import re
|
import re
|
||||||
import rfc3987
|
import rfc3987
|
||||||
|
|
||||||
import wbexceptions
|
|
||||||
|
|
||||||
# WbUrl : wb archival url representation for WB
|
# WbUrl : wb archival url representation for WB
|
||||||
|
"""
|
||||||
|
WbUrl represents the standard wayback archival url format.
|
||||||
|
A regular url is a subset of the WbUrl (latest replay).
|
||||||
|
|
||||||
|
The WbUrl expresses the common interface for interacting
|
||||||
|
with the wayback machine.
|
||||||
|
|
||||||
|
There WbUrl may represent one of the following forms:
|
||||||
|
|
||||||
|
query form: [/modifier]/[timestamp][-end_timestamp]*/<url>
|
||||||
|
|
||||||
|
modifier, timestamp and end_timestamp are optional
|
||||||
|
|
||||||
|
*/example.com
|
||||||
|
20101112030201*/http://example.com
|
||||||
|
2009-2015*/http://example.com
|
||||||
|
/cdx/*/http://example.com
|
||||||
|
|
||||||
|
url query form: used to indicate query across urls
|
||||||
|
same as query form but with a final *
|
||||||
|
*/example.com*
|
||||||
|
20101112030201*/http://example.com*
|
||||||
|
|
||||||
|
|
||||||
|
replay form:
|
||||||
|
20101112030201/http://example.com
|
||||||
|
20101112030201im_/http://example.com
|
||||||
|
|
||||||
|
latest_replay: (no timestamp)
|
||||||
|
http://example.com
|
||||||
|
"""
|
||||||
|
|
||||||
class WbUrl:
|
class WbUrl:
|
||||||
"""
|
"""
|
||||||
@ -38,6 +67,13 @@ class WbUrl:
|
|||||||
>>> repr(WbUrl('*/http://example.com/abc?def=a*'))
|
>>> repr(WbUrl('*/http://example.com/abc?def=a*'))
|
||||||
"('url_query', '', '', 'http://example.com/abc?def=a', '*/http://example.com/abc?def=a*')"
|
"('url_query', '', '', 'http://example.com/abc?def=a', '*/http://example.com/abc?def=a*')"
|
||||||
|
|
||||||
|
>>> repr(WbUrl('2010*/http://example.com/abc?def=a'))
|
||||||
|
"('query', '2010', '', 'http://example.com/abc?def=a', '2010*/http://example.com/abc?def=a')"
|
||||||
|
|
||||||
|
# timestamp range query
|
||||||
|
>>> repr(WbUrl('2009-2015*/http://example.com/abc?def=a'))
|
||||||
|
"('query', '2009', '', 'http://example.com/abc?def=a', '2009-2015*/http://example.com/abc?def=a')"
|
||||||
|
|
||||||
>>> repr(WbUrl('json/*/http://example.com/abc?def=a'))
|
>>> repr(WbUrl('json/*/http://example.com/abc?def=a'))
|
||||||
"('query', '', 'json', 'http://example.com/abc?def=a', 'json/*/http://example.com/abc?def=a')"
|
"('query', '', 'json', 'http://example.com/abc?def=a', 'json/*/http://example.com/abc?def=a')"
|
||||||
|
|
||||||
@ -59,16 +95,16 @@ class WbUrl:
|
|||||||
# ======================
|
# ======================
|
||||||
>>> x = WbUrl('/#$%#/')
|
>>> x = WbUrl('/#$%#/')
|
||||||
Traceback (most recent call last):
|
Traceback (most recent call last):
|
||||||
BadUrlException: Bad Request Url: http://#$%#/
|
Exception: Bad Request Url: http://#$%#/
|
||||||
|
|
||||||
>>> x = WbUrl('/http://example.com:abc/')
|
>>> x = WbUrl('/http://example.com:abc/')
|
||||||
Traceback (most recent call last):
|
Traceback (most recent call last):
|
||||||
BadUrlException: Bad Request Url: http://example.com:abc/
|
Exception: Bad Request Url: http://example.com:abc/
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Regexs
|
# Regexs
|
||||||
# ======================
|
# ======================
|
||||||
QUERY_REGEX = re.compile('^(?:([\w\-:]+)/)?(\d*)\*/?(.*)$')
|
QUERY_REGEX = re.compile('^(?:([\w\-:]+)/)?(\d*)(?:-(\d+))?\*/?(.*)$')
|
||||||
REPLAY_REGEX = re.compile('^(\d*)([a-z]+_)?/{0,3}(.*)$')
|
REPLAY_REGEX = re.compile('^(\d*)([a-z]+_)?/{0,3}(.*)$')
|
||||||
|
|
||||||
QUERY = 'query'
|
QUERY = 'query'
|
||||||
@ -85,13 +121,14 @@ class WbUrl:
|
|||||||
self.type = None
|
self.type = None
|
||||||
self.url = ''
|
self.url = ''
|
||||||
self.timestamp = ''
|
self.timestamp = ''
|
||||||
|
self.end_timestamp = ''
|
||||||
self.mod = ''
|
self.mod = ''
|
||||||
|
|
||||||
if not any (f(url) for f in [self._init_query, self._init_replay]):
|
if not any (f(url) for f in [self._init_query, self._init_replay]):
|
||||||
raise wbexceptions.RequestParseException('Invalid WB Request Url: ', url)
|
raise Exception('Invalid WbUrl: ', url)
|
||||||
|
|
||||||
if len(self.url) == 0:
|
if len(self.url) == 0:
|
||||||
raise wbexceptions.RequestParseException('Invalid WB Request Url: ', url)
|
raise Exception('Invalid WbUrl: ', url)
|
||||||
|
|
||||||
# protocol agnostic url -> http://
|
# protocol agnostic url -> http://
|
||||||
#if self.url.startswith('//'):
|
#if self.url.startswith('//'):
|
||||||
@ -105,7 +142,7 @@ class WbUrl:
|
|||||||
matcher = rfc3987.match(self.url.upper(), 'IRI')
|
matcher = rfc3987.match(self.url.upper(), 'IRI')
|
||||||
|
|
||||||
if not matcher:
|
if not matcher:
|
||||||
raise wbexceptions.BadUrlException('Bad Request Url: ' + self.url)
|
raise Exception('Bad Request Url: ' + self.url)
|
||||||
|
|
||||||
# Match query regex
|
# Match query regex
|
||||||
# ======================
|
# ======================
|
||||||
@ -118,7 +155,8 @@ class WbUrl:
|
|||||||
|
|
||||||
self.mod = res[0]
|
self.mod = res[0]
|
||||||
self.timestamp = res[1]
|
self.timestamp = res[1]
|
||||||
self.url = res[2]
|
self.end_timestamp = res[2]
|
||||||
|
self.url = res[3]
|
||||||
if self.url.endswith('*'):
|
if self.url.endswith('*'):
|
||||||
self.type = self.URL_QUERY
|
self.type = self.URL_QUERY
|
||||||
self.url = self.url[:-1]
|
self.url = self.url[:-1]
|
||||||
@ -151,6 +189,7 @@ class WbUrl:
|
|||||||
atype = overrides['type'] if 'type' in overrides else self.type
|
atype = overrides['type'] if 'type' in overrides else self.type
|
||||||
mod = overrides['mod'] if 'mod' in overrides else self.mod
|
mod = overrides['mod'] if 'mod' in overrides else self.mod
|
||||||
timestamp = overrides['timestamp'] if 'timestamp' in overrides else self.timestamp
|
timestamp = overrides['timestamp'] if 'timestamp' in overrides else self.timestamp
|
||||||
|
end_timestamp = overrides['end_timestamp'] if 'end_timestamp' in overrides else self.end_timestamp
|
||||||
url = overrides['url'] if 'url' in overrides else self.url
|
url = overrides['url'] if 'url' in overrides else self.url
|
||||||
|
|
||||||
if atype == self.QUERY or atype == self.URL_QUERY:
|
if atype == self.QUERY or atype == self.URL_QUERY:
|
||||||
@ -159,6 +198,8 @@ class WbUrl:
|
|||||||
tsmod += mod + "/"
|
tsmod += mod + "/"
|
||||||
if timestamp:
|
if timestamp:
|
||||||
tsmod += timestamp
|
tsmod += timestamp
|
||||||
|
if end_timestamp:
|
||||||
|
tsmod += '-' + end_timestamp
|
||||||
|
|
||||||
tsmod += "*/" + url
|
tsmod += "*/" + url
|
||||||
if atype == self.URL_QUERY:
|
if atype == self.URL_QUERY:
|
122
pywb/utils.py
122
pywb/utils.py
@ -1,122 +0,0 @@
|
|||||||
import itertools
|
|
||||||
import time
|
|
||||||
import zlib
|
|
||||||
import time
|
|
||||||
import datetime
|
|
||||||
import calendar
|
|
||||||
import re
|
|
||||||
|
|
||||||
def peek_iter(iterable):
|
|
||||||
try:
|
|
||||||
first = next(iterable)
|
|
||||||
except StopIteration:
|
|
||||||
return None
|
|
||||||
|
|
||||||
return itertools.chain([first], iterable)
|
|
||||||
|
|
||||||
|
|
||||||
def split_prefix(key, prefixs):
|
|
||||||
for p in prefixs:
|
|
||||||
if key.startswith(p):
|
|
||||||
plen = len(p)
|
|
||||||
return (key[:plen], key[plen:])
|
|
||||||
|
|
||||||
|
|
||||||
def create_decompressor():
|
|
||||||
return zlib.decompressobj(16 + zlib.MAX_WBITS)
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
# Adapted from example at
|
|
||||||
class PerfTimer:
|
|
||||||
def __init__(self, perfdict, name):
|
|
||||||
self.perfdict = perfdict
|
|
||||||
self.name = name
|
|
||||||
|
|
||||||
def __enter__(self):
|
|
||||||
self.start = time.clock()
|
|
||||||
return self
|
|
||||||
|
|
||||||
def __exit__(self, *args):
|
|
||||||
self.end = time.clock()
|
|
||||||
if self.perfdict is not None:
|
|
||||||
self.perfdict[self.name] = str(self.end - self.start)
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
# adapted -from wsgiref.request_uri, but doesn't include domain name and allows all characters
|
|
||||||
# allowed in the path segment according to: http://tools.ietf.org/html/rfc3986#section-3.3
|
|
||||||
# explained here: http://stackoverflow.com/questions/4669692/valid-characters-for-directory-part-of-a-url-for-short-links
|
|
||||||
def rel_request_uri(environ, include_query=1):
|
|
||||||
"""
|
|
||||||
Return the requested path, optionally including the query string
|
|
||||||
|
|
||||||
# Simple test:
|
|
||||||
>>> rel_request_uri({'PATH_INFO': '/web/example.com'})
|
|
||||||
'/web/example.com'
|
|
||||||
|
|
||||||
# Test all unecoded special chars and double-quote
|
|
||||||
# (double-quote must be encoded but not single quote)
|
|
||||||
>>> rel_request_uri({'PATH_INFO': "/web/example.com/0~!+$&'()*+,;=:\\\""})
|
|
||||||
"/web/example.com/0~!+$&'()*+,;=:%22"
|
|
||||||
"""
|
|
||||||
from urllib import quote
|
|
||||||
url = quote(environ.get('PATH_INFO',''), safe='/~!$&\'()*+,;=:@')
|
|
||||||
if include_query and environ.get('QUERY_STRING'):
|
|
||||||
url += '?' + environ['QUERY_STRING']
|
|
||||||
|
|
||||||
return url
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
def unsurt(surt):
|
|
||||||
"""
|
|
||||||
# Simple surt
|
|
||||||
>>> unsurt('com,example)/')
|
|
||||||
'example.com)/'
|
|
||||||
|
|
||||||
# Broken surt
|
|
||||||
>>> unsurt('com,example)')
|
|
||||||
'com,example)'
|
|
||||||
|
|
||||||
# Long surt
|
|
||||||
>>> unsurt('suffix,domain,sub,subsub,another,subdomain)/path/file/index.html?a=b?c=)/')
|
|
||||||
'subdomain.another.subsub.sub.domain.suffix)/path/file/index.html?a=b?c=)/'
|
|
||||||
"""
|
|
||||||
|
|
||||||
try:
|
|
||||||
index = surt.index(')/')
|
|
||||||
parts = surt[0:index].split(',')
|
|
||||||
parts.reverse()
|
|
||||||
host = '.'.join(parts)
|
|
||||||
host += surt[index:]
|
|
||||||
return host
|
|
||||||
|
|
||||||
except ValueError:
|
|
||||||
# May not be a valid surt
|
|
||||||
return surt
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
# Support for bulk doctest testing via nose or py.test
|
|
||||||
# nosetests --with-doctest
|
|
||||||
# py.test --doctest_modules
|
|
||||||
|
|
||||||
import sys
|
|
||||||
is_in_testtool = any(sys.argv[0].endswith(tool) for tool in ['py.test', 'nosetests'])
|
|
||||||
|
|
||||||
def enable_doctests():
|
|
||||||
return is_in_testtool
|
|
||||||
|
|
||||||
|
|
||||||
def test_data_dir():
|
|
||||||
import os
|
|
||||||
return os.path.dirname(os.path.realpath(__file__)) + '/../sample_archive/'
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
|
|
||||||
if __name__ == "__main__" or enable_doctests():
|
|
||||||
import doctest
|
|
||||||
doctest.testmod()
|
|
||||||
|
|
16
pywb/utils/README.md
Normal file
16
pywb/utils/README.md
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
## PyWb Utils v0.2 ##
|
||||||
|
|
||||||
|
[](https://travis-ci.org/ikreymer/pywb_utils)
|
||||||
|
|
||||||
|
This is a standalone module contains a variety of utils used by pywb wayback tool suite.
|
||||||
|
|
||||||
|
`python run-tests.py` will run all tests
|
||||||
|
|
||||||
|
#### Modules
|
||||||
|
|
||||||
|
[binsearch.py](pywb_utils/binsearch.py) -- Binary search implementation over text files
|
||||||
|
|
||||||
|
[loaders.py](pywb_utils/loaders.py) -- Loading abstraction for http, local file system, as well as buffered and seekable file readers
|
||||||
|
|
||||||
|
[timeutils.py](pywb_utils/timeutils.py) -- Utility functions for converting between standard datetime formats 14-digit timestamp
|
||||||
|
|
0
pywb/utils/__init__.py
Normal file
0
pywb/utils/__init__.py
Normal file
110
pywb/utils/binsearch.py
Normal file
110
pywb/utils/binsearch.py
Normal file
@ -0,0 +1,110 @@
|
|||||||
|
"""
|
||||||
|
Utility functions for performing binary search over a sorted text file
|
||||||
|
"""
|
||||||
|
|
||||||
|
from collections import deque
|
||||||
|
import itertools
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
def binsearch_offset(reader, key, compare_func=cmp, block_size=8192):
|
||||||
|
"""
|
||||||
|
Find offset of the line which matches a given 'key' using binary search
|
||||||
|
If key is not found, the offset is of the line after the key
|
||||||
|
|
||||||
|
File is subdivided into block_size (default 8192) sized blocks
|
||||||
|
Optional compare_func may be specified
|
||||||
|
"""
|
||||||
|
min_ = 0
|
||||||
|
max_ = reader.getsize() / block_size
|
||||||
|
|
||||||
|
while max_ - min_ > 1:
|
||||||
|
mid = min_ + ((max_ - min_) / 2)
|
||||||
|
reader.seek(mid * block_size)
|
||||||
|
|
||||||
|
if mid > 0:
|
||||||
|
reader.readline() # skip partial line
|
||||||
|
|
||||||
|
line = reader.readline()
|
||||||
|
|
||||||
|
if compare_func(key, line) > 0:
|
||||||
|
min_ = mid
|
||||||
|
else:
|
||||||
|
max_ = mid
|
||||||
|
|
||||||
|
return min_ * block_size
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
def search(reader, key, prev_size=0, compare_func=cmp, block_size=8192):
|
||||||
|
"""
|
||||||
|
Perform a binary search for a specified key to within a 'block_size'
|
||||||
|
(default 8192) sized block followed by linear search
|
||||||
|
within the block to find first matching line.
|
||||||
|
|
||||||
|
When performin_g linear search, keep track of up to N previous lines before
|
||||||
|
first matching line.
|
||||||
|
"""
|
||||||
|
min_ = binsearch_offset(reader, key, compare_func, block_size)
|
||||||
|
|
||||||
|
reader.seek(min_)
|
||||||
|
|
||||||
|
if min_ > 0:
|
||||||
|
reader.readline() # skip partial line
|
||||||
|
|
||||||
|
if prev_size > 1:
|
||||||
|
prev_deque = deque(max_len=prev_size)
|
||||||
|
|
||||||
|
line = None
|
||||||
|
|
||||||
|
while True:
|
||||||
|
line = reader.readline()
|
||||||
|
if not line:
|
||||||
|
break
|
||||||
|
if compare_func(line, key) >= 0:
|
||||||
|
break
|
||||||
|
|
||||||
|
if prev_size == 1:
|
||||||
|
prev = line
|
||||||
|
elif prev_size > 1:
|
||||||
|
prev_deque.append(line)
|
||||||
|
|
||||||
|
def gen_iter(line):
|
||||||
|
"""
|
||||||
|
Create iterator over any previous lines to
|
||||||
|
current matched line
|
||||||
|
"""
|
||||||
|
if prev_size == 1:
|
||||||
|
yield prev.rstrip()
|
||||||
|
elif prev_size > 1:
|
||||||
|
for i in prev_deque:
|
||||||
|
yield i.rstrip()
|
||||||
|
|
||||||
|
while line:
|
||||||
|
yield line.rstrip()
|
||||||
|
line = reader.readline()
|
||||||
|
|
||||||
|
return gen_iter(line)
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
def iter_prefix(reader, key):
|
||||||
|
"""
|
||||||
|
Creates an iterator which iterates over lines that start with prefix
|
||||||
|
'key' in a sorted text file.
|
||||||
|
"""
|
||||||
|
|
||||||
|
return itertools.takewhile(
|
||||||
|
lambda line: line.startswith(key),
|
||||||
|
search(reader, key))
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
def iter_exact(reader, key, token=' '):
|
||||||
|
"""
|
||||||
|
Create an iterator which iterates over lines where the first field matches
|
||||||
|
the 'key', equivalent to token + sep prefix.
|
||||||
|
Default field termin_ator/seperator is ' '
|
||||||
|
"""
|
||||||
|
|
||||||
|
return iter_prefix(reader, key + token)
|
204
pywb/utils/bufferedreaders.py
Normal file
204
pywb/utils/bufferedreaders.py
Normal file
@ -0,0 +1,204 @@
|
|||||||
|
import StringIO
|
||||||
|
import zlib
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
def gzip_decompressor():
|
||||||
|
"""
|
||||||
|
Decompressor which can handle decompress gzip stream
|
||||||
|
"""
|
||||||
|
return zlib.decompressobj(16 + zlib.MAX_WBITS)
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class BufferedReader(object):
|
||||||
|
"""
|
||||||
|
A wrapping line reader which wraps an existing reader.
|
||||||
|
Read operations operate on underlying buffer, which is filled to
|
||||||
|
block_size (1024 default)
|
||||||
|
|
||||||
|
If an optional decompress type is specified,
|
||||||
|
data is fed through the decompressor when read from the buffer.
|
||||||
|
Currently supported decompression: gzip
|
||||||
|
|
||||||
|
If decompression fails on first try, data is assumed to be decompressed
|
||||||
|
and no exception is thrown. If a failure occurs after data has been
|
||||||
|
partially decompressed, the exception is propagated.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
DECOMPRESSORS = {'gzip': gzip_decompressor}
|
||||||
|
|
||||||
|
def __init__(self, stream, max_len=0, block_size=1024, decomp_type=None):
|
||||||
|
self.stream = stream
|
||||||
|
self.block_size = block_size
|
||||||
|
|
||||||
|
if decomp_type:
|
||||||
|
try:
|
||||||
|
self.decompressor = self.DECOMPRESSORS[decomp_type.lower()]()
|
||||||
|
except KeyError:
|
||||||
|
raise Exception('Decompression type not supported: ' +
|
||||||
|
decomp_type)
|
||||||
|
else:
|
||||||
|
self.decompressor = None
|
||||||
|
|
||||||
|
self.buff = None
|
||||||
|
self.num_read = 0
|
||||||
|
self.max_len = max_len
|
||||||
|
|
||||||
|
def _fillbuff(self, block_size=None):
|
||||||
|
if not block_size:
|
||||||
|
block_size = self.block_size
|
||||||
|
|
||||||
|
if not self.buff or self.buff.pos >= self.buff.len:
|
||||||
|
if self.max_len > 0:
|
||||||
|
to_read = min(self.max_len - self.num_read, self.block_size)
|
||||||
|
else:
|
||||||
|
to_read = self.block_size
|
||||||
|
|
||||||
|
data = self.stream.read(to_read)
|
||||||
|
self._process_read(data)
|
||||||
|
|
||||||
|
def _process_read(self, data):
|
||||||
|
data = self._decompress(data)
|
||||||
|
self.num_read += len(data)
|
||||||
|
self.buff = StringIO.StringIO(data)
|
||||||
|
|
||||||
|
def _decompress(self, data):
|
||||||
|
if self.decompressor and data:
|
||||||
|
try:
|
||||||
|
data = self.decompressor.decompress(data)
|
||||||
|
except Exception:
|
||||||
|
# if first read attempt, assume non-gzipped stream
|
||||||
|
if self.num_read == 0:
|
||||||
|
self.decompressor = None
|
||||||
|
# otherwise (partly decompressed), something is wrong
|
||||||
|
else:
|
||||||
|
raise
|
||||||
|
return data
|
||||||
|
|
||||||
|
def read(self, length=None):
|
||||||
|
self._fillbuff()
|
||||||
|
return self.buff.read(length)
|
||||||
|
|
||||||
|
def readline(self, length=None):
|
||||||
|
self._fillbuff()
|
||||||
|
return self.buff.readline(length)
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
if self.stream:
|
||||||
|
self.stream.close()
|
||||||
|
self.stream = None
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class ChunkedDataException(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class ChunkedDataReader(BufferedReader):
|
||||||
|
r"""
|
||||||
|
A ChunkedDataReader is a BufferedReader which also supports de-chunking
|
||||||
|
of the data if it happens to be http 'chunk-encoded'.
|
||||||
|
|
||||||
|
If at any point the chunked header is not available, the stream is
|
||||||
|
assumed to not be chunked and no more dechunking occurs.
|
||||||
|
|
||||||
|
Properly formatted chunked data:
|
||||||
|
>>> c = ChunkedDataReader(StringIO.StringIO("4\r\n1234\r\n0\r\n\r\n"));
|
||||||
|
>>> c.read() + c.read()
|
||||||
|
'1234'
|
||||||
|
|
||||||
|
Non-chunked data:
|
||||||
|
>>> ChunkedDataReader(StringIO.StringIO("xyz123!@#")).read()
|
||||||
|
'xyz123!@#'
|
||||||
|
|
||||||
|
Starts like chunked data, but isn't:
|
||||||
|
>>> c = ChunkedDataReader(StringIO.StringIO("1\r\nxyz123!@#"));
|
||||||
|
>>> c.read() + c.read()
|
||||||
|
'1\r\nx123!@#'
|
||||||
|
|
||||||
|
Chunked data cut off part way through:
|
||||||
|
>>> c = ChunkedDataReader(StringIO.StringIO("4\r\n1234\r\n4\r\n12"));
|
||||||
|
>>> c.read() + c.read()
|
||||||
|
'123412'
|
||||||
|
"""
|
||||||
|
|
||||||
|
all_chunks_read = False
|
||||||
|
not_chunked = False
|
||||||
|
|
||||||
|
# if False, we'll use best-guess fallback for parse errors
|
||||||
|
raise_chunked_data_exceptions = False
|
||||||
|
|
||||||
|
def _fillbuff(self, block_size=None):
|
||||||
|
if self.not_chunked:
|
||||||
|
return BufferedReader._fillbuff(self, block_size)
|
||||||
|
|
||||||
|
if self.all_chunks_read:
|
||||||
|
return
|
||||||
|
|
||||||
|
if not self.buff or self.buff.pos >= self.buff.len:
|
||||||
|
length_header = self.stream.readline(64)
|
||||||
|
self._data = ''
|
||||||
|
|
||||||
|
try:
|
||||||
|
self._try_decode(length_header)
|
||||||
|
except ChunkedDataException:
|
||||||
|
if self.raise_chunked_data_exceptions:
|
||||||
|
raise
|
||||||
|
|
||||||
|
# Can't parse the data as chunked.
|
||||||
|
# It's possible that non-chunked data is served
|
||||||
|
# with a Transfer-Encoding: chunked.
|
||||||
|
# Treat this as non-chunk encoded from here on.
|
||||||
|
self._process_read(length_header + self._data)
|
||||||
|
self.not_chunked = True
|
||||||
|
|
||||||
|
def _try_decode(self, length_header):
|
||||||
|
# decode length header
|
||||||
|
try:
|
||||||
|
chunk_size = int(length_header.strip().split(';')[0], 16)
|
||||||
|
except ValueError:
|
||||||
|
raise ChunkedDataException("Couldn't decode length header " +
|
||||||
|
length_header)
|
||||||
|
|
||||||
|
if not chunk_size:
|
||||||
|
# chunk_size 0 indicates end of file
|
||||||
|
self.all_chunks_read = True
|
||||||
|
#self._process_read('')
|
||||||
|
return
|
||||||
|
|
||||||
|
data_len = len(self._data)
|
||||||
|
|
||||||
|
# read chunk
|
||||||
|
while data_len < chunk_size:
|
||||||
|
new_data = self.stream.read(chunk_size - data_len)
|
||||||
|
|
||||||
|
# if we unexpectedly run out of data,
|
||||||
|
# either raise an exception or just stop reading,
|
||||||
|
# assuming file was cut off
|
||||||
|
if not new_data:
|
||||||
|
if self.raise_chunked_data_exceptions:
|
||||||
|
msg = 'Ran out of data before end of chunk'
|
||||||
|
raise ChunkedDataException(msg)
|
||||||
|
else:
|
||||||
|
chunk_size = data_len
|
||||||
|
self.all_chunks_read = True
|
||||||
|
|
||||||
|
self._data += new_data
|
||||||
|
data_len = len(self._data)
|
||||||
|
|
||||||
|
# if we successfully read a block without running out,
|
||||||
|
# it should end in \r\n
|
||||||
|
if not self.all_chunks_read:
|
||||||
|
clrf = self.stream.read(2)
|
||||||
|
if clrf != '\r\n':
|
||||||
|
raise ChunkedDataException("Chunk terminator not found.")
|
||||||
|
|
||||||
|
# hand to base class for further processing
|
||||||
|
self._process_read(self._data)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import doctest
|
||||||
|
doctest.testmod()
|
152
pywb/utils/loaders.py
Normal file
152
pywb/utils/loaders.py
Normal file
@ -0,0 +1,152 @@
|
|||||||
|
"""
|
||||||
|
This module provides loaders for local file system and over http
|
||||||
|
local and remote access
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import hmac
|
||||||
|
import urllib2
|
||||||
|
import time
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
# load a reader from http
|
||||||
|
#=================================================================
|
||||||
|
class HttpLoader(object):
|
||||||
|
"""
|
||||||
|
Load a file-like reader over http using range requests
|
||||||
|
and an optional cookie created via a cookie_maker
|
||||||
|
"""
|
||||||
|
def __init__(self, cookie_maker=None):
|
||||||
|
self.cookie_maker = cookie_maker
|
||||||
|
|
||||||
|
def load(self, url, offset, length):
|
||||||
|
if length > 0:
|
||||||
|
range_header = 'bytes={0}-{1}'.format(offset, offset + length - 1)
|
||||||
|
else:
|
||||||
|
range_header = 'bytes={0}-'.format(offset)
|
||||||
|
|
||||||
|
headers = {}
|
||||||
|
headers['Range'] = range_header
|
||||||
|
|
||||||
|
if self.cookie_maker:
|
||||||
|
headers['Cookie'] = self.cookie_maker.make()
|
||||||
|
|
||||||
|
request = urllib2.Request(url, headers=headers)
|
||||||
|
return urllib2.urlopen(request)
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
# Signed Cookie-Maker
|
||||||
|
#=================================================================
|
||||||
|
|
||||||
|
class HMACCookieMaker(object):
|
||||||
|
"""
|
||||||
|
Utility class to produce signed HMAC digest cookies
|
||||||
|
to be used with each http request
|
||||||
|
"""
|
||||||
|
def __init__(self, key, name, duration=10):
|
||||||
|
self.key = key
|
||||||
|
self.name = name
|
||||||
|
# duration in seconds
|
||||||
|
self.duration = duration
|
||||||
|
|
||||||
|
def make(self, extra_id=''):
|
||||||
|
expire = str(long(time.time() + self.duration))
|
||||||
|
|
||||||
|
if extra_id:
|
||||||
|
msg = extra_id + '-' + expire
|
||||||
|
else:
|
||||||
|
msg = expire
|
||||||
|
|
||||||
|
hmacdigest = hmac.new(self.key, msg)
|
||||||
|
hexdigest = hmacdigest.hexdigest()
|
||||||
|
|
||||||
|
if extra_id:
|
||||||
|
cookie = '{0}-{1}={2}-{3}'.format(self.name, extra_id,
|
||||||
|
expire, hexdigest)
|
||||||
|
else:
|
||||||
|
cookie = '{0}={1}-{2}'.format(self.name, expire, hexdigest)
|
||||||
|
|
||||||
|
return cookie
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
# load a reader from local filesystem
|
||||||
|
#=================================================================
|
||||||
|
class FileLoader(object):
|
||||||
|
"""
|
||||||
|
Load a file-like reader from the local file system
|
||||||
|
"""
|
||||||
|
|
||||||
|
def load(self, url, offset, length):
|
||||||
|
if url.startswith('file://'):
|
||||||
|
url = url[len('file://'):]
|
||||||
|
|
||||||
|
afile = open(url, 'rb')
|
||||||
|
afile.seek(offset)
|
||||||
|
|
||||||
|
if length > 0:
|
||||||
|
return LimitReader(afile, length)
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
# Limit Reader
|
||||||
|
#=================================================================
|
||||||
|
class LimitReader(object):
|
||||||
|
"""
|
||||||
|
A reader which will not read more than specified limit
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, stream, limit):
|
||||||
|
self.stream = stream
|
||||||
|
self.limit = limit
|
||||||
|
|
||||||
|
if not self.limit:
|
||||||
|
self.limit = 1
|
||||||
|
|
||||||
|
def read(self, length=None):
|
||||||
|
length = min(length, self.limit) if length else self.limit
|
||||||
|
buff = self.stream.read(length)
|
||||||
|
self.limit -= len(buff)
|
||||||
|
return buff
|
||||||
|
|
||||||
|
def readline(self, length=None):
|
||||||
|
length = min(length, self.limit) if length else self.limit
|
||||||
|
buff = self.stream.readline(length)
|
||||||
|
self.limit -= len(buff)
|
||||||
|
return buff
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
self.stream.close()
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
# Local text file with known size -- used for binsearch
|
||||||
|
#=================================================================
|
||||||
|
class SeekableTextFileReader(object):
|
||||||
|
"""
|
||||||
|
A very simple file-like object wrapper that knows it's total size,
|
||||||
|
via getsize()
|
||||||
|
Supports seek() operation.
|
||||||
|
Assumed to be a text file. Used for binsearch.
|
||||||
|
"""
|
||||||
|
def __init__(self, filename):
|
||||||
|
self.fh = open(filename, 'rb')
|
||||||
|
self.filename = filename
|
||||||
|
self.size = os.path.getsize(filename)
|
||||||
|
|
||||||
|
def getsize(self):
|
||||||
|
return self.size
|
||||||
|
|
||||||
|
def read(self):
|
||||||
|
return self.fh.read()
|
||||||
|
|
||||||
|
def readline(self):
|
||||||
|
return self.fh.readline()
|
||||||
|
|
||||||
|
def seek(self, offset):
|
||||||
|
return self.fh.seek(offset)
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
return self.fh.close()
|
107
pywb/utils/statusandheaders.py
Normal file
107
pywb/utils/statusandheaders.py
Normal file
@ -0,0 +1,107 @@
|
|||||||
|
"""
|
||||||
|
Representation and parsing of HTTP-style status + headers
|
||||||
|
"""
|
||||||
|
|
||||||
|
import pprint
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class StatusAndHeaders(object):
|
||||||
|
"""
|
||||||
|
Representation of parsed http-style status line and headers
|
||||||
|
Status Line if first line of request/response
|
||||||
|
Headers is a list of (name, value) tuples
|
||||||
|
An optional protocol which appears on first line may be specified
|
||||||
|
"""
|
||||||
|
def __init__(self, statusline, headers, protocol=''):
|
||||||
|
self.statusline = statusline
|
||||||
|
self.headers = headers
|
||||||
|
self.protocol = protocol
|
||||||
|
|
||||||
|
def get_header(self, name):
|
||||||
|
"""
|
||||||
|
return header (name, value)
|
||||||
|
if found
|
||||||
|
"""
|
||||||
|
name_lower = name.lower()
|
||||||
|
for value in self.headers:
|
||||||
|
if value[0].lower() == name_lower:
|
||||||
|
return value[1]
|
||||||
|
|
||||||
|
def remove_header(self, name):
|
||||||
|
"""
|
||||||
|
remove header (case-insensitive)
|
||||||
|
return True if header removed, False otherwise
|
||||||
|
"""
|
||||||
|
name_lower = name.lower()
|
||||||
|
for index in xrange(len(self.headers) - 1, -1, -1):
|
||||||
|
if self.headers[index][0].lower() == name_lower:
|
||||||
|
del self.headers[index]
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
headers_str = pprint.pformat(self.headers, indent=2)
|
||||||
|
return "StatusAndHeaders(protocol = '{0}', statusline = '{1}', \
|
||||||
|
headers = {2})".format(self.protocol, self.statusline, headers_str)
|
||||||
|
|
||||||
|
def __eq__(self, other):
|
||||||
|
return (self.statusline == other.statusline and
|
||||||
|
self.headers == other.headers and
|
||||||
|
self.protocol == other.protocol)
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class StatusAndHeadersParser(object):
|
||||||
|
"""
|
||||||
|
Parser which consumes a stream support readline() to read
|
||||||
|
status and headers and return a StatusAndHeaders object
|
||||||
|
"""
|
||||||
|
def __init__(self, statuslist):
|
||||||
|
self.statuslist = statuslist
|
||||||
|
|
||||||
|
def parse(self, stream):
|
||||||
|
"""
|
||||||
|
parse stream for status line and headers
|
||||||
|
return a StatusAndHeaders object
|
||||||
|
"""
|
||||||
|
statusline = stream.readline().rstrip()
|
||||||
|
|
||||||
|
protocol_status = self.split_prefix(statusline, self.statuslist)
|
||||||
|
|
||||||
|
if not protocol_status:
|
||||||
|
msg = 'Expected Status Line - Found: ' + statusline
|
||||||
|
raise StatusAndHeadersParserException(msg)
|
||||||
|
|
||||||
|
headers = []
|
||||||
|
|
||||||
|
line = stream.readline().rstrip()
|
||||||
|
while line and line != '\r\n':
|
||||||
|
name, value = line.split(':', 1)
|
||||||
|
header = (name, value.strip())
|
||||||
|
headers.append(header)
|
||||||
|
line = stream.readline().rstrip()
|
||||||
|
|
||||||
|
return StatusAndHeaders(statusline=protocol_status[1].strip(),
|
||||||
|
headers=headers,
|
||||||
|
protocol=protocol_status[0])
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def split_prefix(key, prefixs):
|
||||||
|
"""
|
||||||
|
split key string into prefix and remainder
|
||||||
|
for first matching prefix from a list
|
||||||
|
"""
|
||||||
|
for prefix in prefixs:
|
||||||
|
if key.startswith(prefix):
|
||||||
|
plen = len(prefix)
|
||||||
|
return (key[:plen], key[plen:])
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class StatusAndHeadersParserException(Exception):
|
||||||
|
"""
|
||||||
|
status + headers parsing exception
|
||||||
|
"""
|
||||||
|
pass
|
52
pywb/utils/test/binsearch_test.py
Normal file
52
pywb/utils/test/binsearch_test.py
Normal file
@ -0,0 +1,52 @@
|
|||||||
|
#=================================================================
|
||||||
|
"""
|
||||||
|
# binsearch tests
|
||||||
|
|
||||||
|
# Prefix Search
|
||||||
|
>>> print_binsearch_results('org,iana)/domains/root', iter_prefix)
|
||||||
|
org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz
|
||||||
|
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz
|
||||||
|
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
|
||||||
|
org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz
|
||||||
|
|
||||||
|
>>> print_binsearch_results('org,iana)/domains/root', iter_exact)
|
||||||
|
org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz
|
||||||
|
|
||||||
|
>>> print_binsearch_results('org,iana)/', iter_exact)
|
||||||
|
org,iana)/ 20140126200624 http://www.iana.org/ text/html 200 OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 2258 334 iana.warc.gz
|
||||||
|
|
||||||
|
>>> print_binsearch_results('org,iana)/domains/root/db', iter_exact)
|
||||||
|
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz
|
||||||
|
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
|
||||||
|
|
||||||
|
# Exact Search
|
||||||
|
>>> print_binsearch_results('org,iaana)/', iter_exact)
|
||||||
|
>>> print_binsearch_results('org,ibna)/', iter_exact)
|
||||||
|
|
||||||
|
>>> print_binsearch_results('org,iana)/time-zones', iter_exact)
|
||||||
|
org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
import os
|
||||||
|
from pywb.utils.binsearch import iter_prefix, iter_exact
|
||||||
|
from pywb.utils.loaders import SeekableTextFileReader
|
||||||
|
|
||||||
|
from pywb import get_test_dir
|
||||||
|
|
||||||
|
#test_cdx_dir = os.path.dirname(os.path.realpath(__file__)) + '/../sample-data/'
|
||||||
|
test_cdx_dir = get_test_dir() + 'cdx/'
|
||||||
|
|
||||||
|
def print_binsearch_results(key, iter_func):
|
||||||
|
cdx = SeekableTextFileReader(test_cdx_dir + 'iana.cdx')
|
||||||
|
|
||||||
|
for line in iter_func(cdx, key):
|
||||||
|
print line
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import doctest
|
||||||
|
doctest.testmod()
|
||||||
|
|
||||||
|
|
69
pywb/utils/test/loaders_test.py
Normal file
69
pywb/utils/test/loaders_test.py
Normal file
@ -0,0 +1,69 @@
|
|||||||
|
#=================================================================
|
||||||
|
"""
|
||||||
|
# LimitReader Tests
|
||||||
|
>>> LimitReader(StringIO.StringIO('abcdefghjiklmnopqrstuvwxyz'), 10).read(26)
|
||||||
|
'abcdefghji'
|
||||||
|
|
||||||
|
>>> LimitReader(StringIO.StringIO('abcdefghjiklmnopqrstuvwxyz'), 8).readline(26)
|
||||||
|
'abcdefgh'
|
||||||
|
|
||||||
|
>>> read_multiple(LimitReader(StringIO.StringIO('abcdefghjiklmnopqrstuvwxyz'), 10), [2, 2, 20])
|
||||||
|
'efghji'
|
||||||
|
|
||||||
|
# FileLoader Tests (includes LimitReader)
|
||||||
|
# Ensure attempt to read more than 100 bytes, reads exactly 100 bytes
|
||||||
|
>>> len(FileLoader().load(test_cdx_dir + 'iana.cdx', 0, 100).read('400'))
|
||||||
|
100
|
||||||
|
|
||||||
|
# SeekableTextFileReader Test
|
||||||
|
>>> sr = SeekableTextFileReader(test_cdx_dir + 'iana.cdx')
|
||||||
|
>>> sr.getsize()
|
||||||
|
30399
|
||||||
|
|
||||||
|
>>> seek_read_full(sr, 100)
|
||||||
|
'org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200826 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 34054 620049 iana.warc.gz\\n'
|
||||||
|
|
||||||
|
#BufferedReader readline()
|
||||||
|
>>> BufferedReader(open(test_cdx_dir + 'iana.cdx', 'rb')).readline()
|
||||||
|
' CDX N b a m s k r M S V g\\n'
|
||||||
|
|
||||||
|
#BufferedReader readline() with decompression
|
||||||
|
>>> BufferedReader(open(test_cdx_dir + 'iana.cdx.gz', 'rb'), decomp_type = 'gzip').readline()
|
||||||
|
' CDX N b a m s k r M S V g\\n'
|
||||||
|
|
||||||
|
>>> HttpLoader(HMACCookieMaker('test', 'test', 5)).load('http://example.com', 41, 14).read()
|
||||||
|
'Example Domain'
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
import os
|
||||||
|
import StringIO
|
||||||
|
from pywb.utils.loaders import FileLoader, HttpLoader, HMACCookieMaker
|
||||||
|
from pywb.utils.loaders import LimitReader, SeekableTextFileReader
|
||||||
|
from pywb.utils.bufferedreaders import BufferedReader
|
||||||
|
|
||||||
|
from pywb import get_test_dir
|
||||||
|
#test_cdx_dir = os.path.dirname(os.path.realpath(__file__)) + '/../sample-data/'
|
||||||
|
test_cdx_dir = get_test_dir() + 'cdx/'
|
||||||
|
|
||||||
|
|
||||||
|
def read_multiple(reader, inc_reads):
|
||||||
|
result = None
|
||||||
|
for x in inc_reads:
|
||||||
|
result = reader.read(x)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def seek_read_full(seekable_reader, offset):
|
||||||
|
seekable_reader.seek(offset)
|
||||||
|
seekable_reader.readline() #skip
|
||||||
|
return seekable_reader.readline()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import doctest
|
||||||
|
doctest.testmod()
|
||||||
|
|
||||||
|
|
@ -1,20 +1,25 @@
|
|||||||
|
"""
|
||||||
|
utility functions for converting between
|
||||||
|
datetime, iso date and 14-digit timestamp
|
||||||
|
"""
|
||||||
|
|
||||||
import re
|
import re
|
||||||
import time
|
import time
|
||||||
import datetime
|
import datetime
|
||||||
import calendar
|
import calendar
|
||||||
|
from itertools import imap
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
# str <-> datetime conversion
|
# str <-> datetime conversion
|
||||||
#=================================================================
|
#=================================================================
|
||||||
|
|
||||||
DATE_TIMESPLIT = re.compile('[^\d]')
|
DATE_TIMESPLIT = re.compile(r'[^\d]')
|
||||||
|
|
||||||
TIMESTAMP_14 = '%Y%m%d%H%M%S'
|
TIMESTAMP_14 = '%Y%m%d%H%M%S'
|
||||||
|
|
||||||
PAD_STAMP_END = '29991231235959'
|
PAD_STAMP_END = '29991231235959'
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def iso_date_to_datetime(string):
|
def iso_date_to_datetime(string):
|
||||||
"""
|
"""
|
||||||
>>> iso_date_to_datetime('2013-12-26T10:11:12Z')
|
>>> iso_date_to_datetime('2013-12-26T10:11:12Z')
|
||||||
@ -28,16 +33,18 @@ def iso_date_to_datetime(string):
|
|||||||
if nums[-1] == '':
|
if nums[-1] == '':
|
||||||
nums = nums[:-1]
|
nums = nums[:-1]
|
||||||
|
|
||||||
dt = datetime.datetime(*map(int, nums))
|
the_datetime = datetime.datetime(*imap(int, nums))
|
||||||
return dt
|
return the_datetime
|
||||||
|
|
||||||
def datetime_to_timestamp(dt):
|
|
||||||
|
def datetime_to_timestamp(the_datetime):
|
||||||
"""
|
"""
|
||||||
>>> datetime_to_timestamp(datetime.datetime(2013, 12, 26, 10, 11, 12))
|
>>> datetime_to_timestamp(datetime.datetime(2013, 12, 26, 10, 11, 12))
|
||||||
'20131226101112'
|
'20131226101112'
|
||||||
"""
|
"""
|
||||||
|
|
||||||
return dt.strftime(TIMESTAMP_14)
|
return the_datetime.strftime(TIMESTAMP_14)
|
||||||
|
|
||||||
|
|
||||||
def iso_date_to_timestamp(string):
|
def iso_date_to_timestamp(string):
|
||||||
"""
|
"""
|
||||||
@ -52,7 +59,7 @@ def iso_date_to_timestamp(string):
|
|||||||
|
|
||||||
|
|
||||||
# default pad is end of range for compatibility
|
# default pad is end of range for compatibility
|
||||||
def pad_timestamp(string, pad_str = PAD_STAMP_END):
|
def pad_timestamp(string, pad_str=PAD_STAMP_END):
|
||||||
"""
|
"""
|
||||||
>>> pad_timestamp('20')
|
>>> pad_timestamp('20')
|
||||||
'20991231235959'
|
'20991231235959'
|
||||||
@ -76,10 +83,12 @@ def pad_timestamp(string, pad_str = PAD_STAMP_END):
|
|||||||
def timestamp_to_datetime(string):
|
def timestamp_to_datetime(string):
|
||||||
"""
|
"""
|
||||||
>>> timestamp_to_datetime('20131226095010')
|
>>> timestamp_to_datetime('20131226095010')
|
||||||
time.struct_time(tm_year=2013, tm_mon=12, tm_mday=26, tm_hour=9, tm_min=50, tm_sec=10, tm_wday=3, tm_yday=360, tm_isdst=-1)
|
time.struct_time(tm_year=2013, tm_mon=12, tm_mday=26, \
|
||||||
|
tm_hour=9, tm_min=50, tm_sec=10, tm_wday=3, tm_yday=360, tm_isdst=-1)
|
||||||
|
|
||||||
>>> timestamp_to_datetime('2014')
|
>>> timestamp_to_datetime('2014')
|
||||||
time.struct_time(tm_year=2014, tm_mon=12, tm_mday=31, tm_hour=23, tm_min=59, tm_sec=59, tm_wday=2, tm_yday=365, tm_isdst=-1)
|
time.struct_time(tm_year=2014, tm_mon=12, tm_mday=31, \
|
||||||
|
tm_hour=23, tm_min=59, tm_sec=59, tm_wday=2, tm_yday=365, tm_isdst=-1)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Default pad to end of range for comptability
|
# Default pad to end of range for comptability
|
@ -1,4 +1,4 @@
|
|||||||
import cdxserver.timeutils as timeutils
|
import pywb.utils.timeutils as timeutils
|
||||||
|
|
||||||
import wbrequestresponse
|
import wbrequestresponse
|
||||||
import wbexceptions
|
import wbexceptions
|
||||||
|
22
pywb/warc/README.md
Normal file
22
pywb/warc/README.md
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
## PyWb Warc v0.2
|
||||||
|
|
||||||
|
[](https://travis-ci.org/ikreymer/pywb_warc)
|
||||||
|
|
||||||
|
This is the WARC/ARC record loading component of pywb wayback tool suite.
|
||||||
|
|
||||||
|
|
||||||
|
This package provides the following facilities:
|
||||||
|
|
||||||
|
* Resolve relative WARC/ARC filenames to a full path based on configurable resolvers
|
||||||
|
|
||||||
|
* Resolve 'revisit' records from provided index to find a full record with headers and payload content
|
||||||
|
|
||||||
|
* Load WARC and ARC records either locally or via http using http 1.1 range requests
|
||||||
|
|
||||||
|
|
||||||
|
### Tests
|
||||||
|
|
||||||
|
This package will include a test suite for different WARC and ARC loading formats.
|
||||||
|
|
||||||
|
To run: `python run-tests.py`
|
||||||
|
|
0
pywb/warc/__init__.py
Normal file
0
pywb/warc/__init__.py
Normal file
@ -1,13 +1,27 @@
|
|||||||
import redis
|
import redis
|
||||||
import binsearch.binsearch
|
|
||||||
|
from pywb.utils.binsearch import iter_exact
|
||||||
|
from pywb.utils.loaders import SeekableTextFileReader
|
||||||
|
|
||||||
import urlparse
|
import urlparse
|
||||||
import os
|
import os
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
#======================================
|
"""
|
||||||
# PrefixResolver - convert cdx file entry to url with prefix if url contains specified string
|
The purpose of this module is to 'resolve' a warc/arc filename,
|
||||||
#======================================
|
often found in a CDX file, to a full loadable url.
|
||||||
|
|
||||||
|
Supported resolvers are: url prefix, path index lookup and redis
|
||||||
|
|
||||||
|
make_best_resolver() attempts to guess the resolver method for given uri
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
# PrefixResolver - convert cdx file entry to url with prefix
|
||||||
|
# if url contains specified string
|
||||||
|
#=================================================================
|
||||||
class PrefixResolver:
|
class PrefixResolver:
|
||||||
def __init__(self, prefix, contains):
|
def __init__(self, prefix, contains):
|
||||||
self.prefix = prefix
|
self.prefix = prefix
|
||||||
@ -18,14 +32,15 @@ class PrefixResolver:
|
|||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
if self.contains:
|
if self.contains:
|
||||||
return "PrefixResolver('{0}', contains = '{1}')".format(self.prefix, self.contains)
|
return ("PrefixResolver('{0}', contains = '{1}')"
|
||||||
|
.format(self.prefix, self.contains))
|
||||||
else:
|
else:
|
||||||
return "PrefixResolver('{0}')".format(self.prefix)
|
return "PrefixResolver('{0}')".format(self.prefix)
|
||||||
|
|
||||||
|
|
||||||
#======================================
|
#=================================================================
|
||||||
class RedisResolver:
|
class RedisResolver:
|
||||||
def __init__(self, redis_url, key_prefix = None):
|
def __init__(self, redis_url, key_prefix=None):
|
||||||
self.redis_url = redis_url
|
self.redis_url = redis_url
|
||||||
self.key_prefix = key_prefix if key_prefix else 'w:'
|
self.key_prefix = key_prefix if key_prefix else 'w:'
|
||||||
self.redis = redis.StrictRedis.from_url(redis_url)
|
self.redis = redis.StrictRedis.from_url(redis_url)
|
||||||
@ -42,14 +57,14 @@ class RedisResolver:
|
|||||||
return "RedisResolver('{0}')".format(self.redis_url)
|
return "RedisResolver('{0}')".format(self.redis_url)
|
||||||
|
|
||||||
|
|
||||||
#======================================
|
#=================================================================
|
||||||
class PathIndexResolver:
|
class PathIndexResolver:
|
||||||
def __init__(self, pathindex_file):
|
def __init__(self, pathindex_file):
|
||||||
self.pathindex_file = pathindex_file
|
self.pathindex_file = pathindex_file
|
||||||
self.reader = binsearch.binsearch.FileReader(pathindex_file)
|
self.reader = SeekableTextFileReader(pathindex_file)
|
||||||
|
|
||||||
def __call__(self, filename):
|
def __call__(self, filename):
|
||||||
result = binsearch.binsearch.iter_exact(self.reader, filename, '\t')
|
result = iter_exact(self.reader, filename, '\t')
|
||||||
|
|
||||||
def gen_list(result):
|
def gen_list(result):
|
||||||
for pathline in result:
|
for pathline in result:
|
||||||
@ -63,6 +78,7 @@ class PathIndexResolver:
|
|||||||
return "PathIndexResolver('{0}')".format(self.pathindex_file)
|
return "PathIndexResolver('{0}')".format(self.pathindex_file)
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
#TODO: more options (remote files, contains param, etc..)
|
#TODO: more options (remote files, contains param, etc..)
|
||||||
# find best resolver given the path
|
# find best resolver given the path
|
||||||
def make_best_resolver(param):
|
def make_best_resolver(param):
|
||||||
@ -80,11 +96,14 @@ def make_best_resolver(param):
|
|||||||
RedisResolver('redis://myhost.example.com:1234/1')
|
RedisResolver('redis://myhost.example.com:1234/1')
|
||||||
|
|
||||||
# a file
|
# a file
|
||||||
>>> class_name(make_best_resolver('file://' + os.path.realpath(__file__)))
|
>>> r = make_best_resolver('file://' + os.path.realpath(__file__))
|
||||||
|
>>> r.__class__.__name__
|
||||||
'PathIndexResolver'
|
'PathIndexResolver'
|
||||||
|
|
||||||
# a dir
|
# a dir
|
||||||
>>> class_name(make_best_resolver('file://' + os.path.dirname(os.path.realpath(__file__))))
|
>>> path = os.path.realpath(__file__)
|
||||||
|
>>> r = make_best_resolver('file://' + os.path.dirname(path))
|
||||||
|
>>> r.__class__.__name__
|
||||||
'PrefixResolver'
|
'PrefixResolver'
|
||||||
|
|
||||||
"""
|
"""
|
||||||
@ -99,27 +118,29 @@ def make_best_resolver(param):
|
|||||||
url_parts = urlparse.urlsplit(path)
|
url_parts = urlparse.urlsplit(path)
|
||||||
|
|
||||||
if url_parts.scheme == 'redis':
|
if url_parts.scheme == 'redis':
|
||||||
logging.info('Adding Redis Index: ' + path)
|
logging.debug('Adding Redis Index: ' + path)
|
||||||
return RedisResolver(path, arg)
|
return RedisResolver(path, arg)
|
||||||
|
|
||||||
if url_parts.scheme == 'file':
|
if url_parts.scheme == 'file':
|
||||||
path = url_parts.path
|
path = url_parts.path
|
||||||
|
|
||||||
if os.path.isfile(path):
|
if os.path.isfile(path):
|
||||||
logging.info('Adding Path Index: ' + path)
|
logging.debug('Adding Path Index: ' + path)
|
||||||
return PathIndexResolver(path)
|
return PathIndexResolver(path)
|
||||||
|
|
||||||
# non-file paths always treated as prefix for now
|
# non-file paths always treated as prefix for now
|
||||||
else:
|
else:
|
||||||
logging.info('Adding Archive Path Source: ' + path)
|
logging.debug('Adding Archive Path Source: ' + path)
|
||||||
return PrefixResolver(path, arg)
|
return PrefixResolver(path, arg)
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
def make_best_resolvers(paths):
|
def make_best_resolvers(paths):
|
||||||
"""
|
"""
|
||||||
>>> make_best_resolvers(['http://myhost.example.com/warcs/', 'redis://myhost.example.com:1234/1'])
|
>>> r = make_best_resolvers(['http://example.com/warcs/',\
|
||||||
[PrefixResolver('http://myhost.example.com/warcs/'), RedisResolver('redis://myhost.example.com:1234/1')]
|
'redis://example.com:1234/1'])
|
||||||
|
>>> map(lambda x: x.__class__.__name__, r)
|
||||||
|
['PrefixResolver', 'RedisResolver']
|
||||||
"""
|
"""
|
||||||
if hasattr(paths, '__iter__'):
|
if hasattr(paths, '__iter__'):
|
||||||
return map(make_best_resolver, paths)
|
return map(make_best_resolver, paths)
|
||||||
@ -127,13 +148,7 @@ def make_best_resolvers(paths):
|
|||||||
return [make_best_resolver(paths)]
|
return [make_best_resolver(paths)]
|
||||||
|
|
||||||
|
|
||||||
import utils
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
if __name__ == "__main__" or utils.enable_doctests():
|
if __name__ == "__main__":
|
||||||
|
|
||||||
def class_name(obj):
|
|
||||||
return obj.__class__.__name__
|
|
||||||
|
|
||||||
import doctest
|
import doctest
|
||||||
doctest.testmod()
|
doctest.testmod()
|
||||||
|
|
161
pywb/warc/recordloader.py
Normal file
161
pywb/warc/recordloader.py
Normal file
@ -0,0 +1,161 @@
|
|||||||
|
import itertools
|
||||||
|
import urlparse
|
||||||
|
import collections
|
||||||
|
|
||||||
|
from pywb.utils.statusandheaders import StatusAndHeaders
|
||||||
|
from pywb.utils.statusandheaders import StatusAndHeadersParser
|
||||||
|
|
||||||
|
from pywb.utils.loaders import FileLoader, HttpLoader
|
||||||
|
from pywb.utils.bufferedreaders import BufferedReader
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
ArcWarcRecord = collections.namedtuple('ArchiveRecord',
|
||||||
|
'type, rec_headers, ' +
|
||||||
|
'stream, status_headers')
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class ArchiveLoadFailed(Exception):
|
||||||
|
def __init__(self, reason, filename=''):
|
||||||
|
super(ArchiveLoadFailed, self).__init__(filename + ':' + str(reason))
|
||||||
|
#self.filename = filename
|
||||||
|
#self.reason = reason
|
||||||
|
|
||||||
|
def status(self):
|
||||||
|
return '503 Service Unavailable'
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class ArcWarcRecordLoader:
|
||||||
|
# Standard ARC headers
|
||||||
|
ARC_HEADERS = ["uri", "ip-address", "creation-date",
|
||||||
|
"content-type", "length"]
|
||||||
|
|
||||||
|
# Since loading a range request,
|
||||||
|
# can only determine gzip-ness based on file extension
|
||||||
|
# (BufferedReader will however default to non-gzip if
|
||||||
|
# decompression fails)
|
||||||
|
FORMAT_MAP = {
|
||||||
|
'.warc.gz': ('warc', True),
|
||||||
|
'.arc.gz': ('arc', True),
|
||||||
|
'.warc': ('warc', False),
|
||||||
|
'.arc': ('arc', False),
|
||||||
|
}
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def create_default_loaders(cookie_maker=None):
|
||||||
|
http = HttpLoader(cookie_maker)
|
||||||
|
file = FileLoader()
|
||||||
|
return {
|
||||||
|
'http': http,
|
||||||
|
'https': http,
|
||||||
|
'file': file,
|
||||||
|
'': file
|
||||||
|
}
|
||||||
|
|
||||||
|
def __init__(self, loaders={}, cookie_maker=None, chunk_size=8192):
|
||||||
|
self.loaders = loaders
|
||||||
|
|
||||||
|
if not self.loaders:
|
||||||
|
self.loaders = self.create_default_loaders(cookie_maker)
|
||||||
|
|
||||||
|
self.chunk_size = chunk_size
|
||||||
|
|
||||||
|
self.arc_parser = ARCHeadersParser(self.ARC_HEADERS)
|
||||||
|
|
||||||
|
warc_types = ['WARC/1.0', 'WARC/0.17', 'WARC/0.18']
|
||||||
|
self.warc_parser = StatusAndHeadersParser(warc_types)
|
||||||
|
self.http_parser = StatusAndHeadersParser(['HTTP/1.0', 'HTTP/1.1'])
|
||||||
|
|
||||||
|
def load(self, url, offset, length):
|
||||||
|
url_parts = urlparse.urlsplit(url)
|
||||||
|
|
||||||
|
loader = self.loaders.get(url_parts.scheme)
|
||||||
|
if not loader:
|
||||||
|
raise ArchiveLoadFailed('Unknown Protocol', url)
|
||||||
|
|
||||||
|
the_format = None
|
||||||
|
|
||||||
|
for ext, iformat in self.FORMAT_MAP.iteritems():
|
||||||
|
if url.endswith(ext):
|
||||||
|
the_format = iformat
|
||||||
|
break
|
||||||
|
|
||||||
|
if the_format is None:
|
||||||
|
raise ArchiveLoadFailed('Unknown file format', url)
|
||||||
|
|
||||||
|
(a_format, is_gzip) = the_format
|
||||||
|
|
||||||
|
#decomp = utils.create_decompressor() if is_gzip else None
|
||||||
|
decomp_type = 'gzip' if is_gzip else None
|
||||||
|
|
||||||
|
try:
|
||||||
|
length = int(length)
|
||||||
|
except:
|
||||||
|
length = -1
|
||||||
|
|
||||||
|
raw = loader.load(url, long(offset), length)
|
||||||
|
|
||||||
|
stream = BufferedReader(raw, length, self.chunk_size, decomp_type)
|
||||||
|
|
||||||
|
if a_format == 'arc':
|
||||||
|
rec_headers = self.arc_parser.parse(stream)
|
||||||
|
rec_type = 'response'
|
||||||
|
empty = (rec_headers.get_header('length') == 0)
|
||||||
|
|
||||||
|
elif a_format == 'warc':
|
||||||
|
rec_headers = self.warc_parser.parse(stream)
|
||||||
|
rec_type = rec_headers.get_header('WARC-Type')
|
||||||
|
empty = (rec_headers.get_header('Content-Length') == '0')
|
||||||
|
|
||||||
|
# special case: empty w/arc record (hopefully a revisit)
|
||||||
|
if empty:
|
||||||
|
status_headers = StatusAndHeaders('204 No Content', [])
|
||||||
|
|
||||||
|
# special case: warc records that are not expected to have http headers
|
||||||
|
# attempt to add 200 status and content-type
|
||||||
|
elif rec_type == 'metadata' or rec_type == 'resource':
|
||||||
|
content_type = [('Content-Type',
|
||||||
|
rec_headers.get_header('Content-Type'))]
|
||||||
|
|
||||||
|
status_headers = StatusAndHeaders('200 OK', content_type)
|
||||||
|
|
||||||
|
# special case: http 0.9 response, no status or headers
|
||||||
|
#elif rec_type == 'response':
|
||||||
|
# content_type = rec_headers.get_header('Content-Type')
|
||||||
|
# if content_type and (';version=0.9' in content_type):
|
||||||
|
# status_headers = StatusAndHeaders('200 OK', [])
|
||||||
|
|
||||||
|
# response record: parse HTTP status and headers!
|
||||||
|
else:
|
||||||
|
#(statusline, http_headers) = self.parse_http_headers(stream)
|
||||||
|
status_headers = self.http_parser.parse(stream)
|
||||||
|
|
||||||
|
return ArcWarcRecord((a_format, rec_type),
|
||||||
|
rec_headers, stream, status_headers)
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class ARCHeadersParser:
|
||||||
|
def __init__(self, headernames):
|
||||||
|
self.headernames = headernames
|
||||||
|
|
||||||
|
def parse(self, stream):
|
||||||
|
headerline = stream.readline().rstrip()
|
||||||
|
|
||||||
|
parts = headerline.split()
|
||||||
|
|
||||||
|
headernames = self.headernames
|
||||||
|
|
||||||
|
if len(parts) != len(headernames):
|
||||||
|
msg = 'Wrong # of headers, expected arc headers {0}, Found {1}'
|
||||||
|
raise ArchiveLoadFailed(msg.format(headernames, parts))
|
||||||
|
|
||||||
|
headers = []
|
||||||
|
|
||||||
|
for name, value in itertools.izip(headernames, parts):
|
||||||
|
headers.append((name, value))
|
||||||
|
|
||||||
|
return StatusAndHeaders(statusline='',
|
||||||
|
headers=headers,
|
||||||
|
protocol='ARC/1.0')
|
176
pywb/warc/resolvingloader.py
Normal file
176
pywb/warc/resolvingloader.py
Normal file
@ -0,0 +1,176 @@
|
|||||||
|
from pywb.utils.timeutils import iso_date_to_timestamp
|
||||||
|
from recordloader import ArcWarcRecordLoader, ArchiveLoadFailed
|
||||||
|
from pathresolvers import make_best_resolvers
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class ResolvingLoader:
|
||||||
|
def __init__(self, paths, record_loader=ArcWarcRecordLoader(),
|
||||||
|
cdx_server=None):
|
||||||
|
|
||||||
|
self.path_resolvers = make_best_resolvers(paths)
|
||||||
|
self.record_loader = record_loader
|
||||||
|
self.cdx_server = cdx_server
|
||||||
|
|
||||||
|
def resolve_headers_and_payload(self, cdx, failed_files):
|
||||||
|
"""
|
||||||
|
Resolve headers and payload for a given capture
|
||||||
|
In the simple case, headers and payload are in the same record.
|
||||||
|
In the case of revisit records, the payload and headers may be in
|
||||||
|
different records.
|
||||||
|
|
||||||
|
If the original has already been found, lookup original using
|
||||||
|
orig. fields in cdx dict.
|
||||||
|
Otherwise, call _load_different_url_payload() to get cdx index
|
||||||
|
from a different url to find the original record.
|
||||||
|
"""
|
||||||
|
has_curr = (cdx['filename'] != '-')
|
||||||
|
has_orig = (cdx.get('orig.filename', '-') != '-')
|
||||||
|
|
||||||
|
# load headers record from cdx['filename'] unless it is '-' (rare)
|
||||||
|
headers_record = None
|
||||||
|
if has_curr:
|
||||||
|
headers_record = self._resolve_path_load(cdx, False, failed_files)
|
||||||
|
|
||||||
|
# two index lookups
|
||||||
|
# Case 1: if mimetype is still warc/revisit
|
||||||
|
if cdx['mimetype'] == 'warc/revisit' and headers_record:
|
||||||
|
payload_record = self._load_different_url_payload(cdx,
|
||||||
|
headers_record,
|
||||||
|
failed_files)
|
||||||
|
|
||||||
|
# single lookup cases
|
||||||
|
# case 2: non-revisit
|
||||||
|
elif (has_curr and not has_orig):
|
||||||
|
payload_record = headers_record
|
||||||
|
|
||||||
|
# case 3: identical url revisit, load payload from orig.filename
|
||||||
|
elif (has_orig):
|
||||||
|
payload_record = self._resolve_path_load(cdx, True, failed_files)
|
||||||
|
|
||||||
|
# special case: set header to payload if old-style revisit
|
||||||
|
# with missing header
|
||||||
|
if not headers_record:
|
||||||
|
headers_record = payload_record
|
||||||
|
elif headers_record != payload_record:
|
||||||
|
# close remainder of stream as this record only used for
|
||||||
|
# (already parsed) headers
|
||||||
|
headers_record.stream.close()
|
||||||
|
|
||||||
|
# special case: check if headers record is actually empty
|
||||||
|
# (eg empty revisit), then use headers from revisit
|
||||||
|
if not headers_record.status_headers.headers:
|
||||||
|
headers_record = payload_record
|
||||||
|
|
||||||
|
if not headers_record or not payload_record:
|
||||||
|
raise ArchiveLoadFailed('Could not load ' + str(cdx))
|
||||||
|
|
||||||
|
return (headers_record.status_headers, payload_record.stream)
|
||||||
|
|
||||||
|
def _resolve_path_load(self, cdx, is_original, failed_files):
|
||||||
|
"""
|
||||||
|
Load specific record based on filename, offset and length
|
||||||
|
fields in the cdx.
|
||||||
|
If original=True, use the orig.* fields for the cdx
|
||||||
|
|
||||||
|
Resolve the filename to full path using specified path resolvers
|
||||||
|
|
||||||
|
If failed_files list provided, keep track of failed resolve attempts
|
||||||
|
"""
|
||||||
|
|
||||||
|
if is_original:
|
||||||
|
(filename, offset, length) = (cdx['orig.filename'],
|
||||||
|
cdx['orig.offset'],
|
||||||
|
cdx['orig.length'])
|
||||||
|
else:
|
||||||
|
(filename, offset, length) = (cdx['filename'],
|
||||||
|
cdx['offset'],
|
||||||
|
cdx['length'])
|
||||||
|
|
||||||
|
# optimization: if same file already failed this request,
|
||||||
|
# don't try again
|
||||||
|
if failed_files and filename in failed_files:
|
||||||
|
raise ArchiveLoadFailed('Skipping Already Failed', filename)
|
||||||
|
|
||||||
|
any_found = False
|
||||||
|
last_exc = None
|
||||||
|
for resolver in self.path_resolvers:
|
||||||
|
possible_paths = resolver(filename)
|
||||||
|
|
||||||
|
if possible_paths:
|
||||||
|
for path in possible_paths:
|
||||||
|
any_found = True
|
||||||
|
try:
|
||||||
|
return self.record_loader.load(path, offset, length)
|
||||||
|
|
||||||
|
except Exception as ue:
|
||||||
|
last_exc = ue
|
||||||
|
|
||||||
|
# Unsuccessful if reached here
|
||||||
|
if failed_files:
|
||||||
|
failed_files.append(filename)
|
||||||
|
|
||||||
|
if last_exc:
|
||||||
|
msg = str(last_exc.__class__.__name__)
|
||||||
|
else:
|
||||||
|
msg = 'Archive File Not Found'
|
||||||
|
|
||||||
|
raise ArchiveLoadFailed(msg, filename)
|
||||||
|
|
||||||
|
def _load_different_url_payload(self, cdx, headers_record, failed_files):
|
||||||
|
"""
|
||||||
|
Handle the case where a duplicate of a capture with same digest
|
||||||
|
exists at a different url.
|
||||||
|
|
||||||
|
If a cdx_server is provided, a query is made for matching
|
||||||
|
url, timestamp and digest.
|
||||||
|
|
||||||
|
Raise exception if no matches found.
|
||||||
|
"""
|
||||||
|
|
||||||
|
ref_target_uri = (headers_record.rec_headers.
|
||||||
|
get_header('WARC-Refers-To-Target-URI'))
|
||||||
|
|
||||||
|
target_uri = headers_record.rec_headers.get_header('WARC-Target-URI')
|
||||||
|
|
||||||
|
# Check for unresolved revisit error,
|
||||||
|
# if refers to target uri not present or same as the current url
|
||||||
|
if not ref_target_uri or (ref_target_uri == target_uri):
|
||||||
|
raise ArchiveLoadFailed('Missing Revisit Original')
|
||||||
|
|
||||||
|
ref_target_date = (headers_record.rec_headers.
|
||||||
|
get_header('WARC-Refers-To-Date'))
|
||||||
|
|
||||||
|
if not ref_target_date:
|
||||||
|
ref_target_date = cdx['timestamp']
|
||||||
|
else:
|
||||||
|
ref_target_date = iso_date_to_timestamp(ref_target_date)
|
||||||
|
|
||||||
|
orig_cdx_lines = self.load_cdx_for_dupe(ref_target_uri,
|
||||||
|
ref_target_date, digest)
|
||||||
|
|
||||||
|
for cdx in orig_cdx_lines:
|
||||||
|
try:
|
||||||
|
payload_record = self._load_and_resolve(cdx, False,
|
||||||
|
failed_files)
|
||||||
|
return payload_record
|
||||||
|
|
||||||
|
except ArchiveLoadFailed as e:
|
||||||
|
pass
|
||||||
|
|
||||||
|
raise ArchiveLoadFailed('Original for revisit could not be loaded')
|
||||||
|
|
||||||
|
def load_cdx_for_dupe(url, timestamp, digest):
|
||||||
|
"""
|
||||||
|
If a cdx_server is available, return response from server,
|
||||||
|
otherwise empty list
|
||||||
|
"""
|
||||||
|
if not self.cdx_server:
|
||||||
|
return []
|
||||||
|
|
||||||
|
params = {'url': url,
|
||||||
|
'closest': closest,
|
||||||
|
'filter': 'digest:' + digest,
|
||||||
|
'output': 'raw'}
|
||||||
|
|
||||||
|
return self.cdx_server.load_cdx(params)
|
199
pywb/warc/test/test_loading.py
Normal file
199
pywb/warc/test/test_loading.py
Normal file
@ -0,0 +1,199 @@
|
|||||||
|
|
||||||
|
"""
|
||||||
|
Test loading different types of records from a variety of formats
|
||||||
|
|
||||||
|
# Load response record from WARC
|
||||||
|
>>> load_test_archive('example.warc.gz', '333', '1043')
|
||||||
|
(('warc', 'response'),
|
||||||
|
StatusAndHeaders(protocol = 'WARC/1.0', statusline = '', headers = [ ('WARC-Type', 'response'),
|
||||||
|
('WARC-Record-ID', '<urn:uuid:6d058047-ede2-4a13-be79-90c17c631dd4>'),
|
||||||
|
('WARC-Date', '2014-01-03T03:03:21Z'),
|
||||||
|
('Content-Length', '1610'),
|
||||||
|
('Content-Type', 'application/http; msgtype=response'),
|
||||||
|
('WARC-Payload-Digest', 'sha1:B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'),
|
||||||
|
('WARC-Target-URI', 'http://example.com?example=1'),
|
||||||
|
('WARC-Warcinfo-ID', '<urn:uuid:fbd6cf0a-6160-4550-b343-12188dc05234>')]),
|
||||||
|
StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
|
||||||
|
('Cache-Control', 'max-age=604800'),
|
||||||
|
('Content-Type', 'text/html'),
|
||||||
|
('Date', 'Fri, 03 Jan 2014 03:03:21 GMT'),
|
||||||
|
('Etag', '"359670651"'),
|
||||||
|
('Expires', 'Fri, 10 Jan 2014 03:03:21 GMT'),
|
||||||
|
('Last-Modified', 'Fri, 09 Aug 2013 23:54:35 GMT'),
|
||||||
|
('Server', 'ECS (sjc/4FCE)'),
|
||||||
|
('X-Cache', 'HIT'),
|
||||||
|
('x-ec-custom-error', '1'),
|
||||||
|
('Content-Length', '1270'),
|
||||||
|
('Connection', 'close')]))
|
||||||
|
|
||||||
|
# Load revisit record from WARC
|
||||||
|
>>> load_test_archive('example.warc.gz', '1864', '553')
|
||||||
|
(('warc', 'revisit'),
|
||||||
|
StatusAndHeaders(protocol = 'WARC/1.0', statusline = '', headers = [ ('WARC-Type', 'revisit'),
|
||||||
|
('WARC-Record-ID', '<urn:uuid:3619f5b0-d967-44be-8f24-762098d427c4>'),
|
||||||
|
('WARC-Date', '2014-01-03T03:03:41Z'),
|
||||||
|
('Content-Length', '340'),
|
||||||
|
('Content-Type', 'application/http; msgtype=response'),
|
||||||
|
('WARC-Payload-Digest', 'sha1:B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'),
|
||||||
|
('WARC-Target-URI', 'http://example.com?example=1'),
|
||||||
|
('WARC-Warcinfo-ID', '<urn:uuid:fbd6cf0a-6160-4550-b343-12188dc05234>'),
|
||||||
|
( 'WARC-Profile',
|
||||||
|
'http://netpreserve.org/warc/0.18/revisit/identical-payload-digest'),
|
||||||
|
('WARC-Refers-To-Target-URI', 'http://example.com?example=1'),
|
||||||
|
('WARC-Refers-To-Date', '2014-01-03T03:03:21Z')]),
|
||||||
|
StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
|
||||||
|
('Cache-Control', 'max-age=604800'),
|
||||||
|
('Content-Type', 'text/html'),
|
||||||
|
('Date', 'Fri, 03 Jan 2014 03:03:41 GMT'),
|
||||||
|
('Etag', '"359670651"'),
|
||||||
|
('Expires', 'Fri, 10 Jan 2014 03:03:41 GMT'),
|
||||||
|
('Last-Modified', 'Fri, 09 Aug 2013 23:54:35 GMT'),
|
||||||
|
('Server', 'ECS (sjc/4FCE)'),
|
||||||
|
('X-Cache', 'HIT'),
|
||||||
|
('x-ec-custom-error', '1'),
|
||||||
|
('Content-Length', '1270'),
|
||||||
|
('Connection', 'close')]))
|
||||||
|
|
||||||
|
|
||||||
|
# Test of record loading based on cdx line
|
||||||
|
# Print parsed http headers + 2 lines of content
|
||||||
|
# ==============================================================================
|
||||||
|
|
||||||
|
# Test loading from ARC based on cdx line
|
||||||
|
>>> load_from_cdx_test('com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 856 171 example.arc.gz')
|
||||||
|
StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
|
||||||
|
('Cache-Control', 'max-age=604800'),
|
||||||
|
('Content-Type', 'text/html'),
|
||||||
|
('Date', 'Sun, 16 Feb 2014 05:02:20 GMT'),
|
||||||
|
('Etag', '"359670651"'),
|
||||||
|
('Expires', 'Sun, 23 Feb 2014 05:02:20 GMT'),
|
||||||
|
('Last-Modified', 'Fri, 09 Aug 2013 23:54:35 GMT'),
|
||||||
|
('Server', 'ECS (sjc/4FCE)'),
|
||||||
|
('X-Cache', 'HIT'),
|
||||||
|
('x-ec-custom-error', '1'),
|
||||||
|
('Content-Length', '1270')])
|
||||||
|
<!doctype html>
|
||||||
|
<html>
|
||||||
|
|
||||||
|
>>> load_from_cdx_test('com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1656 151 example.arc')
|
||||||
|
StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
|
||||||
|
('Cache-Control', 'max-age=604800'),
|
||||||
|
('Content-Type', 'text/html'),
|
||||||
|
('Date', 'Sun, 16 Feb 2014 05:02:20 GMT'),
|
||||||
|
('Etag', '"359670651"'),
|
||||||
|
('Expires', 'Sun, 23 Feb 2014 05:02:20 GMT'),
|
||||||
|
('Last-Modified', 'Fri, 09 Aug 2013 23:54:35 GMT'),
|
||||||
|
('Server', 'ECS (sjc/4FCE)'),
|
||||||
|
('X-Cache', 'HIT'),
|
||||||
|
('x-ec-custom-error', '1'),
|
||||||
|
('Content-Length', '1270')])
|
||||||
|
<!doctype html>
|
||||||
|
<html>
|
||||||
|
|
||||||
|
|
||||||
|
# Test loading from WARC based on cdx line
|
||||||
|
>>> load_from_cdx_test('com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz')
|
||||||
|
StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
|
||||||
|
('Cache-Control', 'max-age=604800'),
|
||||||
|
('Content-Type', 'text/html'),
|
||||||
|
('Date', 'Fri, 03 Jan 2014 03:03:21 GMT'),
|
||||||
|
('Etag', '"359670651"'),
|
||||||
|
('Expires', 'Fri, 10 Jan 2014 03:03:21 GMT'),
|
||||||
|
('Last-Modified', 'Fri, 09 Aug 2013 23:54:35 GMT'),
|
||||||
|
('Server', 'ECS (sjc/4FCE)'),
|
||||||
|
('X-Cache', 'HIT'),
|
||||||
|
('x-ec-custom-error', '1'),
|
||||||
|
('Content-Length', '1270'),
|
||||||
|
('Connection', 'close')])
|
||||||
|
<!doctype html>
|
||||||
|
<html>
|
||||||
|
|
||||||
|
# Test cdx w/ revisit
|
||||||
|
>>> load_from_cdx_test('com,example)/?example=1 20140103030341 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz 1043 333 example.warc.gz')
|
||||||
|
StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
|
||||||
|
('Cache-Control', 'max-age=604800'),
|
||||||
|
('Content-Type', 'text/html'),
|
||||||
|
('Date', 'Fri, 03 Jan 2014 03:03:41 GMT'),
|
||||||
|
('Etag', '"359670651"'),
|
||||||
|
('Expires', 'Fri, 10 Jan 2014 03:03:41 GMT'),
|
||||||
|
('Last-Modified', 'Fri, 09 Aug 2013 23:54:35 GMT'),
|
||||||
|
('Server', 'ECS (sjc/4FCE)'),
|
||||||
|
('X-Cache', 'HIT'),
|
||||||
|
('x-ec-custom-error', '1'),
|
||||||
|
('Content-Length', '1270'),
|
||||||
|
('Connection', 'close')])
|
||||||
|
<!doctype html>
|
||||||
|
<html>
|
||||||
|
|
||||||
|
# Test loading warc created by wget 1.14
|
||||||
|
>>> load_from_cdx_test('com,example)/ 20140216012908 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1151 792 example-wget-1-14.warc.gz')
|
||||||
|
StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
|
||||||
|
('Cache-Control', 'max-age=604800'),
|
||||||
|
('Content-Type', 'text/html'),
|
||||||
|
('Date', 'Sun, 16 Feb 2014 01:29:08 GMT'),
|
||||||
|
('Etag', '"359670651"'),
|
||||||
|
('Expires', 'Sun, 23 Feb 2014 01:29:08 GMT'),
|
||||||
|
('Last-Modified', 'Fri, 09 Aug 2013 23:54:35 GMT'),
|
||||||
|
('Server', 'ECS (sjc/4FB4)'),
|
||||||
|
('X-Cache', 'HIT'),
|
||||||
|
('x-ec-custom-error', '1'),
|
||||||
|
('Content-Length', '1270')])
|
||||||
|
<!doctype html>
|
||||||
|
<html>
|
||||||
|
|
||||||
|
# Error Handling
|
||||||
|
|
||||||
|
# Invalid WARC Offset
|
||||||
|
>>> load_from_cdx_test('com,example)/?example=1 20140103030341 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1860 example.warc.gz 1043 333 example.warc.gz')
|
||||||
|
Traceback (most recent call last):
|
||||||
|
ArchiveLoadFailed: example.warc.gz:StatusAndHeadersParserException
|
||||||
|
|
||||||
|
# Invalid ARC Offset
|
||||||
|
>>> load_from_cdx_test('com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 332 example.warc.gz')
|
||||||
|
Traceback (most recent call last):
|
||||||
|
ArchiveLoadFailed: example.warc.gz:StatusAndHeadersParserException
|
||||||
|
|
||||||
|
|
||||||
|
# Error Expected with revisit -- invalid offset on original
|
||||||
|
>>> load_from_cdx_test('com,example)/?example=1 20140103030341 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz 1043 330 example.warc.gz')
|
||||||
|
Traceback (most recent call last):
|
||||||
|
ArchiveLoadFailed: example.warc.gz:StatusAndHeadersParserException
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import pprint
|
||||||
|
|
||||||
|
from pywb.warc.recordloader import ArcWarcRecordLoader, ArchiveLoadFailed
|
||||||
|
from pywb.warc.pathresolvers import make_best_resolvers
|
||||||
|
from pywb.warc.resolvingloader import ResolvingLoader
|
||||||
|
from pywb.cdx.cdxobject import CDXObject
|
||||||
|
|
||||||
|
from pywb import get_test_dir
|
||||||
|
|
||||||
|
#test_warc_dir = os.path.dirname(os.path.realpath(__file__)) + '/../sample_data/'
|
||||||
|
test_warc_dir = get_test_dir() + 'warcs/'
|
||||||
|
|
||||||
|
def load_test_archive(test_file, offset, length):
|
||||||
|
path = test_warc_dir + test_file
|
||||||
|
|
||||||
|
testloader = ArcWarcRecordLoader()
|
||||||
|
|
||||||
|
archive = testloader.load(path, offset, length)
|
||||||
|
archive = testloader.load(path, offset, length)
|
||||||
|
|
||||||
|
pprint.pprint((archive.type, archive.rec_headers, archive.status_headers))
|
||||||
|
|
||||||
|
|
||||||
|
def load_from_cdx_test(cdx):
|
||||||
|
resolve_loader = ResolvingLoader(test_warc_dir)
|
||||||
|
cdx = CDXObject(cdx)
|
||||||
|
(headers, stream) = resolve_loader.resolve_headers_and_payload(cdx, None)
|
||||||
|
print headers
|
||||||
|
sys.stdout.write(stream.readline())
|
||||||
|
sys.stdout.write(stream.readline())
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -1,8 +1,7 @@
|
|||||||
import utils
|
|
||||||
import wbexceptions
|
import wbexceptions
|
||||||
|
|
||||||
from wbrequestresponse import WbResponse, StatusAndHeaders
|
from wbrequestresponse import WbResponse, StatusAndHeaders
|
||||||
from cdxserver.cdxserver import CDXException
|
from pywb.cdx.cdxserver import CDXException
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import importlib
|
import importlib
|
||||||
@ -10,13 +9,37 @@ import logging
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
# adapted -from wsgiref.request_uri, but doesn't include domain name and allows all characters
|
||||||
|
# allowed in the path segment according to: http://tools.ietf.org/html/rfc3986#section-3.3
|
||||||
|
# explained here: http://stackoverflow.com/questions/4669692/valid-characters-for-directory-part-of-a-url-for-short-links
|
||||||
|
def rel_request_uri(environ, include_query=1):
|
||||||
|
"""
|
||||||
|
Return the requested path, optionally including the query string
|
||||||
|
|
||||||
|
# Simple test:
|
||||||
|
>>> rel_request_uri({'PATH_INFO': '/web/example.com'})
|
||||||
|
'/web/example.com'
|
||||||
|
|
||||||
|
# Test all unecoded special chars and double-quote
|
||||||
|
# (double-quote must be encoded but not single quote)
|
||||||
|
>>> rel_request_uri({'PATH_INFO': "/web/example.com/0~!+$&'()*+,;=:\\\""})
|
||||||
|
"/web/example.com/0~!+$&'()*+,;=:%22"
|
||||||
|
"""
|
||||||
|
from urllib import quote
|
||||||
|
url = quote(environ.get('PATH_INFO',''), safe='/~!$&\'()*+,;=:@')
|
||||||
|
if include_query and environ.get('QUERY_STRING'):
|
||||||
|
url += '?' + environ['QUERY_STRING']
|
||||||
|
|
||||||
|
return url
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
def create_wb_app(wb_router):
|
def create_wb_app(wb_router):
|
||||||
|
|
||||||
# Top-level wsgi application
|
# Top-level wsgi application
|
||||||
def application(env, start_response):
|
def application(env, start_response):
|
||||||
if env.get('SCRIPT_NAME') or not env.get('REQUEST_URI'):
|
if env.get('SCRIPT_NAME') or not env.get('REQUEST_URI'):
|
||||||
env['REL_REQUEST_URI'] = utils.rel_request_uri(env)
|
env['REL_REQUEST_URI'] = rel_request_uri(env)
|
||||||
else:
|
else:
|
||||||
env['REL_REQUEST_URI'] = env['REQUEST_URI']
|
env['REL_REQUEST_URI'] = env['REQUEST_URI']
|
||||||
|
|
||||||
@ -95,7 +118,7 @@ def main():
|
|||||||
raise
|
raise
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
if __name__ == "__main__" or utils.enable_doctests():
|
if __name__ == "__main__":
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
application = main()
|
application = main()
|
||||||
|
@ -1,7 +1,6 @@
|
|||||||
from wburl import WbUrl
|
from pywb.rewrite.wburl import WbUrl
|
||||||
from url_rewriter import UrlRewriter
|
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||||
|
from pywb.utils.statusandheaders import StatusAndHeaders
|
||||||
import utils
|
|
||||||
|
|
||||||
import pprint
|
import pprint
|
||||||
#WB Request and Response
|
#WB Request and Response
|
||||||
@ -182,35 +181,6 @@ class WbResponse:
|
|||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return str(vars(self))
|
return str(vars(self))
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
class StatusAndHeaders:
|
|
||||||
def __init__(self, statusline, headers, protocol = ''):
|
|
||||||
self.statusline = statusline
|
|
||||||
self.headers = headers
|
|
||||||
self.protocol = protocol
|
|
||||||
|
|
||||||
def get_header(self, name):
|
|
||||||
name_lower = name.lower()
|
|
||||||
for value in self.headers:
|
|
||||||
if (value[0].lower() == name_lower):
|
|
||||||
return value[1]
|
|
||||||
|
|
||||||
def remove_header(self, name):
|
|
||||||
name_lower = name.lower()
|
|
||||||
for x in xrange(len(self.headers) - 1, -1, -1):
|
|
||||||
if self.headers[x][0].lower() == name_lower:
|
|
||||||
del self.headers[x]
|
|
||||||
break
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return "StatusAndHeaders(protocol = '{0}', statusline = '{1}', headers = {2})".format(self.protocol, self.statusline, pprint.pformat(self.headers, indent = 2))
|
|
||||||
#return pprint.pformat(self.__dict__)
|
|
||||||
|
|
||||||
def __eq__(self, other):
|
|
||||||
return self.statusline == other.statusline and self.headers == other.headers and self.protocol == other.protocol
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import doctest
|
import doctest
|
||||||
doctest.testmod()
|
doctest.testmod()
|
||||||
|
3
run-tests.py
Normal file
3
run-tests.py
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
import pytest
|
||||||
|
result = pytest.main('-v --doctest-module tests/ pywb/')
|
||||||
|
exit(result)
|
BIN
sample_archive/cdx/iana.cdx.gz
Normal file
BIN
sample_archive/cdx/iana.cdx.gz
Normal file
Binary file not shown.
BIN
sample_archive/warcs/example-wget-1-14.warc.gz
Normal file
BIN
sample_archive/warcs/example-wget-1-14.warc.gz
Normal file
Binary file not shown.
69
sample_archive/warcs/example.arc
Normal file
69
sample_archive/warcs/example.arc
Normal file
@ -0,0 +1,69 @@
|
|||||||
|
filedesc://live-web-example.arc.gz 127.0.0.1 20140216050221 text/plain 75
|
||||||
|
1 0 LiveWeb Capture
|
||||||
|
URL IP-address Archive-date Content-type Archive-length
|
||||||
|
|
||||||
|
http://example.com/ 93.184.216.119 20140216050221 text/html 1591
|
||||||
|
HTTP/1.1 200 OK
|
||||||
|
Accept-Ranges: bytes
|
||||||
|
Cache-Control: max-age=604800
|
||||||
|
Content-Type: text/html
|
||||||
|
Date: Sun, 16 Feb 2014 05:02:20 GMT
|
||||||
|
Etag: "359670651"
|
||||||
|
Expires: Sun, 23 Feb 2014 05:02:20 GMT
|
||||||
|
Last-Modified: Fri, 09 Aug 2013 23:54:35 GMT
|
||||||
|
Server: ECS (sjc/4FCE)
|
||||||
|
X-Cache: HIT
|
||||||
|
x-ec-custom-error: 1
|
||||||
|
Content-Length: 1270
|
||||||
|
|
||||||
|
<!doctype html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>Example Domain</title>
|
||||||
|
|
||||||
|
<meta charset="utf-8" />
|
||||||
|
<meta http-equiv="Content-type" content="text/html; charset=utf-8" />
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
||||||
|
<style type="text/css">
|
||||||
|
body {
|
||||||
|
background-color: #f0f0f2;
|
||||||
|
margin: 0;
|
||||||
|
padding: 0;
|
||||||
|
font-family: "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
|
||||||
|
|
||||||
|
}
|
||||||
|
div {
|
||||||
|
width: 600px;
|
||||||
|
margin: 5em auto;
|
||||||
|
padding: 50px;
|
||||||
|
background-color: #fff;
|
||||||
|
border-radius: 1em;
|
||||||
|
}
|
||||||
|
a:link, a:visited {
|
||||||
|
color: #38488f;
|
||||||
|
text-decoration: none;
|
||||||
|
}
|
||||||
|
@media (max-width: 700px) {
|
||||||
|
body {
|
||||||
|
background-color: #fff;
|
||||||
|
}
|
||||||
|
div {
|
||||||
|
width: auto;
|
||||||
|
margin: 0 auto;
|
||||||
|
border-radius: 0;
|
||||||
|
padding: 1em;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
|
||||||
|
<body>
|
||||||
|
<div>
|
||||||
|
<h1>Example Domain</h1>
|
||||||
|
<p>This domain is established to be used for illustrative examples in documents. You may use this
|
||||||
|
domain in examples without prior coordination or asking for permission.</p>
|
||||||
|
<p><a href="http://www.iana.org/domains/example">More information...</a></p>
|
||||||
|
</div>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
|
BIN
sample_archive/warcs/example.arc.gz
Normal file
BIN
sample_archive/warcs/example.arc.gz
Normal file
Binary file not shown.
10
setup.py
10
setup.py
@ -5,18 +5,18 @@ import setuptools
|
|||||||
import glob
|
import glob
|
||||||
|
|
||||||
setuptools.setup(name='pywb',
|
setuptools.setup(name='pywb',
|
||||||
version='0.1',
|
version='0.2',
|
||||||
url='https://github.com/ikreymer/pywb',
|
url='https://github.com/ikreymer/pywb',
|
||||||
author='Ilya Kreymer',
|
author='Ilya Kreymer',
|
||||||
author_email='ilya@archive.org',
|
author_email='ilya@archive.org',
|
||||||
long_description=open('README.md').read(),
|
long_description=open('README.md').read(),
|
||||||
license='GPL',
|
license='GPL',
|
||||||
packages=['pywb', 'pywb.binsearch', 'pywb.cdxserver'],
|
packages=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite'],
|
||||||
provides=['pywb', 'pywb.binsearch', 'pywb.cdxserver'],
|
provides=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite'],
|
||||||
package_data={'pywb': ['ui/*', 'static/*']},
|
package_data={'pywb': ['ui/*', 'static/*']},
|
||||||
data_files = [('sample_archive/cdx/', glob.glob('sample_archive/cdx/*')),
|
data_files = [('sample_archive/cdx/', glob.glob('sample_archive/cdx/*')),
|
||||||
('sample_archive/warcs/', glob.glob('sample_archive/warcs/*'))],
|
('sample_archive/warcs/', glob.glob('sample_archive/warcs/*'))],
|
||||||
install_requires=['uwsgi', 'rfc3987', 'chardet', 'redis', 'jinja2', 'surt', 'pyyaml', 'WebTest'],
|
install_requires=['uwsgi', 'rfc3987', 'chardet', 'redis', 'jinja2', 'surt', 'pyyaml', 'WebTest','pytest'],
|
||||||
tests_require=['WebTest', 'pytest'],
|
# tests_require=['WebTest', 'pytest'],
|
||||||
zip_safe=False)
|
zip_safe=False)
|
||||||
|
|
||||||
|
88
tests/test_archivalrouter.py
Normal file
88
tests/test_archivalrouter.py
Normal file
@ -0,0 +1,88 @@
|
|||||||
|
"""
|
||||||
|
Test Route
|
||||||
|
# route with relative path
|
||||||
|
>>> Route('web', BaseHandler())({'REL_REQUEST_URI': '/web/test.example.com', 'SCRIPT_NAME': ''}, False)
|
||||||
|
{'wb_url': ('latest_replay', '', '', 'http://test.example.com', 'http://test.example.com'), 'coll': 'web', 'wb_prefix': '/web/', 'request_uri': '/web/test.example.com'}
|
||||||
|
|
||||||
|
# route with absolute path, running at script /my_pywb
|
||||||
|
>>> Route('web', BaseHandler())({'REL_REQUEST_URI': '/web/2013im_/test.example.com', 'SCRIPT_NAME': '/my_pywb', 'HTTP_HOST': 'localhost:8081', 'wsgi.url_scheme': 'https'}, True)
|
||||||
|
{'wb_url': ('replay', '2013', 'im_', 'http://test.example.com', '2013im_/http://test.example.com'), 'coll': 'web', 'wb_prefix': 'https://localhost:8081/my_pywb/web/', 'request_uri': '/web/2013im_/test.example.com'}
|
||||||
|
|
||||||
|
|
||||||
|
# not matching route -- skipped
|
||||||
|
>>> Route('web', BaseHandler())({'REL_REQUEST_URI': '/other/test.example.com', 'SCRIPT_NAME': ''}, False)
|
||||||
|
|
||||||
|
|
||||||
|
# Referer Redirect Test
|
||||||
|
>>> ReferRedirect('http://localhost:8080/').match_prefixs
|
||||||
|
['http://localhost:8080/']
|
||||||
|
|
||||||
|
>>> ReferRedirect(['http://example:9090/']).match_prefixs
|
||||||
|
['http://example:9090/']
|
||||||
|
|
||||||
|
>>> _test_redir('http://localhost:8080/', '/diff_path/other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html')
|
||||||
|
'http://localhost:8080/coll/20131010/http://example.com/diff_path/other.html'
|
||||||
|
|
||||||
|
>>> _test_redir('http://localhost:8080/', '/../other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html')
|
||||||
|
'http://localhost:8080/coll/20131010/http://example.com/other.html'
|
||||||
|
|
||||||
|
>>> _test_redir('http://localhost:8080/', '/../../other.html', 'http://localhost:8080/coll/20131010/http://example.com/index.html')
|
||||||
|
'http://localhost:8080/coll/20131010/http://example.com/other.html'
|
||||||
|
|
||||||
|
# Custom collection
|
||||||
|
>>> _test_redir('http://localhost:8080/', '/other.html', 'http://localhost:8080/complex/123/20131010/http://example.com/path/page.html', coll='complex/123')
|
||||||
|
'http://localhost:8080/complex/123/20131010/http://example.com/other.html'
|
||||||
|
|
||||||
|
# With timestamp included
|
||||||
|
>>> _test_redir('http://localhost:8080/', '/20131010/other.html', 'http://localhost:8080/coll/20131010/http://example.com/index.html')
|
||||||
|
'http://localhost:8080/coll/20131010/http://example.com/other.html'
|
||||||
|
|
||||||
|
# With timestamp included
|
||||||
|
>>> _test_redir('http://localhost:8080/', '/20131010/path/other.html', 'http://localhost:8080/coll/20131010/http://example.com/some/index.html')
|
||||||
|
'http://localhost:8080/coll/20131010/http://example.com/path/other.html'
|
||||||
|
|
||||||
|
# Wrong Host
|
||||||
|
>>> _test_redir('http://example:8080/', '/other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html')
|
||||||
|
False
|
||||||
|
|
||||||
|
# Right Host
|
||||||
|
>>> _test_redir('http://localhost:8080/', '/other.html', 'http://example.com:8080/coll/20131010/http://example.com/path/page.html', http_host = 'example.com:8080')
|
||||||
|
'http://example.com:8080/coll/20131010/http://example.com/other.html'
|
||||||
|
|
||||||
|
# With custom SCRIPT_NAME
|
||||||
|
>>> _test_redir('http://localhost:8080/', '/../other.html', 'http://localhost:8080/extra/coll/20131010/http://example.com/path/page.html', '/extra')
|
||||||
|
'http://localhost:8080/extra/coll/20131010/http://example.com/other.html'
|
||||||
|
|
||||||
|
# With custom SCRIPT_NAME + timestamp
|
||||||
|
>>> _test_redir('http://localhost:8080/', '/20131010/other.html', 'http://localhost:8080/extra/coll/20131010/http://example.com/path/page.html', '/extra')
|
||||||
|
'http://localhost:8080/extra/coll/20131010/http://example.com/other.html'
|
||||||
|
|
||||||
|
# With custom SCRIPT_NAME, bad match
|
||||||
|
>>> _test_redir('http://localhost:8080/', '/../other.html', 'http://localhost:8080/extra/coll/20131010/http://example.com/path/page.html', '/extr')
|
||||||
|
False
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
from pywb.archivalrouter import Route, ReferRedirect
|
||||||
|
from pywb.handlers import BaseHandler
|
||||||
|
|
||||||
|
|
||||||
|
def _test_redir(match_host, request_uri, referrer, script_name = '', coll = 'coll', http_host = None):
|
||||||
|
env = {'REL_REQUEST_URI': request_uri, 'HTTP_REFERER': referrer, 'SCRIPT_NAME': script_name}
|
||||||
|
|
||||||
|
if http_host:
|
||||||
|
env['HTTP_HOST'] = http_host
|
||||||
|
|
||||||
|
routes = [Route(coll, BaseHandler())]
|
||||||
|
|
||||||
|
redir = ReferRedirect(match_host)
|
||||||
|
#req = WbRequest.from_uri(request_uri, env)
|
||||||
|
rep = redir(env, routes)
|
||||||
|
if not rep:
|
||||||
|
return False
|
||||||
|
|
||||||
|
return rep.status_headers.get_header('Location')
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -1,43 +0,0 @@
|
|||||||
import os
|
|
||||||
from ..pywb.binsearch.binsearch import iter_prefix, iter_exact, FileReader
|
|
||||||
|
|
||||||
test_cdx_dir = os.path.dirname(os.path.realpath(__file__)) + '/../sample_archive/cdx/'
|
|
||||||
|
|
||||||
def binsearch_cdx_test(key, iter_func):
|
|
||||||
"""
|
|
||||||
# Prefix Search
|
|
||||||
>>> binsearch_cdx_test('org,iana)/domains/root', iter_prefix)
|
|
||||||
org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz
|
|
||||||
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz
|
|
||||||
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
|
|
||||||
org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz
|
|
||||||
|
|
||||||
>>> binsearch_cdx_test('org,iana)/domains/root', iter_exact)
|
|
||||||
org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz
|
|
||||||
|
|
||||||
>>> binsearch_cdx_test('org,iana)/', iter_exact)
|
|
||||||
org,iana)/ 20140126200624 http://www.iana.org/ text/html 200 OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 2258 334 iana.warc.gz
|
|
||||||
|
|
||||||
>>> binsearch_cdx_test('org,iana)/domains/root/db', iter_exact)
|
|
||||||
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz
|
|
||||||
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
|
|
||||||
|
|
||||||
# Exact Search
|
|
||||||
>>> binsearch_cdx_test('org,iaana)/', iter_exact)
|
|
||||||
>>> binsearch_cdx_test('org,ibna)/', iter_exact)
|
|
||||||
|
|
||||||
>>> binsearch_cdx_test('org,iana)/time-zones', iter_exact)
|
|
||||||
org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
|
|
||||||
"""
|
|
||||||
|
|
||||||
cdx = FileReader(test_cdx_dir + 'iana.cdx')
|
|
||||||
|
|
||||||
for line in iter_func(cdx, key):
|
|
||||||
print line
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
import doctest
|
|
||||||
doctest.testmod()
|
|
||||||
|
|
||||||
|
|
@ -1,149 +0,0 @@
|
|||||||
from ..pywb.binsearch.binsearch import iter_exact, iter_prefix, FileReader
|
|
||||||
from ..pywb.cdxserver.cdxserver import CDXServer
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
import pprint
|
|
||||||
|
|
||||||
test_cdx_dir = os.path.dirname(os.path.realpath(__file__)) + '/../sample_archive/cdx/'
|
|
||||||
|
|
||||||
def cdx_ops_test(url, sources = [test_cdx_dir + 'iana.cdx'], **kwparams):
|
|
||||||
"""
|
|
||||||
# Merge Sort Multipe CDX Sources
|
|
||||||
>>> cdx_ops_test(url = 'http://iana.org/', sources = [test_cdx_dir + 'dupes.cdx', test_cdx_dir + 'iana.cdx'])
|
|
||||||
org,iana)/ 20140126200624 http://www.iana.org/ text/html 200 OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 2258 334 iana.warc.gz
|
|
||||||
org,iana)/ 20140127171238 http://iana.org unk 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 343 1858 dupes.warc.gz
|
|
||||||
org,iana)/ 20140127171238 http://www.iana.org/ warc/revisit - OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 536 2678 dupes.warc.gz
|
|
||||||
|
|
||||||
|
|
||||||
# Limit CDX Stream
|
|
||||||
>>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', limit = 3)
|
|
||||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200625 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 117166 198285 iana.warc.gz
|
|
||||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200654 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf warc/revisit - YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 548 482544 iana.warc.gz
|
|
||||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200706 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf warc/revisit - YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 495230 iana.warc.gz
|
|
||||||
|
|
||||||
|
|
||||||
# Reverse CDX Stream
|
|
||||||
>>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', reverse = True, resolve_revisits = True, limit = 3)
|
|
||||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201308 https://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 551 783712 iana.warc.gz 117166 198285 iana.warc.gz
|
|
||||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201249 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 771773 iana.warc.gz 117166 198285 iana.warc.gz
|
|
||||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201240 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 551 757988 iana.warc.gz 117166 198285 iana.warc.gz
|
|
||||||
|
|
||||||
>>> cdx_ops_test('http://iana.org/_js/2013.1/jquery.js', reverse = True, resolve_revisits = True, limit = 1)
|
|
||||||
org,iana)/_js/2013.1/jquery.js 20140126201307 https://www.iana.org/_js/2013.1/jquery.js application/x-javascript 200 AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO - - 543 778507 iana.warc.gz 33449 7311 iana.warc.gz
|
|
||||||
|
|
||||||
# No matching results
|
|
||||||
>>> cdx_ops_test('http://iana.org/dont_have_this', reverse = True, resolve_revisits = True, limit = 2)
|
|
||||||
|
|
||||||
|
|
||||||
# Filter cdx
|
|
||||||
>>> cdx_ops_test(url = 'http://iana.org/domains', match_type = 'prefix', filter = ['mimetype:text/html'])
|
|
||||||
org,iana)/domains 20140126200825 http://www.iana.org/domains text/html 200 7UPSCLNWNZP33LGW6OJGSF2Y4CDG4ES7 - - 2912 610534 iana.warc.gz
|
|
||||||
org,iana)/domains/arpa 20140126201248 http://www.iana.org/domains/arpa text/html 200 QOFZZRN6JIKAL2JRL6ZC2VVG42SPKGHT - - 2939 759039 iana.warc.gz
|
|
||||||
org,iana)/domains/idn-tables 20140126201127 http://www.iana.org/domains/idn-tables text/html 200 HNCUFTJMOQOGAEY6T56KVC3T7TVLKGEW - - 8118 715878 iana.warc.gz
|
|
||||||
org,iana)/domains/int 20140126201239 http://www.iana.org/domains/int text/html 200 X32BBNNORV4SPEHTQF5KI5NFHSKTZK6Q - - 2482 746788 iana.warc.gz
|
|
||||||
org,iana)/domains/reserved 20140126201054 http://www.iana.org/domains/reserved text/html 200 R5AAEQX5XY5X5DG66B23ODN5DUBWRA27 - - 3573 701457 iana.warc.gz
|
|
||||||
org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz
|
|
||||||
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz
|
|
||||||
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
|
|
||||||
org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz
|
|
||||||
|
|
||||||
|
|
||||||
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', filter = 'statuscode:200')
|
|
||||||
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
|
|
||||||
|
|
||||||
|
|
||||||
# Collapse by timestamp
|
|
||||||
# unresolved revisits, different statuscode results in an extra repeat
|
|
||||||
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapse_time = 11)
|
|
||||||
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
|
|
||||||
org,iana)/_css/2013.1/screen.css 20140126200653 http://www.iana.org/_css/2013.1/screen.css warc/revisit - BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 533 328367 iana.warc.gz
|
|
||||||
org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css warc/revisit - BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz
|
|
||||||
|
|
||||||
# resolved revisits
|
|
||||||
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapse_time = '11', resolve_revisits = True)
|
|
||||||
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz - - -
|
|
||||||
org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz 8754 41238 iana.warc.gz
|
|
||||||
|
|
||||||
|
|
||||||
# Sort by closest timestamp + field select output
|
|
||||||
>>> cdx_ops_test(closest_to = '20140126200826', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', fields = 'timestamp', limit = 10)
|
|
||||||
20140126200826
|
|
||||||
20140126200816
|
|
||||||
20140126200805
|
|
||||||
20140126200912
|
|
||||||
20140126200738
|
|
||||||
20140126200930
|
|
||||||
20140126200718
|
|
||||||
20140126200706
|
|
||||||
20140126200654
|
|
||||||
20140126200625
|
|
||||||
|
|
||||||
>>> cdx_ops_test(closest_to = '20140126201306', url = 'http://iana.org/dnssec', resolve_revisits = True, sources = [test_cdx_dir + 'dupes.cdx', test_cdx_dir + 'iana.cdx'])
|
|
||||||
org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz - - -
|
|
||||||
org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz - - -
|
|
||||||
|
|
||||||
|
|
||||||
>>> cdx_ops_test(closest_to = '20140126201307', url = 'http://iana.org/dnssec', resolve_revisits = True)
|
|
||||||
org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz - - -
|
|
||||||
org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz - - -
|
|
||||||
|
|
||||||
# equal dist prefer earlier
|
|
||||||
>>> cdx_ops_test(closest_to = '20140126200700', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolve_revisits = True, limit = 2)
|
|
||||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200654 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 548 482544 iana.warc.gz 117166 198285 iana.warc.gz
|
|
||||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200706 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 495230 iana.warc.gz 117166 198285 iana.warc.gz
|
|
||||||
|
|
||||||
>>> cdx_ops_test(closest_to = '20140126200659', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolve_revisits = True, limit = 2, fields = 'timestamp')
|
|
||||||
20140126200654
|
|
||||||
20140126200706
|
|
||||||
|
|
||||||
>>> cdx_ops_test(closest_to = '20140126200701', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolve_revisits = True, limit = 2, fields = 'timestamp')
|
|
||||||
20140126200706
|
|
||||||
20140126200654
|
|
||||||
|
|
||||||
|
|
||||||
# Resolve Revisits
|
|
||||||
>>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/inconsolata.otf', resolve_revisits = True)
|
|
||||||
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200826 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 34054 620049 iana.warc.gz - - -
|
|
||||||
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200912 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 546 667073 iana.warc.gz 34054 620049 iana.warc.gz
|
|
||||||
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200930 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 534 697255 iana.warc.gz 34054 620049 iana.warc.gz
|
|
||||||
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201055 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 547 714833 iana.warc.gz 34054 620049 iana.warc.gz
|
|
||||||
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201249 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 551 768625 iana.warc.gz 34054 620049 iana.warc.gz
|
|
||||||
|
|
||||||
>>> cdx_ops_test('http://iana.org/domains/root/db', resolve_revisits = True)
|
|
||||||
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz - - -
|
|
||||||
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz - - -
|
|
||||||
|
|
||||||
|
|
||||||
# CDX Server init
|
|
||||||
>>> x = CDXServer([test_cdx_dir]).load_cdx(url = 'example.com', limit = 1, output = 'raw')
|
|
||||||
>>> pprint.pprint(x.next().items())
|
|
||||||
[('urlkey', 'com,example)/'),
|
|
||||||
('timestamp', '20140127171200'),
|
|
||||||
('original', 'http://example.com'),
|
|
||||||
('mimetype', 'text/html'),
|
|
||||||
('statuscode', '200'),
|
|
||||||
('digest', 'B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'),
|
|
||||||
('redirect', '-'),
|
|
||||||
('robotflags', '-'),
|
|
||||||
('length', '1046'),
|
|
||||||
('offset', '334'),
|
|
||||||
('filename', 'dupes.warc.gz')]
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
kwparams['url'] = url
|
|
||||||
kwparams['output'] = 'text'
|
|
||||||
|
|
||||||
server = CDXServer(sources)
|
|
||||||
results = server.load_cdx(**kwparams)
|
|
||||||
|
|
||||||
for x in results:
|
|
||||||
sys.stdout.write(x)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
import doctest
|
|
||||||
doctest.testmod()
|
|
||||||
|
|
||||||
|
|
@ -1,7 +1,7 @@
|
|||||||
import webtest
|
import webtest
|
||||||
from ..pywb.pywb_init import pywb_config
|
from pywb.pywb_init import pywb_config
|
||||||
from ..pywb.wbapp import create_wb_app
|
from pywb.wbapp import create_wb_app
|
||||||
from ..pywb.cdxserver.cdxobject import CDXObject
|
from pywb.cdx.cdxobject import CDXObject
|
||||||
|
|
||||||
class TestWb:
|
class TestWb:
|
||||||
TEST_CONFIG = 'test_config.yaml'
|
TEST_CONFIG = 'test_config.yaml'
|
||||||
|
Loading…
x
Reference in New Issue
Block a user