1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

Merge branch 'master' into cdx-server

This commit is contained in:
Kenji Nagahashi 2014-02-25 23:14:15 +00:00
commit 14f4b4d26e
33 changed files with 1120 additions and 342 deletions

8
.coveragerc Normal file
View File

@ -0,0 +1,8 @@
[run]
omit =
*/test/*
*/tests/*
[report]
exclude_lines =
if __name__ == .__main__.:

View File

@ -4,7 +4,14 @@ python:
# command to install dependencies
install:
- "python setup.py -q install"
- "pip install python-coveralls"
- "pip install pytest-cov"
# command to run tests
#script: nosetests --with-doctest
#script: py.test run-tests.py ./pywb/ --doctest-modules --ignore=setup.py
script: py.test -v --doctest-module ./tests/*.py ./pywb/
#script: py.test -v --doctest-module ./tests/*.py ./pywb/
script:
py.test --cov-config .coveragerc --cov pywb -v --doctest-module ./pywb/ tests/
after_success:
coveralls

View File

@ -3,13 +3,13 @@ import re
from wbrequestresponse import WbRequest, WbResponse
from pywb.rewrite.url_rewriter import UrlRewriter
from pywb.rewrite.wburl import WbUrl
#=================================================================
# ArchivalRouter -- route WB requests in archival mode
#=================================================================
class ArchivalRouter:
def __init__(self, routes, hostpaths = None, abs_path = True, home_view = None, error_view = None):
def __init__(self, routes, hostpaths=None, abs_path=True, home_view=None, error_view=None):
self.routes = routes
self.fallback = ReferRedirect(hostpaths)
self.abs_path = abs_path
@ -69,24 +69,25 @@ class Route:
if not matcher:
return None
rel_prefix = matcher.group(0)
matched_str = matcher.group(0)
if rel_prefix:
wb_prefix = env['SCRIPT_NAME'] + '/' + rel_prefix + '/'
wb_url_str = request_uri[len(rel_prefix) + 2:] # remove the '/' + rel_prefix part of uri
if matched_str:
rel_prefix = env['SCRIPT_NAME'] + '/' + matched_str + '/'
wb_url_str = request_uri[len(matched_str) + 2:] # remove the '/' + rel_prefix part of uri
else:
wb_prefix = env['SCRIPT_NAME'] + '/'
rel_prefix = env['SCRIPT_NAME'] + '/'
wb_url_str = request_uri[1:] # the request_uri is the wb_url, since no coll
coll = matcher.group(self.coll_group)
wbrequest = WbRequest(env,
request_uri = request_uri,
wb_url_str = wb_url_str,
wb_prefix = wb_prefix,
coll = coll,
host_prefix = WbRequest.make_host_prefix(env) if use_abs_prefix else '',
wburl_class = self.handler.get_wburl_type())
request_uri=request_uri,
wb_url_str=wb_url_str,
rel_prefix=rel_prefix,
coll=coll,
use_abs_prefix=use_abs_prefix,
wburl_class = self.handler.get_wburl_type(),
urlrewriter_class=UrlRewriter)
# Allow for applying of additional filters

View File

@ -2,6 +2,7 @@
"""
import surt
import urlparse
from cdxobject import CDXException
@ -69,6 +70,109 @@ index.html?a=b?c=)/')
return surt
#=================================================================
def calc_search_range(url, match_type, surt_ordered=True, url_canon=None):
"""
Canonicalize a url (either with custom canonicalizer or
standard canonicalizer with or without surt)
Then, compute a start and end search url search range
for a given match type.
Support match types:
* exact
* prefix
* host
* domain (only available when for surt ordering)
Examples below:
# surt ranges
>>> calc_search_range('http://example.com/path/file.html', 'exact')
('com,example)/path/file.html', 'com,example)/path/file.html!')
>>> calc_search_range('http://example.com/path/file.html', 'prefix')
('com,example)/path/file.html', 'com,example)/path/file.htmm')
>>> calc_search_range('http://example.com/path/file.html', 'host')
('com,example)/', 'com,example*')
>>> calc_search_range('http://example.com/path/file.html', 'domain')
('com,example)/', 'com,example-')
special case for tld domain range
>>> calc_search_range('com', 'domain')
('com,', 'com-')
# non-surt ranges
>>> calc_search_range('http://example.com/path/file.html', 'exact', False)
('example.com/path/file.html', 'example.com/path/file.html!')
>>> calc_search_range('http://example.com/path/file.html', 'prefix', False)
('example.com/path/file.html', 'example.com/path/file.htmm')
>>> calc_search_range('http://example.com/path/file.html', 'host', False)
('example.com/', 'example.com0')
# domain range not supported
>>> calc_search_range('http://example.com/path/file.html', 'domain', False)
Traceback (most recent call last):
Exception: matchType=domain unsupported for non-surt
"""
def inc_last_char(x):
return x[0:-1] + chr(ord(x[-1]) + 1)
if not url_canon:
# make new canon
url_canon = UrlCanonicalizer(surt_ordered)
else:
# ensure surt order matches url_canon
surt_ordered = url_canon.surt_ordered
start_key = url_canon(url)
if match_type == 'exact':
end_key = start_key + '!'
elif match_type == 'prefix':
# add trailing slash if url has it
if url.endswith('/') and not start_key.endswith('/'):
start_key += '/'
end_key = inc_last_char(start_key)
elif match_type == 'host':
if surt_ordered:
host = start_key.split(')/')[0]
start_key = host + ')/'
end_key = host + '*'
else:
host = urlparse.urlsplit(url).netloc
start_key = host + '/'
end_key = host + '0'
elif match_type == 'domain':
if not surt_ordered:
raise Exception('matchType=domain unsupported for non-surt')
host = start_key.split(')/')[0]
# if tld, use com, as start_key
# otherwise, stick with com,example)/
if not ',' in host:
start_key = host + ','
else:
start_key = host + ')/'
end_key = host + '-'
else:
raise Exception('Invalid match_type: ' + match_type)
return (start_key, end_key)
if __name__ == "__main__":
import doctest
doctest.testmod()

View File

@ -77,3 +77,34 @@ class CDXObject(OrderedDict):
li = itertools.imap(lambda (n, val): val, self.items())
return ' '.join(li)
#=================================================================
class IDXObject(OrderedDict):
FORMAT = ['urlkey', 'part', 'offset', 'length', 'lineno']
NUM_REQ_FIELDS = len(FORMAT) - 1 # lineno is an optional field
def __init__(self, idxline):
OrderedDict.__init__(self)
idxline = idxline.rstrip()
fields = idxline.split('\t')
if len(fields) < self.NUM_REQ_FIELDS:
msg = 'invalid idx format: {0} fields found, {1} required'
raise Exception(msg.format(len(fields), self.NUM_REQ_FIELDS))
for header, field in itertools.izip(self.FORMAT, fields):
self[header] = field
self['offset'] = int(self['offset'])
self['length'] = int(self['length'])
lineno = self.get('lineno')
if lineno:
self['lineno'] = int(lineno)
self.idxline = idxline
def __str__(self):
return self.idxline

View File

@ -1,4 +1,4 @@
from cdxobject import CDXObject, AccessException
from cdxobject import CDXObject, IDXObject, AccessException
from pywb.utils.timeutils import timestamp_to_sec
import bisect
@ -56,7 +56,7 @@ def cdx_text_out(cdx, fields):
def cdx_load_and_filter(sources, params):
cdx_iter = load_cdx_streams(sources, params)
cdx_iter = make_cdx_iter(cdx_iter)
cdx_iter = make_obj_iter(cdx_iter, params)
if params.get('proxyAll'):
return cdx_iter
@ -102,9 +102,15 @@ def load_cdx_streams(sources, params):
#=================================================================
# convert text cdx stream to CDXObject
def make_cdx_iter(text_iter):
return itertools.imap(lambda line: CDXObject(line), text_iter)
# convert text cdx stream to CDXObject/IDXObject
def make_obj_iter(text_iter, params):
# already converted
if params.get('showPagedIndex'):
cls = IDXObject
else:
cls = CDXObject
return itertools.imap(lambda line: cls(line), text_iter)
#=================================================================

View File

@ -1,10 +1,13 @@
from canonicalize import UrlCanonicalizer
from canonicalize import UrlCanonicalizer, calc_search_range
from cdxops import cdx_load
from cdxsource import CDXSource, CDXFile, RemoteCDXSource
from cdxsource import CDXSource, CDXFile, RemoteCDXSource, RedisCDXSource
from zipnum import ZipNumCluster
from cdxobject import CDXObject, CaptureNotFoundException, CDXException
from cdxdomainspecific import load_domain_specific_cdx_rules
from pywb.utils.loaders import is_http
from itertools import chain
import logging
import os
@ -14,8 +17,23 @@ import urlparse
#=================================================================
class BaseCDXServer(object):
def __init__(self, **kwargs):
self.url_canon = kwargs.get('url_canon', UrlCanonicalizer())
self.fuzzy_query = kwargs.get('fuzzy_query')
ds_rules = kwargs.get('ds_rules')
surt_ordered = kwargs.get('surt_ordered', True)
# load from domain-specific rules
if ds_rules:
self.url_canon, self.fuzzy_query = (
load_domain_specific_cdx_rules(ds_rules, surt_ordered))
# or custom passed in canonicalizer
else:
self.url_canon = kwargs.get('url_canon')
self.fuzzy_query = kwargs.get('fuzzy_query')
# set default canonicalizer if none set thus far
if not self.url_canon:
self.url_canon = UrlCanonicalizer(surt_ordered)
# set perms checker, if any
self.perms_checker = kwargs.get('perms_checker')
def _check_cdx_iter(self, cdx_iter, params):
@ -66,7 +84,7 @@ class CDXServer(BaseCDXServer):
def __init__(self, paths, **kwargs):
super(CDXServer, self).__init__(**kwargs)
self.sources = create_cdx_sources(paths)
self.sources = create_cdx_sources(paths, kwargs.get('config'))
def load_cdx(self, **params):
# if key not set, assume 'url' is set and needs canonicalization
@ -77,7 +95,14 @@ class CDXServer(BaseCDXServer):
msg = 'A url= param must be specified to query the cdx server'
raise CDXException(msg)
params['key'] = self.url_canon(url)
#params['key'] = self.url_canon(url)
match_type = params.get('matchType', 'exact')
key, end_key = calc_search_range(url=url,
match_type=match_type,
url_canon=self.url_canon)
params['key'] = key
params['end_key'] = end_key
cdx_iter = cdx_load(self.sources, params, self.perms_checker)
@ -124,36 +149,29 @@ def create_cdx_server(config, ds_rules_file=None):
paths = config.get('index_paths')
surt_ordered = config.get('surt_ordered', True)
perms_checker = config.get('perms_checker')
pass_config = config
else:
paths = config
surt_ordered = True
perms_checker = None
pass_config = None
logging.debug('CDX Surt-Ordered? ' + str(surt_ordered))
if ds_rules_file:
canon, fuzzy = load_domain_specific_cdx_rules(ds_rules_file,
surt_ordered)
else:
canon, fuzzy = None, None
if not canon:
canon = UrlCanonicalizer(surt_ordered)
if (isinstance(paths, str) and
any(paths.startswith(x) for x in ['http://', 'https://'])):
if isinstance(paths, str) and is_http(paths):
server_cls = RemoteCDXServer
else:
server_cls = CDXServer
return server_cls(paths,
url_canon=canon,
fuzzy_query=fuzzy,
config=pass_config,
surt_ordered=surt_ordered,
ds_rules=ds_rules_file,
perms_checker=perms_checker)
#=================================================================
def create_cdx_sources(paths):
def create_cdx_sources(paths, config=None):
sources = []
if not isinstance(paths, list):
@ -161,13 +179,13 @@ def create_cdx_sources(paths):
for path in paths:
if isinstance(path, CDXSource):
add_cdx_source(sources, path)
add_cdx_source(sources, path, config)
elif isinstance(path, str):
if os.path.isdir(path):
for file in os.listdir(path):
add_cdx_source(sources, path + file)
add_cdx_source(sources, path + file, config)
else:
add_cdx_source(sources, path)
add_cdx_source(sources, path, config)
if len(sources) == 0:
logging.exception('No CDX Sources Found from: ' + str(sources))
@ -176,9 +194,9 @@ def create_cdx_sources(paths):
#=================================================================
def add_cdx_source(sources, source):
def add_cdx_source(sources, source, config):
if not isinstance(source, CDXSource):
source = create_cdx_source(source)
source = create_cdx_source(source, config)
if not source:
return
@ -187,19 +205,20 @@ def add_cdx_source(sources, source):
#=================================================================
def create_cdx_source(filename):
if filename.startswith('http://') or filename.startswith('https://'):
def create_cdx_source(filename, config):
if is_http(filename):
return RemoteCDXSource(filename)
if filename.startswith('redis://'):
return RedisCDXSource(filename, config)
if filename.endswith('.cdx'):
return CDXFile(filename)
if filename.endswith('.summary'):
return ZipNumCluster(filename, config)
return None
#TODO: support zipnum
#elif filename.endswith('.summary')
# return ZipNumCDXSource(filename)
#elif filename.startswith('redis://')
# return RedisCDXSource(filename)
#=================================================================

View File

@ -1,9 +1,9 @@
from pywb.utils.binsearch import iter_exact, iter_prefix
from pywb.utils.binsearch import iter_range
from pywb.utils.loaders import SeekableTextFileReader
import urllib
import urllib2
import itertools
#=================================================================
class CDXSource(object):
@ -24,17 +24,7 @@ class CDXFile(CDXSource):
def load_cdx(self, params):
source = SeekableTextFileReader(self.filename)
match_type = params.get('matchType')
if match_type == 'prefix':
iter_func = iter_prefix
else:
iter_func = iter_exact
key = params.get('key')
return iter_func(source, key)
return iter_range(source, params.get('key'), params.get('end_key'))
def __str__(self):
return 'CDX File - ' + self.filename
@ -90,3 +80,35 @@ class RemoteCDXSource(CDXSource):
def __str__(self):
return 'Remote CDX Server: ' + self.remote_url
#=================================================================
class RedisCDXSource(CDXSource):
DEFAULT_KEY_PREFIX = 'c:'
def __init__(self, redis_url, config=None):
import redis
self.redis = redis.StrictRedis.from_url(redis_url)
self.key_prefix = self.DEFAULT_KEY_PREFIX
if config:
self.key_prefix = config.get('redis_key_prefix', self.key_prefix)
def load_cdx(self, params):
"""
Load cdx from redis cache, from an ordered list
Currently, there is no support for range queries
Only 'exact' matchType is supported
"""
key = params['key']
# ensure only url/surt is part of key
key = key.split(' ')[0]
cdx_list = self.redis.zrange(self.key_prefix + key, 0, -1)
# key is not part of list, so prepend to each line
key += ' '
cdx_list = itertools.imap(lambda x: key + x, cdx_list)
return cdx_list

View File

@ -132,8 +132,8 @@ org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db tex
('filename', 'dupes.warc.gz')]
# NOTE: external dependency -- need self-contained test
>>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'example.com', output = 'raw', limit = '2')
>>> pprint.pprint(x.next().items())
#>>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'example.com', output = 'raw', limit = '2')
#>>> pprint.pprint(x.next().items())
[('urlkey', 'com,example)/'),
('timestamp', '20020120142510'),
('original', 'http://example.com:80/'),

203
pywb/cdx/zipnum.py Normal file
View File

@ -0,0 +1,203 @@
import os
import collections
import itertools
import logging
from cStringIO import StringIO
import datetime
from cdxsource import CDXSource
from cdxobject import IDXObject
from pywb.utils.loaders import BlockLoader
from pywb.utils.loaders import SeekableTextFileReader
from pywb.utils.bufferedreaders import gzip_decompressor
from pywb.utils.binsearch import iter_range, linearsearch
#=================================================================
class ZipBlocks:
def __init__(self, part, offset, length, count):
self.part = part
self.offset = offset
self.length = length
self.count = count
#=================================================================
def readline_to_iter(stream):
try:
count = 0
buff = stream.readline()
while buff:
count += 1
yield buff
buff = stream.readline()
finally:
stream.close()
#=================================================================
class ZipNumCluster(CDXSource):
DEFAULT_RELOAD_INTERVAL = 10 # in minutes
DEFAULT_MAX_BLOCKS = 50
def __init__(self, summary, config=None):
loc = None
cookie_maker = None
self.max_blocks = self.DEFAULT_MAX_BLOCKS
reload_ival = self.DEFAULT_RELOAD_INTERVAL
if config:
loc = config.get('zipnum_loc')
cookie_maker = config.get('cookie_maker')
self.max_blocks = config.get('max_blocks', self.max_blocks)
reload_ival = config.get('reload_interval', reload_ival)
if not loc:
splits = os.path.splitext(summary)
loc = splits[0] + '.loc'
self.summary = summary
self.loc_filename = loc
# initial loc map
self.loc_map = {}
self.loc_mtime = 0
self.load_loc()
# reload interval
self.loc_update_time = datetime.datetime.now()
self.reload_interval = datetime.timedelta(minutes=reload_ival)
self.blk_loader = BlockLoader(cookie_maker=cookie_maker)
def load_loc(self):
# check modified time of current file before loading
new_mtime = os.path.getmtime(self.loc_filename)
if (new_mtime == self.loc_mtime):
return
# update loc file mtime
self.loc_mtime = new_mtime
logging.debug('Loading loc from: ' + self.loc_filename)
with open(self.loc_filename) as fh:
for line in fh:
parts = line.rstrip().split('\t')
self.loc_map[parts[0]] = parts[1:]
@staticmethod
def reload_timed(timestamp, val, delta, func):
now = datetime.datetime.now()
if now - timestamp >= delta:
func()
return now
return None
def reload_loc(self):
reload_time = self.reload_timed(self.loc_update_time,
self.loc_map,
self.reload_interval,
self.load_loc)
if reload_time:
self.loc_update_time = reload_time
def lookup_loc(self, part):
return self.loc_map[part]
def load_cdx(self, params):
self.reload_loc()
reader = SeekableTextFileReader(self.summary)
idx_iter = iter_range(reader,
params['key'],
params['end_key'],
prev_size=1)
if params.get('showPagedIndex'):
params['proxyAll'] = True
return idx_iter
else:
blocks = self.idx_to_cdx(idx_iter, params)
def gen_cdx():
for blk in blocks:
for cdx in blk:
yield cdx
return gen_cdx()
def idx_to_cdx(self, idx_iter, params):
blocks = None
ranges = []
for idx in idx_iter:
idx = IDXObject(idx)
if (blocks and blocks.part == idx['part'] and
blocks.offset + blocks.length == idx['offset'] and
blocks.count < self.max_blocks):
blocks.length += idx['length']
blocks.count += 1
ranges.append(idx['length'])
else:
if blocks:
yield self.block_to_cdx_iter(blocks, ranges, params)
blocks = ZipBlocks(idx['part'],
idx['offset'],
idx['length'],
1)
ranges = [blocks.length]
if blocks:
yield self.block_to_cdx_iter(blocks, ranges, params)
def block_to_cdx_iter(self, blocks, ranges, params):
last_exc = None
last_traceback = None
for location in self.lookup_loc(blocks.part):
try:
return self.load_blocks(location, blocks, ranges, params)
except Exception as exc:
last_exc = exc
import sys
last_traceback = sys.exc_info()[2]
if last_exc:
raise exc, None, last_traceback
else:
raise Exception('No Locations Found for: ' + block.part)
def load_blocks(self, location, blocks, ranges, params):
if (logging.getLogger().getEffectiveLevel() <= logging.DEBUG):
msg = 'Loading {b.count} blocks from {loc}:{b.offset}+{b.length}'
logging.debug(msg.format(b=blocks, loc=location))
reader = self.blk_loader.load(location, blocks.offset, blocks.length)
def decompress_block(range_):
decomp = gzip_decompressor()
buff = decomp.decompress(reader.read(range_))
return readline_to_iter(StringIO(buff))
iter_ = itertools.chain(*itertools.imap(decompress_block, ranges))
# start bound
iter_ = linearsearch(iter_, params['key'])
# end bound
end = params['end_key']
iter_ = itertools.takewhile(lambda line: line < end, iter_)
return iter_

View File

@ -10,19 +10,28 @@ from wbexceptions import WbException, NotFoundException
from views import TextCapturesView
class BaseHandler:
@staticmethod
def get_wburl_type():
return WbUrl
#=================================================================
class BaseHandler(object):
def __call__(self, wbrequest):
return wbrequest
def get_wburl_type(self):
return None
#=================================================================
class WbUrlHandler(BaseHandler):
def get_wburl_type(self):
return WbUrl
#=================================================================
# Standard WB Handler
#=================================================================
class WBHandler(BaseHandler):
def __init__(self, index_reader, replay, html_view = None, search_view = None):
class WBHandler(WbUrlHandler):
def __init__(self, index_reader, replay,
html_view=None, search_view=None):
self.index_reader = index_reader
self.replay = replay
@ -31,7 +40,6 @@ class WBHandler(BaseHandler):
self.html_view = html_view
self.search_view = search_view
def __call__(self, wbrequest):
if wbrequest.wb_url_str == '/':
return self.render_search_page(wbrequest)
@ -61,6 +69,7 @@ class WBHandler(BaseHandler):
def __str__(self):
return 'WBHandler: ' + str(self.index_reader) + ', ' + str(self.replay)
#=================================================================
# CDX-Server Handler -- pass all params to cdx server
#=================================================================
@ -75,11 +84,6 @@ class CDXHandler(BaseHandler):
return self.view.render_response(wbrequest, cdx_lines)
@staticmethod
def get_wburl_type():
return None
def __str__(self):
return 'Index Reader: ' + str(self.index_reader)
@ -115,10 +119,6 @@ class StaticHandler(BaseHandler):
except IOError:
raise NotFoundException('Static File Not Found: ' + wbrequest.wb_url_str)
@staticmethod
def get_wburl_type():
return None
def __str__(self):
return 'Static files from ' + self.static_path
@ -130,6 +130,7 @@ class DebugEchoEnvHandler(BaseHandler):
def __call__(self, wbrequest):
return WbResponse.text_response(str(wbrequest.env))
#=================================================================
class DebugEchoHandler(BaseHandler):
def __call__(self, wbrequest):
@ -150,5 +151,3 @@ class PerfTimer:
self.end = time.clock()
if self.perfdict is not None:
self.perfdict[self.name] = str(self.end - self.start)

View File

@ -37,7 +37,7 @@ class IndexReader(object):
def load_cdx(self, **params):
return self.cdx_server.load_cdx(**params)
def get_query_params(self, wburl, limit = 150000, collapse_time = None, replay_closest = 10):
def get_query_params(self, wburl, limit = 150000, collapse_time = None, replay_closest = 100):
if wburl.type == wburl.URL_QUERY:
raise NotImplementedError('Url Query Not Yet Supported')

View File

@ -45,14 +45,14 @@ class ProxyRouter:
return None
wbrequest = WbRequest(env,
request_uri = url,
wb_url_str = url,
wb_prefix = '',
coll = '',
host_prefix = self.hostpaths[0],
wburl_class = self.handler.get_wburl_type(),
url_rewriter_class = ProxyHttpsUrlRewriter,
is_proxy = True)
request_uri=url,
wb_url_str=url,
#rel_prefix=url,
#host_prefix=self.hostpaths[0],
wburl_class=self.handler.get_wburl_type(),
urlrewriter_class=ProxyHttpsUrlRewriter,
use_abs_prefix=False,
is_proxy=True)
return self.handler(wbrequest)

View File

@ -7,7 +7,6 @@ from wbrequestresponse import WbResponse
from wbexceptions import CaptureException, InternalRedirect
from pywb.warc.recordloader import ArchiveLoadFailed
#=================================================================
class ReplayView:
def __init__(self, content_loader, content_rewriter, head_insert_view = None,
@ -49,6 +48,9 @@ class ReplayView:
# check if redir is needed
self._redirect_if_needed(wbrequest, cdx)
# one more check for referrer-based self-redirect
self._reject_referrer_self_redirect(wbrequest, status_headers)
response = None
if self.content_rewriter and wbrequest.wb_url.mod != 'id_':
@ -148,6 +150,7 @@ class ReplayView:
def _reject_self_redirect(self, wbrequest, cdx, status_headers):
# self-redirect via location
if status_headers.statusline.startswith('3'):
request_url = wbrequest.wb_url.url.lower()
location_url = status_headers.get_header('Location').lower()
@ -156,3 +159,16 @@ class ReplayView:
if (UrlRewriter.strip_protocol(request_url) == UrlRewriter.strip_protocol(location_url)):
raise CaptureException('Self Redirect: ' + str(cdx))
def _reject_referrer_self_redirect(self, wbrequest, status_headers):
# at correct timestamp now, but must check for referrer redirect
# indirect self-redirect, via meta-refresh, if referrer is same as current url
if status_headers.statusline.startswith('2'):
# build full url even if using relative-rewriting
request_url = wbrequest.host_prefix + wbrequest.rel_prefix + str(wbrequest.wb_url)
referrer_url = wbrequest.referrer
if (referrer_url and UrlRewriter.strip_protocol(request_url) == UrlRewriter.strip_protocol(referrer_url)):
raise CaptureException('Self Redirect via Referrer: ' + str(wbrequest.wb_url))

View File

@ -6,7 +6,7 @@ from regex_rewriters import RegexRewriter, JSRewriter, CSSRewriter, XMLRewriter
from header_rewriter import HeaderRewriter, RewrittenStatusAndHeaders
from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.utils.bufferedreaders import BufferedReader, ChunkedDataReader
from pywb.utils.bufferedreaders import DecompressingBufferedReader, ChunkedDataReader
class RewriteContent:
@ -54,7 +54,7 @@ class RewriteContent:
# =========================================================================
# special case -- need to ungzip the body
if (rewritten_headers.contains_removed_header('content-encoding', 'gzip')):
stream = BufferedReader(stream, decomp_type='gzip')
stream = DecompressingBufferedReader(stream, decomp_type='gzip')
if rewritten_headers.charset:
encoding = rewritten_headers.charset

View File

@ -24,9 +24,9 @@ def test_example_2():
def test_example_3():
status_headers, buff = get_rewritten('http://archive.org/', urlrewriter)
#def test_example_3():
# status_headers, buff = get_rewritten('http://archive.org/', urlrewriter)
assert '/pywb/20131226101010/http://example.com/about/terms.php' in buff, buff
# assert '/pywb/20131226101010/http://example.com/about/terms.php' in buff, buff

View File

@ -103,10 +103,12 @@ class UrlRewriter:
return self.prefix + self.wburl.to_str(timestamp=timestamp, url=url)
def set_base_url(self, newUrl):
self.wburl.url = newUrl
def __repr__(self):
return "UrlRewriter('{0}', '{1}')".format(self.wburl, self.prefix)
@staticmethod
def strip_protocol(url):
for protocol in UrlRewriter.PROTOCOLS:

View File

@ -1,9 +1,5 @@
#!/usr/bin/python
import re
import rfc3987
# WbUrl : wb archival url representation for WB
"""
WbUrl represents the standard wayback archival url format.
A regular url is a subset of the WbUrl (latest replay).
@ -34,9 +30,38 @@ replay form:
latest_replay: (no timestamp)
http://example.com
Additionally, the BaseWbUrl provides the base components
(url, timestamp, end_timestamp, modifier, type) which
can be used to provide a custom representation of the
wayback url format.
"""
class WbUrl:
import re
import rfc3987
#=================================================================
class BaseWbUrl(object):
QUERY = 'query'
URL_QUERY = 'url_query'
REPLAY = 'replay'
LATEST_REPLAY = 'latest_replay'
def __init__(self, url='', mod='',
timestamp='', end_timestamp='', type=None):
self.url = url
self.timestamp = timestamp
self.end_timestamp = end_timestamp
self.mod = mod
self.type = type
#=================================================================
class WbUrl(BaseWbUrl):
"""
# Replay Urls
# ======================
@ -107,22 +132,14 @@ class WbUrl:
QUERY_REGEX = re.compile('^(?:([\w\-:]+)/)?(\d*)(?:-(\d+))?\*/?(.*)$')
REPLAY_REGEX = re.compile('^(\d*)([a-z]+_)?/{0,3}(.*)$')
QUERY = 'query'
URL_QUERY = 'url_query'
REPLAY = 'replay'
LATEST_REPLAY = 'latest_replay'
DEFAULT_SCHEME = 'http://'
# ======================
def __init__(self, url):
super(WbUrl, self).__init__()
self.original_url = url
self.type = None
self.url = ''
self.timestamp = ''
self.end_timestamp = ''
self.mod = ''
if not any (f(url) for f in [self._init_query, self._init_replay]):
raise Exception('Invalid WbUrl: ', url)

View File

@ -1,13 +1,19 @@
"""
Test Route
# route with relative path
>>> Route('web', BaseHandler())({'REL_REQUEST_URI': '/web/test.example.com', 'SCRIPT_NAME': ''}, False)
{'wb_url': ('latest_replay', '', '', 'http://test.example.com', 'http://test.example.com'), 'coll': 'web', 'wb_prefix': '/web/', 'request_uri': '/web/test.example.com'}
# Test WbRequest parsed via a Route
# route with relative path, print resulting wbrequest
>>> print_req(Route('web', WbUrlHandler())({'REL_REQUEST_URI': '/web/test.example.com', 'SCRIPT_NAME': ''}, False))
{'coll': 'web',
'request_uri': '/web/test.example.com',
'wb_prefix': '/web/',
'wb_url': ('latest_replay', '', '', 'http://test.example.com', 'http://test.example.com')}
# route with absolute path, running at script /my_pywb
>>> Route('web', BaseHandler())({'REL_REQUEST_URI': '/web/2013im_/test.example.com', 'SCRIPT_NAME': '/my_pywb', 'HTTP_HOST': 'localhost:8081', 'wsgi.url_scheme': 'https'}, True)
{'wb_url': ('replay', '2013', 'im_', 'http://test.example.com', '2013im_/http://test.example.com'), 'coll': 'web', 'wb_prefix': 'https://localhost:8081/my_pywb/web/', 'request_uri': '/web/2013im_/test.example.com'}
# route with absolute path, running at script /my_pywb, print resultingwbrequest
>>> print_req(Route('web', WbUrlHandler())({'REL_REQUEST_URI': '/web/2013im_/test.example.com', 'SCRIPT_NAME': '/my_pywb', 'HTTP_HOST': 'localhost:8081', 'wsgi.url_scheme': 'https'}, True))
{'coll': 'web',
'request_uri': '/web/2013im_/test.example.com',
'wb_prefix': 'https://localhost:8081/my_pywb/web/',
'wb_url': ('replay', '2013', 'im_', 'http://test.example.com', '2013im_/http://test.example.com')}
# not matching route -- skipped
>>> Route('web', BaseHandler())({'REL_REQUEST_URI': '/other/test.example.com', 'SCRIPT_NAME': ''}, False)
@ -65,7 +71,12 @@ False
"""
from pywb.archivalrouter import Route, ReferRedirect
from pywb.handlers import BaseHandler
from pywb.handlers import BaseHandler, WbUrlHandler
import pprint
def print_req(req):
varlist = vars(req)
pprint.pprint({k: varlist[k] for k in ('request_uri', 'wb_prefix', 'wb_url', 'coll')})
def _test_redir(match_host, request_uri, referrer, script_name = '', coll = 'coll', http_host = None):
@ -74,7 +85,7 @@ def _test_redir(match_host, request_uri, referrer, script_name = '', coll = 'col
if http_host:
env['HTTP_HOST'] = http_host
routes = [Route(coll, BaseHandler())]
routes = [Route(coll, WbUrlHandler())]
redir = ReferRedirect(match_host)
#req = WbRequest.from_uri(request_uri, env)
@ -85,4 +96,6 @@ def _test_redir(match_host, request_uri, referrer, script_name = '', coll = 'col
return rep.status_headers.get_header('Location')
if __name__ == "__main__":
import doctest
doctest.testmod()

View File

@ -0,0 +1,87 @@
"""
# WbRequest Tests
# =================
>>> print_req_from_uri('/save/_embed/example.com/?a=b')
{'wb_url': ('latest_replay', '', '', 'http://_embed/example.com/?a=b', 'http://_embed/example.com/?a=b'), 'coll': 'save', 'wb_prefix': '/save/', 'request_uri': '/save/_embed/example.com/?a=b'}
>>> print_req_from_uri('/2345/20101024101112im_/example.com/?b=c')
{'wb_url': ('replay', '20101024101112', 'im_', 'http://example.com/?b=c', '20101024101112im_/http://example.com/?b=c'), 'coll': '2345', 'wb_prefix': '/2345/', 'request_uri': '/2345/20101024101112im_/example.com/?b=c'}
>>> print_req_from_uri('/2010/example.com')
{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': '/2010/', 'request_uri': '/2010/example.com'}
>>> print_req_from_uri('../example.com')
{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '', 'wb_prefix': '/', 'request_uri': '../example.com'}
# Abs path
>>> print_req_from_uri('/2010/example.com', {'wsgi.url_scheme': 'https', 'HTTP_HOST': 'localhost:8080'}, use_abs_prefix = True)
{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': 'https://localhost:8080/2010/', 'request_uri': '/2010/example.com'}
# No Scheme, so stick to relative
>>> print_req_from_uri('/2010/example.com', {'HTTP_HOST': 'localhost:8080'}, use_abs_prefix = True)
{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': '/2010/', 'request_uri': '/2010/example.com'}
# WbResponse Tests
# =================
>>> WbResponse.text_response('Test')
{'body': ['Test'], 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [('Content-Type', 'text/plain')])}
>>> WbResponse.text_stream(['Test', 'Another'], '404')
{'body': ['Test', 'Another'], 'status_headers': StatusAndHeaders(protocol = '', statusline = '404', headers = [('Content-Type', 'text/plain')])}
>>> WbResponse.redir_response('http://example.com/otherfile')
{'body': [], 'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [('Location', 'http://example.com/otherfile')])}
"""
from pywb.rewrite.wburl import WbUrl
from pywb.rewrite.url_rewriter import UrlRewriter
from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.wbrequestresponse import WbRequest, WbResponse
def print_req_from_uri(request_uri, env={}, use_abs_prefix=False):
response = req_from_uri(request_uri, env, use_abs_prefix)
varlist = vars(response)
print str({k: varlist[k] for k in ('request_uri', 'wb_prefix', 'wb_url', 'coll')})
def req_from_uri(request_uri, env={}, use_abs_prefix=False):
if not request_uri:
request_uri = env.get('REL_REQUEST_URI')
parts = request_uri.split('/', 2)
# Has coll prefix
if len(parts) == 3:
rel_prefix = '/' + parts[1] + '/'
wb_url_str = parts[2]
coll = parts[1]
# No Coll Prefix
elif len(parts) == 2:
rel_prefix = '/'
wb_url_str = parts[1]
coll = ''
else:
rel_prefix = '/'
wb_url_str = parts[0]
coll = ''
return WbRequest(env,
request_uri=request_uri,
rel_prefix=rel_prefix,
wb_url_str=wb_url_str,
coll=coll,
wburl_class=WbUrl,
urlrewriter_class=UrlRewriter,
use_abs_prefix=use_abs_prefix)
if __name__ == "__main__":
import doctest
doctest.testmod()

View File

@ -35,6 +35,58 @@ def binsearch_offset(reader, key, compare_func=cmp, block_size=8192):
return min_ * block_size
#=================================================================
def binsearch(reader, key, compare_func=cmp, block_size=8192):
"""
Perform a binary search for a specified key to within a 'block_size'
(default 8192) granularity, and return first full line found.
"""
min_ = binsearch_offset(reader, key, compare_func, block_size)
reader.seek(min_)
if min_ > 0:
reader.readline() # skip partial line
def gen_iter(line):
while line:
yield line.rstrip()
line = reader.readline()
return gen_iter(reader.readline())
#=================================================================
def linearsearch(iter_, key, prev_size=0, compare_func=cmp):
"""
Perform a linear search over iterator until
current_line >= key
optionally also tracking upto N previous lines, which are
returned before the first matched line.
if end of stream is reached before a match is found,
nothing is returned (prev lines discarded also)
"""
prev_deque = deque(maxlen=prev_size + 1)
matched = False
for line in iter_:
prev_deque.append(line)
if compare_func(line, key) >= 0:
matched = True
break
# no matches, so return empty iterator
if not matched:
return []
return itertools.chain(prev_deque, iter_)
#=================================================================
def search(reader, key, prev_size=0, compare_func=cmp, block_size=8192):
"""
@ -45,46 +97,27 @@ def search(reader, key, prev_size=0, compare_func=cmp, block_size=8192):
When performin_g linear search, keep track of up to N previous lines before
first matching line.
"""
min_ = binsearch_offset(reader, key, compare_func, block_size)
iter_ = binsearch(reader, key, compare_func, block_size)
iter_ = linearsearch(iter_,
key, prev_size=prev_size,
compare_func=compare_func)
return iter_
reader.seek(min_)
if min_ > 0:
reader.readline() # skip partial line
#=================================================================
def iter_range(reader, start, end, prev_size=0):
"""
Creates an iterator which iterates over lines where
start <= line < end (end exclusive)
"""
if prev_size > 1:
prev_deque = deque(max_len=prev_size)
iter_ = search(reader, start, prev_size=prev_size)
line = None
end_iter = itertools.takewhile(
lambda line: line < end,
iter_)
while True:
line = reader.readline()
if not line:
break
if compare_func(line, key) >= 0:
break
if prev_size == 1:
prev = line
elif prev_size > 1:
prev_deque.append(line)
def gen_iter(line):
"""
Create iterator over any previous lines to
current matched line
"""
if prev_size == 1:
yield prev.rstrip()
elif prev_size > 1:
for i in prev_deque:
yield i.rstrip()
while line:
yield line.rstrip()
line = reader.readline()
return gen_iter(line)
return end_iter
#=================================================================

View File

@ -11,7 +11,7 @@ def gzip_decompressor():
#=================================================================
class BufferedReader(object):
class DecompressingBufferedReader(object):
"""
A wrapping line reader which wraps an existing reader.
Read operations operate on underlying buffer, which is filled to
@ -29,7 +29,7 @@ class BufferedReader(object):
DECOMPRESSORS = {'gzip': gzip_decompressor}
def __init__(self, stream, max_len=0, block_size=1024, decomp_type=None):
def __init__(self, stream, block_size=1024, decomp_type=None):
self.stream = stream
self.block_size = block_size
@ -44,24 +44,19 @@ class BufferedReader(object):
self.buff = None
self.num_read = 0
self.max_len = max_len
def _fillbuff(self, block_size=None):
if not block_size:
block_size = self.block_size
if not self.buff or self.buff.pos >= self.buff.len:
if self.max_len > 0:
to_read = min(self.max_len - self.num_read, self.block_size)
else:
to_read = self.block_size
data = self.stream.read(to_read)
data = self.stream.read(block_size)
self._process_read(data)
def _process_read(self, data):
data = self._decompress(data)
self.num_read += len(data)
self.buff_size = len(data)
self.num_read += self.buff_size
self.buff = StringIO.StringIO(data)
def _decompress(self, data):
@ -78,12 +73,40 @@ class BufferedReader(object):
return data
def read(self, length=None):
"""
Fill bytes and read some number of bytes
(up to length if specified)
< length bytes may be read if reached the end of input
or at a buffer boundary. If at a boundary, the subsequent
call will fill buffer anew.
"""
self._fillbuff()
return self.buff.read(length)
def readline(self, length=None):
"""
Fill buffer and read a full line from the buffer
(up to specified length, if provided)
If no newline found at end, try filling buffer again in case
at buffer boundary.
"""
self._fillbuff()
return self.buff.readline(length)
linebuff = self.buff.readline(length)
# we may be at a boundary
while not linebuff.endswith('\n'):
if length:
length -= len(linebuff)
if length <= 0:
break
self._fillbuff()
if self.buff_size == 0:
break
linebuff += self.buff.readline(length)
return linebuff
def close(self):
if self.stream:
@ -97,7 +120,7 @@ class ChunkedDataException(Exception):
#=================================================================
class ChunkedDataReader(BufferedReader):
class ChunkedDataReader(DecompressingBufferedReader):
r"""
A ChunkedDataReader is a BufferedReader which also supports de-chunking
of the data if it happens to be http 'chunk-encoded'.
@ -133,7 +156,7 @@ class ChunkedDataReader(BufferedReader):
def _fillbuff(self, block_size=None):
if self.not_chunked:
return BufferedReader._fillbuff(self, block_size)
return super(ChunkedDataReader, self)._fillbuff(block_size)
if self.all_chunks_read:
return

View File

@ -9,18 +9,50 @@ import urllib2
import time
def is_http(filename):
return any(filename.startswith(x) for x in ['http://', 'https://'])
#=================================================================
# load a reader from http
#=================================================================
class HttpLoader(object):
class BlockLoader(object):
"""
Load a file-like reader over http using range requests
and an optional cookie created via a cookie_maker
a loader which can stream blocks of content
given a uri, offset and optional length.
Currently supports: http/https and file/local file system
"""
def __init__(self, cookie_maker=None):
self.cookie_maker = cookie_maker
def load(self, url, offset, length):
"""
Determine loading method based on uri
"""
if is_http(url):
return self.load_http(url, offset, length)
else:
return self.load_file(url, offset, length)
def load_file(self, url, offset, length):
"""
Load a file-like reader from the local file system
"""
if url.startswith('file://'):
url = url[len('file://'):]
afile = open(url, 'rb')
afile.seek(offset)
if length > 0:
return LimitReader(afile, length)
else:
return afile
def load_http(self, url, offset, length):
"""
Load a file-like reader over http using range requests
and an optional cookie created via a cookie_maker
"""
if length > 0:
range_header = 'bytes={0}-{1}'.format(offset, offset + length - 1)
else:
@ -71,25 +103,6 @@ class HMACCookieMaker(object):
return cookie
#=================================================================
# load a reader from local filesystem
#=================================================================
class FileLoader(object):
"""
Load a file-like reader from the local file system
"""
def load(self, url, offset, length):
if url.startswith('file://'):
url = url[len('file://'):]
afile = open(url, 'rb')
afile.seek(offset)
if length > 0:
return LimitReader(afile, length)
#=================================================================
# Limit Reader
#=================================================================

View File

@ -65,23 +65,36 @@ class StatusAndHeadersParser(object):
"""
parse stream for status line and headers
return a StatusAndHeaders object
support continuation headers starting with space or tab
"""
statusline = stream.readline().rstrip()
protocol_status = self.split_prefix(statusline, self.statuslist)
if not protocol_status:
msg = 'Expected Status Line - Found: ' + statusline
msg = 'Expected Status Line starting with {0} - Found: {1}'
msg = msg.format(self.statuslist, statusline)
raise StatusAndHeadersParserException(msg, statusline)
headers = []
line = stream.readline().rstrip()
while line and line != '\r\n':
while line:
name, value = line.split(':', 1)
header = (name, value.strip())
name = name.rstrip(' \t')
value = value.lstrip()
next_line = stream.readline().rstrip()
# append continuation lines, if any
while next_line and next_line.startswith((' ', '\t')):
value += next_line
next_line = stream.readline().rstrip()
header = (name, value)
headers.append(header)
line = stream.readline().rstrip()
line = next_line
return StatusAndHeaders(statusline=protocol_status[1].strip(),
headers=headers,
@ -107,4 +120,3 @@ class StatusAndHeadersParserException(Exception):
def __init__(self, msg, statusline):
super(StatusAndHeadersParserException, self).__init__(msg)
self.statusline = statusline

View File

@ -9,6 +9,7 @@ org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ te
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz
# Exact Search
>>> print_binsearch_results('org,iana)/domains/root', iter_exact)
org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz
@ -19,18 +20,45 @@ org,iana)/ 20140126200624 http://www.iana.org/ text/html 200 OSSAPWJ23L56IYVRW3G
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
# Exact Search
>>> print_binsearch_results('org,iana)/time-zones', iter_exact)
org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
# Exact search -- no matches
>>> print_binsearch_results('org,iaana)/', iter_exact)
>>> print_binsearch_results('org,ibna)/', iter_exact)
>>> print_binsearch_results('org,iana)/time-zones', iter_exact)
org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
# Range Search (end exclusive)
>>> print_binsearch_results_range('org,iana)/about', 'org,iana)/domains', iter_range)
org,iana)/about 20140126200706 http://www.iana.org/about text/html 200 6G77LZKFAVKH4PCWWKMW6TRJPSHWUBI3 - - 2962 483588 iana.warc.gz
org,iana)/about/performance/ietf-draft-status 20140126200815 http://www.iana.org/about/performance/ietf-draft-status text/html 302 Y7CTA2QZUSCDTJCSECZNSPIBLJDO7PJJ - - 584 596566 iana.warc.gz
org,iana)/about/performance/ietf-statistics 20140126200804 http://www.iana.org/about/performance/ietf-statistics text/html 302 HNYDN7XRX46RQTT2OFIWXKEYMZQAJWHD - - 582 581890 iana.warc.gz
org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz
org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz
# Range Search -- exact
>>> print_binsearch_results_range('org,iana)/about', 'org,iana)/about!', iter_range)
org,iana)/about 20140126200706 http://www.iana.org/about text/html 200 6G77LZKFAVKH4PCWWKMW6TRJPSHWUBI3 - - 2962 483588 iana.warc.gz
# Range Search -- exact + 1 prev
>>> print_binsearch_results_range('org,iana)/about', 'org,iana)/about!', iter_range, prev_size=1)
org,iana)/_js/2013.1/jquery.js 20140126201307 https://www.iana.org/_js/2013.1/jquery.js warc/revisit - AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO - - 543 778507 iana.warc.gz
org,iana)/about 20140126200706 http://www.iana.org/about text/html 200 6G77LZKFAVKH4PCWWKMW6TRJPSHWUBI3 - - 2962 483588 iana.warc.gz
# Range Search -- exact + 2 prev
>>> print_binsearch_results_range('org,iana)/about', 'org,iana)/about!', iter_range, prev_size=2)
org,iana)/_js/2013.1/jquery.js 20140126201248 http://www.iana.org/_js/2013.1/jquery.js warc/revisit - AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO - - 544 765491 iana.warc.gz
org,iana)/_js/2013.1/jquery.js 20140126201307 https://www.iana.org/_js/2013.1/jquery.js warc/revisit - AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO - - 543 778507 iana.warc.gz
org,iana)/about 20140126200706 http://www.iana.org/about text/html 200 6G77LZKFAVKH4PCWWKMW6TRJPSHWUBI3 - - 2962 483588 iana.warc.gz
"""
#=================================================================
import os
from pywb.utils.binsearch import iter_prefix, iter_exact
from pywb.utils.binsearch import iter_prefix, iter_exact, iter_range
from pywb.utils.loaders import SeekableTextFileReader
from pywb import get_test_dir
@ -45,6 +73,13 @@ def print_binsearch_results(key, iter_func):
print line
def print_binsearch_results_range(key, end_key, iter_func, prev_size=0):
cdx = SeekableTextFileReader(test_cdx_dir + 'iana.cdx')
for line in iter_func(cdx, key, end_key, prev_size=prev_size):
print line
if __name__ == "__main__":
import doctest
doctest.testmod()

View File

@ -10,9 +10,9 @@
>>> read_multiple(LimitReader(StringIO.StringIO('abcdefghjiklmnopqrstuvwxyz'), 10), [2, 2, 20])
'efghji'
# FileLoader Tests (includes LimitReader)
# BlockLoader Tests (includes LimitReader)
# Ensure attempt to read more than 100 bytes, reads exactly 100 bytes
>>> len(FileLoader().load(test_cdx_dir + 'iana.cdx', 0, 100).read('400'))
>>> len(BlockLoader().load(test_cdx_dir + 'iana.cdx', 0, 100).read('400'))
100
# SeekableTextFileReader Test
@ -23,25 +23,39 @@
>>> seek_read_full(sr, 100)
'org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200826 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 34054 620049 iana.warc.gz\\n'
#BufferedReader readline()
>>> BufferedReader(open(test_cdx_dir + 'iana.cdx', 'rb')).readline()
# Buffered Reader Tests
#=================================================================
#DecompressingBufferedReader readline()
>>> DecompressingBufferedReader(open(test_cdx_dir + 'iana.cdx', 'rb')).readline()
' CDX N b a m s k r M S V g\\n'
#BufferedReader readline() with decompression
>>> BufferedReader(open(test_cdx_dir + 'iana.cdx.gz', 'rb'), decomp_type = 'gzip').readline()
#DecompressingBufferedReader readline() with decompression
>>> DecompressingBufferedReader(open(test_cdx_dir + 'iana.cdx.gz', 'rb'), decomp_type = 'gzip').readline()
' CDX N b a m s k r M S V g\\n'
>>> HttpLoader(HMACCookieMaker('test', 'test', 5)).load('http://example.com', 41, 14).read()
>>> BlockLoader(HMACCookieMaker('test', 'test', 5)).load('http://example.com', 41, 14).read()
'Example Domain'
# test very small block size
>>> dbr = DecompressingBufferedReader(StringIO.StringIO('ABCDEFG\\nHIJKLMN\\nOPQR\\nXYZ'), block_size = 3)
>>> dbr.readline(); dbr.readline(4); dbr.readline(); dbr.readline(); dbr.readline(2); dbr.readline(); dbr.readline()
'ABCDEFG\\n'
'HIJK'
'LMN\\n'
'OPQR\\n'
'XY'
'Z'
''
"""
#=================================================================
import os
import StringIO
from pywb.utils.loaders import FileLoader, HttpLoader, HMACCookieMaker
from pywb.utils.loaders import BlockLoader, HMACCookieMaker
from pywb.utils.loaders import LimitReader, SeekableTextFileReader
from pywb.utils.bufferedreaders import BufferedReader
from pywb.utils.bufferedreaders import DecompressingBufferedReader
from pywb import get_test_dir
#test_cdx_dir = os.path.dirname(os.path.realpath(__file__)) + '/../sample-data/'

View File

@ -0,0 +1,29 @@
"""
>>> StatusAndHeadersParser(['HTTP/1.0']).parse(StringIO.StringIO(status_headers_1))
StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '200 OK', headers = [ ('Content-Type', 'ABC'),
('Some', 'Value'),
('Multi-Line', 'Value1 Also This')])
>>> StatusAndHeadersParser(['Other']).parse(StringIO.StringIO(status_headers_1))
Traceback (most recent call last):
StatusAndHeadersParserException: Expected Status Line starting with ['Other'] - Found: HTTP/1.0 200 OK
"""
from pywb.utils.statusandheaders import StatusAndHeadersParser
import StringIO
status_headers_1 = "\
HTTP/1.0 200 OK\r\n\
Content-Type: ABC\r\n\
Some: Value\r\n\
Multi-Line: Value1\r\n\
Also This\r\n\
\r\n\
Body"
if __name__ == "__main__":
import doctest
doctest.testmod()

View File

@ -17,7 +17,8 @@ DATE_TIMESPLIT = re.compile(r'[^\d]')
TIMESTAMP_14 = '%Y%m%d%H%M%S'
PAD_STAMP_END = '29991231235959'
#PAD_STAMP_END = '29991231235959'
PAD_6 = '299912'
def iso_date_to_datetime(string):
@ -58,41 +59,145 @@ def iso_date_to_timestamp(string):
return datetime_to_timestamp(iso_date_to_datetime(string))
# default pad is end of range for compatibility
def pad_timestamp(string, pad_str=PAD_STAMP_END):
# pad to certain length (default 6)
def _pad_timestamp(string, pad_str=PAD_6):
"""
>>> pad_timestamp('20')
'20991231235959'
>>> _pad_timestamp('20')
'209912'
>>> pad_timestamp('2014')
'20141231235959'
>>> _pad_timestamp('2014')
'201412'
>>> pad_timestamp('20141011')
'20141011235959'
>>> _pad_timestamp('20141011')
'20141011'
>>> pad_timestamp('201410110010')
'20141011001059'
>>> _pad_timestamp('201410110010')
'201410110010'
"""
str_len = len(string)
pad_len = len(pad_str)
return string if str_len >= pad_len else string + pad_str[str_len:]
if str_len < pad_len:
string = string + pad_str[str_len:]
return string
def timestamp_to_datetime(string):
"""
>>> timestamp_to_datetime('20131226095010')
time.struct_time(tm_year=2013, tm_mon=12, tm_mday=26, \
tm_hour=9, tm_min=50, tm_sec=10, tm_wday=3, tm_yday=360, tm_isdst=-1)
# >14-digit -- rest ignored
>>> timestamp_to_datetime('2014122609501011')
datetime.datetime(2014, 12, 26, 9, 50, 10)
# 14-digit
>>> timestamp_to_datetime('20141226095010')
datetime.datetime(2014, 12, 26, 9, 50, 10)
# 13-digit padding
>>> timestamp_to_datetime('2014122609501')
datetime.datetime(2014, 12, 26, 9, 50, 59)
# 12-digit padding
>>> timestamp_to_datetime('201412260950')
datetime.datetime(2014, 12, 26, 9, 50, 59)
# 11-digit padding
>>> timestamp_to_datetime('20141226095')
datetime.datetime(2014, 12, 26, 9, 59, 59)
# 10-digit padding
>>> timestamp_to_datetime('2014122609')
datetime.datetime(2014, 12, 26, 9, 59, 59)
# 9-digit padding
>>> timestamp_to_datetime('201412260')
datetime.datetime(2014, 12, 26, 23, 59, 59)
# 8-digit padding
>>> timestamp_to_datetime('20141226')
datetime.datetime(2014, 12, 26, 23, 59, 59)
# 7-digit padding
>>> timestamp_to_datetime('2014122')
datetime.datetime(2014, 12, 31, 23, 59, 59)
# 6-digit padding
>>> timestamp_to_datetime('201410')
datetime.datetime(2014, 10, 31, 23, 59, 59)
# 5-digit padding
>>> timestamp_to_datetime('20141')
datetime.datetime(2014, 12, 31, 23, 59, 59)
# 4-digit padding
>>> timestamp_to_datetime('2014')
time.struct_time(tm_year=2014, tm_mon=12, tm_mday=31, \
tm_hour=23, tm_min=59, tm_sec=59, tm_wday=2, tm_yday=365, tm_isdst=-1)
datetime.datetime(2014, 12, 31, 23, 59, 59)
# 3-digit padding
>>> timestamp_to_datetime('201')
datetime.datetime(2019, 12, 31, 23, 59, 59)
# 2-digit padding
>>> timestamp_to_datetime('20')
datetime.datetime(2099, 12, 31, 23, 59, 59)
# 1-digit padding
>>> timestamp_to_datetime('2')
datetime.datetime(2999, 12, 31, 23, 59, 59)
# 1-digit out-of-range padding
>>> timestamp_to_datetime('3')
datetime.datetime(2999, 12, 31, 23, 59, 59)
# 0-digit padding
>>> timestamp_to_datetime('')
datetime.datetime(2999, 12, 31, 23, 59, 59)
# bad month
>>> timestamp_to_datetime('20131709005601')
datetime.datetime(2013, 12, 9, 0, 56, 1)
# all out of range except minutes
>>> timestamp_to_datetime('40001965252477')
datetime.datetime(2999, 12, 31, 23, 24, 59)
"""
# Default pad to end of range for comptability
return time.strptime(pad_timestamp(string), TIMESTAMP_14)
# pad to 6 digits
string = _pad_timestamp(string, PAD_6)
def clamp(val, min_, max_):
try:
val = int(val)
val = max(min_, min(val, max_))
return val
except:
return max_
def extract(string, start, end, min_, max_):
if len(string) >= end:
return clamp(string[start:end], min_, max_)
else:
return max_
# now parse, clamp to boundary
year = extract(string, 0, 4, 1900, 2999)
month = extract(string, 4, 6, 1, 12)
day = extract(string, 6, 8, 1, calendar.monthrange(year, month)[1])
hour = extract(string, 8, 10, 0, 23)
minute = extract(string, 10, 12, 0, 59)
second = extract(string, 12, 14, 0, 59)
return datetime.datetime(year=year,
month=month,
day=day,
hour=hour,
minute=minute,
second=second)
#return time.strptime(pad_timestamp(string), TIMESTAMP_14)
def timestamp_to_sec(string):
@ -104,7 +209,7 @@ def timestamp_to_sec(string):
1420070399
"""
return calendar.timegm(timestamp_to_datetime(string))
return calendar.timegm(timestamp_to_datetime(string).utctimetuple())
if __name__ == "__main__":

View File

@ -56,9 +56,9 @@ class J2TemplateView:
# Filters
@staticmethod
def format_ts(value, format='%a, %b %d %Y %H:%M:%S'):
def format_ts(value, format_='%a, %b %d %Y %H:%M:%S'):
value = timeutils.timestamp_to_datetime(value)
return time.strftime(format, value)
return value.strftime(format_)
@staticmethod
def get_host(url):

View File

@ -6,8 +6,8 @@ from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.utils.statusandheaders import StatusAndHeadersParser
from pywb.utils.statusandheaders import StatusAndHeadersParserException
from pywb.utils.loaders import FileLoader, HttpLoader
from pywb.utils.bufferedreaders import BufferedReader
from pywb.utils.loaders import BlockLoader
from pywb.utils.bufferedreaders import DecompressingBufferedReader
#=================================================================
ArcWarcRecord = collections.namedtuple('ArchiveRecord',
@ -32,24 +32,12 @@ class ArcWarcRecordLoader:
ARC_HEADERS = ["uri", "ip-address", "creation-date",
"content-type", "length"]
@staticmethod
def create_default_loaders(cookie_maker=None):
http = HttpLoader(cookie_maker)
file = FileLoader()
return {
'http': http,
'https': http,
'file': file,
'': file
}
def __init__(self, loader=None, cookie_maker=None, block_size=8192):
if not loader:
loader = BlockLoader(cookie_maker)
def __init__(self, loaders={}, cookie_maker=None, chunk_size=8192):
self.loaders = loaders
if not self.loaders:
self.loaders = self.create_default_loaders(cookie_maker)
self.chunk_size = chunk_size
self.loader = loader
self.block_size = block_size
self.arc_parser = ARCHeadersParser(self.ARC_HEADERS)
@ -60,22 +48,25 @@ class ArcWarcRecordLoader:
def load(self, url, offset, length):
url_parts = urlparse.urlsplit(url)
loader = self.loaders.get(url_parts.scheme)
if not loader:
raise ArchiveLoadFailed('Unknown Protocol', url)
#loader = self.loaders.get(url_parts.scheme)
#if not loader:
# raise ArchiveLoadFailed('Unknown Protocol', url)
try:
length = int(length)
except:
length = -1
raw = loader.load(url, long(offset), length)
raw = self.loader.load(url, long(offset), length)
decomp_type = 'gzip'
stream = BufferedReader(raw, length, self.chunk_size, decomp_type)
# Create decompressing stream
stream = DecompressingBufferedReader(stream = raw,
decomp_type = decomp_type,
block_size = self.block_size)
(the_format, rec_headers) = self._load_headers(stream)
(the_format, rec_headers) = self._detect_type_load_headers(stream)
if the_format == 'arc':
rec_type = 'response'
@ -111,7 +102,7 @@ class ArcWarcRecordLoader:
return ArcWarcRecord((the_format, rec_type),
rec_headers, stream, status_headers)
def _load_headers(self, stream):
def _detect_type_load_headers(self, stream):
"""
Try parsing record as WARC, then try parsing as ARC.
if neither one succeeds, we're out of luck.

View File

@ -213,3 +213,6 @@ def load_from_cdx_test(cdx):
except Exception as e:
print 'Exception: ' + e.__class__.__name__
if __name__ == "__main__":
import doctest
doctest.testmod()

View File

@ -1,99 +1,75 @@
from pywb.rewrite.wburl import WbUrl
from pywb.rewrite.url_rewriter import UrlRewriter
from pywb.utils.statusandheaders import StatusAndHeaders
import pprint
#WB Request and Response
#=================================================================
class WbRequest:
"""
>>> WbRequest.from_uri('/save/_embed/example.com/?a=b')
{'wb_url': ('latest_replay', '', '', 'http://_embed/example.com/?a=b', 'http://_embed/example.com/?a=b'), 'coll': 'save', 'wb_prefix': '/save/', 'request_uri': '/save/_embed/example.com/?a=b'}
Represents the main pywb request object.
>>> WbRequest.from_uri('/2345/20101024101112im_/example.com/?b=c')
{'wb_url': ('replay', '20101024101112', 'im_', 'http://example.com/?b=c', '20101024101112im_/http://example.com/?b=c'), 'coll': '2345', 'wb_prefix': '/2345/', 'request_uri': '/2345/20101024101112im_/example.com/?b=c'}
Contains various info from wsgi env, add additional info
about the request, such as coll, relative prefix,
host prefix, absolute prefix.
>>> WbRequest.from_uri('/2010/example.com')
{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': '/2010/', 'request_uri': '/2010/example.com'}
>>> WbRequest.from_uri('../example.com')
{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '', 'wb_prefix': '/', 'request_uri': '../example.com'}
# Abs path
>>> WbRequest.from_uri('/2010/example.com', {'wsgi.url_scheme': 'https', 'HTTP_HOST': 'localhost:8080'}, use_abs_prefix = True)
{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': 'https://localhost:8080/2010/', 'request_uri': '/2010/example.com'}
# No Scheme, so stick to relative
>>> WbRequest.from_uri('/2010/example.com', {'HTTP_HOST': 'localhost:8080'}, use_abs_prefix = True)
{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': '/2010/', 'request_uri': '/2010/example.com'}
If a wburl and url rewriter classes are specified, the class
also contains the url rewriter.
"""
@staticmethod
def from_uri(request_uri, env = {}, use_abs_prefix = False):
if not request_uri:
request_uri = env.get('REL_REQUEST_URI')
parts = request_uri.split('/', 2)
# Has coll prefix
if len(parts) == 3:
wb_prefix = '/' + parts[1] + '/'
wb_url_str = parts[2]
coll = parts[1]
# No Coll Prefix
elif len(parts) == 2:
wb_prefix = '/'
wb_url_str = parts[1]
coll = ''
else:
wb_prefix = '/'
wb_url_str = parts[0]
coll = ''
host_prefix = WbRequest.make_host_prefix(env) if use_abs_prefix else ''
return WbRequest(env, request_uri, wb_prefix, wb_url_str, coll, host_prefix = host_prefix)
@staticmethod
def make_host_prefix(env):
try:
return env['wsgi.url_scheme'] + '://' + env['HTTP_HOST']
host = env.get('HTTP_HOST')
if not host:
host = env['SERVER_NAME'] + ':' + env['SERVER_PORT']
return env['wsgi.url_scheme'] + '://' + host
except KeyError:
return ''
def __init__(self, env, request_uri, wb_prefix, wb_url_str, coll,
host_prefix = '',
wburl_class = WbUrl,
url_rewriter_class = UrlRewriter,
is_proxy = False):
def __init__(self, env,
request_uri=None,
rel_prefix='',
wb_url_str='/',
coll='',
host_prefix='',
use_abs_prefix=False,
wburl_class=None,
urlrewriter_class=None,
is_proxy=False):
self.env = env
self.request_uri = request_uri if request_uri else env.get('REL_REQUEST_URI')
self.host_prefix = host_prefix
self.coll = coll
if not host_prefix:
host_prefix = self.make_host_prefix(env)
self.host_prefix = host_prefix
self.rel_prefix = rel_prefix
if use_abs_prefix:
self.wb_prefix = host_prefix + rel_prefix
else:
self.wb_prefix = rel_prefix
self.wb_prefix = host_prefix + wb_prefix
if not wb_url_str:
wb_url_str = '/'
self.wb_url_str = wb_url_str
# wb_url present and not root page
if wb_url_str != '/' and wburl_class:
self.wb_url_str = wb_url_str
self.wb_url = wburl_class(wb_url_str)
self.urlrewriter = url_rewriter_class(self.wb_url, self.wb_prefix)
self.urlrewriter = urlrewriter_class(self.wb_url, self.wb_prefix)
else:
# no wb_url, just store blank wb_url
self.wb_url_str = wb_url_str
self.wb_url = None
self.urlrewriter = None
self.coll = coll
self.referrer = env.get('HTTP_REFERER')
self.is_ajax = self._is_ajax()
@ -122,24 +98,19 @@ class WbRequest:
def __repr__(self):
#return "WbRequest(env, '" + (self.wb_url) + "', '" + (self.coll) + "')"
#return str(vars(self))
varlist = vars(self)
return str({k: varlist[k] for k in ('request_uri', 'wb_prefix', 'wb_url', 'coll')})
varstr = pprint.pformat(varlist)
return varstr
#=================================================================
class WbResponse:
"""
>>> WbResponse.text_response('Test')
{'body': ['Test'], 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [('Content-Type', 'text/plain')])}
Represnts a pywb wsgi response object.
>>> WbResponse.text_stream(['Test', 'Another'], '404')
{'body': ['Test', 'Another'], 'status_headers': StatusAndHeaders(protocol = '', statusline = '404', headers = [('Content-Type', 'text/plain')])}
>>> WbResponse.redir_response('http://example.com/otherfile')
{'body': [], 'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [('Location', 'http://example.com/otherfile')])}
Holds a status_headers object and a response iter, to be
returned to wsgi container.
"""
def __init__(self, status_headers, value = []):
self.status_headers = status_headers
self.body = value
@ -180,8 +151,3 @@ class WbResponse:
def __repr__(self):
return str(vars(self))
if __name__ == "__main__":
import doctest
doctest.testmod()

View File

@ -75,6 +75,11 @@ class TestWb:
assert 'wb.js' in resp.body
assert '/pywb/20140127171238/http://www.iana.org/time-zones' in resp.body
def test_replay_content_length_1(self):
# test larger file, rewritten file (svg!)
resp = self.testapp.get('/pywb/20140126200654/http://www.iana.org/_img/2013.1/rir-map.svg')
assert resp.headers['Content-Length'] == str(len(resp.body))
def test_redirect_1(self):
resp = self.testapp.get('/pywb/20140127171237/http://www.iana.org/')
@ -119,6 +124,20 @@ class TestWb:
assert resp.content_type == 'text/css'
def test_referrer_self_redirect(self):
uri = '/pywb/20140127171239/http://www.iana.org/_css/2013.1/screen.css'
host = 'somehost:8082'
referrer = 'http://' + host + uri
# capture is normally a 200
resp = self.testapp.get(uri)
assert resp.status_int == 200
# redirect causes skip of this capture, redirect to next
resp = self.testapp.get(uri, headers = [('Referer', referrer), ('Host', host)], status = 302)
assert resp.status_int == 302
def test_excluded_content(self):
resp = self.testapp.get('/pywb/http://www.iana.org/_img/bookmark_icon.ico', status = 403)
assert resp.status_int == 403