mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

remove obsolete code and tests!

disable test_auto_colls for now until fully supported in new system
This commit is contained in:
Ilya Kreymer 2017-04-24 18:52:15 -07:00
parent 24c968640d
commit 52dc46fe6a
37 changed files with 2 additions and 5864 deletions

View File

@ -1,28 +0,0 @@
### pywb.cdx package
This package contains the CDX processing suite of the pywb wayback tool suite.
The CDX Server loads, filters and transforms cdx from multiple sources in response
to a given query.
#### Sample App
A very simple reference WSGI app is included.
Run: `python -m pywb.cdx.wsgi_cdxserver` to start the app, keyboard interrupt to stop.
The default [config.yaml](config.yaml) points to the sample data directory
and uses port 8080.
The domain specific [rules.yaml](rules.yaml) are also loaded.
#### CDX Server API Reference
Goal is to provide compatiblity with this feature set and more:

View File

@ -1,185 +0,0 @@
import yaml
import re
import logging
import pkg_resources
from six.moves.urllib.parse import urlsplit
from pywb.utils.dsrules import BaseRule, RuleSet
from pywb.utils.canonicalize import unsurt, UrlCanonicalizer
from pywb.utils.loaders import to_native_str
def load_domain_specific_cdx_rules(ds_rules_file, surt_ordered):
canon = None
fuzzy = None
# Load Canonicalizer Rules
rules = RuleSet(CDXDomainSpecificRule, 'canonicalize',
if not surt_ordered:
for rule in rules.rules:
if rules:
canon = CustomUrlCanonicalizer(rules, surt_ordered)
# Load Fuzzy Lookup Rules
rules = RuleSet(CDXDomainSpecificRule, 'fuzzy_lookup',
if not surt_ordered:
for rule in rules.rules:
if rules:
fuzzy = FuzzyQuery(rules)
logging.debug('CustomCanonilizer? ' + str(bool(canon)))
logging.debug('FuzzyMatcher? ' + str(bool(canon)))
return (canon, fuzzy)
class CustomUrlCanonicalizer(UrlCanonicalizer):
def __init__(self, rules, surt_ordered=True):
super(CustomUrlCanonicalizer, self).__init__(surt_ordered)
self.rules = rules
def __call__(self, url):
urlkey = super(CustomUrlCanonicalizer, self).__call__(url)
for rule in self.rules.iter_matching(urlkey):
m = rule.regex.match(urlkey)
if not m:
if rule.replace:
return m.expand(rule.replace)
return urlkey
class FuzzyQuery(object):
def __init__(self, rules):
self.rules = rules
def __call__(self, query):
matched_rule = None
urlkey = to_native_str(query.key, 'utf-8')
url = query.url
filter_ = query.filters
output = query.output
for rule in self.rules.iter_matching(urlkey):
m = rule.regex.search(urlkey)
if not m:
matched_rule = rule
groups = m.groups()
for g in groups:
for f in matched_rule.filter:
if not matched_rule:
return None
repl = '?'
if matched_rule.replace:
repl = matched_rule.replace
inx = url.find(repl)
if inx > 0:
url = url[:inx + len(repl)]
if matched_rule.match_type == 'domain':
host = urlsplit(url).netloc
# remove the subdomain
url = host.split('.', 1)[1]
params = query.params
params.update({'url': url,
'matchType': matched_rule.match_type,
'filter': filter_})
if 'reverse' in params:
del params['reverse']
if 'closest' in params:
del params['closest']
if 'end_key' in params:
del params['end_key']
return params
class CDXDomainSpecificRule(BaseRule):
DEFAULT_FILTER = ['~urlkey:{0}']
def __init__(self, name, config):
super(CDXDomainSpecificRule, self).__init__(name, config)
if not isinstance(config, dict):
self.regex = self.make_regex(config)
self.replace = None
self.filter = self.DEFAULT_FILTER
self.match_type = self.DEFAULT_MATCH_TYPE
self.regex = self.make_regex(config.get('match'))
self.replace = config.get('replace')
self.filter = config.get('filter', self.DEFAULT_FILTER)
self.match_type = config.get('type', self.DEFAULT_MATCH_TYPE)
def unsurt(self):
urlkey is assumed to be in surt format by default
In the case of non-surt format, this method is called
to desurt any urls
self.url_prefix = list(map(unsurt, self.url_prefix))
if self.regex:
self.regex = re.compile(unsurt(self.regex.pattern))
if self.replace:
self.replace = unsurt(self.replace)
def make_regex(config):
# just query args
if isinstance(config, list):
string = CDXDomainSpecificRule.make_query_match_regex(config)
# split out base and args
elif isinstance(config, dict):
string = config.get('regex', '')
string += CDXDomainSpecificRule.make_query_match_regex(
config.get('args', []))
# else assume string
string = str(config)
return re.compile(string)
def make_query_match_regex(params_list):
def conv(value):
return '[?&]({0}=[^&]+)'.format(re.escape(value))
params_list = list(map(conv, params_list))
final_str = '.*'.join(params_list)
return final_str

View File

@ -1,230 +0,0 @@
from pywb.utils.canonicalize import UrlCanonicalizer
from pywb.utils.wbexception import NotFoundException
from pywb.cdx.cdxops import cdx_load
from pywb.cdx.cdxsource import CDXSource, CDXFile, RemoteCDXSource, RedisCDXSource
from pywb.cdx.zipnum import ZipNumCluster
from pywb.cdx.cdxobject import CDXObject, CDXException
from pywb.cdx.query import CDXQuery
from pywb.cdx.cdxdomainspecific import load_domain_specific_cdx_rules
from pywb.utils.loaders import is_http
from itertools import chain
import logging
import os
class BaseCDXServer(object):
def __init__(self, **kwargs):
ds_rules_file = kwargs.get('ds_rules_file')
surt_ordered = kwargs.get('surt_ordered', True)
# load from domain-specific rules
if ds_rules_file:
self.url_canon, self.fuzzy_query = (
load_domain_specific_cdx_rules(ds_rules_file, surt_ordered))
# or custom passed in canonicalizer
self.url_canon = kwargs.get('url_canon')
self.fuzzy_query = kwargs.get('fuzzy_query')
# set default canonicalizer if none set thus far
if not self.url_canon:
self.url_canon = UrlCanonicalizer(surt_ordered)
def _check_cdx_iter(self, cdx_iter, query):
""" Check cdx iter semantics
If `cdx_iter` is empty (no matches), check if fuzzy matching
is allowed, and try it -- otherwise,
throw :exc:`~pywb.utils.wbexception.NotFoundException`
cdx_iter = self.peek_iter(cdx_iter)
if cdx_iter:
return cdx_iter
# check if fuzzy is allowed and ensure that its an
# exact match
if (self.fuzzy_query and
query.allow_fuzzy and
fuzzy_query_params = self.fuzzy_query(query)
if fuzzy_query_params:
return self.load_cdx(**fuzzy_query_params)
msg = 'No Captures found for: ' + query.url
if not query.is_exact:
msg += ' (' + query.match_type + ' query)'
raise NotFoundException(msg, url=query.url)
#def _calc_search_keys(self, query):
# return calc_search_range(url=query.url,
# match_type=query.match_type,
# url_canon=self.url_canon)
def load_cdx(self, **params):
params['_url_canon'] = self.url_canon
query = CDXQuery(params)
#key, end_key = self._calc_search_keys(query)
#query.set_key(key, end_key)
cdx_iter = self._load_cdx_query(query)
return self._check_cdx_iter(cdx_iter, query)
def _load_cdx_query(self, query): # pragma: no cover
raise NotImplementedError('Implement in subclass')
def peek_iter(iterable):
first = next(iterable)
except StopIteration:
return None
return chain([first], iterable)
class CDXServer(BaseCDXServer):
Top-level cdx server object which maintains a list of cdx sources,
responds to queries and dispatches to the cdx ops for processing
def __init__(self, paths, **kwargs):
super(CDXServer, self).__init__(**kwargs)
# TODO: we could save config in member, so that other
# methods can use it. it's bad for add_cdx_source to take
# config argument.
self._create_cdx_sources(paths, kwargs.get('config'))
def _load_cdx_query(self, query):
load CDX for query parameters ``params``.
``key`` (or ``url``) parameter specifies URL to query,
``matchType`` parameter specifies matching method for ``key``
(default ``exact``).
other parameters are passed down to :func:`cdx_load`.
raises :exc:`~pywb.utils.wbexception.NotFoundException`
if no captures are found.
:param query: query parameters
:type query: :class:`~pywb.cdx.query.CDXQuery`
:rtype: iterator on :class:`~pywb.cdx.cdxobject.CDXObject`
return cdx_load(self.sources, query)
def _create_cdx_sources(self, paths, config):
build CDXSource instances for each of path in ``paths``.
:param paths: list of sources or single source.
each source may be either string or CDXSource instance. value
of any other types will be silently ignored.
:param config: config object passed to :method:`add_cdx_source`.
self.sources = []
if paths is not None:
if not isinstance(paths, (list, tuple)):
paths = [paths]
for path in paths:
self.add_cdx_source(path, config)
if len(self.sources) == 0:
logging.warn('No CDX Sources configured from paths=%s', paths)
def _add_cdx_source(self, source):
if source is None:
logging.debug('Adding CDX Source: %s', source)
def add_cdx_source(self, source, config):
if isinstance(source, CDXSource):
elif isinstance(source, str):
if os.path.isdir(source):
for fn in os.listdir(source):
os.path.join(source, fn), config))
source, config))
def _create_cdx_source(self, filename, config):
if is_http(filename):
return RemoteCDXSource(filename)
if filename.startswith('redis://'):
return RedisCDXSource(filename, config)
if filename.endswith(('.cdx', '.cdxj')):
return CDXFile(filename)
if filename.endswith(('.summary', '.idx')):
return ZipNumCluster(filename, config)
# no warning for .loc or .gz (zipnum)
if not filename.endswith(('.loc', '.gz')):
logging.warn('skipping unrecognized URI: %s', filename)
return None
class RemoteCDXServer(BaseCDXServer):
A special cdx server that uses a single
It simply proxies the query params to the remote source
and performs no local processing/filtering
def __init__(self, source, **kwargs):
super(RemoteCDXServer, self).__init__(**kwargs)
if isinstance(source, RemoteCDXSource):
self.source = source
elif (isinstance(source, str) and is_http(source)):
self.source = RemoteCDXSource(source, remote_processing=True)
raise Exception('Invalid remote cdx source: ' + str(source))
def _load_cdx_query(self, query):
return cdx_load([self.source], query, process=False)
def create_cdx_server(config, ds_rules_file=None, server_cls=None):
if hasattr(config, 'get'):
paths = config.get('index_paths')
surt_ordered = config.get('surt_ordered', True)
pass_config = config
paths = config
surt_ordered = True
pass_config = None
logging.debug('CDX Surt-Ordered? ' + str(surt_ordered))
if not server_cls:
if ((isinstance(paths, str) and is_http(paths)) or
isinstance(paths, RemoteCDXSource)):
server_cls = RemoteCDXServer
server_cls = CDXServer
return server_cls(paths,

View File

@ -1,150 +0,0 @@
from pywb.utils.binsearch import iter_range
from pywb.utils.wbexception import AccessException, NotFoundException
from pywb.utils.wbexception import BadRequestException, WbException
from pywb.cdx.query import CDXQuery
from six.moves.urllib.request import urlopen, Request
from six.moves.urllib.error import HTTPError
from six.moves import map
class CDXSource(object):
Represents any cdx index source
def load_cdx(self, query): # pragma: no cover
raise NotImplementedError('Implement in subclass')
class CDXFile(CDXSource):
Represents a local plain-text .cdx file
def __init__(self, filename):
self.filename = filename
def load_cdx(self, query):
return self._do_load_file(self.filename, query)
def _do_load_file(filename, query):
with open(filename, 'rb') as source:
gen = iter_range(source, query.key,
for line in gen:
yield line
def __str__(self):
return 'CDX File - ' + self.filename
class RemoteCDXSource(CDXSource):
Represents a remote cdx server, to which requests will be proxied.
Only ``url`` and ``match_type`` params are proxied at this time,
the stream is passed through all other filters locally.
def __init__(self, filename, cookie=None, remote_processing=False):
self.remote_url = filename
self.cookie = cookie
self.remote_processing = remote_processing
def load_cdx(self, query):
if self.remote_processing:
remote_query = query
# Only send url and matchType to remote
remote_query = CDXQuery(dict(url=query.url,
urlparams = remote_query.urlencode()
request = Request(self.remote_url + '?' + urlparams)
if self.cookie:
request.add_header('Cookie', self.cookie)
response = urlopen(request)
except HTTPError as e:
if e.code == 403:
raise AccessException('Access Denied')
elif e.code == 404:
# return empty list for consistency with other cdx sources
# will be converted to 404 if no other retry
return []
elif e.code == 400:
raise BadRequestException()
raise WbException('Invalid response from remote cdx server')
return iter(response)
def __str__(self):
if self.remote_processing:
return 'Remote CDX Server: ' + self.remote_url
return 'Remote CDX Source: ' + self.remote_url
class RedisCDXSource(CDXSource):
def __init__(self, redis_url, config=None):
import redis
parts = redis_url.split('/')
if len(parts) > 4:
self.cdx_key = parts[4].encode('utf-8')
redis_url = 'redis://' + parts[2] + '/' + parts[3]
self.cdx_key = None
self.redis_url = redis_url
self.redis = redis.StrictRedis.from_url(redis_url)
self.key_prefix = self.DEFAULT_KEY_PREFIX
def load_cdx(self, query):
Load cdx from redis cache, from an ordered list
If cdx_key is set, treat it as cdx file and load use
zrangebylex! (Supports all match types!)
Otherwise, assume a key per-url and load all entries for that key.
(Only exact match supported)
if self.cdx_key:
return self.load_sorted_range(query, self.cdx_key)
return self.load_single_key(query.key)
def load_sorted_range(self, query, cdx_key):
cdx_list = self.redis.zrangebylex(cdx_key,
b'[' + query.key,
b'(' + query.end_key)
return iter(cdx_list)
def load_single_key(self, key):
# ensure only url/surt is part of key
key = key.split(b' ')[0]
cdx_list = self.redis.zrange(self.key_prefix + key, 0, -1)
# key is not part of list, so prepend to each line
key += b' '
cdx_list = list(map(lambda x: key + x, cdx_list))
return cdx_list
def __str__(self):
return 'Redis - ' + self.redis_url

View File

@ -1,40 +0,0 @@
Load Rules
>>> (canon, fuzzy) = load_domain_specific_cdx_rules(None, True)
>>> canon('http://test.example.example/path/index.html?a=b&id=value&c=d')
# Fuzzy Query Args Builder
>>> CDXDomainSpecificRule.make_query_match_regex(['para', 'id', 'abc'])
>>> CDXDomainSpecificRule.make_query_match_regex(['id[0]', 'abc()'])
# Fuzzy Match Query + Args
# list
>>> CDXDomainSpecificRule.make_regex(['para', 'id', 'abc']).pattern
# dict
>>> CDXDomainSpecificRule.make_regex(dict(regex='com,test,.*\)/', args=['para', 'id', 'abc'])).pattern
# string
>>> CDXDomainSpecificRule.make_regex('com,test,.*\)/[?&](abc=[^&]+).*[?&](id=[^&]+).*[?&](para=[^&]+)').pattern
from pywb.cdx.cdxdomainspecific import CDXDomainSpecificRule
from pywb.cdx.cdxdomainspecific import load_domain_specific_cdx_rules
if __name__ == "__main__":
import doctest

View File

@ -1,228 +0,0 @@
# Merge Sort Multipe CDX Sources
>>> cdx_ops_test(url = 'http://iana.org/', sources = [test_cdx_dir + 'dupes.cdx', test_cdx_dir + 'iana.cdx'])
org,iana)/ 20140126200624 http://www.iana.org/ text/html 200 OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 2258 334 iana.warc.gz
org,iana)/ 20140127171238 http://iana.org unk 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 343 1858 dupes.warc.gz
org,iana)/ 20140127171238 http://www.iana.org/ warc/revisit - OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 536 2678 dupes.warc.gz
# Limit CDX Stream
>>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', limit = 3)
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200625 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 117166 198285 iana.warc.gz
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200654 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf warc/revisit - YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 548 482544 iana.warc.gz
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200706 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf warc/revisit - YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 495230 iana.warc.gz
# Reverse CDX Stream
>>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', reverse = True, resolveRevisits = True, limit = 3)
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201308 https://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 551 783712 iana.warc.gz 117166 198285 iana.warc.gz
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201249 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 771773 iana.warc.gz 117166 198285 iana.warc.gz
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201240 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 551 757988 iana.warc.gz 117166 198285 iana.warc.gz
>>> cdx_ops_test('http://iana.org/_js/2013.1/jquery.js', reverse = True, resolveRevisits = True, limit = 1)
org,iana)/_js/2013.1/jquery.js 20140126201307 https://www.iana.org/_js/2013.1/jquery.js application/x-javascript 200 AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO - - 543 778507 iana.warc.gz 33449 7311 iana.warc.gz
# From & To
>>> cdx_ops_test('http://example.com/', sources = [test_cdx_dir], from_ts='2013', to='2013')
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
>>> cdx_ops_test('http://example.com/', sources = [test_cdx_dir], from_ts='2014')
com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz
com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz
>>> cdx_ops_test('http://example.com/', sources = [test_cdx_dir], to='2012') # doctest: +IGNORE_EXCEPTION_DETAIL
Traceback (most recent call last):
NotFoundException: No Captures found for: http://example.com/
# No matching results
>>> cdx_ops_test('http://iana.org/dont_have_this', reverse = True, resolveRevisits = True, limit = 2) # doctest: +IGNORE_EXCEPTION_DETAIL
Traceback (most recent call last):
NotFoundException: No Captures found for: http://iana.org/dont_have_this
# No matching -- limit=1
>>> cdx_ops_test('http://iana.org/dont_have_this', reverse = True, resolveRevisits = True, limit = 1) # doctest: +IGNORE_EXCEPTION_DETAIL
Traceback (most recent call last):
NotFoundException: No Captures found for: http://iana.org/dont_have_this
# Filter cdx (default: regex)
>>> cdx_ops_test(url = 'http://iana.org/domains', matchType = 'prefix', filter = ['mimetype:text/html'])
org,iana)/domains 20140126200825 http://www.iana.org/domains text/html 200 7UPSCLNWNZP33LGW6OJGSF2Y4CDG4ES7 - - 2912 610534 iana.warc.gz
org,iana)/domains/arpa 20140126201248 http://www.iana.org/domains/arpa text/html 200 QOFZZRN6JIKAL2JRL6ZC2VVG42SPKGHT - - 2939 759039 iana.warc.gz
org,iana)/domains/idn-tables 20140126201127 http://www.iana.org/domains/idn-tables text/html 200 HNCUFTJMOQOGAEY6T56KVC3T7TVLKGEW - - 8118 715878 iana.warc.gz
org,iana)/domains/int 20140126201239 http://www.iana.org/domains/int text/html 200 X32BBNNORV4SPEHTQF5KI5NFHSKTZK6Q - - 2482 746788 iana.warc.gz
org,iana)/domains/reserved 20140126201054 http://www.iana.org/domains/reserved text/html 200 R5AAEQX5XY5X5DG66B23ODN5DUBWRA27 - - 3573 701457 iana.warc.gz
org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', filter = 'statuscode:200')
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
# Filter Alt field name
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', filter = 'status:200')
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
# Filter -- no field specified, match regex on entire line
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', filter = '~screen.css 20140126200625')
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
# Filter -- no such field, no matches
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', filter = 'blah:200') # doctest: +IGNORE_EXCEPTION_DETAIL
Traceback (most recent call last):
NotFoundException: No Captures found for: http://iana.org/_css/2013.1/screen.css
# Filter exact -- (* prefix)
>>> cdx_ops_test(url = 'http://example.com*', sources = [test_cdx_dir], filter = '=urlkey:com,example)/?example=1')
com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz
# Filter exact invert
>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = ['!=urlkey:com,example)/?example=1', '!=urlkey:com,example)/?example=2', '!=urlkey:com,example)/?example=3'])
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz
com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz
# Filter contains
>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '~urlkey:example=1')
com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz
# Filter contains invert
>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '!~urlkey:example=')
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz
com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz
# Collapse by timestamp
# unresolved revisits, different statuscode results in an extra repeat
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapseTime = 11)
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
org,iana)/_css/2013.1/screen.css 20140126200653 http://www.iana.org/_css/2013.1/screen.css warc/revisit - BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 533 328367 iana.warc.gz
org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css warc/revisit - BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz
# resolved revisits
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapseTime = '11', resolveRevisits = True)
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz - - -
org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz 8754 41238 iana.warc.gz
# Sort by closest timestamp + field select output
>>> cdx_ops_test(closest = '20140126200826', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', fields = 'timestamp', limit = 10)
# In case of both reverse and closest, closest takes precedence
# 'reverse closest' not supported at this time
# if it is, this test will reflect the change
>>> cdx_ops_test(closest = '20140126200826', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', fields = 'timestamp', limit = 3, reverse = True)
>>> cdx_ops_test(closest = '20140126201306', url = 'http://iana.org/dnssec', resolveRevisits = True, sources = [test_cdx_dir + 'dupes.cdx', test_cdx_dir + 'iana.cdx'])
org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz - - -
org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz - - -
>>> cdx_ops_test(closest = '20140126201307', url = 'http://iana.org/dnssec', resolveRevisits = True)
org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz - - -
org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz - - -
# equal dist prefer earlier
>>> cdx_ops_test(closest = '20140126200700', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolveRevisits = True, limit = 2)
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200654 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 548 482544 iana.warc.gz 117166 198285 iana.warc.gz
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200706 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 495230 iana.warc.gz 117166 198285 iana.warc.gz
>>> cdx_ops_test(closest = '20140126200659', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolveRevisits = True, limit = 2, fields = 'timestamp')
>>> cdx_ops_test(closest = '20140126200701', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolveRevisits = True, limit = 2, fields = 'timestamp')
# Resolve Revisits
>>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/inconsolata.otf', resolveRevisits = True)
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200826 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 34054 620049 iana.warc.gz - - -
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200912 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 546 667073 iana.warc.gz 34054 620049 iana.warc.gz
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200930 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 534 697255 iana.warc.gz 34054 620049 iana.warc.gz
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201055 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 547 714833 iana.warc.gz 34054 620049 iana.warc.gz
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201249 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 551 768625 iana.warc.gz 34054 620049 iana.warc.gz
>>> cdx_ops_test('http://iana.org/domains/root/db', resolveRevisits = True)
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz - - -
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz - - -
from pywb.cdx.cdxserver import CDXServer
import os
import sys
import six
from pywb import get_test_dir
test_cdx_dir = get_test_dir() + 'cdx/'
def cdx_ops_test_data(url, sources = [test_cdx_dir + 'iana.cdx'], **kwparams):
kwparams['url'] = url
if not 'output' in kwparams:
kwparams['output'] = 'cdxobject'
server = CDXServer(sources)
results = server.load_cdx(**kwparams)
return list(results)
def cdx_ops_test(*args, **kwargs):
results = cdx_ops_test_data(*args, **kwargs)
fields = kwargs.get('fields')
if fields:
fields = fields.split(',')
for x in results:
if not isinstance(x, str):
l = x.to_text(fields).replace('\t', ' ')
l = x
def test_cdxj_resolve_revisit():
# Resolve Revisit -- cdxj minimal -- output also json
results = cdx_ops_test_data(url = 'http://example.com/?example=1', sources=[get_test_dir() + 'cdxj/example.cdxj'], resolveRevisits=True)
assert(len(results) == 2)
assert(dict(results[0]) == {"urlkey": "com,example)/?example=1", "timestamp": "20140103030321", "url": "http://example.com?example=1", "length": "1043", "filename": "example.warc.gz", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "offset": "333", "orig.length": "-", "orig.offset": "-", "orig.filename": "-"})
assert(dict(results[1]) == {"urlkey": "com,example)/?example=1", "timestamp": "20140103030341", "url": "http://example.com?example=1", "filename": "example.warc.gz", "length": "553", "mime": "", "offset": "1864", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "orig.length": "1043", "orig.offset": "333", "orig.filename": "example.warc.gz"})
def test_cdxj_resolve_revisit_2():
# Resolve Revisit -- cdxj minimal -- output also json
results = cdx_ops_test_data(url = 'http://example.com/?example=1', sources=[get_test_dir() + 'cdxj/example-no-digest.cdxj'], resolveRevisits=True)
assert(len(results) == 2)
assert(dict(results[0]) == {"urlkey": "com,example)/?example=1", "timestamp": "20140103030321", "url": "http://example.com?example=1", "length": "1043", "filename": "example.warc.gz", "offset": "333", "orig.length": "-", "orig.offset": "-", "orig.filename": "-"})
assert(dict(results[1]) == {"urlkey": "com,example)/?example=1", "timestamp": "20140103030341", "url": "http://example.com?example=1", "length": "553", "filename": "example.warc.gz", "mime": "warc/revisit", "offset": "1864", "orig.length": "-", "orig.offset": "-", "orig.filename": "-"})
if __name__ == "__main__":
import doctest

View File

@ -1,117 +0,0 @@
import yaml
from pywb.cdx.cdxserver import create_cdx_server, CDXServer, RemoteCDXServer
from pywb.cdx.cdxsource import CDXFile, RemoteCDXSource, RedisCDXSource
from pywb.cdx.zipnum import ZipNumCluster
from pywb import get_test_dir
yaml_config = r"""
# local cdx paths
- {0}cdx/example.cdx
# simple remote cdx source, assumes no filtering
- http://cdxserver.example.com/cdx
# customized remote cdx server
- !!python/object:pywb.cdx.cdxsource.RemoteCDXSource {{
remote_url: 'http://cdxserver.example.com/cdx',
cookie: custom_token=value,
remote_processing: true,
# example redis cdx source
- redis://redis.example.com:6379/0
- {0}zipcdx/zipnum-sample.idx
index_paths: http://cdxserver.example.com/cdx
test_3: http://cdxserver.example.com/cdx
test_4: !!python/object:pywb.cdx.cdxsource.RemoteCDXSource {{
remote_url: 'http://cdxserver.example.com/cdx',
cookie: custom_token=value,
remote_processing: true,
test_5: {0}cdx/example.cdx
index_paths: invalid://abc
def test_cdxserver_config():
config = yaml.load(yaml_config)
cdxserver = create_cdx_server(config.get('test_1'))
assert(isinstance(cdxserver, CDXServer))
sources = cdxserver.sources
assert len(sources) == 5
assert type(sources[0]) == CDXFile
assert sources[0].filename.endswith('example.cdx')
# remote source with no remote processing
assert type(sources[1]) == RemoteCDXSource
assert sources[1].remote_url == 'http://cdxserver.example.com/cdx'
assert sources[1].remote_processing == False
# remote cdx server with processing
assert type(sources[2]) == RemoteCDXSource
assert sources[2].remote_url == 'http://cdxserver.example.com/cdx'
assert sources[2].remote_processing == True
# redis source
assert type(sources[3]) == RedisCDXSource
assert sources[3].redis_url == 'redis://redis.example.com:6379/0'
assert type(sources[4]) == ZipNumCluster
assert sources[4].summary.endswith('zipnum-sample.idx')
assert sources[4].loc_resolver.loc_filename.endswith('zipnum-sample.loc')
def assert_remote_cdxserver(config_name):
config = yaml.load(yaml_config)
cdxserver = create_cdx_server(config.get(config_name))
assert(isinstance(cdxserver, RemoteCDXServer))
source = cdxserver.source
# remote cdx server with remote processing
assert type(source) == RemoteCDXSource
assert source.remote_url == 'http://cdxserver.example.com/cdx'
assert source.remote_processing == True
def test_remote_index_path():
def test_no_index_path_remote():
def test_explicit_remote_source():
def test_single_cdx():
config = yaml.load(yaml_config)
cdxserver = create_cdx_server(config.get('test_5'))
assert(isinstance(cdxserver, CDXServer))
sources = cdxserver.sources
assert len(sources) == 1
assert type(sources[0]) == CDXFile
assert sources[0].filename.endswith('example.cdx')
def test_invalid_config():
config = yaml.load(yaml_config)
cdxserver = create_cdx_server(config.get('test_6'))
assert(isinstance(cdxserver, CDXServer))
sources = cdxserver.sources
assert len(sources) == 0

View File

@ -1,78 +0,0 @@
>>> redis_cdx(redis_cdx_server, 'http://example.com')
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz
com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz
>>> redis_cdx(redis_cdx_server_key, 'http://example.com')
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz
com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz
from fakeredis import FakeStrictRedis
from mock import patch
from warcio.timeutils import timestamp_to_sec
from pywb.cdx.cdxsource import RedisCDXSource
from pywb.cdx.cdxserver import CDXServer
from pywb import get_test_dir
import sys
import os
test_cdx_dir = os.path.join(get_test_dir(), 'cdx/')
def load_cdx_into_redis(source, filename, key=None):
# load a cdx into mock redis
with open(test_cdx_dir + filename, 'rb') as fh:
for line in fh:
zadd_cdx(source, line, key)
def zadd_cdx(source, cdx, key):
if key:
source.redis.zadd(key, 0, cdx)
parts = cdx.split(b' ', 2)
key = parts[0]
timestamp = parts[1]
rest = timestamp + b' ' + parts[2]
score = timestamp_to_sec(timestamp.decode('utf-8'))
source.redis.zadd(source.key_prefix + key, score, rest)
@patch('redis.StrictRedis', FakeStrictRedis)
def init_redis_server():
source = RedisCDXSource('redis://')
for f in os.listdir(test_cdx_dir):
if f.endswith('.cdx'):
load_cdx_into_redis(source, f)
return CDXServer([source])
@patch('redis.StrictRedis', FakeStrictRedis)
def init_redis_server_key_file():
source = RedisCDXSource('redis://')
for f in os.listdir(test_cdx_dir):
if f.endswith('.cdx'):
load_cdx_into_redis(source, f, source.cdx_key)
return CDXServer([source])
def redis_cdx(cdx_server, url, **params):
cdx_iter = cdx_server.load_cdx(url=url, **params)
for cdx in cdx_iter:
redis_cdx_server = init_redis_server()
redis_cdx_server_key = init_redis_server_key_file()

View File

@ -1,243 +0,0 @@
>>> zip_ops_test(url='http://iana.org')
org,iana)/ 20140126200624 http://www.iana.org/ text/html 200 OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 2258 334 iana.warc.gz
org,iana)/ 20140127171238 http://iana.org unk 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 343 1858 dupes.warc.gz
org,iana)/ 20140127171238 http://www.iana.org/ warc/revisit - OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 536 2678 dupes.warc.gz
# test idx index (tabs replacad with 4 spaces)
>>> zip_ops_test(url='http://iana.org/domains/', matchType='prefix', showPagedIndex=True)
org,iana)/dnssec 20140126201307 zipnum 8517 373 35
org,iana)/domains/int 20140126201239 zipnum 8890 355 36
org,iana)/domains/root/servers 20140126201227 zipnum 9245 386 37
>>> zip_ops_test(url='http://iana.org/domains/*')
org,iana)/domains/arpa 20140126201248 http://www.iana.org/domains/arpa text/html 200 QOFZZRN6JIKAL2JRL6ZC2VVG42SPKGHT - - 2939 759039 iana.warc.gz
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
org,iana)/domains/idn-tables 20140126201127 http://www.iana.org/domains/idn-tables text/html 200 HNCUFTJMOQOGAEY6T56KVC3T7TVLKGEW - - 8118 715878 iana.warc.gz
org,iana)/domains/int 20140126201239 http://www.iana.org/domains/int text/html 200 X32BBNNORV4SPEHTQF5KI5NFHSKTZK6Q - - 2482 746788 iana.warc.gz
org,iana)/domains/reserved 20140126201054 http://www.iana.org/domains/reserved text/html 200 R5AAEQX5XY5X5DG66B23ODN5DUBWRA27 - - 3573 701457 iana.warc.gz
org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz
# first page
>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', showPagedIndex=True, pageSize=4, page=0)
com,example)/ 20140127171200 zipnum 0 275 1
org,iana)/ 20140127171238 zipnum 275 328 2
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201055 zipnum 603 312 3
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200718 zipnum 915 235 4
# first page -- simplified query
>>> zip_ops_test(url='*.iana.org/path_part_ignored/', showPagedIndex=True, pageSize=4)
com,example)/ 20140127171200 zipnum 0 275 1
org,iana)/ 20140127171238 zipnum 275 328 2
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201055 zipnum 603 312 3
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200718 zipnum 915 235 4
# next page + json
>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', output='json', showPagedIndex=True, pageSize=4, page=1)
{"urlkey": "org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200912", "part": "zipnum", "offset": 1150, "length": 235, "lineno": 5}
{"urlkey": "org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201240", "part": "zipnum", "offset": 1385, "length": 307, "lineno": 6}
{"urlkey": "org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200654", "part": "zipnum", "offset": 1692, "length": 235, "lineno": 7}
{"urlkey": "org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200816", "part": "zipnum", "offset": 1927, "length": 231, "lineno": 8}
# last page
>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', showPagedIndex=True, pageSize=4, page=9)
org,iana)/domains/root/servers 20140126201227 zipnum 9245 386 37
org,iana)/time-zones 20140126200737 zipnum 9631 166 38
# last page cdx
>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', pageSize=4, page=9)
org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz
org,iana)/numbers 20140126200651 http://www.iana.org/numbers text/html 200 HWT5UZKURYLW5QNWVZCWFCANGEMU7XWK - - 3498 321385 iana.warc.gz
org,iana)/performance/ietf-draft-status 20140126200815 http://www.iana.org/performance/ietf-draft-status text/html 200 T5IQTX6DWV5KABGH454CYEDWKRI5Y23E - - 2940 597667 iana.warc.gz
org,iana)/performance/ietf-statistics 20140126200804 http://www.iana.org/performance/ietf-statistics text/html 200 XOFML5WNBQMTSULLIIPLSP6U5MX33HN6 - - 3712 582987 iana.warc.gz
org,iana)/protocols 20140126200715 http://www.iana.org/protocols text/html 200 IRUJZEUAXOUUG224ZMI4VWTUPJX6XJTT - - 63663 496277 iana.warc.gz
org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
org,iana)/time-zones/x 20140126200737 http://www.iana.org/time-zones/X text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
org,iana)/time-zones/y 20140126200737 http://www.iana.org/time-zones/Y text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
# last page reverse -- not yet supported
#>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', reverse=True, showPagedIndex=True, pageSize=4, page=9)
#org,iana)/time-zones 20140126200737 zipnum 9623 145 38
#org,iana)/domains/root/servers 20140126201227 zipnum 9237 386 37
# last page reverse CDX
>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', reverse=True, pageSize=4, page=9)
org,iana)/time-zones/y 20140126200737 http://www.iana.org/time-zones/Y text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
org,iana)/time-zones/x 20140126200737 http://www.iana.org/time-zones/X text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
org,iana)/protocols 20140126200715 http://www.iana.org/protocols text/html 200 IRUJZEUAXOUUG224ZMI4VWTUPJX6XJTT - - 63663 496277 iana.warc.gz
org,iana)/performance/ietf-statistics 20140126200804 http://www.iana.org/performance/ietf-statistics text/html 200 XOFML5WNBQMTSULLIIPLSP6U5MX33HN6 - - 3712 582987 iana.warc.gz
org,iana)/performance/ietf-draft-status 20140126200815 http://www.iana.org/performance/ietf-draft-status text/html 200 T5IQTX6DWV5KABGH454CYEDWKRI5Y23E - - 2940 597667 iana.warc.gz
org,iana)/numbers 20140126200651 http://www.iana.org/numbers text/html 200 HWT5UZKURYLW5QNWVZCWFCANGEMU7XWK - - 3498 321385 iana.warc.gz
org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz
# last url prefix
>>> zip_ops_test(url='http://iana.org/time-zones*')
org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
org,iana)/time-zones/x 20140126200737 http://www.iana.org/time-zones/X text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
org,iana)/time-zones/y 20140126200737 http://www.iana.org/time-zones/Y text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
# last url prefix w/ slash
>>> zip_ops_test(url='http://iana.org/time-zones/*')
org,iana)/time-zones/x 20140126200737 http://www.iana.org/time-zones/X text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
org,iana)/time-zones/y 20140126200737 http://www.iana.org/time-zones/Y text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
# last url exact
>>> zip_ops_test(url='http://iana.org/time-zones/Y')
org,iana)/time-zones/y 20140126200737 http://www.iana.org/time-zones/Y text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
# invalid page
>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', showPagedIndex=True, pageSize=4, page=10) # doctest: +IGNORE_EXCEPTION_DETAIL
Traceback (most recent call last):
CDXException: Page 10 invalid: First Page is 0, Last Page is 9
>>> zip_ops_test(url='http://aaa.aaa/', matchType='exact', showPagedIndex=True) # doctest: +IGNORE_EXCEPTION_DETAIL
Traceback (most recent call last):
NotFoundException: No Captures found for: http://aaa.aaa/
>>> zip_ops_test(url='http://aaa.aaa/', matchType='domain', showPagedIndex=True) # doctest: +IGNORE_EXCEPTION_DETAIL
Traceback (most recent call last):
NotFoundException: No Captures found for: http://aaa.aaa/ (domain query)
# list last index line, as we don't know if there are any captures at end
>>> zip_ops_test(url='http://aaa.zz/', matchType='domain', showPagedIndex=True)
org,iana)/time-zones 20140126200737 zipnum 9631 166 38
# read cdx to find no captures
>>> zip_ops_test(url='http://aaa.zz/', matchType='domain') # doctest: +IGNORE_EXCEPTION_DETAIL
Traceback (most recent call last):
NotFoundException: No Captures found for: http://aaa.zz/ (domain query)
# Invalid .idx filesor or missing loc
>>> zip_test_err(url='http://example.com/', matchType='exact') # doctest: +IGNORE_EXCEPTION_DETAIL
Traceback (most recent call last):
Exception: No Locations Found for: foo
>>> zip_test_err(url='http://example.zz/x', matchType='exact') # doctest: +IGNORE_EXCEPTION_DETAIL
Traceback (most recent call last):
Exception: No Locations Found for: foo2
from test_cdxops import cdx_ops_test, cdx_ops_test_data
from pywb import get_test_dir
from pywb.cdx.cdxserver import CDXServer
import shutil
import tempfile
import os
import json
import pytest
test_zipnum = get_test_dir() + 'zipcdx/zipnum-sample.idx'
def zip_ops_test_data(url, **kwargs):
sources = test_zipnum
return json.loads(cdx_ops_test_data(url, sources, **kwargs)[0])
def zip_ops_test(url, **kwargs):
sources = test_zipnum
cdx_ops_test(url, sources, **kwargs)
def zip_test_err(url, **kwargs):
sources = get_test_dir() + 'zipcdx/zipnum-bad.idx'
cdx_ops_test(url, sources, **kwargs)
def test_zip_prefix_load():
tmpdir = tempfile.mkdtemp()
shutil.copy(test_zipnum, tmpdir)
shutil.copy(get_test_dir() + 'zipcdx/zipnum-sample.cdx.gz',
os.path.join(tmpdir, 'zipnum'))
config['shard_index_loc'] = dict(match='(.*)',
server = CDXServer(os.path.join(tmpdir, 'zipnum-sample.idx'),
# Test Page Count
results = server.load_cdx(url='iana.org/',
results = list(results)
assert len(results) == 1, results
assert json.loads(results[0]) == {"blocks": 38, "pages": 4, "pageSize": 10}
# Test simple query
results = server.load_cdx(url='iana.org/')
results = list(results)
assert len(results) ==3, results
assert '20140126200624' in results[0]
assert '20140127171238' in results[1]
assert 'warc/revisit' in results[2]
def test_blocks_def_page_size():
# Pages -- default page size
res = zip_ops_test_data(url='http://iana.org/domains/example', matchType='exact', showNumPages=True)
assert(res == {"blocks": 1, "pages": 1, "pageSize": 10})
def test_blocks_def_size_2():
res = zip_ops_test_data(url='http://iana.org/domains/', matchType='domain', showNumPages=True)
assert(res == {"blocks": 38, "pages": 4, "pageSize": 10})
def test_blocks_set_page_size():
# set page size
res = zip_ops_test_data(url='http://iana.org/domains/', matchType='domain', pageSize=4, showNumPages=True)
assert(res == {"blocks": 38, "pages": 10, "pageSize": 4})
def test_blocks_alt_q():
# set page size -- alt domain query
res = zip_ops_test_data(url='*.iana.org', pageSize='4', showNumPages=True)
assert(res == {"blocks": 38, "pages": 10, "pageSize": 4})
def test_blocks_secondary_match():
# page size for non-existent, but secondary index match
res = zip_ops_test_data(url='iana.org/domains/int/blah', pageSize=4, showNumPages=True)
assert(res == {"blocks": 0, "pages": 0, "pageSize": 4})
def test_blocks_no_match():
# page size for non-existent, no secondary index match
res = zip_ops_test_data(url='*.foo.bar', showNumPages=True)
assert(res == {"blocks": 0, "pages": 0, "pageSize": 10})
def test_blocks_zero_pages():
# read cdx to find 0 pages
res = zip_ops_test_data(url='http://aaa.zz/', matchType='domain', showNumPages=True)
assert(res == {"blocks": 0, "pages": 0, "pageSize": 10})
# Errors
def test_err_file_not_found():
with pytest.raises(IOError):
zip_test_err(url='http://iana.org/x', matchType='exact') # doctest: +IGNORE_EXCEPTION_DETAIL
if __name__ == "__main__":
import doctest

View File

@ -1,353 +0,0 @@
import os
import collections
import itertools
import logging
from io import BytesIO
import datetime
import json
import six
from six.moves import map
from pywb.cdx.cdxsource import CDXSource
from pywb.cdx.cdxobject import IDXObject, CDXException
from pywb.utils.loaders import BlockLoader, read_last_line
from warcio.bufferedreaders import gzip_decompressor
from pywb.utils.binsearch import iter_range, linearsearch, search
class ZipBlocks:
def __init__(self, part, offset, length, count):
self.part = part
self.offset = offset
self.length = length
self.count = count
#TODO: see if these could be combined with warc path resolvers
class LocMapResolver(object):
""" Lookup shards based on a file mapping
shard name to one or more paths. The entries are
tab delimited.
def __init__(self, loc_summary, loc_filename):
# initial loc map
self.loc_map = {}
self.loc_mtime = 0
if not loc_filename:
splits = os.path.splitext(loc_summary)
loc_filename = splits[0] + '.loc'
self.loc_filename = loc_filename
def load_loc(self):
# check modified time of current file before loading
new_mtime = os.path.getmtime(self.loc_filename)
if (new_mtime == self.loc_mtime):
# update loc file mtime
self.loc_mtime = new_mtime
local_dir = os.path.dirname(self.loc_filename)
def res_path(pathname):
if '://' not in pathname:
pathname = os.path.join(local_dir, pathname)
return pathname
logging.debug('Loading loc from: ' + self.loc_filename)
with open(self.loc_filename, 'r') as fh:
for line in fh:
parts = line.rstrip().split('\t')
paths = [res_path(pathname) for pathname in parts[1:]]
self.loc_map[parts[0]] = paths
def __call__(self, part, query):
return self.loc_map[part]
class LocPrefixResolver(object):
""" Use a prefix lookup, where the prefix can either be a fixed
string or can be a regex replacement of the index summary path
def __init__(self, loc_summary, loc_config):
import re
loc_match = loc_config.get('match', '().*')
loc_replace = loc_config['replace']
loc_summary = os.path.dirname(loc_summary) + '/'
self.prefix = re.sub(loc_match, loc_replace, loc_summary)
def load_loc(self):
def __call__(self, part, query):
return [self.prefix + part]
class ZipNumCluster(CDXSource):
def __init__(self, summary, config=None):
self.max_blocks = self.DEFAULT_MAX_BLOCKS
self.loc_resolver = None
loc = None
cookie_maker = None
reload_ival = self.DEFAULT_RELOAD_INTERVAL
if config:
loc = config.get('shard_index_loc')
cookie_maker = config.get('cookie_maker')
self.max_blocks = config.get('max_blocks', self.max_blocks)
reload_ival = config.get('reload_interval', reload_ival)
if isinstance(loc, dict):
self.loc_resolver = LocPrefixResolver(summary, loc)
self.loc_resolver = LocMapResolver(summary, loc)
self.summary = summary
# reload interval
self.loc_update_time = datetime.datetime.now()
self.reload_interval = datetime.timedelta(minutes=reload_ival)
self.blk_loader = BlockLoader(cookie_maker=cookie_maker)
# @staticmethod
# def reload_timed(timestamp, val, delta, func):
# now = datetime.datetime.now()
# if now - timestamp >= delta:
# func()
# return now
# return None
# def reload_loc(self):
# reload_time = self.reload_timed(self.loc_update_time,
# self.loc_map,
# self.reload_interval,
# self.load_loc)
# if reload_time:
# self.loc_update_time = reload_time
def load_cdx(self, query):
return self._do_load_cdx(self.summary, query)
def _do_load_cdx(self, filename, query):
reader = open(filename, 'rb')
idx_iter = self.compute_page_range(reader, query)
if query.secondary_index_only or query.page_count:
return idx_iter
blocks = self.idx_to_cdx(idx_iter, query)
def gen_cdx():
for blk in blocks:
for cdx in blk:
yield cdx
return gen_cdx()
def _page_info(self, pages, pagesize, blocks):
info = dict(pages=pages,
return json.dumps(info) + '\n'
def compute_page_range(self, reader, query):
pagesize = query.page_size
if not pagesize:
pagesize = self.max_blocks
pagesize = int(pagesize)
last_line = None
# Get End
end_iter = search(reader, query.end_key, prev_size=1)
end_line = six.next(end_iter)
except StopIteration:
last_line = read_last_line(reader)
end_line = last_line
# Get Start
first_iter = iter_range(reader,
first_line = six.next(first_iter)
except StopIteration:
if end_line == last_line and query.key >= last_line:
first_line = last_line
if query.page_count:
yield self._page_info(0, pagesize, 0)
first = IDXObject(first_line)
end = IDXObject(end_line)
blocks = end['lineno'] - first['lineno']
total_pages = int(blocks / pagesize) + 1
blocks = -1
total_pages = 1
if query.page_count:
# same line, so actually need to look at cdx
# to determine if it exists
if blocks == 0:
block_cdx_iter = self.idx_to_cdx([first_line], query)
block = six.next(block_cdx_iter)
cdx = six.next(block)
except StopIteration:
total_pages = 0
blocks = -1
yield self._page_info(total_pages, pagesize, blocks + 1)
curr_page = query.page
if curr_page >= total_pages or curr_page < 0:
msg = 'Page {0} invalid: First Page is 0, Last Page is {1}'
raise CDXException(msg.format(curr_page, total_pages - 1))
startline = curr_page * pagesize
endline = startline + pagesize - 1
if blocks >= 0:
endline = min(endline, blocks)
if curr_page == 0:
yield first_line
startline -= 1
idxiter = itertools.islice(first_iter, startline, endline)
for idx in idxiter:
yield idx
def search_by_line_num(self, reader, line): # pragma: no cover
def line_cmp(line1, line2):
line1_no = int(line1.rsplit(b'\t', 1)[-1])
line2_no = int(line2.rsplit(b'\t', 1)[-1])
return cmp(line1_no, line2_no)
line_iter = search(reader, line, compare_func=line_cmp)
yield six.next(line_iter)
def idx_to_cdx(self, idx_iter, query):
blocks = None
ranges = []
for idx in idx_iter:
idx = IDXObject(idx)
if (blocks and blocks.part == idx['part'] and
blocks.offset + blocks.length == idx['offset'] and
blocks.count < self.max_blocks):
blocks.length += idx['length']
blocks.count += 1
if blocks:
yield self.block_to_cdx_iter(blocks, ranges, query)
blocks = ZipBlocks(idx['part'],
ranges = [blocks.length]
if blocks:
yield self.block_to_cdx_iter(blocks, ranges, query)
def block_to_cdx_iter(self, blocks, ranges, query):
last_exc = None
last_traceback = None
locations = self.loc_resolver(blocks.part, query)
raise Exception('No Locations Found for: ' + blocks.part)
for location in self.loc_resolver(blocks.part, query):
return self.load_blocks(location, blocks, ranges, query)
except Exception as exc:
last_exc = exc
import sys
last_traceback = sys.exc_info()[2]
if last_exc:
six.reraise(Exception, last_exc, last_traceback)
#raise last_exc
raise Exception('No Locations Found for: ' + blocks.part)
def load_blocks(self, location, blocks, ranges, query):
""" Load one or more blocks of compressed cdx lines, return
a line iterator which decompresses and returns one line at a time,
bounded by query.key and query.end_key
if (logging.getLogger().getEffectiveLevel() <= logging.DEBUG):
msg = 'Loading {b.count} blocks from {loc}:{b.offset}+{b.length}'
logging.debug(msg.format(b=blocks, loc=location))
reader = self.blk_loader.load(location, blocks.offset, blocks.length)
def decompress_block(range_):
decomp = gzip_decompressor()
buff = decomp.decompress(reader.read(range_))
for line in BytesIO(buff):
yield line
iter_ = itertools.chain(*map(decompress_block, ranges))
# start bound
iter_ = linearsearch(iter_, query.key)
# end bound
iter_ = itertools.takewhile(lambda line: line < query.end_key, iter_)
return iter_
def __str__(self):
return 'ZipNum Cluster: {0}, {1}'.format(self.summary,

View File

@ -1,245 +0,0 @@
from six.moves.urllib.parse import urlsplit, urlunsplit, quote
import re
from pywb.rewrite.url_rewriter import UrlRewriter
from pywb.rewrite.wburl import WbUrl
from pywb.framework.wbrequestresponse import WbRequest, WbResponse
# ArchivalRouter -- route WB requests in archival mode
class ArchivalRouter(object):
def __init__(self, routes, **kwargs):
self.routes = routes
# optional port setting may be ignored by wsgi container
self.port = kwargs.get('port')
self.fallback = ReferRedirect()
self.abs_path = kwargs.get('abs_path')
self.home_view = kwargs.get('home_view')
self.error_view = kwargs.get('error_view')
self.info_view = kwargs.get('info_view')
config = kwargs.get('config', {})
self.urlrewriter_class = config.get('urlrewriter_class', UrlRewriter)
self.enable_coll_info = config.get('enable_coll_info', False)
def __call__(self, env):
request_uri = self.ensure_rel_uri_set(env)
for route in self.routes:
matcher, coll = route.is_handling(request_uri)
if matcher:
wbrequest = self.parse_request(route, env, matcher,
coll, request_uri,
return route.handler(wbrequest)
# Default Home Page
if request_uri in ['/', '/index.html', '/index.htm']:
return self.render_home_page(env)
if self.enable_coll_info and request_uri in ['/collinfo.json']:
params = env.get('pywb.template_params', {})
host = WbRequest.make_host_prefix(env)
return self.info_view.render_response(env=env, host=host, routes=self.routes,
return self.fallback(env, self) if self.fallback else None
def parse_request(self, route, env, matcher, coll, request_uri,
matched_str = matcher.group(0)
rel_prefix = env.get('SCRIPT_NAME', '') + '/'
if matched_str:
rel_prefix += matched_str + '/'
# remove the '/' + rel_prefix part of uri
wb_url_str = request_uri[len(matched_str) + 2:]
# the request_uri is the wb_url, since no coll
wb_url_str = request_uri[1:]
wbrequest = route.request_class(env,
# Allow for applying of additional filters
route.apply_filters(wbrequest, matcher)
return wbrequest
def render_home_page(self, env):
if self.home_view:
params = env.get('pywb.template_params', {})
return self.home_view.render_response(env=env, routes=self.routes, **params)
return None
# adapted from wsgiref.request_uri, but doesn't include domain name
# and allows all characters which are allowed in the path segment
# according to: http://tools.ietf.org/html/rfc3986#section-3.3
# explained here:
# http://stackoverflow.com/questions/4669692/
# valid-characters-for-directory-part-of-a-url-for-short-links
def ensure_rel_uri_set(env):
""" Return the full requested path, including the query string
if 'REL_REQUEST_URI' in env:
return env['REL_REQUEST_URI']
if not env.get('SCRIPT_NAME') and env.get('REQUEST_URI'):
return env['REL_REQUEST_URI']
url = quote(env.get('PATH_INFO', ''), safe='/~!$&\'()*+,;=:@')
query = env.get('QUERY_STRING')
if query:
url += '?' + query
env['REL_REQUEST_URI'] = url
return url
# Route by matching regex (or fixed prefix)
# of request uri (excluding first '/')
class Route(object):
# match upto next / or ? or end
def __init__(self, regex, handler, config=None,
config = config or {}
self.path = regex
if regex:
self.regex = re.compile(regex + lookahead)
self.regex = re.compile('')
self.handler = handler
self.request_class = request_class
# collection id from regex group (default 0)
self.coll_group = int(config.get('coll_group', 0))
self.cookie_scope = config.get('cookie_scope')
self.rewrite_opts = config.get('rewrite_opts', {})
self.user_metadata = config.get('metadata', {})
def is_handling(self, request_uri):
matcher = self.regex.match(request_uri[1:])
if not matcher:
return None, None
coll = matcher.group(self.coll_group)
return matcher, coll
def apply_filters(self, wbrequest, matcher):
for filter in self.filters:
last_grp = len(matcher.groups())
filter_str = filter.format(matcher.group(last_grp))
def _custom_init(self, config):
self.filters = config.get('filters', [])
# ReferRedirect -- redirect urls that have 'fallen through'
# based on the referrer settings
class ReferRedirect:
def __call__(self, env, the_router):
referrer = env.get('HTTP_REFERER')
routes = the_router.routes
# ensure there is a referrer
if referrer is None:
return None
# get referrer path name
ref_split = urlsplit(referrer)
# require that referrer starts with current Host, if any
curr_host = env.get('HTTP_HOST')
if curr_host and curr_host != ref_split.netloc:
return None
path = ref_split.path
app_path = env.get('SCRIPT_NAME', '')
if app_path:
# must start with current app name, if not root
if not path.startswith(app_path):
return None
path = path[len(app_path):]
ref_route = None
ref_request = None
for route in routes:
matcher, coll = route.is_handling(path)
if matcher:
ref_request = the_router.parse_request(route, env,
matcher, coll, path)
ref_route = route
# must have matched one of the routes with a urlrewriter
if not ref_request or not ref_request.urlrewriter:
return None
rewriter = ref_request.urlrewriter
rel_request_uri = env['REL_REQUEST_URI']
timestamp_path = '/' + rewriter.wburl.timestamp + '/'
# check if timestamp is already part of the path
if rel_request_uri.startswith(timestamp_path):
# remove timestamp but leave / to make host relative url
# 2013/path.html -> /path.html
rel_request_uri = rel_request_uri[len(timestamp_path) - 1:]
rewritten_url = rewriter.rewrite(rel_request_uri)
# if post, can't redirect as that would lost the post data
# (can't use 307 because FF will show confirmation warning)
if ref_request.method == 'POST':
new_wb_url = WbUrl(rewritten_url[len(rewriter.prefix):])
ref_request.wb_url.url = new_wb_url.url
return ref_route.handler(ref_request)
final_url = urlunsplit((ref_split.scheme,
return WbResponse.redir_response(final_url, status='302 Temp Redirect')

View File

@ -1,23 +0,0 @@
from pywb.rewrite.wburl import WbUrl
class BaseHandler(object):
Represents a base handler class that handles any request
def __call__(self, wbrequest): # pragma: no cover
raise NotImplementedError('Need to implement in derived class')
def get_wburl_type(self):
return None
class WbUrlHandler(BaseHandler):
Represents a handler which assumes the request contains a WbUrl
Ensure that the WbUrl is parsed in the request
def get_wburl_type(self):
return WbUrl

View File

@ -1,62 +0,0 @@
try: # pragma: no cover
import uwsgi
uwsgi_cache = True
except ImportError:
uwsgi_cache = False
from redis import StrictRedis
from pywb.utils.loaders import to_native_str
class UwsgiCache(object): # pragma: no cover
def __setitem__(self, item, value):
uwsgi.cache_update(item, value)
def __getitem__(self, item):
return uwsgi.cache_get(item)
def __contains__(self, item):
return uwsgi.cache_exists(item)
def __delitem__(self, item):
class DefaultCache(dict):
def __getitem__(self, item):
return self.get(item)
class RedisCache(object):
def __init__(self, redis_url):
# must be of the form redis://host:port/db/key
redis_url, key = redis_url.rsplit('/', 1)
self.redis = StrictRedis.from_url(redis_url)
self.key = key
def __setitem__(self, item, value):
self.redis.hset(self.key, item, value)
def __getitem__(self, item):
return to_native_str(self.redis.hget(self.key, item), 'utf-8')
def __contains__(self, item):
return self.redis.hexists(self.key, item)
def __delitem__(self, item):
self.redis.hdel(self.key, item)
def create_cache(redis_url_key=None):
if redis_url_key:
return RedisCache(redis_url_key)
if uwsgi_cache: # pragma: no cover
return UwsgiCache()
return DefaultCache()

View File

@ -1,231 +0,0 @@
from pywb.utils.wbexception import BadRequestException
from warcio.timeutils import http_date_to_timestamp
from warcio.timeutils import timestamp_to_http_date
from pywb.framework.wbrequestresponse import WbRequest, WbResponse
from pywb.rewrite.wburl import WbUrl
import six
LINK_FORMAT = 'application/link-format'
class MementoReqMixin(object):
def _parse_extra(self):
if not self.wb_url:
if self.wb_url.type != self.wb_url.LATEST_REPLAY:
self.options['is_timegate'] = True
accept_datetime = self.env.get('HTTP_ACCEPT_DATETIME')
if not accept_datetime:
timestamp = http_date_to_timestamp(accept_datetime)
except Exception:
raise BadRequestException('Invalid Accept-Datetime: ' +
# note: this changes from LATEST_REPLAY -> REPLAY
class MementoRequest(MementoReqMixin, WbRequest):
class MementoRespMixin(object):
def _init_derived(self, params):
wbrequest = params.get('wbrequest')
is_redirect = params.get('memento_is_redir', False)
cdx = params.get('cdx')
if not wbrequest or not wbrequest.wb_url:
mod = wbrequest.options.get('replay_mod', '')
#is_top_frame = wbrequest.wb_url.is_top_frame
is_top_frame = wbrequest.options.get('is_top_frame', False)
is_timegate = (wbrequest.options.get('is_timegate', False) and
not is_top_frame)
if is_timegate:
self.status_headers.replace_header('Vary', 'accept-datetime')
# Determine if memento:
is_memento = False
is_original = False
# if no cdx included, not a memento, unless top-frame special
if not cdx:
# special case: include the headers but except Memento-Datetime
# since this is really an intermediate resource
if is_top_frame:
is_memento = True
# otherwise, if in proxy mode, then always a memento
elif wbrequest.options['is_proxy']:
is_memento = True
is_original = True
# otherwise only if timestamp replay (and not a timegate)
#elif not is_timegate:
# is_memento = (wbrequest.wb_url.type == wbrequest.wb_url.REPLAY)
elif not is_redirect:
is_memento = (wbrequest.wb_url.is_replay())
link = []
req_url = wbrequest.wb_url.url
if is_memento or is_timegate:
url = req_url
if cdx:
ts = cdx['timestamp']
url = cdx['url']
# for top frame
elif wbrequest.wb_url.timestamp:
ts = wbrequest.wb_url.timestamp
ts = None
if ts:
http_date = timestamp_to_http_date(ts)
if is_memento:
canon_link = wbrequest.urlrewriter.get_new_url(mod=mod,
# set in replay_views -- Must set content location
#if is_memento and is_timegate:
# self.status_headers.headers.append(('Content-Location',
# canon_link))
# don't set memento link for very long urls...
if len(canon_link) < 512:
if is_original and is_timegate:
link.append(self.make_link(req_url, 'original timegate'))
link.append(self.make_link(req_url, 'original'))
# for now, include timemap only in non-proxy mode
if not wbrequest.options['is_proxy'] and (is_memento or is_timegate):
if is_memento and not is_timegate:
timegate = wbrequest.urlrewriter.get_new_url(mod=mod, timestamp='')
link.append(self.make_link(timegate, 'timegate'))
link = ', '.join(link)
self.status_headers.replace_header('Link', link)
def make_link(self, url, type):
return '<{0}>; rel="{1}"'.format(url, type)
def make_memento_link(self, url, type_, dt):
return '<{0}>; rel="{1}"; datetime="{2}"'.format(url, type_, dt)
def make_timemap_link(self, wbrequest):
format_ = '<{0}>; rel="timemap"; type="{1}"'
url = wbrequest.urlrewriter.get_new_url(mod='timemap',
return format_.format(url, LINK_FORMAT)
class MementoResponse(MementoRespMixin, WbResponse):
def make_timemap_memento_link(cdx, prefix, datetime=None,
rel='memento', end=',\n', mod=''):
memento = '<{0}>; rel="{1}"; datetime="{2}"' + end
string = WbUrl.to_wburl_str(url=cdx['url'],
url = prefix + string
if not datetime:
datetime = timestamp_to_http_date(cdx['timestamp'])
return memento.format(url, rel, datetime)
def make_timemap(wbrequest, cdx_lines):
prefix = wbrequest.wb_prefix
url = wbrequest.wb_url.url
mod = wbrequest.options.get('replay_mod', '')
# get first memento as it'll be used for 'from' field
first_cdx = six.next(cdx_lines)
from_date = timestamp_to_http_date(first_cdx['timestamp'])
except StopIteration:
first_cdx = None
if first_cdx:
# timemap link
timemap = ('<{0}>; rel="self"; ' +
'type="application/link-format"; from="{1}",\n')
yield timemap.format(prefix + wbrequest.wb_url.to_str(),
# original link
original = '<{0}>; rel="original",\n'
yield original.format(url)
# timegate link
timegate = '<{0}>; rel="timegate",\n'
timegate_url= WbUrl.to_wburl_str(url=url,
yield timegate.format(prefix + timegate_url)
if not first_cdx:
# terminating timemap link, no from
timemap = ('<{0}>; rel="self"; type="application/link-format"')
yield timemap.format(prefix + wbrequest.wb_url.to_str())
# first memento link
yield make_timemap_memento_link(first_cdx, prefix,
datetime=from_date, mod=mod)
prev_cdx = None
for cdx in cdx_lines:
if prev_cdx:
yield make_timemap_memento_link(prev_cdx, prefix, mod=mod)
prev_cdx = cdx
# last memento link, if any
if prev_cdx:
yield make_timemap_memento_link(prev_cdx, prefix, end='', mod=mod)

View File

@ -1,463 +0,0 @@
from __future__ import absolute_import
from pywb.framework.wbrequestresponse import WbResponse, WbRequest
from pywb.framework.archivalrouter import ArchivalRouter
from six.moves.urllib.parse import urlsplit
from six import iteritems
import base64
import socket
import ssl
from io import BytesIO
from pywb.rewrite.url_rewriter import SchemeOnlyUrlRewriter, UrlRewriter
from pywb.rewrite.rewrite_content import RewriteContent
from pywb.utils.wbexception import BadRequestException
from warcio.bufferedreaders import BufferedReader
from warcio.utils import to_native_str
from pywb.framework.proxy_resolvers import ProxyAuthResolver, CookieResolver, IPCacheResolver
from tempfile import SpooledTemporaryFile
class ProxyArchivalRouter(ArchivalRouter):
A router which combines both archival and proxy modes support
First, request is treated as a proxy request using ProxyRouter
Second, if not handled by the router, it is treated as a regular
archival mode request.
def __init__(self, routes, **kwargs):
super(ProxyArchivalRouter, self).__init__(routes, **kwargs)
self.proxy = ProxyRouter(routes, **kwargs)
def __call__(self, env):
response = self.proxy(env)
if response:
return response
response = super(ProxyArchivalRouter, self).__call__(env)
if response:
return response
class ProxyRouter(object):
A router which supports http proxy mode requests
Handles requests of the form: GET http://example.com
The router returns latest capture by default.
However, if Memento protocol support is enabled,
the memento Accept-Datetime header can be used
to select specific capture.
See: http://www.mementoweb.org/guide/rfc/#Pattern1.3
for more details.
DEF_MAGIC_NAME = 'pywb.proxy'
CERT_DL_PEM = '/pywb-ca.pem'
CERT_DL_P12 = '/pywb-ca.p12'
CA_ROOT_FILE = './ca/pywb-ca.pem'
CA_ROOT_NAME = 'pywb https proxy replay CA'
CA_CERTS_DIR = './ca/certs/'
EXTRA_HEADERS = {'cache-control': 'no-cache',
'connection': 'close',
def __init__(self, routes, **kwargs):
self.error_view = kwargs.get('error_view')
proxy_options = kwargs.get('config', {})
if proxy_options:
proxy_options = proxy_options.get('proxy_options', {})
self.magic_name = proxy_options.get('magic_name')
if not self.magic_name:
self.magic_name = self.DEF_MAGIC_NAME
proxy_options['magic_name'] = self.magic_name
self.extra_headers = proxy_options.get('extra_headers')
if not self.extra_headers:
self.extra_headers = self.EXTRA_HEADERS
proxy_options['extra_headers'] = self.extra_headers
res_type = proxy_options.get('cookie_resolver', True)
if res_type == 'auth' or not res_type:
self.resolver = ProxyAuthResolver(routes, proxy_options)
elif res_type == 'ip':
self.resolver = IPCacheResolver(routes, proxy_options)
#elif res_type == True or res_type == 'cookie':
# self.resolver = CookieResolver(routes, proxy_options)
self.resolver = CookieResolver(routes, proxy_options)
self.use_banner = proxy_options.get('use_banner', True)
self.use_wombat = proxy_options.get('use_client_rewrite', True)
self.proxy_cert_dl_view = proxy_options.get('proxy_cert_download_view')
if not proxy_options.get('enable_https_proxy'):
self.ca = None
from certauth.certauth import CertificateAuthority
except ImportError: #pragma: no cover
print('HTTPS proxy is not available as the "certauth" module ' +
'is not installed')
print('Please install via "pip install certauth" ' +
'to enable HTTPS support')
self.ca = None
# HTTPS Only Options
ca_file = proxy_options.get('root_ca_file', self.CA_ROOT_FILE)
# attempt to create the root_ca_file if doesn't exist
# (generally recommended to create this seperately)
ca_name = proxy_options.get('root_ca_name', self.CA_ROOT_NAME)
certs_dir = proxy_options.get('certs_dir', self.CA_CERTS_DIR)
self.ca = CertificateAuthority(ca_file=ca_file,
self.use_wildcard = proxy_options.get('use_wildcard_certs', True)
def __call__(self, env):
is_https = (env['REQUEST_METHOD'] == 'CONNECT')
# for non-https requests, check non-proxy urls
if not is_https:
url = env['REL_REQUEST_URI']
if not url.startswith(('http://', 'https://')):
return None
env['pywb.proxy_scheme'] = 'http'
route = None
coll = None
matcher = None
response = None
ts = None
# check resolver, for pre connect resolve
if self.resolver.pre_connect:
route, coll, matcher, ts, response = self.resolver.resolve(env)
if response:
return response
# do connect, then get updated url
if is_https:
response = self.handle_connect(env)
if response:
return response
url = env['REL_REQUEST_URI']
parts = urlsplit(env['REL_REQUEST_URI'])
hostport = parts.netloc.split(':', 1)
env['pywb.proxy_host'] = hostport[0]
env['pywb.proxy_port'] = hostport[1] if len(hostport) == 2 else ''
env['pywb.proxy_req_uri'] = parts.path
if parts.query:
env['pywb.proxy_req_uri'] += '?' + parts.query
env['pywb.proxy_query'] = parts.query
if self.resolver.supports_switching:
env['pywb_proxy_magic'] = self.magic_name
# route (static) and other resources to archival replay
if env['pywb.proxy_host'] == self.magic_name:
env['REL_REQUEST_URI'] = env['pywb.proxy_req_uri']
# special case for proxy install
response = self.handle_cert_install(env)
if response:
return response
return None
# check resolver, post connect
if not self.resolver.pre_connect:
route, coll, matcher, ts, response = self.resolver.resolve(env)
if response:
return response
rel_prefix = ''
custom_prefix = env.get('HTTP_PYWB_REWRITE_PREFIX', '')
if custom_prefix:
host_prefix = custom_prefix
urlrewriter_class = UrlRewriter
abs_prefix = True
# always rewrite to absolute here
rewrite_opts = dict(no_match_rel=True)
host_prefix = env['pywb.proxy_scheme'] + '://' + self.magic_name
urlrewriter_class = SchemeOnlyUrlRewriter
abs_prefix = False
rewrite_opts = {}
# special case for proxy calendar
if (env['pywb.proxy_host'] == 'query.' + self.magic_name):
url = env['pywb.proxy_req_uri'][1:]
rel_prefix = '/'
if ts is not None:
url = ts + '/' + url
wbrequest = route.request_class(env,
if matcher:
route.apply_filters(wbrequest, matcher)
# full rewrite and banner
if self.use_wombat and self.use_banner:
wbrequest.wb_url.mod = ''
elif self.use_banner:
# banner only, no rewrite
wbrequest.wb_url.mod = 'bn_'
# unaltered, no rewrite or banner
wbrequest.wb_url.mod = 'uo_'
response = route.handler(wbrequest)
if not response:
return None
# add extra headers for replay responses
if wbrequest.wb_url and wbrequest.wb_url.is_replay():
for name, value in iteritems(self.extra_headers):
response.status_headers.replace_header(name, value)
# check for content-length
res = response.status_headers.get_header('content-length')
if int(res) > 0:
return response
# need to either chunk or buffer to get content-length
if env.get('SERVER_PROTOCOL') == 'HTTP/1.1':
response.status_headers.headers.append(('Transfer-Encoding', 'chunked'))
response.body = self._chunk_encode(response.body)
response.body = self._buffer_response(response.status_headers,
return response
def _chunk_encode(orig_iter):
for chunk in orig_iter:
if not len(chunk):
chunk_len = b'%X\r\n' % len(chunk)
yield chunk_len
yield chunk
yield b'\r\n'
yield b'0\r\n\r\n'
def _buffer_response(status_headers, iterator):
out = SpooledTemporaryFile(ProxyRouter.BUFF_RESPONSE_MEM_SIZE)
size = 0
for buff in iterator:
size += len(buff)
content_length_str = str(size)
# remove existing content length
return RewriteContent.stream_to_gen(out)
def get_request_socket(self, env):
if not self.ca:
return None
sock = None
if env.get('uwsgi.version'): # pragma: no cover
import uwsgi
fd = uwsgi.connection_fd()
conn = socket.fromfd(fd, socket.AF_INET, socket.SOCK_STREAM)
sock = socket.socket(_sock=conn)
sock = conn
except Exception as e:
elif env.get('gunicorn.socket'): # pragma: no cover
sock = env['gunicorn.socket']
if not sock:
# attempt to find socket from wsgi.input
input_ = env.get('wsgi.input')
if input_:
if hasattr(input_, '_sock'): # pragma: no cover
raw = input_._sock
sock = socket.socket(_sock=raw) # pragma: no cover
elif hasattr(input_, 'raw'):
sock = input_.raw._sock
return sock
def handle_connect(self, env):
sock = self.get_request_socket(env)
if not sock:
return WbResponse.text_response('HTTPS Proxy Not Supported',
'405 HTTPS Proxy Not Supported')
sock.send(b'HTTP/1.0 200 Connection Established\r\n')
sock.send(b'Proxy-Connection: close\r\n')
sock.send(b'Server: pywb proxy\r\n')
hostname, port = env['REL_REQUEST_URI'].split(':')
if not self.use_wildcard:
certfile = self.ca.cert_for_host(hostname)
certfile = self.ca.get_wildcard_cert(hostname)
ssl_sock = ssl.wrap_socket(sock,
env['pywb.proxy_ssl_sock'] = ssl_sock
buffreader = BufferedReader(ssl_sock, block_size=self.BLOCK_SIZE)
statusline = to_native_str(buffreader.readline().rstrip())
except Exception as se:
raise BadRequestException(se.message)
statusparts = statusline.split(' ')
if len(statusparts) < 3:
raise BadRequestException('Invalid Proxy Request: ' + statusline)
env['REQUEST_METHOD'] = statusparts[0]
env['REL_REQUEST_URI'] = ('https://' +
env['REL_REQUEST_URI'].replace(':443', '') +
env['SERVER_PROTOCOL'] = statusparts[2].strip()
env['pywb.proxy_scheme'] = 'https'
env['pywb.proxy_host'] = hostname
env['pywb.proxy_port'] = port
env['pywb.proxy_req_uri'] = statusparts[1]
queryparts = env['REL_REQUEST_URI'].split('?', 1)
env['PATH_INFO'] = queryparts[0]
env['QUERY_STRING'] = queryparts[1] if len(queryparts) > 1 else ''
env['pywb.proxy_query'] = env['QUERY_STRING']
while True:
line = to_native_str(buffreader.readline())
if line:
line = line.rstrip()
if not line:
parts = line.split(':', 1)
if len(parts) < 2:
name = parts[0].strip()
value = parts[1].strip()
name = name.replace('-', '_').upper()
if name not in ('CONTENT_LENGTH', 'CONTENT_TYPE'):
name = 'HTTP_' + name
env[name] = value
env['wsgi.input'] = buffreader
#remain = buffreader.rem_length()
#if remain > 0:
#remainder = buffreader.read()
#env['wsgi.input'] = BufferedReader(BytesIO(remainder))
#remainder = buffreader.read(self.BLOCK_SIZE)
#env['wsgi.input'] = BufferedReader(ssl_sock,
# block_size=self.BLOCK_SIZE,
# starting_data=remainder)
def handle_cert_install(self, env):
if env['pywb.proxy_req_uri'] in ('/', '/index.html', '/index.html'):
available = (self.ca is not None)
if self.proxy_cert_dl_view:
return (self.proxy_cert_dl_view.
elif env['pywb.proxy_req_uri'] == self.CERT_DL_PEM:
if not self.ca:
return None
buff = b''
with open(self.ca.ca_file, 'rb') as fh:
buff = fh.read()
content_type = 'application/x-x509-ca-cert'
headers = [('Content-Length', str(len(buff)))]
return WbResponse.bin_stream([buff],
elif env['pywb.proxy_req_uri'] == self.CERT_DL_P12:
if not self.ca:
return None
buff = self.ca.get_root_PKCS12()
content_type = 'application/x-pkcs12'
headers = [('Content-Length', str(len(buff)))]
return WbResponse.bin_stream([buff],

View File

@ -1,374 +0,0 @@
from pywb.framework.wbrequestresponse import WbResponse
from pywb.utils.loaders import extract_client_cookie
from pywb.utils.wbexception import WbException
from pywb.rewrite.wburl import WbUrl
from pywb.framework.cache import create_cache
from pywb.framework.basehandlers import WbUrlHandler
from six.moves.urllib.parse import parse_qs, urlsplit
import six
from warcio.statusandheaders import StatusAndHeaders
from warcio.utils import to_native_str
import base64
import os
import json
class BaseCollResolver(object):
def __init__(self, routes, config):
self.routes = routes
self.use_default_coll = config.get('use_default_coll')
def pre_connect(self):
return False
def resolve(self, env):
route = None
coll = None
matcher = None
ts = None
proxy_coll, ts = self.get_proxy_coll_ts(env)
# invalid parsing
if proxy_coll == '':
return None, None, None, None, self.select_coll_response(env, proxy_coll)
if proxy_coll is None and isinstance(self.use_default_coll, str):
proxy_coll = self.use_default_coll
if proxy_coll:
path = '/' + proxy_coll + '/'
for r in self.routes:
matcher, c = r.is_handling(path)
if matcher:
route = r
coll = c
# if no match, return coll selection response
if not route:
return None, None, None, None, self.select_coll_response(env, proxy_coll)
# if 'use_default_coll', find first WbUrl-handling collection
elif self.use_default_coll:
raise Exception('use_default_coll: true no longer supported, please specify collection name')
#for route in self.routes:
# if isinstance(route.handler, WbUrlHandler):
# return route, route.path, matcher, ts, None
# otherwise, return the appropriate coll selection response
return None, None, None, None, self.select_coll_response(env, proxy_coll)
return route, coll, matcher, ts, None
class ProxyAuthResolver(BaseCollResolver):
DEFAULT_MSG = 'Please enter name of a collection to use with proxy mode'
def __init__(self, routes, config):
super(ProxyAuthResolver, self).__init__(routes, config)
self.auth_msg = config.get('auth_msg', self.DEFAULT_MSG)
def pre_connect(self):
return True
def supports_switching(self):
return False
def get_proxy_coll_ts(self, env):
proxy_auth = env.get('HTTP_PROXY_AUTHORIZATION')
if not proxy_auth:
return None, None
proxy_coll = self.read_basic_auth_coll(proxy_auth)
return proxy_coll, None
def select_coll_response(self, env, default_coll=None):
proxy_msg = 'Basic realm="{0}"'.format(self.auth_msg)
headers = [('Content-Type', 'text/plain'),
('Proxy-Authenticate', proxy_msg)]
status_headers = StatusAndHeaders('407 Proxy Authentication', headers)
value = self.auth_msg
return WbResponse(status_headers, value=[value.encode('utf-8')])
def read_basic_auth_coll(value):
parts = value.split(' ')
if parts[0].lower() != 'basic':
return ''
if len(parts) != 2:
return ''
user_pass = base64.b64decode(parts[1].encode('utf-8'))
return to_native_str(user_pass.split(b':')[0])
class IPCacheResolver(BaseCollResolver):
def __init__(self, routes, config):
super(IPCacheResolver, self).__init__(routes, config)
self.cache = create_cache(config.get('redis_cache_key'))
self.magic_name = config['magic_name']
def supports_switching(self):
return False
def _get_ip(self, env):
ip = env['REMOTE_ADDR']
qs = env.get('pywb.proxy_query')
if qs:
res = parse_qs(qs)
if 'ip' in res:
ip = res['ip'][0]
return ip
def select_coll_response(self, env, default_coll=None):
raise WbException('Invalid Proxy Collection Specified: ' + str(default_coll))
def get_proxy_coll_ts(self, env):
ip = env['REMOTE_ADDR']
qs = env.get('pywb.proxy_query')
if qs:
res = parse_qs(qs)
if 'ip' in res:
ip = res['ip'][0]
if 'delete' in res:
del self.cache[ip + ':c']
del self.cache[ip + ':t']
if 'coll' in res:
self.cache[ip + ':c'] = res['coll'][0]
if 'ts' in res:
self.cache[ip + ':t'] = res['ts'][0]
coll = self.cache[ip + ':c']
ts = self.cache[ip + ':t']
return coll, ts
def resolve(self, env):
server_name = env['pywb.proxy_host']
if self.magic_name in server_name:
response = self.handle_magic_page(env)
if response:
return None, None, None, None, response
return super(IPCacheResolver, self).resolve(env)
def handle_magic_page(self, env):
coll, ts = self.get_proxy_coll_ts(env)
ip = self._get_ip(env)
res = json.dumps({'ip': ip, 'coll': coll, 'ts': ts})
return WbResponse.text_response(res, content_type='application/json')
class CookieResolver(BaseCollResolver):
SESH_COOKIE_NAME = '__pywb_proxy_sesh'
def __init__(self, routes, config):
super(CookieResolver, self).__init__(routes, config)
self.magic_name = config['magic_name']
self.sethost_prefix = '-sethost.' + self.magic_name + '.'
self.set_prefix = '-set.' + self.magic_name
self.cookie_name = config.get('cookie_name', self.SESH_COOKIE_NAME)
self.proxy_select_view = config.get('proxy_select_view')
self.extra_headers = config.get('extra_headers')
self.cache = create_cache()
def supports_switching(self):
return True
def get_proxy_coll_ts(self, env):
coll, ts, sesh_id = self.get_coll(env)
return coll, ts
def select_coll_response(self, env, default_coll=None):
return self.make_magic_response('auto',
def resolve(self, env):
server_name = env['pywb.proxy_host']
if ('.' + self.magic_name) in server_name:
response = self.handle_magic_page(env)
if response:
return None, None, None, None, response
return super(CookieResolver, self).resolve(env)
def handle_magic_page(self, env):
request_url = env['REL_REQUEST_URI']
parts = urlsplit(request_url)
server_name = env['pywb.proxy_host']
path_url = parts.path[1:]
if parts.query:
path_url += '?' + parts.query
if server_name.startswith('auto'):
coll, ts, sesh_id = self.get_coll(env)
if coll:
return self.make_sethost_cookie_response(sesh_id,
return self.make_magic_response('select', path_url, env)
elif server_name.startswith('query.'):
wb_url = WbUrl(path_url)
# only dealing with specific timestamp setting
if wb_url.is_query():
return None
coll, ts, sesh_id = self.get_coll(env)
if not coll:
return self.make_magic_response('select', path_url, env)
self.set_ts(sesh_id, wb_url.timestamp)
return self.make_redir_response(wb_url.url)
elif server_name.endswith(self.set_prefix):
old_sesh_id = extract_client_cookie(env, self.cookie_name)
sesh_id = self.create_renew_sesh_id(old_sesh_id)
if sesh_id != old_sesh_id:
headers = self.make_cookie_headers(sesh_id, self.magic_name)
headers = None
coll = server_name[:-len(self.set_prefix)]
# set sesh value
self.set_coll(sesh_id, coll)
return self.make_sethost_cookie_response(sesh_id, path_url, env,
elif self.sethost_prefix in server_name:
inx = server_name.find(self.sethost_prefix)
sesh_id = server_name[:inx]
domain = server_name[inx + len(self.sethost_prefix):]
headers = self.make_cookie_headers(sesh_id, domain)
full_url = env['pywb.proxy_scheme'] + '://' + domain
full_url += '/' + path_url
return self.make_redir_response(full_url, headers=headers)
elif 'select.' in server_name:
coll, ts, sesh_id = self.get_coll(env)
route_temp = '-set.' + self.magic_name + '/' + path_url
return (self.proxy_select_view.
# msg = 'Invalid Magic Path: ' + url
# print msg
# return WbResponse.text_response(msg, status='404 Not Found')
def make_cookie_headers(self, sesh_id, domain):
cookie_val = '{0}={1}; Path=/; Domain=.{2}; HttpOnly'
cookie_val = cookie_val.format(self.cookie_name, sesh_id, domain)
headers = [('Set-Cookie', cookie_val)]
return headers
def make_sethost_cookie_response(self, sesh_id, path_url,
env, headers=None):
if '://' not in path_url:
path_url = 'http://' + path_url
path_parts = urlsplit(path_url)
new_url = path_parts.path[1:]
if path_parts.query:
new_url += '?' + path_parts.query
return self.make_magic_response(sesh_id + '-sethost', new_url, env,
def make_magic_response(self, prefix, url, env,
suffix=None, headers=None):
full_url = env['pywb.proxy_scheme'] + '://' + prefix + '.'
full_url += self.magic_name
if suffix:
full_url += '.' + suffix
full_url += '/' + url
return self.make_redir_response(full_url, headers=headers)
def set_coll(self, sesh_id, coll):
self.cache[sesh_id + ':c'] = coll
def set_ts(self, sesh_id, ts):
if ts:
self.cache[sesh_id + ':t'] = ts
# this ensures that omitting timestamp will reset to latest
# capture by deleting the cache entry
del self.cache[sesh_id + ':t']
def get_coll(self, env):
sesh_id = extract_client_cookie(env, self.cookie_name)
coll = None
ts = None
if sesh_id:
coll = self.cache[sesh_id + ':c']
ts = self.cache[sesh_id + ':t']
return coll, ts, sesh_id
def create_renew_sesh_id(self, sesh_id, force=False):
#if sesh_id in self.cache and not force:
if sesh_id and ((sesh_id + ':c') in self.cache) and not force:
return sesh_id
sesh_id = base64.b32encode(os.urandom(5)).lower()
return to_native_str(sesh_id)
def make_redir_response(self, url, headers=None):
if not headers:
headers = []
if self.extra_headers:
for name, value in six.iteritems(self.extra_headers):
headers.append((name, value))
return WbResponse.redir_response(url, headers=headers)

View File

@ -1,135 +0,0 @@
# Test WbRequest parsed via a Route
# route with relative path, print resulting wbrequest
>>> _test_route_req(Route('web', WbUrlHandler()), {'REL_REQUEST_URI': '/web/test.example.com', 'SCRIPT_NAME': ''})
{'coll': 'web',
'request_uri': '/web/test.example.com',
'wb_prefix': '/web/',
'wb_url': ('latest_replay', '', '', 'http://test.example.com', 'http://test.example.com')}
# route with absolute path, running at script /my_pywb, print resultingwbrequest
>>> _test_route_req(Route('web', WbUrlHandler()), {'REL_REQUEST_URI': '/web/2013im_/test.example.com', 'SCRIPT_NAME': '/my_pywb', 'HTTP_HOST': 'localhost:8081', 'wsgi.url_scheme': 'https'}, True)
{'coll': 'web',
'request_uri': '/web/2013im_/test.example.com',
'wb_prefix': 'https://localhost:8081/my_pywb/web/',
'wb_url': ('replay', '2013', 'im_', 'http://test.example.com', '2013im_/http://test.example.com')}
# route with no collection
>>> _test_route_req(Route('', BaseHandler()), {'REL_REQUEST_URI': 'http://example.com', 'SCRIPT_NAME': '/pywb'})
{'coll': '',
'request_uri': 'http://example.com',
'wb_prefix': '/pywb/',
'wb_url': None}
# not matching route -- skipped
>>> _test_route_req(Route('web', BaseHandler()), {'REL_REQUEST_URI': '/other/test.example.com', 'SCRIPT_NAME': ''})
# Test Refer Redirects
>>> _test_redir('http://localhost:8080/', '/diff_path/other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html')
>>> _test_redir('http://localhost:8080/', '/../other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html')
>>> _test_redir('http://localhost:8080/', '/../../other.html', 'http://localhost:8080/coll/20131010/http://example.com/index.html')
# Custom collection
>>> _test_redir('http://localhost:8080/', '/other.html', 'http://localhost:8080/complex/123/20131010/http://example.com/path/page.html', coll='complex/123')
# With timestamp included
>>> _test_redir('http://localhost:8080/', '/20131010/other.html', 'http://localhost:8080/coll/20131010/http://example.com/index.html')
# With timestamp included
>>> _test_redir('http://localhost:8080/', '/20131010/path/other.html', 'http://localhost:8080/coll/20131010/http://example.com/some/index.html')
# Wrong Host
>>> _test_redir('http://example.com:8080/', '/other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html')
# Right Host
>>> _test_redir('http://example.com:8080/', '/other.html', 'http://example.com:8080/coll/20131010/http://example.com/path/page.html')
# With custom SCRIPT_NAME
>>> _test_redir('http://localhost:8080/', '/../other.html', 'http://localhost:8080/extra/coll/20131010/http://example.com/path/page.html', '/extra')
# With custom SCRIPT_NAME + timestamp
>>> _test_redir('http://localhost:8080/', '/20131010/other.html', 'http://localhost:8080/extra/coll/20131010/http://example.com/path/page.html', '/extra')
# With custom SCRIPT_NAME, bad match
>>> _test_redir('http://localhost:8080/', '/../other.html', 'http://localhost:8080/extra/coll/20131010/http://example.com/path/page.html', '/extr')
# With no collection
>>> _test_redir('http://localhost:8080/', '/other.html', 'http://localhost:8080/2013/http://example.com/path/page.html', coll='')
# With SCRIPT_NAME but no collection
>>> _test_redir('http://localhost:8080/', '/other.html', 'http://localhost:8080/pywb-access/http://example.com/path/page.html', '/pywb-access', coll='')
>>> _test_redir('http://localhost:8080/', '/some/example/other.html', 'http://localhost:8080/user/coll/http://example.com/path/page.html', '/user/coll', coll='')
## Test ensure_rel_uri_set
# Simple test:
>>> ArchivalRouter.ensure_rel_uri_set({'PATH_INFO': '/pywb/example.com'})
# Test all unecoded special chars and double-quote
# (double-quote must be encoded but not single quote)
>>> ArchivalRouter.ensure_rel_uri_set({'PATH_INFO': "/pywb/example.com/0~!+$&'()*+,;=:\\\""})
from pywb.framework.archivalrouter import Route, ReferRedirect, ArchivalRouter
from pywb.framework.basehandlers import BaseHandler, WbUrlHandler
import pprint
from six.moves.urllib.parse import urlsplit
def _test_route_req(route, env, abs_path=False):
matcher, coll = route.is_handling(env['REL_REQUEST_URI'])
if not matcher:
the_router = ArchivalRouter([route], abs_path=abs_path)
req = the_router.parse_request(route, env, matcher, coll, env['REL_REQUEST_URI'], abs_path)
varlist = vars(req)
the_dict = dict((k, varlist[k]) for k in ('request_uri', 'wb_prefix', 'wb_url', 'coll'))
def _test_redir(match_host, request_uri, referrer, script_name='', coll='coll'):
env = {'REL_REQUEST_URI': request_uri, 'HTTP_REFERER': referrer, 'SCRIPT_NAME': script_name}
env['HTTP_HOST'] = urlsplit(match_host).netloc
routes = [Route(coll, WbUrlHandler())]
the_router = ArchivalRouter(routes)
redir = ReferRedirect()
#req = WbRequest.from_uri(request_uri, env)
rep = redir(env, the_router)
if not rep:
return False
return rep.status_headers.get_header('Location')
if __name__ == "__main__":
import doctest

View File

@ -1,178 +1,6 @@
# WbRequest Tests
# =================
#>>> get_req_from_uri('/save/_embed/example.com/?a=b')
{'wb_url': ('latest_replay', '', '', 'http://_embed/example.com/?a=b', 'http://_embed/example.com/?a=b'), 'coll': 'save', 'wb_prefix': '/save/', 'request_uri': '/save/_embed/example.com/?a=b'}
#>>> get_req_from_uri('/2345/20101024101112im_/example.com/?b=c')
{'wb_url': ('replay', '20101024101112', 'im_', 'http://example.com/?b=c', '20101024101112im_/http://example.com/?b=c'), 'coll': '2345', 'wb_prefix': '/2345/', 'request_uri': '/2345/20101024101112im_/example.com/?b=c'}
#>>> get_req_from_uri('/2010/example.com')
{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': '/2010/', 'request_uri': '/2010/example.com'}
# ajax
#>>> get_req_from_uri('', {'REL_REQUEST_URI': '/2010/example.com', 'HTTP_HOST': 'localhost:8080', 'HTTP_X_REQUESTED_WITH': 'XMLHttpRequest'})
{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': '/2010/', 'request_uri': '/2010/example.com'}
#>>> get_req_from_uri('../example.com')
{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '', 'wb_prefix': '/', 'request_uri': '../example.com'}
# Abs path
#>>> get_req_from_uri('/2010/example.com', {'wsgi.url_scheme': 'https', 'HTTP_HOST': 'localhost:8080'}, use_abs_prefix = True)
{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': 'https://localhost:8080/2010/', 'request_uri': '/2010/example.com'}
# No Scheme, default to http (shouldn't happen per WSGI standard)
#>>> get_req_from_uri('/2010/example.com', {'HTTP_HOST': 'localhost:8080'}, use_abs_prefix = True)
{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': 'http://localhost:8080/2010/', 'request_uri': '/2010/example.com'}
# Referrer extraction
>>> WbUrl(req_from_uri('/web/2010/example.com', {'wsgi.url_scheme': 'http', 'HTTP_HOST': 'localhost:8080', 'HTTP_REFERER': 'http://localhost:8080/web/2011/blah.example.com/'}).extract_referrer_wburl_str()).url
# incorrect referer
>>> req_from_uri('/web/2010/example.com', {'wsgi.url_scheme': 'http', 'HTTP_HOST': 'localhost:8080', 'HTTP_REFERER': 'http://other.example.com/web/2011/blah.example.com/'}).extract_referrer_wburl_str()
# no referer
>>> req_from_uri('/web/2010/example.com', {'wsgi.url_scheme': 'http', 'HTTP_HOST': 'localhost:8080'}).extract_referrer_wburl_str()
# range requests
>>> req_from_uri('/web/2014/example.com', dict(HTTP_RANGE='bytes=10-100')).extract_range()
('http://example.com', 10, 100, True)
>>> req_from_uri('/web/2014/example.com', dict(HTTP_RANGE='bytes=0-')).extract_range()
('http://example.com', 0, '', True)
>>> req_from_uri('/web/www.googlevideo.com/videoplayback?id=123&range=0-65535').extract_range()
('http://www.googlevideo.com/videoplayback?id=123', 0, 65535, False)
>>> req_from_uri('/web/www.googlevideo.com/videoplayback?id=123&range=100-200').extract_range()
('http://www.googlevideo.com/videoplayback?id=123', 100, 200, False)
# invalid range requests
>>> req_from_uri('/web/2014/example.com', dict(HTTP_RANGE='10-20')).extract_range()
>>> req_from_uri('/web/2014/example.com', dict(HTTP_RANGE='A-5')).extract_range()
>>> req_from_uri('/web/www.googlevideo.com/videoplayback?id=123&range=100-').extract_range()
from pywb.rewrite.wburl import WbUrl
from pywb.rewrite.url_rewriter import UrlRewriter
from pywb.framework.wbrequestresponse import WbResponse
from warcio.statusandheaders import StatusAndHeaders
from pywb.framework.wbrequestresponse import WbRequest, WbResponse
def get_req_from_uri(request_uri, env={}, use_abs_prefix=False):
response = req_from_uri(request_uri, env, use_abs_prefix)
varlist = vars(response)
the_dict = dict((k, varlist[k]) for k in ('request_uri', 'wb_prefix', 'wb_url', 'coll'))
return the_dict
def req_from_uri(request_uri, env={}, use_abs_prefix=False):
if not request_uri:
request_uri = env.get('REL_REQUEST_URI')
parts = request_uri.split('/', 2)
# Has coll prefix
if len(parts) == 3:
rel_prefix = '/' + parts[1] + '/'
wb_url_str = parts[2]
coll = parts[1]
# No Coll Prefix
elif len(parts) == 2:
rel_prefix = '/'
wb_url_str = parts[1]
coll = ''
rel_prefix = '/'
wb_url_str = parts[0]
coll = ''
return WbRequest(env,
def test_req_1():
res = get_req_from_uri('/save/_embed/example.com/?a=b')
assert(repr(res['wb_url']) == "('latest_replay', '', '', 'http://_embed/example.com/?a=b', 'http://_embed/example.com/?a=b')")
assert(res['coll'] == 'save')
assert(res['wb_prefix'] == '/save/')
assert(res['request_uri'] == '/save/_embed/example.com/?a=b')
def test_req_2():
res = get_req_from_uri('/2345/20101024101112im_/example.com/?b=c')
assert(repr(res['wb_url']) == "('replay', '20101024101112', 'im_', 'http://example.com/?b=c', '20101024101112im_/http://example.com/?b=c')")
assert(res['coll'] == '2345')
assert(res['wb_prefix'] == '/2345/')
assert(res['request_uri'] == '/2345/20101024101112im_/example.com/?b=c')
def test_req_3():
res = get_req_from_uri('/2010/example.com')
assert(repr(res['wb_url']) == "('latest_replay', '', '', 'http://example.com', 'http://example.com')")
assert(res['coll'] == '2010')
assert(res['wb_prefix'] == '/2010/')
assert(res['request_uri'] == '/2010/example.com')
def test_req_4():
# ajax
res = get_req_from_uri('', {'REL_REQUEST_URI': '/2010/example.com', 'HTTP_HOST': 'localhost:8080', 'HTTP_X_REQUESTED_WITH': 'XMLHttpRequest'})
assert(repr(res['wb_url']) == "('latest_replay', '', '', 'http://example.com', 'http://example.com')")
assert(res['coll'] == '2010')
assert(res['wb_prefix'] == '/2010/')
assert(res['request_uri'] == '/2010/example.com')
def test_req_5():
res = get_req_from_uri('../example.com')
assert(repr(res['wb_url']) == "('latest_replay', '', '', 'http://example.com', 'http://example.com')")
assert(res['coll'] == '')
assert(res['wb_prefix'] == '/')
assert(res['request_uri'] == '../example.com')
def test_req_6():
# Abs path
res = get_req_from_uri('/2010/example.com', {'wsgi.url_scheme': 'https', 'HTTP_HOST': 'localhost:8080'}, use_abs_prefix = True)
assert(repr(res['wb_url']) == "('latest_replay', '', '', 'http://example.com', 'http://example.com')")
assert(res['coll'] == '2010')
assert(res['wb_prefix'] == 'https://localhost:8080/2010/')
assert(res['request_uri'] == '/2010/example.com')
def test_req_7():
# No Scheme, default to http (shouldn't happen per WSGI standard)
res = get_req_from_uri('/2010/example.com', {'HTTP_HOST': 'localhost:8080'}, use_abs_prefix = True)
assert(repr(res['wb_url']) == "('latest_replay', '', '', 'http://example.com', 'http://example.com')")
assert(res['coll'] == '2010')
assert(res['wb_prefix'] == 'http://localhost:8080/2010/')
assert(res['request_uri'] == '/2010/example.com')
#Response tests
def test_resp_1():
resp = vars(WbResponse.text_response('Test'))

View File

@ -1,57 +0,0 @@
from pywb.framework.wsgi_wrappers import init_app
from pywb.utils.wbexception import AccessException
import webtest
class TestOkApp:
def __call__(self, env):
def response(env, start_response):
start_response('200 OK', [])
return [b'Test']
return response
class TestErrApp:
def __call__(self, env):
raise Exception('Test Unexpected Error')
class TestCustomErrApp:
def __call__(self, env):
raise AccessException('Forbidden Test')
def initer(app_class):
def init(config=None):
return app_class()
return init
def test_ok_app():
the_app = init_app(initer(TestOkApp), load_yaml=False)
testapp = webtest.TestApp(the_app)
resp = testapp.get('/')
assert resp.status_int == 200
assert b'Test' in resp.body, resp.body
def test_err_app():
the_app = init_app(initer(TestErrApp), load_yaml=False)
testapp = webtest.TestApp(the_app)
resp = testapp.get('/abc', expect_errors=True)
assert resp.status_int == 500
assert b'500 Internal Server Error Error: Test Unexpected Error' in resp.body
def test_custom_err_app():
the_app = init_app(initer(TestCustomErrApp), load_yaml=False)
testapp = webtest.TestApp(the_app)
resp = testapp.get('/abc', expect_errors=True)
assert resp.status_int == 403
assert b'403 Access Denied Error: Forbidden Test' in resp.body

View File

@ -1,204 +1,8 @@
from warcio.statusandheaders import StatusAndHeaders
from pywb.utils.loaders import extract_post_query, append_post_query
from io import BytesIO
import pprint
import re
import json
class WbRequest(object):
Represents the main pywb request object.
Contains various info from wsgi env, add additional info
about the request, such as coll, relative prefix,
host prefix, absolute prefix.
If a wburl and url rewriter classes are specified, the class
also contains the url rewriter.
def make_host_prefix(env):
host = env.get('HTTP_HOST')
if not host:
host = env['SERVER_NAME'] + ':' + env['SERVER_PORT']
return env.get('wsgi.url_scheme', 'http') + '://' + host
except KeyError:
return ''
def __init__(self, env,
self.env = env
if request_uri:
self.request_uri = request_uri
self.request_uri = env.get('REL_REQUEST_URI')
self.method = self.env.get('REQUEST_METHOD')
self.coll = coll
self.final_mod = ''
if not host_prefix:
host_prefix = self.make_host_prefix(env)
self.host_prefix = host_prefix
self.rel_prefix = rel_prefix
if use_abs_prefix:
self.wb_prefix = host_prefix + rel_prefix
self.wb_prefix = rel_prefix
if not wb_url_str:
wb_url_str = '/'
self.wb_url_str = wb_url_str
# wb_url present and not root page
if wb_url_str != '/' and wburl_class:
self.wb_url = wburl_class(wb_url_str)
self.urlrewriter = urlrewriter_class(self.wb_url,
host_prefix + rel_prefix,
env.get('SCRIPT_NAME', '/'),
# no wb_url, just store blank wb_url
self.wb_url = None
self.urlrewriter = None
self.referrer = env.get('HTTP_REFERER')
self.options = dict()
self.options['is_ajax'] = self._is_ajax()
self.options['is_proxy'] = is_proxy or env.get('pywb_proxy_magic')
self.query_filter = []
self.custom_params = {}
self.user_metadata = user_metadata
self.rewrite_opts = rewrite_opts
env['X_PERF'] = {}
self.custom_params['noredir'] = True
def _is_ajax(self):
value = self.env.get('HTTP_X_REQUESTED_WITH')
value = value or self.env.get('HTTP_X_PYWB_REQUESTED_WITH')
if value and value.lower() == 'xmlhttprequest':
return True
return False
RANGE_ARG_RX = re.compile('.*.googlevideo.com/videoplayback.*([&?]range=(\d+)-(\d+))')
RANGE_HEADER = re.compile('bytes=(\d+)-(\d+)?')
def extract_range(self):
url = self.wb_url.url
use_206 = False
start = None
end = None
range_h = self.env.get('HTTP_RANGE')
if range_h:
m = self.RANGE_HEADER.match(range_h)
if m:
start = m.group(1)
end = m.group(2)
use_206 = True
m = self.RANGE_ARG_RX.match(url)
if m:
start = m.group(2)
end = m.group(3)
url = url[:m.start(1)] + url[m.end(1):]
use_206 = False
if not start:
return None
start = int(start)
self.custom_params['noredir'] = True
if end:
end = int(end)
end = ''
result = (url, start, end, use_206)
return result
def __repr__(self):
varlist = vars(self)
varstr = pprint.pformat(varlist)
return varstr
def _parse_extra(self):
def extract_referrer_wburl_str(self):
if not self.referrer:
return None
if not self.referrer.startswith(self.host_prefix + self.rel_prefix):
return None
wburl_str = self.referrer[len(self.host_prefix + self.rel_prefix):]
return wburl_str
def normalize_post_query(self):
if self.method != 'POST':
if not self.wb_url:
mime = self.env.get('CONTENT_TYPE', '')
length = self.env.get('CONTENT_LENGTH')
stream = self.env['wsgi.input']
buffered_stream = BytesIO()
post_query = extract_post_query('POST', mime, length, stream,
if post_query:
self.env['wsgi.input'] = buffered_stream
self.wb_url.url = append_post_query(self.wb_url.url, post_query)
class WbResponse(object):

View File

@ -1,188 +0,0 @@
from pywb.utils.wbexception import WbException, NotFoundException
from pywb.utils.loaders import load_yaml_config
from pywb.utils.loaders import load_yaml_config
from warcio.utils import to_native_str
from pywb.framework.wbrequestresponse import WbResponse
from warcio.statusandheaders import StatusAndHeaders
import os
import logging
class WSGIApp(object):
def __init__(self, wb_router, fallback_app=None):
self.wb_router = wb_router
self.fallback_app = fallback_app
# Top-level wsgi application
def __call__(self, env, start_response):
return self.handle_connect(env, start_response)
return self.handle_methods(env, start_response)
def handle_connect(self, env, start_response):
def ssl_start_response(statusline, headers):
ssl_sock = env.get('pywb.proxy_ssl_sock')
if not ssl_sock:
start_response(statusline, headers)
env['pywb.proxy_statusline'] = statusline
status_line = 'HTTP/1.1 ' + statusline + '\r\n'
for name, value in headers:
line = name + ': ' + value + '\r\n'
resp_iter = self.handle_methods(env, ssl_start_response)
ssl_sock = env.get('pywb.proxy_ssl_sock')
if not ssl_sock:
return resp_iter
for obj in resp_iter:
if obj:
start_response(env['pywb.proxy_statusline'], [])
return []
def handle_methods(self, env, start_response):
wb_router = self.wb_router
response = None
response = wb_router(env)
if not response:
if self.fallback_app:
return self.fallback_app(env, start_response)
msg = 'No handler for "{0}".'.format(env['REL_REQUEST_URI'])
raise NotFoundException(msg)
except WbException as e:
response = self.handle_exception(env, e, False)
except Exception as e:
response = self.handle_exception(env, e, True)
return response(env, start_response)
def handle_exception(self, env, exc, print_trace):
error_view = None
if hasattr(self.wb_router, 'error_view'):
error_view = self.wb_router.error_view
if hasattr(exc, 'status'):
status = exc.status()
status = '500 Internal Server Error'
if hasattr(exc, 'url'):
err_url = exc.url
err_url = None
if len(exc.args):
err_msg = exc.args[0]
if print_trace:
import traceback
err_details = traceback.format_exc()
err_details = None
if error_view:
if err_url and isinstance(err_url, str):
err_url = to_native_str(err_url, 'utf-8')
if err_msg and isinstance(err_msg, str):
err_msg = to_native_str(err_msg, 'utf-8')
return error_view.render_response(exc_type=type(exc).__name__,
msg = status + ' Error: '
if err_msg:
msg += err_msg
#msg = msg.encode('utf-8', 'ignore')
return WbResponse.text_response(msg,
DEFAULT_CONFIG_FILE = 'config.yaml'
def init_app(init_func, load_yaml=True, config_file=None, config=None):
config = config or {}
if load_yaml:
# env setting overrides all others
env_config = os.environ.get('PYWB_CONFIG_FILE')
if env_config:
config_file = env_config
if not config_file:
if os.path.isfile(config_file):
config = load_yaml_config(config_file)
wb_router = init_func(config)
msg = '*** pywb app init FAILED config from "%s"!\n'
logging.exception(msg, init_func.__name__)
msg = '*** pywb app inited with config from "%s"!\n'
logging.debug(msg, init_func.__name__)
return WSGIApp(wb_router)
def start_wsgi_ref_server(the_app, name, port): # pragma: no cover
from wsgiref.simple_server import make_server, WSGIServer
from six.moves.socketserver import ThreadingMixIn
# disable is_hop_by_hop restrictions
import wsgiref.handlers
wsgiref.handlers.is_hop_by_hop = lambda x: False
if port is None:
logging.info('Starting %s on port %s', name, port)
class ThreadingWSGIServer(ThreadingMixIn, WSGIServer):
httpd = make_server('', port, the_app, ThreadingWSGIServer)
except KeyboardInterrupt as ex:
logging.info('Stopping %s', name)

View File

View File

@ -1,85 +0,0 @@
from pywb.utils.wbexception import AccessException
def make_perms_cdx_filter(perms_policy, wbrequest):
Called internally to convert a perms_policy and a request
to a filter which can be applied on the cdx
perms_checker = perms_policy(wbrequest)
if not perms_checker:
return None
return _create_cdx_perms_filter(perms_checker)
def _create_cdx_perms_filter(perms_checker):
Return a function which will filter the cdx given
a Perms object.
:param perms_checker: a Perms object which implements the
allow_url_lookup() and access_check_capture() methods
def perms_filter_op(cdx_iter, query):
filter out those cdx records that user doesn't have access to,
by consulting :param perms_checker:.
:param cdx_iter: cdx record source iterable
:param query: request parameters (CDXQuery)
:param perms_checker: object implementing permission checker
if not perms_checker.allow_url_lookup(query.key):
if query.is_exact:
raise AccessException('Excluded')
for cdx in cdx_iter:
cdx = perms_checker.access_check_capture(cdx)
if cdx:
yield cdx
return perms_filter_op
def allow_all_perms_policy(wbrequest):
Perms policy which always returns a default Perms object
which allows everything.
The perms object is created per request and may store request
state, if necessary.
The same perms object may be called with multiple queries
(such as for each cdx line) per request.
return Perms()
class Perms(object):
A base perms checker which allows everything
def allow_url_lookup(self, key):
Return true/false if urlkey (canonicalized url)
should be allowed.
Default: allow all
return True
def access_check_capture(self, cdx):
Allow/deny specified cdx capture (dict) to be included
in the result.
Return None to reject, or modify the cdx to exclude
any fields that need to be restricted.
Default: allow cdx line without modifications
return cdx

View File

@ -1,67 +0,0 @@
from pywb.utils.canonicalize import UrlCanonicalizer
from pywb.utils.wbexception import NotFoundException
from pywb.framework.basehandlers import WbUrlHandler
from pywb.framework.archivalrouter import ArchivalRouter, Route
from pywb.framework.wbrequestresponse import WbResponse
BLOCK = '["block"]'
ALLOW = '["allow"]'
RESPONSE_TYPE = 'application/json'
NOT_FOUND = 'Please specify a url to check for access'
class PermsHandler(WbUrlHandler):
def __init__(self, perms_policy, url_canon):
self.perms_policy = perms_policy
self.url_canon = url_canon
def __call__(self, wbrequest):
perms_checker = self.perms_policy(wbrequest)
if wbrequest.wb_url:
return self.check_single_url(wbrequest, perms_checker)
# elif wbrequest.env['REQUEST_METHOD'] == 'POST':
# return self.check_bulk(wbrequest, perms_checker)
raise NotFoundException(NOT_FOUND)
def check_single_url(self, wbrequest, perms_checker):
urlkey = self.url_canon(wbrequest.wb_url.url)
urlkey = urlkey.encode('utf-8')
if not perms_checker.allow_url_lookup(urlkey):
response_text = BLOCK
response_text = ALLOW
#TODO: other types of checking
return WbResponse.text_response(response_text,
# def check_bulk_urls(self, wbrequest, perms_checker):
# pass
def create_perms_checker_app(config):
Create permissions checker standalone app
Running under the '/check-access' route
port = config.get('port')
perms_policy = config.get('perms_policy')
canonicalizer = UrlCanonicalizer(config.get('surt_ordered', True))
handler = PermsHandler(perms_policy, canonicalizer)
routes = [Route('check-access', handler)]
return ArchivalRouter(routes, port=port)

View File

@ -1,99 +0,0 @@
from gevent.monkey import patch_all; patch_all()
import requests
from pywb.framework.archivalrouter import Route
from pywb.rewrite.rewrite_content import RewriteContent
from pywb.rewrite.wburl import WbUrl
from warcio.recordloader import ArcWarcRecordLoader
from pywb.webapp.live_rewrite_handler import RewriteHandler
from pywb.utils.canonicalize import canonicalize
from warcio.timeutils import http_date_to_timestamp
from pywb.cdx.cdxobject import CDXObject
from io import BytesIO
from pywb.urlrewrite.rewriteinputreq import RewriteInputRequest
from six.moves.urllib.parse import quote
# ============================================================================
class PlatformRoute(Route):
def apply_filters(self, wbrequest, matcher):
wbrequest.matchdict = matcher.groupdict()
# ============================================================================
class PlatformHandler(RewriteHandler):
def __init__(self, config):
super(PlatformHandler, self).__init__(config)
self.upstream_url = config.get('upstream_url')
self.loader = ArcWarcRecordLoader()
framed = config.get('framed_replay')
self.content_rewriter = RewriteContent(is_framed_replay=framed)
def render_content(self, wbrequest):
if wbrequest.wb_url.mod == 'vi_':
return self._get_video_info(wbrequest)
ref_wburl_str = wbrequest.extract_referrer_wburl_str()
if ref_wburl_str:
wbrequest.env['HTTP_REFERER'] = WbUrl(ref_wburl_str).url
urlkey = canonicalize(wbrequest.wb_url.url)
url = wbrequest.wb_url.url
inputreq = RewriteInputRequest(wbrequest.env, urlkey, url,
req_data = inputreq.reconstruct_request(url)
headers = {'Content-Length': len(req_data),
'Content-Type': 'application/request'}
if wbrequest.wb_url.is_latest_replay():
closest = 'now'
closest = wbrequest.wb_url.timestamp
upstream_url = self.upstream_url.format(url=quote(url),
r = requests.post(upstream_url,
record = self.loader.parse_record_stream(r.raw)
cdx = CDXObject()
cdx['urlkey'] = urlkey
cdx['timestamp'] = http_date_to_timestamp(r.headers.get('Memento-Datetime'))
cdx['url'] = url
head_insert_func = self.head_insert_view.create_insert_func(wbrequest)
result = self.content_rewriter.rewrite_content(wbrequest.urlrewriter,
status_headers, gen, is_rw = result
return self._make_response(wbrequest, *result)
if __name__ == "__main__":
from gevent.wsgi import WSGIServer
from pywb.apps.wayback import application
server = WSGIServer(('', 8090), application)

View File

@ -1,32 +0,0 @@
### pywb.warc
This is the WARC/ARC record loading component of pywb wayback tool suite.
The package provides the following facilities:
* Resolve relative WARC/ARC filenames to a full path based on configurable resolvers
* Resolve 'revisit' records from provided index to find a full record with headers and payload content
* Load WARC/ARC records either locally or via http using http 1.1 range requests
When loading archived content, the format type (WARC vs ARC) and compressed ARCs/WARCs
are decompressed automatically.
No assumption is made about format based on filename, content type
or other external parameters other than the content itself.
### Tests
This package will includes a test suite for loading a variety of WARC and ARC records.
Tests so far:
* Compressed WARC, ARC Records
* Uncompressed ARC Records
* Compressed WARC created by wget 1.14
* Same Url revisit record resolving
* Different url revisit record resolving

View File

@ -1,62 +0,0 @@
from pywb.cdx.cdxserver import create_cdx_server
from pywb.utils.wbexception import NotFoundException
from pywb.framework.basehandlers import BaseHandler
from pywb.framework.wbrequestresponse import WbResponse
from pywb.webapp.query_handler import QueryHandler
from six.moves.urllib.parse import parse_qs
import json
import six
class CDXAPIHandler(BaseHandler):
Handler which passes wsgi request to cdx server and
returns a text-based cdx api
def __init__(self, index_handler):
self.index_handler = index_handler
def __call__(self, wbrequest):
params = self.extract_params_from_wsgi_env(wbrequest.env)
cdx_iter = self.index_handler.load_cdx(wbrequest, params)
except NotFoundException:
msg = 'No Captures found for: ' + params.get('url')
if params.get('output') == 'json':
msg = json.dumps(dict(error=msg))
return WbResponse.text_response(msg, content_type=content_type,
status='404 Not Found')
return WbResponse.text_stream(cdx_iter,
def extract_params_from_wsgi_env(env):
""" utility function to extract params and create a CDXQuery
from a WSGI environment dictionary
params = parse_qs(env['QUERY_STRING'])
# parse_qs produces arrays for single values
# cdx processing expects singleton params for all params,
# except filters, so convert here
# use first value of the list
for name, val in six.iteritems(params):
if name != 'filter':
params[name] = val[0]
if 'output' not in params:
params['output'] = 'text'
elif params['output'] not in ('text', 'json'):
params['output'] = 'text'
return params

View File

@ -1,195 +1,14 @@
import pkgutil
import mimetypes
import time
import logging
from datetime import datetime
from warcio.statusandheaders import StatusAndHeaders
from warcio.timeutils import datetime_to_timestamp
from pywb.utils.wbexception import NotFoundException
from pywb.utils.loaders import LocalFileLoader
from pywb.framework.basehandlers import BaseHandler, WbUrlHandler
from pywb.framework.wbrequestresponse import WbResponse
from pywb.warc.blockrecordloader import BlockArcWarcRecordLoader
from pywb.warc.resolvingloader import ResolvingLoader
from pywb.warc.pathresolvers import PathResolverMapper
from pywb.webapp.views import J2TemplateView, init_view
from pywb.webapp.replay_views import ReplayView
from pywb.framework.memento import MementoResponse
class SearchPageWbUrlHandler(WbUrlHandler):
Loads a default search page html template to be shown when
the wb_url is empty
def __init__(self, config):
self.search_view = init_view(config, 'search_html')
self.is_frame_mode = config.get('framed_replay', False)
self.frame_mod = 'tf_'
self.replay_mod = ''
self.response_class = WbResponse
if self.is_frame_mode:
#html = config.get('frame_insert_html', 'templates/frame_insert.html')
#self.search_view = J2TemplateView(html, config.get('jinja_env'))
self.frame_insert_view = init_view(config, 'frame_insert_html')
self.banner_html = config.get('banner_html', 'banner.html')
if config.get('enable_memento', False):
self.response_class = MementoResponse
if self.is_frame_mode == 'inverse':
self.frame_mod = ''
self.replay_mod = 'mp_'
self.frame_insert_view = None
self.banner_html = None
def render_search_page(self, wbrequest, **kwargs):
return self.search_view.render_response(wbrequest=wbrequest,
def __call__(self, wbrequest):
# root search page
if wbrequest.wb_url_str == '/':
return self.render_search_page(wbrequest)
wbrequest.options['replay_mod'] = self.replay_mod
wbrequest.options['frame_mod'] = self.frame_mod
# render top level frame if in frame mode
# (not supported in proxy mode)
if (self.is_frame_mode and wbrequest.wb_url and
not wbrequest.wb_url.is_query() and
not wbrequest.options['is_proxy']):
if wbrequest.wb_url.mod == self.frame_mod:
wbrequest.options['is_top_frame'] = True
return self.get_top_frame_response(wbrequest)
wbrequest.options['is_framed'] = True
wbrequest.final_mod = self.frame_mod
wbrequest.options['is_framed'] = False
return self.handle_request(wbrequest)
except NotFoundException as nfe:
return self.handle_not_found(wbrequest, nfe)
def get_top_frame_params(self, wbrequest, mod):
embed_url = wbrequest.wb_url.to_str(mod=mod)
if wbrequest.wb_url.timestamp:
timestamp = wbrequest.wb_url.timestamp
timestamp = datetime_to_timestamp(datetime.utcnow())
params = dict(embed_url=embed_url,
return params
def get_top_frame_response(self, wbrequest):
params = self.get_top_frame_params(wbrequest, mod=self.replay_mod)
headers = [('Content-Type', 'text/html')]
status_headers = StatusAndHeaders('200 OK', headers)
template_result = self.frame_insert_view.render_to_string(**params)
body = template_result.encode('utf-8')
return self.response_class(status_headers, [body], wbrequest=wbrequest)
# Standard WB Handler
class WBHandler(SearchPageWbUrlHandler):
def __init__(self, query_handler, config=None):
super(WBHandler, self).__init__(config)
self.index_reader = query_handler
self.not_found_view = init_view(config, 'not_found_html')
self.replay = self._init_replay_view(config)
self.fallback_handler = None
self.fallback_name = config.get('fallback')
def _init_replay_view(self, config):
cookie_maker = config.get('cookie_maker')
record_loader = BlockArcWarcRecordLoader(cookie_maker=cookie_maker)
paths = config.get('archive_paths')
resolving_loader = ResolvingLoader(PathResolverMapper()(paths),
return ReplayView(resolving_loader, config)
def resolve_refs(self, handler_dict):
if self.fallback_name:
self.fallback_handler = handler_dict.get(self.fallback_name)
logging.debug('Fallback Handler: ' + self.fallback_name)
def handle_request(self, wbrequest):
cdx_lines, output = self.index_reader.load_for_request(wbrequest)
if output != 'text' and wbrequest.wb_url.is_replay():
return self.handle_replay(wbrequest, cdx_lines)
return self.handle_query(wbrequest, cdx_lines, output)
def handle_query(self, wbrequest, cdx_lines, output):
return self.index_reader.make_cdx_response(wbrequest,
def handle_replay(self, wbrequest, cdx_lines):
cdx_callback = self.index_reader.cdx_load_callback(wbrequest)
return self.replay.render_content(wbrequest,
def handle_not_found(self, wbrequest, nfe):
# check fallback: only for replay queries and not for identity
if (self.fallback_handler and
not wbrequest.wb_url.is_query() and
not wbrequest.wb_url.is_identity):
return self.fallback_handler(wbrequest)
# if capture query, just return capture page
if wbrequest.wb_url.is_query():
output = self.index_reader.get_output_type(wbrequest.wb_url)
return self.index_reader.make_cdx_response(wbrequest, iter([]), output)
return self.not_found_view.render_response(status='404 Not Found',
# Static Content Handler
class StaticHandler(BaseHandler):
class StaticHandler(object):
def __init__(self, static_path):
@ -234,15 +53,3 @@ class StaticHandler(BaseHandler):
# Debug Handlers
class DebugEchoEnvHandler(BaseHandler): # pragma: no cover
def __call__(self, wbrequest):
return WbResponse.text_response(str(wbrequest.env))
class DebugEchoHandler(BaseHandler): # pragma: no cover
def __call__(self, wbrequest):
return WbResponse.text_response(str(wbrequest))

View File

@ -1,241 +0,0 @@
from pywb.framework.wbrequestresponse import WbResponse
from pywb.framework.cache import create_cache
from pywb.rewrite.rewrite_live import LiveRewriter
from pywb.rewrite.wburl import WbUrl
from pywb.webapp.handlers import StaticHandler, SearchPageWbUrlHandler
from pywb.webapp.views import HeadInsertView
from pywb.utils.wbexception import LiveResourceException
import json
import hashlib
class RewriteHandler(SearchPageWbUrlHandler):
LIVE_COOKIE = 'pywb.timestamp={0}; max-age=60'
YT_DL_TYPE = 'application/vnd.youtube-dl_formats+json'
def __init__(self, config):
super(RewriteHandler, self).__init__(config)
proxyhostport = config.get('proxyhostport')
live_rewriter_cls = config.get('live_rewriter_cls', LiveRewriter)
self.live_fetcher = live_rewriter_cls(is_framed_replay=self.is_frame_mode,
self.recording = self.live_fetcher.is_recording()
self.head_insert_view = HeadInsertView.init_from_config(config)
self.live_cookie = config.get('live-cookie', self.LIVE_COOKIE)
self.verify = config.get('verify_ssl', True)
self.ydl = None
self._cache = None
def handle_request(self, wbrequest):
if wbrequest.wb_url.is_query():
type_ = wbrequest.wb_url.LATEST_REPLAY
url = wbrequest.urlrewriter.get_new_url(type=type_, timestamp='')
return WbResponse.redir_response(url)
if wbrequest.options['is_ajax']:
wbrequest.urlrewriter.rewrite_opts['is_ajax'] = True
return self.render_content(wbrequest)
except Exception as exc:
import traceback
err_details = traceback.format_exc()
url = wbrequest.wb_url.url
msg = 'Could not load the url from the live web: ' + url
raise LiveResourceException(msg=msg, url=url)
def _live_request_headers(self, wbrequest):
return {}
def _skip_recording(self, wbrequest):
return False
def render_content(self, wbrequest):
if wbrequest.wb_url.mod == 'vi_':
return self._get_video_info(wbrequest)
head_insert_func = self.head_insert_view.create_insert_func(wbrequest)
req_headers = self._live_request_headers(wbrequest)
ref_wburl_str = wbrequest.extract_referrer_wburl_str()
if ref_wburl_str:
wbrequest.env['REL_REFERER'] = WbUrl(ref_wburl_str).url
skip_recording = self._skip_recording(wbrequest)
use_206 = False
url = None
rangeres = None
readd_range = False
cache_key = None
if self.recording and not skip_recording:
rangeres = wbrequest.extract_range()
if rangeres:
url, start, end, use_206 = rangeres
# if bytes=0- Range request,
# simply remove the range and still proxy
if start == 0 and not end and use_206:
wbrequest.wb_url.url = url
del wbrequest.env['HTTP_RANGE']
readd_range = True
# disables proxy
skip_recording = True
# sets cache_key only if not already cached
cache_key = self._get_cache_key('r:', url)
result = self.live_fetcher.fetch_request(wbrequest.wb_url.url,
wbresponse = self._make_response(wbrequest, *result)
if readd_range:
content_length = (wbresponse.status_headers.
content_length = int(content_length)
wbresponse.status_headers.add_range(0, content_length,
except (ValueError, TypeError):
if self.recording and cache_key:
self._add_rec_ping(cache_key, url, wbrequest, wbresponse)
if rangeres:
referrer = wbrequest.env.get('REL_REFERER')
# also ping video info
if referrer:
resp = self._get_video_info(wbrequest,
print('Error getting video info')
return wbresponse
def _make_response(self, wbrequest, status_headers, gen, is_rewritten):
# if cookie set, pass recorded timestamp info via cookie
# so that client side may be able to access it
# used by framed mode to update frame banner
if self.live_cookie:
cdx = wbrequest.env.get('pywb.cdx')
if cdx:
value = self.live_cookie.format(cdx['timestamp'])
status_headers.headers.append(('Set-Cookie', value))
return WbResponse(status_headers, gen)
def _get_cache_key(self, prefix, url):
if not self._cache:
self._cache = create_cache()
key = self.create_cache_key(prefix, url)
if key in self._cache:
return None
return key
def create_cache_key(prefix, url):
hash_ = hashlib.md5()
key = hash_.hexdigest()
key = prefix + key
return key
def _add_rec_ping(self, key, url, wbrequest, wbresponse):
def do_ping():
headers = self._live_request_headers(wbrequest)
headers['Connection'] = 'close'
# mark as pinged
self._cache[key] = '1'
self.live_fetcher.fetch_async(url, headers)
del self._cache[key]
def wrap_buff_gen(gen):
for x in gen:
yield x
wbresponse.body = wrap_buff_gen(wbresponse.body)
return wbresponse
def _get_video_info(self, wbrequest, info_url=None, video_url=None):
if not video_url:
video_url = wbrequest.wb_url.url
if not info_url:
info_url = wbrequest.wb_url.url
cache_key = None
if self.recording:
cache_key = self._get_cache_key('v:', video_url)
info = self.live_fetcher.get_video_info(video_url)
if info is None: #pragma: no cover
msg = ('youtube-dl is not installed, pip install youtube-dl to ' +
'enable improved video proxy')
return WbResponse.text_response(text=msg, status='404 Not Found')
#if info and info.formats and len(info.formats) == 1:
content_type = self.YT_DL_TYPE
metadata = json.dumps(info)
if (self.recording and cache_key):
headers = self._live_request_headers(wbrequest)
headers['Content-Type'] = content_type
if info_url.startswith('https://'):
info_url = info_url.replace('https', 'http', 1)
response = self.live_fetcher.add_metadata(info_url, headers, metadata)
self._cache[cache_key] = '1'
return WbResponse.text_response(metadata, content_type=content_type)

View File

@ -1,387 +0,0 @@
from pywb.utils.loaders import load_yaml_config
from pywb.framework.archivalrouter import ArchivalRouter, Route
from pywb.framework.proxy import ProxyArchivalRouter
from pywb.framework.wbrequestresponse import WbRequest
from pywb.framework.memento import MementoRequest
from pywb.framework.basehandlers import BaseHandler
from pywb.webapp.views import J2TemplateView
from pywb.webapp.views import J2HtmlCapturesView, init_view
from pywb.webapp.live_rewrite_handler import RewriteHandler
from pywb.webapp.query_handler import QueryHandler
from pywb.webapp.handlers import WBHandler
from pywb.webapp.handlers import StaticHandler
from pywb.webapp.handlers import DebugEchoHandler, DebugEchoEnvHandler
from pywb.webapp.cdx_api_handler import CDXAPIHandler
from pywb import DEFAULT_CONFIG
import os
import logging
import six
class DictChain(object):
def __init__(self, *dicts):
self.dicts = dicts
def get(self, key, default_val=None):
for d in self.dicts:
val = d.get(key)
if val is not None:
return val
return default_val
def __contains__(self, key):
return self.get(key) is not None
def __getitem__(self, key):
return self.get(key)
def __setitem__(self, key, value):
self.dicts[0][key] = value
def create_wb_handler(query_handler, config):
wb_handler_class = config.get('wb_handler_class', WBHandler)
wb_handler = wb_handler_class(
return wb_handler
def create_live_handler(config):
wb_handler_class = config.get('wb_handler_class', RewriteHandler)
live_handler = wb_handler_class(config)
return live_handler
def init_route_config(value, config):
if isinstance(value, str) or isinstance(value, list):
value = dict(index_paths=value)
route_config = DictChain(value, config)
return route_config
def init_collection(route_config):
ds_rules_file = route_config.get('domain_specific_rules', None)
html_view = init_view(route_config, 'query_html', J2HtmlCapturesView)
server_cls = route_config.get('server_cls')
query_handler = QueryHandler.init_from_config(route_config,
return query_handler
def add_cdx_api_handler(name, cdx_api_suffix, routes, query_handler,
# if bool, use -cdx suffix, else use custom string
# as the suffix
if isinstance(cdx_api_suffix, bool):
name += '-cdx'
name += str(cdx_api_suffix)
logging.debug('Adding CDX API Handler: ' + name)
routes.append(route_class(name, CDXAPIHandler(query_handler)))
def create_cdx_server_app(passed_config):
Create a cdx server api-only app
For each collection, create a /<coll>-cdx access point
which follows the cdx api
defaults = load_yaml_config(DEFAULT_CONFIG)
config = DictChain(passed_config, defaults)
collections = config.get('collections', {})
static_routes = {}
# collections based on file system
if config.get('enable_auto_colls', True):
colls_loader_cls = config.get('colls_loader_cls', DirectoryCollsLoader)
dir_loader = colls_loader_cls(config, static_routes, collections)
routes = []
for name, value in six.iteritems(collections):
route_config = init_route_config(value, config)
query_handler = init_collection(route_config)
cdx_api_suffix = route_config.get('enable_cdx_api', True)
add_cdx_api_handler(name, cdx_api_suffix, routes, query_handler)
return ArchivalRouter(routes)
class DirectoryCollsLoader(object):
def __init__(self, config, static_routes, colls):
self.config = config
self.static_routes = static_routes
self.colls = colls
def __call__(self):
colls = self.colls
static_dir = self.config.get('paths')['static_path']
static_shared_prefix = self.config.get('static_shared_prefix')
if static_dir and static_shared_prefix and os.path.isdir(static_dir):
static_dir = os.path.abspath(static_dir) + os.path.sep
self.static_routes[static_shared_prefix] = static_dir
root_dir = self.config.get('collections_root', '')
if not root_dir or not os.path.isdir(root_dir):
return colls
for name in os.listdir(root_dir):
full = os.path.join(root_dir, name)
if not os.path.isdir(full):
coll_config = self.load_coll_dir(full, name)
if coll_config:
# if already exists, override existing config with coll specific
if name in colls:
colls[name] = coll_config
return colls
def _norm_path(self, root_dir, path):
result = os.path.normpath(os.path.join(root_dir, path))
return result
def _add_dir_if_exists(self, coll, root_dir, dir_key, required=False):
curr_val = coll.get(dir_key)
if curr_val:
# add collection path only if relative path, and not a url
if '://' not in curr_val and not os.path.isabs(curr_val):
coll[dir_key] = self._norm_path(root_dir, curr_val) + os.path.sep
return False
thedir = self.config.get('paths')[dir_key]
fulldir = os.path.join(root_dir, thedir)
if os.path.isdir(fulldir):
fulldir = os.path.abspath(fulldir) + os.path.sep
coll[dir_key] = fulldir
return True
elif required:
msg = 'Dir "{0}" does not exist for "{1}"'.format(fulldir, dir_key)
raise Exception(msg)
return False
def load_yaml_file(self, root_dir, filename):
filename = os.path.join(root_dir, filename)
if os.path.isfile(filename):
return load_yaml_config(filename)
return {}
def load_coll_dir(self, root_dir, name):
# Load config.yaml
coll_config = self.load_yaml_file(root_dir, 'config.yaml')
# Load metadata.yaml
metadata = self.load_yaml_file(root_dir, 'metadata.yaml')
coll_config['metadata'] = metadata
self._add_dir_if_exists(coll_config, root_dir, 'index_paths', True)
# inherit these properties from base, in case archive_paths is shared
shared_config = DictChain(coll_config, self.config)
self._add_dir_if_exists(shared_config, root_dir, 'archive_paths', True)
if self._add_dir_if_exists(coll_config, root_dir, 'static_path', False):
self.static_routes['static/' + name] = coll_config['static_path']
# Custom templates dir
templates_dir = self.config.get('paths').get('templates_dir')
if templates_dir:
template_dir = os.path.join(root_dir, templates_dir)
# Check all templates
template_files = self.config.get('paths')['template_files']
for tname, tfile in six.iteritems(template_files):
if tname in coll_config:
# Already set
coll_config[tname] = self._norm_path(root_dir, coll_config[tname])
# If templates override dir
elif templates_dir:
full = os.path.join(template_dir, tfile)
if os.path.isfile(full):
coll_config[tname] = full
return coll_config
def create_wb_router(passed_config=None):
passed_config = passed_config or {}
defaults = load_yaml_config(DEFAULT_CONFIG)
config = DictChain(passed_config, defaults)
routes = []
port = config.get('port')
collections = config.get('collections', {})
static_routes = config.get('static_routes', {})
root_route = None
# collections based on file system
if config.get('enable_auto_colls', True):
colls_loader_cls = config.get('colls_loader_cls', DirectoryCollsLoader)
dir_loader = colls_loader_cls(config, static_routes, collections)
if config.get('enable_memento', False):
request_class = MementoRequest
request_class = WbRequest
# store live and replay handlers
handler_dict = {}
# setup template globals
templates_dirs = config['templates_dirs']
jinja_env = J2TemplateView.init_shared_env(paths=templates_dirs,
jinja_env.globals.update(config.get('template_globals', {}))
for static_name, static_path in six.iteritems(static_routes):
routes.append(Route(static_name, StaticHandler(static_path)))
for name, value in six.iteritems(collections):
if isinstance(value, BaseHandler):
handler_dict[name] = value
new_route = Route(name, value, config=config)
if name != '':
root_route = new_route
route_config = init_route_config(value, config)
route_class = route_config.get('route_class', Route)
if route_config.get('index_paths') == '$liveweb':
live = create_live_handler(route_config)
handler_dict[name] = live
new_route = route_class(name, live, config=route_config)
if name != '':
root_route = new_route
query_handler = init_collection(route_config)
wb_handler = create_wb_handler(
handler_dict[name] = wb_handler
logging.debug('Adding Collection: ' + name)
new_route = route_class(name, wb_handler,
if name != '':
root_route = new_route
# cdx query handler
cdx_api_suffix = route_config.get('enable_cdx_api', False)
if cdx_api_suffix:
add_cdx_api_handler(name, cdx_api_suffix, routes, query_handler,
if config.get('debug_echo_env', False):
routes.append(Route('echo_env', DebugEchoEnvHandler()))
if config.get('debug_echo_req', False):
routes.append(Route('echo_req', DebugEchoHandler()))
if root_route:
# resolve any cross handler references
for route in routes:
if hasattr(route.handler, 'resolve_refs'):
# default to regular archival mode
router = ArchivalRouter
if config.get('enable_http_proxy', False):
router = ProxyArchivalRouter
view = init_view(config, 'proxy_select_html')
if 'proxy_options' not in passed_config:
passed_config['proxy_options'] = {}
if view:
passed_config['proxy_options']['proxy_select_view'] = view
view = init_view(config, 'proxy_cert_download_html')
if view:
passed_config['proxy_options']['proxy_cert_download_view'] = view
# Finally, create wb router
return router(
abs_path=config.get('absolute_paths', True),
home_view=init_view(config, 'home_html'),
error_view=init_view(config, 'error_html'),
info_view=init_view(config, 'info_json'),

View File

@ -1,172 +0,0 @@
from pywb.utils.dsrules import DEFAULT_RULES_FILE
from pywb.perms.perms_filter import make_perms_cdx_filter
from pywb.framework.wbrequestresponse import WbResponse
from pywb.cdx.cdxserver import create_cdx_server
from pywb.webapp.views import MementoTimemapView
class QueryHandler(object):
Main interface for querying the index (currently only CDX) from a
source server (currently a cdx server)
Creates an appropriate query based on wbrequest type info and outputs
a returns a view for the cdx, either a raw cdx iter, an html view,
def __init__(self, cdx_server, html_query_view=None, perms_policy=None):
self.cdx_server = cdx_server
self.perms_policy = perms_policy
self.views = {}
if html_query_view:
self.views['html'] = html_query_view
self.views['timemap'] = MementoTimemapView()
def init_from_config(config,
perms_policy = None
if hasattr(config, 'get'):
perms_policy = config.get('perms_policy')
server_cls = config.get('server_cls', server_cls)
cdx_server = create_cdx_server(config, ds_rules_file, server_cls)
return QueryHandler(cdx_server, html_view, perms_policy)
def get_output_type(self, wb_url):
# cdx server only supports text and cdxobject for now
if wb_url.mod == 'cdx_':
output = 'text'
elif wb_url.mod == 'timemap':
output = 'timemap'
elif wb_url.is_query():
output = 'html'
output = 'cdxobject'
return output
def load_for_request(self, wbrequest):
wb_url = wbrequest.wb_url
output = self.get_output_type(wb_url)
# init standard params
params = self.get_query_params(wb_url)
params['allowFuzzy'] = True
params['url'] = wb_url.url
params['output'] = output
# get metadata
if wb_url.mod == 'vi_':
# matching metadata explicitly with special scheme
schema, rest = wb_url.url.split('://', 1)
params['url'] = 'metadata://' + rest
cdx_iter = self.load_cdx(wbrequest, params)
return cdx_iter, output
def load_cdx(self, wbrequest, params):
if wbrequest:
# add any custom filter from the request
if wbrequest.query_filter:
filters = params.get('filter')
if filters:
params['filter'] = wbrequest.query_filter
params['coll'] = wbrequest.coll
if wbrequest.custom_params:
if self.perms_policy:
perms_op = make_perms_cdx_filter(self.perms_policy, wbrequest)
if perms_op:
params['custom_ops'] = [perms_op]
cdx_iter = self.cdx_server.load_cdx(**params)
return cdx_iter
def make_cdx_response(self, wbrequest, cdx_iter, output, **kwargs):
# if not text, the iterator is assumed to be CDXObjects
if output and output != 'text':
view = self.views.get(output)
if view:
return view.render_response(wbrequest, cdx_iter, **kwargs)
return WbResponse.text_stream(cdx_iter)
def cdx_load_callback(self, wbrequest):
def load_cdx(params):
params['output'] = 'cdxobject'
return self.load_cdx(wbrequest, params)
return load_cdx
def get_query_params(self,
wburl, limit=150000,
#if wburl.type == wburl.URL_QUERY:
# raise NotImplementedError('Url Query Not Yet Supported')
return {
{'collapseTime': collapse_time,
'filter': ['!statuscode:(500|502|504)'],
'from': wburl.timestamp,
'to': wburl.end_timestamp,
'limit': limit,
'matchType': 'exact',
{'collapse': 'urlkey',
'matchType': 'prefix',
'showGroupCount': True,
'showUniqCount': True,
'lastSkipTimestamp': True,
'limit': limit,
'fl': ('urlkey,original,timestamp,' +
'filter': [],
{'sort': 'closest',
'filter': ['!statuscode:(500|502|504)'],
'limit': replay_closest,
'closest': wburl.timestamp,
'resolveRevisits': True,
'matchType': 'exact',
{'sort': 'reverse',
# Not appropriate as default
# Should be an option to configure status code filtering in general
# 'filter': ['statuscode:[23]..|-'],
'filter': [],
'limit': '1',
'resolveRevisits': True,
'matchType': 'exact',

View File

@ -1,92 +0,0 @@
from warcio.statusandheaders import StatusAndHeaders
from warcio.limitreader import LimitReader
from pywb.framework.cache import create_cache
from tempfile import NamedTemporaryFile, mkdtemp
import yaml
import os
from shutil import rmtree
import atexit
class RangeCache(object):
def __init__(self):
self.cache = create_cache()
self.temp_dir = None
def cleanup(self):
if self.temp_dir: # pragma: no cover
print('Removing: ' + self.temp_dir)
rmtree(self.temp_dir, True)
self.temp_dir = None
def handle_range(self, wbrequest, key, wbresponse_func,
url, start, end, use_206):
# key must be set
if key not in self.cache:
wbrequest.custom_params['noredir'] = True
response = wbresponse_func()
# only cache 200 responses
if not response.status_headers.get_statuscode().startswith('200'):
return response.status_headers, response.body
if not self.temp_dir:
self.temp_dir = mkdtemp(prefix='_pywbcache')
with NamedTemporaryFile(delete=False, dir=self.temp_dir) as fh:
for obj in response.body:
name = fh.name
spec = dict(name=fh.name,
self.cache[key] = yaml.dump(spec)
spec = yaml.load(self.cache[key])
spec['headers'] = [tuple(x) for x in spec['headers']]
filelen = os.path.getsize(spec['name'])
maxlen = filelen - start
if end:
maxlen = min(maxlen, end - start + 1)
def read_range():
with open(spec['name'], 'rb') as fh:
fh = LimitReader.wrap_stream(fh, maxlen)
while True:
buf = fh.read()
if not buf:
yield buf
status_headers = StatusAndHeaders('200 OK', spec['headers'])
if use_206:
StatusAndHeaders.add_range(status_headers, start,
status_headers.replace_header('Content-Length', str(maxlen))
return status_headers, read_range()
range_cache = RangeCache()

View File

@ -1,392 +0,0 @@
import re
import logging
from io import BytesIO
from six.moves.urllib.parse import urlsplit
from itertools import chain
from warcio.statusandheaders import StatusAndHeaders
from warcio.limitreader import LimitReader
from warcio.timeutils import timestamp_now
from warcio.recordloader import ArchiveLoadFailed
from pywb.utils.wbexception import WbException, NotFoundException
from pywb.framework.wbrequestresponse import WbResponse
from pywb.framework.memento import MementoResponse
from pywb.rewrite.rewrite_content import RewriteContent
from pywb.webapp.views import HeadInsertView
from pywb.webapp.rangecache import range_cache
class CaptureException(WbException):
raised to indicate an issue with a specific capture
and will be caught and result in a retry, if possible
if not, will result in a 502
def status(self):
return '502 Internal Server Error'
class ReplayView(object):
STRIP_SCHEME_WWW = re.compile('^([\w]+:[/]*(?:www[\d]*\.)?)?(.*?)$', re.MULTILINE)
def __init__(self, content_loader, config):
self.content_loader = content_loader
framed = config.get('framed_replay')
self.content_rewriter = RewriteContent(is_framed_replay=framed)
self.head_insert_view = HeadInsertView.init_from_config(config)
self.buffer_response = config.get('buffer_response', True)
self.buffer_max_size = config.get('buffer_max_size', 16384)
self.redir_to_exact = config.get('redir_to_exact', True)
memento = config.get('enable_memento', False)
if memento:
self.response_class = MementoResponse
self.response_class = WbResponse
self.enable_range_cache = config.get('enable_ranges', True)
self._reporter = config.get('reporter')
def render_content(self, wbrequest, cdx_lines, cdx_loader):
last_e = None
first = True
#cdx_lines = args[0]
#cdx_loader = args[1]
# List of already failed w/arcs
failed_files = []
response = None
# Iterate over the cdx until find one that works
# The cdx should already be sorted in
# closest-to-timestamp order (from the cdx server)
for cdx in cdx_lines:
# optimize: can detect if redirect is needed just from the cdx,
# no need to load w/arc data if requiring exact match
if first:
redir_response = self._redirect_if_needed(wbrequest, cdx)
if redir_response:
return redir_response
first = False
response = self.cached_replay_capture(wbrequest,
except (CaptureException, ArchiveLoadFailed) as ce:
#import traceback
last_e = ce
if response:
return response
if not last_e:
# can only get here if cdx_lines is empty somehow
# should be filtered out before hand, but if not
msg = 'No Captures found for: ' + wbrequest.wb_url.url
last_e = NotFoundException(msg)
raise last_e
def cached_replay_capture(self, wbrequest, cdx, cdx_loader, failed_files):
def get_capture():
return self.replay_capture(wbrequest,
if not self.enable_range_cache:
return get_capture()
range_info = wbrequest.extract_range()
if not range_info:
return get_capture()
range_status, range_iter = (range_cache.
cdx.get('digest', cdx['urlkey']),
response = self.response_class(range_status,
return response
def replay_capture(self, wbrequest, cdx, cdx_loader, failed_files):
(status_headers, stream) = (self.content_loader(cdx,
# check and reject self-redirect
self._reject_self_redirect(wbrequest, cdx, status_headers)
# check if redir is needed
redir_response = self._redirect_if_needed(wbrequest, cdx)
if redir_response:
return redir_response
#length = status_headers.get_header('content-length')
#stream = LimitReader.wrap_stream(stream, length)
# one more check for referrer-based self-redirect
# TODO: evaluate this, as refreshing in browser may sometimes cause
# referrer to be set to the same page, incorrectly skipping a capture
# self._reject_referrer_self_redirect(wbrequest)
urlrewriter = wbrequest.urlrewriter
# if using url rewriter, use original url for rewriting purposes
if wbrequest and wbrequest.wb_url:
wbrequest.wb_url.url = cdx['url']
if wbrequest.options['is_ajax']:
wbrequest.urlrewriter.rewrite_opts['is_ajax'] = True
head_insert_func = None
if self.head_insert_view:
head_insert_func = (self.head_insert_view.
result = (self.content_rewriter.
(status_headers, response_iter, is_rewritten) = result
# buffer response if buffering enabled
if self.buffer_response:
content_len = status_headers.get_header('content-length')
content_len = int(content_len)
content_len = 0
if content_len <= 0:
max_size = self.buffer_max_size
response_iter = self.buffered_response(status_headers,
# Set Content-Location if not exact capture
if not self.redir_to_exact:
mod = wbrequest.options.get('replay_mod', wbrequest.wb_url.mod)
canon_url = (wbrequest.urlrewriter.
status_headers.headers.append(('Content-Location', canon_url))
if wbrequest.wb_url.mod == 'vi_':
status_headers.headers.append(('access-control-allow-origin', '*'))
response = self.response_class(status_headers,
# notify reporter callback, if any
if self._reporter:
self._reporter(wbrequest, cdx, response)
return response
# Buffer rewrite iterator and return a response from a string
def buffered_response(self, status_headers, iterator, max_size):
out = BytesIO()
size = 0
read_all = True
for buff in iterator:
buff = bytes(buff)
size += len(buff)
if max_size > 0 and size > max_size:
read_all = False
content = out.getvalue()
if read_all:
content_length_str = str(len(content))
# remove existing content length
return [content]
return chain(iter([content]), iterator)
def _redirect_if_needed(self, wbrequest, cdx):
if not self.redir_to_exact:
return None
if wbrequest.options['is_proxy']:
return None
if wbrequest.custom_params.get('noredir'):
return None
is_timegate = (wbrequest.options.get('is_timegate', False))
if not is_timegate:
is_timegate = wbrequest.wb_url.is_latest_replay()
redir_needed = is_timegate or (cdx['timestamp'] != wbrequest.wb_url.timestamp)
if not redir_needed:
return None
if self.enable_range_cache and wbrequest.extract_range():
return None
#if is_timegate:
# timestamp = timestamp_now()
timestamp = cdx['timestamp']
new_url = (wbrequest.urlrewriter.
if wbrequest.method == 'POST':
# FF shows a confirm dialog, so can't use 307 effectively
# was: statusline = '307 Same-Method Internal Redirect'
return None
elif is_timegate:
statusline = '302 Found'
# clear cdx line to indicate internal redirect
statusline = '302 Internal Redirect'
cdx = None
status_headers = StatusAndHeaders(statusline,
[('Location', new_url)])
return self.response_class(status_headers,
def _reject_self_redirect(self, wbrequest, cdx, status_headers):
Check if response is a 3xx redirect to the same url
If so, reject this capture to avoid causing redirect loop
if not status_headers.statusline.startswith('3'):
# skip all 304s
if (status_headers.statusline.startswith('304') and
not wbrequest.wb_url.is_identity):
raise CaptureException('Skipping 304 Modified: ' + str(cdx))
request_url = wbrequest.wb_url.url.lower()
location_url = status_headers.get_header('Location')
if not location_url:
location_url = location_url.lower()
if location_url.startswith('/'):
host = urlsplit(cdx['url']).netloc
location_url = host + location_url
if (ReplayView.strip_scheme_www(request_url) ==
raise CaptureException('Self Redirect: ' + str(cdx))
# TODO: reevaluate this, as it may reject valid refreshes of a page
def _reject_referrer_self_redirect(self, wbrequest): # pragma: no cover
Perform final check for referrer based self-redirect.
This method should be called after verifying that
the request timestamp == capture timestamp
If referrer is same as current url,
reject this response and try another capture.
if not wbrequest.referrer:
# build full url even if using relative-rewriting
request_url = (wbrequest.host_prefix +
wbrequest.rel_prefix + str(wbrequest.wb_url))
if (ReplayView.strip_scheme_www(request_url) ==
raise CaptureException('Self Redirect via Referrer: ' +
def strip_scheme_www(url):
>>> ReplayView.strip_scheme_www('https://example.com') ==\
>>> ReplayView.strip_scheme_www('https://example.com') ==\
>>> ReplayView.strip_scheme_www('https://example.com') ==\
>>> ReplayView.strip_scheme_www('https://example.com') ==\
>>> ReplayView.strip_scheme_www('about://example.com') ==\
>>> ReplayView.strip_scheme_www('http://') ==\
>>> ReplayView.strip_scheme_www('#!@?') ==\
m = ReplayView.STRIP_SCHEME_WWW.match(url)
match = m.group(2)
return match
if __name__ == "__main__":
import doctest

View File

@ -1,20 +0,0 @@
>>> format_ts('20141226101000')
'Fri, Dec 26 2014 10:10:00'
>>> format_ts('20141226101000', '%s')
>>> is_wb_handler(DebugEchoHandler())
from pywb.webapp.views import format_ts, is_wb_handler
from pywb.webapp.handlers import DebugEchoHandler
if __name__ == "__main__":
import doctest

View File

@ -1,222 +0,0 @@
from warcio.timeutils import timestamp_to_datetime, timestamp_to_sec
from pywb.framework.wbrequestresponse import WbResponse
from pywb.framework.memento import make_timemap, LINK_FORMAT
from six.moves.urllib.parse import urlsplit
import logging
import json
import os
from jinja2 import Environment
from jinja2 import FileSystemLoader, PackageLoader, ChoiceLoader
class template_filter(object):
Decorator for registering a function as a jinja2 filter
If optional argument is supplied, it is used as the filter name
Otherwise, the func name is the filter name
def __init__(self, param=None):
self.name = param
def __call__(self, func):
name = self.name
if not name:
name = func.__name__
FILTERS[name] = func
return func
# Filters
def format_ts(value, format_='%a, %b %d %Y %H:%M:%S'):
if format_ == '%s':
return timestamp_to_sec(value)
value = timestamp_to_datetime(value)
return value.strftime(format_)
def get_urlsplit(url):
split = urlsplit(url)
return split
def is_wb_handler(obj):
if not hasattr(obj, 'handler'):
return False
return obj.handler.__class__.__name__ == "WBHandler"
def tojson(obj):
return json.dumps(obj)
class FileOnlyPackageLoader(PackageLoader):
def get_source(self, env, template):
dir_, file_ = os.path.split(template)
return super(FileOnlyPackageLoader, self).get_source(env, file_)
class RelEnvironment(Environment):
"""Override join_path() to enable relative template paths."""
def join_path(self, template, parent):
return os.path.join(os.path.dirname(parent), template)
class J2TemplateView(object):
shared_jinja_env = None
def __init__(self, filename):
self.template_file = filename
self.jinja_env = self.init_shared_env()
def init_shared_env(paths=['templates', '.', '/'],
if J2TemplateView.shared_jinja_env:
return J2TemplateView.shared_jinja_env
loaders = J2TemplateView._add_loaders(paths, packages)
loader = ChoiceLoader(loaders)
if overlay_env:
jinja_env = overlay_env.overlay(loader=loader, trim_blocks=True)
jinja_env = RelEnvironment(loader=loader, trim_blocks=True)
J2TemplateView.shared_jinja_env = jinja_env
return jinja_env
def _add_loaders(paths, packages):
loaders = []
# add loaders for paths
for path in paths:
# add loaders for all specified packages
for package in packages:
return loaders
def render_to_string(self, **kwargs):
template = self.jinja_env.get_template(self.template_file)
wbrequest = kwargs.get('wbrequest')
if wbrequest:
params = wbrequest.env.get('pywb.template_params')
if params:
template_result = template.render(**kwargs)
return template_result
def render_response(self, **kwargs):
template_result = self.render_to_string(**kwargs)
status = kwargs.get('status', '200 OK')
content_type = kwargs.get('content_type', 'text/html; charset=utf-8')
return WbResponse.text_response(template_result,
def init_view(config, key, view_class=J2TemplateView):
filename = config.get(key)
if not filename:
return None
logging.debug('Adding {0}: {1}'.format(key, filename))
return view_class(filename)
class HeadInsertView(J2TemplateView):
def create_insert_func(self, wbrequest,
if wbrequest.options['is_ajax']:
return None
url = wbrequest.wb_url.get_url()
top_url = wbrequest.wb_prefix
top_url += wbrequest.wb_url.to_str(mod=wbrequest.final_mod)
include_wombat = not wbrequest.wb_url.is_banner_only
def make_head_insert(rule, cdx):
cdx['url'] = url
return (self.render_to_string(wbrequest=wbrequest,
return make_head_insert
def init_from_config(config):
view = config.get('head_insert_view')
if not view:
html = config.get('head_insert_html', 'templates/head_insert.html')
if html:
banner_html = config.get('banner_html', 'banner.html')
view = HeadInsertView(html)
logging.debug('Adding HeadInsert: {0}, Banner {1}'.
format(html, banner_html))
view.banner_html = banner_html
return view
# query views
class J2HtmlCapturesView(J2TemplateView):
def render_response(self, wbrequest, cdx_lines, **kwargs):
def format_cdx_lines():
for cdx in cdx_lines:
cdx['_orig_url'] = cdx['url']
cdx['url'] = wbrequest.wb_url.get_url(url=cdx['url'])
yield cdx
return J2TemplateView.render_response(self,
class MementoTimemapView(object):
def render_response(self, wbrequest, cdx_lines, **kwargs):
memento_lines = make_timemap(wbrequest, cdx_lines)
return WbResponse.text_stream(memento_lines,