mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
remove obsolete code and tests!
disable test_auto_colls for now until fully supported in new system
This commit is contained in:
parent
24c968640d
commit
52dc46fe6a
@ -1,28 +0,0 @@
|
||||
### pywb.cdx package
|
||||
|
||||
This package contains the CDX processing suite of the pywb wayback tool suite.
|
||||
|
||||
The CDX Server loads, filters and transforms cdx from multiple sources in response
|
||||
to a given query.
|
||||
|
||||
#### Sample App
|
||||
|
||||
A very simple reference WSGI app is included.
|
||||
|
||||
Run: `python -m pywb.cdx.wsgi_cdxserver` to start the app, keyboard interrupt to stop.
|
||||
|
||||
The default [config.yaml](config.yaml) points to the sample data directory
|
||||
and uses port 8080.
|
||||
|
||||
The domain specific [rules.yaml](rules.yaml) are also loaded.
|
||||
|
||||
#### CDX Server API Reference
|
||||
|
||||
Goal is to provide compatiblity with this feature set and more:
|
||||
https://github.com/internetarchive/wayback/tree/master/wayback-cdx-server
|
||||
|
||||
TODO
|
||||
|
||||
|
||||
|
||||
|
@ -1,185 +0,0 @@
|
||||
import yaml
|
||||
import re
|
||||
import logging
|
||||
import pkg_resources
|
||||
|
||||
from six.moves.urllib.parse import urlsplit
|
||||
|
||||
from pywb.utils.dsrules import BaseRule, RuleSet
|
||||
|
||||
from pywb.utils.canonicalize import unsurt, UrlCanonicalizer
|
||||
from pywb.utils.loaders import to_native_str
|
||||
|
||||
|
||||
#=================================================================
|
||||
def load_domain_specific_cdx_rules(ds_rules_file, surt_ordered):
|
||||
canon = None
|
||||
fuzzy = None
|
||||
|
||||
# Load Canonicalizer Rules
|
||||
rules = RuleSet(CDXDomainSpecificRule, 'canonicalize',
|
||||
ds_rules_file=ds_rules_file)
|
||||
|
||||
if not surt_ordered:
|
||||
for rule in rules.rules:
|
||||
rule.unsurt()
|
||||
|
||||
if rules:
|
||||
canon = CustomUrlCanonicalizer(rules, surt_ordered)
|
||||
|
||||
# Load Fuzzy Lookup Rules
|
||||
rules = RuleSet(CDXDomainSpecificRule, 'fuzzy_lookup',
|
||||
ds_rules_file=ds_rules_file)
|
||||
|
||||
if not surt_ordered:
|
||||
for rule in rules.rules:
|
||||
rule.unsurt()
|
||||
|
||||
if rules:
|
||||
fuzzy = FuzzyQuery(rules)
|
||||
|
||||
logging.debug('CustomCanonilizer? ' + str(bool(canon)))
|
||||
logging.debug('FuzzyMatcher? ' + str(bool(canon)))
|
||||
return (canon, fuzzy)
|
||||
|
||||
|
||||
#=================================================================
|
||||
class CustomUrlCanonicalizer(UrlCanonicalizer):
|
||||
def __init__(self, rules, surt_ordered=True):
|
||||
super(CustomUrlCanonicalizer, self).__init__(surt_ordered)
|
||||
self.rules = rules
|
||||
|
||||
def __call__(self, url):
|
||||
urlkey = super(CustomUrlCanonicalizer, self).__call__(url)
|
||||
|
||||
for rule in self.rules.iter_matching(urlkey):
|
||||
m = rule.regex.match(urlkey)
|
||||
if not m:
|
||||
continue
|
||||
|
||||
if rule.replace:
|
||||
return m.expand(rule.replace)
|
||||
|
||||
return urlkey
|
||||
|
||||
|
||||
#=================================================================
|
||||
class FuzzyQuery(object):
|
||||
def __init__(self, rules):
|
||||
self.rules = rules
|
||||
|
||||
def __call__(self, query):
|
||||
matched_rule = None
|
||||
|
||||
urlkey = to_native_str(query.key, 'utf-8')
|
||||
url = query.url
|
||||
filter_ = query.filters
|
||||
output = query.output
|
||||
|
||||
for rule in self.rules.iter_matching(urlkey):
|
||||
m = rule.regex.search(urlkey)
|
||||
if not m:
|
||||
continue
|
||||
|
||||
matched_rule = rule
|
||||
|
||||
groups = m.groups()
|
||||
for g in groups:
|
||||
for f in matched_rule.filter:
|
||||
filter_.append(f.format(g))
|
||||
|
||||
break
|
||||
|
||||
if not matched_rule:
|
||||
return None
|
||||
|
||||
repl = '?'
|
||||
if matched_rule.replace:
|
||||
repl = matched_rule.replace
|
||||
|
||||
inx = url.find(repl)
|
||||
if inx > 0:
|
||||
url = url[:inx + len(repl)]
|
||||
|
||||
if matched_rule.match_type == 'domain':
|
||||
host = urlsplit(url).netloc
|
||||
# remove the subdomain
|
||||
url = host.split('.', 1)[1]
|
||||
|
||||
params = query.params
|
||||
params.update({'url': url,
|
||||
'matchType': matched_rule.match_type,
|
||||
'filter': filter_})
|
||||
|
||||
if 'reverse' in params:
|
||||
del params['reverse']
|
||||
|
||||
if 'closest' in params:
|
||||
del params['closest']
|
||||
|
||||
if 'end_key' in params:
|
||||
del params['end_key']
|
||||
|
||||
return params
|
||||
|
||||
|
||||
#=================================================================
|
||||
class CDXDomainSpecificRule(BaseRule):
|
||||
DEFAULT_FILTER = ['~urlkey:{0}']
|
||||
DEFAULT_MATCH_TYPE = 'prefix'
|
||||
|
||||
def __init__(self, name, config):
|
||||
super(CDXDomainSpecificRule, self).__init__(name, config)
|
||||
|
||||
if not isinstance(config, dict):
|
||||
self.regex = self.make_regex(config)
|
||||
self.replace = None
|
||||
self.filter = self.DEFAULT_FILTER
|
||||
self.match_type = self.DEFAULT_MATCH_TYPE
|
||||
else:
|
||||
self.regex = self.make_regex(config.get('match'))
|
||||
self.replace = config.get('replace')
|
||||
self.filter = config.get('filter', self.DEFAULT_FILTER)
|
||||
self.match_type = config.get('type', self.DEFAULT_MATCH_TYPE)
|
||||
|
||||
def unsurt(self):
|
||||
"""
|
||||
urlkey is assumed to be in surt format by default
|
||||
In the case of non-surt format, this method is called
|
||||
to desurt any urls
|
||||
"""
|
||||
self.url_prefix = list(map(unsurt, self.url_prefix))
|
||||
if self.regex:
|
||||
self.regex = re.compile(unsurt(self.regex.pattern))
|
||||
|
||||
if self.replace:
|
||||
self.replace = unsurt(self.replace)
|
||||
|
||||
@staticmethod
|
||||
def make_regex(config):
|
||||
# just query args
|
||||
if isinstance(config, list):
|
||||
string = CDXDomainSpecificRule.make_query_match_regex(config)
|
||||
|
||||
# split out base and args
|
||||
elif isinstance(config, dict):
|
||||
string = config.get('regex', '')
|
||||
string += CDXDomainSpecificRule.make_query_match_regex(
|
||||
config.get('args', []))
|
||||
|
||||
# else assume string
|
||||
else:
|
||||
string = str(config)
|
||||
|
||||
return re.compile(string)
|
||||
|
||||
@staticmethod
|
||||
def make_query_match_regex(params_list):
|
||||
params_list.sort()
|
||||
|
||||
def conv(value):
|
||||
return '[?&]({0}=[^&]+)'.format(re.escape(value))
|
||||
|
||||
params_list = list(map(conv, params_list))
|
||||
final_str = '.*'.join(params_list)
|
||||
return final_str
|
@ -1,230 +0,0 @@
|
||||
from pywb.utils.canonicalize import UrlCanonicalizer
|
||||
from pywb.utils.wbexception import NotFoundException
|
||||
|
||||
from pywb.cdx.cdxops import cdx_load
|
||||
from pywb.cdx.cdxsource import CDXSource, CDXFile, RemoteCDXSource, RedisCDXSource
|
||||
from pywb.cdx.zipnum import ZipNumCluster
|
||||
from pywb.cdx.cdxobject import CDXObject, CDXException
|
||||
from pywb.cdx.query import CDXQuery
|
||||
from pywb.cdx.cdxdomainspecific import load_domain_specific_cdx_rules
|
||||
|
||||
from pywb.utils.loaders import is_http
|
||||
|
||||
from itertools import chain
|
||||
import logging
|
||||
import os
|
||||
|
||||
|
||||
#=================================================================
|
||||
class BaseCDXServer(object):
|
||||
def __init__(self, **kwargs):
|
||||
ds_rules_file = kwargs.get('ds_rules_file')
|
||||
surt_ordered = kwargs.get('surt_ordered', True)
|
||||
|
||||
# load from domain-specific rules
|
||||
if ds_rules_file:
|
||||
self.url_canon, self.fuzzy_query = (
|
||||
load_domain_specific_cdx_rules(ds_rules_file, surt_ordered))
|
||||
# or custom passed in canonicalizer
|
||||
else:
|
||||
self.url_canon = kwargs.get('url_canon')
|
||||
self.fuzzy_query = kwargs.get('fuzzy_query')
|
||||
|
||||
# set default canonicalizer if none set thus far
|
||||
if not self.url_canon:
|
||||
self.url_canon = UrlCanonicalizer(surt_ordered)
|
||||
|
||||
def _check_cdx_iter(self, cdx_iter, query):
|
||||
""" Check cdx iter semantics
|
||||
If `cdx_iter` is empty (no matches), check if fuzzy matching
|
||||
is allowed, and try it -- otherwise,
|
||||
throw :exc:`~pywb.utils.wbexception.NotFoundException`
|
||||
"""
|
||||
|
||||
cdx_iter = self.peek_iter(cdx_iter)
|
||||
|
||||
if cdx_iter:
|
||||
return cdx_iter
|
||||
|
||||
# check if fuzzy is allowed and ensure that its an
|
||||
# exact match
|
||||
if (self.fuzzy_query and
|
||||
query.allow_fuzzy and
|
||||
query.is_exact):
|
||||
|
||||
fuzzy_query_params = self.fuzzy_query(query)
|
||||
if fuzzy_query_params:
|
||||
return self.load_cdx(**fuzzy_query_params)
|
||||
|
||||
msg = 'No Captures found for: ' + query.url
|
||||
if not query.is_exact:
|
||||
msg += ' (' + query.match_type + ' query)'
|
||||
|
||||
raise NotFoundException(msg, url=query.url)
|
||||
|
||||
#def _calc_search_keys(self, query):
|
||||
# return calc_search_range(url=query.url,
|
||||
# match_type=query.match_type,
|
||||
# url_canon=self.url_canon)
|
||||
|
||||
def load_cdx(self, **params):
|
||||
params['_url_canon'] = self.url_canon
|
||||
query = CDXQuery(params)
|
||||
|
||||
#key, end_key = self._calc_search_keys(query)
|
||||
#query.set_key(key, end_key)
|
||||
|
||||
cdx_iter = self._load_cdx_query(query)
|
||||
|
||||
return self._check_cdx_iter(cdx_iter, query)
|
||||
|
||||
def _load_cdx_query(self, query): # pragma: no cover
|
||||
raise NotImplementedError('Implement in subclass')
|
||||
|
||||
@staticmethod
|
||||
def peek_iter(iterable):
|
||||
try:
|
||||
first = next(iterable)
|
||||
except StopIteration:
|
||||
return None
|
||||
|
||||
return chain([first], iterable)
|
||||
|
||||
|
||||
#=================================================================
|
||||
class CDXServer(BaseCDXServer):
|
||||
"""
|
||||
Top-level cdx server object which maintains a list of cdx sources,
|
||||
responds to queries and dispatches to the cdx ops for processing
|
||||
"""
|
||||
|
||||
def __init__(self, paths, **kwargs):
|
||||
super(CDXServer, self).__init__(**kwargs)
|
||||
# TODO: we could save config in member, so that other
|
||||
# methods can use it. it's bad for add_cdx_source to take
|
||||
# config argument.
|
||||
self._create_cdx_sources(paths, kwargs.get('config'))
|
||||
|
||||
def _load_cdx_query(self, query):
|
||||
"""
|
||||
load CDX for query parameters ``params``.
|
||||
``key`` (or ``url``) parameter specifies URL to query,
|
||||
``matchType`` parameter specifies matching method for ``key``
|
||||
(default ``exact``).
|
||||
other parameters are passed down to :func:`cdx_load`.
|
||||
raises :exc:`~pywb.utils.wbexception.NotFoundException`
|
||||
if no captures are found.
|
||||
|
||||
:param query: query parameters
|
||||
:type query: :class:`~pywb.cdx.query.CDXQuery`
|
||||
:rtype: iterator on :class:`~pywb.cdx.cdxobject.CDXObject`
|
||||
"""
|
||||
return cdx_load(self.sources, query)
|
||||
|
||||
def _create_cdx_sources(self, paths, config):
|
||||
"""
|
||||
build CDXSource instances for each of path in ``paths``.
|
||||
|
||||
:param paths: list of sources or single source.
|
||||
each source may be either string or CDXSource instance. value
|
||||
of any other types will be silently ignored.
|
||||
:param config: config object passed to :method:`add_cdx_source`.
|
||||
"""
|
||||
self.sources = []
|
||||
|
||||
if paths is not None:
|
||||
if not isinstance(paths, (list, tuple)):
|
||||
paths = [paths]
|
||||
|
||||
for path in paths:
|
||||
self.add_cdx_source(path, config)
|
||||
|
||||
if len(self.sources) == 0:
|
||||
logging.warn('No CDX Sources configured from paths=%s', paths)
|
||||
|
||||
def _add_cdx_source(self, source):
|
||||
if source is None:
|
||||
return
|
||||
|
||||
logging.debug('Adding CDX Source: %s', source)
|
||||
self.sources.append(source)
|
||||
|
||||
def add_cdx_source(self, source, config):
|
||||
if isinstance(source, CDXSource):
|
||||
self._add_cdx_source(source)
|
||||
|
||||
elif isinstance(source, str):
|
||||
if os.path.isdir(source):
|
||||
for fn in os.listdir(source):
|
||||
self._add_cdx_source(self._create_cdx_source(
|
||||
os.path.join(source, fn), config))
|
||||
else:
|
||||
self._add_cdx_source(self._create_cdx_source(
|
||||
source, config))
|
||||
|
||||
def _create_cdx_source(self, filename, config):
|
||||
if is_http(filename):
|
||||
return RemoteCDXSource(filename)
|
||||
|
||||
if filename.startswith('redis://'):
|
||||
return RedisCDXSource(filename, config)
|
||||
|
||||
if filename.endswith(('.cdx', '.cdxj')):
|
||||
return CDXFile(filename)
|
||||
|
||||
if filename.endswith(('.summary', '.idx')):
|
||||
return ZipNumCluster(filename, config)
|
||||
|
||||
# no warning for .loc or .gz (zipnum)
|
||||
if not filename.endswith(('.loc', '.gz')):
|
||||
logging.warn('skipping unrecognized URI: %s', filename)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
#=================================================================
|
||||
class RemoteCDXServer(BaseCDXServer):
|
||||
"""
|
||||
A special cdx server that uses a single
|
||||
:class:`~pywb.cdx.cdxsource.RemoteCDXSource`.
|
||||
It simply proxies the query params to the remote source
|
||||
and performs no local processing/filtering
|
||||
"""
|
||||
def __init__(self, source, **kwargs):
|
||||
super(RemoteCDXServer, self).__init__(**kwargs)
|
||||
|
||||
if isinstance(source, RemoteCDXSource):
|
||||
self.source = source
|
||||
elif (isinstance(source, str) and is_http(source)):
|
||||
self.source = RemoteCDXSource(source, remote_processing=True)
|
||||
else:
|
||||
raise Exception('Invalid remote cdx source: ' + str(source))
|
||||
|
||||
def _load_cdx_query(self, query):
|
||||
return cdx_load([self.source], query, process=False)
|
||||
|
||||
|
||||
#=================================================================
|
||||
def create_cdx_server(config, ds_rules_file=None, server_cls=None):
|
||||
if hasattr(config, 'get'):
|
||||
paths = config.get('index_paths')
|
||||
surt_ordered = config.get('surt_ordered', True)
|
||||
pass_config = config
|
||||
else:
|
||||
paths = config
|
||||
surt_ordered = True
|
||||
pass_config = None
|
||||
|
||||
logging.debug('CDX Surt-Ordered? ' + str(surt_ordered))
|
||||
|
||||
if not server_cls:
|
||||
if ((isinstance(paths, str) and is_http(paths)) or
|
||||
isinstance(paths, RemoteCDXSource)):
|
||||
server_cls = RemoteCDXServer
|
||||
else:
|
||||
server_cls = CDXServer
|
||||
|
||||
return server_cls(paths,
|
||||
config=pass_config,
|
||||
surt_ordered=surt_ordered,
|
||||
ds_rules_file=ds_rules_file)
|
@ -1,150 +0,0 @@
|
||||
from pywb.utils.binsearch import iter_range
|
||||
|
||||
from pywb.utils.wbexception import AccessException, NotFoundException
|
||||
from pywb.utils.wbexception import BadRequestException, WbException
|
||||
|
||||
from pywb.cdx.query import CDXQuery
|
||||
|
||||
from six.moves.urllib.request import urlopen, Request
|
||||
from six.moves.urllib.error import HTTPError
|
||||
from six.moves import map
|
||||
|
||||
|
||||
#=================================================================
|
||||
class CDXSource(object):
|
||||
"""
|
||||
Represents any cdx index source
|
||||
"""
|
||||
def load_cdx(self, query): # pragma: no cover
|
||||
raise NotImplementedError('Implement in subclass')
|
||||
|
||||
|
||||
#=================================================================
|
||||
class CDXFile(CDXSource):
|
||||
"""
|
||||
Represents a local plain-text .cdx file
|
||||
"""
|
||||
def __init__(self, filename):
|
||||
self.filename = filename
|
||||
|
||||
def load_cdx(self, query):
|
||||
return self._do_load_file(self.filename, query)
|
||||
|
||||
@staticmethod
|
||||
def _do_load_file(filename, query):
|
||||
with open(filename, 'rb') as source:
|
||||
gen = iter_range(source, query.key,
|
||||
query.end_key)
|
||||
for line in gen:
|
||||
yield line
|
||||
|
||||
def __str__(self):
|
||||
return 'CDX File - ' + self.filename
|
||||
|
||||
|
||||
#=================================================================
|
||||
class RemoteCDXSource(CDXSource):
|
||||
"""
|
||||
Represents a remote cdx server, to which requests will be proxied.
|
||||
|
||||
Only ``url`` and ``match_type`` params are proxied at this time,
|
||||
the stream is passed through all other filters locally.
|
||||
"""
|
||||
def __init__(self, filename, cookie=None, remote_processing=False):
|
||||
self.remote_url = filename
|
||||
self.cookie = cookie
|
||||
self.remote_processing = remote_processing
|
||||
|
||||
def load_cdx(self, query):
|
||||
if self.remote_processing:
|
||||
remote_query = query
|
||||
else:
|
||||
# Only send url and matchType to remote
|
||||
remote_query = CDXQuery(dict(url=query.url,
|
||||
matchType=query.match_type))
|
||||
|
||||
urlparams = remote_query.urlencode()
|
||||
|
||||
try:
|
||||
request = Request(self.remote_url + '?' + urlparams)
|
||||
|
||||
if self.cookie:
|
||||
request.add_header('Cookie', self.cookie)
|
||||
|
||||
response = urlopen(request)
|
||||
|
||||
except HTTPError as e:
|
||||
if e.code == 403:
|
||||
raise AccessException('Access Denied')
|
||||
elif e.code == 404:
|
||||
# return empty list for consistency with other cdx sources
|
||||
# will be converted to 404 if no other retry
|
||||
return []
|
||||
elif e.code == 400:
|
||||
raise BadRequestException()
|
||||
else:
|
||||
raise WbException('Invalid response from remote cdx server')
|
||||
|
||||
return iter(response)
|
||||
|
||||
def __str__(self):
|
||||
if self.remote_processing:
|
||||
return 'Remote CDX Server: ' + self.remote_url
|
||||
else:
|
||||
return 'Remote CDX Source: ' + self.remote_url
|
||||
|
||||
|
||||
#=================================================================
|
||||
class RedisCDXSource(CDXSource):
|
||||
DEFAULT_KEY_PREFIX = b'c:'
|
||||
|
||||
def __init__(self, redis_url, config=None):
|
||||
import redis
|
||||
|
||||
parts = redis_url.split('/')
|
||||
if len(parts) > 4:
|
||||
self.cdx_key = parts[4].encode('utf-8')
|
||||
redis_url = 'redis://' + parts[2] + '/' + parts[3]
|
||||
else:
|
||||
self.cdx_key = None
|
||||
|
||||
self.redis_url = redis_url
|
||||
self.redis = redis.StrictRedis.from_url(redis_url)
|
||||
|
||||
self.key_prefix = self.DEFAULT_KEY_PREFIX
|
||||
|
||||
def load_cdx(self, query):
|
||||
"""
|
||||
Load cdx from redis cache, from an ordered list
|
||||
|
||||
If cdx_key is set, treat it as cdx file and load use
|
||||
zrangebylex! (Supports all match types!)
|
||||
|
||||
Otherwise, assume a key per-url and load all entries for that key.
|
||||
(Only exact match supported)
|
||||
"""
|
||||
|
||||
if self.cdx_key:
|
||||
return self.load_sorted_range(query, self.cdx_key)
|
||||
else:
|
||||
return self.load_single_key(query.key)
|
||||
|
||||
def load_sorted_range(self, query, cdx_key):
|
||||
cdx_list = self.redis.zrangebylex(cdx_key,
|
||||
b'[' + query.key,
|
||||
b'(' + query.end_key)
|
||||
|
||||
return iter(cdx_list)
|
||||
|
||||
def load_single_key(self, key):
|
||||
# ensure only url/surt is part of key
|
||||
key = key.split(b' ')[0]
|
||||
cdx_list = self.redis.zrange(self.key_prefix + key, 0, -1)
|
||||
|
||||
# key is not part of list, so prepend to each line
|
||||
key += b' '
|
||||
cdx_list = list(map(lambda x: key + x, cdx_list))
|
||||
return cdx_list
|
||||
|
||||
def __str__(self):
|
||||
return 'Redis - ' + self.redis_url
|
@ -1,40 +0,0 @@
|
||||
r"""
|
||||
Load Rules
|
||||
|
||||
>>> (canon, fuzzy) = load_domain_specific_cdx_rules(None, True)
|
||||
>>> canon('http://test.example.example/path/index.html?a=b&id=value&c=d')
|
||||
'example,example,test)/path/index.html?id=value'
|
||||
|
||||
|
||||
# Fuzzy Query Args Builder
|
||||
>>> CDXDomainSpecificRule.make_query_match_regex(['para', 'id', 'abc'])
|
||||
'[?&](abc=[^&]+).*[?&](id=[^&]+).*[?&](para=[^&]+)'
|
||||
|
||||
>>> CDXDomainSpecificRule.make_query_match_regex(['id[0]', 'abc()'])
|
||||
'[?&](abc\\(\\)=[^&]+).*[?&](id\\[0\\]=[^&]+)'
|
||||
|
||||
|
||||
# Fuzzy Match Query + Args
|
||||
|
||||
# list
|
||||
>>> CDXDomainSpecificRule.make_regex(['para', 'id', 'abc']).pattern
|
||||
'[?&](abc=[^&]+).*[?&](id=[^&]+).*[?&](para=[^&]+)'
|
||||
|
||||
# dict
|
||||
>>> CDXDomainSpecificRule.make_regex(dict(regex='com,test,.*\)/', args=['para', 'id', 'abc'])).pattern
|
||||
'com,test,.*\\)/[?&](abc=[^&]+).*[?&](id=[^&]+).*[?&](para=[^&]+)'
|
||||
|
||||
# string
|
||||
>>> CDXDomainSpecificRule.make_regex('com,test,.*\)/[?&](abc=[^&]+).*[?&](id=[^&]+).*[?&](para=[^&]+)').pattern
|
||||
'com,test,.*\\)/[?&](abc=[^&]+).*[?&](id=[^&]+).*[?&](para=[^&]+)'
|
||||
|
||||
"""
|
||||
|
||||
|
||||
from pywb.cdx.cdxdomainspecific import CDXDomainSpecificRule
|
||||
from pywb.cdx.cdxdomainspecific import load_domain_specific_cdx_rules
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
doctest.testmod()
|
@ -1,228 +0,0 @@
|
||||
#=================================================================
|
||||
"""
|
||||
# Merge Sort Multipe CDX Sources
|
||||
>>> cdx_ops_test(url = 'http://iana.org/', sources = [test_cdx_dir + 'dupes.cdx', test_cdx_dir + 'iana.cdx'])
|
||||
org,iana)/ 20140126200624 http://www.iana.org/ text/html 200 OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 2258 334 iana.warc.gz
|
||||
org,iana)/ 20140127171238 http://iana.org unk 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 343 1858 dupes.warc.gz
|
||||
org,iana)/ 20140127171238 http://www.iana.org/ warc/revisit - OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 536 2678 dupes.warc.gz
|
||||
|
||||
|
||||
# Limit CDX Stream
|
||||
>>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', limit = 3)
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200625 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 117166 198285 iana.warc.gz
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200654 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf warc/revisit - YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 548 482544 iana.warc.gz
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200706 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf warc/revisit - YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 495230 iana.warc.gz
|
||||
|
||||
|
||||
# Reverse CDX Stream
|
||||
>>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', reverse = True, resolveRevisits = True, limit = 3)
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201308 https://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 551 783712 iana.warc.gz 117166 198285 iana.warc.gz
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201249 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 771773 iana.warc.gz 117166 198285 iana.warc.gz
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201240 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 551 757988 iana.warc.gz 117166 198285 iana.warc.gz
|
||||
|
||||
>>> cdx_ops_test('http://iana.org/_js/2013.1/jquery.js', reverse = True, resolveRevisits = True, limit = 1)
|
||||
org,iana)/_js/2013.1/jquery.js 20140126201307 https://www.iana.org/_js/2013.1/jquery.js application/x-javascript 200 AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO - - 543 778507 iana.warc.gz 33449 7311 iana.warc.gz
|
||||
|
||||
# From & To
|
||||
>>> cdx_ops_test('http://example.com/', sources = [test_cdx_dir], from_ts='2013', to='2013')
|
||||
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
|
||||
|
||||
>>> cdx_ops_test('http://example.com/', sources = [test_cdx_dir], from_ts='2014')
|
||||
com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz
|
||||
com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz
|
||||
|
||||
>>> cdx_ops_test('http://example.com/', sources = [test_cdx_dir], to='2012') # doctest: +IGNORE_EXCEPTION_DETAIL
|
||||
Traceback (most recent call last):
|
||||
NotFoundException: No Captures found for: http://example.com/
|
||||
|
||||
# No matching results
|
||||
>>> cdx_ops_test('http://iana.org/dont_have_this', reverse = True, resolveRevisits = True, limit = 2) # doctest: +IGNORE_EXCEPTION_DETAIL
|
||||
Traceback (most recent call last):
|
||||
NotFoundException: No Captures found for: http://iana.org/dont_have_this
|
||||
|
||||
# No matching -- limit=1
|
||||
>>> cdx_ops_test('http://iana.org/dont_have_this', reverse = True, resolveRevisits = True, limit = 1) # doctest: +IGNORE_EXCEPTION_DETAIL
|
||||
Traceback (most recent call last):
|
||||
NotFoundException: No Captures found for: http://iana.org/dont_have_this
|
||||
|
||||
# Filter cdx (default: regex)
|
||||
>>> cdx_ops_test(url = 'http://iana.org/domains', matchType = 'prefix', filter = ['mimetype:text/html'])
|
||||
org,iana)/domains 20140126200825 http://www.iana.org/domains text/html 200 7UPSCLNWNZP33LGW6OJGSF2Y4CDG4ES7 - - 2912 610534 iana.warc.gz
|
||||
org,iana)/domains/arpa 20140126201248 http://www.iana.org/domains/arpa text/html 200 QOFZZRN6JIKAL2JRL6ZC2VVG42SPKGHT - - 2939 759039 iana.warc.gz
|
||||
org,iana)/domains/idn-tables 20140126201127 http://www.iana.org/domains/idn-tables text/html 200 HNCUFTJMOQOGAEY6T56KVC3T7TVLKGEW - - 8118 715878 iana.warc.gz
|
||||
org,iana)/domains/int 20140126201239 http://www.iana.org/domains/int text/html 200 X32BBNNORV4SPEHTQF5KI5NFHSKTZK6Q - - 2482 746788 iana.warc.gz
|
||||
org,iana)/domains/reserved 20140126201054 http://www.iana.org/domains/reserved text/html 200 R5AAEQX5XY5X5DG66B23ODN5DUBWRA27 - - 3573 701457 iana.warc.gz
|
||||
org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz
|
||||
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz
|
||||
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
|
||||
org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz
|
||||
|
||||
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', filter = 'statuscode:200')
|
||||
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
|
||||
|
||||
# Filter Alt field name
|
||||
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', filter = 'status:200')
|
||||
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
|
||||
|
||||
# Filter -- no field specified, match regex on entire line
|
||||
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', filter = '~screen.css 20140126200625')
|
||||
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
|
||||
|
||||
# Filter -- no such field, no matches
|
||||
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', filter = 'blah:200') # doctest: +IGNORE_EXCEPTION_DETAIL
|
||||
Traceback (most recent call last):
|
||||
NotFoundException: No Captures found for: http://iana.org/_css/2013.1/screen.css
|
||||
|
||||
# Filter exact -- (* prefix)
|
||||
>>> cdx_ops_test(url = 'http://example.com*', sources = [test_cdx_dir], filter = '=urlkey:com,example)/?example=1')
|
||||
com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
|
||||
com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz
|
||||
|
||||
# Filter exact invert
|
||||
>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = ['!=urlkey:com,example)/?example=1', '!=urlkey:com,example)/?example=2', '!=urlkey:com,example)/?example=3'])
|
||||
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
|
||||
com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz
|
||||
com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz
|
||||
|
||||
# Filter contains
|
||||
>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '~urlkey:example=1')
|
||||
com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
|
||||
com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz
|
||||
|
||||
# Filter contains invert
|
||||
>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '!~urlkey:example=')
|
||||
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
|
||||
com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz
|
||||
com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz
|
||||
|
||||
# Collapse by timestamp
|
||||
# unresolved revisits, different statuscode results in an extra repeat
|
||||
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapseTime = 11)
|
||||
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
|
||||
org,iana)/_css/2013.1/screen.css 20140126200653 http://www.iana.org/_css/2013.1/screen.css warc/revisit - BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 533 328367 iana.warc.gz
|
||||
org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css warc/revisit - BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz
|
||||
|
||||
# resolved revisits
|
||||
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapseTime = '11', resolveRevisits = True)
|
||||
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz - - -
|
||||
org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz 8754 41238 iana.warc.gz
|
||||
|
||||
# Sort by closest timestamp + field select output
|
||||
>>> cdx_ops_test(closest = '20140126200826', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', fields = 'timestamp', limit = 10)
|
||||
20140126200826
|
||||
20140126200816
|
||||
20140126200805
|
||||
20140126200912
|
||||
20140126200738
|
||||
20140126200930
|
||||
20140126200718
|
||||
20140126200706
|
||||
20140126200654
|
||||
20140126200625
|
||||
|
||||
# In case of both reverse and closest, closest takes precedence
|
||||
# 'reverse closest' not supported at this time
|
||||
# if it is, this test will reflect the change
|
||||
>>> cdx_ops_test(closest = '20140126200826', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', fields = 'timestamp', limit = 3, reverse = True)
|
||||
20140126200826
|
||||
20140126200816
|
||||
20140126200805
|
||||
|
||||
>>> cdx_ops_test(closest = '20140126201306', url = 'http://iana.org/dnssec', resolveRevisits = True, sources = [test_cdx_dir + 'dupes.cdx', test_cdx_dir + 'iana.cdx'])
|
||||
org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz - - -
|
||||
org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz - - -
|
||||
|
||||
|
||||
>>> cdx_ops_test(closest = '20140126201307', url = 'http://iana.org/dnssec', resolveRevisits = True)
|
||||
org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz - - -
|
||||
org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz - - -
|
||||
|
||||
# equal dist prefer earlier
|
||||
>>> cdx_ops_test(closest = '20140126200700', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolveRevisits = True, limit = 2)
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200654 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 548 482544 iana.warc.gz 117166 198285 iana.warc.gz
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200706 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 495230 iana.warc.gz 117166 198285 iana.warc.gz
|
||||
|
||||
>>> cdx_ops_test(closest = '20140126200659', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolveRevisits = True, limit = 2, fields = 'timestamp')
|
||||
20140126200654
|
||||
20140126200706
|
||||
|
||||
>>> cdx_ops_test(closest = '20140126200701', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolveRevisits = True, limit = 2, fields = 'timestamp')
|
||||
20140126200706
|
||||
20140126200654
|
||||
|
||||
|
||||
# Resolve Revisits
|
||||
>>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/inconsolata.otf', resolveRevisits = True)
|
||||
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200826 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 34054 620049 iana.warc.gz - - -
|
||||
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200912 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 546 667073 iana.warc.gz 34054 620049 iana.warc.gz
|
||||
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200930 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 534 697255 iana.warc.gz 34054 620049 iana.warc.gz
|
||||
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201055 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 547 714833 iana.warc.gz 34054 620049 iana.warc.gz
|
||||
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201249 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 551 768625 iana.warc.gz 34054 620049 iana.warc.gz
|
||||
|
||||
>>> cdx_ops_test('http://iana.org/domains/root/db', resolveRevisits = True)
|
||||
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz - - -
|
||||
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz - - -
|
||||
|
||||
"""
|
||||
|
||||
#=================================================================
|
||||
from pywb.cdx.cdxserver import CDXServer
|
||||
import os
|
||||
import sys
|
||||
import six
|
||||
|
||||
from pywb import get_test_dir
|
||||
|
||||
test_cdx_dir = get_test_dir() + 'cdx/'
|
||||
|
||||
|
||||
def cdx_ops_test_data(url, sources = [test_cdx_dir + 'iana.cdx'], **kwparams):
|
||||
kwparams['url'] = url
|
||||
if not 'output' in kwparams:
|
||||
kwparams['output'] = 'cdxobject'
|
||||
|
||||
server = CDXServer(sources)
|
||||
results = server.load_cdx(**kwparams)
|
||||
return list(results)
|
||||
|
||||
|
||||
def cdx_ops_test(*args, **kwargs):
|
||||
results = cdx_ops_test_data(*args, **kwargs)
|
||||
|
||||
fields = kwargs.get('fields')
|
||||
if fields:
|
||||
fields = fields.split(',')
|
||||
|
||||
for x in results:
|
||||
if not isinstance(x, str):
|
||||
l = x.to_text(fields).replace('\t', ' ')
|
||||
else:
|
||||
l = x
|
||||
|
||||
sys.stdout.write(l)
|
||||
|
||||
|
||||
|
||||
def test_cdxj_resolve_revisit():
|
||||
# Resolve Revisit -- cdxj minimal -- output also json
|
||||
results = cdx_ops_test_data(url = 'http://example.com/?example=1', sources=[get_test_dir() + 'cdxj/example.cdxj'], resolveRevisits=True)
|
||||
assert(len(results) == 2)
|
||||
assert(dict(results[0]) == {"urlkey": "com,example)/?example=1", "timestamp": "20140103030321", "url": "http://example.com?example=1", "length": "1043", "filename": "example.warc.gz", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "offset": "333", "orig.length": "-", "orig.offset": "-", "orig.filename": "-"})
|
||||
|
||||
assert(dict(results[1]) == {"urlkey": "com,example)/?example=1", "timestamp": "20140103030341", "url": "http://example.com?example=1", "filename": "example.warc.gz", "length": "553", "mime": "", "offset": "1864", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "orig.length": "1043", "orig.offset": "333", "orig.filename": "example.warc.gz"})
|
||||
|
||||
|
||||
|
||||
def test_cdxj_resolve_revisit_2():
|
||||
# Resolve Revisit -- cdxj minimal -- output also json
|
||||
results = cdx_ops_test_data(url = 'http://example.com/?example=1', sources=[get_test_dir() + 'cdxj/example-no-digest.cdxj'], resolveRevisits=True)
|
||||
assert(len(results) == 2)
|
||||
assert(dict(results[0]) == {"urlkey": "com,example)/?example=1", "timestamp": "20140103030321", "url": "http://example.com?example=1", "length": "1043", "filename": "example.warc.gz", "offset": "333", "orig.length": "-", "orig.offset": "-", "orig.filename": "-"})
|
||||
|
||||
assert(dict(results[1]) == {"urlkey": "com,example)/?example=1", "timestamp": "20140103030341", "url": "http://example.com?example=1", "length": "553", "filename": "example.warc.gz", "mime": "warc/revisit", "offset": "1864", "orig.length": "-", "orig.offset": "-", "orig.filename": "-"})
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
doctest.testmod()
|
@ -1,117 +0,0 @@
|
||||
import yaml
|
||||
from pywb.cdx.cdxserver import create_cdx_server, CDXServer, RemoteCDXServer
|
||||
from pywb.cdx.cdxsource import CDXFile, RemoteCDXSource, RedisCDXSource
|
||||
from pywb.cdx.zipnum import ZipNumCluster
|
||||
|
||||
from pywb import get_test_dir
|
||||
|
||||
yaml_config = r"""
|
||||
test_1:
|
||||
index_paths:
|
||||
# local cdx paths
|
||||
- {0}cdx/example.cdx
|
||||
|
||||
# simple remote cdx source, assumes no filtering
|
||||
- http://cdxserver.example.com/cdx
|
||||
|
||||
# customized remote cdx server
|
||||
- !!python/object:pywb.cdx.cdxsource.RemoteCDXSource {{
|
||||
remote_url: 'http://cdxserver.example.com/cdx',
|
||||
cookie: custom_token=value,
|
||||
remote_processing: true,
|
||||
}}
|
||||
|
||||
# example redis cdx source
|
||||
- redis://redis.example.com:6379/0
|
||||
|
||||
- {0}zipcdx/zipnum-sample.idx
|
||||
|
||||
test_2:
|
||||
index_paths: http://cdxserver.example.com/cdx
|
||||
|
||||
test_3: http://cdxserver.example.com/cdx
|
||||
|
||||
test_4: !!python/object:pywb.cdx.cdxsource.RemoteCDXSource {{
|
||||
remote_url: 'http://cdxserver.example.com/cdx',
|
||||
cookie: custom_token=value,
|
||||
remote_processing: true,
|
||||
}}
|
||||
|
||||
test_5: {0}cdx/example.cdx
|
||||
|
||||
test_6:
|
||||
index_paths: invalid://abc
|
||||
|
||||
|
||||
""".format(get_test_dir())
|
||||
|
||||
def test_cdxserver_config():
|
||||
config = yaml.load(yaml_config)
|
||||
cdxserver = create_cdx_server(config.get('test_1'))
|
||||
assert(isinstance(cdxserver, CDXServer))
|
||||
sources = cdxserver.sources
|
||||
assert len(sources) == 5
|
||||
|
||||
assert type(sources[0]) == CDXFile
|
||||
assert sources[0].filename.endswith('example.cdx')
|
||||
|
||||
# remote source with no remote processing
|
||||
assert type(sources[1]) == RemoteCDXSource
|
||||
assert sources[1].remote_url == 'http://cdxserver.example.com/cdx'
|
||||
assert sources[1].remote_processing == False
|
||||
|
||||
# remote cdx server with processing
|
||||
assert type(sources[2]) == RemoteCDXSource
|
||||
assert sources[2].remote_url == 'http://cdxserver.example.com/cdx'
|
||||
assert sources[2].remote_processing == True
|
||||
|
||||
# redis source
|
||||
assert type(sources[3]) == RedisCDXSource
|
||||
assert sources[3].redis_url == 'redis://redis.example.com:6379/0'
|
||||
|
||||
assert type(sources[4]) == ZipNumCluster
|
||||
assert sources[4].summary.endswith('zipnum-sample.idx')
|
||||
assert sources[4].loc_resolver.loc_filename.endswith('zipnum-sample.loc')
|
||||
|
||||
|
||||
def assert_remote_cdxserver(config_name):
|
||||
config = yaml.load(yaml_config)
|
||||
cdxserver = create_cdx_server(config.get(config_name))
|
||||
assert(isinstance(cdxserver, RemoteCDXServer))
|
||||
|
||||
source = cdxserver.source
|
||||
|
||||
# remote cdx server with remote processing
|
||||
assert type(source) == RemoteCDXSource
|
||||
assert source.remote_url == 'http://cdxserver.example.com/cdx'
|
||||
assert source.remote_processing == True
|
||||
|
||||
|
||||
def test_remote_index_path():
|
||||
assert_remote_cdxserver('test_2')
|
||||
|
||||
def test_no_index_path_remote():
|
||||
assert_remote_cdxserver('test_3')
|
||||
|
||||
def test_explicit_remote_source():
|
||||
assert_remote_cdxserver('test_4')
|
||||
|
||||
|
||||
def test_single_cdx():
|
||||
config = yaml.load(yaml_config)
|
||||
cdxserver = create_cdx_server(config.get('test_5'))
|
||||
assert(isinstance(cdxserver, CDXServer))
|
||||
sources = cdxserver.sources
|
||||
assert len(sources) == 1
|
||||
|
||||
assert type(sources[0]) == CDXFile
|
||||
assert sources[0].filename.endswith('example.cdx')
|
||||
|
||||
def test_invalid_config():
|
||||
config = yaml.load(yaml_config)
|
||||
cdxserver = create_cdx_server(config.get('test_6'))
|
||||
assert(isinstance(cdxserver, CDXServer))
|
||||
sources = cdxserver.sources
|
||||
assert len(sources) == 0
|
||||
|
||||
|
@ -1,78 +0,0 @@
|
||||
"""
|
||||
>>> redis_cdx(redis_cdx_server, 'http://example.com')
|
||||
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
|
||||
com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz
|
||||
com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz
|
||||
|
||||
>>> redis_cdx(redis_cdx_server_key, 'http://example.com')
|
||||
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
|
||||
com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz
|
||||
com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz
|
||||
|
||||
"""
|
||||
|
||||
from fakeredis import FakeStrictRedis
|
||||
from mock import patch
|
||||
|
||||
from warcio.timeutils import timestamp_to_sec
|
||||
from pywb.cdx.cdxsource import RedisCDXSource
|
||||
from pywb.cdx.cdxserver import CDXServer
|
||||
|
||||
from pywb import get_test_dir
|
||||
|
||||
import sys
|
||||
import os
|
||||
|
||||
test_cdx_dir = os.path.join(get_test_dir(), 'cdx/')
|
||||
|
||||
def load_cdx_into_redis(source, filename, key=None):
|
||||
# load a cdx into mock redis
|
||||
with open(test_cdx_dir + filename, 'rb') as fh:
|
||||
for line in fh:
|
||||
zadd_cdx(source, line, key)
|
||||
|
||||
def zadd_cdx(source, cdx, key):
|
||||
if key:
|
||||
source.redis.zadd(key, 0, cdx)
|
||||
return
|
||||
|
||||
parts = cdx.split(b' ', 2)
|
||||
|
||||
key = parts[0]
|
||||
timestamp = parts[1]
|
||||
rest = timestamp + b' ' + parts[2]
|
||||
|
||||
score = timestamp_to_sec(timestamp.decode('utf-8'))
|
||||
source.redis.zadd(source.key_prefix + key, score, rest)
|
||||
|
||||
|
||||
|
||||
@patch('redis.StrictRedis', FakeStrictRedis)
|
||||
def init_redis_server():
|
||||
source = RedisCDXSource('redis://127.0.0.1:6379/0')
|
||||
|
||||
for f in os.listdir(test_cdx_dir):
|
||||
if f.endswith('.cdx'):
|
||||
load_cdx_into_redis(source, f)
|
||||
|
||||
return CDXServer([source])
|
||||
|
||||
@patch('redis.StrictRedis', FakeStrictRedis)
|
||||
def init_redis_server_key_file():
|
||||
source = RedisCDXSource('redis://127.0.0.1:6379/0/key')
|
||||
|
||||
for f in os.listdir(test_cdx_dir):
|
||||
if f.endswith('.cdx'):
|
||||
load_cdx_into_redis(source, f, source.cdx_key)
|
||||
|
||||
return CDXServer([source])
|
||||
|
||||
|
||||
def redis_cdx(cdx_server, url, **params):
|
||||
cdx_iter = cdx_server.load_cdx(url=url, **params)
|
||||
for cdx in cdx_iter:
|
||||
sys.stdout.write(cdx)
|
||||
|
||||
redis_cdx_server = init_redis_server()
|
||||
redis_cdx_server_key = init_redis_server_key_file()
|
||||
|
@ -1,243 +0,0 @@
|
||||
"""
|
||||
>>> zip_ops_test(url='http://iana.org')
|
||||
org,iana)/ 20140126200624 http://www.iana.org/ text/html 200 OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 2258 334 iana.warc.gz
|
||||
org,iana)/ 20140127171238 http://iana.org unk 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 343 1858 dupes.warc.gz
|
||||
org,iana)/ 20140127171238 http://www.iana.org/ warc/revisit - OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 536 2678 dupes.warc.gz
|
||||
|
||||
# test idx index (tabs replacad with 4 spaces)
|
||||
>>> zip_ops_test(url='http://iana.org/domains/', matchType='prefix', showPagedIndex=True)
|
||||
org,iana)/dnssec 20140126201307 zipnum 8517 373 35
|
||||
org,iana)/domains/int 20140126201239 zipnum 8890 355 36
|
||||
org,iana)/domains/root/servers 20140126201227 zipnum 9245 386 37
|
||||
|
||||
|
||||
>>> zip_ops_test(url='http://iana.org/domains/*')
|
||||
org,iana)/domains/arpa 20140126201248 http://www.iana.org/domains/arpa text/html 200 QOFZZRN6JIKAL2JRL6ZC2VVG42SPKGHT - - 2939 759039 iana.warc.gz
|
||||
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
|
||||
org,iana)/domains/idn-tables 20140126201127 http://www.iana.org/domains/idn-tables text/html 200 HNCUFTJMOQOGAEY6T56KVC3T7TVLKGEW - - 8118 715878 iana.warc.gz
|
||||
org,iana)/domains/int 20140126201239 http://www.iana.org/domains/int text/html 200 X32BBNNORV4SPEHTQF5KI5NFHSKTZK6Q - - 2482 746788 iana.warc.gz
|
||||
org,iana)/domains/reserved 20140126201054 http://www.iana.org/domains/reserved text/html 200 R5AAEQX5XY5X5DG66B23ODN5DUBWRA27 - - 3573 701457 iana.warc.gz
|
||||
org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz
|
||||
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz
|
||||
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
|
||||
org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz
|
||||
|
||||
# first page
|
||||
>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', showPagedIndex=True, pageSize=4, page=0)
|
||||
com,example)/ 20140127171200 zipnum 0 275 1
|
||||
org,iana)/ 20140127171238 zipnum 275 328 2
|
||||
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201055 zipnum 603 312 3
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200718 zipnum 915 235 4
|
||||
|
||||
|
||||
# first page -- simplified query
|
||||
>>> zip_ops_test(url='*.iana.org/path_part_ignored/', showPagedIndex=True, pageSize=4)
|
||||
com,example)/ 20140127171200 zipnum 0 275 1
|
||||
org,iana)/ 20140127171238 zipnum 275 328 2
|
||||
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201055 zipnum 603 312 3
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200718 zipnum 915 235 4
|
||||
|
||||
# next page + json
|
||||
>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', output='json', showPagedIndex=True, pageSize=4, page=1)
|
||||
{"urlkey": "org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200912", "part": "zipnum", "offset": 1150, "length": 235, "lineno": 5}
|
||||
{"urlkey": "org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201240", "part": "zipnum", "offset": 1385, "length": 307, "lineno": 6}
|
||||
{"urlkey": "org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200654", "part": "zipnum", "offset": 1692, "length": 235, "lineno": 7}
|
||||
{"urlkey": "org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200816", "part": "zipnum", "offset": 1927, "length": 231, "lineno": 8}
|
||||
|
||||
# last page
|
||||
>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', showPagedIndex=True, pageSize=4, page=9)
|
||||
org,iana)/domains/root/servers 20140126201227 zipnum 9245 386 37
|
||||
org,iana)/time-zones 20140126200737 zipnum 9631 166 38
|
||||
|
||||
# last page cdx
|
||||
>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', pageSize=4, page=9)
|
||||
org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz
|
||||
org,iana)/numbers 20140126200651 http://www.iana.org/numbers text/html 200 HWT5UZKURYLW5QNWVZCWFCANGEMU7XWK - - 3498 321385 iana.warc.gz
|
||||
org,iana)/performance/ietf-draft-status 20140126200815 http://www.iana.org/performance/ietf-draft-status text/html 200 T5IQTX6DWV5KABGH454CYEDWKRI5Y23E - - 2940 597667 iana.warc.gz
|
||||
org,iana)/performance/ietf-statistics 20140126200804 http://www.iana.org/performance/ietf-statistics text/html 200 XOFML5WNBQMTSULLIIPLSP6U5MX33HN6 - - 3712 582987 iana.warc.gz
|
||||
org,iana)/protocols 20140126200715 http://www.iana.org/protocols text/html 200 IRUJZEUAXOUUG224ZMI4VWTUPJX6XJTT - - 63663 496277 iana.warc.gz
|
||||
org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
|
||||
org,iana)/time-zones/x 20140126200737 http://www.iana.org/time-zones/X text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
|
||||
org,iana)/time-zones/y 20140126200737 http://www.iana.org/time-zones/Y text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
|
||||
|
||||
# last page reverse -- not yet supported
|
||||
#>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', reverse=True, showPagedIndex=True, pageSize=4, page=9)
|
||||
#org,iana)/time-zones 20140126200737 zipnum 9623 145 38
|
||||
#org,iana)/domains/root/servers 20140126201227 zipnum 9237 386 37
|
||||
|
||||
|
||||
# last page reverse CDX
|
||||
>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', reverse=True, pageSize=4, page=9)
|
||||
org,iana)/time-zones/y 20140126200737 http://www.iana.org/time-zones/Y text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
|
||||
org,iana)/time-zones/x 20140126200737 http://www.iana.org/time-zones/X text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
|
||||
org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
|
||||
org,iana)/protocols 20140126200715 http://www.iana.org/protocols text/html 200 IRUJZEUAXOUUG224ZMI4VWTUPJX6XJTT - - 63663 496277 iana.warc.gz
|
||||
org,iana)/performance/ietf-statistics 20140126200804 http://www.iana.org/performance/ietf-statistics text/html 200 XOFML5WNBQMTSULLIIPLSP6U5MX33HN6 - - 3712 582987 iana.warc.gz
|
||||
org,iana)/performance/ietf-draft-status 20140126200815 http://www.iana.org/performance/ietf-draft-status text/html 200 T5IQTX6DWV5KABGH454CYEDWKRI5Y23E - - 2940 597667 iana.warc.gz
|
||||
org,iana)/numbers 20140126200651 http://www.iana.org/numbers text/html 200 HWT5UZKURYLW5QNWVZCWFCANGEMU7XWK - - 3498 321385 iana.warc.gz
|
||||
org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz
|
||||
|
||||
# last url prefix
|
||||
>>> zip_ops_test(url='http://iana.org/time-zones*')
|
||||
org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
|
||||
org,iana)/time-zones/x 20140126200737 http://www.iana.org/time-zones/X text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
|
||||
org,iana)/time-zones/y 20140126200737 http://www.iana.org/time-zones/Y text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
|
||||
|
||||
# last url prefix w/ slash
|
||||
>>> zip_ops_test(url='http://iana.org/time-zones/*')
|
||||
org,iana)/time-zones/x 20140126200737 http://www.iana.org/time-zones/X text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
|
||||
org,iana)/time-zones/y 20140126200737 http://www.iana.org/time-zones/Y text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
|
||||
|
||||
# last url exact
|
||||
>>> zip_ops_test(url='http://iana.org/time-zones/Y')
|
||||
org,iana)/time-zones/y 20140126200737 http://www.iana.org/time-zones/Y text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
|
||||
|
||||
# invalid page
|
||||
>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', showPagedIndex=True, pageSize=4, page=10) # doctest: +IGNORE_EXCEPTION_DETAIL
|
||||
Traceback (most recent call last):
|
||||
CDXException: Page 10 invalid: First Page is 0, Last Page is 9
|
||||
|
||||
|
||||
>>> zip_ops_test(url='http://aaa.aaa/', matchType='exact', showPagedIndex=True) # doctest: +IGNORE_EXCEPTION_DETAIL
|
||||
Traceback (most recent call last):
|
||||
NotFoundException: No Captures found for: http://aaa.aaa/
|
||||
|
||||
>>> zip_ops_test(url='http://aaa.aaa/', matchType='domain', showPagedIndex=True) # doctest: +IGNORE_EXCEPTION_DETAIL
|
||||
Traceback (most recent call last):
|
||||
NotFoundException: No Captures found for: http://aaa.aaa/ (domain query)
|
||||
|
||||
# list last index line, as we don't know if there are any captures at end
|
||||
>>> zip_ops_test(url='http://aaa.zz/', matchType='domain', showPagedIndex=True)
|
||||
org,iana)/time-zones 20140126200737 zipnum 9631 166 38
|
||||
|
||||
# read cdx to find no captures
|
||||
>>> zip_ops_test(url='http://aaa.zz/', matchType='domain') # doctest: +IGNORE_EXCEPTION_DETAIL
|
||||
Traceback (most recent call last):
|
||||
NotFoundException: No Captures found for: http://aaa.zz/ (domain query)
|
||||
|
||||
# Invalid .idx filesor or missing loc
|
||||
|
||||
>>> zip_test_err(url='http://example.com/', matchType='exact') # doctest: +IGNORE_EXCEPTION_DETAIL
|
||||
Traceback (most recent call last):
|
||||
Exception: No Locations Found for: foo
|
||||
|
||||
|
||||
>>> zip_test_err(url='http://example.zz/x', matchType='exact') # doctest: +IGNORE_EXCEPTION_DETAIL
|
||||
Traceback (most recent call last):
|
||||
Exception: No Locations Found for: foo2
|
||||
|
||||
"""
|
||||
|
||||
from test_cdxops import cdx_ops_test, cdx_ops_test_data
|
||||
from pywb import get_test_dir
|
||||
from pywb.cdx.cdxserver import CDXServer
|
||||
|
||||
|
||||
import shutil
|
||||
import tempfile
|
||||
import os
|
||||
import json
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
test_zipnum = get_test_dir() + 'zipcdx/zipnum-sample.idx'
|
||||
|
||||
def zip_ops_test_data(url, **kwargs):
|
||||
sources = test_zipnum
|
||||
return json.loads(cdx_ops_test_data(url, sources, **kwargs)[0])
|
||||
|
||||
def zip_ops_test(url, **kwargs):
|
||||
sources = test_zipnum
|
||||
cdx_ops_test(url, sources, **kwargs)
|
||||
|
||||
def zip_test_err(url, **kwargs):
|
||||
sources = get_test_dir() + 'zipcdx/zipnum-bad.idx'
|
||||
cdx_ops_test(url, sources, **kwargs)
|
||||
|
||||
|
||||
def test_zip_prefix_load():
|
||||
|
||||
tmpdir = tempfile.mkdtemp()
|
||||
try:
|
||||
shutil.copy(test_zipnum, tmpdir)
|
||||
shutil.copy(get_test_dir() + 'zipcdx/zipnum-sample.cdx.gz',
|
||||
os.path.join(tmpdir, 'zipnum'))
|
||||
|
||||
config={}
|
||||
config['shard_index_loc'] = dict(match='(.*)',
|
||||
replace=r'\1')
|
||||
server = CDXServer(os.path.join(tmpdir, 'zipnum-sample.idx'),
|
||||
config=config)
|
||||
|
||||
|
||||
# Test Page Count
|
||||
results = server.load_cdx(url='iana.org/',
|
||||
matchType='domain',
|
||||
showNumPages=True)
|
||||
|
||||
results = list(results)
|
||||
assert len(results) == 1, results
|
||||
assert json.loads(results[0]) == {"blocks": 38, "pages": 4, "pageSize": 10}
|
||||
|
||||
|
||||
# Test simple query
|
||||
results = server.load_cdx(url='iana.org/')
|
||||
results = list(results)
|
||||
assert len(results) ==3, results
|
||||
assert '20140126200624' in results[0]
|
||||
assert '20140127171238' in results[1]
|
||||
assert 'warc/revisit' in results[2]
|
||||
|
||||
finally:
|
||||
shutil.rmtree(tmpdir)
|
||||
|
||||
|
||||
|
||||
def test_blocks_def_page_size():
|
||||
# Pages -- default page size
|
||||
res = zip_ops_test_data(url='http://iana.org/domains/example', matchType='exact', showNumPages=True)
|
||||
assert(res == {"blocks": 1, "pages": 1, "pageSize": 10})
|
||||
|
||||
def test_blocks_def_size_2():
|
||||
res = zip_ops_test_data(url='http://iana.org/domains/', matchType='domain', showNumPages=True)
|
||||
assert(res == {"blocks": 38, "pages": 4, "pageSize": 10})
|
||||
|
||||
def test_blocks_set_page_size():
|
||||
# set page size
|
||||
res = zip_ops_test_data(url='http://iana.org/domains/', matchType='domain', pageSize=4, showNumPages=True)
|
||||
assert(res == {"blocks": 38, "pages": 10, "pageSize": 4})
|
||||
|
||||
def test_blocks_alt_q():
|
||||
# set page size -- alt domain query
|
||||
res = zip_ops_test_data(url='*.iana.org', pageSize='4', showNumPages=True)
|
||||
assert(res == {"blocks": 38, "pages": 10, "pageSize": 4})
|
||||
|
||||
def test_blocks_secondary_match():
|
||||
# page size for non-existent, but secondary index match
|
||||
res = zip_ops_test_data(url='iana.org/domains/int/blah', pageSize=4, showNumPages=True)
|
||||
assert(res == {"blocks": 0, "pages": 0, "pageSize": 4})
|
||||
|
||||
def test_blocks_no_match():
|
||||
# page size for non-existent, no secondary index match
|
||||
res = zip_ops_test_data(url='*.foo.bar', showNumPages=True)
|
||||
assert(res == {"blocks": 0, "pages": 0, "pageSize": 10})
|
||||
|
||||
def test_blocks_zero_pages():
|
||||
# read cdx to find 0 pages
|
||||
res = zip_ops_test_data(url='http://aaa.zz/', matchType='domain', showNumPages=True)
|
||||
assert(res == {"blocks": 0, "pages": 0, "pageSize": 10})
|
||||
|
||||
|
||||
# Errors
|
||||
|
||||
def test_err_file_not_found():
|
||||
with pytest.raises(IOError):
|
||||
zip_test_err(url='http://iana.org/x', matchType='exact') # doctest: +IGNORE_EXCEPTION_DETAIL
|
||||
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
doctest.testmod()
|
@ -1,353 +0,0 @@
|
||||
import os
|
||||
import collections
|
||||
import itertools
|
||||
import logging
|
||||
from io import BytesIO
|
||||
import datetime
|
||||
import json
|
||||
import six
|
||||
|
||||
from six.moves import map
|
||||
|
||||
from pywb.cdx.cdxsource import CDXSource
|
||||
from pywb.cdx.cdxobject import IDXObject, CDXException
|
||||
|
||||
from pywb.utils.loaders import BlockLoader, read_last_line
|
||||
from warcio.bufferedreaders import gzip_decompressor
|
||||
from pywb.utils.binsearch import iter_range, linearsearch, search
|
||||
|
||||
|
||||
#=================================================================
|
||||
class ZipBlocks:
|
||||
def __init__(self, part, offset, length, count):
|
||||
self.part = part
|
||||
self.offset = offset
|
||||
self.length = length
|
||||
self.count = count
|
||||
|
||||
|
||||
#=================================================================
|
||||
#TODO: see if these could be combined with warc path resolvers
|
||||
|
||||
class LocMapResolver(object):
|
||||
""" Lookup shards based on a file mapping
|
||||
shard name to one or more paths. The entries are
|
||||
tab delimited.
|
||||
"""
|
||||
def __init__(self, loc_summary, loc_filename):
|
||||
# initial loc map
|
||||
self.loc_map = {}
|
||||
self.loc_mtime = 0
|
||||
if not loc_filename:
|
||||
splits = os.path.splitext(loc_summary)
|
||||
loc_filename = splits[0] + '.loc'
|
||||
self.loc_filename = loc_filename
|
||||
|
||||
self.load_loc()
|
||||
|
||||
def load_loc(self):
|
||||
# check modified time of current file before loading
|
||||
new_mtime = os.path.getmtime(self.loc_filename)
|
||||
if (new_mtime == self.loc_mtime):
|
||||
return
|
||||
|
||||
# update loc file mtime
|
||||
self.loc_mtime = new_mtime
|
||||
|
||||
local_dir = os.path.dirname(self.loc_filename)
|
||||
|
||||
def res_path(pathname):
|
||||
if '://' not in pathname:
|
||||
pathname = os.path.join(local_dir, pathname)
|
||||
return pathname
|
||||
|
||||
logging.debug('Loading loc from: ' + self.loc_filename)
|
||||
with open(self.loc_filename, 'r') as fh:
|
||||
for line in fh:
|
||||
parts = line.rstrip().split('\t')
|
||||
|
||||
paths = [res_path(pathname) for pathname in parts[1:]]
|
||||
self.loc_map[parts[0]] = paths
|
||||
|
||||
def __call__(self, part, query):
|
||||
return self.loc_map[part]
|
||||
|
||||
|
||||
#=================================================================
|
||||
class LocPrefixResolver(object):
|
||||
""" Use a prefix lookup, where the prefix can either be a fixed
|
||||
string or can be a regex replacement of the index summary path
|
||||
"""
|
||||
def __init__(self, loc_summary, loc_config):
|
||||
import re
|
||||
loc_match = loc_config.get('match', '().*')
|
||||
loc_replace = loc_config['replace']
|
||||
loc_summary = os.path.dirname(loc_summary) + '/'
|
||||
self.prefix = re.sub(loc_match, loc_replace, loc_summary)
|
||||
|
||||
def load_loc(self):
|
||||
pass
|
||||
|
||||
def __call__(self, part, query):
|
||||
return [self.prefix + part]
|
||||
|
||||
|
||||
#=================================================================
|
||||
class ZipNumCluster(CDXSource):
|
||||
DEFAULT_RELOAD_INTERVAL = 10 # in minutes
|
||||
DEFAULT_MAX_BLOCKS = 10
|
||||
|
||||
def __init__(self, summary, config=None):
|
||||
self.max_blocks = self.DEFAULT_MAX_BLOCKS
|
||||
|
||||
self.loc_resolver = None
|
||||
|
||||
loc = None
|
||||
cookie_maker = None
|
||||
reload_ival = self.DEFAULT_RELOAD_INTERVAL
|
||||
|
||||
if config:
|
||||
loc = config.get('shard_index_loc')
|
||||
cookie_maker = config.get('cookie_maker')
|
||||
|
||||
self.max_blocks = config.get('max_blocks', self.max_blocks)
|
||||
|
||||
reload_ival = config.get('reload_interval', reload_ival)
|
||||
|
||||
|
||||
if isinstance(loc, dict):
|
||||
self.loc_resolver = LocPrefixResolver(summary, loc)
|
||||
else:
|
||||
self.loc_resolver = LocMapResolver(summary, loc)
|
||||
|
||||
self.summary = summary
|
||||
|
||||
# reload interval
|
||||
self.loc_update_time = datetime.datetime.now()
|
||||
self.reload_interval = datetime.timedelta(minutes=reload_ival)
|
||||
|
||||
self.blk_loader = BlockLoader(cookie_maker=cookie_maker)
|
||||
|
||||
# @staticmethod
|
||||
# def reload_timed(timestamp, val, delta, func):
|
||||
# now = datetime.datetime.now()
|
||||
# if now - timestamp >= delta:
|
||||
# func()
|
||||
# return now
|
||||
# return None
|
||||
#
|
||||
# def reload_loc(self):
|
||||
# reload_time = self.reload_timed(self.loc_update_time,
|
||||
# self.loc_map,
|
||||
# self.reload_interval,
|
||||
# self.load_loc)
|
||||
#
|
||||
# if reload_time:
|
||||
# self.loc_update_time = reload_time
|
||||
|
||||
def load_cdx(self, query):
|
||||
self.loc_resolver.load_loc()
|
||||
return self._do_load_cdx(self.summary, query)
|
||||
|
||||
def _do_load_cdx(self, filename, query):
|
||||
reader = open(filename, 'rb')
|
||||
|
||||
idx_iter = self.compute_page_range(reader, query)
|
||||
|
||||
if query.secondary_index_only or query.page_count:
|
||||
return idx_iter
|
||||
|
||||
blocks = self.idx_to_cdx(idx_iter, query)
|
||||
|
||||
def gen_cdx():
|
||||
for blk in blocks:
|
||||
for cdx in blk:
|
||||
yield cdx
|
||||
|
||||
return gen_cdx()
|
||||
|
||||
|
||||
def _page_info(self, pages, pagesize, blocks):
|
||||
info = dict(pages=pages,
|
||||
pageSize=pagesize,
|
||||
blocks=blocks)
|
||||
return json.dumps(info) + '\n'
|
||||
|
||||
def compute_page_range(self, reader, query):
|
||||
pagesize = query.page_size
|
||||
if not pagesize:
|
||||
pagesize = self.max_blocks
|
||||
else:
|
||||
pagesize = int(pagesize)
|
||||
|
||||
last_line = None
|
||||
|
||||
# Get End
|
||||
end_iter = search(reader, query.end_key, prev_size=1)
|
||||
|
||||
try:
|
||||
end_line = six.next(end_iter)
|
||||
except StopIteration:
|
||||
last_line = read_last_line(reader)
|
||||
end_line = last_line
|
||||
|
||||
# Get Start
|
||||
first_iter = iter_range(reader,
|
||||
query.key,
|
||||
query.end_key,
|
||||
prev_size=1)
|
||||
|
||||
try:
|
||||
first_line = six.next(first_iter)
|
||||
except StopIteration:
|
||||
if end_line == last_line and query.key >= last_line:
|
||||
first_line = last_line
|
||||
else:
|
||||
reader.close()
|
||||
if query.page_count:
|
||||
yield self._page_info(0, pagesize, 0)
|
||||
return
|
||||
else:
|
||||
raise
|
||||
|
||||
first = IDXObject(first_line)
|
||||
|
||||
end = IDXObject(end_line)
|
||||
|
||||
try:
|
||||
blocks = end['lineno'] - first['lineno']
|
||||
total_pages = int(blocks / pagesize) + 1
|
||||
except:
|
||||
blocks = -1
|
||||
total_pages = 1
|
||||
|
||||
if query.page_count:
|
||||
# same line, so actually need to look at cdx
|
||||
# to determine if it exists
|
||||
if blocks == 0:
|
||||
try:
|
||||
block_cdx_iter = self.idx_to_cdx([first_line], query)
|
||||
block = six.next(block_cdx_iter)
|
||||
cdx = six.next(block)
|
||||
except StopIteration:
|
||||
total_pages = 0
|
||||
blocks = -1
|
||||
|
||||
yield self._page_info(total_pages, pagesize, blocks + 1)
|
||||
reader.close()
|
||||
return
|
||||
|
||||
curr_page = query.page
|
||||
if curr_page >= total_pages or curr_page < 0:
|
||||
msg = 'Page {0} invalid: First Page is 0, Last Page is {1}'
|
||||
reader.close()
|
||||
raise CDXException(msg.format(curr_page, total_pages - 1))
|
||||
|
||||
startline = curr_page * pagesize
|
||||
endline = startline + pagesize - 1
|
||||
if blocks >= 0:
|
||||
endline = min(endline, blocks)
|
||||
|
||||
if curr_page == 0:
|
||||
yield first_line
|
||||
else:
|
||||
startline -= 1
|
||||
|
||||
idxiter = itertools.islice(first_iter, startline, endline)
|
||||
for idx in idxiter:
|
||||
yield idx
|
||||
|
||||
reader.close()
|
||||
|
||||
|
||||
def search_by_line_num(self, reader, line): # pragma: no cover
|
||||
def line_cmp(line1, line2):
|
||||
line1_no = int(line1.rsplit(b'\t', 1)[-1])
|
||||
line2_no = int(line2.rsplit(b'\t', 1)[-1])
|
||||
return cmp(line1_no, line2_no)
|
||||
|
||||
line_iter = search(reader, line, compare_func=line_cmp)
|
||||
yield six.next(line_iter)
|
||||
|
||||
def idx_to_cdx(self, idx_iter, query):
|
||||
blocks = None
|
||||
ranges = []
|
||||
|
||||
for idx in idx_iter:
|
||||
idx = IDXObject(idx)
|
||||
|
||||
if (blocks and blocks.part == idx['part'] and
|
||||
blocks.offset + blocks.length == idx['offset'] and
|
||||
blocks.count < self.max_blocks):
|
||||
|
||||
blocks.length += idx['length']
|
||||
blocks.count += 1
|
||||
ranges.append(idx['length'])
|
||||
|
||||
else:
|
||||
if blocks:
|
||||
yield self.block_to_cdx_iter(blocks, ranges, query)
|
||||
|
||||
blocks = ZipBlocks(idx['part'],
|
||||
idx['offset'],
|
||||
idx['length'],
|
||||
1)
|
||||
|
||||
ranges = [blocks.length]
|
||||
|
||||
if blocks:
|
||||
yield self.block_to_cdx_iter(blocks, ranges, query)
|
||||
|
||||
def block_to_cdx_iter(self, blocks, ranges, query):
|
||||
last_exc = None
|
||||
last_traceback = None
|
||||
|
||||
try:
|
||||
locations = self.loc_resolver(blocks.part, query)
|
||||
except:
|
||||
raise Exception('No Locations Found for: ' + blocks.part)
|
||||
|
||||
for location in self.loc_resolver(blocks.part, query):
|
||||
try:
|
||||
return self.load_blocks(location, blocks, ranges, query)
|
||||
except Exception as exc:
|
||||
last_exc = exc
|
||||
import sys
|
||||
last_traceback = sys.exc_info()[2]
|
||||
|
||||
if last_exc:
|
||||
six.reraise(Exception, last_exc, last_traceback)
|
||||
#raise last_exc
|
||||
else:
|
||||
raise Exception('No Locations Found for: ' + blocks.part)
|
||||
|
||||
def load_blocks(self, location, blocks, ranges, query):
|
||||
""" Load one or more blocks of compressed cdx lines, return
|
||||
a line iterator which decompresses and returns one line at a time,
|
||||
bounded by query.key and query.end_key
|
||||
"""
|
||||
|
||||
if (logging.getLogger().getEffectiveLevel() <= logging.DEBUG):
|
||||
msg = 'Loading {b.count} blocks from {loc}:{b.offset}+{b.length}'
|
||||
logging.debug(msg.format(b=blocks, loc=location))
|
||||
|
||||
reader = self.blk_loader.load(location, blocks.offset, blocks.length)
|
||||
|
||||
def decompress_block(range_):
|
||||
decomp = gzip_decompressor()
|
||||
buff = decomp.decompress(reader.read(range_))
|
||||
for line in BytesIO(buff):
|
||||
yield line
|
||||
|
||||
iter_ = itertools.chain(*map(decompress_block, ranges))
|
||||
|
||||
# start bound
|
||||
iter_ = linearsearch(iter_, query.key)
|
||||
|
||||
# end bound
|
||||
iter_ = itertools.takewhile(lambda line: line < query.end_key, iter_)
|
||||
return iter_
|
||||
|
||||
def __str__(self):
|
||||
return 'ZipNum Cluster: {0}, {1}'.format(self.summary,
|
||||
self.loc_resolver)
|
@ -1,245 +0,0 @@
|
||||
from six.moves.urllib.parse import urlsplit, urlunsplit, quote
|
||||
|
||||
import re
|
||||
|
||||
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||
from pywb.rewrite.wburl import WbUrl
|
||||
from pywb.framework.wbrequestresponse import WbRequest, WbResponse
|
||||
|
||||
|
||||
#=================================================================
|
||||
# ArchivalRouter -- route WB requests in archival mode
|
||||
#=================================================================
|
||||
class ArchivalRouter(object):
|
||||
def __init__(self, routes, **kwargs):
|
||||
self.routes = routes
|
||||
|
||||
# optional port setting may be ignored by wsgi container
|
||||
self.port = kwargs.get('port')
|
||||
|
||||
self.fallback = ReferRedirect()
|
||||
|
||||
self.abs_path = kwargs.get('abs_path')
|
||||
|
||||
self.home_view = kwargs.get('home_view')
|
||||
self.error_view = kwargs.get('error_view')
|
||||
self.info_view = kwargs.get('info_view')
|
||||
|
||||
config = kwargs.get('config', {})
|
||||
self.urlrewriter_class = config.get('urlrewriter_class', UrlRewriter)
|
||||
|
||||
self.enable_coll_info = config.get('enable_coll_info', False)
|
||||
|
||||
def __call__(self, env):
|
||||
request_uri = self.ensure_rel_uri_set(env)
|
||||
|
||||
for route in self.routes:
|
||||
matcher, coll = route.is_handling(request_uri)
|
||||
if matcher:
|
||||
wbrequest = self.parse_request(route, env, matcher,
|
||||
coll, request_uri,
|
||||
use_abs_prefix=self.abs_path)
|
||||
|
||||
return route.handler(wbrequest)
|
||||
|
||||
# Default Home Page
|
||||
if request_uri in ['/', '/index.html', '/index.htm']:
|
||||
return self.render_home_page(env)
|
||||
|
||||
if self.enable_coll_info and request_uri in ['/collinfo.json']:
|
||||
params = env.get('pywb.template_params', {})
|
||||
host = WbRequest.make_host_prefix(env)
|
||||
return self.info_view.render_response(env=env, host=host, routes=self.routes,
|
||||
content_type='application/json',
|
||||
**params)
|
||||
|
||||
return self.fallback(env, self) if self.fallback else None
|
||||
|
||||
def parse_request(self, route, env, matcher, coll, request_uri,
|
||||
use_abs_prefix=False):
|
||||
matched_str = matcher.group(0)
|
||||
rel_prefix = env.get('SCRIPT_NAME', '') + '/'
|
||||
|
||||
if matched_str:
|
||||
rel_prefix += matched_str + '/'
|
||||
# remove the '/' + rel_prefix part of uri
|
||||
wb_url_str = request_uri[len(matched_str) + 2:]
|
||||
else:
|
||||
# the request_uri is the wb_url, since no coll
|
||||
wb_url_str = request_uri[1:]
|
||||
|
||||
wbrequest = route.request_class(env,
|
||||
request_uri=request_uri,
|
||||
wb_url_str=wb_url_str,
|
||||
rel_prefix=rel_prefix,
|
||||
coll=coll,
|
||||
use_abs_prefix=use_abs_prefix,
|
||||
wburl_class=route.handler.get_wburl_type(),
|
||||
urlrewriter_class=self.urlrewriter_class,
|
||||
cookie_scope=route.cookie_scope,
|
||||
rewrite_opts=route.rewrite_opts,
|
||||
user_metadata=route.user_metadata)
|
||||
|
||||
# Allow for applying of additional filters
|
||||
route.apply_filters(wbrequest, matcher)
|
||||
|
||||
return wbrequest
|
||||
|
||||
def render_home_page(self, env):
|
||||
if self.home_view:
|
||||
params = env.get('pywb.template_params', {})
|
||||
return self.home_view.render_response(env=env, routes=self.routes, **params)
|
||||
else:
|
||||
return None
|
||||
|
||||
#=================================================================
|
||||
# adapted from wsgiref.request_uri, but doesn't include domain name
|
||||
# and allows all characters which are allowed in the path segment
|
||||
# according to: http://tools.ietf.org/html/rfc3986#section-3.3
|
||||
# explained here:
|
||||
# http://stackoverflow.com/questions/4669692/
|
||||
# valid-characters-for-directory-part-of-a-url-for-short-links
|
||||
|
||||
@staticmethod
|
||||
def ensure_rel_uri_set(env):
|
||||
""" Return the full requested path, including the query string
|
||||
"""
|
||||
if 'REL_REQUEST_URI' in env:
|
||||
return env['REL_REQUEST_URI']
|
||||
|
||||
if not env.get('SCRIPT_NAME') and env.get('REQUEST_URI'):
|
||||
env['REL_REQUEST_URI'] = env['REQUEST_URI']
|
||||
return env['REL_REQUEST_URI']
|
||||
|
||||
url = quote(env.get('PATH_INFO', ''), safe='/~!$&\'()*+,;=:@')
|
||||
query = env.get('QUERY_STRING')
|
||||
if query:
|
||||
url += '?' + query
|
||||
|
||||
env['REL_REQUEST_URI'] = url
|
||||
return url
|
||||
|
||||
|
||||
#=================================================================
|
||||
# Route by matching regex (or fixed prefix)
|
||||
# of request uri (excluding first '/')
|
||||
#=================================================================
|
||||
class Route(object):
|
||||
# match upto next / or ? or end
|
||||
SLASH_QUERY_LOOKAHEAD = '(?=/|$|\?)'
|
||||
|
||||
def __init__(self, regex, handler, config=None,
|
||||
request_class=WbRequest,
|
||||
lookahead=SLASH_QUERY_LOOKAHEAD):
|
||||
|
||||
config = config or {}
|
||||
self.path = regex
|
||||
if regex:
|
||||
self.regex = re.compile(regex + lookahead)
|
||||
else:
|
||||
self.regex = re.compile('')
|
||||
|
||||
self.handler = handler
|
||||
self.request_class = request_class
|
||||
|
||||
# collection id from regex group (default 0)
|
||||
self.coll_group = int(config.get('coll_group', 0))
|
||||
self.cookie_scope = config.get('cookie_scope')
|
||||
self.rewrite_opts = config.get('rewrite_opts', {})
|
||||
self.user_metadata = config.get('metadata', {})
|
||||
self._custom_init(config)
|
||||
|
||||
def is_handling(self, request_uri):
|
||||
matcher = self.regex.match(request_uri[1:])
|
||||
if not matcher:
|
||||
return None, None
|
||||
|
||||
coll = matcher.group(self.coll_group)
|
||||
return matcher, coll
|
||||
|
||||
def apply_filters(self, wbrequest, matcher):
|
||||
for filter in self.filters:
|
||||
last_grp = len(matcher.groups())
|
||||
filter_str = filter.format(matcher.group(last_grp))
|
||||
wbrequest.query_filter.append(filter_str)
|
||||
|
||||
def _custom_init(self, config):
|
||||
self.filters = config.get('filters', [])
|
||||
|
||||
|
||||
#=================================================================
|
||||
# ReferRedirect -- redirect urls that have 'fallen through'
|
||||
# based on the referrer settings
|
||||
#=================================================================
|
||||
class ReferRedirect:
|
||||
def __call__(self, env, the_router):
|
||||
referrer = env.get('HTTP_REFERER')
|
||||
|
||||
routes = the_router.routes
|
||||
|
||||
# ensure there is a referrer
|
||||
if referrer is None:
|
||||
return None
|
||||
|
||||
# get referrer path name
|
||||
ref_split = urlsplit(referrer)
|
||||
|
||||
# require that referrer starts with current Host, if any
|
||||
curr_host = env.get('HTTP_HOST')
|
||||
if curr_host and curr_host != ref_split.netloc:
|
||||
return None
|
||||
|
||||
path = ref_split.path
|
||||
|
||||
app_path = env.get('SCRIPT_NAME', '')
|
||||
|
||||
if app_path:
|
||||
# must start with current app name, if not root
|
||||
if not path.startswith(app_path):
|
||||
return None
|
||||
|
||||
path = path[len(app_path):]
|
||||
|
||||
ref_route = None
|
||||
ref_request = None
|
||||
|
||||
for route in routes:
|
||||
matcher, coll = route.is_handling(path)
|
||||
if matcher:
|
||||
ref_request = the_router.parse_request(route, env,
|
||||
matcher, coll, path)
|
||||
ref_route = route
|
||||
break
|
||||
|
||||
# must have matched one of the routes with a urlrewriter
|
||||
if not ref_request or not ref_request.urlrewriter:
|
||||
return None
|
||||
|
||||
rewriter = ref_request.urlrewriter
|
||||
|
||||
rel_request_uri = env['REL_REQUEST_URI']
|
||||
|
||||
timestamp_path = '/' + rewriter.wburl.timestamp + '/'
|
||||
|
||||
# check if timestamp is already part of the path
|
||||
if rel_request_uri.startswith(timestamp_path):
|
||||
# remove timestamp but leave / to make host relative url
|
||||
# 2013/path.html -> /path.html
|
||||
rel_request_uri = rel_request_uri[len(timestamp_path) - 1:]
|
||||
|
||||
rewritten_url = rewriter.rewrite(rel_request_uri)
|
||||
|
||||
# if post, can't redirect as that would lost the post data
|
||||
# (can't use 307 because FF will show confirmation warning)
|
||||
if ref_request.method == 'POST':
|
||||
new_wb_url = WbUrl(rewritten_url[len(rewriter.prefix):])
|
||||
ref_request.wb_url.url = new_wb_url.url
|
||||
return ref_route.handler(ref_request)
|
||||
|
||||
final_url = urlunsplit((ref_split.scheme,
|
||||
ref_split.netloc,
|
||||
rewritten_url,
|
||||
'',
|
||||
''))
|
||||
|
||||
return WbResponse.redir_response(final_url, status='302 Temp Redirect')
|
@ -1,23 +0,0 @@
|
||||
from pywb.rewrite.wburl import WbUrl
|
||||
|
||||
|
||||
#=================================================================
|
||||
class BaseHandler(object):
|
||||
"""
|
||||
Represents a base handler class that handles any request
|
||||
"""
|
||||
def __call__(self, wbrequest): # pragma: no cover
|
||||
raise NotImplementedError('Need to implement in derived class')
|
||||
|
||||
def get_wburl_type(self):
|
||||
return None
|
||||
|
||||
|
||||
#=================================================================
|
||||
class WbUrlHandler(BaseHandler):
|
||||
"""
|
||||
Represents a handler which assumes the request contains a WbUrl
|
||||
Ensure that the WbUrl is parsed in the request
|
||||
"""
|
||||
def get_wburl_type(self):
|
||||
return WbUrl
|
@ -1,62 +0,0 @@
|
||||
try: # pragma: no cover
|
||||
import uwsgi
|
||||
uwsgi_cache = True
|
||||
except ImportError:
|
||||
uwsgi_cache = False
|
||||
|
||||
|
||||
from redis import StrictRedis
|
||||
from pywb.utils.loaders import to_native_str
|
||||
|
||||
|
||||
#=================================================================
|
||||
class UwsgiCache(object): # pragma: no cover
|
||||
def __setitem__(self, item, value):
|
||||
uwsgi.cache_update(item, value)
|
||||
|
||||
def __getitem__(self, item):
|
||||
return uwsgi.cache_get(item)
|
||||
|
||||
def __contains__(self, item):
|
||||
return uwsgi.cache_exists(item)
|
||||
|
||||
def __delitem__(self, item):
|
||||
uwsgi.cache_del(item)
|
||||
|
||||
|
||||
#=================================================================
|
||||
class DefaultCache(dict):
|
||||
def __getitem__(self, item):
|
||||
return self.get(item)
|
||||
|
||||
|
||||
#=================================================================
|
||||
class RedisCache(object):
|
||||
def __init__(self, redis_url):
|
||||
# must be of the form redis://host:port/db/key
|
||||
redis_url, key = redis_url.rsplit('/', 1)
|
||||
self.redis = StrictRedis.from_url(redis_url)
|
||||
self.key = key
|
||||
|
||||
def __setitem__(self, item, value):
|
||||
self.redis.hset(self.key, item, value)
|
||||
|
||||
def __getitem__(self, item):
|
||||
return to_native_str(self.redis.hget(self.key, item), 'utf-8')
|
||||
|
||||
def __contains__(self, item):
|
||||
return self.redis.hexists(self.key, item)
|
||||
|
||||
def __delitem__(self, item):
|
||||
self.redis.hdel(self.key, item)
|
||||
|
||||
|
||||
#=================================================================
|
||||
def create_cache(redis_url_key=None):
|
||||
if redis_url_key:
|
||||
return RedisCache(redis_url_key)
|
||||
|
||||
if uwsgi_cache: # pragma: no cover
|
||||
return UwsgiCache()
|
||||
else:
|
||||
return DefaultCache()
|
@ -1,231 +0,0 @@
|
||||
from pywb.utils.wbexception import BadRequestException
|
||||
from warcio.timeutils import http_date_to_timestamp
|
||||
from warcio.timeutils import timestamp_to_http_date
|
||||
|
||||
from pywb.framework.wbrequestresponse import WbRequest, WbResponse
|
||||
from pywb.rewrite.wburl import WbUrl
|
||||
|
||||
import six
|
||||
LINK_FORMAT = 'application/link-format'
|
||||
|
||||
|
||||
#=================================================================
|
||||
class MementoReqMixin(object):
|
||||
def _parse_extra(self):
|
||||
if not self.wb_url:
|
||||
return
|
||||
|
||||
if self.wb_url.type != self.wb_url.LATEST_REPLAY:
|
||||
return
|
||||
|
||||
self.options['is_timegate'] = True
|
||||
|
||||
accept_datetime = self.env.get('HTTP_ACCEPT_DATETIME')
|
||||
if not accept_datetime:
|
||||
return
|
||||
|
||||
try:
|
||||
timestamp = http_date_to_timestamp(accept_datetime)
|
||||
except Exception:
|
||||
raise BadRequestException('Invalid Accept-Datetime: ' +
|
||||
accept_datetime)
|
||||
|
||||
# note: this changes from LATEST_REPLAY -> REPLAY
|
||||
self.wb_url.set_replay_timestamp(timestamp)
|
||||
|
||||
|
||||
#=================================================================
|
||||
class MementoRequest(MementoReqMixin, WbRequest):
|
||||
pass
|
||||
|
||||
|
||||
#=================================================================
|
||||
class MementoRespMixin(object):
|
||||
def _init_derived(self, params):
|
||||
wbrequest = params.get('wbrequest')
|
||||
is_redirect = params.get('memento_is_redir', False)
|
||||
cdx = params.get('cdx')
|
||||
|
||||
if not wbrequest or not wbrequest.wb_url:
|
||||
return
|
||||
|
||||
mod = wbrequest.options.get('replay_mod', '')
|
||||
|
||||
#is_top_frame = wbrequest.wb_url.is_top_frame
|
||||
is_top_frame = wbrequest.options.get('is_top_frame', False)
|
||||
|
||||
is_timegate = (wbrequest.options.get('is_timegate', False) and
|
||||
not is_top_frame)
|
||||
|
||||
if is_timegate:
|
||||
self.status_headers.replace_header('Vary', 'accept-datetime')
|
||||
|
||||
# Determine if memento:
|
||||
is_memento = False
|
||||
is_original = False
|
||||
|
||||
# if no cdx included, not a memento, unless top-frame special
|
||||
if not cdx:
|
||||
# special case: include the headers but except Memento-Datetime
|
||||
# since this is really an intermediate resource
|
||||
if is_top_frame:
|
||||
is_memento = True
|
||||
|
||||
# otherwise, if in proxy mode, then always a memento
|
||||
elif wbrequest.options['is_proxy']:
|
||||
is_memento = True
|
||||
is_original = True
|
||||
|
||||
# otherwise only if timestamp replay (and not a timegate)
|
||||
#elif not is_timegate:
|
||||
# is_memento = (wbrequest.wb_url.type == wbrequest.wb_url.REPLAY)
|
||||
elif not is_redirect:
|
||||
is_memento = (wbrequest.wb_url.is_replay())
|
||||
|
||||
link = []
|
||||
req_url = wbrequest.wb_url.url
|
||||
|
||||
if is_memento or is_timegate:
|
||||
url = req_url
|
||||
if cdx:
|
||||
ts = cdx['timestamp']
|
||||
url = cdx['url']
|
||||
# for top frame
|
||||
elif wbrequest.wb_url.timestamp:
|
||||
ts = wbrequest.wb_url.timestamp
|
||||
else:
|
||||
ts = None
|
||||
|
||||
if ts:
|
||||
http_date = timestamp_to_http_date(ts)
|
||||
|
||||
if is_memento:
|
||||
self.status_headers.replace_header('Memento-Datetime',
|
||||
http_date)
|
||||
|
||||
canon_link = wbrequest.urlrewriter.get_new_url(mod=mod,
|
||||
timestamp=ts,
|
||||
url=url)
|
||||
|
||||
# set in replay_views -- Must set content location
|
||||
#if is_memento and is_timegate:
|
||||
# self.status_headers.headers.append(('Content-Location',
|
||||
# canon_link))
|
||||
|
||||
# don't set memento link for very long urls...
|
||||
if len(canon_link) < 512:
|
||||
link.append(self.make_memento_link(canon_link,
|
||||
'memento',
|
||||
http_date))
|
||||
|
||||
if is_original and is_timegate:
|
||||
link.append(self.make_link(req_url, 'original timegate'))
|
||||
else:
|
||||
link.append(self.make_link(req_url, 'original'))
|
||||
|
||||
# for now, include timemap only in non-proxy mode
|
||||
if not wbrequest.options['is_proxy'] and (is_memento or is_timegate):
|
||||
link.append(self.make_timemap_link(wbrequest))
|
||||
|
||||
if is_memento and not is_timegate:
|
||||
timegate = wbrequest.urlrewriter.get_new_url(mod=mod, timestamp='')
|
||||
link.append(self.make_link(timegate, 'timegate'))
|
||||
|
||||
link = ', '.join(link)
|
||||
|
||||
self.status_headers.replace_header('Link', link)
|
||||
|
||||
def make_link(self, url, type):
|
||||
return '<{0}>; rel="{1}"'.format(url, type)
|
||||
|
||||
def make_memento_link(self, url, type_, dt):
|
||||
return '<{0}>; rel="{1}"; datetime="{2}"'.format(url, type_, dt)
|
||||
|
||||
def make_timemap_link(self, wbrequest):
|
||||
format_ = '<{0}>; rel="timemap"; type="{1}"'
|
||||
|
||||
url = wbrequest.urlrewriter.get_new_url(mod='timemap',
|
||||
timestamp='',
|
||||
type=wbrequest.wb_url.QUERY)
|
||||
|
||||
return format_.format(url, LINK_FORMAT)
|
||||
|
||||
|
||||
#=================================================================
|
||||
class MementoResponse(MementoRespMixin, WbResponse):
|
||||
pass
|
||||
|
||||
|
||||
#=================================================================
|
||||
def make_timemap_memento_link(cdx, prefix, datetime=None,
|
||||
rel='memento', end=',\n', mod=''):
|
||||
|
||||
memento = '<{0}>; rel="{1}"; datetime="{2}"' + end
|
||||
|
||||
string = WbUrl.to_wburl_str(url=cdx['url'],
|
||||
mod=mod,
|
||||
timestamp=cdx['timestamp'],
|
||||
type=WbUrl.REPLAY)
|
||||
|
||||
url = prefix + string
|
||||
|
||||
if not datetime:
|
||||
datetime = timestamp_to_http_date(cdx['timestamp'])
|
||||
|
||||
return memento.format(url, rel, datetime)
|
||||
|
||||
|
||||
#=================================================================
|
||||
def make_timemap(wbrequest, cdx_lines):
|
||||
prefix = wbrequest.wb_prefix
|
||||
url = wbrequest.wb_url.url
|
||||
mod = wbrequest.options.get('replay_mod', '')
|
||||
|
||||
# get first memento as it'll be used for 'from' field
|
||||
try:
|
||||
first_cdx = six.next(cdx_lines)
|
||||
from_date = timestamp_to_http_date(first_cdx['timestamp'])
|
||||
except StopIteration:
|
||||
first_cdx = None
|
||||
|
||||
|
||||
if first_cdx:
|
||||
# timemap link
|
||||
timemap = ('<{0}>; rel="self"; ' +
|
||||
'type="application/link-format"; from="{1}",\n')
|
||||
yield timemap.format(prefix + wbrequest.wb_url.to_str(),
|
||||
from_date)
|
||||
|
||||
# original link
|
||||
original = '<{0}>; rel="original",\n'
|
||||
yield original.format(url)
|
||||
|
||||
# timegate link
|
||||
timegate = '<{0}>; rel="timegate",\n'
|
||||
timegate_url= WbUrl.to_wburl_str(url=url,
|
||||
mod=mod,
|
||||
type=WbUrl.LATEST_REPLAY)
|
||||
|
||||
yield timegate.format(prefix + timegate_url)
|
||||
|
||||
if not first_cdx:
|
||||
# terminating timemap link, no from
|
||||
timemap = ('<{0}>; rel="self"; type="application/link-format"')
|
||||
yield timemap.format(prefix + wbrequest.wb_url.to_str())
|
||||
return
|
||||
|
||||
# first memento link
|
||||
yield make_timemap_memento_link(first_cdx, prefix,
|
||||
datetime=from_date, mod=mod)
|
||||
|
||||
prev_cdx = None
|
||||
|
||||
for cdx in cdx_lines:
|
||||
if prev_cdx:
|
||||
yield make_timemap_memento_link(prev_cdx, prefix, mod=mod)
|
||||
|
||||
prev_cdx = cdx
|
||||
|
||||
# last memento link, if any
|
||||
if prev_cdx:
|
||||
yield make_timemap_memento_link(prev_cdx, prefix, end='', mod=mod)
|
@ -1,463 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
|
||||
from pywb.framework.wbrequestresponse import WbResponse, WbRequest
|
||||
from pywb.framework.archivalrouter import ArchivalRouter
|
||||
|
||||
from six.moves.urllib.parse import urlsplit
|
||||
from six import iteritems
|
||||
import base64
|
||||
|
||||
import socket
|
||||
import ssl
|
||||
|
||||
from io import BytesIO
|
||||
|
||||
from pywb.rewrite.url_rewriter import SchemeOnlyUrlRewriter, UrlRewriter
|
||||
from pywb.rewrite.rewrite_content import RewriteContent
|
||||
from pywb.utils.wbexception import BadRequestException
|
||||
|
||||
from warcio.bufferedreaders import BufferedReader
|
||||
from warcio.utils import to_native_str
|
||||
|
||||
from pywb.framework.proxy_resolvers import ProxyAuthResolver, CookieResolver, IPCacheResolver
|
||||
|
||||
from tempfile import SpooledTemporaryFile
|
||||
|
||||
|
||||
#=================================================================
|
||||
class ProxyArchivalRouter(ArchivalRouter):
|
||||
"""
|
||||
A router which combines both archival and proxy modes support
|
||||
First, request is treated as a proxy request using ProxyRouter
|
||||
Second, if not handled by the router, it is treated as a regular
|
||||
archival mode request.
|
||||
"""
|
||||
def __init__(self, routes, **kwargs):
|
||||
super(ProxyArchivalRouter, self).__init__(routes, **kwargs)
|
||||
self.proxy = ProxyRouter(routes, **kwargs)
|
||||
|
||||
def __call__(self, env):
|
||||
response = self.proxy(env)
|
||||
if response:
|
||||
return response
|
||||
|
||||
response = super(ProxyArchivalRouter, self).__call__(env)
|
||||
if response:
|
||||
return response
|
||||
|
||||
|
||||
#=================================================================
|
||||
class ProxyRouter(object):
|
||||
"""
|
||||
A router which supports http proxy mode requests
|
||||
Handles requests of the form: GET http://example.com
|
||||
|
||||
The router returns latest capture by default.
|
||||
However, if Memento protocol support is enabled,
|
||||
the memento Accept-Datetime header can be used
|
||||
to select specific capture.
|
||||
See: http://www.mementoweb.org/guide/rfc/#Pattern1.3
|
||||
for more details.
|
||||
"""
|
||||
|
||||
BLOCK_SIZE = 4096
|
||||
DEF_MAGIC_NAME = 'pywb.proxy'
|
||||
BUFF_RESPONSE_MEM_SIZE = 1024*1024
|
||||
|
||||
CERT_DL_PEM = '/pywb-ca.pem'
|
||||
CERT_DL_P12 = '/pywb-ca.p12'
|
||||
|
||||
CA_ROOT_FILE = './ca/pywb-ca.pem'
|
||||
CA_ROOT_NAME = 'pywb https proxy replay CA'
|
||||
CA_CERTS_DIR = './ca/certs/'
|
||||
|
||||
EXTRA_HEADERS = {'cache-control': 'no-cache',
|
||||
'connection': 'close',
|
||||
'p3p': 'CP="NOI ADM DEV COM NAV OUR STP"'}
|
||||
|
||||
def __init__(self, routes, **kwargs):
|
||||
self.error_view = kwargs.get('error_view')
|
||||
|
||||
proxy_options = kwargs.get('config', {})
|
||||
if proxy_options:
|
||||
proxy_options = proxy_options.get('proxy_options', {})
|
||||
|
||||
self.magic_name = proxy_options.get('magic_name')
|
||||
if not self.magic_name:
|
||||
self.magic_name = self.DEF_MAGIC_NAME
|
||||
proxy_options['magic_name'] = self.magic_name
|
||||
|
||||
self.extra_headers = proxy_options.get('extra_headers')
|
||||
if not self.extra_headers:
|
||||
self.extra_headers = self.EXTRA_HEADERS
|
||||
proxy_options['extra_headers'] = self.extra_headers
|
||||
|
||||
res_type = proxy_options.get('cookie_resolver', True)
|
||||
if res_type == 'auth' or not res_type:
|
||||
self.resolver = ProxyAuthResolver(routes, proxy_options)
|
||||
elif res_type == 'ip':
|
||||
self.resolver = IPCacheResolver(routes, proxy_options)
|
||||
#elif res_type == True or res_type == 'cookie':
|
||||
# self.resolver = CookieResolver(routes, proxy_options)
|
||||
else:
|
||||
self.resolver = CookieResolver(routes, proxy_options)
|
||||
|
||||
self.use_banner = proxy_options.get('use_banner', True)
|
||||
self.use_wombat = proxy_options.get('use_client_rewrite', True)
|
||||
|
||||
self.proxy_cert_dl_view = proxy_options.get('proxy_cert_download_view')
|
||||
|
||||
if not proxy_options.get('enable_https_proxy'):
|
||||
self.ca = None
|
||||
return
|
||||
|
||||
try:
|
||||
from certauth.certauth import CertificateAuthority
|
||||
except ImportError: #pragma: no cover
|
||||
print('HTTPS proxy is not available as the "certauth" module ' +
|
||||
'is not installed')
|
||||
print('Please install via "pip install certauth" ' +
|
||||
'to enable HTTPS support')
|
||||
self.ca = None
|
||||
return
|
||||
|
||||
# HTTPS Only Options
|
||||
ca_file = proxy_options.get('root_ca_file', self.CA_ROOT_FILE)
|
||||
|
||||
# attempt to create the root_ca_file if doesn't exist
|
||||
# (generally recommended to create this seperately)
|
||||
ca_name = proxy_options.get('root_ca_name', self.CA_ROOT_NAME)
|
||||
|
||||
certs_dir = proxy_options.get('certs_dir', self.CA_CERTS_DIR)
|
||||
self.ca = CertificateAuthority(ca_file=ca_file,
|
||||
certs_dir=certs_dir,
|
||||
ca_name=ca_name)
|
||||
|
||||
self.use_wildcard = proxy_options.get('use_wildcard_certs', True)
|
||||
|
||||
def __call__(self, env):
|
||||
is_https = (env['REQUEST_METHOD'] == 'CONNECT')
|
||||
ArchivalRouter.ensure_rel_uri_set(env)
|
||||
|
||||
# for non-https requests, check non-proxy urls
|
||||
if not is_https:
|
||||
url = env['REL_REQUEST_URI']
|
||||
|
||||
if not url.startswith(('http://', 'https://')):
|
||||
return None
|
||||
|
||||
env['pywb.proxy_scheme'] = 'http'
|
||||
|
||||
route = None
|
||||
coll = None
|
||||
matcher = None
|
||||
response = None
|
||||
ts = None
|
||||
|
||||
# check resolver, for pre connect resolve
|
||||
if self.resolver.pre_connect:
|
||||
route, coll, matcher, ts, response = self.resolver.resolve(env)
|
||||
if response:
|
||||
return response
|
||||
|
||||
# do connect, then get updated url
|
||||
if is_https:
|
||||
response = self.handle_connect(env)
|
||||
if response:
|
||||
return response
|
||||
|
||||
url = env['REL_REQUEST_URI']
|
||||
else:
|
||||
parts = urlsplit(env['REL_REQUEST_URI'])
|
||||
hostport = parts.netloc.split(':', 1)
|
||||
env['pywb.proxy_host'] = hostport[0]
|
||||
env['pywb.proxy_port'] = hostport[1] if len(hostport) == 2 else ''
|
||||
env['pywb.proxy_req_uri'] = parts.path
|
||||
if parts.query:
|
||||
env['pywb.proxy_req_uri'] += '?' + parts.query
|
||||
env['pywb.proxy_query'] = parts.query
|
||||
|
||||
if self.resolver.supports_switching:
|
||||
env['pywb_proxy_magic'] = self.magic_name
|
||||
|
||||
# route (static) and other resources to archival replay
|
||||
if env['pywb.proxy_host'] == self.magic_name:
|
||||
env['REL_REQUEST_URI'] = env['pywb.proxy_req_uri']
|
||||
|
||||
# special case for proxy install
|
||||
response = self.handle_cert_install(env)
|
||||
if response:
|
||||
return response
|
||||
|
||||
return None
|
||||
|
||||
# check resolver, post connect
|
||||
if not self.resolver.pre_connect:
|
||||
route, coll, matcher, ts, response = self.resolver.resolve(env)
|
||||
if response:
|
||||
return response
|
||||
|
||||
rel_prefix = ''
|
||||
|
||||
custom_prefix = env.get('HTTP_PYWB_REWRITE_PREFIX', '')
|
||||
if custom_prefix:
|
||||
host_prefix = custom_prefix
|
||||
urlrewriter_class = UrlRewriter
|
||||
abs_prefix = True
|
||||
# always rewrite to absolute here
|
||||
rewrite_opts = dict(no_match_rel=True)
|
||||
else:
|
||||
host_prefix = env['pywb.proxy_scheme'] + '://' + self.magic_name
|
||||
urlrewriter_class = SchemeOnlyUrlRewriter
|
||||
abs_prefix = False
|
||||
rewrite_opts = {}
|
||||
|
||||
# special case for proxy calendar
|
||||
if (env['pywb.proxy_host'] == 'query.' + self.magic_name):
|
||||
url = env['pywb.proxy_req_uri'][1:]
|
||||
rel_prefix = '/'
|
||||
|
||||
if ts is not None:
|
||||
url = ts + '/' + url
|
||||
|
||||
wbrequest = route.request_class(env,
|
||||
request_uri=url,
|
||||
wb_url_str=url,
|
||||
coll=coll,
|
||||
host_prefix=host_prefix,
|
||||
rel_prefix=rel_prefix,
|
||||
wburl_class=route.handler.get_wburl_type(),
|
||||
urlrewriter_class=urlrewriter_class,
|
||||
use_abs_prefix=abs_prefix,
|
||||
rewrite_opts=rewrite_opts,
|
||||
is_proxy=True)
|
||||
|
||||
if matcher:
|
||||
route.apply_filters(wbrequest, matcher)
|
||||
|
||||
# full rewrite and banner
|
||||
if self.use_wombat and self.use_banner:
|
||||
wbrequest.wb_url.mod = ''
|
||||
elif self.use_banner:
|
||||
# banner only, no rewrite
|
||||
wbrequest.wb_url.mod = 'bn_'
|
||||
else:
|
||||
# unaltered, no rewrite or banner
|
||||
wbrequest.wb_url.mod = 'uo_'
|
||||
|
||||
response = route.handler(wbrequest)
|
||||
if not response:
|
||||
return None
|
||||
|
||||
# add extra headers for replay responses
|
||||
if wbrequest.wb_url and wbrequest.wb_url.is_replay():
|
||||
for name, value in iteritems(self.extra_headers):
|
||||
response.status_headers.replace_header(name, value)
|
||||
|
||||
# check for content-length
|
||||
res = response.status_headers.get_header('content-length')
|
||||
try:
|
||||
if int(res) > 0:
|
||||
return response
|
||||
except:
|
||||
pass
|
||||
|
||||
# need to either chunk or buffer to get content-length
|
||||
if env.get('SERVER_PROTOCOL') == 'HTTP/1.1':
|
||||
response.status_headers.remove_header('content-length')
|
||||
response.status_headers.headers.append(('Transfer-Encoding', 'chunked'))
|
||||
response.body = self._chunk_encode(response.body)
|
||||
else:
|
||||
response.body = self._buffer_response(response.status_headers,
|
||||
response.body)
|
||||
|
||||
return response
|
||||
|
||||
@staticmethod
|
||||
def _chunk_encode(orig_iter):
|
||||
for chunk in orig_iter:
|
||||
if not len(chunk):
|
||||
continue
|
||||
chunk_len = b'%X\r\n' % len(chunk)
|
||||
yield chunk_len
|
||||
yield chunk
|
||||
yield b'\r\n'
|
||||
|
||||
yield b'0\r\n\r\n'
|
||||
|
||||
@staticmethod
|
||||
def _buffer_response(status_headers, iterator):
|
||||
out = SpooledTemporaryFile(ProxyRouter.BUFF_RESPONSE_MEM_SIZE)
|
||||
size = 0
|
||||
|
||||
for buff in iterator:
|
||||
size += len(buff)
|
||||
out.write(buff)
|
||||
|
||||
content_length_str = str(size)
|
||||
# remove existing content length
|
||||
status_headers.replace_header('Content-Length',
|
||||
content_length_str)
|
||||
|
||||
out.seek(0)
|
||||
return RewriteContent.stream_to_gen(out)
|
||||
|
||||
def get_request_socket(self, env):
|
||||
if not self.ca:
|
||||
return None
|
||||
|
||||
sock = None
|
||||
|
||||
if env.get('uwsgi.version'): # pragma: no cover
|
||||
try:
|
||||
import uwsgi
|
||||
fd = uwsgi.connection_fd()
|
||||
conn = socket.fromfd(fd, socket.AF_INET, socket.SOCK_STREAM)
|
||||
try:
|
||||
sock = socket.socket(_sock=conn)
|
||||
except:
|
||||
sock = conn
|
||||
except Exception as e:
|
||||
pass
|
||||
elif env.get('gunicorn.socket'): # pragma: no cover
|
||||
sock = env['gunicorn.socket']
|
||||
|
||||
if not sock:
|
||||
# attempt to find socket from wsgi.input
|
||||
input_ = env.get('wsgi.input')
|
||||
if input_:
|
||||
if hasattr(input_, '_sock'): # pragma: no cover
|
||||
raw = input_._sock
|
||||
sock = socket.socket(_sock=raw) # pragma: no cover
|
||||
elif hasattr(input_, 'raw'):
|
||||
sock = input_.raw._sock
|
||||
|
||||
return sock
|
||||
|
||||
def handle_connect(self, env):
|
||||
sock = self.get_request_socket(env)
|
||||
if not sock:
|
||||
return WbResponse.text_response('HTTPS Proxy Not Supported',
|
||||
'405 HTTPS Proxy Not Supported')
|
||||
|
||||
sock.send(b'HTTP/1.0 200 Connection Established\r\n')
|
||||
sock.send(b'Proxy-Connection: close\r\n')
|
||||
sock.send(b'Server: pywb proxy\r\n')
|
||||
sock.send(b'\r\n')
|
||||
|
||||
hostname, port = env['REL_REQUEST_URI'].split(':')
|
||||
|
||||
if not self.use_wildcard:
|
||||
certfile = self.ca.cert_for_host(hostname)
|
||||
else:
|
||||
certfile = self.ca.get_wildcard_cert(hostname)
|
||||
|
||||
try:
|
||||
ssl_sock = ssl.wrap_socket(sock,
|
||||
server_side=True,
|
||||
certfile=certfile,
|
||||
#ciphers="ALL",
|
||||
suppress_ragged_eofs=False,
|
||||
ssl_version=ssl.PROTOCOL_SSLv23
|
||||
)
|
||||
env['pywb.proxy_ssl_sock'] = ssl_sock
|
||||
|
||||
buffreader = BufferedReader(ssl_sock, block_size=self.BLOCK_SIZE)
|
||||
|
||||
statusline = to_native_str(buffreader.readline().rstrip())
|
||||
|
||||
except Exception as se:
|
||||
raise BadRequestException(se.message)
|
||||
|
||||
statusparts = statusline.split(' ')
|
||||
|
||||
if len(statusparts) < 3:
|
||||
raise BadRequestException('Invalid Proxy Request: ' + statusline)
|
||||
|
||||
env['REQUEST_METHOD'] = statusparts[0]
|
||||
env['REL_REQUEST_URI'] = ('https://' +
|
||||
env['REL_REQUEST_URI'].replace(':443', '') +
|
||||
statusparts[1])
|
||||
|
||||
env['SERVER_PROTOCOL'] = statusparts[2].strip()
|
||||
|
||||
env['pywb.proxy_scheme'] = 'https'
|
||||
|
||||
env['pywb.proxy_host'] = hostname
|
||||
env['pywb.proxy_port'] = port
|
||||
env['pywb.proxy_req_uri'] = statusparts[1]
|
||||
|
||||
queryparts = env['REL_REQUEST_URI'].split('?', 1)
|
||||
env['PATH_INFO'] = queryparts[0]
|
||||
env['QUERY_STRING'] = queryparts[1] if len(queryparts) > 1 else ''
|
||||
env['pywb.proxy_query'] = env['QUERY_STRING']
|
||||
|
||||
while True:
|
||||
line = to_native_str(buffreader.readline())
|
||||
if line:
|
||||
line = line.rstrip()
|
||||
|
||||
if not line:
|
||||
break
|
||||
|
||||
parts = line.split(':', 1)
|
||||
if len(parts) < 2:
|
||||
continue
|
||||
|
||||
name = parts[0].strip()
|
||||
value = parts[1].strip()
|
||||
|
||||
name = name.replace('-', '_').upper()
|
||||
|
||||
if name not in ('CONTENT_LENGTH', 'CONTENT_TYPE'):
|
||||
name = 'HTTP_' + name
|
||||
|
||||
env[name] = value
|
||||
|
||||
env['wsgi.input'] = buffreader
|
||||
#remain = buffreader.rem_length()
|
||||
#if remain > 0:
|
||||
#remainder = buffreader.read()
|
||||
#env['wsgi.input'] = BufferedReader(BytesIO(remainder))
|
||||
#remainder = buffreader.read(self.BLOCK_SIZE)
|
||||
#env['wsgi.input'] = BufferedReader(ssl_sock,
|
||||
# block_size=self.BLOCK_SIZE,
|
||||
# starting_data=remainder)
|
||||
|
||||
def handle_cert_install(self, env):
|
||||
if env['pywb.proxy_req_uri'] in ('/', '/index.html', '/index.html'):
|
||||
available = (self.ca is not None)
|
||||
|
||||
if self.proxy_cert_dl_view:
|
||||
return (self.proxy_cert_dl_view.
|
||||
render_response(available=available,
|
||||
pem_path=self.CERT_DL_PEM,
|
||||
p12_path=self.CERT_DL_P12))
|
||||
|
||||
elif env['pywb.proxy_req_uri'] == self.CERT_DL_PEM:
|
||||
if not self.ca:
|
||||
return None
|
||||
|
||||
buff = b''
|
||||
with open(self.ca.ca_file, 'rb') as fh:
|
||||
buff = fh.read()
|
||||
|
||||
content_type = 'application/x-x509-ca-cert'
|
||||
headers = [('Content-Length', str(len(buff)))]
|
||||
|
||||
return WbResponse.bin_stream([buff],
|
||||
content_type=content_type,
|
||||
headers=headers)
|
||||
|
||||
elif env['pywb.proxy_req_uri'] == self.CERT_DL_P12:
|
||||
if not self.ca:
|
||||
return None
|
||||
|
||||
buff = self.ca.get_root_PKCS12()
|
||||
|
||||
content_type = 'application/x-pkcs12'
|
||||
headers = [('Content-Length', str(len(buff)))]
|
||||
|
||||
return WbResponse.bin_stream([buff],
|
||||
content_type=content_type,
|
||||
headers=headers)
|
@ -1,374 +0,0 @@
|
||||
from pywb.framework.wbrequestresponse import WbResponse
|
||||
from pywb.utils.loaders import extract_client_cookie
|
||||
from pywb.utils.wbexception import WbException
|
||||
from pywb.rewrite.wburl import WbUrl
|
||||
|
||||
from pywb.framework.cache import create_cache
|
||||
from pywb.framework.basehandlers import WbUrlHandler
|
||||
|
||||
from six.moves.urllib.parse import parse_qs, urlsplit
|
||||
import six
|
||||
|
||||
from warcio.statusandheaders import StatusAndHeaders
|
||||
from warcio.utils import to_native_str
|
||||
|
||||
import base64
|
||||
import os
|
||||
import json
|
||||
|
||||
|
||||
#=================================================================
|
||||
class BaseCollResolver(object):
|
||||
def __init__(self, routes, config):
|
||||
self.routes = routes
|
||||
self.use_default_coll = config.get('use_default_coll')
|
||||
|
||||
@property
|
||||
def pre_connect(self):
|
||||
return False
|
||||
|
||||
def resolve(self, env):
|
||||
route = None
|
||||
coll = None
|
||||
matcher = None
|
||||
ts = None
|
||||
|
||||
proxy_coll, ts = self.get_proxy_coll_ts(env)
|
||||
|
||||
# invalid parsing
|
||||
if proxy_coll == '':
|
||||
return None, None, None, None, self.select_coll_response(env, proxy_coll)
|
||||
|
||||
if proxy_coll is None and isinstance(self.use_default_coll, str):
|
||||
proxy_coll = self.use_default_coll
|
||||
|
||||
if proxy_coll:
|
||||
path = '/' + proxy_coll + '/'
|
||||
|
||||
for r in self.routes:
|
||||
matcher, c = r.is_handling(path)
|
||||
if matcher:
|
||||
route = r
|
||||
coll = c
|
||||
break
|
||||
|
||||
# if no match, return coll selection response
|
||||
if not route:
|
||||
return None, None, None, None, self.select_coll_response(env, proxy_coll)
|
||||
|
||||
# if 'use_default_coll', find first WbUrl-handling collection
|
||||
elif self.use_default_coll:
|
||||
raise Exception('use_default_coll: true no longer supported, please specify collection name')
|
||||
#for route in self.routes:
|
||||
# if isinstance(route.handler, WbUrlHandler):
|
||||
# return route, route.path, matcher, ts, None
|
||||
|
||||
# otherwise, return the appropriate coll selection response
|
||||
else:
|
||||
return None, None, None, None, self.select_coll_response(env, proxy_coll)
|
||||
|
||||
return route, coll, matcher, ts, None
|
||||
|
||||
|
||||
#=================================================================
|
||||
class ProxyAuthResolver(BaseCollResolver):
|
||||
DEFAULT_MSG = 'Please enter name of a collection to use with proxy mode'
|
||||
|
||||
def __init__(self, routes, config):
|
||||
super(ProxyAuthResolver, self).__init__(routes, config)
|
||||
self.auth_msg = config.get('auth_msg', self.DEFAULT_MSG)
|
||||
|
||||
@property
|
||||
def pre_connect(self):
|
||||
return True
|
||||
|
||||
@property
|
||||
def supports_switching(self):
|
||||
return False
|
||||
|
||||
def get_proxy_coll_ts(self, env):
|
||||
proxy_auth = env.get('HTTP_PROXY_AUTHORIZATION')
|
||||
|
||||
if not proxy_auth:
|
||||
return None, None
|
||||
|
||||
proxy_coll = self.read_basic_auth_coll(proxy_auth)
|
||||
return proxy_coll, None
|
||||
|
||||
def select_coll_response(self, env, default_coll=None):
|
||||
proxy_msg = 'Basic realm="{0}"'.format(self.auth_msg)
|
||||
|
||||
headers = [('Content-Type', 'text/plain'),
|
||||
('Proxy-Authenticate', proxy_msg)]
|
||||
|
||||
status_headers = StatusAndHeaders('407 Proxy Authentication', headers)
|
||||
|
||||
value = self.auth_msg
|
||||
|
||||
return WbResponse(status_headers, value=[value.encode('utf-8')])
|
||||
|
||||
@staticmethod
|
||||
def read_basic_auth_coll(value):
|
||||
parts = value.split(' ')
|
||||
if parts[0].lower() != 'basic':
|
||||
return ''
|
||||
|
||||
if len(parts) != 2:
|
||||
return ''
|
||||
|
||||
user_pass = base64.b64decode(parts[1].encode('utf-8'))
|
||||
return to_native_str(user_pass.split(b':')[0])
|
||||
|
||||
|
||||
#=================================================================
|
||||
class IPCacheResolver(BaseCollResolver):
|
||||
def __init__(self, routes, config):
|
||||
super(IPCacheResolver, self).__init__(routes, config)
|
||||
self.cache = create_cache(config.get('redis_cache_key'))
|
||||
self.magic_name = config['magic_name']
|
||||
|
||||
@property
|
||||
def supports_switching(self):
|
||||
return False
|
||||
|
||||
def _get_ip(self, env):
|
||||
ip = env['REMOTE_ADDR']
|
||||
qs = env.get('pywb.proxy_query')
|
||||
if qs:
|
||||
res = parse_qs(qs)
|
||||
|
||||
if 'ip' in res:
|
||||
ip = res['ip'][0]
|
||||
|
||||
return ip
|
||||
|
||||
def select_coll_response(self, env, default_coll=None):
|
||||
raise WbException('Invalid Proxy Collection Specified: ' + str(default_coll))
|
||||
|
||||
def get_proxy_coll_ts(self, env):
|
||||
ip = env['REMOTE_ADDR']
|
||||
qs = env.get('pywb.proxy_query')
|
||||
|
||||
if qs:
|
||||
res = parse_qs(qs)
|
||||
|
||||
if 'ip' in res:
|
||||
ip = res['ip'][0]
|
||||
|
||||
if 'delete' in res:
|
||||
del self.cache[ip + ':c']
|
||||
del self.cache[ip + ':t']
|
||||
else:
|
||||
if 'coll' in res:
|
||||
self.cache[ip + ':c'] = res['coll'][0]
|
||||
|
||||
if 'ts' in res:
|
||||
self.cache[ip + ':t'] = res['ts'][0]
|
||||
|
||||
coll = self.cache[ip + ':c']
|
||||
ts = self.cache[ip + ':t']
|
||||
return coll, ts
|
||||
|
||||
def resolve(self, env):
|
||||
server_name = env['pywb.proxy_host']
|
||||
|
||||
if self.magic_name in server_name:
|
||||
response = self.handle_magic_page(env)
|
||||
if response:
|
||||
return None, None, None, None, response
|
||||
|
||||
return super(IPCacheResolver, self).resolve(env)
|
||||
|
||||
def handle_magic_page(self, env):
|
||||
coll, ts = self.get_proxy_coll_ts(env)
|
||||
ip = self._get_ip(env)
|
||||
res = json.dumps({'ip': ip, 'coll': coll, 'ts': ts})
|
||||
return WbResponse.text_response(res, content_type='application/json')
|
||||
|
||||
|
||||
#=================================================================
|
||||
class CookieResolver(BaseCollResolver):
|
||||
SESH_COOKIE_NAME = '__pywb_proxy_sesh'
|
||||
|
||||
def __init__(self, routes, config):
|
||||
super(CookieResolver, self).__init__(routes, config)
|
||||
self.magic_name = config['magic_name']
|
||||
self.sethost_prefix = '-sethost.' + self.magic_name + '.'
|
||||
self.set_prefix = '-set.' + self.magic_name
|
||||
|
||||
self.cookie_name = config.get('cookie_name', self.SESH_COOKIE_NAME)
|
||||
self.proxy_select_view = config.get('proxy_select_view')
|
||||
|
||||
self.extra_headers = config.get('extra_headers')
|
||||
|
||||
self.cache = create_cache()
|
||||
|
||||
@property
|
||||
def supports_switching(self):
|
||||
return True
|
||||
|
||||
def get_proxy_coll_ts(self, env):
|
||||
coll, ts, sesh_id = self.get_coll(env)
|
||||
return coll, ts
|
||||
|
||||
def select_coll_response(self, env, default_coll=None):
|
||||
return self.make_magic_response('auto',
|
||||
env['REL_REQUEST_URI'],
|
||||
env)
|
||||
|
||||
def resolve(self, env):
|
||||
server_name = env['pywb.proxy_host']
|
||||
|
||||
if ('.' + self.magic_name) in server_name:
|
||||
response = self.handle_magic_page(env)
|
||||
if response:
|
||||
return None, None, None, None, response
|
||||
|
||||
return super(CookieResolver, self).resolve(env)
|
||||
|
||||
def handle_magic_page(self, env):
|
||||
request_url = env['REL_REQUEST_URI']
|
||||
parts = urlsplit(request_url)
|
||||
server_name = env['pywb.proxy_host']
|
||||
|
||||
path_url = parts.path[1:]
|
||||
if parts.query:
|
||||
path_url += '?' + parts.query
|
||||
|
||||
if server_name.startswith('auto'):
|
||||
coll, ts, sesh_id = self.get_coll(env)
|
||||
|
||||
if coll:
|
||||
return self.make_sethost_cookie_response(sesh_id,
|
||||
path_url,
|
||||
env)
|
||||
else:
|
||||
return self.make_magic_response('select', path_url, env)
|
||||
|
||||
elif server_name.startswith('query.'):
|
||||
wb_url = WbUrl(path_url)
|
||||
|
||||
# only dealing with specific timestamp setting
|
||||
if wb_url.is_query():
|
||||
return None
|
||||
|
||||
coll, ts, sesh_id = self.get_coll(env)
|
||||
if not coll:
|
||||
return self.make_magic_response('select', path_url, env)
|
||||
|
||||
self.set_ts(sesh_id, wb_url.timestamp)
|
||||
return self.make_redir_response(wb_url.url)
|
||||
|
||||
elif server_name.endswith(self.set_prefix):
|
||||
old_sesh_id = extract_client_cookie(env, self.cookie_name)
|
||||
sesh_id = self.create_renew_sesh_id(old_sesh_id)
|
||||
|
||||
if sesh_id != old_sesh_id:
|
||||
headers = self.make_cookie_headers(sesh_id, self.magic_name)
|
||||
else:
|
||||
headers = None
|
||||
|
||||
coll = server_name[:-len(self.set_prefix)]
|
||||
|
||||
# set sesh value
|
||||
self.set_coll(sesh_id, coll)
|
||||
|
||||
return self.make_sethost_cookie_response(sesh_id, path_url, env,
|
||||
headers=headers)
|
||||
|
||||
elif self.sethost_prefix in server_name:
|
||||
inx = server_name.find(self.sethost_prefix)
|
||||
sesh_id = server_name[:inx]
|
||||
|
||||
domain = server_name[inx + len(self.sethost_prefix):]
|
||||
|
||||
headers = self.make_cookie_headers(sesh_id, domain)
|
||||
|
||||
full_url = env['pywb.proxy_scheme'] + '://' + domain
|
||||
full_url += '/' + path_url
|
||||
return self.make_redir_response(full_url, headers=headers)
|
||||
|
||||
elif 'select.' in server_name:
|
||||
coll, ts, sesh_id = self.get_coll(env)
|
||||
|
||||
route_temp = '-set.' + self.magic_name + '/' + path_url
|
||||
|
||||
return (self.proxy_select_view.
|
||||
render_response(routes=self.routes,
|
||||
route_temp=route_temp,
|
||||
coll=coll,
|
||||
url=path_url))
|
||||
#else:
|
||||
# msg = 'Invalid Magic Path: ' + url
|
||||
# print msg
|
||||
# return WbResponse.text_response(msg, status='404 Not Found')
|
||||
|
||||
def make_cookie_headers(self, sesh_id, domain):
|
||||
cookie_val = '{0}={1}; Path=/; Domain=.{2}; HttpOnly'
|
||||
cookie_val = cookie_val.format(self.cookie_name, sesh_id, domain)
|
||||
headers = [('Set-Cookie', cookie_val)]
|
||||
return headers
|
||||
|
||||
def make_sethost_cookie_response(self, sesh_id, path_url,
|
||||
env, headers=None):
|
||||
if '://' not in path_url:
|
||||
path_url = 'http://' + path_url
|
||||
|
||||
path_parts = urlsplit(path_url)
|
||||
|
||||
new_url = path_parts.path[1:]
|
||||
if path_parts.query:
|
||||
new_url += '?' + path_parts.query
|
||||
|
||||
return self.make_magic_response(sesh_id + '-sethost', new_url, env,
|
||||
suffix=path_parts.netloc,
|
||||
headers=headers)
|
||||
|
||||
def make_magic_response(self, prefix, url, env,
|
||||
suffix=None, headers=None):
|
||||
full_url = env['pywb.proxy_scheme'] + '://' + prefix + '.'
|
||||
full_url += self.magic_name
|
||||
if suffix:
|
||||
full_url += '.' + suffix
|
||||
full_url += '/' + url
|
||||
return self.make_redir_response(full_url, headers=headers)
|
||||
|
||||
def set_coll(self, sesh_id, coll):
|
||||
self.cache[sesh_id + ':c'] = coll
|
||||
|
||||
def set_ts(self, sesh_id, ts):
|
||||
if ts:
|
||||
self.cache[sesh_id + ':t'] = ts
|
||||
# this ensures that omitting timestamp will reset to latest
|
||||
# capture by deleting the cache entry
|
||||
else:
|
||||
del self.cache[sesh_id + ':t']
|
||||
|
||||
def get_coll(self, env):
|
||||
sesh_id = extract_client_cookie(env, self.cookie_name)
|
||||
|
||||
coll = None
|
||||
ts = None
|
||||
if sesh_id:
|
||||
coll = self.cache[sesh_id + ':c']
|
||||
ts = self.cache[sesh_id + ':t']
|
||||
|
||||
return coll, ts, sesh_id
|
||||
|
||||
def create_renew_sesh_id(self, sesh_id, force=False):
|
||||
#if sesh_id in self.cache and not force:
|
||||
if sesh_id and ((sesh_id + ':c') in self.cache) and not force:
|
||||
return sesh_id
|
||||
|
||||
sesh_id = base64.b32encode(os.urandom(5)).lower()
|
||||
return to_native_str(sesh_id)
|
||||
|
||||
def make_redir_response(self, url, headers=None):
|
||||
if not headers:
|
||||
headers = []
|
||||
|
||||
if self.extra_headers:
|
||||
for name, value in six.iteritems(self.extra_headers):
|
||||
headers.append((name, value))
|
||||
|
||||
return WbResponse.redir_response(url, headers=headers)
|
@ -1,135 +0,0 @@
|
||||
"""
|
||||
# Test WbRequest parsed via a Route
|
||||
# route with relative path, print resulting wbrequest
|
||||
>>> _test_route_req(Route('web', WbUrlHandler()), {'REL_REQUEST_URI': '/web/test.example.com', 'SCRIPT_NAME': ''})
|
||||
{'coll': 'web',
|
||||
'request_uri': '/web/test.example.com',
|
||||
'wb_prefix': '/web/',
|
||||
'wb_url': ('latest_replay', '', '', 'http://test.example.com', 'http://test.example.com')}
|
||||
|
||||
|
||||
# route with absolute path, running at script /my_pywb, print resultingwbrequest
|
||||
>>> _test_route_req(Route('web', WbUrlHandler()), {'REL_REQUEST_URI': '/web/2013im_/test.example.com', 'SCRIPT_NAME': '/my_pywb', 'HTTP_HOST': 'localhost:8081', 'wsgi.url_scheme': 'https'}, True)
|
||||
{'coll': 'web',
|
||||
'request_uri': '/web/2013im_/test.example.com',
|
||||
'wb_prefix': 'https://localhost:8081/my_pywb/web/',
|
||||
'wb_url': ('replay', '2013', 'im_', 'http://test.example.com', '2013im_/http://test.example.com')}
|
||||
|
||||
# route with no collection
|
||||
>>> _test_route_req(Route('', BaseHandler()), {'REL_REQUEST_URI': 'http://example.com', 'SCRIPT_NAME': '/pywb'})
|
||||
{'coll': '',
|
||||
'request_uri': 'http://example.com',
|
||||
'wb_prefix': '/pywb/',
|
||||
'wb_url': None}
|
||||
|
||||
# not matching route -- skipped
|
||||
>>> _test_route_req(Route('web', BaseHandler()), {'REL_REQUEST_URI': '/other/test.example.com', 'SCRIPT_NAME': ''})
|
||||
|
||||
# Test Refer Redirects
|
||||
>>> _test_redir('http://localhost:8080/', '/diff_path/other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html')
|
||||
'http://localhost:8080/coll/20131010/http://example.com/diff_path/other.html'
|
||||
|
||||
>>> _test_redir('http://localhost:8080/', '/../other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html')
|
||||
'http://localhost:8080/coll/20131010/http://example.com/other.html'
|
||||
|
||||
>>> _test_redir('http://localhost:8080/', '/../../other.html', 'http://localhost:8080/coll/20131010/http://example.com/index.html')
|
||||
'http://localhost:8080/coll/20131010/http://example.com/other.html'
|
||||
|
||||
# Custom collection
|
||||
>>> _test_redir('http://localhost:8080/', '/other.html', 'http://localhost:8080/complex/123/20131010/http://example.com/path/page.html', coll='complex/123')
|
||||
'http://localhost:8080/complex/123/20131010/http://example.com/other.html'
|
||||
|
||||
# With timestamp included
|
||||
>>> _test_redir('http://localhost:8080/', '/20131010/other.html', 'http://localhost:8080/coll/20131010/http://example.com/index.html')
|
||||
'http://localhost:8080/coll/20131010/http://example.com/other.html'
|
||||
|
||||
# With timestamp included
|
||||
>>> _test_redir('http://localhost:8080/', '/20131010/path/other.html', 'http://localhost:8080/coll/20131010/http://example.com/some/index.html')
|
||||
'http://localhost:8080/coll/20131010/http://example.com/path/other.html'
|
||||
|
||||
# Wrong Host
|
||||
>>> _test_redir('http://example.com:8080/', '/other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html')
|
||||
False
|
||||
|
||||
# Right Host
|
||||
>>> _test_redir('http://example.com:8080/', '/other.html', 'http://example.com:8080/coll/20131010/http://example.com/path/page.html')
|
||||
'http://example.com:8080/coll/20131010/http://example.com/other.html'
|
||||
|
||||
# With custom SCRIPT_NAME
|
||||
>>> _test_redir('http://localhost:8080/', '/../other.html', 'http://localhost:8080/extra/coll/20131010/http://example.com/path/page.html', '/extra')
|
||||
'http://localhost:8080/extra/coll/20131010/http://example.com/other.html'
|
||||
|
||||
# With custom SCRIPT_NAME + timestamp
|
||||
>>> _test_redir('http://localhost:8080/', '/20131010/other.html', 'http://localhost:8080/extra/coll/20131010/http://example.com/path/page.html', '/extra')
|
||||
'http://localhost:8080/extra/coll/20131010/http://example.com/other.html'
|
||||
|
||||
# With custom SCRIPT_NAME, bad match
|
||||
>>> _test_redir('http://localhost:8080/', '/../other.html', 'http://localhost:8080/extra/coll/20131010/http://example.com/path/page.html', '/extr')
|
||||
False
|
||||
|
||||
# With no collection
|
||||
>>> _test_redir('http://localhost:8080/', '/other.html', 'http://localhost:8080/2013/http://example.com/path/page.html', coll='')
|
||||
'http://localhost:8080/2013/http://example.com/other.html'
|
||||
|
||||
# With SCRIPT_NAME but no collection
|
||||
>>> _test_redir('http://localhost:8080/', '/other.html', 'http://localhost:8080/pywb-access/http://example.com/path/page.html', '/pywb-access', coll='')
|
||||
'http://localhost:8080/pywb-access/http://example.com/other.html'
|
||||
|
||||
|
||||
>>> _test_redir('http://localhost:8080/', '/some/example/other.html', 'http://localhost:8080/user/coll/http://example.com/path/page.html', '/user/coll', coll='')
|
||||
'http://localhost:8080/user/coll/http://example.com/some/example/other.html'
|
||||
|
||||
## Test ensure_rel_uri_set
|
||||
|
||||
# Simple test:
|
||||
>>> ArchivalRouter.ensure_rel_uri_set({'PATH_INFO': '/pywb/example.com'})
|
||||
'/pywb/example.com'
|
||||
|
||||
# Test all unecoded special chars and double-quote
|
||||
# (double-quote must be encoded but not single quote)
|
||||
>>> ArchivalRouter.ensure_rel_uri_set({'PATH_INFO': "/pywb/example.com/0~!+$&'()*+,;=:\\\""})
|
||||
"/pywb/example.com/0~!+$&'()*+,;=:%22"
|
||||
|
||||
"""
|
||||
|
||||
from pywb.framework.archivalrouter import Route, ReferRedirect, ArchivalRouter
|
||||
from pywb.framework.basehandlers import BaseHandler, WbUrlHandler
|
||||
|
||||
import pprint
|
||||
|
||||
from six.moves.urllib.parse import urlsplit
|
||||
|
||||
def _test_route_req(route, env, abs_path=False):
|
||||
matcher, coll = route.is_handling(env['REL_REQUEST_URI'])
|
||||
if not matcher:
|
||||
return
|
||||
|
||||
the_router = ArchivalRouter([route], abs_path=abs_path)
|
||||
req = the_router.parse_request(route, env, matcher, coll, env['REL_REQUEST_URI'], abs_path)
|
||||
|
||||
varlist = vars(req)
|
||||
the_dict = dict((k, varlist[k]) for k in ('request_uri', 'wb_prefix', 'wb_url', 'coll'))
|
||||
pprint.pprint(the_dict)
|
||||
|
||||
|
||||
def _test_redir(match_host, request_uri, referrer, script_name='', coll='coll'):
|
||||
env = {'REL_REQUEST_URI': request_uri, 'HTTP_REFERER': referrer, 'SCRIPT_NAME': script_name}
|
||||
|
||||
env['HTTP_HOST'] = urlsplit(match_host).netloc
|
||||
|
||||
routes = [Route(coll, WbUrlHandler())]
|
||||
|
||||
the_router = ArchivalRouter(routes)
|
||||
|
||||
redir = ReferRedirect()
|
||||
#req = WbRequest.from_uri(request_uri, env)
|
||||
rep = redir(env, the_router)
|
||||
if not rep:
|
||||
return False
|
||||
|
||||
return rep.status_headers.get_header('Location')
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
doctest.testmod()
|
@ -1,178 +1,6 @@
|
||||
"""
|
||||
# WbRequest Tests
|
||||
# =================
|
||||
#>>> get_req_from_uri('/save/_embed/example.com/?a=b')
|
||||
{'wb_url': ('latest_replay', '', '', 'http://_embed/example.com/?a=b', 'http://_embed/example.com/?a=b'), 'coll': 'save', 'wb_prefix': '/save/', 'request_uri': '/save/_embed/example.com/?a=b'}
|
||||
|
||||
#>>> get_req_from_uri('/2345/20101024101112im_/example.com/?b=c')
|
||||
{'wb_url': ('replay', '20101024101112', 'im_', 'http://example.com/?b=c', '20101024101112im_/http://example.com/?b=c'), 'coll': '2345', 'wb_prefix': '/2345/', 'request_uri': '/2345/20101024101112im_/example.com/?b=c'}
|
||||
|
||||
#>>> get_req_from_uri('/2010/example.com')
|
||||
{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': '/2010/', 'request_uri': '/2010/example.com'}
|
||||
|
||||
# ajax
|
||||
#>>> get_req_from_uri('', {'REL_REQUEST_URI': '/2010/example.com', 'HTTP_HOST': 'localhost:8080', 'HTTP_X_REQUESTED_WITH': 'XMLHttpRequest'})
|
||||
{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': '/2010/', 'request_uri': '/2010/example.com'}
|
||||
|
||||
#>>> get_req_from_uri('../example.com')
|
||||
{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '', 'wb_prefix': '/', 'request_uri': '../example.com'}
|
||||
|
||||
# Abs path
|
||||
#>>> get_req_from_uri('/2010/example.com', {'wsgi.url_scheme': 'https', 'HTTP_HOST': 'localhost:8080'}, use_abs_prefix = True)
|
||||
{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': 'https://localhost:8080/2010/', 'request_uri': '/2010/example.com'}
|
||||
|
||||
# No Scheme, default to http (shouldn't happen per WSGI standard)
|
||||
#>>> get_req_from_uri('/2010/example.com', {'HTTP_HOST': 'localhost:8080'}, use_abs_prefix = True)
|
||||
{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': 'http://localhost:8080/2010/', 'request_uri': '/2010/example.com'}
|
||||
|
||||
# Referrer extraction
|
||||
>>> WbUrl(req_from_uri('/web/2010/example.com', {'wsgi.url_scheme': 'http', 'HTTP_HOST': 'localhost:8080', 'HTTP_REFERER': 'http://localhost:8080/web/2011/blah.example.com/'}).extract_referrer_wburl_str()).url
|
||||
'http://blah.example.com/'
|
||||
|
||||
# incorrect referer
|
||||
>>> req_from_uri('/web/2010/example.com', {'wsgi.url_scheme': 'http', 'HTTP_HOST': 'localhost:8080', 'HTTP_REFERER': 'http://other.example.com/web/2011/blah.example.com/'}).extract_referrer_wburl_str()
|
||||
|
||||
|
||||
# no referer
|
||||
>>> req_from_uri('/web/2010/example.com', {'wsgi.url_scheme': 'http', 'HTTP_HOST': 'localhost:8080'}).extract_referrer_wburl_str()
|
||||
|
||||
# range requests
|
||||
>>> req_from_uri('/web/2014/example.com', dict(HTTP_RANGE='bytes=10-100')).extract_range()
|
||||
('http://example.com', 10, 100, True)
|
||||
|
||||
>>> req_from_uri('/web/2014/example.com', dict(HTTP_RANGE='bytes=0-')).extract_range()
|
||||
('http://example.com', 0, '', True)
|
||||
|
||||
>>> req_from_uri('/web/www.googlevideo.com/videoplayback?id=123&range=0-65535').extract_range()
|
||||
('http://www.googlevideo.com/videoplayback?id=123', 0, 65535, False)
|
||||
|
||||
>>> req_from_uri('/web/www.googlevideo.com/videoplayback?id=123&range=100-200').extract_range()
|
||||
('http://www.googlevideo.com/videoplayback?id=123', 100, 200, False)
|
||||
|
||||
# invalid range requests
|
||||
>>> req_from_uri('/web/2014/example.com', dict(HTTP_RANGE='10-20')).extract_range()
|
||||
|
||||
>>> req_from_uri('/web/2014/example.com', dict(HTTP_RANGE='A-5')).extract_range()
|
||||
|
||||
>>> req_from_uri('/web/www.googlevideo.com/videoplayback?id=123&range=100-').extract_range()
|
||||
|
||||
"""
|
||||
|
||||
|
||||
from pywb.rewrite.wburl import WbUrl
|
||||
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||
from pywb.framework.wbrequestresponse import WbResponse
|
||||
from warcio.statusandheaders import StatusAndHeaders
|
||||
|
||||
from pywb.framework.wbrequestresponse import WbRequest, WbResponse
|
||||
|
||||
|
||||
def get_req_from_uri(request_uri, env={}, use_abs_prefix=False):
|
||||
response = req_from_uri(request_uri, env, use_abs_prefix)
|
||||
varlist = vars(response)
|
||||
the_dict = dict((k, varlist[k]) for k in ('request_uri', 'wb_prefix', 'wb_url', 'coll'))
|
||||
#print(the_dict)
|
||||
return the_dict
|
||||
|
||||
def req_from_uri(request_uri, env={}, use_abs_prefix=False):
|
||||
if not request_uri:
|
||||
request_uri = env.get('REL_REQUEST_URI')
|
||||
|
||||
parts = request_uri.split('/', 2)
|
||||
|
||||
# Has coll prefix
|
||||
if len(parts) == 3:
|
||||
rel_prefix = '/' + parts[1] + '/'
|
||||
wb_url_str = parts[2]
|
||||
coll = parts[1]
|
||||
# No Coll Prefix
|
||||
elif len(parts) == 2:
|
||||
rel_prefix = '/'
|
||||
wb_url_str = parts[1]
|
||||
coll = ''
|
||||
else:
|
||||
rel_prefix = '/'
|
||||
wb_url_str = parts[0]
|
||||
coll = ''
|
||||
|
||||
return WbRequest(env,
|
||||
request_uri=request_uri,
|
||||
rel_prefix=rel_prefix,
|
||||
wb_url_str=wb_url_str,
|
||||
coll=coll,
|
||||
wburl_class=WbUrl,
|
||||
urlrewriter_class=UrlRewriter,
|
||||
use_abs_prefix=use_abs_prefix)
|
||||
|
||||
|
||||
def test_req_1():
|
||||
res = get_req_from_uri('/save/_embed/example.com/?a=b')
|
||||
|
||||
assert(repr(res['wb_url']) == "('latest_replay', '', '', 'http://_embed/example.com/?a=b', 'http://_embed/example.com/?a=b')")
|
||||
assert(res['coll'] == 'save')
|
||||
assert(res['wb_prefix'] == '/save/')
|
||||
assert(res['request_uri'] == '/save/_embed/example.com/?a=b')
|
||||
|
||||
def test_req_2():
|
||||
res = get_req_from_uri('/2345/20101024101112im_/example.com/?b=c')
|
||||
|
||||
assert(repr(res['wb_url']) == "('replay', '20101024101112', 'im_', 'http://example.com/?b=c', '20101024101112im_/http://example.com/?b=c')")
|
||||
assert(res['coll'] == '2345')
|
||||
assert(res['wb_prefix'] == '/2345/')
|
||||
assert(res['request_uri'] == '/2345/20101024101112im_/example.com/?b=c')
|
||||
|
||||
def test_req_3():
|
||||
res = get_req_from_uri('/2010/example.com')
|
||||
|
||||
assert(repr(res['wb_url']) == "('latest_replay', '', '', 'http://example.com', 'http://example.com')")
|
||||
assert(res['coll'] == '2010')
|
||||
assert(res['wb_prefix'] == '/2010/')
|
||||
assert(res['request_uri'] == '/2010/example.com')
|
||||
|
||||
|
||||
def test_req_4():
|
||||
# ajax
|
||||
res = get_req_from_uri('', {'REL_REQUEST_URI': '/2010/example.com', 'HTTP_HOST': 'localhost:8080', 'HTTP_X_REQUESTED_WITH': 'XMLHttpRequest'})
|
||||
|
||||
assert(repr(res['wb_url']) == "('latest_replay', '', '', 'http://example.com', 'http://example.com')")
|
||||
assert(res['coll'] == '2010')
|
||||
assert(res['wb_prefix'] == '/2010/')
|
||||
assert(res['request_uri'] == '/2010/example.com')
|
||||
|
||||
|
||||
def test_req_5():
|
||||
res = get_req_from_uri('../example.com')
|
||||
|
||||
assert(repr(res['wb_url']) == "('latest_replay', '', '', 'http://example.com', 'http://example.com')")
|
||||
assert(res['coll'] == '')
|
||||
assert(res['wb_prefix'] == '/')
|
||||
assert(res['request_uri'] == '../example.com')
|
||||
|
||||
|
||||
|
||||
def test_req_6():
|
||||
# Abs path
|
||||
res = get_req_from_uri('/2010/example.com', {'wsgi.url_scheme': 'https', 'HTTP_HOST': 'localhost:8080'}, use_abs_prefix = True)
|
||||
|
||||
assert(repr(res['wb_url']) == "('latest_replay', '', '', 'http://example.com', 'http://example.com')")
|
||||
assert(res['coll'] == '2010')
|
||||
assert(res['wb_prefix'] == 'https://localhost:8080/2010/')
|
||||
assert(res['request_uri'] == '/2010/example.com')
|
||||
|
||||
|
||||
def test_req_7():
|
||||
# No Scheme, default to http (shouldn't happen per WSGI standard)
|
||||
res = get_req_from_uri('/2010/example.com', {'HTTP_HOST': 'localhost:8080'}, use_abs_prefix = True)
|
||||
|
||||
assert(repr(res['wb_url']) == "('latest_replay', '', '', 'http://example.com', 'http://example.com')")
|
||||
assert(res['coll'] == '2010')
|
||||
assert(res['wb_prefix'] == 'http://localhost:8080/2010/')
|
||||
assert(res['request_uri'] == '/2010/example.com')
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#Response tests
|
||||
|
||||
def test_resp_1():
|
||||
resp = vars(WbResponse.text_response('Test'))
|
||||
|
@ -1,57 +0,0 @@
|
||||
from pywb.framework.wsgi_wrappers import init_app
|
||||
|
||||
from pywb.utils.wbexception import AccessException
|
||||
|
||||
import webtest
|
||||
|
||||
class TestOkApp:
|
||||
def __call__(self, env):
|
||||
def response(env, start_response):
|
||||
start_response('200 OK', [])
|
||||
return [b'Test']
|
||||
return response
|
||||
|
||||
class TestErrApp:
|
||||
def __call__(self, env):
|
||||
raise Exception('Test Unexpected Error')
|
||||
|
||||
class TestCustomErrApp:
|
||||
def __call__(self, env):
|
||||
raise AccessException('Forbidden Test')
|
||||
|
||||
|
||||
def initer(app_class):
|
||||
def init(config=None):
|
||||
return app_class()
|
||||
return init
|
||||
|
||||
def test_ok_app():
|
||||
the_app = init_app(initer(TestOkApp), load_yaml=False)
|
||||
|
||||
testapp = webtest.TestApp(the_app)
|
||||
resp = testapp.get('/')
|
||||
|
||||
assert resp.status_int == 200
|
||||
assert b'Test' in resp.body, resp.body
|
||||
|
||||
def test_err_app():
|
||||
the_app = init_app(initer(TestErrApp), load_yaml=False)
|
||||
|
||||
testapp = webtest.TestApp(the_app)
|
||||
resp = testapp.get('/abc', expect_errors=True)
|
||||
|
||||
assert resp.status_int == 500
|
||||
assert b'500 Internal Server Error Error: Test Unexpected Error' in resp.body
|
||||
|
||||
def test_custom_err_app():
|
||||
the_app = init_app(initer(TestCustomErrApp), load_yaml=False)
|
||||
|
||||
testapp = webtest.TestApp(the_app)
|
||||
resp = testapp.get('/abc', expect_errors=True)
|
||||
|
||||
assert resp.status_int == 403
|
||||
assert b'403 Access Denied Error: Forbidden Test' in resp.body
|
||||
|
||||
|
||||
|
||||
|
@ -1,204 +1,8 @@
|
||||
from warcio.statusandheaders import StatusAndHeaders
|
||||
from pywb.utils.loaders import extract_post_query, append_post_query
|
||||
|
||||
from io import BytesIO
|
||||
import pprint
|
||||
import re
|
||||
import json
|
||||
|
||||
|
||||
#=================================================================
|
||||
class WbRequest(object):
|
||||
"""
|
||||
Represents the main pywb request object.
|
||||
|
||||
Contains various info from wsgi env, add additional info
|
||||
about the request, such as coll, relative prefix,
|
||||
host prefix, absolute prefix.
|
||||
|
||||
If a wburl and url rewriter classes are specified, the class
|
||||
also contains the url rewriter.
|
||||
|
||||
"""
|
||||
@staticmethod
|
||||
def make_host_prefix(env):
|
||||
try:
|
||||
host = env.get('HTTP_HOST')
|
||||
if not host:
|
||||
host = env['SERVER_NAME'] + ':' + env['SERVER_PORT']
|
||||
|
||||
return env.get('wsgi.url_scheme', 'http') + '://' + host
|
||||
except KeyError:
|
||||
return ''
|
||||
|
||||
def __init__(self, env,
|
||||
request_uri=None,
|
||||
rel_prefix='',
|
||||
wb_url_str='/',
|
||||
coll='',
|
||||
host_prefix='',
|
||||
use_abs_prefix=False,
|
||||
wburl_class=None,
|
||||
urlrewriter_class=None,
|
||||
is_proxy=False,
|
||||
cookie_scope=None,
|
||||
rewrite_opts={},
|
||||
user_metadata={},
|
||||
):
|
||||
|
||||
self.env = env
|
||||
|
||||
if request_uri:
|
||||
self.request_uri = request_uri
|
||||
else:
|
||||
self.request_uri = env.get('REL_REQUEST_URI')
|
||||
|
||||
self.method = self.env.get('REQUEST_METHOD')
|
||||
|
||||
self.coll = coll
|
||||
|
||||
self.final_mod = ''
|
||||
|
||||
if not host_prefix:
|
||||
host_prefix = self.make_host_prefix(env)
|
||||
|
||||
self.host_prefix = host_prefix
|
||||
self.rel_prefix = rel_prefix
|
||||
|
||||
if use_abs_prefix:
|
||||
self.wb_prefix = host_prefix + rel_prefix
|
||||
else:
|
||||
self.wb_prefix = rel_prefix
|
||||
|
||||
if not wb_url_str:
|
||||
wb_url_str = '/'
|
||||
|
||||
self.wb_url_str = wb_url_str
|
||||
|
||||
# wb_url present and not root page
|
||||
if wb_url_str != '/' and wburl_class:
|
||||
self.wb_url = wburl_class(wb_url_str)
|
||||
self.urlrewriter = urlrewriter_class(self.wb_url,
|
||||
self.wb_prefix,
|
||||
host_prefix + rel_prefix,
|
||||
rel_prefix,
|
||||
env.get('SCRIPT_NAME', '/'),
|
||||
cookie_scope,
|
||||
rewrite_opts)
|
||||
|
||||
self.urlrewriter.deprefix_url()
|
||||
# no wb_url, just store blank wb_url
|
||||
else:
|
||||
self.wb_url = None
|
||||
self.urlrewriter = None
|
||||
|
||||
self.referrer = env.get('HTTP_REFERER')
|
||||
|
||||
self.options = dict()
|
||||
self.options['is_ajax'] = self._is_ajax()
|
||||
self.options['is_proxy'] = is_proxy or env.get('pywb_proxy_magic')
|
||||
|
||||
self.query_filter = []
|
||||
self.custom_params = {}
|
||||
self.user_metadata = user_metadata
|
||||
self.rewrite_opts = rewrite_opts
|
||||
|
||||
# PERF
|
||||
env['X_PERF'] = {}
|
||||
|
||||
if env.get('HTTP_X_PYWB_NOREDIRECT'):
|
||||
self.custom_params['noredir'] = True
|
||||
|
||||
self._parse_extra()
|
||||
|
||||
def _is_ajax(self):
|
||||
value = self.env.get('HTTP_X_REQUESTED_WITH')
|
||||
value = value or self.env.get('HTTP_X_PYWB_REQUESTED_WITH')
|
||||
if value and value.lower() == 'xmlhttprequest':
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
RANGE_ARG_RX = re.compile('.*.googlevideo.com/videoplayback.*([&?]range=(\d+)-(\d+))')
|
||||
|
||||
RANGE_HEADER = re.compile('bytes=(\d+)-(\d+)?')
|
||||
|
||||
def extract_range(self):
|
||||
url = self.wb_url.url
|
||||
use_206 = False
|
||||
start = None
|
||||
end = None
|
||||
|
||||
range_h = self.env.get('HTTP_RANGE')
|
||||
|
||||
if range_h:
|
||||
m = self.RANGE_HEADER.match(range_h)
|
||||
if m:
|
||||
start = m.group(1)
|
||||
end = m.group(2)
|
||||
use_206 = True
|
||||
|
||||
else:
|
||||
m = self.RANGE_ARG_RX.match(url)
|
||||
if m:
|
||||
start = m.group(2)
|
||||
end = m.group(3)
|
||||
url = url[:m.start(1)] + url[m.end(1):]
|
||||
use_206 = False
|
||||
|
||||
if not start:
|
||||
return None
|
||||
|
||||
start = int(start)
|
||||
self.custom_params['noredir'] = True
|
||||
|
||||
if end:
|
||||
end = int(end)
|
||||
else:
|
||||
end = ''
|
||||
|
||||
result = (url, start, end, use_206)
|
||||
return result
|
||||
|
||||
def __repr__(self):
|
||||
varlist = vars(self)
|
||||
varstr = pprint.pformat(varlist)
|
||||
return varstr
|
||||
|
||||
def _parse_extra(self):
|
||||
pass
|
||||
|
||||
def extract_referrer_wburl_str(self):
|
||||
if not self.referrer:
|
||||
return None
|
||||
|
||||
if not self.referrer.startswith(self.host_prefix + self.rel_prefix):
|
||||
return None
|
||||
|
||||
wburl_str = self.referrer[len(self.host_prefix + self.rel_prefix):]
|
||||
return wburl_str
|
||||
|
||||
def normalize_post_query(self):
|
||||
if self.method != 'POST':
|
||||
return
|
||||
|
||||
if not self.wb_url:
|
||||
return
|
||||
|
||||
mime = self.env.get('CONTENT_TYPE', '')
|
||||
length = self.env.get('CONTENT_LENGTH')
|
||||
stream = self.env['wsgi.input']
|
||||
|
||||
buffered_stream = BytesIO()
|
||||
|
||||
post_query = extract_post_query('POST', mime, length, stream,
|
||||
buffered_stream=buffered_stream,
|
||||
environ=self.env)
|
||||
|
||||
if post_query:
|
||||
self.env['wsgi.input'] = buffered_stream
|
||||
self.wb_url.url = append_post_query(self.wb_url.url, post_query)
|
||||
|
||||
|
||||
#=================================================================
|
||||
class WbResponse(object):
|
||||
|
@ -1,188 +0,0 @@
|
||||
from pywb.utils.wbexception import WbException, NotFoundException
|
||||
from pywb.utils.loaders import load_yaml_config
|
||||
from pywb.utils.loaders import load_yaml_config
|
||||
from warcio.utils import to_native_str
|
||||
|
||||
from pywb.framework.wbrequestresponse import WbResponse
|
||||
from warcio.statusandheaders import StatusAndHeaders
|
||||
|
||||
|
||||
import os
|
||||
import logging
|
||||
|
||||
|
||||
DEFAULT_PORT = 8080
|
||||
|
||||
|
||||
#=================================================================
|
||||
class WSGIApp(object):
|
||||
def __init__(self, wb_router, fallback_app=None):
|
||||
self.wb_router = wb_router
|
||||
self.fallback_app = fallback_app
|
||||
|
||||
# Top-level wsgi application
|
||||
def __call__(self, env, start_response):
|
||||
if env['REQUEST_METHOD'] == 'CONNECT':
|
||||
return self.handle_connect(env, start_response)
|
||||
else:
|
||||
return self.handle_methods(env, start_response)
|
||||
|
||||
def handle_connect(self, env, start_response):
|
||||
def ssl_start_response(statusline, headers):
|
||||
ssl_sock = env.get('pywb.proxy_ssl_sock')
|
||||
if not ssl_sock:
|
||||
start_response(statusline, headers)
|
||||
return
|
||||
|
||||
env['pywb.proxy_statusline'] = statusline
|
||||
|
||||
status_line = 'HTTP/1.1 ' + statusline + '\r\n'
|
||||
ssl_sock.write(status_line.encode('iso-8859-1'))
|
||||
|
||||
for name, value in headers:
|
||||
line = name + ': ' + value + '\r\n'
|
||||
ssl_sock.write(line.encode('iso-8859-1'))
|
||||
|
||||
resp_iter = self.handle_methods(env, ssl_start_response)
|
||||
|
||||
ssl_sock = env.get('pywb.proxy_ssl_sock')
|
||||
if not ssl_sock:
|
||||
return resp_iter
|
||||
|
||||
ssl_sock.write(b'\r\n')
|
||||
|
||||
for obj in resp_iter:
|
||||
if obj:
|
||||
ssl_sock.write(obj)
|
||||
ssl_sock.close()
|
||||
|
||||
start_response(env['pywb.proxy_statusline'], [])
|
||||
|
||||
return []
|
||||
|
||||
def handle_methods(self, env, start_response):
|
||||
wb_router = self.wb_router
|
||||
response = None
|
||||
|
||||
try:
|
||||
response = wb_router(env)
|
||||
|
||||
if not response:
|
||||
if self.fallback_app:
|
||||
return self.fallback_app(env, start_response)
|
||||
else:
|
||||
msg = 'No handler for "{0}".'.format(env['REL_REQUEST_URI'])
|
||||
raise NotFoundException(msg)
|
||||
|
||||
except WbException as e:
|
||||
response = self.handle_exception(env, e, False)
|
||||
|
||||
except Exception as e:
|
||||
response = self.handle_exception(env, e, True)
|
||||
|
||||
return response(env, start_response)
|
||||
|
||||
def handle_exception(self, env, exc, print_trace):
|
||||
error_view = None
|
||||
|
||||
if hasattr(self.wb_router, 'error_view'):
|
||||
error_view = self.wb_router.error_view
|
||||
|
||||
if hasattr(exc, 'status'):
|
||||
status = exc.status()
|
||||
else:
|
||||
status = '500 Internal Server Error'
|
||||
|
||||
if hasattr(exc, 'url'):
|
||||
err_url = exc.url
|
||||
else:
|
||||
err_url = None
|
||||
|
||||
if len(exc.args):
|
||||
err_msg = exc.args[0]
|
||||
|
||||
if print_trace:
|
||||
import traceback
|
||||
err_details = traceback.format_exc()
|
||||
print(err_details)
|
||||
else:
|
||||
logging.info(err_msg)
|
||||
err_details = None
|
||||
|
||||
if error_view:
|
||||
if err_url and isinstance(err_url, str):
|
||||
err_url = to_native_str(err_url, 'utf-8')
|
||||
if err_msg and isinstance(err_msg, str):
|
||||
err_msg = to_native_str(err_msg, 'utf-8')
|
||||
|
||||
return error_view.render_response(exc_type=type(exc).__name__,
|
||||
err_msg=err_msg,
|
||||
err_details=err_details,
|
||||
status=status,
|
||||
env=env,
|
||||
err_url=err_url)
|
||||
else:
|
||||
msg = status + ' Error: '
|
||||
if err_msg:
|
||||
msg += err_msg
|
||||
|
||||
#msg = msg.encode('utf-8', 'ignore')
|
||||
return WbResponse.text_response(msg,
|
||||
status=status)
|
||||
|
||||
#=================================================================
|
||||
DEFAULT_CONFIG_FILE = 'config.yaml'
|
||||
|
||||
|
||||
#=================================================================
|
||||
def init_app(init_func, load_yaml=True, config_file=None, config=None):
|
||||
try:
|
||||
config = config or {}
|
||||
if load_yaml:
|
||||
# env setting overrides all others
|
||||
env_config = os.environ.get('PYWB_CONFIG_FILE')
|
||||
if env_config:
|
||||
config_file = env_config
|
||||
|
||||
if not config_file:
|
||||
config_file = DEFAULT_CONFIG_FILE
|
||||
|
||||
if os.path.isfile(config_file):
|
||||
config = load_yaml_config(config_file)
|
||||
|
||||
wb_router = init_func(config)
|
||||
except:
|
||||
msg = '*** pywb app init FAILED config from "%s"!\n'
|
||||
logging.exception(msg, init_func.__name__)
|
||||
raise
|
||||
else:
|
||||
msg = '*** pywb app inited with config from "%s"!\n'
|
||||
logging.debug(msg, init_func.__name__)
|
||||
|
||||
return WSGIApp(wb_router)
|
||||
|
||||
|
||||
#=================================================================
|
||||
def start_wsgi_ref_server(the_app, name, port): # pragma: no cover
|
||||
from wsgiref.simple_server import make_server, WSGIServer
|
||||
from six.moves.socketserver import ThreadingMixIn
|
||||
|
||||
# disable is_hop_by_hop restrictions
|
||||
import wsgiref.handlers
|
||||
wsgiref.handlers.is_hop_by_hop = lambda x: False
|
||||
|
||||
if port is None:
|
||||
port = DEFAULT_PORT
|
||||
|
||||
logging.info('Starting %s on port %s', name, port)
|
||||
|
||||
class ThreadingWSGIServer(ThreadingMixIn, WSGIServer):
|
||||
pass
|
||||
|
||||
try:
|
||||
httpd = make_server('', port, the_app, ThreadingWSGIServer)
|
||||
httpd.serve_forever()
|
||||
except KeyboardInterrupt as ex:
|
||||
pass
|
||||
finally:
|
||||
logging.info('Stopping %s', name)
|
@ -1,85 +0,0 @@
|
||||
from pywb.utils.wbexception import AccessException
|
||||
|
||||
|
||||
#=================================================================
|
||||
def make_perms_cdx_filter(perms_policy, wbrequest):
|
||||
"""
|
||||
Called internally to convert a perms_policy and a request
|
||||
to a filter which can be applied on the cdx
|
||||
"""
|
||||
perms_checker = perms_policy(wbrequest)
|
||||
if not perms_checker:
|
||||
return None
|
||||
|
||||
return _create_cdx_perms_filter(perms_checker)
|
||||
|
||||
|
||||
#=================================================================
|
||||
def _create_cdx_perms_filter(perms_checker):
|
||||
"""
|
||||
Return a function which will filter the cdx given
|
||||
a Perms object.
|
||||
:param perms_checker: a Perms object which implements the
|
||||
allow_url_lookup() and access_check_capture() methods
|
||||
"""
|
||||
|
||||
def perms_filter_op(cdx_iter, query):
|
||||
"""
|
||||
filter out those cdx records that user doesn't have access to,
|
||||
by consulting :param perms_checker:.
|
||||
:param cdx_iter: cdx record source iterable
|
||||
:param query: request parameters (CDXQuery)
|
||||
:param perms_checker: object implementing permission checker
|
||||
"""
|
||||
if not perms_checker.allow_url_lookup(query.key):
|
||||
if query.is_exact:
|
||||
raise AccessException('Excluded')
|
||||
|
||||
for cdx in cdx_iter:
|
||||
cdx = perms_checker.access_check_capture(cdx)
|
||||
if cdx:
|
||||
yield cdx
|
||||
|
||||
return perms_filter_op
|
||||
|
||||
|
||||
#================================================================
|
||||
def allow_all_perms_policy(wbrequest):
|
||||
"""
|
||||
Perms policy which always returns a default Perms object
|
||||
which allows everything.
|
||||
|
||||
The perms object is created per request and may store request
|
||||
state, if necessary.
|
||||
|
||||
The same perms object may be called with multiple queries
|
||||
(such as for each cdx line) per request.
|
||||
"""
|
||||
return Perms()
|
||||
|
||||
|
||||
#=================================================================
|
||||
class Perms(object):
|
||||
"""
|
||||
A base perms checker which allows everything
|
||||
"""
|
||||
|
||||
def allow_url_lookup(self, key):
|
||||
"""
|
||||
Return true/false if urlkey (canonicalized url)
|
||||
should be allowed.
|
||||
|
||||
Default: allow all
|
||||
"""
|
||||
return True
|
||||
|
||||
def access_check_capture(self, cdx):
|
||||
"""
|
||||
Allow/deny specified cdx capture (dict) to be included
|
||||
in the result.
|
||||
Return None to reject, or modify the cdx to exclude
|
||||
any fields that need to be restricted.
|
||||
|
||||
Default: allow cdx line without modifications
|
||||
"""
|
||||
return cdx
|
@ -1,67 +0,0 @@
|
||||
from pywb.utils.canonicalize import UrlCanonicalizer
|
||||
from pywb.utils.wbexception import NotFoundException
|
||||
|
||||
from pywb.framework.basehandlers import WbUrlHandler
|
||||
from pywb.framework.archivalrouter import ArchivalRouter, Route
|
||||
from pywb.framework.wbrequestresponse import WbResponse
|
||||
|
||||
BLOCK = '["block"]'
|
||||
ALLOW = '["allow"]'
|
||||
RESPONSE_TYPE = 'application/json'
|
||||
|
||||
NOT_FOUND = 'Please specify a url to check for access'
|
||||
|
||||
|
||||
#=================================================================
|
||||
class PermsHandler(WbUrlHandler):
|
||||
|
||||
def __init__(self, perms_policy, url_canon):
|
||||
self.perms_policy = perms_policy
|
||||
self.url_canon = url_canon
|
||||
|
||||
def __call__(self, wbrequest):
|
||||
perms_checker = self.perms_policy(wbrequest)
|
||||
|
||||
if wbrequest.wb_url:
|
||||
return self.check_single_url(wbrequest, perms_checker)
|
||||
|
||||
# elif wbrequest.env['REQUEST_METHOD'] == 'POST':
|
||||
# return self.check_bulk(wbrequest, perms_checker)
|
||||
|
||||
else:
|
||||
raise NotFoundException(NOT_FOUND)
|
||||
|
||||
def check_single_url(self, wbrequest, perms_checker):
|
||||
urlkey = self.url_canon(wbrequest.wb_url.url)
|
||||
urlkey = urlkey.encode('utf-8')
|
||||
|
||||
if not perms_checker.allow_url_lookup(urlkey):
|
||||
response_text = BLOCK
|
||||
else:
|
||||
response_text = ALLOW
|
||||
|
||||
#TODO: other types of checking
|
||||
return WbResponse.text_response(response_text,
|
||||
content_type=RESPONSE_TYPE)
|
||||
#TODO
|
||||
# def check_bulk_urls(self, wbrequest, perms_checker):
|
||||
# pass
|
||||
#
|
||||
|
||||
|
||||
#=================================================================
|
||||
def create_perms_checker_app(config):
|
||||
"""
|
||||
Create permissions checker standalone app
|
||||
Running under the '/check-access' route
|
||||
"""
|
||||
port = config.get('port')
|
||||
|
||||
perms_policy = config.get('perms_policy')
|
||||
|
||||
canonicalizer = UrlCanonicalizer(config.get('surt_ordered', True))
|
||||
|
||||
handler = PermsHandler(perms_policy, canonicalizer)
|
||||
routes = [Route('check-access', handler)]
|
||||
|
||||
return ArchivalRouter(routes, port=port)
|
@ -1,99 +0,0 @@
|
||||
from gevent.monkey import patch_all; patch_all()
|
||||
|
||||
import requests
|
||||
|
||||
from pywb.framework.archivalrouter import Route
|
||||
|
||||
from pywb.rewrite.rewrite_content import RewriteContent
|
||||
from pywb.rewrite.wburl import WbUrl
|
||||
from warcio.recordloader import ArcWarcRecordLoader
|
||||
from pywb.webapp.live_rewrite_handler import RewriteHandler
|
||||
from pywb.utils.canonicalize import canonicalize
|
||||
from warcio.timeutils import http_date_to_timestamp
|
||||
from pywb.cdx.cdxobject import CDXObject
|
||||
|
||||
from io import BytesIO
|
||||
|
||||
from pywb.urlrewrite.rewriteinputreq import RewriteInputRequest
|
||||
|
||||
from six.moves.urllib.parse import quote
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class PlatformRoute(Route):
|
||||
def apply_filters(self, wbrequest, matcher):
|
||||
wbrequest.matchdict = matcher.groupdict()
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class PlatformHandler(RewriteHandler):
|
||||
def __init__(self, config):
|
||||
super(PlatformHandler, self).__init__(config)
|
||||
self.upstream_url = config.get('upstream_url')
|
||||
self.loader = ArcWarcRecordLoader()
|
||||
|
||||
framed = config.get('framed_replay')
|
||||
self.content_rewriter = RewriteContent(is_framed_replay=framed)
|
||||
|
||||
def render_content(self, wbrequest):
|
||||
if wbrequest.wb_url.mod == 'vi_':
|
||||
return self._get_video_info(wbrequest)
|
||||
|
||||
ref_wburl_str = wbrequest.extract_referrer_wburl_str()
|
||||
if ref_wburl_str:
|
||||
wbrequest.env['HTTP_REFERER'] = WbUrl(ref_wburl_str).url
|
||||
|
||||
urlkey = canonicalize(wbrequest.wb_url.url)
|
||||
url = wbrequest.wb_url.url
|
||||
|
||||
inputreq = RewriteInputRequest(wbrequest.env, urlkey, url,
|
||||
self.content_rewriter)
|
||||
|
||||
req_data = inputreq.reconstruct_request(url)
|
||||
|
||||
headers = {'Content-Length': len(req_data),
|
||||
'Content-Type': 'application/request'}
|
||||
|
||||
if wbrequest.wb_url.is_latest_replay():
|
||||
closest = 'now'
|
||||
else:
|
||||
closest = wbrequest.wb_url.timestamp
|
||||
|
||||
upstream_url = self.upstream_url.format(url=quote(url),
|
||||
closest=closest,
|
||||
#coll=wbrequest.coll,
|
||||
**wbrequest.matchdict)
|
||||
|
||||
r = requests.post(upstream_url,
|
||||
data=BytesIO(req_data),
|
||||
headers=headers,
|
||||
stream=True,
|
||||
allow_redirects=False)
|
||||
|
||||
r.raise_for_status()
|
||||
|
||||
record = self.loader.parse_record_stream(r.raw)
|
||||
|
||||
cdx = CDXObject()
|
||||
cdx['urlkey'] = urlkey
|
||||
cdx['timestamp'] = http_date_to_timestamp(r.headers.get('Memento-Datetime'))
|
||||
cdx['url'] = url
|
||||
|
||||
head_insert_func = self.head_insert_view.create_insert_func(wbrequest)
|
||||
result = self.content_rewriter.rewrite_content(wbrequest.urlrewriter,
|
||||
record.http_headers,
|
||||
record.stream,
|
||||
head_insert_func,
|
||||
urlkey,
|
||||
cdx)
|
||||
|
||||
status_headers, gen, is_rw = result
|
||||
return self._make_response(wbrequest, *result)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from gevent.wsgi import WSGIServer
|
||||
from pywb.apps.wayback import application
|
||||
|
||||
server = WSGIServer(('', 8090), application)
|
||||
server.serve_forever()
|
@ -1,32 +0,0 @@
|
||||
### pywb.warc
|
||||
|
||||
This is the WARC/ARC record loading component of pywb wayback tool suite.
|
||||
The package provides the following facilities:
|
||||
|
||||
* Resolve relative WARC/ARC filenames to a full path based on configurable resolvers
|
||||
|
||||
* Resolve 'revisit' records from provided index to find a full record with headers and payload content
|
||||
|
||||
* Load WARC/ARC records either locally or via http using http 1.1 range requests
|
||||
|
||||
|
||||
When loading archived content, the format type (WARC vs ARC) and compressed ARCs/WARCs
|
||||
are decompressed automatically.
|
||||
No assumption is made about format based on filename, content type
|
||||
or other external parameters other than the content itself.
|
||||
|
||||
### Tests
|
||||
|
||||
This package will includes a test suite for loading a variety of WARC and ARC records.
|
||||
|
||||
Tests so far:
|
||||
|
||||
* Compressed WARC, ARC Records
|
||||
* Uncompressed ARC Records
|
||||
* Compressed WARC created by wget 1.14
|
||||
* Same Url revisit record resolving
|
||||
|
||||
|
||||
TODO:
|
||||
|
||||
* Different url revisit record resolving
|
@ -1,62 +0,0 @@
|
||||
from pywb.cdx.cdxserver import create_cdx_server
|
||||
|
||||
from pywb.utils.wbexception import NotFoundException
|
||||
from pywb.framework.basehandlers import BaseHandler
|
||||
from pywb.framework.wbrequestresponse import WbResponse
|
||||
|
||||
from pywb.webapp.query_handler import QueryHandler
|
||||
|
||||
from six.moves.urllib.parse import parse_qs
|
||||
import json
|
||||
import six
|
||||
|
||||
|
||||
#=================================================================
|
||||
class CDXAPIHandler(BaseHandler):
|
||||
"""
|
||||
Handler which passes wsgi request to cdx server and
|
||||
returns a text-based cdx api
|
||||
"""
|
||||
def __init__(self, index_handler):
|
||||
self.index_handler = index_handler
|
||||
|
||||
def __call__(self, wbrequest):
|
||||
params = self.extract_params_from_wsgi_env(wbrequest.env)
|
||||
|
||||
try:
|
||||
cdx_iter = self.index_handler.load_cdx(wbrequest, params)
|
||||
except NotFoundException:
|
||||
msg = 'No Captures found for: ' + params.get('url')
|
||||
if params.get('output') == 'json':
|
||||
msg = json.dumps(dict(error=msg))
|
||||
content_type='application/json'
|
||||
else:
|
||||
content_type='text/plain'
|
||||
|
||||
return WbResponse.text_response(msg, content_type=content_type,
|
||||
status='404 Not Found')
|
||||
|
||||
return WbResponse.text_stream(cdx_iter,
|
||||
content_type='text/plain')
|
||||
|
||||
@staticmethod
|
||||
def extract_params_from_wsgi_env(env):
|
||||
""" utility function to extract params and create a CDXQuery
|
||||
from a WSGI environment dictionary
|
||||
"""
|
||||
params = parse_qs(env['QUERY_STRING'])
|
||||
|
||||
# parse_qs produces arrays for single values
|
||||
# cdx processing expects singleton params for all params,
|
||||
# except filters, so convert here
|
||||
# use first value of the list
|
||||
for name, val in six.iteritems(params):
|
||||
if name != 'filter':
|
||||
params[name] = val[0]
|
||||
|
||||
if 'output' not in params:
|
||||
params['output'] = 'text'
|
||||
elif params['output'] not in ('text', 'json'):
|
||||
params['output'] = 'text'
|
||||
|
||||
return params
|
@ -1,195 +1,14 @@
|
||||
import pkgutil
|
||||
import mimetypes
|
||||
import time
|
||||
import logging
|
||||
|
||||
from datetime import datetime
|
||||
|
||||
from warcio.statusandheaders import StatusAndHeaders
|
||||
from warcio.timeutils import datetime_to_timestamp
|
||||
|
||||
from pywb.utils.wbexception import NotFoundException
|
||||
from pywb.utils.loaders import LocalFileLoader
|
||||
|
||||
from pywb.framework.basehandlers import BaseHandler, WbUrlHandler
|
||||
from pywb.framework.wbrequestresponse import WbResponse
|
||||
|
||||
from pywb.warc.blockrecordloader import BlockArcWarcRecordLoader
|
||||
from pywb.warc.resolvingloader import ResolvingLoader
|
||||
from pywb.warc.pathresolvers import PathResolverMapper
|
||||
|
||||
from pywb.webapp.views import J2TemplateView, init_view
|
||||
from pywb.webapp.replay_views import ReplayView
|
||||
from pywb.framework.memento import MementoResponse
|
||||
|
||||
|
||||
#=================================================================
|
||||
class SearchPageWbUrlHandler(WbUrlHandler):
|
||||
"""
|
||||
Loads a default search page html template to be shown when
|
||||
the wb_url is empty
|
||||
"""
|
||||
def __init__(self, config):
|
||||
self.search_view = init_view(config, 'search_html')
|
||||
|
||||
self.is_frame_mode = config.get('framed_replay', False)
|
||||
self.frame_mod = 'tf_'
|
||||
self.replay_mod = ''
|
||||
|
||||
self.response_class = WbResponse
|
||||
|
||||
if self.is_frame_mode:
|
||||
#html = config.get('frame_insert_html', 'templates/frame_insert.html')
|
||||
#self.search_view = J2TemplateView(html, config.get('jinja_env'))
|
||||
self.frame_insert_view = init_view(config, 'frame_insert_html')
|
||||
assert(self.frame_insert_view)
|
||||
|
||||
self.banner_html = config.get('banner_html', 'banner.html')
|
||||
|
||||
if config.get('enable_memento', False):
|
||||
self.response_class = MementoResponse
|
||||
|
||||
if self.is_frame_mode == 'inverse':
|
||||
self.frame_mod = ''
|
||||
self.replay_mod = 'mp_'
|
||||
|
||||
else:
|
||||
self.frame_insert_view = None
|
||||
self.banner_html = None
|
||||
|
||||
def render_search_page(self, wbrequest, **kwargs):
|
||||
return self.search_view.render_response(wbrequest=wbrequest,
|
||||
prefix=wbrequest.wb_prefix,
|
||||
**kwargs)
|
||||
|
||||
def __call__(self, wbrequest):
|
||||
# root search page
|
||||
if wbrequest.wb_url_str == '/':
|
||||
return self.render_search_page(wbrequest)
|
||||
|
||||
wbrequest.options['replay_mod'] = self.replay_mod
|
||||
wbrequest.options['frame_mod'] = self.frame_mod
|
||||
|
||||
# render top level frame if in frame mode
|
||||
# (not supported in proxy mode)
|
||||
if (self.is_frame_mode and wbrequest.wb_url and
|
||||
not wbrequest.wb_url.is_query() and
|
||||
not wbrequest.options['is_proxy']):
|
||||
|
||||
if wbrequest.wb_url.mod == self.frame_mod:
|
||||
wbrequest.options['is_top_frame'] = True
|
||||
return self.get_top_frame_response(wbrequest)
|
||||
else:
|
||||
wbrequest.options['is_framed'] = True
|
||||
wbrequest.final_mod = self.frame_mod
|
||||
else:
|
||||
wbrequest.options['is_framed'] = False
|
||||
|
||||
try:
|
||||
return self.handle_request(wbrequest)
|
||||
except NotFoundException as nfe:
|
||||
return self.handle_not_found(wbrequest, nfe)
|
||||
|
||||
def get_top_frame_params(self, wbrequest, mod):
|
||||
embed_url = wbrequest.wb_url.to_str(mod=mod)
|
||||
|
||||
if wbrequest.wb_url.timestamp:
|
||||
timestamp = wbrequest.wb_url.timestamp
|
||||
else:
|
||||
timestamp = datetime_to_timestamp(datetime.utcnow())
|
||||
|
||||
params = dict(embed_url=embed_url,
|
||||
wbrequest=wbrequest,
|
||||
timestamp=timestamp,
|
||||
url=wbrequest.wb_url.get_url(),
|
||||
banner_html=self.banner_html)
|
||||
|
||||
return params
|
||||
|
||||
def get_top_frame_response(self, wbrequest):
|
||||
params = self.get_top_frame_params(wbrequest, mod=self.replay_mod)
|
||||
|
||||
headers = [('Content-Type', 'text/html')]
|
||||
status_headers = StatusAndHeaders('200 OK', headers)
|
||||
|
||||
template_result = self.frame_insert_view.render_to_string(**params)
|
||||
body = template_result.encode('utf-8')
|
||||
|
||||
return self.response_class(status_headers, [body], wbrequest=wbrequest)
|
||||
|
||||
|
||||
#=================================================================
|
||||
# Standard WB Handler
|
||||
#=================================================================
|
||||
class WBHandler(SearchPageWbUrlHandler):
|
||||
def __init__(self, query_handler, config=None):
|
||||
super(WBHandler, self).__init__(config)
|
||||
|
||||
self.index_reader = query_handler
|
||||
self.not_found_view = init_view(config, 'not_found_html')
|
||||
|
||||
self.replay = self._init_replay_view(config)
|
||||
|
||||
self.fallback_handler = None
|
||||
self.fallback_name = config.get('fallback')
|
||||
|
||||
def _init_replay_view(self, config):
|
||||
cookie_maker = config.get('cookie_maker')
|
||||
record_loader = BlockArcWarcRecordLoader(cookie_maker=cookie_maker)
|
||||
|
||||
paths = config.get('archive_paths')
|
||||
|
||||
resolving_loader = ResolvingLoader(PathResolverMapper()(paths),
|
||||
record_loader=record_loader)
|
||||
|
||||
return ReplayView(resolving_loader, config)
|
||||
|
||||
def resolve_refs(self, handler_dict):
|
||||
if self.fallback_name:
|
||||
self.fallback_handler = handler_dict.get(self.fallback_name)
|
||||
logging.debug('Fallback Handler: ' + self.fallback_name)
|
||||
|
||||
def handle_request(self, wbrequest):
|
||||
cdx_lines, output = self.index_reader.load_for_request(wbrequest)
|
||||
|
||||
if output != 'text' and wbrequest.wb_url.is_replay():
|
||||
return self.handle_replay(wbrequest, cdx_lines)
|
||||
else:
|
||||
return self.handle_query(wbrequest, cdx_lines, output)
|
||||
|
||||
def handle_query(self, wbrequest, cdx_lines, output):
|
||||
return self.index_reader.make_cdx_response(wbrequest,
|
||||
cdx_lines,
|
||||
output)
|
||||
|
||||
def handle_replay(self, wbrequest, cdx_lines):
|
||||
cdx_callback = self.index_reader.cdx_load_callback(wbrequest)
|
||||
|
||||
return self.replay.render_content(wbrequest,
|
||||
cdx_lines,
|
||||
cdx_callback)
|
||||
|
||||
def handle_not_found(self, wbrequest, nfe):
|
||||
# check fallback: only for replay queries and not for identity
|
||||
if (self.fallback_handler and
|
||||
not wbrequest.wb_url.is_query() and
|
||||
not wbrequest.wb_url.is_identity):
|
||||
return self.fallback_handler(wbrequest)
|
||||
|
||||
# if capture query, just return capture page
|
||||
if wbrequest.wb_url.is_query():
|
||||
output = self.index_reader.get_output_type(wbrequest.wb_url)
|
||||
return self.index_reader.make_cdx_response(wbrequest, iter([]), output)
|
||||
else:
|
||||
return self.not_found_view.render_response(status='404 Not Found',
|
||||
wbrequest=wbrequest,
|
||||
url=wbrequest.wb_url.url)
|
||||
|
||||
|
||||
#=================================================================
|
||||
# Static Content Handler
|
||||
#=================================================================
|
||||
class StaticHandler(BaseHandler):
|
||||
class StaticHandler(object):
|
||||
def __init__(self, static_path):
|
||||
mimetypes.init()
|
||||
|
||||
@ -234,15 +53,3 @@ class StaticHandler(BaseHandler):
|
||||
wbrequest.wb_url_str)
|
||||
|
||||
|
||||
#=================================================================
|
||||
# Debug Handlers
|
||||
#=================================================================
|
||||
class DebugEchoEnvHandler(BaseHandler): # pragma: no cover
|
||||
def __call__(self, wbrequest):
|
||||
return WbResponse.text_response(str(wbrequest.env))
|
||||
|
||||
|
||||
#=================================================================
|
||||
class DebugEchoHandler(BaseHandler): # pragma: no cover
|
||||
def __call__(self, wbrequest):
|
||||
return WbResponse.text_response(str(wbrequest))
|
||||
|
@ -1,241 +0,0 @@
|
||||
from pywb.framework.wbrequestresponse import WbResponse
|
||||
from pywb.framework.cache import create_cache
|
||||
|
||||
from pywb.rewrite.rewrite_live import LiveRewriter
|
||||
from pywb.rewrite.wburl import WbUrl
|
||||
|
||||
from pywb.webapp.handlers import StaticHandler, SearchPageWbUrlHandler
|
||||
from pywb.webapp.views import HeadInsertView
|
||||
|
||||
from pywb.utils.wbexception import LiveResourceException
|
||||
|
||||
import json
|
||||
import hashlib
|
||||
|
||||
|
||||
#=================================================================
|
||||
class RewriteHandler(SearchPageWbUrlHandler):
|
||||
|
||||
LIVE_COOKIE = 'pywb.timestamp={0}; max-age=60'
|
||||
|
||||
YT_DL_TYPE = 'application/vnd.youtube-dl_formats+json'
|
||||
|
||||
def __init__(self, config):
|
||||
super(RewriteHandler, self).__init__(config)
|
||||
|
||||
proxyhostport = config.get('proxyhostport')
|
||||
|
||||
live_rewriter_cls = config.get('live_rewriter_cls', LiveRewriter)
|
||||
|
||||
self.live_fetcher = live_rewriter_cls(is_framed_replay=self.is_frame_mode,
|
||||
proxies=proxyhostport)
|
||||
|
||||
self.recording = self.live_fetcher.is_recording()
|
||||
|
||||
self.head_insert_view = HeadInsertView.init_from_config(config)
|
||||
|
||||
self.live_cookie = config.get('live-cookie', self.LIVE_COOKIE)
|
||||
|
||||
self.verify = config.get('verify_ssl', True)
|
||||
|
||||
self.ydl = None
|
||||
|
||||
self._cache = None
|
||||
|
||||
def handle_request(self, wbrequest):
|
||||
if wbrequest.wb_url.is_query():
|
||||
type_ = wbrequest.wb_url.LATEST_REPLAY
|
||||
url = wbrequest.urlrewriter.get_new_url(type=type_, timestamp='')
|
||||
return WbResponse.redir_response(url)
|
||||
|
||||
if wbrequest.options['is_ajax']:
|
||||
wbrequest.urlrewriter.rewrite_opts['is_ajax'] = True
|
||||
|
||||
try:
|
||||
return self.render_content(wbrequest)
|
||||
|
||||
except Exception as exc:
|
||||
import traceback
|
||||
err_details = traceback.format_exc()
|
||||
print(err_details)
|
||||
|
||||
url = wbrequest.wb_url.url
|
||||
msg = 'Could not load the url from the live web: ' + url
|
||||
raise LiveResourceException(msg=msg, url=url)
|
||||
|
||||
def _live_request_headers(self, wbrequest):
|
||||
return {}
|
||||
|
||||
def _skip_recording(self, wbrequest):
|
||||
return False
|
||||
|
||||
def render_content(self, wbrequest):
|
||||
if wbrequest.wb_url.mod == 'vi_':
|
||||
return self._get_video_info(wbrequest)
|
||||
|
||||
head_insert_func = self.head_insert_view.create_insert_func(wbrequest)
|
||||
req_headers = self._live_request_headers(wbrequest)
|
||||
|
||||
ref_wburl_str = wbrequest.extract_referrer_wburl_str()
|
||||
if ref_wburl_str:
|
||||
wbrequest.env['REL_REFERER'] = WbUrl(ref_wburl_str).url
|
||||
|
||||
skip_recording = self._skip_recording(wbrequest)
|
||||
|
||||
use_206 = False
|
||||
url = None
|
||||
rangeres = None
|
||||
|
||||
readd_range = False
|
||||
cache_key = None
|
||||
|
||||
if self.recording and not skip_recording:
|
||||
rangeres = wbrequest.extract_range()
|
||||
|
||||
if rangeres:
|
||||
url, start, end, use_206 = rangeres
|
||||
|
||||
# if bytes=0- Range request,
|
||||
# simply remove the range and still proxy
|
||||
if start == 0 and not end and use_206:
|
||||
wbrequest.wb_url.url = url
|
||||
del wbrequest.env['HTTP_RANGE']
|
||||
readd_range = True
|
||||
else:
|
||||
# disables proxy
|
||||
skip_recording = True
|
||||
|
||||
# sets cache_key only if not already cached
|
||||
cache_key = self._get_cache_key('r:', url)
|
||||
|
||||
result = self.live_fetcher.fetch_request(wbrequest.wb_url.url,
|
||||
wbrequest.urlrewriter,
|
||||
head_insert_func=head_insert_func,
|
||||
req_headers=req_headers,
|
||||
env=wbrequest.env,
|
||||
skip_recording=skip_recording,
|
||||
verify=self.verify)
|
||||
|
||||
wbresponse = self._make_response(wbrequest, *result)
|
||||
|
||||
if readd_range:
|
||||
content_length = (wbresponse.status_headers.
|
||||
get_header('Content-Length'))
|
||||
try:
|
||||
content_length = int(content_length)
|
||||
wbresponse.status_headers.add_range(0, content_length,
|
||||
content_length)
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
||||
if self.recording and cache_key:
|
||||
self._add_rec_ping(cache_key, url, wbrequest, wbresponse)
|
||||
|
||||
if rangeres:
|
||||
referrer = wbrequest.env.get('REL_REFERER')
|
||||
|
||||
# also ping video info
|
||||
if referrer:
|
||||
try:
|
||||
resp = self._get_video_info(wbrequest,
|
||||
info_url=referrer,
|
||||
video_url=url)
|
||||
except:
|
||||
print('Error getting video info')
|
||||
|
||||
return wbresponse
|
||||
|
||||
def _make_response(self, wbrequest, status_headers, gen, is_rewritten):
|
||||
# if cookie set, pass recorded timestamp info via cookie
|
||||
# so that client side may be able to access it
|
||||
# used by framed mode to update frame banner
|
||||
if self.live_cookie:
|
||||
cdx = wbrequest.env.get('pywb.cdx')
|
||||
if cdx:
|
||||
value = self.live_cookie.format(cdx['timestamp'])
|
||||
status_headers.headers.append(('Set-Cookie', value))
|
||||
|
||||
return WbResponse(status_headers, gen)
|
||||
|
||||
def _get_cache_key(self, prefix, url):
|
||||
if not self._cache:
|
||||
self._cache = create_cache()
|
||||
|
||||
key = self.create_cache_key(prefix, url)
|
||||
|
||||
if key in self._cache:
|
||||
return None
|
||||
|
||||
return key
|
||||
|
||||
@staticmethod
|
||||
def create_cache_key(prefix, url):
|
||||
hash_ = hashlib.md5()
|
||||
hash_.update(url.encode('utf-8'))
|
||||
key = hash_.hexdigest()
|
||||
key = prefix + key
|
||||
return key
|
||||
|
||||
def _add_rec_ping(self, key, url, wbrequest, wbresponse):
|
||||
def do_ping():
|
||||
headers = self._live_request_headers(wbrequest)
|
||||
headers['Connection'] = 'close'
|
||||
|
||||
try:
|
||||
# mark as pinged
|
||||
self._cache[key] = '1'
|
||||
|
||||
self.live_fetcher.fetch_async(url, headers)
|
||||
|
||||
except:
|
||||
del self._cache[key]
|
||||
raise
|
||||
|
||||
def wrap_buff_gen(gen):
|
||||
for x in gen:
|
||||
yield x
|
||||
|
||||
try:
|
||||
do_ping()
|
||||
except:
|
||||
pass
|
||||
|
||||
#do_ping()
|
||||
wbresponse.body = wrap_buff_gen(wbresponse.body)
|
||||
return wbresponse
|
||||
|
||||
def _get_video_info(self, wbrequest, info_url=None, video_url=None):
|
||||
if not video_url:
|
||||
video_url = wbrequest.wb_url.url
|
||||
|
||||
if not info_url:
|
||||
info_url = wbrequest.wb_url.url
|
||||
|
||||
cache_key = None
|
||||
if self.recording:
|
||||
cache_key = self._get_cache_key('v:', video_url)
|
||||
|
||||
info = self.live_fetcher.get_video_info(video_url)
|
||||
if info is None: #pragma: no cover
|
||||
msg = ('youtube-dl is not installed, pip install youtube-dl to ' +
|
||||
'enable improved video proxy')
|
||||
|
||||
return WbResponse.text_response(text=msg, status='404 Not Found')
|
||||
|
||||
#if info and info.formats and len(info.formats) == 1:
|
||||
|
||||
content_type = self.YT_DL_TYPE
|
||||
metadata = json.dumps(info)
|
||||
|
||||
if (self.recording and cache_key):
|
||||
headers = self._live_request_headers(wbrequest)
|
||||
headers['Content-Type'] = content_type
|
||||
|
||||
if info_url.startswith('https://'):
|
||||
info_url = info_url.replace('https', 'http', 1)
|
||||
|
||||
response = self.live_fetcher.add_metadata(info_url, headers, metadata)
|
||||
|
||||
self._cache[cache_key] = '1'
|
||||
|
||||
return WbResponse.text_response(metadata, content_type=content_type)
|
@ -1,387 +0,0 @@
|
||||
from pywb.utils.loaders import load_yaml_config
|
||||
|
||||
from pywb.framework.archivalrouter import ArchivalRouter, Route
|
||||
from pywb.framework.proxy import ProxyArchivalRouter
|
||||
from pywb.framework.wbrequestresponse import WbRequest
|
||||
from pywb.framework.memento import MementoRequest
|
||||
from pywb.framework.basehandlers import BaseHandler
|
||||
|
||||
from pywb.webapp.views import J2TemplateView
|
||||
from pywb.webapp.views import J2HtmlCapturesView, init_view
|
||||
|
||||
from pywb.webapp.live_rewrite_handler import RewriteHandler
|
||||
|
||||
from pywb.webapp.query_handler import QueryHandler
|
||||
from pywb.webapp.handlers import WBHandler
|
||||
from pywb.webapp.handlers import StaticHandler
|
||||
from pywb.webapp.handlers import DebugEchoHandler, DebugEchoEnvHandler
|
||||
from pywb.webapp.cdx_api_handler import CDXAPIHandler
|
||||
|
||||
from pywb import DEFAULT_CONFIG
|
||||
|
||||
import os
|
||||
import logging
|
||||
import six
|
||||
|
||||
|
||||
#=================================================================
|
||||
class DictChain(object):
|
||||
def __init__(self, *dicts):
|
||||
self.dicts = dicts
|
||||
|
||||
def get(self, key, default_val=None):
|
||||
for d in self.dicts:
|
||||
val = d.get(key)
|
||||
if val is not None:
|
||||
return val
|
||||
return default_val
|
||||
|
||||
def __contains__(self, key):
|
||||
return self.get(key) is not None
|
||||
|
||||
def __getitem__(self, key):
|
||||
return self.get(key)
|
||||
|
||||
def __setitem__(self, key, value):
|
||||
self.dicts[0][key] = value
|
||||
|
||||
|
||||
#=================================================================
|
||||
def create_wb_handler(query_handler, config):
|
||||
wb_handler_class = config.get('wb_handler_class', WBHandler)
|
||||
|
||||
wb_handler = wb_handler_class(
|
||||
query_handler,
|
||||
config=config,
|
||||
)
|
||||
|
||||
return wb_handler
|
||||
|
||||
|
||||
#=================================================================
|
||||
def create_live_handler(config):
|
||||
wb_handler_class = config.get('wb_handler_class', RewriteHandler)
|
||||
|
||||
live_handler = wb_handler_class(config)
|
||||
|
||||
return live_handler
|
||||
|
||||
|
||||
#=================================================================
|
||||
def init_route_config(value, config):
|
||||
if isinstance(value, str) or isinstance(value, list):
|
||||
value = dict(index_paths=value)
|
||||
|
||||
route_config = DictChain(value, config)
|
||||
return route_config
|
||||
|
||||
|
||||
#=================================================================
|
||||
def init_collection(route_config):
|
||||
ds_rules_file = route_config.get('domain_specific_rules', None)
|
||||
|
||||
html_view = init_view(route_config, 'query_html', J2HtmlCapturesView)
|
||||
|
||||
server_cls = route_config.get('server_cls')
|
||||
|
||||
query_handler = QueryHandler.init_from_config(route_config,
|
||||
ds_rules_file,
|
||||
html_view,
|
||||
server_cls)
|
||||
|
||||
return query_handler
|
||||
|
||||
|
||||
#=================================================================
|
||||
def add_cdx_api_handler(name, cdx_api_suffix, routes, query_handler,
|
||||
route_class=Route):
|
||||
# if bool, use -cdx suffix, else use custom string
|
||||
# as the suffix
|
||||
if isinstance(cdx_api_suffix, bool):
|
||||
name += '-cdx'
|
||||
else:
|
||||
name += str(cdx_api_suffix)
|
||||
|
||||
logging.debug('Adding CDX API Handler: ' + name)
|
||||
routes.append(route_class(name, CDXAPIHandler(query_handler)))
|
||||
|
||||
|
||||
#=================================================================
|
||||
def create_cdx_server_app(passed_config):
|
||||
"""
|
||||
Create a cdx server api-only app
|
||||
For each collection, create a /<coll>-cdx access point
|
||||
which follows the cdx api
|
||||
"""
|
||||
|
||||
defaults = load_yaml_config(DEFAULT_CONFIG)
|
||||
|
||||
config = DictChain(passed_config, defaults)
|
||||
|
||||
collections = config.get('collections', {})
|
||||
|
||||
static_routes = {}
|
||||
|
||||
# collections based on file system
|
||||
if config.get('enable_auto_colls', True):
|
||||
colls_loader_cls = config.get('colls_loader_cls', DirectoryCollsLoader)
|
||||
dir_loader = colls_loader_cls(config, static_routes, collections)
|
||||
dir_loader()
|
||||
#collections.update(dir_loader())
|
||||
|
||||
routes = []
|
||||
|
||||
for name, value in six.iteritems(collections):
|
||||
route_config = init_route_config(value, config)
|
||||
query_handler = init_collection(route_config)
|
||||
|
||||
cdx_api_suffix = route_config.get('enable_cdx_api', True)
|
||||
|
||||
add_cdx_api_handler(name, cdx_api_suffix, routes, query_handler)
|
||||
|
||||
return ArchivalRouter(routes)
|
||||
|
||||
|
||||
#=================================================================
|
||||
class DirectoryCollsLoader(object):
|
||||
def __init__(self, config, static_routes, colls):
|
||||
self.config = config
|
||||
self.static_routes = static_routes
|
||||
self.colls = colls
|
||||
|
||||
def __call__(self):
|
||||
colls = self.colls
|
||||
|
||||
static_dir = self.config.get('paths')['static_path']
|
||||
static_shared_prefix = self.config.get('static_shared_prefix')
|
||||
|
||||
if static_dir and static_shared_prefix and os.path.isdir(static_dir):
|
||||
static_dir = os.path.abspath(static_dir) + os.path.sep
|
||||
self.static_routes[static_shared_prefix] = static_dir
|
||||
|
||||
root_dir = self.config.get('collections_root', '')
|
||||
if not root_dir or not os.path.isdir(root_dir):
|
||||
return colls
|
||||
|
||||
for name in os.listdir(root_dir):
|
||||
full = os.path.join(root_dir, name)
|
||||
if not os.path.isdir(full):
|
||||
continue
|
||||
|
||||
coll_config = self.load_coll_dir(full, name)
|
||||
if coll_config:
|
||||
# if already exists, override existing config with coll specific
|
||||
if name in colls:
|
||||
colls[name].update(coll_config)
|
||||
else:
|
||||
colls[name] = coll_config
|
||||
|
||||
return colls
|
||||
|
||||
def _norm_path(self, root_dir, path):
|
||||
result = os.path.normpath(os.path.join(root_dir, path))
|
||||
return result
|
||||
|
||||
def _add_dir_if_exists(self, coll, root_dir, dir_key, required=False):
|
||||
curr_val = coll.get(dir_key)
|
||||
if curr_val:
|
||||
# add collection path only if relative path, and not a url
|
||||
if '://' not in curr_val and not os.path.isabs(curr_val):
|
||||
coll[dir_key] = self._norm_path(root_dir, curr_val) + os.path.sep
|
||||
return False
|
||||
|
||||
thedir = self.config.get('paths')[dir_key]
|
||||
|
||||
fulldir = os.path.join(root_dir, thedir)
|
||||
|
||||
if os.path.isdir(fulldir):
|
||||
fulldir = os.path.abspath(fulldir) + os.path.sep
|
||||
coll[dir_key] = fulldir
|
||||
return True
|
||||
elif required:
|
||||
msg = 'Dir "{0}" does not exist for "{1}"'.format(fulldir, dir_key)
|
||||
raise Exception(msg)
|
||||
else:
|
||||
return False
|
||||
|
||||
def load_yaml_file(self, root_dir, filename):
|
||||
filename = os.path.join(root_dir, filename)
|
||||
if os.path.isfile(filename):
|
||||
return load_yaml_config(filename)
|
||||
else:
|
||||
return {}
|
||||
|
||||
def load_coll_dir(self, root_dir, name):
|
||||
# Load config.yaml
|
||||
coll_config = self.load_yaml_file(root_dir, 'config.yaml')
|
||||
|
||||
# Load metadata.yaml
|
||||
metadata = self.load_yaml_file(root_dir, 'metadata.yaml')
|
||||
coll_config['metadata'] = metadata
|
||||
|
||||
self._add_dir_if_exists(coll_config, root_dir, 'index_paths', True)
|
||||
|
||||
# inherit these properties from base, in case archive_paths is shared
|
||||
shared_config = DictChain(coll_config, self.config)
|
||||
self._add_dir_if_exists(shared_config, root_dir, 'archive_paths', True)
|
||||
|
||||
if self._add_dir_if_exists(coll_config, root_dir, 'static_path', False):
|
||||
self.static_routes['static/' + name] = coll_config['static_path']
|
||||
|
||||
# Custom templates dir
|
||||
templates_dir = self.config.get('paths').get('templates_dir')
|
||||
if templates_dir:
|
||||
template_dir = os.path.join(root_dir, templates_dir)
|
||||
|
||||
# Check all templates
|
||||
template_files = self.config.get('paths')['template_files']
|
||||
for tname, tfile in six.iteritems(template_files):
|
||||
if tname in coll_config:
|
||||
# Already set
|
||||
coll_config[tname] = self._norm_path(root_dir, coll_config[tname])
|
||||
|
||||
# If templates override dir
|
||||
elif templates_dir:
|
||||
full = os.path.join(template_dir, tfile)
|
||||
if os.path.isfile(full):
|
||||
coll_config[tname] = full
|
||||
|
||||
return coll_config
|
||||
|
||||
|
||||
#=================================================================
|
||||
def create_wb_router(passed_config=None):
|
||||
passed_config = passed_config or {}
|
||||
|
||||
defaults = load_yaml_config(DEFAULT_CONFIG)
|
||||
|
||||
config = DictChain(passed_config, defaults)
|
||||
|
||||
routes = []
|
||||
|
||||
port = config.get('port')
|
||||
|
||||
collections = config.get('collections', {})
|
||||
|
||||
static_routes = config.get('static_routes', {})
|
||||
|
||||
root_route = None
|
||||
|
||||
# collections based on file system
|
||||
if config.get('enable_auto_colls', True):
|
||||
colls_loader_cls = config.get('colls_loader_cls', DirectoryCollsLoader)
|
||||
dir_loader = colls_loader_cls(config, static_routes, collections)
|
||||
dir_loader()
|
||||
#collections.update(dir_loader())
|
||||
|
||||
if config.get('enable_memento', False):
|
||||
request_class = MementoRequest
|
||||
else:
|
||||
request_class = WbRequest
|
||||
|
||||
# store live and replay handlers
|
||||
handler_dict = {}
|
||||
|
||||
# setup template globals
|
||||
templates_dirs = config['templates_dirs']
|
||||
jinja_env = J2TemplateView.init_shared_env(paths=templates_dirs,
|
||||
packages=config['template_packages'])
|
||||
|
||||
jinja_env.globals.update(config.get('template_globals', {}))
|
||||
|
||||
for static_name, static_path in six.iteritems(static_routes):
|
||||
routes.append(Route(static_name, StaticHandler(static_path)))
|
||||
|
||||
for name, value in six.iteritems(collections):
|
||||
if isinstance(value, BaseHandler):
|
||||
handler_dict[name] = value
|
||||
new_route = Route(name, value, config=config)
|
||||
if name != '':
|
||||
routes.append(new_route)
|
||||
else:
|
||||
root_route = new_route
|
||||
continue
|
||||
|
||||
route_config = init_route_config(value, config)
|
||||
route_class = route_config.get('route_class', Route)
|
||||
|
||||
if route_config.get('index_paths') == '$liveweb':
|
||||
live = create_live_handler(route_config)
|
||||
handler_dict[name] = live
|
||||
new_route = route_class(name, live, config=route_config)
|
||||
if name != '':
|
||||
routes.append(new_route)
|
||||
else:
|
||||
root_route = new_route
|
||||
continue
|
||||
|
||||
query_handler = init_collection(route_config)
|
||||
|
||||
wb_handler = create_wb_handler(
|
||||
query_handler=query_handler,
|
||||
config=route_config,
|
||||
)
|
||||
|
||||
handler_dict[name] = wb_handler
|
||||
|
||||
logging.debug('Adding Collection: ' + name)
|
||||
|
||||
new_route = route_class(name, wb_handler,
|
||||
config=route_config,
|
||||
request_class=request_class)
|
||||
|
||||
if name != '':
|
||||
routes.append(new_route)
|
||||
else:
|
||||
root_route = new_route
|
||||
|
||||
# cdx query handler
|
||||
cdx_api_suffix = route_config.get('enable_cdx_api', False)
|
||||
|
||||
if cdx_api_suffix:
|
||||
add_cdx_api_handler(name, cdx_api_suffix, routes, query_handler,
|
||||
route_class=route_class)
|
||||
|
||||
if config.get('debug_echo_env', False):
|
||||
routes.append(Route('echo_env', DebugEchoEnvHandler()))
|
||||
|
||||
if config.get('debug_echo_req', False):
|
||||
routes.append(Route('echo_req', DebugEchoHandler()))
|
||||
|
||||
if root_route:
|
||||
routes.append(root_route)
|
||||
|
||||
# resolve any cross handler references
|
||||
for route in routes:
|
||||
if hasattr(route.handler, 'resolve_refs'):
|
||||
route.handler.resolve_refs(handler_dict)
|
||||
|
||||
# default to regular archival mode
|
||||
router = ArchivalRouter
|
||||
|
||||
if config.get('enable_http_proxy', False):
|
||||
router = ProxyArchivalRouter
|
||||
|
||||
view = init_view(config, 'proxy_select_html')
|
||||
|
||||
if 'proxy_options' not in passed_config:
|
||||
passed_config['proxy_options'] = {}
|
||||
|
||||
if view:
|
||||
passed_config['proxy_options']['proxy_select_view'] = view
|
||||
|
||||
view = init_view(config, 'proxy_cert_download_html')
|
||||
|
||||
if view:
|
||||
passed_config['proxy_options']['proxy_cert_download_view'] = view
|
||||
|
||||
# Finally, create wb router
|
||||
return router(
|
||||
routes,
|
||||
port=port,
|
||||
abs_path=config.get('absolute_paths', True),
|
||||
home_view=init_view(config, 'home_html'),
|
||||
error_view=init_view(config, 'error_html'),
|
||||
info_view=init_view(config, 'info_json'),
|
||||
config=config
|
||||
)
|
@ -1,172 +0,0 @@
|
||||
from pywb.utils.dsrules import DEFAULT_RULES_FILE
|
||||
|
||||
from pywb.perms.perms_filter import make_perms_cdx_filter
|
||||
from pywb.framework.wbrequestresponse import WbResponse
|
||||
from pywb.cdx.cdxserver import create_cdx_server
|
||||
from pywb.webapp.views import MementoTimemapView
|
||||
|
||||
|
||||
#=================================================================
|
||||
class QueryHandler(object):
|
||||
"""
|
||||
Main interface for querying the index (currently only CDX) from a
|
||||
source server (currently a cdx server)
|
||||
|
||||
Creates an appropriate query based on wbrequest type info and outputs
|
||||
a returns a view for the cdx, either a raw cdx iter, an html view,
|
||||
etc...
|
||||
"""
|
||||
|
||||
def __init__(self, cdx_server, html_query_view=None, perms_policy=None):
|
||||
self.cdx_server = cdx_server
|
||||
self.perms_policy = perms_policy
|
||||
|
||||
self.views = {}
|
||||
if html_query_view:
|
||||
self.views['html'] = html_query_view
|
||||
|
||||
self.views['timemap'] = MementoTimemapView()
|
||||
|
||||
@staticmethod
|
||||
def init_from_config(config,
|
||||
ds_rules_file=DEFAULT_RULES_FILE,
|
||||
html_view=None,
|
||||
server_cls=None):
|
||||
|
||||
perms_policy = None
|
||||
|
||||
if hasattr(config, 'get'):
|
||||
perms_policy = config.get('perms_policy')
|
||||
server_cls = config.get('server_cls', server_cls)
|
||||
|
||||
cdx_server = create_cdx_server(config, ds_rules_file, server_cls)
|
||||
|
||||
return QueryHandler(cdx_server, html_view, perms_policy)
|
||||
|
||||
def get_output_type(self, wb_url):
|
||||
# cdx server only supports text and cdxobject for now
|
||||
if wb_url.mod == 'cdx_':
|
||||
output = 'text'
|
||||
elif wb_url.mod == 'timemap':
|
||||
output = 'timemap'
|
||||
elif wb_url.is_query():
|
||||
output = 'html'
|
||||
else:
|
||||
output = 'cdxobject'
|
||||
|
||||
return output
|
||||
|
||||
def load_for_request(self, wbrequest):
|
||||
wbrequest.normalize_post_query()
|
||||
|
||||
wb_url = wbrequest.wb_url
|
||||
output = self.get_output_type(wb_url)
|
||||
|
||||
# init standard params
|
||||
params = self.get_query_params(wb_url)
|
||||
|
||||
params['allowFuzzy'] = True
|
||||
params['url'] = wb_url.url
|
||||
params['output'] = output
|
||||
|
||||
params['filter'].append('!mimetype:-')
|
||||
|
||||
# get metadata
|
||||
if wb_url.mod == 'vi_':
|
||||
# matching metadata explicitly with special scheme
|
||||
schema, rest = wb_url.url.split('://', 1)
|
||||
params['url'] = 'metadata://' + rest
|
||||
params['filter'].append('~original:metadata://')
|
||||
|
||||
cdx_iter = self.load_cdx(wbrequest, params)
|
||||
return cdx_iter, output
|
||||
|
||||
def load_cdx(self, wbrequest, params):
|
||||
if wbrequest:
|
||||
# add any custom filter from the request
|
||||
if wbrequest.query_filter:
|
||||
filters = params.get('filter')
|
||||
if filters:
|
||||
filters.extend(wbrequest.query_filter)
|
||||
else:
|
||||
params['filter'] = wbrequest.query_filter
|
||||
|
||||
params['coll'] = wbrequest.coll
|
||||
if wbrequest.custom_params:
|
||||
params.update(wbrequest.custom_params)
|
||||
|
||||
if self.perms_policy:
|
||||
perms_op = make_perms_cdx_filter(self.perms_policy, wbrequest)
|
||||
if perms_op:
|
||||
params['custom_ops'] = [perms_op]
|
||||
|
||||
cdx_iter = self.cdx_server.load_cdx(**params)
|
||||
return cdx_iter
|
||||
|
||||
def make_cdx_response(self, wbrequest, cdx_iter, output, **kwargs):
|
||||
# if not text, the iterator is assumed to be CDXObjects
|
||||
if output and output != 'text':
|
||||
view = self.views.get(output)
|
||||
if view:
|
||||
return view.render_response(wbrequest, cdx_iter, **kwargs)
|
||||
|
||||
return WbResponse.text_stream(cdx_iter)
|
||||
|
||||
def cdx_load_callback(self, wbrequest):
|
||||
def load_cdx(params):
|
||||
params['output'] = 'cdxobject'
|
||||
return self.load_cdx(wbrequest, params)
|
||||
|
||||
return load_cdx
|
||||
|
||||
def get_query_params(self,
|
||||
wburl, limit=150000,
|
||||
collapse_time=None,
|
||||
replay_closest=100):
|
||||
|
||||
#if wburl.type == wburl.URL_QUERY:
|
||||
# raise NotImplementedError('Url Query Not Yet Supported')
|
||||
|
||||
return {
|
||||
wburl.QUERY:
|
||||
{'collapseTime': collapse_time,
|
||||
'filter': ['!statuscode:(500|502|504)'],
|
||||
'from': wburl.timestamp,
|
||||
'to': wburl.end_timestamp,
|
||||
'limit': limit,
|
||||
'matchType': 'exact',
|
||||
},
|
||||
|
||||
wburl.URL_QUERY:
|
||||
{'collapse': 'urlkey',
|
||||
'matchType': 'prefix',
|
||||
'showGroupCount': True,
|
||||
'showUniqCount': True,
|
||||
'lastSkipTimestamp': True,
|
||||
'limit': limit,
|
||||
'fl': ('urlkey,original,timestamp,' +
|
||||
'endtimestamp,groupcount,uniqcount'),
|
||||
'filter': [],
|
||||
},
|
||||
|
||||
wburl.REPLAY:
|
||||
{'sort': 'closest',
|
||||
'filter': ['!statuscode:(500|502|504)'],
|
||||
'limit': replay_closest,
|
||||
'closest': wburl.timestamp,
|
||||
'resolveRevisits': True,
|
||||
'matchType': 'exact',
|
||||
},
|
||||
|
||||
wburl.LATEST_REPLAY:
|
||||
{'sort': 'reverse',
|
||||
# Not appropriate as default
|
||||
# Should be an option to configure status code filtering in general
|
||||
# 'filter': ['statuscode:[23]..|-'],
|
||||
'filter': [],
|
||||
'limit': '1',
|
||||
'resolveRevisits': True,
|
||||
'matchType': 'exact',
|
||||
}
|
||||
|
||||
}[wburl.type]
|
@ -1,92 +0,0 @@
|
||||
from warcio.statusandheaders import StatusAndHeaders
|
||||
from warcio.limitreader import LimitReader
|
||||
|
||||
from pywb.framework.cache import create_cache
|
||||
|
||||
from tempfile import NamedTemporaryFile, mkdtemp
|
||||
|
||||
import yaml
|
||||
import os
|
||||
from shutil import rmtree
|
||||
|
||||
import atexit
|
||||
|
||||
|
||||
#=================================================================
|
||||
class RangeCache(object):
|
||||
def __init__(self):
|
||||
self.cache = create_cache()
|
||||
self.temp_dir = None
|
||||
atexit.register(self.cleanup)
|
||||
|
||||
def cleanup(self):
|
||||
if self.temp_dir: # pragma: no cover
|
||||
print('Removing: ' + self.temp_dir)
|
||||
rmtree(self.temp_dir, True)
|
||||
self.temp_dir = None
|
||||
|
||||
def handle_range(self, wbrequest, key, wbresponse_func,
|
||||
url, start, end, use_206):
|
||||
# key must be set
|
||||
assert(key)
|
||||
if key not in self.cache:
|
||||
wbrequest.custom_params['noredir'] = True
|
||||
response = wbresponse_func()
|
||||
|
||||
# only cache 200 responses
|
||||
if not response.status_headers.get_statuscode().startswith('200'):
|
||||
return response.status_headers, response.body
|
||||
|
||||
if not self.temp_dir:
|
||||
self.temp_dir = mkdtemp(prefix='_pywbcache')
|
||||
else:
|
||||
pass
|
||||
#self._check_dir_size(self.temp_dir)
|
||||
|
||||
with NamedTemporaryFile(delete=False, dir=self.temp_dir) as fh:
|
||||
for obj in response.body:
|
||||
fh.write(obj)
|
||||
|
||||
name = fh.name
|
||||
|
||||
spec = dict(name=fh.name,
|
||||
headers=response.status_headers.headers)
|
||||
|
||||
self.cache[key] = yaml.dump(spec)
|
||||
else:
|
||||
spec = yaml.load(self.cache[key])
|
||||
|
||||
spec['headers'] = [tuple(x) for x in spec['headers']]
|
||||
|
||||
filelen = os.path.getsize(spec['name'])
|
||||
|
||||
maxlen = filelen - start
|
||||
|
||||
if end:
|
||||
maxlen = min(maxlen, end - start + 1)
|
||||
|
||||
def read_range():
|
||||
with open(spec['name'], 'rb') as fh:
|
||||
fh.seek(start)
|
||||
fh = LimitReader.wrap_stream(fh, maxlen)
|
||||
while True:
|
||||
buf = fh.read()
|
||||
if not buf:
|
||||
break
|
||||
|
||||
yield buf
|
||||
|
||||
status_headers = StatusAndHeaders('200 OK', spec['headers'])
|
||||
|
||||
if use_206:
|
||||
StatusAndHeaders.add_range(status_headers, start,
|
||||
maxlen,
|
||||
filelen)
|
||||
|
||||
status_headers.replace_header('Content-Length', str(maxlen))
|
||||
|
||||
return status_headers, read_range()
|
||||
|
||||
|
||||
#=================================================================
|
||||
range_cache = RangeCache()
|
@ -1,392 +0,0 @@
|
||||
import re
|
||||
import logging
|
||||
|
||||
from io import BytesIO
|
||||
from six.moves.urllib.parse import urlsplit
|
||||
from itertools import chain
|
||||
|
||||
from warcio.statusandheaders import StatusAndHeaders
|
||||
from warcio.limitreader import LimitReader
|
||||
from warcio.timeutils import timestamp_now
|
||||
from warcio.recordloader import ArchiveLoadFailed
|
||||
|
||||
from pywb.utils.wbexception import WbException, NotFoundException
|
||||
|
||||
from pywb.framework.wbrequestresponse import WbResponse
|
||||
from pywb.framework.memento import MementoResponse
|
||||
|
||||
from pywb.rewrite.rewrite_content import RewriteContent
|
||||
|
||||
from pywb.webapp.views import HeadInsertView
|
||||
|
||||
from pywb.webapp.rangecache import range_cache
|
||||
|
||||
|
||||
#=================================================================
|
||||
class CaptureException(WbException):
|
||||
"""
|
||||
raised to indicate an issue with a specific capture
|
||||
and will be caught and result in a retry, if possible
|
||||
if not, will result in a 502
|
||||
"""
|
||||
def status(self):
|
||||
return '502 Internal Server Error'
|
||||
|
||||
|
||||
#=================================================================
|
||||
class ReplayView(object):
|
||||
STRIP_SCHEME_WWW = re.compile('^([\w]+:[/]*(?:www[\d]*\.)?)?(.*?)$', re.MULTILINE)
|
||||
|
||||
def __init__(self, content_loader, config):
|
||||
self.content_loader = content_loader
|
||||
|
||||
framed = config.get('framed_replay')
|
||||
self.content_rewriter = RewriteContent(is_framed_replay=framed)
|
||||
|
||||
self.head_insert_view = HeadInsertView.init_from_config(config)
|
||||
|
||||
self.buffer_response = config.get('buffer_response', True)
|
||||
self.buffer_max_size = config.get('buffer_max_size', 16384)
|
||||
|
||||
self.redir_to_exact = config.get('redir_to_exact', True)
|
||||
|
||||
memento = config.get('enable_memento', False)
|
||||
if memento:
|
||||
self.response_class = MementoResponse
|
||||
else:
|
||||
self.response_class = WbResponse
|
||||
|
||||
self.enable_range_cache = config.get('enable_ranges', True)
|
||||
|
||||
self._reporter = config.get('reporter')
|
||||
|
||||
def render_content(self, wbrequest, cdx_lines, cdx_loader):
|
||||
last_e = None
|
||||
first = True
|
||||
|
||||
#cdx_lines = args[0]
|
||||
#cdx_loader = args[1]
|
||||
|
||||
# List of already failed w/arcs
|
||||
failed_files = []
|
||||
|
||||
response = None
|
||||
|
||||
# Iterate over the cdx until find one that works
|
||||
# The cdx should already be sorted in
|
||||
# closest-to-timestamp order (from the cdx server)
|
||||
for cdx in cdx_lines:
|
||||
try:
|
||||
# optimize: can detect if redirect is needed just from the cdx,
|
||||
# no need to load w/arc data if requiring exact match
|
||||
if first:
|
||||
redir_response = self._redirect_if_needed(wbrequest, cdx)
|
||||
if redir_response:
|
||||
return redir_response
|
||||
|
||||
first = False
|
||||
|
||||
response = self.cached_replay_capture(wbrequest,
|
||||
cdx,
|
||||
cdx_loader,
|
||||
failed_files)
|
||||
|
||||
except (CaptureException, ArchiveLoadFailed) as ce:
|
||||
#import traceback
|
||||
#traceback.print_exc()
|
||||
logging.debug(ce)
|
||||
last_e = ce
|
||||
pass
|
||||
|
||||
if response:
|
||||
return response
|
||||
|
||||
if not last_e:
|
||||
# can only get here if cdx_lines is empty somehow
|
||||
# should be filtered out before hand, but if not
|
||||
msg = 'No Captures found for: ' + wbrequest.wb_url.url
|
||||
last_e = NotFoundException(msg)
|
||||
|
||||
raise last_e
|
||||
|
||||
def cached_replay_capture(self, wbrequest, cdx, cdx_loader, failed_files):
|
||||
def get_capture():
|
||||
return self.replay_capture(wbrequest,
|
||||
cdx,
|
||||
cdx_loader,
|
||||
failed_files)
|
||||
|
||||
if not self.enable_range_cache:
|
||||
return get_capture()
|
||||
|
||||
range_info = wbrequest.extract_range()
|
||||
|
||||
if not range_info:
|
||||
return get_capture()
|
||||
|
||||
range_status, range_iter = (range_cache.
|
||||
handle_range(wbrequest,
|
||||
cdx.get('digest', cdx['urlkey']),
|
||||
get_capture,
|
||||
*range_info))
|
||||
|
||||
response = self.response_class(range_status,
|
||||
range_iter,
|
||||
wbrequest=wbrequest,
|
||||
cdx=cdx)
|
||||
return response
|
||||
|
||||
def replay_capture(self, wbrequest, cdx, cdx_loader, failed_files):
|
||||
(status_headers, stream) = (self.content_loader(cdx,
|
||||
failed_files,
|
||||
cdx_loader,
|
||||
wbrequest))
|
||||
|
||||
# check and reject self-redirect
|
||||
self._reject_self_redirect(wbrequest, cdx, status_headers)
|
||||
|
||||
# check if redir is needed
|
||||
redir_response = self._redirect_if_needed(wbrequest, cdx)
|
||||
if redir_response:
|
||||
return redir_response
|
||||
|
||||
#length = status_headers.get_header('content-length')
|
||||
#stream = LimitReader.wrap_stream(stream, length)
|
||||
|
||||
# one more check for referrer-based self-redirect
|
||||
# TODO: evaluate this, as refreshing in browser may sometimes cause
|
||||
# referrer to be set to the same page, incorrectly skipping a capture
|
||||
# self._reject_referrer_self_redirect(wbrequest)
|
||||
|
||||
urlrewriter = wbrequest.urlrewriter
|
||||
|
||||
# if using url rewriter, use original url for rewriting purposes
|
||||
if wbrequest and wbrequest.wb_url:
|
||||
wbrequest.wb_url.url = cdx['url']
|
||||
|
||||
if wbrequest.options['is_ajax']:
|
||||
wbrequest.urlrewriter.rewrite_opts['is_ajax'] = True
|
||||
|
||||
head_insert_func = None
|
||||
if self.head_insert_view:
|
||||
head_insert_func = (self.head_insert_view.
|
||||
create_insert_func(wbrequest))
|
||||
|
||||
result = (self.content_rewriter.
|
||||
rewrite_content(urlrewriter,
|
||||
status_headers=status_headers,
|
||||
stream=stream,
|
||||
head_insert_func=head_insert_func,
|
||||
urlkey=cdx['urlkey'],
|
||||
cdx=cdx,
|
||||
env=wbrequest.env))
|
||||
|
||||
(status_headers, response_iter, is_rewritten) = result
|
||||
|
||||
# buffer response if buffering enabled
|
||||
if self.buffer_response:
|
||||
content_len = status_headers.get_header('content-length')
|
||||
try:
|
||||
content_len = int(content_len)
|
||||
except:
|
||||
content_len = 0
|
||||
|
||||
if content_len <= 0:
|
||||
max_size = self.buffer_max_size
|
||||
response_iter = self.buffered_response(status_headers,
|
||||
response_iter,
|
||||
max_size)
|
||||
|
||||
# Set Content-Location if not exact capture
|
||||
if not self.redir_to_exact:
|
||||
mod = wbrequest.options.get('replay_mod', wbrequest.wb_url.mod)
|
||||
canon_url = (wbrequest.urlrewriter.
|
||||
get_new_url(timestamp=cdx['timestamp'],
|
||||
url=cdx['url'],
|
||||
mod=mod))
|
||||
|
||||
status_headers.headers.append(('Content-Location', canon_url))
|
||||
|
||||
if wbrequest.wb_url.mod == 'vi_':
|
||||
status_headers.headers.append(('access-control-allow-origin', '*'))
|
||||
|
||||
response = self.response_class(status_headers,
|
||||
response_iter,
|
||||
wbrequest=wbrequest,
|
||||
cdx=cdx)
|
||||
|
||||
# notify reporter callback, if any
|
||||
if self._reporter:
|
||||
self._reporter(wbrequest, cdx, response)
|
||||
|
||||
return response
|
||||
|
||||
# Buffer rewrite iterator and return a response from a string
|
||||
def buffered_response(self, status_headers, iterator, max_size):
|
||||
out = BytesIO()
|
||||
size = 0
|
||||
read_all = True
|
||||
|
||||
try:
|
||||
for buff in iterator:
|
||||
buff = bytes(buff)
|
||||
size += len(buff)
|
||||
out.write(buff)
|
||||
if max_size > 0 and size > max_size:
|
||||
read_all = False
|
||||
break
|
||||
|
||||
finally:
|
||||
content = out.getvalue()
|
||||
out.close()
|
||||
|
||||
if read_all:
|
||||
content_length_str = str(len(content))
|
||||
|
||||
# remove existing content length
|
||||
status_headers.replace_header('Content-Length',
|
||||
content_length_str)
|
||||
return [content]
|
||||
else:
|
||||
status_headers.remove_header('Content-Length')
|
||||
return chain(iter([content]), iterator)
|
||||
|
||||
def _redirect_if_needed(self, wbrequest, cdx):
|
||||
if not self.redir_to_exact:
|
||||
return None
|
||||
|
||||
if wbrequest.options['is_proxy']:
|
||||
return None
|
||||
|
||||
if wbrequest.custom_params.get('noredir'):
|
||||
return None
|
||||
|
||||
is_timegate = (wbrequest.options.get('is_timegate', False))
|
||||
if not is_timegate:
|
||||
is_timegate = wbrequest.wb_url.is_latest_replay()
|
||||
|
||||
redir_needed = is_timegate or (cdx['timestamp'] != wbrequest.wb_url.timestamp)
|
||||
|
||||
if not redir_needed:
|
||||
return None
|
||||
|
||||
if self.enable_range_cache and wbrequest.extract_range():
|
||||
return None
|
||||
|
||||
#if is_timegate:
|
||||
# timestamp = timestamp_now()
|
||||
#else:
|
||||
timestamp = cdx['timestamp']
|
||||
|
||||
new_url = (wbrequest.urlrewriter.
|
||||
get_new_url(timestamp=timestamp,
|
||||
url=cdx['url']))
|
||||
|
||||
if wbrequest.method == 'POST':
|
||||
# FF shows a confirm dialog, so can't use 307 effectively
|
||||
# was: statusline = '307 Same-Method Internal Redirect'
|
||||
return None
|
||||
elif is_timegate:
|
||||
statusline = '302 Found'
|
||||
else:
|
||||
# clear cdx line to indicate internal redirect
|
||||
statusline = '302 Internal Redirect'
|
||||
cdx = None
|
||||
|
||||
status_headers = StatusAndHeaders(statusline,
|
||||
[('Location', new_url)])
|
||||
|
||||
return self.response_class(status_headers,
|
||||
wbrequest=wbrequest,
|
||||
cdx=cdx,
|
||||
memento_is_redir=True)
|
||||
|
||||
def _reject_self_redirect(self, wbrequest, cdx, status_headers):
|
||||
"""
|
||||
Check if response is a 3xx redirect to the same url
|
||||
If so, reject this capture to avoid causing redirect loop
|
||||
"""
|
||||
if not status_headers.statusline.startswith('3'):
|
||||
return
|
||||
|
||||
# skip all 304s
|
||||
if (status_headers.statusline.startswith('304') and
|
||||
not wbrequest.wb_url.is_identity):
|
||||
|
||||
raise CaptureException('Skipping 304 Modified: ' + str(cdx))
|
||||
|
||||
request_url = wbrequest.wb_url.url.lower()
|
||||
location_url = status_headers.get_header('Location')
|
||||
if not location_url:
|
||||
return
|
||||
|
||||
location_url = location_url.lower()
|
||||
if location_url.startswith('/'):
|
||||
host = urlsplit(cdx['url']).netloc
|
||||
location_url = host + location_url
|
||||
|
||||
if (ReplayView.strip_scheme_www(request_url) ==
|
||||
ReplayView.strip_scheme_www(location_url)):
|
||||
raise CaptureException('Self Redirect: ' + str(cdx))
|
||||
|
||||
# TODO: reevaluate this, as it may reject valid refreshes of a page
|
||||
def _reject_referrer_self_redirect(self, wbrequest): # pragma: no cover
|
||||
"""
|
||||
Perform final check for referrer based self-redirect.
|
||||
This method should be called after verifying that
|
||||
the request timestamp == capture timestamp
|
||||
|
||||
If referrer is same as current url,
|
||||
reject this response and try another capture.
|
||||
"""
|
||||
if not wbrequest.referrer:
|
||||
return
|
||||
|
||||
# build full url even if using relative-rewriting
|
||||
request_url = (wbrequest.host_prefix +
|
||||
wbrequest.rel_prefix + str(wbrequest.wb_url))
|
||||
|
||||
if (ReplayView.strip_scheme_www(request_url) ==
|
||||
ReplayView.strip_scheme_www(wbrequest.referrer)):
|
||||
raise CaptureException('Self Redirect via Referrer: ' +
|
||||
str(wbrequest.wb_url))
|
||||
|
||||
@staticmethod
|
||||
def strip_scheme_www(url):
|
||||
"""
|
||||
>>> ReplayView.strip_scheme_www('https://example.com') ==\
|
||||
ReplayView.strip_scheme_www('http://example.com')
|
||||
True
|
||||
|
||||
>>> ReplayView.strip_scheme_www('https://example.com') ==\
|
||||
ReplayView.strip_scheme_www('http:/example.com')
|
||||
True
|
||||
|
||||
>>> ReplayView.strip_scheme_www('https://example.com') ==\
|
||||
ReplayView.strip_scheme_www('example.com')
|
||||
True
|
||||
|
||||
>>> ReplayView.strip_scheme_www('https://example.com') ==\
|
||||
ReplayView.strip_scheme_www('http://www2.example.com')
|
||||
True
|
||||
|
||||
>>> ReplayView.strip_scheme_www('about://example.com') ==\
|
||||
ReplayView.strip_scheme_www('example.com')
|
||||
True
|
||||
|
||||
>>> ReplayView.strip_scheme_www('http://') ==\
|
||||
ReplayView.strip_scheme_www('')
|
||||
True
|
||||
|
||||
>>> ReplayView.strip_scheme_www('#!@?') ==\
|
||||
ReplayView.strip_scheme_www('#!@?')
|
||||
True
|
||||
"""
|
||||
m = ReplayView.STRIP_SCHEME_WWW.match(url)
|
||||
match = m.group(2)
|
||||
return match
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
doctest.testmod()
|
@ -1,20 +0,0 @@
|
||||
"""
|
||||
>>> format_ts('20141226101000')
|
||||
'Fri, Dec 26 2014 10:10:00'
|
||||
|
||||
>>> format_ts('20141226101000', '%s')
|
||||
1419588600
|
||||
|
||||
>>> is_wb_handler(DebugEchoHandler())
|
||||
False
|
||||
|
||||
|
||||
"""
|
||||
|
||||
from pywb.webapp.views import format_ts, is_wb_handler
|
||||
from pywb.webapp.handlers import DebugEchoHandler
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
doctest.testmod()
|
@ -1,222 +0,0 @@
|
||||
from warcio.timeutils import timestamp_to_datetime, timestamp_to_sec
|
||||
from pywb.framework.wbrequestresponse import WbResponse
|
||||
from pywb.framework.memento import make_timemap, LINK_FORMAT
|
||||
|
||||
from six.moves.urllib.parse import urlsplit
|
||||
|
||||
import logging
|
||||
import json
|
||||
import os
|
||||
|
||||
from jinja2 import Environment
|
||||
from jinja2 import FileSystemLoader, PackageLoader, ChoiceLoader
|
||||
|
||||
|
||||
FILTERS = {}
|
||||
|
||||
|
||||
#=================================================================
|
||||
class template_filter(object):
|
||||
"""
|
||||
Decorator for registering a function as a jinja2 filter
|
||||
If optional argument is supplied, it is used as the filter name
|
||||
Otherwise, the func name is the filter name
|
||||
"""
|
||||
def __init__(self, param=None):
|
||||
self.name = param
|
||||
|
||||
def __call__(self, func):
|
||||
name = self.name
|
||||
if not name:
|
||||
name = func.__name__
|
||||
|
||||
FILTERS[name] = func
|
||||
return func
|
||||
|
||||
|
||||
#=================================================================
|
||||
# Filters
|
||||
@template_filter()
|
||||
def format_ts(value, format_='%a, %b %d %Y %H:%M:%S'):
|
||||
if format_ == '%s':
|
||||
return timestamp_to_sec(value)
|
||||
else:
|
||||
value = timestamp_to_datetime(value)
|
||||
return value.strftime(format_)
|
||||
|
||||
|
||||
@template_filter('urlsplit')
|
||||
def get_urlsplit(url):
|
||||
split = urlsplit(url)
|
||||
return split
|
||||
|
||||
|
||||
@template_filter()
|
||||
def is_wb_handler(obj):
|
||||
if not hasattr(obj, 'handler'):
|
||||
return False
|
||||
|
||||
return obj.handler.__class__.__name__ == "WBHandler"
|
||||
|
||||
|
||||
@template_filter()
|
||||
def tojson(obj):
|
||||
return json.dumps(obj)
|
||||
|
||||
|
||||
#=================================================================
|
||||
class FileOnlyPackageLoader(PackageLoader):
|
||||
def get_source(self, env, template):
|
||||
dir_, file_ = os.path.split(template)
|
||||
return super(FileOnlyPackageLoader, self).get_source(env, file_)
|
||||
|
||||
|
||||
#=================================================================
|
||||
class RelEnvironment(Environment):
|
||||
"""Override join_path() to enable relative template paths."""
|
||||
def join_path(self, template, parent):
|
||||
return os.path.join(os.path.dirname(parent), template)
|
||||
|
||||
|
||||
#=================================================================
|
||||
class J2TemplateView(object):
|
||||
shared_jinja_env = None
|
||||
|
||||
def __init__(self, filename):
|
||||
self.template_file = filename
|
||||
self.jinja_env = self.init_shared_env()
|
||||
|
||||
@staticmethod
|
||||
def init_shared_env(paths=['templates', '.', '/'],
|
||||
packages=['pywb'],
|
||||
overlay_env=None):
|
||||
|
||||
if J2TemplateView.shared_jinja_env:
|
||||
return J2TemplateView.shared_jinja_env
|
||||
|
||||
loaders = J2TemplateView._add_loaders(paths, packages)
|
||||
loader = ChoiceLoader(loaders)
|
||||
|
||||
if overlay_env:
|
||||
jinja_env = overlay_env.overlay(loader=loader, trim_blocks=True)
|
||||
else:
|
||||
jinja_env = RelEnvironment(loader=loader, trim_blocks=True)
|
||||
|
||||
jinja_env.filters.update(FILTERS)
|
||||
J2TemplateView.shared_jinja_env = jinja_env
|
||||
return jinja_env
|
||||
|
||||
@staticmethod
|
||||
def _add_loaders(paths, packages):
|
||||
loaders = []
|
||||
# add loaders for paths
|
||||
for path in paths:
|
||||
loaders.append(FileSystemLoader(path))
|
||||
|
||||
# add loaders for all specified packages
|
||||
for package in packages:
|
||||
loaders.append(FileOnlyPackageLoader(package))
|
||||
|
||||
return loaders
|
||||
|
||||
def render_to_string(self, **kwargs):
|
||||
template = self.jinja_env.get_template(self.template_file)
|
||||
|
||||
wbrequest = kwargs.get('wbrequest')
|
||||
if wbrequest:
|
||||
params = wbrequest.env.get('pywb.template_params')
|
||||
if params:
|
||||
kwargs.update(params)
|
||||
|
||||
template_result = template.render(**kwargs)
|
||||
|
||||
return template_result
|
||||
|
||||
def render_response(self, **kwargs):
|
||||
template_result = self.render_to_string(**kwargs)
|
||||
status = kwargs.get('status', '200 OK')
|
||||
content_type = kwargs.get('content_type', 'text/html; charset=utf-8')
|
||||
return WbResponse.text_response(template_result,
|
||||
status=status,
|
||||
content_type=content_type)
|
||||
|
||||
|
||||
#=================================================================
|
||||
def init_view(config, key, view_class=J2TemplateView):
|
||||
filename = config.get(key)
|
||||
if not filename:
|
||||
return None
|
||||
|
||||
logging.debug('Adding {0}: {1}'.format(key, filename))
|
||||
return view_class(filename)
|
||||
|
||||
|
||||
#=================================================================
|
||||
class HeadInsertView(J2TemplateView):
|
||||
def create_insert_func(self, wbrequest,
|
||||
include_ts=True):
|
||||
|
||||
if wbrequest.options['is_ajax']:
|
||||
return None
|
||||
|
||||
url = wbrequest.wb_url.get_url()
|
||||
|
||||
top_url = wbrequest.wb_prefix
|
||||
top_url += wbrequest.wb_url.to_str(mod=wbrequest.final_mod)
|
||||
|
||||
include_wombat = not wbrequest.wb_url.is_banner_only
|
||||
|
||||
def make_head_insert(rule, cdx):
|
||||
cdx['url'] = url
|
||||
return (self.render_to_string(wbrequest=wbrequest,
|
||||
cdx=cdx,
|
||||
top_url=top_url,
|
||||
include_ts=include_ts,
|
||||
include_wombat=include_wombat,
|
||||
banner_html=self.banner_html,
|
||||
rule=rule))
|
||||
return make_head_insert
|
||||
|
||||
@staticmethod
|
||||
def init_from_config(config):
|
||||
view = config.get('head_insert_view')
|
||||
if not view:
|
||||
html = config.get('head_insert_html', 'templates/head_insert.html')
|
||||
|
||||
if html:
|
||||
banner_html = config.get('banner_html', 'banner.html')
|
||||
view = HeadInsertView(html)
|
||||
logging.debug('Adding HeadInsert: {0}, Banner {1}'.
|
||||
format(html, banner_html))
|
||||
|
||||
view.banner_html = banner_html
|
||||
|
||||
return view
|
||||
|
||||
|
||||
#=================================================================
|
||||
# query views
|
||||
#=================================================================
|
||||
class J2HtmlCapturesView(J2TemplateView):
|
||||
def render_response(self, wbrequest, cdx_lines, **kwargs):
|
||||
def format_cdx_lines():
|
||||
for cdx in cdx_lines:
|
||||
cdx['_orig_url'] = cdx['url']
|
||||
cdx['url'] = wbrequest.wb_url.get_url(url=cdx['url'])
|
||||
yield cdx
|
||||
|
||||
return J2TemplateView.render_response(self,
|
||||
cdx_lines=list(format_cdx_lines()),
|
||||
url=wbrequest.wb_url.get_url(),
|
||||
type=wbrequest.wb_url.type,
|
||||
prefix=wbrequest.wb_prefix,
|
||||
**kwargs)
|
||||
|
||||
|
||||
#=================================================================
|
||||
class MementoTimemapView(object):
|
||||
def render_response(self, wbrequest, cdx_lines, **kwargs):
|
||||
memento_lines = make_timemap(wbrequest, cdx_lines)
|
||||
|
||||
return WbResponse.text_stream(memento_lines,
|
||||
content_type=LINK_FORMAT)
|
Loading…
x
Reference in New Issue
Block a user