diff --git a/pywb/cdx/README.md b/pywb/cdx/README.md deleted file mode 100644 index 87d26116..00000000 --- a/pywb/cdx/README.md +++ /dev/null @@ -1,28 +0,0 @@ -### pywb.cdx package - -This package contains the CDX processing suite of the pywb wayback tool suite. - -The CDX Server loads, filters and transforms cdx from multiple sources in response -to a given query. - -#### Sample App - -A very simple reference WSGI app is included. - -Run: `python -m pywb.cdx.wsgi_cdxserver` to start the app, keyboard interrupt to stop. - -The default [config.yaml](config.yaml) points to the sample data directory -and uses port 8080. - -The domain specific [rules.yaml](rules.yaml) are also loaded. - -#### CDX Server API Reference - -Goal is to provide compatiblity with this feature set and more: -https://github.com/internetarchive/wayback/tree/master/wayback-cdx-server - -TODO - - - - diff --git a/pywb/cdx/cdxdomainspecific.py b/pywb/cdx/cdxdomainspecific.py deleted file mode 100644 index ef332ec4..00000000 --- a/pywb/cdx/cdxdomainspecific.py +++ /dev/null @@ -1,185 +0,0 @@ -import yaml -import re -import logging -import pkg_resources - -from six.moves.urllib.parse import urlsplit - -from pywb.utils.dsrules import BaseRule, RuleSet - -from pywb.utils.canonicalize import unsurt, UrlCanonicalizer -from pywb.utils.loaders import to_native_str - - -#================================================================= -def load_domain_specific_cdx_rules(ds_rules_file, surt_ordered): - canon = None - fuzzy = None - - # Load Canonicalizer Rules - rules = RuleSet(CDXDomainSpecificRule, 'canonicalize', - ds_rules_file=ds_rules_file) - - if not surt_ordered: - for rule in rules.rules: - rule.unsurt() - - if rules: - canon = CustomUrlCanonicalizer(rules, surt_ordered) - - # Load Fuzzy Lookup Rules - rules = RuleSet(CDXDomainSpecificRule, 'fuzzy_lookup', - ds_rules_file=ds_rules_file) - - if not surt_ordered: - for rule in rules.rules: - rule.unsurt() - - if rules: - fuzzy = FuzzyQuery(rules) - - logging.debug('CustomCanonilizer? ' + str(bool(canon))) - logging.debug('FuzzyMatcher? ' + str(bool(canon))) - return (canon, fuzzy) - - -#================================================================= -class CustomUrlCanonicalizer(UrlCanonicalizer): - def __init__(self, rules, surt_ordered=True): - super(CustomUrlCanonicalizer, self).__init__(surt_ordered) - self.rules = rules - - def __call__(self, url): - urlkey = super(CustomUrlCanonicalizer, self).__call__(url) - - for rule in self.rules.iter_matching(urlkey): - m = rule.regex.match(urlkey) - if not m: - continue - - if rule.replace: - return m.expand(rule.replace) - - return urlkey - - -#================================================================= -class FuzzyQuery(object): - def __init__(self, rules): - self.rules = rules - - def __call__(self, query): - matched_rule = None - - urlkey = to_native_str(query.key, 'utf-8') - url = query.url - filter_ = query.filters - output = query.output - - for rule in self.rules.iter_matching(urlkey): - m = rule.regex.search(urlkey) - if not m: - continue - - matched_rule = rule - - groups = m.groups() - for g in groups: - for f in matched_rule.filter: - filter_.append(f.format(g)) - - break - - if not matched_rule: - return None - - repl = '?' - if matched_rule.replace: - repl = matched_rule.replace - - inx = url.find(repl) - if inx > 0: - url = url[:inx + len(repl)] - - if matched_rule.match_type == 'domain': - host = urlsplit(url).netloc - # remove the subdomain - url = host.split('.', 1)[1] - - params = query.params - params.update({'url': url, - 'matchType': matched_rule.match_type, - 'filter': filter_}) - - if 'reverse' in params: - del params['reverse'] - - if 'closest' in params: - del params['closest'] - - if 'end_key' in params: - del params['end_key'] - - return params - - -#================================================================= -class CDXDomainSpecificRule(BaseRule): - DEFAULT_FILTER = ['~urlkey:{0}'] - DEFAULT_MATCH_TYPE = 'prefix' - - def __init__(self, name, config): - super(CDXDomainSpecificRule, self).__init__(name, config) - - if not isinstance(config, dict): - self.regex = self.make_regex(config) - self.replace = None - self.filter = self.DEFAULT_FILTER - self.match_type = self.DEFAULT_MATCH_TYPE - else: - self.regex = self.make_regex(config.get('match')) - self.replace = config.get('replace') - self.filter = config.get('filter', self.DEFAULT_FILTER) - self.match_type = config.get('type', self.DEFAULT_MATCH_TYPE) - - def unsurt(self): - """ - urlkey is assumed to be in surt format by default - In the case of non-surt format, this method is called - to desurt any urls - """ - self.url_prefix = list(map(unsurt, self.url_prefix)) - if self.regex: - self.regex = re.compile(unsurt(self.regex.pattern)) - - if self.replace: - self.replace = unsurt(self.replace) - - @staticmethod - def make_regex(config): - # just query args - if isinstance(config, list): - string = CDXDomainSpecificRule.make_query_match_regex(config) - - # split out base and args - elif isinstance(config, dict): - string = config.get('regex', '') - string += CDXDomainSpecificRule.make_query_match_regex( - config.get('args', [])) - - # else assume string - else: - string = str(config) - - return re.compile(string) - - @staticmethod - def make_query_match_regex(params_list): - params_list.sort() - - def conv(value): - return '[?&]({0}=[^&]+)'.format(re.escape(value)) - - params_list = list(map(conv, params_list)) - final_str = '.*'.join(params_list) - return final_str diff --git a/pywb/cdx/cdxserver.py b/pywb/cdx/cdxserver.py deleted file mode 100644 index f0869d0f..00000000 --- a/pywb/cdx/cdxserver.py +++ /dev/null @@ -1,230 +0,0 @@ -from pywb.utils.canonicalize import UrlCanonicalizer -from pywb.utils.wbexception import NotFoundException - -from pywb.cdx.cdxops import cdx_load -from pywb.cdx.cdxsource import CDXSource, CDXFile, RemoteCDXSource, RedisCDXSource -from pywb.cdx.zipnum import ZipNumCluster -from pywb.cdx.cdxobject import CDXObject, CDXException -from pywb.cdx.query import CDXQuery -from pywb.cdx.cdxdomainspecific import load_domain_specific_cdx_rules - -from pywb.utils.loaders import is_http - -from itertools import chain -import logging -import os - - -#================================================================= -class BaseCDXServer(object): - def __init__(self, **kwargs): - ds_rules_file = kwargs.get('ds_rules_file') - surt_ordered = kwargs.get('surt_ordered', True) - - # load from domain-specific rules - if ds_rules_file: - self.url_canon, self.fuzzy_query = ( - load_domain_specific_cdx_rules(ds_rules_file, surt_ordered)) - # or custom passed in canonicalizer - else: - self.url_canon = kwargs.get('url_canon') - self.fuzzy_query = kwargs.get('fuzzy_query') - - # set default canonicalizer if none set thus far - if not self.url_canon: - self.url_canon = UrlCanonicalizer(surt_ordered) - - def _check_cdx_iter(self, cdx_iter, query): - """ Check cdx iter semantics - If `cdx_iter` is empty (no matches), check if fuzzy matching - is allowed, and try it -- otherwise, - throw :exc:`~pywb.utils.wbexception.NotFoundException` - """ - - cdx_iter = self.peek_iter(cdx_iter) - - if cdx_iter: - return cdx_iter - - # check if fuzzy is allowed and ensure that its an - # exact match - if (self.fuzzy_query and - query.allow_fuzzy and - query.is_exact): - - fuzzy_query_params = self.fuzzy_query(query) - if fuzzy_query_params: - return self.load_cdx(**fuzzy_query_params) - - msg = 'No Captures found for: ' + query.url - if not query.is_exact: - msg += ' (' + query.match_type + ' query)' - - raise NotFoundException(msg, url=query.url) - - #def _calc_search_keys(self, query): - # return calc_search_range(url=query.url, - # match_type=query.match_type, - # url_canon=self.url_canon) - - def load_cdx(self, **params): - params['_url_canon'] = self.url_canon - query = CDXQuery(params) - - #key, end_key = self._calc_search_keys(query) - #query.set_key(key, end_key) - - cdx_iter = self._load_cdx_query(query) - - return self._check_cdx_iter(cdx_iter, query) - - def _load_cdx_query(self, query): # pragma: no cover - raise NotImplementedError('Implement in subclass') - - @staticmethod - def peek_iter(iterable): - try: - first = next(iterable) - except StopIteration: - return None - - return chain([first], iterable) - - -#================================================================= -class CDXServer(BaseCDXServer): - """ - Top-level cdx server object which maintains a list of cdx sources, - responds to queries and dispatches to the cdx ops for processing - """ - - def __init__(self, paths, **kwargs): - super(CDXServer, self).__init__(**kwargs) - # TODO: we could save config in member, so that other - # methods can use it. it's bad for add_cdx_source to take - # config argument. - self._create_cdx_sources(paths, kwargs.get('config')) - - def _load_cdx_query(self, query): - """ - load CDX for query parameters ``params``. - ``key`` (or ``url``) parameter specifies URL to query, - ``matchType`` parameter specifies matching method for ``key`` - (default ``exact``). - other parameters are passed down to :func:`cdx_load`. - raises :exc:`~pywb.utils.wbexception.NotFoundException` - if no captures are found. - - :param query: query parameters - :type query: :class:`~pywb.cdx.query.CDXQuery` - :rtype: iterator on :class:`~pywb.cdx.cdxobject.CDXObject` - """ - return cdx_load(self.sources, query) - - def _create_cdx_sources(self, paths, config): - """ - build CDXSource instances for each of path in ``paths``. - - :param paths: list of sources or single source. - each source may be either string or CDXSource instance. value - of any other types will be silently ignored. - :param config: config object passed to :method:`add_cdx_source`. - """ - self.sources = [] - - if paths is not None: - if not isinstance(paths, (list, tuple)): - paths = [paths] - - for path in paths: - self.add_cdx_source(path, config) - - if len(self.sources) == 0: - logging.warn('No CDX Sources configured from paths=%s', paths) - - def _add_cdx_source(self, source): - if source is None: - return - - logging.debug('Adding CDX Source: %s', source) - self.sources.append(source) - - def add_cdx_source(self, source, config): - if isinstance(source, CDXSource): - self._add_cdx_source(source) - - elif isinstance(source, str): - if os.path.isdir(source): - for fn in os.listdir(source): - self._add_cdx_source(self._create_cdx_source( - os.path.join(source, fn), config)) - else: - self._add_cdx_source(self._create_cdx_source( - source, config)) - - def _create_cdx_source(self, filename, config): - if is_http(filename): - return RemoteCDXSource(filename) - - if filename.startswith('redis://'): - return RedisCDXSource(filename, config) - - if filename.endswith(('.cdx', '.cdxj')): - return CDXFile(filename) - - if filename.endswith(('.summary', '.idx')): - return ZipNumCluster(filename, config) - - # no warning for .loc or .gz (zipnum) - if not filename.endswith(('.loc', '.gz')): - logging.warn('skipping unrecognized URI: %s', filename) - - return None - - -#================================================================= -class RemoteCDXServer(BaseCDXServer): - """ - A special cdx server that uses a single - :class:`~pywb.cdx.cdxsource.RemoteCDXSource`. - It simply proxies the query params to the remote source - and performs no local processing/filtering - """ - def __init__(self, source, **kwargs): - super(RemoteCDXServer, self).__init__(**kwargs) - - if isinstance(source, RemoteCDXSource): - self.source = source - elif (isinstance(source, str) and is_http(source)): - self.source = RemoteCDXSource(source, remote_processing=True) - else: - raise Exception('Invalid remote cdx source: ' + str(source)) - - def _load_cdx_query(self, query): - return cdx_load([self.source], query, process=False) - - -#================================================================= -def create_cdx_server(config, ds_rules_file=None, server_cls=None): - if hasattr(config, 'get'): - paths = config.get('index_paths') - surt_ordered = config.get('surt_ordered', True) - pass_config = config - else: - paths = config - surt_ordered = True - pass_config = None - - logging.debug('CDX Surt-Ordered? ' + str(surt_ordered)) - - if not server_cls: - if ((isinstance(paths, str) and is_http(paths)) or - isinstance(paths, RemoteCDXSource)): - server_cls = RemoteCDXServer - else: - server_cls = CDXServer - - return server_cls(paths, - config=pass_config, - surt_ordered=surt_ordered, - ds_rules_file=ds_rules_file) diff --git a/pywb/cdx/cdxsource.py b/pywb/cdx/cdxsource.py deleted file mode 100644 index 19b547e3..00000000 --- a/pywb/cdx/cdxsource.py +++ /dev/null @@ -1,150 +0,0 @@ -from pywb.utils.binsearch import iter_range - -from pywb.utils.wbexception import AccessException, NotFoundException -from pywb.utils.wbexception import BadRequestException, WbException - -from pywb.cdx.query import CDXQuery - -from six.moves.urllib.request import urlopen, Request -from six.moves.urllib.error import HTTPError -from six.moves import map - - -#================================================================= -class CDXSource(object): - """ - Represents any cdx index source - """ - def load_cdx(self, query): # pragma: no cover - raise NotImplementedError('Implement in subclass') - - -#================================================================= -class CDXFile(CDXSource): - """ - Represents a local plain-text .cdx file - """ - def __init__(self, filename): - self.filename = filename - - def load_cdx(self, query): - return self._do_load_file(self.filename, query) - - @staticmethod - def _do_load_file(filename, query): - with open(filename, 'rb') as source: - gen = iter_range(source, query.key, - query.end_key) - for line in gen: - yield line - - def __str__(self): - return 'CDX File - ' + self.filename - - -#================================================================= -class RemoteCDXSource(CDXSource): - """ - Represents a remote cdx server, to which requests will be proxied. - - Only ``url`` and ``match_type`` params are proxied at this time, - the stream is passed through all other filters locally. - """ - def __init__(self, filename, cookie=None, remote_processing=False): - self.remote_url = filename - self.cookie = cookie - self.remote_processing = remote_processing - - def load_cdx(self, query): - if self.remote_processing: - remote_query = query - else: - # Only send url and matchType to remote - remote_query = CDXQuery(dict(url=query.url, - matchType=query.match_type)) - - urlparams = remote_query.urlencode() - - try: - request = Request(self.remote_url + '?' + urlparams) - - if self.cookie: - request.add_header('Cookie', self.cookie) - - response = urlopen(request) - - except HTTPError as e: - if e.code == 403: - raise AccessException('Access Denied') - elif e.code == 404: - # return empty list for consistency with other cdx sources - # will be converted to 404 if no other retry - return [] - elif e.code == 400: - raise BadRequestException() - else: - raise WbException('Invalid response from remote cdx server') - - return iter(response) - - def __str__(self): - if self.remote_processing: - return 'Remote CDX Server: ' + self.remote_url - else: - return 'Remote CDX Source: ' + self.remote_url - - -#================================================================= -class RedisCDXSource(CDXSource): - DEFAULT_KEY_PREFIX = b'c:' - - def __init__(self, redis_url, config=None): - import redis - - parts = redis_url.split('/') - if len(parts) > 4: - self.cdx_key = parts[4].encode('utf-8') - redis_url = 'redis://' + parts[2] + '/' + parts[3] - else: - self.cdx_key = None - - self.redis_url = redis_url - self.redis = redis.StrictRedis.from_url(redis_url) - - self.key_prefix = self.DEFAULT_KEY_PREFIX - - def load_cdx(self, query): - """ - Load cdx from redis cache, from an ordered list - - If cdx_key is set, treat it as cdx file and load use - zrangebylex! (Supports all match types!) - - Otherwise, assume a key per-url and load all entries for that key. - (Only exact match supported) - """ - - if self.cdx_key: - return self.load_sorted_range(query, self.cdx_key) - else: - return self.load_single_key(query.key) - - def load_sorted_range(self, query, cdx_key): - cdx_list = self.redis.zrangebylex(cdx_key, - b'[' + query.key, - b'(' + query.end_key) - - return iter(cdx_list) - - def load_single_key(self, key): - # ensure only url/surt is part of key - key = key.split(b' ')[0] - cdx_list = self.redis.zrange(self.key_prefix + key, 0, -1) - - # key is not part of list, so prepend to each line - key += b' ' - cdx_list = list(map(lambda x: key + x, cdx_list)) - return cdx_list - - def __str__(self): - return 'Redis - ' + self.redis_url diff --git a/pywb/cdx/test/test_cdxdomainspecific.py b/pywb/cdx/test/test_cdxdomainspecific.py deleted file mode 100644 index 906a3103..00000000 --- a/pywb/cdx/test/test_cdxdomainspecific.py +++ /dev/null @@ -1,40 +0,0 @@ -r""" -Load Rules - ->>> (canon, fuzzy) = load_domain_specific_cdx_rules(None, True) ->>> canon('http://test.example.example/path/index.html?a=b&id=value&c=d') -'example,example,test)/path/index.html?id=value' - - -# Fuzzy Query Args Builder ->>> CDXDomainSpecificRule.make_query_match_regex(['para', 'id', 'abc']) -'[?&](abc=[^&]+).*[?&](id=[^&]+).*[?&](para=[^&]+)' - ->>> CDXDomainSpecificRule.make_query_match_regex(['id[0]', 'abc()']) -'[?&](abc\\(\\)=[^&]+).*[?&](id\\[0\\]=[^&]+)' - - -# Fuzzy Match Query + Args - -# list ->>> CDXDomainSpecificRule.make_regex(['para', 'id', 'abc']).pattern -'[?&](abc=[^&]+).*[?&](id=[^&]+).*[?&](para=[^&]+)' - -# dict ->>> CDXDomainSpecificRule.make_regex(dict(regex='com,test,.*\)/', args=['para', 'id', 'abc'])).pattern -'com,test,.*\\)/[?&](abc=[^&]+).*[?&](id=[^&]+).*[?&](para=[^&]+)' - -# string ->>> CDXDomainSpecificRule.make_regex('com,test,.*\)/[?&](abc=[^&]+).*[?&](id=[^&]+).*[?&](para=[^&]+)').pattern -'com,test,.*\\)/[?&](abc=[^&]+).*[?&](id=[^&]+).*[?&](para=[^&]+)' - -""" - - -from pywb.cdx.cdxdomainspecific import CDXDomainSpecificRule -from pywb.cdx.cdxdomainspecific import load_domain_specific_cdx_rules - - -if __name__ == "__main__": - import doctest - doctest.testmod() diff --git a/pywb/cdx/test/test_cdxops.py b/pywb/cdx/test/test_cdxops.py deleted file mode 100644 index 8c550ece..00000000 --- a/pywb/cdx/test/test_cdxops.py +++ /dev/null @@ -1,228 +0,0 @@ -#================================================================= -""" -# Merge Sort Multipe CDX Sources ->>> cdx_ops_test(url = 'http://iana.org/', sources = [test_cdx_dir + 'dupes.cdx', test_cdx_dir + 'iana.cdx']) -org,iana)/ 20140126200624 http://www.iana.org/ text/html 200 OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 2258 334 iana.warc.gz -org,iana)/ 20140127171238 http://iana.org unk 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 343 1858 dupes.warc.gz -org,iana)/ 20140127171238 http://www.iana.org/ warc/revisit - OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 536 2678 dupes.warc.gz - - -# Limit CDX Stream ->>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', limit = 3) -org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200625 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 117166 198285 iana.warc.gz -org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200654 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf warc/revisit - YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 548 482544 iana.warc.gz -org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200706 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf warc/revisit - YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 495230 iana.warc.gz - - -# Reverse CDX Stream ->>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', reverse = True, resolveRevisits = True, limit = 3) -org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201308 https://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 551 783712 iana.warc.gz 117166 198285 iana.warc.gz -org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201249 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 771773 iana.warc.gz 117166 198285 iana.warc.gz -org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201240 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 551 757988 iana.warc.gz 117166 198285 iana.warc.gz - ->>> cdx_ops_test('http://iana.org/_js/2013.1/jquery.js', reverse = True, resolveRevisits = True, limit = 1) -org,iana)/_js/2013.1/jquery.js 20140126201307 https://www.iana.org/_js/2013.1/jquery.js application/x-javascript 200 AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO - - 543 778507 iana.warc.gz 33449 7311 iana.warc.gz - -# From & To ->>> cdx_ops_test('http://example.com/', sources = [test_cdx_dir], from_ts='2013', to='2013') -com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz - ->>> cdx_ops_test('http://example.com/', sources = [test_cdx_dir], from_ts='2014') -com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz -com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz - ->>> cdx_ops_test('http://example.com/', sources = [test_cdx_dir], to='2012') # doctest: +IGNORE_EXCEPTION_DETAIL -Traceback (most recent call last): -NotFoundException: No Captures found for: http://example.com/ - -# No matching results ->>> cdx_ops_test('http://iana.org/dont_have_this', reverse = True, resolveRevisits = True, limit = 2) # doctest: +IGNORE_EXCEPTION_DETAIL -Traceback (most recent call last): -NotFoundException: No Captures found for: http://iana.org/dont_have_this - -# No matching -- limit=1 ->>> cdx_ops_test('http://iana.org/dont_have_this', reverse = True, resolveRevisits = True, limit = 1) # doctest: +IGNORE_EXCEPTION_DETAIL -Traceback (most recent call last): -NotFoundException: No Captures found for: http://iana.org/dont_have_this - -# Filter cdx (default: regex) ->>> cdx_ops_test(url = 'http://iana.org/domains', matchType = 'prefix', filter = ['mimetype:text/html']) -org,iana)/domains 20140126200825 http://www.iana.org/domains text/html 200 7UPSCLNWNZP33LGW6OJGSF2Y4CDG4ES7 - - 2912 610534 iana.warc.gz -org,iana)/domains/arpa 20140126201248 http://www.iana.org/domains/arpa text/html 200 QOFZZRN6JIKAL2JRL6ZC2VVG42SPKGHT - - 2939 759039 iana.warc.gz -org,iana)/domains/idn-tables 20140126201127 http://www.iana.org/domains/idn-tables text/html 200 HNCUFTJMOQOGAEY6T56KVC3T7TVLKGEW - - 8118 715878 iana.warc.gz -org,iana)/domains/int 20140126201239 http://www.iana.org/domains/int text/html 200 X32BBNNORV4SPEHTQF5KI5NFHSKTZK6Q - - 2482 746788 iana.warc.gz -org,iana)/domains/reserved 20140126201054 http://www.iana.org/domains/reserved text/html 200 R5AAEQX5XY5X5DG66B23ODN5DUBWRA27 - - 3573 701457 iana.warc.gz -org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz -org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz -org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz -org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz - ->>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', filter = 'statuscode:200') -org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz - -# Filter Alt field name ->>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', filter = 'status:200') -org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz - -# Filter -- no field specified, match regex on entire line ->>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', filter = '~screen.css 20140126200625') -org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz - -# Filter -- no such field, no matches ->>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', filter = 'blah:200') # doctest: +IGNORE_EXCEPTION_DETAIL -Traceback (most recent call last): -NotFoundException: No Captures found for: http://iana.org/_css/2013.1/screen.css - -# Filter exact -- (* prefix) ->>> cdx_ops_test(url = 'http://example.com*', sources = [test_cdx_dir], filter = '=urlkey:com,example)/?example=1') -com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz -com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz - -# Filter exact invert ->>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = ['!=urlkey:com,example)/?example=1', '!=urlkey:com,example)/?example=2', '!=urlkey:com,example)/?example=3']) -com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz -com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz -com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz - -# Filter contains ->>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '~urlkey:example=1') -com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz -com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz - -# Filter contains invert ->>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '!~urlkey:example=') -com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz -com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz -com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz - -# Collapse by timestamp -# unresolved revisits, different statuscode results in an extra repeat ->>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapseTime = 11) -org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz -org,iana)/_css/2013.1/screen.css 20140126200653 http://www.iana.org/_css/2013.1/screen.css warc/revisit - BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 533 328367 iana.warc.gz -org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css warc/revisit - BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz - -# resolved revisits ->>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapseTime = '11', resolveRevisits = True) -org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz - - - -org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz 8754 41238 iana.warc.gz - -# Sort by closest timestamp + field select output ->>> cdx_ops_test(closest = '20140126200826', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', fields = 'timestamp', limit = 10) -20140126200826 -20140126200816 -20140126200805 -20140126200912 -20140126200738 -20140126200930 -20140126200718 -20140126200706 -20140126200654 -20140126200625 - -# In case of both reverse and closest, closest takes precedence -# 'reverse closest' not supported at this time -# if it is, this test will reflect the change ->>> cdx_ops_test(closest = '20140126200826', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', fields = 'timestamp', limit = 3, reverse = True) -20140126200826 -20140126200816 -20140126200805 - ->>> cdx_ops_test(closest = '20140126201306', url = 'http://iana.org/dnssec', resolveRevisits = True, sources = [test_cdx_dir + 'dupes.cdx', test_cdx_dir + 'iana.cdx']) -org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz - - - -org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz - - - - - ->>> cdx_ops_test(closest = '20140126201307', url = 'http://iana.org/dnssec', resolveRevisits = True) -org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz - - - -org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz - - - - -# equal dist prefer earlier ->>> cdx_ops_test(closest = '20140126200700', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolveRevisits = True, limit = 2) -org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200654 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 548 482544 iana.warc.gz 117166 198285 iana.warc.gz -org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200706 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 495230 iana.warc.gz 117166 198285 iana.warc.gz - ->>> cdx_ops_test(closest = '20140126200659', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolveRevisits = True, limit = 2, fields = 'timestamp') -20140126200654 -20140126200706 - ->>> cdx_ops_test(closest = '20140126200701', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolveRevisits = True, limit = 2, fields = 'timestamp') -20140126200706 -20140126200654 - - -# Resolve Revisits ->>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/inconsolata.otf', resolveRevisits = True) -org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200826 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 34054 620049 iana.warc.gz - - - -org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200912 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 546 667073 iana.warc.gz 34054 620049 iana.warc.gz -org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200930 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 534 697255 iana.warc.gz 34054 620049 iana.warc.gz -org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201055 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 547 714833 iana.warc.gz 34054 620049 iana.warc.gz -org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201249 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 551 768625 iana.warc.gz 34054 620049 iana.warc.gz - ->>> cdx_ops_test('http://iana.org/domains/root/db', resolveRevisits = True) -org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz - - - -org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz - - - - -""" - -#================================================================= -from pywb.cdx.cdxserver import CDXServer -import os -import sys -import six - -from pywb import get_test_dir - -test_cdx_dir = get_test_dir() + 'cdx/' - - -def cdx_ops_test_data(url, sources = [test_cdx_dir + 'iana.cdx'], **kwparams): - kwparams['url'] = url - if not 'output' in kwparams: - kwparams['output'] = 'cdxobject' - - server = CDXServer(sources) - results = server.load_cdx(**kwparams) - return list(results) - - -def cdx_ops_test(*args, **kwargs): - results = cdx_ops_test_data(*args, **kwargs) - - fields = kwargs.get('fields') - if fields: - fields = fields.split(',') - - for x in results: - if not isinstance(x, str): - l = x.to_text(fields).replace('\t', ' ') - else: - l = x - - sys.stdout.write(l) - - - -def test_cdxj_resolve_revisit(): - # Resolve Revisit -- cdxj minimal -- output also json - results = cdx_ops_test_data(url = 'http://example.com/?example=1', sources=[get_test_dir() + 'cdxj/example.cdxj'], resolveRevisits=True) - assert(len(results) == 2) - assert(dict(results[0]) == {"urlkey": "com,example)/?example=1", "timestamp": "20140103030321", "url": "http://example.com?example=1", "length": "1043", "filename": "example.warc.gz", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "offset": "333", "orig.length": "-", "orig.offset": "-", "orig.filename": "-"}) - - assert(dict(results[1]) == {"urlkey": "com,example)/?example=1", "timestamp": "20140103030341", "url": "http://example.com?example=1", "filename": "example.warc.gz", "length": "553", "mime": "", "offset": "1864", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "orig.length": "1043", "orig.offset": "333", "orig.filename": "example.warc.gz"}) - - - -def test_cdxj_resolve_revisit_2(): - # Resolve Revisit -- cdxj minimal -- output also json - results = cdx_ops_test_data(url = 'http://example.com/?example=1', sources=[get_test_dir() + 'cdxj/example-no-digest.cdxj'], resolveRevisits=True) - assert(len(results) == 2) - assert(dict(results[0]) == {"urlkey": "com,example)/?example=1", "timestamp": "20140103030321", "url": "http://example.com?example=1", "length": "1043", "filename": "example.warc.gz", "offset": "333", "orig.length": "-", "orig.offset": "-", "orig.filename": "-"}) - - assert(dict(results[1]) == {"urlkey": "com,example)/?example=1", "timestamp": "20140103030341", "url": "http://example.com?example=1", "length": "553", "filename": "example.warc.gz", "mime": "warc/revisit", "offset": "1864", "orig.length": "-", "orig.offset": "-", "orig.filename": "-"}) - - - -if __name__ == "__main__": - import doctest - doctest.testmod() diff --git a/pywb/cdx/test/test_cdxserver_config.py b/pywb/cdx/test/test_cdxserver_config.py deleted file mode 100644 index 5c458270..00000000 --- a/pywb/cdx/test/test_cdxserver_config.py +++ /dev/null @@ -1,117 +0,0 @@ -import yaml -from pywb.cdx.cdxserver import create_cdx_server, CDXServer, RemoteCDXServer -from pywb.cdx.cdxsource import CDXFile, RemoteCDXSource, RedisCDXSource -from pywb.cdx.zipnum import ZipNumCluster - -from pywb import get_test_dir - -yaml_config = r""" -test_1: - index_paths: - # local cdx paths - - {0}cdx/example.cdx - - # simple remote cdx source, assumes no filtering - - http://cdxserver.example.com/cdx - - # customized remote cdx server - - !!python/object:pywb.cdx.cdxsource.RemoteCDXSource {{ - remote_url: 'http://cdxserver.example.com/cdx', - cookie: custom_token=value, - remote_processing: true, - }} - - # example redis cdx source - - redis://redis.example.com:6379/0 - - - {0}zipcdx/zipnum-sample.idx - -test_2: - index_paths: http://cdxserver.example.com/cdx - -test_3: http://cdxserver.example.com/cdx - -test_4: !!python/object:pywb.cdx.cdxsource.RemoteCDXSource {{ - remote_url: 'http://cdxserver.example.com/cdx', - cookie: custom_token=value, - remote_processing: true, - }} - -test_5: {0}cdx/example.cdx - -test_6: - index_paths: invalid://abc - - -""".format(get_test_dir()) - -def test_cdxserver_config(): - config = yaml.load(yaml_config) - cdxserver = create_cdx_server(config.get('test_1')) - assert(isinstance(cdxserver, CDXServer)) - sources = cdxserver.sources - assert len(sources) == 5 - - assert type(sources[0]) == CDXFile - assert sources[0].filename.endswith('example.cdx') - - # remote source with no remote processing - assert type(sources[1]) == RemoteCDXSource - assert sources[1].remote_url == 'http://cdxserver.example.com/cdx' - assert sources[1].remote_processing == False - - # remote cdx server with processing - assert type(sources[2]) == RemoteCDXSource - assert sources[2].remote_url == 'http://cdxserver.example.com/cdx' - assert sources[2].remote_processing == True - - # redis source - assert type(sources[3]) == RedisCDXSource - assert sources[3].redis_url == 'redis://redis.example.com:6379/0' - - assert type(sources[4]) == ZipNumCluster - assert sources[4].summary.endswith('zipnum-sample.idx') - assert sources[4].loc_resolver.loc_filename.endswith('zipnum-sample.loc') - - -def assert_remote_cdxserver(config_name): - config = yaml.load(yaml_config) - cdxserver = create_cdx_server(config.get(config_name)) - assert(isinstance(cdxserver, RemoteCDXServer)) - - source = cdxserver.source - - # remote cdx server with remote processing - assert type(source) == RemoteCDXSource - assert source.remote_url == 'http://cdxserver.example.com/cdx' - assert source.remote_processing == True - - -def test_remote_index_path(): - assert_remote_cdxserver('test_2') - -def test_no_index_path_remote(): - assert_remote_cdxserver('test_3') - -def test_explicit_remote_source(): - assert_remote_cdxserver('test_4') - - -def test_single_cdx(): - config = yaml.load(yaml_config) - cdxserver = create_cdx_server(config.get('test_5')) - assert(isinstance(cdxserver, CDXServer)) - sources = cdxserver.sources - assert len(sources) == 1 - - assert type(sources[0]) == CDXFile - assert sources[0].filename.endswith('example.cdx') - -def test_invalid_config(): - config = yaml.load(yaml_config) - cdxserver = create_cdx_server(config.get('test_6')) - assert(isinstance(cdxserver, CDXServer)) - sources = cdxserver.sources - assert len(sources) == 0 - - diff --git a/pywb/cdx/test/test_redis_source.py b/pywb/cdx/test/test_redis_source.py deleted file mode 100644 index 1fa65209..00000000 --- a/pywb/cdx/test/test_redis_source.py +++ /dev/null @@ -1,78 +0,0 @@ -""" ->>> redis_cdx(redis_cdx_server, 'http://example.com') -com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz -com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz -com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz - ->>> redis_cdx(redis_cdx_server_key, 'http://example.com') -com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz -com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz -com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz - -""" - -from fakeredis import FakeStrictRedis -from mock import patch - -from warcio.timeutils import timestamp_to_sec -from pywb.cdx.cdxsource import RedisCDXSource -from pywb.cdx.cdxserver import CDXServer - -from pywb import get_test_dir - -import sys -import os - -test_cdx_dir = os.path.join(get_test_dir(), 'cdx/') - -def load_cdx_into_redis(source, filename, key=None): - # load a cdx into mock redis - with open(test_cdx_dir + filename, 'rb') as fh: - for line in fh: - zadd_cdx(source, line, key) - -def zadd_cdx(source, cdx, key): - if key: - source.redis.zadd(key, 0, cdx) - return - - parts = cdx.split(b' ', 2) - - key = parts[0] - timestamp = parts[1] - rest = timestamp + b' ' + parts[2] - - score = timestamp_to_sec(timestamp.decode('utf-8')) - source.redis.zadd(source.key_prefix + key, score, rest) - - - -@patch('redis.StrictRedis', FakeStrictRedis) -def init_redis_server(): - source = RedisCDXSource('redis://127.0.0.1:6379/0') - - for f in os.listdir(test_cdx_dir): - if f.endswith('.cdx'): - load_cdx_into_redis(source, f) - - return CDXServer([source]) - -@patch('redis.StrictRedis', FakeStrictRedis) -def init_redis_server_key_file(): - source = RedisCDXSource('redis://127.0.0.1:6379/0/key') - - for f in os.listdir(test_cdx_dir): - if f.endswith('.cdx'): - load_cdx_into_redis(source, f, source.cdx_key) - - return CDXServer([source]) - - -def redis_cdx(cdx_server, url, **params): - cdx_iter = cdx_server.load_cdx(url=url, **params) - for cdx in cdx_iter: - sys.stdout.write(cdx) - -redis_cdx_server = init_redis_server() -redis_cdx_server_key = init_redis_server_key_file() - diff --git a/pywb/cdx/test/test_zipnum.py b/pywb/cdx/test/test_zipnum.py deleted file mode 100644 index ec450cfc..00000000 --- a/pywb/cdx/test/test_zipnum.py +++ /dev/null @@ -1,243 +0,0 @@ -""" ->>> zip_ops_test(url='http://iana.org') -org,iana)/ 20140126200624 http://www.iana.org/ text/html 200 OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 2258 334 iana.warc.gz -org,iana)/ 20140127171238 http://iana.org unk 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 343 1858 dupes.warc.gz -org,iana)/ 20140127171238 http://www.iana.org/ warc/revisit - OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 536 2678 dupes.warc.gz - -# test idx index (tabs replacad with 4 spaces) ->>> zip_ops_test(url='http://iana.org/domains/', matchType='prefix', showPagedIndex=True) -org,iana)/dnssec 20140126201307 zipnum 8517 373 35 -org,iana)/domains/int 20140126201239 zipnum 8890 355 36 -org,iana)/domains/root/servers 20140126201227 zipnum 9245 386 37 - - ->>> zip_ops_test(url='http://iana.org/domains/*') -org,iana)/domains/arpa 20140126201248 http://www.iana.org/domains/arpa text/html 200 QOFZZRN6JIKAL2JRL6ZC2VVG42SPKGHT - - 2939 759039 iana.warc.gz -org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz -org,iana)/domains/idn-tables 20140126201127 http://www.iana.org/domains/idn-tables text/html 200 HNCUFTJMOQOGAEY6T56KVC3T7TVLKGEW - - 8118 715878 iana.warc.gz -org,iana)/domains/int 20140126201239 http://www.iana.org/domains/int text/html 200 X32BBNNORV4SPEHTQF5KI5NFHSKTZK6Q - - 2482 746788 iana.warc.gz -org,iana)/domains/reserved 20140126201054 http://www.iana.org/domains/reserved text/html 200 R5AAEQX5XY5X5DG66B23ODN5DUBWRA27 - - 3573 701457 iana.warc.gz -org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz -org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz -org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz -org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz - -# first page ->>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', showPagedIndex=True, pageSize=4, page=0) -com,example)/ 20140127171200 zipnum 0 275 1 -org,iana)/ 20140127171238 zipnum 275 328 2 -org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201055 zipnum 603 312 3 -org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200718 zipnum 915 235 4 - - -# first page -- simplified query ->>> zip_ops_test(url='*.iana.org/path_part_ignored/', showPagedIndex=True, pageSize=4) -com,example)/ 20140127171200 zipnum 0 275 1 -org,iana)/ 20140127171238 zipnum 275 328 2 -org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201055 zipnum 603 312 3 -org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200718 zipnum 915 235 4 - -# next page + json ->>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', output='json', showPagedIndex=True, pageSize=4, page=1) -{"urlkey": "org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200912", "part": "zipnum", "offset": 1150, "length": 235, "lineno": 5} -{"urlkey": "org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201240", "part": "zipnum", "offset": 1385, "length": 307, "lineno": 6} -{"urlkey": "org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200654", "part": "zipnum", "offset": 1692, "length": 235, "lineno": 7} -{"urlkey": "org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200816", "part": "zipnum", "offset": 1927, "length": 231, "lineno": 8} - -# last page ->>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', showPagedIndex=True, pageSize=4, page=9) -org,iana)/domains/root/servers 20140126201227 zipnum 9245 386 37 -org,iana)/time-zones 20140126200737 zipnum 9631 166 38 - -# last page cdx ->>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', pageSize=4, page=9) -org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz -org,iana)/numbers 20140126200651 http://www.iana.org/numbers text/html 200 HWT5UZKURYLW5QNWVZCWFCANGEMU7XWK - - 3498 321385 iana.warc.gz -org,iana)/performance/ietf-draft-status 20140126200815 http://www.iana.org/performance/ietf-draft-status text/html 200 T5IQTX6DWV5KABGH454CYEDWKRI5Y23E - - 2940 597667 iana.warc.gz -org,iana)/performance/ietf-statistics 20140126200804 http://www.iana.org/performance/ietf-statistics text/html 200 XOFML5WNBQMTSULLIIPLSP6U5MX33HN6 - - 3712 582987 iana.warc.gz -org,iana)/protocols 20140126200715 http://www.iana.org/protocols text/html 200 IRUJZEUAXOUUG224ZMI4VWTUPJX6XJTT - - 63663 496277 iana.warc.gz -org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz -org,iana)/time-zones/x 20140126200737 http://www.iana.org/time-zones/X text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz -org,iana)/time-zones/y 20140126200737 http://www.iana.org/time-zones/Y text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz - -# last page reverse -- not yet supported -#>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', reverse=True, showPagedIndex=True, pageSize=4, page=9) -#org,iana)/time-zones 20140126200737 zipnum 9623 145 38 -#org,iana)/domains/root/servers 20140126201227 zipnum 9237 386 37 - - -# last page reverse CDX ->>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', reverse=True, pageSize=4, page=9) -org,iana)/time-zones/y 20140126200737 http://www.iana.org/time-zones/Y text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz -org,iana)/time-zones/x 20140126200737 http://www.iana.org/time-zones/X text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz -org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz -org,iana)/protocols 20140126200715 http://www.iana.org/protocols text/html 200 IRUJZEUAXOUUG224ZMI4VWTUPJX6XJTT - - 63663 496277 iana.warc.gz -org,iana)/performance/ietf-statistics 20140126200804 http://www.iana.org/performance/ietf-statistics text/html 200 XOFML5WNBQMTSULLIIPLSP6U5MX33HN6 - - 3712 582987 iana.warc.gz -org,iana)/performance/ietf-draft-status 20140126200815 http://www.iana.org/performance/ietf-draft-status text/html 200 T5IQTX6DWV5KABGH454CYEDWKRI5Y23E - - 2940 597667 iana.warc.gz -org,iana)/numbers 20140126200651 http://www.iana.org/numbers text/html 200 HWT5UZKURYLW5QNWVZCWFCANGEMU7XWK - - 3498 321385 iana.warc.gz -org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz - -# last url prefix ->>> zip_ops_test(url='http://iana.org/time-zones*') -org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz -org,iana)/time-zones/x 20140126200737 http://www.iana.org/time-zones/X text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz -org,iana)/time-zones/y 20140126200737 http://www.iana.org/time-zones/Y text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz - -# last url prefix w/ slash ->>> zip_ops_test(url='http://iana.org/time-zones/*') -org,iana)/time-zones/x 20140126200737 http://www.iana.org/time-zones/X text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz -org,iana)/time-zones/y 20140126200737 http://www.iana.org/time-zones/Y text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz - -# last url exact ->>> zip_ops_test(url='http://iana.org/time-zones/Y') -org,iana)/time-zones/y 20140126200737 http://www.iana.org/time-zones/Y text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz - -# invalid page ->>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', showPagedIndex=True, pageSize=4, page=10) # doctest: +IGNORE_EXCEPTION_DETAIL -Traceback (most recent call last): -CDXException: Page 10 invalid: First Page is 0, Last Page is 9 - - ->>> zip_ops_test(url='http://aaa.aaa/', matchType='exact', showPagedIndex=True) # doctest: +IGNORE_EXCEPTION_DETAIL -Traceback (most recent call last): -NotFoundException: No Captures found for: http://aaa.aaa/ - ->>> zip_ops_test(url='http://aaa.aaa/', matchType='domain', showPagedIndex=True) # doctest: +IGNORE_EXCEPTION_DETAIL -Traceback (most recent call last): -NotFoundException: No Captures found for: http://aaa.aaa/ (domain query) - -# list last index line, as we don't know if there are any captures at end ->>> zip_ops_test(url='http://aaa.zz/', matchType='domain', showPagedIndex=True) -org,iana)/time-zones 20140126200737 zipnum 9631 166 38 - -# read cdx to find no captures ->>> zip_ops_test(url='http://aaa.zz/', matchType='domain') # doctest: +IGNORE_EXCEPTION_DETAIL -Traceback (most recent call last): -NotFoundException: No Captures found for: http://aaa.zz/ (domain query) - -# Invalid .idx filesor or missing loc - ->>> zip_test_err(url='http://example.com/', matchType='exact') # doctest: +IGNORE_EXCEPTION_DETAIL -Traceback (most recent call last): -Exception: No Locations Found for: foo - - ->>> zip_test_err(url='http://example.zz/x', matchType='exact') # doctest: +IGNORE_EXCEPTION_DETAIL -Traceback (most recent call last): -Exception: No Locations Found for: foo2 - -""" - -from test_cdxops import cdx_ops_test, cdx_ops_test_data -from pywb import get_test_dir -from pywb.cdx.cdxserver import CDXServer - - -import shutil -import tempfile -import os -import json - -import pytest - - -test_zipnum = get_test_dir() + 'zipcdx/zipnum-sample.idx' - -def zip_ops_test_data(url, **kwargs): - sources = test_zipnum - return json.loads(cdx_ops_test_data(url, sources, **kwargs)[0]) - -def zip_ops_test(url, **kwargs): - sources = test_zipnum - cdx_ops_test(url, sources, **kwargs) - -def zip_test_err(url, **kwargs): - sources = get_test_dir() + 'zipcdx/zipnum-bad.idx' - cdx_ops_test(url, sources, **kwargs) - - -def test_zip_prefix_load(): - - tmpdir = tempfile.mkdtemp() - try: - shutil.copy(test_zipnum, tmpdir) - shutil.copy(get_test_dir() + 'zipcdx/zipnum-sample.cdx.gz', - os.path.join(tmpdir, 'zipnum')) - - config={} - config['shard_index_loc'] = dict(match='(.*)', - replace=r'\1') - server = CDXServer(os.path.join(tmpdir, 'zipnum-sample.idx'), - config=config) - - - # Test Page Count - results = server.load_cdx(url='iana.org/', - matchType='domain', - showNumPages=True) - - results = list(results) - assert len(results) == 1, results - assert json.loads(results[0]) == {"blocks": 38, "pages": 4, "pageSize": 10} - - - # Test simple query - results = server.load_cdx(url='iana.org/') - results = list(results) - assert len(results) ==3, results - assert '20140126200624' in results[0] - assert '20140127171238' in results[1] - assert 'warc/revisit' in results[2] - - finally: - shutil.rmtree(tmpdir) - - - -def test_blocks_def_page_size(): - # Pages -- default page size - res = zip_ops_test_data(url='http://iana.org/domains/example', matchType='exact', showNumPages=True) - assert(res == {"blocks": 1, "pages": 1, "pageSize": 10}) - -def test_blocks_def_size_2(): - res = zip_ops_test_data(url='http://iana.org/domains/', matchType='domain', showNumPages=True) - assert(res == {"blocks": 38, "pages": 4, "pageSize": 10}) - -def test_blocks_set_page_size(): - # set page size - res = zip_ops_test_data(url='http://iana.org/domains/', matchType='domain', pageSize=4, showNumPages=True) - assert(res == {"blocks": 38, "pages": 10, "pageSize": 4}) - -def test_blocks_alt_q(): - # set page size -- alt domain query - res = zip_ops_test_data(url='*.iana.org', pageSize='4', showNumPages=True) - assert(res == {"blocks": 38, "pages": 10, "pageSize": 4}) - -def test_blocks_secondary_match(): - # page size for non-existent, but secondary index match - res = zip_ops_test_data(url='iana.org/domains/int/blah', pageSize=4, showNumPages=True) - assert(res == {"blocks": 0, "pages": 0, "pageSize": 4}) - -def test_blocks_no_match(): - # page size for non-existent, no secondary index match - res = zip_ops_test_data(url='*.foo.bar', showNumPages=True) - assert(res == {"blocks": 0, "pages": 0, "pageSize": 10}) - -def test_blocks_zero_pages(): - # read cdx to find 0 pages - res = zip_ops_test_data(url='http://aaa.zz/', matchType='domain', showNumPages=True) - assert(res == {"blocks": 0, "pages": 0, "pageSize": 10}) - - -# Errors - -def test_err_file_not_found(): - with pytest.raises(IOError): - zip_test_err(url='http://iana.org/x', matchType='exact') # doctest: +IGNORE_EXCEPTION_DETAIL - - - - -if __name__ == "__main__": - import doctest - doctest.testmod() diff --git a/pywb/cdx/zipnum.py b/pywb/cdx/zipnum.py deleted file mode 100644 index a368083d..00000000 --- a/pywb/cdx/zipnum.py +++ /dev/null @@ -1,353 +0,0 @@ -import os -import collections -import itertools -import logging -from io import BytesIO -import datetime -import json -import six - -from six.moves import map - -from pywb.cdx.cdxsource import CDXSource -from pywb.cdx.cdxobject import IDXObject, CDXException - -from pywb.utils.loaders import BlockLoader, read_last_line -from warcio.bufferedreaders import gzip_decompressor -from pywb.utils.binsearch import iter_range, linearsearch, search - - -#================================================================= -class ZipBlocks: - def __init__(self, part, offset, length, count): - self.part = part - self.offset = offset - self.length = length - self.count = count - - -#================================================================= -#TODO: see if these could be combined with warc path resolvers - -class LocMapResolver(object): - """ Lookup shards based on a file mapping - shard name to one or more paths. The entries are - tab delimited. - """ - def __init__(self, loc_summary, loc_filename): - # initial loc map - self.loc_map = {} - self.loc_mtime = 0 - if not loc_filename: - splits = os.path.splitext(loc_summary) - loc_filename = splits[0] + '.loc' - self.loc_filename = loc_filename - - self.load_loc() - - def load_loc(self): - # check modified time of current file before loading - new_mtime = os.path.getmtime(self.loc_filename) - if (new_mtime == self.loc_mtime): - return - - # update loc file mtime - self.loc_mtime = new_mtime - - local_dir = os.path.dirname(self.loc_filename) - - def res_path(pathname): - if '://' not in pathname: - pathname = os.path.join(local_dir, pathname) - return pathname - - logging.debug('Loading loc from: ' + self.loc_filename) - with open(self.loc_filename, 'r') as fh: - for line in fh: - parts = line.rstrip().split('\t') - - paths = [res_path(pathname) for pathname in parts[1:]] - self.loc_map[parts[0]] = paths - - def __call__(self, part, query): - return self.loc_map[part] - - -#================================================================= -class LocPrefixResolver(object): - """ Use a prefix lookup, where the prefix can either be a fixed - string or can be a regex replacement of the index summary path - """ - def __init__(self, loc_summary, loc_config): - import re - loc_match = loc_config.get('match', '().*') - loc_replace = loc_config['replace'] - loc_summary = os.path.dirname(loc_summary) + '/' - self.prefix = re.sub(loc_match, loc_replace, loc_summary) - - def load_loc(self): - pass - - def __call__(self, part, query): - return [self.prefix + part] - - -#================================================================= -class ZipNumCluster(CDXSource): - DEFAULT_RELOAD_INTERVAL = 10 # in minutes - DEFAULT_MAX_BLOCKS = 10 - - def __init__(self, summary, config=None): - self.max_blocks = self.DEFAULT_MAX_BLOCKS - - self.loc_resolver = None - - loc = None - cookie_maker = None - reload_ival = self.DEFAULT_RELOAD_INTERVAL - - if config: - loc = config.get('shard_index_loc') - cookie_maker = config.get('cookie_maker') - - self.max_blocks = config.get('max_blocks', self.max_blocks) - - reload_ival = config.get('reload_interval', reload_ival) - - - if isinstance(loc, dict): - self.loc_resolver = LocPrefixResolver(summary, loc) - else: - self.loc_resolver = LocMapResolver(summary, loc) - - self.summary = summary - - # reload interval - self.loc_update_time = datetime.datetime.now() - self.reload_interval = datetime.timedelta(minutes=reload_ival) - - self.blk_loader = BlockLoader(cookie_maker=cookie_maker) - -# @staticmethod -# def reload_timed(timestamp, val, delta, func): -# now = datetime.datetime.now() -# if now - timestamp >= delta: -# func() -# return now -# return None -# -# def reload_loc(self): -# reload_time = self.reload_timed(self.loc_update_time, -# self.loc_map, -# self.reload_interval, -# self.load_loc) -# -# if reload_time: -# self.loc_update_time = reload_time - - def load_cdx(self, query): - self.loc_resolver.load_loc() - return self._do_load_cdx(self.summary, query) - - def _do_load_cdx(self, filename, query): - reader = open(filename, 'rb') - - idx_iter = self.compute_page_range(reader, query) - - if query.secondary_index_only or query.page_count: - return idx_iter - - blocks = self.idx_to_cdx(idx_iter, query) - - def gen_cdx(): - for blk in blocks: - for cdx in blk: - yield cdx - - return gen_cdx() - - - def _page_info(self, pages, pagesize, blocks): - info = dict(pages=pages, - pageSize=pagesize, - blocks=blocks) - return json.dumps(info) + '\n' - - def compute_page_range(self, reader, query): - pagesize = query.page_size - if not pagesize: - pagesize = self.max_blocks - else: - pagesize = int(pagesize) - - last_line = None - - # Get End - end_iter = search(reader, query.end_key, prev_size=1) - - try: - end_line = six.next(end_iter) - except StopIteration: - last_line = read_last_line(reader) - end_line = last_line - - # Get Start - first_iter = iter_range(reader, - query.key, - query.end_key, - prev_size=1) - - try: - first_line = six.next(first_iter) - except StopIteration: - if end_line == last_line and query.key >= last_line: - first_line = last_line - else: - reader.close() - if query.page_count: - yield self._page_info(0, pagesize, 0) - return - else: - raise - - first = IDXObject(first_line) - - end = IDXObject(end_line) - - try: - blocks = end['lineno'] - first['lineno'] - total_pages = int(blocks / pagesize) + 1 - except: - blocks = -1 - total_pages = 1 - - if query.page_count: - # same line, so actually need to look at cdx - # to determine if it exists - if blocks == 0: - try: - block_cdx_iter = self.idx_to_cdx([first_line], query) - block = six.next(block_cdx_iter) - cdx = six.next(block) - except StopIteration: - total_pages = 0 - blocks = -1 - - yield self._page_info(total_pages, pagesize, blocks + 1) - reader.close() - return - - curr_page = query.page - if curr_page >= total_pages or curr_page < 0: - msg = 'Page {0} invalid: First Page is 0, Last Page is {1}' - reader.close() - raise CDXException(msg.format(curr_page, total_pages - 1)) - - startline = curr_page * pagesize - endline = startline + pagesize - 1 - if blocks >= 0: - endline = min(endline, blocks) - - if curr_page == 0: - yield first_line - else: - startline -= 1 - - idxiter = itertools.islice(first_iter, startline, endline) - for idx in idxiter: - yield idx - - reader.close() - - - def search_by_line_num(self, reader, line): # pragma: no cover - def line_cmp(line1, line2): - line1_no = int(line1.rsplit(b'\t', 1)[-1]) - line2_no = int(line2.rsplit(b'\t', 1)[-1]) - return cmp(line1_no, line2_no) - - line_iter = search(reader, line, compare_func=line_cmp) - yield six.next(line_iter) - - def idx_to_cdx(self, idx_iter, query): - blocks = None - ranges = [] - - for idx in idx_iter: - idx = IDXObject(idx) - - if (blocks and blocks.part == idx['part'] and - blocks.offset + blocks.length == idx['offset'] and - blocks.count < self.max_blocks): - - blocks.length += idx['length'] - blocks.count += 1 - ranges.append(idx['length']) - - else: - if blocks: - yield self.block_to_cdx_iter(blocks, ranges, query) - - blocks = ZipBlocks(idx['part'], - idx['offset'], - idx['length'], - 1) - - ranges = [blocks.length] - - if blocks: - yield self.block_to_cdx_iter(blocks, ranges, query) - - def block_to_cdx_iter(self, blocks, ranges, query): - last_exc = None - last_traceback = None - - try: - locations = self.loc_resolver(blocks.part, query) - except: - raise Exception('No Locations Found for: ' + blocks.part) - - for location in self.loc_resolver(blocks.part, query): - try: - return self.load_blocks(location, blocks, ranges, query) - except Exception as exc: - last_exc = exc - import sys - last_traceback = sys.exc_info()[2] - - if last_exc: - six.reraise(Exception, last_exc, last_traceback) - #raise last_exc - else: - raise Exception('No Locations Found for: ' + blocks.part) - - def load_blocks(self, location, blocks, ranges, query): - """ Load one or more blocks of compressed cdx lines, return - a line iterator which decompresses and returns one line at a time, - bounded by query.key and query.end_key - """ - - if (logging.getLogger().getEffectiveLevel() <= logging.DEBUG): - msg = 'Loading {b.count} blocks from {loc}:{b.offset}+{b.length}' - logging.debug(msg.format(b=blocks, loc=location)) - - reader = self.blk_loader.load(location, blocks.offset, blocks.length) - - def decompress_block(range_): - decomp = gzip_decompressor() - buff = decomp.decompress(reader.read(range_)) - for line in BytesIO(buff): - yield line - - iter_ = itertools.chain(*map(decompress_block, ranges)) - - # start bound - iter_ = linearsearch(iter_, query.key) - - # end bound - iter_ = itertools.takewhile(lambda line: line < query.end_key, iter_) - return iter_ - - def __str__(self): - return 'ZipNum Cluster: {0}, {1}'.format(self.summary, - self.loc_resolver) diff --git a/pywb/framework/archivalrouter.py b/pywb/framework/archivalrouter.py deleted file mode 100644 index 866a2b8d..00000000 --- a/pywb/framework/archivalrouter.py +++ /dev/null @@ -1,245 +0,0 @@ -from six.moves.urllib.parse import urlsplit, urlunsplit, quote - -import re - -from pywb.rewrite.url_rewriter import UrlRewriter -from pywb.rewrite.wburl import WbUrl -from pywb.framework.wbrequestresponse import WbRequest, WbResponse - - -#================================================================= -# ArchivalRouter -- route WB requests in archival mode -#================================================================= -class ArchivalRouter(object): - def __init__(self, routes, **kwargs): - self.routes = routes - - # optional port setting may be ignored by wsgi container - self.port = kwargs.get('port') - - self.fallback = ReferRedirect() - - self.abs_path = kwargs.get('abs_path') - - self.home_view = kwargs.get('home_view') - self.error_view = kwargs.get('error_view') - self.info_view = kwargs.get('info_view') - - config = kwargs.get('config', {}) - self.urlrewriter_class = config.get('urlrewriter_class', UrlRewriter) - - self.enable_coll_info = config.get('enable_coll_info', False) - - def __call__(self, env): - request_uri = self.ensure_rel_uri_set(env) - - for route in self.routes: - matcher, coll = route.is_handling(request_uri) - if matcher: - wbrequest = self.parse_request(route, env, matcher, - coll, request_uri, - use_abs_prefix=self.abs_path) - - return route.handler(wbrequest) - - # Default Home Page - if request_uri in ['/', '/index.html', '/index.htm']: - return self.render_home_page(env) - - if self.enable_coll_info and request_uri in ['/collinfo.json']: - params = env.get('pywb.template_params', {}) - host = WbRequest.make_host_prefix(env) - return self.info_view.render_response(env=env, host=host, routes=self.routes, - content_type='application/json', - **params) - - return self.fallback(env, self) if self.fallback else None - - def parse_request(self, route, env, matcher, coll, request_uri, - use_abs_prefix=False): - matched_str = matcher.group(0) - rel_prefix = env.get('SCRIPT_NAME', '') + '/' - - if matched_str: - rel_prefix += matched_str + '/' - # remove the '/' + rel_prefix part of uri - wb_url_str = request_uri[len(matched_str) + 2:] - else: - # the request_uri is the wb_url, since no coll - wb_url_str = request_uri[1:] - - wbrequest = route.request_class(env, - request_uri=request_uri, - wb_url_str=wb_url_str, - rel_prefix=rel_prefix, - coll=coll, - use_abs_prefix=use_abs_prefix, - wburl_class=route.handler.get_wburl_type(), - urlrewriter_class=self.urlrewriter_class, - cookie_scope=route.cookie_scope, - rewrite_opts=route.rewrite_opts, - user_metadata=route.user_metadata) - - # Allow for applying of additional filters - route.apply_filters(wbrequest, matcher) - - return wbrequest - - def render_home_page(self, env): - if self.home_view: - params = env.get('pywb.template_params', {}) - return self.home_view.render_response(env=env, routes=self.routes, **params) - else: - return None - - #================================================================= - # adapted from wsgiref.request_uri, but doesn't include domain name - # and allows all characters which are allowed in the path segment - # according to: http://tools.ietf.org/html/rfc3986#section-3.3 - # explained here: - # http://stackoverflow.com/questions/4669692/ - # valid-characters-for-directory-part-of-a-url-for-short-links - - @staticmethod - def ensure_rel_uri_set(env): - """ Return the full requested path, including the query string - """ - if 'REL_REQUEST_URI' in env: - return env['REL_REQUEST_URI'] - - if not env.get('SCRIPT_NAME') and env.get('REQUEST_URI'): - env['REL_REQUEST_URI'] = env['REQUEST_URI'] - return env['REL_REQUEST_URI'] - - url = quote(env.get('PATH_INFO', ''), safe='/~!$&\'()*+,;=:@') - query = env.get('QUERY_STRING') - if query: - url += '?' + query - - env['REL_REQUEST_URI'] = url - return url - - -#================================================================= -# Route by matching regex (or fixed prefix) -# of request uri (excluding first '/') -#================================================================= -class Route(object): - # match upto next / or ? or end - SLASH_QUERY_LOOKAHEAD = '(?=/|$|\?)' - - def __init__(self, regex, handler, config=None, - request_class=WbRequest, - lookahead=SLASH_QUERY_LOOKAHEAD): - - config = config or {} - self.path = regex - if regex: - self.regex = re.compile(regex + lookahead) - else: - self.regex = re.compile('') - - self.handler = handler - self.request_class = request_class - - # collection id from regex group (default 0) - self.coll_group = int(config.get('coll_group', 0)) - self.cookie_scope = config.get('cookie_scope') - self.rewrite_opts = config.get('rewrite_opts', {}) - self.user_metadata = config.get('metadata', {}) - self._custom_init(config) - - def is_handling(self, request_uri): - matcher = self.regex.match(request_uri[1:]) - if not matcher: - return None, None - - coll = matcher.group(self.coll_group) - return matcher, coll - - def apply_filters(self, wbrequest, matcher): - for filter in self.filters: - last_grp = len(matcher.groups()) - filter_str = filter.format(matcher.group(last_grp)) - wbrequest.query_filter.append(filter_str) - - def _custom_init(self, config): - self.filters = config.get('filters', []) - - -#================================================================= -# ReferRedirect -- redirect urls that have 'fallen through' -# based on the referrer settings -#================================================================= -class ReferRedirect: - def __call__(self, env, the_router): - referrer = env.get('HTTP_REFERER') - - routes = the_router.routes - - # ensure there is a referrer - if referrer is None: - return None - - # get referrer path name - ref_split = urlsplit(referrer) - - # require that referrer starts with current Host, if any - curr_host = env.get('HTTP_HOST') - if curr_host and curr_host != ref_split.netloc: - return None - - path = ref_split.path - - app_path = env.get('SCRIPT_NAME', '') - - if app_path: - # must start with current app name, if not root - if not path.startswith(app_path): - return None - - path = path[len(app_path):] - - ref_route = None - ref_request = None - - for route in routes: - matcher, coll = route.is_handling(path) - if matcher: - ref_request = the_router.parse_request(route, env, - matcher, coll, path) - ref_route = route - break - - # must have matched one of the routes with a urlrewriter - if not ref_request or not ref_request.urlrewriter: - return None - - rewriter = ref_request.urlrewriter - - rel_request_uri = env['REL_REQUEST_URI'] - - timestamp_path = '/' + rewriter.wburl.timestamp + '/' - - # check if timestamp is already part of the path - if rel_request_uri.startswith(timestamp_path): - # remove timestamp but leave / to make host relative url - # 2013/path.html -> /path.html - rel_request_uri = rel_request_uri[len(timestamp_path) - 1:] - - rewritten_url = rewriter.rewrite(rel_request_uri) - - # if post, can't redirect as that would lost the post data - # (can't use 307 because FF will show confirmation warning) - if ref_request.method == 'POST': - new_wb_url = WbUrl(rewritten_url[len(rewriter.prefix):]) - ref_request.wb_url.url = new_wb_url.url - return ref_route.handler(ref_request) - - final_url = urlunsplit((ref_split.scheme, - ref_split.netloc, - rewritten_url, - '', - '')) - - return WbResponse.redir_response(final_url, status='302 Temp Redirect') diff --git a/pywb/framework/basehandlers.py b/pywb/framework/basehandlers.py deleted file mode 100644 index db8508d8..00000000 --- a/pywb/framework/basehandlers.py +++ /dev/null @@ -1,23 +0,0 @@ -from pywb.rewrite.wburl import WbUrl - - -#================================================================= -class BaseHandler(object): - """ - Represents a base handler class that handles any request - """ - def __call__(self, wbrequest): # pragma: no cover - raise NotImplementedError('Need to implement in derived class') - - def get_wburl_type(self): - return None - - -#================================================================= -class WbUrlHandler(BaseHandler): - """ - Represents a handler which assumes the request contains a WbUrl - Ensure that the WbUrl is parsed in the request - """ - def get_wburl_type(self): - return WbUrl diff --git a/pywb/framework/cache.py b/pywb/framework/cache.py deleted file mode 100644 index 3c97ba5b..00000000 --- a/pywb/framework/cache.py +++ /dev/null @@ -1,62 +0,0 @@ -try: # pragma: no cover - import uwsgi - uwsgi_cache = True -except ImportError: - uwsgi_cache = False - - -from redis import StrictRedis -from pywb.utils.loaders import to_native_str - - -#================================================================= -class UwsgiCache(object): # pragma: no cover - def __setitem__(self, item, value): - uwsgi.cache_update(item, value) - - def __getitem__(self, item): - return uwsgi.cache_get(item) - - def __contains__(self, item): - return uwsgi.cache_exists(item) - - def __delitem__(self, item): - uwsgi.cache_del(item) - - -#================================================================= -class DefaultCache(dict): - def __getitem__(self, item): - return self.get(item) - - -#================================================================= -class RedisCache(object): - def __init__(self, redis_url): - # must be of the form redis://host:port/db/key - redis_url, key = redis_url.rsplit('/', 1) - self.redis = StrictRedis.from_url(redis_url) - self.key = key - - def __setitem__(self, item, value): - self.redis.hset(self.key, item, value) - - def __getitem__(self, item): - return to_native_str(self.redis.hget(self.key, item), 'utf-8') - - def __contains__(self, item): - return self.redis.hexists(self.key, item) - - def __delitem__(self, item): - self.redis.hdel(self.key, item) - - -#================================================================= -def create_cache(redis_url_key=None): - if redis_url_key: - return RedisCache(redis_url_key) - - if uwsgi_cache: # pragma: no cover - return UwsgiCache() - else: - return DefaultCache() diff --git a/pywb/framework/memento.py b/pywb/framework/memento.py deleted file mode 100644 index 9f6fbe87..00000000 --- a/pywb/framework/memento.py +++ /dev/null @@ -1,231 +0,0 @@ -from pywb.utils.wbexception import BadRequestException -from warcio.timeutils import http_date_to_timestamp -from warcio.timeutils import timestamp_to_http_date - -from pywb.framework.wbrequestresponse import WbRequest, WbResponse -from pywb.rewrite.wburl import WbUrl - -import six -LINK_FORMAT = 'application/link-format' - - -#================================================================= -class MementoReqMixin(object): - def _parse_extra(self): - if not self.wb_url: - return - - if self.wb_url.type != self.wb_url.LATEST_REPLAY: - return - - self.options['is_timegate'] = True - - accept_datetime = self.env.get('HTTP_ACCEPT_DATETIME') - if not accept_datetime: - return - - try: - timestamp = http_date_to_timestamp(accept_datetime) - except Exception: - raise BadRequestException('Invalid Accept-Datetime: ' + - accept_datetime) - - # note: this changes from LATEST_REPLAY -> REPLAY - self.wb_url.set_replay_timestamp(timestamp) - - -#================================================================= -class MementoRequest(MementoReqMixin, WbRequest): - pass - - -#================================================================= -class MementoRespMixin(object): - def _init_derived(self, params): - wbrequest = params.get('wbrequest') - is_redirect = params.get('memento_is_redir', False) - cdx = params.get('cdx') - - if not wbrequest or not wbrequest.wb_url: - return - - mod = wbrequest.options.get('replay_mod', '') - - #is_top_frame = wbrequest.wb_url.is_top_frame - is_top_frame = wbrequest.options.get('is_top_frame', False) - - is_timegate = (wbrequest.options.get('is_timegate', False) and - not is_top_frame) - - if is_timegate: - self.status_headers.replace_header('Vary', 'accept-datetime') - - # Determine if memento: - is_memento = False - is_original = False - - # if no cdx included, not a memento, unless top-frame special - if not cdx: - # special case: include the headers but except Memento-Datetime - # since this is really an intermediate resource - if is_top_frame: - is_memento = True - - # otherwise, if in proxy mode, then always a memento - elif wbrequest.options['is_proxy']: - is_memento = True - is_original = True - - # otherwise only if timestamp replay (and not a timegate) - #elif not is_timegate: - # is_memento = (wbrequest.wb_url.type == wbrequest.wb_url.REPLAY) - elif not is_redirect: - is_memento = (wbrequest.wb_url.is_replay()) - - link = [] - req_url = wbrequest.wb_url.url - - if is_memento or is_timegate: - url = req_url - if cdx: - ts = cdx['timestamp'] - url = cdx['url'] - # for top frame - elif wbrequest.wb_url.timestamp: - ts = wbrequest.wb_url.timestamp - else: - ts = None - - if ts: - http_date = timestamp_to_http_date(ts) - - if is_memento: - self.status_headers.replace_header('Memento-Datetime', - http_date) - - canon_link = wbrequest.urlrewriter.get_new_url(mod=mod, - timestamp=ts, - url=url) - - # set in replay_views -- Must set content location - #if is_memento and is_timegate: - # self.status_headers.headers.append(('Content-Location', - # canon_link)) - - # don't set memento link for very long urls... - if len(canon_link) < 512: - link.append(self.make_memento_link(canon_link, - 'memento', - http_date)) - - if is_original and is_timegate: - link.append(self.make_link(req_url, 'original timegate')) - else: - link.append(self.make_link(req_url, 'original')) - - # for now, include timemap only in non-proxy mode - if not wbrequest.options['is_proxy'] and (is_memento or is_timegate): - link.append(self.make_timemap_link(wbrequest)) - - if is_memento and not is_timegate: - timegate = wbrequest.urlrewriter.get_new_url(mod=mod, timestamp='') - link.append(self.make_link(timegate, 'timegate')) - - link = ', '.join(link) - - self.status_headers.replace_header('Link', link) - - def make_link(self, url, type): - return '<{0}>; rel="{1}"'.format(url, type) - - def make_memento_link(self, url, type_, dt): - return '<{0}>; rel="{1}"; datetime="{2}"'.format(url, type_, dt) - - def make_timemap_link(self, wbrequest): - format_ = '<{0}>; rel="timemap"; type="{1}"' - - url = wbrequest.urlrewriter.get_new_url(mod='timemap', - timestamp='', - type=wbrequest.wb_url.QUERY) - - return format_.format(url, LINK_FORMAT) - - -#================================================================= -class MementoResponse(MementoRespMixin, WbResponse): - pass - - -#================================================================= -def make_timemap_memento_link(cdx, prefix, datetime=None, - rel='memento', end=',\n', mod=''): - - memento = '<{0}>; rel="{1}"; datetime="{2}"' + end - - string = WbUrl.to_wburl_str(url=cdx['url'], - mod=mod, - timestamp=cdx['timestamp'], - type=WbUrl.REPLAY) - - url = prefix + string - - if not datetime: - datetime = timestamp_to_http_date(cdx['timestamp']) - - return memento.format(url, rel, datetime) - - -#================================================================= -def make_timemap(wbrequest, cdx_lines): - prefix = wbrequest.wb_prefix - url = wbrequest.wb_url.url - mod = wbrequest.options.get('replay_mod', '') - - # get first memento as it'll be used for 'from' field - try: - first_cdx = six.next(cdx_lines) - from_date = timestamp_to_http_date(first_cdx['timestamp']) - except StopIteration: - first_cdx = None - - - if first_cdx: - # timemap link - timemap = ('<{0}>; rel="self"; ' + - 'type="application/link-format"; from="{1}",\n') - yield timemap.format(prefix + wbrequest.wb_url.to_str(), - from_date) - - # original link - original = '<{0}>; rel="original",\n' - yield original.format(url) - - # timegate link - timegate = '<{0}>; rel="timegate",\n' - timegate_url= WbUrl.to_wburl_str(url=url, - mod=mod, - type=WbUrl.LATEST_REPLAY) - - yield timegate.format(prefix + timegate_url) - - if not first_cdx: - # terminating timemap link, no from - timemap = ('<{0}>; rel="self"; type="application/link-format"') - yield timemap.format(prefix + wbrequest.wb_url.to_str()) - return - - # first memento link - yield make_timemap_memento_link(first_cdx, prefix, - datetime=from_date, mod=mod) - - prev_cdx = None - - for cdx in cdx_lines: - if prev_cdx: - yield make_timemap_memento_link(prev_cdx, prefix, mod=mod) - - prev_cdx = cdx - - # last memento link, if any - if prev_cdx: - yield make_timemap_memento_link(prev_cdx, prefix, end='', mod=mod) diff --git a/pywb/framework/proxy.py b/pywb/framework/proxy.py deleted file mode 100644 index 9f5a1f26..00000000 --- a/pywb/framework/proxy.py +++ /dev/null @@ -1,463 +0,0 @@ -from __future__ import absolute_import - -from pywb.framework.wbrequestresponse import WbResponse, WbRequest -from pywb.framework.archivalrouter import ArchivalRouter - -from six.moves.urllib.parse import urlsplit -from six import iteritems -import base64 - -import socket -import ssl - -from io import BytesIO - -from pywb.rewrite.url_rewriter import SchemeOnlyUrlRewriter, UrlRewriter -from pywb.rewrite.rewrite_content import RewriteContent -from pywb.utils.wbexception import BadRequestException - -from warcio.bufferedreaders import BufferedReader -from warcio.utils import to_native_str - -from pywb.framework.proxy_resolvers import ProxyAuthResolver, CookieResolver, IPCacheResolver - -from tempfile import SpooledTemporaryFile - - -#================================================================= -class ProxyArchivalRouter(ArchivalRouter): - """ - A router which combines both archival and proxy modes support - First, request is treated as a proxy request using ProxyRouter - Second, if not handled by the router, it is treated as a regular - archival mode request. - """ - def __init__(self, routes, **kwargs): - super(ProxyArchivalRouter, self).__init__(routes, **kwargs) - self.proxy = ProxyRouter(routes, **kwargs) - - def __call__(self, env): - response = self.proxy(env) - if response: - return response - - response = super(ProxyArchivalRouter, self).__call__(env) - if response: - return response - - -#================================================================= -class ProxyRouter(object): - """ - A router which supports http proxy mode requests - Handles requests of the form: GET http://example.com - - The router returns latest capture by default. - However, if Memento protocol support is enabled, - the memento Accept-Datetime header can be used - to select specific capture. - See: http://www.mementoweb.org/guide/rfc/#Pattern1.3 - for more details. - """ - - BLOCK_SIZE = 4096 - DEF_MAGIC_NAME = 'pywb.proxy' - BUFF_RESPONSE_MEM_SIZE = 1024*1024 - - CERT_DL_PEM = '/pywb-ca.pem' - CERT_DL_P12 = '/pywb-ca.p12' - - CA_ROOT_FILE = './ca/pywb-ca.pem' - CA_ROOT_NAME = 'pywb https proxy replay CA' - CA_CERTS_DIR = './ca/certs/' - - EXTRA_HEADERS = {'cache-control': 'no-cache', - 'connection': 'close', - 'p3p': 'CP="NOI ADM DEV COM NAV OUR STP"'} - - def __init__(self, routes, **kwargs): - self.error_view = kwargs.get('error_view') - - proxy_options = kwargs.get('config', {}) - if proxy_options: - proxy_options = proxy_options.get('proxy_options', {}) - - self.magic_name = proxy_options.get('magic_name') - if not self.magic_name: - self.magic_name = self.DEF_MAGIC_NAME - proxy_options['magic_name'] = self.magic_name - - self.extra_headers = proxy_options.get('extra_headers') - if not self.extra_headers: - self.extra_headers = self.EXTRA_HEADERS - proxy_options['extra_headers'] = self.extra_headers - - res_type = proxy_options.get('cookie_resolver', True) - if res_type == 'auth' or not res_type: - self.resolver = ProxyAuthResolver(routes, proxy_options) - elif res_type == 'ip': - self.resolver = IPCacheResolver(routes, proxy_options) - #elif res_type == True or res_type == 'cookie': - # self.resolver = CookieResolver(routes, proxy_options) - else: - self.resolver = CookieResolver(routes, proxy_options) - - self.use_banner = proxy_options.get('use_banner', True) - self.use_wombat = proxy_options.get('use_client_rewrite', True) - - self.proxy_cert_dl_view = proxy_options.get('proxy_cert_download_view') - - if not proxy_options.get('enable_https_proxy'): - self.ca = None - return - - try: - from certauth.certauth import CertificateAuthority - except ImportError: #pragma: no cover - print('HTTPS proxy is not available as the "certauth" module ' + - 'is not installed') - print('Please install via "pip install certauth" ' + - 'to enable HTTPS support') - self.ca = None - return - - # HTTPS Only Options - ca_file = proxy_options.get('root_ca_file', self.CA_ROOT_FILE) - - # attempt to create the root_ca_file if doesn't exist - # (generally recommended to create this seperately) - ca_name = proxy_options.get('root_ca_name', self.CA_ROOT_NAME) - - certs_dir = proxy_options.get('certs_dir', self.CA_CERTS_DIR) - self.ca = CertificateAuthority(ca_file=ca_file, - certs_dir=certs_dir, - ca_name=ca_name) - - self.use_wildcard = proxy_options.get('use_wildcard_certs', True) - - def __call__(self, env): - is_https = (env['REQUEST_METHOD'] == 'CONNECT') - ArchivalRouter.ensure_rel_uri_set(env) - - # for non-https requests, check non-proxy urls - if not is_https: - url = env['REL_REQUEST_URI'] - - if not url.startswith(('http://', 'https://')): - return None - - env['pywb.proxy_scheme'] = 'http' - - route = None - coll = None - matcher = None - response = None - ts = None - - # check resolver, for pre connect resolve - if self.resolver.pre_connect: - route, coll, matcher, ts, response = self.resolver.resolve(env) - if response: - return response - - # do connect, then get updated url - if is_https: - response = self.handle_connect(env) - if response: - return response - - url = env['REL_REQUEST_URI'] - else: - parts = urlsplit(env['REL_REQUEST_URI']) - hostport = parts.netloc.split(':', 1) - env['pywb.proxy_host'] = hostport[0] - env['pywb.proxy_port'] = hostport[1] if len(hostport) == 2 else '' - env['pywb.proxy_req_uri'] = parts.path - if parts.query: - env['pywb.proxy_req_uri'] += '?' + parts.query - env['pywb.proxy_query'] = parts.query - - if self.resolver.supports_switching: - env['pywb_proxy_magic'] = self.magic_name - - # route (static) and other resources to archival replay - if env['pywb.proxy_host'] == self.magic_name: - env['REL_REQUEST_URI'] = env['pywb.proxy_req_uri'] - - # special case for proxy install - response = self.handle_cert_install(env) - if response: - return response - - return None - - # check resolver, post connect - if not self.resolver.pre_connect: - route, coll, matcher, ts, response = self.resolver.resolve(env) - if response: - return response - - rel_prefix = '' - - custom_prefix = env.get('HTTP_PYWB_REWRITE_PREFIX', '') - if custom_prefix: - host_prefix = custom_prefix - urlrewriter_class = UrlRewriter - abs_prefix = True - # always rewrite to absolute here - rewrite_opts = dict(no_match_rel=True) - else: - host_prefix = env['pywb.proxy_scheme'] + '://' + self.magic_name - urlrewriter_class = SchemeOnlyUrlRewriter - abs_prefix = False - rewrite_opts = {} - - # special case for proxy calendar - if (env['pywb.proxy_host'] == 'query.' + self.magic_name): - url = env['pywb.proxy_req_uri'][1:] - rel_prefix = '/' - - if ts is not None: - url = ts + '/' + url - - wbrequest = route.request_class(env, - request_uri=url, - wb_url_str=url, - coll=coll, - host_prefix=host_prefix, - rel_prefix=rel_prefix, - wburl_class=route.handler.get_wburl_type(), - urlrewriter_class=urlrewriter_class, - use_abs_prefix=abs_prefix, - rewrite_opts=rewrite_opts, - is_proxy=True) - - if matcher: - route.apply_filters(wbrequest, matcher) - - # full rewrite and banner - if self.use_wombat and self.use_banner: - wbrequest.wb_url.mod = '' - elif self.use_banner: - # banner only, no rewrite - wbrequest.wb_url.mod = 'bn_' - else: - # unaltered, no rewrite or banner - wbrequest.wb_url.mod = 'uo_' - - response = route.handler(wbrequest) - if not response: - return None - - # add extra headers for replay responses - if wbrequest.wb_url and wbrequest.wb_url.is_replay(): - for name, value in iteritems(self.extra_headers): - response.status_headers.replace_header(name, value) - - # check for content-length - res = response.status_headers.get_header('content-length') - try: - if int(res) > 0: - return response - except: - pass - - # need to either chunk or buffer to get content-length - if env.get('SERVER_PROTOCOL') == 'HTTP/1.1': - response.status_headers.remove_header('content-length') - response.status_headers.headers.append(('Transfer-Encoding', 'chunked')) - response.body = self._chunk_encode(response.body) - else: - response.body = self._buffer_response(response.status_headers, - response.body) - - return response - - @staticmethod - def _chunk_encode(orig_iter): - for chunk in orig_iter: - if not len(chunk): - continue - chunk_len = b'%X\r\n' % len(chunk) - yield chunk_len - yield chunk - yield b'\r\n' - - yield b'0\r\n\r\n' - - @staticmethod - def _buffer_response(status_headers, iterator): - out = SpooledTemporaryFile(ProxyRouter.BUFF_RESPONSE_MEM_SIZE) - size = 0 - - for buff in iterator: - size += len(buff) - out.write(buff) - - content_length_str = str(size) - # remove existing content length - status_headers.replace_header('Content-Length', - content_length_str) - - out.seek(0) - return RewriteContent.stream_to_gen(out) - - def get_request_socket(self, env): - if not self.ca: - return None - - sock = None - - if env.get('uwsgi.version'): # pragma: no cover - try: - import uwsgi - fd = uwsgi.connection_fd() - conn = socket.fromfd(fd, socket.AF_INET, socket.SOCK_STREAM) - try: - sock = socket.socket(_sock=conn) - except: - sock = conn - except Exception as e: - pass - elif env.get('gunicorn.socket'): # pragma: no cover - sock = env['gunicorn.socket'] - - if not sock: - # attempt to find socket from wsgi.input - input_ = env.get('wsgi.input') - if input_: - if hasattr(input_, '_sock'): # pragma: no cover - raw = input_._sock - sock = socket.socket(_sock=raw) # pragma: no cover - elif hasattr(input_, 'raw'): - sock = input_.raw._sock - - return sock - - def handle_connect(self, env): - sock = self.get_request_socket(env) - if not sock: - return WbResponse.text_response('HTTPS Proxy Not Supported', - '405 HTTPS Proxy Not Supported') - - sock.send(b'HTTP/1.0 200 Connection Established\r\n') - sock.send(b'Proxy-Connection: close\r\n') - sock.send(b'Server: pywb proxy\r\n') - sock.send(b'\r\n') - - hostname, port = env['REL_REQUEST_URI'].split(':') - - if not self.use_wildcard: - certfile = self.ca.cert_for_host(hostname) - else: - certfile = self.ca.get_wildcard_cert(hostname) - - try: - ssl_sock = ssl.wrap_socket(sock, - server_side=True, - certfile=certfile, - #ciphers="ALL", - suppress_ragged_eofs=False, - ssl_version=ssl.PROTOCOL_SSLv23 - ) - env['pywb.proxy_ssl_sock'] = ssl_sock - - buffreader = BufferedReader(ssl_sock, block_size=self.BLOCK_SIZE) - - statusline = to_native_str(buffreader.readline().rstrip()) - - except Exception as se: - raise BadRequestException(se.message) - - statusparts = statusline.split(' ') - - if len(statusparts) < 3: - raise BadRequestException('Invalid Proxy Request: ' + statusline) - - env['REQUEST_METHOD'] = statusparts[0] - env['REL_REQUEST_URI'] = ('https://' + - env['REL_REQUEST_URI'].replace(':443', '') + - statusparts[1]) - - env['SERVER_PROTOCOL'] = statusparts[2].strip() - - env['pywb.proxy_scheme'] = 'https' - - env['pywb.proxy_host'] = hostname - env['pywb.proxy_port'] = port - env['pywb.proxy_req_uri'] = statusparts[1] - - queryparts = env['REL_REQUEST_URI'].split('?', 1) - env['PATH_INFO'] = queryparts[0] - env['QUERY_STRING'] = queryparts[1] if len(queryparts) > 1 else '' - env['pywb.proxy_query'] = env['QUERY_STRING'] - - while True: - line = to_native_str(buffreader.readline()) - if line: - line = line.rstrip() - - if not line: - break - - parts = line.split(':', 1) - if len(parts) < 2: - continue - - name = parts[0].strip() - value = parts[1].strip() - - name = name.replace('-', '_').upper() - - if name not in ('CONTENT_LENGTH', 'CONTENT_TYPE'): - name = 'HTTP_' + name - - env[name] = value - - env['wsgi.input'] = buffreader - #remain = buffreader.rem_length() - #if remain > 0: - #remainder = buffreader.read() - #env['wsgi.input'] = BufferedReader(BytesIO(remainder)) - #remainder = buffreader.read(self.BLOCK_SIZE) - #env['wsgi.input'] = BufferedReader(ssl_sock, - # block_size=self.BLOCK_SIZE, - # starting_data=remainder) - - def handle_cert_install(self, env): - if env['pywb.proxy_req_uri'] in ('/', '/index.html', '/index.html'): - available = (self.ca is not None) - - if self.proxy_cert_dl_view: - return (self.proxy_cert_dl_view. - render_response(available=available, - pem_path=self.CERT_DL_PEM, - p12_path=self.CERT_DL_P12)) - - elif env['pywb.proxy_req_uri'] == self.CERT_DL_PEM: - if not self.ca: - return None - - buff = b'' - with open(self.ca.ca_file, 'rb') as fh: - buff = fh.read() - - content_type = 'application/x-x509-ca-cert' - headers = [('Content-Length', str(len(buff)))] - - return WbResponse.bin_stream([buff], - content_type=content_type, - headers=headers) - - elif env['pywb.proxy_req_uri'] == self.CERT_DL_P12: - if not self.ca: - return None - - buff = self.ca.get_root_PKCS12() - - content_type = 'application/x-pkcs12' - headers = [('Content-Length', str(len(buff)))] - - return WbResponse.bin_stream([buff], - content_type=content_type, - headers=headers) diff --git a/pywb/framework/proxy_resolvers.py b/pywb/framework/proxy_resolvers.py deleted file mode 100644 index e4b34cb5..00000000 --- a/pywb/framework/proxy_resolvers.py +++ /dev/null @@ -1,374 +0,0 @@ -from pywb.framework.wbrequestresponse import WbResponse -from pywb.utils.loaders import extract_client_cookie -from pywb.utils.wbexception import WbException -from pywb.rewrite.wburl import WbUrl - -from pywb.framework.cache import create_cache -from pywb.framework.basehandlers import WbUrlHandler - -from six.moves.urllib.parse import parse_qs, urlsplit -import six - -from warcio.statusandheaders import StatusAndHeaders -from warcio.utils import to_native_str - -import base64 -import os -import json - - -#================================================================= -class BaseCollResolver(object): - def __init__(self, routes, config): - self.routes = routes - self.use_default_coll = config.get('use_default_coll') - - @property - def pre_connect(self): - return False - - def resolve(self, env): - route = None - coll = None - matcher = None - ts = None - - proxy_coll, ts = self.get_proxy_coll_ts(env) - - # invalid parsing - if proxy_coll == '': - return None, None, None, None, self.select_coll_response(env, proxy_coll) - - if proxy_coll is None and isinstance(self.use_default_coll, str): - proxy_coll = self.use_default_coll - - if proxy_coll: - path = '/' + proxy_coll + '/' - - for r in self.routes: - matcher, c = r.is_handling(path) - if matcher: - route = r - coll = c - break - - # if no match, return coll selection response - if not route: - return None, None, None, None, self.select_coll_response(env, proxy_coll) - - # if 'use_default_coll', find first WbUrl-handling collection - elif self.use_default_coll: - raise Exception('use_default_coll: true no longer supported, please specify collection name') - #for route in self.routes: - # if isinstance(route.handler, WbUrlHandler): - # return route, route.path, matcher, ts, None - - # otherwise, return the appropriate coll selection response - else: - return None, None, None, None, self.select_coll_response(env, proxy_coll) - - return route, coll, matcher, ts, None - - -#================================================================= -class ProxyAuthResolver(BaseCollResolver): - DEFAULT_MSG = 'Please enter name of a collection to use with proxy mode' - - def __init__(self, routes, config): - super(ProxyAuthResolver, self).__init__(routes, config) - self.auth_msg = config.get('auth_msg', self.DEFAULT_MSG) - - @property - def pre_connect(self): - return True - - @property - def supports_switching(self): - return False - - def get_proxy_coll_ts(self, env): - proxy_auth = env.get('HTTP_PROXY_AUTHORIZATION') - - if not proxy_auth: - return None, None - - proxy_coll = self.read_basic_auth_coll(proxy_auth) - return proxy_coll, None - - def select_coll_response(self, env, default_coll=None): - proxy_msg = 'Basic realm="{0}"'.format(self.auth_msg) - - headers = [('Content-Type', 'text/plain'), - ('Proxy-Authenticate', proxy_msg)] - - status_headers = StatusAndHeaders('407 Proxy Authentication', headers) - - value = self.auth_msg - - return WbResponse(status_headers, value=[value.encode('utf-8')]) - - @staticmethod - def read_basic_auth_coll(value): - parts = value.split(' ') - if parts[0].lower() != 'basic': - return '' - - if len(parts) != 2: - return '' - - user_pass = base64.b64decode(parts[1].encode('utf-8')) - return to_native_str(user_pass.split(b':')[0]) - - -#================================================================= -class IPCacheResolver(BaseCollResolver): - def __init__(self, routes, config): - super(IPCacheResolver, self).__init__(routes, config) - self.cache = create_cache(config.get('redis_cache_key')) - self.magic_name = config['magic_name'] - - @property - def supports_switching(self): - return False - - def _get_ip(self, env): - ip = env['REMOTE_ADDR'] - qs = env.get('pywb.proxy_query') - if qs: - res = parse_qs(qs) - - if 'ip' in res: - ip = res['ip'][0] - - return ip - - def select_coll_response(self, env, default_coll=None): - raise WbException('Invalid Proxy Collection Specified: ' + str(default_coll)) - - def get_proxy_coll_ts(self, env): - ip = env['REMOTE_ADDR'] - qs = env.get('pywb.proxy_query') - - if qs: - res = parse_qs(qs) - - if 'ip' in res: - ip = res['ip'][0] - - if 'delete' in res: - del self.cache[ip + ':c'] - del self.cache[ip + ':t'] - else: - if 'coll' in res: - self.cache[ip + ':c'] = res['coll'][0] - - if 'ts' in res: - self.cache[ip + ':t'] = res['ts'][0] - - coll = self.cache[ip + ':c'] - ts = self.cache[ip + ':t'] - return coll, ts - - def resolve(self, env): - server_name = env['pywb.proxy_host'] - - if self.magic_name in server_name: - response = self.handle_magic_page(env) - if response: - return None, None, None, None, response - - return super(IPCacheResolver, self).resolve(env) - - def handle_magic_page(self, env): - coll, ts = self.get_proxy_coll_ts(env) - ip = self._get_ip(env) - res = json.dumps({'ip': ip, 'coll': coll, 'ts': ts}) - return WbResponse.text_response(res, content_type='application/json') - - -#================================================================= -class CookieResolver(BaseCollResolver): - SESH_COOKIE_NAME = '__pywb_proxy_sesh' - - def __init__(self, routes, config): - super(CookieResolver, self).__init__(routes, config) - self.magic_name = config['magic_name'] - self.sethost_prefix = '-sethost.' + self.magic_name + '.' - self.set_prefix = '-set.' + self.magic_name - - self.cookie_name = config.get('cookie_name', self.SESH_COOKIE_NAME) - self.proxy_select_view = config.get('proxy_select_view') - - self.extra_headers = config.get('extra_headers') - - self.cache = create_cache() - - @property - def supports_switching(self): - return True - - def get_proxy_coll_ts(self, env): - coll, ts, sesh_id = self.get_coll(env) - return coll, ts - - def select_coll_response(self, env, default_coll=None): - return self.make_magic_response('auto', - env['REL_REQUEST_URI'], - env) - - def resolve(self, env): - server_name = env['pywb.proxy_host'] - - if ('.' + self.magic_name) in server_name: - response = self.handle_magic_page(env) - if response: - return None, None, None, None, response - - return super(CookieResolver, self).resolve(env) - - def handle_magic_page(self, env): - request_url = env['REL_REQUEST_URI'] - parts = urlsplit(request_url) - server_name = env['pywb.proxy_host'] - - path_url = parts.path[1:] - if parts.query: - path_url += '?' + parts.query - - if server_name.startswith('auto'): - coll, ts, sesh_id = self.get_coll(env) - - if coll: - return self.make_sethost_cookie_response(sesh_id, - path_url, - env) - else: - return self.make_magic_response('select', path_url, env) - - elif server_name.startswith('query.'): - wb_url = WbUrl(path_url) - - # only dealing with specific timestamp setting - if wb_url.is_query(): - return None - - coll, ts, sesh_id = self.get_coll(env) - if not coll: - return self.make_magic_response('select', path_url, env) - - self.set_ts(sesh_id, wb_url.timestamp) - return self.make_redir_response(wb_url.url) - - elif server_name.endswith(self.set_prefix): - old_sesh_id = extract_client_cookie(env, self.cookie_name) - sesh_id = self.create_renew_sesh_id(old_sesh_id) - - if sesh_id != old_sesh_id: - headers = self.make_cookie_headers(sesh_id, self.magic_name) - else: - headers = None - - coll = server_name[:-len(self.set_prefix)] - - # set sesh value - self.set_coll(sesh_id, coll) - - return self.make_sethost_cookie_response(sesh_id, path_url, env, - headers=headers) - - elif self.sethost_prefix in server_name: - inx = server_name.find(self.sethost_prefix) - sesh_id = server_name[:inx] - - domain = server_name[inx + len(self.sethost_prefix):] - - headers = self.make_cookie_headers(sesh_id, domain) - - full_url = env['pywb.proxy_scheme'] + '://' + domain - full_url += '/' + path_url - return self.make_redir_response(full_url, headers=headers) - - elif 'select.' in server_name: - coll, ts, sesh_id = self.get_coll(env) - - route_temp = '-set.' + self.magic_name + '/' + path_url - - return (self.proxy_select_view. - render_response(routes=self.routes, - route_temp=route_temp, - coll=coll, - url=path_url)) - #else: - # msg = 'Invalid Magic Path: ' + url - # print msg - # return WbResponse.text_response(msg, status='404 Not Found') - - def make_cookie_headers(self, sesh_id, domain): - cookie_val = '{0}={1}; Path=/; Domain=.{2}; HttpOnly' - cookie_val = cookie_val.format(self.cookie_name, sesh_id, domain) - headers = [('Set-Cookie', cookie_val)] - return headers - - def make_sethost_cookie_response(self, sesh_id, path_url, - env, headers=None): - if '://' not in path_url: - path_url = 'http://' + path_url - - path_parts = urlsplit(path_url) - - new_url = path_parts.path[1:] - if path_parts.query: - new_url += '?' + path_parts.query - - return self.make_magic_response(sesh_id + '-sethost', new_url, env, - suffix=path_parts.netloc, - headers=headers) - - def make_magic_response(self, prefix, url, env, - suffix=None, headers=None): - full_url = env['pywb.proxy_scheme'] + '://' + prefix + '.' - full_url += self.magic_name - if suffix: - full_url += '.' + suffix - full_url += '/' + url - return self.make_redir_response(full_url, headers=headers) - - def set_coll(self, sesh_id, coll): - self.cache[sesh_id + ':c'] = coll - - def set_ts(self, sesh_id, ts): - if ts: - self.cache[sesh_id + ':t'] = ts - # this ensures that omitting timestamp will reset to latest - # capture by deleting the cache entry - else: - del self.cache[sesh_id + ':t'] - - def get_coll(self, env): - sesh_id = extract_client_cookie(env, self.cookie_name) - - coll = None - ts = None - if sesh_id: - coll = self.cache[sesh_id + ':c'] - ts = self.cache[sesh_id + ':t'] - - return coll, ts, sesh_id - - def create_renew_sesh_id(self, sesh_id, force=False): - #if sesh_id in self.cache and not force: - if sesh_id and ((sesh_id + ':c') in self.cache) and not force: - return sesh_id - - sesh_id = base64.b32encode(os.urandom(5)).lower() - return to_native_str(sesh_id) - - def make_redir_response(self, url, headers=None): - if not headers: - headers = [] - - if self.extra_headers: - for name, value in six.iteritems(self.extra_headers): - headers.append((name, value)) - - return WbResponse.redir_response(url, headers=headers) diff --git a/pywb/framework/test/test_archivalrouter.py b/pywb/framework/test/test_archivalrouter.py deleted file mode 100644 index 2bdb79a9..00000000 --- a/pywb/framework/test/test_archivalrouter.py +++ /dev/null @@ -1,135 +0,0 @@ -""" -# Test WbRequest parsed via a Route -# route with relative path, print resulting wbrequest ->>> _test_route_req(Route('web', WbUrlHandler()), {'REL_REQUEST_URI': '/web/test.example.com', 'SCRIPT_NAME': ''}) -{'coll': 'web', - 'request_uri': '/web/test.example.com', - 'wb_prefix': '/web/', - 'wb_url': ('latest_replay', '', '', 'http://test.example.com', 'http://test.example.com')} - - -# route with absolute path, running at script /my_pywb, print resultingwbrequest ->>> _test_route_req(Route('web', WbUrlHandler()), {'REL_REQUEST_URI': '/web/2013im_/test.example.com', 'SCRIPT_NAME': '/my_pywb', 'HTTP_HOST': 'localhost:8081', 'wsgi.url_scheme': 'https'}, True) -{'coll': 'web', - 'request_uri': '/web/2013im_/test.example.com', - 'wb_prefix': 'https://localhost:8081/my_pywb/web/', - 'wb_url': ('replay', '2013', 'im_', 'http://test.example.com', '2013im_/http://test.example.com')} - -# route with no collection ->>> _test_route_req(Route('', BaseHandler()), {'REL_REQUEST_URI': 'http://example.com', 'SCRIPT_NAME': '/pywb'}) -{'coll': '', - 'request_uri': 'http://example.com', - 'wb_prefix': '/pywb/', - 'wb_url': None} - -# not matching route -- skipped ->>> _test_route_req(Route('web', BaseHandler()), {'REL_REQUEST_URI': '/other/test.example.com', 'SCRIPT_NAME': ''}) - -# Test Refer Redirects ->>> _test_redir('http://localhost:8080/', '/diff_path/other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html') -'http://localhost:8080/coll/20131010/http://example.com/diff_path/other.html' - ->>> _test_redir('http://localhost:8080/', '/../other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html') -'http://localhost:8080/coll/20131010/http://example.com/other.html' - ->>> _test_redir('http://localhost:8080/', '/../../other.html', 'http://localhost:8080/coll/20131010/http://example.com/index.html') -'http://localhost:8080/coll/20131010/http://example.com/other.html' - -# Custom collection ->>> _test_redir('http://localhost:8080/', '/other.html', 'http://localhost:8080/complex/123/20131010/http://example.com/path/page.html', coll='complex/123') -'http://localhost:8080/complex/123/20131010/http://example.com/other.html' - -# With timestamp included ->>> _test_redir('http://localhost:8080/', '/20131010/other.html', 'http://localhost:8080/coll/20131010/http://example.com/index.html') -'http://localhost:8080/coll/20131010/http://example.com/other.html' - -# With timestamp included ->>> _test_redir('http://localhost:8080/', '/20131010/path/other.html', 'http://localhost:8080/coll/20131010/http://example.com/some/index.html') -'http://localhost:8080/coll/20131010/http://example.com/path/other.html' - -# Wrong Host ->>> _test_redir('http://example.com:8080/', '/other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html') -False - -# Right Host ->>> _test_redir('http://example.com:8080/', '/other.html', 'http://example.com:8080/coll/20131010/http://example.com/path/page.html') -'http://example.com:8080/coll/20131010/http://example.com/other.html' - -# With custom SCRIPT_NAME ->>> _test_redir('http://localhost:8080/', '/../other.html', 'http://localhost:8080/extra/coll/20131010/http://example.com/path/page.html', '/extra') -'http://localhost:8080/extra/coll/20131010/http://example.com/other.html' - -# With custom SCRIPT_NAME + timestamp ->>> _test_redir('http://localhost:8080/', '/20131010/other.html', 'http://localhost:8080/extra/coll/20131010/http://example.com/path/page.html', '/extra') -'http://localhost:8080/extra/coll/20131010/http://example.com/other.html' - -# With custom SCRIPT_NAME, bad match ->>> _test_redir('http://localhost:8080/', '/../other.html', 'http://localhost:8080/extra/coll/20131010/http://example.com/path/page.html', '/extr') -False - -# With no collection ->>> _test_redir('http://localhost:8080/', '/other.html', 'http://localhost:8080/2013/http://example.com/path/page.html', coll='') -'http://localhost:8080/2013/http://example.com/other.html' - -# With SCRIPT_NAME but no collection ->>> _test_redir('http://localhost:8080/', '/other.html', 'http://localhost:8080/pywb-access/http://example.com/path/page.html', '/pywb-access', coll='') -'http://localhost:8080/pywb-access/http://example.com/other.html' - - ->>> _test_redir('http://localhost:8080/', '/some/example/other.html', 'http://localhost:8080/user/coll/http://example.com/path/page.html', '/user/coll', coll='') -'http://localhost:8080/user/coll/http://example.com/some/example/other.html' - -## Test ensure_rel_uri_set - -# Simple test: ->>> ArchivalRouter.ensure_rel_uri_set({'PATH_INFO': '/pywb/example.com'}) -'/pywb/example.com' - -# Test all unecoded special chars and double-quote -# (double-quote must be encoded but not single quote) ->>> ArchivalRouter.ensure_rel_uri_set({'PATH_INFO': "/pywb/example.com/0~!+$&'()*+,;=:\\\""}) -"/pywb/example.com/0~!+$&'()*+,;=:%22" - -""" - -from pywb.framework.archivalrouter import Route, ReferRedirect, ArchivalRouter -from pywb.framework.basehandlers import BaseHandler, WbUrlHandler - -import pprint - -from six.moves.urllib.parse import urlsplit - -def _test_route_req(route, env, abs_path=False): - matcher, coll = route.is_handling(env['REL_REQUEST_URI']) - if not matcher: - return - - the_router = ArchivalRouter([route], abs_path=abs_path) - req = the_router.parse_request(route, env, matcher, coll, env['REL_REQUEST_URI'], abs_path) - - varlist = vars(req) - the_dict = dict((k, varlist[k]) for k in ('request_uri', 'wb_prefix', 'wb_url', 'coll')) - pprint.pprint(the_dict) - - -def _test_redir(match_host, request_uri, referrer, script_name='', coll='coll'): - env = {'REL_REQUEST_URI': request_uri, 'HTTP_REFERER': referrer, 'SCRIPT_NAME': script_name} - - env['HTTP_HOST'] = urlsplit(match_host).netloc - - routes = [Route(coll, WbUrlHandler())] - - the_router = ArchivalRouter(routes) - - redir = ReferRedirect() - #req = WbRequest.from_uri(request_uri, env) - rep = redir(env, the_router) - if not rep: - return False - - return rep.status_headers.get_header('Location') - - -if __name__ == "__main__": - import doctest - doctest.testmod() diff --git a/pywb/framework/test/test_wbrequestresponse.py b/pywb/framework/test/test_wbrequestresponse.py index 587dcc9e..4937f8f1 100644 --- a/pywb/framework/test/test_wbrequestresponse.py +++ b/pywb/framework/test/test_wbrequestresponse.py @@ -1,178 +1,6 @@ -""" -# WbRequest Tests -# ================= -#>>> get_req_from_uri('/save/_embed/example.com/?a=b') -{'wb_url': ('latest_replay', '', '', 'http://_embed/example.com/?a=b', 'http://_embed/example.com/?a=b'), 'coll': 'save', 'wb_prefix': '/save/', 'request_uri': '/save/_embed/example.com/?a=b'} - -#>>> get_req_from_uri('/2345/20101024101112im_/example.com/?b=c') -{'wb_url': ('replay', '20101024101112', 'im_', 'http://example.com/?b=c', '20101024101112im_/http://example.com/?b=c'), 'coll': '2345', 'wb_prefix': '/2345/', 'request_uri': '/2345/20101024101112im_/example.com/?b=c'} - -#>>> get_req_from_uri('/2010/example.com') -{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': '/2010/', 'request_uri': '/2010/example.com'} - -# ajax -#>>> get_req_from_uri('', {'REL_REQUEST_URI': '/2010/example.com', 'HTTP_HOST': 'localhost:8080', 'HTTP_X_REQUESTED_WITH': 'XMLHttpRequest'}) -{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': '/2010/', 'request_uri': '/2010/example.com'} - -#>>> get_req_from_uri('../example.com') -{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '', 'wb_prefix': '/', 'request_uri': '../example.com'} - -# Abs path -#>>> get_req_from_uri('/2010/example.com', {'wsgi.url_scheme': 'https', 'HTTP_HOST': 'localhost:8080'}, use_abs_prefix = True) -{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': 'https://localhost:8080/2010/', 'request_uri': '/2010/example.com'} - -# No Scheme, default to http (shouldn't happen per WSGI standard) -#>>> get_req_from_uri('/2010/example.com', {'HTTP_HOST': 'localhost:8080'}, use_abs_prefix = True) -{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': 'http://localhost:8080/2010/', 'request_uri': '/2010/example.com'} - -# Referrer extraction ->>> WbUrl(req_from_uri('/web/2010/example.com', {'wsgi.url_scheme': 'http', 'HTTP_HOST': 'localhost:8080', 'HTTP_REFERER': 'http://localhost:8080/web/2011/blah.example.com/'}).extract_referrer_wburl_str()).url -'http://blah.example.com/' - -# incorrect referer ->>> req_from_uri('/web/2010/example.com', {'wsgi.url_scheme': 'http', 'HTTP_HOST': 'localhost:8080', 'HTTP_REFERER': 'http://other.example.com/web/2011/blah.example.com/'}).extract_referrer_wburl_str() - - -# no referer ->>> req_from_uri('/web/2010/example.com', {'wsgi.url_scheme': 'http', 'HTTP_HOST': 'localhost:8080'}).extract_referrer_wburl_str() - -# range requests ->>> req_from_uri('/web/2014/example.com', dict(HTTP_RANGE='bytes=10-100')).extract_range() -('http://example.com', 10, 100, True) - ->>> req_from_uri('/web/2014/example.com', dict(HTTP_RANGE='bytes=0-')).extract_range() -('http://example.com', 0, '', True) - ->>> req_from_uri('/web/www.googlevideo.com/videoplayback?id=123&range=0-65535').extract_range() -('http://www.googlevideo.com/videoplayback?id=123', 0, 65535, False) - ->>> req_from_uri('/web/www.googlevideo.com/videoplayback?id=123&range=100-200').extract_range() -('http://www.googlevideo.com/videoplayback?id=123', 100, 200, False) - -# invalid range requests ->>> req_from_uri('/web/2014/example.com', dict(HTTP_RANGE='10-20')).extract_range() - ->>> req_from_uri('/web/2014/example.com', dict(HTTP_RANGE='A-5')).extract_range() - ->>> req_from_uri('/web/www.googlevideo.com/videoplayback?id=123&range=100-').extract_range() - -""" - - -from pywb.rewrite.wburl import WbUrl -from pywb.rewrite.url_rewriter import UrlRewriter +from pywb.framework.wbrequestresponse import WbResponse from warcio.statusandheaders import StatusAndHeaders -from pywb.framework.wbrequestresponse import WbRequest, WbResponse - - -def get_req_from_uri(request_uri, env={}, use_abs_prefix=False): - response = req_from_uri(request_uri, env, use_abs_prefix) - varlist = vars(response) - the_dict = dict((k, varlist[k]) for k in ('request_uri', 'wb_prefix', 'wb_url', 'coll')) - #print(the_dict) - return the_dict - -def req_from_uri(request_uri, env={}, use_abs_prefix=False): - if not request_uri: - request_uri = env.get('REL_REQUEST_URI') - - parts = request_uri.split('/', 2) - - # Has coll prefix - if len(parts) == 3: - rel_prefix = '/' + parts[1] + '/' - wb_url_str = parts[2] - coll = parts[1] - # No Coll Prefix - elif len(parts) == 2: - rel_prefix = '/' - wb_url_str = parts[1] - coll = '' - else: - rel_prefix = '/' - wb_url_str = parts[0] - coll = '' - - return WbRequest(env, - request_uri=request_uri, - rel_prefix=rel_prefix, - wb_url_str=wb_url_str, - coll=coll, - wburl_class=WbUrl, - urlrewriter_class=UrlRewriter, - use_abs_prefix=use_abs_prefix) - - -def test_req_1(): - res = get_req_from_uri('/save/_embed/example.com/?a=b') - - assert(repr(res['wb_url']) == "('latest_replay', '', '', 'http://_embed/example.com/?a=b', 'http://_embed/example.com/?a=b')") - assert(res['coll'] == 'save') - assert(res['wb_prefix'] == '/save/') - assert(res['request_uri'] == '/save/_embed/example.com/?a=b') - -def test_req_2(): - res = get_req_from_uri('/2345/20101024101112im_/example.com/?b=c') - - assert(repr(res['wb_url']) == "('replay', '20101024101112', 'im_', 'http://example.com/?b=c', '20101024101112im_/http://example.com/?b=c')") - assert(res['coll'] == '2345') - assert(res['wb_prefix'] == '/2345/') - assert(res['request_uri'] == '/2345/20101024101112im_/example.com/?b=c') - -def test_req_3(): - res = get_req_from_uri('/2010/example.com') - - assert(repr(res['wb_url']) == "('latest_replay', '', '', 'http://example.com', 'http://example.com')") - assert(res['coll'] == '2010') - assert(res['wb_prefix'] == '/2010/') - assert(res['request_uri'] == '/2010/example.com') - - -def test_req_4(): - # ajax - res = get_req_from_uri('', {'REL_REQUEST_URI': '/2010/example.com', 'HTTP_HOST': 'localhost:8080', 'HTTP_X_REQUESTED_WITH': 'XMLHttpRequest'}) - - assert(repr(res['wb_url']) == "('latest_replay', '', '', 'http://example.com', 'http://example.com')") - assert(res['coll'] == '2010') - assert(res['wb_prefix'] == '/2010/') - assert(res['request_uri'] == '/2010/example.com') - - -def test_req_5(): - res = get_req_from_uri('../example.com') - - assert(repr(res['wb_url']) == "('latest_replay', '', '', 'http://example.com', 'http://example.com')") - assert(res['coll'] == '') - assert(res['wb_prefix'] == '/') - assert(res['request_uri'] == '../example.com') - - - -def test_req_6(): - # Abs path - res = get_req_from_uri('/2010/example.com', {'wsgi.url_scheme': 'https', 'HTTP_HOST': 'localhost:8080'}, use_abs_prefix = True) - - assert(repr(res['wb_url']) == "('latest_replay', '', '', 'http://example.com', 'http://example.com')") - assert(res['coll'] == '2010') - assert(res['wb_prefix'] == 'https://localhost:8080/2010/') - assert(res['request_uri'] == '/2010/example.com') - - -def test_req_7(): - # No Scheme, default to http (shouldn't happen per WSGI standard) - res = get_req_from_uri('/2010/example.com', {'HTTP_HOST': 'localhost:8080'}, use_abs_prefix = True) - - assert(repr(res['wb_url']) == "('latest_replay', '', '', 'http://example.com', 'http://example.com')") - assert(res['coll'] == '2010') - assert(res['wb_prefix'] == 'http://localhost:8080/2010/') - assert(res['request_uri'] == '/2010/example.com') - - - - - -#Response tests def test_resp_1(): resp = vars(WbResponse.text_response('Test')) diff --git a/pywb/framework/test/test_wsgi_wrapper.py b/pywb/framework/test/test_wsgi_wrapper.py deleted file mode 100644 index 18bde0fd..00000000 --- a/pywb/framework/test/test_wsgi_wrapper.py +++ /dev/null @@ -1,57 +0,0 @@ -from pywb.framework.wsgi_wrappers import init_app - -from pywb.utils.wbexception import AccessException - -import webtest - -class TestOkApp: - def __call__(self, env): - def response(env, start_response): - start_response('200 OK', []) - return [b'Test'] - return response - -class TestErrApp: - def __call__(self, env): - raise Exception('Test Unexpected Error') - -class TestCustomErrApp: - def __call__(self, env): - raise AccessException('Forbidden Test') - - -def initer(app_class): - def init(config=None): - return app_class() - return init - -def test_ok_app(): - the_app = init_app(initer(TestOkApp), load_yaml=False) - - testapp = webtest.TestApp(the_app) - resp = testapp.get('/') - - assert resp.status_int == 200 - assert b'Test' in resp.body, resp.body - -def test_err_app(): - the_app = init_app(initer(TestErrApp), load_yaml=False) - - testapp = webtest.TestApp(the_app) - resp = testapp.get('/abc', expect_errors=True) - - assert resp.status_int == 500 - assert b'500 Internal Server Error Error: Test Unexpected Error' in resp.body - -def test_custom_err_app(): - the_app = init_app(initer(TestCustomErrApp), load_yaml=False) - - testapp = webtest.TestApp(the_app) - resp = testapp.get('/abc', expect_errors=True) - - assert resp.status_int == 403 - assert b'403 Access Denied Error: Forbidden Test' in resp.body - - - - diff --git a/pywb/framework/wbrequestresponse.py b/pywb/framework/wbrequestresponse.py index 1cbf171d..b2d0db25 100644 --- a/pywb/framework/wbrequestresponse.py +++ b/pywb/framework/wbrequestresponse.py @@ -1,204 +1,8 @@ from warcio.statusandheaders import StatusAndHeaders -from pywb.utils.loaders import extract_post_query, append_post_query -from io import BytesIO -import pprint -import re import json -#================================================================= -class WbRequest(object): - """ - Represents the main pywb request object. - - Contains various info from wsgi env, add additional info - about the request, such as coll, relative prefix, - host prefix, absolute prefix. - - If a wburl and url rewriter classes are specified, the class - also contains the url rewriter. - - """ - @staticmethod - def make_host_prefix(env): - try: - host = env.get('HTTP_HOST') - if not host: - host = env['SERVER_NAME'] + ':' + env['SERVER_PORT'] - - return env.get('wsgi.url_scheme', 'http') + '://' + host - except KeyError: - return '' - - def __init__(self, env, - request_uri=None, - rel_prefix='', - wb_url_str='/', - coll='', - host_prefix='', - use_abs_prefix=False, - wburl_class=None, - urlrewriter_class=None, - is_proxy=False, - cookie_scope=None, - rewrite_opts={}, - user_metadata={}, - ): - - self.env = env - - if request_uri: - self.request_uri = request_uri - else: - self.request_uri = env.get('REL_REQUEST_URI') - - self.method = self.env.get('REQUEST_METHOD') - - self.coll = coll - - self.final_mod = '' - - if not host_prefix: - host_prefix = self.make_host_prefix(env) - - self.host_prefix = host_prefix - self.rel_prefix = rel_prefix - - if use_abs_prefix: - self.wb_prefix = host_prefix + rel_prefix - else: - self.wb_prefix = rel_prefix - - if not wb_url_str: - wb_url_str = '/' - - self.wb_url_str = wb_url_str - - # wb_url present and not root page - if wb_url_str != '/' and wburl_class: - self.wb_url = wburl_class(wb_url_str) - self.urlrewriter = urlrewriter_class(self.wb_url, - self.wb_prefix, - host_prefix + rel_prefix, - rel_prefix, - env.get('SCRIPT_NAME', '/'), - cookie_scope, - rewrite_opts) - - self.urlrewriter.deprefix_url() - # no wb_url, just store blank wb_url - else: - self.wb_url = None - self.urlrewriter = None - - self.referrer = env.get('HTTP_REFERER') - - self.options = dict() - self.options['is_ajax'] = self._is_ajax() - self.options['is_proxy'] = is_proxy or env.get('pywb_proxy_magic') - - self.query_filter = [] - self.custom_params = {} - self.user_metadata = user_metadata - self.rewrite_opts = rewrite_opts - - # PERF - env['X_PERF'] = {} - - if env.get('HTTP_X_PYWB_NOREDIRECT'): - self.custom_params['noredir'] = True - - self._parse_extra() - - def _is_ajax(self): - value = self.env.get('HTTP_X_REQUESTED_WITH') - value = value or self.env.get('HTTP_X_PYWB_REQUESTED_WITH') - if value and value.lower() == 'xmlhttprequest': - return True - - return False - - RANGE_ARG_RX = re.compile('.*.googlevideo.com/videoplayback.*([&?]range=(\d+)-(\d+))') - - RANGE_HEADER = re.compile('bytes=(\d+)-(\d+)?') - - def extract_range(self): - url = self.wb_url.url - use_206 = False - start = None - end = None - - range_h = self.env.get('HTTP_RANGE') - - if range_h: - m = self.RANGE_HEADER.match(range_h) - if m: - start = m.group(1) - end = m.group(2) - use_206 = True - - else: - m = self.RANGE_ARG_RX.match(url) - if m: - start = m.group(2) - end = m.group(3) - url = url[:m.start(1)] + url[m.end(1):] - use_206 = False - - if not start: - return None - - start = int(start) - self.custom_params['noredir'] = True - - if end: - end = int(end) - else: - end = '' - - result = (url, start, end, use_206) - return result - - def __repr__(self): - varlist = vars(self) - varstr = pprint.pformat(varlist) - return varstr - - def _parse_extra(self): - pass - - def extract_referrer_wburl_str(self): - if not self.referrer: - return None - - if not self.referrer.startswith(self.host_prefix + self.rel_prefix): - return None - - wburl_str = self.referrer[len(self.host_prefix + self.rel_prefix):] - return wburl_str - - def normalize_post_query(self): - if self.method != 'POST': - return - - if not self.wb_url: - return - - mime = self.env.get('CONTENT_TYPE', '') - length = self.env.get('CONTENT_LENGTH') - stream = self.env['wsgi.input'] - - buffered_stream = BytesIO() - - post_query = extract_post_query('POST', mime, length, stream, - buffered_stream=buffered_stream, - environ=self.env) - - if post_query: - self.env['wsgi.input'] = buffered_stream - self.wb_url.url = append_post_query(self.wb_url.url, post_query) - #================================================================= class WbResponse(object): diff --git a/pywb/framework/wsgi_wrappers.py b/pywb/framework/wsgi_wrappers.py deleted file mode 100644 index 4e077977..00000000 --- a/pywb/framework/wsgi_wrappers.py +++ /dev/null @@ -1,188 +0,0 @@ -from pywb.utils.wbexception import WbException, NotFoundException -from pywb.utils.loaders import load_yaml_config -from pywb.utils.loaders import load_yaml_config -from warcio.utils import to_native_str - -from pywb.framework.wbrequestresponse import WbResponse -from warcio.statusandheaders import StatusAndHeaders - - -import os -import logging - - -DEFAULT_PORT = 8080 - - -#================================================================= -class WSGIApp(object): - def __init__(self, wb_router, fallback_app=None): - self.wb_router = wb_router - self.fallback_app = fallback_app - - # Top-level wsgi application - def __call__(self, env, start_response): - if env['REQUEST_METHOD'] == 'CONNECT': - return self.handle_connect(env, start_response) - else: - return self.handle_methods(env, start_response) - - def handle_connect(self, env, start_response): - def ssl_start_response(statusline, headers): - ssl_sock = env.get('pywb.proxy_ssl_sock') - if not ssl_sock: - start_response(statusline, headers) - return - - env['pywb.proxy_statusline'] = statusline - - status_line = 'HTTP/1.1 ' + statusline + '\r\n' - ssl_sock.write(status_line.encode('iso-8859-1')) - - for name, value in headers: - line = name + ': ' + value + '\r\n' - ssl_sock.write(line.encode('iso-8859-1')) - - resp_iter = self.handle_methods(env, ssl_start_response) - - ssl_sock = env.get('pywb.proxy_ssl_sock') - if not ssl_sock: - return resp_iter - - ssl_sock.write(b'\r\n') - - for obj in resp_iter: - if obj: - ssl_sock.write(obj) - ssl_sock.close() - - start_response(env['pywb.proxy_statusline'], []) - - return [] - - def handle_methods(self, env, start_response): - wb_router = self.wb_router - response = None - - try: - response = wb_router(env) - - if not response: - if self.fallback_app: - return self.fallback_app(env, start_response) - else: - msg = 'No handler for "{0}".'.format(env['REL_REQUEST_URI']) - raise NotFoundException(msg) - - except WbException as e: - response = self.handle_exception(env, e, False) - - except Exception as e: - response = self.handle_exception(env, e, True) - - return response(env, start_response) - - def handle_exception(self, env, exc, print_trace): - error_view = None - - if hasattr(self.wb_router, 'error_view'): - error_view = self.wb_router.error_view - - if hasattr(exc, 'status'): - status = exc.status() - else: - status = '500 Internal Server Error' - - if hasattr(exc, 'url'): - err_url = exc.url - else: - err_url = None - - if len(exc.args): - err_msg = exc.args[0] - - if print_trace: - import traceback - err_details = traceback.format_exc() - print(err_details) - else: - logging.info(err_msg) - err_details = None - - if error_view: - if err_url and isinstance(err_url, str): - err_url = to_native_str(err_url, 'utf-8') - if err_msg and isinstance(err_msg, str): - err_msg = to_native_str(err_msg, 'utf-8') - - return error_view.render_response(exc_type=type(exc).__name__, - err_msg=err_msg, - err_details=err_details, - status=status, - env=env, - err_url=err_url) - else: - msg = status + ' Error: ' - if err_msg: - msg += err_msg - - #msg = msg.encode('utf-8', 'ignore') - return WbResponse.text_response(msg, - status=status) - -#================================================================= -DEFAULT_CONFIG_FILE = 'config.yaml' - - -#================================================================= -def init_app(init_func, load_yaml=True, config_file=None, config=None): - try: - config = config or {} - if load_yaml: - # env setting overrides all others - env_config = os.environ.get('PYWB_CONFIG_FILE') - if env_config: - config_file = env_config - - if not config_file: - config_file = DEFAULT_CONFIG_FILE - - if os.path.isfile(config_file): - config = load_yaml_config(config_file) - - wb_router = init_func(config) - except: - msg = '*** pywb app init FAILED config from "%s"!\n' - logging.exception(msg, init_func.__name__) - raise - else: - msg = '*** pywb app inited with config from "%s"!\n' - logging.debug(msg, init_func.__name__) - - return WSGIApp(wb_router) - - -#================================================================= -def start_wsgi_ref_server(the_app, name, port): # pragma: no cover - from wsgiref.simple_server import make_server, WSGIServer - from six.moves.socketserver import ThreadingMixIn - - # disable is_hop_by_hop restrictions - import wsgiref.handlers - wsgiref.handlers.is_hop_by_hop = lambda x: False - - if port is None: - port = DEFAULT_PORT - - logging.info('Starting %s on port %s', name, port) - - class ThreadingWSGIServer(ThreadingMixIn, WSGIServer): - pass - - try: - httpd = make_server('', port, the_app, ThreadingWSGIServer) - httpd.serve_forever() - except KeyboardInterrupt as ex: - pass - finally: - logging.info('Stopping %s', name) diff --git a/pywb/perms/__init__.py b/pywb/perms/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/pywb/perms/perms_filter.py b/pywb/perms/perms_filter.py deleted file mode 100644 index 26996e39..00000000 --- a/pywb/perms/perms_filter.py +++ /dev/null @@ -1,85 +0,0 @@ -from pywb.utils.wbexception import AccessException - - -#================================================================= -def make_perms_cdx_filter(perms_policy, wbrequest): - """ - Called internally to convert a perms_policy and a request - to a filter which can be applied on the cdx - """ - perms_checker = perms_policy(wbrequest) - if not perms_checker: - return None - - return _create_cdx_perms_filter(perms_checker) - - -#================================================================= -def _create_cdx_perms_filter(perms_checker): - """ - Return a function which will filter the cdx given - a Perms object. - :param perms_checker: a Perms object which implements the - allow_url_lookup() and access_check_capture() methods - """ - - def perms_filter_op(cdx_iter, query): - """ - filter out those cdx records that user doesn't have access to, - by consulting :param perms_checker:. - :param cdx_iter: cdx record source iterable - :param query: request parameters (CDXQuery) - :param perms_checker: object implementing permission checker - """ - if not perms_checker.allow_url_lookup(query.key): - if query.is_exact: - raise AccessException('Excluded') - - for cdx in cdx_iter: - cdx = perms_checker.access_check_capture(cdx) - if cdx: - yield cdx - - return perms_filter_op - - -#================================================================ -def allow_all_perms_policy(wbrequest): - """ - Perms policy which always returns a default Perms object - which allows everything. - - The perms object is created per request and may store request - state, if necessary. - - The same perms object may be called with multiple queries - (such as for each cdx line) per request. - """ - return Perms() - - -#================================================================= -class Perms(object): - """ - A base perms checker which allows everything - """ - - def allow_url_lookup(self, key): - """ - Return true/false if urlkey (canonicalized url) - should be allowed. - - Default: allow all - """ - return True - - def access_check_capture(self, cdx): - """ - Allow/deny specified cdx capture (dict) to be included - in the result. - Return None to reject, or modify the cdx to exclude - any fields that need to be restricted. - - Default: allow cdx line without modifications - """ - return cdx diff --git a/pywb/perms/perms_handler.py b/pywb/perms/perms_handler.py deleted file mode 100644 index 7e0baf52..00000000 --- a/pywb/perms/perms_handler.py +++ /dev/null @@ -1,67 +0,0 @@ -from pywb.utils.canonicalize import UrlCanonicalizer -from pywb.utils.wbexception import NotFoundException - -from pywb.framework.basehandlers import WbUrlHandler -from pywb.framework.archivalrouter import ArchivalRouter, Route -from pywb.framework.wbrequestresponse import WbResponse - -BLOCK = '["block"]' -ALLOW = '["allow"]' -RESPONSE_TYPE = 'application/json' - -NOT_FOUND = 'Please specify a url to check for access' - - -#================================================================= -class PermsHandler(WbUrlHandler): - - def __init__(self, perms_policy, url_canon): - self.perms_policy = perms_policy - self.url_canon = url_canon - - def __call__(self, wbrequest): - perms_checker = self.perms_policy(wbrequest) - - if wbrequest.wb_url: - return self.check_single_url(wbrequest, perms_checker) - -# elif wbrequest.env['REQUEST_METHOD'] == 'POST': -# return self.check_bulk(wbrequest, perms_checker) - - else: - raise NotFoundException(NOT_FOUND) - - def check_single_url(self, wbrequest, perms_checker): - urlkey = self.url_canon(wbrequest.wb_url.url) - urlkey = urlkey.encode('utf-8') - - if not perms_checker.allow_url_lookup(urlkey): - response_text = BLOCK - else: - response_text = ALLOW - - #TODO: other types of checking - return WbResponse.text_response(response_text, - content_type=RESPONSE_TYPE) -#TODO -# def check_bulk_urls(self, wbrequest, perms_checker): -# pass -# - - -#================================================================= -def create_perms_checker_app(config): - """ - Create permissions checker standalone app - Running under the '/check-access' route - """ - port = config.get('port') - - perms_policy = config.get('perms_policy') - - canonicalizer = UrlCanonicalizer(config.get('surt_ordered', True)) - - handler = PermsHandler(perms_policy, canonicalizer) - routes = [Route('check-access', handler)] - - return ArchivalRouter(routes, port=port) diff --git a/pywb/urlrewrite/platformhandler.py b/pywb/urlrewrite/platformhandler.py deleted file mode 100644 index 827bfdf3..00000000 --- a/pywb/urlrewrite/platformhandler.py +++ /dev/null @@ -1,99 +0,0 @@ -from gevent.monkey import patch_all; patch_all() - -import requests - -from pywb.framework.archivalrouter import Route - -from pywb.rewrite.rewrite_content import RewriteContent -from pywb.rewrite.wburl import WbUrl -from warcio.recordloader import ArcWarcRecordLoader -from pywb.webapp.live_rewrite_handler import RewriteHandler -from pywb.utils.canonicalize import canonicalize -from warcio.timeutils import http_date_to_timestamp -from pywb.cdx.cdxobject import CDXObject - -from io import BytesIO - -from pywb.urlrewrite.rewriteinputreq import RewriteInputRequest - -from six.moves.urllib.parse import quote - - -# ============================================================================ -class PlatformRoute(Route): - def apply_filters(self, wbrequest, matcher): - wbrequest.matchdict = matcher.groupdict() - - -# ============================================================================ -class PlatformHandler(RewriteHandler): - def __init__(self, config): - super(PlatformHandler, self).__init__(config) - self.upstream_url = config.get('upstream_url') - self.loader = ArcWarcRecordLoader() - - framed = config.get('framed_replay') - self.content_rewriter = RewriteContent(is_framed_replay=framed) - - def render_content(self, wbrequest): - if wbrequest.wb_url.mod == 'vi_': - return self._get_video_info(wbrequest) - - ref_wburl_str = wbrequest.extract_referrer_wburl_str() - if ref_wburl_str: - wbrequest.env['HTTP_REFERER'] = WbUrl(ref_wburl_str).url - - urlkey = canonicalize(wbrequest.wb_url.url) - url = wbrequest.wb_url.url - - inputreq = RewriteInputRequest(wbrequest.env, urlkey, url, - self.content_rewriter) - - req_data = inputreq.reconstruct_request(url) - - headers = {'Content-Length': len(req_data), - 'Content-Type': 'application/request'} - - if wbrequest.wb_url.is_latest_replay(): - closest = 'now' - else: - closest = wbrequest.wb_url.timestamp - - upstream_url = self.upstream_url.format(url=quote(url), - closest=closest, - #coll=wbrequest.coll, - **wbrequest.matchdict) - - r = requests.post(upstream_url, - data=BytesIO(req_data), - headers=headers, - stream=True, - allow_redirects=False) - - r.raise_for_status() - - record = self.loader.parse_record_stream(r.raw) - - cdx = CDXObject() - cdx['urlkey'] = urlkey - cdx['timestamp'] = http_date_to_timestamp(r.headers.get('Memento-Datetime')) - cdx['url'] = url - - head_insert_func = self.head_insert_view.create_insert_func(wbrequest) - result = self.content_rewriter.rewrite_content(wbrequest.urlrewriter, - record.http_headers, - record.stream, - head_insert_func, - urlkey, - cdx) - - status_headers, gen, is_rw = result - return self._make_response(wbrequest, *result) - - -if __name__ == "__main__": - from gevent.wsgi import WSGIServer - from pywb.apps.wayback import application - - server = WSGIServer(('', 8090), application) - server.serve_forever() diff --git a/pywb/warc/README.md b/pywb/warc/README.md deleted file mode 100644 index 91cc3036..00000000 --- a/pywb/warc/README.md +++ /dev/null @@ -1,32 +0,0 @@ -### pywb.warc - -This is the WARC/ARC record loading component of pywb wayback tool suite. -The package provides the following facilities: - -* Resolve relative WARC/ARC filenames to a full path based on configurable resolvers - -* Resolve 'revisit' records from provided index to find a full record with headers and payload content - -* Load WARC/ARC records either locally or via http using http 1.1 range requests - - -When loading archived content, the format type (WARC vs ARC) and compressed ARCs/WARCs -are decompressed automatically. -No assumption is made about format based on filename, content type -or other external parameters other than the content itself. - -### Tests - -This package will includes a test suite for loading a variety of WARC and ARC records. - -Tests so far: - -* Compressed WARC, ARC Records -* Uncompressed ARC Records -* Compressed WARC created by wget 1.14 -* Same Url revisit record resolving - - -TODO: - -* Different url revisit record resolving diff --git a/pywb/webapp/__init__.py b/pywb/webapp/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/pywb/webapp/cdx_api_handler.py b/pywb/webapp/cdx_api_handler.py deleted file mode 100644 index 1ebd0018..00000000 --- a/pywb/webapp/cdx_api_handler.py +++ /dev/null @@ -1,62 +0,0 @@ -from pywb.cdx.cdxserver import create_cdx_server - -from pywb.utils.wbexception import NotFoundException -from pywb.framework.basehandlers import BaseHandler -from pywb.framework.wbrequestresponse import WbResponse - -from pywb.webapp.query_handler import QueryHandler - -from six.moves.urllib.parse import parse_qs -import json -import six - - -#================================================================= -class CDXAPIHandler(BaseHandler): - """ - Handler which passes wsgi request to cdx server and - returns a text-based cdx api - """ - def __init__(self, index_handler): - self.index_handler = index_handler - - def __call__(self, wbrequest): - params = self.extract_params_from_wsgi_env(wbrequest.env) - - try: - cdx_iter = self.index_handler.load_cdx(wbrequest, params) - except NotFoundException: - msg = 'No Captures found for: ' + params.get('url') - if params.get('output') == 'json': - msg = json.dumps(dict(error=msg)) - content_type='application/json' - else: - content_type='text/plain' - - return WbResponse.text_response(msg, content_type=content_type, - status='404 Not Found') - - return WbResponse.text_stream(cdx_iter, - content_type='text/plain') - - @staticmethod - def extract_params_from_wsgi_env(env): - """ utility function to extract params and create a CDXQuery - from a WSGI environment dictionary - """ - params = parse_qs(env['QUERY_STRING']) - - # parse_qs produces arrays for single values - # cdx processing expects singleton params for all params, - # except filters, so convert here - # use first value of the list - for name, val in six.iteritems(params): - if name != 'filter': - params[name] = val[0] - - if 'output' not in params: - params['output'] = 'text' - elif params['output'] not in ('text', 'json'): - params['output'] = 'text' - - return params diff --git a/pywb/webapp/handlers.py b/pywb/webapp/handlers.py index 968dfd21..2f9029d2 100644 --- a/pywb/webapp/handlers.py +++ b/pywb/webapp/handlers.py @@ -1,195 +1,14 @@ -import pkgutil import mimetypes -import time -import logging -from datetime import datetime - -from warcio.statusandheaders import StatusAndHeaders -from warcio.timeutils import datetime_to_timestamp - -from pywb.utils.wbexception import NotFoundException from pywb.utils.loaders import LocalFileLoader -from pywb.framework.basehandlers import BaseHandler, WbUrlHandler from pywb.framework.wbrequestresponse import WbResponse -from pywb.warc.blockrecordloader import BlockArcWarcRecordLoader -from pywb.warc.resolvingloader import ResolvingLoader -from pywb.warc.pathresolvers import PathResolverMapper - -from pywb.webapp.views import J2TemplateView, init_view -from pywb.webapp.replay_views import ReplayView -from pywb.framework.memento import MementoResponse - - -#================================================================= -class SearchPageWbUrlHandler(WbUrlHandler): - """ - Loads a default search page html template to be shown when - the wb_url is empty - """ - def __init__(self, config): - self.search_view = init_view(config, 'search_html') - - self.is_frame_mode = config.get('framed_replay', False) - self.frame_mod = 'tf_' - self.replay_mod = '' - - self.response_class = WbResponse - - if self.is_frame_mode: - #html = config.get('frame_insert_html', 'templates/frame_insert.html') - #self.search_view = J2TemplateView(html, config.get('jinja_env')) - self.frame_insert_view = init_view(config, 'frame_insert_html') - assert(self.frame_insert_view) - - self.banner_html = config.get('banner_html', 'banner.html') - - if config.get('enable_memento', False): - self.response_class = MementoResponse - - if self.is_frame_mode == 'inverse': - self.frame_mod = '' - self.replay_mod = 'mp_' - - else: - self.frame_insert_view = None - self.banner_html = None - - def render_search_page(self, wbrequest, **kwargs): - return self.search_view.render_response(wbrequest=wbrequest, - prefix=wbrequest.wb_prefix, - **kwargs) - - def __call__(self, wbrequest): - # root search page - if wbrequest.wb_url_str == '/': - return self.render_search_page(wbrequest) - - wbrequest.options['replay_mod'] = self.replay_mod - wbrequest.options['frame_mod'] = self.frame_mod - - # render top level frame if in frame mode - # (not supported in proxy mode) - if (self.is_frame_mode and wbrequest.wb_url and - not wbrequest.wb_url.is_query() and - not wbrequest.options['is_proxy']): - - if wbrequest.wb_url.mod == self.frame_mod: - wbrequest.options['is_top_frame'] = True - return self.get_top_frame_response(wbrequest) - else: - wbrequest.options['is_framed'] = True - wbrequest.final_mod = self.frame_mod - else: - wbrequest.options['is_framed'] = False - - try: - return self.handle_request(wbrequest) - except NotFoundException as nfe: - return self.handle_not_found(wbrequest, nfe) - - def get_top_frame_params(self, wbrequest, mod): - embed_url = wbrequest.wb_url.to_str(mod=mod) - - if wbrequest.wb_url.timestamp: - timestamp = wbrequest.wb_url.timestamp - else: - timestamp = datetime_to_timestamp(datetime.utcnow()) - - params = dict(embed_url=embed_url, - wbrequest=wbrequest, - timestamp=timestamp, - url=wbrequest.wb_url.get_url(), - banner_html=self.banner_html) - - return params - - def get_top_frame_response(self, wbrequest): - params = self.get_top_frame_params(wbrequest, mod=self.replay_mod) - - headers = [('Content-Type', 'text/html')] - status_headers = StatusAndHeaders('200 OK', headers) - - template_result = self.frame_insert_view.render_to_string(**params) - body = template_result.encode('utf-8') - - return self.response_class(status_headers, [body], wbrequest=wbrequest) - - -#================================================================= -# Standard WB Handler -#================================================================= -class WBHandler(SearchPageWbUrlHandler): - def __init__(self, query_handler, config=None): - super(WBHandler, self).__init__(config) - - self.index_reader = query_handler - self.not_found_view = init_view(config, 'not_found_html') - - self.replay = self._init_replay_view(config) - - self.fallback_handler = None - self.fallback_name = config.get('fallback') - - def _init_replay_view(self, config): - cookie_maker = config.get('cookie_maker') - record_loader = BlockArcWarcRecordLoader(cookie_maker=cookie_maker) - - paths = config.get('archive_paths') - - resolving_loader = ResolvingLoader(PathResolverMapper()(paths), - record_loader=record_loader) - - return ReplayView(resolving_loader, config) - - def resolve_refs(self, handler_dict): - if self.fallback_name: - self.fallback_handler = handler_dict.get(self.fallback_name) - logging.debug('Fallback Handler: ' + self.fallback_name) - - def handle_request(self, wbrequest): - cdx_lines, output = self.index_reader.load_for_request(wbrequest) - - if output != 'text' and wbrequest.wb_url.is_replay(): - return self.handle_replay(wbrequest, cdx_lines) - else: - return self.handle_query(wbrequest, cdx_lines, output) - - def handle_query(self, wbrequest, cdx_lines, output): - return self.index_reader.make_cdx_response(wbrequest, - cdx_lines, - output) - - def handle_replay(self, wbrequest, cdx_lines): - cdx_callback = self.index_reader.cdx_load_callback(wbrequest) - - return self.replay.render_content(wbrequest, - cdx_lines, - cdx_callback) - - def handle_not_found(self, wbrequest, nfe): - # check fallback: only for replay queries and not for identity - if (self.fallback_handler and - not wbrequest.wb_url.is_query() and - not wbrequest.wb_url.is_identity): - return self.fallback_handler(wbrequest) - - # if capture query, just return capture page - if wbrequest.wb_url.is_query(): - output = self.index_reader.get_output_type(wbrequest.wb_url) - return self.index_reader.make_cdx_response(wbrequest, iter([]), output) - else: - return self.not_found_view.render_response(status='404 Not Found', - wbrequest=wbrequest, - url=wbrequest.wb_url.url) - #================================================================= # Static Content Handler #================================================================= -class StaticHandler(BaseHandler): +class StaticHandler(object): def __init__(self, static_path): mimetypes.init() @@ -234,15 +53,3 @@ class StaticHandler(BaseHandler): wbrequest.wb_url_str) -#================================================================= -# Debug Handlers -#================================================================= -class DebugEchoEnvHandler(BaseHandler): # pragma: no cover - def __call__(self, wbrequest): - return WbResponse.text_response(str(wbrequest.env)) - - -#================================================================= -class DebugEchoHandler(BaseHandler): # pragma: no cover - def __call__(self, wbrequest): - return WbResponse.text_response(str(wbrequest)) diff --git a/pywb/webapp/live_rewrite_handler.py b/pywb/webapp/live_rewrite_handler.py deleted file mode 100644 index 56143583..00000000 --- a/pywb/webapp/live_rewrite_handler.py +++ /dev/null @@ -1,241 +0,0 @@ -from pywb.framework.wbrequestresponse import WbResponse -from pywb.framework.cache import create_cache - -from pywb.rewrite.rewrite_live import LiveRewriter -from pywb.rewrite.wburl import WbUrl - -from pywb.webapp.handlers import StaticHandler, SearchPageWbUrlHandler -from pywb.webapp.views import HeadInsertView - -from pywb.utils.wbexception import LiveResourceException - -import json -import hashlib - - -#================================================================= -class RewriteHandler(SearchPageWbUrlHandler): - - LIVE_COOKIE = 'pywb.timestamp={0}; max-age=60' - - YT_DL_TYPE = 'application/vnd.youtube-dl_formats+json' - - def __init__(self, config): - super(RewriteHandler, self).__init__(config) - - proxyhostport = config.get('proxyhostport') - - live_rewriter_cls = config.get('live_rewriter_cls', LiveRewriter) - - self.live_fetcher = live_rewriter_cls(is_framed_replay=self.is_frame_mode, - proxies=proxyhostport) - - self.recording = self.live_fetcher.is_recording() - - self.head_insert_view = HeadInsertView.init_from_config(config) - - self.live_cookie = config.get('live-cookie', self.LIVE_COOKIE) - - self.verify = config.get('verify_ssl', True) - - self.ydl = None - - self._cache = None - - def handle_request(self, wbrequest): - if wbrequest.wb_url.is_query(): - type_ = wbrequest.wb_url.LATEST_REPLAY - url = wbrequest.urlrewriter.get_new_url(type=type_, timestamp='') - return WbResponse.redir_response(url) - - if wbrequest.options['is_ajax']: - wbrequest.urlrewriter.rewrite_opts['is_ajax'] = True - - try: - return self.render_content(wbrequest) - - except Exception as exc: - import traceback - err_details = traceback.format_exc() - print(err_details) - - url = wbrequest.wb_url.url - msg = 'Could not load the url from the live web: ' + url - raise LiveResourceException(msg=msg, url=url) - - def _live_request_headers(self, wbrequest): - return {} - - def _skip_recording(self, wbrequest): - return False - - def render_content(self, wbrequest): - if wbrequest.wb_url.mod == 'vi_': - return self._get_video_info(wbrequest) - - head_insert_func = self.head_insert_view.create_insert_func(wbrequest) - req_headers = self._live_request_headers(wbrequest) - - ref_wburl_str = wbrequest.extract_referrer_wburl_str() - if ref_wburl_str: - wbrequest.env['REL_REFERER'] = WbUrl(ref_wburl_str).url - - skip_recording = self._skip_recording(wbrequest) - - use_206 = False - url = None - rangeres = None - - readd_range = False - cache_key = None - - if self.recording and not skip_recording: - rangeres = wbrequest.extract_range() - - if rangeres: - url, start, end, use_206 = rangeres - - # if bytes=0- Range request, - # simply remove the range and still proxy - if start == 0 and not end and use_206: - wbrequest.wb_url.url = url - del wbrequest.env['HTTP_RANGE'] - readd_range = True - else: - # disables proxy - skip_recording = True - - # sets cache_key only if not already cached - cache_key = self._get_cache_key('r:', url) - - result = self.live_fetcher.fetch_request(wbrequest.wb_url.url, - wbrequest.urlrewriter, - head_insert_func=head_insert_func, - req_headers=req_headers, - env=wbrequest.env, - skip_recording=skip_recording, - verify=self.verify) - - wbresponse = self._make_response(wbrequest, *result) - - if readd_range: - content_length = (wbresponse.status_headers. - get_header('Content-Length')) - try: - content_length = int(content_length) - wbresponse.status_headers.add_range(0, content_length, - content_length) - except (ValueError, TypeError): - pass - - if self.recording and cache_key: - self._add_rec_ping(cache_key, url, wbrequest, wbresponse) - - if rangeres: - referrer = wbrequest.env.get('REL_REFERER') - - # also ping video info - if referrer: - try: - resp = self._get_video_info(wbrequest, - info_url=referrer, - video_url=url) - except: - print('Error getting video info') - - return wbresponse - - def _make_response(self, wbrequest, status_headers, gen, is_rewritten): - # if cookie set, pass recorded timestamp info via cookie - # so that client side may be able to access it - # used by framed mode to update frame banner - if self.live_cookie: - cdx = wbrequest.env.get('pywb.cdx') - if cdx: - value = self.live_cookie.format(cdx['timestamp']) - status_headers.headers.append(('Set-Cookie', value)) - - return WbResponse(status_headers, gen) - - def _get_cache_key(self, prefix, url): - if not self._cache: - self._cache = create_cache() - - key = self.create_cache_key(prefix, url) - - if key in self._cache: - return None - - return key - - @staticmethod - def create_cache_key(prefix, url): - hash_ = hashlib.md5() - hash_.update(url.encode('utf-8')) - key = hash_.hexdigest() - key = prefix + key - return key - - def _add_rec_ping(self, key, url, wbrequest, wbresponse): - def do_ping(): - headers = self._live_request_headers(wbrequest) - headers['Connection'] = 'close' - - try: - # mark as pinged - self._cache[key] = '1' - - self.live_fetcher.fetch_async(url, headers) - - except: - del self._cache[key] - raise - - def wrap_buff_gen(gen): - for x in gen: - yield x - - try: - do_ping() - except: - pass - - #do_ping() - wbresponse.body = wrap_buff_gen(wbresponse.body) - return wbresponse - - def _get_video_info(self, wbrequest, info_url=None, video_url=None): - if not video_url: - video_url = wbrequest.wb_url.url - - if not info_url: - info_url = wbrequest.wb_url.url - - cache_key = None - if self.recording: - cache_key = self._get_cache_key('v:', video_url) - - info = self.live_fetcher.get_video_info(video_url) - if info is None: #pragma: no cover - msg = ('youtube-dl is not installed, pip install youtube-dl to ' + - 'enable improved video proxy') - - return WbResponse.text_response(text=msg, status='404 Not Found') - - #if info and info.formats and len(info.formats) == 1: - - content_type = self.YT_DL_TYPE - metadata = json.dumps(info) - - if (self.recording and cache_key): - headers = self._live_request_headers(wbrequest) - headers['Content-Type'] = content_type - - if info_url.startswith('https://'): - info_url = info_url.replace('https', 'http', 1) - - response = self.live_fetcher.add_metadata(info_url, headers, metadata) - - self._cache[cache_key] = '1' - - return WbResponse.text_response(metadata, content_type=content_type) diff --git a/pywb/webapp/pywb_init.py b/pywb/webapp/pywb_init.py deleted file mode 100644 index 804653be..00000000 --- a/pywb/webapp/pywb_init.py +++ /dev/null @@ -1,387 +0,0 @@ -from pywb.utils.loaders import load_yaml_config - -from pywb.framework.archivalrouter import ArchivalRouter, Route -from pywb.framework.proxy import ProxyArchivalRouter -from pywb.framework.wbrequestresponse import WbRequest -from pywb.framework.memento import MementoRequest -from pywb.framework.basehandlers import BaseHandler - -from pywb.webapp.views import J2TemplateView -from pywb.webapp.views import J2HtmlCapturesView, init_view - -from pywb.webapp.live_rewrite_handler import RewriteHandler - -from pywb.webapp.query_handler import QueryHandler -from pywb.webapp.handlers import WBHandler -from pywb.webapp.handlers import StaticHandler -from pywb.webapp.handlers import DebugEchoHandler, DebugEchoEnvHandler -from pywb.webapp.cdx_api_handler import CDXAPIHandler - -from pywb import DEFAULT_CONFIG - -import os -import logging -import six - - -#================================================================= -class DictChain(object): - def __init__(self, *dicts): - self.dicts = dicts - - def get(self, key, default_val=None): - for d in self.dicts: - val = d.get(key) - if val is not None: - return val - return default_val - - def __contains__(self, key): - return self.get(key) is not None - - def __getitem__(self, key): - return self.get(key) - - def __setitem__(self, key, value): - self.dicts[0][key] = value - - -#================================================================= -def create_wb_handler(query_handler, config): - wb_handler_class = config.get('wb_handler_class', WBHandler) - - wb_handler = wb_handler_class( - query_handler, - config=config, - ) - - return wb_handler - - -#================================================================= -def create_live_handler(config): - wb_handler_class = config.get('wb_handler_class', RewriteHandler) - - live_handler = wb_handler_class(config) - - return live_handler - - -#================================================================= -def init_route_config(value, config): - if isinstance(value, str) or isinstance(value, list): - value = dict(index_paths=value) - - route_config = DictChain(value, config) - return route_config - - -#================================================================= -def init_collection(route_config): - ds_rules_file = route_config.get('domain_specific_rules', None) - - html_view = init_view(route_config, 'query_html', J2HtmlCapturesView) - - server_cls = route_config.get('server_cls') - - query_handler = QueryHandler.init_from_config(route_config, - ds_rules_file, - html_view, - server_cls) - - return query_handler - - -#================================================================= -def add_cdx_api_handler(name, cdx_api_suffix, routes, query_handler, - route_class=Route): - # if bool, use -cdx suffix, else use custom string - # as the suffix - if isinstance(cdx_api_suffix, bool): - name += '-cdx' - else: - name += str(cdx_api_suffix) - - logging.debug('Adding CDX API Handler: ' + name) - routes.append(route_class(name, CDXAPIHandler(query_handler))) - - -#================================================================= -def create_cdx_server_app(passed_config): - """ - Create a cdx server api-only app - For each collection, create a /-cdx access point - which follows the cdx api - """ - - defaults = load_yaml_config(DEFAULT_CONFIG) - - config = DictChain(passed_config, defaults) - - collections = config.get('collections', {}) - - static_routes = {} - - # collections based on file system - if config.get('enable_auto_colls', True): - colls_loader_cls = config.get('colls_loader_cls', DirectoryCollsLoader) - dir_loader = colls_loader_cls(config, static_routes, collections) - dir_loader() - #collections.update(dir_loader()) - - routes = [] - - for name, value in six.iteritems(collections): - route_config = init_route_config(value, config) - query_handler = init_collection(route_config) - - cdx_api_suffix = route_config.get('enable_cdx_api', True) - - add_cdx_api_handler(name, cdx_api_suffix, routes, query_handler) - - return ArchivalRouter(routes) - - -#================================================================= -class DirectoryCollsLoader(object): - def __init__(self, config, static_routes, colls): - self.config = config - self.static_routes = static_routes - self.colls = colls - - def __call__(self): - colls = self.colls - - static_dir = self.config.get('paths')['static_path'] - static_shared_prefix = self.config.get('static_shared_prefix') - - if static_dir and static_shared_prefix and os.path.isdir(static_dir): - static_dir = os.path.abspath(static_dir) + os.path.sep - self.static_routes[static_shared_prefix] = static_dir - - root_dir = self.config.get('collections_root', '') - if not root_dir or not os.path.isdir(root_dir): - return colls - - for name in os.listdir(root_dir): - full = os.path.join(root_dir, name) - if not os.path.isdir(full): - continue - - coll_config = self.load_coll_dir(full, name) - if coll_config: - # if already exists, override existing config with coll specific - if name in colls: - colls[name].update(coll_config) - else: - colls[name] = coll_config - - return colls - - def _norm_path(self, root_dir, path): - result = os.path.normpath(os.path.join(root_dir, path)) - return result - - def _add_dir_if_exists(self, coll, root_dir, dir_key, required=False): - curr_val = coll.get(dir_key) - if curr_val: - # add collection path only if relative path, and not a url - if '://' not in curr_val and not os.path.isabs(curr_val): - coll[dir_key] = self._norm_path(root_dir, curr_val) + os.path.sep - return False - - thedir = self.config.get('paths')[dir_key] - - fulldir = os.path.join(root_dir, thedir) - - if os.path.isdir(fulldir): - fulldir = os.path.abspath(fulldir) + os.path.sep - coll[dir_key] = fulldir - return True - elif required: - msg = 'Dir "{0}" does not exist for "{1}"'.format(fulldir, dir_key) - raise Exception(msg) - else: - return False - - def load_yaml_file(self, root_dir, filename): - filename = os.path.join(root_dir, filename) - if os.path.isfile(filename): - return load_yaml_config(filename) - else: - return {} - - def load_coll_dir(self, root_dir, name): - # Load config.yaml - coll_config = self.load_yaml_file(root_dir, 'config.yaml') - - # Load metadata.yaml - metadata = self.load_yaml_file(root_dir, 'metadata.yaml') - coll_config['metadata'] = metadata - - self._add_dir_if_exists(coll_config, root_dir, 'index_paths', True) - - # inherit these properties from base, in case archive_paths is shared - shared_config = DictChain(coll_config, self.config) - self._add_dir_if_exists(shared_config, root_dir, 'archive_paths', True) - - if self._add_dir_if_exists(coll_config, root_dir, 'static_path', False): - self.static_routes['static/' + name] = coll_config['static_path'] - - # Custom templates dir - templates_dir = self.config.get('paths').get('templates_dir') - if templates_dir: - template_dir = os.path.join(root_dir, templates_dir) - - # Check all templates - template_files = self.config.get('paths')['template_files'] - for tname, tfile in six.iteritems(template_files): - if tname in coll_config: - # Already set - coll_config[tname] = self._norm_path(root_dir, coll_config[tname]) - - # If templates override dir - elif templates_dir: - full = os.path.join(template_dir, tfile) - if os.path.isfile(full): - coll_config[tname] = full - - return coll_config - - -#================================================================= -def create_wb_router(passed_config=None): - passed_config = passed_config or {} - - defaults = load_yaml_config(DEFAULT_CONFIG) - - config = DictChain(passed_config, defaults) - - routes = [] - - port = config.get('port') - - collections = config.get('collections', {}) - - static_routes = config.get('static_routes', {}) - - root_route = None - - # collections based on file system - if config.get('enable_auto_colls', True): - colls_loader_cls = config.get('colls_loader_cls', DirectoryCollsLoader) - dir_loader = colls_loader_cls(config, static_routes, collections) - dir_loader() - #collections.update(dir_loader()) - - if config.get('enable_memento', False): - request_class = MementoRequest - else: - request_class = WbRequest - - # store live and replay handlers - handler_dict = {} - - # setup template globals - templates_dirs = config['templates_dirs'] - jinja_env = J2TemplateView.init_shared_env(paths=templates_dirs, - packages=config['template_packages']) - - jinja_env.globals.update(config.get('template_globals', {})) - - for static_name, static_path in six.iteritems(static_routes): - routes.append(Route(static_name, StaticHandler(static_path))) - - for name, value in six.iteritems(collections): - if isinstance(value, BaseHandler): - handler_dict[name] = value - new_route = Route(name, value, config=config) - if name != '': - routes.append(new_route) - else: - root_route = new_route - continue - - route_config = init_route_config(value, config) - route_class = route_config.get('route_class', Route) - - if route_config.get('index_paths') == '$liveweb': - live = create_live_handler(route_config) - handler_dict[name] = live - new_route = route_class(name, live, config=route_config) - if name != '': - routes.append(new_route) - else: - root_route = new_route - continue - - query_handler = init_collection(route_config) - - wb_handler = create_wb_handler( - query_handler=query_handler, - config=route_config, - ) - - handler_dict[name] = wb_handler - - logging.debug('Adding Collection: ' + name) - - new_route = route_class(name, wb_handler, - config=route_config, - request_class=request_class) - - if name != '': - routes.append(new_route) - else: - root_route = new_route - - # cdx query handler - cdx_api_suffix = route_config.get('enable_cdx_api', False) - - if cdx_api_suffix: - add_cdx_api_handler(name, cdx_api_suffix, routes, query_handler, - route_class=route_class) - - if config.get('debug_echo_env', False): - routes.append(Route('echo_env', DebugEchoEnvHandler())) - - if config.get('debug_echo_req', False): - routes.append(Route('echo_req', DebugEchoHandler())) - - if root_route: - routes.append(root_route) - - # resolve any cross handler references - for route in routes: - if hasattr(route.handler, 'resolve_refs'): - route.handler.resolve_refs(handler_dict) - - # default to regular archival mode - router = ArchivalRouter - - if config.get('enable_http_proxy', False): - router = ProxyArchivalRouter - - view = init_view(config, 'proxy_select_html') - - if 'proxy_options' not in passed_config: - passed_config['proxy_options'] = {} - - if view: - passed_config['proxy_options']['proxy_select_view'] = view - - view = init_view(config, 'proxy_cert_download_html') - - if view: - passed_config['proxy_options']['proxy_cert_download_view'] = view - - # Finally, create wb router - return router( - routes, - port=port, - abs_path=config.get('absolute_paths', True), - home_view=init_view(config, 'home_html'), - error_view=init_view(config, 'error_html'), - info_view=init_view(config, 'info_json'), - config=config - ) diff --git a/pywb/webapp/query_handler.py b/pywb/webapp/query_handler.py deleted file mode 100644 index 69d3bc58..00000000 --- a/pywb/webapp/query_handler.py +++ /dev/null @@ -1,172 +0,0 @@ -from pywb.utils.dsrules import DEFAULT_RULES_FILE - -from pywb.perms.perms_filter import make_perms_cdx_filter -from pywb.framework.wbrequestresponse import WbResponse -from pywb.cdx.cdxserver import create_cdx_server -from pywb.webapp.views import MementoTimemapView - - -#================================================================= -class QueryHandler(object): - """ - Main interface for querying the index (currently only CDX) from a - source server (currently a cdx server) - - Creates an appropriate query based on wbrequest type info and outputs - a returns a view for the cdx, either a raw cdx iter, an html view, - etc... - """ - - def __init__(self, cdx_server, html_query_view=None, perms_policy=None): - self.cdx_server = cdx_server - self.perms_policy = perms_policy - - self.views = {} - if html_query_view: - self.views['html'] = html_query_view - - self.views['timemap'] = MementoTimemapView() - - @staticmethod - def init_from_config(config, - ds_rules_file=DEFAULT_RULES_FILE, - html_view=None, - server_cls=None): - - perms_policy = None - - if hasattr(config, 'get'): - perms_policy = config.get('perms_policy') - server_cls = config.get('server_cls', server_cls) - - cdx_server = create_cdx_server(config, ds_rules_file, server_cls) - - return QueryHandler(cdx_server, html_view, perms_policy) - - def get_output_type(self, wb_url): - # cdx server only supports text and cdxobject for now - if wb_url.mod == 'cdx_': - output = 'text' - elif wb_url.mod == 'timemap': - output = 'timemap' - elif wb_url.is_query(): - output = 'html' - else: - output = 'cdxobject' - - return output - - def load_for_request(self, wbrequest): - wbrequest.normalize_post_query() - - wb_url = wbrequest.wb_url - output = self.get_output_type(wb_url) - - # init standard params - params = self.get_query_params(wb_url) - - params['allowFuzzy'] = True - params['url'] = wb_url.url - params['output'] = output - - params['filter'].append('!mimetype:-') - - # get metadata - if wb_url.mod == 'vi_': - # matching metadata explicitly with special scheme - schema, rest = wb_url.url.split('://', 1) - params['url'] = 'metadata://' + rest - params['filter'].append('~original:metadata://') - - cdx_iter = self.load_cdx(wbrequest, params) - return cdx_iter, output - - def load_cdx(self, wbrequest, params): - if wbrequest: - # add any custom filter from the request - if wbrequest.query_filter: - filters = params.get('filter') - if filters: - filters.extend(wbrequest.query_filter) - else: - params['filter'] = wbrequest.query_filter - - params['coll'] = wbrequest.coll - if wbrequest.custom_params: - params.update(wbrequest.custom_params) - - if self.perms_policy: - perms_op = make_perms_cdx_filter(self.perms_policy, wbrequest) - if perms_op: - params['custom_ops'] = [perms_op] - - cdx_iter = self.cdx_server.load_cdx(**params) - return cdx_iter - - def make_cdx_response(self, wbrequest, cdx_iter, output, **kwargs): - # if not text, the iterator is assumed to be CDXObjects - if output and output != 'text': - view = self.views.get(output) - if view: - return view.render_response(wbrequest, cdx_iter, **kwargs) - - return WbResponse.text_stream(cdx_iter) - - def cdx_load_callback(self, wbrequest): - def load_cdx(params): - params['output'] = 'cdxobject' - return self.load_cdx(wbrequest, params) - - return load_cdx - - def get_query_params(self, - wburl, limit=150000, - collapse_time=None, - replay_closest=100): - - #if wburl.type == wburl.URL_QUERY: - # raise NotImplementedError('Url Query Not Yet Supported') - - return { - wburl.QUERY: - {'collapseTime': collapse_time, - 'filter': ['!statuscode:(500|502|504)'], - 'from': wburl.timestamp, - 'to': wburl.end_timestamp, - 'limit': limit, - 'matchType': 'exact', - }, - - wburl.URL_QUERY: - {'collapse': 'urlkey', - 'matchType': 'prefix', - 'showGroupCount': True, - 'showUniqCount': True, - 'lastSkipTimestamp': True, - 'limit': limit, - 'fl': ('urlkey,original,timestamp,' + - 'endtimestamp,groupcount,uniqcount'), - 'filter': [], - }, - - wburl.REPLAY: - {'sort': 'closest', - 'filter': ['!statuscode:(500|502|504)'], - 'limit': replay_closest, - 'closest': wburl.timestamp, - 'resolveRevisits': True, - 'matchType': 'exact', - }, - - wburl.LATEST_REPLAY: - {'sort': 'reverse', - # Not appropriate as default - # Should be an option to configure status code filtering in general - # 'filter': ['statuscode:[23]..|-'], - 'filter': [], - 'limit': '1', - 'resolveRevisits': True, - 'matchType': 'exact', - } - - }[wburl.type] diff --git a/pywb/webapp/rangecache.py b/pywb/webapp/rangecache.py deleted file mode 100644 index 9c024b54..00000000 --- a/pywb/webapp/rangecache.py +++ /dev/null @@ -1,92 +0,0 @@ -from warcio.statusandheaders import StatusAndHeaders -from warcio.limitreader import LimitReader - -from pywb.framework.cache import create_cache - -from tempfile import NamedTemporaryFile, mkdtemp - -import yaml -import os -from shutil import rmtree - -import atexit - - -#================================================================= -class RangeCache(object): - def __init__(self): - self.cache = create_cache() - self.temp_dir = None - atexit.register(self.cleanup) - - def cleanup(self): - if self.temp_dir: # pragma: no cover - print('Removing: ' + self.temp_dir) - rmtree(self.temp_dir, True) - self.temp_dir = None - - def handle_range(self, wbrequest, key, wbresponse_func, - url, start, end, use_206): - # key must be set - assert(key) - if key not in self.cache: - wbrequest.custom_params['noredir'] = True - response = wbresponse_func() - - # only cache 200 responses - if not response.status_headers.get_statuscode().startswith('200'): - return response.status_headers, response.body - - if not self.temp_dir: - self.temp_dir = mkdtemp(prefix='_pywbcache') - else: - pass - #self._check_dir_size(self.temp_dir) - - with NamedTemporaryFile(delete=False, dir=self.temp_dir) as fh: - for obj in response.body: - fh.write(obj) - - name = fh.name - - spec = dict(name=fh.name, - headers=response.status_headers.headers) - - self.cache[key] = yaml.dump(spec) - else: - spec = yaml.load(self.cache[key]) - - spec['headers'] = [tuple(x) for x in spec['headers']] - - filelen = os.path.getsize(spec['name']) - - maxlen = filelen - start - - if end: - maxlen = min(maxlen, end - start + 1) - - def read_range(): - with open(spec['name'], 'rb') as fh: - fh.seek(start) - fh = LimitReader.wrap_stream(fh, maxlen) - while True: - buf = fh.read() - if not buf: - break - - yield buf - - status_headers = StatusAndHeaders('200 OK', spec['headers']) - - if use_206: - StatusAndHeaders.add_range(status_headers, start, - maxlen, - filelen) - - status_headers.replace_header('Content-Length', str(maxlen)) - - return status_headers, read_range() - - -#================================================================= -range_cache = RangeCache() diff --git a/pywb/webapp/replay_views.py b/pywb/webapp/replay_views.py deleted file mode 100644 index 7b6652f2..00000000 --- a/pywb/webapp/replay_views.py +++ /dev/null @@ -1,392 +0,0 @@ -import re -import logging - -from io import BytesIO -from six.moves.urllib.parse import urlsplit -from itertools import chain - -from warcio.statusandheaders import StatusAndHeaders -from warcio.limitreader import LimitReader -from warcio.timeutils import timestamp_now -from warcio.recordloader import ArchiveLoadFailed - -from pywb.utils.wbexception import WbException, NotFoundException - -from pywb.framework.wbrequestresponse import WbResponse -from pywb.framework.memento import MementoResponse - -from pywb.rewrite.rewrite_content import RewriteContent - -from pywb.webapp.views import HeadInsertView - -from pywb.webapp.rangecache import range_cache - - -#================================================================= -class CaptureException(WbException): - """ - raised to indicate an issue with a specific capture - and will be caught and result in a retry, if possible - if not, will result in a 502 - """ - def status(self): - return '502 Internal Server Error' - - -#================================================================= -class ReplayView(object): - STRIP_SCHEME_WWW = re.compile('^([\w]+:[/]*(?:www[\d]*\.)?)?(.*?)$', re.MULTILINE) - - def __init__(self, content_loader, config): - self.content_loader = content_loader - - framed = config.get('framed_replay') - self.content_rewriter = RewriteContent(is_framed_replay=framed) - - self.head_insert_view = HeadInsertView.init_from_config(config) - - self.buffer_response = config.get('buffer_response', True) - self.buffer_max_size = config.get('buffer_max_size', 16384) - - self.redir_to_exact = config.get('redir_to_exact', True) - - memento = config.get('enable_memento', False) - if memento: - self.response_class = MementoResponse - else: - self.response_class = WbResponse - - self.enable_range_cache = config.get('enable_ranges', True) - - self._reporter = config.get('reporter') - - def render_content(self, wbrequest, cdx_lines, cdx_loader): - last_e = None - first = True - - #cdx_lines = args[0] - #cdx_loader = args[1] - - # List of already failed w/arcs - failed_files = [] - - response = None - - # Iterate over the cdx until find one that works - # The cdx should already be sorted in - # closest-to-timestamp order (from the cdx server) - for cdx in cdx_lines: - try: - # optimize: can detect if redirect is needed just from the cdx, - # no need to load w/arc data if requiring exact match - if first: - redir_response = self._redirect_if_needed(wbrequest, cdx) - if redir_response: - return redir_response - - first = False - - response = self.cached_replay_capture(wbrequest, - cdx, - cdx_loader, - failed_files) - - except (CaptureException, ArchiveLoadFailed) as ce: - #import traceback - #traceback.print_exc() - logging.debug(ce) - last_e = ce - pass - - if response: - return response - - if not last_e: - # can only get here if cdx_lines is empty somehow - # should be filtered out before hand, but if not - msg = 'No Captures found for: ' + wbrequest.wb_url.url - last_e = NotFoundException(msg) - - raise last_e - - def cached_replay_capture(self, wbrequest, cdx, cdx_loader, failed_files): - def get_capture(): - return self.replay_capture(wbrequest, - cdx, - cdx_loader, - failed_files) - - if not self.enable_range_cache: - return get_capture() - - range_info = wbrequest.extract_range() - - if not range_info: - return get_capture() - - range_status, range_iter = (range_cache. - handle_range(wbrequest, - cdx.get('digest', cdx['urlkey']), - get_capture, - *range_info)) - - response = self.response_class(range_status, - range_iter, - wbrequest=wbrequest, - cdx=cdx) - return response - - def replay_capture(self, wbrequest, cdx, cdx_loader, failed_files): - (status_headers, stream) = (self.content_loader(cdx, - failed_files, - cdx_loader, - wbrequest)) - - # check and reject self-redirect - self._reject_self_redirect(wbrequest, cdx, status_headers) - - # check if redir is needed - redir_response = self._redirect_if_needed(wbrequest, cdx) - if redir_response: - return redir_response - - #length = status_headers.get_header('content-length') - #stream = LimitReader.wrap_stream(stream, length) - - # one more check for referrer-based self-redirect - # TODO: evaluate this, as refreshing in browser may sometimes cause - # referrer to be set to the same page, incorrectly skipping a capture - # self._reject_referrer_self_redirect(wbrequest) - - urlrewriter = wbrequest.urlrewriter - - # if using url rewriter, use original url for rewriting purposes - if wbrequest and wbrequest.wb_url: - wbrequest.wb_url.url = cdx['url'] - - if wbrequest.options['is_ajax']: - wbrequest.urlrewriter.rewrite_opts['is_ajax'] = True - - head_insert_func = None - if self.head_insert_view: - head_insert_func = (self.head_insert_view. - create_insert_func(wbrequest)) - - result = (self.content_rewriter. - rewrite_content(urlrewriter, - status_headers=status_headers, - stream=stream, - head_insert_func=head_insert_func, - urlkey=cdx['urlkey'], - cdx=cdx, - env=wbrequest.env)) - - (status_headers, response_iter, is_rewritten) = result - - # buffer response if buffering enabled - if self.buffer_response: - content_len = status_headers.get_header('content-length') - try: - content_len = int(content_len) - except: - content_len = 0 - - if content_len <= 0: - max_size = self.buffer_max_size - response_iter = self.buffered_response(status_headers, - response_iter, - max_size) - - # Set Content-Location if not exact capture - if not self.redir_to_exact: - mod = wbrequest.options.get('replay_mod', wbrequest.wb_url.mod) - canon_url = (wbrequest.urlrewriter. - get_new_url(timestamp=cdx['timestamp'], - url=cdx['url'], - mod=mod)) - - status_headers.headers.append(('Content-Location', canon_url)) - - if wbrequest.wb_url.mod == 'vi_': - status_headers.headers.append(('access-control-allow-origin', '*')) - - response = self.response_class(status_headers, - response_iter, - wbrequest=wbrequest, - cdx=cdx) - - # notify reporter callback, if any - if self._reporter: - self._reporter(wbrequest, cdx, response) - - return response - - # Buffer rewrite iterator and return a response from a string - def buffered_response(self, status_headers, iterator, max_size): - out = BytesIO() - size = 0 - read_all = True - - try: - for buff in iterator: - buff = bytes(buff) - size += len(buff) - out.write(buff) - if max_size > 0 and size > max_size: - read_all = False - break - - finally: - content = out.getvalue() - out.close() - - if read_all: - content_length_str = str(len(content)) - - # remove existing content length - status_headers.replace_header('Content-Length', - content_length_str) - return [content] - else: - status_headers.remove_header('Content-Length') - return chain(iter([content]), iterator) - - def _redirect_if_needed(self, wbrequest, cdx): - if not self.redir_to_exact: - return None - - if wbrequest.options['is_proxy']: - return None - - if wbrequest.custom_params.get('noredir'): - return None - - is_timegate = (wbrequest.options.get('is_timegate', False)) - if not is_timegate: - is_timegate = wbrequest.wb_url.is_latest_replay() - - redir_needed = is_timegate or (cdx['timestamp'] != wbrequest.wb_url.timestamp) - - if not redir_needed: - return None - - if self.enable_range_cache and wbrequest.extract_range(): - return None - - #if is_timegate: - # timestamp = timestamp_now() - #else: - timestamp = cdx['timestamp'] - - new_url = (wbrequest.urlrewriter. - get_new_url(timestamp=timestamp, - url=cdx['url'])) - - if wbrequest.method == 'POST': - # FF shows a confirm dialog, so can't use 307 effectively - # was: statusline = '307 Same-Method Internal Redirect' - return None - elif is_timegate: - statusline = '302 Found' - else: - # clear cdx line to indicate internal redirect - statusline = '302 Internal Redirect' - cdx = None - - status_headers = StatusAndHeaders(statusline, - [('Location', new_url)]) - - return self.response_class(status_headers, - wbrequest=wbrequest, - cdx=cdx, - memento_is_redir=True) - - def _reject_self_redirect(self, wbrequest, cdx, status_headers): - """ - Check if response is a 3xx redirect to the same url - If so, reject this capture to avoid causing redirect loop - """ - if not status_headers.statusline.startswith('3'): - return - - # skip all 304s - if (status_headers.statusline.startswith('304') and - not wbrequest.wb_url.is_identity): - - raise CaptureException('Skipping 304 Modified: ' + str(cdx)) - - request_url = wbrequest.wb_url.url.lower() - location_url = status_headers.get_header('Location') - if not location_url: - return - - location_url = location_url.lower() - if location_url.startswith('/'): - host = urlsplit(cdx['url']).netloc - location_url = host + location_url - - if (ReplayView.strip_scheme_www(request_url) == - ReplayView.strip_scheme_www(location_url)): - raise CaptureException('Self Redirect: ' + str(cdx)) - - # TODO: reevaluate this, as it may reject valid refreshes of a page - def _reject_referrer_self_redirect(self, wbrequest): # pragma: no cover - """ - Perform final check for referrer based self-redirect. - This method should be called after verifying that - the request timestamp == capture timestamp - - If referrer is same as current url, - reject this response and try another capture. - """ - if not wbrequest.referrer: - return - - # build full url even if using relative-rewriting - request_url = (wbrequest.host_prefix + - wbrequest.rel_prefix + str(wbrequest.wb_url)) - - if (ReplayView.strip_scheme_www(request_url) == - ReplayView.strip_scheme_www(wbrequest.referrer)): - raise CaptureException('Self Redirect via Referrer: ' + - str(wbrequest.wb_url)) - - @staticmethod - def strip_scheme_www(url): - """ - >>> ReplayView.strip_scheme_www('https://example.com') ==\ - ReplayView.strip_scheme_www('http://example.com') - True - - >>> ReplayView.strip_scheme_www('https://example.com') ==\ - ReplayView.strip_scheme_www('http:/example.com') - True - - >>> ReplayView.strip_scheme_www('https://example.com') ==\ - ReplayView.strip_scheme_www('example.com') - True - - >>> ReplayView.strip_scheme_www('https://example.com') ==\ - ReplayView.strip_scheme_www('http://www2.example.com') - True - - >>> ReplayView.strip_scheme_www('about://example.com') ==\ - ReplayView.strip_scheme_www('example.com') - True - - >>> ReplayView.strip_scheme_www('http://') ==\ - ReplayView.strip_scheme_www('') - True - - >>> ReplayView.strip_scheme_www('#!@?') ==\ - ReplayView.strip_scheme_www('#!@?') - True - """ - m = ReplayView.STRIP_SCHEME_WWW.match(url) - match = m.group(2) - return match - - -if __name__ == "__main__": - import doctest - doctest.testmod() diff --git a/pywb/webapp/test/test_view_filters.py b/pywb/webapp/test/test_view_filters.py deleted file mode 100644 index f82ea6e7..00000000 --- a/pywb/webapp/test/test_view_filters.py +++ /dev/null @@ -1,20 +0,0 @@ -""" ->>> format_ts('20141226101000') -'Fri, Dec 26 2014 10:10:00' - ->>> format_ts('20141226101000', '%s') -1419588600 - ->>> is_wb_handler(DebugEchoHandler()) -False - - -""" - -from pywb.webapp.views import format_ts, is_wb_handler -from pywb.webapp.handlers import DebugEchoHandler - - -if __name__ == "__main__": - import doctest - doctest.testmod() diff --git a/pywb/webapp/views.py b/pywb/webapp/views.py deleted file mode 100644 index 7ec27faa..00000000 --- a/pywb/webapp/views.py +++ /dev/null @@ -1,222 +0,0 @@ -from warcio.timeutils import timestamp_to_datetime, timestamp_to_sec -from pywb.framework.wbrequestresponse import WbResponse -from pywb.framework.memento import make_timemap, LINK_FORMAT - -from six.moves.urllib.parse import urlsplit - -import logging -import json -import os - -from jinja2 import Environment -from jinja2 import FileSystemLoader, PackageLoader, ChoiceLoader - - -FILTERS = {} - - -#================================================================= -class template_filter(object): - """ - Decorator for registering a function as a jinja2 filter - If optional argument is supplied, it is used as the filter name - Otherwise, the func name is the filter name - """ - def __init__(self, param=None): - self.name = param - - def __call__(self, func): - name = self.name - if not name: - name = func.__name__ - - FILTERS[name] = func - return func - - -#================================================================= -# Filters -@template_filter() -def format_ts(value, format_='%a, %b %d %Y %H:%M:%S'): - if format_ == '%s': - return timestamp_to_sec(value) - else: - value = timestamp_to_datetime(value) - return value.strftime(format_) - - -@template_filter('urlsplit') -def get_urlsplit(url): - split = urlsplit(url) - return split - - -@template_filter() -def is_wb_handler(obj): - if not hasattr(obj, 'handler'): - return False - - return obj.handler.__class__.__name__ == "WBHandler" - - -@template_filter() -def tojson(obj): - return json.dumps(obj) - - -#================================================================= -class FileOnlyPackageLoader(PackageLoader): - def get_source(self, env, template): - dir_, file_ = os.path.split(template) - return super(FileOnlyPackageLoader, self).get_source(env, file_) - - -#================================================================= -class RelEnvironment(Environment): - """Override join_path() to enable relative template paths.""" - def join_path(self, template, parent): - return os.path.join(os.path.dirname(parent), template) - - -#================================================================= -class J2TemplateView(object): - shared_jinja_env = None - - def __init__(self, filename): - self.template_file = filename - self.jinja_env = self.init_shared_env() - - @staticmethod - def init_shared_env(paths=['templates', '.', '/'], - packages=['pywb'], - overlay_env=None): - - if J2TemplateView.shared_jinja_env: - return J2TemplateView.shared_jinja_env - - loaders = J2TemplateView._add_loaders(paths, packages) - loader = ChoiceLoader(loaders) - - if overlay_env: - jinja_env = overlay_env.overlay(loader=loader, trim_blocks=True) - else: - jinja_env = RelEnvironment(loader=loader, trim_blocks=True) - - jinja_env.filters.update(FILTERS) - J2TemplateView.shared_jinja_env = jinja_env - return jinja_env - - @staticmethod - def _add_loaders(paths, packages): - loaders = [] - # add loaders for paths - for path in paths: - loaders.append(FileSystemLoader(path)) - - # add loaders for all specified packages - for package in packages: - loaders.append(FileOnlyPackageLoader(package)) - - return loaders - - def render_to_string(self, **kwargs): - template = self.jinja_env.get_template(self.template_file) - - wbrequest = kwargs.get('wbrequest') - if wbrequest: - params = wbrequest.env.get('pywb.template_params') - if params: - kwargs.update(params) - - template_result = template.render(**kwargs) - - return template_result - - def render_response(self, **kwargs): - template_result = self.render_to_string(**kwargs) - status = kwargs.get('status', '200 OK') - content_type = kwargs.get('content_type', 'text/html; charset=utf-8') - return WbResponse.text_response(template_result, - status=status, - content_type=content_type) - - -#================================================================= -def init_view(config, key, view_class=J2TemplateView): - filename = config.get(key) - if not filename: - return None - - logging.debug('Adding {0}: {1}'.format(key, filename)) - return view_class(filename) - - -#================================================================= -class HeadInsertView(J2TemplateView): - def create_insert_func(self, wbrequest, - include_ts=True): - - if wbrequest.options['is_ajax']: - return None - - url = wbrequest.wb_url.get_url() - - top_url = wbrequest.wb_prefix - top_url += wbrequest.wb_url.to_str(mod=wbrequest.final_mod) - - include_wombat = not wbrequest.wb_url.is_banner_only - - def make_head_insert(rule, cdx): - cdx['url'] = url - return (self.render_to_string(wbrequest=wbrequest, - cdx=cdx, - top_url=top_url, - include_ts=include_ts, - include_wombat=include_wombat, - banner_html=self.banner_html, - rule=rule)) - return make_head_insert - - @staticmethod - def init_from_config(config): - view = config.get('head_insert_view') - if not view: - html = config.get('head_insert_html', 'templates/head_insert.html') - - if html: - banner_html = config.get('banner_html', 'banner.html') - view = HeadInsertView(html) - logging.debug('Adding HeadInsert: {0}, Banner {1}'. - format(html, banner_html)) - - view.banner_html = banner_html - - return view - - -#================================================================= -# query views -#================================================================= -class J2HtmlCapturesView(J2TemplateView): - def render_response(self, wbrequest, cdx_lines, **kwargs): - def format_cdx_lines(): - for cdx in cdx_lines: - cdx['_orig_url'] = cdx['url'] - cdx['url'] = wbrequest.wb_url.get_url(url=cdx['url']) - yield cdx - - return J2TemplateView.render_response(self, - cdx_lines=list(format_cdx_lines()), - url=wbrequest.wb_url.get_url(), - type=wbrequest.wb_url.type, - prefix=wbrequest.wb_prefix, - **kwargs) - - -#================================================================= -class MementoTimemapView(object): - def render_response(self, wbrequest, cdx_lines, **kwargs): - memento_lines = make_timemap(wbrequest, cdx_lines) - - return WbResponse.text_stream(memento_lines, - content_type=LINK_FORMAT) diff --git a/tests/test_auto_colls.py b/tests_disabled/test_auto_colls.py similarity index 100% rename from tests/test_auto_colls.py rename to tests_disabled/test_auto_colls.py