diff --git a/config.yaml b/config.yaml index 07a2c303..8891f756 100644 --- a/config.yaml +++ b/config.yaml @@ -92,4 +92,8 @@ static_routes: enable_http_proxy: true # enable cdx server api for querying cdx directly (experimental) -#enable_cdx_api: false +enable_cdx_api: true + +# custom rules for domain specific matching +# set to false to disable +#domain_specific_rules: rules.yaml diff --git a/pywb/cdx/canonicalize.py b/pywb/cdx/canonicalize.py new file mode 100644 index 00000000..e0adb5c1 --- /dev/null +++ b/pywb/cdx/canonicalize.py @@ -0,0 +1,74 @@ +""" Standard url-canonicalzation, surt and non-surt +""" + +import surt +from cdxobject import CDXException + + +#================================================================= +class UrlCanonicalizer(object): + def __init__(self, surt_ordered=True): + self.surt_ordered = surt_ordered + + def __call__(self, url): + return canonicalize(url, self.surt_ordered) + + +#================================================================= +def canonicalize(url, surt_ordered=True): + """ + Canonicalize url and convert to surt + If not in surt ordered mode, convert back to url form + as surt conversion is currently part of canonicalization + + >>> canonicalize('http://example.com/path/file.html', surt_ordered=True) + 'com,example)/path/file.html' + + >>> canonicalize('http://example.com/path/file.html', surt_ordered=False) + 'example.com/path/file.html' + """ + try: + key = surt.surt(url) + except Exception as e: + raise CDXException('Invalid Url: ' + url) + + # if not surt, unsurt the surt to get canonicalized non-surt url + if not surt_ordered: + key = unsurt(key) + + return key + + +#================================================================= +def unsurt(surt): + """ + # Simple surt + >>> unsurt('com,example)/') + 'example.com/' + + # Broken surt + >>> unsurt('com,example)') + 'com,example)' + + # Long surt + >>> unsurt('suffix,domain,sub,subsub,another,subdomain)/path/file/\ +index.html?a=b?c=)/') + 'subdomain.another.subsub.sub.domain.suffix/path/file/index.html?a=b?c=)/' + """ + + try: + index = surt.index(')/') + parts = surt[0:index].split(',') + parts.reverse() + host = '.'.join(parts) + host += surt[index + 1:] + return host + + except ValueError: + # May not be a valid surt + return surt + + +if __name__ == "__main__": + import doctest + doctest.testmod() diff --git a/pywb/cdx/cdxdomainspecific.py b/pywb/cdx/cdxdomainspecific.py new file mode 100644 index 00000000..2c733c8d --- /dev/null +++ b/pywb/cdx/cdxdomainspecific.py @@ -0,0 +1,125 @@ +import yaml +import re +import logging +import pkgutil + +from canonicalize import unsurt, UrlCanonicalizer + + +#================================================================= +def load_domain_specific_cdx_rules(filename, surt_ordered): + fh = pkgutil.get_data(__package__, filename) + config = yaml.load(fh) + + # Load Canonicalizer Rules + rules = StartsWithRule.load_rules(config.get('canon_rules'), + surt_ordered) + + if rules: + canon = CustomUrlCanonicalizer(rules, surt_ordered) + else: + canon = None + + # Load Fuzzy Lookup Rules + rules = StartsWithRule.load_rules(config.get('fuzzy_lookup_rules'), + surt_ordered) + + if rules: + fuzzy = FuzzyQuery(rules) + else: + fuzzy = None + + logging.debug('CANON: ' + str(canon)) + logging.debug('FUZZY: ' + str(fuzzy)) + return (canon, fuzzy) + + +#================================================================= +class CustomUrlCanonicalizer(UrlCanonicalizer): + def __init__(self, rules, surt_ordered=True): + super(CustomUrlCanonicalizer, self).__init__(surt_ordered) + self.rules = rules + + def __call__(self, url): + urlkey = super(CustomUrlCanonicalizer, self).__call__(url) + + for rule in self.rules: + if not any(urlkey.startswith(x) for x in rule.starts): + continue + + m = rule.regex.match(urlkey) + if not m: + continue + + if rule.replace: + return m.expand(rule.replace) + + return urlkey + + +#================================================================= +class FuzzyQuery: + def __init__(self, rules): + self.rules = rules + + def __call__(self, params): + matched_rule = None + + urlkey = params['key'] + url = params['url'] + + for rule in self.rules: + if not any(urlkey.startswith(x) for x in rule.starts): + continue + + m = rule.regex.search(urlkey) + if not m: + continue + + matched_rule = rule + + if len(m.groups()) == 1: + params['filter'] = '=urlkey:' + m.group(1) + + break + + if not matched_rule: + return None + + inx = url.find('?') + if inx > 0: + params['url'] = url[:inx + 1] + + params['matchType'] = 'prefix' + params['key'] = None + return params + + +#================================================================= +class StartsWithRule: + def __init__(self, config, surt_ordered=True): + self.starts = config.get('startswith') + if not isinstance(self.starts, list): + self.starts = [self.starts] + + self.regex = re.compile(config.get('matches')) + self.replace = config.get('replace') + + def unsurt(self): + # must convert to non-surt form + self.starts = map(unsurt, self.starts) + self.regex = unsurt(self.regex) + self.replace = unsurt(self.replace) + + @staticmethod + def load_rules(rules_config, surt_ordered=True): + if not rules_config: + return [] + + rules = map(StartsWithRule, rules_config) + + if not surt_ordered: + for rule in rules: + rule.unsurt() + + return rules diff --git a/pywb/cdx/cdxobject.py b/pywb/cdx/cdxobject.py index ac3975b2..203cb7ef 100644 --- a/pywb/cdx/cdxobject.py +++ b/pywb/cdx/cdxobject.py @@ -2,6 +2,24 @@ from collections import OrderedDict import itertools +#================================================================= +class CDXException(Exception): + def status(self): + return '400 Bad Request' + + +#================================================================= +class CaptureNotFoundException(CDXException): + def status(self): + return '404 Not Found' + + +#================================================================= +class AccessException(CDXException): + def status(self): + return '403 Access Denied' + + #================================================================= class CDXObject(OrderedDict): CDX_FORMATS = [ diff --git a/pywb/cdx/cdxserver.py b/pywb/cdx/cdxserver.py index 2beef250..17d16314 100644 --- a/pywb/cdx/cdxserver.py +++ b/pywb/cdx/cdxserver.py @@ -1,82 +1,103 @@ -import surt -from cdxops import cdx_load +from canonicalize import UrlCanonicalizer -import itertools +from cdxops import cdx_load +from cdxsource import CDXSource, CDXFile, RemoteCDXSource +from cdxobject import CDXObject, CaptureNotFoundException, CDXException +from cdxdomainspecific import load_domain_specific_cdx_rules + +from itertools import chain import logging import os import urlparse -from cdxsource import CDXSource, CDXFile, RemoteCDXSource -from cdxobject import CDXObject + +#================================================================= +class BaseCDXServer(object): + def __init__(self, url_canon=None, fuzzy_query=None): + self.url_canon = url_canon if url_canon else UrlCanonicalizer() + self.fuzzy_query = fuzzy_query + + def _check_cdx_iter(self, cdx_iter, params): + """ Check cdx iter semantics + If iter is empty (no matches), check if fuzzy matching + is allowed, and try it -- otherwise, + throw CaptureNotFoundException + """ + + cdx_iter = self.peek_iter(cdx_iter) + + if cdx_iter: + return cdx_iter + + url = params['url'] + + if self.fuzzy_query and params.get('allow_fuzzy'): + if not 'key' in params: + params['key'] = self.url_canon(url) + + params = self.fuzzy_query(params) + if params: + params['allow_fuzzy'] = False + return self.load_cdx(**params) + + msg = 'No Captures found for: ' + url + raise CaptureNotFoundException(msg) + + def load_cdx(self, **params): + raise NotImplementedError('Implement in subclass') + + @staticmethod + def peek_iter(iterable): + try: + first = next(iterable) + except StopIteration: + return None + + return chain([first], iterable) #================================================================= -class CDXException(Exception): - def status(self): - return '400 Bad Request' - - -#================================================================= -class AccessException(CDXException): - def status(self): - return '403 Bad Request' - - -#================================================================= -class CDXServer(object): +class CDXServer(BaseCDXServer): """ Top-level cdx server object which maintains a list of cdx sources, responds to queries and dispatches to the cdx ops for processing """ - def __init__(self, paths, surt_ordered=True): + def __init__(self, paths, url_canon=None, fuzzy_query=None): + super(CDXServer, self).__init__(url_canon, fuzzy_query) self.sources = create_cdx_sources(paths) - self.surt_ordered = surt_ordered def load_cdx(self, **params): # if key not set, assume 'url' is set and needs canonicalization if not params.get('key'): - params['key'] = self._canonicalize(params) + try: + url = params['url'] + except KeyError: + msg = 'A url= param must be specified to query the cdx server' + raise CDXException(msg) + + params['key'] = self.url_canon(url) convert_old_style_params(params) - return cdx_load(self.sources, params) + cdx_iter = cdx_load(self.sources, params) - def _canonicalize(self, params): - """ - Canonicalize url and convert to surt - If no surt-mode, convert back to url form - as surt conversion is currently part of canonicalization - """ - try: - url = params['url'] - except KeyError: - msg = 'A url= param must be specified to query the cdx server' - raise CDXException(msg) - - try: - key = surt.surt(url) - except Exception as e: - raise CDXException('Invalid Url: ' + url) - - # if not surt, unsurt the surt to get canonicalized non-surt url - if not self.surt_ordered: - key = unsurt(key) - - return key + return self._check_cdx_iter(cdx_iter, params) def __str__(self): return 'CDX server serving from ' + str(self.sources) #================================================================= -class RemoteCDXServer(object): +class RemoteCDXServer(BaseCDXServer): """ A special cdx server that uses a single RemoteCDXSource It simply proxies the query params to the remote source and performs no local processing/filtering """ - def __init__(self, source): + def __init__(self, source, url_canon=None, fuzzy_query=None): + super(RemoteCDXServer, self).__init__(url_canon, fuzzy_query) + if isinstance(source, RemoteCDXSource): self.source = source elif (isinstance(source, str) and @@ -87,18 +108,19 @@ class RemoteCDXServer(object): def load_cdx(self, **params): remote_iter = self.source.load_cdx(params) + # if need raw, convert to raw format here if params.get('output') == 'raw': - return (CDXObject(cdx) for cdx in remote_iter) - else: - return remote_iter + remote_iter = (CDXObject(cdx) for cdx in remote_iter) + + return self._check_cdx_iter(remote_iter, params) def __str__(self): return 'Remote CDX server serving from ' + str(self.sources[0]) #================================================================= -def create_cdx_server(config): +def create_cdx_server(config, ds_rules_file=None): if hasattr(config, 'get'): paths = config.get('index_paths') surt_ordered = config.get('surt_ordered', True) @@ -108,11 +130,22 @@ def create_cdx_server(config): logging.debug('CDX Surt-Ordered? ' + str(surt_ordered)) + if ds_rules_file: + canon, fuzzy = load_domain_specific_cdx_rules(ds_rules_file, + surt_ordered) + else: + canon, fuzzy = None, None + + if not canon: + canon = UrlCanonicalizer(surt_ordered) + if (isinstance(paths, str) and any(paths.startswith(x) for x in ['http://', 'https://'])): - return RemoteCDXServer(paths) + server_cls = RemoteCDXServer else: - return CDXServer(paths) + server_cls = CDXServer + + return server_cls(paths, url_canon=canon, fuzzy_query=fuzzy) #================================================================= @@ -170,13 +203,17 @@ def convert_old_style_params(params): """ Convert old-style CDX Server param semantics """ - collapse_time = params.get('collapseTime') - if collapse_time: - params['collapse_time'] = collapse_time + param = params.get('collapseTime') + if param: + params['collapse_time'] = param - resolve_revisits = params.get('resolveRevisits') - if resolve_revisits: - params['resolve_revisits'] = resolve_revisits + param = params.get('matchType') + if param: + params['match_type'] = param + + param = params.get('resolveRevisits') + if param: + params['resolve_revisits'] = param if params.get('sort') == 'reverse': params['reverse'] = True @@ -204,38 +241,3 @@ def extract_params_from_wsgi_env(env): params[name] = val[0] return params - - -#================================================================= -def unsurt(surt): - """ - # Simple surt - >>> unsurt('com,example)/') - 'example.com)/' - - # Broken surt - >>> unsurt('com,example)') - 'com,example)' - - # Long surt - >>> unsurt('suffix,domain,sub,subsub,another,subdomain)/path/file/\ -index.html?a=b?c=)/') - 'subdomain.another.subsub.sub.domain.suffix)/path/file/index.html?a=b?c=)/' - """ - - try: - index = surt.index(')/') - parts = surt[0:index].split(',') - parts.reverse() - host = '.'.join(parts) - host += surt[index:] - return host - - except ValueError: - # May not be a valid surt - return surt - - -if __name__ == "__main__": - import doctest - doctest.testmod() diff --git a/pywb/cdx/rules.yaml b/pywb/cdx/rules.yaml new file mode 100644 index 00000000..1da70582 --- /dev/null +++ b/pywb/cdx/rules.yaml @@ -0,0 +1,24 @@ + +fuzzy_lookup_rules: + - startswith: 'com,twitter)/i/profiles/show/' + matches: '/profiles/show/.*with_replies\?.*(max_id=[^&]+)' + + - startswith: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet' + matches: 'com,facebook\)/.*[?&]data=(.*(?:[&]|query_type[^,]+))' + + - startswith: ['com,yimg,l)/g/combo', 'com,yahooapis,yui)/combo'] + matches: '([^/]+(?:\.css|\.js))' + + # matches all urls + - startswith: '' + matches: '[&?](?:_|uncache)=[\d]+[&]?' + +canon_rules: + - startswith: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet' + matches: 'com,facebook\)/.*[?&]data=([^&]+).*' + replace: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet?data=\1' + + + + + diff --git a/pywb/cdx/test/cdxserver_test.py b/pywb/cdx/test/cdxserver_test.py index c1434228..fc96acb2 100644 --- a/pywb/cdx/test/cdxserver_test.py +++ b/pywb/cdx/test/cdxserver_test.py @@ -25,6 +25,8 @@ org,iana)/_js/2013.1/jquery.js 20140126201307 https://www.iana.org/_js/2013.1/jq # No matching results >>> cdx_ops_test('http://iana.org/dont_have_this', reverse = True, resolve_revisits = True, limit = 2) +Traceback (most recent call last): +CaptureNotFoundException: No Captures found for: http://iana.org/dont_have_this # Filter cdx (default: regex) diff --git a/pywb/indexreader.py b/pywb/indexreader.py index aaf60705..493c1bbd 100644 --- a/pywb/indexreader.py +++ b/pywb/indexreader.py @@ -1,13 +1,7 @@ import urllib import urllib2 -from wbexceptions import NotFoundException - -from itertools import chain -from pprint import pprint - -from pywb.cdx.cdxserver import create_cdx_server, CDXException -from pywb.cdx.cdxobject import CDXObject +from pywb.cdx.cdxserver import create_cdx_server #================================================================= class IndexReader(object): @@ -18,8 +12,8 @@ class IndexReader(object): Creates an appropriate query based on wbrequest type info """ - def __init__(self, config): - self.cdx_server = create_cdx_server(config) + def __init__(self, config, ds_rules_file=None): + self.cdx_server = create_cdx_server(config, ds_rules_file) def load_for_request(self, wbrequest): wburl = wbrequest.wb_url @@ -29,19 +23,14 @@ class IndexReader(object): # add any custom filter from the request if wbrequest.query_filter: - params['filter'] = wbrequest.query_filter + params['filter'].extend(wbrequest.query_filter) if wbrequest.custom_params: params.update(wbrequest.custom_params) - params['url'] = wburl.url + params['allow_fuzzy'] = True - cdxlines = self.load_cdx(output='raw', **params) - - cdxlines = self.peek_iter(cdxlines) - - if cdxlines is None: - raise NotFoundException('No Captures found for: ' + wburl.url) + cdxlines = self.load_cdx(url=wburl.url, output='raw', **params) return cdxlines @@ -54,7 +43,7 @@ class IndexReader(object): return { wburl.QUERY: - {'collapseTime': collapse_time, 'filter': '!statuscode:(500|502|504)', 'limit': limit}, + {'collapseTime': collapse_time, 'filter': ['!statuscode:(500|502|504)'], 'limit': limit}, wburl.URL_QUERY: {'collapse': 'urlkey', 'matchType': 'prefix', 'showGroupCount': True, 'showUniqCount': True, 'lastSkipTimestamp': True, 'limit': limit, @@ -62,21 +51,12 @@ class IndexReader(object): }, wburl.REPLAY: - {'sort': 'closest', 'filter': '!statuscode:(500|502|504)', 'limit': replay_closest, 'closest': wburl.timestamp, 'resolveRevisits': True}, + {'sort': 'closest', 'filter': ['!statuscode:(500|502|504)'], 'limit': replay_closest, 'closest': wburl.timestamp, 'resolveRevisits': True}, # BUG: resolveRevisits currently doesn't work for this type of query # This is not an issue in archival mode, as there is a redirect to the actual timestamp query # but may be an issue in proxy mode wburl.LATEST_REPLAY: - {'sort': 'reverse', 'filter': 'statuscode:[23]..', 'limit': '1', 'resolveRevisits': True} + {'sort': 'reverse', 'filter': ['statuscode:[23]..'], 'limit': '1', 'resolveRevisits': True} }[wburl.type] - - @staticmethod - def peek_iter(iterable): - try: - first = next(iterable) - except StopIteration: - return None - - return chain([first], iterable) diff --git a/pywb/pywb_init.py b/pywb/pywb_init.py index a6d0500b..c4b40ee2 100644 --- a/pywb/pywb_init.py +++ b/pywb/pywb_init.py @@ -21,6 +21,8 @@ DEFAULTS = { 'error_html': 'ui/error.html', 'static_routes': {'static/default': 'static/'}, + + 'domain_specific_rules': 'rules.yaml', } class DictChain: @@ -30,7 +32,7 @@ class DictChain: def get(self, key, default_val=None): for d in self.dicts: val = d.get(key) - if val: + if val is not None: return val return default_val @@ -52,11 +54,13 @@ def pywb_config_manual(passed_config = {}): for name, value in collections.iteritems(): if isinstance(value, str): route_config = config - cdx_server = IndexReader(value) + cdx_config = value else: route_config = DictChain(value, config) - cdx_server = IndexReader(route_config) + cdx_config = route_config + ds_rules = route_config.get('domain_specific_rules', None) + cdx_server = IndexReader(cdx_config, ds_rules) wb_handler = config_utils.create_wb_handler( cdx_server = cdx_server, @@ -118,7 +122,8 @@ def pywb_config(config_file = None): if not config_file: config_file = os.environ.get('PYWB_CONFIG', DEFAULT_CONFIG_FILE) - config = yaml.load(open(config_file)) + with open(config_file) as fh: + config = yaml.load(fh) return pywb_config_manual(config) diff --git a/pywb/rewrite/rewrite_content.py b/pywb/rewrite/rewrite_content.py index ae1383ff..9f904764 100644 --- a/pywb/rewrite/rewrite_content.py +++ b/pywb/rewrite/rewrite_content.py @@ -54,8 +54,7 @@ class RewriteContent: # ========================================================================= # special case -- need to ungzip the body if (rewritten_headers.contains_removed_header('content-encoding', 'gzip')): - stream = BufferedReader(stream, 'gzip') - + stream = BufferedReader(stream, decomp_type='gzip') if rewritten_headers.charset: encoding = rewritten_headers.charset diff --git a/pywb/wbexceptions.py b/pywb/wbexceptions.py index 8796db76..afacc325 100644 --- a/pywb/wbexceptions.py +++ b/pywb/wbexceptions.py @@ -1,14 +1,15 @@ + class WbException(Exception): pass class NotFoundException(WbException): - def status(_): + def status(self): return '404 Not Found' # Exceptions that effect a specific capture and result in a retry class CaptureException(WbException): - def status(_): + def status(self): return '500 Internal Server Error' class InternalRedirect(WbException): diff --git a/test_config.yaml b/test_config.yaml index 38a15f37..04dfee37 100644 --- a/test_config.yaml +++ b/test_config.yaml @@ -93,3 +93,6 @@ enable_cdx_api: true # optional reporter callback func # if set, called with request and cdx object reporter_func: pywb.run-tests.print_reporter + +# custom rules for domain specific matching +#domain_specific_rules: rules.yaml diff --git a/tests/test_integration.py b/tests/test_integration.py index e639163b..59b4fc36 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -50,6 +50,13 @@ class TestWb: # 1 Capture (filtered) + header assert len(resp.html.find_all('tr')) == 2 + def test_calendar_query_fuzzy_match(self): + # fuzzy match removing _= according to standard rules.yaml + resp = self.testapp.get('/pywb/*/http://www.iana.org/_css/2013.1/screen.css?_=3141592653') + self._assert_basic_html(resp) + # 17 Captures + header + assert len(resp.html.find_all('tr')) == 18 + def test_cdx_query(self): resp = self.testapp.get('/pywb/cdx_/*/http://www.iana.org/') self._assert_basic_text(resp)