cdx: add domain-specific rules at cdx layer for custom canonicalization!

and 'fuzzy' matching when not found handled via cdxdomainspecific.py BaseCDXServer contains a canonicalizer object and a fuzzy query canonicalizer abstracted to seperate class (in canonicalizer.py) clean up cdx related exceptions default rules read from cdx/rules.yaml filename configurable via 'domain_specific_rules' setting in config.yaml fix typo in pywb/rewrite
2025-03-15 00:03:28 +01:00 · 2014-02-18 14:47:48 -08:00 · 2014-02-18 14:47:48 -08:00 · a09dec4b3e
commit a09dec4b3e
parent ab95524b7b
13 changed files with 375 additions and 131 deletions
--- a/config.yaml
+++ b/config.yaml
@ -92,4 +92,8 @@ static_routes:
 enable_http_proxy: true
 # enable cdx server api for querying cdx directly (experimental)
-#enable_cdx_api: false
+enable_cdx_api: true
 # custom rules for domain specific matching
 # set to false to disable
 #domain_specific_rules: rules.yaml
--- a/pywb/cdx/canonicalize.py
+++ b/pywb/cdx/canonicalize.py
@ -0,0 +1,74 @@
 """ Standard url-canonicalzation, surt and non-surt
 """
 import surt
 from cdxobject import CDXException
 #=================================================================
 class UrlCanonicalizer(object):
    def __init__(self, surt_ordered=True):
        self.surt_ordered = surt_ordered
    def __call__(self, url):
        return canonicalize(url, self.surt_ordered)
 #=================================================================
 def canonicalize(url, surt_ordered=True):
    """
    Canonicalize url and convert to surt
    If not in surt ordered mode, convert back to url form
    as surt conversion is currently part of canonicalization
    >>> canonicalize('http://example.com/path/file.html', surt_ordered=True)
    'com,example)/path/file.html'
    >>> canonicalize('http://example.com/path/file.html', surt_ordered=False)
    'example.com/path/file.html'
    """
    try:
        key = surt.surt(url)
    except Exception as e:
        raise CDXException('Invalid Url: ' + url)
    # if not surt, unsurt the surt to get canonicalized non-surt url
    if not surt_ordered:
        key = unsurt(key)
    return key
 #=================================================================
 def unsurt(surt):
    """
    # Simple surt
    >>> unsurt('com,example)/')
    'example.com/'
    # Broken surt
    >>> unsurt('com,example)')
    'com,example)'
    # Long surt
    >>> unsurt('suffix,domain,sub,subsub,another,subdomain)/path/file/\
 index.html?a=b?c=)/')
    'subdomain.another.subsub.sub.domain.suffix/path/file/index.html?a=b?c=)/'
    """
    try:
        index = surt.index(')/')
        parts = surt[0:index].split(',')
        parts.reverse()
        host = '.'.join(parts)
        host += surt[index + 1:]
        return host
    except ValueError:
        # May not be a valid surt
        return surt
 if __name__ == "__main__":
    import doctest
    doctest.testmod()
--- a/pywb/cdx/cdxdomainspecific.py
+++ b/pywb/cdx/cdxdomainspecific.py
@ -0,0 +1,125 @@
 import yaml
 import re
 import logging
 import pkgutil
 from canonicalize import unsurt, UrlCanonicalizer
 #=================================================================
 def load_domain_specific_cdx_rules(filename, surt_ordered):
    fh = pkgutil.get_data(__package__, filename)
    config = yaml.load(fh)
    # Load Canonicalizer Rules
    rules = StartsWithRule.load_rules(config.get('canon_rules'),
                                      surt_ordered)
    if rules:
        canon = CustomUrlCanonicalizer(rules, surt_ordered)
    else:
        canon = None
    # Load Fuzzy Lookup Rules
    rules = StartsWithRule.load_rules(config.get('fuzzy_lookup_rules'),
                                      surt_ordered)
    if rules:
        fuzzy = FuzzyQuery(rules)
    else:
        fuzzy = None
    logging.debug('CANON: ' + str(canon))
    logging.debug('FUZZY: ' + str(fuzzy))
    return (canon, fuzzy)
 #=================================================================
 class CustomUrlCanonicalizer(UrlCanonicalizer):
    def __init__(self, rules, surt_ordered=True):
        super(CustomUrlCanonicalizer, self).__init__(surt_ordered)
        self.rules = rules
    def __call__(self, url):
        urlkey = super(CustomUrlCanonicalizer, self).__call__(url)
        for rule in self.rules:
            if not any(urlkey.startswith(x) for x in rule.starts):
                continue
            m = rule.regex.match(urlkey)
            if not m:
                continue
            if rule.replace:
                return m.expand(rule.replace)
        return urlkey
 #=================================================================
 class FuzzyQuery:
    def __init__(self, rules):
        self.rules = rules
    def __call__(self, params):
        matched_rule = None
        urlkey = params['key']
        url = params['url']
        for rule in self.rules:
            if not any(urlkey.startswith(x) for x in rule.starts):
                continue
            m = rule.regex.search(urlkey)
            if not m:
                continue
            matched_rule = rule
            if len(m.groups()) == 1:
                params['filter'] = '=urlkey:' + m.group(1)
            break
        if not matched_rule:
            return None
        inx = url.find('?')
        if inx > 0:
            params['url'] = url[:inx + 1]
        params['matchType'] = 'prefix'
        params['key'] = None
        return params
 #=================================================================
 class StartsWithRule:
    def __init__(self, config, surt_ordered=True):
        self.starts = config.get('startswith')
        if not isinstance(self.starts, list):
            self.starts = [self.starts]
        self.regex = re.compile(config.get('matches'))
        self.replace = config.get('replace')
    def unsurt(self):
        # must convert to non-surt form
        self.starts = map(unsurt, self.starts)
        self.regex = unsurt(self.regex)
        self.replace = unsurt(self.replace)
    @staticmethod
    def load_rules(rules_config, surt_ordered=True):
        if not rules_config:
            return []
        rules = map(StartsWithRule, rules_config)
        if not surt_ordered:
            for rule in rules:
                rule.unsurt()
        return rules
--- a/pywb/cdx/cdxobject.py
+++ b/pywb/cdx/cdxobject.py
@ -2,6 +2,24 @@ from collections import OrderedDict
 import itertools
 #=================================================================
 class CDXException(Exception):
    def status(self):
        return '400 Bad Request'
 #=================================================================
 class CaptureNotFoundException(CDXException):
    def status(self):
        return '404 Not Found'
 #=================================================================
 class AccessException(CDXException):
    def status(self):
        return '403 Access Denied'
 #=================================================================
 class CDXObject(OrderedDict):
    CDX_FORMATS = [
--- a/pywb/cdx/cdxserver.py
+++ b/pywb/cdx/cdxserver.py
@ -1,82 +1,103 @@
-import surt
+from canonicalize import UrlCanonicalizer
 from cdxops import cdx_load
-import itertools
+from cdxops import cdx_load
 from cdxsource import CDXSource, CDXFile, RemoteCDXSource
 from cdxobject import CDXObject, CaptureNotFoundException, CDXException
 from cdxdomainspecific import load_domain_specific_cdx_rules
 from itertools import chain
 import logging
 import os
 import urlparse
-from cdxsource import CDXSource, CDXFile, RemoteCDXSource
+
-from cdxobject import CDXObject
+#=================================================================
 class BaseCDXServer(object):
    def __init__(self, url_canon=None, fuzzy_query=None):
        self.url_canon = url_canon if url_canon else UrlCanonicalizer()
        self.fuzzy_query = fuzzy_query
    def _check_cdx_iter(self, cdx_iter, params):
        """ Check cdx iter semantics
        If iter is empty (no matches), check if fuzzy matching
        is allowed, and try it -- otherwise,
        throw CaptureNotFoundException
        """
        cdx_iter = self.peek_iter(cdx_iter)
        if cdx_iter:
            return cdx_iter
        url = params['url']
        if self.fuzzy_query and params.get('allow_fuzzy'):
            if not 'key' in params:
                params['key'] = self.url_canon(url)
            params = self.fuzzy_query(params)
            if params:
                params['allow_fuzzy'] = False
                return self.load_cdx(**params)
        msg = 'No Captures found for: ' + url
        raise CaptureNotFoundException(msg)
    def load_cdx(self, **params):
        raise NotImplementedError('Implement in subclass')
    @staticmethod
    def peek_iter(iterable):
        try:
            first = next(iterable)
        except StopIteration:
            return None
        return chain([first], iterable)
 #=================================================================
-class CDXException(Exception):
+class CDXServer(BaseCDXServer):
    def status(self):
        return '400 Bad Request'
 #=================================================================
 class AccessException(CDXException):
    def status(self):
        return '403 Bad Request'
 #=================================================================
 class CDXServer(object):
    """
    Top-level cdx server object which maintains a list of cdx sources,
    responds to queries and dispatches to the cdx ops for processing
    """
-    def __init__(self, paths, surt_ordered=True):
+    def __init__(self, paths, url_canon=None, fuzzy_query=None):
        super(CDXServer, self).__init__(url_canon, fuzzy_query)
        self.sources = create_cdx_sources(paths)
        self.surt_ordered = surt_ordered
    def load_cdx(self, **params):
        # if key not set, assume 'url' is set and needs canonicalization
        if not params.get('key'):
-            params['key'] = self._canonicalize(params)
+            try:
                url = params['url']
            except KeyError:
                msg = 'A url= param must be specified to query the cdx server'
                raise CDXException(msg)
            params['key'] = self.url_canon(url)
        convert_old_style_params(params)
-        return cdx_load(self.sources, params)
+        cdx_iter = cdx_load(self.sources, params)
-    def _canonicalize(self, params):
+        return self._check_cdx_iter(cdx_iter, params)
        """
        Canonicalize url and convert to surt
        If no surt-mode, convert back to url form
        as surt conversion is currently part of canonicalization
        """
        try:
            url = params['url']
        except KeyError:
            msg = 'A url= param must be specified to query the cdx server'
            raise CDXException(msg)
        try:
            key = surt.surt(url)
        except Exception as e:
            raise CDXException('Invalid Url: ' + url)
        # if not surt, unsurt the surt to get canonicalized non-surt url
        if not self.surt_ordered:
            key = unsurt(key)
        return key
    def __str__(self):
        return 'CDX server serving from ' + str(self.sources)
 #=================================================================
-class RemoteCDXServer(object):
+class RemoteCDXServer(BaseCDXServer):
    """
    A special cdx server that uses a single RemoteCDXSource
    It simply proxies the query params to the remote source
    and performs no local processing/filtering
    """
-    def __init__(self, source):
+    def __init__(self, source, url_canon=None, fuzzy_query=None):
        super(RemoteCDXServer, self).__init__(url_canon, fuzzy_query)
        if isinstance(source, RemoteCDXSource):
            self.source = source
        elif (isinstance(source, str) and
@ -87,18 +108,19 @@ class RemoteCDXServer(object):
    def load_cdx(self, **params):
        remote_iter = self.source.load_cdx(params)
        # if need raw, convert to raw format here
        if params.get('output') == 'raw':
-            return (CDXObject(cdx) for cdx in remote_iter)
+            remote_iter = (CDXObject(cdx) for cdx in remote_iter)
-        else:
+
-            return remote_iter
+        return self._check_cdx_iter(remote_iter, params)
    def __str__(self):
        return 'Remote CDX server serving from ' + str(self.sources[0])
 #=================================================================
-def create_cdx_server(config):
+def create_cdx_server(config, ds_rules_file=None):
    if hasattr(config, 'get'):
        paths = config.get('index_paths')
        surt_ordered = config.get('surt_ordered', True)
@ -108,11 +130,22 @@ def create_cdx_server(config):
    logging.debug('CDX Surt-Ordered? ' + str(surt_ordered))
    if ds_rules_file:
        canon, fuzzy = load_domain_specific_cdx_rules(ds_rules_file,
                                                      surt_ordered)
    else:
        canon, fuzzy = None, None
    if not canon:
        canon = UrlCanonicalizer(surt_ordered)
    if (isinstance(paths, str) and
        any(paths.startswith(x) for x in ['http://', 'https://'])):
-        return RemoteCDXServer(paths)
+        server_cls = RemoteCDXServer
    else:
-        return CDXServer(paths)
+        server_cls = CDXServer
    return server_cls(paths, url_canon=canon, fuzzy_query=fuzzy)
 #=================================================================
@ -170,13 +203,17 @@ def convert_old_style_params(params):
    """
    Convert old-style CDX Server param semantics
    """
-    collapse_time = params.get('collapseTime')
+    param = params.get('collapseTime')
-    if collapse_time:
+    if param:
-        params['collapse_time'] = collapse_time
+        params['collapse_time'] = param
-    resolve_revisits = params.get('resolveRevisits')
+    param = params.get('matchType')
-    if resolve_revisits:
+    if param:
-        params['resolve_revisits'] = resolve_revisits
+        params['match_type'] = param
    param = params.get('resolveRevisits')
    if param:
        params['resolve_revisits'] = param
    if params.get('sort') == 'reverse':
        params['reverse'] = True
@ -204,38 +241,3 @@ def extract_params_from_wsgi_env(env):
            params[name] = val[0]
    return params
 #=================================================================
 def unsurt(surt):
    """
    # Simple surt
    >>> unsurt('com,example)/')
    'example.com)/'
    # Broken surt
    >>> unsurt('com,example)')
    'com,example)'
    # Long surt
    >>> unsurt('suffix,domain,sub,subsub,another,subdomain)/path/file/\
 index.html?a=b?c=)/')
    'subdomain.another.subsub.sub.domain.suffix)/path/file/index.html?a=b?c=)/'
    """
    try:
        index = surt.index(')/')
        parts = surt[0:index].split(',')
        parts.reverse()
        host = '.'.join(parts)
        host += surt[index:]
        return host
    except ValueError:
        # May not be a valid surt
        return surt
 if __name__ == "__main__":
    import doctest
    doctest.testmod()
--- a/pywb/cdx/rules.yaml
+++ b/pywb/cdx/rules.yaml
@ -0,0 +1,24 @@
 fuzzy_lookup_rules:
    - startswith: 'com,twitter)/i/profiles/show/'
      matches: '/profiles/show/.*with_replies\?.*(max_id=[^&]+)'
    - startswith: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet'
      matches: 'com,facebook\)/.*[?&]data=(.*(?:[&]|query_type[^,]+))'
    - startswith: ['com,yimg,l)/g/combo', 'com,yahooapis,yui)/combo']
      matches: '([^/]+(?:\.css|\.js))'
    # matches all urls
    - startswith: ''
      matches: '[&?](?:_|uncache)=[\d]+[&]?'
 canon_rules:
    - startswith: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet'
      matches: 'com,facebook\)/.*[?&]data=([^&]+).*'
      replace: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet?data=\1'
--- a/pywb/cdx/test/cdxserver_test.py
+++ b/pywb/cdx/test/cdxserver_test.py
@ -25,6 +25,8 @@ org,iana)/_js/2013.1/jquery.js 20140126201307 https://www.iana.org/_js/2013.1/jq
 # No matching results
 >>> cdx_ops_test('http://iana.org/dont_have_this', reverse = True, resolve_revisits = True, limit = 2)
 Traceback (most recent call last):
 CaptureNotFoundException: No Captures found for: http://iana.org/dont_have_this
 # Filter cdx (default: regex)
--- a/pywb/indexreader.py
+++ b/pywb/indexreader.py
@ -1,13 +1,7 @@
 import urllib
 import urllib2
-from wbexceptions import NotFoundException
+from pywb.cdx.cdxserver import create_cdx_server
 from itertools import chain
 from pprint import pprint
 from pywb.cdx.cdxserver import create_cdx_server, CDXException
 from pywb.cdx.cdxobject import CDXObject
 #=================================================================
 class IndexReader(object):
@ -18,8 +12,8 @@ class IndexReader(object):
    Creates an appropriate query based on wbrequest type info
    """
-    def __init__(self, config):
+    def __init__(self, config, ds_rules_file=None):
-        self.cdx_server = create_cdx_server(config)
+        self.cdx_server = create_cdx_server(config, ds_rules_file)
    def load_for_request(self, wbrequest):
        wburl = wbrequest.wb_url
@ -29,19 +23,14 @@ class IndexReader(object):
        # add any custom filter from the request
        if wbrequest.query_filter:
-            params['filter'] = wbrequest.query_filter
+            params['filter'].extend(wbrequest.query_filter)
        if wbrequest.custom_params:
            params.update(wbrequest.custom_params)
-        params['url'] = wburl.url
+        params['allow_fuzzy'] = True
-        cdxlines = self.load_cdx(output='raw', **params)
+        cdxlines = self.load_cdx(url=wburl.url, output='raw', **params)
        cdxlines = self.peek_iter(cdxlines)
        if cdxlines is None:
            raise NotFoundException('No Captures found for: ' + wburl.url)
        return cdxlines
@ -54,7 +43,7 @@ class IndexReader(object):
        return {
            wburl.QUERY:
-                {'collapseTime': collapse_time, 'filter': '!statuscode:(500|502|504)', 'limit': limit},
+                {'collapseTime': collapse_time, 'filter': ['!statuscode:(500|502|504)'], 'limit': limit},
            wburl.URL_QUERY:
                {'collapse': 'urlkey', 'matchType': 'prefix', 'showGroupCount': True, 'showUniqCount': True, 'lastSkipTimestamp': True, 'limit': limit,
@ -62,21 +51,12 @@ class IndexReader(object):
                },
            wburl.REPLAY:
-                {'sort': 'closest', 'filter': '!statuscode:(500|502|504)', 'limit': replay_closest, 'closest': wburl.timestamp, 'resolveRevisits': True},
+                {'sort': 'closest', 'filter': ['!statuscode:(500|502|504)'], 'limit': replay_closest, 'closest': wburl.timestamp, 'resolveRevisits': True},
            # BUG: resolveRevisits currently doesn't work for this type of query
            # This is not an issue in archival mode, as there is a redirect to the actual timestamp query
            # but may be an issue in proxy mode
            wburl.LATEST_REPLAY:
-                {'sort': 'reverse', 'filter': 'statuscode:[23]..', 'limit': '1', 'resolveRevisits': True}
+                {'sort': 'reverse', 'filter': ['statuscode:[23]..'], 'limit': '1', 'resolveRevisits': True}
        }[wburl.type]
    @staticmethod
    def peek_iter(iterable):
        try:
            first = next(iterable)
        except StopIteration:
            return None
        return chain([first], iterable)
--- a/pywb/pywb_init.py
+++ b/pywb/pywb_init.py
@ -21,6 +21,8 @@ DEFAULTS = {
    'error_html': 'ui/error.html',
    'static_routes': {'static/default': 'static/'},
    'domain_specific_rules': 'rules.yaml',
 }
 class DictChain:
@ -30,7 +32,7 @@ class DictChain:
    def get(self, key, default_val=None):
        for d in self.dicts:
            val = d.get(key)
-            if val:
+            if val is not None:
                return val
        return default_val
@ -52,11 +54,13 @@ def pywb_config_manual(passed_config = {}):
    for name, value in collections.iteritems():
        if isinstance(value, str):
            route_config = config
-            cdx_server = IndexReader(value)
+            cdx_config = value
        else:
            route_config = DictChain(value, config)
-            cdx_server = IndexReader(route_config)
+            cdx_config = route_config
        ds_rules = route_config.get('domain_specific_rules', None)
        cdx_server = IndexReader(cdx_config, ds_rules)
        wb_handler = config_utils.create_wb_handler(
            cdx_server = cdx_server,
@ -118,7 +122,8 @@ def pywb_config(config_file = None):
    if not config_file:
        config_file = os.environ.get('PYWB_CONFIG', DEFAULT_CONFIG_FILE)
-    config = yaml.load(open(config_file))
+    with open(config_file) as fh:
        config = yaml.load(fh)
    return pywb_config_manual(config)
--- a/pywb/rewrite/rewrite_content.py
+++ b/pywb/rewrite/rewrite_content.py
@ -54,8 +54,7 @@ class RewriteContent:
        # =========================================================================
        # special case -- need to ungzip the body
        if (rewritten_headers.contains_removed_header('content-encoding', 'gzip')):
-            stream = BufferedReader(stream, 'gzip')
+            stream = BufferedReader(stream, decomp_type='gzip')
        if rewritten_headers.charset:
            encoding = rewritten_headers.charset
--- a/pywb/wbexceptions.py
+++ b/pywb/wbexceptions.py
@ -1,14 +1,15 @@
 class WbException(Exception):
    pass
 class NotFoundException(WbException):
-    def status(_):
+    def status(self):
        return '404 Not Found'
 # Exceptions that effect a specific capture and result in a retry
 class CaptureException(WbException):
-    def status(_):
+    def status(self):
        return '500 Internal Server Error'
 class InternalRedirect(WbException):
--- a/test_config.yaml
+++ b/test_config.yaml
@ -93,3 +93,6 @@ enable_cdx_api: true
 # optional reporter callback func
 # if set, called with request and cdx object
 reporter_func: pywb.run-tests.print_reporter
 # custom rules for domain specific matching
 #domain_specific_rules: rules.yaml
--- a/tests/test_integration.py
+++ b/tests/test_integration.py
@ -50,6 +50,13 @@ class TestWb:
        # 1 Capture (filtered) + header
        assert len(resp.html.find_all('tr')) == 2
    def test_calendar_query_fuzzy_match(self):
        # fuzzy match removing _= according to standard rules.yaml
        resp = self.testapp.get('/pywb/*/http://www.iana.org/_css/2013.1/screen.css?_=3141592653')
        self._assert_basic_html(resp)
        # 17 Captures + header
        assert len(resp.html.find_all('tr')) == 18
    def test_cdx_query(self):
        resp = self.testapp.get('/pywb/cdx_/*/http://www.iana.org/')
        self._assert_basic_text(resp)