cdx: add domain-specific rules at cdx layer for custom canonicalization!

and 'fuzzy' matching when not found handled via cdxdomainspecific.py BaseCDXServer contains a canonicalizer object and a fuzzy query canonicalizer abstracted to seperate class (in canonicalizer.py) clean up cdx related exceptions default rules read from cdx/rules.yaml filename configurable via 'domain_specific_rules' setting in config.yaml fix typo in pywb/rewrite
2025-03-24 15:09:54 +01:00 · 2014-02-18 14:47:48 -08:00 · 2014-02-18 14:47:48 -08:00 · a09dec4b3e
commit a09dec4b3e
parent ab95524b7b
13 changed files with 375 additions and 131 deletions
--- a/config.yaml
+++ b/config.yaml
@ -92,4 +92,8 @@ static_routes:
 enable_http_proxy: true

 # enable cdx server api for querying cdx directly (experimental)
-#enable_cdx_api: false
+enable_cdx_api: true
+
+# custom rules for domain specific matching
+# set to false to disable
+#domain_specific_rules: rules.yaml
--- a/pywb/cdx/canonicalize.py
+++ b/pywb/cdx/canonicalize.py
@ -0,0 +1,74 @@
+""" Standard url-canonicalzation, surt and non-surt
+"""
+
+import surt
+from cdxobject import CDXException
+
+
+#=================================================================
+class UrlCanonicalizer(object):
+    def __init__(self, surt_ordered=True):
+        self.surt_ordered = surt_ordered
+
+    def __call__(self, url):
+        return canonicalize(url, self.surt_ordered)
+
+
+#=================================================================
+def canonicalize(url, surt_ordered=True):
+    """
+    Canonicalize url and convert to surt
+    If not in surt ordered mode, convert back to url form
+    as surt conversion is currently part of canonicalization
+
+    >>> canonicalize('http://example.com/path/file.html', surt_ordered=True)
+    'com,example)/path/file.html'
+
+    >>> canonicalize('http://example.com/path/file.html', surt_ordered=False)
+    'example.com/path/file.html'
+    """
+    try:
+        key = surt.surt(url)
+    except Exception as e:
+        raise CDXException('Invalid Url: ' + url)
+
+    # if not surt, unsurt the surt to get canonicalized non-surt url
+    if not surt_ordered:
+        key = unsurt(key)
+
+    return key
+
+
+#=================================================================
+def unsurt(surt):
+    """
+    # Simple surt
+    >>> unsurt('com,example)/')
+    'example.com/'
+
+    # Broken surt
+    >>> unsurt('com,example)')
+    'com,example)'
+
+    # Long surt
+    >>> unsurt('suffix,domain,sub,subsub,another,subdomain)/path/file/\
+index.html?a=b?c=)/')
+    'subdomain.another.subsub.sub.domain.suffix/path/file/index.html?a=b?c=)/'
+    """
+
+    try:
+        index = surt.index(')/')
+        parts = surt[0:index].split(',')
+        parts.reverse()
+        host = '.'.join(parts)
+        host += surt[index + 1:]
+        return host
+
+    except ValueError:
+        # May not be a valid surt
+        return surt
+
+
+if __name__ == "__main__":
+    import doctest
+    doctest.testmod()
--- a/pywb/cdx/cdxdomainspecific.py
+++ b/pywb/cdx/cdxdomainspecific.py
@ -0,0 +1,125 @@
+import yaml
+import re
+import logging
+import pkgutil
+
+from canonicalize import unsurt, UrlCanonicalizer
+
+
+#=================================================================
+def load_domain_specific_cdx_rules(filename, surt_ordered):
+    fh = pkgutil.get_data(__package__, filename)
+    config = yaml.load(fh)
+
+    # Load Canonicalizer Rules
+    rules = StartsWithRule.load_rules(config.get('canon_rules'),
+                                      surt_ordered)
+
+    if rules:
+        canon = CustomUrlCanonicalizer(rules, surt_ordered)
+    else:
+        canon = None
+
+    # Load Fuzzy Lookup Rules
+    rules = StartsWithRule.load_rules(config.get('fuzzy_lookup_rules'),
+                                      surt_ordered)
+
+    if rules:
+        fuzzy = FuzzyQuery(rules)
+    else:
+        fuzzy = None
+
+    logging.debug('CANON: ' + str(canon))
+    logging.debug('FUZZY: ' + str(fuzzy))
+    return (canon, fuzzy)
+
+
+#=================================================================
+class CustomUrlCanonicalizer(UrlCanonicalizer):
+    def __init__(self, rules, surt_ordered=True):
+        super(CustomUrlCanonicalizer, self).__init__(surt_ordered)
+        self.rules = rules
+
+    def __call__(self, url):
+        urlkey = super(CustomUrlCanonicalizer, self).__call__(url)
+
+        for rule in self.rules:
+            if not any(urlkey.startswith(x) for x in rule.starts):
+                continue
+
+            m = rule.regex.match(urlkey)
+            if not m:
+                continue
+
+            if rule.replace:
+                return m.expand(rule.replace)
+
+        return urlkey
+
+
+#=================================================================
+class FuzzyQuery:
+    def __init__(self, rules):
+        self.rules = rules
+
+    def __call__(self, params):
+        matched_rule = None
+
+        urlkey = params['key']
+        url = params['url']
+
+        for rule in self.rules:
+            if not any(urlkey.startswith(x) for x in rule.starts):
+                continue
+
+            m = rule.regex.search(urlkey)
+            if not m:
+                continue
+
+            matched_rule = rule
+
+            if len(m.groups()) == 1:
+                params['filter'] = '=urlkey:' + m.group(1)
+
+            break
+
+        if not matched_rule:
+            return None
+
+        inx = url.find('?')
+        if inx > 0:
+            params['url'] = url[:inx + 1]
+
+        params['matchType'] = 'prefix'
+        params['key'] = None
+        return params
+
+
+#=================================================================
+class StartsWithRule:
+    def __init__(self, config, surt_ordered=True):
+        self.starts = config.get('startswith')
+        if not isinstance(self.starts, list):
+            self.starts = [self.starts]
+
+        self.regex = re.compile(config.get('matches'))
+        self.replace = config.get('replace')
+
+    def unsurt(self):
+        # must convert to non-surt form
+        self.starts = map(unsurt, self.starts)
+        self.regex = unsurt(self.regex)
+        self.replace = unsurt(self.replace)
+
+    @staticmethod
+    def load_rules(rules_config, surt_ordered=True):
+        if not rules_config:
+            return []
+
+        rules = map(StartsWithRule, rules_config)
+
+        if not surt_ordered:
+            for rule in rules:
+                rule.unsurt()
+
+        return rules
--- a/pywb/cdx/cdxobject.py
+++ b/pywb/cdx/cdxobject.py
@ -2,6 +2,24 @@ from collections import OrderedDict
 import itertools


+#=================================================================
+class CDXException(Exception):
+    def status(self):
+        return '400 Bad Request'
+
+
+#=================================================================
+class CaptureNotFoundException(CDXException):
+    def status(self):
+        return '404 Not Found'
+
+
+#=================================================================
+class AccessException(CDXException):
+    def status(self):
+        return '403 Access Denied'
+
+
 #=================================================================
 class CDXObject(OrderedDict):
    CDX_FORMATS = [
--- a/pywb/cdx/cdxserver.py
+++ b/pywb/cdx/cdxserver.py
@ -1,82 +1,103 @@
-import surt
-from cdxops import cdx_load
+from canonicalize import UrlCanonicalizer

-import itertools
+from cdxops import cdx_load
+from cdxsource import CDXSource, CDXFile, RemoteCDXSource
+from cdxobject import CDXObject, CaptureNotFoundException, CDXException
+from cdxdomainspecific import load_domain_specific_cdx_rules
+
+from itertools import chain
 import logging
 import os
 import urlparse

-from cdxsource import CDXSource, CDXFile, RemoteCDXSource
-from cdxobject import CDXObject
+
+#=================================================================
+class BaseCDXServer(object):
+    def __init__(self, url_canon=None, fuzzy_query=None):
+        self.url_canon = url_canon if url_canon else UrlCanonicalizer()
+        self.fuzzy_query = fuzzy_query
+
+    def _check_cdx_iter(self, cdx_iter, params):
+        """ Check cdx iter semantics
+        If iter is empty (no matches), check if fuzzy matching
+        is allowed, and try it -- otherwise,
+        throw CaptureNotFoundException
+        """
+
+        cdx_iter = self.peek_iter(cdx_iter)
+
+        if cdx_iter:
+            return cdx_iter
+
+        url = params['url']
+
+        if self.fuzzy_query and params.get('allow_fuzzy'):
+            if not 'key' in params:
+                params['key'] = self.url_canon(url)
+
+            params = self.fuzzy_query(params)
+            if params:
+                params['allow_fuzzy'] = False
+                return self.load_cdx(**params)
+
+        msg = 'No Captures found for: ' + url
+        raise CaptureNotFoundException(msg)
+
+    def load_cdx(self, **params):
+        raise NotImplementedError('Implement in subclass')
+
+    @staticmethod
+    def peek_iter(iterable):
+        try:
+            first = next(iterable)
+        except StopIteration:
+            return None
+
+        return chain([first], iterable)


 #=================================================================
-class CDXException(Exception):
-    def status(self):
-        return '400 Bad Request'
-
-
-#=================================================================
-class AccessException(CDXException):
-    def status(self):
-        return '403 Bad Request'
-
-
-#=================================================================
-class CDXServer(object):
+class CDXServer(BaseCDXServer):
    """
    Top-level cdx server object which maintains a list of cdx sources,
    responds to queries and dispatches to the cdx ops for processing
    """

-    def __init__(self, paths, surt_ordered=True):
+    def __init__(self, paths, url_canon=None, fuzzy_query=None):
+        super(CDXServer, self).__init__(url_canon, fuzzy_query)
        self.sources = create_cdx_sources(paths)
-        self.surt_ordered = surt_ordered

    def load_cdx(self, **params):
        # if key not set, assume 'url' is set and needs canonicalization
        if not params.get('key'):
-            params['key'] = self._canonicalize(params)
-
-        convert_old_style_params(params)
-
-        return cdx_load(self.sources, params)
-
-    def _canonicalize(self, params):
-        """
-        Canonicalize url and convert to surt
-        If no surt-mode, convert back to url form
-        as surt conversion is currently part of canonicalization
-        """
            try:
                url = params['url']
            except KeyError:
                msg = 'A url= param must be specified to query the cdx server'
                raise CDXException(msg)

-        try:
-            key = surt.surt(url)
-        except Exception as e:
-            raise CDXException('Invalid Url: ' + url)
+            params['key'] = self.url_canon(url)

-        # if not surt, unsurt the surt to get canonicalized non-surt url
-        if not self.surt_ordered:
-            key = unsurt(key)
+        convert_old_style_params(params)

-        return key
+        cdx_iter = cdx_load(self.sources, params)
+
+        return self._check_cdx_iter(cdx_iter, params)

    def __str__(self):
        return 'CDX server serving from ' + str(self.sources)


 #=================================================================
-class RemoteCDXServer(object):
+class RemoteCDXServer(BaseCDXServer):
    """
    A special cdx server that uses a single RemoteCDXSource
    It simply proxies the query params to the remote source
    and performs no local processing/filtering
    """
-    def __init__(self, source):
+    def __init__(self, source, url_canon=None, fuzzy_query=None):
+        super(RemoteCDXServer, self).__init__(url_canon, fuzzy_query)
+
        if isinstance(source, RemoteCDXSource):
            self.source = source
        elif (isinstance(source, str) and
@ -87,18 +108,19 @@ class RemoteCDXServer(object):

    def load_cdx(self, **params):
        remote_iter = self.source.load_cdx(params)
+
        # if need raw, convert to raw format here
        if params.get('output') == 'raw':
-            return (CDXObject(cdx) for cdx in remote_iter)
-        else:
-            return remote_iter
+            remote_iter = (CDXObject(cdx) for cdx in remote_iter)
+
+        return self._check_cdx_iter(remote_iter, params)

    def __str__(self):
        return 'Remote CDX server serving from ' + str(self.sources[0])


 #=================================================================
-def create_cdx_server(config):
+def create_cdx_server(config, ds_rules_file=None):
    if hasattr(config, 'get'):
        paths = config.get('index_paths')
        surt_ordered = config.get('surt_ordered', True)
@ -108,11 +130,22 @@ def create_cdx_server(config):

    logging.debug('CDX Surt-Ordered? ' + str(surt_ordered))

+    if ds_rules_file:
+        canon, fuzzy = load_domain_specific_cdx_rules(ds_rules_file,
+                                                      surt_ordered)
+    else:
+        canon, fuzzy = None, None
+
+    if not canon:
+        canon = UrlCanonicalizer(surt_ordered)
+
    if (isinstance(paths, str) and
        any(paths.startswith(x) for x in ['http://', 'https://'])):
-        return RemoteCDXServer(paths)
+        server_cls = RemoteCDXServer
    else:
-        return CDXServer(paths)
+        server_cls = CDXServer
+
+    return server_cls(paths, url_canon=canon, fuzzy_query=fuzzy)


 #=================================================================
@ -170,13 +203,17 @@ def convert_old_style_params(params):
    """
    Convert old-style CDX Server param semantics
    """
-    collapse_time = params.get('collapseTime')
-    if collapse_time:
-        params['collapse_time'] = collapse_time
+    param = params.get('collapseTime')
+    if param:
+        params['collapse_time'] = param

-    resolve_revisits = params.get('resolveRevisits')
-    if resolve_revisits:
-        params['resolve_revisits'] = resolve_revisits
+    param = params.get('matchType')
+    if param:
+        params['match_type'] = param
+
+    param = params.get('resolveRevisits')
+    if param:
+        params['resolve_revisits'] = param

    if params.get('sort') == 'reverse':
        params['reverse'] = True
@ -204,38 +241,3 @@ def extract_params_from_wsgi_env(env):
            params[name] = val[0]

    return params
-
-
-#=================================================================
-def unsurt(surt):
-    """
-    # Simple surt
-    >>> unsurt('com,example)/')
-    'example.com)/'
-
-    # Broken surt
-    >>> unsurt('com,example)')
-    'com,example)'
-
-    # Long surt
-    >>> unsurt('suffix,domain,sub,subsub,another,subdomain)/path/file/\
-index.html?a=b?c=)/')
-    'subdomain.another.subsub.sub.domain.suffix)/path/file/index.html?a=b?c=)/'
-    """
-
-    try:
-        index = surt.index(')/')
-        parts = surt[0:index].split(',')
-        parts.reverse()
-        host = '.'.join(parts)
-        host += surt[index:]
-        return host
-
-    except ValueError:
-        # May not be a valid surt
-        return surt
-
-
-if __name__ == "__main__":
-    import doctest
-    doctest.testmod()
--- a/pywb/cdx/rules.yaml
+++ b/pywb/cdx/rules.yaml
@ -0,0 +1,24 @@
+
+fuzzy_lookup_rules:
+    - startswith: 'com,twitter)/i/profiles/show/'
+      matches: '/profiles/show/.*with_replies\?.*(max_id=[^&]+)'
+
+    - startswith: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet'
+      matches: 'com,facebook\)/.*[?&]data=(.*(?:[&]|query_type[^,]+))'
+
+    - startswith: ['com,yimg,l)/g/combo', 'com,yahooapis,yui)/combo']
+      matches: '([^/]+(?:\.css|\.js))'
+
+    # matches all urls
+    - startswith: ''
+      matches: '[&?](?:_|uncache)=[\d]+[&]?'
+
+canon_rules:
+    - startswith: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet'
+      matches: 'com,facebook\)/.*[?&]data=([^&]+).*'
+      replace: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet?data=\1'
+
+
+
+
+
--- a/pywb/cdx/test/cdxserver_test.py
+++ b/pywb/cdx/test/cdxserver_test.py
@ -25,6 +25,8 @@ org,iana)/_js/2013.1/jquery.js 20140126201307 https://www.iana.org/_js/2013.1/jq

 # No matching results
 >>> cdx_ops_test('http://iana.org/dont_have_this', reverse = True, resolve_revisits = True, limit = 2)
+Traceback (most recent call last):
+CaptureNotFoundException: No Captures found for: http://iana.org/dont_have_this


 # Filter cdx (default: regex)
--- a/pywb/indexreader.py
+++ b/pywb/indexreader.py
@ -1,13 +1,7 @@
 import urllib
 import urllib2

-from wbexceptions import NotFoundException
-
-from itertools import chain
-from pprint import pprint
-
-from pywb.cdx.cdxserver import create_cdx_server, CDXException
-from pywb.cdx.cdxobject import CDXObject
+from pywb.cdx.cdxserver import create_cdx_server

 #=================================================================
 class IndexReader(object):
@ -18,8 +12,8 @@ class IndexReader(object):
    Creates an appropriate query based on wbrequest type info
    """

-    def __init__(self, config):
-        self.cdx_server = create_cdx_server(config)
+    def __init__(self, config, ds_rules_file=None):
+        self.cdx_server = create_cdx_server(config, ds_rules_file)

    def load_for_request(self, wbrequest):
        wburl = wbrequest.wb_url
@ -29,19 +23,14 @@ class IndexReader(object):

        # add any custom filter from the request
        if wbrequest.query_filter:
-            params['filter'] = wbrequest.query_filter
+            params['filter'].extend(wbrequest.query_filter)

        if wbrequest.custom_params:
            params.update(wbrequest.custom_params)

-        params['url'] = wburl.url
+        params['allow_fuzzy'] = True

-        cdxlines = self.load_cdx(output='raw', **params)
-
-        cdxlines = self.peek_iter(cdxlines)
-
-        if cdxlines is None:
-            raise NotFoundException('No Captures found for: ' + wburl.url)
+        cdxlines = self.load_cdx(url=wburl.url, output='raw', **params)

        return cdxlines

@ -54,7 +43,7 @@ class IndexReader(object):

        return {
            wburl.QUERY:
-                {'collapseTime': collapse_time, 'filter': '!statuscode:(500|502|504)', 'limit': limit},
+                {'collapseTime': collapse_time, 'filter': ['!statuscode:(500|502|504)'], 'limit': limit},

            wburl.URL_QUERY:
                {'collapse': 'urlkey', 'matchType': 'prefix', 'showGroupCount': True, 'showUniqCount': True, 'lastSkipTimestamp': True, 'limit': limit,
@ -62,21 +51,12 @@ class IndexReader(object):
                },

            wburl.REPLAY:
-                {'sort': 'closest', 'filter': '!statuscode:(500|502|504)', 'limit': replay_closest, 'closest': wburl.timestamp, 'resolveRevisits': True},
+                {'sort': 'closest', 'filter': ['!statuscode:(500|502|504)'], 'limit': replay_closest, 'closest': wburl.timestamp, 'resolveRevisits': True},

            # BUG: resolveRevisits currently doesn't work for this type of query
            # This is not an issue in archival mode, as there is a redirect to the actual timestamp query
            # but may be an issue in proxy mode
            wburl.LATEST_REPLAY:
-                {'sort': 'reverse', 'filter': 'statuscode:[23]..', 'limit': '1', 'resolveRevisits': True}
+                {'sort': 'reverse', 'filter': ['statuscode:[23]..'], 'limit': '1', 'resolveRevisits': True}

        }[wburl.type]
-
-    @staticmethod
-    def peek_iter(iterable):
-        try:
-            first = next(iterable)
-        except StopIteration:
-            return None
-
-        return chain([first], iterable)
--- a/pywb/pywb_init.py
+++ b/pywb/pywb_init.py
@ -21,6 +21,8 @@ DEFAULTS = {
    'error_html': 'ui/error.html',

    'static_routes': {'static/default': 'static/'},
+
+    'domain_specific_rules': 'rules.yaml',
 }

 class DictChain:
@ -30,7 +32,7 @@ class DictChain:
    def get(self, key, default_val=None):
        for d in self.dicts:
            val = d.get(key)
-            if val:
+            if val is not None:
                return val
        return default_val

@ -52,11 +54,13 @@ def pywb_config_manual(passed_config = {}):
    for name, value in collections.iteritems():
        if isinstance(value, str):
            route_config = config
-            cdx_server = IndexReader(value)
+            cdx_config = value
        else:
            route_config = DictChain(value, config)
-            cdx_server = IndexReader(route_config)
+            cdx_config = route_config

+        ds_rules = route_config.get('domain_specific_rules', None)
+        cdx_server = IndexReader(cdx_config, ds_rules)

        wb_handler = config_utils.create_wb_handler(
            cdx_server = cdx_server,
@ -118,7 +122,8 @@ def pywb_config(config_file = None):
    if not config_file:
        config_file = os.environ.get('PYWB_CONFIG', DEFAULT_CONFIG_FILE)

-    config = yaml.load(open(config_file))
+    with open(config_file) as fh:
+        config = yaml.load(fh)

    return pywb_config_manual(config)

--- a/pywb/rewrite/rewrite_content.py
+++ b/pywb/rewrite/rewrite_content.py
@ -54,8 +54,7 @@ class RewriteContent:
        # =========================================================================
        # special case -- need to ungzip the body
        if (rewritten_headers.contains_removed_header('content-encoding', 'gzip')):
-            stream = BufferedReader(stream, 'gzip')
-
+            stream = BufferedReader(stream, decomp_type='gzip')

        if rewritten_headers.charset:
            encoding = rewritten_headers.charset
--- a/pywb/wbexceptions.py
+++ b/pywb/wbexceptions.py
@ -1,14 +1,15 @@

+
 class WbException(Exception):
    pass

 class NotFoundException(WbException):
-    def status(_):
+    def status(self):
        return '404 Not Found'

 # Exceptions that effect a specific capture and result in a retry
 class CaptureException(WbException):
-    def status(_):
+    def status(self):
        return '500 Internal Server Error'

 class InternalRedirect(WbException):
--- a/test_config.yaml
+++ b/test_config.yaml
@ -93,3 +93,6 @@ enable_cdx_api: true
 # optional reporter callback func
 # if set, called with request and cdx object
 reporter_func: pywb.run-tests.print_reporter
+
+# custom rules for domain specific matching
+#domain_specific_rules: rules.yaml
--- a/tests/test_integration.py
+++ b/tests/test_integration.py
@ -50,6 +50,13 @@ class TestWb:
        # 1 Capture (filtered) + header
        assert len(resp.html.find_all('tr')) == 2

+    def test_calendar_query_fuzzy_match(self):
+        # fuzzy match removing _= according to standard rules.yaml
+        resp = self.testapp.get('/pywb/*/http://www.iana.org/_css/2013.1/screen.css?_=3141592653')
+        self._assert_basic_html(resp)
+        # 17 Captures + header
+        assert len(resp.html.find_all('tr')) == 18
+
    def test_cdx_query(self):
        resp = self.testapp.get('/pywb/cdx_/*/http://www.iana.org/')
        self._assert_basic_text(resp)