Merge branch 'master' into pkg-reorg

2025-03-15 00:03:28 +01:00 · 2014-02-24 21:33:11 -08:00 · 2014-02-24 21:33:11 -08:00 · 3cd7b6b209
commit 3cd7b6b209
parent 51d61a8738 d702b299ae
8 changed files with 249 additions and 47 deletions
--- a/pywb/cdx/cdxserver.py
+++ b/pywb/cdx/cdxserver.py
@ -1,7 +1,7 @@
 from canonicalize import UrlCanonicalizer, calc_search_range

 from cdxops import cdx_load
-from cdxsource import CDXSource, CDXFile, RemoteCDXSource
+from cdxsource import CDXSource, CDXFile, RemoteCDXSource, RedisCDXSource
 from zipnum import ZipNumCluster
 from cdxobject import CDXObject, CaptureNotFoundException, CDXException
 from cdxdomainspecific import load_domain_specific_cdx_rules
@ -149,10 +149,12 @@ def create_cdx_server(config, ds_rules_file=None):
        paths = config.get('index_paths')
        surt_ordered = config.get('surt_ordered', True)
        perms_checker = config.get('perms_checker')
+        pass_config = config
    else:
        paths = config
        surt_ordered = True
        perms_checker = None
+        pass_config = None

    logging.debug('CDX Surt-Ordered? ' + str(surt_ordered))

@ -162,6 +164,7 @@ def create_cdx_server(config, ds_rules_file=None):
        server_cls = CDXServer

    return server_cls(paths,
+                      config=pass_config,
                      surt_ordered=surt_ordered,
                      ds_rules=ds_rules_file,
                      perms_checker=perms_checker)
@ -206,6 +209,9 @@ def create_cdx_source(filename, config):
    if is_http(filename):
        return RemoteCDXSource(filename)

+    if filename.startswith('redis://'):
+        return RedisCDXSource(filename, config)
+
    if filename.endswith('.cdx'):
        return CDXFile(filename)

@ -213,9 +219,6 @@ def create_cdx_source(filename, config):
        return ZipNumCluster(filename, config)

    return None
-    #TODO: support zipnum
-   #elif filename.startswith('redis://')
-    #    return RedisCDXSource(filename)


 #=================================================================
--- a/pywb/cdx/cdxsource.py
+++ b/pywb/cdx/cdxsource.py
@ -3,7 +3,7 @@ from pywb.utils.loaders import SeekableTextFileReader

 import urllib
 import urllib2
-
+import itertools

 #=================================================================
 class CDXSource(object):
@ -80,3 +80,35 @@ class RemoteCDXSource(CDXSource):

    def __str__(self):
        return 'Remote CDX Server: ' + self.remote_url
+
+
+#=================================================================
+class RedisCDXSource(CDXSource):
+    DEFAULT_KEY_PREFIX = 'c:'
+
+    def __init__(self, redis_url, config=None):
+        import redis
+        self.redis = redis.StrictRedis.from_url(redis_url)
+
+        self.key_prefix = self.DEFAULT_KEY_PREFIX
+        if config:
+            self.key_prefix = config.get('redis_key_prefix', self.key_prefix)
+        
+
+    def load_cdx(self, params):
+        """
+        Load cdx from redis cache, from an ordered list
+
+        Currently, there is no support for range queries
+        Only 'exact' matchType is supported
+        """
+        key = params['key']
+
+        # ensure only url/surt is part of key
+        key = key.split(' ')[0]
+        cdx_list = self.redis.zrange(self.key_prefix + key, 0, -1)
+
+        # key is not part of list, so prepend to each line
+        key += ' '
+        cdx_list = itertools.imap(lambda x: key + x, cdx_list)
+        return cdx_list
--- a/pywb/core/views.py
+++ b/pywb/core/views.py
@ -56,9 +56,9 @@ class J2TemplateView:

    # Filters
    @staticmethod
-    def format_ts(value, format='%a, %b %d %Y %H:%M:%S'):
-        value = timestamp_to_datetime(value)
-        return time.strftime(format, value)
+    def format_ts(value, format_='%a, %b %d %Y %H:%M:%S'):
+        value = timeutils.timestamp_to_datetime(value)
+        return value.strftime(format_)

    @staticmethod
    def get_host(url):
--- a/pywb/rewrite/wburl.py
+++ b/pywb/rewrite/wburl.py
@ -1,9 +1,5 @@
 #!/usr/bin/python

-import re
-import rfc3987
-
-# WbUrl : wb archival url representation for WB
 """
 WbUrl represents the standard wayback archival url format.
 A regular url is a subset of the WbUrl (latest replay).
@ -34,9 +30,38 @@ replay form:

 latest_replay: (no timestamp)
 http://example.com
+
+Additionally, the BaseWbUrl provides the base components
+(url, timestamp, end_timestamp, modifier, type) which
+can be used to provide a custom representation of the
+wayback url format.
+
 """

-class WbUrl:
+import re
+import rfc3987
+
+
+#=================================================================
+class BaseWbUrl(object):
+    QUERY = 'query'
+    URL_QUERY = 'url_query'
+    REPLAY = 'replay'
+    LATEST_REPLAY = 'latest_replay'
+
+
+    def __init__(self, url='', mod='',
+                 timestamp='', end_timestamp='', type=None):
+
+        self.url = url
+        self.timestamp = timestamp
+        self.end_timestamp = end_timestamp
+        self.mod = mod
+        self.type = type
+
+
+#=================================================================
+class WbUrl(BaseWbUrl):
    """
    # Replay Urls
    # ======================
@ -107,22 +132,14 @@ class WbUrl:
    QUERY_REGEX = re.compile('^(?:([\w\-:]+)/)?(\d*)(?:-(\d+))?\*/?(.*)$')
    REPLAY_REGEX = re.compile('^(\d*)([a-z]+_)?/{0,3}(.*)$')

-    QUERY = 'query'
-    URL_QUERY = 'url_query'
-    REPLAY = 'replay'
-    LATEST_REPLAY = 'latest_replay'
-
    DEFAULT_SCHEME = 'http://'
    # ======================


    def __init__(self, url):
+        super(WbUrl, self).__init__()
+
        self.original_url = url
-        self.type = None
-        self.url = ''
-        self.timestamp = ''
-        self.end_timestamp = ''
-        self.mod = ''

        if not any (f(url) for f in [self._init_query, self._init_replay]):
            raise Exception('Invalid WbUrl: ', url)
--- a/pywb/utils/statusandheaders.py
+++ b/pywb/utils/statusandheaders.py
@ -65,23 +65,36 @@ class StatusAndHeadersParser(object):
        """
        parse stream for status line and headers
        return a StatusAndHeaders object
+
+        support continuation headers starting with space or tab
        """
        statusline = stream.readline().rstrip()

        protocol_status = self.split_prefix(statusline, self.statuslist)

        if not protocol_status:
-            msg = 'Expected Status Line - Found: ' + statusline
+            msg = 'Expected Status Line starting with {0} - Found: {1}'
+            msg = msg.format(self.statuslist, statusline)
            raise StatusAndHeadersParserException(msg, statusline)

        headers = []

        line = stream.readline().rstrip()
-        while line and line != '\r\n':
+        while line:
            name, value = line.split(':', 1)
-            header = (name, value.strip())
+            name = name.rstrip(' \t')
+            value = value.lstrip()
+
+            next_line = stream.readline().rstrip()
+
+            # append continuation lines, if any
+            while next_line and next_line.startswith((' ', '\t')):
+                value += next_line
+                next_line = stream.readline().rstrip()
+
+            header = (name, value)
            headers.append(header)
-            line = stream.readline().rstrip()
+            line = next_line

        return StatusAndHeaders(statusline=protocol_status[1].strip(),
                                headers=headers,
--- a/pywb/utils/test/statusandheaders_test.py
+++ b/pywb/utils/test/statusandheaders_test.py
@ -0,0 +1,29 @@
+"""
+>>> StatusAndHeadersParser(['HTTP/1.0']).parse(StringIO.StringIO(status_headers_1))
+StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '200 OK', headers = [ ('Content-Type', 'ABC'),
+  ('Some', 'Value'),
+  ('Multi-Line', 'Value1    Also This')])
+
+>>> StatusAndHeadersParser(['Other']).parse(StringIO.StringIO(status_headers_1))
+Traceback (most recent call last):
+StatusAndHeadersParserException: Expected Status Line starting with ['Other'] - Found: HTTP/1.0 200 OK
+"""
+
+
+from pywb.utils.statusandheaders import StatusAndHeadersParser
+import StringIO
+
+
+status_headers_1 = "\
+HTTP/1.0 200 OK\r\n\
+Content-Type: ABC\r\n\
+Some: Value\r\n\
+Multi-Line: Value1\r\n\
+    Also This\r\n\
+\r\n\
+Body"
+
+
+if __name__ == "__main__":
+    import doctest
+    doctest.testmod()
--- a/pywb/utils/timeutils.py
+++ b/pywb/utils/timeutils.py
@ -17,7 +17,8 @@ DATE_TIMESPLIT = re.compile(r'[^\d]')

 TIMESTAMP_14 = '%Y%m%d%H%M%S'

-PAD_STAMP_END = '29991231235959'
+#PAD_STAMP_END = '29991231235959'
+PAD_6 = '299912'


 def iso_date_to_datetime(string):
@ -58,41 +59,145 @@ def iso_date_to_timestamp(string):
    return datetime_to_timestamp(iso_date_to_datetime(string))


-# default pad is end of range for compatibility
-def pad_timestamp(string, pad_str=PAD_STAMP_END):
+# pad to certain length (default 6)
+def _pad_timestamp(string, pad_str=PAD_6):
    """
-    >>> pad_timestamp('20')
-    '20991231235959'
+    >>> _pad_timestamp('20')
+    '209912'

-    >>> pad_timestamp('2014')
-    '20141231235959'
+    >>> _pad_timestamp('2014')
+    '201412'

-    >>> pad_timestamp('20141011')
-    '20141011235959'
+    >>> _pad_timestamp('20141011')
+    '20141011'

-    >>> pad_timestamp('201410110010')
-    '20141011001059'
+    >>> _pad_timestamp('201410110010')
+    '201410110010'
     """

    str_len = len(string)
    pad_len = len(pad_str)

-    return string if str_len >= pad_len else string + pad_str[str_len:]
+    if str_len < pad_len:
+        string = string + pad_str[str_len:]
+
+    return string


 def timestamp_to_datetime(string):
    """
-    >>> timestamp_to_datetime('20131226095010')
-    time.struct_time(tm_year=2013, tm_mon=12, tm_mday=26, \
-tm_hour=9, tm_min=50, tm_sec=10, tm_wday=3, tm_yday=360, tm_isdst=-1)
+    # >14-digit -- rest ignored
+    >>> timestamp_to_datetime('2014122609501011')
+    datetime.datetime(2014, 12, 26, 9, 50, 10)

+    # 14-digit
+    >>> timestamp_to_datetime('20141226095010')
+    datetime.datetime(2014, 12, 26, 9, 50, 10)
+
+    # 13-digit padding
+    >>> timestamp_to_datetime('2014122609501')
+    datetime.datetime(2014, 12, 26, 9, 50, 59)
+
+    # 12-digit padding
+    >>> timestamp_to_datetime('201412260950')
+    datetime.datetime(2014, 12, 26, 9, 50, 59)
+
+    # 11-digit padding
+    >>> timestamp_to_datetime('20141226095')
+    datetime.datetime(2014, 12, 26, 9, 59, 59)
+
+    # 10-digit padding
+    >>> timestamp_to_datetime('2014122609')
+    datetime.datetime(2014, 12, 26, 9, 59, 59)
+
+    # 9-digit padding
+    >>> timestamp_to_datetime('201412260')
+    datetime.datetime(2014, 12, 26, 23, 59, 59)
+
+    # 8-digit padding
+    >>> timestamp_to_datetime('20141226')
+    datetime.datetime(2014, 12, 26, 23, 59, 59)
+
+    # 7-digit padding
+    >>> timestamp_to_datetime('2014122')
+    datetime.datetime(2014, 12, 31, 23, 59, 59)
+
+    # 6-digit padding
+    >>> timestamp_to_datetime('201410')
+    datetime.datetime(2014, 10, 31, 23, 59, 59)
+
+    # 5-digit padding
+    >>> timestamp_to_datetime('20141')
+    datetime.datetime(2014, 12, 31, 23, 59, 59)
+
+    # 4-digit padding
    >>> timestamp_to_datetime('2014')
-    time.struct_time(tm_year=2014, tm_mon=12, tm_mday=31, \
-tm_hour=23, tm_min=59, tm_sec=59, tm_wday=2, tm_yday=365, tm_isdst=-1)
+    datetime.datetime(2014, 12, 31, 23, 59, 59)
+
+    # 3-digit padding
+    >>> timestamp_to_datetime('201')
+    datetime.datetime(2019, 12, 31, 23, 59, 59)
+
+    # 2-digit padding
+    >>> timestamp_to_datetime('20')
+    datetime.datetime(2099, 12, 31, 23, 59, 59)
+
+    # 1-digit padding
+    >>> timestamp_to_datetime('2')
+    datetime.datetime(2999, 12, 31, 23, 59, 59)
+
+    # 1-digit out-of-range padding
+    >>> timestamp_to_datetime('3')
+    datetime.datetime(2999, 12, 31, 23, 59, 59)
+
+    # 0-digit padding
+    >>> timestamp_to_datetime('')
+    datetime.datetime(2999, 12, 31, 23, 59, 59)
+
+    # bad month
+    >>> timestamp_to_datetime('20131709005601')
+    datetime.datetime(2013, 12, 9, 0, 56, 1)
+
+    # all out of range except minutes
+    >>> timestamp_to_datetime('40001965252477')
+    datetime.datetime(2999, 12, 31, 23, 24, 59)
+
    """

-    # Default pad to end of range for comptability
-    return time.strptime(pad_timestamp(string), TIMESTAMP_14)
+    # pad to 6 digits
+    string = _pad_timestamp(string, PAD_6)
+
+
+    def clamp(val, min_, max_):
+        try:
+            val = int(val)
+            val = max(min_, min(val, max_))
+            return val
+        except:
+            return max_
+
+    def extract(string, start, end, min_, max_):
+        if len(string) >= end:
+            return clamp(string[start:end], min_, max_)
+        else:
+            return max_
+
+    # now parse, clamp to boundary
+    year = extract(string, 0, 4, 1900, 2999)
+    month = extract(string, 4, 6, 1, 12)
+    day = extract(string, 6, 8, 1, calendar.monthrange(year, month)[1])
+    hour = extract(string, 8, 10, 0, 23)
+    minute = extract(string, 10, 12, 0, 59)
+    second = extract(string, 12, 14, 0, 59)
+
+    return datetime.datetime(year=year,
+                             month=month,
+                             day=day,
+                             hour=hour,
+                             minute=minute,
+                             second=second)
+
+    #return time.strptime(pad_timestamp(string), TIMESTAMP_14)


 def timestamp_to_sec(string):
@ -104,7 +209,7 @@ def timestamp_to_sec(string):
    1420070399
    """

-    return calendar.timegm(timestamp_to_datetime(string))
+    return calendar.timegm(timestamp_to_datetime(string).utctimetuple())


 if __name__ == "__main__":
--- a/pywb/warc/test/test_loading.py
+++ b/pywb/warc/test/test_loading.py
@ -213,3 +213,6 @@ def load_from_cdx_test(cdx):
    except Exception as e:
        print 'Exception: ' + e.__class__.__name__

+if __name__ == "__main__":
+    import doctest
+    doctest.testmod()