Merge branch 'master' into pkg-reorg

2025-03-24 06:59:52 +01:00 · 2014-02-24 21:33:11 -08:00 · 2014-02-24 21:33:11 -08:00 · 3cd7b6b209
commit 3cd7b6b209
parent 51d61a8738 d702b299ae
8 changed files with 249 additions and 47 deletions
--- a/pywb/cdx/cdxserver.py
+++ b/pywb/cdx/cdxserver.py
@ -1,7 +1,7 @@
 from canonicalize import UrlCanonicalizer, calc_search_range
 from cdxops import cdx_load
-from cdxsource import CDXSource, CDXFile, RemoteCDXSource
+from cdxsource import CDXSource, CDXFile, RemoteCDXSource, RedisCDXSource
 from zipnum import ZipNumCluster
 from cdxobject import CDXObject, CaptureNotFoundException, CDXException
 from cdxdomainspecific import load_domain_specific_cdx_rules
@ -149,10 +149,12 @@ def create_cdx_server(config, ds_rules_file=None):
        paths = config.get('index_paths')
        surt_ordered = config.get('surt_ordered', True)
        perms_checker = config.get('perms_checker')
        pass_config = config
    else:
        paths = config
        surt_ordered = True
        perms_checker = None
        pass_config = None
    logging.debug('CDX Surt-Ordered? ' + str(surt_ordered))
@ -162,6 +164,7 @@ def create_cdx_server(config, ds_rules_file=None):
        server_cls = CDXServer
    return server_cls(paths,
                      config=pass_config,
                      surt_ordered=surt_ordered,
                      ds_rules=ds_rules_file,
                      perms_checker=perms_checker)
@ -206,6 +209,9 @@ def create_cdx_source(filename, config):
    if is_http(filename):
        return RemoteCDXSource(filename)
    if filename.startswith('redis://'):
        return RedisCDXSource(filename, config)
    if filename.endswith('.cdx'):
        return CDXFile(filename)
@ -213,9 +219,6 @@ def create_cdx_source(filename, config):
        return ZipNumCluster(filename, config)
    return None
    #TODO: support zipnum
   #elif filename.startswith('redis://')
    #    return RedisCDXSource(filename)
 #=================================================================
--- a/pywb/cdx/cdxsource.py
+++ b/pywb/cdx/cdxsource.py
@ -3,7 +3,7 @@ from pywb.utils.loaders import SeekableTextFileReader
 import urllib
 import urllib2
-
+import itertools
 #=================================================================
 class CDXSource(object):
@ -80,3 +80,35 @@ class RemoteCDXSource(CDXSource):
    def __str__(self):
        return 'Remote CDX Server: ' + self.remote_url
 #=================================================================
 class RedisCDXSource(CDXSource):
    DEFAULT_KEY_PREFIX = 'c:'
    def __init__(self, redis_url, config=None):
        import redis
        self.redis = redis.StrictRedis.from_url(redis_url)
        self.key_prefix = self.DEFAULT_KEY_PREFIX
        if config:
            self.key_prefix = config.get('redis_key_prefix', self.key_prefix)
    def load_cdx(self, params):
        """
        Load cdx from redis cache, from an ordered list
        Currently, there is no support for range queries
        Only 'exact' matchType is supported
        """
        key = params['key']
        # ensure only url/surt is part of key
        key = key.split(' ')[0]
        cdx_list = self.redis.zrange(self.key_prefix + key, 0, -1)
        # key is not part of list, so prepend to each line
        key += ' '
        cdx_list = itertools.imap(lambda x: key + x, cdx_list)
        return cdx_list
--- a/pywb/core/views.py
+++ b/pywb/core/views.py
@ -56,9 +56,9 @@ class J2TemplateView:
    # Filters
    @staticmethod
-    def format_ts(value, format='%a, %b %d %Y %H:%M:%S'):
+    def format_ts(value, format_='%a, %b %d %Y %H:%M:%S'):
-        value = timestamp_to_datetime(value)
+        value = timeutils.timestamp_to_datetime(value)
-        return time.strftime(format, value)
+        return value.strftime(format_)
    @staticmethod
    def get_host(url):
--- a/pywb/rewrite/wburl.py
+++ b/pywb/rewrite/wburl.py
@ -1,9 +1,5 @@
 #!/usr/bin/python
 import re
 import rfc3987
 # WbUrl : wb archival url representation for WB
 """
 WbUrl represents the standard wayback archival url format.
 A regular url is a subset of the WbUrl (latest replay).
@ -34,9 +30,38 @@ replay form:
 latest_replay: (no timestamp)
 http://example.com
 Additionally, the BaseWbUrl provides the base components
 (url, timestamp, end_timestamp, modifier, type) which
 can be used to provide a custom representation of the
 wayback url format.
 """
-class WbUrl:
+import re
 import rfc3987
 #=================================================================
 class BaseWbUrl(object):
    QUERY = 'query'
    URL_QUERY = 'url_query'
    REPLAY = 'replay'
    LATEST_REPLAY = 'latest_replay'
    def __init__(self, url='', mod='',
                 timestamp='', end_timestamp='', type=None):
        self.url = url
        self.timestamp = timestamp
        self.end_timestamp = end_timestamp
        self.mod = mod
        self.type = type
 #=================================================================
 class WbUrl(BaseWbUrl):
    """
    # Replay Urls
    # ======================
@ -107,22 +132,14 @@ class WbUrl:
    QUERY_REGEX = re.compile('^(?:([\w\-:]+)/)?(\d*)(?:-(\d+))?\*/?(.*)$')
    REPLAY_REGEX = re.compile('^(\d*)([a-z]+_)?/{0,3}(.*)$')
    QUERY = 'query'
    URL_QUERY = 'url_query'
    REPLAY = 'replay'
    LATEST_REPLAY = 'latest_replay'
    DEFAULT_SCHEME = 'http://'
    # ======================
    def __init__(self, url):
        super(WbUrl, self).__init__()
        self.original_url = url
        self.type = None
        self.url = ''
        self.timestamp = ''
        self.end_timestamp = ''
        self.mod = ''
        if not any (f(url) for f in [self._init_query, self._init_replay]):
            raise Exception('Invalid WbUrl: ', url)
--- a/pywb/utils/statusandheaders.py
+++ b/pywb/utils/statusandheaders.py
@ -65,23 +65,36 @@ class StatusAndHeadersParser(object):
        """
        parse stream for status line and headers
        return a StatusAndHeaders object
        support continuation headers starting with space or tab
        """
        statusline = stream.readline().rstrip()
        protocol_status = self.split_prefix(statusline, self.statuslist)
        if not protocol_status:
-            msg = 'Expected Status Line - Found: ' + statusline
+            msg = 'Expected Status Line starting with {0} - Found: {1}'
            msg = msg.format(self.statuslist, statusline)
            raise StatusAndHeadersParserException(msg, statusline)
        headers = []
        line = stream.readline().rstrip()
-        while line and line != '\r\n':
+        while line:
            name, value = line.split(':', 1)
-            header = (name, value.strip())
+            name = name.rstrip(' \t')
            value = value.lstrip()
            next_line = stream.readline().rstrip()
            # append continuation lines, if any
            while next_line and next_line.startswith((' ', '\t')):
                value += next_line
                next_line = stream.readline().rstrip()
            header = (name, value)
            headers.append(header)
-            line = stream.readline().rstrip()
+            line = next_line
        return StatusAndHeaders(statusline=protocol_status[1].strip(),
                                headers=headers,
--- a/pywb/utils/test/statusandheaders_test.py
+++ b/pywb/utils/test/statusandheaders_test.py
@ -0,0 +1,29 @@
 """
 >>> StatusAndHeadersParser(['HTTP/1.0']).parse(StringIO.StringIO(status_headers_1))
 StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '200 OK', headers = [ ('Content-Type', 'ABC'),
  ('Some', 'Value'),
  ('Multi-Line', 'Value1    Also This')])
 >>> StatusAndHeadersParser(['Other']).parse(StringIO.StringIO(status_headers_1))
 Traceback (most recent call last):
 StatusAndHeadersParserException: Expected Status Line starting with ['Other'] - Found: HTTP/1.0 200 OK
 """
 from pywb.utils.statusandheaders import StatusAndHeadersParser
 import StringIO
 status_headers_1 = "\
 HTTP/1.0 200 OK\r\n\
 Content-Type: ABC\r\n\
 Some: Value\r\n\
 Multi-Line: Value1\r\n\
    Also This\r\n\
 \r\n\
 Body"
 if __name__ == "__main__":
    import doctest
    doctest.testmod()
--- a/pywb/utils/timeutils.py
+++ b/pywb/utils/timeutils.py
@ -17,7 +17,8 @@ DATE_TIMESPLIT = re.compile(r'[^\d]')
 TIMESTAMP_14 = '%Y%m%d%H%M%S'
-PAD_STAMP_END = '29991231235959'
+#PAD_STAMP_END = '29991231235959'
 PAD_6 = '299912'
 def iso_date_to_datetime(string):
@ -58,41 +59,145 @@ def iso_date_to_timestamp(string):
    return datetime_to_timestamp(iso_date_to_datetime(string))
-# default pad is end of range for compatibility
+# pad to certain length (default 6)
-def pad_timestamp(string, pad_str=PAD_STAMP_END):
+def _pad_timestamp(string, pad_str=PAD_6):
    """
-    >>> pad_timestamp('20')
+    >>> _pad_timestamp('20')
-    '20991231235959'
+    '209912'
-    >>> pad_timestamp('2014')
+    >>> _pad_timestamp('2014')
-    '20141231235959'
+    '201412'
-    >>> pad_timestamp('20141011')
+    >>> _pad_timestamp('20141011')
-    '20141011235959'
+    '20141011'
-    >>> pad_timestamp('201410110010')
+    >>> _pad_timestamp('201410110010')
-    '20141011001059'
+    '201410110010'
     """
    str_len = len(string)
    pad_len = len(pad_str)
-    return string if str_len >= pad_len else string + pad_str[str_len:]
+    if str_len < pad_len:
        string = string + pad_str[str_len:]
    return string
 def timestamp_to_datetime(string):
    """
-    >>> timestamp_to_datetime('20131226095010')
+    # >14-digit -- rest ignored
-    time.struct_time(tm_year=2013, tm_mon=12, tm_mday=26, \
+    >>> timestamp_to_datetime('2014122609501011')
-tm_hour=9, tm_min=50, tm_sec=10, tm_wday=3, tm_yday=360, tm_isdst=-1)
+    datetime.datetime(2014, 12, 26, 9, 50, 10)
    # 14-digit
    >>> timestamp_to_datetime('20141226095010')
    datetime.datetime(2014, 12, 26, 9, 50, 10)
    # 13-digit padding
    >>> timestamp_to_datetime('2014122609501')
    datetime.datetime(2014, 12, 26, 9, 50, 59)
    # 12-digit padding
    >>> timestamp_to_datetime('201412260950')
    datetime.datetime(2014, 12, 26, 9, 50, 59)
    # 11-digit padding
    >>> timestamp_to_datetime('20141226095')
    datetime.datetime(2014, 12, 26, 9, 59, 59)
    # 10-digit padding
    >>> timestamp_to_datetime('2014122609')
    datetime.datetime(2014, 12, 26, 9, 59, 59)
    # 9-digit padding
    >>> timestamp_to_datetime('201412260')
    datetime.datetime(2014, 12, 26, 23, 59, 59)
    # 8-digit padding
    >>> timestamp_to_datetime('20141226')
    datetime.datetime(2014, 12, 26, 23, 59, 59)
    # 7-digit padding
    >>> timestamp_to_datetime('2014122')
    datetime.datetime(2014, 12, 31, 23, 59, 59)
    # 6-digit padding
    >>> timestamp_to_datetime('201410')
    datetime.datetime(2014, 10, 31, 23, 59, 59)
    # 5-digit padding
    >>> timestamp_to_datetime('20141')
    datetime.datetime(2014, 12, 31, 23, 59, 59)
    # 4-digit padding
    >>> timestamp_to_datetime('2014')
-    time.struct_time(tm_year=2014, tm_mon=12, tm_mday=31, \
+    datetime.datetime(2014, 12, 31, 23, 59, 59)
-tm_hour=23, tm_min=59, tm_sec=59, tm_wday=2, tm_yday=365, tm_isdst=-1)
+
    # 3-digit padding
    >>> timestamp_to_datetime('201')
    datetime.datetime(2019, 12, 31, 23, 59, 59)
    # 2-digit padding
    >>> timestamp_to_datetime('20')
    datetime.datetime(2099, 12, 31, 23, 59, 59)
    # 1-digit padding
    >>> timestamp_to_datetime('2')
    datetime.datetime(2999, 12, 31, 23, 59, 59)
    # 1-digit out-of-range padding
    >>> timestamp_to_datetime('3')
    datetime.datetime(2999, 12, 31, 23, 59, 59)
    # 0-digit padding
    >>> timestamp_to_datetime('')
    datetime.datetime(2999, 12, 31, 23, 59, 59)
    # bad month
    >>> timestamp_to_datetime('20131709005601')
    datetime.datetime(2013, 12, 9, 0, 56, 1)
    # all out of range except minutes
    >>> timestamp_to_datetime('40001965252477')
    datetime.datetime(2999, 12, 31, 23, 24, 59)
    """
-    # Default pad to end of range for comptability
+    # pad to 6 digits
-    return time.strptime(pad_timestamp(string), TIMESTAMP_14)
+    string = _pad_timestamp(string, PAD_6)
    def clamp(val, min_, max_):
        try:
            val = int(val)
            val = max(min_, min(val, max_))
            return val
        except:
            return max_
    def extract(string, start, end, min_, max_):
        if len(string) >= end:
            return clamp(string[start:end], min_, max_)
        else:
            return max_
    # now parse, clamp to boundary
    year = extract(string, 0, 4, 1900, 2999)
    month = extract(string, 4, 6, 1, 12)
    day = extract(string, 6, 8, 1, calendar.monthrange(year, month)[1])
    hour = extract(string, 8, 10, 0, 23)
    minute = extract(string, 10, 12, 0, 59)
    second = extract(string, 12, 14, 0, 59)
    return datetime.datetime(year=year,
                             month=month,
                             day=day,
                             hour=hour,
                             minute=minute,
                             second=second)
    #return time.strptime(pad_timestamp(string), TIMESTAMP_14)
 def timestamp_to_sec(string):
@ -104,7 +209,7 @@ def timestamp_to_sec(string):
    1420070399
    """
-    return calendar.timegm(timestamp_to_datetime(string))
+    return calendar.timegm(timestamp_to_datetime(string).utctimetuple())
 if __name__ == "__main__":
--- a/pywb/warc/test/test_loading.py
+++ b/pywb/warc/test/test_loading.py
@ -213,3 +213,6 @@ def load_from_cdx_test(cdx):
    except Exception as e:
        print 'Exception: ' + e.__class__.__name__
 if __name__ == "__main__":
    import doctest
    doctest.testmod()