diff --git a/pywb/cdx/cdxserver.py b/pywb/cdx/cdxserver.py index e6825956..1a68f7e4 100644 --- a/pywb/cdx/cdxserver.py +++ b/pywb/cdx/cdxserver.py @@ -1,7 +1,7 @@ from canonicalize import UrlCanonicalizer, calc_search_range from cdxops import cdx_load -from cdxsource import CDXSource, CDXFile, RemoteCDXSource +from cdxsource import CDXSource, CDXFile, RemoteCDXSource, RedisCDXSource from zipnum import ZipNumCluster from cdxobject import CDXObject, CaptureNotFoundException, CDXException from cdxdomainspecific import load_domain_specific_cdx_rules @@ -149,10 +149,12 @@ def create_cdx_server(config, ds_rules_file=None): paths = config.get('index_paths') surt_ordered = config.get('surt_ordered', True) perms_checker = config.get('perms_checker') + pass_config = config else: paths = config surt_ordered = True perms_checker = None + pass_config = None logging.debug('CDX Surt-Ordered? ' + str(surt_ordered)) @@ -162,6 +164,7 @@ def create_cdx_server(config, ds_rules_file=None): server_cls = CDXServer return server_cls(paths, + config=pass_config, surt_ordered=surt_ordered, ds_rules=ds_rules_file, perms_checker=perms_checker) @@ -206,6 +209,9 @@ def create_cdx_source(filename, config): if is_http(filename): return RemoteCDXSource(filename) + if filename.startswith('redis://'): + return RedisCDXSource(filename, config) + if filename.endswith('.cdx'): return CDXFile(filename) @@ -213,9 +219,6 @@ def create_cdx_source(filename, config): return ZipNumCluster(filename, config) return None - #TODO: support zipnum - #elif filename.startswith('redis://') - # return RedisCDXSource(filename) #================================================================= diff --git a/pywb/cdx/cdxsource.py b/pywb/cdx/cdxsource.py index 39285cf8..783cf36b 100644 --- a/pywb/cdx/cdxsource.py +++ b/pywb/cdx/cdxsource.py @@ -3,7 +3,7 @@ from pywb.utils.loaders import SeekableTextFileReader import urllib import urllib2 - +import itertools #================================================================= class CDXSource(object): @@ -80,3 +80,35 @@ class RemoteCDXSource(CDXSource): def __str__(self): return 'Remote CDX Server: ' + self.remote_url + + +#================================================================= +class RedisCDXSource(CDXSource): + DEFAULT_KEY_PREFIX = 'c:' + + def __init__(self, redis_url, config=None): + import redis + self.redis = redis.StrictRedis.from_url(redis_url) + + self.key_prefix = self.DEFAULT_KEY_PREFIX + if config: + self.key_prefix = config.get('redis_key_prefix', self.key_prefix) + + + def load_cdx(self, params): + """ + Load cdx from redis cache, from an ordered list + + Currently, there is no support for range queries + Only 'exact' matchType is supported + """ + key = params['key'] + + # ensure only url/surt is part of key + key = key.split(' ')[0] + cdx_list = self.redis.zrange(self.key_prefix + key, 0, -1) + + # key is not part of list, so prepend to each line + key += ' ' + cdx_list = itertools.imap(lambda x: key + x, cdx_list) + return cdx_list diff --git a/pywb/core/views.py b/pywb/core/views.py index 961d1af7..e6ca5635 100644 --- a/pywb/core/views.py +++ b/pywb/core/views.py @@ -56,9 +56,9 @@ class J2TemplateView: # Filters @staticmethod - def format_ts(value, format='%a, %b %d %Y %H:%M:%S'): - value = timestamp_to_datetime(value) - return time.strftime(format, value) + def format_ts(value, format_='%a, %b %d %Y %H:%M:%S'): + value = timeutils.timestamp_to_datetime(value) + return value.strftime(format_) @staticmethod def get_host(url): diff --git a/pywb/rewrite/wburl.py b/pywb/rewrite/wburl.py index 77bd437d..6be56b6c 100644 --- a/pywb/rewrite/wburl.py +++ b/pywb/rewrite/wburl.py @@ -1,9 +1,5 @@ #!/usr/bin/python -import re -import rfc3987 - -# WbUrl : wb archival url representation for WB """ WbUrl represents the standard wayback archival url format. A regular url is a subset of the WbUrl (latest replay). @@ -34,9 +30,38 @@ replay form: latest_replay: (no timestamp) http://example.com + +Additionally, the BaseWbUrl provides the base components +(url, timestamp, end_timestamp, modifier, type) which +can be used to provide a custom representation of the +wayback url format. + """ -class WbUrl: +import re +import rfc3987 + + +#================================================================= +class BaseWbUrl(object): + QUERY = 'query' + URL_QUERY = 'url_query' + REPLAY = 'replay' + LATEST_REPLAY = 'latest_replay' + + + def __init__(self, url='', mod='', + timestamp='', end_timestamp='', type=None): + + self.url = url + self.timestamp = timestamp + self.end_timestamp = end_timestamp + self.mod = mod + self.type = type + + +#================================================================= +class WbUrl(BaseWbUrl): """ # Replay Urls # ====================== @@ -107,22 +132,14 @@ class WbUrl: QUERY_REGEX = re.compile('^(?:([\w\-:]+)/)?(\d*)(?:-(\d+))?\*/?(.*)$') REPLAY_REGEX = re.compile('^(\d*)([a-z]+_)?/{0,3}(.*)$') - QUERY = 'query' - URL_QUERY = 'url_query' - REPLAY = 'replay' - LATEST_REPLAY = 'latest_replay' - DEFAULT_SCHEME = 'http://' # ====================== def __init__(self, url): + super(WbUrl, self).__init__() + self.original_url = url - self.type = None - self.url = '' - self.timestamp = '' - self.end_timestamp = '' - self.mod = '' if not any (f(url) for f in [self._init_query, self._init_replay]): raise Exception('Invalid WbUrl: ', url) diff --git a/pywb/utils/statusandheaders.py b/pywb/utils/statusandheaders.py index 93725628..92e897fc 100644 --- a/pywb/utils/statusandheaders.py +++ b/pywb/utils/statusandheaders.py @@ -65,23 +65,36 @@ class StatusAndHeadersParser(object): """ parse stream for status line and headers return a StatusAndHeaders object + + support continuation headers starting with space or tab """ statusline = stream.readline().rstrip() protocol_status = self.split_prefix(statusline, self.statuslist) if not protocol_status: - msg = 'Expected Status Line - Found: ' + statusline + msg = 'Expected Status Line starting with {0} - Found: {1}' + msg = msg.format(self.statuslist, statusline) raise StatusAndHeadersParserException(msg, statusline) headers = [] line = stream.readline().rstrip() - while line and line != '\r\n': + while line: name, value = line.split(':', 1) - header = (name, value.strip()) + name = name.rstrip(' \t') + value = value.lstrip() + + next_line = stream.readline().rstrip() + + # append continuation lines, if any + while next_line and next_line.startswith((' ', '\t')): + value += next_line + next_line = stream.readline().rstrip() + + header = (name, value) headers.append(header) - line = stream.readline().rstrip() + line = next_line return StatusAndHeaders(statusline=protocol_status[1].strip(), headers=headers, diff --git a/pywb/utils/test/statusandheaders_test.py b/pywb/utils/test/statusandheaders_test.py new file mode 100644 index 00000000..3473e71e --- /dev/null +++ b/pywb/utils/test/statusandheaders_test.py @@ -0,0 +1,29 @@ +""" +>>> StatusAndHeadersParser(['HTTP/1.0']).parse(StringIO.StringIO(status_headers_1)) +StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '200 OK', headers = [ ('Content-Type', 'ABC'), + ('Some', 'Value'), + ('Multi-Line', 'Value1 Also This')]) + +>>> StatusAndHeadersParser(['Other']).parse(StringIO.StringIO(status_headers_1)) +Traceback (most recent call last): +StatusAndHeadersParserException: Expected Status Line starting with ['Other'] - Found: HTTP/1.0 200 OK +""" + + +from pywb.utils.statusandheaders import StatusAndHeadersParser +import StringIO + + +status_headers_1 = "\ +HTTP/1.0 200 OK\r\n\ +Content-Type: ABC\r\n\ +Some: Value\r\n\ +Multi-Line: Value1\r\n\ + Also This\r\n\ +\r\n\ +Body" + + +if __name__ == "__main__": + import doctest + doctest.testmod() diff --git a/pywb/utils/timeutils.py b/pywb/utils/timeutils.py index 62929d50..7af3401f 100644 --- a/pywb/utils/timeutils.py +++ b/pywb/utils/timeutils.py @@ -17,7 +17,8 @@ DATE_TIMESPLIT = re.compile(r'[^\d]') TIMESTAMP_14 = '%Y%m%d%H%M%S' -PAD_STAMP_END = '29991231235959' +#PAD_STAMP_END = '29991231235959' +PAD_6 = '299912' def iso_date_to_datetime(string): @@ -58,41 +59,145 @@ def iso_date_to_timestamp(string): return datetime_to_timestamp(iso_date_to_datetime(string)) -# default pad is end of range for compatibility -def pad_timestamp(string, pad_str=PAD_STAMP_END): +# pad to certain length (default 6) +def _pad_timestamp(string, pad_str=PAD_6): """ - >>> pad_timestamp('20') - '20991231235959' + >>> _pad_timestamp('20') + '209912' - >>> pad_timestamp('2014') - '20141231235959' + >>> _pad_timestamp('2014') + '201412' - >>> pad_timestamp('20141011') - '20141011235959' + >>> _pad_timestamp('20141011') + '20141011' - >>> pad_timestamp('201410110010') - '20141011001059' + >>> _pad_timestamp('201410110010') + '201410110010' """ str_len = len(string) pad_len = len(pad_str) - return string if str_len >= pad_len else string + pad_str[str_len:] + if str_len < pad_len: + string = string + pad_str[str_len:] + + return string def timestamp_to_datetime(string): """ - >>> timestamp_to_datetime('20131226095010') - time.struct_time(tm_year=2013, tm_mon=12, tm_mday=26, \ -tm_hour=9, tm_min=50, tm_sec=10, tm_wday=3, tm_yday=360, tm_isdst=-1) + # >14-digit -- rest ignored + >>> timestamp_to_datetime('2014122609501011') + datetime.datetime(2014, 12, 26, 9, 50, 10) + # 14-digit + >>> timestamp_to_datetime('20141226095010') + datetime.datetime(2014, 12, 26, 9, 50, 10) + + # 13-digit padding + >>> timestamp_to_datetime('2014122609501') + datetime.datetime(2014, 12, 26, 9, 50, 59) + + # 12-digit padding + >>> timestamp_to_datetime('201412260950') + datetime.datetime(2014, 12, 26, 9, 50, 59) + + # 11-digit padding + >>> timestamp_to_datetime('20141226095') + datetime.datetime(2014, 12, 26, 9, 59, 59) + + # 10-digit padding + >>> timestamp_to_datetime('2014122609') + datetime.datetime(2014, 12, 26, 9, 59, 59) + + # 9-digit padding + >>> timestamp_to_datetime('201412260') + datetime.datetime(2014, 12, 26, 23, 59, 59) + + # 8-digit padding + >>> timestamp_to_datetime('20141226') + datetime.datetime(2014, 12, 26, 23, 59, 59) + + # 7-digit padding + >>> timestamp_to_datetime('2014122') + datetime.datetime(2014, 12, 31, 23, 59, 59) + + # 6-digit padding + >>> timestamp_to_datetime('201410') + datetime.datetime(2014, 10, 31, 23, 59, 59) + + # 5-digit padding + >>> timestamp_to_datetime('20141') + datetime.datetime(2014, 12, 31, 23, 59, 59) + + # 4-digit padding >>> timestamp_to_datetime('2014') - time.struct_time(tm_year=2014, tm_mon=12, tm_mday=31, \ -tm_hour=23, tm_min=59, tm_sec=59, tm_wday=2, tm_yday=365, tm_isdst=-1) + datetime.datetime(2014, 12, 31, 23, 59, 59) + + # 3-digit padding + >>> timestamp_to_datetime('201') + datetime.datetime(2019, 12, 31, 23, 59, 59) + + # 2-digit padding + >>> timestamp_to_datetime('20') + datetime.datetime(2099, 12, 31, 23, 59, 59) + + # 1-digit padding + >>> timestamp_to_datetime('2') + datetime.datetime(2999, 12, 31, 23, 59, 59) + + # 1-digit out-of-range padding + >>> timestamp_to_datetime('3') + datetime.datetime(2999, 12, 31, 23, 59, 59) + + # 0-digit padding + >>> timestamp_to_datetime('') + datetime.datetime(2999, 12, 31, 23, 59, 59) + + # bad month + >>> timestamp_to_datetime('20131709005601') + datetime.datetime(2013, 12, 9, 0, 56, 1) + + # all out of range except minutes + >>> timestamp_to_datetime('40001965252477') + datetime.datetime(2999, 12, 31, 23, 24, 59) + """ - # Default pad to end of range for comptability - return time.strptime(pad_timestamp(string), TIMESTAMP_14) + # pad to 6 digits + string = _pad_timestamp(string, PAD_6) + + + def clamp(val, min_, max_): + try: + val = int(val) + val = max(min_, min(val, max_)) + return val + except: + return max_ + + def extract(string, start, end, min_, max_): + if len(string) >= end: + return clamp(string[start:end], min_, max_) + else: + return max_ + + # now parse, clamp to boundary + year = extract(string, 0, 4, 1900, 2999) + month = extract(string, 4, 6, 1, 12) + day = extract(string, 6, 8, 1, calendar.monthrange(year, month)[1]) + hour = extract(string, 8, 10, 0, 23) + minute = extract(string, 10, 12, 0, 59) + second = extract(string, 12, 14, 0, 59) + + return datetime.datetime(year=year, + month=month, + day=day, + hour=hour, + minute=minute, + second=second) + + #return time.strptime(pad_timestamp(string), TIMESTAMP_14) def timestamp_to_sec(string): @@ -104,7 +209,7 @@ def timestamp_to_sec(string): 1420070399 """ - return calendar.timegm(timestamp_to_datetime(string)) + return calendar.timegm(timestamp_to_datetime(string).utctimetuple()) if __name__ == "__main__": diff --git a/pywb/warc/test/test_loading.py b/pywb/warc/test/test_loading.py index 47176e3e..02ab54cb 100644 --- a/pywb/warc/test/test_loading.py +++ b/pywb/warc/test/test_loading.py @@ -213,3 +213,6 @@ def load_from_cdx_test(cdx): except Exception as e: print 'Exception: ' + e.__class__.__name__ +if __name__ == "__main__": + import doctest + doctest.testmod()