1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

Merge branch 'master' into pkg-reorg

This commit is contained in:
Ilya Kreymer 2014-02-24 21:33:11 -08:00
commit 3cd7b6b209
8 changed files with 249 additions and 47 deletions

View File

@ -1,7 +1,7 @@
from canonicalize import UrlCanonicalizer, calc_search_range from canonicalize import UrlCanonicalizer, calc_search_range
from cdxops import cdx_load from cdxops import cdx_load
from cdxsource import CDXSource, CDXFile, RemoteCDXSource from cdxsource import CDXSource, CDXFile, RemoteCDXSource, RedisCDXSource
from zipnum import ZipNumCluster from zipnum import ZipNumCluster
from cdxobject import CDXObject, CaptureNotFoundException, CDXException from cdxobject import CDXObject, CaptureNotFoundException, CDXException
from cdxdomainspecific import load_domain_specific_cdx_rules from cdxdomainspecific import load_domain_specific_cdx_rules
@ -149,10 +149,12 @@ def create_cdx_server(config, ds_rules_file=None):
paths = config.get('index_paths') paths = config.get('index_paths')
surt_ordered = config.get('surt_ordered', True) surt_ordered = config.get('surt_ordered', True)
perms_checker = config.get('perms_checker') perms_checker = config.get('perms_checker')
pass_config = config
else: else:
paths = config paths = config
surt_ordered = True surt_ordered = True
perms_checker = None perms_checker = None
pass_config = None
logging.debug('CDX Surt-Ordered? ' + str(surt_ordered)) logging.debug('CDX Surt-Ordered? ' + str(surt_ordered))
@ -162,6 +164,7 @@ def create_cdx_server(config, ds_rules_file=None):
server_cls = CDXServer server_cls = CDXServer
return server_cls(paths, return server_cls(paths,
config=pass_config,
surt_ordered=surt_ordered, surt_ordered=surt_ordered,
ds_rules=ds_rules_file, ds_rules=ds_rules_file,
perms_checker=perms_checker) perms_checker=perms_checker)
@ -206,6 +209,9 @@ def create_cdx_source(filename, config):
if is_http(filename): if is_http(filename):
return RemoteCDXSource(filename) return RemoteCDXSource(filename)
if filename.startswith('redis://'):
return RedisCDXSource(filename, config)
if filename.endswith('.cdx'): if filename.endswith('.cdx'):
return CDXFile(filename) return CDXFile(filename)
@ -213,9 +219,6 @@ def create_cdx_source(filename, config):
return ZipNumCluster(filename, config) return ZipNumCluster(filename, config)
return None return None
#TODO: support zipnum
#elif filename.startswith('redis://')
# return RedisCDXSource(filename)
#================================================================= #=================================================================

View File

@ -3,7 +3,7 @@ from pywb.utils.loaders import SeekableTextFileReader
import urllib import urllib
import urllib2 import urllib2
import itertools
#================================================================= #=================================================================
class CDXSource(object): class CDXSource(object):
@ -80,3 +80,35 @@ class RemoteCDXSource(CDXSource):
def __str__(self): def __str__(self):
return 'Remote CDX Server: ' + self.remote_url return 'Remote CDX Server: ' + self.remote_url
#=================================================================
class RedisCDXSource(CDXSource):
DEFAULT_KEY_PREFIX = 'c:'
def __init__(self, redis_url, config=None):
import redis
self.redis = redis.StrictRedis.from_url(redis_url)
self.key_prefix = self.DEFAULT_KEY_PREFIX
if config:
self.key_prefix = config.get('redis_key_prefix', self.key_prefix)
def load_cdx(self, params):
"""
Load cdx from redis cache, from an ordered list
Currently, there is no support for range queries
Only 'exact' matchType is supported
"""
key = params['key']
# ensure only url/surt is part of key
key = key.split(' ')[0]
cdx_list = self.redis.zrange(self.key_prefix + key, 0, -1)
# key is not part of list, so prepend to each line
key += ' '
cdx_list = itertools.imap(lambda x: key + x, cdx_list)
return cdx_list

View File

@ -56,9 +56,9 @@ class J2TemplateView:
# Filters # Filters
@staticmethod @staticmethod
def format_ts(value, format='%a, %b %d %Y %H:%M:%S'): def format_ts(value, format_='%a, %b %d %Y %H:%M:%S'):
value = timestamp_to_datetime(value) value = timeutils.timestamp_to_datetime(value)
return time.strftime(format, value) return value.strftime(format_)
@staticmethod @staticmethod
def get_host(url): def get_host(url):

View File

@ -1,9 +1,5 @@
#!/usr/bin/python #!/usr/bin/python
import re
import rfc3987
# WbUrl : wb archival url representation for WB
""" """
WbUrl represents the standard wayback archival url format. WbUrl represents the standard wayback archival url format.
A regular url is a subset of the WbUrl (latest replay). A regular url is a subset of the WbUrl (latest replay).
@ -34,9 +30,38 @@ replay form:
latest_replay: (no timestamp) latest_replay: (no timestamp)
http://example.com http://example.com
Additionally, the BaseWbUrl provides the base components
(url, timestamp, end_timestamp, modifier, type) which
can be used to provide a custom representation of the
wayback url format.
""" """
class WbUrl: import re
import rfc3987
#=================================================================
class BaseWbUrl(object):
QUERY = 'query'
URL_QUERY = 'url_query'
REPLAY = 'replay'
LATEST_REPLAY = 'latest_replay'
def __init__(self, url='', mod='',
timestamp='', end_timestamp='', type=None):
self.url = url
self.timestamp = timestamp
self.end_timestamp = end_timestamp
self.mod = mod
self.type = type
#=================================================================
class WbUrl(BaseWbUrl):
""" """
# Replay Urls # Replay Urls
# ====================== # ======================
@ -107,22 +132,14 @@ class WbUrl:
QUERY_REGEX = re.compile('^(?:([\w\-:]+)/)?(\d*)(?:-(\d+))?\*/?(.*)$') QUERY_REGEX = re.compile('^(?:([\w\-:]+)/)?(\d*)(?:-(\d+))?\*/?(.*)$')
REPLAY_REGEX = re.compile('^(\d*)([a-z]+_)?/{0,3}(.*)$') REPLAY_REGEX = re.compile('^(\d*)([a-z]+_)?/{0,3}(.*)$')
QUERY = 'query'
URL_QUERY = 'url_query'
REPLAY = 'replay'
LATEST_REPLAY = 'latest_replay'
DEFAULT_SCHEME = 'http://' DEFAULT_SCHEME = 'http://'
# ====================== # ======================
def __init__(self, url): def __init__(self, url):
super(WbUrl, self).__init__()
self.original_url = url self.original_url = url
self.type = None
self.url = ''
self.timestamp = ''
self.end_timestamp = ''
self.mod = ''
if not any (f(url) for f in [self._init_query, self._init_replay]): if not any (f(url) for f in [self._init_query, self._init_replay]):
raise Exception('Invalid WbUrl: ', url) raise Exception('Invalid WbUrl: ', url)

View File

@ -65,23 +65,36 @@ class StatusAndHeadersParser(object):
""" """
parse stream for status line and headers parse stream for status line and headers
return a StatusAndHeaders object return a StatusAndHeaders object
support continuation headers starting with space or tab
""" """
statusline = stream.readline().rstrip() statusline = stream.readline().rstrip()
protocol_status = self.split_prefix(statusline, self.statuslist) protocol_status = self.split_prefix(statusline, self.statuslist)
if not protocol_status: if not protocol_status:
msg = 'Expected Status Line - Found: ' + statusline msg = 'Expected Status Line starting with {0} - Found: {1}'
msg = msg.format(self.statuslist, statusline)
raise StatusAndHeadersParserException(msg, statusline) raise StatusAndHeadersParserException(msg, statusline)
headers = [] headers = []
line = stream.readline().rstrip() line = stream.readline().rstrip()
while line and line != '\r\n': while line:
name, value = line.split(':', 1) name, value = line.split(':', 1)
header = (name, value.strip()) name = name.rstrip(' \t')
value = value.lstrip()
next_line = stream.readline().rstrip()
# append continuation lines, if any
while next_line and next_line.startswith((' ', '\t')):
value += next_line
next_line = stream.readline().rstrip()
header = (name, value)
headers.append(header) headers.append(header)
line = stream.readline().rstrip() line = next_line
return StatusAndHeaders(statusline=protocol_status[1].strip(), return StatusAndHeaders(statusline=protocol_status[1].strip(),
headers=headers, headers=headers,

View File

@ -0,0 +1,29 @@
"""
>>> StatusAndHeadersParser(['HTTP/1.0']).parse(StringIO.StringIO(status_headers_1))
StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '200 OK', headers = [ ('Content-Type', 'ABC'),
('Some', 'Value'),
('Multi-Line', 'Value1 Also This')])
>>> StatusAndHeadersParser(['Other']).parse(StringIO.StringIO(status_headers_1))
Traceback (most recent call last):
StatusAndHeadersParserException: Expected Status Line starting with ['Other'] - Found: HTTP/1.0 200 OK
"""
from pywb.utils.statusandheaders import StatusAndHeadersParser
import StringIO
status_headers_1 = "\
HTTP/1.0 200 OK\r\n\
Content-Type: ABC\r\n\
Some: Value\r\n\
Multi-Line: Value1\r\n\
Also This\r\n\
\r\n\
Body"
if __name__ == "__main__":
import doctest
doctest.testmod()

View File

@ -17,7 +17,8 @@ DATE_TIMESPLIT = re.compile(r'[^\d]')
TIMESTAMP_14 = '%Y%m%d%H%M%S' TIMESTAMP_14 = '%Y%m%d%H%M%S'
PAD_STAMP_END = '29991231235959' #PAD_STAMP_END = '29991231235959'
PAD_6 = '299912'
def iso_date_to_datetime(string): def iso_date_to_datetime(string):
@ -58,41 +59,145 @@ def iso_date_to_timestamp(string):
return datetime_to_timestamp(iso_date_to_datetime(string)) return datetime_to_timestamp(iso_date_to_datetime(string))
# default pad is end of range for compatibility # pad to certain length (default 6)
def pad_timestamp(string, pad_str=PAD_STAMP_END): def _pad_timestamp(string, pad_str=PAD_6):
""" """
>>> pad_timestamp('20') >>> _pad_timestamp('20')
'20991231235959' '209912'
>>> pad_timestamp('2014') >>> _pad_timestamp('2014')
'20141231235959' '201412'
>>> pad_timestamp('20141011') >>> _pad_timestamp('20141011')
'20141011235959' '20141011'
>>> pad_timestamp('201410110010') >>> _pad_timestamp('201410110010')
'20141011001059' '201410110010'
""" """
str_len = len(string) str_len = len(string)
pad_len = len(pad_str) pad_len = len(pad_str)
return string if str_len >= pad_len else string + pad_str[str_len:] if str_len < pad_len:
string = string + pad_str[str_len:]
return string
def timestamp_to_datetime(string): def timestamp_to_datetime(string):
""" """
>>> timestamp_to_datetime('20131226095010') # >14-digit -- rest ignored
time.struct_time(tm_year=2013, tm_mon=12, tm_mday=26, \ >>> timestamp_to_datetime('2014122609501011')
tm_hour=9, tm_min=50, tm_sec=10, tm_wday=3, tm_yday=360, tm_isdst=-1) datetime.datetime(2014, 12, 26, 9, 50, 10)
# 14-digit
>>> timestamp_to_datetime('20141226095010')
datetime.datetime(2014, 12, 26, 9, 50, 10)
# 13-digit padding
>>> timestamp_to_datetime('2014122609501')
datetime.datetime(2014, 12, 26, 9, 50, 59)
# 12-digit padding
>>> timestamp_to_datetime('201412260950')
datetime.datetime(2014, 12, 26, 9, 50, 59)
# 11-digit padding
>>> timestamp_to_datetime('20141226095')
datetime.datetime(2014, 12, 26, 9, 59, 59)
# 10-digit padding
>>> timestamp_to_datetime('2014122609')
datetime.datetime(2014, 12, 26, 9, 59, 59)
# 9-digit padding
>>> timestamp_to_datetime('201412260')
datetime.datetime(2014, 12, 26, 23, 59, 59)
# 8-digit padding
>>> timestamp_to_datetime('20141226')
datetime.datetime(2014, 12, 26, 23, 59, 59)
# 7-digit padding
>>> timestamp_to_datetime('2014122')
datetime.datetime(2014, 12, 31, 23, 59, 59)
# 6-digit padding
>>> timestamp_to_datetime('201410')
datetime.datetime(2014, 10, 31, 23, 59, 59)
# 5-digit padding
>>> timestamp_to_datetime('20141')
datetime.datetime(2014, 12, 31, 23, 59, 59)
# 4-digit padding
>>> timestamp_to_datetime('2014') >>> timestamp_to_datetime('2014')
time.struct_time(tm_year=2014, tm_mon=12, tm_mday=31, \ datetime.datetime(2014, 12, 31, 23, 59, 59)
tm_hour=23, tm_min=59, tm_sec=59, tm_wday=2, tm_yday=365, tm_isdst=-1)
# 3-digit padding
>>> timestamp_to_datetime('201')
datetime.datetime(2019, 12, 31, 23, 59, 59)
# 2-digit padding
>>> timestamp_to_datetime('20')
datetime.datetime(2099, 12, 31, 23, 59, 59)
# 1-digit padding
>>> timestamp_to_datetime('2')
datetime.datetime(2999, 12, 31, 23, 59, 59)
# 1-digit out-of-range padding
>>> timestamp_to_datetime('3')
datetime.datetime(2999, 12, 31, 23, 59, 59)
# 0-digit padding
>>> timestamp_to_datetime('')
datetime.datetime(2999, 12, 31, 23, 59, 59)
# bad month
>>> timestamp_to_datetime('20131709005601')
datetime.datetime(2013, 12, 9, 0, 56, 1)
# all out of range except minutes
>>> timestamp_to_datetime('40001965252477')
datetime.datetime(2999, 12, 31, 23, 24, 59)
""" """
# Default pad to end of range for comptability # pad to 6 digits
return time.strptime(pad_timestamp(string), TIMESTAMP_14) string = _pad_timestamp(string, PAD_6)
def clamp(val, min_, max_):
try:
val = int(val)
val = max(min_, min(val, max_))
return val
except:
return max_
def extract(string, start, end, min_, max_):
if len(string) >= end:
return clamp(string[start:end], min_, max_)
else:
return max_
# now parse, clamp to boundary
year = extract(string, 0, 4, 1900, 2999)
month = extract(string, 4, 6, 1, 12)
day = extract(string, 6, 8, 1, calendar.monthrange(year, month)[1])
hour = extract(string, 8, 10, 0, 23)
minute = extract(string, 10, 12, 0, 59)
second = extract(string, 12, 14, 0, 59)
return datetime.datetime(year=year,
month=month,
day=day,
hour=hour,
minute=minute,
second=second)
#return time.strptime(pad_timestamp(string), TIMESTAMP_14)
def timestamp_to_sec(string): def timestamp_to_sec(string):
@ -104,7 +209,7 @@ def timestamp_to_sec(string):
1420070399 1420070399
""" """
return calendar.timegm(timestamp_to_datetime(string)) return calendar.timegm(timestamp_to_datetime(string).utctimetuple())
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -213,3 +213,6 @@ def load_from_cdx_test(cdx):
except Exception as e: except Exception as e:
print 'Exception: ' + e.__class__.__name__ print 'Exception: ' + e.__class__.__name__
if __name__ == "__main__":
import doctest
doctest.testmod()