1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

Merge branch 'master' into pkg-reorg

This commit is contained in:
Ilya Kreymer 2014-02-24 21:33:11 -08:00
commit 3cd7b6b209
8 changed files with 249 additions and 47 deletions

View File

@ -1,7 +1,7 @@
from canonicalize import UrlCanonicalizer, calc_search_range
from cdxops import cdx_load
from cdxsource import CDXSource, CDXFile, RemoteCDXSource
from cdxsource import CDXSource, CDXFile, RemoteCDXSource, RedisCDXSource
from zipnum import ZipNumCluster
from cdxobject import CDXObject, CaptureNotFoundException, CDXException
from cdxdomainspecific import load_domain_specific_cdx_rules
@ -149,10 +149,12 @@ def create_cdx_server(config, ds_rules_file=None):
paths = config.get('index_paths')
surt_ordered = config.get('surt_ordered', True)
perms_checker = config.get('perms_checker')
pass_config = config
else:
paths = config
surt_ordered = True
perms_checker = None
pass_config = None
logging.debug('CDX Surt-Ordered? ' + str(surt_ordered))
@ -162,6 +164,7 @@ def create_cdx_server(config, ds_rules_file=None):
server_cls = CDXServer
return server_cls(paths,
config=pass_config,
surt_ordered=surt_ordered,
ds_rules=ds_rules_file,
perms_checker=perms_checker)
@ -206,6 +209,9 @@ def create_cdx_source(filename, config):
if is_http(filename):
return RemoteCDXSource(filename)
if filename.startswith('redis://'):
return RedisCDXSource(filename, config)
if filename.endswith('.cdx'):
return CDXFile(filename)
@ -213,9 +219,6 @@ def create_cdx_source(filename, config):
return ZipNumCluster(filename, config)
return None
#TODO: support zipnum
#elif filename.startswith('redis://')
# return RedisCDXSource(filename)
#=================================================================

View File

@ -3,7 +3,7 @@ from pywb.utils.loaders import SeekableTextFileReader
import urllib
import urllib2
import itertools
#=================================================================
class CDXSource(object):
@ -80,3 +80,35 @@ class RemoteCDXSource(CDXSource):
def __str__(self):
return 'Remote CDX Server: ' + self.remote_url
#=================================================================
class RedisCDXSource(CDXSource):
DEFAULT_KEY_PREFIX = 'c:'
def __init__(self, redis_url, config=None):
import redis
self.redis = redis.StrictRedis.from_url(redis_url)
self.key_prefix = self.DEFAULT_KEY_PREFIX
if config:
self.key_prefix = config.get('redis_key_prefix', self.key_prefix)
def load_cdx(self, params):
"""
Load cdx from redis cache, from an ordered list
Currently, there is no support for range queries
Only 'exact' matchType is supported
"""
key = params['key']
# ensure only url/surt is part of key
key = key.split(' ')[0]
cdx_list = self.redis.zrange(self.key_prefix + key, 0, -1)
# key is not part of list, so prepend to each line
key += ' '
cdx_list = itertools.imap(lambda x: key + x, cdx_list)
return cdx_list

View File

@ -56,9 +56,9 @@ class J2TemplateView:
# Filters
@staticmethod
def format_ts(value, format='%a, %b %d %Y %H:%M:%S'):
value = timestamp_to_datetime(value)
return time.strftime(format, value)
def format_ts(value, format_='%a, %b %d %Y %H:%M:%S'):
value = timeutils.timestamp_to_datetime(value)
return value.strftime(format_)
@staticmethod
def get_host(url):

View File

@ -1,9 +1,5 @@
#!/usr/bin/python
import re
import rfc3987
# WbUrl : wb archival url representation for WB
"""
WbUrl represents the standard wayback archival url format.
A regular url is a subset of the WbUrl (latest replay).
@ -34,9 +30,38 @@ replay form:
latest_replay: (no timestamp)
http://example.com
Additionally, the BaseWbUrl provides the base components
(url, timestamp, end_timestamp, modifier, type) which
can be used to provide a custom representation of the
wayback url format.
"""
class WbUrl:
import re
import rfc3987
#=================================================================
class BaseWbUrl(object):
QUERY = 'query'
URL_QUERY = 'url_query'
REPLAY = 'replay'
LATEST_REPLAY = 'latest_replay'
def __init__(self, url='', mod='',
timestamp='', end_timestamp='', type=None):
self.url = url
self.timestamp = timestamp
self.end_timestamp = end_timestamp
self.mod = mod
self.type = type
#=================================================================
class WbUrl(BaseWbUrl):
"""
# Replay Urls
# ======================
@ -107,22 +132,14 @@ class WbUrl:
QUERY_REGEX = re.compile('^(?:([\w\-:]+)/)?(\d*)(?:-(\d+))?\*/?(.*)$')
REPLAY_REGEX = re.compile('^(\d*)([a-z]+_)?/{0,3}(.*)$')
QUERY = 'query'
URL_QUERY = 'url_query'
REPLAY = 'replay'
LATEST_REPLAY = 'latest_replay'
DEFAULT_SCHEME = 'http://'
# ======================
def __init__(self, url):
super(WbUrl, self).__init__()
self.original_url = url
self.type = None
self.url = ''
self.timestamp = ''
self.end_timestamp = ''
self.mod = ''
if not any (f(url) for f in [self._init_query, self._init_replay]):
raise Exception('Invalid WbUrl: ', url)

View File

@ -65,23 +65,36 @@ class StatusAndHeadersParser(object):
"""
parse stream for status line and headers
return a StatusAndHeaders object
support continuation headers starting with space or tab
"""
statusline = stream.readline().rstrip()
protocol_status = self.split_prefix(statusline, self.statuslist)
if not protocol_status:
msg = 'Expected Status Line - Found: ' + statusline
msg = 'Expected Status Line starting with {0} - Found: {1}'
msg = msg.format(self.statuslist, statusline)
raise StatusAndHeadersParserException(msg, statusline)
headers = []
line = stream.readline().rstrip()
while line and line != '\r\n':
while line:
name, value = line.split(':', 1)
header = (name, value.strip())
name = name.rstrip(' \t')
value = value.lstrip()
next_line = stream.readline().rstrip()
# append continuation lines, if any
while next_line and next_line.startswith((' ', '\t')):
value += next_line
next_line = stream.readline().rstrip()
header = (name, value)
headers.append(header)
line = stream.readline().rstrip()
line = next_line
return StatusAndHeaders(statusline=protocol_status[1].strip(),
headers=headers,

View File

@ -0,0 +1,29 @@
"""
>>> StatusAndHeadersParser(['HTTP/1.0']).parse(StringIO.StringIO(status_headers_1))
StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '200 OK', headers = [ ('Content-Type', 'ABC'),
('Some', 'Value'),
('Multi-Line', 'Value1 Also This')])
>>> StatusAndHeadersParser(['Other']).parse(StringIO.StringIO(status_headers_1))
Traceback (most recent call last):
StatusAndHeadersParserException: Expected Status Line starting with ['Other'] - Found: HTTP/1.0 200 OK
"""
from pywb.utils.statusandheaders import StatusAndHeadersParser
import StringIO
status_headers_1 = "\
HTTP/1.0 200 OK\r\n\
Content-Type: ABC\r\n\
Some: Value\r\n\
Multi-Line: Value1\r\n\
Also This\r\n\
\r\n\
Body"
if __name__ == "__main__":
import doctest
doctest.testmod()

View File

@ -17,7 +17,8 @@ DATE_TIMESPLIT = re.compile(r'[^\d]')
TIMESTAMP_14 = '%Y%m%d%H%M%S'
PAD_STAMP_END = '29991231235959'
#PAD_STAMP_END = '29991231235959'
PAD_6 = '299912'
def iso_date_to_datetime(string):
@ -58,41 +59,145 @@ def iso_date_to_timestamp(string):
return datetime_to_timestamp(iso_date_to_datetime(string))
# default pad is end of range for compatibility
def pad_timestamp(string, pad_str=PAD_STAMP_END):
# pad to certain length (default 6)
def _pad_timestamp(string, pad_str=PAD_6):
"""
>>> pad_timestamp('20')
'20991231235959'
>>> _pad_timestamp('20')
'209912'
>>> pad_timestamp('2014')
'20141231235959'
>>> _pad_timestamp('2014')
'201412'
>>> pad_timestamp('20141011')
'20141011235959'
>>> _pad_timestamp('20141011')
'20141011'
>>> pad_timestamp('201410110010')
'20141011001059'
>>> _pad_timestamp('201410110010')
'201410110010'
"""
str_len = len(string)
pad_len = len(pad_str)
return string if str_len >= pad_len else string + pad_str[str_len:]
if str_len < pad_len:
string = string + pad_str[str_len:]
return string
def timestamp_to_datetime(string):
"""
>>> timestamp_to_datetime('20131226095010')
time.struct_time(tm_year=2013, tm_mon=12, tm_mday=26, \
tm_hour=9, tm_min=50, tm_sec=10, tm_wday=3, tm_yday=360, tm_isdst=-1)
# >14-digit -- rest ignored
>>> timestamp_to_datetime('2014122609501011')
datetime.datetime(2014, 12, 26, 9, 50, 10)
# 14-digit
>>> timestamp_to_datetime('20141226095010')
datetime.datetime(2014, 12, 26, 9, 50, 10)
# 13-digit padding
>>> timestamp_to_datetime('2014122609501')
datetime.datetime(2014, 12, 26, 9, 50, 59)
# 12-digit padding
>>> timestamp_to_datetime('201412260950')
datetime.datetime(2014, 12, 26, 9, 50, 59)
# 11-digit padding
>>> timestamp_to_datetime('20141226095')
datetime.datetime(2014, 12, 26, 9, 59, 59)
# 10-digit padding
>>> timestamp_to_datetime('2014122609')
datetime.datetime(2014, 12, 26, 9, 59, 59)
# 9-digit padding
>>> timestamp_to_datetime('201412260')
datetime.datetime(2014, 12, 26, 23, 59, 59)
# 8-digit padding
>>> timestamp_to_datetime('20141226')
datetime.datetime(2014, 12, 26, 23, 59, 59)
# 7-digit padding
>>> timestamp_to_datetime('2014122')
datetime.datetime(2014, 12, 31, 23, 59, 59)
# 6-digit padding
>>> timestamp_to_datetime('201410')
datetime.datetime(2014, 10, 31, 23, 59, 59)
# 5-digit padding
>>> timestamp_to_datetime('20141')
datetime.datetime(2014, 12, 31, 23, 59, 59)
# 4-digit padding
>>> timestamp_to_datetime('2014')
time.struct_time(tm_year=2014, tm_mon=12, tm_mday=31, \
tm_hour=23, tm_min=59, tm_sec=59, tm_wday=2, tm_yday=365, tm_isdst=-1)
datetime.datetime(2014, 12, 31, 23, 59, 59)
# 3-digit padding
>>> timestamp_to_datetime('201')
datetime.datetime(2019, 12, 31, 23, 59, 59)
# 2-digit padding
>>> timestamp_to_datetime('20')
datetime.datetime(2099, 12, 31, 23, 59, 59)
# 1-digit padding
>>> timestamp_to_datetime('2')
datetime.datetime(2999, 12, 31, 23, 59, 59)
# 1-digit out-of-range padding
>>> timestamp_to_datetime('3')
datetime.datetime(2999, 12, 31, 23, 59, 59)
# 0-digit padding
>>> timestamp_to_datetime('')
datetime.datetime(2999, 12, 31, 23, 59, 59)
# bad month
>>> timestamp_to_datetime('20131709005601')
datetime.datetime(2013, 12, 9, 0, 56, 1)
# all out of range except minutes
>>> timestamp_to_datetime('40001965252477')
datetime.datetime(2999, 12, 31, 23, 24, 59)
"""
# Default pad to end of range for comptability
return time.strptime(pad_timestamp(string), TIMESTAMP_14)
# pad to 6 digits
string = _pad_timestamp(string, PAD_6)
def clamp(val, min_, max_):
try:
val = int(val)
val = max(min_, min(val, max_))
return val
except:
return max_
def extract(string, start, end, min_, max_):
if len(string) >= end:
return clamp(string[start:end], min_, max_)
else:
return max_
# now parse, clamp to boundary
year = extract(string, 0, 4, 1900, 2999)
month = extract(string, 4, 6, 1, 12)
day = extract(string, 6, 8, 1, calendar.monthrange(year, month)[1])
hour = extract(string, 8, 10, 0, 23)
minute = extract(string, 10, 12, 0, 59)
second = extract(string, 12, 14, 0, 59)
return datetime.datetime(year=year,
month=month,
day=day,
hour=hour,
minute=minute,
second=second)
#return time.strptime(pad_timestamp(string), TIMESTAMP_14)
def timestamp_to_sec(string):
@ -104,7 +209,7 @@ def timestamp_to_sec(string):
1420070399
"""
return calendar.timegm(timestamp_to_datetime(string))
return calendar.timegm(timestamp_to_datetime(string).utctimetuple())
if __name__ == "__main__":

View File

@ -213,3 +213,6 @@ def load_from_cdx_test(cdx):
except Exception as e:
print 'Exception: ' + e.__class__.__name__
if __name__ == "__main__":
import doctest
doctest.testmod()