mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
Merge branch 'master' into pkg-reorg
This commit is contained in:
commit
3cd7b6b209
@ -1,7 +1,7 @@
|
||||
from canonicalize import UrlCanonicalizer, calc_search_range
|
||||
|
||||
from cdxops import cdx_load
|
||||
from cdxsource import CDXSource, CDXFile, RemoteCDXSource
|
||||
from cdxsource import CDXSource, CDXFile, RemoteCDXSource, RedisCDXSource
|
||||
from zipnum import ZipNumCluster
|
||||
from cdxobject import CDXObject, CaptureNotFoundException, CDXException
|
||||
from cdxdomainspecific import load_domain_specific_cdx_rules
|
||||
@ -149,10 +149,12 @@ def create_cdx_server(config, ds_rules_file=None):
|
||||
paths = config.get('index_paths')
|
||||
surt_ordered = config.get('surt_ordered', True)
|
||||
perms_checker = config.get('perms_checker')
|
||||
pass_config = config
|
||||
else:
|
||||
paths = config
|
||||
surt_ordered = True
|
||||
perms_checker = None
|
||||
pass_config = None
|
||||
|
||||
logging.debug('CDX Surt-Ordered? ' + str(surt_ordered))
|
||||
|
||||
@ -162,6 +164,7 @@ def create_cdx_server(config, ds_rules_file=None):
|
||||
server_cls = CDXServer
|
||||
|
||||
return server_cls(paths,
|
||||
config=pass_config,
|
||||
surt_ordered=surt_ordered,
|
||||
ds_rules=ds_rules_file,
|
||||
perms_checker=perms_checker)
|
||||
@ -206,6 +209,9 @@ def create_cdx_source(filename, config):
|
||||
if is_http(filename):
|
||||
return RemoteCDXSource(filename)
|
||||
|
||||
if filename.startswith('redis://'):
|
||||
return RedisCDXSource(filename, config)
|
||||
|
||||
if filename.endswith('.cdx'):
|
||||
return CDXFile(filename)
|
||||
|
||||
@ -213,9 +219,6 @@ def create_cdx_source(filename, config):
|
||||
return ZipNumCluster(filename, config)
|
||||
|
||||
return None
|
||||
#TODO: support zipnum
|
||||
#elif filename.startswith('redis://')
|
||||
# return RedisCDXSource(filename)
|
||||
|
||||
|
||||
#=================================================================
|
||||
|
@ -3,7 +3,7 @@ from pywb.utils.loaders import SeekableTextFileReader
|
||||
|
||||
import urllib
|
||||
import urllib2
|
||||
|
||||
import itertools
|
||||
|
||||
#=================================================================
|
||||
class CDXSource(object):
|
||||
@ -80,3 +80,35 @@ class RemoteCDXSource(CDXSource):
|
||||
|
||||
def __str__(self):
|
||||
return 'Remote CDX Server: ' + self.remote_url
|
||||
|
||||
|
||||
#=================================================================
|
||||
class RedisCDXSource(CDXSource):
|
||||
DEFAULT_KEY_PREFIX = 'c:'
|
||||
|
||||
def __init__(self, redis_url, config=None):
|
||||
import redis
|
||||
self.redis = redis.StrictRedis.from_url(redis_url)
|
||||
|
||||
self.key_prefix = self.DEFAULT_KEY_PREFIX
|
||||
if config:
|
||||
self.key_prefix = config.get('redis_key_prefix', self.key_prefix)
|
||||
|
||||
|
||||
def load_cdx(self, params):
|
||||
"""
|
||||
Load cdx from redis cache, from an ordered list
|
||||
|
||||
Currently, there is no support for range queries
|
||||
Only 'exact' matchType is supported
|
||||
"""
|
||||
key = params['key']
|
||||
|
||||
# ensure only url/surt is part of key
|
||||
key = key.split(' ')[0]
|
||||
cdx_list = self.redis.zrange(self.key_prefix + key, 0, -1)
|
||||
|
||||
# key is not part of list, so prepend to each line
|
||||
key += ' '
|
||||
cdx_list = itertools.imap(lambda x: key + x, cdx_list)
|
||||
return cdx_list
|
||||
|
@ -56,9 +56,9 @@ class J2TemplateView:
|
||||
|
||||
# Filters
|
||||
@staticmethod
|
||||
def format_ts(value, format='%a, %b %d %Y %H:%M:%S'):
|
||||
value = timestamp_to_datetime(value)
|
||||
return time.strftime(format, value)
|
||||
def format_ts(value, format_='%a, %b %d %Y %H:%M:%S'):
|
||||
value = timeutils.timestamp_to_datetime(value)
|
||||
return value.strftime(format_)
|
||||
|
||||
@staticmethod
|
||||
def get_host(url):
|
||||
|
@ -1,9 +1,5 @@
|
||||
#!/usr/bin/python
|
||||
|
||||
import re
|
||||
import rfc3987
|
||||
|
||||
# WbUrl : wb archival url representation for WB
|
||||
"""
|
||||
WbUrl represents the standard wayback archival url format.
|
||||
A regular url is a subset of the WbUrl (latest replay).
|
||||
@ -34,9 +30,38 @@ replay form:
|
||||
|
||||
latest_replay: (no timestamp)
|
||||
http://example.com
|
||||
|
||||
Additionally, the BaseWbUrl provides the base components
|
||||
(url, timestamp, end_timestamp, modifier, type) which
|
||||
can be used to provide a custom representation of the
|
||||
wayback url format.
|
||||
|
||||
"""
|
||||
|
||||
class WbUrl:
|
||||
import re
|
||||
import rfc3987
|
||||
|
||||
|
||||
#=================================================================
|
||||
class BaseWbUrl(object):
|
||||
QUERY = 'query'
|
||||
URL_QUERY = 'url_query'
|
||||
REPLAY = 'replay'
|
||||
LATEST_REPLAY = 'latest_replay'
|
||||
|
||||
|
||||
def __init__(self, url='', mod='',
|
||||
timestamp='', end_timestamp='', type=None):
|
||||
|
||||
self.url = url
|
||||
self.timestamp = timestamp
|
||||
self.end_timestamp = end_timestamp
|
||||
self.mod = mod
|
||||
self.type = type
|
||||
|
||||
|
||||
#=================================================================
|
||||
class WbUrl(BaseWbUrl):
|
||||
"""
|
||||
# Replay Urls
|
||||
# ======================
|
||||
@ -107,22 +132,14 @@ class WbUrl:
|
||||
QUERY_REGEX = re.compile('^(?:([\w\-:]+)/)?(\d*)(?:-(\d+))?\*/?(.*)$')
|
||||
REPLAY_REGEX = re.compile('^(\d*)([a-z]+_)?/{0,3}(.*)$')
|
||||
|
||||
QUERY = 'query'
|
||||
URL_QUERY = 'url_query'
|
||||
REPLAY = 'replay'
|
||||
LATEST_REPLAY = 'latest_replay'
|
||||
|
||||
DEFAULT_SCHEME = 'http://'
|
||||
# ======================
|
||||
|
||||
|
||||
def __init__(self, url):
|
||||
super(WbUrl, self).__init__()
|
||||
|
||||
self.original_url = url
|
||||
self.type = None
|
||||
self.url = ''
|
||||
self.timestamp = ''
|
||||
self.end_timestamp = ''
|
||||
self.mod = ''
|
||||
|
||||
if not any (f(url) for f in [self._init_query, self._init_replay]):
|
||||
raise Exception('Invalid WbUrl: ', url)
|
||||
|
@ -65,23 +65,36 @@ class StatusAndHeadersParser(object):
|
||||
"""
|
||||
parse stream for status line and headers
|
||||
return a StatusAndHeaders object
|
||||
|
||||
support continuation headers starting with space or tab
|
||||
"""
|
||||
statusline = stream.readline().rstrip()
|
||||
|
||||
protocol_status = self.split_prefix(statusline, self.statuslist)
|
||||
|
||||
if not protocol_status:
|
||||
msg = 'Expected Status Line - Found: ' + statusline
|
||||
msg = 'Expected Status Line starting with {0} - Found: {1}'
|
||||
msg = msg.format(self.statuslist, statusline)
|
||||
raise StatusAndHeadersParserException(msg, statusline)
|
||||
|
||||
headers = []
|
||||
|
||||
line = stream.readline().rstrip()
|
||||
while line and line != '\r\n':
|
||||
while line:
|
||||
name, value = line.split(':', 1)
|
||||
header = (name, value.strip())
|
||||
name = name.rstrip(' \t')
|
||||
value = value.lstrip()
|
||||
|
||||
next_line = stream.readline().rstrip()
|
||||
|
||||
# append continuation lines, if any
|
||||
while next_line and next_line.startswith((' ', '\t')):
|
||||
value += next_line
|
||||
next_line = stream.readline().rstrip()
|
||||
|
||||
header = (name, value)
|
||||
headers.append(header)
|
||||
line = stream.readline().rstrip()
|
||||
line = next_line
|
||||
|
||||
return StatusAndHeaders(statusline=protocol_status[1].strip(),
|
||||
headers=headers,
|
||||
|
29
pywb/utils/test/statusandheaders_test.py
Normal file
29
pywb/utils/test/statusandheaders_test.py
Normal file
@ -0,0 +1,29 @@
|
||||
"""
|
||||
>>> StatusAndHeadersParser(['HTTP/1.0']).parse(StringIO.StringIO(status_headers_1))
|
||||
StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '200 OK', headers = [ ('Content-Type', 'ABC'),
|
||||
('Some', 'Value'),
|
||||
('Multi-Line', 'Value1 Also This')])
|
||||
|
||||
>>> StatusAndHeadersParser(['Other']).parse(StringIO.StringIO(status_headers_1))
|
||||
Traceback (most recent call last):
|
||||
StatusAndHeadersParserException: Expected Status Line starting with ['Other'] - Found: HTTP/1.0 200 OK
|
||||
"""
|
||||
|
||||
|
||||
from pywb.utils.statusandheaders import StatusAndHeadersParser
|
||||
import StringIO
|
||||
|
||||
|
||||
status_headers_1 = "\
|
||||
HTTP/1.0 200 OK\r\n\
|
||||
Content-Type: ABC\r\n\
|
||||
Some: Value\r\n\
|
||||
Multi-Line: Value1\r\n\
|
||||
Also This\r\n\
|
||||
\r\n\
|
||||
Body"
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
doctest.testmod()
|
@ -17,7 +17,8 @@ DATE_TIMESPLIT = re.compile(r'[^\d]')
|
||||
|
||||
TIMESTAMP_14 = '%Y%m%d%H%M%S'
|
||||
|
||||
PAD_STAMP_END = '29991231235959'
|
||||
#PAD_STAMP_END = '29991231235959'
|
||||
PAD_6 = '299912'
|
||||
|
||||
|
||||
def iso_date_to_datetime(string):
|
||||
@ -58,41 +59,145 @@ def iso_date_to_timestamp(string):
|
||||
return datetime_to_timestamp(iso_date_to_datetime(string))
|
||||
|
||||
|
||||
# default pad is end of range for compatibility
|
||||
def pad_timestamp(string, pad_str=PAD_STAMP_END):
|
||||
# pad to certain length (default 6)
|
||||
def _pad_timestamp(string, pad_str=PAD_6):
|
||||
"""
|
||||
>>> pad_timestamp('20')
|
||||
'20991231235959'
|
||||
>>> _pad_timestamp('20')
|
||||
'209912'
|
||||
|
||||
>>> pad_timestamp('2014')
|
||||
'20141231235959'
|
||||
>>> _pad_timestamp('2014')
|
||||
'201412'
|
||||
|
||||
>>> pad_timestamp('20141011')
|
||||
'20141011235959'
|
||||
>>> _pad_timestamp('20141011')
|
||||
'20141011'
|
||||
|
||||
>>> pad_timestamp('201410110010')
|
||||
'20141011001059'
|
||||
>>> _pad_timestamp('201410110010')
|
||||
'201410110010'
|
||||
"""
|
||||
|
||||
str_len = len(string)
|
||||
pad_len = len(pad_str)
|
||||
|
||||
return string if str_len >= pad_len else string + pad_str[str_len:]
|
||||
if str_len < pad_len:
|
||||
string = string + pad_str[str_len:]
|
||||
|
||||
return string
|
||||
|
||||
|
||||
def timestamp_to_datetime(string):
|
||||
"""
|
||||
>>> timestamp_to_datetime('20131226095010')
|
||||
time.struct_time(tm_year=2013, tm_mon=12, tm_mday=26, \
|
||||
tm_hour=9, tm_min=50, tm_sec=10, tm_wday=3, tm_yday=360, tm_isdst=-1)
|
||||
# >14-digit -- rest ignored
|
||||
>>> timestamp_to_datetime('2014122609501011')
|
||||
datetime.datetime(2014, 12, 26, 9, 50, 10)
|
||||
|
||||
# 14-digit
|
||||
>>> timestamp_to_datetime('20141226095010')
|
||||
datetime.datetime(2014, 12, 26, 9, 50, 10)
|
||||
|
||||
# 13-digit padding
|
||||
>>> timestamp_to_datetime('2014122609501')
|
||||
datetime.datetime(2014, 12, 26, 9, 50, 59)
|
||||
|
||||
# 12-digit padding
|
||||
>>> timestamp_to_datetime('201412260950')
|
||||
datetime.datetime(2014, 12, 26, 9, 50, 59)
|
||||
|
||||
# 11-digit padding
|
||||
>>> timestamp_to_datetime('20141226095')
|
||||
datetime.datetime(2014, 12, 26, 9, 59, 59)
|
||||
|
||||
# 10-digit padding
|
||||
>>> timestamp_to_datetime('2014122609')
|
||||
datetime.datetime(2014, 12, 26, 9, 59, 59)
|
||||
|
||||
# 9-digit padding
|
||||
>>> timestamp_to_datetime('201412260')
|
||||
datetime.datetime(2014, 12, 26, 23, 59, 59)
|
||||
|
||||
# 8-digit padding
|
||||
>>> timestamp_to_datetime('20141226')
|
||||
datetime.datetime(2014, 12, 26, 23, 59, 59)
|
||||
|
||||
# 7-digit padding
|
||||
>>> timestamp_to_datetime('2014122')
|
||||
datetime.datetime(2014, 12, 31, 23, 59, 59)
|
||||
|
||||
# 6-digit padding
|
||||
>>> timestamp_to_datetime('201410')
|
||||
datetime.datetime(2014, 10, 31, 23, 59, 59)
|
||||
|
||||
# 5-digit padding
|
||||
>>> timestamp_to_datetime('20141')
|
||||
datetime.datetime(2014, 12, 31, 23, 59, 59)
|
||||
|
||||
# 4-digit padding
|
||||
>>> timestamp_to_datetime('2014')
|
||||
time.struct_time(tm_year=2014, tm_mon=12, tm_mday=31, \
|
||||
tm_hour=23, tm_min=59, tm_sec=59, tm_wday=2, tm_yday=365, tm_isdst=-1)
|
||||
datetime.datetime(2014, 12, 31, 23, 59, 59)
|
||||
|
||||
# 3-digit padding
|
||||
>>> timestamp_to_datetime('201')
|
||||
datetime.datetime(2019, 12, 31, 23, 59, 59)
|
||||
|
||||
# 2-digit padding
|
||||
>>> timestamp_to_datetime('20')
|
||||
datetime.datetime(2099, 12, 31, 23, 59, 59)
|
||||
|
||||
# 1-digit padding
|
||||
>>> timestamp_to_datetime('2')
|
||||
datetime.datetime(2999, 12, 31, 23, 59, 59)
|
||||
|
||||
# 1-digit out-of-range padding
|
||||
>>> timestamp_to_datetime('3')
|
||||
datetime.datetime(2999, 12, 31, 23, 59, 59)
|
||||
|
||||
# 0-digit padding
|
||||
>>> timestamp_to_datetime('')
|
||||
datetime.datetime(2999, 12, 31, 23, 59, 59)
|
||||
|
||||
# bad month
|
||||
>>> timestamp_to_datetime('20131709005601')
|
||||
datetime.datetime(2013, 12, 9, 0, 56, 1)
|
||||
|
||||
# all out of range except minutes
|
||||
>>> timestamp_to_datetime('40001965252477')
|
||||
datetime.datetime(2999, 12, 31, 23, 24, 59)
|
||||
|
||||
"""
|
||||
|
||||
# Default pad to end of range for comptability
|
||||
return time.strptime(pad_timestamp(string), TIMESTAMP_14)
|
||||
# pad to 6 digits
|
||||
string = _pad_timestamp(string, PAD_6)
|
||||
|
||||
|
||||
def clamp(val, min_, max_):
|
||||
try:
|
||||
val = int(val)
|
||||
val = max(min_, min(val, max_))
|
||||
return val
|
||||
except:
|
||||
return max_
|
||||
|
||||
def extract(string, start, end, min_, max_):
|
||||
if len(string) >= end:
|
||||
return clamp(string[start:end], min_, max_)
|
||||
else:
|
||||
return max_
|
||||
|
||||
# now parse, clamp to boundary
|
||||
year = extract(string, 0, 4, 1900, 2999)
|
||||
month = extract(string, 4, 6, 1, 12)
|
||||
day = extract(string, 6, 8, 1, calendar.monthrange(year, month)[1])
|
||||
hour = extract(string, 8, 10, 0, 23)
|
||||
minute = extract(string, 10, 12, 0, 59)
|
||||
second = extract(string, 12, 14, 0, 59)
|
||||
|
||||
return datetime.datetime(year=year,
|
||||
month=month,
|
||||
day=day,
|
||||
hour=hour,
|
||||
minute=minute,
|
||||
second=second)
|
||||
|
||||
#return time.strptime(pad_timestamp(string), TIMESTAMP_14)
|
||||
|
||||
|
||||
def timestamp_to_sec(string):
|
||||
@ -104,7 +209,7 @@ def timestamp_to_sec(string):
|
||||
1420070399
|
||||
"""
|
||||
|
||||
return calendar.timegm(timestamp_to_datetime(string))
|
||||
return calendar.timegm(timestamp_to_datetime(string).utctimetuple())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
@ -213,3 +213,6 @@ def load_from_cdx_test(cdx):
|
||||
except Exception as e:
|
||||
print 'Exception: ' + e.__class__.__name__
|
||||
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
doctest.testmod()
|
||||
|
Loading…
x
Reference in New Issue
Block a user