mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
Merge branch 'master' into pkg-reorg
This commit is contained in:
commit
3cd7b6b209
@ -1,7 +1,7 @@
|
|||||||
from canonicalize import UrlCanonicalizer, calc_search_range
|
from canonicalize import UrlCanonicalizer, calc_search_range
|
||||||
|
|
||||||
from cdxops import cdx_load
|
from cdxops import cdx_load
|
||||||
from cdxsource import CDXSource, CDXFile, RemoteCDXSource
|
from cdxsource import CDXSource, CDXFile, RemoteCDXSource, RedisCDXSource
|
||||||
from zipnum import ZipNumCluster
|
from zipnum import ZipNumCluster
|
||||||
from cdxobject import CDXObject, CaptureNotFoundException, CDXException
|
from cdxobject import CDXObject, CaptureNotFoundException, CDXException
|
||||||
from cdxdomainspecific import load_domain_specific_cdx_rules
|
from cdxdomainspecific import load_domain_specific_cdx_rules
|
||||||
@ -149,10 +149,12 @@ def create_cdx_server(config, ds_rules_file=None):
|
|||||||
paths = config.get('index_paths')
|
paths = config.get('index_paths')
|
||||||
surt_ordered = config.get('surt_ordered', True)
|
surt_ordered = config.get('surt_ordered', True)
|
||||||
perms_checker = config.get('perms_checker')
|
perms_checker = config.get('perms_checker')
|
||||||
|
pass_config = config
|
||||||
else:
|
else:
|
||||||
paths = config
|
paths = config
|
||||||
surt_ordered = True
|
surt_ordered = True
|
||||||
perms_checker = None
|
perms_checker = None
|
||||||
|
pass_config = None
|
||||||
|
|
||||||
logging.debug('CDX Surt-Ordered? ' + str(surt_ordered))
|
logging.debug('CDX Surt-Ordered? ' + str(surt_ordered))
|
||||||
|
|
||||||
@ -162,6 +164,7 @@ def create_cdx_server(config, ds_rules_file=None):
|
|||||||
server_cls = CDXServer
|
server_cls = CDXServer
|
||||||
|
|
||||||
return server_cls(paths,
|
return server_cls(paths,
|
||||||
|
config=pass_config,
|
||||||
surt_ordered=surt_ordered,
|
surt_ordered=surt_ordered,
|
||||||
ds_rules=ds_rules_file,
|
ds_rules=ds_rules_file,
|
||||||
perms_checker=perms_checker)
|
perms_checker=perms_checker)
|
||||||
@ -206,6 +209,9 @@ def create_cdx_source(filename, config):
|
|||||||
if is_http(filename):
|
if is_http(filename):
|
||||||
return RemoteCDXSource(filename)
|
return RemoteCDXSource(filename)
|
||||||
|
|
||||||
|
if filename.startswith('redis://'):
|
||||||
|
return RedisCDXSource(filename, config)
|
||||||
|
|
||||||
if filename.endswith('.cdx'):
|
if filename.endswith('.cdx'):
|
||||||
return CDXFile(filename)
|
return CDXFile(filename)
|
||||||
|
|
||||||
@ -213,9 +219,6 @@ def create_cdx_source(filename, config):
|
|||||||
return ZipNumCluster(filename, config)
|
return ZipNumCluster(filename, config)
|
||||||
|
|
||||||
return None
|
return None
|
||||||
#TODO: support zipnum
|
|
||||||
#elif filename.startswith('redis://')
|
|
||||||
# return RedisCDXSource(filename)
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
|
@ -3,7 +3,7 @@ from pywb.utils.loaders import SeekableTextFileReader
|
|||||||
|
|
||||||
import urllib
|
import urllib
|
||||||
import urllib2
|
import urllib2
|
||||||
|
import itertools
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class CDXSource(object):
|
class CDXSource(object):
|
||||||
@ -80,3 +80,35 @@ class RemoteCDXSource(CDXSource):
|
|||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return 'Remote CDX Server: ' + self.remote_url
|
return 'Remote CDX Server: ' + self.remote_url
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class RedisCDXSource(CDXSource):
|
||||||
|
DEFAULT_KEY_PREFIX = 'c:'
|
||||||
|
|
||||||
|
def __init__(self, redis_url, config=None):
|
||||||
|
import redis
|
||||||
|
self.redis = redis.StrictRedis.from_url(redis_url)
|
||||||
|
|
||||||
|
self.key_prefix = self.DEFAULT_KEY_PREFIX
|
||||||
|
if config:
|
||||||
|
self.key_prefix = config.get('redis_key_prefix', self.key_prefix)
|
||||||
|
|
||||||
|
|
||||||
|
def load_cdx(self, params):
|
||||||
|
"""
|
||||||
|
Load cdx from redis cache, from an ordered list
|
||||||
|
|
||||||
|
Currently, there is no support for range queries
|
||||||
|
Only 'exact' matchType is supported
|
||||||
|
"""
|
||||||
|
key = params['key']
|
||||||
|
|
||||||
|
# ensure only url/surt is part of key
|
||||||
|
key = key.split(' ')[0]
|
||||||
|
cdx_list = self.redis.zrange(self.key_prefix + key, 0, -1)
|
||||||
|
|
||||||
|
# key is not part of list, so prepend to each line
|
||||||
|
key += ' '
|
||||||
|
cdx_list = itertools.imap(lambda x: key + x, cdx_list)
|
||||||
|
return cdx_list
|
||||||
|
@ -56,9 +56,9 @@ class J2TemplateView:
|
|||||||
|
|
||||||
# Filters
|
# Filters
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def format_ts(value, format='%a, %b %d %Y %H:%M:%S'):
|
def format_ts(value, format_='%a, %b %d %Y %H:%M:%S'):
|
||||||
value = timestamp_to_datetime(value)
|
value = timeutils.timestamp_to_datetime(value)
|
||||||
return time.strftime(format, value)
|
return value.strftime(format_)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_host(url):
|
def get_host(url):
|
||||||
|
@ -1,9 +1,5 @@
|
|||||||
#!/usr/bin/python
|
#!/usr/bin/python
|
||||||
|
|
||||||
import re
|
|
||||||
import rfc3987
|
|
||||||
|
|
||||||
# WbUrl : wb archival url representation for WB
|
|
||||||
"""
|
"""
|
||||||
WbUrl represents the standard wayback archival url format.
|
WbUrl represents the standard wayback archival url format.
|
||||||
A regular url is a subset of the WbUrl (latest replay).
|
A regular url is a subset of the WbUrl (latest replay).
|
||||||
@ -34,9 +30,38 @@ replay form:
|
|||||||
|
|
||||||
latest_replay: (no timestamp)
|
latest_replay: (no timestamp)
|
||||||
http://example.com
|
http://example.com
|
||||||
|
|
||||||
|
Additionally, the BaseWbUrl provides the base components
|
||||||
|
(url, timestamp, end_timestamp, modifier, type) which
|
||||||
|
can be used to provide a custom representation of the
|
||||||
|
wayback url format.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
class WbUrl:
|
import re
|
||||||
|
import rfc3987
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class BaseWbUrl(object):
|
||||||
|
QUERY = 'query'
|
||||||
|
URL_QUERY = 'url_query'
|
||||||
|
REPLAY = 'replay'
|
||||||
|
LATEST_REPLAY = 'latest_replay'
|
||||||
|
|
||||||
|
|
||||||
|
def __init__(self, url='', mod='',
|
||||||
|
timestamp='', end_timestamp='', type=None):
|
||||||
|
|
||||||
|
self.url = url
|
||||||
|
self.timestamp = timestamp
|
||||||
|
self.end_timestamp = end_timestamp
|
||||||
|
self.mod = mod
|
||||||
|
self.type = type
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class WbUrl(BaseWbUrl):
|
||||||
"""
|
"""
|
||||||
# Replay Urls
|
# Replay Urls
|
||||||
# ======================
|
# ======================
|
||||||
@ -107,22 +132,14 @@ class WbUrl:
|
|||||||
QUERY_REGEX = re.compile('^(?:([\w\-:]+)/)?(\d*)(?:-(\d+))?\*/?(.*)$')
|
QUERY_REGEX = re.compile('^(?:([\w\-:]+)/)?(\d*)(?:-(\d+))?\*/?(.*)$')
|
||||||
REPLAY_REGEX = re.compile('^(\d*)([a-z]+_)?/{0,3}(.*)$')
|
REPLAY_REGEX = re.compile('^(\d*)([a-z]+_)?/{0,3}(.*)$')
|
||||||
|
|
||||||
QUERY = 'query'
|
|
||||||
URL_QUERY = 'url_query'
|
|
||||||
REPLAY = 'replay'
|
|
||||||
LATEST_REPLAY = 'latest_replay'
|
|
||||||
|
|
||||||
DEFAULT_SCHEME = 'http://'
|
DEFAULT_SCHEME = 'http://'
|
||||||
# ======================
|
# ======================
|
||||||
|
|
||||||
|
|
||||||
def __init__(self, url):
|
def __init__(self, url):
|
||||||
|
super(WbUrl, self).__init__()
|
||||||
|
|
||||||
self.original_url = url
|
self.original_url = url
|
||||||
self.type = None
|
|
||||||
self.url = ''
|
|
||||||
self.timestamp = ''
|
|
||||||
self.end_timestamp = ''
|
|
||||||
self.mod = ''
|
|
||||||
|
|
||||||
if not any (f(url) for f in [self._init_query, self._init_replay]):
|
if not any (f(url) for f in [self._init_query, self._init_replay]):
|
||||||
raise Exception('Invalid WbUrl: ', url)
|
raise Exception('Invalid WbUrl: ', url)
|
||||||
|
@ -65,23 +65,36 @@ class StatusAndHeadersParser(object):
|
|||||||
"""
|
"""
|
||||||
parse stream for status line and headers
|
parse stream for status line and headers
|
||||||
return a StatusAndHeaders object
|
return a StatusAndHeaders object
|
||||||
|
|
||||||
|
support continuation headers starting with space or tab
|
||||||
"""
|
"""
|
||||||
statusline = stream.readline().rstrip()
|
statusline = stream.readline().rstrip()
|
||||||
|
|
||||||
protocol_status = self.split_prefix(statusline, self.statuslist)
|
protocol_status = self.split_prefix(statusline, self.statuslist)
|
||||||
|
|
||||||
if not protocol_status:
|
if not protocol_status:
|
||||||
msg = 'Expected Status Line - Found: ' + statusline
|
msg = 'Expected Status Line starting with {0} - Found: {1}'
|
||||||
|
msg = msg.format(self.statuslist, statusline)
|
||||||
raise StatusAndHeadersParserException(msg, statusline)
|
raise StatusAndHeadersParserException(msg, statusline)
|
||||||
|
|
||||||
headers = []
|
headers = []
|
||||||
|
|
||||||
line = stream.readline().rstrip()
|
line = stream.readline().rstrip()
|
||||||
while line and line != '\r\n':
|
while line:
|
||||||
name, value = line.split(':', 1)
|
name, value = line.split(':', 1)
|
||||||
header = (name, value.strip())
|
name = name.rstrip(' \t')
|
||||||
|
value = value.lstrip()
|
||||||
|
|
||||||
|
next_line = stream.readline().rstrip()
|
||||||
|
|
||||||
|
# append continuation lines, if any
|
||||||
|
while next_line and next_line.startswith((' ', '\t')):
|
||||||
|
value += next_line
|
||||||
|
next_line = stream.readline().rstrip()
|
||||||
|
|
||||||
|
header = (name, value)
|
||||||
headers.append(header)
|
headers.append(header)
|
||||||
line = stream.readline().rstrip()
|
line = next_line
|
||||||
|
|
||||||
return StatusAndHeaders(statusline=protocol_status[1].strip(),
|
return StatusAndHeaders(statusline=protocol_status[1].strip(),
|
||||||
headers=headers,
|
headers=headers,
|
||||||
|
29
pywb/utils/test/statusandheaders_test.py
Normal file
29
pywb/utils/test/statusandheaders_test.py
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
"""
|
||||||
|
>>> StatusAndHeadersParser(['HTTP/1.0']).parse(StringIO.StringIO(status_headers_1))
|
||||||
|
StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '200 OK', headers = [ ('Content-Type', 'ABC'),
|
||||||
|
('Some', 'Value'),
|
||||||
|
('Multi-Line', 'Value1 Also This')])
|
||||||
|
|
||||||
|
>>> StatusAndHeadersParser(['Other']).parse(StringIO.StringIO(status_headers_1))
|
||||||
|
Traceback (most recent call last):
|
||||||
|
StatusAndHeadersParserException: Expected Status Line starting with ['Other'] - Found: HTTP/1.0 200 OK
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
from pywb.utils.statusandheaders import StatusAndHeadersParser
|
||||||
|
import StringIO
|
||||||
|
|
||||||
|
|
||||||
|
status_headers_1 = "\
|
||||||
|
HTTP/1.0 200 OK\r\n\
|
||||||
|
Content-Type: ABC\r\n\
|
||||||
|
Some: Value\r\n\
|
||||||
|
Multi-Line: Value1\r\n\
|
||||||
|
Also This\r\n\
|
||||||
|
\r\n\
|
||||||
|
Body"
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import doctest
|
||||||
|
doctest.testmod()
|
@ -17,7 +17,8 @@ DATE_TIMESPLIT = re.compile(r'[^\d]')
|
|||||||
|
|
||||||
TIMESTAMP_14 = '%Y%m%d%H%M%S'
|
TIMESTAMP_14 = '%Y%m%d%H%M%S'
|
||||||
|
|
||||||
PAD_STAMP_END = '29991231235959'
|
#PAD_STAMP_END = '29991231235959'
|
||||||
|
PAD_6 = '299912'
|
||||||
|
|
||||||
|
|
||||||
def iso_date_to_datetime(string):
|
def iso_date_to_datetime(string):
|
||||||
@ -58,41 +59,145 @@ def iso_date_to_timestamp(string):
|
|||||||
return datetime_to_timestamp(iso_date_to_datetime(string))
|
return datetime_to_timestamp(iso_date_to_datetime(string))
|
||||||
|
|
||||||
|
|
||||||
# default pad is end of range for compatibility
|
# pad to certain length (default 6)
|
||||||
def pad_timestamp(string, pad_str=PAD_STAMP_END):
|
def _pad_timestamp(string, pad_str=PAD_6):
|
||||||
"""
|
"""
|
||||||
>>> pad_timestamp('20')
|
>>> _pad_timestamp('20')
|
||||||
'20991231235959'
|
'209912'
|
||||||
|
|
||||||
>>> pad_timestamp('2014')
|
>>> _pad_timestamp('2014')
|
||||||
'20141231235959'
|
'201412'
|
||||||
|
|
||||||
>>> pad_timestamp('20141011')
|
>>> _pad_timestamp('20141011')
|
||||||
'20141011235959'
|
'20141011'
|
||||||
|
|
||||||
>>> pad_timestamp('201410110010')
|
>>> _pad_timestamp('201410110010')
|
||||||
'20141011001059'
|
'201410110010'
|
||||||
"""
|
"""
|
||||||
|
|
||||||
str_len = len(string)
|
str_len = len(string)
|
||||||
pad_len = len(pad_str)
|
pad_len = len(pad_str)
|
||||||
|
|
||||||
return string if str_len >= pad_len else string + pad_str[str_len:]
|
if str_len < pad_len:
|
||||||
|
string = string + pad_str[str_len:]
|
||||||
|
|
||||||
|
return string
|
||||||
|
|
||||||
|
|
||||||
def timestamp_to_datetime(string):
|
def timestamp_to_datetime(string):
|
||||||
"""
|
"""
|
||||||
>>> timestamp_to_datetime('20131226095010')
|
# >14-digit -- rest ignored
|
||||||
time.struct_time(tm_year=2013, tm_mon=12, tm_mday=26, \
|
>>> timestamp_to_datetime('2014122609501011')
|
||||||
tm_hour=9, tm_min=50, tm_sec=10, tm_wday=3, tm_yday=360, tm_isdst=-1)
|
datetime.datetime(2014, 12, 26, 9, 50, 10)
|
||||||
|
|
||||||
|
# 14-digit
|
||||||
|
>>> timestamp_to_datetime('20141226095010')
|
||||||
|
datetime.datetime(2014, 12, 26, 9, 50, 10)
|
||||||
|
|
||||||
|
# 13-digit padding
|
||||||
|
>>> timestamp_to_datetime('2014122609501')
|
||||||
|
datetime.datetime(2014, 12, 26, 9, 50, 59)
|
||||||
|
|
||||||
|
# 12-digit padding
|
||||||
|
>>> timestamp_to_datetime('201412260950')
|
||||||
|
datetime.datetime(2014, 12, 26, 9, 50, 59)
|
||||||
|
|
||||||
|
# 11-digit padding
|
||||||
|
>>> timestamp_to_datetime('20141226095')
|
||||||
|
datetime.datetime(2014, 12, 26, 9, 59, 59)
|
||||||
|
|
||||||
|
# 10-digit padding
|
||||||
|
>>> timestamp_to_datetime('2014122609')
|
||||||
|
datetime.datetime(2014, 12, 26, 9, 59, 59)
|
||||||
|
|
||||||
|
# 9-digit padding
|
||||||
|
>>> timestamp_to_datetime('201412260')
|
||||||
|
datetime.datetime(2014, 12, 26, 23, 59, 59)
|
||||||
|
|
||||||
|
# 8-digit padding
|
||||||
|
>>> timestamp_to_datetime('20141226')
|
||||||
|
datetime.datetime(2014, 12, 26, 23, 59, 59)
|
||||||
|
|
||||||
|
# 7-digit padding
|
||||||
|
>>> timestamp_to_datetime('2014122')
|
||||||
|
datetime.datetime(2014, 12, 31, 23, 59, 59)
|
||||||
|
|
||||||
|
# 6-digit padding
|
||||||
|
>>> timestamp_to_datetime('201410')
|
||||||
|
datetime.datetime(2014, 10, 31, 23, 59, 59)
|
||||||
|
|
||||||
|
# 5-digit padding
|
||||||
|
>>> timestamp_to_datetime('20141')
|
||||||
|
datetime.datetime(2014, 12, 31, 23, 59, 59)
|
||||||
|
|
||||||
|
# 4-digit padding
|
||||||
>>> timestamp_to_datetime('2014')
|
>>> timestamp_to_datetime('2014')
|
||||||
time.struct_time(tm_year=2014, tm_mon=12, tm_mday=31, \
|
datetime.datetime(2014, 12, 31, 23, 59, 59)
|
||||||
tm_hour=23, tm_min=59, tm_sec=59, tm_wday=2, tm_yday=365, tm_isdst=-1)
|
|
||||||
|
# 3-digit padding
|
||||||
|
>>> timestamp_to_datetime('201')
|
||||||
|
datetime.datetime(2019, 12, 31, 23, 59, 59)
|
||||||
|
|
||||||
|
# 2-digit padding
|
||||||
|
>>> timestamp_to_datetime('20')
|
||||||
|
datetime.datetime(2099, 12, 31, 23, 59, 59)
|
||||||
|
|
||||||
|
# 1-digit padding
|
||||||
|
>>> timestamp_to_datetime('2')
|
||||||
|
datetime.datetime(2999, 12, 31, 23, 59, 59)
|
||||||
|
|
||||||
|
# 1-digit out-of-range padding
|
||||||
|
>>> timestamp_to_datetime('3')
|
||||||
|
datetime.datetime(2999, 12, 31, 23, 59, 59)
|
||||||
|
|
||||||
|
# 0-digit padding
|
||||||
|
>>> timestamp_to_datetime('')
|
||||||
|
datetime.datetime(2999, 12, 31, 23, 59, 59)
|
||||||
|
|
||||||
|
# bad month
|
||||||
|
>>> timestamp_to_datetime('20131709005601')
|
||||||
|
datetime.datetime(2013, 12, 9, 0, 56, 1)
|
||||||
|
|
||||||
|
# all out of range except minutes
|
||||||
|
>>> timestamp_to_datetime('40001965252477')
|
||||||
|
datetime.datetime(2999, 12, 31, 23, 24, 59)
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Default pad to end of range for comptability
|
# pad to 6 digits
|
||||||
return time.strptime(pad_timestamp(string), TIMESTAMP_14)
|
string = _pad_timestamp(string, PAD_6)
|
||||||
|
|
||||||
|
|
||||||
|
def clamp(val, min_, max_):
|
||||||
|
try:
|
||||||
|
val = int(val)
|
||||||
|
val = max(min_, min(val, max_))
|
||||||
|
return val
|
||||||
|
except:
|
||||||
|
return max_
|
||||||
|
|
||||||
|
def extract(string, start, end, min_, max_):
|
||||||
|
if len(string) >= end:
|
||||||
|
return clamp(string[start:end], min_, max_)
|
||||||
|
else:
|
||||||
|
return max_
|
||||||
|
|
||||||
|
# now parse, clamp to boundary
|
||||||
|
year = extract(string, 0, 4, 1900, 2999)
|
||||||
|
month = extract(string, 4, 6, 1, 12)
|
||||||
|
day = extract(string, 6, 8, 1, calendar.monthrange(year, month)[1])
|
||||||
|
hour = extract(string, 8, 10, 0, 23)
|
||||||
|
minute = extract(string, 10, 12, 0, 59)
|
||||||
|
second = extract(string, 12, 14, 0, 59)
|
||||||
|
|
||||||
|
return datetime.datetime(year=year,
|
||||||
|
month=month,
|
||||||
|
day=day,
|
||||||
|
hour=hour,
|
||||||
|
minute=minute,
|
||||||
|
second=second)
|
||||||
|
|
||||||
|
#return time.strptime(pad_timestamp(string), TIMESTAMP_14)
|
||||||
|
|
||||||
|
|
||||||
def timestamp_to_sec(string):
|
def timestamp_to_sec(string):
|
||||||
@ -104,7 +209,7 @@ def timestamp_to_sec(string):
|
|||||||
1420070399
|
1420070399
|
||||||
"""
|
"""
|
||||||
|
|
||||||
return calendar.timegm(timestamp_to_datetime(string))
|
return calendar.timegm(timestamp_to_datetime(string).utctimetuple())
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
@ -213,3 +213,6 @@ def load_from_cdx_test(cdx):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
print 'Exception: ' + e.__class__.__name__
|
print 'Exception: ' + e.__class__.__name__
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import doctest
|
||||||
|
doctest.testmod()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user