mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
Merge branch 'master' into cdx-server
This commit is contained in:
commit
14f4b4d26e
8
.coveragerc
Normal file
8
.coveragerc
Normal file
@ -0,0 +1,8 @@
|
||||
[run]
|
||||
omit =
|
||||
*/test/*
|
||||
*/tests/*
|
||||
|
||||
[report]
|
||||
exclude_lines =
|
||||
if __name__ == .__main__.:
|
@ -4,7 +4,14 @@ python:
|
||||
# command to install dependencies
|
||||
install:
|
||||
- "python setup.py -q install"
|
||||
- "pip install python-coveralls"
|
||||
- "pip install pytest-cov"
|
||||
# command to run tests
|
||||
#script: nosetests --with-doctest
|
||||
#script: py.test run-tests.py ./pywb/ --doctest-modules --ignore=setup.py
|
||||
script: py.test -v --doctest-module ./tests/*.py ./pywb/
|
||||
#script: py.test -v --doctest-module ./tests/*.py ./pywb/
|
||||
script:
|
||||
py.test --cov-config .coveragerc --cov pywb -v --doctest-module ./pywb/ tests/
|
||||
|
||||
after_success:
|
||||
coveralls
|
||||
|
@ -3,13 +3,13 @@ import re
|
||||
|
||||
from wbrequestresponse import WbRequest, WbResponse
|
||||
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||
from pywb.rewrite.wburl import WbUrl
|
||||
|
||||
|
||||
#=================================================================
|
||||
# ArchivalRouter -- route WB requests in archival mode
|
||||
#=================================================================
|
||||
class ArchivalRouter:
|
||||
def __init__(self, routes, hostpaths = None, abs_path = True, home_view = None, error_view = None):
|
||||
def __init__(self, routes, hostpaths=None, abs_path=True, home_view=None, error_view=None):
|
||||
self.routes = routes
|
||||
self.fallback = ReferRedirect(hostpaths)
|
||||
self.abs_path = abs_path
|
||||
@ -69,24 +69,25 @@ class Route:
|
||||
if not matcher:
|
||||
return None
|
||||
|
||||
rel_prefix = matcher.group(0)
|
||||
matched_str = matcher.group(0)
|
||||
|
||||
if rel_prefix:
|
||||
wb_prefix = env['SCRIPT_NAME'] + '/' + rel_prefix + '/'
|
||||
wb_url_str = request_uri[len(rel_prefix) + 2:] # remove the '/' + rel_prefix part of uri
|
||||
if matched_str:
|
||||
rel_prefix = env['SCRIPT_NAME'] + '/' + matched_str + '/'
|
||||
wb_url_str = request_uri[len(matched_str) + 2:] # remove the '/' + rel_prefix part of uri
|
||||
else:
|
||||
wb_prefix = env['SCRIPT_NAME'] + '/'
|
||||
rel_prefix = env['SCRIPT_NAME'] + '/'
|
||||
wb_url_str = request_uri[1:] # the request_uri is the wb_url, since no coll
|
||||
|
||||
coll = matcher.group(self.coll_group)
|
||||
|
||||
wbrequest = WbRequest(env,
|
||||
request_uri = request_uri,
|
||||
wb_url_str = wb_url_str,
|
||||
wb_prefix = wb_prefix,
|
||||
coll = coll,
|
||||
host_prefix = WbRequest.make_host_prefix(env) if use_abs_prefix else '',
|
||||
wburl_class = self.handler.get_wburl_type())
|
||||
request_uri=request_uri,
|
||||
wb_url_str=wb_url_str,
|
||||
rel_prefix=rel_prefix,
|
||||
coll=coll,
|
||||
use_abs_prefix=use_abs_prefix,
|
||||
wburl_class = self.handler.get_wburl_type(),
|
||||
urlrewriter_class=UrlRewriter)
|
||||
|
||||
|
||||
# Allow for applying of additional filters
|
||||
|
@ -2,6 +2,7 @@
|
||||
"""
|
||||
|
||||
import surt
|
||||
import urlparse
|
||||
from cdxobject import CDXException
|
||||
|
||||
|
||||
@ -69,6 +70,109 @@ index.html?a=b?c=)/')
|
||||
return surt
|
||||
|
||||
|
||||
#=================================================================
|
||||
def calc_search_range(url, match_type, surt_ordered=True, url_canon=None):
|
||||
"""
|
||||
Canonicalize a url (either with custom canonicalizer or
|
||||
standard canonicalizer with or without surt)
|
||||
|
||||
Then, compute a start and end search url search range
|
||||
for a given match type.
|
||||
|
||||
Support match types:
|
||||
* exact
|
||||
* prefix
|
||||
* host
|
||||
* domain (only available when for surt ordering)
|
||||
|
||||
Examples below:
|
||||
|
||||
# surt ranges
|
||||
>>> calc_search_range('http://example.com/path/file.html', 'exact')
|
||||
('com,example)/path/file.html', 'com,example)/path/file.html!')
|
||||
|
||||
>>> calc_search_range('http://example.com/path/file.html', 'prefix')
|
||||
('com,example)/path/file.html', 'com,example)/path/file.htmm')
|
||||
|
||||
>>> calc_search_range('http://example.com/path/file.html', 'host')
|
||||
('com,example)/', 'com,example*')
|
||||
|
||||
>>> calc_search_range('http://example.com/path/file.html', 'domain')
|
||||
('com,example)/', 'com,example-')
|
||||
|
||||
special case for tld domain range
|
||||
>>> calc_search_range('com', 'domain')
|
||||
('com,', 'com-')
|
||||
|
||||
# non-surt ranges
|
||||
>>> calc_search_range('http://example.com/path/file.html', 'exact', False)
|
||||
('example.com/path/file.html', 'example.com/path/file.html!')
|
||||
|
||||
>>> calc_search_range('http://example.com/path/file.html', 'prefix', False)
|
||||
('example.com/path/file.html', 'example.com/path/file.htmm')
|
||||
|
||||
>>> calc_search_range('http://example.com/path/file.html', 'host', False)
|
||||
('example.com/', 'example.com0')
|
||||
|
||||
# domain range not supported
|
||||
>>> calc_search_range('http://example.com/path/file.html', 'domain', False)
|
||||
Traceback (most recent call last):
|
||||
Exception: matchType=domain unsupported for non-surt
|
||||
"""
|
||||
def inc_last_char(x):
|
||||
return x[0:-1] + chr(ord(x[-1]) + 1)
|
||||
|
||||
if not url_canon:
|
||||
# make new canon
|
||||
url_canon = UrlCanonicalizer(surt_ordered)
|
||||
else:
|
||||
# ensure surt order matches url_canon
|
||||
surt_ordered = url_canon.surt_ordered
|
||||
|
||||
start_key = url_canon(url)
|
||||
|
||||
if match_type == 'exact':
|
||||
end_key = start_key + '!'
|
||||
|
||||
elif match_type == 'prefix':
|
||||
# add trailing slash if url has it
|
||||
if url.endswith('/') and not start_key.endswith('/'):
|
||||
start_key += '/'
|
||||
|
||||
end_key = inc_last_char(start_key)
|
||||
|
||||
elif match_type == 'host':
|
||||
if surt_ordered:
|
||||
host = start_key.split(')/')[0]
|
||||
|
||||
start_key = host + ')/'
|
||||
end_key = host + '*'
|
||||
else:
|
||||
host = urlparse.urlsplit(url).netloc
|
||||
|
||||
start_key = host + '/'
|
||||
end_key = host + '0'
|
||||
|
||||
elif match_type == 'domain':
|
||||
if not surt_ordered:
|
||||
raise Exception('matchType=domain unsupported for non-surt')
|
||||
|
||||
host = start_key.split(')/')[0]
|
||||
|
||||
# if tld, use com, as start_key
|
||||
# otherwise, stick with com,example)/
|
||||
if not ',' in host:
|
||||
start_key = host + ','
|
||||
else:
|
||||
start_key = host + ')/'
|
||||
|
||||
end_key = host + '-'
|
||||
else:
|
||||
raise Exception('Invalid match_type: ' + match_type)
|
||||
|
||||
return (start_key, end_key)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
doctest.testmod()
|
||||
|
@ -77,3 +77,34 @@ class CDXObject(OrderedDict):
|
||||
|
||||
li = itertools.imap(lambda (n, val): val, self.items())
|
||||
return ' '.join(li)
|
||||
|
||||
|
||||
#=================================================================
|
||||
class IDXObject(OrderedDict):
|
||||
|
||||
FORMAT = ['urlkey', 'part', 'offset', 'length', 'lineno']
|
||||
NUM_REQ_FIELDS = len(FORMAT) - 1 # lineno is an optional field
|
||||
|
||||
def __init__(self, idxline):
|
||||
OrderedDict.__init__(self)
|
||||
|
||||
idxline = idxline.rstrip()
|
||||
fields = idxline.split('\t')
|
||||
|
||||
if len(fields) < self.NUM_REQ_FIELDS:
|
||||
msg = 'invalid idx format: {0} fields found, {1} required'
|
||||
raise Exception(msg.format(len(fields), self.NUM_REQ_FIELDS))
|
||||
|
||||
for header, field in itertools.izip(self.FORMAT, fields):
|
||||
self[header] = field
|
||||
|
||||
self['offset'] = int(self['offset'])
|
||||
self['length'] = int(self['length'])
|
||||
lineno = self.get('lineno')
|
||||
if lineno:
|
||||
self['lineno'] = int(lineno)
|
||||
|
||||
self.idxline = idxline
|
||||
|
||||
def __str__(self):
|
||||
return self.idxline
|
||||
|
@ -1,4 +1,4 @@
|
||||
from cdxobject import CDXObject, AccessException
|
||||
from cdxobject import CDXObject, IDXObject, AccessException
|
||||
from pywb.utils.timeutils import timestamp_to_sec
|
||||
|
||||
import bisect
|
||||
@ -56,7 +56,7 @@ def cdx_text_out(cdx, fields):
|
||||
def cdx_load_and_filter(sources, params):
|
||||
cdx_iter = load_cdx_streams(sources, params)
|
||||
|
||||
cdx_iter = make_cdx_iter(cdx_iter)
|
||||
cdx_iter = make_obj_iter(cdx_iter, params)
|
||||
|
||||
if params.get('proxyAll'):
|
||||
return cdx_iter
|
||||
@ -102,9 +102,15 @@ def load_cdx_streams(sources, params):
|
||||
|
||||
|
||||
#=================================================================
|
||||
# convert text cdx stream to CDXObject
|
||||
def make_cdx_iter(text_iter):
|
||||
return itertools.imap(lambda line: CDXObject(line), text_iter)
|
||||
# convert text cdx stream to CDXObject/IDXObject
|
||||
def make_obj_iter(text_iter, params):
|
||||
# already converted
|
||||
if params.get('showPagedIndex'):
|
||||
cls = IDXObject
|
||||
else:
|
||||
cls = CDXObject
|
||||
|
||||
return itertools.imap(lambda line: cls(line), text_iter)
|
||||
|
||||
|
||||
#=================================================================
|
||||
|
@ -1,10 +1,13 @@
|
||||
from canonicalize import UrlCanonicalizer
|
||||
from canonicalize import UrlCanonicalizer, calc_search_range
|
||||
|
||||
from cdxops import cdx_load
|
||||
from cdxsource import CDXSource, CDXFile, RemoteCDXSource
|
||||
from cdxsource import CDXSource, CDXFile, RemoteCDXSource, RedisCDXSource
|
||||
from zipnum import ZipNumCluster
|
||||
from cdxobject import CDXObject, CaptureNotFoundException, CDXException
|
||||
from cdxdomainspecific import load_domain_specific_cdx_rules
|
||||
|
||||
from pywb.utils.loaders import is_http
|
||||
|
||||
from itertools import chain
|
||||
import logging
|
||||
import os
|
||||
@ -14,8 +17,23 @@ import urlparse
|
||||
#=================================================================
|
||||
class BaseCDXServer(object):
|
||||
def __init__(self, **kwargs):
|
||||
self.url_canon = kwargs.get('url_canon', UrlCanonicalizer())
|
||||
self.fuzzy_query = kwargs.get('fuzzy_query')
|
||||
ds_rules = kwargs.get('ds_rules')
|
||||
surt_ordered = kwargs.get('surt_ordered', True)
|
||||
|
||||
# load from domain-specific rules
|
||||
if ds_rules:
|
||||
self.url_canon, self.fuzzy_query = (
|
||||
load_domain_specific_cdx_rules(ds_rules, surt_ordered))
|
||||
# or custom passed in canonicalizer
|
||||
else:
|
||||
self.url_canon = kwargs.get('url_canon')
|
||||
self.fuzzy_query = kwargs.get('fuzzy_query')
|
||||
|
||||
# set default canonicalizer if none set thus far
|
||||
if not self.url_canon:
|
||||
self.url_canon = UrlCanonicalizer(surt_ordered)
|
||||
|
||||
# set perms checker, if any
|
||||
self.perms_checker = kwargs.get('perms_checker')
|
||||
|
||||
def _check_cdx_iter(self, cdx_iter, params):
|
||||
@ -66,7 +84,7 @@ class CDXServer(BaseCDXServer):
|
||||
|
||||
def __init__(self, paths, **kwargs):
|
||||
super(CDXServer, self).__init__(**kwargs)
|
||||
self.sources = create_cdx_sources(paths)
|
||||
self.sources = create_cdx_sources(paths, kwargs.get('config'))
|
||||
|
||||
def load_cdx(self, **params):
|
||||
# if key not set, assume 'url' is set and needs canonicalization
|
||||
@ -77,7 +95,14 @@ class CDXServer(BaseCDXServer):
|
||||
msg = 'A url= param must be specified to query the cdx server'
|
||||
raise CDXException(msg)
|
||||
|
||||
params['key'] = self.url_canon(url)
|
||||
#params['key'] = self.url_canon(url)
|
||||
match_type = params.get('matchType', 'exact')
|
||||
|
||||
key, end_key = calc_search_range(url=url,
|
||||
match_type=match_type,
|
||||
url_canon=self.url_canon)
|
||||
params['key'] = key
|
||||
params['end_key'] = end_key
|
||||
|
||||
cdx_iter = cdx_load(self.sources, params, self.perms_checker)
|
||||
|
||||
@ -124,36 +149,29 @@ def create_cdx_server(config, ds_rules_file=None):
|
||||
paths = config.get('index_paths')
|
||||
surt_ordered = config.get('surt_ordered', True)
|
||||
perms_checker = config.get('perms_checker')
|
||||
pass_config = config
|
||||
else:
|
||||
paths = config
|
||||
surt_ordered = True
|
||||
perms_checker = None
|
||||
pass_config = None
|
||||
|
||||
logging.debug('CDX Surt-Ordered? ' + str(surt_ordered))
|
||||
|
||||
if ds_rules_file:
|
||||
canon, fuzzy = load_domain_specific_cdx_rules(ds_rules_file,
|
||||
surt_ordered)
|
||||
else:
|
||||
canon, fuzzy = None, None
|
||||
|
||||
if not canon:
|
||||
canon = UrlCanonicalizer(surt_ordered)
|
||||
|
||||
if (isinstance(paths, str) and
|
||||
any(paths.startswith(x) for x in ['http://', 'https://'])):
|
||||
if isinstance(paths, str) and is_http(paths):
|
||||
server_cls = RemoteCDXServer
|
||||
else:
|
||||
server_cls = CDXServer
|
||||
|
||||
return server_cls(paths,
|
||||
url_canon=canon,
|
||||
fuzzy_query=fuzzy,
|
||||
config=pass_config,
|
||||
surt_ordered=surt_ordered,
|
||||
ds_rules=ds_rules_file,
|
||||
perms_checker=perms_checker)
|
||||
|
||||
|
||||
#=================================================================
|
||||
def create_cdx_sources(paths):
|
||||
def create_cdx_sources(paths, config=None):
|
||||
sources = []
|
||||
|
||||
if not isinstance(paths, list):
|
||||
@ -161,13 +179,13 @@ def create_cdx_sources(paths):
|
||||
|
||||
for path in paths:
|
||||
if isinstance(path, CDXSource):
|
||||
add_cdx_source(sources, path)
|
||||
add_cdx_source(sources, path, config)
|
||||
elif isinstance(path, str):
|
||||
if os.path.isdir(path):
|
||||
for file in os.listdir(path):
|
||||
add_cdx_source(sources, path + file)
|
||||
add_cdx_source(sources, path + file, config)
|
||||
else:
|
||||
add_cdx_source(sources, path)
|
||||
add_cdx_source(sources, path, config)
|
||||
|
||||
if len(sources) == 0:
|
||||
logging.exception('No CDX Sources Found from: ' + str(sources))
|
||||
@ -176,9 +194,9 @@ def create_cdx_sources(paths):
|
||||
|
||||
|
||||
#=================================================================
|
||||
def add_cdx_source(sources, source):
|
||||
def add_cdx_source(sources, source, config):
|
||||
if not isinstance(source, CDXSource):
|
||||
source = create_cdx_source(source)
|
||||
source = create_cdx_source(source, config)
|
||||
if not source:
|
||||
return
|
||||
|
||||
@ -187,19 +205,20 @@ def add_cdx_source(sources, source):
|
||||
|
||||
|
||||
#=================================================================
|
||||
def create_cdx_source(filename):
|
||||
if filename.startswith('http://') or filename.startswith('https://'):
|
||||
def create_cdx_source(filename, config):
|
||||
if is_http(filename):
|
||||
return RemoteCDXSource(filename)
|
||||
|
||||
if filename.startswith('redis://'):
|
||||
return RedisCDXSource(filename, config)
|
||||
|
||||
if filename.endswith('.cdx'):
|
||||
return CDXFile(filename)
|
||||
|
||||
if filename.endswith('.summary'):
|
||||
return ZipNumCluster(filename, config)
|
||||
|
||||
return None
|
||||
#TODO: support zipnum
|
||||
#elif filename.endswith('.summary')
|
||||
# return ZipNumCDXSource(filename)
|
||||
#elif filename.startswith('redis://')
|
||||
# return RedisCDXSource(filename)
|
||||
|
||||
|
||||
#=================================================================
|
||||
|
@ -1,9 +1,9 @@
|
||||
from pywb.utils.binsearch import iter_exact, iter_prefix
|
||||
from pywb.utils.binsearch import iter_range
|
||||
from pywb.utils.loaders import SeekableTextFileReader
|
||||
|
||||
import urllib
|
||||
import urllib2
|
||||
|
||||
import itertools
|
||||
|
||||
#=================================================================
|
||||
class CDXSource(object):
|
||||
@ -24,17 +24,7 @@ class CDXFile(CDXSource):
|
||||
|
||||
def load_cdx(self, params):
|
||||
source = SeekableTextFileReader(self.filename)
|
||||
|
||||
match_type = params.get('matchType')
|
||||
|
||||
if match_type == 'prefix':
|
||||
iter_func = iter_prefix
|
||||
else:
|
||||
iter_func = iter_exact
|
||||
|
||||
key = params.get('key')
|
||||
|
||||
return iter_func(source, key)
|
||||
return iter_range(source, params.get('key'), params.get('end_key'))
|
||||
|
||||
def __str__(self):
|
||||
return 'CDX File - ' + self.filename
|
||||
@ -90,3 +80,35 @@ class RemoteCDXSource(CDXSource):
|
||||
|
||||
def __str__(self):
|
||||
return 'Remote CDX Server: ' + self.remote_url
|
||||
|
||||
|
||||
#=================================================================
|
||||
class RedisCDXSource(CDXSource):
|
||||
DEFAULT_KEY_PREFIX = 'c:'
|
||||
|
||||
def __init__(self, redis_url, config=None):
|
||||
import redis
|
||||
self.redis = redis.StrictRedis.from_url(redis_url)
|
||||
|
||||
self.key_prefix = self.DEFAULT_KEY_PREFIX
|
||||
if config:
|
||||
self.key_prefix = config.get('redis_key_prefix', self.key_prefix)
|
||||
|
||||
|
||||
def load_cdx(self, params):
|
||||
"""
|
||||
Load cdx from redis cache, from an ordered list
|
||||
|
||||
Currently, there is no support for range queries
|
||||
Only 'exact' matchType is supported
|
||||
"""
|
||||
key = params['key']
|
||||
|
||||
# ensure only url/surt is part of key
|
||||
key = key.split(' ')[0]
|
||||
cdx_list = self.redis.zrange(self.key_prefix + key, 0, -1)
|
||||
|
||||
# key is not part of list, so prepend to each line
|
||||
key += ' '
|
||||
cdx_list = itertools.imap(lambda x: key + x, cdx_list)
|
||||
return cdx_list
|
||||
|
@ -132,8 +132,8 @@ org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db tex
|
||||
('filename', 'dupes.warc.gz')]
|
||||
|
||||
# NOTE: external dependency -- need self-contained test
|
||||
>>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'example.com', output = 'raw', limit = '2')
|
||||
>>> pprint.pprint(x.next().items())
|
||||
#>>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'example.com', output = 'raw', limit = '2')
|
||||
#>>> pprint.pprint(x.next().items())
|
||||
[('urlkey', 'com,example)/'),
|
||||
('timestamp', '20020120142510'),
|
||||
('original', 'http://example.com:80/'),
|
||||
|
203
pywb/cdx/zipnum.py
Normal file
203
pywb/cdx/zipnum.py
Normal file
@ -0,0 +1,203 @@
|
||||
import os
|
||||
import collections
|
||||
import itertools
|
||||
import logging
|
||||
from cStringIO import StringIO
|
||||
import datetime
|
||||
|
||||
from cdxsource import CDXSource
|
||||
from cdxobject import IDXObject
|
||||
|
||||
from pywb.utils.loaders import BlockLoader
|
||||
from pywb.utils.loaders import SeekableTextFileReader
|
||||
from pywb.utils.bufferedreaders import gzip_decompressor
|
||||
from pywb.utils.binsearch import iter_range, linearsearch
|
||||
|
||||
|
||||
#=================================================================
|
||||
class ZipBlocks:
|
||||
def __init__(self, part, offset, length, count):
|
||||
self.part = part
|
||||
self.offset = offset
|
||||
self.length = length
|
||||
self.count = count
|
||||
|
||||
|
||||
#=================================================================
|
||||
def readline_to_iter(stream):
|
||||
try:
|
||||
count = 0
|
||||
buff = stream.readline()
|
||||
while buff:
|
||||
count += 1
|
||||
yield buff
|
||||
buff = stream.readline()
|
||||
|
||||
finally:
|
||||
stream.close()
|
||||
|
||||
|
||||
#=================================================================
|
||||
class ZipNumCluster(CDXSource):
|
||||
DEFAULT_RELOAD_INTERVAL = 10 # in minutes
|
||||
DEFAULT_MAX_BLOCKS = 50
|
||||
|
||||
def __init__(self, summary, config=None):
|
||||
|
||||
loc = None
|
||||
cookie_maker = None
|
||||
self.max_blocks = self.DEFAULT_MAX_BLOCKS
|
||||
reload_ival = self.DEFAULT_RELOAD_INTERVAL
|
||||
|
||||
if config:
|
||||
loc = config.get('zipnum_loc')
|
||||
cookie_maker = config.get('cookie_maker')
|
||||
|
||||
self.max_blocks = config.get('max_blocks', self.max_blocks)
|
||||
|
||||
reload_ival = config.get('reload_interval', reload_ival)
|
||||
|
||||
if not loc:
|
||||
splits = os.path.splitext(summary)
|
||||
loc = splits[0] + '.loc'
|
||||
|
||||
self.summary = summary
|
||||
self.loc_filename = loc
|
||||
|
||||
# initial loc map
|
||||
self.loc_map = {}
|
||||
self.loc_mtime = 0
|
||||
self.load_loc()
|
||||
|
||||
# reload interval
|
||||
self.loc_update_time = datetime.datetime.now()
|
||||
self.reload_interval = datetime.timedelta(minutes=reload_ival)
|
||||
|
||||
self.blk_loader = BlockLoader(cookie_maker=cookie_maker)
|
||||
|
||||
def load_loc(self):
|
||||
# check modified time of current file before loading
|
||||
new_mtime = os.path.getmtime(self.loc_filename)
|
||||
if (new_mtime == self.loc_mtime):
|
||||
return
|
||||
|
||||
# update loc file mtime
|
||||
self.loc_mtime = new_mtime
|
||||
|
||||
logging.debug('Loading loc from: ' + self.loc_filename)
|
||||
with open(self.loc_filename) as fh:
|
||||
for line in fh:
|
||||
parts = line.rstrip().split('\t')
|
||||
self.loc_map[parts[0]] = parts[1:]
|
||||
|
||||
@staticmethod
|
||||
def reload_timed(timestamp, val, delta, func):
|
||||
now = datetime.datetime.now()
|
||||
if now - timestamp >= delta:
|
||||
func()
|
||||
return now
|
||||
return None
|
||||
|
||||
def reload_loc(self):
|
||||
reload_time = self.reload_timed(self.loc_update_time,
|
||||
self.loc_map,
|
||||
self.reload_interval,
|
||||
self.load_loc)
|
||||
|
||||
if reload_time:
|
||||
self.loc_update_time = reload_time
|
||||
|
||||
def lookup_loc(self, part):
|
||||
return self.loc_map[part]
|
||||
|
||||
def load_cdx(self, params):
|
||||
self.reload_loc()
|
||||
|
||||
reader = SeekableTextFileReader(self.summary)
|
||||
|
||||
idx_iter = iter_range(reader,
|
||||
params['key'],
|
||||
params['end_key'],
|
||||
prev_size=1)
|
||||
|
||||
if params.get('showPagedIndex'):
|
||||
params['proxyAll'] = True
|
||||
return idx_iter
|
||||
else:
|
||||
blocks = self.idx_to_cdx(idx_iter, params)
|
||||
|
||||
def gen_cdx():
|
||||
for blk in blocks:
|
||||
for cdx in blk:
|
||||
yield cdx
|
||||
|
||||
return gen_cdx()
|
||||
|
||||
def idx_to_cdx(self, idx_iter, params):
|
||||
blocks = None
|
||||
ranges = []
|
||||
|
||||
for idx in idx_iter:
|
||||
idx = IDXObject(idx)
|
||||
|
||||
if (blocks and blocks.part == idx['part'] and
|
||||
blocks.offset + blocks.length == idx['offset'] and
|
||||
blocks.count < self.max_blocks):
|
||||
|
||||
blocks.length += idx['length']
|
||||
blocks.count += 1
|
||||
ranges.append(idx['length'])
|
||||
|
||||
else:
|
||||
if blocks:
|
||||
yield self.block_to_cdx_iter(blocks, ranges, params)
|
||||
|
||||
blocks = ZipBlocks(idx['part'],
|
||||
idx['offset'],
|
||||
idx['length'],
|
||||
1)
|
||||
|
||||
ranges = [blocks.length]
|
||||
|
||||
if blocks:
|
||||
yield self.block_to_cdx_iter(blocks, ranges, params)
|
||||
|
||||
def block_to_cdx_iter(self, blocks, ranges, params):
|
||||
last_exc = None
|
||||
last_traceback = None
|
||||
|
||||
for location in self.lookup_loc(blocks.part):
|
||||
try:
|
||||
return self.load_blocks(location, blocks, ranges, params)
|
||||
except Exception as exc:
|
||||
last_exc = exc
|
||||
import sys
|
||||
last_traceback = sys.exc_info()[2]
|
||||
|
||||
if last_exc:
|
||||
raise exc, None, last_traceback
|
||||
else:
|
||||
raise Exception('No Locations Found for: ' + block.part)
|
||||
|
||||
def load_blocks(self, location, blocks, ranges, params):
|
||||
|
||||
if (logging.getLogger().getEffectiveLevel() <= logging.DEBUG):
|
||||
msg = 'Loading {b.count} blocks from {loc}:{b.offset}+{b.length}'
|
||||
logging.debug(msg.format(b=blocks, loc=location))
|
||||
|
||||
reader = self.blk_loader.load(location, blocks.offset, blocks.length)
|
||||
|
||||
def decompress_block(range_):
|
||||
decomp = gzip_decompressor()
|
||||
buff = decomp.decompress(reader.read(range_))
|
||||
return readline_to_iter(StringIO(buff))
|
||||
|
||||
iter_ = itertools.chain(*itertools.imap(decompress_block, ranges))
|
||||
|
||||
# start bound
|
||||
iter_ = linearsearch(iter_, params['key'])
|
||||
|
||||
# end bound
|
||||
end = params['end_key']
|
||||
iter_ = itertools.takewhile(lambda line: line < end, iter_)
|
||||
return iter_
|
@ -10,19 +10,28 @@ from wbexceptions import WbException, NotFoundException
|
||||
from views import TextCapturesView
|
||||
|
||||
|
||||
class BaseHandler:
|
||||
@staticmethod
|
||||
def get_wburl_type():
|
||||
return WbUrl
|
||||
|
||||
#=================================================================
|
||||
class BaseHandler(object):
|
||||
def __call__(self, wbrequest):
|
||||
return wbrequest
|
||||
|
||||
def get_wburl_type(self):
|
||||
return None
|
||||
|
||||
|
||||
#=================================================================
|
||||
class WbUrlHandler(BaseHandler):
|
||||
def get_wburl_type(self):
|
||||
return WbUrl
|
||||
|
||||
|
||||
#=================================================================
|
||||
# Standard WB Handler
|
||||
#=================================================================
|
||||
class WBHandler(BaseHandler):
|
||||
def __init__(self, index_reader, replay, html_view = None, search_view = None):
|
||||
class WBHandler(WbUrlHandler):
|
||||
def __init__(self, index_reader, replay,
|
||||
html_view=None, search_view=None):
|
||||
|
||||
self.index_reader = index_reader
|
||||
self.replay = replay
|
||||
|
||||
@ -31,7 +40,6 @@ class WBHandler(BaseHandler):
|
||||
self.html_view = html_view
|
||||
self.search_view = search_view
|
||||
|
||||
|
||||
def __call__(self, wbrequest):
|
||||
if wbrequest.wb_url_str == '/':
|
||||
return self.render_search_page(wbrequest)
|
||||
@ -61,6 +69,7 @@ class WBHandler(BaseHandler):
|
||||
def __str__(self):
|
||||
return 'WBHandler: ' + str(self.index_reader) + ', ' + str(self.replay)
|
||||
|
||||
|
||||
#=================================================================
|
||||
# CDX-Server Handler -- pass all params to cdx server
|
||||
#=================================================================
|
||||
@ -75,11 +84,6 @@ class CDXHandler(BaseHandler):
|
||||
|
||||
return self.view.render_response(wbrequest, cdx_lines)
|
||||
|
||||
|
||||
@staticmethod
|
||||
def get_wburl_type():
|
||||
return None
|
||||
|
||||
def __str__(self):
|
||||
return 'Index Reader: ' + str(self.index_reader)
|
||||
|
||||
@ -115,10 +119,6 @@ class StaticHandler(BaseHandler):
|
||||
except IOError:
|
||||
raise NotFoundException('Static File Not Found: ' + wbrequest.wb_url_str)
|
||||
|
||||
@staticmethod
|
||||
def get_wburl_type():
|
||||
return None
|
||||
|
||||
def __str__(self):
|
||||
return 'Static files from ' + self.static_path
|
||||
|
||||
@ -130,6 +130,7 @@ class DebugEchoEnvHandler(BaseHandler):
|
||||
def __call__(self, wbrequest):
|
||||
return WbResponse.text_response(str(wbrequest.env))
|
||||
|
||||
|
||||
#=================================================================
|
||||
class DebugEchoHandler(BaseHandler):
|
||||
def __call__(self, wbrequest):
|
||||
@ -150,5 +151,3 @@ class PerfTimer:
|
||||
self.end = time.clock()
|
||||
if self.perfdict is not None:
|
||||
self.perfdict[self.name] = str(self.end - self.start)
|
||||
|
||||
|
||||
|
@ -37,7 +37,7 @@ class IndexReader(object):
|
||||
def load_cdx(self, **params):
|
||||
return self.cdx_server.load_cdx(**params)
|
||||
|
||||
def get_query_params(self, wburl, limit = 150000, collapse_time = None, replay_closest = 10):
|
||||
def get_query_params(self, wburl, limit = 150000, collapse_time = None, replay_closest = 100):
|
||||
if wburl.type == wburl.URL_QUERY:
|
||||
raise NotImplementedError('Url Query Not Yet Supported')
|
||||
|
||||
|
@ -45,14 +45,14 @@ class ProxyRouter:
|
||||
return None
|
||||
|
||||
wbrequest = WbRequest(env,
|
||||
request_uri = url,
|
||||
wb_url_str = url,
|
||||
wb_prefix = '',
|
||||
coll = '',
|
||||
host_prefix = self.hostpaths[0],
|
||||
wburl_class = self.handler.get_wburl_type(),
|
||||
url_rewriter_class = ProxyHttpsUrlRewriter,
|
||||
is_proxy = True)
|
||||
request_uri=url,
|
||||
wb_url_str=url,
|
||||
#rel_prefix=url,
|
||||
#host_prefix=self.hostpaths[0],
|
||||
wburl_class=self.handler.get_wburl_type(),
|
||||
urlrewriter_class=ProxyHttpsUrlRewriter,
|
||||
use_abs_prefix=False,
|
||||
is_proxy=True)
|
||||
|
||||
return self.handler(wbrequest)
|
||||
|
||||
|
@ -7,7 +7,6 @@ from wbrequestresponse import WbResponse
|
||||
from wbexceptions import CaptureException, InternalRedirect
|
||||
from pywb.warc.recordloader import ArchiveLoadFailed
|
||||
|
||||
|
||||
#=================================================================
|
||||
class ReplayView:
|
||||
def __init__(self, content_loader, content_rewriter, head_insert_view = None,
|
||||
@ -49,6 +48,9 @@ class ReplayView:
|
||||
# check if redir is needed
|
||||
self._redirect_if_needed(wbrequest, cdx)
|
||||
|
||||
# one more check for referrer-based self-redirect
|
||||
self._reject_referrer_self_redirect(wbrequest, status_headers)
|
||||
|
||||
response = None
|
||||
|
||||
if self.content_rewriter and wbrequest.wb_url.mod != 'id_':
|
||||
@ -148,6 +150,7 @@ class ReplayView:
|
||||
|
||||
|
||||
def _reject_self_redirect(self, wbrequest, cdx, status_headers):
|
||||
# self-redirect via location
|
||||
if status_headers.statusline.startswith('3'):
|
||||
request_url = wbrequest.wb_url.url.lower()
|
||||
location_url = status_headers.get_header('Location').lower()
|
||||
@ -156,3 +159,16 @@ class ReplayView:
|
||||
if (UrlRewriter.strip_protocol(request_url) == UrlRewriter.strip_protocol(location_url)):
|
||||
raise CaptureException('Self Redirect: ' + str(cdx))
|
||||
|
||||
def _reject_referrer_self_redirect(self, wbrequest, status_headers):
|
||||
# at correct timestamp now, but must check for referrer redirect
|
||||
# indirect self-redirect, via meta-refresh, if referrer is same as current url
|
||||
if status_headers.statusline.startswith('2'):
|
||||
# build full url even if using relative-rewriting
|
||||
request_url = wbrequest.host_prefix + wbrequest.rel_prefix + str(wbrequest.wb_url)
|
||||
referrer_url = wbrequest.referrer
|
||||
if (referrer_url and UrlRewriter.strip_protocol(request_url) == UrlRewriter.strip_protocol(referrer_url)):
|
||||
raise CaptureException('Self Redirect via Referrer: ' + str(wbrequest.wb_url))
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -6,7 +6,7 @@ from regex_rewriters import RegexRewriter, JSRewriter, CSSRewriter, XMLRewriter
|
||||
from header_rewriter import HeaderRewriter, RewrittenStatusAndHeaders
|
||||
|
||||
from pywb.utils.statusandheaders import StatusAndHeaders
|
||||
from pywb.utils.bufferedreaders import BufferedReader, ChunkedDataReader
|
||||
from pywb.utils.bufferedreaders import DecompressingBufferedReader, ChunkedDataReader
|
||||
|
||||
class RewriteContent:
|
||||
|
||||
@ -54,7 +54,7 @@ class RewriteContent:
|
||||
# =========================================================================
|
||||
# special case -- need to ungzip the body
|
||||
if (rewritten_headers.contains_removed_header('content-encoding', 'gzip')):
|
||||
stream = BufferedReader(stream, decomp_type='gzip')
|
||||
stream = DecompressingBufferedReader(stream, decomp_type='gzip')
|
||||
|
||||
if rewritten_headers.charset:
|
||||
encoding = rewritten_headers.charset
|
||||
|
@ -24,9 +24,9 @@ def test_example_2():
|
||||
|
||||
|
||||
|
||||
def test_example_3():
|
||||
status_headers, buff = get_rewritten('http://archive.org/', urlrewriter)
|
||||
#def test_example_3():
|
||||
# status_headers, buff = get_rewritten('http://archive.org/', urlrewriter)
|
||||
|
||||
assert '/pywb/20131226101010/http://example.com/about/terms.php' in buff, buff
|
||||
# assert '/pywb/20131226101010/http://example.com/about/terms.php' in buff, buff
|
||||
|
||||
|
||||
|
@ -103,10 +103,12 @@ class UrlRewriter:
|
||||
|
||||
return self.prefix + self.wburl.to_str(timestamp=timestamp, url=url)
|
||||
|
||||
|
||||
def set_base_url(self, newUrl):
|
||||
self.wburl.url = newUrl
|
||||
|
||||
def __repr__(self):
|
||||
return "UrlRewriter('{0}', '{1}')".format(self.wburl, self.prefix)
|
||||
|
||||
@staticmethod
|
||||
def strip_protocol(url):
|
||||
for protocol in UrlRewriter.PROTOCOLS:
|
||||
|
@ -1,9 +1,5 @@
|
||||
#!/usr/bin/python
|
||||
|
||||
import re
|
||||
import rfc3987
|
||||
|
||||
# WbUrl : wb archival url representation for WB
|
||||
"""
|
||||
WbUrl represents the standard wayback archival url format.
|
||||
A regular url is a subset of the WbUrl (latest replay).
|
||||
@ -34,9 +30,38 @@ replay form:
|
||||
|
||||
latest_replay: (no timestamp)
|
||||
http://example.com
|
||||
|
||||
Additionally, the BaseWbUrl provides the base components
|
||||
(url, timestamp, end_timestamp, modifier, type) which
|
||||
can be used to provide a custom representation of the
|
||||
wayback url format.
|
||||
|
||||
"""
|
||||
|
||||
class WbUrl:
|
||||
import re
|
||||
import rfc3987
|
||||
|
||||
|
||||
#=================================================================
|
||||
class BaseWbUrl(object):
|
||||
QUERY = 'query'
|
||||
URL_QUERY = 'url_query'
|
||||
REPLAY = 'replay'
|
||||
LATEST_REPLAY = 'latest_replay'
|
||||
|
||||
|
||||
def __init__(self, url='', mod='',
|
||||
timestamp='', end_timestamp='', type=None):
|
||||
|
||||
self.url = url
|
||||
self.timestamp = timestamp
|
||||
self.end_timestamp = end_timestamp
|
||||
self.mod = mod
|
||||
self.type = type
|
||||
|
||||
|
||||
#=================================================================
|
||||
class WbUrl(BaseWbUrl):
|
||||
"""
|
||||
# Replay Urls
|
||||
# ======================
|
||||
@ -107,22 +132,14 @@ class WbUrl:
|
||||
QUERY_REGEX = re.compile('^(?:([\w\-:]+)/)?(\d*)(?:-(\d+))?\*/?(.*)$')
|
||||
REPLAY_REGEX = re.compile('^(\d*)([a-z]+_)?/{0,3}(.*)$')
|
||||
|
||||
QUERY = 'query'
|
||||
URL_QUERY = 'url_query'
|
||||
REPLAY = 'replay'
|
||||
LATEST_REPLAY = 'latest_replay'
|
||||
|
||||
DEFAULT_SCHEME = 'http://'
|
||||
# ======================
|
||||
|
||||
|
||||
def __init__(self, url):
|
||||
super(WbUrl, self).__init__()
|
||||
|
||||
self.original_url = url
|
||||
self.type = None
|
||||
self.url = ''
|
||||
self.timestamp = ''
|
||||
self.end_timestamp = ''
|
||||
self.mod = ''
|
||||
|
||||
if not any (f(url) for f in [self._init_query, self._init_replay]):
|
||||
raise Exception('Invalid WbUrl: ', url)
|
||||
|
@ -1,13 +1,19 @@
|
||||
"""
|
||||
Test Route
|
||||
# route with relative path
|
||||
>>> Route('web', BaseHandler())({'REL_REQUEST_URI': '/web/test.example.com', 'SCRIPT_NAME': ''}, False)
|
||||
{'wb_url': ('latest_replay', '', '', 'http://test.example.com', 'http://test.example.com'), 'coll': 'web', 'wb_prefix': '/web/', 'request_uri': '/web/test.example.com'}
|
||||
# Test WbRequest parsed via a Route
|
||||
# route with relative path, print resulting wbrequest
|
||||
>>> print_req(Route('web', WbUrlHandler())({'REL_REQUEST_URI': '/web/test.example.com', 'SCRIPT_NAME': ''}, False))
|
||||
{'coll': 'web',
|
||||
'request_uri': '/web/test.example.com',
|
||||
'wb_prefix': '/web/',
|
||||
'wb_url': ('latest_replay', '', '', 'http://test.example.com', 'http://test.example.com')}
|
||||
|
||||
# route with absolute path, running at script /my_pywb
|
||||
>>> Route('web', BaseHandler())({'REL_REQUEST_URI': '/web/2013im_/test.example.com', 'SCRIPT_NAME': '/my_pywb', 'HTTP_HOST': 'localhost:8081', 'wsgi.url_scheme': 'https'}, True)
|
||||
{'wb_url': ('replay', '2013', 'im_', 'http://test.example.com', '2013im_/http://test.example.com'), 'coll': 'web', 'wb_prefix': 'https://localhost:8081/my_pywb/web/', 'request_uri': '/web/2013im_/test.example.com'}
|
||||
|
||||
# route with absolute path, running at script /my_pywb, print resultingwbrequest
|
||||
>>> print_req(Route('web', WbUrlHandler())({'REL_REQUEST_URI': '/web/2013im_/test.example.com', 'SCRIPT_NAME': '/my_pywb', 'HTTP_HOST': 'localhost:8081', 'wsgi.url_scheme': 'https'}, True))
|
||||
{'coll': 'web',
|
||||
'request_uri': '/web/2013im_/test.example.com',
|
||||
'wb_prefix': 'https://localhost:8081/my_pywb/web/',
|
||||
'wb_url': ('replay', '2013', 'im_', 'http://test.example.com', '2013im_/http://test.example.com')}
|
||||
|
||||
# not matching route -- skipped
|
||||
>>> Route('web', BaseHandler())({'REL_REQUEST_URI': '/other/test.example.com', 'SCRIPT_NAME': ''}, False)
|
||||
@ -65,7 +71,12 @@ False
|
||||
"""
|
||||
|
||||
from pywb.archivalrouter import Route, ReferRedirect
|
||||
from pywb.handlers import BaseHandler
|
||||
from pywb.handlers import BaseHandler, WbUrlHandler
|
||||
import pprint
|
||||
|
||||
def print_req(req):
|
||||
varlist = vars(req)
|
||||
pprint.pprint({k: varlist[k] for k in ('request_uri', 'wb_prefix', 'wb_url', 'coll')})
|
||||
|
||||
|
||||
def _test_redir(match_host, request_uri, referrer, script_name = '', coll = 'coll', http_host = None):
|
||||
@ -74,7 +85,7 @@ def _test_redir(match_host, request_uri, referrer, script_name = '', coll = 'col
|
||||
if http_host:
|
||||
env['HTTP_HOST'] = http_host
|
||||
|
||||
routes = [Route(coll, BaseHandler())]
|
||||
routes = [Route(coll, WbUrlHandler())]
|
||||
|
||||
redir = ReferRedirect(match_host)
|
||||
#req = WbRequest.from_uri(request_uri, env)
|
||||
@ -85,4 +96,6 @@ def _test_redir(match_host, request_uri, referrer, script_name = '', coll = 'col
|
||||
return rep.status_headers.get_header('Location')
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
doctest.testmod()
|
87
pywb/test/test_wbrequestresponse.py
Normal file
87
pywb/test/test_wbrequestresponse.py
Normal file
@ -0,0 +1,87 @@
|
||||
"""
|
||||
# WbRequest Tests
|
||||
# =================
|
||||
>>> print_req_from_uri('/save/_embed/example.com/?a=b')
|
||||
{'wb_url': ('latest_replay', '', '', 'http://_embed/example.com/?a=b', 'http://_embed/example.com/?a=b'), 'coll': 'save', 'wb_prefix': '/save/', 'request_uri': '/save/_embed/example.com/?a=b'}
|
||||
|
||||
>>> print_req_from_uri('/2345/20101024101112im_/example.com/?b=c')
|
||||
{'wb_url': ('replay', '20101024101112', 'im_', 'http://example.com/?b=c', '20101024101112im_/http://example.com/?b=c'), 'coll': '2345', 'wb_prefix': '/2345/', 'request_uri': '/2345/20101024101112im_/example.com/?b=c'}
|
||||
|
||||
>>> print_req_from_uri('/2010/example.com')
|
||||
{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': '/2010/', 'request_uri': '/2010/example.com'}
|
||||
|
||||
>>> print_req_from_uri('../example.com')
|
||||
{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '', 'wb_prefix': '/', 'request_uri': '../example.com'}
|
||||
|
||||
# Abs path
|
||||
>>> print_req_from_uri('/2010/example.com', {'wsgi.url_scheme': 'https', 'HTTP_HOST': 'localhost:8080'}, use_abs_prefix = True)
|
||||
{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': 'https://localhost:8080/2010/', 'request_uri': '/2010/example.com'}
|
||||
|
||||
# No Scheme, so stick to relative
|
||||
>>> print_req_from_uri('/2010/example.com', {'HTTP_HOST': 'localhost:8080'}, use_abs_prefix = True)
|
||||
{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': '/2010/', 'request_uri': '/2010/example.com'}
|
||||
|
||||
|
||||
|
||||
# WbResponse Tests
|
||||
# =================
|
||||
>>> WbResponse.text_response('Test')
|
||||
{'body': ['Test'], 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [('Content-Type', 'text/plain')])}
|
||||
|
||||
>>> WbResponse.text_stream(['Test', 'Another'], '404')
|
||||
{'body': ['Test', 'Another'], 'status_headers': StatusAndHeaders(protocol = '', statusline = '404', headers = [('Content-Type', 'text/plain')])}
|
||||
|
||||
>>> WbResponse.redir_response('http://example.com/otherfile')
|
||||
{'body': [], 'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [('Location', 'http://example.com/otherfile')])}
|
||||
|
||||
"""
|
||||
|
||||
|
||||
from pywb.rewrite.wburl import WbUrl
|
||||
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||
from pywb.utils.statusandheaders import StatusAndHeaders
|
||||
|
||||
from pywb.wbrequestresponse import WbRequest, WbResponse
|
||||
|
||||
|
||||
def print_req_from_uri(request_uri, env={}, use_abs_prefix=False):
|
||||
response = req_from_uri(request_uri, env, use_abs_prefix)
|
||||
varlist = vars(response)
|
||||
print str({k: varlist[k] for k in ('request_uri', 'wb_prefix', 'wb_url', 'coll')})
|
||||
|
||||
|
||||
def req_from_uri(request_uri, env={}, use_abs_prefix=False):
|
||||
if not request_uri:
|
||||
request_uri = env.get('REL_REQUEST_URI')
|
||||
|
||||
parts = request_uri.split('/', 2)
|
||||
|
||||
# Has coll prefix
|
||||
if len(parts) == 3:
|
||||
rel_prefix = '/' + parts[1] + '/'
|
||||
wb_url_str = parts[2]
|
||||
coll = parts[1]
|
||||
# No Coll Prefix
|
||||
elif len(parts) == 2:
|
||||
rel_prefix = '/'
|
||||
wb_url_str = parts[1]
|
||||
coll = ''
|
||||
else:
|
||||
rel_prefix = '/'
|
||||
wb_url_str = parts[0]
|
||||
coll = ''
|
||||
|
||||
return WbRequest(env,
|
||||
request_uri=request_uri,
|
||||
rel_prefix=rel_prefix,
|
||||
wb_url_str=wb_url_str,
|
||||
coll=coll,
|
||||
wburl_class=WbUrl,
|
||||
urlrewriter_class=UrlRewriter,
|
||||
use_abs_prefix=use_abs_prefix)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
doctest.testmod()
|
||||
|
@ -35,6 +35,58 @@ def binsearch_offset(reader, key, compare_func=cmp, block_size=8192):
|
||||
return min_ * block_size
|
||||
|
||||
|
||||
#=================================================================
|
||||
def binsearch(reader, key, compare_func=cmp, block_size=8192):
|
||||
"""
|
||||
Perform a binary search for a specified key to within a 'block_size'
|
||||
(default 8192) granularity, and return first full line found.
|
||||
"""
|
||||
|
||||
min_ = binsearch_offset(reader, key, compare_func, block_size)
|
||||
|
||||
reader.seek(min_)
|
||||
|
||||
if min_ > 0:
|
||||
reader.readline() # skip partial line
|
||||
|
||||
def gen_iter(line):
|
||||
while line:
|
||||
yield line.rstrip()
|
||||
line = reader.readline()
|
||||
|
||||
return gen_iter(reader.readline())
|
||||
|
||||
|
||||
#=================================================================
|
||||
def linearsearch(iter_, key, prev_size=0, compare_func=cmp):
|
||||
"""
|
||||
Perform a linear search over iterator until
|
||||
current_line >= key
|
||||
|
||||
optionally also tracking upto N previous lines, which are
|
||||
returned before the first matched line.
|
||||
|
||||
if end of stream is reached before a match is found,
|
||||
nothing is returned (prev lines discarded also)
|
||||
"""
|
||||
|
||||
prev_deque = deque(maxlen=prev_size + 1)
|
||||
|
||||
matched = False
|
||||
|
||||
for line in iter_:
|
||||
prev_deque.append(line)
|
||||
if compare_func(line, key) >= 0:
|
||||
matched = True
|
||||
break
|
||||
|
||||
# no matches, so return empty iterator
|
||||
if not matched:
|
||||
return []
|
||||
|
||||
return itertools.chain(prev_deque, iter_)
|
||||
|
||||
|
||||
#=================================================================
|
||||
def search(reader, key, prev_size=0, compare_func=cmp, block_size=8192):
|
||||
"""
|
||||
@ -45,46 +97,27 @@ def search(reader, key, prev_size=0, compare_func=cmp, block_size=8192):
|
||||
When performin_g linear search, keep track of up to N previous lines before
|
||||
first matching line.
|
||||
"""
|
||||
min_ = binsearch_offset(reader, key, compare_func, block_size)
|
||||
iter_ = binsearch(reader, key, compare_func, block_size)
|
||||
iter_ = linearsearch(iter_,
|
||||
key, prev_size=prev_size,
|
||||
compare_func=compare_func)
|
||||
return iter_
|
||||
|
||||
reader.seek(min_)
|
||||
|
||||
if min_ > 0:
|
||||
reader.readline() # skip partial line
|
||||
#=================================================================
|
||||
def iter_range(reader, start, end, prev_size=0):
|
||||
"""
|
||||
Creates an iterator which iterates over lines where
|
||||
start <= line < end (end exclusive)
|
||||
"""
|
||||
|
||||
if prev_size > 1:
|
||||
prev_deque = deque(max_len=prev_size)
|
||||
iter_ = search(reader, start, prev_size=prev_size)
|
||||
|
||||
line = None
|
||||
end_iter = itertools.takewhile(
|
||||
lambda line: line < end,
|
||||
iter_)
|
||||
|
||||
while True:
|
||||
line = reader.readline()
|
||||
if not line:
|
||||
break
|
||||
if compare_func(line, key) >= 0:
|
||||
break
|
||||
|
||||
if prev_size == 1:
|
||||
prev = line
|
||||
elif prev_size > 1:
|
||||
prev_deque.append(line)
|
||||
|
||||
def gen_iter(line):
|
||||
"""
|
||||
Create iterator over any previous lines to
|
||||
current matched line
|
||||
"""
|
||||
if prev_size == 1:
|
||||
yield prev.rstrip()
|
||||
elif prev_size > 1:
|
||||
for i in prev_deque:
|
||||
yield i.rstrip()
|
||||
|
||||
while line:
|
||||
yield line.rstrip()
|
||||
line = reader.readline()
|
||||
|
||||
return gen_iter(line)
|
||||
return end_iter
|
||||
|
||||
|
||||
#=================================================================
|
||||
|
@ -11,7 +11,7 @@ def gzip_decompressor():
|
||||
|
||||
|
||||
#=================================================================
|
||||
class BufferedReader(object):
|
||||
class DecompressingBufferedReader(object):
|
||||
"""
|
||||
A wrapping line reader which wraps an existing reader.
|
||||
Read operations operate on underlying buffer, which is filled to
|
||||
@ -29,7 +29,7 @@ class BufferedReader(object):
|
||||
|
||||
DECOMPRESSORS = {'gzip': gzip_decompressor}
|
||||
|
||||
def __init__(self, stream, max_len=0, block_size=1024, decomp_type=None):
|
||||
def __init__(self, stream, block_size=1024, decomp_type=None):
|
||||
self.stream = stream
|
||||
self.block_size = block_size
|
||||
|
||||
@ -44,24 +44,19 @@ class BufferedReader(object):
|
||||
|
||||
self.buff = None
|
||||
self.num_read = 0
|
||||
self.max_len = max_len
|
||||
|
||||
def _fillbuff(self, block_size=None):
|
||||
if not block_size:
|
||||
block_size = self.block_size
|
||||
|
||||
if not self.buff or self.buff.pos >= self.buff.len:
|
||||
if self.max_len > 0:
|
||||
to_read = min(self.max_len - self.num_read, self.block_size)
|
||||
else:
|
||||
to_read = self.block_size
|
||||
|
||||
data = self.stream.read(to_read)
|
||||
data = self.stream.read(block_size)
|
||||
self._process_read(data)
|
||||
|
||||
def _process_read(self, data):
|
||||
data = self._decompress(data)
|
||||
self.num_read += len(data)
|
||||
self.buff_size = len(data)
|
||||
self.num_read += self.buff_size
|
||||
self.buff = StringIO.StringIO(data)
|
||||
|
||||
def _decompress(self, data):
|
||||
@ -78,12 +73,40 @@ class BufferedReader(object):
|
||||
return data
|
||||
|
||||
def read(self, length=None):
|
||||
"""
|
||||
Fill bytes and read some number of bytes
|
||||
(up to length if specified)
|
||||
< length bytes may be read if reached the end of input
|
||||
or at a buffer boundary. If at a boundary, the subsequent
|
||||
call will fill buffer anew.
|
||||
"""
|
||||
self._fillbuff()
|
||||
return self.buff.read(length)
|
||||
|
||||
def readline(self, length=None):
|
||||
"""
|
||||
Fill buffer and read a full line from the buffer
|
||||
(up to specified length, if provided)
|
||||
If no newline found at end, try filling buffer again in case
|
||||
at buffer boundary.
|
||||
"""
|
||||
self._fillbuff()
|
||||
return self.buff.readline(length)
|
||||
linebuff = self.buff.readline(length)
|
||||
# we may be at a boundary
|
||||
while not linebuff.endswith('\n'):
|
||||
if length:
|
||||
length -= len(linebuff)
|
||||
if length <= 0:
|
||||
break
|
||||
|
||||
self._fillbuff()
|
||||
|
||||
if self.buff_size == 0:
|
||||
break
|
||||
|
||||
linebuff += self.buff.readline(length)
|
||||
|
||||
return linebuff
|
||||
|
||||
def close(self):
|
||||
if self.stream:
|
||||
@ -97,7 +120,7 @@ class ChunkedDataException(Exception):
|
||||
|
||||
|
||||
#=================================================================
|
||||
class ChunkedDataReader(BufferedReader):
|
||||
class ChunkedDataReader(DecompressingBufferedReader):
|
||||
r"""
|
||||
A ChunkedDataReader is a BufferedReader which also supports de-chunking
|
||||
of the data if it happens to be http 'chunk-encoded'.
|
||||
@ -133,7 +156,7 @@ class ChunkedDataReader(BufferedReader):
|
||||
|
||||
def _fillbuff(self, block_size=None):
|
||||
if self.not_chunked:
|
||||
return BufferedReader._fillbuff(self, block_size)
|
||||
return super(ChunkedDataReader, self)._fillbuff(block_size)
|
||||
|
||||
if self.all_chunks_read:
|
||||
return
|
||||
|
@ -9,18 +9,50 @@ import urllib2
|
||||
import time
|
||||
|
||||
|
||||
def is_http(filename):
|
||||
return any(filename.startswith(x) for x in ['http://', 'https://'])
|
||||
|
||||
|
||||
#=================================================================
|
||||
# load a reader from http
|
||||
#=================================================================
|
||||
class HttpLoader(object):
|
||||
class BlockLoader(object):
|
||||
"""
|
||||
Load a file-like reader over http using range requests
|
||||
and an optional cookie created via a cookie_maker
|
||||
a loader which can stream blocks of content
|
||||
given a uri, offset and optional length.
|
||||
Currently supports: http/https and file/local file system
|
||||
"""
|
||||
def __init__(self, cookie_maker=None):
|
||||
self.cookie_maker = cookie_maker
|
||||
|
||||
def load(self, url, offset, length):
|
||||
"""
|
||||
Determine loading method based on uri
|
||||
"""
|
||||
if is_http(url):
|
||||
return self.load_http(url, offset, length)
|
||||
else:
|
||||
return self.load_file(url, offset, length)
|
||||
|
||||
def load_file(self, url, offset, length):
|
||||
"""
|
||||
Load a file-like reader from the local file system
|
||||
"""
|
||||
|
||||
if url.startswith('file://'):
|
||||
url = url[len('file://'):]
|
||||
|
||||
afile = open(url, 'rb')
|
||||
afile.seek(offset)
|
||||
|
||||
if length > 0:
|
||||
return LimitReader(afile, length)
|
||||
else:
|
||||
return afile
|
||||
|
||||
def load_http(self, url, offset, length):
|
||||
"""
|
||||
Load a file-like reader over http using range requests
|
||||
and an optional cookie created via a cookie_maker
|
||||
"""
|
||||
if length > 0:
|
||||
range_header = 'bytes={0}-{1}'.format(offset, offset + length - 1)
|
||||
else:
|
||||
@ -71,25 +103,6 @@ class HMACCookieMaker(object):
|
||||
return cookie
|
||||
|
||||
|
||||
#=================================================================
|
||||
# load a reader from local filesystem
|
||||
#=================================================================
|
||||
class FileLoader(object):
|
||||
"""
|
||||
Load a file-like reader from the local file system
|
||||
"""
|
||||
|
||||
def load(self, url, offset, length):
|
||||
if url.startswith('file://'):
|
||||
url = url[len('file://'):]
|
||||
|
||||
afile = open(url, 'rb')
|
||||
afile.seek(offset)
|
||||
|
||||
if length > 0:
|
||||
return LimitReader(afile, length)
|
||||
|
||||
|
||||
#=================================================================
|
||||
# Limit Reader
|
||||
#=================================================================
|
||||
|
@ -65,23 +65,36 @@ class StatusAndHeadersParser(object):
|
||||
"""
|
||||
parse stream for status line and headers
|
||||
return a StatusAndHeaders object
|
||||
|
||||
support continuation headers starting with space or tab
|
||||
"""
|
||||
statusline = stream.readline().rstrip()
|
||||
|
||||
protocol_status = self.split_prefix(statusline, self.statuslist)
|
||||
|
||||
if not protocol_status:
|
||||
msg = 'Expected Status Line - Found: ' + statusline
|
||||
msg = 'Expected Status Line starting with {0} - Found: {1}'
|
||||
msg = msg.format(self.statuslist, statusline)
|
||||
raise StatusAndHeadersParserException(msg, statusline)
|
||||
|
||||
headers = []
|
||||
|
||||
line = stream.readline().rstrip()
|
||||
while line and line != '\r\n':
|
||||
while line:
|
||||
name, value = line.split(':', 1)
|
||||
header = (name, value.strip())
|
||||
name = name.rstrip(' \t')
|
||||
value = value.lstrip()
|
||||
|
||||
next_line = stream.readline().rstrip()
|
||||
|
||||
# append continuation lines, if any
|
||||
while next_line and next_line.startswith((' ', '\t')):
|
||||
value += next_line
|
||||
next_line = stream.readline().rstrip()
|
||||
|
||||
header = (name, value)
|
||||
headers.append(header)
|
||||
line = stream.readline().rstrip()
|
||||
line = next_line
|
||||
|
||||
return StatusAndHeaders(statusline=protocol_status[1].strip(),
|
||||
headers=headers,
|
||||
@ -107,4 +120,3 @@ class StatusAndHeadersParserException(Exception):
|
||||
def __init__(self, msg, statusline):
|
||||
super(StatusAndHeadersParserException, self).__init__(msg)
|
||||
self.statusline = statusline
|
||||
|
||||
|
@ -9,6 +9,7 @@ org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ te
|
||||
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
|
||||
org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz
|
||||
|
||||
# Exact Search
|
||||
>>> print_binsearch_results('org,iana)/domains/root', iter_exact)
|
||||
org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz
|
||||
|
||||
@ -19,18 +20,45 @@ org,iana)/ 20140126200624 http://www.iana.org/ text/html 200 OSSAPWJ23L56IYVRW3G
|
||||
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz
|
||||
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
|
||||
|
||||
# Exact Search
|
||||
>>> print_binsearch_results('org,iana)/time-zones', iter_exact)
|
||||
org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
|
||||
|
||||
# Exact search -- no matches
|
||||
>>> print_binsearch_results('org,iaana)/', iter_exact)
|
||||
>>> print_binsearch_results('org,ibna)/', iter_exact)
|
||||
|
||||
>>> print_binsearch_results('org,iana)/time-zones', iter_exact)
|
||||
org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
|
||||
|
||||
# Range Search (end exclusive)
|
||||
>>> print_binsearch_results_range('org,iana)/about', 'org,iana)/domains', iter_range)
|
||||
org,iana)/about 20140126200706 http://www.iana.org/about text/html 200 6G77LZKFAVKH4PCWWKMW6TRJPSHWUBI3 - - 2962 483588 iana.warc.gz
|
||||
org,iana)/about/performance/ietf-draft-status 20140126200815 http://www.iana.org/about/performance/ietf-draft-status text/html 302 Y7CTA2QZUSCDTJCSECZNSPIBLJDO7PJJ - - 584 596566 iana.warc.gz
|
||||
org,iana)/about/performance/ietf-statistics 20140126200804 http://www.iana.org/about/performance/ietf-statistics text/html 302 HNYDN7XRX46RQTT2OFIWXKEYMZQAJWHD - - 582 581890 iana.warc.gz
|
||||
org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz
|
||||
org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz
|
||||
|
||||
|
||||
# Range Search -- exact
|
||||
>>> print_binsearch_results_range('org,iana)/about', 'org,iana)/about!', iter_range)
|
||||
org,iana)/about 20140126200706 http://www.iana.org/about text/html 200 6G77LZKFAVKH4PCWWKMW6TRJPSHWUBI3 - - 2962 483588 iana.warc.gz
|
||||
|
||||
# Range Search -- exact + 1 prev
|
||||
>>> print_binsearch_results_range('org,iana)/about', 'org,iana)/about!', iter_range, prev_size=1)
|
||||
org,iana)/_js/2013.1/jquery.js 20140126201307 https://www.iana.org/_js/2013.1/jquery.js warc/revisit - AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO - - 543 778507 iana.warc.gz
|
||||
org,iana)/about 20140126200706 http://www.iana.org/about text/html 200 6G77LZKFAVKH4PCWWKMW6TRJPSHWUBI3 - - 2962 483588 iana.warc.gz
|
||||
|
||||
# Range Search -- exact + 2 prev
|
||||
>>> print_binsearch_results_range('org,iana)/about', 'org,iana)/about!', iter_range, prev_size=2)
|
||||
org,iana)/_js/2013.1/jquery.js 20140126201248 http://www.iana.org/_js/2013.1/jquery.js warc/revisit - AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO - - 544 765491 iana.warc.gz
|
||||
org,iana)/_js/2013.1/jquery.js 20140126201307 https://www.iana.org/_js/2013.1/jquery.js warc/revisit - AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO - - 543 778507 iana.warc.gz
|
||||
org,iana)/about 20140126200706 http://www.iana.org/about text/html 200 6G77LZKFAVKH4PCWWKMW6TRJPSHWUBI3 - - 2962 483588 iana.warc.gz
|
||||
|
||||
|
||||
"""
|
||||
|
||||
|
||||
#=================================================================
|
||||
import os
|
||||
from pywb.utils.binsearch import iter_prefix, iter_exact
|
||||
from pywb.utils.binsearch import iter_prefix, iter_exact, iter_range
|
||||
from pywb.utils.loaders import SeekableTextFileReader
|
||||
|
||||
from pywb import get_test_dir
|
||||
@ -45,6 +73,13 @@ def print_binsearch_results(key, iter_func):
|
||||
print line
|
||||
|
||||
|
||||
def print_binsearch_results_range(key, end_key, iter_func, prev_size=0):
|
||||
cdx = SeekableTextFileReader(test_cdx_dir + 'iana.cdx')
|
||||
|
||||
for line in iter_func(cdx, key, end_key, prev_size=prev_size):
|
||||
print line
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
doctest.testmod()
|
||||
|
@ -10,9 +10,9 @@
|
||||
>>> read_multiple(LimitReader(StringIO.StringIO('abcdefghjiklmnopqrstuvwxyz'), 10), [2, 2, 20])
|
||||
'efghji'
|
||||
|
||||
# FileLoader Tests (includes LimitReader)
|
||||
# BlockLoader Tests (includes LimitReader)
|
||||
# Ensure attempt to read more than 100 bytes, reads exactly 100 bytes
|
||||
>>> len(FileLoader().load(test_cdx_dir + 'iana.cdx', 0, 100).read('400'))
|
||||
>>> len(BlockLoader().load(test_cdx_dir + 'iana.cdx', 0, 100).read('400'))
|
||||
100
|
||||
|
||||
# SeekableTextFileReader Test
|
||||
@ -23,25 +23,39 @@
|
||||
>>> seek_read_full(sr, 100)
|
||||
'org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200826 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 34054 620049 iana.warc.gz\\n'
|
||||
|
||||
#BufferedReader readline()
|
||||
>>> BufferedReader(open(test_cdx_dir + 'iana.cdx', 'rb')).readline()
|
||||
# Buffered Reader Tests
|
||||
#=================================================================
|
||||
|
||||
#DecompressingBufferedReader readline()
|
||||
>>> DecompressingBufferedReader(open(test_cdx_dir + 'iana.cdx', 'rb')).readline()
|
||||
' CDX N b a m s k r M S V g\\n'
|
||||
|
||||
#BufferedReader readline() with decompression
|
||||
>>> BufferedReader(open(test_cdx_dir + 'iana.cdx.gz', 'rb'), decomp_type = 'gzip').readline()
|
||||
#DecompressingBufferedReader readline() with decompression
|
||||
>>> DecompressingBufferedReader(open(test_cdx_dir + 'iana.cdx.gz', 'rb'), decomp_type = 'gzip').readline()
|
||||
' CDX N b a m s k r M S V g\\n'
|
||||
|
||||
>>> HttpLoader(HMACCookieMaker('test', 'test', 5)).load('http://example.com', 41, 14).read()
|
||||
>>> BlockLoader(HMACCookieMaker('test', 'test', 5)).load('http://example.com', 41, 14).read()
|
||||
'Example Domain'
|
||||
|
||||
# test very small block size
|
||||
>>> dbr = DecompressingBufferedReader(StringIO.StringIO('ABCDEFG\\nHIJKLMN\\nOPQR\\nXYZ'), block_size = 3)
|
||||
>>> dbr.readline(); dbr.readline(4); dbr.readline(); dbr.readline(); dbr.readline(2); dbr.readline(); dbr.readline()
|
||||
'ABCDEFG\\n'
|
||||
'HIJK'
|
||||
'LMN\\n'
|
||||
'OPQR\\n'
|
||||
'XY'
|
||||
'Z'
|
||||
''
|
||||
"""
|
||||
|
||||
|
||||
#=================================================================
|
||||
import os
|
||||
import StringIO
|
||||
from pywb.utils.loaders import FileLoader, HttpLoader, HMACCookieMaker
|
||||
from pywb.utils.loaders import BlockLoader, HMACCookieMaker
|
||||
from pywb.utils.loaders import LimitReader, SeekableTextFileReader
|
||||
from pywb.utils.bufferedreaders import BufferedReader
|
||||
from pywb.utils.bufferedreaders import DecompressingBufferedReader
|
||||
|
||||
from pywb import get_test_dir
|
||||
#test_cdx_dir = os.path.dirname(os.path.realpath(__file__)) + '/../sample-data/'
|
||||
|
29
pywb/utils/test/statusandheaders_test.py
Normal file
29
pywb/utils/test/statusandheaders_test.py
Normal file
@ -0,0 +1,29 @@
|
||||
"""
|
||||
>>> StatusAndHeadersParser(['HTTP/1.0']).parse(StringIO.StringIO(status_headers_1))
|
||||
StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '200 OK', headers = [ ('Content-Type', 'ABC'),
|
||||
('Some', 'Value'),
|
||||
('Multi-Line', 'Value1 Also This')])
|
||||
|
||||
>>> StatusAndHeadersParser(['Other']).parse(StringIO.StringIO(status_headers_1))
|
||||
Traceback (most recent call last):
|
||||
StatusAndHeadersParserException: Expected Status Line starting with ['Other'] - Found: HTTP/1.0 200 OK
|
||||
"""
|
||||
|
||||
|
||||
from pywb.utils.statusandheaders import StatusAndHeadersParser
|
||||
import StringIO
|
||||
|
||||
|
||||
status_headers_1 = "\
|
||||
HTTP/1.0 200 OK\r\n\
|
||||
Content-Type: ABC\r\n\
|
||||
Some: Value\r\n\
|
||||
Multi-Line: Value1\r\n\
|
||||
Also This\r\n\
|
||||
\r\n\
|
||||
Body"
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
doctest.testmod()
|
@ -17,7 +17,8 @@ DATE_TIMESPLIT = re.compile(r'[^\d]')
|
||||
|
||||
TIMESTAMP_14 = '%Y%m%d%H%M%S'
|
||||
|
||||
PAD_STAMP_END = '29991231235959'
|
||||
#PAD_STAMP_END = '29991231235959'
|
||||
PAD_6 = '299912'
|
||||
|
||||
|
||||
def iso_date_to_datetime(string):
|
||||
@ -58,41 +59,145 @@ def iso_date_to_timestamp(string):
|
||||
return datetime_to_timestamp(iso_date_to_datetime(string))
|
||||
|
||||
|
||||
# default pad is end of range for compatibility
|
||||
def pad_timestamp(string, pad_str=PAD_STAMP_END):
|
||||
# pad to certain length (default 6)
|
||||
def _pad_timestamp(string, pad_str=PAD_6):
|
||||
"""
|
||||
>>> pad_timestamp('20')
|
||||
'20991231235959'
|
||||
>>> _pad_timestamp('20')
|
||||
'209912'
|
||||
|
||||
>>> pad_timestamp('2014')
|
||||
'20141231235959'
|
||||
>>> _pad_timestamp('2014')
|
||||
'201412'
|
||||
|
||||
>>> pad_timestamp('20141011')
|
||||
'20141011235959'
|
||||
>>> _pad_timestamp('20141011')
|
||||
'20141011'
|
||||
|
||||
>>> pad_timestamp('201410110010')
|
||||
'20141011001059'
|
||||
>>> _pad_timestamp('201410110010')
|
||||
'201410110010'
|
||||
"""
|
||||
|
||||
str_len = len(string)
|
||||
pad_len = len(pad_str)
|
||||
|
||||
return string if str_len >= pad_len else string + pad_str[str_len:]
|
||||
if str_len < pad_len:
|
||||
string = string + pad_str[str_len:]
|
||||
|
||||
return string
|
||||
|
||||
|
||||
def timestamp_to_datetime(string):
|
||||
"""
|
||||
>>> timestamp_to_datetime('20131226095010')
|
||||
time.struct_time(tm_year=2013, tm_mon=12, tm_mday=26, \
|
||||
tm_hour=9, tm_min=50, tm_sec=10, tm_wday=3, tm_yday=360, tm_isdst=-1)
|
||||
# >14-digit -- rest ignored
|
||||
>>> timestamp_to_datetime('2014122609501011')
|
||||
datetime.datetime(2014, 12, 26, 9, 50, 10)
|
||||
|
||||
# 14-digit
|
||||
>>> timestamp_to_datetime('20141226095010')
|
||||
datetime.datetime(2014, 12, 26, 9, 50, 10)
|
||||
|
||||
# 13-digit padding
|
||||
>>> timestamp_to_datetime('2014122609501')
|
||||
datetime.datetime(2014, 12, 26, 9, 50, 59)
|
||||
|
||||
# 12-digit padding
|
||||
>>> timestamp_to_datetime('201412260950')
|
||||
datetime.datetime(2014, 12, 26, 9, 50, 59)
|
||||
|
||||
# 11-digit padding
|
||||
>>> timestamp_to_datetime('20141226095')
|
||||
datetime.datetime(2014, 12, 26, 9, 59, 59)
|
||||
|
||||
# 10-digit padding
|
||||
>>> timestamp_to_datetime('2014122609')
|
||||
datetime.datetime(2014, 12, 26, 9, 59, 59)
|
||||
|
||||
# 9-digit padding
|
||||
>>> timestamp_to_datetime('201412260')
|
||||
datetime.datetime(2014, 12, 26, 23, 59, 59)
|
||||
|
||||
# 8-digit padding
|
||||
>>> timestamp_to_datetime('20141226')
|
||||
datetime.datetime(2014, 12, 26, 23, 59, 59)
|
||||
|
||||
# 7-digit padding
|
||||
>>> timestamp_to_datetime('2014122')
|
||||
datetime.datetime(2014, 12, 31, 23, 59, 59)
|
||||
|
||||
# 6-digit padding
|
||||
>>> timestamp_to_datetime('201410')
|
||||
datetime.datetime(2014, 10, 31, 23, 59, 59)
|
||||
|
||||
# 5-digit padding
|
||||
>>> timestamp_to_datetime('20141')
|
||||
datetime.datetime(2014, 12, 31, 23, 59, 59)
|
||||
|
||||
# 4-digit padding
|
||||
>>> timestamp_to_datetime('2014')
|
||||
time.struct_time(tm_year=2014, tm_mon=12, tm_mday=31, \
|
||||
tm_hour=23, tm_min=59, tm_sec=59, tm_wday=2, tm_yday=365, tm_isdst=-1)
|
||||
datetime.datetime(2014, 12, 31, 23, 59, 59)
|
||||
|
||||
# 3-digit padding
|
||||
>>> timestamp_to_datetime('201')
|
||||
datetime.datetime(2019, 12, 31, 23, 59, 59)
|
||||
|
||||
# 2-digit padding
|
||||
>>> timestamp_to_datetime('20')
|
||||
datetime.datetime(2099, 12, 31, 23, 59, 59)
|
||||
|
||||
# 1-digit padding
|
||||
>>> timestamp_to_datetime('2')
|
||||
datetime.datetime(2999, 12, 31, 23, 59, 59)
|
||||
|
||||
# 1-digit out-of-range padding
|
||||
>>> timestamp_to_datetime('3')
|
||||
datetime.datetime(2999, 12, 31, 23, 59, 59)
|
||||
|
||||
# 0-digit padding
|
||||
>>> timestamp_to_datetime('')
|
||||
datetime.datetime(2999, 12, 31, 23, 59, 59)
|
||||
|
||||
# bad month
|
||||
>>> timestamp_to_datetime('20131709005601')
|
||||
datetime.datetime(2013, 12, 9, 0, 56, 1)
|
||||
|
||||
# all out of range except minutes
|
||||
>>> timestamp_to_datetime('40001965252477')
|
||||
datetime.datetime(2999, 12, 31, 23, 24, 59)
|
||||
|
||||
"""
|
||||
|
||||
# Default pad to end of range for comptability
|
||||
return time.strptime(pad_timestamp(string), TIMESTAMP_14)
|
||||
# pad to 6 digits
|
||||
string = _pad_timestamp(string, PAD_6)
|
||||
|
||||
|
||||
def clamp(val, min_, max_):
|
||||
try:
|
||||
val = int(val)
|
||||
val = max(min_, min(val, max_))
|
||||
return val
|
||||
except:
|
||||
return max_
|
||||
|
||||
def extract(string, start, end, min_, max_):
|
||||
if len(string) >= end:
|
||||
return clamp(string[start:end], min_, max_)
|
||||
else:
|
||||
return max_
|
||||
|
||||
# now parse, clamp to boundary
|
||||
year = extract(string, 0, 4, 1900, 2999)
|
||||
month = extract(string, 4, 6, 1, 12)
|
||||
day = extract(string, 6, 8, 1, calendar.monthrange(year, month)[1])
|
||||
hour = extract(string, 8, 10, 0, 23)
|
||||
minute = extract(string, 10, 12, 0, 59)
|
||||
second = extract(string, 12, 14, 0, 59)
|
||||
|
||||
return datetime.datetime(year=year,
|
||||
month=month,
|
||||
day=day,
|
||||
hour=hour,
|
||||
minute=minute,
|
||||
second=second)
|
||||
|
||||
#return time.strptime(pad_timestamp(string), TIMESTAMP_14)
|
||||
|
||||
|
||||
def timestamp_to_sec(string):
|
||||
@ -104,7 +209,7 @@ def timestamp_to_sec(string):
|
||||
1420070399
|
||||
"""
|
||||
|
||||
return calendar.timegm(timestamp_to_datetime(string))
|
||||
return calendar.timegm(timestamp_to_datetime(string).utctimetuple())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
@ -56,9 +56,9 @@ class J2TemplateView:
|
||||
|
||||
# Filters
|
||||
@staticmethod
|
||||
def format_ts(value, format='%a, %b %d %Y %H:%M:%S'):
|
||||
def format_ts(value, format_='%a, %b %d %Y %H:%M:%S'):
|
||||
value = timeutils.timestamp_to_datetime(value)
|
||||
return time.strftime(format, value)
|
||||
return value.strftime(format_)
|
||||
|
||||
@staticmethod
|
||||
def get_host(url):
|
||||
|
@ -6,8 +6,8 @@ from pywb.utils.statusandheaders import StatusAndHeaders
|
||||
from pywb.utils.statusandheaders import StatusAndHeadersParser
|
||||
from pywb.utils.statusandheaders import StatusAndHeadersParserException
|
||||
|
||||
from pywb.utils.loaders import FileLoader, HttpLoader
|
||||
from pywb.utils.bufferedreaders import BufferedReader
|
||||
from pywb.utils.loaders import BlockLoader
|
||||
from pywb.utils.bufferedreaders import DecompressingBufferedReader
|
||||
|
||||
#=================================================================
|
||||
ArcWarcRecord = collections.namedtuple('ArchiveRecord',
|
||||
@ -32,24 +32,12 @@ class ArcWarcRecordLoader:
|
||||
ARC_HEADERS = ["uri", "ip-address", "creation-date",
|
||||
"content-type", "length"]
|
||||
|
||||
@staticmethod
|
||||
def create_default_loaders(cookie_maker=None):
|
||||
http = HttpLoader(cookie_maker)
|
||||
file = FileLoader()
|
||||
return {
|
||||
'http': http,
|
||||
'https': http,
|
||||
'file': file,
|
||||
'': file
|
||||
}
|
||||
def __init__(self, loader=None, cookie_maker=None, block_size=8192):
|
||||
if not loader:
|
||||
loader = BlockLoader(cookie_maker)
|
||||
|
||||
def __init__(self, loaders={}, cookie_maker=None, chunk_size=8192):
|
||||
self.loaders = loaders
|
||||
|
||||
if not self.loaders:
|
||||
self.loaders = self.create_default_loaders(cookie_maker)
|
||||
|
||||
self.chunk_size = chunk_size
|
||||
self.loader = loader
|
||||
self.block_size = block_size
|
||||
|
||||
self.arc_parser = ARCHeadersParser(self.ARC_HEADERS)
|
||||
|
||||
@ -60,22 +48,25 @@ class ArcWarcRecordLoader:
|
||||
def load(self, url, offset, length):
|
||||
url_parts = urlparse.urlsplit(url)
|
||||
|
||||
loader = self.loaders.get(url_parts.scheme)
|
||||
if not loader:
|
||||
raise ArchiveLoadFailed('Unknown Protocol', url)
|
||||
#loader = self.loaders.get(url_parts.scheme)
|
||||
#if not loader:
|
||||
# raise ArchiveLoadFailed('Unknown Protocol', url)
|
||||
|
||||
try:
|
||||
length = int(length)
|
||||
except:
|
||||
length = -1
|
||||
|
||||
raw = loader.load(url, long(offset), length)
|
||||
raw = self.loader.load(url, long(offset), length)
|
||||
|
||||
decomp_type = 'gzip'
|
||||
|
||||
stream = BufferedReader(raw, length, self.chunk_size, decomp_type)
|
||||
# Create decompressing stream
|
||||
stream = DecompressingBufferedReader(stream = raw,
|
||||
decomp_type = decomp_type,
|
||||
block_size = self.block_size)
|
||||
|
||||
(the_format, rec_headers) = self._load_headers(stream)
|
||||
(the_format, rec_headers) = self._detect_type_load_headers(stream)
|
||||
|
||||
if the_format == 'arc':
|
||||
rec_type = 'response'
|
||||
@ -111,7 +102,7 @@ class ArcWarcRecordLoader:
|
||||
return ArcWarcRecord((the_format, rec_type),
|
||||
rec_headers, stream, status_headers)
|
||||
|
||||
def _load_headers(self, stream):
|
||||
def _detect_type_load_headers(self, stream):
|
||||
"""
|
||||
Try parsing record as WARC, then try parsing as ARC.
|
||||
if neither one succeeds, we're out of luck.
|
||||
|
@ -213,3 +213,6 @@ def load_from_cdx_test(cdx):
|
||||
except Exception as e:
|
||||
print 'Exception: ' + e.__class__.__name__
|
||||
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
doctest.testmod()
|
||||
|
@ -1,99 +1,75 @@
|
||||
from pywb.rewrite.wburl import WbUrl
|
||||
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||
from pywb.utils.statusandheaders import StatusAndHeaders
|
||||
|
||||
import pprint
|
||||
#WB Request and Response
|
||||
|
||||
|
||||
#=================================================================
|
||||
class WbRequest:
|
||||
"""
|
||||
>>> WbRequest.from_uri('/save/_embed/example.com/?a=b')
|
||||
{'wb_url': ('latest_replay', '', '', 'http://_embed/example.com/?a=b', 'http://_embed/example.com/?a=b'), 'coll': 'save', 'wb_prefix': '/save/', 'request_uri': '/save/_embed/example.com/?a=b'}
|
||||
Represents the main pywb request object.
|
||||
|
||||
>>> WbRequest.from_uri('/2345/20101024101112im_/example.com/?b=c')
|
||||
{'wb_url': ('replay', '20101024101112', 'im_', 'http://example.com/?b=c', '20101024101112im_/http://example.com/?b=c'), 'coll': '2345', 'wb_prefix': '/2345/', 'request_uri': '/2345/20101024101112im_/example.com/?b=c'}
|
||||
Contains various info from wsgi env, add additional info
|
||||
about the request, such as coll, relative prefix,
|
||||
host prefix, absolute prefix.
|
||||
|
||||
>>> WbRequest.from_uri('/2010/example.com')
|
||||
{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': '/2010/', 'request_uri': '/2010/example.com'}
|
||||
|
||||
>>> WbRequest.from_uri('../example.com')
|
||||
{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '', 'wb_prefix': '/', 'request_uri': '../example.com'}
|
||||
|
||||
# Abs path
|
||||
>>> WbRequest.from_uri('/2010/example.com', {'wsgi.url_scheme': 'https', 'HTTP_HOST': 'localhost:8080'}, use_abs_prefix = True)
|
||||
{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': 'https://localhost:8080/2010/', 'request_uri': '/2010/example.com'}
|
||||
|
||||
# No Scheme, so stick to relative
|
||||
>>> WbRequest.from_uri('/2010/example.com', {'HTTP_HOST': 'localhost:8080'}, use_abs_prefix = True)
|
||||
{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': '/2010/', 'request_uri': '/2010/example.com'}
|
||||
If a wburl and url rewriter classes are specified, the class
|
||||
also contains the url rewriter.
|
||||
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def from_uri(request_uri, env = {}, use_abs_prefix = False):
|
||||
if not request_uri:
|
||||
request_uri = env.get('REL_REQUEST_URI')
|
||||
|
||||
parts = request_uri.split('/', 2)
|
||||
|
||||
# Has coll prefix
|
||||
if len(parts) == 3:
|
||||
wb_prefix = '/' + parts[1] + '/'
|
||||
wb_url_str = parts[2]
|
||||
coll = parts[1]
|
||||
# No Coll Prefix
|
||||
elif len(parts) == 2:
|
||||
wb_prefix = '/'
|
||||
wb_url_str = parts[1]
|
||||
coll = ''
|
||||
else:
|
||||
wb_prefix = '/'
|
||||
wb_url_str = parts[0]
|
||||
coll = ''
|
||||
|
||||
host_prefix = WbRequest.make_host_prefix(env) if use_abs_prefix else ''
|
||||
|
||||
return WbRequest(env, request_uri, wb_prefix, wb_url_str, coll, host_prefix = host_prefix)
|
||||
|
||||
|
||||
@staticmethod
|
||||
def make_host_prefix(env):
|
||||
try:
|
||||
return env['wsgi.url_scheme'] + '://' + env['HTTP_HOST']
|
||||
host = env.get('HTTP_HOST')
|
||||
if not host:
|
||||
host = env['SERVER_NAME'] + ':' + env['SERVER_PORT']
|
||||
|
||||
return env['wsgi.url_scheme'] + '://' + host
|
||||
except KeyError:
|
||||
return ''
|
||||
|
||||
|
||||
def __init__(self, env, request_uri, wb_prefix, wb_url_str, coll,
|
||||
host_prefix = '',
|
||||
wburl_class = WbUrl,
|
||||
url_rewriter_class = UrlRewriter,
|
||||
is_proxy = False):
|
||||
def __init__(self, env,
|
||||
request_uri=None,
|
||||
rel_prefix='',
|
||||
wb_url_str='/',
|
||||
coll='',
|
||||
host_prefix='',
|
||||
use_abs_prefix=False,
|
||||
wburl_class=None,
|
||||
urlrewriter_class=None,
|
||||
is_proxy=False):
|
||||
|
||||
self.env = env
|
||||
|
||||
self.request_uri = request_uri if request_uri else env.get('REL_REQUEST_URI')
|
||||
|
||||
self.host_prefix = host_prefix
|
||||
self.coll = coll
|
||||
|
||||
if not host_prefix:
|
||||
host_prefix = self.make_host_prefix(env)
|
||||
|
||||
self.host_prefix = host_prefix
|
||||
self.rel_prefix = rel_prefix
|
||||
|
||||
if use_abs_prefix:
|
||||
self.wb_prefix = host_prefix + rel_prefix
|
||||
else:
|
||||
self.wb_prefix = rel_prefix
|
||||
|
||||
self.wb_prefix = host_prefix + wb_prefix
|
||||
|
||||
if not wb_url_str:
|
||||
wb_url_str = '/'
|
||||
|
||||
self.wb_url_str = wb_url_str
|
||||
|
||||
# wb_url present and not root page
|
||||
if wb_url_str != '/' and wburl_class:
|
||||
self.wb_url_str = wb_url_str
|
||||
self.wb_url = wburl_class(wb_url_str)
|
||||
self.urlrewriter = url_rewriter_class(self.wb_url, self.wb_prefix)
|
||||
self.urlrewriter = urlrewriter_class(self.wb_url, self.wb_prefix)
|
||||
else:
|
||||
# no wb_url, just store blank wb_url
|
||||
self.wb_url_str = wb_url_str
|
||||
self.wb_url = None
|
||||
self.urlrewriter = None
|
||||
|
||||
self.coll = coll
|
||||
|
||||
self.referrer = env.get('HTTP_REFERER')
|
||||
|
||||
self.is_ajax = self._is_ajax()
|
||||
@ -122,24 +98,19 @@ class WbRequest:
|
||||
|
||||
|
||||
def __repr__(self):
|
||||
#return "WbRequest(env, '" + (self.wb_url) + "', '" + (self.coll) + "')"
|
||||
#return str(vars(self))
|
||||
varlist = vars(self)
|
||||
return str({k: varlist[k] for k in ('request_uri', 'wb_prefix', 'wb_url', 'coll')})
|
||||
varstr = pprint.pformat(varlist)
|
||||
return varstr
|
||||
|
||||
|
||||
#=================================================================
|
||||
class WbResponse:
|
||||
"""
|
||||
>>> WbResponse.text_response('Test')
|
||||
{'body': ['Test'], 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [('Content-Type', 'text/plain')])}
|
||||
Represnts a pywb wsgi response object.
|
||||
|
||||
>>> WbResponse.text_stream(['Test', 'Another'], '404')
|
||||
{'body': ['Test', 'Another'], 'status_headers': StatusAndHeaders(protocol = '', statusline = '404', headers = [('Content-Type', 'text/plain')])}
|
||||
|
||||
>>> WbResponse.redir_response('http://example.com/otherfile')
|
||||
{'body': [], 'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [('Location', 'http://example.com/otherfile')])}
|
||||
Holds a status_headers object and a response iter, to be
|
||||
returned to wsgi container.
|
||||
"""
|
||||
|
||||
def __init__(self, status_headers, value = []):
|
||||
self.status_headers = status_headers
|
||||
self.body = value
|
||||
@ -180,8 +151,3 @@ class WbResponse:
|
||||
|
||||
def __repr__(self):
|
||||
return str(vars(self))
|
||||
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
doctest.testmod()
|
||||
|
||||
|
@ -75,6 +75,11 @@ class TestWb:
|
||||
assert 'wb.js' in resp.body
|
||||
assert '/pywb/20140127171238/http://www.iana.org/time-zones' in resp.body
|
||||
|
||||
def test_replay_content_length_1(self):
|
||||
# test larger file, rewritten file (svg!)
|
||||
resp = self.testapp.get('/pywb/20140126200654/http://www.iana.org/_img/2013.1/rir-map.svg')
|
||||
assert resp.headers['Content-Length'] == str(len(resp.body))
|
||||
|
||||
|
||||
def test_redirect_1(self):
|
||||
resp = self.testapp.get('/pywb/20140127171237/http://www.iana.org/')
|
||||
@ -119,6 +124,20 @@ class TestWb:
|
||||
assert resp.content_type == 'text/css'
|
||||
|
||||
|
||||
def test_referrer_self_redirect(self):
|
||||
uri = '/pywb/20140127171239/http://www.iana.org/_css/2013.1/screen.css'
|
||||
host = 'somehost:8082'
|
||||
referrer = 'http://' + host + uri
|
||||
|
||||
# capture is normally a 200
|
||||
resp = self.testapp.get(uri)
|
||||
assert resp.status_int == 200
|
||||
|
||||
# redirect causes skip of this capture, redirect to next
|
||||
resp = self.testapp.get(uri, headers = [('Referer', referrer), ('Host', host)], status = 302)
|
||||
assert resp.status_int == 302
|
||||
|
||||
|
||||
def test_excluded_content(self):
|
||||
resp = self.testapp.get('/pywb/http://www.iana.org/_img/bookmark_icon.ico', status = 403)
|
||||
assert resp.status_int == 403
|
||||
|
Loading…
x
Reference in New Issue
Block a user