diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 00000000..63400c07 --- /dev/null +++ b/.coveragerc @@ -0,0 +1,8 @@ +[run] +omit = + */test/* + */tests/* + +[report] +exclude_lines = + if __name__ == .__main__.: diff --git a/.travis.yml b/.travis.yml index 81d946f7..bab78128 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,7 +4,14 @@ python: # command to install dependencies install: - "python setup.py -q install" + - "pip install python-coveralls" + - "pip install pytest-cov" # command to run tests #script: nosetests --with-doctest #script: py.test run-tests.py ./pywb/ --doctest-modules --ignore=setup.py -script: py.test -v --doctest-module ./tests/*.py ./pywb/ +#script: py.test -v --doctest-module ./tests/*.py ./pywb/ +script: + py.test --cov-config .coveragerc --cov pywb -v --doctest-module ./pywb/ tests/ + +after_success: + coveralls diff --git a/pywb/archivalrouter.py b/pywb/archivalrouter.py index 354edddd..4d28b57e 100644 --- a/pywb/archivalrouter.py +++ b/pywb/archivalrouter.py @@ -3,13 +3,13 @@ import re from wbrequestresponse import WbRequest, WbResponse from pywb.rewrite.url_rewriter import UrlRewriter -from pywb.rewrite.wburl import WbUrl + #================================================================= # ArchivalRouter -- route WB requests in archival mode #================================================================= class ArchivalRouter: - def __init__(self, routes, hostpaths = None, abs_path = True, home_view = None, error_view = None): + def __init__(self, routes, hostpaths=None, abs_path=True, home_view=None, error_view=None): self.routes = routes self.fallback = ReferRedirect(hostpaths) self.abs_path = abs_path @@ -69,24 +69,25 @@ class Route: if not matcher: return None - rel_prefix = matcher.group(0) + matched_str = matcher.group(0) - if rel_prefix: - wb_prefix = env['SCRIPT_NAME'] + '/' + rel_prefix + '/' - wb_url_str = request_uri[len(rel_prefix) + 2:] # remove the '/' + rel_prefix part of uri + if matched_str: + rel_prefix = env['SCRIPT_NAME'] + '/' + matched_str + '/' + wb_url_str = request_uri[len(matched_str) + 2:] # remove the '/' + rel_prefix part of uri else: - wb_prefix = env['SCRIPT_NAME'] + '/' + rel_prefix = env['SCRIPT_NAME'] + '/' wb_url_str = request_uri[1:] # the request_uri is the wb_url, since no coll coll = matcher.group(self.coll_group) wbrequest = WbRequest(env, - request_uri = request_uri, - wb_url_str = wb_url_str, - wb_prefix = wb_prefix, - coll = coll, - host_prefix = WbRequest.make_host_prefix(env) if use_abs_prefix else '', - wburl_class = self.handler.get_wburl_type()) + request_uri=request_uri, + wb_url_str=wb_url_str, + rel_prefix=rel_prefix, + coll=coll, + use_abs_prefix=use_abs_prefix, + wburl_class = self.handler.get_wburl_type(), + urlrewriter_class=UrlRewriter) # Allow for applying of additional filters diff --git a/pywb/cdx/canonicalize.py b/pywb/cdx/canonicalize.py index e0adb5c1..e2f818b9 100644 --- a/pywb/cdx/canonicalize.py +++ b/pywb/cdx/canonicalize.py @@ -2,6 +2,7 @@ """ import surt +import urlparse from cdxobject import CDXException @@ -69,6 +70,109 @@ index.html?a=b?c=)/') return surt +#================================================================= +def calc_search_range(url, match_type, surt_ordered=True, url_canon=None): + """ + Canonicalize a url (either with custom canonicalizer or + standard canonicalizer with or without surt) + + Then, compute a start and end search url search range + for a given match type. + + Support match types: + * exact + * prefix + * host + * domain (only available when for surt ordering) + + Examples below: + + # surt ranges + >>> calc_search_range('http://example.com/path/file.html', 'exact') + ('com,example)/path/file.html', 'com,example)/path/file.html!') + + >>> calc_search_range('http://example.com/path/file.html', 'prefix') + ('com,example)/path/file.html', 'com,example)/path/file.htmm') + + >>> calc_search_range('http://example.com/path/file.html', 'host') + ('com,example)/', 'com,example*') + + >>> calc_search_range('http://example.com/path/file.html', 'domain') + ('com,example)/', 'com,example-') + + special case for tld domain range + >>> calc_search_range('com', 'domain') + ('com,', 'com-') + + # non-surt ranges + >>> calc_search_range('http://example.com/path/file.html', 'exact', False) + ('example.com/path/file.html', 'example.com/path/file.html!') + + >>> calc_search_range('http://example.com/path/file.html', 'prefix', False) + ('example.com/path/file.html', 'example.com/path/file.htmm') + + >>> calc_search_range('http://example.com/path/file.html', 'host', False) + ('example.com/', 'example.com0') + + # domain range not supported + >>> calc_search_range('http://example.com/path/file.html', 'domain', False) + Traceback (most recent call last): + Exception: matchType=domain unsupported for non-surt + """ + def inc_last_char(x): + return x[0:-1] + chr(ord(x[-1]) + 1) + + if not url_canon: + # make new canon + url_canon = UrlCanonicalizer(surt_ordered) + else: + # ensure surt order matches url_canon + surt_ordered = url_canon.surt_ordered + + start_key = url_canon(url) + + if match_type == 'exact': + end_key = start_key + '!' + + elif match_type == 'prefix': + # add trailing slash if url has it + if url.endswith('/') and not start_key.endswith('/'): + start_key += '/' + + end_key = inc_last_char(start_key) + + elif match_type == 'host': + if surt_ordered: + host = start_key.split(')/')[0] + + start_key = host + ')/' + end_key = host + '*' + else: + host = urlparse.urlsplit(url).netloc + + start_key = host + '/' + end_key = host + '0' + + elif match_type == 'domain': + if not surt_ordered: + raise Exception('matchType=domain unsupported for non-surt') + + host = start_key.split(')/')[0] + + # if tld, use com, as start_key + # otherwise, stick with com,example)/ + if not ',' in host: + start_key = host + ',' + else: + start_key = host + ')/' + + end_key = host + '-' + else: + raise Exception('Invalid match_type: ' + match_type) + + return (start_key, end_key) + + if __name__ == "__main__": import doctest doctest.testmod() diff --git a/pywb/cdx/cdxobject.py b/pywb/cdx/cdxobject.py index 203cb7ef..4eba8025 100644 --- a/pywb/cdx/cdxobject.py +++ b/pywb/cdx/cdxobject.py @@ -77,3 +77,34 @@ class CDXObject(OrderedDict): li = itertools.imap(lambda (n, val): val, self.items()) return ' '.join(li) + + +#================================================================= +class IDXObject(OrderedDict): + + FORMAT = ['urlkey', 'part', 'offset', 'length', 'lineno'] + NUM_REQ_FIELDS = len(FORMAT) - 1 # lineno is an optional field + + def __init__(self, idxline): + OrderedDict.__init__(self) + + idxline = idxline.rstrip() + fields = idxline.split('\t') + + if len(fields) < self.NUM_REQ_FIELDS: + msg = 'invalid idx format: {0} fields found, {1} required' + raise Exception(msg.format(len(fields), self.NUM_REQ_FIELDS)) + + for header, field in itertools.izip(self.FORMAT, fields): + self[header] = field + + self['offset'] = int(self['offset']) + self['length'] = int(self['length']) + lineno = self.get('lineno') + if lineno: + self['lineno'] = int(lineno) + + self.idxline = idxline + + def __str__(self): + return self.idxline diff --git a/pywb/cdx/cdxops.py b/pywb/cdx/cdxops.py index 58bd920b..247f3d18 100644 --- a/pywb/cdx/cdxops.py +++ b/pywb/cdx/cdxops.py @@ -1,4 +1,4 @@ -from cdxobject import CDXObject, AccessException +from cdxobject import CDXObject, IDXObject, AccessException from pywb.utils.timeutils import timestamp_to_sec import bisect @@ -56,7 +56,7 @@ def cdx_text_out(cdx, fields): def cdx_load_and_filter(sources, params): cdx_iter = load_cdx_streams(sources, params) - cdx_iter = make_cdx_iter(cdx_iter) + cdx_iter = make_obj_iter(cdx_iter, params) if params.get('proxyAll'): return cdx_iter @@ -102,9 +102,15 @@ def load_cdx_streams(sources, params): #================================================================= -# convert text cdx stream to CDXObject -def make_cdx_iter(text_iter): - return itertools.imap(lambda line: CDXObject(line), text_iter) +# convert text cdx stream to CDXObject/IDXObject +def make_obj_iter(text_iter, params): + # already converted + if params.get('showPagedIndex'): + cls = IDXObject + else: + cls = CDXObject + + return itertools.imap(lambda line: cls(line), text_iter) #================================================================= diff --git a/pywb/cdx/cdxserver.py b/pywb/cdx/cdxserver.py index 69f19d21..1a68f7e4 100644 --- a/pywb/cdx/cdxserver.py +++ b/pywb/cdx/cdxserver.py @@ -1,10 +1,13 @@ -from canonicalize import UrlCanonicalizer +from canonicalize import UrlCanonicalizer, calc_search_range from cdxops import cdx_load -from cdxsource import CDXSource, CDXFile, RemoteCDXSource +from cdxsource import CDXSource, CDXFile, RemoteCDXSource, RedisCDXSource +from zipnum import ZipNumCluster from cdxobject import CDXObject, CaptureNotFoundException, CDXException from cdxdomainspecific import load_domain_specific_cdx_rules +from pywb.utils.loaders import is_http + from itertools import chain import logging import os @@ -14,8 +17,23 @@ import urlparse #================================================================= class BaseCDXServer(object): def __init__(self, **kwargs): - self.url_canon = kwargs.get('url_canon', UrlCanonicalizer()) - self.fuzzy_query = kwargs.get('fuzzy_query') + ds_rules = kwargs.get('ds_rules') + surt_ordered = kwargs.get('surt_ordered', True) + + # load from domain-specific rules + if ds_rules: + self.url_canon, self.fuzzy_query = ( + load_domain_specific_cdx_rules(ds_rules, surt_ordered)) + # or custom passed in canonicalizer + else: + self.url_canon = kwargs.get('url_canon') + self.fuzzy_query = kwargs.get('fuzzy_query') + + # set default canonicalizer if none set thus far + if not self.url_canon: + self.url_canon = UrlCanonicalizer(surt_ordered) + + # set perms checker, if any self.perms_checker = kwargs.get('perms_checker') def _check_cdx_iter(self, cdx_iter, params): @@ -66,7 +84,7 @@ class CDXServer(BaseCDXServer): def __init__(self, paths, **kwargs): super(CDXServer, self).__init__(**kwargs) - self.sources = create_cdx_sources(paths) + self.sources = create_cdx_sources(paths, kwargs.get('config')) def load_cdx(self, **params): # if key not set, assume 'url' is set and needs canonicalization @@ -77,7 +95,14 @@ class CDXServer(BaseCDXServer): msg = 'A url= param must be specified to query the cdx server' raise CDXException(msg) - params['key'] = self.url_canon(url) + #params['key'] = self.url_canon(url) + match_type = params.get('matchType', 'exact') + + key, end_key = calc_search_range(url=url, + match_type=match_type, + url_canon=self.url_canon) + params['key'] = key + params['end_key'] = end_key cdx_iter = cdx_load(self.sources, params, self.perms_checker) @@ -124,36 +149,29 @@ def create_cdx_server(config, ds_rules_file=None): paths = config.get('index_paths') surt_ordered = config.get('surt_ordered', True) perms_checker = config.get('perms_checker') + pass_config = config else: paths = config surt_ordered = True perms_checker = None + pass_config = None logging.debug('CDX Surt-Ordered? ' + str(surt_ordered)) - if ds_rules_file: - canon, fuzzy = load_domain_specific_cdx_rules(ds_rules_file, - surt_ordered) - else: - canon, fuzzy = None, None - - if not canon: - canon = UrlCanonicalizer(surt_ordered) - - if (isinstance(paths, str) and - any(paths.startswith(x) for x in ['http://', 'https://'])): + if isinstance(paths, str) and is_http(paths): server_cls = RemoteCDXServer else: server_cls = CDXServer return server_cls(paths, - url_canon=canon, - fuzzy_query=fuzzy, + config=pass_config, + surt_ordered=surt_ordered, + ds_rules=ds_rules_file, perms_checker=perms_checker) #================================================================= -def create_cdx_sources(paths): +def create_cdx_sources(paths, config=None): sources = [] if not isinstance(paths, list): @@ -161,13 +179,13 @@ def create_cdx_sources(paths): for path in paths: if isinstance(path, CDXSource): - add_cdx_source(sources, path) + add_cdx_source(sources, path, config) elif isinstance(path, str): if os.path.isdir(path): for file in os.listdir(path): - add_cdx_source(sources, path + file) + add_cdx_source(sources, path + file, config) else: - add_cdx_source(sources, path) + add_cdx_source(sources, path, config) if len(sources) == 0: logging.exception('No CDX Sources Found from: ' + str(sources)) @@ -176,9 +194,9 @@ def create_cdx_sources(paths): #================================================================= -def add_cdx_source(sources, source): +def add_cdx_source(sources, source, config): if not isinstance(source, CDXSource): - source = create_cdx_source(source) + source = create_cdx_source(source, config) if not source: return @@ -187,19 +205,20 @@ def add_cdx_source(sources, source): #================================================================= -def create_cdx_source(filename): - if filename.startswith('http://') or filename.startswith('https://'): +def create_cdx_source(filename, config): + if is_http(filename): return RemoteCDXSource(filename) + if filename.startswith('redis://'): + return RedisCDXSource(filename, config) + if filename.endswith('.cdx'): return CDXFile(filename) + if filename.endswith('.summary'): + return ZipNumCluster(filename, config) + return None - #TODO: support zipnum - #elif filename.endswith('.summary') - # return ZipNumCDXSource(filename) - #elif filename.startswith('redis://') - # return RedisCDXSource(filename) #================================================================= diff --git a/pywb/cdx/cdxsource.py b/pywb/cdx/cdxsource.py index a8c92be5..783cf36b 100644 --- a/pywb/cdx/cdxsource.py +++ b/pywb/cdx/cdxsource.py @@ -1,9 +1,9 @@ -from pywb.utils.binsearch import iter_exact, iter_prefix +from pywb.utils.binsearch import iter_range from pywb.utils.loaders import SeekableTextFileReader import urllib import urllib2 - +import itertools #================================================================= class CDXSource(object): @@ -24,17 +24,7 @@ class CDXFile(CDXSource): def load_cdx(self, params): source = SeekableTextFileReader(self.filename) - - match_type = params.get('matchType') - - if match_type == 'prefix': - iter_func = iter_prefix - else: - iter_func = iter_exact - - key = params.get('key') - - return iter_func(source, key) + return iter_range(source, params.get('key'), params.get('end_key')) def __str__(self): return 'CDX File - ' + self.filename @@ -90,3 +80,35 @@ class RemoteCDXSource(CDXSource): def __str__(self): return 'Remote CDX Server: ' + self.remote_url + + +#================================================================= +class RedisCDXSource(CDXSource): + DEFAULT_KEY_PREFIX = 'c:' + + def __init__(self, redis_url, config=None): + import redis + self.redis = redis.StrictRedis.from_url(redis_url) + + self.key_prefix = self.DEFAULT_KEY_PREFIX + if config: + self.key_prefix = config.get('redis_key_prefix', self.key_prefix) + + + def load_cdx(self, params): + """ + Load cdx from redis cache, from an ordered list + + Currently, there is no support for range queries + Only 'exact' matchType is supported + """ + key = params['key'] + + # ensure only url/surt is part of key + key = key.split(' ')[0] + cdx_list = self.redis.zrange(self.key_prefix + key, 0, -1) + + # key is not part of list, so prepend to each line + key += ' ' + cdx_list = itertools.imap(lambda x: key + x, cdx_list) + return cdx_list diff --git a/pywb/cdx/test/cdxserver_test.py b/pywb/cdx/test/cdxserver_test.py index 2d023729..0e799ce9 100644 --- a/pywb/cdx/test/cdxserver_test.py +++ b/pywb/cdx/test/cdxserver_test.py @@ -132,8 +132,8 @@ org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db tex ('filename', 'dupes.warc.gz')] # NOTE: external dependency -- need self-contained test ->>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'example.com', output = 'raw', limit = '2') ->>> pprint.pprint(x.next().items()) +#>>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'example.com', output = 'raw', limit = '2') +#>>> pprint.pprint(x.next().items()) [('urlkey', 'com,example)/'), ('timestamp', '20020120142510'), ('original', 'http://example.com:80/'), diff --git a/pywb/cdx/zipnum.py b/pywb/cdx/zipnum.py new file mode 100644 index 00000000..847c660f --- /dev/null +++ b/pywb/cdx/zipnum.py @@ -0,0 +1,203 @@ +import os +import collections +import itertools +import logging +from cStringIO import StringIO +import datetime + +from cdxsource import CDXSource +from cdxobject import IDXObject + +from pywb.utils.loaders import BlockLoader +from pywb.utils.loaders import SeekableTextFileReader +from pywb.utils.bufferedreaders import gzip_decompressor +from pywb.utils.binsearch import iter_range, linearsearch + + +#================================================================= +class ZipBlocks: + def __init__(self, part, offset, length, count): + self.part = part + self.offset = offset + self.length = length + self.count = count + + +#================================================================= +def readline_to_iter(stream): + try: + count = 0 + buff = stream.readline() + while buff: + count += 1 + yield buff + buff = stream.readline() + + finally: + stream.close() + + +#================================================================= +class ZipNumCluster(CDXSource): + DEFAULT_RELOAD_INTERVAL = 10 # in minutes + DEFAULT_MAX_BLOCKS = 50 + + def __init__(self, summary, config=None): + + loc = None + cookie_maker = None + self.max_blocks = self.DEFAULT_MAX_BLOCKS + reload_ival = self.DEFAULT_RELOAD_INTERVAL + + if config: + loc = config.get('zipnum_loc') + cookie_maker = config.get('cookie_maker') + + self.max_blocks = config.get('max_blocks', self.max_blocks) + + reload_ival = config.get('reload_interval', reload_ival) + + if not loc: + splits = os.path.splitext(summary) + loc = splits[0] + '.loc' + + self.summary = summary + self.loc_filename = loc + + # initial loc map + self.loc_map = {} + self.loc_mtime = 0 + self.load_loc() + + # reload interval + self.loc_update_time = datetime.datetime.now() + self.reload_interval = datetime.timedelta(minutes=reload_ival) + + self.blk_loader = BlockLoader(cookie_maker=cookie_maker) + + def load_loc(self): + # check modified time of current file before loading + new_mtime = os.path.getmtime(self.loc_filename) + if (new_mtime == self.loc_mtime): + return + + # update loc file mtime + self.loc_mtime = new_mtime + + logging.debug('Loading loc from: ' + self.loc_filename) + with open(self.loc_filename) as fh: + for line in fh: + parts = line.rstrip().split('\t') + self.loc_map[parts[0]] = parts[1:] + + @staticmethod + def reload_timed(timestamp, val, delta, func): + now = datetime.datetime.now() + if now - timestamp >= delta: + func() + return now + return None + + def reload_loc(self): + reload_time = self.reload_timed(self.loc_update_time, + self.loc_map, + self.reload_interval, + self.load_loc) + + if reload_time: + self.loc_update_time = reload_time + + def lookup_loc(self, part): + return self.loc_map[part] + + def load_cdx(self, params): + self.reload_loc() + + reader = SeekableTextFileReader(self.summary) + + idx_iter = iter_range(reader, + params['key'], + params['end_key'], + prev_size=1) + + if params.get('showPagedIndex'): + params['proxyAll'] = True + return idx_iter + else: + blocks = self.idx_to_cdx(idx_iter, params) + + def gen_cdx(): + for blk in blocks: + for cdx in blk: + yield cdx + + return gen_cdx() + + def idx_to_cdx(self, idx_iter, params): + blocks = None + ranges = [] + + for idx in idx_iter: + idx = IDXObject(idx) + + if (blocks and blocks.part == idx['part'] and + blocks.offset + blocks.length == idx['offset'] and + blocks.count < self.max_blocks): + + blocks.length += idx['length'] + blocks.count += 1 + ranges.append(idx['length']) + + else: + if blocks: + yield self.block_to_cdx_iter(blocks, ranges, params) + + blocks = ZipBlocks(idx['part'], + idx['offset'], + idx['length'], + 1) + + ranges = [blocks.length] + + if blocks: + yield self.block_to_cdx_iter(blocks, ranges, params) + + def block_to_cdx_iter(self, blocks, ranges, params): + last_exc = None + last_traceback = None + + for location in self.lookup_loc(blocks.part): + try: + return self.load_blocks(location, blocks, ranges, params) + except Exception as exc: + last_exc = exc + import sys + last_traceback = sys.exc_info()[2] + + if last_exc: + raise exc, None, last_traceback + else: + raise Exception('No Locations Found for: ' + block.part) + + def load_blocks(self, location, blocks, ranges, params): + + if (logging.getLogger().getEffectiveLevel() <= logging.DEBUG): + msg = 'Loading {b.count} blocks from {loc}:{b.offset}+{b.length}' + logging.debug(msg.format(b=blocks, loc=location)) + + reader = self.blk_loader.load(location, blocks.offset, blocks.length) + + def decompress_block(range_): + decomp = gzip_decompressor() + buff = decomp.decompress(reader.read(range_)) + return readline_to_iter(StringIO(buff)) + + iter_ = itertools.chain(*itertools.imap(decompress_block, ranges)) + + # start bound + iter_ = linearsearch(iter_, params['key']) + + # end bound + end = params['end_key'] + iter_ = itertools.takewhile(lambda line: line < end, iter_) + return iter_ diff --git a/pywb/handlers.py b/pywb/handlers.py index 4be855e3..c82db7fe 100644 --- a/pywb/handlers.py +++ b/pywb/handlers.py @@ -10,19 +10,28 @@ from wbexceptions import WbException, NotFoundException from views import TextCapturesView -class BaseHandler: - @staticmethod - def get_wburl_type(): - return WbUrl - +#================================================================= +class BaseHandler(object): def __call__(self, wbrequest): return wbrequest + def get_wburl_type(self): + return None + + +#================================================================= +class WbUrlHandler(BaseHandler): + def get_wburl_type(self): + return WbUrl + + #================================================================= # Standard WB Handler #================================================================= -class WBHandler(BaseHandler): - def __init__(self, index_reader, replay, html_view = None, search_view = None): +class WBHandler(WbUrlHandler): + def __init__(self, index_reader, replay, + html_view=None, search_view=None): + self.index_reader = index_reader self.replay = replay @@ -31,7 +40,6 @@ class WBHandler(BaseHandler): self.html_view = html_view self.search_view = search_view - def __call__(self, wbrequest): if wbrequest.wb_url_str == '/': return self.render_search_page(wbrequest) @@ -61,6 +69,7 @@ class WBHandler(BaseHandler): def __str__(self): return 'WBHandler: ' + str(self.index_reader) + ', ' + str(self.replay) + #================================================================= # CDX-Server Handler -- pass all params to cdx server #================================================================= @@ -75,11 +84,6 @@ class CDXHandler(BaseHandler): return self.view.render_response(wbrequest, cdx_lines) - - @staticmethod - def get_wburl_type(): - return None - def __str__(self): return 'Index Reader: ' + str(self.index_reader) @@ -115,10 +119,6 @@ class StaticHandler(BaseHandler): except IOError: raise NotFoundException('Static File Not Found: ' + wbrequest.wb_url_str) - @staticmethod - def get_wburl_type(): - return None - def __str__(self): return 'Static files from ' + self.static_path @@ -130,6 +130,7 @@ class DebugEchoEnvHandler(BaseHandler): def __call__(self, wbrequest): return WbResponse.text_response(str(wbrequest.env)) + #================================================================= class DebugEchoHandler(BaseHandler): def __call__(self, wbrequest): @@ -150,5 +151,3 @@ class PerfTimer: self.end = time.clock() if self.perfdict is not None: self.perfdict[self.name] = str(self.end - self.start) - - diff --git a/pywb/indexreader.py b/pywb/indexreader.py index b55de029..cea27a8f 100644 --- a/pywb/indexreader.py +++ b/pywb/indexreader.py @@ -37,7 +37,7 @@ class IndexReader(object): def load_cdx(self, **params): return self.cdx_server.load_cdx(**params) - def get_query_params(self, wburl, limit = 150000, collapse_time = None, replay_closest = 10): + def get_query_params(self, wburl, limit = 150000, collapse_time = None, replay_closest = 100): if wburl.type == wburl.URL_QUERY: raise NotImplementedError('Url Query Not Yet Supported') diff --git a/pywb/proxy.py b/pywb/proxy.py index 107f9d96..fc14d1e5 100644 --- a/pywb/proxy.py +++ b/pywb/proxy.py @@ -45,14 +45,14 @@ class ProxyRouter: return None wbrequest = WbRequest(env, - request_uri = url, - wb_url_str = url, - wb_prefix = '', - coll = '', - host_prefix = self.hostpaths[0], - wburl_class = self.handler.get_wburl_type(), - url_rewriter_class = ProxyHttpsUrlRewriter, - is_proxy = True) + request_uri=url, + wb_url_str=url, + #rel_prefix=url, + #host_prefix=self.hostpaths[0], + wburl_class=self.handler.get_wburl_type(), + urlrewriter_class=ProxyHttpsUrlRewriter, + use_abs_prefix=False, + is_proxy=True) return self.handler(wbrequest) diff --git a/pywb/replay_views.py b/pywb/replay_views.py index f5f9c504..4c6907eb 100644 --- a/pywb/replay_views.py +++ b/pywb/replay_views.py @@ -7,7 +7,6 @@ from wbrequestresponse import WbResponse from wbexceptions import CaptureException, InternalRedirect from pywb.warc.recordloader import ArchiveLoadFailed - #================================================================= class ReplayView: def __init__(self, content_loader, content_rewriter, head_insert_view = None, @@ -49,6 +48,9 @@ class ReplayView: # check if redir is needed self._redirect_if_needed(wbrequest, cdx) + # one more check for referrer-based self-redirect + self._reject_referrer_self_redirect(wbrequest, status_headers) + response = None if self.content_rewriter and wbrequest.wb_url.mod != 'id_': @@ -148,6 +150,7 @@ class ReplayView: def _reject_self_redirect(self, wbrequest, cdx, status_headers): + # self-redirect via location if status_headers.statusline.startswith('3'): request_url = wbrequest.wb_url.url.lower() location_url = status_headers.get_header('Location').lower() @@ -156,3 +159,16 @@ class ReplayView: if (UrlRewriter.strip_protocol(request_url) == UrlRewriter.strip_protocol(location_url)): raise CaptureException('Self Redirect: ' + str(cdx)) + def _reject_referrer_self_redirect(self, wbrequest, status_headers): + # at correct timestamp now, but must check for referrer redirect + # indirect self-redirect, via meta-refresh, if referrer is same as current url + if status_headers.statusline.startswith('2'): + # build full url even if using relative-rewriting + request_url = wbrequest.host_prefix + wbrequest.rel_prefix + str(wbrequest.wb_url) + referrer_url = wbrequest.referrer + if (referrer_url and UrlRewriter.strip_protocol(request_url) == UrlRewriter.strip_protocol(referrer_url)): + raise CaptureException('Self Redirect via Referrer: ' + str(wbrequest.wb_url)) + + + + diff --git a/pywb/rewrite/rewrite_content.py b/pywb/rewrite/rewrite_content.py index 9f904764..81cd23c9 100644 --- a/pywb/rewrite/rewrite_content.py +++ b/pywb/rewrite/rewrite_content.py @@ -6,7 +6,7 @@ from regex_rewriters import RegexRewriter, JSRewriter, CSSRewriter, XMLRewriter from header_rewriter import HeaderRewriter, RewrittenStatusAndHeaders from pywb.utils.statusandheaders import StatusAndHeaders -from pywb.utils.bufferedreaders import BufferedReader, ChunkedDataReader +from pywb.utils.bufferedreaders import DecompressingBufferedReader, ChunkedDataReader class RewriteContent: @@ -54,7 +54,7 @@ class RewriteContent: # ========================================================================= # special case -- need to ungzip the body if (rewritten_headers.contains_removed_header('content-encoding', 'gzip')): - stream = BufferedReader(stream, decomp_type='gzip') + stream = DecompressingBufferedReader(stream, decomp_type='gzip') if rewritten_headers.charset: encoding = rewritten_headers.charset diff --git a/pywb/rewrite/test/test_rewrite_live.py b/pywb/rewrite/test/test_rewrite_live.py index 691bec6d..6d66ce60 100644 --- a/pywb/rewrite/test/test_rewrite_live.py +++ b/pywb/rewrite/test/test_rewrite_live.py @@ -24,9 +24,9 @@ def test_example_2(): -def test_example_3(): - status_headers, buff = get_rewritten('http://archive.org/', urlrewriter) +#def test_example_3(): +# status_headers, buff = get_rewritten('http://archive.org/', urlrewriter) - assert '/pywb/20131226101010/http://example.com/about/terms.php' in buff, buff +# assert '/pywb/20131226101010/http://example.com/about/terms.php' in buff, buff diff --git a/pywb/rewrite/url_rewriter.py b/pywb/rewrite/url_rewriter.py index c4cc4054..6889fc92 100644 --- a/pywb/rewrite/url_rewriter.py +++ b/pywb/rewrite/url_rewriter.py @@ -103,10 +103,12 @@ class UrlRewriter: return self.prefix + self.wburl.to_str(timestamp=timestamp, url=url) - def set_base_url(self, newUrl): self.wburl.url = newUrl + def __repr__(self): + return "UrlRewriter('{0}', '{1}')".format(self.wburl, self.prefix) + @staticmethod def strip_protocol(url): for protocol in UrlRewriter.PROTOCOLS: diff --git a/pywb/rewrite/wburl.py b/pywb/rewrite/wburl.py index 77bd437d..6be56b6c 100644 --- a/pywb/rewrite/wburl.py +++ b/pywb/rewrite/wburl.py @@ -1,9 +1,5 @@ #!/usr/bin/python -import re -import rfc3987 - -# WbUrl : wb archival url representation for WB """ WbUrl represents the standard wayback archival url format. A regular url is a subset of the WbUrl (latest replay). @@ -34,9 +30,38 @@ replay form: latest_replay: (no timestamp) http://example.com + +Additionally, the BaseWbUrl provides the base components +(url, timestamp, end_timestamp, modifier, type) which +can be used to provide a custom representation of the +wayback url format. + """ -class WbUrl: +import re +import rfc3987 + + +#================================================================= +class BaseWbUrl(object): + QUERY = 'query' + URL_QUERY = 'url_query' + REPLAY = 'replay' + LATEST_REPLAY = 'latest_replay' + + + def __init__(self, url='', mod='', + timestamp='', end_timestamp='', type=None): + + self.url = url + self.timestamp = timestamp + self.end_timestamp = end_timestamp + self.mod = mod + self.type = type + + +#================================================================= +class WbUrl(BaseWbUrl): """ # Replay Urls # ====================== @@ -107,22 +132,14 @@ class WbUrl: QUERY_REGEX = re.compile('^(?:([\w\-:]+)/)?(\d*)(?:-(\d+))?\*/?(.*)$') REPLAY_REGEX = re.compile('^(\d*)([a-z]+_)?/{0,3}(.*)$') - QUERY = 'query' - URL_QUERY = 'url_query' - REPLAY = 'replay' - LATEST_REPLAY = 'latest_replay' - DEFAULT_SCHEME = 'http://' # ====================== def __init__(self, url): + super(WbUrl, self).__init__() + self.original_url = url - self.type = None - self.url = '' - self.timestamp = '' - self.end_timestamp = '' - self.mod = '' if not any (f(url) for f in [self._init_query, self._init_replay]): raise Exception('Invalid WbUrl: ', url) diff --git a/tests/test_archivalrouter.py b/pywb/test/test_archivalrouter.py similarity index 73% rename from tests/test_archivalrouter.py rename to pywb/test/test_archivalrouter.py index 415626e6..4379fbfd 100644 --- a/tests/test_archivalrouter.py +++ b/pywb/test/test_archivalrouter.py @@ -1,13 +1,19 @@ """ -Test Route -# route with relative path ->>> Route('web', BaseHandler())({'REL_REQUEST_URI': '/web/test.example.com', 'SCRIPT_NAME': ''}, False) -{'wb_url': ('latest_replay', '', '', 'http://test.example.com', 'http://test.example.com'), 'coll': 'web', 'wb_prefix': '/web/', 'request_uri': '/web/test.example.com'} +# Test WbRequest parsed via a Route +# route with relative path, print resulting wbrequest +>>> print_req(Route('web', WbUrlHandler())({'REL_REQUEST_URI': '/web/test.example.com', 'SCRIPT_NAME': ''}, False)) +{'coll': 'web', + 'request_uri': '/web/test.example.com', + 'wb_prefix': '/web/', + 'wb_url': ('latest_replay', '', '', 'http://test.example.com', 'http://test.example.com')} -# route with absolute path, running at script /my_pywb ->>> Route('web', BaseHandler())({'REL_REQUEST_URI': '/web/2013im_/test.example.com', 'SCRIPT_NAME': '/my_pywb', 'HTTP_HOST': 'localhost:8081', 'wsgi.url_scheme': 'https'}, True) -{'wb_url': ('replay', '2013', 'im_', 'http://test.example.com', '2013im_/http://test.example.com'), 'coll': 'web', 'wb_prefix': 'https://localhost:8081/my_pywb/web/', 'request_uri': '/web/2013im_/test.example.com'} +# route with absolute path, running at script /my_pywb, print resultingwbrequest +>>> print_req(Route('web', WbUrlHandler())({'REL_REQUEST_URI': '/web/2013im_/test.example.com', 'SCRIPT_NAME': '/my_pywb', 'HTTP_HOST': 'localhost:8081', 'wsgi.url_scheme': 'https'}, True)) +{'coll': 'web', + 'request_uri': '/web/2013im_/test.example.com', + 'wb_prefix': 'https://localhost:8081/my_pywb/web/', + 'wb_url': ('replay', '2013', 'im_', 'http://test.example.com', '2013im_/http://test.example.com')} # not matching route -- skipped >>> Route('web', BaseHandler())({'REL_REQUEST_URI': '/other/test.example.com', 'SCRIPT_NAME': ''}, False) @@ -65,7 +71,12 @@ False """ from pywb.archivalrouter import Route, ReferRedirect -from pywb.handlers import BaseHandler +from pywb.handlers import BaseHandler, WbUrlHandler +import pprint + +def print_req(req): + varlist = vars(req) + pprint.pprint({k: varlist[k] for k in ('request_uri', 'wb_prefix', 'wb_url', 'coll')}) def _test_redir(match_host, request_uri, referrer, script_name = '', coll = 'coll', http_host = None): @@ -74,7 +85,7 @@ def _test_redir(match_host, request_uri, referrer, script_name = '', coll = 'col if http_host: env['HTTP_HOST'] = http_host - routes = [Route(coll, BaseHandler())] + routes = [Route(coll, WbUrlHandler())] redir = ReferRedirect(match_host) #req = WbRequest.from_uri(request_uri, env) @@ -85,4 +96,6 @@ def _test_redir(match_host, request_uri, referrer, script_name = '', coll = 'col return rep.status_headers.get_header('Location') - +if __name__ == "__main__": + import doctest + doctest.testmod() diff --git a/pywb/test/test_wbrequestresponse.py b/pywb/test/test_wbrequestresponse.py new file mode 100644 index 00000000..600ec926 --- /dev/null +++ b/pywb/test/test_wbrequestresponse.py @@ -0,0 +1,87 @@ +""" +# WbRequest Tests +# ================= +>>> print_req_from_uri('/save/_embed/example.com/?a=b') +{'wb_url': ('latest_replay', '', '', 'http://_embed/example.com/?a=b', 'http://_embed/example.com/?a=b'), 'coll': 'save', 'wb_prefix': '/save/', 'request_uri': '/save/_embed/example.com/?a=b'} + +>>> print_req_from_uri('/2345/20101024101112im_/example.com/?b=c') +{'wb_url': ('replay', '20101024101112', 'im_', 'http://example.com/?b=c', '20101024101112im_/http://example.com/?b=c'), 'coll': '2345', 'wb_prefix': '/2345/', 'request_uri': '/2345/20101024101112im_/example.com/?b=c'} + +>>> print_req_from_uri('/2010/example.com') +{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': '/2010/', 'request_uri': '/2010/example.com'} + +>>> print_req_from_uri('../example.com') +{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '', 'wb_prefix': '/', 'request_uri': '../example.com'} + +# Abs path +>>> print_req_from_uri('/2010/example.com', {'wsgi.url_scheme': 'https', 'HTTP_HOST': 'localhost:8080'}, use_abs_prefix = True) +{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': 'https://localhost:8080/2010/', 'request_uri': '/2010/example.com'} + +# No Scheme, so stick to relative +>>> print_req_from_uri('/2010/example.com', {'HTTP_HOST': 'localhost:8080'}, use_abs_prefix = True) +{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': '/2010/', 'request_uri': '/2010/example.com'} + + + +# WbResponse Tests +# ================= +>>> WbResponse.text_response('Test') +{'body': ['Test'], 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [('Content-Type', 'text/plain')])} + +>>> WbResponse.text_stream(['Test', 'Another'], '404') +{'body': ['Test', 'Another'], 'status_headers': StatusAndHeaders(protocol = '', statusline = '404', headers = [('Content-Type', 'text/plain')])} + +>>> WbResponse.redir_response('http://example.com/otherfile') +{'body': [], 'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [('Location', 'http://example.com/otherfile')])} + +""" + + +from pywb.rewrite.wburl import WbUrl +from pywb.rewrite.url_rewriter import UrlRewriter +from pywb.utils.statusandheaders import StatusAndHeaders + +from pywb.wbrequestresponse import WbRequest, WbResponse + + +def print_req_from_uri(request_uri, env={}, use_abs_prefix=False): + response = req_from_uri(request_uri, env, use_abs_prefix) + varlist = vars(response) + print str({k: varlist[k] for k in ('request_uri', 'wb_prefix', 'wb_url', 'coll')}) + + +def req_from_uri(request_uri, env={}, use_abs_prefix=False): + if not request_uri: + request_uri = env.get('REL_REQUEST_URI') + + parts = request_uri.split('/', 2) + + # Has coll prefix + if len(parts) == 3: + rel_prefix = '/' + parts[1] + '/' + wb_url_str = parts[2] + coll = parts[1] + # No Coll Prefix + elif len(parts) == 2: + rel_prefix = '/' + wb_url_str = parts[1] + coll = '' + else: + rel_prefix = '/' + wb_url_str = parts[0] + coll = '' + + return WbRequest(env, + request_uri=request_uri, + rel_prefix=rel_prefix, + wb_url_str=wb_url_str, + coll=coll, + wburl_class=WbUrl, + urlrewriter_class=UrlRewriter, + use_abs_prefix=use_abs_prefix) + + +if __name__ == "__main__": + import doctest + doctest.testmod() + diff --git a/pywb/utils/binsearch.py b/pywb/utils/binsearch.py index 96b2e9de..7d939c18 100644 --- a/pywb/utils/binsearch.py +++ b/pywb/utils/binsearch.py @@ -35,6 +35,58 @@ def binsearch_offset(reader, key, compare_func=cmp, block_size=8192): return min_ * block_size +#================================================================= +def binsearch(reader, key, compare_func=cmp, block_size=8192): + """ + Perform a binary search for a specified key to within a 'block_size' + (default 8192) granularity, and return first full line found. + """ + + min_ = binsearch_offset(reader, key, compare_func, block_size) + + reader.seek(min_) + + if min_ > 0: + reader.readline() # skip partial line + + def gen_iter(line): + while line: + yield line.rstrip() + line = reader.readline() + + return gen_iter(reader.readline()) + + +#================================================================= +def linearsearch(iter_, key, prev_size=0, compare_func=cmp): + """ + Perform a linear search over iterator until + current_line >= key + + optionally also tracking upto N previous lines, which are + returned before the first matched line. + + if end of stream is reached before a match is found, + nothing is returned (prev lines discarded also) + """ + + prev_deque = deque(maxlen=prev_size + 1) + + matched = False + + for line in iter_: + prev_deque.append(line) + if compare_func(line, key) >= 0: + matched = True + break + + # no matches, so return empty iterator + if not matched: + return [] + + return itertools.chain(prev_deque, iter_) + + #================================================================= def search(reader, key, prev_size=0, compare_func=cmp, block_size=8192): """ @@ -45,46 +97,27 @@ def search(reader, key, prev_size=0, compare_func=cmp, block_size=8192): When performin_g linear search, keep track of up to N previous lines before first matching line. """ - min_ = binsearch_offset(reader, key, compare_func, block_size) + iter_ = binsearch(reader, key, compare_func, block_size) + iter_ = linearsearch(iter_, + key, prev_size=prev_size, + compare_func=compare_func) + return iter_ - reader.seek(min_) - if min_ > 0: - reader.readline() # skip partial line +#================================================================= +def iter_range(reader, start, end, prev_size=0): + """ + Creates an iterator which iterates over lines where + start <= line < end (end exclusive) + """ - if prev_size > 1: - prev_deque = deque(max_len=prev_size) + iter_ = search(reader, start, prev_size=prev_size) - line = None + end_iter = itertools.takewhile( + lambda line: line < end, + iter_) - while True: - line = reader.readline() - if not line: - break - if compare_func(line, key) >= 0: - break - - if prev_size == 1: - prev = line - elif prev_size > 1: - prev_deque.append(line) - - def gen_iter(line): - """ - Create iterator over any previous lines to - current matched line - """ - if prev_size == 1: - yield prev.rstrip() - elif prev_size > 1: - for i in prev_deque: - yield i.rstrip() - - while line: - yield line.rstrip() - line = reader.readline() - - return gen_iter(line) + return end_iter #================================================================= diff --git a/pywb/utils/bufferedreaders.py b/pywb/utils/bufferedreaders.py index 27a3ed33..6be38b85 100644 --- a/pywb/utils/bufferedreaders.py +++ b/pywb/utils/bufferedreaders.py @@ -11,7 +11,7 @@ def gzip_decompressor(): #================================================================= -class BufferedReader(object): +class DecompressingBufferedReader(object): """ A wrapping line reader which wraps an existing reader. Read operations operate on underlying buffer, which is filled to @@ -29,7 +29,7 @@ class BufferedReader(object): DECOMPRESSORS = {'gzip': gzip_decompressor} - def __init__(self, stream, max_len=0, block_size=1024, decomp_type=None): + def __init__(self, stream, block_size=1024, decomp_type=None): self.stream = stream self.block_size = block_size @@ -44,24 +44,19 @@ class BufferedReader(object): self.buff = None self.num_read = 0 - self.max_len = max_len def _fillbuff(self, block_size=None): if not block_size: block_size = self.block_size if not self.buff or self.buff.pos >= self.buff.len: - if self.max_len > 0: - to_read = min(self.max_len - self.num_read, self.block_size) - else: - to_read = self.block_size - - data = self.stream.read(to_read) + data = self.stream.read(block_size) self._process_read(data) def _process_read(self, data): data = self._decompress(data) - self.num_read += len(data) + self.buff_size = len(data) + self.num_read += self.buff_size self.buff = StringIO.StringIO(data) def _decompress(self, data): @@ -78,12 +73,40 @@ class BufferedReader(object): return data def read(self, length=None): + """ + Fill bytes and read some number of bytes + (up to length if specified) + < length bytes may be read if reached the end of input + or at a buffer boundary. If at a boundary, the subsequent + call will fill buffer anew. + """ self._fillbuff() return self.buff.read(length) def readline(self, length=None): + """ + Fill buffer and read a full line from the buffer + (up to specified length, if provided) + If no newline found at end, try filling buffer again in case + at buffer boundary. + """ self._fillbuff() - return self.buff.readline(length) + linebuff = self.buff.readline(length) + # we may be at a boundary + while not linebuff.endswith('\n'): + if length: + length -= len(linebuff) + if length <= 0: + break + + self._fillbuff() + + if self.buff_size == 0: + break + + linebuff += self.buff.readline(length) + + return linebuff def close(self): if self.stream: @@ -97,7 +120,7 @@ class ChunkedDataException(Exception): #================================================================= -class ChunkedDataReader(BufferedReader): +class ChunkedDataReader(DecompressingBufferedReader): r""" A ChunkedDataReader is a BufferedReader which also supports de-chunking of the data if it happens to be http 'chunk-encoded'. @@ -133,7 +156,7 @@ class ChunkedDataReader(BufferedReader): def _fillbuff(self, block_size=None): if self.not_chunked: - return BufferedReader._fillbuff(self, block_size) + return super(ChunkedDataReader, self)._fillbuff(block_size) if self.all_chunks_read: return diff --git a/pywb/utils/loaders.py b/pywb/utils/loaders.py index 4d458738..a117f539 100644 --- a/pywb/utils/loaders.py +++ b/pywb/utils/loaders.py @@ -9,18 +9,50 @@ import urllib2 import time +def is_http(filename): + return any(filename.startswith(x) for x in ['http://', 'https://']) + + #================================================================= -# load a reader from http -#================================================================= -class HttpLoader(object): +class BlockLoader(object): """ - Load a file-like reader over http using range requests - and an optional cookie created via a cookie_maker + a loader which can stream blocks of content + given a uri, offset and optional length. + Currently supports: http/https and file/local file system """ def __init__(self, cookie_maker=None): self.cookie_maker = cookie_maker def load(self, url, offset, length): + """ + Determine loading method based on uri + """ + if is_http(url): + return self.load_http(url, offset, length) + else: + return self.load_file(url, offset, length) + + def load_file(self, url, offset, length): + """ + Load a file-like reader from the local file system + """ + + if url.startswith('file://'): + url = url[len('file://'):] + + afile = open(url, 'rb') + afile.seek(offset) + + if length > 0: + return LimitReader(afile, length) + else: + return afile + + def load_http(self, url, offset, length): + """ + Load a file-like reader over http using range requests + and an optional cookie created via a cookie_maker + """ if length > 0: range_header = 'bytes={0}-{1}'.format(offset, offset + length - 1) else: @@ -71,25 +103,6 @@ class HMACCookieMaker(object): return cookie -#================================================================= -# load a reader from local filesystem -#================================================================= -class FileLoader(object): - """ - Load a file-like reader from the local file system - """ - - def load(self, url, offset, length): - if url.startswith('file://'): - url = url[len('file://'):] - - afile = open(url, 'rb') - afile.seek(offset) - - if length > 0: - return LimitReader(afile, length) - - #================================================================= # Limit Reader #================================================================= diff --git a/pywb/utils/statusandheaders.py b/pywb/utils/statusandheaders.py index 01bb6614..92e897fc 100644 --- a/pywb/utils/statusandheaders.py +++ b/pywb/utils/statusandheaders.py @@ -65,23 +65,36 @@ class StatusAndHeadersParser(object): """ parse stream for status line and headers return a StatusAndHeaders object + + support continuation headers starting with space or tab """ statusline = stream.readline().rstrip() protocol_status = self.split_prefix(statusline, self.statuslist) if not protocol_status: - msg = 'Expected Status Line - Found: ' + statusline + msg = 'Expected Status Line starting with {0} - Found: {1}' + msg = msg.format(self.statuslist, statusline) raise StatusAndHeadersParserException(msg, statusline) headers = [] line = stream.readline().rstrip() - while line and line != '\r\n': + while line: name, value = line.split(':', 1) - header = (name, value.strip()) + name = name.rstrip(' \t') + value = value.lstrip() + + next_line = stream.readline().rstrip() + + # append continuation lines, if any + while next_line and next_line.startswith((' ', '\t')): + value += next_line + next_line = stream.readline().rstrip() + + header = (name, value) headers.append(header) - line = stream.readline().rstrip() + line = next_line return StatusAndHeaders(statusline=protocol_status[1].strip(), headers=headers, @@ -107,4 +120,3 @@ class StatusAndHeadersParserException(Exception): def __init__(self, msg, statusline): super(StatusAndHeadersParserException, self).__init__(msg) self.statusline = statusline - diff --git a/pywb/utils/test/binsearch_test.py b/pywb/utils/test/binsearch_test.py index d35551ec..40ea1f58 100644 --- a/pywb/utils/test/binsearch_test.py +++ b/pywb/utils/test/binsearch_test.py @@ -9,6 +9,7 @@ org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ te org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz +# Exact Search >>> print_binsearch_results('org,iana)/domains/root', iter_exact) org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz @@ -19,18 +20,45 @@ org,iana)/ 20140126200624 http://www.iana.org/ text/html 200 OSSAPWJ23L56IYVRW3G org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz -# Exact Search +>>> print_binsearch_results('org,iana)/time-zones', iter_exact) +org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz + +# Exact search -- no matches >>> print_binsearch_results('org,iaana)/', iter_exact) >>> print_binsearch_results('org,ibna)/', iter_exact) ->>> print_binsearch_results('org,iana)/time-zones', iter_exact) -org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz + +# Range Search (end exclusive) +>>> print_binsearch_results_range('org,iana)/about', 'org,iana)/domains', iter_range) +org,iana)/about 20140126200706 http://www.iana.org/about text/html 200 6G77LZKFAVKH4PCWWKMW6TRJPSHWUBI3 - - 2962 483588 iana.warc.gz +org,iana)/about/performance/ietf-draft-status 20140126200815 http://www.iana.org/about/performance/ietf-draft-status text/html 302 Y7CTA2QZUSCDTJCSECZNSPIBLJDO7PJJ - - 584 596566 iana.warc.gz +org,iana)/about/performance/ietf-statistics 20140126200804 http://www.iana.org/about/performance/ietf-statistics text/html 302 HNYDN7XRX46RQTT2OFIWXKEYMZQAJWHD - - 582 581890 iana.warc.gz +org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz +org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz + + +# Range Search -- exact +>>> print_binsearch_results_range('org,iana)/about', 'org,iana)/about!', iter_range) +org,iana)/about 20140126200706 http://www.iana.org/about text/html 200 6G77LZKFAVKH4PCWWKMW6TRJPSHWUBI3 - - 2962 483588 iana.warc.gz + +# Range Search -- exact + 1 prev +>>> print_binsearch_results_range('org,iana)/about', 'org,iana)/about!', iter_range, prev_size=1) +org,iana)/_js/2013.1/jquery.js 20140126201307 https://www.iana.org/_js/2013.1/jquery.js warc/revisit - AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO - - 543 778507 iana.warc.gz +org,iana)/about 20140126200706 http://www.iana.org/about text/html 200 6G77LZKFAVKH4PCWWKMW6TRJPSHWUBI3 - - 2962 483588 iana.warc.gz + +# Range Search -- exact + 2 prev +>>> print_binsearch_results_range('org,iana)/about', 'org,iana)/about!', iter_range, prev_size=2) +org,iana)/_js/2013.1/jquery.js 20140126201248 http://www.iana.org/_js/2013.1/jquery.js warc/revisit - AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO - - 544 765491 iana.warc.gz +org,iana)/_js/2013.1/jquery.js 20140126201307 https://www.iana.org/_js/2013.1/jquery.js warc/revisit - AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO - - 543 778507 iana.warc.gz +org,iana)/about 20140126200706 http://www.iana.org/about text/html 200 6G77LZKFAVKH4PCWWKMW6TRJPSHWUBI3 - - 2962 483588 iana.warc.gz + + """ #================================================================= import os -from pywb.utils.binsearch import iter_prefix, iter_exact +from pywb.utils.binsearch import iter_prefix, iter_exact, iter_range from pywb.utils.loaders import SeekableTextFileReader from pywb import get_test_dir @@ -45,6 +73,13 @@ def print_binsearch_results(key, iter_func): print line +def print_binsearch_results_range(key, end_key, iter_func, prev_size=0): + cdx = SeekableTextFileReader(test_cdx_dir + 'iana.cdx') + + for line in iter_func(cdx, key, end_key, prev_size=prev_size): + print line + + if __name__ == "__main__": import doctest doctest.testmod() diff --git a/pywb/utils/test/loaders_test.py b/pywb/utils/test/loaders_test.py index 73d4b3dd..7dc42d83 100644 --- a/pywb/utils/test/loaders_test.py +++ b/pywb/utils/test/loaders_test.py @@ -10,9 +10,9 @@ >>> read_multiple(LimitReader(StringIO.StringIO('abcdefghjiklmnopqrstuvwxyz'), 10), [2, 2, 20]) 'efghji' -# FileLoader Tests (includes LimitReader) +# BlockLoader Tests (includes LimitReader) # Ensure attempt to read more than 100 bytes, reads exactly 100 bytes ->>> len(FileLoader().load(test_cdx_dir + 'iana.cdx', 0, 100).read('400')) +>>> len(BlockLoader().load(test_cdx_dir + 'iana.cdx', 0, 100).read('400')) 100 # SeekableTextFileReader Test @@ -23,25 +23,39 @@ >>> seek_read_full(sr, 100) 'org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200826 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 34054 620049 iana.warc.gz\\n' -#BufferedReader readline() ->>> BufferedReader(open(test_cdx_dir + 'iana.cdx', 'rb')).readline() +# Buffered Reader Tests +#================================================================= + +#DecompressingBufferedReader readline() +>>> DecompressingBufferedReader(open(test_cdx_dir + 'iana.cdx', 'rb')).readline() ' CDX N b a m s k r M S V g\\n' -#BufferedReader readline() with decompression ->>> BufferedReader(open(test_cdx_dir + 'iana.cdx.gz', 'rb'), decomp_type = 'gzip').readline() +#DecompressingBufferedReader readline() with decompression +>>> DecompressingBufferedReader(open(test_cdx_dir + 'iana.cdx.gz', 'rb'), decomp_type = 'gzip').readline() ' CDX N b a m s k r M S V g\\n' ->>> HttpLoader(HMACCookieMaker('test', 'test', 5)).load('http://example.com', 41, 14).read() +>>> BlockLoader(HMACCookieMaker('test', 'test', 5)).load('http://example.com', 41, 14).read() 'Example Domain' + +# test very small block size +>>> dbr = DecompressingBufferedReader(StringIO.StringIO('ABCDEFG\\nHIJKLMN\\nOPQR\\nXYZ'), block_size = 3) +>>> dbr.readline(); dbr.readline(4); dbr.readline(); dbr.readline(); dbr.readline(2); dbr.readline(); dbr.readline() +'ABCDEFG\\n' +'HIJK' +'LMN\\n' +'OPQR\\n' +'XY' +'Z' +'' """ #================================================================= import os import StringIO -from pywb.utils.loaders import FileLoader, HttpLoader, HMACCookieMaker +from pywb.utils.loaders import BlockLoader, HMACCookieMaker from pywb.utils.loaders import LimitReader, SeekableTextFileReader -from pywb.utils.bufferedreaders import BufferedReader +from pywb.utils.bufferedreaders import DecompressingBufferedReader from pywb import get_test_dir #test_cdx_dir = os.path.dirname(os.path.realpath(__file__)) + '/../sample-data/' diff --git a/pywb/utils/test/statusandheaders_test.py b/pywb/utils/test/statusandheaders_test.py new file mode 100644 index 00000000..3473e71e --- /dev/null +++ b/pywb/utils/test/statusandheaders_test.py @@ -0,0 +1,29 @@ +""" +>>> StatusAndHeadersParser(['HTTP/1.0']).parse(StringIO.StringIO(status_headers_1)) +StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '200 OK', headers = [ ('Content-Type', 'ABC'), + ('Some', 'Value'), + ('Multi-Line', 'Value1 Also This')]) + +>>> StatusAndHeadersParser(['Other']).parse(StringIO.StringIO(status_headers_1)) +Traceback (most recent call last): +StatusAndHeadersParserException: Expected Status Line starting with ['Other'] - Found: HTTP/1.0 200 OK +""" + + +from pywb.utils.statusandheaders import StatusAndHeadersParser +import StringIO + + +status_headers_1 = "\ +HTTP/1.0 200 OK\r\n\ +Content-Type: ABC\r\n\ +Some: Value\r\n\ +Multi-Line: Value1\r\n\ + Also This\r\n\ +\r\n\ +Body" + + +if __name__ == "__main__": + import doctest + doctest.testmod() diff --git a/pywb/utils/timeutils.py b/pywb/utils/timeutils.py index 62929d50..7af3401f 100644 --- a/pywb/utils/timeutils.py +++ b/pywb/utils/timeutils.py @@ -17,7 +17,8 @@ DATE_TIMESPLIT = re.compile(r'[^\d]') TIMESTAMP_14 = '%Y%m%d%H%M%S' -PAD_STAMP_END = '29991231235959' +#PAD_STAMP_END = '29991231235959' +PAD_6 = '299912' def iso_date_to_datetime(string): @@ -58,41 +59,145 @@ def iso_date_to_timestamp(string): return datetime_to_timestamp(iso_date_to_datetime(string)) -# default pad is end of range for compatibility -def pad_timestamp(string, pad_str=PAD_STAMP_END): +# pad to certain length (default 6) +def _pad_timestamp(string, pad_str=PAD_6): """ - >>> pad_timestamp('20') - '20991231235959' + >>> _pad_timestamp('20') + '209912' - >>> pad_timestamp('2014') - '20141231235959' + >>> _pad_timestamp('2014') + '201412' - >>> pad_timestamp('20141011') - '20141011235959' + >>> _pad_timestamp('20141011') + '20141011' - >>> pad_timestamp('201410110010') - '20141011001059' + >>> _pad_timestamp('201410110010') + '201410110010' """ str_len = len(string) pad_len = len(pad_str) - return string if str_len >= pad_len else string + pad_str[str_len:] + if str_len < pad_len: + string = string + pad_str[str_len:] + + return string def timestamp_to_datetime(string): """ - >>> timestamp_to_datetime('20131226095010') - time.struct_time(tm_year=2013, tm_mon=12, tm_mday=26, \ -tm_hour=9, tm_min=50, tm_sec=10, tm_wday=3, tm_yday=360, tm_isdst=-1) + # >14-digit -- rest ignored + >>> timestamp_to_datetime('2014122609501011') + datetime.datetime(2014, 12, 26, 9, 50, 10) + # 14-digit + >>> timestamp_to_datetime('20141226095010') + datetime.datetime(2014, 12, 26, 9, 50, 10) + + # 13-digit padding + >>> timestamp_to_datetime('2014122609501') + datetime.datetime(2014, 12, 26, 9, 50, 59) + + # 12-digit padding + >>> timestamp_to_datetime('201412260950') + datetime.datetime(2014, 12, 26, 9, 50, 59) + + # 11-digit padding + >>> timestamp_to_datetime('20141226095') + datetime.datetime(2014, 12, 26, 9, 59, 59) + + # 10-digit padding + >>> timestamp_to_datetime('2014122609') + datetime.datetime(2014, 12, 26, 9, 59, 59) + + # 9-digit padding + >>> timestamp_to_datetime('201412260') + datetime.datetime(2014, 12, 26, 23, 59, 59) + + # 8-digit padding + >>> timestamp_to_datetime('20141226') + datetime.datetime(2014, 12, 26, 23, 59, 59) + + # 7-digit padding + >>> timestamp_to_datetime('2014122') + datetime.datetime(2014, 12, 31, 23, 59, 59) + + # 6-digit padding + >>> timestamp_to_datetime('201410') + datetime.datetime(2014, 10, 31, 23, 59, 59) + + # 5-digit padding + >>> timestamp_to_datetime('20141') + datetime.datetime(2014, 12, 31, 23, 59, 59) + + # 4-digit padding >>> timestamp_to_datetime('2014') - time.struct_time(tm_year=2014, tm_mon=12, tm_mday=31, \ -tm_hour=23, tm_min=59, tm_sec=59, tm_wday=2, tm_yday=365, tm_isdst=-1) + datetime.datetime(2014, 12, 31, 23, 59, 59) + + # 3-digit padding + >>> timestamp_to_datetime('201') + datetime.datetime(2019, 12, 31, 23, 59, 59) + + # 2-digit padding + >>> timestamp_to_datetime('20') + datetime.datetime(2099, 12, 31, 23, 59, 59) + + # 1-digit padding + >>> timestamp_to_datetime('2') + datetime.datetime(2999, 12, 31, 23, 59, 59) + + # 1-digit out-of-range padding + >>> timestamp_to_datetime('3') + datetime.datetime(2999, 12, 31, 23, 59, 59) + + # 0-digit padding + >>> timestamp_to_datetime('') + datetime.datetime(2999, 12, 31, 23, 59, 59) + + # bad month + >>> timestamp_to_datetime('20131709005601') + datetime.datetime(2013, 12, 9, 0, 56, 1) + + # all out of range except minutes + >>> timestamp_to_datetime('40001965252477') + datetime.datetime(2999, 12, 31, 23, 24, 59) + """ - # Default pad to end of range for comptability - return time.strptime(pad_timestamp(string), TIMESTAMP_14) + # pad to 6 digits + string = _pad_timestamp(string, PAD_6) + + + def clamp(val, min_, max_): + try: + val = int(val) + val = max(min_, min(val, max_)) + return val + except: + return max_ + + def extract(string, start, end, min_, max_): + if len(string) >= end: + return clamp(string[start:end], min_, max_) + else: + return max_ + + # now parse, clamp to boundary + year = extract(string, 0, 4, 1900, 2999) + month = extract(string, 4, 6, 1, 12) + day = extract(string, 6, 8, 1, calendar.monthrange(year, month)[1]) + hour = extract(string, 8, 10, 0, 23) + minute = extract(string, 10, 12, 0, 59) + second = extract(string, 12, 14, 0, 59) + + return datetime.datetime(year=year, + month=month, + day=day, + hour=hour, + minute=minute, + second=second) + + #return time.strptime(pad_timestamp(string), TIMESTAMP_14) def timestamp_to_sec(string): @@ -104,7 +209,7 @@ def timestamp_to_sec(string): 1420070399 """ - return calendar.timegm(timestamp_to_datetime(string)) + return calendar.timegm(timestamp_to_datetime(string).utctimetuple()) if __name__ == "__main__": diff --git a/pywb/views.py b/pywb/views.py index f693d1e6..67f928d6 100644 --- a/pywb/views.py +++ b/pywb/views.py @@ -56,9 +56,9 @@ class J2TemplateView: # Filters @staticmethod - def format_ts(value, format='%a, %b %d %Y %H:%M:%S'): + def format_ts(value, format_='%a, %b %d %Y %H:%M:%S'): value = timeutils.timestamp_to_datetime(value) - return time.strftime(format, value) + return value.strftime(format_) @staticmethod def get_host(url): diff --git a/pywb/warc/recordloader.py b/pywb/warc/recordloader.py index 05973f6b..446e0da3 100644 --- a/pywb/warc/recordloader.py +++ b/pywb/warc/recordloader.py @@ -6,8 +6,8 @@ from pywb.utils.statusandheaders import StatusAndHeaders from pywb.utils.statusandheaders import StatusAndHeadersParser from pywb.utils.statusandheaders import StatusAndHeadersParserException -from pywb.utils.loaders import FileLoader, HttpLoader -from pywb.utils.bufferedreaders import BufferedReader +from pywb.utils.loaders import BlockLoader +from pywb.utils.bufferedreaders import DecompressingBufferedReader #================================================================= ArcWarcRecord = collections.namedtuple('ArchiveRecord', @@ -32,24 +32,12 @@ class ArcWarcRecordLoader: ARC_HEADERS = ["uri", "ip-address", "creation-date", "content-type", "length"] - @staticmethod - def create_default_loaders(cookie_maker=None): - http = HttpLoader(cookie_maker) - file = FileLoader() - return { - 'http': http, - 'https': http, - 'file': file, - '': file - } + def __init__(self, loader=None, cookie_maker=None, block_size=8192): + if not loader: + loader = BlockLoader(cookie_maker) - def __init__(self, loaders={}, cookie_maker=None, chunk_size=8192): - self.loaders = loaders - - if not self.loaders: - self.loaders = self.create_default_loaders(cookie_maker) - - self.chunk_size = chunk_size + self.loader = loader + self.block_size = block_size self.arc_parser = ARCHeadersParser(self.ARC_HEADERS) @@ -60,22 +48,25 @@ class ArcWarcRecordLoader: def load(self, url, offset, length): url_parts = urlparse.urlsplit(url) - loader = self.loaders.get(url_parts.scheme) - if not loader: - raise ArchiveLoadFailed('Unknown Protocol', url) + #loader = self.loaders.get(url_parts.scheme) + #if not loader: + # raise ArchiveLoadFailed('Unknown Protocol', url) try: length = int(length) except: length = -1 - raw = loader.load(url, long(offset), length) + raw = self.loader.load(url, long(offset), length) decomp_type = 'gzip' - stream = BufferedReader(raw, length, self.chunk_size, decomp_type) + # Create decompressing stream + stream = DecompressingBufferedReader(stream = raw, + decomp_type = decomp_type, + block_size = self.block_size) - (the_format, rec_headers) = self._load_headers(stream) + (the_format, rec_headers) = self._detect_type_load_headers(stream) if the_format == 'arc': rec_type = 'response' @@ -111,7 +102,7 @@ class ArcWarcRecordLoader: return ArcWarcRecord((the_format, rec_type), rec_headers, stream, status_headers) - def _load_headers(self, stream): + def _detect_type_load_headers(self, stream): """ Try parsing record as WARC, then try parsing as ARC. if neither one succeeds, we're out of luck. diff --git a/pywb/warc/test/test_loading.py b/pywb/warc/test/test_loading.py index 47176e3e..02ab54cb 100644 --- a/pywb/warc/test/test_loading.py +++ b/pywb/warc/test/test_loading.py @@ -213,3 +213,6 @@ def load_from_cdx_test(cdx): except Exception as e: print 'Exception: ' + e.__class__.__name__ +if __name__ == "__main__": + import doctest + doctest.testmod() diff --git a/pywb/wbrequestresponse.py b/pywb/wbrequestresponse.py index e2715177..4a459c4b 100644 --- a/pywb/wbrequestresponse.py +++ b/pywb/wbrequestresponse.py @@ -1,99 +1,75 @@ -from pywb.rewrite.wburl import WbUrl -from pywb.rewrite.url_rewriter import UrlRewriter from pywb.utils.statusandheaders import StatusAndHeaders - import pprint -#WB Request and Response + +#================================================================= class WbRequest: """ - >>> WbRequest.from_uri('/save/_embed/example.com/?a=b') - {'wb_url': ('latest_replay', '', '', 'http://_embed/example.com/?a=b', 'http://_embed/example.com/?a=b'), 'coll': 'save', 'wb_prefix': '/save/', 'request_uri': '/save/_embed/example.com/?a=b'} + Represents the main pywb request object. - >>> WbRequest.from_uri('/2345/20101024101112im_/example.com/?b=c') - {'wb_url': ('replay', '20101024101112', 'im_', 'http://example.com/?b=c', '20101024101112im_/http://example.com/?b=c'), 'coll': '2345', 'wb_prefix': '/2345/', 'request_uri': '/2345/20101024101112im_/example.com/?b=c'} + Contains various info from wsgi env, add additional info + about the request, such as coll, relative prefix, + host prefix, absolute prefix. - >>> WbRequest.from_uri('/2010/example.com') - {'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': '/2010/', 'request_uri': '/2010/example.com'} - - >>> WbRequest.from_uri('../example.com') - {'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '', 'wb_prefix': '/', 'request_uri': '../example.com'} - - # Abs path - >>> WbRequest.from_uri('/2010/example.com', {'wsgi.url_scheme': 'https', 'HTTP_HOST': 'localhost:8080'}, use_abs_prefix = True) - {'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': 'https://localhost:8080/2010/', 'request_uri': '/2010/example.com'} - - # No Scheme, so stick to relative - >>> WbRequest.from_uri('/2010/example.com', {'HTTP_HOST': 'localhost:8080'}, use_abs_prefix = True) - {'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': '/2010/', 'request_uri': '/2010/example.com'} + If a wburl and url rewriter classes are specified, the class + also contains the url rewriter. """ - - @staticmethod - def from_uri(request_uri, env = {}, use_abs_prefix = False): - if not request_uri: - request_uri = env.get('REL_REQUEST_URI') - - parts = request_uri.split('/', 2) - - # Has coll prefix - if len(parts) == 3: - wb_prefix = '/' + parts[1] + '/' - wb_url_str = parts[2] - coll = parts[1] - # No Coll Prefix - elif len(parts) == 2: - wb_prefix = '/' - wb_url_str = parts[1] - coll = '' - else: - wb_prefix = '/' - wb_url_str = parts[0] - coll = '' - - host_prefix = WbRequest.make_host_prefix(env) if use_abs_prefix else '' - - return WbRequest(env, request_uri, wb_prefix, wb_url_str, coll, host_prefix = host_prefix) - - @staticmethod def make_host_prefix(env): try: - return env['wsgi.url_scheme'] + '://' + env['HTTP_HOST'] + host = env.get('HTTP_HOST') + if not host: + host = env['SERVER_NAME'] + ':' + env['SERVER_PORT'] + + return env['wsgi.url_scheme'] + '://' + host except KeyError: return '' - def __init__(self, env, request_uri, wb_prefix, wb_url_str, coll, - host_prefix = '', - wburl_class = WbUrl, - url_rewriter_class = UrlRewriter, - is_proxy = False): + def __init__(self, env, + request_uri=None, + rel_prefix='', + wb_url_str='/', + coll='', + host_prefix='', + use_abs_prefix=False, + wburl_class=None, + urlrewriter_class=None, + is_proxy=False): self.env = env self.request_uri = request_uri if request_uri else env.get('REL_REQUEST_URI') - self.host_prefix = host_prefix + self.coll = coll + + if not host_prefix: + host_prefix = self.make_host_prefix(env) + + self.host_prefix = host_prefix + self.rel_prefix = rel_prefix + + if use_abs_prefix: + self.wb_prefix = host_prefix + rel_prefix + else: + self.wb_prefix = rel_prefix - self.wb_prefix = host_prefix + wb_prefix if not wb_url_str: wb_url_str = '/' + self.wb_url_str = wb_url_str + # wb_url present and not root page if wb_url_str != '/' and wburl_class: - self.wb_url_str = wb_url_str self.wb_url = wburl_class(wb_url_str) - self.urlrewriter = url_rewriter_class(self.wb_url, self.wb_prefix) + self.urlrewriter = urlrewriter_class(self.wb_url, self.wb_prefix) else: # no wb_url, just store blank wb_url - self.wb_url_str = wb_url_str self.wb_url = None self.urlrewriter = None - self.coll = coll - self.referrer = env.get('HTTP_REFERER') self.is_ajax = self._is_ajax() @@ -122,24 +98,19 @@ class WbRequest: def __repr__(self): - #return "WbRequest(env, '" + (self.wb_url) + "', '" + (self.coll) + "')" - #return str(vars(self)) varlist = vars(self) - return str({k: varlist[k] for k in ('request_uri', 'wb_prefix', 'wb_url', 'coll')}) + varstr = pprint.pformat(varlist) + return varstr +#================================================================= class WbResponse: """ - >>> WbResponse.text_response('Test') - {'body': ['Test'], 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [('Content-Type', 'text/plain')])} + Represnts a pywb wsgi response object. - >>> WbResponse.text_stream(['Test', 'Another'], '404') - {'body': ['Test', 'Another'], 'status_headers': StatusAndHeaders(protocol = '', statusline = '404', headers = [('Content-Type', 'text/plain')])} - - >>> WbResponse.redir_response('http://example.com/otherfile') - {'body': [], 'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [('Location', 'http://example.com/otherfile')])} + Holds a status_headers object and a response iter, to be + returned to wsgi container. """ - def __init__(self, status_headers, value = []): self.status_headers = status_headers self.body = value @@ -180,8 +151,3 @@ class WbResponse: def __repr__(self): return str(vars(self)) - -if __name__ == "__main__": - import doctest - doctest.testmod() - diff --git a/tests/test_integration.py b/tests/test_integration.py index ec7fd6bd..1a7a943c 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -75,6 +75,11 @@ class TestWb: assert 'wb.js' in resp.body assert '/pywb/20140127171238/http://www.iana.org/time-zones' in resp.body + def test_replay_content_length_1(self): + # test larger file, rewritten file (svg!) + resp = self.testapp.get('/pywb/20140126200654/http://www.iana.org/_img/2013.1/rir-map.svg') + assert resp.headers['Content-Length'] == str(len(resp.body)) + def test_redirect_1(self): resp = self.testapp.get('/pywb/20140127171237/http://www.iana.org/') @@ -119,6 +124,20 @@ class TestWb: assert resp.content_type == 'text/css' + def test_referrer_self_redirect(self): + uri = '/pywb/20140127171239/http://www.iana.org/_css/2013.1/screen.css' + host = 'somehost:8082' + referrer = 'http://' + host + uri + + # capture is normally a 200 + resp = self.testapp.get(uri) + assert resp.status_int == 200 + + # redirect causes skip of this capture, redirect to next + resp = self.testapp.get(uri, headers = [('Referer', referrer), ('Host', host)], status = 302) + assert resp.status_int == 302 + + def test_excluded_content(self): resp = self.testapp.get('/pywb/http://www.iana.org/_img/bookmark_icon.ico', status = 403) assert resp.status_int == 403