1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

Merge branch 'master' into cdx-server

This commit is contained in:
Kenji Nagahashi 2014-02-25 23:14:15 +00:00
commit 14f4b4d26e
33 changed files with 1120 additions and 342 deletions

8
.coveragerc Normal file
View File

@ -0,0 +1,8 @@
[run]
omit =
*/test/*
*/tests/*
[report]
exclude_lines =
if __name__ == .__main__.:

View File

@ -4,7 +4,14 @@ python:
# command to install dependencies # command to install dependencies
install: install:
- "python setup.py -q install" - "python setup.py -q install"
- "pip install python-coveralls"
- "pip install pytest-cov"
# command to run tests # command to run tests
#script: nosetests --with-doctest #script: nosetests --with-doctest
#script: py.test run-tests.py ./pywb/ --doctest-modules --ignore=setup.py #script: py.test run-tests.py ./pywb/ --doctest-modules --ignore=setup.py
script: py.test -v --doctest-module ./tests/*.py ./pywb/ #script: py.test -v --doctest-module ./tests/*.py ./pywb/
script:
py.test --cov-config .coveragerc --cov pywb -v --doctest-module ./pywb/ tests/
after_success:
coveralls

View File

@ -3,13 +3,13 @@ import re
from wbrequestresponse import WbRequest, WbResponse from wbrequestresponse import WbRequest, WbResponse
from pywb.rewrite.url_rewriter import UrlRewriter from pywb.rewrite.url_rewriter import UrlRewriter
from pywb.rewrite.wburl import WbUrl
#================================================================= #=================================================================
# ArchivalRouter -- route WB requests in archival mode # ArchivalRouter -- route WB requests in archival mode
#================================================================= #=================================================================
class ArchivalRouter: class ArchivalRouter:
def __init__(self, routes, hostpaths = None, abs_path = True, home_view = None, error_view = None): def __init__(self, routes, hostpaths=None, abs_path=True, home_view=None, error_view=None):
self.routes = routes self.routes = routes
self.fallback = ReferRedirect(hostpaths) self.fallback = ReferRedirect(hostpaths)
self.abs_path = abs_path self.abs_path = abs_path
@ -69,24 +69,25 @@ class Route:
if not matcher: if not matcher:
return None return None
rel_prefix = matcher.group(0) matched_str = matcher.group(0)
if rel_prefix: if matched_str:
wb_prefix = env['SCRIPT_NAME'] + '/' + rel_prefix + '/' rel_prefix = env['SCRIPT_NAME'] + '/' + matched_str + '/'
wb_url_str = request_uri[len(rel_prefix) + 2:] # remove the '/' + rel_prefix part of uri wb_url_str = request_uri[len(matched_str) + 2:] # remove the '/' + rel_prefix part of uri
else: else:
wb_prefix = env['SCRIPT_NAME'] + '/' rel_prefix = env['SCRIPT_NAME'] + '/'
wb_url_str = request_uri[1:] # the request_uri is the wb_url, since no coll wb_url_str = request_uri[1:] # the request_uri is the wb_url, since no coll
coll = matcher.group(self.coll_group) coll = matcher.group(self.coll_group)
wbrequest = WbRequest(env, wbrequest = WbRequest(env,
request_uri = request_uri, request_uri=request_uri,
wb_url_str = wb_url_str, wb_url_str=wb_url_str,
wb_prefix = wb_prefix, rel_prefix=rel_prefix,
coll = coll, coll=coll,
host_prefix = WbRequest.make_host_prefix(env) if use_abs_prefix else '', use_abs_prefix=use_abs_prefix,
wburl_class = self.handler.get_wburl_type()) wburl_class = self.handler.get_wburl_type(),
urlrewriter_class=UrlRewriter)
# Allow for applying of additional filters # Allow for applying of additional filters

View File

@ -2,6 +2,7 @@
""" """
import surt import surt
import urlparse
from cdxobject import CDXException from cdxobject import CDXException
@ -69,6 +70,109 @@ index.html?a=b?c=)/')
return surt return surt
#=================================================================
def calc_search_range(url, match_type, surt_ordered=True, url_canon=None):
"""
Canonicalize a url (either with custom canonicalizer or
standard canonicalizer with or without surt)
Then, compute a start and end search url search range
for a given match type.
Support match types:
* exact
* prefix
* host
* domain (only available when for surt ordering)
Examples below:
# surt ranges
>>> calc_search_range('http://example.com/path/file.html', 'exact')
('com,example)/path/file.html', 'com,example)/path/file.html!')
>>> calc_search_range('http://example.com/path/file.html', 'prefix')
('com,example)/path/file.html', 'com,example)/path/file.htmm')
>>> calc_search_range('http://example.com/path/file.html', 'host')
('com,example)/', 'com,example*')
>>> calc_search_range('http://example.com/path/file.html', 'domain')
('com,example)/', 'com,example-')
special case for tld domain range
>>> calc_search_range('com', 'domain')
('com,', 'com-')
# non-surt ranges
>>> calc_search_range('http://example.com/path/file.html', 'exact', False)
('example.com/path/file.html', 'example.com/path/file.html!')
>>> calc_search_range('http://example.com/path/file.html', 'prefix', False)
('example.com/path/file.html', 'example.com/path/file.htmm')
>>> calc_search_range('http://example.com/path/file.html', 'host', False)
('example.com/', 'example.com0')
# domain range not supported
>>> calc_search_range('http://example.com/path/file.html', 'domain', False)
Traceback (most recent call last):
Exception: matchType=domain unsupported for non-surt
"""
def inc_last_char(x):
return x[0:-1] + chr(ord(x[-1]) + 1)
if not url_canon:
# make new canon
url_canon = UrlCanonicalizer(surt_ordered)
else:
# ensure surt order matches url_canon
surt_ordered = url_canon.surt_ordered
start_key = url_canon(url)
if match_type == 'exact':
end_key = start_key + '!'
elif match_type == 'prefix':
# add trailing slash if url has it
if url.endswith('/') and not start_key.endswith('/'):
start_key += '/'
end_key = inc_last_char(start_key)
elif match_type == 'host':
if surt_ordered:
host = start_key.split(')/')[0]
start_key = host + ')/'
end_key = host + '*'
else:
host = urlparse.urlsplit(url).netloc
start_key = host + '/'
end_key = host + '0'
elif match_type == 'domain':
if not surt_ordered:
raise Exception('matchType=domain unsupported for non-surt')
host = start_key.split(')/')[0]
# if tld, use com, as start_key
# otherwise, stick with com,example)/
if not ',' in host:
start_key = host + ','
else:
start_key = host + ')/'
end_key = host + '-'
else:
raise Exception('Invalid match_type: ' + match_type)
return (start_key, end_key)
if __name__ == "__main__": if __name__ == "__main__":
import doctest import doctest
doctest.testmod() doctest.testmod()

View File

@ -77,3 +77,34 @@ class CDXObject(OrderedDict):
li = itertools.imap(lambda (n, val): val, self.items()) li = itertools.imap(lambda (n, val): val, self.items())
return ' '.join(li) return ' '.join(li)
#=================================================================
class IDXObject(OrderedDict):
FORMAT = ['urlkey', 'part', 'offset', 'length', 'lineno']
NUM_REQ_FIELDS = len(FORMAT) - 1 # lineno is an optional field
def __init__(self, idxline):
OrderedDict.__init__(self)
idxline = idxline.rstrip()
fields = idxline.split('\t')
if len(fields) < self.NUM_REQ_FIELDS:
msg = 'invalid idx format: {0} fields found, {1} required'
raise Exception(msg.format(len(fields), self.NUM_REQ_FIELDS))
for header, field in itertools.izip(self.FORMAT, fields):
self[header] = field
self['offset'] = int(self['offset'])
self['length'] = int(self['length'])
lineno = self.get('lineno')
if lineno:
self['lineno'] = int(lineno)
self.idxline = idxline
def __str__(self):
return self.idxline

View File

@ -1,4 +1,4 @@
from cdxobject import CDXObject, AccessException from cdxobject import CDXObject, IDXObject, AccessException
from pywb.utils.timeutils import timestamp_to_sec from pywb.utils.timeutils import timestamp_to_sec
import bisect import bisect
@ -56,7 +56,7 @@ def cdx_text_out(cdx, fields):
def cdx_load_and_filter(sources, params): def cdx_load_and_filter(sources, params):
cdx_iter = load_cdx_streams(sources, params) cdx_iter = load_cdx_streams(sources, params)
cdx_iter = make_cdx_iter(cdx_iter) cdx_iter = make_obj_iter(cdx_iter, params)
if params.get('proxyAll'): if params.get('proxyAll'):
return cdx_iter return cdx_iter
@ -102,9 +102,15 @@ def load_cdx_streams(sources, params):
#================================================================= #=================================================================
# convert text cdx stream to CDXObject # convert text cdx stream to CDXObject/IDXObject
def make_cdx_iter(text_iter): def make_obj_iter(text_iter, params):
return itertools.imap(lambda line: CDXObject(line), text_iter) # already converted
if params.get('showPagedIndex'):
cls = IDXObject
else:
cls = CDXObject
return itertools.imap(lambda line: cls(line), text_iter)
#================================================================= #=================================================================

View File

@ -1,10 +1,13 @@
from canonicalize import UrlCanonicalizer from canonicalize import UrlCanonicalizer, calc_search_range
from cdxops import cdx_load from cdxops import cdx_load
from cdxsource import CDXSource, CDXFile, RemoteCDXSource from cdxsource import CDXSource, CDXFile, RemoteCDXSource, RedisCDXSource
from zipnum import ZipNumCluster
from cdxobject import CDXObject, CaptureNotFoundException, CDXException from cdxobject import CDXObject, CaptureNotFoundException, CDXException
from cdxdomainspecific import load_domain_specific_cdx_rules from cdxdomainspecific import load_domain_specific_cdx_rules
from pywb.utils.loaders import is_http
from itertools import chain from itertools import chain
import logging import logging
import os import os
@ -14,8 +17,23 @@ import urlparse
#================================================================= #=================================================================
class BaseCDXServer(object): class BaseCDXServer(object):
def __init__(self, **kwargs): def __init__(self, **kwargs):
self.url_canon = kwargs.get('url_canon', UrlCanonicalizer()) ds_rules = kwargs.get('ds_rules')
self.fuzzy_query = kwargs.get('fuzzy_query') surt_ordered = kwargs.get('surt_ordered', True)
# load from domain-specific rules
if ds_rules:
self.url_canon, self.fuzzy_query = (
load_domain_specific_cdx_rules(ds_rules, surt_ordered))
# or custom passed in canonicalizer
else:
self.url_canon = kwargs.get('url_canon')
self.fuzzy_query = kwargs.get('fuzzy_query')
# set default canonicalizer if none set thus far
if not self.url_canon:
self.url_canon = UrlCanonicalizer(surt_ordered)
# set perms checker, if any
self.perms_checker = kwargs.get('perms_checker') self.perms_checker = kwargs.get('perms_checker')
def _check_cdx_iter(self, cdx_iter, params): def _check_cdx_iter(self, cdx_iter, params):
@ -66,7 +84,7 @@ class CDXServer(BaseCDXServer):
def __init__(self, paths, **kwargs): def __init__(self, paths, **kwargs):
super(CDXServer, self).__init__(**kwargs) super(CDXServer, self).__init__(**kwargs)
self.sources = create_cdx_sources(paths) self.sources = create_cdx_sources(paths, kwargs.get('config'))
def load_cdx(self, **params): def load_cdx(self, **params):
# if key not set, assume 'url' is set and needs canonicalization # if key not set, assume 'url' is set and needs canonicalization
@ -77,7 +95,14 @@ class CDXServer(BaseCDXServer):
msg = 'A url= param must be specified to query the cdx server' msg = 'A url= param must be specified to query the cdx server'
raise CDXException(msg) raise CDXException(msg)
params['key'] = self.url_canon(url) #params['key'] = self.url_canon(url)
match_type = params.get('matchType', 'exact')
key, end_key = calc_search_range(url=url,
match_type=match_type,
url_canon=self.url_canon)
params['key'] = key
params['end_key'] = end_key
cdx_iter = cdx_load(self.sources, params, self.perms_checker) cdx_iter = cdx_load(self.sources, params, self.perms_checker)
@ -124,36 +149,29 @@ def create_cdx_server(config, ds_rules_file=None):
paths = config.get('index_paths') paths = config.get('index_paths')
surt_ordered = config.get('surt_ordered', True) surt_ordered = config.get('surt_ordered', True)
perms_checker = config.get('perms_checker') perms_checker = config.get('perms_checker')
pass_config = config
else: else:
paths = config paths = config
surt_ordered = True surt_ordered = True
perms_checker = None perms_checker = None
pass_config = None
logging.debug('CDX Surt-Ordered? ' + str(surt_ordered)) logging.debug('CDX Surt-Ordered? ' + str(surt_ordered))
if ds_rules_file: if isinstance(paths, str) and is_http(paths):
canon, fuzzy = load_domain_specific_cdx_rules(ds_rules_file,
surt_ordered)
else:
canon, fuzzy = None, None
if not canon:
canon = UrlCanonicalizer(surt_ordered)
if (isinstance(paths, str) and
any(paths.startswith(x) for x in ['http://', 'https://'])):
server_cls = RemoteCDXServer server_cls = RemoteCDXServer
else: else:
server_cls = CDXServer server_cls = CDXServer
return server_cls(paths, return server_cls(paths,
url_canon=canon, config=pass_config,
fuzzy_query=fuzzy, surt_ordered=surt_ordered,
ds_rules=ds_rules_file,
perms_checker=perms_checker) perms_checker=perms_checker)
#================================================================= #=================================================================
def create_cdx_sources(paths): def create_cdx_sources(paths, config=None):
sources = [] sources = []
if not isinstance(paths, list): if not isinstance(paths, list):
@ -161,13 +179,13 @@ def create_cdx_sources(paths):
for path in paths: for path in paths:
if isinstance(path, CDXSource): if isinstance(path, CDXSource):
add_cdx_source(sources, path) add_cdx_source(sources, path, config)
elif isinstance(path, str): elif isinstance(path, str):
if os.path.isdir(path): if os.path.isdir(path):
for file in os.listdir(path): for file in os.listdir(path):
add_cdx_source(sources, path + file) add_cdx_source(sources, path + file, config)
else: else:
add_cdx_source(sources, path) add_cdx_source(sources, path, config)
if len(sources) == 0: if len(sources) == 0:
logging.exception('No CDX Sources Found from: ' + str(sources)) logging.exception('No CDX Sources Found from: ' + str(sources))
@ -176,9 +194,9 @@ def create_cdx_sources(paths):
#================================================================= #=================================================================
def add_cdx_source(sources, source): def add_cdx_source(sources, source, config):
if not isinstance(source, CDXSource): if not isinstance(source, CDXSource):
source = create_cdx_source(source) source = create_cdx_source(source, config)
if not source: if not source:
return return
@ -187,19 +205,20 @@ def add_cdx_source(sources, source):
#================================================================= #=================================================================
def create_cdx_source(filename): def create_cdx_source(filename, config):
if filename.startswith('http://') or filename.startswith('https://'): if is_http(filename):
return RemoteCDXSource(filename) return RemoteCDXSource(filename)
if filename.startswith('redis://'):
return RedisCDXSource(filename, config)
if filename.endswith('.cdx'): if filename.endswith('.cdx'):
return CDXFile(filename) return CDXFile(filename)
if filename.endswith('.summary'):
return ZipNumCluster(filename, config)
return None return None
#TODO: support zipnum
#elif filename.endswith('.summary')
# return ZipNumCDXSource(filename)
#elif filename.startswith('redis://')
# return RedisCDXSource(filename)
#================================================================= #=================================================================

View File

@ -1,9 +1,9 @@
from pywb.utils.binsearch import iter_exact, iter_prefix from pywb.utils.binsearch import iter_range
from pywb.utils.loaders import SeekableTextFileReader from pywb.utils.loaders import SeekableTextFileReader
import urllib import urllib
import urllib2 import urllib2
import itertools
#================================================================= #=================================================================
class CDXSource(object): class CDXSource(object):
@ -24,17 +24,7 @@ class CDXFile(CDXSource):
def load_cdx(self, params): def load_cdx(self, params):
source = SeekableTextFileReader(self.filename) source = SeekableTextFileReader(self.filename)
return iter_range(source, params.get('key'), params.get('end_key'))
match_type = params.get('matchType')
if match_type == 'prefix':
iter_func = iter_prefix
else:
iter_func = iter_exact
key = params.get('key')
return iter_func(source, key)
def __str__(self): def __str__(self):
return 'CDX File - ' + self.filename return 'CDX File - ' + self.filename
@ -90,3 +80,35 @@ class RemoteCDXSource(CDXSource):
def __str__(self): def __str__(self):
return 'Remote CDX Server: ' + self.remote_url return 'Remote CDX Server: ' + self.remote_url
#=================================================================
class RedisCDXSource(CDXSource):
DEFAULT_KEY_PREFIX = 'c:'
def __init__(self, redis_url, config=None):
import redis
self.redis = redis.StrictRedis.from_url(redis_url)
self.key_prefix = self.DEFAULT_KEY_PREFIX
if config:
self.key_prefix = config.get('redis_key_prefix', self.key_prefix)
def load_cdx(self, params):
"""
Load cdx from redis cache, from an ordered list
Currently, there is no support for range queries
Only 'exact' matchType is supported
"""
key = params['key']
# ensure only url/surt is part of key
key = key.split(' ')[0]
cdx_list = self.redis.zrange(self.key_prefix + key, 0, -1)
# key is not part of list, so prepend to each line
key += ' '
cdx_list = itertools.imap(lambda x: key + x, cdx_list)
return cdx_list

View File

@ -132,8 +132,8 @@ org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db tex
('filename', 'dupes.warc.gz')] ('filename', 'dupes.warc.gz')]
# NOTE: external dependency -- need self-contained test # NOTE: external dependency -- need self-contained test
>>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'example.com', output = 'raw', limit = '2') #>>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'example.com', output = 'raw', limit = '2')
>>> pprint.pprint(x.next().items()) #>>> pprint.pprint(x.next().items())
[('urlkey', 'com,example)/'), [('urlkey', 'com,example)/'),
('timestamp', '20020120142510'), ('timestamp', '20020120142510'),
('original', 'http://example.com:80/'), ('original', 'http://example.com:80/'),

203
pywb/cdx/zipnum.py Normal file
View File

@ -0,0 +1,203 @@
import os
import collections
import itertools
import logging
from cStringIO import StringIO
import datetime
from cdxsource import CDXSource
from cdxobject import IDXObject
from pywb.utils.loaders import BlockLoader
from pywb.utils.loaders import SeekableTextFileReader
from pywb.utils.bufferedreaders import gzip_decompressor
from pywb.utils.binsearch import iter_range, linearsearch
#=================================================================
class ZipBlocks:
def __init__(self, part, offset, length, count):
self.part = part
self.offset = offset
self.length = length
self.count = count
#=================================================================
def readline_to_iter(stream):
try:
count = 0
buff = stream.readline()
while buff:
count += 1
yield buff
buff = stream.readline()
finally:
stream.close()
#=================================================================
class ZipNumCluster(CDXSource):
DEFAULT_RELOAD_INTERVAL = 10 # in minutes
DEFAULT_MAX_BLOCKS = 50
def __init__(self, summary, config=None):
loc = None
cookie_maker = None
self.max_blocks = self.DEFAULT_MAX_BLOCKS
reload_ival = self.DEFAULT_RELOAD_INTERVAL
if config:
loc = config.get('zipnum_loc')
cookie_maker = config.get('cookie_maker')
self.max_blocks = config.get('max_blocks', self.max_blocks)
reload_ival = config.get('reload_interval', reload_ival)
if not loc:
splits = os.path.splitext(summary)
loc = splits[0] + '.loc'
self.summary = summary
self.loc_filename = loc
# initial loc map
self.loc_map = {}
self.loc_mtime = 0
self.load_loc()
# reload interval
self.loc_update_time = datetime.datetime.now()
self.reload_interval = datetime.timedelta(minutes=reload_ival)
self.blk_loader = BlockLoader(cookie_maker=cookie_maker)
def load_loc(self):
# check modified time of current file before loading
new_mtime = os.path.getmtime(self.loc_filename)
if (new_mtime == self.loc_mtime):
return
# update loc file mtime
self.loc_mtime = new_mtime
logging.debug('Loading loc from: ' + self.loc_filename)
with open(self.loc_filename) as fh:
for line in fh:
parts = line.rstrip().split('\t')
self.loc_map[parts[0]] = parts[1:]
@staticmethod
def reload_timed(timestamp, val, delta, func):
now = datetime.datetime.now()
if now - timestamp >= delta:
func()
return now
return None
def reload_loc(self):
reload_time = self.reload_timed(self.loc_update_time,
self.loc_map,
self.reload_interval,
self.load_loc)
if reload_time:
self.loc_update_time = reload_time
def lookup_loc(self, part):
return self.loc_map[part]
def load_cdx(self, params):
self.reload_loc()
reader = SeekableTextFileReader(self.summary)
idx_iter = iter_range(reader,
params['key'],
params['end_key'],
prev_size=1)
if params.get('showPagedIndex'):
params['proxyAll'] = True
return idx_iter
else:
blocks = self.idx_to_cdx(idx_iter, params)
def gen_cdx():
for blk in blocks:
for cdx in blk:
yield cdx
return gen_cdx()
def idx_to_cdx(self, idx_iter, params):
blocks = None
ranges = []
for idx in idx_iter:
idx = IDXObject(idx)
if (blocks and blocks.part == idx['part'] and
blocks.offset + blocks.length == idx['offset'] and
blocks.count < self.max_blocks):
blocks.length += idx['length']
blocks.count += 1
ranges.append(idx['length'])
else:
if blocks:
yield self.block_to_cdx_iter(blocks, ranges, params)
blocks = ZipBlocks(idx['part'],
idx['offset'],
idx['length'],
1)
ranges = [blocks.length]
if blocks:
yield self.block_to_cdx_iter(blocks, ranges, params)
def block_to_cdx_iter(self, blocks, ranges, params):
last_exc = None
last_traceback = None
for location in self.lookup_loc(blocks.part):
try:
return self.load_blocks(location, blocks, ranges, params)
except Exception as exc:
last_exc = exc
import sys
last_traceback = sys.exc_info()[2]
if last_exc:
raise exc, None, last_traceback
else:
raise Exception('No Locations Found for: ' + block.part)
def load_blocks(self, location, blocks, ranges, params):
if (logging.getLogger().getEffectiveLevel() <= logging.DEBUG):
msg = 'Loading {b.count} blocks from {loc}:{b.offset}+{b.length}'
logging.debug(msg.format(b=blocks, loc=location))
reader = self.blk_loader.load(location, blocks.offset, blocks.length)
def decompress_block(range_):
decomp = gzip_decompressor()
buff = decomp.decompress(reader.read(range_))
return readline_to_iter(StringIO(buff))
iter_ = itertools.chain(*itertools.imap(decompress_block, ranges))
# start bound
iter_ = linearsearch(iter_, params['key'])
# end bound
end = params['end_key']
iter_ = itertools.takewhile(lambda line: line < end, iter_)
return iter_

View File

@ -10,19 +10,28 @@ from wbexceptions import WbException, NotFoundException
from views import TextCapturesView from views import TextCapturesView
class BaseHandler: #=================================================================
@staticmethod class BaseHandler(object):
def get_wburl_type():
return WbUrl
def __call__(self, wbrequest): def __call__(self, wbrequest):
return wbrequest return wbrequest
def get_wburl_type(self):
return None
#=================================================================
class WbUrlHandler(BaseHandler):
def get_wburl_type(self):
return WbUrl
#================================================================= #=================================================================
# Standard WB Handler # Standard WB Handler
#================================================================= #=================================================================
class WBHandler(BaseHandler): class WBHandler(WbUrlHandler):
def __init__(self, index_reader, replay, html_view = None, search_view = None): def __init__(self, index_reader, replay,
html_view=None, search_view=None):
self.index_reader = index_reader self.index_reader = index_reader
self.replay = replay self.replay = replay
@ -31,7 +40,6 @@ class WBHandler(BaseHandler):
self.html_view = html_view self.html_view = html_view
self.search_view = search_view self.search_view = search_view
def __call__(self, wbrequest): def __call__(self, wbrequest):
if wbrequest.wb_url_str == '/': if wbrequest.wb_url_str == '/':
return self.render_search_page(wbrequest) return self.render_search_page(wbrequest)
@ -61,6 +69,7 @@ class WBHandler(BaseHandler):
def __str__(self): def __str__(self):
return 'WBHandler: ' + str(self.index_reader) + ', ' + str(self.replay) return 'WBHandler: ' + str(self.index_reader) + ', ' + str(self.replay)
#================================================================= #=================================================================
# CDX-Server Handler -- pass all params to cdx server # CDX-Server Handler -- pass all params to cdx server
#================================================================= #=================================================================
@ -75,11 +84,6 @@ class CDXHandler(BaseHandler):
return self.view.render_response(wbrequest, cdx_lines) return self.view.render_response(wbrequest, cdx_lines)
@staticmethod
def get_wburl_type():
return None
def __str__(self): def __str__(self):
return 'Index Reader: ' + str(self.index_reader) return 'Index Reader: ' + str(self.index_reader)
@ -115,10 +119,6 @@ class StaticHandler(BaseHandler):
except IOError: except IOError:
raise NotFoundException('Static File Not Found: ' + wbrequest.wb_url_str) raise NotFoundException('Static File Not Found: ' + wbrequest.wb_url_str)
@staticmethod
def get_wburl_type():
return None
def __str__(self): def __str__(self):
return 'Static files from ' + self.static_path return 'Static files from ' + self.static_path
@ -130,6 +130,7 @@ class DebugEchoEnvHandler(BaseHandler):
def __call__(self, wbrequest): def __call__(self, wbrequest):
return WbResponse.text_response(str(wbrequest.env)) return WbResponse.text_response(str(wbrequest.env))
#================================================================= #=================================================================
class DebugEchoHandler(BaseHandler): class DebugEchoHandler(BaseHandler):
def __call__(self, wbrequest): def __call__(self, wbrequest):
@ -150,5 +151,3 @@ class PerfTimer:
self.end = time.clock() self.end = time.clock()
if self.perfdict is not None: if self.perfdict is not None:
self.perfdict[self.name] = str(self.end - self.start) self.perfdict[self.name] = str(self.end - self.start)

View File

@ -37,7 +37,7 @@ class IndexReader(object):
def load_cdx(self, **params): def load_cdx(self, **params):
return self.cdx_server.load_cdx(**params) return self.cdx_server.load_cdx(**params)
def get_query_params(self, wburl, limit = 150000, collapse_time = None, replay_closest = 10): def get_query_params(self, wburl, limit = 150000, collapse_time = None, replay_closest = 100):
if wburl.type == wburl.URL_QUERY: if wburl.type == wburl.URL_QUERY:
raise NotImplementedError('Url Query Not Yet Supported') raise NotImplementedError('Url Query Not Yet Supported')

View File

@ -45,14 +45,14 @@ class ProxyRouter:
return None return None
wbrequest = WbRequest(env, wbrequest = WbRequest(env,
request_uri = url, request_uri=url,
wb_url_str = url, wb_url_str=url,
wb_prefix = '', #rel_prefix=url,
coll = '', #host_prefix=self.hostpaths[0],
host_prefix = self.hostpaths[0], wburl_class=self.handler.get_wburl_type(),
wburl_class = self.handler.get_wburl_type(), urlrewriter_class=ProxyHttpsUrlRewriter,
url_rewriter_class = ProxyHttpsUrlRewriter, use_abs_prefix=False,
is_proxy = True) is_proxy=True)
return self.handler(wbrequest) return self.handler(wbrequest)

View File

@ -7,7 +7,6 @@ from wbrequestresponse import WbResponse
from wbexceptions import CaptureException, InternalRedirect from wbexceptions import CaptureException, InternalRedirect
from pywb.warc.recordloader import ArchiveLoadFailed from pywb.warc.recordloader import ArchiveLoadFailed
#================================================================= #=================================================================
class ReplayView: class ReplayView:
def __init__(self, content_loader, content_rewriter, head_insert_view = None, def __init__(self, content_loader, content_rewriter, head_insert_view = None,
@ -49,6 +48,9 @@ class ReplayView:
# check if redir is needed # check if redir is needed
self._redirect_if_needed(wbrequest, cdx) self._redirect_if_needed(wbrequest, cdx)
# one more check for referrer-based self-redirect
self._reject_referrer_self_redirect(wbrequest, status_headers)
response = None response = None
if self.content_rewriter and wbrequest.wb_url.mod != 'id_': if self.content_rewriter and wbrequest.wb_url.mod != 'id_':
@ -148,6 +150,7 @@ class ReplayView:
def _reject_self_redirect(self, wbrequest, cdx, status_headers): def _reject_self_redirect(self, wbrequest, cdx, status_headers):
# self-redirect via location
if status_headers.statusline.startswith('3'): if status_headers.statusline.startswith('3'):
request_url = wbrequest.wb_url.url.lower() request_url = wbrequest.wb_url.url.lower()
location_url = status_headers.get_header('Location').lower() location_url = status_headers.get_header('Location').lower()
@ -156,3 +159,16 @@ class ReplayView:
if (UrlRewriter.strip_protocol(request_url) == UrlRewriter.strip_protocol(location_url)): if (UrlRewriter.strip_protocol(request_url) == UrlRewriter.strip_protocol(location_url)):
raise CaptureException('Self Redirect: ' + str(cdx)) raise CaptureException('Self Redirect: ' + str(cdx))
def _reject_referrer_self_redirect(self, wbrequest, status_headers):
# at correct timestamp now, but must check for referrer redirect
# indirect self-redirect, via meta-refresh, if referrer is same as current url
if status_headers.statusline.startswith('2'):
# build full url even if using relative-rewriting
request_url = wbrequest.host_prefix + wbrequest.rel_prefix + str(wbrequest.wb_url)
referrer_url = wbrequest.referrer
if (referrer_url and UrlRewriter.strip_protocol(request_url) == UrlRewriter.strip_protocol(referrer_url)):
raise CaptureException('Self Redirect via Referrer: ' + str(wbrequest.wb_url))

View File

@ -6,7 +6,7 @@ from regex_rewriters import RegexRewriter, JSRewriter, CSSRewriter, XMLRewriter
from header_rewriter import HeaderRewriter, RewrittenStatusAndHeaders from header_rewriter import HeaderRewriter, RewrittenStatusAndHeaders
from pywb.utils.statusandheaders import StatusAndHeaders from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.utils.bufferedreaders import BufferedReader, ChunkedDataReader from pywb.utils.bufferedreaders import DecompressingBufferedReader, ChunkedDataReader
class RewriteContent: class RewriteContent:
@ -54,7 +54,7 @@ class RewriteContent:
# ========================================================================= # =========================================================================
# special case -- need to ungzip the body # special case -- need to ungzip the body
if (rewritten_headers.contains_removed_header('content-encoding', 'gzip')): if (rewritten_headers.contains_removed_header('content-encoding', 'gzip')):
stream = BufferedReader(stream, decomp_type='gzip') stream = DecompressingBufferedReader(stream, decomp_type='gzip')
if rewritten_headers.charset: if rewritten_headers.charset:
encoding = rewritten_headers.charset encoding = rewritten_headers.charset

View File

@ -24,9 +24,9 @@ def test_example_2():
def test_example_3(): #def test_example_3():
status_headers, buff = get_rewritten('http://archive.org/', urlrewriter) # status_headers, buff = get_rewritten('http://archive.org/', urlrewriter)
assert '/pywb/20131226101010/http://example.com/about/terms.php' in buff, buff # assert '/pywb/20131226101010/http://example.com/about/terms.php' in buff, buff

View File

@ -103,10 +103,12 @@ class UrlRewriter:
return self.prefix + self.wburl.to_str(timestamp=timestamp, url=url) return self.prefix + self.wburl.to_str(timestamp=timestamp, url=url)
def set_base_url(self, newUrl): def set_base_url(self, newUrl):
self.wburl.url = newUrl self.wburl.url = newUrl
def __repr__(self):
return "UrlRewriter('{0}', '{1}')".format(self.wburl, self.prefix)
@staticmethod @staticmethod
def strip_protocol(url): def strip_protocol(url):
for protocol in UrlRewriter.PROTOCOLS: for protocol in UrlRewriter.PROTOCOLS:

View File

@ -1,9 +1,5 @@
#!/usr/bin/python #!/usr/bin/python
import re
import rfc3987
# WbUrl : wb archival url representation for WB
""" """
WbUrl represents the standard wayback archival url format. WbUrl represents the standard wayback archival url format.
A regular url is a subset of the WbUrl (latest replay). A regular url is a subset of the WbUrl (latest replay).
@ -34,9 +30,38 @@ replay form:
latest_replay: (no timestamp) latest_replay: (no timestamp)
http://example.com http://example.com
Additionally, the BaseWbUrl provides the base components
(url, timestamp, end_timestamp, modifier, type) which
can be used to provide a custom representation of the
wayback url format.
""" """
class WbUrl: import re
import rfc3987
#=================================================================
class BaseWbUrl(object):
QUERY = 'query'
URL_QUERY = 'url_query'
REPLAY = 'replay'
LATEST_REPLAY = 'latest_replay'
def __init__(self, url='', mod='',
timestamp='', end_timestamp='', type=None):
self.url = url
self.timestamp = timestamp
self.end_timestamp = end_timestamp
self.mod = mod
self.type = type
#=================================================================
class WbUrl(BaseWbUrl):
""" """
# Replay Urls # Replay Urls
# ====================== # ======================
@ -107,22 +132,14 @@ class WbUrl:
QUERY_REGEX = re.compile('^(?:([\w\-:]+)/)?(\d*)(?:-(\d+))?\*/?(.*)$') QUERY_REGEX = re.compile('^(?:([\w\-:]+)/)?(\d*)(?:-(\d+))?\*/?(.*)$')
REPLAY_REGEX = re.compile('^(\d*)([a-z]+_)?/{0,3}(.*)$') REPLAY_REGEX = re.compile('^(\d*)([a-z]+_)?/{0,3}(.*)$')
QUERY = 'query'
URL_QUERY = 'url_query'
REPLAY = 'replay'
LATEST_REPLAY = 'latest_replay'
DEFAULT_SCHEME = 'http://' DEFAULT_SCHEME = 'http://'
# ====================== # ======================
def __init__(self, url): def __init__(self, url):
super(WbUrl, self).__init__()
self.original_url = url self.original_url = url
self.type = None
self.url = ''
self.timestamp = ''
self.end_timestamp = ''
self.mod = ''
if not any (f(url) for f in [self._init_query, self._init_replay]): if not any (f(url) for f in [self._init_query, self._init_replay]):
raise Exception('Invalid WbUrl: ', url) raise Exception('Invalid WbUrl: ', url)

View File

@ -1,13 +1,19 @@
""" """
Test Route # Test WbRequest parsed via a Route
# route with relative path # route with relative path, print resulting wbrequest
>>> Route('web', BaseHandler())({'REL_REQUEST_URI': '/web/test.example.com', 'SCRIPT_NAME': ''}, False) >>> print_req(Route('web', WbUrlHandler())({'REL_REQUEST_URI': '/web/test.example.com', 'SCRIPT_NAME': ''}, False))
{'wb_url': ('latest_replay', '', '', 'http://test.example.com', 'http://test.example.com'), 'coll': 'web', 'wb_prefix': '/web/', 'request_uri': '/web/test.example.com'} {'coll': 'web',
'request_uri': '/web/test.example.com',
'wb_prefix': '/web/',
'wb_url': ('latest_replay', '', '', 'http://test.example.com', 'http://test.example.com')}
# route with absolute path, running at script /my_pywb
>>> Route('web', BaseHandler())({'REL_REQUEST_URI': '/web/2013im_/test.example.com', 'SCRIPT_NAME': '/my_pywb', 'HTTP_HOST': 'localhost:8081', 'wsgi.url_scheme': 'https'}, True)
{'wb_url': ('replay', '2013', 'im_', 'http://test.example.com', '2013im_/http://test.example.com'), 'coll': 'web', 'wb_prefix': 'https://localhost:8081/my_pywb/web/', 'request_uri': '/web/2013im_/test.example.com'}
# route with absolute path, running at script /my_pywb, print resultingwbrequest
>>> print_req(Route('web', WbUrlHandler())({'REL_REQUEST_URI': '/web/2013im_/test.example.com', 'SCRIPT_NAME': '/my_pywb', 'HTTP_HOST': 'localhost:8081', 'wsgi.url_scheme': 'https'}, True))
{'coll': 'web',
'request_uri': '/web/2013im_/test.example.com',
'wb_prefix': 'https://localhost:8081/my_pywb/web/',
'wb_url': ('replay', '2013', 'im_', 'http://test.example.com', '2013im_/http://test.example.com')}
# not matching route -- skipped # not matching route -- skipped
>>> Route('web', BaseHandler())({'REL_REQUEST_URI': '/other/test.example.com', 'SCRIPT_NAME': ''}, False) >>> Route('web', BaseHandler())({'REL_REQUEST_URI': '/other/test.example.com', 'SCRIPT_NAME': ''}, False)
@ -65,7 +71,12 @@ False
""" """
from pywb.archivalrouter import Route, ReferRedirect from pywb.archivalrouter import Route, ReferRedirect
from pywb.handlers import BaseHandler from pywb.handlers import BaseHandler, WbUrlHandler
import pprint
def print_req(req):
varlist = vars(req)
pprint.pprint({k: varlist[k] for k in ('request_uri', 'wb_prefix', 'wb_url', 'coll')})
def _test_redir(match_host, request_uri, referrer, script_name = '', coll = 'coll', http_host = None): def _test_redir(match_host, request_uri, referrer, script_name = '', coll = 'coll', http_host = None):
@ -74,7 +85,7 @@ def _test_redir(match_host, request_uri, referrer, script_name = '', coll = 'col
if http_host: if http_host:
env['HTTP_HOST'] = http_host env['HTTP_HOST'] = http_host
routes = [Route(coll, BaseHandler())] routes = [Route(coll, WbUrlHandler())]
redir = ReferRedirect(match_host) redir = ReferRedirect(match_host)
#req = WbRequest.from_uri(request_uri, env) #req = WbRequest.from_uri(request_uri, env)
@ -85,4 +96,6 @@ def _test_redir(match_host, request_uri, referrer, script_name = '', coll = 'col
return rep.status_headers.get_header('Location') return rep.status_headers.get_header('Location')
if __name__ == "__main__":
import doctest
doctest.testmod()

View File

@ -0,0 +1,87 @@
"""
# WbRequest Tests
# =================
>>> print_req_from_uri('/save/_embed/example.com/?a=b')
{'wb_url': ('latest_replay', '', '', 'http://_embed/example.com/?a=b', 'http://_embed/example.com/?a=b'), 'coll': 'save', 'wb_prefix': '/save/', 'request_uri': '/save/_embed/example.com/?a=b'}
>>> print_req_from_uri('/2345/20101024101112im_/example.com/?b=c')
{'wb_url': ('replay', '20101024101112', 'im_', 'http://example.com/?b=c', '20101024101112im_/http://example.com/?b=c'), 'coll': '2345', 'wb_prefix': '/2345/', 'request_uri': '/2345/20101024101112im_/example.com/?b=c'}
>>> print_req_from_uri('/2010/example.com')
{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': '/2010/', 'request_uri': '/2010/example.com'}
>>> print_req_from_uri('../example.com')
{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '', 'wb_prefix': '/', 'request_uri': '../example.com'}
# Abs path
>>> print_req_from_uri('/2010/example.com', {'wsgi.url_scheme': 'https', 'HTTP_HOST': 'localhost:8080'}, use_abs_prefix = True)
{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': 'https://localhost:8080/2010/', 'request_uri': '/2010/example.com'}
# No Scheme, so stick to relative
>>> print_req_from_uri('/2010/example.com', {'HTTP_HOST': 'localhost:8080'}, use_abs_prefix = True)
{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': '/2010/', 'request_uri': '/2010/example.com'}
# WbResponse Tests
# =================
>>> WbResponse.text_response('Test')
{'body': ['Test'], 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [('Content-Type', 'text/plain')])}
>>> WbResponse.text_stream(['Test', 'Another'], '404')
{'body': ['Test', 'Another'], 'status_headers': StatusAndHeaders(protocol = '', statusline = '404', headers = [('Content-Type', 'text/plain')])}
>>> WbResponse.redir_response('http://example.com/otherfile')
{'body': [], 'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [('Location', 'http://example.com/otherfile')])}
"""
from pywb.rewrite.wburl import WbUrl
from pywb.rewrite.url_rewriter import UrlRewriter
from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.wbrequestresponse import WbRequest, WbResponse
def print_req_from_uri(request_uri, env={}, use_abs_prefix=False):
response = req_from_uri(request_uri, env, use_abs_prefix)
varlist = vars(response)
print str({k: varlist[k] for k in ('request_uri', 'wb_prefix', 'wb_url', 'coll')})
def req_from_uri(request_uri, env={}, use_abs_prefix=False):
if not request_uri:
request_uri = env.get('REL_REQUEST_URI')
parts = request_uri.split('/', 2)
# Has coll prefix
if len(parts) == 3:
rel_prefix = '/' + parts[1] + '/'
wb_url_str = parts[2]
coll = parts[1]
# No Coll Prefix
elif len(parts) == 2:
rel_prefix = '/'
wb_url_str = parts[1]
coll = ''
else:
rel_prefix = '/'
wb_url_str = parts[0]
coll = ''
return WbRequest(env,
request_uri=request_uri,
rel_prefix=rel_prefix,
wb_url_str=wb_url_str,
coll=coll,
wburl_class=WbUrl,
urlrewriter_class=UrlRewriter,
use_abs_prefix=use_abs_prefix)
if __name__ == "__main__":
import doctest
doctest.testmod()

View File

@ -35,6 +35,58 @@ def binsearch_offset(reader, key, compare_func=cmp, block_size=8192):
return min_ * block_size return min_ * block_size
#=================================================================
def binsearch(reader, key, compare_func=cmp, block_size=8192):
"""
Perform a binary search for a specified key to within a 'block_size'
(default 8192) granularity, and return first full line found.
"""
min_ = binsearch_offset(reader, key, compare_func, block_size)
reader.seek(min_)
if min_ > 0:
reader.readline() # skip partial line
def gen_iter(line):
while line:
yield line.rstrip()
line = reader.readline()
return gen_iter(reader.readline())
#=================================================================
def linearsearch(iter_, key, prev_size=0, compare_func=cmp):
"""
Perform a linear search over iterator until
current_line >= key
optionally also tracking upto N previous lines, which are
returned before the first matched line.
if end of stream is reached before a match is found,
nothing is returned (prev lines discarded also)
"""
prev_deque = deque(maxlen=prev_size + 1)
matched = False
for line in iter_:
prev_deque.append(line)
if compare_func(line, key) >= 0:
matched = True
break
# no matches, so return empty iterator
if not matched:
return []
return itertools.chain(prev_deque, iter_)
#================================================================= #=================================================================
def search(reader, key, prev_size=0, compare_func=cmp, block_size=8192): def search(reader, key, prev_size=0, compare_func=cmp, block_size=8192):
""" """
@ -45,46 +97,27 @@ def search(reader, key, prev_size=0, compare_func=cmp, block_size=8192):
When performin_g linear search, keep track of up to N previous lines before When performin_g linear search, keep track of up to N previous lines before
first matching line. first matching line.
""" """
min_ = binsearch_offset(reader, key, compare_func, block_size) iter_ = binsearch(reader, key, compare_func, block_size)
iter_ = linearsearch(iter_,
key, prev_size=prev_size,
compare_func=compare_func)
return iter_
reader.seek(min_)
if min_ > 0: #=================================================================
reader.readline() # skip partial line def iter_range(reader, start, end, prev_size=0):
"""
Creates an iterator which iterates over lines where
start <= line < end (end exclusive)
"""
if prev_size > 1: iter_ = search(reader, start, prev_size=prev_size)
prev_deque = deque(max_len=prev_size)
line = None end_iter = itertools.takewhile(
lambda line: line < end,
iter_)
while True: return end_iter
line = reader.readline()
if not line:
break
if compare_func(line, key) >= 0:
break
if prev_size == 1:
prev = line
elif prev_size > 1:
prev_deque.append(line)
def gen_iter(line):
"""
Create iterator over any previous lines to
current matched line
"""
if prev_size == 1:
yield prev.rstrip()
elif prev_size > 1:
for i in prev_deque:
yield i.rstrip()
while line:
yield line.rstrip()
line = reader.readline()
return gen_iter(line)
#================================================================= #=================================================================

View File

@ -11,7 +11,7 @@ def gzip_decompressor():
#================================================================= #=================================================================
class BufferedReader(object): class DecompressingBufferedReader(object):
""" """
A wrapping line reader which wraps an existing reader. A wrapping line reader which wraps an existing reader.
Read operations operate on underlying buffer, which is filled to Read operations operate on underlying buffer, which is filled to
@ -29,7 +29,7 @@ class BufferedReader(object):
DECOMPRESSORS = {'gzip': gzip_decompressor} DECOMPRESSORS = {'gzip': gzip_decompressor}
def __init__(self, stream, max_len=0, block_size=1024, decomp_type=None): def __init__(self, stream, block_size=1024, decomp_type=None):
self.stream = stream self.stream = stream
self.block_size = block_size self.block_size = block_size
@ -44,24 +44,19 @@ class BufferedReader(object):
self.buff = None self.buff = None
self.num_read = 0 self.num_read = 0
self.max_len = max_len
def _fillbuff(self, block_size=None): def _fillbuff(self, block_size=None):
if not block_size: if not block_size:
block_size = self.block_size block_size = self.block_size
if not self.buff or self.buff.pos >= self.buff.len: if not self.buff or self.buff.pos >= self.buff.len:
if self.max_len > 0: data = self.stream.read(block_size)
to_read = min(self.max_len - self.num_read, self.block_size)
else:
to_read = self.block_size
data = self.stream.read(to_read)
self._process_read(data) self._process_read(data)
def _process_read(self, data): def _process_read(self, data):
data = self._decompress(data) data = self._decompress(data)
self.num_read += len(data) self.buff_size = len(data)
self.num_read += self.buff_size
self.buff = StringIO.StringIO(data) self.buff = StringIO.StringIO(data)
def _decompress(self, data): def _decompress(self, data):
@ -78,12 +73,40 @@ class BufferedReader(object):
return data return data
def read(self, length=None): def read(self, length=None):
"""
Fill bytes and read some number of bytes
(up to length if specified)
< length bytes may be read if reached the end of input
or at a buffer boundary. If at a boundary, the subsequent
call will fill buffer anew.
"""
self._fillbuff() self._fillbuff()
return self.buff.read(length) return self.buff.read(length)
def readline(self, length=None): def readline(self, length=None):
"""
Fill buffer and read a full line from the buffer
(up to specified length, if provided)
If no newline found at end, try filling buffer again in case
at buffer boundary.
"""
self._fillbuff() self._fillbuff()
return self.buff.readline(length) linebuff = self.buff.readline(length)
# we may be at a boundary
while not linebuff.endswith('\n'):
if length:
length -= len(linebuff)
if length <= 0:
break
self._fillbuff()
if self.buff_size == 0:
break
linebuff += self.buff.readline(length)
return linebuff
def close(self): def close(self):
if self.stream: if self.stream:
@ -97,7 +120,7 @@ class ChunkedDataException(Exception):
#================================================================= #=================================================================
class ChunkedDataReader(BufferedReader): class ChunkedDataReader(DecompressingBufferedReader):
r""" r"""
A ChunkedDataReader is a BufferedReader which also supports de-chunking A ChunkedDataReader is a BufferedReader which also supports de-chunking
of the data if it happens to be http 'chunk-encoded'. of the data if it happens to be http 'chunk-encoded'.
@ -133,7 +156,7 @@ class ChunkedDataReader(BufferedReader):
def _fillbuff(self, block_size=None): def _fillbuff(self, block_size=None):
if self.not_chunked: if self.not_chunked:
return BufferedReader._fillbuff(self, block_size) return super(ChunkedDataReader, self)._fillbuff(block_size)
if self.all_chunks_read: if self.all_chunks_read:
return return

View File

@ -9,18 +9,50 @@ import urllib2
import time import time
def is_http(filename):
return any(filename.startswith(x) for x in ['http://', 'https://'])
#================================================================= #=================================================================
# load a reader from http class BlockLoader(object):
#=================================================================
class HttpLoader(object):
""" """
Load a file-like reader over http using range requests a loader which can stream blocks of content
and an optional cookie created via a cookie_maker given a uri, offset and optional length.
Currently supports: http/https and file/local file system
""" """
def __init__(self, cookie_maker=None): def __init__(self, cookie_maker=None):
self.cookie_maker = cookie_maker self.cookie_maker = cookie_maker
def load(self, url, offset, length): def load(self, url, offset, length):
"""
Determine loading method based on uri
"""
if is_http(url):
return self.load_http(url, offset, length)
else:
return self.load_file(url, offset, length)
def load_file(self, url, offset, length):
"""
Load a file-like reader from the local file system
"""
if url.startswith('file://'):
url = url[len('file://'):]
afile = open(url, 'rb')
afile.seek(offset)
if length > 0:
return LimitReader(afile, length)
else:
return afile
def load_http(self, url, offset, length):
"""
Load a file-like reader over http using range requests
and an optional cookie created via a cookie_maker
"""
if length > 0: if length > 0:
range_header = 'bytes={0}-{1}'.format(offset, offset + length - 1) range_header = 'bytes={0}-{1}'.format(offset, offset + length - 1)
else: else:
@ -71,25 +103,6 @@ class HMACCookieMaker(object):
return cookie return cookie
#=================================================================
# load a reader from local filesystem
#=================================================================
class FileLoader(object):
"""
Load a file-like reader from the local file system
"""
def load(self, url, offset, length):
if url.startswith('file://'):
url = url[len('file://'):]
afile = open(url, 'rb')
afile.seek(offset)
if length > 0:
return LimitReader(afile, length)
#================================================================= #=================================================================
# Limit Reader # Limit Reader
#================================================================= #=================================================================

View File

@ -65,23 +65,36 @@ class StatusAndHeadersParser(object):
""" """
parse stream for status line and headers parse stream for status line and headers
return a StatusAndHeaders object return a StatusAndHeaders object
support continuation headers starting with space or tab
""" """
statusline = stream.readline().rstrip() statusline = stream.readline().rstrip()
protocol_status = self.split_prefix(statusline, self.statuslist) protocol_status = self.split_prefix(statusline, self.statuslist)
if not protocol_status: if not protocol_status:
msg = 'Expected Status Line - Found: ' + statusline msg = 'Expected Status Line starting with {0} - Found: {1}'
msg = msg.format(self.statuslist, statusline)
raise StatusAndHeadersParserException(msg, statusline) raise StatusAndHeadersParserException(msg, statusline)
headers = [] headers = []
line = stream.readline().rstrip() line = stream.readline().rstrip()
while line and line != '\r\n': while line:
name, value = line.split(':', 1) name, value = line.split(':', 1)
header = (name, value.strip()) name = name.rstrip(' \t')
value = value.lstrip()
next_line = stream.readline().rstrip()
# append continuation lines, if any
while next_line and next_line.startswith((' ', '\t')):
value += next_line
next_line = stream.readline().rstrip()
header = (name, value)
headers.append(header) headers.append(header)
line = stream.readline().rstrip() line = next_line
return StatusAndHeaders(statusline=protocol_status[1].strip(), return StatusAndHeaders(statusline=protocol_status[1].strip(),
headers=headers, headers=headers,
@ -107,4 +120,3 @@ class StatusAndHeadersParserException(Exception):
def __init__(self, msg, statusline): def __init__(self, msg, statusline):
super(StatusAndHeadersParserException, self).__init__(msg) super(StatusAndHeadersParserException, self).__init__(msg)
self.statusline = statusline self.statusline = statusline

View File

@ -9,6 +9,7 @@ org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ te
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz
# Exact Search
>>> print_binsearch_results('org,iana)/domains/root', iter_exact) >>> print_binsearch_results('org,iana)/domains/root', iter_exact)
org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz
@ -19,18 +20,45 @@ org,iana)/ 20140126200624 http://www.iana.org/ text/html 200 OSSAPWJ23L56IYVRW3G
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
# Exact Search >>> print_binsearch_results('org,iana)/time-zones', iter_exact)
org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
# Exact search -- no matches
>>> print_binsearch_results('org,iaana)/', iter_exact) >>> print_binsearch_results('org,iaana)/', iter_exact)
>>> print_binsearch_results('org,ibna)/', iter_exact) >>> print_binsearch_results('org,ibna)/', iter_exact)
>>> print_binsearch_results('org,iana)/time-zones', iter_exact)
org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz # Range Search (end exclusive)
>>> print_binsearch_results_range('org,iana)/about', 'org,iana)/domains', iter_range)
org,iana)/about 20140126200706 http://www.iana.org/about text/html 200 6G77LZKFAVKH4PCWWKMW6TRJPSHWUBI3 - - 2962 483588 iana.warc.gz
org,iana)/about/performance/ietf-draft-status 20140126200815 http://www.iana.org/about/performance/ietf-draft-status text/html 302 Y7CTA2QZUSCDTJCSECZNSPIBLJDO7PJJ - - 584 596566 iana.warc.gz
org,iana)/about/performance/ietf-statistics 20140126200804 http://www.iana.org/about/performance/ietf-statistics text/html 302 HNYDN7XRX46RQTT2OFIWXKEYMZQAJWHD - - 582 581890 iana.warc.gz
org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz
org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz
# Range Search -- exact
>>> print_binsearch_results_range('org,iana)/about', 'org,iana)/about!', iter_range)
org,iana)/about 20140126200706 http://www.iana.org/about text/html 200 6G77LZKFAVKH4PCWWKMW6TRJPSHWUBI3 - - 2962 483588 iana.warc.gz
# Range Search -- exact + 1 prev
>>> print_binsearch_results_range('org,iana)/about', 'org,iana)/about!', iter_range, prev_size=1)
org,iana)/_js/2013.1/jquery.js 20140126201307 https://www.iana.org/_js/2013.1/jquery.js warc/revisit - AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO - - 543 778507 iana.warc.gz
org,iana)/about 20140126200706 http://www.iana.org/about text/html 200 6G77LZKFAVKH4PCWWKMW6TRJPSHWUBI3 - - 2962 483588 iana.warc.gz
# Range Search -- exact + 2 prev
>>> print_binsearch_results_range('org,iana)/about', 'org,iana)/about!', iter_range, prev_size=2)
org,iana)/_js/2013.1/jquery.js 20140126201248 http://www.iana.org/_js/2013.1/jquery.js warc/revisit - AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO - - 544 765491 iana.warc.gz
org,iana)/_js/2013.1/jquery.js 20140126201307 https://www.iana.org/_js/2013.1/jquery.js warc/revisit - AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO - - 543 778507 iana.warc.gz
org,iana)/about 20140126200706 http://www.iana.org/about text/html 200 6G77LZKFAVKH4PCWWKMW6TRJPSHWUBI3 - - 2962 483588 iana.warc.gz
""" """
#================================================================= #=================================================================
import os import os
from pywb.utils.binsearch import iter_prefix, iter_exact from pywb.utils.binsearch import iter_prefix, iter_exact, iter_range
from pywb.utils.loaders import SeekableTextFileReader from pywb.utils.loaders import SeekableTextFileReader
from pywb import get_test_dir from pywb import get_test_dir
@ -45,6 +73,13 @@ def print_binsearch_results(key, iter_func):
print line print line
def print_binsearch_results_range(key, end_key, iter_func, prev_size=0):
cdx = SeekableTextFileReader(test_cdx_dir + 'iana.cdx')
for line in iter_func(cdx, key, end_key, prev_size=prev_size):
print line
if __name__ == "__main__": if __name__ == "__main__":
import doctest import doctest
doctest.testmod() doctest.testmod()

View File

@ -10,9 +10,9 @@
>>> read_multiple(LimitReader(StringIO.StringIO('abcdefghjiklmnopqrstuvwxyz'), 10), [2, 2, 20]) >>> read_multiple(LimitReader(StringIO.StringIO('abcdefghjiklmnopqrstuvwxyz'), 10), [2, 2, 20])
'efghji' 'efghji'
# FileLoader Tests (includes LimitReader) # BlockLoader Tests (includes LimitReader)
# Ensure attempt to read more than 100 bytes, reads exactly 100 bytes # Ensure attempt to read more than 100 bytes, reads exactly 100 bytes
>>> len(FileLoader().load(test_cdx_dir + 'iana.cdx', 0, 100).read('400')) >>> len(BlockLoader().load(test_cdx_dir + 'iana.cdx', 0, 100).read('400'))
100 100
# SeekableTextFileReader Test # SeekableTextFileReader Test
@ -23,25 +23,39 @@
>>> seek_read_full(sr, 100) >>> seek_read_full(sr, 100)
'org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200826 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 34054 620049 iana.warc.gz\\n' 'org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200826 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 34054 620049 iana.warc.gz\\n'
#BufferedReader readline() # Buffered Reader Tests
>>> BufferedReader(open(test_cdx_dir + 'iana.cdx', 'rb')).readline() #=================================================================
#DecompressingBufferedReader readline()
>>> DecompressingBufferedReader(open(test_cdx_dir + 'iana.cdx', 'rb')).readline()
' CDX N b a m s k r M S V g\\n' ' CDX N b a m s k r M S V g\\n'
#BufferedReader readline() with decompression #DecompressingBufferedReader readline() with decompression
>>> BufferedReader(open(test_cdx_dir + 'iana.cdx.gz', 'rb'), decomp_type = 'gzip').readline() >>> DecompressingBufferedReader(open(test_cdx_dir + 'iana.cdx.gz', 'rb'), decomp_type = 'gzip').readline()
' CDX N b a m s k r M S V g\\n' ' CDX N b a m s k r M S V g\\n'
>>> HttpLoader(HMACCookieMaker('test', 'test', 5)).load('http://example.com', 41, 14).read() >>> BlockLoader(HMACCookieMaker('test', 'test', 5)).load('http://example.com', 41, 14).read()
'Example Domain' 'Example Domain'
# test very small block size
>>> dbr = DecompressingBufferedReader(StringIO.StringIO('ABCDEFG\\nHIJKLMN\\nOPQR\\nXYZ'), block_size = 3)
>>> dbr.readline(); dbr.readline(4); dbr.readline(); dbr.readline(); dbr.readline(2); dbr.readline(); dbr.readline()
'ABCDEFG\\n'
'HIJK'
'LMN\\n'
'OPQR\\n'
'XY'
'Z'
''
""" """
#================================================================= #=================================================================
import os import os
import StringIO import StringIO
from pywb.utils.loaders import FileLoader, HttpLoader, HMACCookieMaker from pywb.utils.loaders import BlockLoader, HMACCookieMaker
from pywb.utils.loaders import LimitReader, SeekableTextFileReader from pywb.utils.loaders import LimitReader, SeekableTextFileReader
from pywb.utils.bufferedreaders import BufferedReader from pywb.utils.bufferedreaders import DecompressingBufferedReader
from pywb import get_test_dir from pywb import get_test_dir
#test_cdx_dir = os.path.dirname(os.path.realpath(__file__)) + '/../sample-data/' #test_cdx_dir = os.path.dirname(os.path.realpath(__file__)) + '/../sample-data/'

View File

@ -0,0 +1,29 @@
"""
>>> StatusAndHeadersParser(['HTTP/1.0']).parse(StringIO.StringIO(status_headers_1))
StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '200 OK', headers = [ ('Content-Type', 'ABC'),
('Some', 'Value'),
('Multi-Line', 'Value1 Also This')])
>>> StatusAndHeadersParser(['Other']).parse(StringIO.StringIO(status_headers_1))
Traceback (most recent call last):
StatusAndHeadersParserException: Expected Status Line starting with ['Other'] - Found: HTTP/1.0 200 OK
"""
from pywb.utils.statusandheaders import StatusAndHeadersParser
import StringIO
status_headers_1 = "\
HTTP/1.0 200 OK\r\n\
Content-Type: ABC\r\n\
Some: Value\r\n\
Multi-Line: Value1\r\n\
Also This\r\n\
\r\n\
Body"
if __name__ == "__main__":
import doctest
doctest.testmod()

View File

@ -17,7 +17,8 @@ DATE_TIMESPLIT = re.compile(r'[^\d]')
TIMESTAMP_14 = '%Y%m%d%H%M%S' TIMESTAMP_14 = '%Y%m%d%H%M%S'
PAD_STAMP_END = '29991231235959' #PAD_STAMP_END = '29991231235959'
PAD_6 = '299912'
def iso_date_to_datetime(string): def iso_date_to_datetime(string):
@ -58,41 +59,145 @@ def iso_date_to_timestamp(string):
return datetime_to_timestamp(iso_date_to_datetime(string)) return datetime_to_timestamp(iso_date_to_datetime(string))
# default pad is end of range for compatibility # pad to certain length (default 6)
def pad_timestamp(string, pad_str=PAD_STAMP_END): def _pad_timestamp(string, pad_str=PAD_6):
""" """
>>> pad_timestamp('20') >>> _pad_timestamp('20')
'20991231235959' '209912'
>>> pad_timestamp('2014') >>> _pad_timestamp('2014')
'20141231235959' '201412'
>>> pad_timestamp('20141011') >>> _pad_timestamp('20141011')
'20141011235959' '20141011'
>>> pad_timestamp('201410110010') >>> _pad_timestamp('201410110010')
'20141011001059' '201410110010'
""" """
str_len = len(string) str_len = len(string)
pad_len = len(pad_str) pad_len = len(pad_str)
return string if str_len >= pad_len else string + pad_str[str_len:] if str_len < pad_len:
string = string + pad_str[str_len:]
return string
def timestamp_to_datetime(string): def timestamp_to_datetime(string):
""" """
>>> timestamp_to_datetime('20131226095010') # >14-digit -- rest ignored
time.struct_time(tm_year=2013, tm_mon=12, tm_mday=26, \ >>> timestamp_to_datetime('2014122609501011')
tm_hour=9, tm_min=50, tm_sec=10, tm_wday=3, tm_yday=360, tm_isdst=-1) datetime.datetime(2014, 12, 26, 9, 50, 10)
# 14-digit
>>> timestamp_to_datetime('20141226095010')
datetime.datetime(2014, 12, 26, 9, 50, 10)
# 13-digit padding
>>> timestamp_to_datetime('2014122609501')
datetime.datetime(2014, 12, 26, 9, 50, 59)
# 12-digit padding
>>> timestamp_to_datetime('201412260950')
datetime.datetime(2014, 12, 26, 9, 50, 59)
# 11-digit padding
>>> timestamp_to_datetime('20141226095')
datetime.datetime(2014, 12, 26, 9, 59, 59)
# 10-digit padding
>>> timestamp_to_datetime('2014122609')
datetime.datetime(2014, 12, 26, 9, 59, 59)
# 9-digit padding
>>> timestamp_to_datetime('201412260')
datetime.datetime(2014, 12, 26, 23, 59, 59)
# 8-digit padding
>>> timestamp_to_datetime('20141226')
datetime.datetime(2014, 12, 26, 23, 59, 59)
# 7-digit padding
>>> timestamp_to_datetime('2014122')
datetime.datetime(2014, 12, 31, 23, 59, 59)
# 6-digit padding
>>> timestamp_to_datetime('201410')
datetime.datetime(2014, 10, 31, 23, 59, 59)
# 5-digit padding
>>> timestamp_to_datetime('20141')
datetime.datetime(2014, 12, 31, 23, 59, 59)
# 4-digit padding
>>> timestamp_to_datetime('2014') >>> timestamp_to_datetime('2014')
time.struct_time(tm_year=2014, tm_mon=12, tm_mday=31, \ datetime.datetime(2014, 12, 31, 23, 59, 59)
tm_hour=23, tm_min=59, tm_sec=59, tm_wday=2, tm_yday=365, tm_isdst=-1)
# 3-digit padding
>>> timestamp_to_datetime('201')
datetime.datetime(2019, 12, 31, 23, 59, 59)
# 2-digit padding
>>> timestamp_to_datetime('20')
datetime.datetime(2099, 12, 31, 23, 59, 59)
# 1-digit padding
>>> timestamp_to_datetime('2')
datetime.datetime(2999, 12, 31, 23, 59, 59)
# 1-digit out-of-range padding
>>> timestamp_to_datetime('3')
datetime.datetime(2999, 12, 31, 23, 59, 59)
# 0-digit padding
>>> timestamp_to_datetime('')
datetime.datetime(2999, 12, 31, 23, 59, 59)
# bad month
>>> timestamp_to_datetime('20131709005601')
datetime.datetime(2013, 12, 9, 0, 56, 1)
# all out of range except minutes
>>> timestamp_to_datetime('40001965252477')
datetime.datetime(2999, 12, 31, 23, 24, 59)
""" """
# Default pad to end of range for comptability # pad to 6 digits
return time.strptime(pad_timestamp(string), TIMESTAMP_14) string = _pad_timestamp(string, PAD_6)
def clamp(val, min_, max_):
try:
val = int(val)
val = max(min_, min(val, max_))
return val
except:
return max_
def extract(string, start, end, min_, max_):
if len(string) >= end:
return clamp(string[start:end], min_, max_)
else:
return max_
# now parse, clamp to boundary
year = extract(string, 0, 4, 1900, 2999)
month = extract(string, 4, 6, 1, 12)
day = extract(string, 6, 8, 1, calendar.monthrange(year, month)[1])
hour = extract(string, 8, 10, 0, 23)
minute = extract(string, 10, 12, 0, 59)
second = extract(string, 12, 14, 0, 59)
return datetime.datetime(year=year,
month=month,
day=day,
hour=hour,
minute=minute,
second=second)
#return time.strptime(pad_timestamp(string), TIMESTAMP_14)
def timestamp_to_sec(string): def timestamp_to_sec(string):
@ -104,7 +209,7 @@ def timestamp_to_sec(string):
1420070399 1420070399
""" """
return calendar.timegm(timestamp_to_datetime(string)) return calendar.timegm(timestamp_to_datetime(string).utctimetuple())
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -56,9 +56,9 @@ class J2TemplateView:
# Filters # Filters
@staticmethod @staticmethod
def format_ts(value, format='%a, %b %d %Y %H:%M:%S'): def format_ts(value, format_='%a, %b %d %Y %H:%M:%S'):
value = timeutils.timestamp_to_datetime(value) value = timeutils.timestamp_to_datetime(value)
return time.strftime(format, value) return value.strftime(format_)
@staticmethod @staticmethod
def get_host(url): def get_host(url):

View File

@ -6,8 +6,8 @@ from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.utils.statusandheaders import StatusAndHeadersParser from pywb.utils.statusandheaders import StatusAndHeadersParser
from pywb.utils.statusandheaders import StatusAndHeadersParserException from pywb.utils.statusandheaders import StatusAndHeadersParserException
from pywb.utils.loaders import FileLoader, HttpLoader from pywb.utils.loaders import BlockLoader
from pywb.utils.bufferedreaders import BufferedReader from pywb.utils.bufferedreaders import DecompressingBufferedReader
#================================================================= #=================================================================
ArcWarcRecord = collections.namedtuple('ArchiveRecord', ArcWarcRecord = collections.namedtuple('ArchiveRecord',
@ -32,24 +32,12 @@ class ArcWarcRecordLoader:
ARC_HEADERS = ["uri", "ip-address", "creation-date", ARC_HEADERS = ["uri", "ip-address", "creation-date",
"content-type", "length"] "content-type", "length"]
@staticmethod def __init__(self, loader=None, cookie_maker=None, block_size=8192):
def create_default_loaders(cookie_maker=None): if not loader:
http = HttpLoader(cookie_maker) loader = BlockLoader(cookie_maker)
file = FileLoader()
return {
'http': http,
'https': http,
'file': file,
'': file
}
def __init__(self, loaders={}, cookie_maker=None, chunk_size=8192): self.loader = loader
self.loaders = loaders self.block_size = block_size
if not self.loaders:
self.loaders = self.create_default_loaders(cookie_maker)
self.chunk_size = chunk_size
self.arc_parser = ARCHeadersParser(self.ARC_HEADERS) self.arc_parser = ARCHeadersParser(self.ARC_HEADERS)
@ -60,22 +48,25 @@ class ArcWarcRecordLoader:
def load(self, url, offset, length): def load(self, url, offset, length):
url_parts = urlparse.urlsplit(url) url_parts = urlparse.urlsplit(url)
loader = self.loaders.get(url_parts.scheme) #loader = self.loaders.get(url_parts.scheme)
if not loader: #if not loader:
raise ArchiveLoadFailed('Unknown Protocol', url) # raise ArchiveLoadFailed('Unknown Protocol', url)
try: try:
length = int(length) length = int(length)
except: except:
length = -1 length = -1
raw = loader.load(url, long(offset), length) raw = self.loader.load(url, long(offset), length)
decomp_type = 'gzip' decomp_type = 'gzip'
stream = BufferedReader(raw, length, self.chunk_size, decomp_type) # Create decompressing stream
stream = DecompressingBufferedReader(stream = raw,
decomp_type = decomp_type,
block_size = self.block_size)
(the_format, rec_headers) = self._load_headers(stream) (the_format, rec_headers) = self._detect_type_load_headers(stream)
if the_format == 'arc': if the_format == 'arc':
rec_type = 'response' rec_type = 'response'
@ -111,7 +102,7 @@ class ArcWarcRecordLoader:
return ArcWarcRecord((the_format, rec_type), return ArcWarcRecord((the_format, rec_type),
rec_headers, stream, status_headers) rec_headers, stream, status_headers)
def _load_headers(self, stream): def _detect_type_load_headers(self, stream):
""" """
Try parsing record as WARC, then try parsing as ARC. Try parsing record as WARC, then try parsing as ARC.
if neither one succeeds, we're out of luck. if neither one succeeds, we're out of luck.

View File

@ -213,3 +213,6 @@ def load_from_cdx_test(cdx):
except Exception as e: except Exception as e:
print 'Exception: ' + e.__class__.__name__ print 'Exception: ' + e.__class__.__name__
if __name__ == "__main__":
import doctest
doctest.testmod()

View File

@ -1,99 +1,75 @@
from pywb.rewrite.wburl import WbUrl
from pywb.rewrite.url_rewriter import UrlRewriter
from pywb.utils.statusandheaders import StatusAndHeaders from pywb.utils.statusandheaders import StatusAndHeaders
import pprint import pprint
#WB Request and Response
#=================================================================
class WbRequest: class WbRequest:
""" """
>>> WbRequest.from_uri('/save/_embed/example.com/?a=b') Represents the main pywb request object.
{'wb_url': ('latest_replay', '', '', 'http://_embed/example.com/?a=b', 'http://_embed/example.com/?a=b'), 'coll': 'save', 'wb_prefix': '/save/', 'request_uri': '/save/_embed/example.com/?a=b'}
>>> WbRequest.from_uri('/2345/20101024101112im_/example.com/?b=c') Contains various info from wsgi env, add additional info
{'wb_url': ('replay', '20101024101112', 'im_', 'http://example.com/?b=c', '20101024101112im_/http://example.com/?b=c'), 'coll': '2345', 'wb_prefix': '/2345/', 'request_uri': '/2345/20101024101112im_/example.com/?b=c'} about the request, such as coll, relative prefix,
host prefix, absolute prefix.
>>> WbRequest.from_uri('/2010/example.com') If a wburl and url rewriter classes are specified, the class
{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': '/2010/', 'request_uri': '/2010/example.com'} also contains the url rewriter.
>>> WbRequest.from_uri('../example.com')
{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '', 'wb_prefix': '/', 'request_uri': '../example.com'}
# Abs path
>>> WbRequest.from_uri('/2010/example.com', {'wsgi.url_scheme': 'https', 'HTTP_HOST': 'localhost:8080'}, use_abs_prefix = True)
{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': 'https://localhost:8080/2010/', 'request_uri': '/2010/example.com'}
# No Scheme, so stick to relative
>>> WbRequest.from_uri('/2010/example.com', {'HTTP_HOST': 'localhost:8080'}, use_abs_prefix = True)
{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': '/2010/', 'request_uri': '/2010/example.com'}
""" """
@staticmethod
def from_uri(request_uri, env = {}, use_abs_prefix = False):
if not request_uri:
request_uri = env.get('REL_REQUEST_URI')
parts = request_uri.split('/', 2)
# Has coll prefix
if len(parts) == 3:
wb_prefix = '/' + parts[1] + '/'
wb_url_str = parts[2]
coll = parts[1]
# No Coll Prefix
elif len(parts) == 2:
wb_prefix = '/'
wb_url_str = parts[1]
coll = ''
else:
wb_prefix = '/'
wb_url_str = parts[0]
coll = ''
host_prefix = WbRequest.make_host_prefix(env) if use_abs_prefix else ''
return WbRequest(env, request_uri, wb_prefix, wb_url_str, coll, host_prefix = host_prefix)
@staticmethod @staticmethod
def make_host_prefix(env): def make_host_prefix(env):
try: try:
return env['wsgi.url_scheme'] + '://' + env['HTTP_HOST'] host = env.get('HTTP_HOST')
if not host:
host = env['SERVER_NAME'] + ':' + env['SERVER_PORT']
return env['wsgi.url_scheme'] + '://' + host
except KeyError: except KeyError:
return '' return ''
def __init__(self, env, request_uri, wb_prefix, wb_url_str, coll, def __init__(self, env,
host_prefix = '', request_uri=None,
wburl_class = WbUrl, rel_prefix='',
url_rewriter_class = UrlRewriter, wb_url_str='/',
is_proxy = False): coll='',
host_prefix='',
use_abs_prefix=False,
wburl_class=None,
urlrewriter_class=None,
is_proxy=False):
self.env = env self.env = env
self.request_uri = request_uri if request_uri else env.get('REL_REQUEST_URI') self.request_uri = request_uri if request_uri else env.get('REL_REQUEST_URI')
self.host_prefix = host_prefix self.coll = coll
if not host_prefix:
host_prefix = self.make_host_prefix(env)
self.host_prefix = host_prefix
self.rel_prefix = rel_prefix
if use_abs_prefix:
self.wb_prefix = host_prefix + rel_prefix
else:
self.wb_prefix = rel_prefix
self.wb_prefix = host_prefix + wb_prefix
if not wb_url_str: if not wb_url_str:
wb_url_str = '/' wb_url_str = '/'
self.wb_url_str = wb_url_str
# wb_url present and not root page # wb_url present and not root page
if wb_url_str != '/' and wburl_class: if wb_url_str != '/' and wburl_class:
self.wb_url_str = wb_url_str
self.wb_url = wburl_class(wb_url_str) self.wb_url = wburl_class(wb_url_str)
self.urlrewriter = url_rewriter_class(self.wb_url, self.wb_prefix) self.urlrewriter = urlrewriter_class(self.wb_url, self.wb_prefix)
else: else:
# no wb_url, just store blank wb_url # no wb_url, just store blank wb_url
self.wb_url_str = wb_url_str
self.wb_url = None self.wb_url = None
self.urlrewriter = None self.urlrewriter = None
self.coll = coll
self.referrer = env.get('HTTP_REFERER') self.referrer = env.get('HTTP_REFERER')
self.is_ajax = self._is_ajax() self.is_ajax = self._is_ajax()
@ -122,24 +98,19 @@ class WbRequest:
def __repr__(self): def __repr__(self):
#return "WbRequest(env, '" + (self.wb_url) + "', '" + (self.coll) + "')"
#return str(vars(self))
varlist = vars(self) varlist = vars(self)
return str({k: varlist[k] for k in ('request_uri', 'wb_prefix', 'wb_url', 'coll')}) varstr = pprint.pformat(varlist)
return varstr
#=================================================================
class WbResponse: class WbResponse:
""" """
>>> WbResponse.text_response('Test') Represnts a pywb wsgi response object.
{'body': ['Test'], 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [('Content-Type', 'text/plain')])}
>>> WbResponse.text_stream(['Test', 'Another'], '404') Holds a status_headers object and a response iter, to be
{'body': ['Test', 'Another'], 'status_headers': StatusAndHeaders(protocol = '', statusline = '404', headers = [('Content-Type', 'text/plain')])} returned to wsgi container.
>>> WbResponse.redir_response('http://example.com/otherfile')
{'body': [], 'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [('Location', 'http://example.com/otherfile')])}
""" """
def __init__(self, status_headers, value = []): def __init__(self, status_headers, value = []):
self.status_headers = status_headers self.status_headers = status_headers
self.body = value self.body = value
@ -180,8 +151,3 @@ class WbResponse:
def __repr__(self): def __repr__(self):
return str(vars(self)) return str(vars(self))
if __name__ == "__main__":
import doctest
doctest.testmod()

View File

@ -75,6 +75,11 @@ class TestWb:
assert 'wb.js' in resp.body assert 'wb.js' in resp.body
assert '/pywb/20140127171238/http://www.iana.org/time-zones' in resp.body assert '/pywb/20140127171238/http://www.iana.org/time-zones' in resp.body
def test_replay_content_length_1(self):
# test larger file, rewritten file (svg!)
resp = self.testapp.get('/pywb/20140126200654/http://www.iana.org/_img/2013.1/rir-map.svg')
assert resp.headers['Content-Length'] == str(len(resp.body))
def test_redirect_1(self): def test_redirect_1(self):
resp = self.testapp.get('/pywb/20140127171237/http://www.iana.org/') resp = self.testapp.get('/pywb/20140127171237/http://www.iana.org/')
@ -119,6 +124,20 @@ class TestWb:
assert resp.content_type == 'text/css' assert resp.content_type == 'text/css'
def test_referrer_self_redirect(self):
uri = '/pywb/20140127171239/http://www.iana.org/_css/2013.1/screen.css'
host = 'somehost:8082'
referrer = 'http://' + host + uri
# capture is normally a 200
resp = self.testapp.get(uri)
assert resp.status_int == 200
# redirect causes skip of this capture, redirect to next
resp = self.testapp.get(uri, headers = [('Referer', referrer), ('Host', host)], status = 302)
assert resp.status_int == 302
def test_excluded_content(self): def test_excluded_content(self):
resp = self.testapp.get('/pywb/http://www.iana.org/_img/bookmark_icon.ico', status = 403) resp = self.testapp.get('/pywb/http://www.iana.org/_img/bookmark_icon.ico', status = 403)
assert resp.status_int == 403 assert resp.status_int == 403