1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

refactoring of binsearch and cdxserver into seperate packages

also move complicated doctests and integration tests to tests/
This commit is contained in:
Ilya Kreymer 2014-02-12 13:16:07 -08:00
parent e4f409b2a4
commit 2528ee0a7c
23 changed files with 947 additions and 755 deletions

View File

@ -6,4 +6,5 @@ install:
- "python setup.py -q install"
# command to run tests
#script: nosetests --with-doctest
script: py.test run-tests.py ./pywb/ --doctest-modules --ignore=setup.py
#script: py.test run-tests.py ./pywb/ --doctest-modules --ignore=setup.py
script: py.test -v --doctest-module ./tests/*.py ./pywb/

View File

@ -13,6 +13,9 @@ from wbrequestresponse import StatusAndHeaders
#=================================================================
class HttpLoader:
"""
Load content over http with range request and optional signature
"""
def __init__(self, hmac = None, hmac_duration = 30):
self.hmac = hmac
self.hmac_duration = hmac_duration
@ -38,6 +41,8 @@ class HttpLoader:
#=================================================================
class FileLoader:
"""
Load content from local file-system
# Ensure attempt to read more than 100 bytes, only reads 100 bytes
>>> len(FileLoader().load(utils.test_data_dir() + 'warcs/iana.warc.gz', 0, 100).read('400'))
100

View File

@ -1,147 +0,0 @@
from collections import deque
import os
import itertools
class FileReader:
def __init__(self, filename):
self.fh = open(filename, 'rb')
self.filename = filename
self.size = os.path.getsize(filename)
def getsize(self):
return self.size
def readline(self):
return self.fh.readline()
def seek(self, offset):
return self.fh.seek(offset)
def close(self):
return self.fh.close()
def binsearch_offset(reader, key, compare_func = cmp, block_size = 8192):
min = 0
max = reader.getsize() / block_size
while (max - min > 1):
mid = min + ((max - min) / 2)
reader.seek(mid * block_size)
if mid > 0:
reader.readline() # skip partial line
line = reader.readline()
if compare_func(key, line) > 0:
min = mid
else:
max = mid
return (min * block_size)
def search(reader, key, prev_size = 0, compare_func = cmp, block_size = 8192):
min = binsearch_offset(reader, key, compare_func, block_size)
reader.seek(min)
if min > 0:
reader.readline() # skip partial line
if prev_size > 1:
prev_deque = deque(maxlen = prev_size)
line = None
while True:
line = reader.readline()
if not line:
break
if compare_func(line, key) >= 0:
break
if prev_size == 1:
prev = line
elif prev_size > 1:
prev_deque.append(line)
def gen_iter(line):
if prev_size == 1:
yield prev.rstrip()
elif prev_size > 1:
for i in prev_deque:
yield i.rstrip()
while line:
yield line.rstrip()
line = reader.readline()
return gen_iter(line)
# Iterate over prefix matches
def iter_prefix(reader, key):
"""
>>> print_test_cdx('org,iana)/domains/root', iter_prefix)
org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz
"""
lines = search(reader, key)
return itertools.takewhile(lambda line: line.startswith(key), lines)
def iter_exact(reader, key, tok = ' '):
"""
>>> print_test_cdx('org,iana)/domains/root', iter_exact)
org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz
>>> print_test_cdx('org,iana)/', iter_exact)
org,iana)/ 20140126200624 http://www.iana.org/ text/html 200 OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 2258 334 iana.warc.gz
>>> print_test_cdx('org,iana)/domains/root/db', iter_exact)
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
>>> print_test_cdx('org,iaana)/', iter_exact)
>>> print_test_cdx('org,ibna)/', iter_exact)
>>> print_test_cdx('org,iana)/time-zones', iter_exact)
org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
"""
lines = search(reader, key)
def check_key(line):
line_key = line.split(tok, 1)[0]
return line_key == key
return itertools.takewhile(check_key, lines)
import utils
if __name__ == "__main__" or utils.enable_doctests():
def create_test_cdx(test_file):
path = utils.test_data_dir() + 'cdx/' + test_file
return FileReader(path)
test_cdx = create_test_cdx('iana.cdx')
def print_test_cdx(key, iter_func, filename = None):
cdx = test_cdx if not filename else create_test_cdx(filename)
for line in iter_func(cdx, key):
print line
#cdx.close()
import doctest
doctest.testmod()

View File

123
pywb/binsearch/binsearch.py Normal file
View File

@ -0,0 +1,123 @@
from collections import deque
import os
import itertools
#=================================================================
# Binary Search over a text file
#=================================================================
class FileReader:
"""
A very simple file-like object wrapper that knows it's size
getsize() method returns the filesize
"""
def __init__(self, filename):
self.fh = open(filename, 'rb')
self.filename = filename
self.size = os.path.getsize(filename)
def getsize(self):
return self.size
def readline(self):
return self.fh.readline()
def seek(self, offset):
return self.fh.seek(offset)
def close(self):
return self.fh.close()
#=================================================================
def binsearch_offset(reader, key, compare_func=cmp, block_size=8192):
"""
Find offset of the full line which matches a given 'key' using binary search
If key is not found, the offset is of the line after the key
File is subdivided into block_size (default 8192) sized blocks
Optional compare_func may be specified
"""
min = 0
max = reader.getsize() / block_size
while (max - min > 1):
mid = min + ((max - min) / 2)
reader.seek(mid * block_size)
if mid > 0:
reader.readline() # skip partial line
line = reader.readline()
if compare_func(key, line) > 0:
min = mid
else:
max = mid
return (min * block_size)
def search(reader, key, prev_size = 0, compare_func = cmp, block_size = 8192):
"""
Perform a binsearch for a specified key down to block_size (8192) sized blocks,
followed by linear search within the block to find first matching line.
When performing linear search, keep track of up to N previous lines before
first matching line.
"""
min = binsearch_offset(reader, key, compare_func, block_size)
reader.seek(min)
if min > 0:
reader.readline() # skip partial line
if prev_size > 1:
prev_deque = deque(maxlen = prev_size)
line = None
while True:
line = reader.readline()
if not line:
break
if compare_func(line, key) >= 0:
break
if prev_size == 1:
prev = line
elif prev_size > 1:
prev_deque.append(line)
def gen_iter(line):
if prev_size == 1:
yield prev.rstrip()
elif prev_size > 1:
for i in prev_deque:
yield i.rstrip()
while line:
yield line.rstrip()
line = reader.readline()
return gen_iter(line)
# Iterate over prefix matches
def iter_prefix(reader, key):
"""
Creates an iterator which iterates over prefix matches for a key in a sorted text file
A line matches as long as it starts with key
"""
return itertools.takewhile(lambda line: line.startswith(key), search(reader, key))
def iter_exact(reader, key, token=' '):
"""
Create an iterator which iterates over exact matches for a key in a sorted text file
Key is terminated by a token (default ' ')
"""
return iter_prefix(reader, key + token)

View File

@ -1,358 +0,0 @@
import binsearch
import indexreader
import bisect
import itertools
import re
from heapq import merge
from collections import deque
#=================================================================
def cdx_text_out(cdx, fields):
if not fields:
return str(cdx)
else:
return ' '.join(map(lambda x: cdx[x], fields.split(',')))
#=================================================================
def cdx_serve(key, params, sources, match_func = binsearch.iter_exact):
cdx_iter = merge_sort_streams(sources, key, match_func)
cdx_iter = make_cdx_iter(cdx_iter)
resolve_revisits = params.get('resolve_revisits', False)
if resolve_revisits:
cdx_iter = cdx_resolve_revisits(cdx_iter)
filters = params.get('filter', None)
if filters:
cdx_iter = cdx_filter(cdx_iter, filters)
collapse_time = params.get('collapse_time', None)
if collapse_time:
cdx_iter = cdx_collapse_time_status(cdx_iter, collapse_time)
limit = int(params.get('limit', 1000000))
reverse = params.get('reverse', False)
if reverse:
cdx_iter = cdx_reverse(cdx_iter, limit)
closest_to = params.get('closest_to', None)
if closest_to:
cdx_iter = cdx_sort_closest(closest_to, cdx_iter, limit)
if limit:
cdx_iter = cdx_limit(cdx_iter, limit)
# output raw cdx objects
if params.get('output') == 'raw':
return cdx_iter
def write_cdx(fields):
for cdx in cdx_iter:
yield cdx_text_out(cdx, fields) + '\n'
return write_cdx(params.get('fields'))
#=================================================================
# merge multiple cdx streams
def merge_sort_streams(sources, key, iter_func):
"""
>>> test_cdx(key = 'org,iana)/', sources = [test_dir + 'dupes.cdx', test_dir + 'iana.cdx'])
org,iana)/ 20140126200624 http://www.iana.org/ text/html 200 OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 2258 334 iana.warc.gz
org,iana)/ 20140127171238 http://iana.org unk 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 343 1858 dupes.warc.gz
org,iana)/ 20140127171238 http://www.iana.org/ warc/revisit - OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 536 2678 dupes.warc.gz
"""
def load_src(source):
source = binsearch.FileReader(source)
source = iter_func(source, key)
return source
# Optimize: no need to merge if just one input
if len(sources) == 1:
return load_src(sources[0])
source_iters = map(load_src, sources)
merged_stream = merge(*(source_iters))
return merged_stream
#=================================================================
# convert text cdx stream to CDXCaptureResult
def make_cdx_iter(text_iter):
return itertools.imap(lambda line: indexreader.CDXCaptureResult(line), text_iter)
#=================================================================
# limit cdx to at most limit
def cdx_limit(cdx_iter, limit):
"""
>>> test_cdx('org,iana)/_css/2013.1/fonts/opensans-bold.ttf', limit = 3)
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200625 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 117166 198285 iana.warc.gz
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200654 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf warc/revisit - YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 548 482544 iana.warc.gz
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200706 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf warc/revisit - YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 495230 iana.warc.gz
"""
for cdx, _ in itertools.izip(cdx_iter, xrange(limit)):
yield cdx
#=================================================================
# reverse cdx
def cdx_reverse(cdx_iter, limit):
"""
>>> test_cdx('org,iana)/_css/2013.1/fonts/opensans-bold.ttf', reverse = True, resolve_revisits = True, limit = 3)
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201308 https://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 551 783712 iana.warc.gz 117166 198285 iana.warc.gz
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201249 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 771773 iana.warc.gz 117166 198285 iana.warc.gz
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201240 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 551 757988 iana.warc.gz 117166 198285 iana.warc.gz
>>> test_cdx('org,iana)/_js/2013.1/jquery.js', reverse = True, resolve_revisits = True, limit = 1)
org,iana)/_js/2013.1/jquery.js 20140126201307 https://www.iana.org/_js/2013.1/jquery.js application/x-javascript 200 AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO - - 543 778507 iana.warc.gz 33449 7311 iana.warc.gz
# no match, single result
>>> test_cdx('org,iana)/dont_have_this', reverse = True, resolve_revisits = True, limit = 1)
"""
# optimize for single last
if limit == 1:
last = None
for cdx in cdx_iter:
last = cdx
return [last] if last else []
reverse_cdxs = deque(maxlen = limit)
for cdx in cdx_iter:
reverse_cdxs.appendleft(cdx)
return reverse_cdxs
#=================================================================
# filter cdx by regex if each filter is field:regex form,
# apply filter to cdx[field]
def cdx_filter(cdx_iter, filter_strings):
"""
>>> test_cdx(key = 'org,iana)/domains', match_func = binsearch.iter_prefix, filter = ['mimetype:text/html'])
org,iana)/domains 20140126200825 http://www.iana.org/domains text/html 200 7UPSCLNWNZP33LGW6OJGSF2Y4CDG4ES7 - - 2912 610534 iana.warc.gz
org,iana)/domains/arpa 20140126201248 http://www.iana.org/domains/arpa text/html 200 QOFZZRN6JIKAL2JRL6ZC2VVG42SPKGHT - - 2939 759039 iana.warc.gz
org,iana)/domains/idn-tables 20140126201127 http://www.iana.org/domains/idn-tables text/html 200 HNCUFTJMOQOGAEY6T56KVC3T7TVLKGEW - - 8118 715878 iana.warc.gz
org,iana)/domains/int 20140126201239 http://www.iana.org/domains/int text/html 200 X32BBNNORV4SPEHTQF5KI5NFHSKTZK6Q - - 2482 746788 iana.warc.gz
org,iana)/domains/reserved 20140126201054 http://www.iana.org/domains/reserved text/html 200 R5AAEQX5XY5X5DG66B23ODN5DUBWRA27 - - 3573 701457 iana.warc.gz
org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz
>>> test_cdx(key = 'org,iana)/_css/2013.1/screen.css', filter = 'statuscode:200')
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
"""
# Support single strings as well
if isinstance(filter_strings, str):
filter_strings = [filter_strings]
filters = []
class Filter:
def __init__(self, string):
# invert filter
self.invert = string.startswith('!')
if self.invert:
string = string[1:]
parts = string.split(':', 1)
# no field set, apply filter to entire cdx
if len(parts) == 1:
self.field = ''
else:
# apply filter to cdx[field]
self.field = parts[0]
string = parts[1]
self.regex = re.compile(string)
def __call__(self, cdx):
val = cdx[self.field] if self.field else str(cdx)
matched = self.regex.match(val) is not None
return matched ^ self.invert
filters = map(Filter, filter_strings)
for cdx in cdx_iter:
if all (x(cdx) for x in filters):
yield cdx
#=================================================================
# collapse by timestamp and status code
def cdx_collapse_time_status(cdx_iter, timelen = 10):
"""
# unresolved revisits, different statuscode results in an extra repeat
>>> test_cdx(key = 'org,iana)/_css/2013.1/screen.css', collapse_time = 11)
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
org,iana)/_css/2013.1/screen.css 20140126200653 http://www.iana.org/_css/2013.1/screen.css warc/revisit - BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 533 328367 iana.warc.gz
org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css warc/revisit - BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz
# resolved revisits
>>> test_cdx(key = 'org,iana)/_css/2013.1/screen.css', collapse_time = '11', resolve_revisits = True)
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz - - -
org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz 8754 41238 iana.warc.gz
"""
timelen = int(timelen)
last_token = None
for cdx in cdx_iter:
curr_token = (cdx['timestamp'][:timelen], cdx['statuscode'])
# yield if last_dedup_time is diff, otherwise skip
if curr_token != last_token:
last_token = curr_token
yield cdx
#=================================================================
# sort CDXCaptureResult by closest to timestamp
def cdx_sort_closest(closest, cdx_iter, limit = 10):
"""
>>> test_cdx(closest_to = '20140126200826', key = 'org,iana)/_css/2013.1/fonts/opensans-bold.ttf', fields = 'timestamp', limit = 10)
20140126200826
20140126200816
20140126200805
20140126200912
20140126200738
20140126200930
20140126200718
20140126200706
20140126200654
20140126200625
>>> test_cdx(closest_to = '20140126201306', key = 'org,iana)/dnssec', resolve_revisits = True, sources = [test_dir + 'dupes.cdx', test_dir + 'iana.cdx'])
org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz - - -
org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz - - -
>>> test_cdx(closest_to = '20140126201307', key = 'org,iana)/dnssec', resolve_revisits = True)
org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz - - -
org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz - - -
# equal dist prefer earlier
>>> test_cdx(closest_to = '20140126200700', key = 'org,iana)/_css/2013.1/fonts/opensans-bold.ttf', resolve_revisits = True, limit = 2)
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200654 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 548 482544 iana.warc.gz 117166 198285 iana.warc.gz
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200706 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 495230 iana.warc.gz 117166 198285 iana.warc.gz
>>> test_cdx(closest_to = '20140126200659', key = 'org,iana)/_css/2013.1/fonts/opensans-bold.ttf', resolve_revisits = True, limit = 2, fields = 'timestamp')
20140126200654
20140126200706
>>> test_cdx(closest_to = '20140126200701', key = 'org,iana)/_css/2013.1/fonts/opensans-bold.ttf', resolve_revisits = True, limit = 2, fields = 'timestamp')
20140126200706
20140126200654
"""
closest_cdx = []
closest_sec = utils.timestamp_to_sec(closest)
for cdx in cdx_iter:
sec = utils.timestamp_to_sec(cdx['timestamp'])
key = abs(closest_sec - sec)
# create tuple to sort by key
bisect.insort(closest_cdx, (key, cdx))
if len(closest_cdx) == limit:
# assuming cdx in ascending order and keys have started increasing
if key > closest_cdx[-1]:
break
if len(closest_cdx) > limit:
closest_cdx.pop()
return itertools.imap(lambda x: x[1], closest_cdx)
#=================================================================
# resolve revisits
# Fields to append from cdx original to revisit
ORIG_TUPLE = ['length', 'offset', 'filename']
def cdx_resolve_revisits(cdx_iter):
"""
>>> test_cdx('org,iana)/_css/2013.1/fonts/inconsolata.otf', resolve_revisits = True)
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200826 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 34054 620049 iana.warc.gz - - -
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200912 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 546 667073 iana.warc.gz 34054 620049 iana.warc.gz
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200930 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 534 697255 iana.warc.gz 34054 620049 iana.warc.gz
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201055 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 547 714833 iana.warc.gz 34054 620049 iana.warc.gz
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201249 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 551 768625 iana.warc.gz 34054 620049 iana.warc.gz
>>> test_cdx('org,iana)/domains/root/db', resolve_revisits = True)
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz - - -
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz - - -
"""
originals = {}
for cdx in cdx_iter:
is_revisit = (cdx['mimetype'] == 'warc/revisit') or (cdx['filename'] == '-')
digest = cdx['digest']
original_cdx = originals.get(digest)
if not original_cdx and not is_revisit:
originals[digest] = cdx
if original_cdx and is_revisit:
fill_orig = lambda field: original_cdx[field]
# Transfer mimetype and statuscode
cdx['mimetype'] = original_cdx['mimetype']
cdx['statuscode'] = original_cdx['statuscode']
else:
fill_orig = lambda field: '-'
# Always add either the original or empty '- - -'
for field in ORIG_TUPLE:
cdx['orig.' + field] = fill_orig(field)
yield cdx
import utils
if __name__ == "__main__" or utils.enable_doctests():
import os
import sys
test_dir = utils.test_data_dir() + 'cdx/'
def test_cdx(key, match_func = binsearch.iter_exact, sources = [test_dir + 'iana.cdx'], **kwparams):
for x in cdx_serve(key, kwparams, sources, match_func):
sys.stdout.write(x)
import doctest
doctest.testmod()

View File

42
pywb/cdxserver/cdxapp.py Normal file
View File

@ -0,0 +1,42 @@
from cdxserver import CDXServer
import logging
import os
test_cdx_dir = os.path.dirname(os.path.realpath(__file__)) + '/../../sample_archive/cdx/'
#=================================================================
def main(config = None):
logging.basicConfig(format = '%(asctime)s: [%(levelname)s]: %(message)s', level = logging.DEBUG)
if not config:
config = [test_cdx_dir]
cdxserver = CDXServer(config)
def application(env, start_response):
try:
response = cdxserver.load_cdx_from_request(env)
start_response('200 OK', [('Content-Type', 'text/plain')])
response = list(response)
except Exception as exc:
import traceback
err_details = traceback.format_exc(exc)
start_response('400 Error', [('Content-Type', 'text/plain')])
response = [str(exc)]
print err_details
return response
return application
if __name__ == "__main__":
pass
else:
application = main()

View File

@ -0,0 +1,57 @@
from collections import OrderedDict
import itertools
#=================================================================
class CDXObject(OrderedDict):
CDX_FORMATS = [
# Public CDX Format
["urlkey","timestamp","original","mimetype","statuscode","digest","length"],
# CDX 11 Format
["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"],
# CDX 9 Format
["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","offset","filename"],
# CDX 11 Format + 3 revisit resolve fields
["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename",
"orig.length","orig.offset","orig.filename"],
# CDX 9 Format + 3 revisit resolve fields
["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","offset","filename",
"orig.length","orig.offset","orig.filename"]
]
def __init__(self, cdxline):
OrderedDict.__init__(self)
cdxline = cdxline.rstrip()
fields = cdxline.split(' ')
cdxformat = None
for i in self.CDX_FORMATS:
if len(i) == len(fields):
cdxformat = i
if not cdxformat:
raise Exception('unknown {0}-field cdx format'.format(len(fields)))
for header, field in itertools.izip(cdxformat, fields):
self[header] = field
self.cdxline = cdxline
def __setitem__(self, key, value):
OrderedDict.__setitem__(self, key, value)
# force regen on next __str__ call
self.cdxline = None
def __str__(self):
if self.cdxline:
return self.cdxline
li = itertools.imap(lambda (n, val): val, self.items())
return ' '.join(li)

228
pywb/cdxserver/cdxops.py Normal file
View File

@ -0,0 +1,228 @@
from cdxobject import CDXObject
from ..binsearch.binsearch import iter_exact, iter_prefix, FileReader
import timeutils
import bisect
import itertools
import re
from heapq import merge
from collections import deque
#=================================================================
def cdx_text_out(cdx, fields):
if not fields:
return str(cdx)
else:
return ' '.join(map(lambda x: cdx[x], fields.split(',')))
#=================================================================
def cdx_load(sources, params):
cdx_iter = load_cdx_streams(sources, params)
cdx_iter = make_cdx_iter(cdx_iter)
resolve_revisits = params.get('resolve_revisits', False)
if resolve_revisits:
cdx_iter = cdx_resolve_revisits(cdx_iter)
filters = params.get('filter', None)
if filters:
cdx_iter = cdx_filter(cdx_iter, filters)
collapse_time = params.get('collapse_time', None)
if collapse_time:
cdx_iter = cdx_collapse_time_status(cdx_iter, collapse_time)
limit = int(params.get('limit', 1000000))
reverse = params.get('reverse', False)
if reverse:
cdx_iter = cdx_reverse(cdx_iter, limit)
closest_to = params.get('closest_to', None)
if closest_to:
cdx_iter = cdx_sort_closest(closest_to, cdx_iter, limit)
if limit:
cdx_iter = cdx_limit(cdx_iter, limit)
# output raw cdx objects
if params.get('output') == 'raw':
return cdx_iter
def write_cdx(fields):
for cdx in cdx_iter:
yield cdx_text_out(cdx, fields) + '\n'
return write_cdx(params.get('fields'))
#=================================================================
# load and source merge cdx streams
def load_cdx_streams(sources, params):
# Optimize: no need to merge if just one input
if len(sources) == 1:
return sources[0].load_cdx(params)
source_iters = map(lambda src: src.load_cdx(params), sources)
merged_stream = merge(*(source_iters))
return merged_stream
#=================================================================
# convert text cdx stream to CDXObject
def make_cdx_iter(text_iter):
return itertools.imap(lambda line: CDXObject(line), text_iter)
#=================================================================
# limit cdx to at most limit
def cdx_limit(cdx_iter, limit):
for cdx, _ in itertools.izip(cdx_iter, xrange(limit)):
yield cdx
#=================================================================
# reverse cdx
def cdx_reverse(cdx_iter, limit):
# optimize for single last
if limit == 1:
last = None
for cdx in cdx_iter:
last = cdx
return [last] if last else []
reverse_cdxs = deque(maxlen = limit)
for cdx in cdx_iter:
reverse_cdxs.appendleft(cdx)
return reverse_cdxs
#=================================================================
# filter cdx by regex if each filter is field:regex form,
# apply filter to cdx[field]
def cdx_filter(cdx_iter, filter_strings):
# Support single strings as well
if isinstance(filter_strings, str):
filter_strings = [filter_strings]
filters = []
class Filter:
def __init__(self, string):
# invert filter
self.invert = string.startswith('!')
if self.invert:
string = string[1:]
parts = string.split(':', 1)
# no field set, apply filter to entire cdx
if len(parts) == 1:
self.field = ''
else:
# apply filter to cdx[field]
self.field = parts[0]
string = parts[1]
self.regex = re.compile(string)
def __call__(self, cdx):
val = cdx[self.field] if self.field else str(cdx)
matched = self.regex.match(val) is not None
return matched ^ self.invert
filters = map(Filter, filter_strings)
for cdx in cdx_iter:
if all (x(cdx) for x in filters):
yield cdx
#=================================================================
# collapse by timestamp and status code
def cdx_collapse_time_status(cdx_iter, timelen = 10):
timelen = int(timelen)
last_token = None
for cdx in cdx_iter:
curr_token = (cdx['timestamp'][:timelen], cdx['statuscode'])
# yield if last_dedup_time is diff, otherwise skip
if curr_token != last_token:
last_token = curr_token
yield cdx
#=================================================================
# sort CDXCaptureResult by closest to timestamp
def cdx_sort_closest(closest, cdx_iter, limit = 10):
closest_cdx = []
closest_sec = timeutils.timestamp_to_sec(closest)
for cdx in cdx_iter:
sec = timeutils.timestamp_to_sec(cdx['timestamp'])
key = abs(closest_sec - sec)
# create tuple to sort by key
bisect.insort(closest_cdx, (key, cdx))
if len(closest_cdx) == limit:
# assuming cdx in ascending order and keys have started increasing
if key > closest_cdx[-1]:
break
if len(closest_cdx) > limit:
closest_cdx.pop()
return itertools.imap(lambda x: x[1], closest_cdx)
#=================================================================
# resolve revisits
# Fields to append from cdx original to revisit
ORIG_TUPLE = ['length', 'offset', 'filename']
def cdx_resolve_revisits(cdx_iter):
originals = {}
for cdx in cdx_iter:
is_revisit = (cdx['mimetype'] == 'warc/revisit') or (cdx['filename'] == '-')
digest = cdx['digest']
original_cdx = originals.get(digest)
if not original_cdx and not is_revisit:
originals[digest] = cdx
if original_cdx and is_revisit:
fill_orig = lambda field: original_cdx[field]
# Transfer mimetype and statuscode
cdx['mimetype'] = original_cdx['mimetype']
cdx['statuscode'] = original_cdx['statuscode']
else:
fill_orig = lambda field: '-'
# Always add either the original or empty '- - -'
for field in ORIG_TUPLE:
cdx['orig.' + field] = fill_orig(field)
yield cdx

160
pywb/cdxserver/cdxserver.py Normal file
View File

@ -0,0 +1,160 @@
import surt
from ..binsearch.binsearch import iter_exact, iter_prefix, FileReader
from cdxops import cdx_load
import itertools
import logging
import os
import urlparse
#=================================================================
class CDXFile:
def __init__(self, filename):
self.filename = filename
def load_cdx(self, params):
source = FileReader(self.filename)
match_type = params.get('match_type')
if match_type == 'prefix':
iter_func = iter_prefix
else:
iter_func = iter_exact
key = params.get('key')
return iter_func(source, key)
def __str__(self):
return 'CDX File - ' + self.filename
#=================================================================
class CDXException(Exception):
def __init__(self, msg, url = None):
Exception.__init__(self, msg)
self.url = url
def status(self):
return '400 Bad Request'
#=================================================================
class CDXServer:
"""
Top-level cdx server object which maintains a list of cdx sources,
responds to queries and dispatches to the cdx ops for processing
"""
def __init__(self, sources, surt_ordered = True):
self.sources = []
self.surt_ordered = surt_ordered
logging.debug('CDX Surt-Ordered? ' + str(surt_ordered))
for src in sources:
if os.path.isdir(src):
for file in os.listdir(src):
self.add_cdx_loader(src + file)
else:
self.add_cdx_loader(src)
if len(self.sources) == 0:
logging.exception('No CDX Sources Found!')
def add_cdx_loader(self, filename):
source = self.create_cdx_loader(filename)
if not source:
return
logging.debug('Adding CDX Source: ' + str(source))
self.sources.append(source)
@staticmethod
def create_cdx_loader(filename):
if filename.endswith('.cdx'):
return CDXFile(filename)
return None
#TODO: support zipnum
#elif filename.endswith('.summary')
# return ZipNumCDXSource(filename)
#elif filename.startswith('redis://')
# return RedisCDXSource(filename)
def load_cdx(self, **params):
# canonicalize to surt (canonicalization is part of surt conversion)
try:
url = params['url']
except KeyError:
raise CDXException('The url= param must be specified to query the cdx server')
try:
key = surt.surt(url)
except Exception as e:
raise CDXException('Invalid url: ', url)
# if not surt, unsurt the surt to get canonicalized non-surt url
if not self.surt_ordered:
key = unsurt(key)
params['key'] = key
return cdx_load(self.sources, params)
def load_cdx_from_request(self, env):
#url = wbrequest.wb_url.url
# use url= param to get actual url
params = urlparse.parse_qs(env['QUERY_STRING'])
if not 'output' in params:
params['output'] = 'text'
# parse_qs produces arrays for single values
# cdxreader expects singleton params for all except filters, so convert here
# use first value of the list
for name, val in params.iteritems():
if name != 'filter':
params[name] = val[0]
cdx_lines = self.load_cdx(**params)
return cdx_lines
def __str__(self):
return 'load cdx indexes from ' + str(self.sources)
#=================================================================
def unsurt(surt):
"""
# Simple surt
>>> unsurt('com,example)/')
'example.com)/'
# Broken surt
>>> unsurt('com,example)')
'com,example)'
# Long surt
>>> unsurt('suffix,domain,sub,subsub,another,subdomain)/path/file/index.html?a=b?c=)/')
'subdomain.another.subsub.sub.domain.suffix)/path/file/index.html?a=b?c=)/'
"""
try:
index = surt.index(')/')
parts = surt[0:index].split(',')
parts.reverse()
host = '.'.join(parts)
host += surt[index:]
return host
except ValueError:
# May not be a valid surt
return surt

103
pywb/cdxserver/timeutils.py Normal file
View File

@ -0,0 +1,103 @@
import re
import time
import datetime
import calendar
#=================================================================
# str <-> datetime conversion
#=================================================================
DATE_TIMESPLIT = re.compile('[^\d]')
TIMESTAMP_14 = '%Y%m%d%H%M%S'
PAD_STAMP_END = '29991231235959'
def iso_date_to_datetime(string):
"""
>>> iso_date_to_datetime('2013-12-26T10:11:12Z')
datetime.datetime(2013, 12, 26, 10, 11, 12)
>>> iso_date_to_datetime('2013-12-26T10:11:12Z')
datetime.datetime(2013, 12, 26, 10, 11, 12)
"""
nums = DATE_TIMESPLIT.split(string)
if nums[-1] == '':
nums = nums[:-1]
dt = datetime.datetime(*map(int, nums))
return dt
def datetime_to_timestamp(dt):
"""
>>> datetime_to_timestamp(datetime.datetime(2013, 12, 26, 10, 11, 12))
'20131226101112'
"""
return dt.strftime(TIMESTAMP_14)
def iso_date_to_timestamp(string):
"""
>>> iso_date_to_timestamp('2013-12-26T10:11:12Z')
'20131226101112'
>>> iso_date_to_timestamp('2013-12-26T10:11:12')
'20131226101112'
"""
return datetime_to_timestamp(iso_date_to_datetime(string))
# default pad is end of range for compatibility
def pad_timestamp(string, pad_str = PAD_STAMP_END):
"""
>>> pad_timestamp('20')
'20991231235959'
>>> pad_timestamp('2014')
'20141231235959'
>>> pad_timestamp('20141011')
'20141011235959'
>>> pad_timestamp('201410110010')
'20141011001059'
"""
str_len = len(string)
pad_len = len(pad_str)
return string if str_len >= pad_len else string + pad_str[str_len:]
def timestamp_to_datetime(string):
"""
>>> timestamp_to_datetime('20131226095010')
time.struct_time(tm_year=2013, tm_mon=12, tm_mday=26, tm_hour=9, tm_min=50, tm_sec=10, tm_wday=3, tm_yday=360, tm_isdst=-1)
>>> timestamp_to_datetime('2014')
time.struct_time(tm_year=2014, tm_mon=12, tm_mday=31, tm_hour=23, tm_min=59, tm_sec=59, tm_wday=2, tm_yday=365, tm_isdst=-1)
"""
# Default pad to end of range for comptability
return time.strptime(pad_timestamp(string), TIMESTAMP_14)
def timestamp_to_sec(string):
"""
>>> timestamp_to_sec('20131226095010')
1388051410
>>> timestamp_to_sec('2014')
1420070399
"""
return calendar.timegm(timestamp_to_datetime(string))
if __name__ == "__main__":
import doctest
doctest.testmod()

View File

@ -66,28 +66,12 @@ class WBHandler(BaseHandler):
# CDX-Server Handler -- pass all params to cdx server
#=================================================================
class CDXHandler(BaseHandler):
def __init__(self, cdx_reader, view = None):
self.cdx_reader = cdx_reader
def __init__(self, cdx_server, view = None):
self.cdx_server = cdx_server
self.view = view if view else views.TextCapturesView()
def __call__(self, wbrequest):
#url = wbrequest.wb_url.url
# use url= param to get actual url
params = urlparse.parse_qs(wbrequest.env['QUERY_STRING'])
# parse_qs produces arrays for single values
# cdxreader expects singleton params for all except filters, so convert here
# use first value of the list
for name, val in params.iteritems():
if name != 'filter':
params[name] = val[0]
url = params.get('url')
if not url:
raise WbException('Must specify a url= param to query cdx server')
cdx_lines = self.cdx_reader.load_cdx(url, params, parsed_cdx = False)
cdx_lines = self.cdx_server.load_cdx_from_request(wbrequest.env)
return self.view.render_response(wbrequest, cdx_lines)
@ -97,7 +81,7 @@ class CDXHandler(BaseHandler):
return None
def __str__(self):
return 'CDX Server: ' + str(self.cdx_reader)
return 'CDX Server: ' + str(self.cdx_server)
#=================================================================

View File

@ -1,15 +1,13 @@
import urllib
import urllib2
import wbexceptions
import itertools
import wbrequestresponse
import surt
from collections import OrderedDict
import binsearch
import cdxserve
from cdxserver.cdxserver import CDXServer, CDXException
from cdxserver.cdxobject import CDXObject
import logging
import os
#=================================================================
class IndexReader:
@ -26,7 +24,13 @@ class IndexReader:
if wbrequest.custom_params:
params.update(wbrequest.custom_params)
cdxlines = self.load_cdx(wburl.url, params, parsed_cdx)
#params['url'] = wburl.url
output = 'raw' if parsed_cdx else 'text'
try:
cdxlines = self.load_cdx(url = wburl.url, output = output, **params)
except CDXException:
raise wbexceptions.BadUrlException('Bad Request Url: ' + wburl.url)
cdxlines = utils.peek_iter(cdxlines)
@ -53,7 +57,7 @@ class IndexReader:
# for now, list implies local sources
if isinstance(paths, list):
if len(paths) > 1:
return LocalCDXServer(paths, surt_ordered)
return EmbeddedCDXServer(paths, surt_ordered)
else:
# treat as non-list
paths = paths[0]
@ -66,66 +70,13 @@ class IndexReader:
cookie = config.get('cookie', None)
return RemoteCDXServer(uri, cookie = cookie)
else:
return LocalCDXServer([uri], surt_ordered)
return EmbeddedCDXServer([uri], surt_ordered)
#=================================================================
class LocalCDXServer(IndexReader):
"""
>>> x = LocalCDXServer([test_dir]).load_cdx('example.com', parsed_cdx = True, limit = 1)
>>> pprint(x.next().items())
[('urlkey', 'com,example)/'),
('timestamp', '20140127171200'),
('original', 'http://example.com'),
('mimetype', 'text/html'),
('statuscode', '200'),
('digest', 'B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'),
('redirect', '-'),
('robotflags', '-'),
('length', '1046'),
('offset', '334'),
('filename', 'dupes.warc.gz')]
"""
def __init__(self, sources, surt_ordered = True):
self.sources = []
self.surt_ordered = surt_ordered
logging.info('CDX Surt-Ordered? ' + str(surt_ordered))
for src in sources:
if os.path.isdir(src):
for file in os.listdir(src):
if file.endswith('.cdx'):
full = src + file
logging.info('Adding CDX: ' + full)
self.sources.append(full)
else:
logging.info('Adding CDX: ' + src)
self.sources.append(src)
def load_cdx(self, url, params = {}, parsed_cdx = True, **kwvalues):
# canonicalize to surt (canonicalization is part of surt conversion)
try:
key = surt.surt(url)
except Exception as e:
raise wbexceptions.BadUrlException('Bad Request Url: ' + url)
# if not surt, unsurt the surt to get canonicalized non-surt url
if not self.surt_ordered:
key = utils.unsurt(key)
match_func = binsearch.iter_exact
params.update(**kwvalues)
params['output'] = 'raw' if parsed_cdx else 'text'
return cdxserve.cdx_serve(key, params, self.sources, match_func)
class EmbeddedCDXServer(CDXServer, IndexReader):
def get_query_params(self, wburl, limit = 150000, collapse_time = None, replay_closest = 10):
if wburl.type == wburl.URL_QUERY:
@ -198,7 +149,7 @@ class RemoteCDXServer(IndexReader):
raise
if parsed_cdx:
return (CDXCaptureResult(cdx) for cdx in response)
return (CDXObject(cdx) for cdx in response)
else:
return iter(response)
@ -238,62 +189,6 @@ class RemoteCDXServer(IndexReader):
return 'server cdx from ' + self.server_url
#=================================================================
class CDXCaptureResult(OrderedDict):
CDX_FORMATS = [
# Public CDX Format
["urlkey","timestamp","original","mimetype","statuscode","digest","length"],
# CDX 11 Format
["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"],
# CDX 9 Format
["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","offset","filename"],
# CDX 11 Format + 3 revisit resolve fields
["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename",
"orig.length","orig.offset","orig.filename"],
# CDX 9 Format + 3 revisit resolve fields
["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","offset","filename",
"orig.length","orig.offset","orig.filename"]
]
def __init__(self, cdxline):
OrderedDict.__init__(self)
cdxline = cdxline.rstrip()
fields = cdxline.split(' ')
cdxformat = None
for i in CDXCaptureResult.CDX_FORMATS:
if len(i) == len(fields):
cdxformat = i
if not cdxformat:
raise wbexceptions.InvalidCDXException('unknown {0}-field cdx format'.format(len(fields)))
for header, field in itertools.izip(cdxformat, fields):
self[header] = field
self.cdxline = cdxline
def __setitem__(self, key, value):
OrderedDict.__setitem__(self, key, value)
# force regen on next __str__ call
self.cdxline = None
def __str__(self):
if self.cdxline:
return self.cdxline
li = itertools.imap(lambda (n, val): val, self.items())
return ' '.join(li)
# Testing
import utils

View File

@ -1,5 +1,5 @@
import redis
import binsearch
import binsearch.binsearch
import urlparse
import os
@ -46,10 +46,10 @@ class RedisResolver:
class PathIndexResolver:
def __init__(self, pathindex_file):
self.pathindex_file = pathindex_file
self.reader = binsearch.FileReader(pathindex_file)
self.reader = binsearch.binsearch.FileReader(pathindex_file)
def __call__(self, filename):
result = binsearch.iter_exact(self.reader, filename, '\t')
result = binsearch.binsearch.iter_exact(self.reader, filename, '\t')
def gen_list(result):
for pathline in result:

View File

@ -43,100 +43,6 @@ class PerfTimer:
self.perfdict[self.name] = str(self.end - self.start)
#=================================================================
# str <-> datetime conversion
#=================================================================
DATE_TIMESPLIT = re.compile('[^\d]')
TIMESTAMP_14 = '%Y%m%d%H%M%S'
PAD_STAMP_END = '29991231235959'
def iso_date_to_datetime(string):
"""
>>> iso_date_to_datetime('2013-12-26T10:11:12Z')
datetime.datetime(2013, 12, 26, 10, 11, 12)
>>> iso_date_to_datetime('2013-12-26T10:11:12Z')
datetime.datetime(2013, 12, 26, 10, 11, 12)
"""
nums = DATE_TIMESPLIT.split(string)
if nums[-1] == '':
nums = nums[:-1]
dt = datetime.datetime(*map(int, nums))
return dt
def datetime_to_timestamp(dt):
"""
>>> datetime_to_timestamp(datetime.datetime(2013, 12, 26, 10, 11, 12))
'20131226101112'
"""
return dt.strftime(TIMESTAMP_14)
def iso_date_to_timestamp(string):
"""
>>> iso_date_to_timestamp('2013-12-26T10:11:12Z')
'20131226101112'
>>> iso_date_to_timestamp('2013-12-26T10:11:12')
'20131226101112'
"""
return datetime_to_timestamp(iso_date_to_datetime(string))
# default pad is end of range for compatibility
def pad_timestamp(string, pad_str = PAD_STAMP_END):
"""
>>> pad_timestamp('20')
'20991231235959'
>>> pad_timestamp('2014')
'20141231235959'
>>> pad_timestamp('20141011')
'20141011235959'
>>> pad_timestamp('201410110010')
'20141011001059'
"""
str_len = len(string)
pad_len = len(pad_str)
return string if str_len >= pad_len else string + pad_str[str_len:]
def timestamp_to_datetime(string):
"""
>>> timestamp_to_datetime('20131226095010')
time.struct_time(tm_year=2013, tm_mon=12, tm_mday=26, tm_hour=9, tm_min=50, tm_sec=10, tm_wday=3, tm_yday=360, tm_isdst=-1)
>>> timestamp_to_datetime('2014')
time.struct_time(tm_year=2014, tm_mon=12, tm_mday=31, tm_hour=23, tm_min=59, tm_sec=59, tm_wday=2, tm_yday=365, tm_isdst=-1)
"""
# Default pad to end of range for comptability
return time.strptime(pad_timestamp(string), TIMESTAMP_14)
def timestamp_to_sec(string):
"""
>>> timestamp_to_sec('20131226095010')
1388051410
>>> timestamp_to_sec('2014')
1420070399
"""
return calendar.timegm(timestamp_to_datetime(string))
#=================================================================
# adapted -from wsgiref.request_uri, but doesn't include domain name and allows all characters
# allowed in the path segment according to: http://tools.ietf.org/html/rfc3986#section-3.3

View File

@ -1,9 +1,9 @@
import indexreader
import utils
import cdxserver.timeutils as timeutils
import wbrequestresponse
import wbexceptions
import time
import urlparse
import time
from os import path
from itertools import imap
@ -58,7 +58,7 @@ class J2TemplateView:
# Filters
@staticmethod
def format_ts(value, format='%a, %b %d %Y %H:%M:%S'):
value = utils.timestamp_to_datetime(value)
value = timeutils.timestamp_to_datetime(value)
return time.strftime(format, value)
@staticmethod

View File

@ -2,6 +2,7 @@ import utils
import wbexceptions
from wbrequestresponse import WbResponse, StatusAndHeaders
from cdxserver.cdxserver import CDXException
import os
import importlib
@ -33,7 +34,7 @@ def create_wb_app(wb_router):
except (wbexceptions.NotFoundException, wbexceptions.AccessException) as e:
response = handle_exception(env, wb_router.error_view, e, False)
except wbexceptions.WbException as wbe:
except (wbexceptions.WbException, CDXException) as wbe:
response = handle_exception(env, wb_router.error_view, wbe, False)
except Exception as e:

View File

@ -11,8 +11,8 @@ setuptools.setup(name='pywb',
author_email='ilya@archive.org',
long_description=open('README.md').read(),
license='GPL',
packages=['pywb'],
provides=['pywb'],
packages=['pywb', 'pywb.binsearch', 'pywb.cdxserver'],
provides=['pywb', 'pywb.binsearch', 'pywb.cdxserver'],
package_data={'pywb': ['ui/*', 'static/*']},
data_files = [('sample_archive/cdx/', glob.glob('sample_archive/cdx/*')),
('sample_archive/warcs/', glob.glob('sample_archive/warcs/*'))],

0
tests/__init__.py Normal file
View File

43
tests/test_binsearch.py Normal file
View File

@ -0,0 +1,43 @@
import os
from ..pywb.binsearch.binsearch import iter_prefix, iter_exact, FileReader
test_cdx_dir = os.path.dirname(os.path.realpath(__file__)) + '/../sample_archive/cdx/'
def binsearch_cdx_test(key, iter_func):
"""
# Prefix Search
>>> binsearch_cdx_test('org,iana)/domains/root', iter_prefix)
org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz
>>> binsearch_cdx_test('org,iana)/domains/root', iter_exact)
org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz
>>> binsearch_cdx_test('org,iana)/', iter_exact)
org,iana)/ 20140126200624 http://www.iana.org/ text/html 200 OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 2258 334 iana.warc.gz
>>> binsearch_cdx_test('org,iana)/domains/root/db', iter_exact)
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
# Exact Search
>>> binsearch_cdx_test('org,iaana)/', iter_exact)
>>> binsearch_cdx_test('org,ibna)/', iter_exact)
>>> binsearch_cdx_test('org,iana)/time-zones', iter_exact)
org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
"""
cdx = FileReader(test_cdx_dir + 'iana.cdx')
for line in iter_func(cdx, key):
print line
if __name__ == "__main__":
import doctest
doctest.testmod()

149
tests/test_cdxserve.py Normal file
View File

@ -0,0 +1,149 @@
from ..pywb.binsearch.binsearch import iter_exact, iter_prefix, FileReader
from ..pywb.cdxserver.cdxserver import CDXServer
import os
import sys
import pprint
test_cdx_dir = os.path.dirname(os.path.realpath(__file__)) + '/../sample_archive/cdx/'
def cdx_ops_test(url, sources = [test_cdx_dir + 'iana.cdx'], **kwparams):
"""
# Merge Sort Multipe CDX Sources
>>> cdx_ops_test(url = 'http://iana.org/', sources = [test_cdx_dir + 'dupes.cdx', test_cdx_dir + 'iana.cdx'])
org,iana)/ 20140126200624 http://www.iana.org/ text/html 200 OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 2258 334 iana.warc.gz
org,iana)/ 20140127171238 http://iana.org unk 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 343 1858 dupes.warc.gz
org,iana)/ 20140127171238 http://www.iana.org/ warc/revisit - OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 536 2678 dupes.warc.gz
# Limit CDX Stream
>>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', limit = 3)
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200625 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 117166 198285 iana.warc.gz
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200654 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf warc/revisit - YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 548 482544 iana.warc.gz
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200706 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf warc/revisit - YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 495230 iana.warc.gz
# Reverse CDX Stream
>>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', reverse = True, resolve_revisits = True, limit = 3)
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201308 https://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 551 783712 iana.warc.gz 117166 198285 iana.warc.gz
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201249 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 771773 iana.warc.gz 117166 198285 iana.warc.gz
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201240 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 551 757988 iana.warc.gz 117166 198285 iana.warc.gz
>>> cdx_ops_test('http://iana.org/_js/2013.1/jquery.js', reverse = True, resolve_revisits = True, limit = 1)
org,iana)/_js/2013.1/jquery.js 20140126201307 https://www.iana.org/_js/2013.1/jquery.js application/x-javascript 200 AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO - - 543 778507 iana.warc.gz 33449 7311 iana.warc.gz
# No matching results
>>> cdx_ops_test('http://iana.org/dont_have_this', reverse = True, resolve_revisits = True, limit = 2)
# Filter cdx
>>> cdx_ops_test(url = 'http://iana.org/domains', match_type = 'prefix', filter = ['mimetype:text/html'])
org,iana)/domains 20140126200825 http://www.iana.org/domains text/html 200 7UPSCLNWNZP33LGW6OJGSF2Y4CDG4ES7 - - 2912 610534 iana.warc.gz
org,iana)/domains/arpa 20140126201248 http://www.iana.org/domains/arpa text/html 200 QOFZZRN6JIKAL2JRL6ZC2VVG42SPKGHT - - 2939 759039 iana.warc.gz
org,iana)/domains/idn-tables 20140126201127 http://www.iana.org/domains/idn-tables text/html 200 HNCUFTJMOQOGAEY6T56KVC3T7TVLKGEW - - 8118 715878 iana.warc.gz
org,iana)/domains/int 20140126201239 http://www.iana.org/domains/int text/html 200 X32BBNNORV4SPEHTQF5KI5NFHSKTZK6Q - - 2482 746788 iana.warc.gz
org,iana)/domains/reserved 20140126201054 http://www.iana.org/domains/reserved text/html 200 R5AAEQX5XY5X5DG66B23ODN5DUBWRA27 - - 3573 701457 iana.warc.gz
org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', filter = 'statuscode:200')
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
# Collapse by timestamp
# unresolved revisits, different statuscode results in an extra repeat
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapse_time = 11)
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
org,iana)/_css/2013.1/screen.css 20140126200653 http://www.iana.org/_css/2013.1/screen.css warc/revisit - BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 533 328367 iana.warc.gz
org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css warc/revisit - BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz
# resolved revisits
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapse_time = '11', resolve_revisits = True)
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz - - -
org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz 8754 41238 iana.warc.gz
# Sort by closest timestamp + field select output
>>> cdx_ops_test(closest_to = '20140126200826', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', fields = 'timestamp', limit = 10)
20140126200826
20140126200816
20140126200805
20140126200912
20140126200738
20140126200930
20140126200718
20140126200706
20140126200654
20140126200625
>>> cdx_ops_test(closest_to = '20140126201306', url = 'http://iana.org/dnssec', resolve_revisits = True, sources = [test_cdx_dir + 'dupes.cdx', test_cdx_dir + 'iana.cdx'])
org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz - - -
org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz - - -
>>> cdx_ops_test(closest_to = '20140126201307', url = 'http://iana.org/dnssec', resolve_revisits = True)
org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz - - -
org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz - - -
# equal dist prefer earlier
>>> cdx_ops_test(closest_to = '20140126200700', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolve_revisits = True, limit = 2)
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200654 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 548 482544 iana.warc.gz 117166 198285 iana.warc.gz
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200706 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 495230 iana.warc.gz 117166 198285 iana.warc.gz
>>> cdx_ops_test(closest_to = '20140126200659', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolve_revisits = True, limit = 2, fields = 'timestamp')
20140126200654
20140126200706
>>> cdx_ops_test(closest_to = '20140126200701', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolve_revisits = True, limit = 2, fields = 'timestamp')
20140126200706
20140126200654
# Resolve Revisits
>>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/inconsolata.otf', resolve_revisits = True)
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200826 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 34054 620049 iana.warc.gz - - -
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200912 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 546 667073 iana.warc.gz 34054 620049 iana.warc.gz
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200930 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 534 697255 iana.warc.gz 34054 620049 iana.warc.gz
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201055 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 547 714833 iana.warc.gz 34054 620049 iana.warc.gz
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201249 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 551 768625 iana.warc.gz 34054 620049 iana.warc.gz
>>> cdx_ops_test('http://iana.org/domains/root/db', resolve_revisits = True)
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz - - -
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz - - -
# CDX Server init
>>> x = CDXServer([test_cdx_dir]).load_cdx(url = 'example.com', limit = 1, output = 'raw')
>>> pprint.pprint(x.next().items())
[('urlkey', 'com,example)/'),
('timestamp', '20140127171200'),
('original', 'http://example.com'),
('mimetype', 'text/html'),
('statuscode', '200'),
('digest', 'B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'),
('redirect', '-'),
('robotflags', '-'),
('length', '1046'),
('offset', '334'),
('filename', 'dupes.warc.gz')]
"""
kwparams['url'] = url
kwparams['output'] = 'text'
server = CDXServer(sources)
results = server.load_cdx(**kwparams)
for x in results:
sys.stdout.write(x)
if __name__ == "__main__":
import doctest
doctest.testmod()

View File

@ -1,14 +1,14 @@
import webtest
import pywb.pywb_init
from pywb.indexreader import CDXCaptureResult
from ..pywb.pywb_init import pywb_config
from ..pywb.wbapp import create_wb_app
from ..pywb.cdxserver.cdxobject import CDXObject
class TestWb:
TEST_CONFIG = 'test_config.yaml'
def setup(self):
import pywb.wbapp
#self.app = pywb.wbapp.create_wb_app(pywb.pywb_init.pywb_config())
self.app = pywb.wbapp.create_wb_app(pywb.pywb_init.pywb_config(self.TEST_CONFIG))
self.app = create_wb_app(pywb_config(self.TEST_CONFIG))
self.testapp = webtest.TestApp(self.app)
def _assert_basic_html(self, resp):
@ -144,8 +144,8 @@ class TestWb:
# combine collapsing, reversing and revisit resolving
resp = self.testapp.get('/pywb-cdx?url=http://www.iana.org/_css/2013.1/print.css&collapse_time=11&resolve_revisits=true&reverse=true')
# convert back to CDXCaptureResult
cdxs = map(CDXCaptureResult, resp.body.rstrip().split('\n'))
# convert back to CDXObject
cdxs = map(CDXObject, resp.body.rstrip().split('\n'))
assert len(cdxs) == 3, len(cdxs)
# verify timestamps