mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
refactoring of binsearch and cdxserver into seperate packages
also move complicated doctests and integration tests to tests/
This commit is contained in:
parent
e4f409b2a4
commit
2528ee0a7c
@ -6,4 +6,5 @@ install:
|
||||
- "python setup.py -q install"
|
||||
# command to run tests
|
||||
#script: nosetests --with-doctest
|
||||
script: py.test run-tests.py ./pywb/ --doctest-modules --ignore=setup.py
|
||||
#script: py.test run-tests.py ./pywb/ --doctest-modules --ignore=setup.py
|
||||
script: py.test -v --doctest-module ./tests/*.py ./pywb/
|
||||
|
@ -13,6 +13,9 @@ from wbrequestresponse import StatusAndHeaders
|
||||
#=================================================================
|
||||
|
||||
class HttpLoader:
|
||||
"""
|
||||
Load content over http with range request and optional signature
|
||||
"""
|
||||
def __init__(self, hmac = None, hmac_duration = 30):
|
||||
self.hmac = hmac
|
||||
self.hmac_duration = hmac_duration
|
||||
@ -38,6 +41,8 @@ class HttpLoader:
|
||||
#=================================================================
|
||||
class FileLoader:
|
||||
"""
|
||||
Load content from local file-system
|
||||
|
||||
# Ensure attempt to read more than 100 bytes, only reads 100 bytes
|
||||
>>> len(FileLoader().load(utils.test_data_dir() + 'warcs/iana.warc.gz', 0, 100).read('400'))
|
||||
100
|
||||
|
@ -1,147 +0,0 @@
|
||||
from collections import deque
|
||||
import os
|
||||
import itertools
|
||||
|
||||
class FileReader:
|
||||
def __init__(self, filename):
|
||||
self.fh = open(filename, 'rb')
|
||||
self.filename = filename
|
||||
self.size = os.path.getsize(filename)
|
||||
|
||||
def getsize(self):
|
||||
return self.size
|
||||
|
||||
def readline(self):
|
||||
return self.fh.readline()
|
||||
|
||||
def seek(self, offset):
|
||||
return self.fh.seek(offset)
|
||||
|
||||
def close(self):
|
||||
return self.fh.close()
|
||||
|
||||
|
||||
def binsearch_offset(reader, key, compare_func = cmp, block_size = 8192):
|
||||
min = 0
|
||||
max = reader.getsize() / block_size
|
||||
|
||||
while (max - min > 1):
|
||||
mid = min + ((max - min) / 2)
|
||||
reader.seek(mid * block_size)
|
||||
|
||||
if mid > 0:
|
||||
reader.readline() # skip partial line
|
||||
|
||||
line = reader.readline()
|
||||
|
||||
if compare_func(key, line) > 0:
|
||||
min = mid
|
||||
else:
|
||||
max = mid
|
||||
|
||||
return (min * block_size)
|
||||
|
||||
|
||||
def search(reader, key, prev_size = 0, compare_func = cmp, block_size = 8192):
|
||||
min = binsearch_offset(reader, key, compare_func, block_size)
|
||||
|
||||
reader.seek(min)
|
||||
|
||||
if min > 0:
|
||||
reader.readline() # skip partial line
|
||||
|
||||
if prev_size > 1:
|
||||
prev_deque = deque(maxlen = prev_size)
|
||||
|
||||
line = None
|
||||
|
||||
while True:
|
||||
line = reader.readline()
|
||||
if not line:
|
||||
break
|
||||
if compare_func(line, key) >= 0:
|
||||
break
|
||||
|
||||
if prev_size == 1:
|
||||
prev = line
|
||||
elif prev_size > 1:
|
||||
prev_deque.append(line)
|
||||
|
||||
def gen_iter(line):
|
||||
if prev_size == 1:
|
||||
yield prev.rstrip()
|
||||
elif prev_size > 1:
|
||||
for i in prev_deque:
|
||||
yield i.rstrip()
|
||||
|
||||
while line:
|
||||
yield line.rstrip()
|
||||
line = reader.readline()
|
||||
|
||||
return gen_iter(line)
|
||||
|
||||
|
||||
# Iterate over prefix matches
|
||||
def iter_prefix(reader, key):
|
||||
"""
|
||||
>>> print_test_cdx('org,iana)/domains/root', iter_prefix)
|
||||
org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz
|
||||
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz
|
||||
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
|
||||
org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz
|
||||
"""
|
||||
|
||||
lines = search(reader, key)
|
||||
return itertools.takewhile(lambda line: line.startswith(key), lines)
|
||||
|
||||
|
||||
def iter_exact(reader, key, tok = ' '):
|
||||
"""
|
||||
>>> print_test_cdx('org,iana)/domains/root', iter_exact)
|
||||
org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz
|
||||
|
||||
>>> print_test_cdx('org,iana)/', iter_exact)
|
||||
org,iana)/ 20140126200624 http://www.iana.org/ text/html 200 OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 2258 334 iana.warc.gz
|
||||
|
||||
>>> print_test_cdx('org,iana)/domains/root/db', iter_exact)
|
||||
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz
|
||||
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
|
||||
|
||||
>>> print_test_cdx('org,iaana)/', iter_exact)
|
||||
>>> print_test_cdx('org,ibna)/', iter_exact)
|
||||
|
||||
>>> print_test_cdx('org,iana)/time-zones', iter_exact)
|
||||
org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
|
||||
"""
|
||||
|
||||
lines = search(reader, key)
|
||||
|
||||
def check_key(line):
|
||||
line_key = line.split(tok, 1)[0]
|
||||
return line_key == key
|
||||
|
||||
return itertools.takewhile(check_key, lines)
|
||||
|
||||
|
||||
import utils
|
||||
if __name__ == "__main__" or utils.enable_doctests():
|
||||
|
||||
def create_test_cdx(test_file):
|
||||
path = utils.test_data_dir() + 'cdx/' + test_file
|
||||
return FileReader(path)
|
||||
|
||||
test_cdx = create_test_cdx('iana.cdx')
|
||||
|
||||
def print_test_cdx(key, iter_func, filename = None):
|
||||
cdx = test_cdx if not filename else create_test_cdx(filename)
|
||||
for line in iter_func(cdx, key):
|
||||
print line
|
||||
|
||||
#cdx.close()
|
||||
|
||||
import doctest
|
||||
doctest.testmod()
|
||||
|
||||
|
||||
|
||||
|
0
pywb/binsearch/__init__.py
Normal file
0
pywb/binsearch/__init__.py
Normal file
123
pywb/binsearch/binsearch.py
Normal file
123
pywb/binsearch/binsearch.py
Normal file
@ -0,0 +1,123 @@
|
||||
from collections import deque
|
||||
import os
|
||||
import itertools
|
||||
|
||||
#=================================================================
|
||||
# Binary Search over a text file
|
||||
#=================================================================
|
||||
class FileReader:
|
||||
"""
|
||||
A very simple file-like object wrapper that knows it's size
|
||||
getsize() method returns the filesize
|
||||
"""
|
||||
def __init__(self, filename):
|
||||
self.fh = open(filename, 'rb')
|
||||
self.filename = filename
|
||||
self.size = os.path.getsize(filename)
|
||||
|
||||
def getsize(self):
|
||||
return self.size
|
||||
|
||||
def readline(self):
|
||||
return self.fh.readline()
|
||||
|
||||
def seek(self, offset):
|
||||
return self.fh.seek(offset)
|
||||
|
||||
def close(self):
|
||||
return self.fh.close()
|
||||
|
||||
|
||||
#=================================================================
|
||||
def binsearch_offset(reader, key, compare_func=cmp, block_size=8192):
|
||||
"""
|
||||
Find offset of the full line which matches a given 'key' using binary search
|
||||
If key is not found, the offset is of the line after the key
|
||||
|
||||
File is subdivided into block_size (default 8192) sized blocks
|
||||
Optional compare_func may be specified
|
||||
"""
|
||||
min = 0
|
||||
max = reader.getsize() / block_size
|
||||
|
||||
while (max - min > 1):
|
||||
mid = min + ((max - min) / 2)
|
||||
reader.seek(mid * block_size)
|
||||
|
||||
if mid > 0:
|
||||
reader.readline() # skip partial line
|
||||
|
||||
line = reader.readline()
|
||||
|
||||
if compare_func(key, line) > 0:
|
||||
min = mid
|
||||
else:
|
||||
max = mid
|
||||
|
||||
return (min * block_size)
|
||||
|
||||
|
||||
def search(reader, key, prev_size = 0, compare_func = cmp, block_size = 8192):
|
||||
"""
|
||||
Perform a binsearch for a specified key down to block_size (8192) sized blocks,
|
||||
followed by linear search within the block to find first matching line.
|
||||
|
||||
When performing linear search, keep track of up to N previous lines before
|
||||
first matching line.
|
||||
"""
|
||||
min = binsearch_offset(reader, key, compare_func, block_size)
|
||||
|
||||
reader.seek(min)
|
||||
|
||||
if min > 0:
|
||||
reader.readline() # skip partial line
|
||||
|
||||
if prev_size > 1:
|
||||
prev_deque = deque(maxlen = prev_size)
|
||||
|
||||
line = None
|
||||
|
||||
while True:
|
||||
line = reader.readline()
|
||||
if not line:
|
||||
break
|
||||
if compare_func(line, key) >= 0:
|
||||
break
|
||||
|
||||
if prev_size == 1:
|
||||
prev = line
|
||||
elif prev_size > 1:
|
||||
prev_deque.append(line)
|
||||
|
||||
def gen_iter(line):
|
||||
if prev_size == 1:
|
||||
yield prev.rstrip()
|
||||
elif prev_size > 1:
|
||||
for i in prev_deque:
|
||||
yield i.rstrip()
|
||||
|
||||
while line:
|
||||
yield line.rstrip()
|
||||
line = reader.readline()
|
||||
|
||||
return gen_iter(line)
|
||||
|
||||
|
||||
# Iterate over prefix matches
|
||||
def iter_prefix(reader, key):
|
||||
"""
|
||||
Creates an iterator which iterates over prefix matches for a key in a sorted text file
|
||||
A line matches as long as it starts with key
|
||||
"""
|
||||
|
||||
return itertools.takewhile(lambda line: line.startswith(key), search(reader, key))
|
||||
|
||||
|
||||
def iter_exact(reader, key, token=' '):
|
||||
"""
|
||||
Create an iterator which iterates over exact matches for a key in a sorted text file
|
||||
Key is terminated by a token (default ' ')
|
||||
"""
|
||||
|
||||
return iter_prefix(reader, key + token)
|
||||
|
358
pywb/cdxserve.py
358
pywb/cdxserve.py
@ -1,358 +0,0 @@
|
||||
import binsearch
|
||||
import indexreader
|
||||
import bisect
|
||||
import itertools
|
||||
import re
|
||||
|
||||
from heapq import merge
|
||||
from collections import deque
|
||||
|
||||
|
||||
|
||||
#=================================================================
|
||||
def cdx_text_out(cdx, fields):
|
||||
if not fields:
|
||||
return str(cdx)
|
||||
else:
|
||||
return ' '.join(map(lambda x: cdx[x], fields.split(',')))
|
||||
|
||||
|
||||
#=================================================================
|
||||
def cdx_serve(key, params, sources, match_func = binsearch.iter_exact):
|
||||
cdx_iter = merge_sort_streams(sources, key, match_func)
|
||||
|
||||
cdx_iter = make_cdx_iter(cdx_iter)
|
||||
|
||||
resolve_revisits = params.get('resolve_revisits', False)
|
||||
if resolve_revisits:
|
||||
cdx_iter = cdx_resolve_revisits(cdx_iter)
|
||||
|
||||
filters = params.get('filter', None)
|
||||
if filters:
|
||||
cdx_iter = cdx_filter(cdx_iter, filters)
|
||||
|
||||
collapse_time = params.get('collapse_time', None)
|
||||
if collapse_time:
|
||||
cdx_iter = cdx_collapse_time_status(cdx_iter, collapse_time)
|
||||
|
||||
limit = int(params.get('limit', 1000000))
|
||||
|
||||
reverse = params.get('reverse', False)
|
||||
if reverse:
|
||||
cdx_iter = cdx_reverse(cdx_iter, limit)
|
||||
|
||||
closest_to = params.get('closest_to', None)
|
||||
if closest_to:
|
||||
cdx_iter = cdx_sort_closest(closest_to, cdx_iter, limit)
|
||||
|
||||
if limit:
|
||||
cdx_iter = cdx_limit(cdx_iter, limit)
|
||||
|
||||
# output raw cdx objects
|
||||
if params.get('output') == 'raw':
|
||||
return cdx_iter
|
||||
|
||||
def write_cdx(fields):
|
||||
for cdx in cdx_iter:
|
||||
yield cdx_text_out(cdx, fields) + '\n'
|
||||
|
||||
return write_cdx(params.get('fields'))
|
||||
|
||||
|
||||
#=================================================================
|
||||
# merge multiple cdx streams
|
||||
def merge_sort_streams(sources, key, iter_func):
|
||||
"""
|
||||
>>> test_cdx(key = 'org,iana)/', sources = [test_dir + 'dupes.cdx', test_dir + 'iana.cdx'])
|
||||
org,iana)/ 20140126200624 http://www.iana.org/ text/html 200 OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 2258 334 iana.warc.gz
|
||||
org,iana)/ 20140127171238 http://iana.org unk 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 343 1858 dupes.warc.gz
|
||||
org,iana)/ 20140127171238 http://www.iana.org/ warc/revisit - OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 536 2678 dupes.warc.gz
|
||||
"""
|
||||
|
||||
def load_src(source):
|
||||
source = binsearch.FileReader(source)
|
||||
source = iter_func(source, key)
|
||||
return source
|
||||
|
||||
# Optimize: no need to merge if just one input
|
||||
if len(sources) == 1:
|
||||
return load_src(sources[0])
|
||||
|
||||
source_iters = map(load_src, sources)
|
||||
merged_stream = merge(*(source_iters))
|
||||
return merged_stream
|
||||
|
||||
#=================================================================
|
||||
# convert text cdx stream to CDXCaptureResult
|
||||
def make_cdx_iter(text_iter):
|
||||
return itertools.imap(lambda line: indexreader.CDXCaptureResult(line), text_iter)
|
||||
|
||||
|
||||
#=================================================================
|
||||
# limit cdx to at most limit
|
||||
def cdx_limit(cdx_iter, limit):
|
||||
"""
|
||||
>>> test_cdx('org,iana)/_css/2013.1/fonts/opensans-bold.ttf', limit = 3)
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200625 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 117166 198285 iana.warc.gz
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200654 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf warc/revisit - YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 548 482544 iana.warc.gz
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200706 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf warc/revisit - YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 495230 iana.warc.gz
|
||||
|
||||
"""
|
||||
|
||||
for cdx, _ in itertools.izip(cdx_iter, xrange(limit)):
|
||||
yield cdx
|
||||
|
||||
|
||||
#=================================================================
|
||||
# reverse cdx
|
||||
def cdx_reverse(cdx_iter, limit):
|
||||
"""
|
||||
>>> test_cdx('org,iana)/_css/2013.1/fonts/opensans-bold.ttf', reverse = True, resolve_revisits = True, limit = 3)
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201308 https://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 551 783712 iana.warc.gz 117166 198285 iana.warc.gz
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201249 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 771773 iana.warc.gz 117166 198285 iana.warc.gz
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201240 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 551 757988 iana.warc.gz 117166 198285 iana.warc.gz
|
||||
|
||||
>>> test_cdx('org,iana)/_js/2013.1/jquery.js', reverse = True, resolve_revisits = True, limit = 1)
|
||||
org,iana)/_js/2013.1/jquery.js 20140126201307 https://www.iana.org/_js/2013.1/jquery.js application/x-javascript 200 AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO - - 543 778507 iana.warc.gz 33449 7311 iana.warc.gz
|
||||
|
||||
# no match, single result
|
||||
>>> test_cdx('org,iana)/dont_have_this', reverse = True, resolve_revisits = True, limit = 1)
|
||||
"""
|
||||
|
||||
# optimize for single last
|
||||
if limit == 1:
|
||||
last = None
|
||||
|
||||
for cdx in cdx_iter:
|
||||
last = cdx
|
||||
|
||||
return [last] if last else []
|
||||
|
||||
reverse_cdxs = deque(maxlen = limit)
|
||||
|
||||
for cdx in cdx_iter:
|
||||
reverse_cdxs.appendleft(cdx)
|
||||
|
||||
return reverse_cdxs
|
||||
|
||||
|
||||
#=================================================================
|
||||
# filter cdx by regex if each filter is field:regex form,
|
||||
# apply filter to cdx[field]
|
||||
def cdx_filter(cdx_iter, filter_strings):
|
||||
"""
|
||||
>>> test_cdx(key = 'org,iana)/domains', match_func = binsearch.iter_prefix, filter = ['mimetype:text/html'])
|
||||
org,iana)/domains 20140126200825 http://www.iana.org/domains text/html 200 7UPSCLNWNZP33LGW6OJGSF2Y4CDG4ES7 - - 2912 610534 iana.warc.gz
|
||||
org,iana)/domains/arpa 20140126201248 http://www.iana.org/domains/arpa text/html 200 QOFZZRN6JIKAL2JRL6ZC2VVG42SPKGHT - - 2939 759039 iana.warc.gz
|
||||
org,iana)/domains/idn-tables 20140126201127 http://www.iana.org/domains/idn-tables text/html 200 HNCUFTJMOQOGAEY6T56KVC3T7TVLKGEW - - 8118 715878 iana.warc.gz
|
||||
org,iana)/domains/int 20140126201239 http://www.iana.org/domains/int text/html 200 X32BBNNORV4SPEHTQF5KI5NFHSKTZK6Q - - 2482 746788 iana.warc.gz
|
||||
org,iana)/domains/reserved 20140126201054 http://www.iana.org/domains/reserved text/html 200 R5AAEQX5XY5X5DG66B23ODN5DUBWRA27 - - 3573 701457 iana.warc.gz
|
||||
org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz
|
||||
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz
|
||||
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
|
||||
org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz
|
||||
|
||||
|
||||
>>> test_cdx(key = 'org,iana)/_css/2013.1/screen.css', filter = 'statuscode:200')
|
||||
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
|
||||
"""
|
||||
|
||||
# Support single strings as well
|
||||
if isinstance(filter_strings, str):
|
||||
filter_strings = [filter_strings]
|
||||
|
||||
filters = []
|
||||
|
||||
class Filter:
|
||||
def __init__(self, string):
|
||||
# invert filter
|
||||
self.invert = string.startswith('!')
|
||||
if self.invert:
|
||||
string = string[1:]
|
||||
|
||||
parts = string.split(':', 1)
|
||||
# no field set, apply filter to entire cdx
|
||||
if len(parts) == 1:
|
||||
self.field = ''
|
||||
else:
|
||||
# apply filter to cdx[field]
|
||||
self.field = parts[0]
|
||||
string = parts[1]
|
||||
|
||||
self.regex = re.compile(string)
|
||||
|
||||
def __call__(self, cdx):
|
||||
val = cdx[self.field] if self.field else str(cdx)
|
||||
matched = self.regex.match(val) is not None
|
||||
return matched ^ self.invert
|
||||
|
||||
filters = map(Filter, filter_strings)
|
||||
|
||||
for cdx in cdx_iter:
|
||||
if all (x(cdx) for x in filters):
|
||||
yield cdx
|
||||
|
||||
|
||||
|
||||
#=================================================================
|
||||
# collapse by timestamp and status code
|
||||
def cdx_collapse_time_status(cdx_iter, timelen = 10):
|
||||
"""
|
||||
# unresolved revisits, different statuscode results in an extra repeat
|
||||
>>> test_cdx(key = 'org,iana)/_css/2013.1/screen.css', collapse_time = 11)
|
||||
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
|
||||
org,iana)/_css/2013.1/screen.css 20140126200653 http://www.iana.org/_css/2013.1/screen.css warc/revisit - BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 533 328367 iana.warc.gz
|
||||
org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css warc/revisit - BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz
|
||||
|
||||
# resolved revisits
|
||||
>>> test_cdx(key = 'org,iana)/_css/2013.1/screen.css', collapse_time = '11', resolve_revisits = True)
|
||||
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz - - -
|
||||
org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz 8754 41238 iana.warc.gz
|
||||
|
||||
"""
|
||||
|
||||
timelen = int(timelen)
|
||||
|
||||
last_token = None
|
||||
|
||||
for cdx in cdx_iter:
|
||||
curr_token = (cdx['timestamp'][:timelen], cdx['statuscode'])
|
||||
|
||||
# yield if last_dedup_time is diff, otherwise skip
|
||||
if curr_token != last_token:
|
||||
last_token = curr_token
|
||||
yield cdx
|
||||
|
||||
|
||||
|
||||
#=================================================================
|
||||
# sort CDXCaptureResult by closest to timestamp
|
||||
def cdx_sort_closest(closest, cdx_iter, limit = 10):
|
||||
"""
|
||||
>>> test_cdx(closest_to = '20140126200826', key = 'org,iana)/_css/2013.1/fonts/opensans-bold.ttf', fields = 'timestamp', limit = 10)
|
||||
20140126200826
|
||||
20140126200816
|
||||
20140126200805
|
||||
20140126200912
|
||||
20140126200738
|
||||
20140126200930
|
||||
20140126200718
|
||||
20140126200706
|
||||
20140126200654
|
||||
20140126200625
|
||||
|
||||
>>> test_cdx(closest_to = '20140126201306', key = 'org,iana)/dnssec', resolve_revisits = True, sources = [test_dir + 'dupes.cdx', test_dir + 'iana.cdx'])
|
||||
org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz - - -
|
||||
org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz - - -
|
||||
|
||||
|
||||
>>> test_cdx(closest_to = '20140126201307', key = 'org,iana)/dnssec', resolve_revisits = True)
|
||||
org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz - - -
|
||||
org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz - - -
|
||||
|
||||
# equal dist prefer earlier
|
||||
>>> test_cdx(closest_to = '20140126200700', key = 'org,iana)/_css/2013.1/fonts/opensans-bold.ttf', resolve_revisits = True, limit = 2)
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200654 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 548 482544 iana.warc.gz 117166 198285 iana.warc.gz
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200706 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 495230 iana.warc.gz 117166 198285 iana.warc.gz
|
||||
|
||||
>>> test_cdx(closest_to = '20140126200659', key = 'org,iana)/_css/2013.1/fonts/opensans-bold.ttf', resolve_revisits = True, limit = 2, fields = 'timestamp')
|
||||
20140126200654
|
||||
20140126200706
|
||||
|
||||
>>> test_cdx(closest_to = '20140126200701', key = 'org,iana)/_css/2013.1/fonts/opensans-bold.ttf', resolve_revisits = True, limit = 2, fields = 'timestamp')
|
||||
20140126200706
|
||||
20140126200654
|
||||
|
||||
"""
|
||||
closest_cdx = []
|
||||
|
||||
closest_sec = utils.timestamp_to_sec(closest)
|
||||
|
||||
for cdx in cdx_iter:
|
||||
sec = utils.timestamp_to_sec(cdx['timestamp'])
|
||||
key = abs(closest_sec - sec)
|
||||
|
||||
# create tuple to sort by key
|
||||
bisect.insort(closest_cdx, (key, cdx))
|
||||
|
||||
if len(closest_cdx) == limit:
|
||||
# assuming cdx in ascending order and keys have started increasing
|
||||
if key > closest_cdx[-1]:
|
||||
break
|
||||
|
||||
if len(closest_cdx) > limit:
|
||||
closest_cdx.pop()
|
||||
|
||||
|
||||
return itertools.imap(lambda x: x[1], closest_cdx)
|
||||
|
||||
|
||||
|
||||
#=================================================================
|
||||
# resolve revisits
|
||||
|
||||
# Fields to append from cdx original to revisit
|
||||
ORIG_TUPLE = ['length', 'offset', 'filename']
|
||||
|
||||
def cdx_resolve_revisits(cdx_iter):
|
||||
"""
|
||||
>>> test_cdx('org,iana)/_css/2013.1/fonts/inconsolata.otf', resolve_revisits = True)
|
||||
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200826 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 34054 620049 iana.warc.gz - - -
|
||||
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200912 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 546 667073 iana.warc.gz 34054 620049 iana.warc.gz
|
||||
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200930 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 534 697255 iana.warc.gz 34054 620049 iana.warc.gz
|
||||
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201055 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 547 714833 iana.warc.gz 34054 620049 iana.warc.gz
|
||||
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201249 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 551 768625 iana.warc.gz 34054 620049 iana.warc.gz
|
||||
|
||||
>>> test_cdx('org,iana)/domains/root/db', resolve_revisits = True)
|
||||
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz - - -
|
||||
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz - - -
|
||||
"""
|
||||
|
||||
|
||||
originals = {}
|
||||
|
||||
for cdx in cdx_iter:
|
||||
is_revisit = (cdx['mimetype'] == 'warc/revisit') or (cdx['filename'] == '-')
|
||||
|
||||
digest = cdx['digest']
|
||||
|
||||
original_cdx = originals.get(digest)
|
||||
|
||||
if not original_cdx and not is_revisit:
|
||||
originals[digest] = cdx
|
||||
|
||||
|
||||
if original_cdx and is_revisit:
|
||||
fill_orig = lambda field: original_cdx[field]
|
||||
# Transfer mimetype and statuscode
|
||||
cdx['mimetype'] = original_cdx['mimetype']
|
||||
cdx['statuscode'] = original_cdx['statuscode']
|
||||
else:
|
||||
fill_orig = lambda field: '-'
|
||||
|
||||
# Always add either the original or empty '- - -'
|
||||
for field in ORIG_TUPLE:
|
||||
cdx['orig.' + field] = fill_orig(field)
|
||||
|
||||
yield cdx
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
import utils
|
||||
if __name__ == "__main__" or utils.enable_doctests():
|
||||
import os
|
||||
import sys
|
||||
|
||||
test_dir = utils.test_data_dir() + 'cdx/'
|
||||
|
||||
def test_cdx(key, match_func = binsearch.iter_exact, sources = [test_dir + 'iana.cdx'], **kwparams):
|
||||
for x in cdx_serve(key, kwparams, sources, match_func):
|
||||
sys.stdout.write(x)
|
||||
|
||||
|
||||
import doctest
|
||||
doctest.testmod()
|
||||
|
||||
|
0
pywb/cdxserver/__init__.py
Normal file
0
pywb/cdxserver/__init__.py
Normal file
42
pywb/cdxserver/cdxapp.py
Normal file
42
pywb/cdxserver/cdxapp.py
Normal file
@ -0,0 +1,42 @@
|
||||
from cdxserver import CDXServer
|
||||
import logging
|
||||
import os
|
||||
|
||||
|
||||
test_cdx_dir = os.path.dirname(os.path.realpath(__file__)) + '/../../sample_archive/cdx/'
|
||||
|
||||
#=================================================================
|
||||
def main(config = None):
|
||||
logging.basicConfig(format = '%(asctime)s: [%(levelname)s]: %(message)s', level = logging.DEBUG)
|
||||
|
||||
if not config:
|
||||
config = [test_cdx_dir]
|
||||
|
||||
cdxserver = CDXServer(config)
|
||||
|
||||
def application(env, start_response):
|
||||
try:
|
||||
response = cdxserver.load_cdx_from_request(env)
|
||||
start_response('200 OK', [('Content-Type', 'text/plain')])
|
||||
|
||||
response = list(response)
|
||||
|
||||
except Exception as exc:
|
||||
import traceback
|
||||
err_details = traceback.format_exc(exc)
|
||||
start_response('400 Error', [('Content-Type', 'text/plain')])
|
||||
response = [str(exc)]
|
||||
print err_details
|
||||
|
||||
return response
|
||||
|
||||
|
||||
return application
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pass
|
||||
else:
|
||||
application = main()
|
||||
|
||||
|
57
pywb/cdxserver/cdxobject.py
Normal file
57
pywb/cdxserver/cdxobject.py
Normal file
@ -0,0 +1,57 @@
|
||||
from collections import OrderedDict
|
||||
import itertools
|
||||
|
||||
#=================================================================
|
||||
class CDXObject(OrderedDict):
|
||||
CDX_FORMATS = [
|
||||
# Public CDX Format
|
||||
["urlkey","timestamp","original","mimetype","statuscode","digest","length"],
|
||||
|
||||
# CDX 11 Format
|
||||
["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"],
|
||||
|
||||
# CDX 9 Format
|
||||
["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","offset","filename"],
|
||||
|
||||
# CDX 11 Format + 3 revisit resolve fields
|
||||
["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename",
|
||||
"orig.length","orig.offset","orig.filename"],
|
||||
|
||||
# CDX 9 Format + 3 revisit resolve fields
|
||||
["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","offset","filename",
|
||||
"orig.length","orig.offset","orig.filename"]
|
||||
]
|
||||
|
||||
def __init__(self, cdxline):
|
||||
OrderedDict.__init__(self)
|
||||
|
||||
cdxline = cdxline.rstrip()
|
||||
fields = cdxline.split(' ')
|
||||
|
||||
cdxformat = None
|
||||
for i in self.CDX_FORMATS:
|
||||
if len(i) == len(fields):
|
||||
cdxformat = i
|
||||
|
||||
if not cdxformat:
|
||||
raise Exception('unknown {0}-field cdx format'.format(len(fields)))
|
||||
|
||||
for header, field in itertools.izip(cdxformat, fields):
|
||||
self[header] = field
|
||||
|
||||
self.cdxline = cdxline
|
||||
|
||||
def __setitem__(self, key, value):
|
||||
OrderedDict.__setitem__(self, key, value)
|
||||
|
||||
# force regen on next __str__ call
|
||||
self.cdxline = None
|
||||
|
||||
def __str__(self):
|
||||
if self.cdxline:
|
||||
return self.cdxline
|
||||
|
||||
li = itertools.imap(lambda (n, val): val, self.items())
|
||||
return ' '.join(li)
|
||||
|
||||
|
228
pywb/cdxserver/cdxops.py
Normal file
228
pywb/cdxserver/cdxops.py
Normal file
@ -0,0 +1,228 @@
|
||||
from cdxobject import CDXObject
|
||||
|
||||
from ..binsearch.binsearch import iter_exact, iter_prefix, FileReader
|
||||
|
||||
import timeutils
|
||||
import bisect
|
||||
import itertools
|
||||
import re
|
||||
|
||||
from heapq import merge
|
||||
from collections import deque
|
||||
|
||||
|
||||
|
||||
#=================================================================
|
||||
def cdx_text_out(cdx, fields):
|
||||
if not fields:
|
||||
return str(cdx)
|
||||
else:
|
||||
return ' '.join(map(lambda x: cdx[x], fields.split(',')))
|
||||
|
||||
|
||||
#=================================================================
|
||||
def cdx_load(sources, params):
|
||||
cdx_iter = load_cdx_streams(sources, params)
|
||||
|
||||
cdx_iter = make_cdx_iter(cdx_iter)
|
||||
|
||||
resolve_revisits = params.get('resolve_revisits', False)
|
||||
if resolve_revisits:
|
||||
cdx_iter = cdx_resolve_revisits(cdx_iter)
|
||||
|
||||
filters = params.get('filter', None)
|
||||
if filters:
|
||||
cdx_iter = cdx_filter(cdx_iter, filters)
|
||||
|
||||
collapse_time = params.get('collapse_time', None)
|
||||
if collapse_time:
|
||||
cdx_iter = cdx_collapse_time_status(cdx_iter, collapse_time)
|
||||
|
||||
limit = int(params.get('limit', 1000000))
|
||||
|
||||
reverse = params.get('reverse', False)
|
||||
if reverse:
|
||||
cdx_iter = cdx_reverse(cdx_iter, limit)
|
||||
|
||||
closest_to = params.get('closest_to', None)
|
||||
if closest_to:
|
||||
cdx_iter = cdx_sort_closest(closest_to, cdx_iter, limit)
|
||||
|
||||
if limit:
|
||||
cdx_iter = cdx_limit(cdx_iter, limit)
|
||||
|
||||
# output raw cdx objects
|
||||
if params.get('output') == 'raw':
|
||||
return cdx_iter
|
||||
|
||||
def write_cdx(fields):
|
||||
for cdx in cdx_iter:
|
||||
yield cdx_text_out(cdx, fields) + '\n'
|
||||
|
||||
return write_cdx(params.get('fields'))
|
||||
|
||||
|
||||
#=================================================================
|
||||
# load and source merge cdx streams
|
||||
def load_cdx_streams(sources, params):
|
||||
# Optimize: no need to merge if just one input
|
||||
if len(sources) == 1:
|
||||
return sources[0].load_cdx(params)
|
||||
|
||||
source_iters = map(lambda src: src.load_cdx(params), sources)
|
||||
merged_stream = merge(*(source_iters))
|
||||
return merged_stream
|
||||
|
||||
#=================================================================
|
||||
# convert text cdx stream to CDXObject
|
||||
def make_cdx_iter(text_iter):
|
||||
return itertools.imap(lambda line: CDXObject(line), text_iter)
|
||||
|
||||
|
||||
#=================================================================
|
||||
# limit cdx to at most limit
|
||||
def cdx_limit(cdx_iter, limit):
|
||||
for cdx, _ in itertools.izip(cdx_iter, xrange(limit)):
|
||||
yield cdx
|
||||
|
||||
|
||||
#=================================================================
|
||||
# reverse cdx
|
||||
def cdx_reverse(cdx_iter, limit):
|
||||
# optimize for single last
|
||||
if limit == 1:
|
||||
last = None
|
||||
|
||||
for cdx in cdx_iter:
|
||||
last = cdx
|
||||
|
||||
return [last] if last else []
|
||||
|
||||
reverse_cdxs = deque(maxlen = limit)
|
||||
|
||||
for cdx in cdx_iter:
|
||||
reverse_cdxs.appendleft(cdx)
|
||||
|
||||
return reverse_cdxs
|
||||
|
||||
|
||||
#=================================================================
|
||||
# filter cdx by regex if each filter is field:regex form,
|
||||
# apply filter to cdx[field]
|
||||
def cdx_filter(cdx_iter, filter_strings):
|
||||
# Support single strings as well
|
||||
if isinstance(filter_strings, str):
|
||||
filter_strings = [filter_strings]
|
||||
|
||||
filters = []
|
||||
|
||||
class Filter:
|
||||
def __init__(self, string):
|
||||
# invert filter
|
||||
self.invert = string.startswith('!')
|
||||
if self.invert:
|
||||
string = string[1:]
|
||||
|
||||
parts = string.split(':', 1)
|
||||
# no field set, apply filter to entire cdx
|
||||
if len(parts) == 1:
|
||||
self.field = ''
|
||||
else:
|
||||
# apply filter to cdx[field]
|
||||
self.field = parts[0]
|
||||
string = parts[1]
|
||||
|
||||
self.regex = re.compile(string)
|
||||
|
||||
def __call__(self, cdx):
|
||||
val = cdx[self.field] if self.field else str(cdx)
|
||||
matched = self.regex.match(val) is not None
|
||||
return matched ^ self.invert
|
||||
|
||||
filters = map(Filter, filter_strings)
|
||||
|
||||
for cdx in cdx_iter:
|
||||
if all (x(cdx) for x in filters):
|
||||
yield cdx
|
||||
|
||||
|
||||
|
||||
#=================================================================
|
||||
# collapse by timestamp and status code
|
||||
def cdx_collapse_time_status(cdx_iter, timelen = 10):
|
||||
timelen = int(timelen)
|
||||
|
||||
last_token = None
|
||||
|
||||
for cdx in cdx_iter:
|
||||
curr_token = (cdx['timestamp'][:timelen], cdx['statuscode'])
|
||||
|
||||
# yield if last_dedup_time is diff, otherwise skip
|
||||
if curr_token != last_token:
|
||||
last_token = curr_token
|
||||
yield cdx
|
||||
|
||||
|
||||
|
||||
#=================================================================
|
||||
# sort CDXCaptureResult by closest to timestamp
|
||||
def cdx_sort_closest(closest, cdx_iter, limit = 10):
|
||||
closest_cdx = []
|
||||
|
||||
closest_sec = timeutils.timestamp_to_sec(closest)
|
||||
|
||||
for cdx in cdx_iter:
|
||||
sec = timeutils.timestamp_to_sec(cdx['timestamp'])
|
||||
key = abs(closest_sec - sec)
|
||||
|
||||
# create tuple to sort by key
|
||||
bisect.insort(closest_cdx, (key, cdx))
|
||||
|
||||
if len(closest_cdx) == limit:
|
||||
# assuming cdx in ascending order and keys have started increasing
|
||||
if key > closest_cdx[-1]:
|
||||
break
|
||||
|
||||
if len(closest_cdx) > limit:
|
||||
closest_cdx.pop()
|
||||
|
||||
|
||||
return itertools.imap(lambda x: x[1], closest_cdx)
|
||||
|
||||
|
||||
|
||||
#=================================================================
|
||||
# resolve revisits
|
||||
|
||||
# Fields to append from cdx original to revisit
|
||||
ORIG_TUPLE = ['length', 'offset', 'filename']
|
||||
|
||||
def cdx_resolve_revisits(cdx_iter):
|
||||
originals = {}
|
||||
|
||||
for cdx in cdx_iter:
|
||||
is_revisit = (cdx['mimetype'] == 'warc/revisit') or (cdx['filename'] == '-')
|
||||
|
||||
digest = cdx['digest']
|
||||
|
||||
original_cdx = originals.get(digest)
|
||||
|
||||
if not original_cdx and not is_revisit:
|
||||
originals[digest] = cdx
|
||||
|
||||
|
||||
if original_cdx and is_revisit:
|
||||
fill_orig = lambda field: original_cdx[field]
|
||||
# Transfer mimetype and statuscode
|
||||
cdx['mimetype'] = original_cdx['mimetype']
|
||||
cdx['statuscode'] = original_cdx['statuscode']
|
||||
else:
|
||||
fill_orig = lambda field: '-'
|
||||
|
||||
# Always add either the original or empty '- - -'
|
||||
for field in ORIG_TUPLE:
|
||||
cdx['orig.' + field] = fill_orig(field)
|
||||
|
||||
yield cdx
|
||||
|
||||
|
160
pywb/cdxserver/cdxserver.py
Normal file
160
pywb/cdxserver/cdxserver.py
Normal file
@ -0,0 +1,160 @@
|
||||
import surt
|
||||
from ..binsearch.binsearch import iter_exact, iter_prefix, FileReader
|
||||
from cdxops import cdx_load
|
||||
|
||||
import itertools
|
||||
import logging
|
||||
import os
|
||||
import urlparse
|
||||
|
||||
|
||||
#=================================================================
|
||||
class CDXFile:
|
||||
def __init__(self, filename):
|
||||
self.filename = filename
|
||||
|
||||
def load_cdx(self, params):
|
||||
source = FileReader(self.filename)
|
||||
|
||||
match_type = params.get('match_type')
|
||||
|
||||
if match_type == 'prefix':
|
||||
iter_func = iter_prefix
|
||||
else:
|
||||
iter_func = iter_exact
|
||||
|
||||
key = params.get('key')
|
||||
|
||||
return iter_func(source, key)
|
||||
|
||||
def __str__(self):
|
||||
return 'CDX File - ' + self.filename
|
||||
|
||||
#=================================================================
|
||||
class CDXException(Exception):
|
||||
def __init__(self, msg, url = None):
|
||||
Exception.__init__(self, msg)
|
||||
self.url = url
|
||||
|
||||
def status(self):
|
||||
return '400 Bad Request'
|
||||
|
||||
|
||||
#=================================================================
|
||||
class CDXServer:
|
||||
"""
|
||||
Top-level cdx server object which maintains a list of cdx sources,
|
||||
responds to queries and dispatches to the cdx ops for processing
|
||||
"""
|
||||
|
||||
def __init__(self, sources, surt_ordered = True):
|
||||
self.sources = []
|
||||
self.surt_ordered = surt_ordered
|
||||
logging.debug('CDX Surt-Ordered? ' + str(surt_ordered))
|
||||
|
||||
for src in sources:
|
||||
if os.path.isdir(src):
|
||||
for file in os.listdir(src):
|
||||
self.add_cdx_loader(src + file)
|
||||
else:
|
||||
self.add_cdx_loader(src)
|
||||
|
||||
if len(self.sources) == 0:
|
||||
logging.exception('No CDX Sources Found!')
|
||||
|
||||
def add_cdx_loader(self, filename):
|
||||
source = self.create_cdx_loader(filename)
|
||||
if not source:
|
||||
return
|
||||
|
||||
logging.debug('Adding CDX Source: ' + str(source))
|
||||
self.sources.append(source)
|
||||
|
||||
@staticmethod
|
||||
def create_cdx_loader(filename):
|
||||
if filename.endswith('.cdx'):
|
||||
return CDXFile(filename)
|
||||
return None
|
||||
#TODO: support zipnum
|
||||
#elif filename.endswith('.summary')
|
||||
# return ZipNumCDXSource(filename)
|
||||
#elif filename.startswith('redis://')
|
||||
# return RedisCDXSource(filename)
|
||||
|
||||
|
||||
def load_cdx(self, **params):
|
||||
# canonicalize to surt (canonicalization is part of surt conversion)
|
||||
try:
|
||||
url = params['url']
|
||||
except KeyError:
|
||||
raise CDXException('The url= param must be specified to query the cdx server')
|
||||
|
||||
try:
|
||||
key = surt.surt(url)
|
||||
except Exception as e:
|
||||
raise CDXException('Invalid url: ', url)
|
||||
|
||||
# if not surt, unsurt the surt to get canonicalized non-surt url
|
||||
if not self.surt_ordered:
|
||||
key = unsurt(key)
|
||||
|
||||
params['key'] = key
|
||||
|
||||
return cdx_load(self.sources, params)
|
||||
|
||||
|
||||
def load_cdx_from_request(self, env):
|
||||
#url = wbrequest.wb_url.url
|
||||
|
||||
# use url= param to get actual url
|
||||
params = urlparse.parse_qs(env['QUERY_STRING'])
|
||||
|
||||
if not 'output' in params:
|
||||
params['output'] = 'text'
|
||||
|
||||
# parse_qs produces arrays for single values
|
||||
# cdxreader expects singleton params for all except filters, so convert here
|
||||
# use first value of the list
|
||||
for name, val in params.iteritems():
|
||||
if name != 'filter':
|
||||
params[name] = val[0]
|
||||
|
||||
cdx_lines = self.load_cdx(**params)
|
||||
return cdx_lines
|
||||
|
||||
|
||||
|
||||
def __str__(self):
|
||||
return 'load cdx indexes from ' + str(self.sources)
|
||||
|
||||
|
||||
|
||||
#=================================================================
|
||||
def unsurt(surt):
|
||||
"""
|
||||
# Simple surt
|
||||
>>> unsurt('com,example)/')
|
||||
'example.com)/'
|
||||
|
||||
# Broken surt
|
||||
>>> unsurt('com,example)')
|
||||
'com,example)'
|
||||
|
||||
# Long surt
|
||||
>>> unsurt('suffix,domain,sub,subsub,another,subdomain)/path/file/index.html?a=b?c=)/')
|
||||
'subdomain.another.subsub.sub.domain.suffix)/path/file/index.html?a=b?c=)/'
|
||||
"""
|
||||
|
||||
try:
|
||||
index = surt.index(')/')
|
||||
parts = surt[0:index].split(',')
|
||||
parts.reverse()
|
||||
host = '.'.join(parts)
|
||||
host += surt[index:]
|
||||
return host
|
||||
|
||||
except ValueError:
|
||||
# May not be a valid surt
|
||||
return surt
|
||||
|
||||
|
103
pywb/cdxserver/timeutils.py
Normal file
103
pywb/cdxserver/timeutils.py
Normal file
@ -0,0 +1,103 @@
|
||||
import re
|
||||
import time
|
||||
import datetime
|
||||
import calendar
|
||||
|
||||
#=================================================================
|
||||
# str <-> datetime conversion
|
||||
#=================================================================
|
||||
|
||||
DATE_TIMESPLIT = re.compile('[^\d]')
|
||||
|
||||
TIMESTAMP_14 = '%Y%m%d%H%M%S'
|
||||
|
||||
PAD_STAMP_END = '29991231235959'
|
||||
|
||||
|
||||
|
||||
def iso_date_to_datetime(string):
|
||||
"""
|
||||
>>> iso_date_to_datetime('2013-12-26T10:11:12Z')
|
||||
datetime.datetime(2013, 12, 26, 10, 11, 12)
|
||||
|
||||
>>> iso_date_to_datetime('2013-12-26T10:11:12Z')
|
||||
datetime.datetime(2013, 12, 26, 10, 11, 12)
|
||||
"""
|
||||
|
||||
nums = DATE_TIMESPLIT.split(string)
|
||||
if nums[-1] == '':
|
||||
nums = nums[:-1]
|
||||
|
||||
dt = datetime.datetime(*map(int, nums))
|
||||
return dt
|
||||
|
||||
def datetime_to_timestamp(dt):
|
||||
"""
|
||||
>>> datetime_to_timestamp(datetime.datetime(2013, 12, 26, 10, 11, 12))
|
||||
'20131226101112'
|
||||
"""
|
||||
|
||||
return dt.strftime(TIMESTAMP_14)
|
||||
|
||||
def iso_date_to_timestamp(string):
|
||||
"""
|
||||
>>> iso_date_to_timestamp('2013-12-26T10:11:12Z')
|
||||
'20131226101112'
|
||||
|
||||
>>> iso_date_to_timestamp('2013-12-26T10:11:12')
|
||||
'20131226101112'
|
||||
"""
|
||||
|
||||
return datetime_to_timestamp(iso_date_to_datetime(string))
|
||||
|
||||
|
||||
# default pad is end of range for compatibility
|
||||
def pad_timestamp(string, pad_str = PAD_STAMP_END):
|
||||
"""
|
||||
>>> pad_timestamp('20')
|
||||
'20991231235959'
|
||||
|
||||
>>> pad_timestamp('2014')
|
||||
'20141231235959'
|
||||
|
||||
>>> pad_timestamp('20141011')
|
||||
'20141011235959'
|
||||
|
||||
>>> pad_timestamp('201410110010')
|
||||
'20141011001059'
|
||||
"""
|
||||
|
||||
str_len = len(string)
|
||||
pad_len = len(pad_str)
|
||||
|
||||
return string if str_len >= pad_len else string + pad_str[str_len:]
|
||||
|
||||
|
||||
def timestamp_to_datetime(string):
|
||||
"""
|
||||
>>> timestamp_to_datetime('20131226095010')
|
||||
time.struct_time(tm_year=2013, tm_mon=12, tm_mday=26, tm_hour=9, tm_min=50, tm_sec=10, tm_wday=3, tm_yday=360, tm_isdst=-1)
|
||||
|
||||
>>> timestamp_to_datetime('2014')
|
||||
time.struct_time(tm_year=2014, tm_mon=12, tm_mday=31, tm_hour=23, tm_min=59, tm_sec=59, tm_wday=2, tm_yday=365, tm_isdst=-1)
|
||||
"""
|
||||
|
||||
# Default pad to end of range for comptability
|
||||
return time.strptime(pad_timestamp(string), TIMESTAMP_14)
|
||||
|
||||
|
||||
def timestamp_to_sec(string):
|
||||
"""
|
||||
>>> timestamp_to_sec('20131226095010')
|
||||
1388051410
|
||||
|
||||
>>> timestamp_to_sec('2014')
|
||||
1420070399
|
||||
"""
|
||||
|
||||
return calendar.timegm(timestamp_to_datetime(string))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
doctest.testmod()
|
@ -66,28 +66,12 @@ class WBHandler(BaseHandler):
|
||||
# CDX-Server Handler -- pass all params to cdx server
|
||||
#=================================================================
|
||||
class CDXHandler(BaseHandler):
|
||||
def __init__(self, cdx_reader, view = None):
|
||||
self.cdx_reader = cdx_reader
|
||||
def __init__(self, cdx_server, view = None):
|
||||
self.cdx_server = cdx_server
|
||||
self.view = view if view else views.TextCapturesView()
|
||||
|
||||
def __call__(self, wbrequest):
|
||||
#url = wbrequest.wb_url.url
|
||||
|
||||
# use url= param to get actual url
|
||||
params = urlparse.parse_qs(wbrequest.env['QUERY_STRING'])
|
||||
|
||||
# parse_qs produces arrays for single values
|
||||
# cdxreader expects singleton params for all except filters, so convert here
|
||||
# use first value of the list
|
||||
for name, val in params.iteritems():
|
||||
if name != 'filter':
|
||||
params[name] = val[0]
|
||||
|
||||
url = params.get('url')
|
||||
if not url:
|
||||
raise WbException('Must specify a url= param to query cdx server')
|
||||
|
||||
cdx_lines = self.cdx_reader.load_cdx(url, params, parsed_cdx = False)
|
||||
cdx_lines = self.cdx_server.load_cdx_from_request(wbrequest.env)
|
||||
|
||||
return self.view.render_response(wbrequest, cdx_lines)
|
||||
|
||||
@ -97,7 +81,7 @@ class CDXHandler(BaseHandler):
|
||||
return None
|
||||
|
||||
def __str__(self):
|
||||
return 'CDX Server: ' + str(self.cdx_reader)
|
||||
return 'CDX Server: ' + str(self.cdx_server)
|
||||
|
||||
|
||||
#=================================================================
|
||||
|
@ -1,15 +1,13 @@
|
||||
import urllib
|
||||
import urllib2
|
||||
import wbexceptions
|
||||
import itertools
|
||||
import wbrequestresponse
|
||||
import surt
|
||||
from collections import OrderedDict
|
||||
|
||||
import binsearch
|
||||
import cdxserve
|
||||
from cdxserver.cdxserver import CDXServer, CDXException
|
||||
from cdxserver.cdxobject import CDXObject
|
||||
|
||||
import logging
|
||||
import os
|
||||
|
||||
#=================================================================
|
||||
class IndexReader:
|
||||
@ -26,7 +24,13 @@ class IndexReader:
|
||||
if wbrequest.custom_params:
|
||||
params.update(wbrequest.custom_params)
|
||||
|
||||
cdxlines = self.load_cdx(wburl.url, params, parsed_cdx)
|
||||
#params['url'] = wburl.url
|
||||
output = 'raw' if parsed_cdx else 'text'
|
||||
|
||||
try:
|
||||
cdxlines = self.load_cdx(url = wburl.url, output = output, **params)
|
||||
except CDXException:
|
||||
raise wbexceptions.BadUrlException('Bad Request Url: ' + wburl.url)
|
||||
|
||||
cdxlines = utils.peek_iter(cdxlines)
|
||||
|
||||
@ -53,7 +57,7 @@ class IndexReader:
|
||||
# for now, list implies local sources
|
||||
if isinstance(paths, list):
|
||||
if len(paths) > 1:
|
||||
return LocalCDXServer(paths, surt_ordered)
|
||||
return EmbeddedCDXServer(paths, surt_ordered)
|
||||
else:
|
||||
# treat as non-list
|
||||
paths = paths[0]
|
||||
@ -66,66 +70,13 @@ class IndexReader:
|
||||
cookie = config.get('cookie', None)
|
||||
return RemoteCDXServer(uri, cookie = cookie)
|
||||
else:
|
||||
return LocalCDXServer([uri], surt_ordered)
|
||||
return EmbeddedCDXServer([uri], surt_ordered)
|
||||
|
||||
|
||||
|
||||
|
||||
#=================================================================
|
||||
class LocalCDXServer(IndexReader):
|
||||
"""
|
||||
>>> x = LocalCDXServer([test_dir]).load_cdx('example.com', parsed_cdx = True, limit = 1)
|
||||
>>> pprint(x.next().items())
|
||||
[('urlkey', 'com,example)/'),
|
||||
('timestamp', '20140127171200'),
|
||||
('original', 'http://example.com'),
|
||||
('mimetype', 'text/html'),
|
||||
('statuscode', '200'),
|
||||
('digest', 'B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'),
|
||||
('redirect', '-'),
|
||||
('robotflags', '-'),
|
||||
('length', '1046'),
|
||||
('offset', '334'),
|
||||
('filename', 'dupes.warc.gz')]
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, sources, surt_ordered = True):
|
||||
self.sources = []
|
||||
self.surt_ordered = surt_ordered
|
||||
logging.info('CDX Surt-Ordered? ' + str(surt_ordered))
|
||||
|
||||
for src in sources:
|
||||
if os.path.isdir(src):
|
||||
for file in os.listdir(src):
|
||||
if file.endswith('.cdx'):
|
||||
full = src + file
|
||||
logging.info('Adding CDX: ' + full)
|
||||
self.sources.append(full)
|
||||
else:
|
||||
logging.info('Adding CDX: ' + src)
|
||||
self.sources.append(src)
|
||||
|
||||
|
||||
def load_cdx(self, url, params = {}, parsed_cdx = True, **kwvalues):
|
||||
# canonicalize to surt (canonicalization is part of surt conversion)
|
||||
try:
|
||||
key = surt.surt(url)
|
||||
except Exception as e:
|
||||
raise wbexceptions.BadUrlException('Bad Request Url: ' + url)
|
||||
|
||||
# if not surt, unsurt the surt to get canonicalized non-surt url
|
||||
if not self.surt_ordered:
|
||||
key = utils.unsurt(key)
|
||||
|
||||
match_func = binsearch.iter_exact
|
||||
|
||||
params.update(**kwvalues)
|
||||
params['output'] = 'raw' if parsed_cdx else 'text'
|
||||
|
||||
return cdxserve.cdx_serve(key, params, self.sources, match_func)
|
||||
|
||||
|
||||
class EmbeddedCDXServer(CDXServer, IndexReader):
|
||||
def get_query_params(self, wburl, limit = 150000, collapse_time = None, replay_closest = 10):
|
||||
|
||||
if wburl.type == wburl.URL_QUERY:
|
||||
@ -198,7 +149,7 @@ class RemoteCDXServer(IndexReader):
|
||||
raise
|
||||
|
||||
if parsed_cdx:
|
||||
return (CDXCaptureResult(cdx) for cdx in response)
|
||||
return (CDXObject(cdx) for cdx in response)
|
||||
else:
|
||||
return iter(response)
|
||||
|
||||
@ -238,62 +189,6 @@ class RemoteCDXServer(IndexReader):
|
||||
return 'server cdx from ' + self.server_url
|
||||
|
||||
|
||||
#=================================================================
|
||||
class CDXCaptureResult(OrderedDict):
|
||||
CDX_FORMATS = [
|
||||
# Public CDX Format
|
||||
["urlkey","timestamp","original","mimetype","statuscode","digest","length"],
|
||||
|
||||
# CDX 11 Format
|
||||
["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"],
|
||||
|
||||
# CDX 9 Format
|
||||
["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","offset","filename"],
|
||||
|
||||
# CDX 11 Format + 3 revisit resolve fields
|
||||
["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename",
|
||||
"orig.length","orig.offset","orig.filename"],
|
||||
|
||||
# CDX 9 Format + 3 revisit resolve fields
|
||||
["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","offset","filename",
|
||||
"orig.length","orig.offset","orig.filename"]
|
||||
]
|
||||
|
||||
def __init__(self, cdxline):
|
||||
OrderedDict.__init__(self)
|
||||
|
||||
cdxline = cdxline.rstrip()
|
||||
fields = cdxline.split(' ')
|
||||
|
||||
cdxformat = None
|
||||
for i in CDXCaptureResult.CDX_FORMATS:
|
||||
if len(i) == len(fields):
|
||||
cdxformat = i
|
||||
|
||||
if not cdxformat:
|
||||
raise wbexceptions.InvalidCDXException('unknown {0}-field cdx format'.format(len(fields)))
|
||||
|
||||
for header, field in itertools.izip(cdxformat, fields):
|
||||
self[header] = field
|
||||
|
||||
self.cdxline = cdxline
|
||||
|
||||
def __setitem__(self, key, value):
|
||||
OrderedDict.__setitem__(self, key, value)
|
||||
|
||||
# force regen on next __str__ call
|
||||
self.cdxline = None
|
||||
|
||||
|
||||
def __str__(self):
|
||||
if self.cdxline:
|
||||
return self.cdxline
|
||||
|
||||
li = itertools.imap(lambda (n, val): val, self.items())
|
||||
return ' '.join(li)
|
||||
|
||||
|
||||
|
||||
# Testing
|
||||
|
||||
import utils
|
||||
|
@ -1,5 +1,5 @@
|
||||
import redis
|
||||
import binsearch
|
||||
import binsearch.binsearch
|
||||
|
||||
import urlparse
|
||||
import os
|
||||
@ -46,10 +46,10 @@ class RedisResolver:
|
||||
class PathIndexResolver:
|
||||
def __init__(self, pathindex_file):
|
||||
self.pathindex_file = pathindex_file
|
||||
self.reader = binsearch.FileReader(pathindex_file)
|
||||
self.reader = binsearch.binsearch.FileReader(pathindex_file)
|
||||
|
||||
def __call__(self, filename):
|
||||
result = binsearch.iter_exact(self.reader, filename, '\t')
|
||||
result = binsearch.binsearch.iter_exact(self.reader, filename, '\t')
|
||||
|
||||
def gen_list(result):
|
||||
for pathline in result:
|
||||
|
@ -43,100 +43,6 @@ class PerfTimer:
|
||||
self.perfdict[self.name] = str(self.end - self.start)
|
||||
|
||||
|
||||
#=================================================================
|
||||
# str <-> datetime conversion
|
||||
#=================================================================
|
||||
|
||||
DATE_TIMESPLIT = re.compile('[^\d]')
|
||||
|
||||
TIMESTAMP_14 = '%Y%m%d%H%M%S'
|
||||
|
||||
PAD_STAMP_END = '29991231235959'
|
||||
|
||||
|
||||
|
||||
def iso_date_to_datetime(string):
|
||||
"""
|
||||
>>> iso_date_to_datetime('2013-12-26T10:11:12Z')
|
||||
datetime.datetime(2013, 12, 26, 10, 11, 12)
|
||||
|
||||
>>> iso_date_to_datetime('2013-12-26T10:11:12Z')
|
||||
datetime.datetime(2013, 12, 26, 10, 11, 12)
|
||||
"""
|
||||
|
||||
nums = DATE_TIMESPLIT.split(string)
|
||||
if nums[-1] == '':
|
||||
nums = nums[:-1]
|
||||
|
||||
dt = datetime.datetime(*map(int, nums))
|
||||
return dt
|
||||
|
||||
def datetime_to_timestamp(dt):
|
||||
"""
|
||||
>>> datetime_to_timestamp(datetime.datetime(2013, 12, 26, 10, 11, 12))
|
||||
'20131226101112'
|
||||
"""
|
||||
|
||||
return dt.strftime(TIMESTAMP_14)
|
||||
|
||||
def iso_date_to_timestamp(string):
|
||||
"""
|
||||
>>> iso_date_to_timestamp('2013-12-26T10:11:12Z')
|
||||
'20131226101112'
|
||||
|
||||
>>> iso_date_to_timestamp('2013-12-26T10:11:12')
|
||||
'20131226101112'
|
||||
"""
|
||||
|
||||
return datetime_to_timestamp(iso_date_to_datetime(string))
|
||||
|
||||
|
||||
# default pad is end of range for compatibility
|
||||
def pad_timestamp(string, pad_str = PAD_STAMP_END):
|
||||
"""
|
||||
>>> pad_timestamp('20')
|
||||
'20991231235959'
|
||||
|
||||
>>> pad_timestamp('2014')
|
||||
'20141231235959'
|
||||
|
||||
>>> pad_timestamp('20141011')
|
||||
'20141011235959'
|
||||
|
||||
>>> pad_timestamp('201410110010')
|
||||
'20141011001059'
|
||||
"""
|
||||
|
||||
str_len = len(string)
|
||||
pad_len = len(pad_str)
|
||||
|
||||
return string if str_len >= pad_len else string + pad_str[str_len:]
|
||||
|
||||
|
||||
def timestamp_to_datetime(string):
|
||||
"""
|
||||
>>> timestamp_to_datetime('20131226095010')
|
||||
time.struct_time(tm_year=2013, tm_mon=12, tm_mday=26, tm_hour=9, tm_min=50, tm_sec=10, tm_wday=3, tm_yday=360, tm_isdst=-1)
|
||||
|
||||
>>> timestamp_to_datetime('2014')
|
||||
time.struct_time(tm_year=2014, tm_mon=12, tm_mday=31, tm_hour=23, tm_min=59, tm_sec=59, tm_wday=2, tm_yday=365, tm_isdst=-1)
|
||||
"""
|
||||
|
||||
# Default pad to end of range for comptability
|
||||
return time.strptime(pad_timestamp(string), TIMESTAMP_14)
|
||||
|
||||
|
||||
def timestamp_to_sec(string):
|
||||
"""
|
||||
>>> timestamp_to_sec('20131226095010')
|
||||
1388051410
|
||||
|
||||
>>> timestamp_to_sec('2014')
|
||||
1420070399
|
||||
"""
|
||||
|
||||
return calendar.timegm(timestamp_to_datetime(string))
|
||||
|
||||
#=================================================================
|
||||
# adapted -from wsgiref.request_uri, but doesn't include domain name and allows all characters
|
||||
# allowed in the path segment according to: http://tools.ietf.org/html/rfc3986#section-3.3
|
||||
|
@ -1,9 +1,9 @@
|
||||
import indexreader
|
||||
import utils
|
||||
import cdxserver.timeutils as timeutils
|
||||
|
||||
import wbrequestresponse
|
||||
import wbexceptions
|
||||
import time
|
||||
import urlparse
|
||||
import time
|
||||
|
||||
from os import path
|
||||
from itertools import imap
|
||||
@ -58,7 +58,7 @@ class J2TemplateView:
|
||||
# Filters
|
||||
@staticmethod
|
||||
def format_ts(value, format='%a, %b %d %Y %H:%M:%S'):
|
||||
value = utils.timestamp_to_datetime(value)
|
||||
value = timeutils.timestamp_to_datetime(value)
|
||||
return time.strftime(format, value)
|
||||
|
||||
@staticmethod
|
||||
|
@ -2,6 +2,7 @@ import utils
|
||||
import wbexceptions
|
||||
|
||||
from wbrequestresponse import WbResponse, StatusAndHeaders
|
||||
from cdxserver.cdxserver import CDXException
|
||||
|
||||
import os
|
||||
import importlib
|
||||
@ -33,7 +34,7 @@ def create_wb_app(wb_router):
|
||||
except (wbexceptions.NotFoundException, wbexceptions.AccessException) as e:
|
||||
response = handle_exception(env, wb_router.error_view, e, False)
|
||||
|
||||
except wbexceptions.WbException as wbe:
|
||||
except (wbexceptions.WbException, CDXException) as wbe:
|
||||
response = handle_exception(env, wb_router.error_view, wbe, False)
|
||||
|
||||
except Exception as e:
|
||||
|
4
setup.py
4
setup.py
@ -11,8 +11,8 @@ setuptools.setup(name='pywb',
|
||||
author_email='ilya@archive.org',
|
||||
long_description=open('README.md').read(),
|
||||
license='GPL',
|
||||
packages=['pywb'],
|
||||
provides=['pywb'],
|
||||
packages=['pywb', 'pywb.binsearch', 'pywb.cdxserver'],
|
||||
provides=['pywb', 'pywb.binsearch', 'pywb.cdxserver'],
|
||||
package_data={'pywb': ['ui/*', 'static/*']},
|
||||
data_files = [('sample_archive/cdx/', glob.glob('sample_archive/cdx/*')),
|
||||
('sample_archive/warcs/', glob.glob('sample_archive/warcs/*'))],
|
||||
|
0
tests/__init__.py
Normal file
0
tests/__init__.py
Normal file
43
tests/test_binsearch.py
Normal file
43
tests/test_binsearch.py
Normal file
@ -0,0 +1,43 @@
|
||||
import os
|
||||
from ..pywb.binsearch.binsearch import iter_prefix, iter_exact, FileReader
|
||||
|
||||
test_cdx_dir = os.path.dirname(os.path.realpath(__file__)) + '/../sample_archive/cdx/'
|
||||
|
||||
def binsearch_cdx_test(key, iter_func):
|
||||
"""
|
||||
# Prefix Search
|
||||
>>> binsearch_cdx_test('org,iana)/domains/root', iter_prefix)
|
||||
org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz
|
||||
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz
|
||||
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
|
||||
org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz
|
||||
|
||||
>>> binsearch_cdx_test('org,iana)/domains/root', iter_exact)
|
||||
org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz
|
||||
|
||||
>>> binsearch_cdx_test('org,iana)/', iter_exact)
|
||||
org,iana)/ 20140126200624 http://www.iana.org/ text/html 200 OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 2258 334 iana.warc.gz
|
||||
|
||||
>>> binsearch_cdx_test('org,iana)/domains/root/db', iter_exact)
|
||||
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz
|
||||
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
|
||||
|
||||
# Exact Search
|
||||
>>> binsearch_cdx_test('org,iaana)/', iter_exact)
|
||||
>>> binsearch_cdx_test('org,ibna)/', iter_exact)
|
||||
|
||||
>>> binsearch_cdx_test('org,iana)/time-zones', iter_exact)
|
||||
org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
|
||||
"""
|
||||
|
||||
cdx = FileReader(test_cdx_dir + 'iana.cdx')
|
||||
|
||||
for line in iter_func(cdx, key):
|
||||
print line
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
doctest.testmod()
|
||||
|
||||
|
149
tests/test_cdxserve.py
Normal file
149
tests/test_cdxserve.py
Normal file
@ -0,0 +1,149 @@
|
||||
from ..pywb.binsearch.binsearch import iter_exact, iter_prefix, FileReader
|
||||
from ..pywb.cdxserver.cdxserver import CDXServer
|
||||
import os
|
||||
import sys
|
||||
import pprint
|
||||
|
||||
test_cdx_dir = os.path.dirname(os.path.realpath(__file__)) + '/../sample_archive/cdx/'
|
||||
|
||||
def cdx_ops_test(url, sources = [test_cdx_dir + 'iana.cdx'], **kwparams):
|
||||
"""
|
||||
# Merge Sort Multipe CDX Sources
|
||||
>>> cdx_ops_test(url = 'http://iana.org/', sources = [test_cdx_dir + 'dupes.cdx', test_cdx_dir + 'iana.cdx'])
|
||||
org,iana)/ 20140126200624 http://www.iana.org/ text/html 200 OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 2258 334 iana.warc.gz
|
||||
org,iana)/ 20140127171238 http://iana.org unk 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 343 1858 dupes.warc.gz
|
||||
org,iana)/ 20140127171238 http://www.iana.org/ warc/revisit - OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 536 2678 dupes.warc.gz
|
||||
|
||||
|
||||
# Limit CDX Stream
|
||||
>>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', limit = 3)
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200625 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 117166 198285 iana.warc.gz
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200654 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf warc/revisit - YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 548 482544 iana.warc.gz
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200706 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf warc/revisit - YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 495230 iana.warc.gz
|
||||
|
||||
|
||||
# Reverse CDX Stream
|
||||
>>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', reverse = True, resolve_revisits = True, limit = 3)
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201308 https://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 551 783712 iana.warc.gz 117166 198285 iana.warc.gz
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201249 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 771773 iana.warc.gz 117166 198285 iana.warc.gz
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201240 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 551 757988 iana.warc.gz 117166 198285 iana.warc.gz
|
||||
|
||||
>>> cdx_ops_test('http://iana.org/_js/2013.1/jquery.js', reverse = True, resolve_revisits = True, limit = 1)
|
||||
org,iana)/_js/2013.1/jquery.js 20140126201307 https://www.iana.org/_js/2013.1/jquery.js application/x-javascript 200 AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO - - 543 778507 iana.warc.gz 33449 7311 iana.warc.gz
|
||||
|
||||
# No matching results
|
||||
>>> cdx_ops_test('http://iana.org/dont_have_this', reverse = True, resolve_revisits = True, limit = 2)
|
||||
|
||||
|
||||
# Filter cdx
|
||||
>>> cdx_ops_test(url = 'http://iana.org/domains', match_type = 'prefix', filter = ['mimetype:text/html'])
|
||||
org,iana)/domains 20140126200825 http://www.iana.org/domains text/html 200 7UPSCLNWNZP33LGW6OJGSF2Y4CDG4ES7 - - 2912 610534 iana.warc.gz
|
||||
org,iana)/domains/arpa 20140126201248 http://www.iana.org/domains/arpa text/html 200 QOFZZRN6JIKAL2JRL6ZC2VVG42SPKGHT - - 2939 759039 iana.warc.gz
|
||||
org,iana)/domains/idn-tables 20140126201127 http://www.iana.org/domains/idn-tables text/html 200 HNCUFTJMOQOGAEY6T56KVC3T7TVLKGEW - - 8118 715878 iana.warc.gz
|
||||
org,iana)/domains/int 20140126201239 http://www.iana.org/domains/int text/html 200 X32BBNNORV4SPEHTQF5KI5NFHSKTZK6Q - - 2482 746788 iana.warc.gz
|
||||
org,iana)/domains/reserved 20140126201054 http://www.iana.org/domains/reserved text/html 200 R5AAEQX5XY5X5DG66B23ODN5DUBWRA27 - - 3573 701457 iana.warc.gz
|
||||
org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz
|
||||
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz
|
||||
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
|
||||
org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz
|
||||
|
||||
|
||||
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', filter = 'statuscode:200')
|
||||
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
|
||||
|
||||
|
||||
# Collapse by timestamp
|
||||
# unresolved revisits, different statuscode results in an extra repeat
|
||||
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapse_time = 11)
|
||||
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
|
||||
org,iana)/_css/2013.1/screen.css 20140126200653 http://www.iana.org/_css/2013.1/screen.css warc/revisit - BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 533 328367 iana.warc.gz
|
||||
org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css warc/revisit - BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz
|
||||
|
||||
# resolved revisits
|
||||
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapse_time = '11', resolve_revisits = True)
|
||||
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz - - -
|
||||
org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz 8754 41238 iana.warc.gz
|
||||
|
||||
|
||||
# Sort by closest timestamp + field select output
|
||||
>>> cdx_ops_test(closest_to = '20140126200826', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', fields = 'timestamp', limit = 10)
|
||||
20140126200826
|
||||
20140126200816
|
||||
20140126200805
|
||||
20140126200912
|
||||
20140126200738
|
||||
20140126200930
|
||||
20140126200718
|
||||
20140126200706
|
||||
20140126200654
|
||||
20140126200625
|
||||
|
||||
>>> cdx_ops_test(closest_to = '20140126201306', url = 'http://iana.org/dnssec', resolve_revisits = True, sources = [test_cdx_dir + 'dupes.cdx', test_cdx_dir + 'iana.cdx'])
|
||||
org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz - - -
|
||||
org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz - - -
|
||||
|
||||
|
||||
>>> cdx_ops_test(closest_to = '20140126201307', url = 'http://iana.org/dnssec', resolve_revisits = True)
|
||||
org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz - - -
|
||||
org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz - - -
|
||||
|
||||
# equal dist prefer earlier
|
||||
>>> cdx_ops_test(closest_to = '20140126200700', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolve_revisits = True, limit = 2)
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200654 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 548 482544 iana.warc.gz 117166 198285 iana.warc.gz
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200706 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 495230 iana.warc.gz 117166 198285 iana.warc.gz
|
||||
|
||||
>>> cdx_ops_test(closest_to = '20140126200659', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolve_revisits = True, limit = 2, fields = 'timestamp')
|
||||
20140126200654
|
||||
20140126200706
|
||||
|
||||
>>> cdx_ops_test(closest_to = '20140126200701', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolve_revisits = True, limit = 2, fields = 'timestamp')
|
||||
20140126200706
|
||||
20140126200654
|
||||
|
||||
|
||||
# Resolve Revisits
|
||||
>>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/inconsolata.otf', resolve_revisits = True)
|
||||
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200826 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 34054 620049 iana.warc.gz - - -
|
||||
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200912 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 546 667073 iana.warc.gz 34054 620049 iana.warc.gz
|
||||
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200930 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 534 697255 iana.warc.gz 34054 620049 iana.warc.gz
|
||||
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201055 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 547 714833 iana.warc.gz 34054 620049 iana.warc.gz
|
||||
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201249 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 551 768625 iana.warc.gz 34054 620049 iana.warc.gz
|
||||
|
||||
>>> cdx_ops_test('http://iana.org/domains/root/db', resolve_revisits = True)
|
||||
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz - - -
|
||||
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz - - -
|
||||
|
||||
|
||||
# CDX Server init
|
||||
>>> x = CDXServer([test_cdx_dir]).load_cdx(url = 'example.com', limit = 1, output = 'raw')
|
||||
>>> pprint.pprint(x.next().items())
|
||||
[('urlkey', 'com,example)/'),
|
||||
('timestamp', '20140127171200'),
|
||||
('original', 'http://example.com'),
|
||||
('mimetype', 'text/html'),
|
||||
('statuscode', '200'),
|
||||
('digest', 'B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'),
|
||||
('redirect', '-'),
|
||||
('robotflags', '-'),
|
||||
('length', '1046'),
|
||||
('offset', '334'),
|
||||
('filename', 'dupes.warc.gz')]
|
||||
|
||||
"""
|
||||
|
||||
kwparams['url'] = url
|
||||
kwparams['output'] = 'text'
|
||||
|
||||
server = CDXServer(sources)
|
||||
results = server.load_cdx(**kwparams)
|
||||
|
||||
for x in results:
|
||||
sys.stdout.write(x)
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
doctest.testmod()
|
||||
|
||||
|
@ -1,14 +1,14 @@
|
||||
import webtest
|
||||
import pywb.pywb_init
|
||||
from pywb.indexreader import CDXCaptureResult
|
||||
from ..pywb.pywb_init import pywb_config
|
||||
from ..pywb.wbapp import create_wb_app
|
||||
from ..pywb.cdxserver.cdxobject import CDXObject
|
||||
|
||||
class TestWb:
|
||||
TEST_CONFIG = 'test_config.yaml'
|
||||
|
||||
def setup(self):
|
||||
import pywb.wbapp
|
||||
#self.app = pywb.wbapp.create_wb_app(pywb.pywb_init.pywb_config())
|
||||
self.app = pywb.wbapp.create_wb_app(pywb.pywb_init.pywb_config(self.TEST_CONFIG))
|
||||
self.app = create_wb_app(pywb_config(self.TEST_CONFIG))
|
||||
self.testapp = webtest.TestApp(self.app)
|
||||
|
||||
def _assert_basic_html(self, resp):
|
||||
@ -144,8 +144,8 @@ class TestWb:
|
||||
# combine collapsing, reversing and revisit resolving
|
||||
resp = self.testapp.get('/pywb-cdx?url=http://www.iana.org/_css/2013.1/print.css&collapse_time=11&resolve_revisits=true&reverse=true')
|
||||
|
||||
# convert back to CDXCaptureResult
|
||||
cdxs = map(CDXCaptureResult, resp.body.rstrip().split('\n'))
|
||||
# convert back to CDXObject
|
||||
cdxs = map(CDXObject, resp.body.rstrip().split('\n'))
|
||||
assert len(cdxs) == 3, len(cdxs)
|
||||
|
||||
# verify timestamps
|
Loading…
x
Reference in New Issue
Block a user