mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
refactoring of binsearch and cdxserver into seperate packages
also move complicated doctests and integration tests to tests/
This commit is contained in:
parent
e4f409b2a4
commit
2528ee0a7c
@ -6,4 +6,5 @@ install:
|
|||||||
- "python setup.py -q install"
|
- "python setup.py -q install"
|
||||||
# command to run tests
|
# command to run tests
|
||||||
#script: nosetests --with-doctest
|
#script: nosetests --with-doctest
|
||||||
script: py.test run-tests.py ./pywb/ --doctest-modules --ignore=setup.py
|
#script: py.test run-tests.py ./pywb/ --doctest-modules --ignore=setup.py
|
||||||
|
script: py.test -v --doctest-module ./tests/*.py ./pywb/
|
||||||
|
@ -13,6 +13,9 @@ from wbrequestresponse import StatusAndHeaders
|
|||||||
#=================================================================
|
#=================================================================
|
||||||
|
|
||||||
class HttpLoader:
|
class HttpLoader:
|
||||||
|
"""
|
||||||
|
Load content over http with range request and optional signature
|
||||||
|
"""
|
||||||
def __init__(self, hmac = None, hmac_duration = 30):
|
def __init__(self, hmac = None, hmac_duration = 30):
|
||||||
self.hmac = hmac
|
self.hmac = hmac
|
||||||
self.hmac_duration = hmac_duration
|
self.hmac_duration = hmac_duration
|
||||||
@ -38,6 +41,8 @@ class HttpLoader:
|
|||||||
#=================================================================
|
#=================================================================
|
||||||
class FileLoader:
|
class FileLoader:
|
||||||
"""
|
"""
|
||||||
|
Load content from local file-system
|
||||||
|
|
||||||
# Ensure attempt to read more than 100 bytes, only reads 100 bytes
|
# Ensure attempt to read more than 100 bytes, only reads 100 bytes
|
||||||
>>> len(FileLoader().load(utils.test_data_dir() + 'warcs/iana.warc.gz', 0, 100).read('400'))
|
>>> len(FileLoader().load(utils.test_data_dir() + 'warcs/iana.warc.gz', 0, 100).read('400'))
|
||||||
100
|
100
|
||||||
|
@ -1,147 +0,0 @@
|
|||||||
from collections import deque
|
|
||||||
import os
|
|
||||||
import itertools
|
|
||||||
|
|
||||||
class FileReader:
|
|
||||||
def __init__(self, filename):
|
|
||||||
self.fh = open(filename, 'rb')
|
|
||||||
self.filename = filename
|
|
||||||
self.size = os.path.getsize(filename)
|
|
||||||
|
|
||||||
def getsize(self):
|
|
||||||
return self.size
|
|
||||||
|
|
||||||
def readline(self):
|
|
||||||
return self.fh.readline()
|
|
||||||
|
|
||||||
def seek(self, offset):
|
|
||||||
return self.fh.seek(offset)
|
|
||||||
|
|
||||||
def close(self):
|
|
||||||
return self.fh.close()
|
|
||||||
|
|
||||||
|
|
||||||
def binsearch_offset(reader, key, compare_func = cmp, block_size = 8192):
|
|
||||||
min = 0
|
|
||||||
max = reader.getsize() / block_size
|
|
||||||
|
|
||||||
while (max - min > 1):
|
|
||||||
mid = min + ((max - min) / 2)
|
|
||||||
reader.seek(mid * block_size)
|
|
||||||
|
|
||||||
if mid > 0:
|
|
||||||
reader.readline() # skip partial line
|
|
||||||
|
|
||||||
line = reader.readline()
|
|
||||||
|
|
||||||
if compare_func(key, line) > 0:
|
|
||||||
min = mid
|
|
||||||
else:
|
|
||||||
max = mid
|
|
||||||
|
|
||||||
return (min * block_size)
|
|
||||||
|
|
||||||
|
|
||||||
def search(reader, key, prev_size = 0, compare_func = cmp, block_size = 8192):
|
|
||||||
min = binsearch_offset(reader, key, compare_func, block_size)
|
|
||||||
|
|
||||||
reader.seek(min)
|
|
||||||
|
|
||||||
if min > 0:
|
|
||||||
reader.readline() # skip partial line
|
|
||||||
|
|
||||||
if prev_size > 1:
|
|
||||||
prev_deque = deque(maxlen = prev_size)
|
|
||||||
|
|
||||||
line = None
|
|
||||||
|
|
||||||
while True:
|
|
||||||
line = reader.readline()
|
|
||||||
if not line:
|
|
||||||
break
|
|
||||||
if compare_func(line, key) >= 0:
|
|
||||||
break
|
|
||||||
|
|
||||||
if prev_size == 1:
|
|
||||||
prev = line
|
|
||||||
elif prev_size > 1:
|
|
||||||
prev_deque.append(line)
|
|
||||||
|
|
||||||
def gen_iter(line):
|
|
||||||
if prev_size == 1:
|
|
||||||
yield prev.rstrip()
|
|
||||||
elif prev_size > 1:
|
|
||||||
for i in prev_deque:
|
|
||||||
yield i.rstrip()
|
|
||||||
|
|
||||||
while line:
|
|
||||||
yield line.rstrip()
|
|
||||||
line = reader.readline()
|
|
||||||
|
|
||||||
return gen_iter(line)
|
|
||||||
|
|
||||||
|
|
||||||
# Iterate over prefix matches
|
|
||||||
def iter_prefix(reader, key):
|
|
||||||
"""
|
|
||||||
>>> print_test_cdx('org,iana)/domains/root', iter_prefix)
|
|
||||||
org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz
|
|
||||||
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz
|
|
||||||
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
|
|
||||||
org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz
|
|
||||||
"""
|
|
||||||
|
|
||||||
lines = search(reader, key)
|
|
||||||
return itertools.takewhile(lambda line: line.startswith(key), lines)
|
|
||||||
|
|
||||||
|
|
||||||
def iter_exact(reader, key, tok = ' '):
|
|
||||||
"""
|
|
||||||
>>> print_test_cdx('org,iana)/domains/root', iter_exact)
|
|
||||||
org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz
|
|
||||||
|
|
||||||
>>> print_test_cdx('org,iana)/', iter_exact)
|
|
||||||
org,iana)/ 20140126200624 http://www.iana.org/ text/html 200 OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 2258 334 iana.warc.gz
|
|
||||||
|
|
||||||
>>> print_test_cdx('org,iana)/domains/root/db', iter_exact)
|
|
||||||
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz
|
|
||||||
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
|
|
||||||
|
|
||||||
>>> print_test_cdx('org,iaana)/', iter_exact)
|
|
||||||
>>> print_test_cdx('org,ibna)/', iter_exact)
|
|
||||||
|
|
||||||
>>> print_test_cdx('org,iana)/time-zones', iter_exact)
|
|
||||||
org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
|
|
||||||
"""
|
|
||||||
|
|
||||||
lines = search(reader, key)
|
|
||||||
|
|
||||||
def check_key(line):
|
|
||||||
line_key = line.split(tok, 1)[0]
|
|
||||||
return line_key == key
|
|
||||||
|
|
||||||
return itertools.takewhile(check_key, lines)
|
|
||||||
|
|
||||||
|
|
||||||
import utils
|
|
||||||
if __name__ == "__main__" or utils.enable_doctests():
|
|
||||||
|
|
||||||
def create_test_cdx(test_file):
|
|
||||||
path = utils.test_data_dir() + 'cdx/' + test_file
|
|
||||||
return FileReader(path)
|
|
||||||
|
|
||||||
test_cdx = create_test_cdx('iana.cdx')
|
|
||||||
|
|
||||||
def print_test_cdx(key, iter_func, filename = None):
|
|
||||||
cdx = test_cdx if not filename else create_test_cdx(filename)
|
|
||||||
for line in iter_func(cdx, key):
|
|
||||||
print line
|
|
||||||
|
|
||||||
#cdx.close()
|
|
||||||
|
|
||||||
import doctest
|
|
||||||
doctest.testmod()
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
0
pywb/binsearch/__init__.py
Normal file
0
pywb/binsearch/__init__.py
Normal file
123
pywb/binsearch/binsearch.py
Normal file
123
pywb/binsearch/binsearch.py
Normal file
@ -0,0 +1,123 @@
|
|||||||
|
from collections import deque
|
||||||
|
import os
|
||||||
|
import itertools
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
# Binary Search over a text file
|
||||||
|
#=================================================================
|
||||||
|
class FileReader:
|
||||||
|
"""
|
||||||
|
A very simple file-like object wrapper that knows it's size
|
||||||
|
getsize() method returns the filesize
|
||||||
|
"""
|
||||||
|
def __init__(self, filename):
|
||||||
|
self.fh = open(filename, 'rb')
|
||||||
|
self.filename = filename
|
||||||
|
self.size = os.path.getsize(filename)
|
||||||
|
|
||||||
|
def getsize(self):
|
||||||
|
return self.size
|
||||||
|
|
||||||
|
def readline(self):
|
||||||
|
return self.fh.readline()
|
||||||
|
|
||||||
|
def seek(self, offset):
|
||||||
|
return self.fh.seek(offset)
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
return self.fh.close()
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
def binsearch_offset(reader, key, compare_func=cmp, block_size=8192):
|
||||||
|
"""
|
||||||
|
Find offset of the full line which matches a given 'key' using binary search
|
||||||
|
If key is not found, the offset is of the line after the key
|
||||||
|
|
||||||
|
File is subdivided into block_size (default 8192) sized blocks
|
||||||
|
Optional compare_func may be specified
|
||||||
|
"""
|
||||||
|
min = 0
|
||||||
|
max = reader.getsize() / block_size
|
||||||
|
|
||||||
|
while (max - min > 1):
|
||||||
|
mid = min + ((max - min) / 2)
|
||||||
|
reader.seek(mid * block_size)
|
||||||
|
|
||||||
|
if mid > 0:
|
||||||
|
reader.readline() # skip partial line
|
||||||
|
|
||||||
|
line = reader.readline()
|
||||||
|
|
||||||
|
if compare_func(key, line) > 0:
|
||||||
|
min = mid
|
||||||
|
else:
|
||||||
|
max = mid
|
||||||
|
|
||||||
|
return (min * block_size)
|
||||||
|
|
||||||
|
|
||||||
|
def search(reader, key, prev_size = 0, compare_func = cmp, block_size = 8192):
|
||||||
|
"""
|
||||||
|
Perform a binsearch for a specified key down to block_size (8192) sized blocks,
|
||||||
|
followed by linear search within the block to find first matching line.
|
||||||
|
|
||||||
|
When performing linear search, keep track of up to N previous lines before
|
||||||
|
first matching line.
|
||||||
|
"""
|
||||||
|
min = binsearch_offset(reader, key, compare_func, block_size)
|
||||||
|
|
||||||
|
reader.seek(min)
|
||||||
|
|
||||||
|
if min > 0:
|
||||||
|
reader.readline() # skip partial line
|
||||||
|
|
||||||
|
if prev_size > 1:
|
||||||
|
prev_deque = deque(maxlen = prev_size)
|
||||||
|
|
||||||
|
line = None
|
||||||
|
|
||||||
|
while True:
|
||||||
|
line = reader.readline()
|
||||||
|
if not line:
|
||||||
|
break
|
||||||
|
if compare_func(line, key) >= 0:
|
||||||
|
break
|
||||||
|
|
||||||
|
if prev_size == 1:
|
||||||
|
prev = line
|
||||||
|
elif prev_size > 1:
|
||||||
|
prev_deque.append(line)
|
||||||
|
|
||||||
|
def gen_iter(line):
|
||||||
|
if prev_size == 1:
|
||||||
|
yield prev.rstrip()
|
||||||
|
elif prev_size > 1:
|
||||||
|
for i in prev_deque:
|
||||||
|
yield i.rstrip()
|
||||||
|
|
||||||
|
while line:
|
||||||
|
yield line.rstrip()
|
||||||
|
line = reader.readline()
|
||||||
|
|
||||||
|
return gen_iter(line)
|
||||||
|
|
||||||
|
|
||||||
|
# Iterate over prefix matches
|
||||||
|
def iter_prefix(reader, key):
|
||||||
|
"""
|
||||||
|
Creates an iterator which iterates over prefix matches for a key in a sorted text file
|
||||||
|
A line matches as long as it starts with key
|
||||||
|
"""
|
||||||
|
|
||||||
|
return itertools.takewhile(lambda line: line.startswith(key), search(reader, key))
|
||||||
|
|
||||||
|
|
||||||
|
def iter_exact(reader, key, token=' '):
|
||||||
|
"""
|
||||||
|
Create an iterator which iterates over exact matches for a key in a sorted text file
|
||||||
|
Key is terminated by a token (default ' ')
|
||||||
|
"""
|
||||||
|
|
||||||
|
return iter_prefix(reader, key + token)
|
||||||
|
|
358
pywb/cdxserve.py
358
pywb/cdxserve.py
@ -1,358 +0,0 @@
|
|||||||
import binsearch
|
|
||||||
import indexreader
|
|
||||||
import bisect
|
|
||||||
import itertools
|
|
||||||
import re
|
|
||||||
|
|
||||||
from heapq import merge
|
|
||||||
from collections import deque
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
def cdx_text_out(cdx, fields):
|
|
||||||
if not fields:
|
|
||||||
return str(cdx)
|
|
||||||
else:
|
|
||||||
return ' '.join(map(lambda x: cdx[x], fields.split(',')))
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
def cdx_serve(key, params, sources, match_func = binsearch.iter_exact):
|
|
||||||
cdx_iter = merge_sort_streams(sources, key, match_func)
|
|
||||||
|
|
||||||
cdx_iter = make_cdx_iter(cdx_iter)
|
|
||||||
|
|
||||||
resolve_revisits = params.get('resolve_revisits', False)
|
|
||||||
if resolve_revisits:
|
|
||||||
cdx_iter = cdx_resolve_revisits(cdx_iter)
|
|
||||||
|
|
||||||
filters = params.get('filter', None)
|
|
||||||
if filters:
|
|
||||||
cdx_iter = cdx_filter(cdx_iter, filters)
|
|
||||||
|
|
||||||
collapse_time = params.get('collapse_time', None)
|
|
||||||
if collapse_time:
|
|
||||||
cdx_iter = cdx_collapse_time_status(cdx_iter, collapse_time)
|
|
||||||
|
|
||||||
limit = int(params.get('limit', 1000000))
|
|
||||||
|
|
||||||
reverse = params.get('reverse', False)
|
|
||||||
if reverse:
|
|
||||||
cdx_iter = cdx_reverse(cdx_iter, limit)
|
|
||||||
|
|
||||||
closest_to = params.get('closest_to', None)
|
|
||||||
if closest_to:
|
|
||||||
cdx_iter = cdx_sort_closest(closest_to, cdx_iter, limit)
|
|
||||||
|
|
||||||
if limit:
|
|
||||||
cdx_iter = cdx_limit(cdx_iter, limit)
|
|
||||||
|
|
||||||
# output raw cdx objects
|
|
||||||
if params.get('output') == 'raw':
|
|
||||||
return cdx_iter
|
|
||||||
|
|
||||||
def write_cdx(fields):
|
|
||||||
for cdx in cdx_iter:
|
|
||||||
yield cdx_text_out(cdx, fields) + '\n'
|
|
||||||
|
|
||||||
return write_cdx(params.get('fields'))
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
# merge multiple cdx streams
|
|
||||||
def merge_sort_streams(sources, key, iter_func):
|
|
||||||
"""
|
|
||||||
>>> test_cdx(key = 'org,iana)/', sources = [test_dir + 'dupes.cdx', test_dir + 'iana.cdx'])
|
|
||||||
org,iana)/ 20140126200624 http://www.iana.org/ text/html 200 OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 2258 334 iana.warc.gz
|
|
||||||
org,iana)/ 20140127171238 http://iana.org unk 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 343 1858 dupes.warc.gz
|
|
||||||
org,iana)/ 20140127171238 http://www.iana.org/ warc/revisit - OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 536 2678 dupes.warc.gz
|
|
||||||
"""
|
|
||||||
|
|
||||||
def load_src(source):
|
|
||||||
source = binsearch.FileReader(source)
|
|
||||||
source = iter_func(source, key)
|
|
||||||
return source
|
|
||||||
|
|
||||||
# Optimize: no need to merge if just one input
|
|
||||||
if len(sources) == 1:
|
|
||||||
return load_src(sources[0])
|
|
||||||
|
|
||||||
source_iters = map(load_src, sources)
|
|
||||||
merged_stream = merge(*(source_iters))
|
|
||||||
return merged_stream
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
# convert text cdx stream to CDXCaptureResult
|
|
||||||
def make_cdx_iter(text_iter):
|
|
||||||
return itertools.imap(lambda line: indexreader.CDXCaptureResult(line), text_iter)
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
# limit cdx to at most limit
|
|
||||||
def cdx_limit(cdx_iter, limit):
|
|
||||||
"""
|
|
||||||
>>> test_cdx('org,iana)/_css/2013.1/fonts/opensans-bold.ttf', limit = 3)
|
|
||||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200625 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 117166 198285 iana.warc.gz
|
|
||||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200654 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf warc/revisit - YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 548 482544 iana.warc.gz
|
|
||||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200706 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf warc/revisit - YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 495230 iana.warc.gz
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
for cdx, _ in itertools.izip(cdx_iter, xrange(limit)):
|
|
||||||
yield cdx
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
# reverse cdx
|
|
||||||
def cdx_reverse(cdx_iter, limit):
|
|
||||||
"""
|
|
||||||
>>> test_cdx('org,iana)/_css/2013.1/fonts/opensans-bold.ttf', reverse = True, resolve_revisits = True, limit = 3)
|
|
||||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201308 https://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 551 783712 iana.warc.gz 117166 198285 iana.warc.gz
|
|
||||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201249 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 771773 iana.warc.gz 117166 198285 iana.warc.gz
|
|
||||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201240 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 551 757988 iana.warc.gz 117166 198285 iana.warc.gz
|
|
||||||
|
|
||||||
>>> test_cdx('org,iana)/_js/2013.1/jquery.js', reverse = True, resolve_revisits = True, limit = 1)
|
|
||||||
org,iana)/_js/2013.1/jquery.js 20140126201307 https://www.iana.org/_js/2013.1/jquery.js application/x-javascript 200 AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO - - 543 778507 iana.warc.gz 33449 7311 iana.warc.gz
|
|
||||||
|
|
||||||
# no match, single result
|
|
||||||
>>> test_cdx('org,iana)/dont_have_this', reverse = True, resolve_revisits = True, limit = 1)
|
|
||||||
"""
|
|
||||||
|
|
||||||
# optimize for single last
|
|
||||||
if limit == 1:
|
|
||||||
last = None
|
|
||||||
|
|
||||||
for cdx in cdx_iter:
|
|
||||||
last = cdx
|
|
||||||
|
|
||||||
return [last] if last else []
|
|
||||||
|
|
||||||
reverse_cdxs = deque(maxlen = limit)
|
|
||||||
|
|
||||||
for cdx in cdx_iter:
|
|
||||||
reverse_cdxs.appendleft(cdx)
|
|
||||||
|
|
||||||
return reverse_cdxs
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
# filter cdx by regex if each filter is field:regex form,
|
|
||||||
# apply filter to cdx[field]
|
|
||||||
def cdx_filter(cdx_iter, filter_strings):
|
|
||||||
"""
|
|
||||||
>>> test_cdx(key = 'org,iana)/domains', match_func = binsearch.iter_prefix, filter = ['mimetype:text/html'])
|
|
||||||
org,iana)/domains 20140126200825 http://www.iana.org/domains text/html 200 7UPSCLNWNZP33LGW6OJGSF2Y4CDG4ES7 - - 2912 610534 iana.warc.gz
|
|
||||||
org,iana)/domains/arpa 20140126201248 http://www.iana.org/domains/arpa text/html 200 QOFZZRN6JIKAL2JRL6ZC2VVG42SPKGHT - - 2939 759039 iana.warc.gz
|
|
||||||
org,iana)/domains/idn-tables 20140126201127 http://www.iana.org/domains/idn-tables text/html 200 HNCUFTJMOQOGAEY6T56KVC3T7TVLKGEW - - 8118 715878 iana.warc.gz
|
|
||||||
org,iana)/domains/int 20140126201239 http://www.iana.org/domains/int text/html 200 X32BBNNORV4SPEHTQF5KI5NFHSKTZK6Q - - 2482 746788 iana.warc.gz
|
|
||||||
org,iana)/domains/reserved 20140126201054 http://www.iana.org/domains/reserved text/html 200 R5AAEQX5XY5X5DG66B23ODN5DUBWRA27 - - 3573 701457 iana.warc.gz
|
|
||||||
org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz
|
|
||||||
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz
|
|
||||||
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
|
|
||||||
org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz
|
|
||||||
|
|
||||||
|
|
||||||
>>> test_cdx(key = 'org,iana)/_css/2013.1/screen.css', filter = 'statuscode:200')
|
|
||||||
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Support single strings as well
|
|
||||||
if isinstance(filter_strings, str):
|
|
||||||
filter_strings = [filter_strings]
|
|
||||||
|
|
||||||
filters = []
|
|
||||||
|
|
||||||
class Filter:
|
|
||||||
def __init__(self, string):
|
|
||||||
# invert filter
|
|
||||||
self.invert = string.startswith('!')
|
|
||||||
if self.invert:
|
|
||||||
string = string[1:]
|
|
||||||
|
|
||||||
parts = string.split(':', 1)
|
|
||||||
# no field set, apply filter to entire cdx
|
|
||||||
if len(parts) == 1:
|
|
||||||
self.field = ''
|
|
||||||
else:
|
|
||||||
# apply filter to cdx[field]
|
|
||||||
self.field = parts[0]
|
|
||||||
string = parts[1]
|
|
||||||
|
|
||||||
self.regex = re.compile(string)
|
|
||||||
|
|
||||||
def __call__(self, cdx):
|
|
||||||
val = cdx[self.field] if self.field else str(cdx)
|
|
||||||
matched = self.regex.match(val) is not None
|
|
||||||
return matched ^ self.invert
|
|
||||||
|
|
||||||
filters = map(Filter, filter_strings)
|
|
||||||
|
|
||||||
for cdx in cdx_iter:
|
|
||||||
if all (x(cdx) for x in filters):
|
|
||||||
yield cdx
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
# collapse by timestamp and status code
|
|
||||||
def cdx_collapse_time_status(cdx_iter, timelen = 10):
|
|
||||||
"""
|
|
||||||
# unresolved revisits, different statuscode results in an extra repeat
|
|
||||||
>>> test_cdx(key = 'org,iana)/_css/2013.1/screen.css', collapse_time = 11)
|
|
||||||
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
|
|
||||||
org,iana)/_css/2013.1/screen.css 20140126200653 http://www.iana.org/_css/2013.1/screen.css warc/revisit - BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 533 328367 iana.warc.gz
|
|
||||||
org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css warc/revisit - BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz
|
|
||||||
|
|
||||||
# resolved revisits
|
|
||||||
>>> test_cdx(key = 'org,iana)/_css/2013.1/screen.css', collapse_time = '11', resolve_revisits = True)
|
|
||||||
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz - - -
|
|
||||||
org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz 8754 41238 iana.warc.gz
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
timelen = int(timelen)
|
|
||||||
|
|
||||||
last_token = None
|
|
||||||
|
|
||||||
for cdx in cdx_iter:
|
|
||||||
curr_token = (cdx['timestamp'][:timelen], cdx['statuscode'])
|
|
||||||
|
|
||||||
# yield if last_dedup_time is diff, otherwise skip
|
|
||||||
if curr_token != last_token:
|
|
||||||
last_token = curr_token
|
|
||||||
yield cdx
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
# sort CDXCaptureResult by closest to timestamp
|
|
||||||
def cdx_sort_closest(closest, cdx_iter, limit = 10):
|
|
||||||
"""
|
|
||||||
>>> test_cdx(closest_to = '20140126200826', key = 'org,iana)/_css/2013.1/fonts/opensans-bold.ttf', fields = 'timestamp', limit = 10)
|
|
||||||
20140126200826
|
|
||||||
20140126200816
|
|
||||||
20140126200805
|
|
||||||
20140126200912
|
|
||||||
20140126200738
|
|
||||||
20140126200930
|
|
||||||
20140126200718
|
|
||||||
20140126200706
|
|
||||||
20140126200654
|
|
||||||
20140126200625
|
|
||||||
|
|
||||||
>>> test_cdx(closest_to = '20140126201306', key = 'org,iana)/dnssec', resolve_revisits = True, sources = [test_dir + 'dupes.cdx', test_dir + 'iana.cdx'])
|
|
||||||
org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz - - -
|
|
||||||
org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz - - -
|
|
||||||
|
|
||||||
|
|
||||||
>>> test_cdx(closest_to = '20140126201307', key = 'org,iana)/dnssec', resolve_revisits = True)
|
|
||||||
org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz - - -
|
|
||||||
org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz - - -
|
|
||||||
|
|
||||||
# equal dist prefer earlier
|
|
||||||
>>> test_cdx(closest_to = '20140126200700', key = 'org,iana)/_css/2013.1/fonts/opensans-bold.ttf', resolve_revisits = True, limit = 2)
|
|
||||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200654 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 548 482544 iana.warc.gz 117166 198285 iana.warc.gz
|
|
||||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200706 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 495230 iana.warc.gz 117166 198285 iana.warc.gz
|
|
||||||
|
|
||||||
>>> test_cdx(closest_to = '20140126200659', key = 'org,iana)/_css/2013.1/fonts/opensans-bold.ttf', resolve_revisits = True, limit = 2, fields = 'timestamp')
|
|
||||||
20140126200654
|
|
||||||
20140126200706
|
|
||||||
|
|
||||||
>>> test_cdx(closest_to = '20140126200701', key = 'org,iana)/_css/2013.1/fonts/opensans-bold.ttf', resolve_revisits = True, limit = 2, fields = 'timestamp')
|
|
||||||
20140126200706
|
|
||||||
20140126200654
|
|
||||||
|
|
||||||
"""
|
|
||||||
closest_cdx = []
|
|
||||||
|
|
||||||
closest_sec = utils.timestamp_to_sec(closest)
|
|
||||||
|
|
||||||
for cdx in cdx_iter:
|
|
||||||
sec = utils.timestamp_to_sec(cdx['timestamp'])
|
|
||||||
key = abs(closest_sec - sec)
|
|
||||||
|
|
||||||
# create tuple to sort by key
|
|
||||||
bisect.insort(closest_cdx, (key, cdx))
|
|
||||||
|
|
||||||
if len(closest_cdx) == limit:
|
|
||||||
# assuming cdx in ascending order and keys have started increasing
|
|
||||||
if key > closest_cdx[-1]:
|
|
||||||
break
|
|
||||||
|
|
||||||
if len(closest_cdx) > limit:
|
|
||||||
closest_cdx.pop()
|
|
||||||
|
|
||||||
|
|
||||||
return itertools.imap(lambda x: x[1], closest_cdx)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
# resolve revisits
|
|
||||||
|
|
||||||
# Fields to append from cdx original to revisit
|
|
||||||
ORIG_TUPLE = ['length', 'offset', 'filename']
|
|
||||||
|
|
||||||
def cdx_resolve_revisits(cdx_iter):
|
|
||||||
"""
|
|
||||||
>>> test_cdx('org,iana)/_css/2013.1/fonts/inconsolata.otf', resolve_revisits = True)
|
|
||||||
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200826 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 34054 620049 iana.warc.gz - - -
|
|
||||||
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200912 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 546 667073 iana.warc.gz 34054 620049 iana.warc.gz
|
|
||||||
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200930 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 534 697255 iana.warc.gz 34054 620049 iana.warc.gz
|
|
||||||
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201055 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 547 714833 iana.warc.gz 34054 620049 iana.warc.gz
|
|
||||||
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201249 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 551 768625 iana.warc.gz 34054 620049 iana.warc.gz
|
|
||||||
|
|
||||||
>>> test_cdx('org,iana)/domains/root/db', resolve_revisits = True)
|
|
||||||
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz - - -
|
|
||||||
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz - - -
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
originals = {}
|
|
||||||
|
|
||||||
for cdx in cdx_iter:
|
|
||||||
is_revisit = (cdx['mimetype'] == 'warc/revisit') or (cdx['filename'] == '-')
|
|
||||||
|
|
||||||
digest = cdx['digest']
|
|
||||||
|
|
||||||
original_cdx = originals.get(digest)
|
|
||||||
|
|
||||||
if not original_cdx and not is_revisit:
|
|
||||||
originals[digest] = cdx
|
|
||||||
|
|
||||||
|
|
||||||
if original_cdx and is_revisit:
|
|
||||||
fill_orig = lambda field: original_cdx[field]
|
|
||||||
# Transfer mimetype and statuscode
|
|
||||||
cdx['mimetype'] = original_cdx['mimetype']
|
|
||||||
cdx['statuscode'] = original_cdx['statuscode']
|
|
||||||
else:
|
|
||||||
fill_orig = lambda field: '-'
|
|
||||||
|
|
||||||
# Always add either the original or empty '- - -'
|
|
||||||
for field in ORIG_TUPLE:
|
|
||||||
cdx['orig.' + field] = fill_orig(field)
|
|
||||||
|
|
||||||
yield cdx
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
import utils
|
|
||||||
if __name__ == "__main__" or utils.enable_doctests():
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
|
|
||||||
test_dir = utils.test_data_dir() + 'cdx/'
|
|
||||||
|
|
||||||
def test_cdx(key, match_func = binsearch.iter_exact, sources = [test_dir + 'iana.cdx'], **kwparams):
|
|
||||||
for x in cdx_serve(key, kwparams, sources, match_func):
|
|
||||||
sys.stdout.write(x)
|
|
||||||
|
|
||||||
|
|
||||||
import doctest
|
|
||||||
doctest.testmod()
|
|
||||||
|
|
||||||
|
|
0
pywb/cdxserver/__init__.py
Normal file
0
pywb/cdxserver/__init__.py
Normal file
42
pywb/cdxserver/cdxapp.py
Normal file
42
pywb/cdxserver/cdxapp.py
Normal file
@ -0,0 +1,42 @@
|
|||||||
|
from cdxserver import CDXServer
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
test_cdx_dir = os.path.dirname(os.path.realpath(__file__)) + '/../../sample_archive/cdx/'
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
def main(config = None):
|
||||||
|
logging.basicConfig(format = '%(asctime)s: [%(levelname)s]: %(message)s', level = logging.DEBUG)
|
||||||
|
|
||||||
|
if not config:
|
||||||
|
config = [test_cdx_dir]
|
||||||
|
|
||||||
|
cdxserver = CDXServer(config)
|
||||||
|
|
||||||
|
def application(env, start_response):
|
||||||
|
try:
|
||||||
|
response = cdxserver.load_cdx_from_request(env)
|
||||||
|
start_response('200 OK', [('Content-Type', 'text/plain')])
|
||||||
|
|
||||||
|
response = list(response)
|
||||||
|
|
||||||
|
except Exception as exc:
|
||||||
|
import traceback
|
||||||
|
err_details = traceback.format_exc(exc)
|
||||||
|
start_response('400 Error', [('Content-Type', 'text/plain')])
|
||||||
|
response = [str(exc)]
|
||||||
|
print err_details
|
||||||
|
|
||||||
|
return response
|
||||||
|
|
||||||
|
|
||||||
|
return application
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
application = main()
|
||||||
|
|
||||||
|
|
57
pywb/cdxserver/cdxobject.py
Normal file
57
pywb/cdxserver/cdxobject.py
Normal file
@ -0,0 +1,57 @@
|
|||||||
|
from collections import OrderedDict
|
||||||
|
import itertools
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class CDXObject(OrderedDict):
|
||||||
|
CDX_FORMATS = [
|
||||||
|
# Public CDX Format
|
||||||
|
["urlkey","timestamp","original","mimetype","statuscode","digest","length"],
|
||||||
|
|
||||||
|
# CDX 11 Format
|
||||||
|
["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"],
|
||||||
|
|
||||||
|
# CDX 9 Format
|
||||||
|
["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","offset","filename"],
|
||||||
|
|
||||||
|
# CDX 11 Format + 3 revisit resolve fields
|
||||||
|
["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename",
|
||||||
|
"orig.length","orig.offset","orig.filename"],
|
||||||
|
|
||||||
|
# CDX 9 Format + 3 revisit resolve fields
|
||||||
|
["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","offset","filename",
|
||||||
|
"orig.length","orig.offset","orig.filename"]
|
||||||
|
]
|
||||||
|
|
||||||
|
def __init__(self, cdxline):
|
||||||
|
OrderedDict.__init__(self)
|
||||||
|
|
||||||
|
cdxline = cdxline.rstrip()
|
||||||
|
fields = cdxline.split(' ')
|
||||||
|
|
||||||
|
cdxformat = None
|
||||||
|
for i in self.CDX_FORMATS:
|
||||||
|
if len(i) == len(fields):
|
||||||
|
cdxformat = i
|
||||||
|
|
||||||
|
if not cdxformat:
|
||||||
|
raise Exception('unknown {0}-field cdx format'.format(len(fields)))
|
||||||
|
|
||||||
|
for header, field in itertools.izip(cdxformat, fields):
|
||||||
|
self[header] = field
|
||||||
|
|
||||||
|
self.cdxline = cdxline
|
||||||
|
|
||||||
|
def __setitem__(self, key, value):
|
||||||
|
OrderedDict.__setitem__(self, key, value)
|
||||||
|
|
||||||
|
# force regen on next __str__ call
|
||||||
|
self.cdxline = None
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
if self.cdxline:
|
||||||
|
return self.cdxline
|
||||||
|
|
||||||
|
li = itertools.imap(lambda (n, val): val, self.items())
|
||||||
|
return ' '.join(li)
|
||||||
|
|
||||||
|
|
228
pywb/cdxserver/cdxops.py
Normal file
228
pywb/cdxserver/cdxops.py
Normal file
@ -0,0 +1,228 @@
|
|||||||
|
from cdxobject import CDXObject
|
||||||
|
|
||||||
|
from ..binsearch.binsearch import iter_exact, iter_prefix, FileReader
|
||||||
|
|
||||||
|
import timeutils
|
||||||
|
import bisect
|
||||||
|
import itertools
|
||||||
|
import re
|
||||||
|
|
||||||
|
from heapq import merge
|
||||||
|
from collections import deque
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
def cdx_text_out(cdx, fields):
|
||||||
|
if not fields:
|
||||||
|
return str(cdx)
|
||||||
|
else:
|
||||||
|
return ' '.join(map(lambda x: cdx[x], fields.split(',')))
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
def cdx_load(sources, params):
|
||||||
|
cdx_iter = load_cdx_streams(sources, params)
|
||||||
|
|
||||||
|
cdx_iter = make_cdx_iter(cdx_iter)
|
||||||
|
|
||||||
|
resolve_revisits = params.get('resolve_revisits', False)
|
||||||
|
if resolve_revisits:
|
||||||
|
cdx_iter = cdx_resolve_revisits(cdx_iter)
|
||||||
|
|
||||||
|
filters = params.get('filter', None)
|
||||||
|
if filters:
|
||||||
|
cdx_iter = cdx_filter(cdx_iter, filters)
|
||||||
|
|
||||||
|
collapse_time = params.get('collapse_time', None)
|
||||||
|
if collapse_time:
|
||||||
|
cdx_iter = cdx_collapse_time_status(cdx_iter, collapse_time)
|
||||||
|
|
||||||
|
limit = int(params.get('limit', 1000000))
|
||||||
|
|
||||||
|
reverse = params.get('reverse', False)
|
||||||
|
if reverse:
|
||||||
|
cdx_iter = cdx_reverse(cdx_iter, limit)
|
||||||
|
|
||||||
|
closest_to = params.get('closest_to', None)
|
||||||
|
if closest_to:
|
||||||
|
cdx_iter = cdx_sort_closest(closest_to, cdx_iter, limit)
|
||||||
|
|
||||||
|
if limit:
|
||||||
|
cdx_iter = cdx_limit(cdx_iter, limit)
|
||||||
|
|
||||||
|
# output raw cdx objects
|
||||||
|
if params.get('output') == 'raw':
|
||||||
|
return cdx_iter
|
||||||
|
|
||||||
|
def write_cdx(fields):
|
||||||
|
for cdx in cdx_iter:
|
||||||
|
yield cdx_text_out(cdx, fields) + '\n'
|
||||||
|
|
||||||
|
return write_cdx(params.get('fields'))
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
# load and source merge cdx streams
|
||||||
|
def load_cdx_streams(sources, params):
|
||||||
|
# Optimize: no need to merge if just one input
|
||||||
|
if len(sources) == 1:
|
||||||
|
return sources[0].load_cdx(params)
|
||||||
|
|
||||||
|
source_iters = map(lambda src: src.load_cdx(params), sources)
|
||||||
|
merged_stream = merge(*(source_iters))
|
||||||
|
return merged_stream
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
# convert text cdx stream to CDXObject
|
||||||
|
def make_cdx_iter(text_iter):
|
||||||
|
return itertools.imap(lambda line: CDXObject(line), text_iter)
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
# limit cdx to at most limit
|
||||||
|
def cdx_limit(cdx_iter, limit):
|
||||||
|
for cdx, _ in itertools.izip(cdx_iter, xrange(limit)):
|
||||||
|
yield cdx
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
# reverse cdx
|
||||||
|
def cdx_reverse(cdx_iter, limit):
|
||||||
|
# optimize for single last
|
||||||
|
if limit == 1:
|
||||||
|
last = None
|
||||||
|
|
||||||
|
for cdx in cdx_iter:
|
||||||
|
last = cdx
|
||||||
|
|
||||||
|
return [last] if last else []
|
||||||
|
|
||||||
|
reverse_cdxs = deque(maxlen = limit)
|
||||||
|
|
||||||
|
for cdx in cdx_iter:
|
||||||
|
reverse_cdxs.appendleft(cdx)
|
||||||
|
|
||||||
|
return reverse_cdxs
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
# filter cdx by regex if each filter is field:regex form,
|
||||||
|
# apply filter to cdx[field]
|
||||||
|
def cdx_filter(cdx_iter, filter_strings):
|
||||||
|
# Support single strings as well
|
||||||
|
if isinstance(filter_strings, str):
|
||||||
|
filter_strings = [filter_strings]
|
||||||
|
|
||||||
|
filters = []
|
||||||
|
|
||||||
|
class Filter:
|
||||||
|
def __init__(self, string):
|
||||||
|
# invert filter
|
||||||
|
self.invert = string.startswith('!')
|
||||||
|
if self.invert:
|
||||||
|
string = string[1:]
|
||||||
|
|
||||||
|
parts = string.split(':', 1)
|
||||||
|
# no field set, apply filter to entire cdx
|
||||||
|
if len(parts) == 1:
|
||||||
|
self.field = ''
|
||||||
|
else:
|
||||||
|
# apply filter to cdx[field]
|
||||||
|
self.field = parts[0]
|
||||||
|
string = parts[1]
|
||||||
|
|
||||||
|
self.regex = re.compile(string)
|
||||||
|
|
||||||
|
def __call__(self, cdx):
|
||||||
|
val = cdx[self.field] if self.field else str(cdx)
|
||||||
|
matched = self.regex.match(val) is not None
|
||||||
|
return matched ^ self.invert
|
||||||
|
|
||||||
|
filters = map(Filter, filter_strings)
|
||||||
|
|
||||||
|
for cdx in cdx_iter:
|
||||||
|
if all (x(cdx) for x in filters):
|
||||||
|
yield cdx
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
# collapse by timestamp and status code
|
||||||
|
def cdx_collapse_time_status(cdx_iter, timelen = 10):
|
||||||
|
timelen = int(timelen)
|
||||||
|
|
||||||
|
last_token = None
|
||||||
|
|
||||||
|
for cdx in cdx_iter:
|
||||||
|
curr_token = (cdx['timestamp'][:timelen], cdx['statuscode'])
|
||||||
|
|
||||||
|
# yield if last_dedup_time is diff, otherwise skip
|
||||||
|
if curr_token != last_token:
|
||||||
|
last_token = curr_token
|
||||||
|
yield cdx
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
# sort CDXCaptureResult by closest to timestamp
|
||||||
|
def cdx_sort_closest(closest, cdx_iter, limit = 10):
|
||||||
|
closest_cdx = []
|
||||||
|
|
||||||
|
closest_sec = timeutils.timestamp_to_sec(closest)
|
||||||
|
|
||||||
|
for cdx in cdx_iter:
|
||||||
|
sec = timeutils.timestamp_to_sec(cdx['timestamp'])
|
||||||
|
key = abs(closest_sec - sec)
|
||||||
|
|
||||||
|
# create tuple to sort by key
|
||||||
|
bisect.insort(closest_cdx, (key, cdx))
|
||||||
|
|
||||||
|
if len(closest_cdx) == limit:
|
||||||
|
# assuming cdx in ascending order and keys have started increasing
|
||||||
|
if key > closest_cdx[-1]:
|
||||||
|
break
|
||||||
|
|
||||||
|
if len(closest_cdx) > limit:
|
||||||
|
closest_cdx.pop()
|
||||||
|
|
||||||
|
|
||||||
|
return itertools.imap(lambda x: x[1], closest_cdx)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
# resolve revisits
|
||||||
|
|
||||||
|
# Fields to append from cdx original to revisit
|
||||||
|
ORIG_TUPLE = ['length', 'offset', 'filename']
|
||||||
|
|
||||||
|
def cdx_resolve_revisits(cdx_iter):
|
||||||
|
originals = {}
|
||||||
|
|
||||||
|
for cdx in cdx_iter:
|
||||||
|
is_revisit = (cdx['mimetype'] == 'warc/revisit') or (cdx['filename'] == '-')
|
||||||
|
|
||||||
|
digest = cdx['digest']
|
||||||
|
|
||||||
|
original_cdx = originals.get(digest)
|
||||||
|
|
||||||
|
if not original_cdx and not is_revisit:
|
||||||
|
originals[digest] = cdx
|
||||||
|
|
||||||
|
|
||||||
|
if original_cdx and is_revisit:
|
||||||
|
fill_orig = lambda field: original_cdx[field]
|
||||||
|
# Transfer mimetype and statuscode
|
||||||
|
cdx['mimetype'] = original_cdx['mimetype']
|
||||||
|
cdx['statuscode'] = original_cdx['statuscode']
|
||||||
|
else:
|
||||||
|
fill_orig = lambda field: '-'
|
||||||
|
|
||||||
|
# Always add either the original or empty '- - -'
|
||||||
|
for field in ORIG_TUPLE:
|
||||||
|
cdx['orig.' + field] = fill_orig(field)
|
||||||
|
|
||||||
|
yield cdx
|
||||||
|
|
||||||
|
|
160
pywb/cdxserver/cdxserver.py
Normal file
160
pywb/cdxserver/cdxserver.py
Normal file
@ -0,0 +1,160 @@
|
|||||||
|
import surt
|
||||||
|
from ..binsearch.binsearch import iter_exact, iter_prefix, FileReader
|
||||||
|
from cdxops import cdx_load
|
||||||
|
|
||||||
|
import itertools
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import urlparse
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class CDXFile:
|
||||||
|
def __init__(self, filename):
|
||||||
|
self.filename = filename
|
||||||
|
|
||||||
|
def load_cdx(self, params):
|
||||||
|
source = FileReader(self.filename)
|
||||||
|
|
||||||
|
match_type = params.get('match_type')
|
||||||
|
|
||||||
|
if match_type == 'prefix':
|
||||||
|
iter_func = iter_prefix
|
||||||
|
else:
|
||||||
|
iter_func = iter_exact
|
||||||
|
|
||||||
|
key = params.get('key')
|
||||||
|
|
||||||
|
return iter_func(source, key)
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return 'CDX File - ' + self.filename
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class CDXException(Exception):
|
||||||
|
def __init__(self, msg, url = None):
|
||||||
|
Exception.__init__(self, msg)
|
||||||
|
self.url = url
|
||||||
|
|
||||||
|
def status(self):
|
||||||
|
return '400 Bad Request'
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class CDXServer:
|
||||||
|
"""
|
||||||
|
Top-level cdx server object which maintains a list of cdx sources,
|
||||||
|
responds to queries and dispatches to the cdx ops for processing
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, sources, surt_ordered = True):
|
||||||
|
self.sources = []
|
||||||
|
self.surt_ordered = surt_ordered
|
||||||
|
logging.debug('CDX Surt-Ordered? ' + str(surt_ordered))
|
||||||
|
|
||||||
|
for src in sources:
|
||||||
|
if os.path.isdir(src):
|
||||||
|
for file in os.listdir(src):
|
||||||
|
self.add_cdx_loader(src + file)
|
||||||
|
else:
|
||||||
|
self.add_cdx_loader(src)
|
||||||
|
|
||||||
|
if len(self.sources) == 0:
|
||||||
|
logging.exception('No CDX Sources Found!')
|
||||||
|
|
||||||
|
def add_cdx_loader(self, filename):
|
||||||
|
source = self.create_cdx_loader(filename)
|
||||||
|
if not source:
|
||||||
|
return
|
||||||
|
|
||||||
|
logging.debug('Adding CDX Source: ' + str(source))
|
||||||
|
self.sources.append(source)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def create_cdx_loader(filename):
|
||||||
|
if filename.endswith('.cdx'):
|
||||||
|
return CDXFile(filename)
|
||||||
|
return None
|
||||||
|
#TODO: support zipnum
|
||||||
|
#elif filename.endswith('.summary')
|
||||||
|
# return ZipNumCDXSource(filename)
|
||||||
|
#elif filename.startswith('redis://')
|
||||||
|
# return RedisCDXSource(filename)
|
||||||
|
|
||||||
|
|
||||||
|
def load_cdx(self, **params):
|
||||||
|
# canonicalize to surt (canonicalization is part of surt conversion)
|
||||||
|
try:
|
||||||
|
url = params['url']
|
||||||
|
except KeyError:
|
||||||
|
raise CDXException('The url= param must be specified to query the cdx server')
|
||||||
|
|
||||||
|
try:
|
||||||
|
key = surt.surt(url)
|
||||||
|
except Exception as e:
|
||||||
|
raise CDXException('Invalid url: ', url)
|
||||||
|
|
||||||
|
# if not surt, unsurt the surt to get canonicalized non-surt url
|
||||||
|
if not self.surt_ordered:
|
||||||
|
key = unsurt(key)
|
||||||
|
|
||||||
|
params['key'] = key
|
||||||
|
|
||||||
|
return cdx_load(self.sources, params)
|
||||||
|
|
||||||
|
|
||||||
|
def load_cdx_from_request(self, env):
|
||||||
|
#url = wbrequest.wb_url.url
|
||||||
|
|
||||||
|
# use url= param to get actual url
|
||||||
|
params = urlparse.parse_qs(env['QUERY_STRING'])
|
||||||
|
|
||||||
|
if not 'output' in params:
|
||||||
|
params['output'] = 'text'
|
||||||
|
|
||||||
|
# parse_qs produces arrays for single values
|
||||||
|
# cdxreader expects singleton params for all except filters, so convert here
|
||||||
|
# use first value of the list
|
||||||
|
for name, val in params.iteritems():
|
||||||
|
if name != 'filter':
|
||||||
|
params[name] = val[0]
|
||||||
|
|
||||||
|
cdx_lines = self.load_cdx(**params)
|
||||||
|
return cdx_lines
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return 'load cdx indexes from ' + str(self.sources)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
def unsurt(surt):
|
||||||
|
"""
|
||||||
|
# Simple surt
|
||||||
|
>>> unsurt('com,example)/')
|
||||||
|
'example.com)/'
|
||||||
|
|
||||||
|
# Broken surt
|
||||||
|
>>> unsurt('com,example)')
|
||||||
|
'com,example)'
|
||||||
|
|
||||||
|
# Long surt
|
||||||
|
>>> unsurt('suffix,domain,sub,subsub,another,subdomain)/path/file/index.html?a=b?c=)/')
|
||||||
|
'subdomain.another.subsub.sub.domain.suffix)/path/file/index.html?a=b?c=)/'
|
||||||
|
"""
|
||||||
|
|
||||||
|
try:
|
||||||
|
index = surt.index(')/')
|
||||||
|
parts = surt[0:index].split(',')
|
||||||
|
parts.reverse()
|
||||||
|
host = '.'.join(parts)
|
||||||
|
host += surt[index:]
|
||||||
|
return host
|
||||||
|
|
||||||
|
except ValueError:
|
||||||
|
# May not be a valid surt
|
||||||
|
return surt
|
||||||
|
|
||||||
|
|
103
pywb/cdxserver/timeutils.py
Normal file
103
pywb/cdxserver/timeutils.py
Normal file
@ -0,0 +1,103 @@
|
|||||||
|
import re
|
||||||
|
import time
|
||||||
|
import datetime
|
||||||
|
import calendar
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
# str <-> datetime conversion
|
||||||
|
#=================================================================
|
||||||
|
|
||||||
|
DATE_TIMESPLIT = re.compile('[^\d]')
|
||||||
|
|
||||||
|
TIMESTAMP_14 = '%Y%m%d%H%M%S'
|
||||||
|
|
||||||
|
PAD_STAMP_END = '29991231235959'
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def iso_date_to_datetime(string):
|
||||||
|
"""
|
||||||
|
>>> iso_date_to_datetime('2013-12-26T10:11:12Z')
|
||||||
|
datetime.datetime(2013, 12, 26, 10, 11, 12)
|
||||||
|
|
||||||
|
>>> iso_date_to_datetime('2013-12-26T10:11:12Z')
|
||||||
|
datetime.datetime(2013, 12, 26, 10, 11, 12)
|
||||||
|
"""
|
||||||
|
|
||||||
|
nums = DATE_TIMESPLIT.split(string)
|
||||||
|
if nums[-1] == '':
|
||||||
|
nums = nums[:-1]
|
||||||
|
|
||||||
|
dt = datetime.datetime(*map(int, nums))
|
||||||
|
return dt
|
||||||
|
|
||||||
|
def datetime_to_timestamp(dt):
|
||||||
|
"""
|
||||||
|
>>> datetime_to_timestamp(datetime.datetime(2013, 12, 26, 10, 11, 12))
|
||||||
|
'20131226101112'
|
||||||
|
"""
|
||||||
|
|
||||||
|
return dt.strftime(TIMESTAMP_14)
|
||||||
|
|
||||||
|
def iso_date_to_timestamp(string):
|
||||||
|
"""
|
||||||
|
>>> iso_date_to_timestamp('2013-12-26T10:11:12Z')
|
||||||
|
'20131226101112'
|
||||||
|
|
||||||
|
>>> iso_date_to_timestamp('2013-12-26T10:11:12')
|
||||||
|
'20131226101112'
|
||||||
|
"""
|
||||||
|
|
||||||
|
return datetime_to_timestamp(iso_date_to_datetime(string))
|
||||||
|
|
||||||
|
|
||||||
|
# default pad is end of range for compatibility
|
||||||
|
def pad_timestamp(string, pad_str = PAD_STAMP_END):
|
||||||
|
"""
|
||||||
|
>>> pad_timestamp('20')
|
||||||
|
'20991231235959'
|
||||||
|
|
||||||
|
>>> pad_timestamp('2014')
|
||||||
|
'20141231235959'
|
||||||
|
|
||||||
|
>>> pad_timestamp('20141011')
|
||||||
|
'20141011235959'
|
||||||
|
|
||||||
|
>>> pad_timestamp('201410110010')
|
||||||
|
'20141011001059'
|
||||||
|
"""
|
||||||
|
|
||||||
|
str_len = len(string)
|
||||||
|
pad_len = len(pad_str)
|
||||||
|
|
||||||
|
return string if str_len >= pad_len else string + pad_str[str_len:]
|
||||||
|
|
||||||
|
|
||||||
|
def timestamp_to_datetime(string):
|
||||||
|
"""
|
||||||
|
>>> timestamp_to_datetime('20131226095010')
|
||||||
|
time.struct_time(tm_year=2013, tm_mon=12, tm_mday=26, tm_hour=9, tm_min=50, tm_sec=10, tm_wday=3, tm_yday=360, tm_isdst=-1)
|
||||||
|
|
||||||
|
>>> timestamp_to_datetime('2014')
|
||||||
|
time.struct_time(tm_year=2014, tm_mon=12, tm_mday=31, tm_hour=23, tm_min=59, tm_sec=59, tm_wday=2, tm_yday=365, tm_isdst=-1)
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Default pad to end of range for comptability
|
||||||
|
return time.strptime(pad_timestamp(string), TIMESTAMP_14)
|
||||||
|
|
||||||
|
|
||||||
|
def timestamp_to_sec(string):
|
||||||
|
"""
|
||||||
|
>>> timestamp_to_sec('20131226095010')
|
||||||
|
1388051410
|
||||||
|
|
||||||
|
>>> timestamp_to_sec('2014')
|
||||||
|
1420070399
|
||||||
|
"""
|
||||||
|
|
||||||
|
return calendar.timegm(timestamp_to_datetime(string))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import doctest
|
||||||
|
doctest.testmod()
|
@ -66,28 +66,12 @@ class WBHandler(BaseHandler):
|
|||||||
# CDX-Server Handler -- pass all params to cdx server
|
# CDX-Server Handler -- pass all params to cdx server
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class CDXHandler(BaseHandler):
|
class CDXHandler(BaseHandler):
|
||||||
def __init__(self, cdx_reader, view = None):
|
def __init__(self, cdx_server, view = None):
|
||||||
self.cdx_reader = cdx_reader
|
self.cdx_server = cdx_server
|
||||||
self.view = view if view else views.TextCapturesView()
|
self.view = view if view else views.TextCapturesView()
|
||||||
|
|
||||||
def __call__(self, wbrequest):
|
def __call__(self, wbrequest):
|
||||||
#url = wbrequest.wb_url.url
|
cdx_lines = self.cdx_server.load_cdx_from_request(wbrequest.env)
|
||||||
|
|
||||||
# use url= param to get actual url
|
|
||||||
params = urlparse.parse_qs(wbrequest.env['QUERY_STRING'])
|
|
||||||
|
|
||||||
# parse_qs produces arrays for single values
|
|
||||||
# cdxreader expects singleton params for all except filters, so convert here
|
|
||||||
# use first value of the list
|
|
||||||
for name, val in params.iteritems():
|
|
||||||
if name != 'filter':
|
|
||||||
params[name] = val[0]
|
|
||||||
|
|
||||||
url = params.get('url')
|
|
||||||
if not url:
|
|
||||||
raise WbException('Must specify a url= param to query cdx server')
|
|
||||||
|
|
||||||
cdx_lines = self.cdx_reader.load_cdx(url, params, parsed_cdx = False)
|
|
||||||
|
|
||||||
return self.view.render_response(wbrequest, cdx_lines)
|
return self.view.render_response(wbrequest, cdx_lines)
|
||||||
|
|
||||||
@ -97,7 +81,7 @@ class CDXHandler(BaseHandler):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return 'CDX Server: ' + str(self.cdx_reader)
|
return 'CDX Server: ' + str(self.cdx_server)
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
|
@ -1,15 +1,13 @@
|
|||||||
import urllib
|
import urllib
|
||||||
import urllib2
|
import urllib2
|
||||||
import wbexceptions
|
import wbexceptions
|
||||||
import itertools
|
|
||||||
import wbrequestresponse
|
import wbrequestresponse
|
||||||
import surt
|
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
|
|
||||||
import binsearch
|
from cdxserver.cdxserver import CDXServer, CDXException
|
||||||
import cdxserve
|
from cdxserver.cdxobject import CDXObject
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
import os
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class IndexReader:
|
class IndexReader:
|
||||||
@ -26,7 +24,13 @@ class IndexReader:
|
|||||||
if wbrequest.custom_params:
|
if wbrequest.custom_params:
|
||||||
params.update(wbrequest.custom_params)
|
params.update(wbrequest.custom_params)
|
||||||
|
|
||||||
cdxlines = self.load_cdx(wburl.url, params, parsed_cdx)
|
#params['url'] = wburl.url
|
||||||
|
output = 'raw' if parsed_cdx else 'text'
|
||||||
|
|
||||||
|
try:
|
||||||
|
cdxlines = self.load_cdx(url = wburl.url, output = output, **params)
|
||||||
|
except CDXException:
|
||||||
|
raise wbexceptions.BadUrlException('Bad Request Url: ' + wburl.url)
|
||||||
|
|
||||||
cdxlines = utils.peek_iter(cdxlines)
|
cdxlines = utils.peek_iter(cdxlines)
|
||||||
|
|
||||||
@ -53,7 +57,7 @@ class IndexReader:
|
|||||||
# for now, list implies local sources
|
# for now, list implies local sources
|
||||||
if isinstance(paths, list):
|
if isinstance(paths, list):
|
||||||
if len(paths) > 1:
|
if len(paths) > 1:
|
||||||
return LocalCDXServer(paths, surt_ordered)
|
return EmbeddedCDXServer(paths, surt_ordered)
|
||||||
else:
|
else:
|
||||||
# treat as non-list
|
# treat as non-list
|
||||||
paths = paths[0]
|
paths = paths[0]
|
||||||
@ -66,66 +70,13 @@ class IndexReader:
|
|||||||
cookie = config.get('cookie', None)
|
cookie = config.get('cookie', None)
|
||||||
return RemoteCDXServer(uri, cookie = cookie)
|
return RemoteCDXServer(uri, cookie = cookie)
|
||||||
else:
|
else:
|
||||||
return LocalCDXServer([uri], surt_ordered)
|
return EmbeddedCDXServer([uri], surt_ordered)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class LocalCDXServer(IndexReader):
|
class EmbeddedCDXServer(CDXServer, IndexReader):
|
||||||
"""
|
|
||||||
>>> x = LocalCDXServer([test_dir]).load_cdx('example.com', parsed_cdx = True, limit = 1)
|
|
||||||
>>> pprint(x.next().items())
|
|
||||||
[('urlkey', 'com,example)/'),
|
|
||||||
('timestamp', '20140127171200'),
|
|
||||||
('original', 'http://example.com'),
|
|
||||||
('mimetype', 'text/html'),
|
|
||||||
('statuscode', '200'),
|
|
||||||
('digest', 'B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'),
|
|
||||||
('redirect', '-'),
|
|
||||||
('robotflags', '-'),
|
|
||||||
('length', '1046'),
|
|
||||||
('offset', '334'),
|
|
||||||
('filename', 'dupes.warc.gz')]
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, sources, surt_ordered = True):
|
|
||||||
self.sources = []
|
|
||||||
self.surt_ordered = surt_ordered
|
|
||||||
logging.info('CDX Surt-Ordered? ' + str(surt_ordered))
|
|
||||||
|
|
||||||
for src in sources:
|
|
||||||
if os.path.isdir(src):
|
|
||||||
for file in os.listdir(src):
|
|
||||||
if file.endswith('.cdx'):
|
|
||||||
full = src + file
|
|
||||||
logging.info('Adding CDX: ' + full)
|
|
||||||
self.sources.append(full)
|
|
||||||
else:
|
|
||||||
logging.info('Adding CDX: ' + src)
|
|
||||||
self.sources.append(src)
|
|
||||||
|
|
||||||
|
|
||||||
def load_cdx(self, url, params = {}, parsed_cdx = True, **kwvalues):
|
|
||||||
# canonicalize to surt (canonicalization is part of surt conversion)
|
|
||||||
try:
|
|
||||||
key = surt.surt(url)
|
|
||||||
except Exception as e:
|
|
||||||
raise wbexceptions.BadUrlException('Bad Request Url: ' + url)
|
|
||||||
|
|
||||||
# if not surt, unsurt the surt to get canonicalized non-surt url
|
|
||||||
if not self.surt_ordered:
|
|
||||||
key = utils.unsurt(key)
|
|
||||||
|
|
||||||
match_func = binsearch.iter_exact
|
|
||||||
|
|
||||||
params.update(**kwvalues)
|
|
||||||
params['output'] = 'raw' if parsed_cdx else 'text'
|
|
||||||
|
|
||||||
return cdxserve.cdx_serve(key, params, self.sources, match_func)
|
|
||||||
|
|
||||||
|
|
||||||
def get_query_params(self, wburl, limit = 150000, collapse_time = None, replay_closest = 10):
|
def get_query_params(self, wburl, limit = 150000, collapse_time = None, replay_closest = 10):
|
||||||
|
|
||||||
if wburl.type == wburl.URL_QUERY:
|
if wburl.type == wburl.URL_QUERY:
|
||||||
@ -198,7 +149,7 @@ class RemoteCDXServer(IndexReader):
|
|||||||
raise
|
raise
|
||||||
|
|
||||||
if parsed_cdx:
|
if parsed_cdx:
|
||||||
return (CDXCaptureResult(cdx) for cdx in response)
|
return (CDXObject(cdx) for cdx in response)
|
||||||
else:
|
else:
|
||||||
return iter(response)
|
return iter(response)
|
||||||
|
|
||||||
@ -238,62 +189,6 @@ class RemoteCDXServer(IndexReader):
|
|||||||
return 'server cdx from ' + self.server_url
|
return 'server cdx from ' + self.server_url
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
class CDXCaptureResult(OrderedDict):
|
|
||||||
CDX_FORMATS = [
|
|
||||||
# Public CDX Format
|
|
||||||
["urlkey","timestamp","original","mimetype","statuscode","digest","length"],
|
|
||||||
|
|
||||||
# CDX 11 Format
|
|
||||||
["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"],
|
|
||||||
|
|
||||||
# CDX 9 Format
|
|
||||||
["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","offset","filename"],
|
|
||||||
|
|
||||||
# CDX 11 Format + 3 revisit resolve fields
|
|
||||||
["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename",
|
|
||||||
"orig.length","orig.offset","orig.filename"],
|
|
||||||
|
|
||||||
# CDX 9 Format + 3 revisit resolve fields
|
|
||||||
["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","offset","filename",
|
|
||||||
"orig.length","orig.offset","orig.filename"]
|
|
||||||
]
|
|
||||||
|
|
||||||
def __init__(self, cdxline):
|
|
||||||
OrderedDict.__init__(self)
|
|
||||||
|
|
||||||
cdxline = cdxline.rstrip()
|
|
||||||
fields = cdxline.split(' ')
|
|
||||||
|
|
||||||
cdxformat = None
|
|
||||||
for i in CDXCaptureResult.CDX_FORMATS:
|
|
||||||
if len(i) == len(fields):
|
|
||||||
cdxformat = i
|
|
||||||
|
|
||||||
if not cdxformat:
|
|
||||||
raise wbexceptions.InvalidCDXException('unknown {0}-field cdx format'.format(len(fields)))
|
|
||||||
|
|
||||||
for header, field in itertools.izip(cdxformat, fields):
|
|
||||||
self[header] = field
|
|
||||||
|
|
||||||
self.cdxline = cdxline
|
|
||||||
|
|
||||||
def __setitem__(self, key, value):
|
|
||||||
OrderedDict.__setitem__(self, key, value)
|
|
||||||
|
|
||||||
# force regen on next __str__ call
|
|
||||||
self.cdxline = None
|
|
||||||
|
|
||||||
|
|
||||||
def __str__(self):
|
|
||||||
if self.cdxline:
|
|
||||||
return self.cdxline
|
|
||||||
|
|
||||||
li = itertools.imap(lambda (n, val): val, self.items())
|
|
||||||
return ' '.join(li)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Testing
|
# Testing
|
||||||
|
|
||||||
import utils
|
import utils
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
import redis
|
import redis
|
||||||
import binsearch
|
import binsearch.binsearch
|
||||||
|
|
||||||
import urlparse
|
import urlparse
|
||||||
import os
|
import os
|
||||||
@ -46,10 +46,10 @@ class RedisResolver:
|
|||||||
class PathIndexResolver:
|
class PathIndexResolver:
|
||||||
def __init__(self, pathindex_file):
|
def __init__(self, pathindex_file):
|
||||||
self.pathindex_file = pathindex_file
|
self.pathindex_file = pathindex_file
|
||||||
self.reader = binsearch.FileReader(pathindex_file)
|
self.reader = binsearch.binsearch.FileReader(pathindex_file)
|
||||||
|
|
||||||
def __call__(self, filename):
|
def __call__(self, filename):
|
||||||
result = binsearch.iter_exact(self.reader, filename, '\t')
|
result = binsearch.binsearch.iter_exact(self.reader, filename, '\t')
|
||||||
|
|
||||||
def gen_list(result):
|
def gen_list(result):
|
||||||
for pathline in result:
|
for pathline in result:
|
||||||
|
@ -43,100 +43,6 @@ class PerfTimer:
|
|||||||
self.perfdict[self.name] = str(self.end - self.start)
|
self.perfdict[self.name] = str(self.end - self.start)
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
# str <-> datetime conversion
|
|
||||||
#=================================================================
|
|
||||||
|
|
||||||
DATE_TIMESPLIT = re.compile('[^\d]')
|
|
||||||
|
|
||||||
TIMESTAMP_14 = '%Y%m%d%H%M%S'
|
|
||||||
|
|
||||||
PAD_STAMP_END = '29991231235959'
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def iso_date_to_datetime(string):
|
|
||||||
"""
|
|
||||||
>>> iso_date_to_datetime('2013-12-26T10:11:12Z')
|
|
||||||
datetime.datetime(2013, 12, 26, 10, 11, 12)
|
|
||||||
|
|
||||||
>>> iso_date_to_datetime('2013-12-26T10:11:12Z')
|
|
||||||
datetime.datetime(2013, 12, 26, 10, 11, 12)
|
|
||||||
"""
|
|
||||||
|
|
||||||
nums = DATE_TIMESPLIT.split(string)
|
|
||||||
if nums[-1] == '':
|
|
||||||
nums = nums[:-1]
|
|
||||||
|
|
||||||
dt = datetime.datetime(*map(int, nums))
|
|
||||||
return dt
|
|
||||||
|
|
||||||
def datetime_to_timestamp(dt):
|
|
||||||
"""
|
|
||||||
>>> datetime_to_timestamp(datetime.datetime(2013, 12, 26, 10, 11, 12))
|
|
||||||
'20131226101112'
|
|
||||||
"""
|
|
||||||
|
|
||||||
return dt.strftime(TIMESTAMP_14)
|
|
||||||
|
|
||||||
def iso_date_to_timestamp(string):
|
|
||||||
"""
|
|
||||||
>>> iso_date_to_timestamp('2013-12-26T10:11:12Z')
|
|
||||||
'20131226101112'
|
|
||||||
|
|
||||||
>>> iso_date_to_timestamp('2013-12-26T10:11:12')
|
|
||||||
'20131226101112'
|
|
||||||
"""
|
|
||||||
|
|
||||||
return datetime_to_timestamp(iso_date_to_datetime(string))
|
|
||||||
|
|
||||||
|
|
||||||
# default pad is end of range for compatibility
|
|
||||||
def pad_timestamp(string, pad_str = PAD_STAMP_END):
|
|
||||||
"""
|
|
||||||
>>> pad_timestamp('20')
|
|
||||||
'20991231235959'
|
|
||||||
|
|
||||||
>>> pad_timestamp('2014')
|
|
||||||
'20141231235959'
|
|
||||||
|
|
||||||
>>> pad_timestamp('20141011')
|
|
||||||
'20141011235959'
|
|
||||||
|
|
||||||
>>> pad_timestamp('201410110010')
|
|
||||||
'20141011001059'
|
|
||||||
"""
|
|
||||||
|
|
||||||
str_len = len(string)
|
|
||||||
pad_len = len(pad_str)
|
|
||||||
|
|
||||||
return string if str_len >= pad_len else string + pad_str[str_len:]
|
|
||||||
|
|
||||||
|
|
||||||
def timestamp_to_datetime(string):
|
|
||||||
"""
|
|
||||||
>>> timestamp_to_datetime('20131226095010')
|
|
||||||
time.struct_time(tm_year=2013, tm_mon=12, tm_mday=26, tm_hour=9, tm_min=50, tm_sec=10, tm_wday=3, tm_yday=360, tm_isdst=-1)
|
|
||||||
|
|
||||||
>>> timestamp_to_datetime('2014')
|
|
||||||
time.struct_time(tm_year=2014, tm_mon=12, tm_mday=31, tm_hour=23, tm_min=59, tm_sec=59, tm_wday=2, tm_yday=365, tm_isdst=-1)
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Default pad to end of range for comptability
|
|
||||||
return time.strptime(pad_timestamp(string), TIMESTAMP_14)
|
|
||||||
|
|
||||||
|
|
||||||
def timestamp_to_sec(string):
|
|
||||||
"""
|
|
||||||
>>> timestamp_to_sec('20131226095010')
|
|
||||||
1388051410
|
|
||||||
|
|
||||||
>>> timestamp_to_sec('2014')
|
|
||||||
1420070399
|
|
||||||
"""
|
|
||||||
|
|
||||||
return calendar.timegm(timestamp_to_datetime(string))
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
# adapted -from wsgiref.request_uri, but doesn't include domain name and allows all characters
|
# adapted -from wsgiref.request_uri, but doesn't include domain name and allows all characters
|
||||||
# allowed in the path segment according to: http://tools.ietf.org/html/rfc3986#section-3.3
|
# allowed in the path segment according to: http://tools.ietf.org/html/rfc3986#section-3.3
|
||||||
|
@ -1,9 +1,9 @@
|
|||||||
import indexreader
|
import cdxserver.timeutils as timeutils
|
||||||
import utils
|
|
||||||
import wbrequestresponse
|
import wbrequestresponse
|
||||||
import wbexceptions
|
import wbexceptions
|
||||||
import time
|
|
||||||
import urlparse
|
import urlparse
|
||||||
|
import time
|
||||||
|
|
||||||
from os import path
|
from os import path
|
||||||
from itertools import imap
|
from itertools import imap
|
||||||
@ -58,7 +58,7 @@ class J2TemplateView:
|
|||||||
# Filters
|
# Filters
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def format_ts(value, format='%a, %b %d %Y %H:%M:%S'):
|
def format_ts(value, format='%a, %b %d %Y %H:%M:%S'):
|
||||||
value = utils.timestamp_to_datetime(value)
|
value = timeutils.timestamp_to_datetime(value)
|
||||||
return time.strftime(format, value)
|
return time.strftime(format, value)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
@ -2,6 +2,7 @@ import utils
|
|||||||
import wbexceptions
|
import wbexceptions
|
||||||
|
|
||||||
from wbrequestresponse import WbResponse, StatusAndHeaders
|
from wbrequestresponse import WbResponse, StatusAndHeaders
|
||||||
|
from cdxserver.cdxserver import CDXException
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import importlib
|
import importlib
|
||||||
@ -33,7 +34,7 @@ def create_wb_app(wb_router):
|
|||||||
except (wbexceptions.NotFoundException, wbexceptions.AccessException) as e:
|
except (wbexceptions.NotFoundException, wbexceptions.AccessException) as e:
|
||||||
response = handle_exception(env, wb_router.error_view, e, False)
|
response = handle_exception(env, wb_router.error_view, e, False)
|
||||||
|
|
||||||
except wbexceptions.WbException as wbe:
|
except (wbexceptions.WbException, CDXException) as wbe:
|
||||||
response = handle_exception(env, wb_router.error_view, wbe, False)
|
response = handle_exception(env, wb_router.error_view, wbe, False)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
4
setup.py
4
setup.py
@ -11,8 +11,8 @@ setuptools.setup(name='pywb',
|
|||||||
author_email='ilya@archive.org',
|
author_email='ilya@archive.org',
|
||||||
long_description=open('README.md').read(),
|
long_description=open('README.md').read(),
|
||||||
license='GPL',
|
license='GPL',
|
||||||
packages=['pywb'],
|
packages=['pywb', 'pywb.binsearch', 'pywb.cdxserver'],
|
||||||
provides=['pywb'],
|
provides=['pywb', 'pywb.binsearch', 'pywb.cdxserver'],
|
||||||
package_data={'pywb': ['ui/*', 'static/*']},
|
package_data={'pywb': ['ui/*', 'static/*']},
|
||||||
data_files = [('sample_archive/cdx/', glob.glob('sample_archive/cdx/*')),
|
data_files = [('sample_archive/cdx/', glob.glob('sample_archive/cdx/*')),
|
||||||
('sample_archive/warcs/', glob.glob('sample_archive/warcs/*'))],
|
('sample_archive/warcs/', glob.glob('sample_archive/warcs/*'))],
|
||||||
|
0
tests/__init__.py
Normal file
0
tests/__init__.py
Normal file
43
tests/test_binsearch.py
Normal file
43
tests/test_binsearch.py
Normal file
@ -0,0 +1,43 @@
|
|||||||
|
import os
|
||||||
|
from ..pywb.binsearch.binsearch import iter_prefix, iter_exact, FileReader
|
||||||
|
|
||||||
|
test_cdx_dir = os.path.dirname(os.path.realpath(__file__)) + '/../sample_archive/cdx/'
|
||||||
|
|
||||||
|
def binsearch_cdx_test(key, iter_func):
|
||||||
|
"""
|
||||||
|
# Prefix Search
|
||||||
|
>>> binsearch_cdx_test('org,iana)/domains/root', iter_prefix)
|
||||||
|
org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz
|
||||||
|
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz
|
||||||
|
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
|
||||||
|
org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz
|
||||||
|
|
||||||
|
>>> binsearch_cdx_test('org,iana)/domains/root', iter_exact)
|
||||||
|
org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz
|
||||||
|
|
||||||
|
>>> binsearch_cdx_test('org,iana)/', iter_exact)
|
||||||
|
org,iana)/ 20140126200624 http://www.iana.org/ text/html 200 OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 2258 334 iana.warc.gz
|
||||||
|
|
||||||
|
>>> binsearch_cdx_test('org,iana)/domains/root/db', iter_exact)
|
||||||
|
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz
|
||||||
|
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
|
||||||
|
|
||||||
|
# Exact Search
|
||||||
|
>>> binsearch_cdx_test('org,iaana)/', iter_exact)
|
||||||
|
>>> binsearch_cdx_test('org,ibna)/', iter_exact)
|
||||||
|
|
||||||
|
>>> binsearch_cdx_test('org,iana)/time-zones', iter_exact)
|
||||||
|
org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
|
||||||
|
"""
|
||||||
|
|
||||||
|
cdx = FileReader(test_cdx_dir + 'iana.cdx')
|
||||||
|
|
||||||
|
for line in iter_func(cdx, key):
|
||||||
|
print line
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import doctest
|
||||||
|
doctest.testmod()
|
||||||
|
|
||||||
|
|
149
tests/test_cdxserve.py
Normal file
149
tests/test_cdxserve.py
Normal file
@ -0,0 +1,149 @@
|
|||||||
|
from ..pywb.binsearch.binsearch import iter_exact, iter_prefix, FileReader
|
||||||
|
from ..pywb.cdxserver.cdxserver import CDXServer
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import pprint
|
||||||
|
|
||||||
|
test_cdx_dir = os.path.dirname(os.path.realpath(__file__)) + '/../sample_archive/cdx/'
|
||||||
|
|
||||||
|
def cdx_ops_test(url, sources = [test_cdx_dir + 'iana.cdx'], **kwparams):
|
||||||
|
"""
|
||||||
|
# Merge Sort Multipe CDX Sources
|
||||||
|
>>> cdx_ops_test(url = 'http://iana.org/', sources = [test_cdx_dir + 'dupes.cdx', test_cdx_dir + 'iana.cdx'])
|
||||||
|
org,iana)/ 20140126200624 http://www.iana.org/ text/html 200 OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 2258 334 iana.warc.gz
|
||||||
|
org,iana)/ 20140127171238 http://iana.org unk 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 343 1858 dupes.warc.gz
|
||||||
|
org,iana)/ 20140127171238 http://www.iana.org/ warc/revisit - OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 536 2678 dupes.warc.gz
|
||||||
|
|
||||||
|
|
||||||
|
# Limit CDX Stream
|
||||||
|
>>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', limit = 3)
|
||||||
|
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200625 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 117166 198285 iana.warc.gz
|
||||||
|
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200654 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf warc/revisit - YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 548 482544 iana.warc.gz
|
||||||
|
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200706 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf warc/revisit - YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 495230 iana.warc.gz
|
||||||
|
|
||||||
|
|
||||||
|
# Reverse CDX Stream
|
||||||
|
>>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', reverse = True, resolve_revisits = True, limit = 3)
|
||||||
|
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201308 https://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 551 783712 iana.warc.gz 117166 198285 iana.warc.gz
|
||||||
|
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201249 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 771773 iana.warc.gz 117166 198285 iana.warc.gz
|
||||||
|
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201240 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 551 757988 iana.warc.gz 117166 198285 iana.warc.gz
|
||||||
|
|
||||||
|
>>> cdx_ops_test('http://iana.org/_js/2013.1/jquery.js', reverse = True, resolve_revisits = True, limit = 1)
|
||||||
|
org,iana)/_js/2013.1/jquery.js 20140126201307 https://www.iana.org/_js/2013.1/jquery.js application/x-javascript 200 AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO - - 543 778507 iana.warc.gz 33449 7311 iana.warc.gz
|
||||||
|
|
||||||
|
# No matching results
|
||||||
|
>>> cdx_ops_test('http://iana.org/dont_have_this', reverse = True, resolve_revisits = True, limit = 2)
|
||||||
|
|
||||||
|
|
||||||
|
# Filter cdx
|
||||||
|
>>> cdx_ops_test(url = 'http://iana.org/domains', match_type = 'prefix', filter = ['mimetype:text/html'])
|
||||||
|
org,iana)/domains 20140126200825 http://www.iana.org/domains text/html 200 7UPSCLNWNZP33LGW6OJGSF2Y4CDG4ES7 - - 2912 610534 iana.warc.gz
|
||||||
|
org,iana)/domains/arpa 20140126201248 http://www.iana.org/domains/arpa text/html 200 QOFZZRN6JIKAL2JRL6ZC2VVG42SPKGHT - - 2939 759039 iana.warc.gz
|
||||||
|
org,iana)/domains/idn-tables 20140126201127 http://www.iana.org/domains/idn-tables text/html 200 HNCUFTJMOQOGAEY6T56KVC3T7TVLKGEW - - 8118 715878 iana.warc.gz
|
||||||
|
org,iana)/domains/int 20140126201239 http://www.iana.org/domains/int text/html 200 X32BBNNORV4SPEHTQF5KI5NFHSKTZK6Q - - 2482 746788 iana.warc.gz
|
||||||
|
org,iana)/domains/reserved 20140126201054 http://www.iana.org/domains/reserved text/html 200 R5AAEQX5XY5X5DG66B23ODN5DUBWRA27 - - 3573 701457 iana.warc.gz
|
||||||
|
org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz
|
||||||
|
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz
|
||||||
|
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
|
||||||
|
org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz
|
||||||
|
|
||||||
|
|
||||||
|
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', filter = 'statuscode:200')
|
||||||
|
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
|
||||||
|
|
||||||
|
|
||||||
|
# Collapse by timestamp
|
||||||
|
# unresolved revisits, different statuscode results in an extra repeat
|
||||||
|
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapse_time = 11)
|
||||||
|
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
|
||||||
|
org,iana)/_css/2013.1/screen.css 20140126200653 http://www.iana.org/_css/2013.1/screen.css warc/revisit - BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 533 328367 iana.warc.gz
|
||||||
|
org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css warc/revisit - BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz
|
||||||
|
|
||||||
|
# resolved revisits
|
||||||
|
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapse_time = '11', resolve_revisits = True)
|
||||||
|
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz - - -
|
||||||
|
org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz 8754 41238 iana.warc.gz
|
||||||
|
|
||||||
|
|
||||||
|
# Sort by closest timestamp + field select output
|
||||||
|
>>> cdx_ops_test(closest_to = '20140126200826', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', fields = 'timestamp', limit = 10)
|
||||||
|
20140126200826
|
||||||
|
20140126200816
|
||||||
|
20140126200805
|
||||||
|
20140126200912
|
||||||
|
20140126200738
|
||||||
|
20140126200930
|
||||||
|
20140126200718
|
||||||
|
20140126200706
|
||||||
|
20140126200654
|
||||||
|
20140126200625
|
||||||
|
|
||||||
|
>>> cdx_ops_test(closest_to = '20140126201306', url = 'http://iana.org/dnssec', resolve_revisits = True, sources = [test_cdx_dir + 'dupes.cdx', test_cdx_dir + 'iana.cdx'])
|
||||||
|
org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz - - -
|
||||||
|
org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz - - -
|
||||||
|
|
||||||
|
|
||||||
|
>>> cdx_ops_test(closest_to = '20140126201307', url = 'http://iana.org/dnssec', resolve_revisits = True)
|
||||||
|
org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz - - -
|
||||||
|
org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz - - -
|
||||||
|
|
||||||
|
# equal dist prefer earlier
|
||||||
|
>>> cdx_ops_test(closest_to = '20140126200700', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolve_revisits = True, limit = 2)
|
||||||
|
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200654 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 548 482544 iana.warc.gz 117166 198285 iana.warc.gz
|
||||||
|
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200706 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 495230 iana.warc.gz 117166 198285 iana.warc.gz
|
||||||
|
|
||||||
|
>>> cdx_ops_test(closest_to = '20140126200659', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolve_revisits = True, limit = 2, fields = 'timestamp')
|
||||||
|
20140126200654
|
||||||
|
20140126200706
|
||||||
|
|
||||||
|
>>> cdx_ops_test(closest_to = '20140126200701', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolve_revisits = True, limit = 2, fields = 'timestamp')
|
||||||
|
20140126200706
|
||||||
|
20140126200654
|
||||||
|
|
||||||
|
|
||||||
|
# Resolve Revisits
|
||||||
|
>>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/inconsolata.otf', resolve_revisits = True)
|
||||||
|
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200826 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 34054 620049 iana.warc.gz - - -
|
||||||
|
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200912 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 546 667073 iana.warc.gz 34054 620049 iana.warc.gz
|
||||||
|
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200930 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 534 697255 iana.warc.gz 34054 620049 iana.warc.gz
|
||||||
|
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201055 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 547 714833 iana.warc.gz 34054 620049 iana.warc.gz
|
||||||
|
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201249 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 551 768625 iana.warc.gz 34054 620049 iana.warc.gz
|
||||||
|
|
||||||
|
>>> cdx_ops_test('http://iana.org/domains/root/db', resolve_revisits = True)
|
||||||
|
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz - - -
|
||||||
|
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz - - -
|
||||||
|
|
||||||
|
|
||||||
|
# CDX Server init
|
||||||
|
>>> x = CDXServer([test_cdx_dir]).load_cdx(url = 'example.com', limit = 1, output = 'raw')
|
||||||
|
>>> pprint.pprint(x.next().items())
|
||||||
|
[('urlkey', 'com,example)/'),
|
||||||
|
('timestamp', '20140127171200'),
|
||||||
|
('original', 'http://example.com'),
|
||||||
|
('mimetype', 'text/html'),
|
||||||
|
('statuscode', '200'),
|
||||||
|
('digest', 'B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'),
|
||||||
|
('redirect', '-'),
|
||||||
|
('robotflags', '-'),
|
||||||
|
('length', '1046'),
|
||||||
|
('offset', '334'),
|
||||||
|
('filename', 'dupes.warc.gz')]
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
kwparams['url'] = url
|
||||||
|
kwparams['output'] = 'text'
|
||||||
|
|
||||||
|
server = CDXServer(sources)
|
||||||
|
results = server.load_cdx(**kwparams)
|
||||||
|
|
||||||
|
for x in results:
|
||||||
|
sys.stdout.write(x)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import doctest
|
||||||
|
doctest.testmod()
|
||||||
|
|
||||||
|
|
@ -1,14 +1,14 @@
|
|||||||
import webtest
|
import webtest
|
||||||
import pywb.pywb_init
|
from ..pywb.pywb_init import pywb_config
|
||||||
from pywb.indexreader import CDXCaptureResult
|
from ..pywb.wbapp import create_wb_app
|
||||||
|
from ..pywb.cdxserver.cdxobject import CDXObject
|
||||||
|
|
||||||
class TestWb:
|
class TestWb:
|
||||||
TEST_CONFIG = 'test_config.yaml'
|
TEST_CONFIG = 'test_config.yaml'
|
||||||
|
|
||||||
def setup(self):
|
def setup(self):
|
||||||
import pywb.wbapp
|
|
||||||
#self.app = pywb.wbapp.create_wb_app(pywb.pywb_init.pywb_config())
|
#self.app = pywb.wbapp.create_wb_app(pywb.pywb_init.pywb_config())
|
||||||
self.app = pywb.wbapp.create_wb_app(pywb.pywb_init.pywb_config(self.TEST_CONFIG))
|
self.app = create_wb_app(pywb_config(self.TEST_CONFIG))
|
||||||
self.testapp = webtest.TestApp(self.app)
|
self.testapp = webtest.TestApp(self.app)
|
||||||
|
|
||||||
def _assert_basic_html(self, resp):
|
def _assert_basic_html(self, resp):
|
||||||
@ -144,8 +144,8 @@ class TestWb:
|
|||||||
# combine collapsing, reversing and revisit resolving
|
# combine collapsing, reversing and revisit resolving
|
||||||
resp = self.testapp.get('/pywb-cdx?url=http://www.iana.org/_css/2013.1/print.css&collapse_time=11&resolve_revisits=true&reverse=true')
|
resp = self.testapp.get('/pywb-cdx?url=http://www.iana.org/_css/2013.1/print.css&collapse_time=11&resolve_revisits=true&reverse=true')
|
||||||
|
|
||||||
# convert back to CDXCaptureResult
|
# convert back to CDXObject
|
||||||
cdxs = map(CDXCaptureResult, resp.body.rstrip().split('\n'))
|
cdxs = map(CDXObject, resp.body.rstrip().split('\n'))
|
||||||
assert len(cdxs) == 3, len(cdxs)
|
assert len(cdxs) == 3, len(cdxs)
|
||||||
|
|
||||||
# verify timestamps
|
# verify timestamps
|
Loading…
x
Reference in New Issue
Block a user