diff --git a/.travis.yml b/.travis.yml index 618a7426..81d946f7 100644 --- a/.travis.yml +++ b/.travis.yml @@ -6,4 +6,5 @@ install: - "python setup.py -q install" # command to run tests #script: nosetests --with-doctest -script: py.test run-tests.py ./pywb/ --doctest-modules --ignore=setup.py +#script: py.test run-tests.py ./pywb/ --doctest-modules --ignore=setup.py +script: py.test -v --doctest-module ./tests/*.py ./pywb/ diff --git a/pywb/archiveloader.py b/pywb/archiveloader.py index 86e35149..fd690b34 100644 --- a/pywb/archiveloader.py +++ b/pywb/archiveloader.py @@ -13,6 +13,9 @@ from wbrequestresponse import StatusAndHeaders #================================================================= class HttpLoader: + """ + Load content over http with range request and optional signature + """ def __init__(self, hmac = None, hmac_duration = 30): self.hmac = hmac self.hmac_duration = hmac_duration @@ -38,6 +41,8 @@ class HttpLoader: #================================================================= class FileLoader: """ + Load content from local file-system + # Ensure attempt to read more than 100 bytes, only reads 100 bytes >>> len(FileLoader().load(utils.test_data_dir() + 'warcs/iana.warc.gz', 0, 100).read('400')) 100 diff --git a/pywb/binsearch.py b/pywb/binsearch.py deleted file mode 100644 index 563a1e32..00000000 --- a/pywb/binsearch.py +++ /dev/null @@ -1,147 +0,0 @@ -from collections import deque -import os -import itertools - -class FileReader: - def __init__(self, filename): - self.fh = open(filename, 'rb') - self.filename = filename - self.size = os.path.getsize(filename) - - def getsize(self): - return self.size - - def readline(self): - return self.fh.readline() - - def seek(self, offset): - return self.fh.seek(offset) - - def close(self): - return self.fh.close() - - -def binsearch_offset(reader, key, compare_func = cmp, block_size = 8192): - min = 0 - max = reader.getsize() / block_size - - while (max - min > 1): - mid = min + ((max - min) / 2) - reader.seek(mid * block_size) - - if mid > 0: - reader.readline() # skip partial line - - line = reader.readline() - - if compare_func(key, line) > 0: - min = mid - else: - max = mid - - return (min * block_size) - - -def search(reader, key, prev_size = 0, compare_func = cmp, block_size = 8192): - min = binsearch_offset(reader, key, compare_func, block_size) - - reader.seek(min) - - if min > 0: - reader.readline() # skip partial line - - if prev_size > 1: - prev_deque = deque(maxlen = prev_size) - - line = None - - while True: - line = reader.readline() - if not line: - break - if compare_func(line, key) >= 0: - break - - if prev_size == 1: - prev = line - elif prev_size > 1: - prev_deque.append(line) - - def gen_iter(line): - if prev_size == 1: - yield prev.rstrip() - elif prev_size > 1: - for i in prev_deque: - yield i.rstrip() - - while line: - yield line.rstrip() - line = reader.readline() - - return gen_iter(line) - - -# Iterate over prefix matches -def iter_prefix(reader, key): - """ - >>> print_test_cdx('org,iana)/domains/root', iter_prefix) - org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz - org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz - org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz - org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz - """ - - lines = search(reader, key) - return itertools.takewhile(lambda line: line.startswith(key), lines) - - -def iter_exact(reader, key, tok = ' '): - """ - >>> print_test_cdx('org,iana)/domains/root', iter_exact) - org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz - - >>> print_test_cdx('org,iana)/', iter_exact) - org,iana)/ 20140126200624 http://www.iana.org/ text/html 200 OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 2258 334 iana.warc.gz - - >>> print_test_cdx('org,iana)/domains/root/db', iter_exact) - org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz - org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz - - >>> print_test_cdx('org,iaana)/', iter_exact) - >>> print_test_cdx('org,ibna)/', iter_exact) - - >>> print_test_cdx('org,iana)/time-zones', iter_exact) - org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz - """ - - lines = search(reader, key) - - def check_key(line): - line_key = line.split(tok, 1)[0] - return line_key == key - - return itertools.takewhile(check_key, lines) - - -import utils -if __name__ == "__main__" or utils.enable_doctests(): - - def create_test_cdx(test_file): - path = utils.test_data_dir() + 'cdx/' + test_file - return FileReader(path) - - test_cdx = create_test_cdx('iana.cdx') - - def print_test_cdx(key, iter_func, filename = None): - cdx = test_cdx if not filename else create_test_cdx(filename) - for line in iter_func(cdx, key): - print line - - #cdx.close() - - import doctest - doctest.testmod() - - - - diff --git a/pywb/binsearch/__init__.py b/pywb/binsearch/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pywb/binsearch/binsearch.py b/pywb/binsearch/binsearch.py new file mode 100644 index 00000000..2d7646d9 --- /dev/null +++ b/pywb/binsearch/binsearch.py @@ -0,0 +1,123 @@ +from collections import deque +import os +import itertools + +#================================================================= +# Binary Search over a text file +#================================================================= +class FileReader: + """ + A very simple file-like object wrapper that knows it's size + getsize() method returns the filesize + """ + def __init__(self, filename): + self.fh = open(filename, 'rb') + self.filename = filename + self.size = os.path.getsize(filename) + + def getsize(self): + return self.size + + def readline(self): + return self.fh.readline() + + def seek(self, offset): + return self.fh.seek(offset) + + def close(self): + return self.fh.close() + + +#================================================================= +def binsearch_offset(reader, key, compare_func=cmp, block_size=8192): + """ + Find offset of the full line which matches a given 'key' using binary search + If key is not found, the offset is of the line after the key + + File is subdivided into block_size (default 8192) sized blocks + Optional compare_func may be specified + """ + min = 0 + max = reader.getsize() / block_size + + while (max - min > 1): + mid = min + ((max - min) / 2) + reader.seek(mid * block_size) + + if mid > 0: + reader.readline() # skip partial line + + line = reader.readline() + + if compare_func(key, line) > 0: + min = mid + else: + max = mid + + return (min * block_size) + + +def search(reader, key, prev_size = 0, compare_func = cmp, block_size = 8192): + """ + Perform a binsearch for a specified key down to block_size (8192) sized blocks, + followed by linear search within the block to find first matching line. + + When performing linear search, keep track of up to N previous lines before + first matching line. + """ + min = binsearch_offset(reader, key, compare_func, block_size) + + reader.seek(min) + + if min > 0: + reader.readline() # skip partial line + + if prev_size > 1: + prev_deque = deque(maxlen = prev_size) + + line = None + + while True: + line = reader.readline() + if not line: + break + if compare_func(line, key) >= 0: + break + + if prev_size == 1: + prev = line + elif prev_size > 1: + prev_deque.append(line) + + def gen_iter(line): + if prev_size == 1: + yield prev.rstrip() + elif prev_size > 1: + for i in prev_deque: + yield i.rstrip() + + while line: + yield line.rstrip() + line = reader.readline() + + return gen_iter(line) + + +# Iterate over prefix matches +def iter_prefix(reader, key): + """ + Creates an iterator which iterates over prefix matches for a key in a sorted text file + A line matches as long as it starts with key + """ + + return itertools.takewhile(lambda line: line.startswith(key), search(reader, key)) + + +def iter_exact(reader, key, token=' '): + """ + Create an iterator which iterates over exact matches for a key in a sorted text file + Key is terminated by a token (default ' ') + """ + + return iter_prefix(reader, key + token) + diff --git a/pywb/cdxserve.py b/pywb/cdxserve.py deleted file mode 100644 index 9deadb49..00000000 --- a/pywb/cdxserve.py +++ /dev/null @@ -1,358 +0,0 @@ -import binsearch -import indexreader -import bisect -import itertools -import re - -from heapq import merge -from collections import deque - - - -#================================================================= -def cdx_text_out(cdx, fields): - if not fields: - return str(cdx) - else: - return ' '.join(map(lambda x: cdx[x], fields.split(','))) - - -#================================================================= -def cdx_serve(key, params, sources, match_func = binsearch.iter_exact): - cdx_iter = merge_sort_streams(sources, key, match_func) - - cdx_iter = make_cdx_iter(cdx_iter) - - resolve_revisits = params.get('resolve_revisits', False) - if resolve_revisits: - cdx_iter = cdx_resolve_revisits(cdx_iter) - - filters = params.get('filter', None) - if filters: - cdx_iter = cdx_filter(cdx_iter, filters) - - collapse_time = params.get('collapse_time', None) - if collapse_time: - cdx_iter = cdx_collapse_time_status(cdx_iter, collapse_time) - - limit = int(params.get('limit', 1000000)) - - reverse = params.get('reverse', False) - if reverse: - cdx_iter = cdx_reverse(cdx_iter, limit) - - closest_to = params.get('closest_to', None) - if closest_to: - cdx_iter = cdx_sort_closest(closest_to, cdx_iter, limit) - - if limit: - cdx_iter = cdx_limit(cdx_iter, limit) - - # output raw cdx objects - if params.get('output') == 'raw': - return cdx_iter - - def write_cdx(fields): - for cdx in cdx_iter: - yield cdx_text_out(cdx, fields) + '\n' - - return write_cdx(params.get('fields')) - - -#================================================================= -# merge multiple cdx streams -def merge_sort_streams(sources, key, iter_func): - """ - >>> test_cdx(key = 'org,iana)/', sources = [test_dir + 'dupes.cdx', test_dir + 'iana.cdx']) - org,iana)/ 20140126200624 http://www.iana.org/ text/html 200 OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 2258 334 iana.warc.gz - org,iana)/ 20140127171238 http://iana.org unk 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 343 1858 dupes.warc.gz - org,iana)/ 20140127171238 http://www.iana.org/ warc/revisit - OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 536 2678 dupes.warc.gz - """ - - def load_src(source): - source = binsearch.FileReader(source) - source = iter_func(source, key) - return source - - # Optimize: no need to merge if just one input - if len(sources) == 1: - return load_src(sources[0]) - - source_iters = map(load_src, sources) - merged_stream = merge(*(source_iters)) - return merged_stream - -#================================================================= -# convert text cdx stream to CDXCaptureResult -def make_cdx_iter(text_iter): - return itertools.imap(lambda line: indexreader.CDXCaptureResult(line), text_iter) - - -#================================================================= -# limit cdx to at most limit -def cdx_limit(cdx_iter, limit): - """ - >>> test_cdx('org,iana)/_css/2013.1/fonts/opensans-bold.ttf', limit = 3) - org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200625 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 117166 198285 iana.warc.gz - org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200654 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf warc/revisit - YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 548 482544 iana.warc.gz - org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200706 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf warc/revisit - YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 495230 iana.warc.gz - - """ - - for cdx, _ in itertools.izip(cdx_iter, xrange(limit)): - yield cdx - - -#================================================================= -# reverse cdx -def cdx_reverse(cdx_iter, limit): - """ - >>> test_cdx('org,iana)/_css/2013.1/fonts/opensans-bold.ttf', reverse = True, resolve_revisits = True, limit = 3) - org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201308 https://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 551 783712 iana.warc.gz 117166 198285 iana.warc.gz - org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201249 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 771773 iana.warc.gz 117166 198285 iana.warc.gz - org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201240 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 551 757988 iana.warc.gz 117166 198285 iana.warc.gz - - >>> test_cdx('org,iana)/_js/2013.1/jquery.js', reverse = True, resolve_revisits = True, limit = 1) - org,iana)/_js/2013.1/jquery.js 20140126201307 https://www.iana.org/_js/2013.1/jquery.js application/x-javascript 200 AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO - - 543 778507 iana.warc.gz 33449 7311 iana.warc.gz - - # no match, single result - >>> test_cdx('org,iana)/dont_have_this', reverse = True, resolve_revisits = True, limit = 1) - """ - - # optimize for single last - if limit == 1: - last = None - - for cdx in cdx_iter: - last = cdx - - return [last] if last else [] - - reverse_cdxs = deque(maxlen = limit) - - for cdx in cdx_iter: - reverse_cdxs.appendleft(cdx) - - return reverse_cdxs - - - #================================================================= -# filter cdx by regex if each filter is field:regex form, -# apply filter to cdx[field] -def cdx_filter(cdx_iter, filter_strings): - """ - >>> test_cdx(key = 'org,iana)/domains', match_func = binsearch.iter_prefix, filter = ['mimetype:text/html']) - org,iana)/domains 20140126200825 http://www.iana.org/domains text/html 200 7UPSCLNWNZP33LGW6OJGSF2Y4CDG4ES7 - - 2912 610534 iana.warc.gz - org,iana)/domains/arpa 20140126201248 http://www.iana.org/domains/arpa text/html 200 QOFZZRN6JIKAL2JRL6ZC2VVG42SPKGHT - - 2939 759039 iana.warc.gz - org,iana)/domains/idn-tables 20140126201127 http://www.iana.org/domains/idn-tables text/html 200 HNCUFTJMOQOGAEY6T56KVC3T7TVLKGEW - - 8118 715878 iana.warc.gz - org,iana)/domains/int 20140126201239 http://www.iana.org/domains/int text/html 200 X32BBNNORV4SPEHTQF5KI5NFHSKTZK6Q - - 2482 746788 iana.warc.gz - org,iana)/domains/reserved 20140126201054 http://www.iana.org/domains/reserved text/html 200 R5AAEQX5XY5X5DG66B23ODN5DUBWRA27 - - 3573 701457 iana.warc.gz - org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz - org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz - org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz - org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz - - - >>> test_cdx(key = 'org,iana)/_css/2013.1/screen.css', filter = 'statuscode:200') - org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz - """ - - # Support single strings as well - if isinstance(filter_strings, str): - filter_strings = [filter_strings] - - filters = [] - - class Filter: - def __init__(self, string): - # invert filter - self.invert = string.startswith('!') - if self.invert: - string = string[1:] - - parts = string.split(':', 1) - # no field set, apply filter to entire cdx - if len(parts) == 1: - self.field = '' - else: - # apply filter to cdx[field] - self.field = parts[0] - string = parts[1] - - self.regex = re.compile(string) - - def __call__(self, cdx): - val = cdx[self.field] if self.field else str(cdx) - matched = self.regex.match(val) is not None - return matched ^ self.invert - - filters = map(Filter, filter_strings) - - for cdx in cdx_iter: - if all (x(cdx) for x in filters): - yield cdx - - - -#================================================================= -# collapse by timestamp and status code -def cdx_collapse_time_status(cdx_iter, timelen = 10): - """ - # unresolved revisits, different statuscode results in an extra repeat - >>> test_cdx(key = 'org,iana)/_css/2013.1/screen.css', collapse_time = 11) - org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz - org,iana)/_css/2013.1/screen.css 20140126200653 http://www.iana.org/_css/2013.1/screen.css warc/revisit - BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 533 328367 iana.warc.gz - org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css warc/revisit - BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz - - # resolved revisits - >>> test_cdx(key = 'org,iana)/_css/2013.1/screen.css', collapse_time = '11', resolve_revisits = True) - org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz - - - - org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz 8754 41238 iana.warc.gz - - """ - - timelen = int(timelen) - - last_token = None - - for cdx in cdx_iter: - curr_token = (cdx['timestamp'][:timelen], cdx['statuscode']) - - # yield if last_dedup_time is diff, otherwise skip - if curr_token != last_token: - last_token = curr_token - yield cdx - - - -#================================================================= -# sort CDXCaptureResult by closest to timestamp -def cdx_sort_closest(closest, cdx_iter, limit = 10): - """ - >>> test_cdx(closest_to = '20140126200826', key = 'org,iana)/_css/2013.1/fonts/opensans-bold.ttf', fields = 'timestamp', limit = 10) - 20140126200826 - 20140126200816 - 20140126200805 - 20140126200912 - 20140126200738 - 20140126200930 - 20140126200718 - 20140126200706 - 20140126200654 - 20140126200625 - - >>> test_cdx(closest_to = '20140126201306', key = 'org,iana)/dnssec', resolve_revisits = True, sources = [test_dir + 'dupes.cdx', test_dir + 'iana.cdx']) - org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz - - - - org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz - - - - - - >>> test_cdx(closest_to = '20140126201307', key = 'org,iana)/dnssec', resolve_revisits = True) - org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz - - - - org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz - - - - - # equal dist prefer earlier - >>> test_cdx(closest_to = '20140126200700', key = 'org,iana)/_css/2013.1/fonts/opensans-bold.ttf', resolve_revisits = True, limit = 2) - org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200654 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 548 482544 iana.warc.gz 117166 198285 iana.warc.gz - org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200706 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 495230 iana.warc.gz 117166 198285 iana.warc.gz - - >>> test_cdx(closest_to = '20140126200659', key = 'org,iana)/_css/2013.1/fonts/opensans-bold.ttf', resolve_revisits = True, limit = 2, fields = 'timestamp') - 20140126200654 - 20140126200706 - - >>> test_cdx(closest_to = '20140126200701', key = 'org,iana)/_css/2013.1/fonts/opensans-bold.ttf', resolve_revisits = True, limit = 2, fields = 'timestamp') - 20140126200706 - 20140126200654 - - """ - closest_cdx = [] - - closest_sec = utils.timestamp_to_sec(closest) - - for cdx in cdx_iter: - sec = utils.timestamp_to_sec(cdx['timestamp']) - key = abs(closest_sec - sec) - - # create tuple to sort by key - bisect.insort(closest_cdx, (key, cdx)) - - if len(closest_cdx) == limit: - # assuming cdx in ascending order and keys have started increasing - if key > closest_cdx[-1]: - break - - if len(closest_cdx) > limit: - closest_cdx.pop() - - - return itertools.imap(lambda x: x[1], closest_cdx) - - - -#================================================================= -# resolve revisits - -# Fields to append from cdx original to revisit -ORIG_TUPLE = ['length', 'offset', 'filename'] - -def cdx_resolve_revisits(cdx_iter): - """ - >>> test_cdx('org,iana)/_css/2013.1/fonts/inconsolata.otf', resolve_revisits = True) - org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200826 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 34054 620049 iana.warc.gz - - - - org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200912 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 546 667073 iana.warc.gz 34054 620049 iana.warc.gz - org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200930 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 534 697255 iana.warc.gz 34054 620049 iana.warc.gz - org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201055 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 547 714833 iana.warc.gz 34054 620049 iana.warc.gz - org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201249 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 551 768625 iana.warc.gz 34054 620049 iana.warc.gz - - >>> test_cdx('org,iana)/domains/root/db', resolve_revisits = True) - org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz - - - - org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz - - - - """ - - - originals = {} - - for cdx in cdx_iter: - is_revisit = (cdx['mimetype'] == 'warc/revisit') or (cdx['filename'] == '-') - - digest = cdx['digest'] - - original_cdx = originals.get(digest) - - if not original_cdx and not is_revisit: - originals[digest] = cdx - - - if original_cdx and is_revisit: - fill_orig = lambda field: original_cdx[field] - # Transfer mimetype and statuscode - cdx['mimetype'] = original_cdx['mimetype'] - cdx['statuscode'] = original_cdx['statuscode'] - else: - fill_orig = lambda field: '-' - - # Always add either the original or empty '- - -' - for field in ORIG_TUPLE: - cdx['orig.' + field] = fill_orig(field) - - yield cdx - - - - - -import utils -if __name__ == "__main__" or utils.enable_doctests(): - import os - import sys - - test_dir = utils.test_data_dir() + 'cdx/' - - def test_cdx(key, match_func = binsearch.iter_exact, sources = [test_dir + 'iana.cdx'], **kwparams): - for x in cdx_serve(key, kwparams, sources, match_func): - sys.stdout.write(x) - - - import doctest - doctest.testmod() - - diff --git a/pywb/cdxserver/__init__.py b/pywb/cdxserver/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pywb/cdxserver/cdxapp.py b/pywb/cdxserver/cdxapp.py new file mode 100644 index 00000000..15488582 --- /dev/null +++ b/pywb/cdxserver/cdxapp.py @@ -0,0 +1,42 @@ +from cdxserver import CDXServer +import logging +import os + + +test_cdx_dir = os.path.dirname(os.path.realpath(__file__)) + '/../../sample_archive/cdx/' + +#================================================================= +def main(config = None): + logging.basicConfig(format = '%(asctime)s: [%(levelname)s]: %(message)s', level = logging.DEBUG) + + if not config: + config = [test_cdx_dir] + + cdxserver = CDXServer(config) + + def application(env, start_response): + try: + response = cdxserver.load_cdx_from_request(env) + start_response('200 OK', [('Content-Type', 'text/plain')]) + + response = list(response) + + except Exception as exc: + import traceback + err_details = traceback.format_exc(exc) + start_response('400 Error', [('Content-Type', 'text/plain')]) + response = [str(exc)] + print err_details + + return response + + + return application + + +if __name__ == "__main__": + pass +else: + application = main() + + diff --git a/pywb/cdxserver/cdxobject.py b/pywb/cdxserver/cdxobject.py new file mode 100644 index 00000000..804e3492 --- /dev/null +++ b/pywb/cdxserver/cdxobject.py @@ -0,0 +1,57 @@ +from collections import OrderedDict +import itertools + +#================================================================= +class CDXObject(OrderedDict): + CDX_FORMATS = [ + # Public CDX Format + ["urlkey","timestamp","original","mimetype","statuscode","digest","length"], + + # CDX 11 Format + ["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"], + + # CDX 9 Format + ["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","offset","filename"], + + # CDX 11 Format + 3 revisit resolve fields + ["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename", + "orig.length","orig.offset","orig.filename"], + + # CDX 9 Format + 3 revisit resolve fields + ["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","offset","filename", + "orig.length","orig.offset","orig.filename"] + ] + + def __init__(self, cdxline): + OrderedDict.__init__(self) + + cdxline = cdxline.rstrip() + fields = cdxline.split(' ') + + cdxformat = None + for i in self.CDX_FORMATS: + if len(i) == len(fields): + cdxformat = i + + if not cdxformat: + raise Exception('unknown {0}-field cdx format'.format(len(fields))) + + for header, field in itertools.izip(cdxformat, fields): + self[header] = field + + self.cdxline = cdxline + + def __setitem__(self, key, value): + OrderedDict.__setitem__(self, key, value) + + # force regen on next __str__ call + self.cdxline = None + + def __str__(self): + if self.cdxline: + return self.cdxline + + li = itertools.imap(lambda (n, val): val, self.items()) + return ' '.join(li) + + diff --git a/pywb/cdxserver/cdxops.py b/pywb/cdxserver/cdxops.py new file mode 100644 index 00000000..28d94a07 --- /dev/null +++ b/pywb/cdxserver/cdxops.py @@ -0,0 +1,228 @@ +from cdxobject import CDXObject + +from ..binsearch.binsearch import iter_exact, iter_prefix, FileReader + +import timeutils +import bisect +import itertools +import re + +from heapq import merge +from collections import deque + + + +#================================================================= +def cdx_text_out(cdx, fields): + if not fields: + return str(cdx) + else: + return ' '.join(map(lambda x: cdx[x], fields.split(','))) + + +#================================================================= +def cdx_load(sources, params): + cdx_iter = load_cdx_streams(sources, params) + + cdx_iter = make_cdx_iter(cdx_iter) + + resolve_revisits = params.get('resolve_revisits', False) + if resolve_revisits: + cdx_iter = cdx_resolve_revisits(cdx_iter) + + filters = params.get('filter', None) + if filters: + cdx_iter = cdx_filter(cdx_iter, filters) + + collapse_time = params.get('collapse_time', None) + if collapse_time: + cdx_iter = cdx_collapse_time_status(cdx_iter, collapse_time) + + limit = int(params.get('limit', 1000000)) + + reverse = params.get('reverse', False) + if reverse: + cdx_iter = cdx_reverse(cdx_iter, limit) + + closest_to = params.get('closest_to', None) + if closest_to: + cdx_iter = cdx_sort_closest(closest_to, cdx_iter, limit) + + if limit: + cdx_iter = cdx_limit(cdx_iter, limit) + + # output raw cdx objects + if params.get('output') == 'raw': + return cdx_iter + + def write_cdx(fields): + for cdx in cdx_iter: + yield cdx_text_out(cdx, fields) + '\n' + + return write_cdx(params.get('fields')) + + +#================================================================= +# load and source merge cdx streams +def load_cdx_streams(sources, params): + # Optimize: no need to merge if just one input + if len(sources) == 1: + return sources[0].load_cdx(params) + + source_iters = map(lambda src: src.load_cdx(params), sources) + merged_stream = merge(*(source_iters)) + return merged_stream + +#================================================================= +# convert text cdx stream to CDXObject +def make_cdx_iter(text_iter): + return itertools.imap(lambda line: CDXObject(line), text_iter) + + +#================================================================= +# limit cdx to at most limit +def cdx_limit(cdx_iter, limit): + for cdx, _ in itertools.izip(cdx_iter, xrange(limit)): + yield cdx + + +#================================================================= +# reverse cdx +def cdx_reverse(cdx_iter, limit): + # optimize for single last + if limit == 1: + last = None + + for cdx in cdx_iter: + last = cdx + + return [last] if last else [] + + reverse_cdxs = deque(maxlen = limit) + + for cdx in cdx_iter: + reverse_cdxs.appendleft(cdx) + + return reverse_cdxs + + + #================================================================= +# filter cdx by regex if each filter is field:regex form, +# apply filter to cdx[field] +def cdx_filter(cdx_iter, filter_strings): + # Support single strings as well + if isinstance(filter_strings, str): + filter_strings = [filter_strings] + + filters = [] + + class Filter: + def __init__(self, string): + # invert filter + self.invert = string.startswith('!') + if self.invert: + string = string[1:] + + parts = string.split(':', 1) + # no field set, apply filter to entire cdx + if len(parts) == 1: + self.field = '' + else: + # apply filter to cdx[field] + self.field = parts[0] + string = parts[1] + + self.regex = re.compile(string) + + def __call__(self, cdx): + val = cdx[self.field] if self.field else str(cdx) + matched = self.regex.match(val) is not None + return matched ^ self.invert + + filters = map(Filter, filter_strings) + + for cdx in cdx_iter: + if all (x(cdx) for x in filters): + yield cdx + + + +#================================================================= +# collapse by timestamp and status code +def cdx_collapse_time_status(cdx_iter, timelen = 10): + timelen = int(timelen) + + last_token = None + + for cdx in cdx_iter: + curr_token = (cdx['timestamp'][:timelen], cdx['statuscode']) + + # yield if last_dedup_time is diff, otherwise skip + if curr_token != last_token: + last_token = curr_token + yield cdx + + + +#================================================================= +# sort CDXCaptureResult by closest to timestamp +def cdx_sort_closest(closest, cdx_iter, limit = 10): + closest_cdx = [] + + closest_sec = timeutils.timestamp_to_sec(closest) + + for cdx in cdx_iter: + sec = timeutils.timestamp_to_sec(cdx['timestamp']) + key = abs(closest_sec - sec) + + # create tuple to sort by key + bisect.insort(closest_cdx, (key, cdx)) + + if len(closest_cdx) == limit: + # assuming cdx in ascending order and keys have started increasing + if key > closest_cdx[-1]: + break + + if len(closest_cdx) > limit: + closest_cdx.pop() + + + return itertools.imap(lambda x: x[1], closest_cdx) + + + +#================================================================= +# resolve revisits + +# Fields to append from cdx original to revisit +ORIG_TUPLE = ['length', 'offset', 'filename'] + +def cdx_resolve_revisits(cdx_iter): + originals = {} + + for cdx in cdx_iter: + is_revisit = (cdx['mimetype'] == 'warc/revisit') or (cdx['filename'] == '-') + + digest = cdx['digest'] + + original_cdx = originals.get(digest) + + if not original_cdx and not is_revisit: + originals[digest] = cdx + + + if original_cdx and is_revisit: + fill_orig = lambda field: original_cdx[field] + # Transfer mimetype and statuscode + cdx['mimetype'] = original_cdx['mimetype'] + cdx['statuscode'] = original_cdx['statuscode'] + else: + fill_orig = lambda field: '-' + + # Always add either the original or empty '- - -' + for field in ORIG_TUPLE: + cdx['orig.' + field] = fill_orig(field) + + yield cdx + + diff --git a/pywb/cdxserver/cdxserver.py b/pywb/cdxserver/cdxserver.py new file mode 100644 index 00000000..82697167 --- /dev/null +++ b/pywb/cdxserver/cdxserver.py @@ -0,0 +1,160 @@ +import surt +from ..binsearch.binsearch import iter_exact, iter_prefix, FileReader +from cdxops import cdx_load + +import itertools +import logging +import os +import urlparse + + +#================================================================= +class CDXFile: + def __init__(self, filename): + self.filename = filename + + def load_cdx(self, params): + source = FileReader(self.filename) + + match_type = params.get('match_type') + + if match_type == 'prefix': + iter_func = iter_prefix + else: + iter_func = iter_exact + + key = params.get('key') + + return iter_func(source, key) + + def __str__(self): + return 'CDX File - ' + self.filename + +#================================================================= +class CDXException(Exception): + def __init__(self, msg, url = None): + Exception.__init__(self, msg) + self.url = url + + def status(self): + return '400 Bad Request' + + +#================================================================= +class CDXServer: + """ + Top-level cdx server object which maintains a list of cdx sources, + responds to queries and dispatches to the cdx ops for processing + """ + + def __init__(self, sources, surt_ordered = True): + self.sources = [] + self.surt_ordered = surt_ordered + logging.debug('CDX Surt-Ordered? ' + str(surt_ordered)) + + for src in sources: + if os.path.isdir(src): + for file in os.listdir(src): + self.add_cdx_loader(src + file) + else: + self.add_cdx_loader(src) + + if len(self.sources) == 0: + logging.exception('No CDX Sources Found!') + + def add_cdx_loader(self, filename): + source = self.create_cdx_loader(filename) + if not source: + return + + logging.debug('Adding CDX Source: ' + str(source)) + self.sources.append(source) + + @staticmethod + def create_cdx_loader(filename): + if filename.endswith('.cdx'): + return CDXFile(filename) + return None + #TODO: support zipnum + #elif filename.endswith('.summary') + # return ZipNumCDXSource(filename) + #elif filename.startswith('redis://') + # return RedisCDXSource(filename) + + + def load_cdx(self, **params): + # canonicalize to surt (canonicalization is part of surt conversion) + try: + url = params['url'] + except KeyError: + raise CDXException('The url= param must be specified to query the cdx server') + + try: + key = surt.surt(url) + except Exception as e: + raise CDXException('Invalid url: ', url) + + # if not surt, unsurt the surt to get canonicalized non-surt url + if not self.surt_ordered: + key = unsurt(key) + + params['key'] = key + + return cdx_load(self.sources, params) + + + def load_cdx_from_request(self, env): + #url = wbrequest.wb_url.url + + # use url= param to get actual url + params = urlparse.parse_qs(env['QUERY_STRING']) + + if not 'output' in params: + params['output'] = 'text' + + # parse_qs produces arrays for single values + # cdxreader expects singleton params for all except filters, so convert here + # use first value of the list + for name, val in params.iteritems(): + if name != 'filter': + params[name] = val[0] + + cdx_lines = self.load_cdx(**params) + return cdx_lines + + + + def __str__(self): + return 'load cdx indexes from ' + str(self.sources) + + + +#================================================================= +def unsurt(surt): + """ + # Simple surt + >>> unsurt('com,example)/') + 'example.com)/' + + # Broken surt + >>> unsurt('com,example)') + 'com,example)' + + # Long surt + >>> unsurt('suffix,domain,sub,subsub,another,subdomain)/path/file/index.html?a=b?c=)/') + 'subdomain.another.subsub.sub.domain.suffix)/path/file/index.html?a=b?c=)/' + """ + + try: + index = surt.index(')/') + parts = surt[0:index].split(',') + parts.reverse() + host = '.'.join(parts) + host += surt[index:] + return host + + except ValueError: + # May not be a valid surt + return surt + + diff --git a/pywb/cdxserver/timeutils.py b/pywb/cdxserver/timeutils.py new file mode 100644 index 00000000..23ed3bec --- /dev/null +++ b/pywb/cdxserver/timeutils.py @@ -0,0 +1,103 @@ +import re +import time +import datetime +import calendar + +#================================================================= +# str <-> datetime conversion +#================================================================= + +DATE_TIMESPLIT = re.compile('[^\d]') + +TIMESTAMP_14 = '%Y%m%d%H%M%S' + +PAD_STAMP_END = '29991231235959' + + + +def iso_date_to_datetime(string): + """ + >>> iso_date_to_datetime('2013-12-26T10:11:12Z') + datetime.datetime(2013, 12, 26, 10, 11, 12) + + >>> iso_date_to_datetime('2013-12-26T10:11:12Z') + datetime.datetime(2013, 12, 26, 10, 11, 12) + """ + + nums = DATE_TIMESPLIT.split(string) + if nums[-1] == '': + nums = nums[:-1] + + dt = datetime.datetime(*map(int, nums)) + return dt + +def datetime_to_timestamp(dt): + """ + >>> datetime_to_timestamp(datetime.datetime(2013, 12, 26, 10, 11, 12)) + '20131226101112' + """ + + return dt.strftime(TIMESTAMP_14) + +def iso_date_to_timestamp(string): + """ + >>> iso_date_to_timestamp('2013-12-26T10:11:12Z') + '20131226101112' + + >>> iso_date_to_timestamp('2013-12-26T10:11:12') + '20131226101112' + """ + + return datetime_to_timestamp(iso_date_to_datetime(string)) + + +# default pad is end of range for compatibility +def pad_timestamp(string, pad_str = PAD_STAMP_END): + """ + >>> pad_timestamp('20') + '20991231235959' + + >>> pad_timestamp('2014') + '20141231235959' + + >>> pad_timestamp('20141011') + '20141011235959' + + >>> pad_timestamp('201410110010') + '20141011001059' + """ + + str_len = len(string) + pad_len = len(pad_str) + + return string if str_len >= pad_len else string + pad_str[str_len:] + + +def timestamp_to_datetime(string): + """ + >>> timestamp_to_datetime('20131226095010') + time.struct_time(tm_year=2013, tm_mon=12, tm_mday=26, tm_hour=9, tm_min=50, tm_sec=10, tm_wday=3, tm_yday=360, tm_isdst=-1) + + >>> timestamp_to_datetime('2014') + time.struct_time(tm_year=2014, tm_mon=12, tm_mday=31, tm_hour=23, tm_min=59, tm_sec=59, tm_wday=2, tm_yday=365, tm_isdst=-1) + """ + + # Default pad to end of range for comptability + return time.strptime(pad_timestamp(string), TIMESTAMP_14) + + +def timestamp_to_sec(string): + """ + >>> timestamp_to_sec('20131226095010') + 1388051410 + + >>> timestamp_to_sec('2014') + 1420070399 + """ + + return calendar.timegm(timestamp_to_datetime(string)) + + +if __name__ == "__main__": + import doctest + doctest.testmod() diff --git a/pywb/handlers.py b/pywb/handlers.py index c2c7b949..81314ea3 100644 --- a/pywb/handlers.py +++ b/pywb/handlers.py @@ -66,28 +66,12 @@ class WBHandler(BaseHandler): # CDX-Server Handler -- pass all params to cdx server #================================================================= class CDXHandler(BaseHandler): - def __init__(self, cdx_reader, view = None): - self.cdx_reader = cdx_reader + def __init__(self, cdx_server, view = None): + self.cdx_server = cdx_server self.view = view if view else views.TextCapturesView() def __call__(self, wbrequest): - #url = wbrequest.wb_url.url - - # use url= param to get actual url - params = urlparse.parse_qs(wbrequest.env['QUERY_STRING']) - - # parse_qs produces arrays for single values - # cdxreader expects singleton params for all except filters, so convert here - # use first value of the list - for name, val in params.iteritems(): - if name != 'filter': - params[name] = val[0] - - url = params.get('url') - if not url: - raise WbException('Must specify a url= param to query cdx server') - - cdx_lines = self.cdx_reader.load_cdx(url, params, parsed_cdx = False) + cdx_lines = self.cdx_server.load_cdx_from_request(wbrequest.env) return self.view.render_response(wbrequest, cdx_lines) @@ -97,7 +81,7 @@ class CDXHandler(BaseHandler): return None def __str__(self): - return 'CDX Server: ' + str(self.cdx_reader) + return 'CDX Server: ' + str(self.cdx_server) #================================================================= diff --git a/pywb/indexreader.py b/pywb/indexreader.py index 580e5705..ce4c295b 100644 --- a/pywb/indexreader.py +++ b/pywb/indexreader.py @@ -1,15 +1,13 @@ import urllib import urllib2 import wbexceptions -import itertools import wbrequestresponse -import surt from collections import OrderedDict -import binsearch -import cdxserve +from cdxserver.cdxserver import CDXServer, CDXException +from cdxserver.cdxobject import CDXObject + import logging -import os #================================================================= class IndexReader: @@ -26,7 +24,13 @@ class IndexReader: if wbrequest.custom_params: params.update(wbrequest.custom_params) - cdxlines = self.load_cdx(wburl.url, params, parsed_cdx) + #params['url'] = wburl.url + output = 'raw' if parsed_cdx else 'text' + + try: + cdxlines = self.load_cdx(url = wburl.url, output = output, **params) + except CDXException: + raise wbexceptions.BadUrlException('Bad Request Url: ' + wburl.url) cdxlines = utils.peek_iter(cdxlines) @@ -53,7 +57,7 @@ class IndexReader: # for now, list implies local sources if isinstance(paths, list): if len(paths) > 1: - return LocalCDXServer(paths, surt_ordered) + return EmbeddedCDXServer(paths, surt_ordered) else: # treat as non-list paths = paths[0] @@ -66,66 +70,13 @@ class IndexReader: cookie = config.get('cookie', None) return RemoteCDXServer(uri, cookie = cookie) else: - return LocalCDXServer([uri], surt_ordered) + return EmbeddedCDXServer([uri], surt_ordered) #================================================================= -class LocalCDXServer(IndexReader): - """ - >>> x = LocalCDXServer([test_dir]).load_cdx('example.com', parsed_cdx = True, limit = 1) - >>> pprint(x.next().items()) - [('urlkey', 'com,example)/'), - ('timestamp', '20140127171200'), - ('original', 'http://example.com'), - ('mimetype', 'text/html'), - ('statuscode', '200'), - ('digest', 'B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'), - ('redirect', '-'), - ('robotflags', '-'), - ('length', '1046'), - ('offset', '334'), - ('filename', 'dupes.warc.gz')] - - """ - - def __init__(self, sources, surt_ordered = True): - self.sources = [] - self.surt_ordered = surt_ordered - logging.info('CDX Surt-Ordered? ' + str(surt_ordered)) - - for src in sources: - if os.path.isdir(src): - for file in os.listdir(src): - if file.endswith('.cdx'): - full = src + file - logging.info('Adding CDX: ' + full) - self.sources.append(full) - else: - logging.info('Adding CDX: ' + src) - self.sources.append(src) - - - def load_cdx(self, url, params = {}, parsed_cdx = True, **kwvalues): - # canonicalize to surt (canonicalization is part of surt conversion) - try: - key = surt.surt(url) - except Exception as e: - raise wbexceptions.BadUrlException('Bad Request Url: ' + url) - - # if not surt, unsurt the surt to get canonicalized non-surt url - if not self.surt_ordered: - key = utils.unsurt(key) - - match_func = binsearch.iter_exact - - params.update(**kwvalues) - params['output'] = 'raw' if parsed_cdx else 'text' - - return cdxserve.cdx_serve(key, params, self.sources, match_func) - - +class EmbeddedCDXServer(CDXServer, IndexReader): def get_query_params(self, wburl, limit = 150000, collapse_time = None, replay_closest = 10): if wburl.type == wburl.URL_QUERY: @@ -198,7 +149,7 @@ class RemoteCDXServer(IndexReader): raise if parsed_cdx: - return (CDXCaptureResult(cdx) for cdx in response) + return (CDXObject(cdx) for cdx in response) else: return iter(response) @@ -238,62 +189,6 @@ class RemoteCDXServer(IndexReader): return 'server cdx from ' + self.server_url -#================================================================= -class CDXCaptureResult(OrderedDict): - CDX_FORMATS = [ - # Public CDX Format - ["urlkey","timestamp","original","mimetype","statuscode","digest","length"], - - # CDX 11 Format - ["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"], - - # CDX 9 Format - ["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","offset","filename"], - - # CDX 11 Format + 3 revisit resolve fields - ["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename", - "orig.length","orig.offset","orig.filename"], - - # CDX 9 Format + 3 revisit resolve fields - ["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","offset","filename", - "orig.length","orig.offset","orig.filename"] - ] - - def __init__(self, cdxline): - OrderedDict.__init__(self) - - cdxline = cdxline.rstrip() - fields = cdxline.split(' ') - - cdxformat = None - for i in CDXCaptureResult.CDX_FORMATS: - if len(i) == len(fields): - cdxformat = i - - if not cdxformat: - raise wbexceptions.InvalidCDXException('unknown {0}-field cdx format'.format(len(fields))) - - for header, field in itertools.izip(cdxformat, fields): - self[header] = field - - self.cdxline = cdxline - - def __setitem__(self, key, value): - OrderedDict.__setitem__(self, key, value) - - # force regen on next __str__ call - self.cdxline = None - - - def __str__(self): - if self.cdxline: - return self.cdxline - - li = itertools.imap(lambda (n, val): val, self.items()) - return ' '.join(li) - - - # Testing import utils diff --git a/pywb/replay_resolvers.py b/pywb/replay_resolvers.py index 306675f8..45354599 100644 --- a/pywb/replay_resolvers.py +++ b/pywb/replay_resolvers.py @@ -1,5 +1,5 @@ import redis -import binsearch +import binsearch.binsearch import urlparse import os @@ -46,10 +46,10 @@ class RedisResolver: class PathIndexResolver: def __init__(self, pathindex_file): self.pathindex_file = pathindex_file - self.reader = binsearch.FileReader(pathindex_file) + self.reader = binsearch.binsearch.FileReader(pathindex_file) def __call__(self, filename): - result = binsearch.iter_exact(self.reader, filename, '\t') + result = binsearch.binsearch.iter_exact(self.reader, filename, '\t') def gen_list(result): for pathline in result: diff --git a/pywb/utils.py b/pywb/utils.py index 934dd818..fee5d931 100644 --- a/pywb/utils.py +++ b/pywb/utils.py @@ -43,100 +43,6 @@ class PerfTimer: self.perfdict[self.name] = str(self.end - self.start) -#================================================================= -# str <-> datetime conversion -#================================================================= - -DATE_TIMESPLIT = re.compile('[^\d]') - -TIMESTAMP_14 = '%Y%m%d%H%M%S' - -PAD_STAMP_END = '29991231235959' - - - -def iso_date_to_datetime(string): - """ - >>> iso_date_to_datetime('2013-12-26T10:11:12Z') - datetime.datetime(2013, 12, 26, 10, 11, 12) - - >>> iso_date_to_datetime('2013-12-26T10:11:12Z') - datetime.datetime(2013, 12, 26, 10, 11, 12) - """ - - nums = DATE_TIMESPLIT.split(string) - if nums[-1] == '': - nums = nums[:-1] - - dt = datetime.datetime(*map(int, nums)) - return dt - -def datetime_to_timestamp(dt): - """ - >>> datetime_to_timestamp(datetime.datetime(2013, 12, 26, 10, 11, 12)) - '20131226101112' - """ - - return dt.strftime(TIMESTAMP_14) - -def iso_date_to_timestamp(string): - """ - >>> iso_date_to_timestamp('2013-12-26T10:11:12Z') - '20131226101112' - - >>> iso_date_to_timestamp('2013-12-26T10:11:12') - '20131226101112' - """ - - return datetime_to_timestamp(iso_date_to_datetime(string)) - - -# default pad is end of range for compatibility -def pad_timestamp(string, pad_str = PAD_STAMP_END): - """ - >>> pad_timestamp('20') - '20991231235959' - - >>> pad_timestamp('2014') - '20141231235959' - - >>> pad_timestamp('20141011') - '20141011235959' - - >>> pad_timestamp('201410110010') - '20141011001059' - """ - - str_len = len(string) - pad_len = len(pad_str) - - return string if str_len >= pad_len else string + pad_str[str_len:] - - -def timestamp_to_datetime(string): - """ - >>> timestamp_to_datetime('20131226095010') - time.struct_time(tm_year=2013, tm_mon=12, tm_mday=26, tm_hour=9, tm_min=50, tm_sec=10, tm_wday=3, tm_yday=360, tm_isdst=-1) - - >>> timestamp_to_datetime('2014') - time.struct_time(tm_year=2014, tm_mon=12, tm_mday=31, tm_hour=23, tm_min=59, tm_sec=59, tm_wday=2, tm_yday=365, tm_isdst=-1) - """ - - # Default pad to end of range for comptability - return time.strptime(pad_timestamp(string), TIMESTAMP_14) - - -def timestamp_to_sec(string): - """ - >>> timestamp_to_sec('20131226095010') - 1388051410 - - >>> timestamp_to_sec('2014') - 1420070399 - """ - - return calendar.timegm(timestamp_to_datetime(string)) - #================================================================= # adapted -from wsgiref.request_uri, but doesn't include domain name and allows all characters # allowed in the path segment according to: http://tools.ietf.org/html/rfc3986#section-3.3 diff --git a/pywb/views.py b/pywb/views.py index d4360e25..9e3f0a96 100644 --- a/pywb/views.py +++ b/pywb/views.py @@ -1,9 +1,9 @@ -import indexreader -import utils +import cdxserver.timeutils as timeutils + import wbrequestresponse import wbexceptions -import time import urlparse +import time from os import path from itertools import imap @@ -58,7 +58,7 @@ class J2TemplateView: # Filters @staticmethod def format_ts(value, format='%a, %b %d %Y %H:%M:%S'): - value = utils.timestamp_to_datetime(value) + value = timeutils.timestamp_to_datetime(value) return time.strftime(format, value) @staticmethod diff --git a/pywb/wbapp.py b/pywb/wbapp.py index 44b85a0f..e0bedef6 100644 --- a/pywb/wbapp.py +++ b/pywb/wbapp.py @@ -2,6 +2,7 @@ import utils import wbexceptions from wbrequestresponse import WbResponse, StatusAndHeaders +from cdxserver.cdxserver import CDXException import os import importlib @@ -33,7 +34,7 @@ def create_wb_app(wb_router): except (wbexceptions.NotFoundException, wbexceptions.AccessException) as e: response = handle_exception(env, wb_router.error_view, e, False) - except wbexceptions.WbException as wbe: + except (wbexceptions.WbException, CDXException) as wbe: response = handle_exception(env, wb_router.error_view, wbe, False) except Exception as e: diff --git a/setup.py b/setup.py index 18698a6b..fe23f1e7 100755 --- a/setup.py +++ b/setup.py @@ -11,8 +11,8 @@ setuptools.setup(name='pywb', author_email='ilya@archive.org', long_description=open('README.md').read(), license='GPL', - packages=['pywb'], - provides=['pywb'], + packages=['pywb', 'pywb.binsearch', 'pywb.cdxserver'], + provides=['pywb', 'pywb.binsearch', 'pywb.cdxserver'], package_data={'pywb': ['ui/*', 'static/*']}, data_files = [('sample_archive/cdx/', glob.glob('sample_archive/cdx/*')), ('sample_archive/warcs/', glob.glob('sample_archive/warcs/*'))], diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/test_binsearch.py b/tests/test_binsearch.py new file mode 100644 index 00000000..20f50ea4 --- /dev/null +++ b/tests/test_binsearch.py @@ -0,0 +1,43 @@ +import os +from ..pywb.binsearch.binsearch import iter_prefix, iter_exact, FileReader + +test_cdx_dir = os.path.dirname(os.path.realpath(__file__)) + '/../sample_archive/cdx/' + +def binsearch_cdx_test(key, iter_func): + """ + # Prefix Search + >>> binsearch_cdx_test('org,iana)/domains/root', iter_prefix) + org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz + org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz + org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz + org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz + + >>> binsearch_cdx_test('org,iana)/domains/root', iter_exact) + org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz + + >>> binsearch_cdx_test('org,iana)/', iter_exact) + org,iana)/ 20140126200624 http://www.iana.org/ text/html 200 OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 2258 334 iana.warc.gz + + >>> binsearch_cdx_test('org,iana)/domains/root/db', iter_exact) + org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz + org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz + + # Exact Search + >>> binsearch_cdx_test('org,iaana)/', iter_exact) + >>> binsearch_cdx_test('org,ibna)/', iter_exact) + + >>> binsearch_cdx_test('org,iana)/time-zones', iter_exact) + org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz + """ + + cdx = FileReader(test_cdx_dir + 'iana.cdx') + + for line in iter_func(cdx, key): + print line + + +if __name__ == "__main__": + import doctest + doctest.testmod() + + diff --git a/tests/test_cdxserve.py b/tests/test_cdxserve.py new file mode 100644 index 00000000..77812bc4 --- /dev/null +++ b/tests/test_cdxserve.py @@ -0,0 +1,149 @@ +from ..pywb.binsearch.binsearch import iter_exact, iter_prefix, FileReader +from ..pywb.cdxserver.cdxserver import CDXServer +import os +import sys +import pprint + +test_cdx_dir = os.path.dirname(os.path.realpath(__file__)) + '/../sample_archive/cdx/' + +def cdx_ops_test(url, sources = [test_cdx_dir + 'iana.cdx'], **kwparams): + """ + # Merge Sort Multipe CDX Sources + >>> cdx_ops_test(url = 'http://iana.org/', sources = [test_cdx_dir + 'dupes.cdx', test_cdx_dir + 'iana.cdx']) + org,iana)/ 20140126200624 http://www.iana.org/ text/html 200 OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 2258 334 iana.warc.gz + org,iana)/ 20140127171238 http://iana.org unk 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 343 1858 dupes.warc.gz + org,iana)/ 20140127171238 http://www.iana.org/ warc/revisit - OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 536 2678 dupes.warc.gz + + + # Limit CDX Stream + >>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', limit = 3) + org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200625 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 117166 198285 iana.warc.gz + org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200654 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf warc/revisit - YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 548 482544 iana.warc.gz + org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200706 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf warc/revisit - YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 495230 iana.warc.gz + + + # Reverse CDX Stream + >>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', reverse = True, resolve_revisits = True, limit = 3) + org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201308 https://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 551 783712 iana.warc.gz 117166 198285 iana.warc.gz + org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201249 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 771773 iana.warc.gz 117166 198285 iana.warc.gz + org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201240 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 551 757988 iana.warc.gz 117166 198285 iana.warc.gz + + >>> cdx_ops_test('http://iana.org/_js/2013.1/jquery.js', reverse = True, resolve_revisits = True, limit = 1) + org,iana)/_js/2013.1/jquery.js 20140126201307 https://www.iana.org/_js/2013.1/jquery.js application/x-javascript 200 AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO - - 543 778507 iana.warc.gz 33449 7311 iana.warc.gz + + # No matching results + >>> cdx_ops_test('http://iana.org/dont_have_this', reverse = True, resolve_revisits = True, limit = 2) + + + # Filter cdx + >>> cdx_ops_test(url = 'http://iana.org/domains', match_type = 'prefix', filter = ['mimetype:text/html']) + org,iana)/domains 20140126200825 http://www.iana.org/domains text/html 200 7UPSCLNWNZP33LGW6OJGSF2Y4CDG4ES7 - - 2912 610534 iana.warc.gz + org,iana)/domains/arpa 20140126201248 http://www.iana.org/domains/arpa text/html 200 QOFZZRN6JIKAL2JRL6ZC2VVG42SPKGHT - - 2939 759039 iana.warc.gz + org,iana)/domains/idn-tables 20140126201127 http://www.iana.org/domains/idn-tables text/html 200 HNCUFTJMOQOGAEY6T56KVC3T7TVLKGEW - - 8118 715878 iana.warc.gz + org,iana)/domains/int 20140126201239 http://www.iana.org/domains/int text/html 200 X32BBNNORV4SPEHTQF5KI5NFHSKTZK6Q - - 2482 746788 iana.warc.gz + org,iana)/domains/reserved 20140126201054 http://www.iana.org/domains/reserved text/html 200 R5AAEQX5XY5X5DG66B23ODN5DUBWRA27 - - 3573 701457 iana.warc.gz + org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz + org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz + org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz + org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz + + + >>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', filter = 'statuscode:200') + org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz + + + # Collapse by timestamp + # unresolved revisits, different statuscode results in an extra repeat + >>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapse_time = 11) + org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz + org,iana)/_css/2013.1/screen.css 20140126200653 http://www.iana.org/_css/2013.1/screen.css warc/revisit - BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 533 328367 iana.warc.gz + org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css warc/revisit - BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz + + # resolved revisits + >>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapse_time = '11', resolve_revisits = True) + org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz - - - + org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz 8754 41238 iana.warc.gz + + + # Sort by closest timestamp + field select output + >>> cdx_ops_test(closest_to = '20140126200826', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', fields = 'timestamp', limit = 10) + 20140126200826 + 20140126200816 + 20140126200805 + 20140126200912 + 20140126200738 + 20140126200930 + 20140126200718 + 20140126200706 + 20140126200654 + 20140126200625 + + >>> cdx_ops_test(closest_to = '20140126201306', url = 'http://iana.org/dnssec', resolve_revisits = True, sources = [test_cdx_dir + 'dupes.cdx', test_cdx_dir + 'iana.cdx']) + org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz - - - + org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz - - - + + + >>> cdx_ops_test(closest_to = '20140126201307', url = 'http://iana.org/dnssec', resolve_revisits = True) + org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz - - - + org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz - - - + + # equal dist prefer earlier + >>> cdx_ops_test(closest_to = '20140126200700', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolve_revisits = True, limit = 2) + org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200654 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 548 482544 iana.warc.gz 117166 198285 iana.warc.gz + org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200706 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 495230 iana.warc.gz 117166 198285 iana.warc.gz + + >>> cdx_ops_test(closest_to = '20140126200659', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolve_revisits = True, limit = 2, fields = 'timestamp') + 20140126200654 + 20140126200706 + + >>> cdx_ops_test(closest_to = '20140126200701', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolve_revisits = True, limit = 2, fields = 'timestamp') + 20140126200706 + 20140126200654 + + + # Resolve Revisits + >>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/inconsolata.otf', resolve_revisits = True) + org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200826 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 34054 620049 iana.warc.gz - - - + org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200912 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 546 667073 iana.warc.gz 34054 620049 iana.warc.gz + org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200930 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 534 697255 iana.warc.gz 34054 620049 iana.warc.gz + org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201055 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 547 714833 iana.warc.gz 34054 620049 iana.warc.gz + org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201249 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 551 768625 iana.warc.gz 34054 620049 iana.warc.gz + + >>> cdx_ops_test('http://iana.org/domains/root/db', resolve_revisits = True) + org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz - - - + org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz - - - + + + # CDX Server init + >>> x = CDXServer([test_cdx_dir]).load_cdx(url = 'example.com', limit = 1, output = 'raw') + >>> pprint.pprint(x.next().items()) + [('urlkey', 'com,example)/'), + ('timestamp', '20140127171200'), + ('original', 'http://example.com'), + ('mimetype', 'text/html'), + ('statuscode', '200'), + ('digest', 'B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'), + ('redirect', '-'), + ('robotflags', '-'), + ('length', '1046'), + ('offset', '334'), + ('filename', 'dupes.warc.gz')] + + """ + + kwparams['url'] = url + kwparams['output'] = 'text' + + server = CDXServer(sources) + results = server.load_cdx(**kwparams) + + for x in results: + sys.stdout.write(x) + + + +if __name__ == "__main__": + import doctest + doctest.testmod() + + diff --git a/run-tests.py b/tests/test_integration.py similarity index 95% rename from run-tests.py rename to tests/test_integration.py index 28282013..abd104c5 100644 --- a/run-tests.py +++ b/tests/test_integration.py @@ -1,14 +1,14 @@ import webtest -import pywb.pywb_init -from pywb.indexreader import CDXCaptureResult +from ..pywb.pywb_init import pywb_config +from ..pywb.wbapp import create_wb_app +from ..pywb.cdxserver.cdxobject import CDXObject class TestWb: TEST_CONFIG = 'test_config.yaml' def setup(self): - import pywb.wbapp #self.app = pywb.wbapp.create_wb_app(pywb.pywb_init.pywb_config()) - self.app = pywb.wbapp.create_wb_app(pywb.pywb_init.pywb_config(self.TEST_CONFIG)) + self.app = create_wb_app(pywb_config(self.TEST_CONFIG)) self.testapp = webtest.TestApp(self.app) def _assert_basic_html(self, resp): @@ -144,8 +144,8 @@ class TestWb: # combine collapsing, reversing and revisit resolving resp = self.testapp.get('/pywb-cdx?url=http://www.iana.org/_css/2013.1/print.css&collapse_time=11&resolve_revisits=true&reverse=true') - # convert back to CDXCaptureResult - cdxs = map(CDXCaptureResult, resp.body.rstrip().split('\n')) + # convert back to CDXObject + cdxs = map(CDXObject, resp.body.rstrip().split('\n')) assert len(cdxs) == 3, len(cdxs) # verify timestamps