diff --git a/pywb/__init__.py b/pywb/__init__.py index 89aedebc..d5993903 100644 --- a/pywb/__init__.py +++ b/pywb/__init__.py @@ -1,4 +1,4 @@ -__version__ = '0.11.1' +__version__ = '1.0b' DEFAULT_CONFIG = 'pywb/default_config.yaml' diff --git a/pywb/cdx/cdxdomainspecific.py b/pywb/cdx/cdxdomainspecific.py index d5989052..2efca76c 100644 --- a/pywb/cdx/cdxdomainspecific.py +++ b/pywb/cdx/cdxdomainspecific.py @@ -2,13 +2,14 @@ import yaml import re import logging import pkg_resources -import urlparse + +from six.moves.urllib.parse import urlsplit from pywb.utils.dsrules import BaseRule, RuleSet from pywb.utils.canonicalize import unsurt, UrlCanonicalizer -from query import CDXQuery +from pywb.cdx.query import CDXQuery #================================================================= @@ -102,7 +103,7 @@ class FuzzyQuery: url = url[:inx + len(repl)] if matched_rule.match_type == 'domain': - host = urlparse.urlsplit(url).netloc + host = urlsplit(url).netloc # remove the subdomain url = host.split('.', 1)[1] diff --git a/pywb/cdx/cdxobject.py b/pywb/cdx/cdxobject.py index 29803c30..9d1b6f38 100644 --- a/pywb/cdx/cdxobject.py +++ b/pywb/cdx/cdxobject.py @@ -3,10 +3,11 @@ try: # pragma: no cover except ImportError: # pragma: no cover from ordereddict import OrderedDict -import itertools +import six +from six.moves import zip -from urllib import urlencode, quote -from urlparse import parse_qs +from six.moves.urllib.parse import urlencode, quote +from six.moves.urllib.parse import parse_qs from pywb.utils.wbexception import WbException @@ -101,7 +102,7 @@ class CDXObject(OrderedDict): 'f': FILENAME } - def __init__(self, cdxline=''): + def __init__(self, cdxline=b''): OrderedDict.__init__(self) cdxline = cdxline.rstrip() @@ -112,28 +113,28 @@ class CDXObject(OrderedDict): self.cdxline = cdxline return - fields = cdxline.split(' ' , 2) + fields = cdxline.split(b' ' , 2) # Check for CDX JSON - if fields[-1].startswith('{'): - self[URLKEY] = fields[0] - self[TIMESTAMP] = fields[1] - json_fields = json_decode(fields[-1]) - for n, v in json_fields.iteritems(): + if fields[-1].startswith(b'{'): + self[URLKEY] = fields[0].decode('utf-8') + self[TIMESTAMP] = fields[1].decode('utf-8') + json_fields = json_decode(fields[-1].decode('utf-8')) + for n, v in six.iteritems(json_fields): n = self.CDX_ALT_FIELDS.get(n, n) try: - self[n] = str(v) + v.encode('ascii') except UnicodeEncodeError: - v = v.encode('utf-8') - parts = v.split('//', 1) - v = parts[0] + '//' + quote(parts[1]) - self[n] = v + parts = v.encode('utf-8').split(b'//', 1) + v = parts[0].decode('utf-8') + '//' + quote(parts[1]) + + self[n] = v self.cdxline = cdxline self._from_json = True return - more_fields = fields.pop().split(' ') + more_fields = fields.pop().split(b' ') fields.extend(more_fields) cdxformat = None @@ -145,8 +146,8 @@ class CDXObject(OrderedDict): msg = 'unknown {0}-field cdx format'.format(len(fields)) raise CDXException(msg) - for header, field in itertools.izip(cdxformat, fields): - self[header] = field + for header, field in zip(cdxformat, fields): + self[header] = field.decode('utf-8') self.cdxline = cdxline @@ -204,13 +205,14 @@ class CDXObject(OrderedDict): def __str__(self): if self.cdxline: - return self.cdxline + return self.cdxline.decode('utf-8') if not self._from_json: - return ' '.join(val for n, val in self.iteritems()) + return ' '.join(val for n, val in six.iteritems(self)) else: return json_encode(self) + #================================================================= class IDXObject(OrderedDict): @@ -221,14 +223,14 @@ class IDXObject(OrderedDict): OrderedDict.__init__(self) idxline = idxline.rstrip() - fields = idxline.split('\t') + fields = idxline.split(b'\t') if len(fields) < self.NUM_REQ_FIELDS: msg = 'invalid idx format: {0} fields found, {1} required' raise CDXException(msg.format(len(fields), self.NUM_REQ_FIELDS)) - for header, field in itertools.izip(self.FORMAT, fields): - self[header] = field + for header, field in zip(self.FORMAT, fields): + self[header] = field.decode('utf-8') self['offset'] = int(self['offset']) self['length'] = int(self['length']) @@ -250,4 +252,4 @@ class IDXObject(OrderedDict): return json_encode(self) + '\n' def __str__(self): - return self.idxline + return self.idxline.decode('utf-8') diff --git a/pywb/cdx/cdxops.py b/pywb/cdx/cdxops.py index 84b1289a..790e5ac9 100644 --- a/pywb/cdx/cdxops.py +++ b/pywb/cdx/cdxops.py @@ -1,15 +1,18 @@ -from cdxobject import CDXObject, IDXObject -from cdxobject import TIMESTAMP, STATUSCODE, MIMETYPE, DIGEST -from cdxobject import OFFSET, LENGTH, FILENAME +from pywb.cdx.cdxobject import CDXObject, IDXObject +from pywb.cdx.cdxobject import TIMESTAMP, STATUSCODE, MIMETYPE, DIGEST +from pywb.cdx.cdxobject import OFFSET, LENGTH, FILENAME -from query import CDXQuery +from pywb.cdx.query import CDXQuery from pywb.utils.timeutils import timestamp_to_sec, pad_timestamp from pywb.utils.timeutils import PAD_14_DOWN, PAD_14_UP import bisect -import itertools + +from six.moves import zip, range, map import re + + from heapq import merge from collections import deque @@ -127,7 +130,7 @@ def cdx_limit(cdx_iter, limit): """ # for cdx, _ in itertools.izip(cdx_iter, xrange(limit)): # yield cdx - return (cdx for cdx, _ in itertools.izip(cdx_iter, xrange(limit))) + return (cdx for cdx, _ in zip(cdx_iter, range(limit))) #================================================================= @@ -221,7 +224,7 @@ def cdx_filter(cdx_iter, filter_strings): def regex(self, val): return self.regex.match(val) is not None - filters = map(Filter, filter_strings) + filters = list(map(Filter, filter_strings)) for cdx in cdx_iter: if all(x(cdx) for x in filters): @@ -273,7 +276,7 @@ def cdx_sort_closest(closest, cdx_iter, limit=10): sort CDXCaptureResult by closest to timestamp. """ closest_cdx = [] - + closest_keys = [] closest_sec = timestamp_to_sec(closest) for cdx in cdx_iter: @@ -281,19 +284,26 @@ def cdx_sort_closest(closest, cdx_iter, limit=10): key = abs(closest_sec - sec) # create tuple to sort by key - bisect.insort(closest_cdx, (key, cdx)) + #bisect.insort(closest_cdx, (key, cdx)) + + i = bisect.bisect_right(closest_keys, key) + closest_keys.insert(i, key) + closest_cdx.insert(i, cdx) if len(closest_cdx) == limit: # assuming cdx in ascending order and keys have started increasing - if key > closest_cdx[-1]: + if key > closest_keys[-1]: break if len(closest_cdx) > limit: closest_cdx.pop() - for cdx in itertools.imap(lambda x: x[1], closest_cdx): + for cdx in closest_cdx: yield cdx + #for cdx in map(lambda x: x[1], closest_cdx): + # yield cdx + #================================================================= # resolve revisits diff --git a/pywb/cdx/cdxserver.py b/pywb/cdx/cdxserver.py index 5da0d621..a9644283 100644 --- a/pywb/cdx/cdxserver.py +++ b/pywb/cdx/cdxserver.py @@ -1,19 +1,18 @@ from pywb.utils.canonicalize import UrlCanonicalizer, calc_search_range from pywb.utils.wbexception import NotFoundException -from cdxops import cdx_load -from cdxsource import CDXSource, CDXFile, RemoteCDXSource, RedisCDXSource -from zipnum import ZipNumCluster -from cdxobject import CDXObject, CDXException -from query import CDXQuery -from cdxdomainspecific import load_domain_specific_cdx_rules +from pywb.cdx.cdxops import cdx_load +from pywb.cdx.cdxsource import CDXSource, CDXFile, RemoteCDXSource, RedisCDXSource +from pywb.cdx.zipnum import ZipNumCluster +from pywb.cdx.cdxobject import CDXObject, CDXException +from pywb.cdx.query import CDXQuery +from pywb.cdx.cdxdomainspecific import load_domain_specific_cdx_rules from pywb.utils.loaders import is_http from itertools import chain import logging import os -import urlparse #================================================================= diff --git a/pywb/cdx/cdxsource.py b/pywb/cdx/cdxsource.py index 9bad27ea..b5245dbb 100644 --- a/pywb/cdx/cdxsource.py +++ b/pywb/cdx/cdxsource.py @@ -3,11 +3,11 @@ from pywb.utils.binsearch import iter_range from pywb.utils.wbexception import AccessException, NotFoundException from pywb.utils.wbexception import BadRequestException, WbException -from query import CDXQuery +from pywb.cdx.query import CDXQuery -import urllib -import urllib2 -import itertools +from six.moves.urllib.request import urlopen, Request +from six.moves.urllib.error import HTTPError +from six.moves import map #================================================================= @@ -33,7 +33,8 @@ class CDXFile(CDXSource): @staticmethod def _do_load_file(filename, query): with open(filename, 'rb') as source: - gen = iter_range(source, query.key, query.end_key) + gen = iter_range(source, query.key.encode('utf-8'), + query.end_key.encode('utf-8')) for line in gen: yield line @@ -65,14 +66,14 @@ class RemoteCDXSource(CDXSource): urlparams = remote_query.urlencode() try: - request = urllib2.Request(self.remote_url + '?' + urlparams) + request = Request(self.remote_url + '?' + urlparams) if self.cookie: request.add_header('Cookie', self.cookie) - response = urllib2.urlopen(request) + response = urlopen(request) - except urllib2.HTTPError as e: + except HTTPError as e: if e.code == 403: raise AccessException('Access Denied') elif e.code == 404: @@ -95,14 +96,14 @@ class RemoteCDXSource(CDXSource): #================================================================= class RedisCDXSource(CDXSource): - DEFAULT_KEY_PREFIX = 'c:' + DEFAULT_KEY_PREFIX = b'c:' def __init__(self, redis_url, config=None): import redis parts = redis_url.split('/') if len(parts) > 4: - self.cdx_key = parts[4] + self.cdx_key = parts[4].encode('utf-8') redis_url = 'redis://' + parts[2] + '/' + parts[3] else: self.cdx_key = None @@ -126,7 +127,7 @@ class RedisCDXSource(CDXSource): if self.cdx_key: return self.load_sorted_range(query, self.cdx_key) else: - return self.load_single_key(query.key) + return self.load_single_key(query.key.encode('utf-8')) def load_sorted_range(self, query, cdx_key): cdx_list = self.redis.zrangebylex(cdx_key, @@ -137,12 +138,12 @@ class RedisCDXSource(CDXSource): def load_single_key(self, key): # ensure only url/surt is part of key - key = key.split(' ')[0] + key = key.split(b' ')[0] cdx_list = self.redis.zrange(self.key_prefix + key, 0, -1) # key is not part of list, so prepend to each line - key += ' ' - cdx_list = itertools.imap(lambda x: key + x, cdx_list) + key += b' ' + cdx_list = map(lambda x: key + x, cdx_list) return cdx_list def __str__(self): diff --git a/pywb/cdx/query.py b/pywb/cdx/query.py index 71dcaa69..dd6144c6 100644 --- a/pywb/cdx/query.py +++ b/pywb/cdx/query.py @@ -1,5 +1,5 @@ -from urllib import urlencode -from cdxobject import CDXException +from six.moves.urllib.parse import urlencode +from pywb.cdx.cdxobject import CDXException #================================================================= diff --git a/pywb/cdx/test/test_cdxobject.py b/pywb/cdx/test/test_cdxobject.py index a2e73cbe..277b5912 100644 --- a/pywb/cdx/test/test_cdxobject.py +++ b/pywb/cdx/test/test_cdxobject.py @@ -5,17 +5,17 @@ from pywb.cdx.cdxobject import CDXObject, IDXObject, CDXException from pytest import raises def test_empty_cdxobject(): - x = CDXObject('') + x = CDXObject(b'') assert len(x) == 0 def test_invalid_cdx_format(): with raises(CDXException): - x = CDXObject('a b c') + x = CDXObject(b'a b c') def _make_line(fields): line = ' '.join(['-'] * fields) - x = CDXObject(line) + x = CDXObject(line.encode('utf-8')) assert len(x) == fields assert str(x) == line @@ -29,13 +29,13 @@ def test_valid_cdx_formats(): _make_line(14) def test_unicode_url(): - x = CDXObject('com,example,cafe)/ 123 {"url": "http://example.com/café/path"}') + x = CDXObject(u'com,example,cafe)/ 123 {"url": "http://example.com/café/path"}'.encode('utf-8')) assert x['urlkey'] == 'com,example,cafe)/' assert x['timestamp'] == '123' assert x['url'] == 'http://example.com/caf%C3%A9/path' def test_invalid_idx_format(): with raises(CDXException): - x = IDXObject('a b c') + x = IDXObject(b'a b c') diff --git a/pywb/cdx/test/test_cdxops.py b/pywb/cdx/test/test_cdxops.py index 86592c37..8c550ece 100644 --- a/pywb/cdx/test/test_cdxops.py +++ b/pywb/cdx/test/test_cdxops.py @@ -31,17 +31,17 @@ com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYA com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz ->>> cdx_ops_test('http://example.com/', sources = [test_cdx_dir], to='2012') +>>> cdx_ops_test('http://example.com/', sources = [test_cdx_dir], to='2012') # doctest: +IGNORE_EXCEPTION_DETAIL Traceback (most recent call last): NotFoundException: No Captures found for: http://example.com/ # No matching results ->>> cdx_ops_test('http://iana.org/dont_have_this', reverse = True, resolveRevisits = True, limit = 2) +>>> cdx_ops_test('http://iana.org/dont_have_this', reverse = True, resolveRevisits = True, limit = 2) # doctest: +IGNORE_EXCEPTION_DETAIL Traceback (most recent call last): NotFoundException: No Captures found for: http://iana.org/dont_have_this # No matching -- limit=1 ->>> cdx_ops_test('http://iana.org/dont_have_this', reverse = True, resolveRevisits = True, limit = 1) +>>> cdx_ops_test('http://iana.org/dont_have_this', reverse = True, resolveRevisits = True, limit = 1) # doctest: +IGNORE_EXCEPTION_DETAIL Traceback (most recent call last): NotFoundException: No Captures found for: http://iana.org/dont_have_this @@ -69,7 +69,7 @@ org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/ org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz # Filter -- no such field, no matches ->>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', filter = 'blah:200') +>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', filter = 'blah:200') # doctest: +IGNORE_EXCEPTION_DETAIL Traceback (most recent call last): NotFoundException: No Captures found for: http://iana.org/_css/2013.1/screen.css @@ -163,50 +163,66 @@ org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201249 http://www.iana.org/_ org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz - - - org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz - - - -# Resolve Revisit -- cdxj minimal -- output also json ->>> cdx_ops_test(url = 'http://example.com/?example=1', sources=[get_test_dir() + 'cdxj/example.cdxj'], resolveRevisits=True) -{"urlkey": "com,example)/?example=1", "timestamp": "20140103030321", "url": "http://example.com?example=1", "length": "1043", "filename": "example.warc.gz", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "offset": "333", "orig.length": "-", "orig.offset": "-", "orig.filename": "-"} -{"urlkey": "com,example)/?example=1", "timestamp": "20140103030341", "url": "http://example.com?example=1", "filename": "example.warc.gz", "length": "553", "mime": "", "offset": "1864", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "orig.length": "1043", "orig.offset": "333", "orig.filename": "example.warc.gz"} - -# Resolve Revisit -- cdxj minimal -- output also json ->>> cdx_ops_test(url = 'http://example.com/?example=1', sources=[get_test_dir() + 'cdxj/example-no-digest.cdxj'], resolveRevisits=True) -{"urlkey": "com,example)/?example=1", "timestamp": "20140103030321", "url": "http://example.com?example=1", "length": "1043", "filename": "example.warc.gz", "offset": "333", "orig.length": "-", "orig.offset": "-", "orig.filename": "-"} -{"urlkey": "com,example)/?example=1", "timestamp": "20140103030341", "url": "http://example.com?example=1", "length": "553", "filename": "example.warc.gz", "mime": "warc/revisit", "offset": "1864", "orig.length": "-", "orig.offset": "-", "orig.filename": "-"} - - - - """ #================================================================= from pywb.cdx.cdxserver import CDXServer import os import sys +import six from pywb import get_test_dir test_cdx_dir = get_test_dir() + 'cdx/' -def cdx_ops_test(url, sources = [test_cdx_dir + 'iana.cdx'], **kwparams): +def cdx_ops_test_data(url, sources = [test_cdx_dir + 'iana.cdx'], **kwparams): kwparams['url'] = url if not 'output' in kwparams: kwparams['output'] = 'cdxobject' - fields = kwparams.get('fields') - if fields: - fields = fields.split(',') server = CDXServer(sources) results = server.load_cdx(**kwparams) + return list(results) + + +def cdx_ops_test(*args, **kwargs): + results = cdx_ops_test_data(*args, **kwargs) + + fields = kwargs.get('fields') + if fields: + fields = fields.split(',') for x in results: if not isinstance(x, str): l = x.to_text(fields).replace('\t', ' ') else: l = x + sys.stdout.write(l) + +def test_cdxj_resolve_revisit(): + # Resolve Revisit -- cdxj minimal -- output also json + results = cdx_ops_test_data(url = 'http://example.com/?example=1', sources=[get_test_dir() + 'cdxj/example.cdxj'], resolveRevisits=True) + assert(len(results) == 2) + assert(dict(results[0]) == {"urlkey": "com,example)/?example=1", "timestamp": "20140103030321", "url": "http://example.com?example=1", "length": "1043", "filename": "example.warc.gz", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "offset": "333", "orig.length": "-", "orig.offset": "-", "orig.filename": "-"}) + + assert(dict(results[1]) == {"urlkey": "com,example)/?example=1", "timestamp": "20140103030341", "url": "http://example.com?example=1", "filename": "example.warc.gz", "length": "553", "mime": "", "offset": "1864", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "orig.length": "1043", "orig.offset": "333", "orig.filename": "example.warc.gz"}) + + + +def test_cdxj_resolve_revisit_2(): + # Resolve Revisit -- cdxj minimal -- output also json + results = cdx_ops_test_data(url = 'http://example.com/?example=1', sources=[get_test_dir() + 'cdxj/example-no-digest.cdxj'], resolveRevisits=True) + assert(len(results) == 2) + assert(dict(results[0]) == {"urlkey": "com,example)/?example=1", "timestamp": "20140103030321", "url": "http://example.com?example=1", "length": "1043", "filename": "example.warc.gz", "offset": "333", "orig.length": "-", "orig.offset": "-", "orig.filename": "-"}) + + assert(dict(results[1]) == {"urlkey": "com,example)/?example=1", "timestamp": "20140103030341", "url": "http://example.com?example=1", "length": "553", "filename": "example.warc.gz", "mime": "warc/revisit", "offset": "1864", "orig.length": "-", "orig.offset": "-", "orig.filename": "-"}) + + + if __name__ == "__main__": import doctest doctest.testmod() diff --git a/pywb/cdx/test/test_cdxserver.py b/pywb/cdx/test/test_cdxserver.py index c4a43996..23772ced 100644 --- a/pywb/cdx/test/test_cdxserver.py +++ b/pywb/cdx/test/test_cdxserver.py @@ -6,11 +6,14 @@ from pywb.utils.dsrules import DEFAULT_RULES_FILE from pywb.utils.wbexception import AccessException, NotFoundException from pywb.utils.wbexception import BadRequestException, WbException -from urllib2 import HTTPError +from six.moves.urllib.error import HTTPError from mock import patch from pytest import raises import webtest +import unittest + +import six from pywb import get_test_dir @@ -41,7 +44,7 @@ def setup_module(self): def mock_urlopen(req): resp = testapp.get(req.get_full_url()) - return resp.body.split('\n') + return resp.body.split(b'\n') def mock_urlopen_err(err): def make_err(req): @@ -58,45 +61,44 @@ def mock_urlopen_fuzzy(req): resp = testapp.get(req.get_full_url(), status=status) if status == 200: - return resp.body.split('\n') + return resp.body.split(b'\n') else: raise mock_urlopen_err(404)(req) -@patch('pywb.cdx.cdxsource.urllib2.urlopen', mock_urlopen) +@patch('pywb.cdx.cdxsource.urlopen', mock_urlopen) def assert_cdx_match(server): x = server.load_cdx(url='example.com', limit=2, output='cdxobject') - x.next() - assert x.next().items() == CDX_RESULT - + x = list(x) + assert(list(x[1].items()) == CDX_RESULT) def assert_cdx_fuzzy_match(server, mock=mock_urlopen): - with patch('pywb.cdx.cdxsource.urllib2.urlopen', mock): + with patch('pywb.cdx.cdxsource.urlopen', mock): x = server.load_cdx(url='http://example.com?_=123', limit=2, output='cdxobject', allowFuzzy=True) - x.next() - assert x.next().items() == CDX_RESULT + x = list(x) + assert(list(x[1].items()) == CDX_RESULT) -@patch('pywb.cdx.cdxsource.urllib2.urlopen', mock_urlopen_err(404)) +@patch('pywb.cdx.cdxsource.urlopen', mock_urlopen_err(404)) def assert_404(server): server.load_cdx(url='http://notfound.example.com') -@patch('pywb.cdx.cdxsource.urllib2.urlopen', mock_urlopen_err(403)) +@patch('pywb.cdx.cdxsource.urlopen', mock_urlopen_err(403)) def assert_403(server): server.load_cdx(url='http://notfound.example.com') -@patch('pywb.cdx.cdxsource.urllib2.urlopen', mock_urlopen_err(400)) +@patch('pywb.cdx.cdxsource.urlopen', mock_urlopen_err(400)) def assert_400(server): server.load_cdx(url='http://notfound.example.com') -@patch('pywb.cdx.cdxsource.urllib2.urlopen', mock_urlopen_err(502)) +@patch('pywb.cdx.cdxsource.urlopen', mock_urlopen_err(502)) def assert_502(server): server.load_cdx(url='http://notfound.example.com') @@ -131,7 +133,7 @@ def test_fuzzy_match(): def test_fuzzy_no_match_1(): # no match, no fuzzy - with patch('pywb.cdx.cdxsource.urllib2.urlopen', mock_urlopen): + with patch('pywb.cdx.cdxsource.urlopen', mock_urlopen): server = CDXServer([TEST_CDX_DIR], ds_rules_file=DEFAULT_RULES_FILE) with raises(NotFoundException): server.load_cdx(url='http://notfound.example.com/', @@ -141,7 +143,7 @@ def test_fuzzy_no_match_1(): def test_fuzzy_no_match_2(): # fuzzy rule, but no actual match - with patch('pywb.cdx.cdxsource.urllib2.urlopen', mock_urlopen): + with patch('pywb.cdx.cdxsource.urlopen', mock_urlopen): server = CDXServer([TEST_CDX_DIR], ds_rules_file=DEFAULT_RULES_FILE) with raises(NotFoundException): server.load_cdx(url='http://notfound.example.com/?_=1234', @@ -153,7 +155,7 @@ def test_fuzzy_no_match_2(): def test2_fuzzy_no_match_3(): # special fuzzy rule, matches prefix test.example.example., # but doesn't match rule regex - with patch('pywb.cdx.cdxsource.urllib2.urlopen', mock_urlopen): + with patch('pywb.cdx.cdxsource.urlopen', mock_urlopen): server = CDXServer([TEST_CDX_DIR], ds_rules_file=DEFAULT_RULES_FILE) with raises(NotFoundException): server.load_cdx(url='http://test.example.example/', diff --git a/pywb/cdx/test/test_lazy_ops.py b/pywb/cdx/test/test_lazy_ops.py index fa579d0d..5fdad2a4 100644 --- a/pywb/cdx/test/test_lazy_ops.py +++ b/pywb/cdx/test/test_lazy_ops.py @@ -4,6 +4,8 @@ from pywb.cdx.query import CDXQuery from pytest import raises +import six + KEY = 'com,example)/' #================================================================ @@ -30,7 +32,7 @@ def lazy_cdx_load(**params): # exception happens on first access attempt with raises(AccessException): - cdx_iter.next() + six.next(cdx_iter) def test_no_process(): diff --git a/pywb/cdx/test/test_redis_source.py b/pywb/cdx/test/test_redis_source.py index a52411dd..b0da56fc 100644 --- a/pywb/cdx/test/test_redis_source.py +++ b/pywb/cdx/test/test_redis_source.py @@ -35,13 +35,13 @@ def zadd_cdx(source, cdx, key): source.redis.zadd(key, 0, cdx) return - parts = cdx.split(' ', 2) + parts = cdx.split(b' ', 2) key = parts[0] timestamp = parts[1] - rest = timestamp + ' ' + parts[2] + rest = timestamp + b' ' + parts[2] - score = timestamp_to_sec(timestamp) + score = timestamp_to_sec(timestamp.decode('utf-8')) source.redis.zadd(source.key_prefix + key, score, rest) diff --git a/pywb/cdx/test/test_zipnum.py b/pywb/cdx/test/test_zipnum.py index 4b0336ae..a2ebb547 100644 --- a/pywb/cdx/test/test_zipnum.py +++ b/pywb/cdx/test/test_zipnum.py @@ -22,29 +22,6 @@ org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ te org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz -# Pages -- default page size ->>> zip_ops_test(url='http://iana.org/domains/example', matchType='exact', showNumPages=True) -{"blocks": 1, "pages": 1, "pageSize": 10} - ->>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', showNumPages=True) -{"blocks": 38, "pages": 4, "pageSize": 10} - -# set page size ->>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', pageSize=4, showNumPages=True) -{"blocks": 38, "pages": 10, "pageSize": 4} - -# set page size -- alt domain query ->>> zip_ops_test(url='*.iana.org', pageSize='4', showNumPages=True) -{"blocks": 38, "pages": 10, "pageSize": 4} - -# page size for non-existent, but secondary index match ->>> zip_ops_test(url='iana.org/domains/int/blah', pageSize=4, showNumPages=True) -{"blocks": 0, "pages": 0, "pageSize": 4} - -# page size for non-existent, no secondary index match ->>> zip_ops_test(url='*.foo.bar', showNumPages=True) -{"blocks": 0, "pages": 0, "pageSize": 10} - # first page >>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', showPagedIndex=True, pageSize=4, page=0) com,example)/ 20140127171200 zipnum 0 275 1 @@ -116,16 +93,16 @@ org,iana)/time-zones/y 20140126200737 http://www.iana.org/time-zones/Y text/html org,iana)/time-zones/y 20140126200737 http://www.iana.org/time-zones/Y text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz # invalid page ->>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', showPagedIndex=True, pageSize=4, page=10) +>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', showPagedIndex=True, pageSize=4, page=10) # doctest: +IGNORE_EXCEPTION_DETAIL Traceback (most recent call last): CDXException: Page 10 invalid: First Page is 0, Last Page is 9 ->>> zip_ops_test(url='http://aaa.aaa/', matchType='exact', showPagedIndex=True) +>>> zip_ops_test(url='http://aaa.aaa/', matchType='exact', showPagedIndex=True) # doctest: +IGNORE_EXCEPTION_DETAIL Traceback (most recent call last): NotFoundException: No Captures found for: http://aaa.aaa/ ->>> zip_ops_test(url='http://aaa.aaa/', matchType='domain', showPagedIndex=True) +>>> zip_ops_test(url='http://aaa.aaa/', matchType='domain', showPagedIndex=True) # doctest: +IGNORE_EXCEPTION_DETAIL Traceback (most recent call last): NotFoundException: No Captures found for: http://aaa.aaa/ (domain query) @@ -133,34 +110,26 @@ NotFoundException: No Captures found for: http://aaa.aaa/ (domain query) >>> zip_ops_test(url='http://aaa.zz/', matchType='domain', showPagedIndex=True) org,iana)/time-zones 20140126200737 zipnum 9631 166 38 -# read cdx to find 0 pages ->>> zip_ops_test(url='http://aaa.zz/', matchType='domain', showNumPages=True) -{"blocks": 0, "pages": 0, "pageSize": 10} - # read cdx to find no captures ->>> zip_ops_test(url='http://aaa.zz/', matchType='domain') +>>> zip_ops_test(url='http://aaa.zz/', matchType='domain') # doctest: +IGNORE_EXCEPTION_DETAIL Traceback (most recent call last): NotFoundException: No Captures found for: http://aaa.zz/ (domain query) # Invalid .idx filesor or missing loc ->>> zip_test_err(url='http://example.com/', matchType='exact') +>>> zip_test_err(url='http://example.com/', matchType='exact') # doctest: +IGNORE_EXCEPTION_DETAIL Traceback (most recent call last): Exception: No Locations Found for: foo ->>> zip_test_err(url='http://iana.org/x', matchType='exact') -Traceback (most recent call last): -IOError: [Errno 2] No such file or directory: './sample_archive/invalid' - ->>> zip_test_err(url='http://example.zz/x', matchType='exact') +>>> zip_test_err(url='http://example.zz/x', matchType='exact') # doctest: +IGNORE_EXCEPTION_DETAIL Traceback (most recent call last): Exception: No Locations Found for: foo2 """ -from test_cdxops import cdx_ops_test +from test_cdxops import cdx_ops_test, cdx_ops_test_data from pywb import get_test_dir from pywb.cdx.cdxserver import CDXServer @@ -170,9 +139,15 @@ import tempfile import os import json +import pytest + test_zipnum = get_test_dir() + 'zipcdx/zipnum-sample.idx' +def zip_ops_test_data(url, **kwargs): + sources = test_zipnum + return json.loads(cdx_ops_test_data(url, sources, **kwargs)[0]) + def zip_ops_test(url, **kwargs): sources = test_zipnum cdx_ops_test(url, sources, **kwargs) @@ -220,6 +195,50 @@ def test_zip_prefix_load(): +def test_blocks_def_page_size(): + # Pages -- default page size + res = zip_ops_test_data(url='http://iana.org/domains/example', matchType='exact', showNumPages=True) + assert(res == {"blocks": 1, "pages": 1, "pageSize": 10}) + +def test_blocks_def_size_2(): + res = zip_ops_test_data(url='http://iana.org/domains/', matchType='domain', showNumPages=True) + assert(res == {"blocks": 38, "pages": 4, "pageSize": 10}) + +def test_blocks_set_page_size(): + # set page size + res = zip_ops_test_data(url='http://iana.org/domains/', matchType='domain', pageSize=4, showNumPages=True) + assert(res == {"blocks": 38, "pages": 10, "pageSize": 4}) + +def test_blocks_alt_q(): + # set page size -- alt domain query + res = zip_ops_test_data(url='*.iana.org', pageSize='4', showNumPages=True) + assert(res == {"blocks": 38, "pages": 10, "pageSize": 4}) + +def test_blocks_secondary_match(): + # page size for non-existent, but secondary index match + res = zip_ops_test_data(url='iana.org/domains/int/blah', pageSize=4, showNumPages=True) + assert(res == {"blocks": 0, "pages": 0, "pageSize": 4}) + +def test_blocks_no_match(): + # page size for non-existent, no secondary index match + res = zip_ops_test_data(url='*.foo.bar', showNumPages=True) + assert(res == {"blocks": 0, "pages": 0, "pageSize": 10}) + +def test_blocks_zero_pages(): + # read cdx to find 0 pages + res = zip_ops_test_data(url='http://aaa.zz/', matchType='domain', showNumPages=True) + assert(res == {"blocks": 0, "pages": 0, "pageSize": 10}) + + +# Errors + +def test_err_file_not_found(): + with pytest.raises(IOError): + zip_test_err(url='http://iana.org/x', matchType='exact') # doctest: +IGNORE_EXCEPTION_DETAIL + + + + if __name__ == "__main__": import doctest doctest.testmod() diff --git a/pywb/cdx/zipnum.py b/pywb/cdx/zipnum.py index d0b832d2..94b31f8f 100644 --- a/pywb/cdx/zipnum.py +++ b/pywb/cdx/zipnum.py @@ -5,9 +5,12 @@ import logging from io import BytesIO import datetime import json +import six -from cdxsource import CDXSource -from cdxobject import IDXObject, CDXException +from six.moves import map + +from pywb.cdx.cdxsource import CDXSource +from pywb.cdx.cdxobject import IDXObject, CDXException from pywb.utils.loaders import BlockLoader, read_last_line from pywb.utils.bufferedreaders import gzip_decompressor @@ -52,7 +55,7 @@ class LocMapResolver(object): self.loc_mtime = new_mtime logging.debug('Loading loc from: ' + self.loc_filename) - with open(self.loc_filename, 'rb') as fh: + with open(self.loc_filename, 'r') as fh: for line in fh: parts = line.rstrip().split('\t') self.loc_map[parts[0]] = parts[1:] @@ -170,25 +173,28 @@ class ZipNumCluster(CDXSource): last_line = None + start_key = query.key.encode('utf-8') + end_key = query.end_key.encode('utf-8') + # Get End - end_iter = search(reader, query.end_key, prev_size=1) + end_iter = search(reader, end_key, prev_size=1) try: - end_line = end_iter.next() + end_line = six.next(end_iter) except StopIteration: last_line = read_last_line(reader) end_line = last_line # Get Start first_iter = iter_range(reader, - query.key, - query.end_key, + start_key, + end_key, prev_size=1) try: - first_line = first_iter.next() + first_line = six.next(first_iter) except StopIteration: - if end_line == last_line and query.key >= last_line: + if end_line == last_line and start_key >= last_line: first_line = last_line else: reader.close() @@ -204,7 +210,7 @@ class ZipNumCluster(CDXSource): try: blocks = end['lineno'] - first['lineno'] - total_pages = blocks / pagesize + 1 + total_pages = int(blocks / pagesize) + 1 except: blocks = -1 total_pages = 1 @@ -215,8 +221,8 @@ class ZipNumCluster(CDXSource): if blocks == 0: try: block_cdx_iter = self.idx_to_cdx([first_line], query) - block = block_cdx_iter.next() - cdx = block.next() + block = six.next(block_cdx_iter) + cdx = six.next(block) except StopIteration: total_pages = 0 blocks = -1 @@ -250,12 +256,12 @@ class ZipNumCluster(CDXSource): def search_by_line_num(self, reader, line): # pragma: no cover def line_cmp(line1, line2): - line1_no = int(line1.rsplit('\t', 1)[-1]) - line2_no = int(line2.rsplit('\t', 1)[-1]) + line1_no = int(line1.rsplit(b'\t', 1)[-1]) + line2_no = int(line2.rsplit(b'\t', 1)[-1]) return cmp(line1_no, line2_no) line_iter = search(reader, line, compare_func=line_cmp) - yield line_iter.next() + yield six.next(line_iter) def idx_to_cdx(self, idx_iter, query): blocks = None @@ -304,7 +310,8 @@ class ZipNumCluster(CDXSource): last_traceback = sys.exc_info()[2] if last_exc: - raise last_exc, None, last_traceback + six.reraise(Exception, last_exc, last_traceback) + #raise last_exc else: raise Exception('No Locations Found for: ' + blocks.part) @@ -326,13 +333,13 @@ class ZipNumCluster(CDXSource): for line in BytesIO(buff): yield line - iter_ = itertools.chain(*itertools.imap(decompress_block, ranges)) + iter_ = itertools.chain(*map(decompress_block, ranges)) # start bound - iter_ = linearsearch(iter_, query.key) + iter_ = linearsearch(iter_, query.key.encode('utf-8')) # end bound - end = query.end_key + end = query.end_key.encode('utf-8') iter_ = itertools.takewhile(lambda line: line < end, iter_) return iter_ diff --git a/pywb/framework/archivalrouter.py b/pywb/framework/archivalrouter.py index 487fa596..866a2b8d 100644 --- a/pywb/framework/archivalrouter.py +++ b/pywb/framework/archivalrouter.py @@ -1,10 +1,10 @@ -import urlparse +from six.moves.urllib.parse import urlsplit, urlunsplit, quote + import re -from urllib import quote from pywb.rewrite.url_rewriter import UrlRewriter from pywb.rewrite.wburl import WbUrl -from wbrequestresponse import WbRequest, WbResponse +from pywb.framework.wbrequestresponse import WbRequest, WbResponse #================================================================= @@ -182,7 +182,7 @@ class ReferRedirect: return None # get referrer path name - ref_split = urlparse.urlsplit(referrer) + ref_split = urlsplit(referrer) # require that referrer starts with current Host, if any curr_host = env.get('HTTP_HOST') @@ -236,10 +236,10 @@ class ReferRedirect: ref_request.wb_url.url = new_wb_url.url return ref_route.handler(ref_request) - final_url = urlparse.urlunsplit((ref_split.scheme, - ref_split.netloc, - rewritten_url, - '', - '')) + final_url = urlunsplit((ref_split.scheme, + ref_split.netloc, + rewritten_url, + '', + '')) return WbResponse.redir_response(final_url, status='302 Temp Redirect') diff --git a/pywb/framework/memento.py b/pywb/framework/memento.py index 6be91979..8c72b374 100644 --- a/pywb/framework/memento.py +++ b/pywb/framework/memento.py @@ -2,7 +2,7 @@ from pywb.utils.wbexception import BadRequestException from pywb.utils.timeutils import http_date_to_timestamp from pywb.utils.timeutils import timestamp_to_http_date -from wbrequestresponse import WbRequest, WbResponse +from pywb.framework.wbrequestresponse import WbRequest, WbResponse from pywb.rewrite.wburl import WbUrl LINK_FORMAT = 'application/link-format' diff --git a/pywb/framework/proxy.py b/pywb/framework/proxy.py index e2c96012..439f52a4 100644 --- a/pywb/framework/proxy.py +++ b/pywb/framework/proxy.py @@ -3,7 +3,7 @@ from __future__ import absolute_import from pywb.framework.wbrequestresponse import WbResponse, WbRequest from pywb.framework.archivalrouter import ArchivalRouter -import urlparse +from six.moves.urllib.parse import urlsplit import base64 import socket @@ -164,7 +164,7 @@ class ProxyRouter(object): url = env['REL_REQUEST_URI'] else: - parts = urlparse.urlsplit(env['REL_REQUEST_URI']) + parts = urlsplit(env['REL_REQUEST_URI']) hostport = parts.netloc.split(':', 1) env['pywb.proxy_host'] = hostport[0] env['pywb.proxy_port'] = hostport[1] if len(hostport) == 2 else '' diff --git a/pywb/framework/proxy_resolvers.py b/pywb/framework/proxy_resolvers.py index 221e9d86..401c03e9 100644 --- a/pywb/framework/proxy_resolvers.py +++ b/pywb/framework/proxy_resolvers.py @@ -1,13 +1,14 @@ -from wbrequestresponse import WbResponse +from pywb.framework.wbrequestresponse import WbResponse from pywb.utils.loaders import extract_client_cookie from pywb.utils.wbexception import WbException from pywb.utils.statusandheaders import StatusAndHeaders from pywb.rewrite.wburl import WbUrl -from cache import create_cache -from basehandlers import WbUrlHandler +from pywb.framework.cache import create_cache +from pywb.framework.basehandlers import WbUrlHandler + +from six.moves.urllib.parse import parse_qs, urlsplit -import urlparse import base64 import os import json @@ -130,7 +131,7 @@ class IPCacheResolver(BaseCollResolver): ip = env['REMOTE_ADDR'] qs = env.get('pywb.proxy_query') if qs: - res = urlparse.parse_qs(qs) + res = parse_qs(qs) if 'ip' in res: ip = res['ip'][0] @@ -145,7 +146,7 @@ class IPCacheResolver(BaseCollResolver): qs = env.get('pywb.proxy_query') if qs: - res = urlparse.parse_qs(qs) + res = parse_qs(qs) if 'ip' in res: ip = res['ip'][0] @@ -223,7 +224,7 @@ class CookieResolver(BaseCollResolver): def handle_magic_page(self, env): request_url = env['REL_REQUEST_URI'] - parts = urlparse.urlsplit(request_url) + parts = urlsplit(request_url) server_name = env['pywb.proxy_host'] path_url = parts.path[1:] @@ -309,7 +310,7 @@ class CookieResolver(BaseCollResolver): if '://' not in path_url: path_url = 'http://' + path_url - path_parts = urlparse.urlsplit(path_url) + path_parts = urlsplit(path_url) new_url = path_parts.path[1:] if path_parts.query: diff --git a/pywb/framework/test/test_archivalrouter.py b/pywb/framework/test/test_archivalrouter.py index 3912edc4..abcaafc7 100644 --- a/pywb/framework/test/test_archivalrouter.py +++ b/pywb/framework/test/test_archivalrouter.py @@ -94,8 +94,10 @@ False from pywb.framework.archivalrouter import Route, ReferRedirect, ArchivalRouter from pywb.framework.basehandlers import BaseHandler, WbUrlHandler + import pprint -import urlparse + +from six.moves.urllib.parse import urlsplit def _test_route_req(route, env, abs_path=False): matcher, coll = route.is_handling(env['REL_REQUEST_URI']) diff --git a/pywb/framework/test/test_wbrequestresponse.py b/pywb/framework/test/test_wbrequestresponse.py index 1f1f5a39..2209fa3b 100644 --- a/pywb/framework/test/test_wbrequestresponse.py +++ b/pywb/framework/test/test_wbrequestresponse.py @@ -87,7 +87,7 @@ def print_req_from_uri(request_uri, env={}, use_abs_prefix=False): response = req_from_uri(request_uri, env, use_abs_prefix) varlist = vars(response) the_dict = dict((k, varlist[k]) for k in ('request_uri', 'wb_prefix', 'wb_url', 'coll')) - print the_dict + print(the_dict) def req_from_uri(request_uri, env={}, use_abs_prefix=False): diff --git a/pywb/framework/test/test_wsgi_wrapper.py b/pywb/framework/test/test_wsgi_wrapper.py index 3433acad..e8246405 100644 --- a/pywb/framework/test/test_wsgi_wrapper.py +++ b/pywb/framework/test/test_wsgi_wrapper.py @@ -41,7 +41,7 @@ def test_err_app(): resp = testapp.get('/abc', expect_errors=True) assert resp.status_int == 500 - assert '500 Internal Server Error Error: Test Unexpected Error' in resp.body + assert b'500 Internal Server Error Error: Test Unexpected Error' in resp.body def test_custom_err_app(): the_app = init_app(initer(TestCustomErrApp), load_yaml=False) @@ -50,7 +50,7 @@ def test_custom_err_app(): resp = testapp.get('/abc', expect_errors=True) assert resp.status_int == 403 - assert '403 Access Denied Error: Forbidden Test' in resp.body + assert b'403 Access Denied Error: Forbidden Test' in resp.body diff --git a/pywb/framework/wsgi_wrappers.py b/pywb/framework/wsgi_wrappers.py index 9dae514e..4220220e 100644 --- a/pywb/framework/wsgi_wrappers.py +++ b/pywb/framework/wsgi_wrappers.py @@ -1,7 +1,7 @@ from pywb.utils.wbexception import WbException, NotFoundException from pywb.utils.loaders import load_yaml_config -from wbrequestresponse import WbResponse, StatusAndHeaders +from pywb.framework.wbrequestresponse import WbResponse, StatusAndHeaders import os @@ -92,12 +92,13 @@ class WSGIApp(object): else: err_url = None - err_msg = exc.message + if len(exc.args): + err_msg = exc.args[0] if print_trace: import traceback - err_details = traceback.format_exc(exc) - print err_details + err_details = traceback.format_exc() + print(err_details) else: logging.info(err_msg) err_details = None diff --git a/pywb/rewrite/cookie_rewriter.py b/pywb/rewrite/cookie_rewriter.py index f5c3c570..67ef088e 100644 --- a/pywb/rewrite/cookie_rewriter.py +++ b/pywb/rewrite/cookie_rewriter.py @@ -1,4 +1,4 @@ -from Cookie import SimpleCookie, CookieError +from six.moves.http_cookies import SimpleCookie, CookieError #================================================================= diff --git a/pywb/rewrite/html_rewriter.py b/pywb/rewrite/html_rewriter.py index 50629514..3f485684 100644 --- a/pywb/rewrite/html_rewriter.py +++ b/pywb/rewrite/html_rewriter.py @@ -1,16 +1,14 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -import sys import re -from HTMLParser import HTMLParser, HTMLParseError -from urlparse import urljoin, urlsplit, urlunsplit +from six.moves.html_parser import HTMLParser +from six.moves.urllib.parse import urljoin, urlsplit, urlunsplit -from url_rewriter import UrlRewriter -from regex_rewriters import JSRewriter, CSSRewriter -import cgi +from pywb.rewrite.url_rewriter import UrlRewriter +from pywb.rewrite.regex_rewriters import JSRewriter, CSSRewriter #================================================================= @@ -411,7 +409,7 @@ class HTMLRewriter(HTMLRewriterMixin, HTMLParser): def feed(self, string): try: HTMLParser.feed(self, string) - except HTMLParseError: # pragma: no cover + except Exception: # pragma: no cover # only raised in 2.6 self.out.write(string) @@ -429,7 +427,7 @@ class HTMLRewriter(HTMLRewriterMixin, HTMLParser): try: HTMLParser.close(self) - except HTMLParseError: # pragma: no cover + except Exception: # pragma: no cover # only raised in 2.6 pass diff --git a/pywb/rewrite/regex_rewriters.py b/pywb/rewrite/regex_rewriters.py index 40c7653e..e690dada 100644 --- a/pywb/rewrite/regex_rewriters.py +++ b/pywb/rewrite/regex_rewriters.py @@ -1,8 +1,6 @@ import re -import sys -import itertools -from url_rewriter import UrlRewriter +from pywb.rewrite.url_rewriter import UrlRewriter #================================================================= diff --git a/pywb/rewrite/rewrite_content.py b/pywb/rewrite/rewrite_content.py index 990e6a3c..1e6e7b1b 100644 --- a/pywb/rewrite/rewrite_content.py +++ b/pywb/rewrite/rewrite_content.py @@ -7,16 +7,16 @@ import re from chardet.universaldetector import UniversalDetector from io import BytesIO -from header_rewriter import RewrittenStatusAndHeaders +from pywb.rewrite.header_rewriter import RewrittenStatusAndHeaders -from rewriterules import RewriteRules +from pywb.rewrite.rewriterules import RewriteRules from pywb.utils.dsrules import RuleSet from pywb.utils.statusandheaders import StatusAndHeaders from pywb.utils.bufferedreaders import DecompressingBufferedReader from pywb.utils.bufferedreaders import ChunkedDataReader, BufferedReader -from regex_rewriters import JSNoneRewriter, JSLinkOnlyRewriter +from pywb.rewrite.regex_rewriters import JSNoneRewriter, JSLinkOnlyRewriter #================================================================= @@ -288,7 +288,7 @@ class RewriteContent: def _decode_buff(buff, stream, encoding): # pragma: no coverage try: buff = buff.decode(encoding) - except UnicodeDecodeError, e: + except UnicodeDecodeError as e: # chunk may have cut apart unicode bytes -- add 1-3 bytes and retry for i in range(3): buff += stream.read(1) diff --git a/pywb/rewrite/rewrite_live.py b/pywb/rewrite/rewrite_live.py index b3e2d464..fb339d4d 100644 --- a/pywb/rewrite/rewrite_live.py +++ b/pywb/rewrite/rewrite_live.py @@ -8,7 +8,7 @@ import mimetypes import logging import os -from urlparse import urlsplit +from six.moves.urllib.parse import urlsplit from pywb.utils.loaders import is_http, LimitReader, LocalFileLoader, to_file_url from pywb.utils.loaders import extract_client_cookie @@ -16,7 +16,7 @@ from pywb.utils.timeutils import timestamp_now from pywb.utils.statusandheaders import StatusAndHeaders from pywb.utils.canonicalize import canonicalize -from rewrite_content import RewriteContent +from pywb.rewrite.rewrite_content import RewriteContent #================================================================= diff --git a/pywb/rewrite/rewriterules.py b/pywb/rewrite/rewriterules.py index d2641516..d8b2d280 100644 --- a/pywb/rewrite/rewriterules.py +++ b/pywb/rewrite/rewriterules.py @@ -1,13 +1,12 @@ from pywb.utils.dsrules import BaseRule -from regex_rewriters import RegexRewriter, CSSRewriter, XMLRewriter -from regex_rewriters import JSLinkAndLocationRewriter, JSLinkOnlyRewriter -from regex_rewriters import JSLocationOnlyRewriter, JSNoneRewriter +from pywb.rewrite.regex_rewriters import RegexRewriter, CSSRewriter, XMLRewriter +from pywb.rewrite.regex_rewriters import JSLinkAndLocationRewriter, JSLinkOnlyRewriter +from pywb.rewrite.regex_rewriters import JSLocationOnlyRewriter, JSNoneRewriter -from header_rewriter import HeaderRewriter -from html_rewriter import HTMLRewriter +from pywb.rewrite.header_rewriter import HeaderRewriter +from pywb.rewrite.html_rewriter import HTMLRewriter -import itertools import re diff --git a/pywb/rewrite/url_rewriter.py b/pywb/rewrite/url_rewriter.py index db0ace2f..140c2d45 100644 --- a/pywb/rewrite/url_rewriter.py +++ b/pywb/rewrite/url_rewriter.py @@ -1,7 +1,7 @@ -import urlparse +from six.moves.urllib.parse import urljoin, urlsplit, urlunsplit -from wburl import WbUrl -from cookie_rewriter import get_cookie_rewriter +from pywb.rewrite.wburl import WbUrl +from pywb.rewrite.cookie_rewriter import get_cookie_rewriter #================================================================= @@ -119,11 +119,11 @@ class UrlRewriter(object): @staticmethod def urljoin(orig_url, url): - new_url = urlparse.urljoin(orig_url, url) + new_url = urljoin(orig_url, url) if '../' not in new_url: return new_url - parts = urlparse.urlsplit(new_url) + parts = urlsplit(new_url) scheme, netloc, path, query, frag = parts path_parts = path.split('/') @@ -147,7 +147,7 @@ class UrlRewriter(object): parts = (scheme, netloc, path, query, frag) - new_url = urlparse.urlunsplit(parts) + new_url = urlunsplit(parts) return new_url diff --git a/pywb/rewrite/wburl.py b/pywb/rewrite/wburl.py index 0bb8ce5d..5c4c876a 100644 --- a/pywb/rewrite/wburl.py +++ b/pywb/rewrite/wburl.py @@ -39,8 +39,11 @@ wayback url format. """ import re -import urllib -import urlparse +import six + +from six.moves.urllib.parse import urlsplit, urlunsplit +from six.moves.urllib.parse import quote_plus, quote, unquote_plus + #================================================================= class BaseWbUrl(object): @@ -105,7 +108,7 @@ class WbUrl(BaseWbUrl): if 'xn--' not in url: return url - parts = urlparse.urlsplit(url) + parts = urlsplit(url) domain = parts.netloc try: domain = domain.decode('idna') @@ -114,9 +117,9 @@ class WbUrl(BaseWbUrl): # likely already encoded, so use as is pass - domain = urllib.quote(domain)#, safe=r':\/') + domain = quote(domain)#, safe=r':\/') - return urlparse.urlunsplit((parts[0], domain, parts[2], parts[3], parts[4])) + return urlunsplit((parts[0], domain, parts[2], parts[3], parts[4])) @staticmethod @@ -131,7 +134,7 @@ class WbUrl(BaseWbUrl): """ parts = WbUrl.FIRST_PATH.split(url, 1) - scheme_dom = urllib.unquote_plus(parts[0]) + scheme_dom = unquote_plus(parts[0]) if isinstance(scheme_dom, str): if scheme_dom == parts[0]: @@ -155,7 +158,7 @@ class WbUrl(BaseWbUrl): if len(parts) > 1: if isinstance(parts[1], unicode): - url += '/' + urllib.quote(parts[1].encode('utf-8')) + url += '/' + quote(parts[1].encode('utf-8')) else: url += '/' + parts[1] @@ -168,7 +171,7 @@ class WbUrl(BaseWbUrl): if isinstance(orig_url, unicode): orig_url = orig_url.encode('utf-8') - orig_url = urllib.quote(orig_url) + orig_url = quote(orig_url) self._original_url = orig_url @@ -259,7 +262,7 @@ class WbUrl(BaseWbUrl): rex_query = '=' + re.escape(prefix) + '([0-9])*([\w]{2}_)?/?' self.url = re.sub(rex_query, '=', self.url) - rex_query = '=(' + urllib.quote_plus(prefix) + '.*?)((?:https?%3A)?%2F%2F[^&]+)' + rex_query = '=(' + quote_plus(prefix) + '.*?)((?:https?%3A)?%2F%2F[^&]+)' self.url = re.sub(rex_query, '=\\2', self.url) return self.url diff --git a/pywb/utils/loaders.py b/pywb/utils/loaders.py index 2a81b8d2..ea901aef 100644 --- a/pywb/utils/loaders.py +++ b/pywb/utils/loaders.py @@ -45,6 +45,17 @@ def load_yaml_config(config_file): return config +#================================================================= +def to_native_str(value, encoding='iso-8859-1'): + if isinstance(value, str): + return value + + if six.PY3 and isinstance(value, six.binary_type): + return value.decode(encoding) + elif six.PY2 and isinstance(value, six.text_type): + return value.encode(encoding) + + #================================================================= def extract_post_query(method, mime, length, stream, buffered_stream=None): """ @@ -77,7 +88,7 @@ def extract_post_query(method, mime, length, stream, buffered_stream=None): if not buff: break - post_query += buff + post_query += to_native_str(buff) if buffered_stream: buffered_stream.write(post_query) diff --git a/pywb/utils/statusandheaders.py b/pywb/utils/statusandheaders.py index 4327398c..b7be3c88 100644 --- a/pywb/utils/statusandheaders.py +++ b/pywb/utils/statusandheaders.py @@ -5,8 +5,12 @@ Representation and parsing of HTTP-style status + headers import pprint from copy import copy from six.moves import range +import six +from pywb.utils.loaders import to_native_str +WRAP_WIDTH = 80 + #================================================================= class StatusAndHeaders(object): """ @@ -112,7 +116,7 @@ class StatusAndHeaders(object): return self def __repr__(self): - headers_str = pprint.pformat(self.headers, indent=2) + headers_str = pprint.pformat(self.headers, indent=2, width=WRAP_WIDTH) return "StatusAndHeaders(protocol = '{0}', statusline = '{1}', \ headers = {2})".format(self.protocol, self.statusline, headers_str) @@ -145,9 +149,15 @@ class StatusAndHeadersParser(object): support continuation headers starting with space or tab """ + + def readline(): + return to_native_str(stream.readline()) + # status line w newlines intact if full_statusline is None: - full_statusline = stream.readline() + full_statusline = readline() + else: + full_statusline = to_native_str(full_statusline) statusline, total_read = _strip_count(full_statusline, 0) @@ -173,7 +183,7 @@ class StatusAndHeadersParser(object): else: protocol_status = statusline.split(' ', 1) - line, total_read = _strip_count(stream.readline(), total_read) + line, total_read = _strip_count(readline(), total_read) while line: result = line.split(':', 1) if len(result) == 2: @@ -183,14 +193,14 @@ class StatusAndHeadersParser(object): name = result[0] value = None - next_line, total_read = _strip_count(stream.readline(), + next_line, total_read = _strip_count(readline(), total_read) # append continuation lines, if any while next_line and next_line.startswith((' ', '\t')): if value is not None: value += next_line - next_line, total_read = _strip_count(stream.readline(), + next_line, total_read = _strip_count(readline(), total_read) if value is not None: diff --git a/pywb/warc/archiveiterator.py b/pywb/warc/archiveiterator.py index 5df3bb03..2bb9f4ce 100644 --- a/pywb/warc/archiveiterator.py +++ b/pywb/warc/archiveiterator.py @@ -3,7 +3,7 @@ from pywb.utils.bufferedreaders import DecompressingBufferedReader from pywb.utils.canonicalize import canonicalize from pywb.utils.loaders import extract_post_query, append_post_query -from recordloader import ArcWarcRecordLoader +from pywb.warc.recordloader import ArcWarcRecordLoader import hashlib import base64 @@ -66,7 +66,10 @@ class ArchiveIterator(object): self.member_info = None self.no_record_parse = no_record_parse - def iter_records(self, block_size=16384): + def __iter__(self): + return self + + def __call__(self, block_size=16384): """ iterate over each record """ @@ -152,10 +155,10 @@ class ArchiveIterator(object): stripped = line.rstrip() - if stripped == '' or first_line: + if len(stripped) == 0 or first_line: empty_size += len(line) - if stripped != '': + if len(stripped) != 0: # if first line is not blank, # likely content-length was invalid, display warning err_offset = self.fh.tell() - self.reader.rem_length() - empty_size @@ -290,7 +293,7 @@ class ArchiveIndexEntryMixin(object): #================================================================= -class DefaultRecordIter(object): +class DefaultRecordParser(object): def __init__(self, **options): self.options = options self.entry_cache = {} @@ -329,14 +332,14 @@ class DefaultRecordIter(object): def end_payload(self, entry): if self.digester: - entry['digest'] = base64.b32encode(self.digester.digest()) + entry['digest'] = base64.b32encode(self.digester.digest()).decode('ascii') self.entry = None def create_payload_buffer(self, entry): return None - def create_record_iter(self, arcv_iter): + def create_record_iter(self, raw_iter): append_post = self.options.get('append_post') include_all = self.options.get('include_all') block_size = self.options.get('block_size', 16384) @@ -347,7 +350,7 @@ class DefaultRecordIter(object): raise Exception('Sorry, minimal index option and ' + 'append POST options can not be used together') - for record in arcv_iter.iter_records(block_size): + for record in raw_iter(block_size): entry = None if not include_all and not minimal and (record.status_headers.get_statuscode() == '-'): @@ -394,9 +397,9 @@ class DefaultRecordIter(object): entry.record = record self.begin_payload(compute_digest, entry) - arcv_iter.read_to_end(record, self.handle_payload) + raw_iter.read_to_end(record, self.handle_payload) - entry.set_rec_info(*arcv_iter.member_info) + entry.set_rec_info(*raw_iter.member_info) self.end_payload(entry) yield entry @@ -536,8 +539,15 @@ class DefaultRecordIter(object): yield entry + def open(self, filename): + with open(filename, 'rb') as fh: + for entry in self(fh): + yield entry + class ArchiveIndexEntry(ArchiveIndexEntryMixin, dict): pass class OrderedArchiveIndexEntry(ArchiveIndexEntryMixin, OrderedDict): pass + + diff --git a/pywb/warc/cdxindexer.py b/pywb/warc/cdxindexer.py index 122df7f4..ab981804 100644 --- a/pywb/warc/cdxindexer.py +++ b/pywb/warc/cdxindexer.py @@ -29,15 +29,18 @@ except ImportError: # pragma: no cover from argparse import ArgumentParser, RawTextHelpFormatter from bisect import insort -from io import BytesIO +from six import StringIO -from archiveiterator import DefaultRecordIter +from pywb.warc.archiveiterator import DefaultRecordParser +import codecs +import six #================================================================= class BaseCDXWriter(object): def __init__(self, out): - self.out = out + self.out = codecs.getwriter('utf-8')(out) + #self.out = out def __enter__(self): self._write_header() @@ -69,7 +72,7 @@ class CDXJ(object): outdict = OrderedDict() - for n, v in entry.iteritems(): + for n, v in six.iteritems(entry): if n in ('urlkey', 'timestamp'): continue @@ -145,7 +148,7 @@ class SortedCDXWriter(BaseCDXWriter): return res def write(self, entry, filename): - self.out = BytesIO() + self.out = StringIO() super(SortedCDXWriter, self).write(entry, filename) line = self.out.getvalue() if line: @@ -175,7 +178,7 @@ def iter_file_or_dir(inputs, recursive=True, rel_root=None): if not rel_root: filename = os.path.basename(input_) else: - filename = _resolve_rel_path(input_, rel_root) + filename = _resolve_rel_path(input_, rel_root) yield input_, filename @@ -268,7 +271,7 @@ def write_multi_cdx_index(output, inputs, **options): outfile = open(output, 'wb') writer_cls = get_cdx_writer_cls(options) - record_iter = DefaultRecordIter(**options) + record_iter = DefaultRecordParser(**options) with writer_cls(outfile) as writer: for fullpath, filename in iter_file_or_dir(inputs, @@ -285,13 +288,12 @@ def write_multi_cdx_index(output, inputs, **options): #================================================================= def write_cdx_index(outfile, infile, filename, **options): - if type(filename) is unicode: - filename = filename.encode(sys.getfilesystemencoding()) + #filename = filename.encode(sys.getfilesystemencoding()) writer_cls = get_cdx_writer_cls(options) with writer_cls(outfile) as writer: - entry_iter = DefaultRecordIter(**options)(infile) + entry_iter = DefaultRecordParser(**options)(infile) for entry in entry_iter: writer.write(entry, filename) diff --git a/pywb/warc/pathresolvers.py b/pywb/warc/pathresolvers.py index 1baf3ed6..eb36a065 100644 --- a/pywb/warc/pathresolvers.py +++ b/pywb/warc/pathresolvers.py @@ -1,9 +1,11 @@ import redis from pywb.utils.binsearch import iter_exact +from pywb.utils.loaders import to_native_str + +from six.moves.urllib.parse import urlsplit +from six.moves.urllib.request import url2pathname -import urlparse -import urllib import os import logging @@ -49,7 +51,7 @@ class RedisResolver: def __call__(self, filename): redis_val = self.redis.hget(self.key_prefix + filename, 'path') - return [redis_val] if redis_val else [] + return [to_native_str(redis_val)] if redis_val else [] def __repr__(self): return "RedisResolver('{0}')".format(self.redis_url) @@ -62,12 +64,12 @@ class PathIndexResolver: def __call__(self, filename): with open(self.pathindex_file, 'rb') as reader: - result = iter_exact(reader, filename, '\t') + result = iter_exact(reader, filename.encode('utf-8'), b'\t') for pathline in result: - paths = pathline.split('\t')[1:] + paths = pathline.split(b'\t')[1:] for path in paths: - yield path + yield to_native_str(path) def __repr__(self): # pragma: no cover return "PathIndexResolver('{0}')".format(self.pathindex_file) @@ -84,7 +86,7 @@ def make_best_resolver(param): path = param arg = None - url_parts = urlparse.urlsplit(path) + url_parts = urlsplit(path) if url_parts.scheme == 'redis': logging.debug('Adding Redis Index: ' + path) @@ -92,7 +94,7 @@ def make_best_resolver(param): if url_parts.scheme == 'file': path = url_parts.path - path = urllib.url2pathname(path) + path = url2pathname(path) if os.path.isfile(path): logging.debug('Adding Path Index: ' + path) @@ -106,7 +108,7 @@ def make_best_resolver(param): #================================================================= def make_best_resolvers(paths): - if hasattr(paths, '__iter__'): - return map(make_best_resolver, paths) + if isinstance(paths, list) or isinstance(paths, set): + return list(map(make_best_resolver, paths)) else: return [make_best_resolver(paths)] diff --git a/pywb/warc/recordloader.py b/pywb/warc/recordloader.py index 9fd21772..37fd17eb 100644 --- a/pywb/warc/recordloader.py +++ b/pywb/warc/recordloader.py @@ -1,5 +1,3 @@ -import itertools -import urlparse import collections from pywb.utils.statusandheaders import StatusAndHeaders @@ -7,10 +5,14 @@ from pywb.utils.statusandheaders import StatusAndHeadersParser from pywb.utils.statusandheaders import StatusAndHeadersParserException from pywb.utils.loaders import BlockLoader, LimitReader +from pywb.utils.loaders import to_native_str from pywb.utils.bufferedreaders import DecompressingBufferedReader from pywb.utils.wbexception import WbException +from six.moves import zip +import six + #================================================================= ArcWarcRecord = collections.namedtuple('ArcWarcRecord', @@ -34,7 +36,7 @@ class ArchiveLoadFailed(WbException): #================================================================= -class ArcWarcRecordLoader: +class ArcWarcRecordLoader(object): # Standard ARC v1.0 headers # TODO: support ARC v2.0 also? ARC_HEADERS = ["uri", "ip-address", "archive-date", @@ -73,7 +75,7 @@ class ArcWarcRecordLoader: except: length = -1 - stream = self.loader.load(url, long(offset), length) + stream = self.loader.load(url, int(offset), length) decomp_type = 'gzip' # Create decompressing stream @@ -200,16 +202,21 @@ class ArcWarcRecordLoader: #================================================================= -class ARCHeadersParser: +class ARCHeadersParser(object): def __init__(self, headernames): self.headernames = headernames def parse(self, stream, headerline=None): total_read = 0 + def readline(): + return to_native_str(stream.readline()) + # if headerline passed in, use that if headerline is None: - headerline = stream.readline() + headerline = readline() + else: + headerline = to_native_str(headerline) header_len = len(headerline) @@ -222,8 +229,8 @@ class ARCHeadersParser: # if arc header, consume next two lines if headerline.startswith('filedesc://'): - version = stream.readline() # skip version - spec = stream.readline() # skip header spec, use preset one + version = readline() # skip version + spec = readline() # skip header spec, use preset one total_read += len(version) total_read += len(spec) @@ -236,7 +243,7 @@ class ARCHeadersParser: headers = [] - for name, value in itertools.izip(headernames, parts): + for name, value in zip(headernames, parts): headers.append((name, value)) return StatusAndHeaders(statusline='', diff --git a/pywb/warc/resolvingloader.py b/pywb/warc/resolvingloader.py index 9053eb06..38ffec36 100644 --- a/pywb/warc/resolvingloader.py +++ b/pywb/warc/resolvingloader.py @@ -1,8 +1,10 @@ from pywb.utils.timeutils import iso_date_to_timestamp -from recordloader import ArcWarcRecordLoader, ArchiveLoadFailed -from pathresolvers import make_best_resolvers +from pywb.warc.recordloader import ArcWarcRecordLoader, ArchiveLoadFailed +from pywb.warc.pathresolvers import make_best_resolvers from pywb.utils.wbexception import NotFoundException +import six + #================================================================= class ResolvingLoader(object): @@ -104,6 +106,9 @@ class ResolvingLoader(object): for resolver in self.path_resolvers: possible_paths = resolver(filename) + #import sys + #sys.stderr.write(str(possible_paths)) + if possible_paths: for path in possible_paths: any_found = True @@ -125,7 +130,8 @@ class ResolvingLoader(object): else: msg = 'Archive File Not Found' - raise ArchiveLoadFailed(msg, filename), None, last_traceback + #raise ArchiveLoadFailed(msg, filename), None, last_traceback + six.reraise(ArchiveLoadFailed, ArchiveLoadFailed(msg, filename), last_traceback) def _load_different_url_payload(self, cdx, headers_record, failed_files, cdx_loader): diff --git a/pywb/warc/test/test_indexing.py b/pywb/warc/test/test_indexing.py index 1c8f4554..556a5c3a 100644 --- a/pywb/warc/test/test_indexing.py +++ b/pywb/warc/test/test_indexing.py @@ -48,9 +48,9 @@ com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ >>> print_cdx_index('example-wget-1-14.warc.gz') CDX N b a m s k r M S V g com,example)/ 20140216012908 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1151 792 example-wget-1-14.warc.gz -metadata)/gnu.org/software/wget/warc/manifest.txt 20140216012908 metadata://gnu.org/software/wget/warc/MANIFEST.txt text/plain - SWUF4CK2XMZSOKSA7SDT7M7NUGWH2TRE - - 315 1943 example-wget-1-14.warc.gz -metadata)/gnu.org/software/wget/warc/wget_arguments.txt 20140216012908 metadata://gnu.org/software/wget/warc/wget_arguments.txt text/plain - UCXDCGORD6K4RJT5NUQGKE2PKEG4ZZD6 - - 340 2258 example-wget-1-14.warc.gz -metadata)/gnu.org/software/wget/warc/wget.log 20140216012908 metadata://gnu.org/software/wget/warc/wget.log text/plain - 2ULE2LD5UOWDXGACCT624TU5BVKACRQ4 - - 599 2598 example-wget-1-14.warc.gz +org,gnu)/software/wget/warc/manifest.txt 20140216012908 metadata://gnu.org/software/wget/warc/MANIFEST.txt text/plain - SWUF4CK2XMZSOKSA7SDT7M7NUGWH2TRE - - 315 1943 example-wget-1-14.warc.gz +org,gnu)/software/wget/warc/wget_arguments.txt 20140216012908 metadata://gnu.org/software/wget/warc/wget_arguments.txt text/plain - UCXDCGORD6K4RJT5NUQGKE2PKEG4ZZD6 - - 340 2258 example-wget-1-14.warc.gz +org,gnu)/software/wget/warc/wget.log 20140216012908 metadata://gnu.org/software/wget/warc/wget.log text/plain - 2ULE2LD5UOWDXGACCT624TU5BVKACRQ4 - - 599 2598 example-wget-1-14.warc.gz # wget warc, includes metadata and request @@ -58,9 +58,9 @@ metadata)/gnu.org/software/wget/warc/wget.log 20140216012908 metadata://gnu.org/ CDX N b a m s k r M S V g com,example)/ 20140216012908 http://example.com/ - - - - - 394 398 example-wget-1-14.warc.gz com,example)/ 20140216012908 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1151 792 example-wget-1-14.warc.gz -metadata)/gnu.org/software/wget/warc/manifest.txt 20140216012908 metadata://gnu.org/software/wget/warc/MANIFEST.txt text/plain - SWUF4CK2XMZSOKSA7SDT7M7NUGWH2TRE - - 315 1943 example-wget-1-14.warc.gz -metadata)/gnu.org/software/wget/warc/wget_arguments.txt 20140216012908 metadata://gnu.org/software/wget/warc/wget_arguments.txt text/plain - UCXDCGORD6K4RJT5NUQGKE2PKEG4ZZD6 - - 340 2258 example-wget-1-14.warc.gz -metadata)/gnu.org/software/wget/warc/wget.log 20140216012908 metadata://gnu.org/software/wget/warc/wget.log text/plain - 2ULE2LD5UOWDXGACCT624TU5BVKACRQ4 - - 599 2598 example-wget-1-14.warc.gz +org,gnu)/software/wget/warc/manifest.txt 20140216012908 metadata://gnu.org/software/wget/warc/MANIFEST.txt text/plain - SWUF4CK2XMZSOKSA7SDT7M7NUGWH2TRE - - 315 1943 example-wget-1-14.warc.gz +org,gnu)/software/wget/warc/wget_arguments.txt 20140216012908 metadata://gnu.org/software/wget/warc/wget_arguments.txt text/plain - UCXDCGORD6K4RJT5NUQGKE2PKEG4ZZD6 - - 340 2258 example-wget-1-14.warc.gz +org,gnu)/software/wget/warc/wget.log 20140216012908 metadata://gnu.org/software/wget/warc/wget.log text/plain - 2ULE2LD5UOWDXGACCT624TU5BVKACRQ4 - - 599 2598 example-wget-1-14.warc.gz # wpull warc, includes metadata by default >>> print_cdx_index('example-wpull.warc.gz') @@ -127,7 +127,7 @@ com,example)/?example=2 20140603030341 http://example.com?example=2 warc/revisit com,example)/?example=2 20140103030321 http://example.com?example=2 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1987 3207 example-extra.warc com,example)/?example=2 20140603030341 http://example.com?example=2 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 504 5910 example-extra.warc ->>> print_cdx_index('example-extra.warc', verify_http=True) +>>> print_cdx_index('example-extra.warc', verify_http=True) # doctest: +IGNORE_EXCEPTION_DETAIL Traceback (most recent call last): StatusAndHeadersParserException: Expected Status Line starting with ['HTTP/1.0', 'HTTP/1.1'] - Found: HTTPX/1.1 200 OK @@ -178,7 +178,7 @@ urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX Total: 210 # test writing to temp dir, also use unicode filename ->>> cli_lines_with_dir(unicode(TEST_WARC_DIR + 'example.warc.gz')) +>>> cli_lines_with_dir(TEST_WARC_DIR + 'example.warc.gz') example.cdx com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz @@ -223,7 +223,7 @@ def cdx_index(warc, **options): return buff.getvalue() def print_cdx_index(*args, **kwargs): - sys.stdout.write(cdx_index(*args, **kwargs)) + sys.stdout.write(cdx_index(*args, **kwargs).decode('utf-8')) def assert_cdx_match(cdx, warc, sort=False): assert read_fully(cdx) == cdx_index(warc, sort=sort) @@ -239,11 +239,11 @@ def cli_lines(cmds): sys.stdout = buff main(cmds) sys.stdout = orig - lines = buff.getvalue().rstrip().split('\n') + lines = buff.getvalue().rstrip().split(b'\n') # print first, last, num lines - print(lines[1]) - print(lines[-1]) + print(lines[1].decode('utf-8')) + print(lines[-1].decode('utf-8')) print('Total: ' + str(len(lines))) def cli_lines_with_dir(input_): @@ -256,10 +256,10 @@ def cli_lines_with_dir(input_): filename = cdx_filename(os.path.basename(input_)) - print filename + print(filename) with open(os.path.join(tmp_dir, filename), 'rb') as fh: - lines = fh.read(8192).rstrip().split('\n') + lines = fh.read(8192).rstrip().split(b'\n') finally: try: @@ -273,8 +273,8 @@ def cli_lines_with_dir(input_): return # print first, last, num lines - print (lines[1]) - print (lines[-1]) + print(lines[1].decode('utf-8')) + print(lines[-1].decode('utf-8')) print('Total: ' + str(len(lines))) @@ -284,18 +284,18 @@ def test_non_chunked_gzip_err(): def parse_cdxj(string): - lines = string.split('\n') - if lines[0] == '': + lines = string.split(b'\n') + if lines[0] == b'': lines = lines[1:] - cdxlist = map(CDXObject, lines) - return map(dict, cdxlist) + cdxlist = list(map(CDXObject, lines)) + return list(map(dict, cdxlist)) def test_cdxj_warc_minimal(): # cdxj minimal res = cdx_index('example.warc.gz', minimal=True, cdxj=True) - assert parse_cdxj(res) == parse_cdxj(""" + assert parse_cdxj(res) == parse_cdxj(b""" com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "1043", "offset": "333", "filename": "example.warc.gz"} com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "mime": "warc/revisit", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "553", "offset": "1864", "filename": "example.warc.gz"} org,iana)/domains/example 20140128051539 {"url": "http://www.iana.org/domains/example", "digest": "JZ622UA23G5ZU6Y3XAKH4LINONUEICEG", "length": "577", "offset": "2907", "filename": "example.warc.gz"} @@ -306,7 +306,7 @@ def test_cdxj_warc_all(): # warc.gz -- parse all -- CDXJ res = cdx_index('example.warc.gz', include_all=True, cdxj=True) - assert parse_cdxj(res) == parse_cdxj(""" + assert parse_cdxj(res) == parse_cdxj(b""" com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "mime": "text/html", "status": "200", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "1043", "offset": "333", "filename": "example.warc.gz"} com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "length": "488", "offset": "1376", "filename": "example.warc.gz"} com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "mime": "warc/revisit", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "553", "offset": "1864", "filename": "example.warc.gz"} @@ -317,14 +317,14 @@ org,iana)/domains/example 20140128051539 {"url": "http://www.iana.org/domains/ex def test_cdxj_arc(): # arc.gz -- json res = cdx_index('example.arc.gz', cdxj=True) - assert parse_cdxj(res) == parse_cdxj(""" + assert parse_cdxj(res) == parse_cdxj(b""" com,example)/ 20140216050221 {"url": "http://example.com/", "mime": "text/html", "status": "200", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "856", "offset": "171", "filename": "example.arc.gz"} """) def test_cdxj_arc_minimal(): # arc.gz -- minimal + json res = cdx_index('example.arc.gz', cdxj=True, minimal=True) - assert parse_cdxj(res) == parse_cdxj(""" + assert parse_cdxj(res) == parse_cdxj(b""" com,example)/ 20140216050221 {"url": "http://example.com/", "digest": "PEWDX5GTH66WU74WBPGFECIYBMPMP3FP", "length": "856", "offset": "171", "filename": "example.arc.gz"} """) diff --git a/pywb/warc/test/test_loading.py b/pywb/warc/test/test_loading.py index 9a8da66c..2d7d65dc 100644 --- a/pywb/warc/test/test_loading.py +++ b/pywb/warc/test/test_loading.py @@ -37,8 +37,7 @@ Test loading different types of records from a variety of formats ('WARC-Payload-Digest', 'sha1:B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'), ('WARC-Target-URI', 'http://example.com?example=1'), ('WARC-Warcinfo-ID', ''), - ( 'WARC-Profile', - 'http://netpreserve.org/warc/0.18/revisit/identical-payload-digest'), + ('WARC-Profile', 'http://netpreserve.org/warc/0.18/revisit/identical-payload-digest'), ('WARC-Refers-To-Target-URI', 'http://example.com?example=1'), ('WARC-Refers-To-Date', '2014-01-03T03:03:21Z')]), StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'), @@ -66,17 +65,13 @@ Test loading different types of records from a variety of formats ('WARC-Target-URI', 'http://example.com?example=1'), ('WARC-Warcinfo-ID', '')]), StatusAndHeaders(protocol = 'GET', statusline = '/?example=1 HTTP/1.1', headers = [ ('Connection', 'close'), - ( 'Accept', - 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'), + ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'), ('Accept-Language', 'en-US,en;q=0.8'), - ( 'User-Agent', - 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.57 Safari/537.36 (via Wayback Save Page)'), + ('User-Agent', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) \ +Chrome/31.0.1650.57 Safari/537.36 (via Wayback Save Page)'), ('Host', 'example.com')])) -StatusAndHeaders(protocol = '', statusline = '204 No Content', headers = [])) - - # Test of record loading based on cdx line # Print parsed http headers + 2 lines of content # ============================================================================== @@ -233,7 +228,7 @@ failed_files=failed_files) Exception: ArchiveLoadFailed # ensure failed_files being filled ->>> failed_files +>>> print_strs(failed_files) ['x-not-found-x.warc.gz'] >>> load_from_cdx_test('com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 856 170 x-not-found-x.warc.gz',\ @@ -295,12 +290,15 @@ Exception: ArchiveLoadFailed import os import sys import pprint +import six from pywb.warc.recordloader import ArcWarcRecordLoader, ArchiveLoadFailed from pywb.warc.pathresolvers import make_best_resolvers from pywb.warc.resolvingloader import ResolvingLoader from pywb.cdx.cdxobject import CDXObject +import pywb.utils.statusandheaders + from pywb import get_test_dir #============================================================================== @@ -319,7 +317,7 @@ URL_AGNOSTIC_REVISIT_NO_DIGEST_CDX = 'com,example)/ 20130729195151 http://test@e warc/revisit - - - - \ 591 355 example-url-agnostic-revisit.warc.gz' -BAD_ORIG_CDX = 'org,iana,example)/ 20130702195401 http://example.iana.org/ \ +BAD_ORIG_CDX = b'org,iana,example)/ 20130702195401 http://example.iana.org/ \ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - \ 1001 353 someunknown.warc.gz' @@ -332,8 +330,10 @@ def load_test_archive(test_file, offset, length): archive = testloader.load(path, offset, length) + pywb.utils.statusandheaders.WRAP_WIDTH = 160 + pprint.pprint(((archive.format, archive.rec_type), - archive.rec_headers, archive.status_headers)) + archive.rec_headers, archive.status_headers), indent=1, width=160) #============================================================================== @@ -345,25 +345,25 @@ def load_orig_bad_cdx(_): #============================================================================== def load_orig_cdx(_): return [CDXObject(BAD_ORIG_CDX), - CDXObject(URL_AGNOSTIC_ORIG_CDX)] + CDXObject(URL_AGNOSTIC_ORIG_CDX.encode('utf-8'))] #============================================================================== def load_from_cdx_test(cdx, revisit_func=load_orig_cdx, reraise=False, failed_files=None): resolve_loader = ResolvingLoader(test_warc_dir) - cdx = CDXObject(cdx) + cdx = CDXObject(cdx.encode('utf-8')) try: (headers, stream) = resolve_loader(cdx, failed_files, revisit_func) - print headers - sys.stdout.write(stream.readline()) - sys.stdout.write(stream.readline()) + print(headers) + sys.stdout.write(stream.readline().decode('utf-8')) + sys.stdout.write(stream.readline().decode('utf-8')) except ArchiveLoadFailed as e: if reraise: raise else: - print 'Exception: ' + e.__class__.__name__ + print('Exception: ' + e.__class__.__name__) #============================================================================== @@ -371,7 +371,14 @@ def parse_stream_error(**params): try: return ArcWarcRecordLoader().parse_record_stream(**params) except Exception as e: - print 'Exception: ' + e.__class__.__name__ + print('Exception: ' + e.__class__.__name__) + + +#============================================================================== +def print_strs(strings): + return list(map(lambda string: string.encode('utf-8') if six.PY2 else string, strings)) + + if __name__ == "__main__": diff --git a/pywb/warc/test/test_pathresolvers.py b/pywb/warc/test/test_pathresolvers.py index d00f3348..fd8fabbb 100644 --- a/pywb/warc/test/test_pathresolvers.py +++ b/pywb/warc/test/test_pathresolvers.py @@ -47,7 +47,7 @@ RedisResolver('redis://myhost.example.com:1234/1') # make_best_resolvers >>> r = make_best_resolvers(['http://example.com/warcs/',\ 'redis://example.com:1234/1']) ->>> map(lambda x: x.__class__.__name__, r) +>>> list(map(lambda x: x.__class__.__name__, r)) ['PrefixResolver', 'RedisResolver'] """ diff --git a/pywb/webapp/cdx_api_handler.py b/pywb/webapp/cdx_api_handler.py index 3ec21433..980c16d3 100644 --- a/pywb/webapp/cdx_api_handler.py +++ b/pywb/webapp/cdx_api_handler.py @@ -3,9 +3,10 @@ from pywb.cdx.cdxserver import create_cdx_server from pywb.framework.basehandlers import BaseHandler from pywb.framework.wbrequestresponse import WbResponse -from query_handler import QueryHandler +from pywb.webapp.query_handler import QueryHandler -from urlparse import parse_qs +from six.moves.urllib.parse import parse_qs +import six #================================================================= @@ -22,7 +23,11 @@ class CDXAPIHandler(BaseHandler): cdx_iter = self.index_handler.load_cdx(wbrequest, params) - return WbResponse.text_stream(cdx_iter) + def to_utf8(): + for cdx in cdx_iter: + yield cdx.encode('utf-8') + + return WbResponse.text_stream(to_utf8()) @staticmethod def extract_params_from_wsgi_env(env): @@ -35,7 +40,7 @@ class CDXAPIHandler(BaseHandler): # cdx processing expects singleton params for all params, # except filters, so convert here # use first value of the list - for name, val in params.iteritems(): + for name, val in six.iteritems(params): if name != 'filter': params[name] = val[0] diff --git a/pywb/webapp/handlers.py b/pywb/webapp/handlers.py index 2678a78b..161709d4 100644 --- a/pywb/webapp/handlers.py +++ b/pywb/webapp/handlers.py @@ -15,8 +15,8 @@ from pywb.framework.wbrequestresponse import WbResponse from pywb.warc.recordloader import ArcWarcRecordLoader from pywb.warc.resolvingloader import ResolvingLoader -from views import J2TemplateView, init_view -from replay_views import ReplayView +from pywb.webapp.views import J2TemplateView, init_view +from pywb.webapp.replay_views import ReplayView from pywb.framework.memento import MementoResponse from pywb.utils.timeutils import datetime_to_timestamp diff --git a/pywb/webapp/live_rewrite_handler.py b/pywb/webapp/live_rewrite_handler.py index 47770677..88564eef 100644 --- a/pywb/webapp/live_rewrite_handler.py +++ b/pywb/webapp/live_rewrite_handler.py @@ -4,8 +4,8 @@ from pywb.framework.cache import create_cache from pywb.rewrite.rewrite_live import LiveRewriter from pywb.rewrite.wburl import WbUrl -from handlers import StaticHandler, SearchPageWbUrlHandler -from views import HeadInsertView +from pywb.webapp.handlers import StaticHandler, SearchPageWbUrlHandler +from pywb.webapp.views import HeadInsertView from pywb.utils.wbexception import WbException @@ -60,7 +60,7 @@ class RewriteHandler(SearchPageWbUrlHandler): except Exception as exc: import traceback err_details = traceback.format_exc(exc) - print err_details + print(err_details) url = wbrequest.wb_url.url msg = 'Could not load the url from the live web: ' + url diff --git a/pywb/webapp/pywb_init.py b/pywb/webapp/pywb_init.py index 4a8c7cca..804653be 100644 --- a/pywb/webapp/pywb_init.py +++ b/pywb/webapp/pywb_init.py @@ -6,21 +6,22 @@ from pywb.framework.wbrequestresponse import WbRequest from pywb.framework.memento import MementoRequest from pywb.framework.basehandlers import BaseHandler -from views import J2TemplateView -from views import J2HtmlCapturesView, init_view +from pywb.webapp.views import J2TemplateView +from pywb.webapp.views import J2HtmlCapturesView, init_view -from live_rewrite_handler import RewriteHandler +from pywb.webapp.live_rewrite_handler import RewriteHandler -from query_handler import QueryHandler -from handlers import WBHandler -from handlers import StaticHandler -from handlers import DebugEchoHandler, DebugEchoEnvHandler -from cdx_api_handler import CDXAPIHandler +from pywb.webapp.query_handler import QueryHandler +from pywb.webapp.handlers import WBHandler +from pywb.webapp.handlers import StaticHandler +from pywb.webapp.handlers import DebugEchoHandler, DebugEchoEnvHandler +from pywb.webapp.cdx_api_handler import CDXAPIHandler from pywb import DEFAULT_CONFIG import os import logging +import six #================================================================= @@ -130,7 +131,7 @@ def create_cdx_server_app(passed_config): routes = [] - for name, value in collections.iteritems(): + for name, value in six.iteritems(collections): route_config = init_route_config(value, config) query_handler = init_collection(route_config) @@ -234,7 +235,7 @@ class DirectoryCollsLoader(object): # Check all templates template_files = self.config.get('paths')['template_files'] - for tname, tfile in template_files.iteritems(): + for tname, tfile in six.iteritems(template_files): if tname in coll_config: # Already set coll_config[tname] = self._norm_path(root_dir, coll_config[tname]) @@ -288,10 +289,10 @@ def create_wb_router(passed_config=None): jinja_env.globals.update(config.get('template_globals', {})) - for static_name, static_path in static_routes.iteritems(): + for static_name, static_path in six.iteritems(static_routes): routes.append(Route(static_name, StaticHandler(static_path))) - for name, value in collections.iteritems(): + for name, value in six.iteritems(collections): if isinstance(value, BaseHandler): handler_dict[name] = value new_route = Route(name, value, config=config) diff --git a/pywb/webapp/query_handler.py b/pywb/webapp/query_handler.py index e9ac1420..69d3bc58 100644 --- a/pywb/webapp/query_handler.py +++ b/pywb/webapp/query_handler.py @@ -1,12 +1,9 @@ -import urllib -import urllib2 - from pywb.utils.dsrules import DEFAULT_RULES_FILE from pywb.perms.perms_filter import make_perms_cdx_filter from pywb.framework.wbrequestresponse import WbResponse from pywb.cdx.cdxserver import create_cdx_server -from views import MementoTimemapView +from pywb.webapp.views import MementoTimemapView #================================================================= diff --git a/pywb/webapp/replay_views.py b/pywb/webapp/replay_views.py index d6637141..d3771c68 100644 --- a/pywb/webapp/replay_views.py +++ b/pywb/webapp/replay_views.py @@ -2,7 +2,7 @@ import re import logging from io import BytesIO -from urlparse import urlsplit +from six.moves.urllib.parse import urlsplit from itertools import chain from pywb.utils.statusandheaders import StatusAndHeaders @@ -16,9 +16,9 @@ from pywb.framework.memento import MementoResponse from pywb.rewrite.rewrite_content import RewriteContent from pywb.warc.recordloader import ArchiveLoadFailed -from views import HeadInsertView +from pywb.webapp.views import HeadInsertView -from rangecache import range_cache +from pywb.webapp.rangecache import range_cache #================================================================= diff --git a/pywb/webapp/views.py b/pywb/webapp/views.py index 45b59968..c52a49ab 100644 --- a/pywb/webapp/views.py +++ b/pywb/webapp/views.py @@ -2,13 +2,12 @@ from pywb.utils.timeutils import timestamp_to_datetime, timestamp_to_sec from pywb.framework.wbrequestresponse import WbResponse from pywb.framework.memento import make_timemap, LINK_FORMAT -import urlparse -import urllib +from six.moves.urllib.parse import urlsplit + import logging import json import os -from itertools import imap from jinja2 import Environment from jinja2 import FileSystemLoader, PackageLoader, ChoiceLoader @@ -48,7 +47,7 @@ def format_ts(value, format_='%a, %b %d %Y %H:%M:%S'): @template_filter('urlsplit') def get_urlsplit(url): - split = urlparse.urlsplit(url) + split = urlsplit(url) return split