1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

more python 3 support work -- pywb.cdx, pywb.warc tests succeed

most relative imports replaced with absolute
This commit is contained in:
Ilya Kreymer 2016-02-18 21:26:40 -08:00
parent b7008920de
commit bd841b91a9
47 changed files with 475 additions and 357 deletions

View File

@ -1,4 +1,4 @@
__version__ = '0.11.1'
__version__ = '1.0b'
DEFAULT_CONFIG = 'pywb/default_config.yaml'

View File

@ -2,13 +2,14 @@ import yaml
import re
import logging
import pkg_resources
import urlparse
from six.moves.urllib.parse import urlsplit
from pywb.utils.dsrules import BaseRule, RuleSet
from pywb.utils.canonicalize import unsurt, UrlCanonicalizer
from query import CDXQuery
from pywb.cdx.query import CDXQuery
#=================================================================
@ -102,7 +103,7 @@ class FuzzyQuery:
url = url[:inx + len(repl)]
if matched_rule.match_type == 'domain':
host = urlparse.urlsplit(url).netloc
host = urlsplit(url).netloc
# remove the subdomain
url = host.split('.', 1)[1]

View File

@ -3,10 +3,11 @@ try: # pragma: no cover
except ImportError: # pragma: no cover
from ordereddict import OrderedDict
import itertools
import six
from six.moves import zip
from urllib import urlencode, quote
from urlparse import parse_qs
from six.moves.urllib.parse import urlencode, quote
from six.moves.urllib.parse import parse_qs
from pywb.utils.wbexception import WbException
@ -101,7 +102,7 @@ class CDXObject(OrderedDict):
'f': FILENAME
}
def __init__(self, cdxline=''):
def __init__(self, cdxline=b''):
OrderedDict.__init__(self)
cdxline = cdxline.rstrip()
@ -112,28 +113,28 @@ class CDXObject(OrderedDict):
self.cdxline = cdxline
return
fields = cdxline.split(' ' , 2)
fields = cdxline.split(b' ' , 2)
# Check for CDX JSON
if fields[-1].startswith('{'):
self[URLKEY] = fields[0]
self[TIMESTAMP] = fields[1]
json_fields = json_decode(fields[-1])
for n, v in json_fields.iteritems():
if fields[-1].startswith(b'{'):
self[URLKEY] = fields[0].decode('utf-8')
self[TIMESTAMP] = fields[1].decode('utf-8')
json_fields = json_decode(fields[-1].decode('utf-8'))
for n, v in six.iteritems(json_fields):
n = self.CDX_ALT_FIELDS.get(n, n)
try:
self[n] = str(v)
v.encode('ascii')
except UnicodeEncodeError:
v = v.encode('utf-8')
parts = v.split('//', 1)
v = parts[0] + '//' + quote(parts[1])
self[n] = v
parts = v.encode('utf-8').split(b'//', 1)
v = parts[0].decode('utf-8') + '//' + quote(parts[1])
self[n] = v
self.cdxline = cdxline
self._from_json = True
return
more_fields = fields.pop().split(' ')
more_fields = fields.pop().split(b' ')
fields.extend(more_fields)
cdxformat = None
@ -145,8 +146,8 @@ class CDXObject(OrderedDict):
msg = 'unknown {0}-field cdx format'.format(len(fields))
raise CDXException(msg)
for header, field in itertools.izip(cdxformat, fields):
self[header] = field
for header, field in zip(cdxformat, fields):
self[header] = field.decode('utf-8')
self.cdxline = cdxline
@ -204,13 +205,14 @@ class CDXObject(OrderedDict):
def __str__(self):
if self.cdxline:
return self.cdxline
return self.cdxline.decode('utf-8')
if not self._from_json:
return ' '.join(val for n, val in self.iteritems())
return ' '.join(val for n, val in six.iteritems(self))
else:
return json_encode(self)
#=================================================================
class IDXObject(OrderedDict):
@ -221,14 +223,14 @@ class IDXObject(OrderedDict):
OrderedDict.__init__(self)
idxline = idxline.rstrip()
fields = idxline.split('\t')
fields = idxline.split(b'\t')
if len(fields) < self.NUM_REQ_FIELDS:
msg = 'invalid idx format: {0} fields found, {1} required'
raise CDXException(msg.format(len(fields), self.NUM_REQ_FIELDS))
for header, field in itertools.izip(self.FORMAT, fields):
self[header] = field
for header, field in zip(self.FORMAT, fields):
self[header] = field.decode('utf-8')
self['offset'] = int(self['offset'])
self['length'] = int(self['length'])
@ -250,4 +252,4 @@ class IDXObject(OrderedDict):
return json_encode(self) + '\n'
def __str__(self):
return self.idxline
return self.idxline.decode('utf-8')

View File

@ -1,15 +1,18 @@
from cdxobject import CDXObject, IDXObject
from cdxobject import TIMESTAMP, STATUSCODE, MIMETYPE, DIGEST
from cdxobject import OFFSET, LENGTH, FILENAME
from pywb.cdx.cdxobject import CDXObject, IDXObject
from pywb.cdx.cdxobject import TIMESTAMP, STATUSCODE, MIMETYPE, DIGEST
from pywb.cdx.cdxobject import OFFSET, LENGTH, FILENAME
from query import CDXQuery
from pywb.cdx.query import CDXQuery
from pywb.utils.timeutils import timestamp_to_sec, pad_timestamp
from pywb.utils.timeutils import PAD_14_DOWN, PAD_14_UP
import bisect
import itertools
from six.moves import zip, range, map
import re
from heapq import merge
from collections import deque
@ -127,7 +130,7 @@ def cdx_limit(cdx_iter, limit):
"""
# for cdx, _ in itertools.izip(cdx_iter, xrange(limit)):
# yield cdx
return (cdx for cdx, _ in itertools.izip(cdx_iter, xrange(limit)))
return (cdx for cdx, _ in zip(cdx_iter, range(limit)))
#=================================================================
@ -221,7 +224,7 @@ def cdx_filter(cdx_iter, filter_strings):
def regex(self, val):
return self.regex.match(val) is not None
filters = map(Filter, filter_strings)
filters = list(map(Filter, filter_strings))
for cdx in cdx_iter:
if all(x(cdx) for x in filters):
@ -273,7 +276,7 @@ def cdx_sort_closest(closest, cdx_iter, limit=10):
sort CDXCaptureResult by closest to timestamp.
"""
closest_cdx = []
closest_keys = []
closest_sec = timestamp_to_sec(closest)
for cdx in cdx_iter:
@ -281,19 +284,26 @@ def cdx_sort_closest(closest, cdx_iter, limit=10):
key = abs(closest_sec - sec)
# create tuple to sort by key
bisect.insort(closest_cdx, (key, cdx))
#bisect.insort(closest_cdx, (key, cdx))
i = bisect.bisect_right(closest_keys, key)
closest_keys.insert(i, key)
closest_cdx.insert(i, cdx)
if len(closest_cdx) == limit:
# assuming cdx in ascending order and keys have started increasing
if key > closest_cdx[-1]:
if key > closest_keys[-1]:
break
if len(closest_cdx) > limit:
closest_cdx.pop()
for cdx in itertools.imap(lambda x: x[1], closest_cdx):
for cdx in closest_cdx:
yield cdx
#for cdx in map(lambda x: x[1], closest_cdx):
# yield cdx
#=================================================================
# resolve revisits

View File

@ -1,19 +1,18 @@
from pywb.utils.canonicalize import UrlCanonicalizer, calc_search_range
from pywb.utils.wbexception import NotFoundException
from cdxops import cdx_load
from cdxsource import CDXSource, CDXFile, RemoteCDXSource, RedisCDXSource
from zipnum import ZipNumCluster
from cdxobject import CDXObject, CDXException
from query import CDXQuery
from cdxdomainspecific import load_domain_specific_cdx_rules
from pywb.cdx.cdxops import cdx_load
from pywb.cdx.cdxsource import CDXSource, CDXFile, RemoteCDXSource, RedisCDXSource
from pywb.cdx.zipnum import ZipNumCluster
from pywb.cdx.cdxobject import CDXObject, CDXException
from pywb.cdx.query import CDXQuery
from pywb.cdx.cdxdomainspecific import load_domain_specific_cdx_rules
from pywb.utils.loaders import is_http
from itertools import chain
import logging
import os
import urlparse
#=================================================================

View File

@ -3,11 +3,11 @@ from pywb.utils.binsearch import iter_range
from pywb.utils.wbexception import AccessException, NotFoundException
from pywb.utils.wbexception import BadRequestException, WbException
from query import CDXQuery
from pywb.cdx.query import CDXQuery
import urllib
import urllib2
import itertools
from six.moves.urllib.request import urlopen, Request
from six.moves.urllib.error import HTTPError
from six.moves import map
#=================================================================
@ -33,7 +33,8 @@ class CDXFile(CDXSource):
@staticmethod
def _do_load_file(filename, query):
with open(filename, 'rb') as source:
gen = iter_range(source, query.key, query.end_key)
gen = iter_range(source, query.key.encode('utf-8'),
query.end_key.encode('utf-8'))
for line in gen:
yield line
@ -65,14 +66,14 @@ class RemoteCDXSource(CDXSource):
urlparams = remote_query.urlencode()
try:
request = urllib2.Request(self.remote_url + '?' + urlparams)
request = Request(self.remote_url + '?' + urlparams)
if self.cookie:
request.add_header('Cookie', self.cookie)
response = urllib2.urlopen(request)
response = urlopen(request)
except urllib2.HTTPError as e:
except HTTPError as e:
if e.code == 403:
raise AccessException('Access Denied')
elif e.code == 404:
@ -95,14 +96,14 @@ class RemoteCDXSource(CDXSource):
#=================================================================
class RedisCDXSource(CDXSource):
DEFAULT_KEY_PREFIX = 'c:'
DEFAULT_KEY_PREFIX = b'c:'
def __init__(self, redis_url, config=None):
import redis
parts = redis_url.split('/')
if len(parts) > 4:
self.cdx_key = parts[4]
self.cdx_key = parts[4].encode('utf-8')
redis_url = 'redis://' + parts[2] + '/' + parts[3]
else:
self.cdx_key = None
@ -126,7 +127,7 @@ class RedisCDXSource(CDXSource):
if self.cdx_key:
return self.load_sorted_range(query, self.cdx_key)
else:
return self.load_single_key(query.key)
return self.load_single_key(query.key.encode('utf-8'))
def load_sorted_range(self, query, cdx_key):
cdx_list = self.redis.zrangebylex(cdx_key,
@ -137,12 +138,12 @@ class RedisCDXSource(CDXSource):
def load_single_key(self, key):
# ensure only url/surt is part of key
key = key.split(' ')[0]
key = key.split(b' ')[0]
cdx_list = self.redis.zrange(self.key_prefix + key, 0, -1)
# key is not part of list, so prepend to each line
key += ' '
cdx_list = itertools.imap(lambda x: key + x, cdx_list)
key += b' '
cdx_list = map(lambda x: key + x, cdx_list)
return cdx_list
def __str__(self):

View File

@ -1,5 +1,5 @@
from urllib import urlencode
from cdxobject import CDXException
from six.moves.urllib.parse import urlencode
from pywb.cdx.cdxobject import CDXException
#=================================================================

View File

@ -5,17 +5,17 @@ from pywb.cdx.cdxobject import CDXObject, IDXObject, CDXException
from pytest import raises
def test_empty_cdxobject():
x = CDXObject('')
x = CDXObject(b'')
assert len(x) == 0
def test_invalid_cdx_format():
with raises(CDXException):
x = CDXObject('a b c')
x = CDXObject(b'a b c')
def _make_line(fields):
line = ' '.join(['-'] * fields)
x = CDXObject(line)
x = CDXObject(line.encode('utf-8'))
assert len(x) == fields
assert str(x) == line
@ -29,13 +29,13 @@ def test_valid_cdx_formats():
_make_line(14)
def test_unicode_url():
x = CDXObject('com,example,cafe)/ 123 {"url": "http://example.com/café/path"}')
x = CDXObject(u'com,example,cafe)/ 123 {"url": "http://example.com/café/path"}'.encode('utf-8'))
assert x['urlkey'] == 'com,example,cafe)/'
assert x['timestamp'] == '123'
assert x['url'] == 'http://example.com/caf%C3%A9/path'
def test_invalid_idx_format():
with raises(CDXException):
x = IDXObject('a b c')
x = IDXObject(b'a b c')

View File

@ -31,17 +31,17 @@ com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYA
com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz
com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz
>>> cdx_ops_test('http://example.com/', sources = [test_cdx_dir], to='2012')
>>> cdx_ops_test('http://example.com/', sources = [test_cdx_dir], to='2012') # doctest: +IGNORE_EXCEPTION_DETAIL
Traceback (most recent call last):
NotFoundException: No Captures found for: http://example.com/
# No matching results
>>> cdx_ops_test('http://iana.org/dont_have_this', reverse = True, resolveRevisits = True, limit = 2)
>>> cdx_ops_test('http://iana.org/dont_have_this', reverse = True, resolveRevisits = True, limit = 2) # doctest: +IGNORE_EXCEPTION_DETAIL
Traceback (most recent call last):
NotFoundException: No Captures found for: http://iana.org/dont_have_this
# No matching -- limit=1
>>> cdx_ops_test('http://iana.org/dont_have_this', reverse = True, resolveRevisits = True, limit = 1)
>>> cdx_ops_test('http://iana.org/dont_have_this', reverse = True, resolveRevisits = True, limit = 1) # doctest: +IGNORE_EXCEPTION_DETAIL
Traceback (most recent call last):
NotFoundException: No Captures found for: http://iana.org/dont_have_this
@ -69,7 +69,7 @@ org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
# Filter -- no such field, no matches
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', filter = 'blah:200')
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', filter = 'blah:200') # doctest: +IGNORE_EXCEPTION_DETAIL
Traceback (most recent call last):
NotFoundException: No Captures found for: http://iana.org/_css/2013.1/screen.css
@ -163,50 +163,66 @@ org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201249 http://www.iana.org/_
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz - - -
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz - - -
# Resolve Revisit -- cdxj minimal -- output also json
>>> cdx_ops_test(url = 'http://example.com/?example=1', sources=[get_test_dir() + 'cdxj/example.cdxj'], resolveRevisits=True)
{"urlkey": "com,example)/?example=1", "timestamp": "20140103030321", "url": "http://example.com?example=1", "length": "1043", "filename": "example.warc.gz", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "offset": "333", "orig.length": "-", "orig.offset": "-", "orig.filename": "-"}
{"urlkey": "com,example)/?example=1", "timestamp": "20140103030341", "url": "http://example.com?example=1", "filename": "example.warc.gz", "length": "553", "mime": "", "offset": "1864", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "orig.length": "1043", "orig.offset": "333", "orig.filename": "example.warc.gz"}
# Resolve Revisit -- cdxj minimal -- output also json
>>> cdx_ops_test(url = 'http://example.com/?example=1', sources=[get_test_dir() + 'cdxj/example-no-digest.cdxj'], resolveRevisits=True)
{"urlkey": "com,example)/?example=1", "timestamp": "20140103030321", "url": "http://example.com?example=1", "length": "1043", "filename": "example.warc.gz", "offset": "333", "orig.length": "-", "orig.offset": "-", "orig.filename": "-"}
{"urlkey": "com,example)/?example=1", "timestamp": "20140103030341", "url": "http://example.com?example=1", "length": "553", "filename": "example.warc.gz", "mime": "warc/revisit", "offset": "1864", "orig.length": "-", "orig.offset": "-", "orig.filename": "-"}
"""
#=================================================================
from pywb.cdx.cdxserver import CDXServer
import os
import sys
import six
from pywb import get_test_dir
test_cdx_dir = get_test_dir() + 'cdx/'
def cdx_ops_test(url, sources = [test_cdx_dir + 'iana.cdx'], **kwparams):
def cdx_ops_test_data(url, sources = [test_cdx_dir + 'iana.cdx'], **kwparams):
kwparams['url'] = url
if not 'output' in kwparams:
kwparams['output'] = 'cdxobject'
fields = kwparams.get('fields')
if fields:
fields = fields.split(',')
server = CDXServer(sources)
results = server.load_cdx(**kwparams)
return list(results)
def cdx_ops_test(*args, **kwargs):
results = cdx_ops_test_data(*args, **kwargs)
fields = kwargs.get('fields')
if fields:
fields = fields.split(',')
for x in results:
if not isinstance(x, str):
l = x.to_text(fields).replace('\t', ' ')
else:
l = x
sys.stdout.write(l)
def test_cdxj_resolve_revisit():
# Resolve Revisit -- cdxj minimal -- output also json
results = cdx_ops_test_data(url = 'http://example.com/?example=1', sources=[get_test_dir() + 'cdxj/example.cdxj'], resolveRevisits=True)
assert(len(results) == 2)
assert(dict(results[0]) == {"urlkey": "com,example)/?example=1", "timestamp": "20140103030321", "url": "http://example.com?example=1", "length": "1043", "filename": "example.warc.gz", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "offset": "333", "orig.length": "-", "orig.offset": "-", "orig.filename": "-"})
assert(dict(results[1]) == {"urlkey": "com,example)/?example=1", "timestamp": "20140103030341", "url": "http://example.com?example=1", "filename": "example.warc.gz", "length": "553", "mime": "", "offset": "1864", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "orig.length": "1043", "orig.offset": "333", "orig.filename": "example.warc.gz"})
def test_cdxj_resolve_revisit_2():
# Resolve Revisit -- cdxj minimal -- output also json
results = cdx_ops_test_data(url = 'http://example.com/?example=1', sources=[get_test_dir() + 'cdxj/example-no-digest.cdxj'], resolveRevisits=True)
assert(len(results) == 2)
assert(dict(results[0]) == {"urlkey": "com,example)/?example=1", "timestamp": "20140103030321", "url": "http://example.com?example=1", "length": "1043", "filename": "example.warc.gz", "offset": "333", "orig.length": "-", "orig.offset": "-", "orig.filename": "-"})
assert(dict(results[1]) == {"urlkey": "com,example)/?example=1", "timestamp": "20140103030341", "url": "http://example.com?example=1", "length": "553", "filename": "example.warc.gz", "mime": "warc/revisit", "offset": "1864", "orig.length": "-", "orig.offset": "-", "orig.filename": "-"})
if __name__ == "__main__":
import doctest
doctest.testmod()

View File

@ -6,11 +6,14 @@ from pywb.utils.dsrules import DEFAULT_RULES_FILE
from pywb.utils.wbexception import AccessException, NotFoundException
from pywb.utils.wbexception import BadRequestException, WbException
from urllib2 import HTTPError
from six.moves.urllib.error import HTTPError
from mock import patch
from pytest import raises
import webtest
import unittest
import six
from pywb import get_test_dir
@ -41,7 +44,7 @@ def setup_module(self):
def mock_urlopen(req):
resp = testapp.get(req.get_full_url())
return resp.body.split('\n')
return resp.body.split(b'\n')
def mock_urlopen_err(err):
def make_err(req):
@ -58,45 +61,44 @@ def mock_urlopen_fuzzy(req):
resp = testapp.get(req.get_full_url(), status=status)
if status == 200:
return resp.body.split('\n')
return resp.body.split(b'\n')
else:
raise mock_urlopen_err(404)(req)
@patch('pywb.cdx.cdxsource.urllib2.urlopen', mock_urlopen)
@patch('pywb.cdx.cdxsource.urlopen', mock_urlopen)
def assert_cdx_match(server):
x = server.load_cdx(url='example.com',
limit=2,
output='cdxobject')
x.next()
assert x.next().items() == CDX_RESULT
x = list(x)
assert(list(x[1].items()) == CDX_RESULT)
def assert_cdx_fuzzy_match(server, mock=mock_urlopen):
with patch('pywb.cdx.cdxsource.urllib2.urlopen', mock):
with patch('pywb.cdx.cdxsource.urlopen', mock):
x = server.load_cdx(url='http://example.com?_=123',
limit=2,
output='cdxobject',
allowFuzzy=True)
x.next()
assert x.next().items() == CDX_RESULT
x = list(x)
assert(list(x[1].items()) == CDX_RESULT)
@patch('pywb.cdx.cdxsource.urllib2.urlopen', mock_urlopen_err(404))
@patch('pywb.cdx.cdxsource.urlopen', mock_urlopen_err(404))
def assert_404(server):
server.load_cdx(url='http://notfound.example.com')
@patch('pywb.cdx.cdxsource.urllib2.urlopen', mock_urlopen_err(403))
@patch('pywb.cdx.cdxsource.urlopen', mock_urlopen_err(403))
def assert_403(server):
server.load_cdx(url='http://notfound.example.com')
@patch('pywb.cdx.cdxsource.urllib2.urlopen', mock_urlopen_err(400))
@patch('pywb.cdx.cdxsource.urlopen', mock_urlopen_err(400))
def assert_400(server):
server.load_cdx(url='http://notfound.example.com')
@patch('pywb.cdx.cdxsource.urllib2.urlopen', mock_urlopen_err(502))
@patch('pywb.cdx.cdxsource.urlopen', mock_urlopen_err(502))
def assert_502(server):
server.load_cdx(url='http://notfound.example.com')
@ -131,7 +133,7 @@ def test_fuzzy_match():
def test_fuzzy_no_match_1():
# no match, no fuzzy
with patch('pywb.cdx.cdxsource.urllib2.urlopen', mock_urlopen):
with patch('pywb.cdx.cdxsource.urlopen', mock_urlopen):
server = CDXServer([TEST_CDX_DIR], ds_rules_file=DEFAULT_RULES_FILE)
with raises(NotFoundException):
server.load_cdx(url='http://notfound.example.com/',
@ -141,7 +143,7 @@ def test_fuzzy_no_match_1():
def test_fuzzy_no_match_2():
# fuzzy rule, but no actual match
with patch('pywb.cdx.cdxsource.urllib2.urlopen', mock_urlopen):
with patch('pywb.cdx.cdxsource.urlopen', mock_urlopen):
server = CDXServer([TEST_CDX_DIR], ds_rules_file=DEFAULT_RULES_FILE)
with raises(NotFoundException):
server.load_cdx(url='http://notfound.example.com/?_=1234',
@ -153,7 +155,7 @@ def test_fuzzy_no_match_2():
def test2_fuzzy_no_match_3():
# special fuzzy rule, matches prefix test.example.example.,
# but doesn't match rule regex
with patch('pywb.cdx.cdxsource.urllib2.urlopen', mock_urlopen):
with patch('pywb.cdx.cdxsource.urlopen', mock_urlopen):
server = CDXServer([TEST_CDX_DIR], ds_rules_file=DEFAULT_RULES_FILE)
with raises(NotFoundException):
server.load_cdx(url='http://test.example.example/',

View File

@ -4,6 +4,8 @@ from pywb.cdx.query import CDXQuery
from pytest import raises
import six
KEY = 'com,example)/'
#================================================================
@ -30,7 +32,7 @@ def lazy_cdx_load(**params):
# exception happens on first access attempt
with raises(AccessException):
cdx_iter.next()
six.next(cdx_iter)
def test_no_process():

View File

@ -35,13 +35,13 @@ def zadd_cdx(source, cdx, key):
source.redis.zadd(key, 0, cdx)
return
parts = cdx.split(' ', 2)
parts = cdx.split(b' ', 2)
key = parts[0]
timestamp = parts[1]
rest = timestamp + ' ' + parts[2]
rest = timestamp + b' ' + parts[2]
score = timestamp_to_sec(timestamp)
score = timestamp_to_sec(timestamp.decode('utf-8'))
source.redis.zadd(source.key_prefix + key, score, rest)

View File

@ -22,29 +22,6 @@ org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ te
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz
# Pages -- default page size
>>> zip_ops_test(url='http://iana.org/domains/example', matchType='exact', showNumPages=True)
{"blocks": 1, "pages": 1, "pageSize": 10}
>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', showNumPages=True)
{"blocks": 38, "pages": 4, "pageSize": 10}
# set page size
>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', pageSize=4, showNumPages=True)
{"blocks": 38, "pages": 10, "pageSize": 4}
# set page size -- alt domain query
>>> zip_ops_test(url='*.iana.org', pageSize='4', showNumPages=True)
{"blocks": 38, "pages": 10, "pageSize": 4}
# page size for non-existent, but secondary index match
>>> zip_ops_test(url='iana.org/domains/int/blah', pageSize=4, showNumPages=True)
{"blocks": 0, "pages": 0, "pageSize": 4}
# page size for non-existent, no secondary index match
>>> zip_ops_test(url='*.foo.bar', showNumPages=True)
{"blocks": 0, "pages": 0, "pageSize": 10}
# first page
>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', showPagedIndex=True, pageSize=4, page=0)
com,example)/ 20140127171200 zipnum 0 275 1
@ -116,16 +93,16 @@ org,iana)/time-zones/y 20140126200737 http://www.iana.org/time-zones/Y text/html
org,iana)/time-zones/y 20140126200737 http://www.iana.org/time-zones/Y text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
# invalid page
>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', showPagedIndex=True, pageSize=4, page=10)
>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', showPagedIndex=True, pageSize=4, page=10) # doctest: +IGNORE_EXCEPTION_DETAIL
Traceback (most recent call last):
CDXException: Page 10 invalid: First Page is 0, Last Page is 9
>>> zip_ops_test(url='http://aaa.aaa/', matchType='exact', showPagedIndex=True)
>>> zip_ops_test(url='http://aaa.aaa/', matchType='exact', showPagedIndex=True) # doctest: +IGNORE_EXCEPTION_DETAIL
Traceback (most recent call last):
NotFoundException: No Captures found for: http://aaa.aaa/
>>> zip_ops_test(url='http://aaa.aaa/', matchType='domain', showPagedIndex=True)
>>> zip_ops_test(url='http://aaa.aaa/', matchType='domain', showPagedIndex=True) # doctest: +IGNORE_EXCEPTION_DETAIL
Traceback (most recent call last):
NotFoundException: No Captures found for: http://aaa.aaa/ (domain query)
@ -133,34 +110,26 @@ NotFoundException: No Captures found for: http://aaa.aaa/ (domain query)
>>> zip_ops_test(url='http://aaa.zz/', matchType='domain', showPagedIndex=True)
org,iana)/time-zones 20140126200737 zipnum 9631 166 38
# read cdx to find 0 pages
>>> zip_ops_test(url='http://aaa.zz/', matchType='domain', showNumPages=True)
{"blocks": 0, "pages": 0, "pageSize": 10}
# read cdx to find no captures
>>> zip_ops_test(url='http://aaa.zz/', matchType='domain')
>>> zip_ops_test(url='http://aaa.zz/', matchType='domain') # doctest: +IGNORE_EXCEPTION_DETAIL
Traceback (most recent call last):
NotFoundException: No Captures found for: http://aaa.zz/ (domain query)
# Invalid .idx filesor or missing loc
>>> zip_test_err(url='http://example.com/', matchType='exact')
>>> zip_test_err(url='http://example.com/', matchType='exact') # doctest: +IGNORE_EXCEPTION_DETAIL
Traceback (most recent call last):
Exception: No Locations Found for: foo
>>> zip_test_err(url='http://iana.org/x', matchType='exact')
Traceback (most recent call last):
IOError: [Errno 2] No such file or directory: './sample_archive/invalid'
>>> zip_test_err(url='http://example.zz/x', matchType='exact')
>>> zip_test_err(url='http://example.zz/x', matchType='exact') # doctest: +IGNORE_EXCEPTION_DETAIL
Traceback (most recent call last):
Exception: No Locations Found for: foo2
"""
from test_cdxops import cdx_ops_test
from test_cdxops import cdx_ops_test, cdx_ops_test_data
from pywb import get_test_dir
from pywb.cdx.cdxserver import CDXServer
@ -170,9 +139,15 @@ import tempfile
import os
import json
import pytest
test_zipnum = get_test_dir() + 'zipcdx/zipnum-sample.idx'
def zip_ops_test_data(url, **kwargs):
sources = test_zipnum
return json.loads(cdx_ops_test_data(url, sources, **kwargs)[0])
def zip_ops_test(url, **kwargs):
sources = test_zipnum
cdx_ops_test(url, sources, **kwargs)
@ -220,6 +195,50 @@ def test_zip_prefix_load():
def test_blocks_def_page_size():
# Pages -- default page size
res = zip_ops_test_data(url='http://iana.org/domains/example', matchType='exact', showNumPages=True)
assert(res == {"blocks": 1, "pages": 1, "pageSize": 10})
def test_blocks_def_size_2():
res = zip_ops_test_data(url='http://iana.org/domains/', matchType='domain', showNumPages=True)
assert(res == {"blocks": 38, "pages": 4, "pageSize": 10})
def test_blocks_set_page_size():
# set page size
res = zip_ops_test_data(url='http://iana.org/domains/', matchType='domain', pageSize=4, showNumPages=True)
assert(res == {"blocks": 38, "pages": 10, "pageSize": 4})
def test_blocks_alt_q():
# set page size -- alt domain query
res = zip_ops_test_data(url='*.iana.org', pageSize='4', showNumPages=True)
assert(res == {"blocks": 38, "pages": 10, "pageSize": 4})
def test_blocks_secondary_match():
# page size for non-existent, but secondary index match
res = zip_ops_test_data(url='iana.org/domains/int/blah', pageSize=4, showNumPages=True)
assert(res == {"blocks": 0, "pages": 0, "pageSize": 4})
def test_blocks_no_match():
# page size for non-existent, no secondary index match
res = zip_ops_test_data(url='*.foo.bar', showNumPages=True)
assert(res == {"blocks": 0, "pages": 0, "pageSize": 10})
def test_blocks_zero_pages():
# read cdx to find 0 pages
res = zip_ops_test_data(url='http://aaa.zz/', matchType='domain', showNumPages=True)
assert(res == {"blocks": 0, "pages": 0, "pageSize": 10})
# Errors
def test_err_file_not_found():
with pytest.raises(IOError):
zip_test_err(url='http://iana.org/x', matchType='exact') # doctest: +IGNORE_EXCEPTION_DETAIL
if __name__ == "__main__":
import doctest
doctest.testmod()

View File

@ -5,9 +5,12 @@ import logging
from io import BytesIO
import datetime
import json
import six
from cdxsource import CDXSource
from cdxobject import IDXObject, CDXException
from six.moves import map
from pywb.cdx.cdxsource import CDXSource
from pywb.cdx.cdxobject import IDXObject, CDXException
from pywb.utils.loaders import BlockLoader, read_last_line
from pywb.utils.bufferedreaders import gzip_decompressor
@ -52,7 +55,7 @@ class LocMapResolver(object):
self.loc_mtime = new_mtime
logging.debug('Loading loc from: ' + self.loc_filename)
with open(self.loc_filename, 'rb') as fh:
with open(self.loc_filename, 'r') as fh:
for line in fh:
parts = line.rstrip().split('\t')
self.loc_map[parts[0]] = parts[1:]
@ -170,25 +173,28 @@ class ZipNumCluster(CDXSource):
last_line = None
start_key = query.key.encode('utf-8')
end_key = query.end_key.encode('utf-8')
# Get End
end_iter = search(reader, query.end_key, prev_size=1)
end_iter = search(reader, end_key, prev_size=1)
try:
end_line = end_iter.next()
end_line = six.next(end_iter)
except StopIteration:
last_line = read_last_line(reader)
end_line = last_line
# Get Start
first_iter = iter_range(reader,
query.key,
query.end_key,
start_key,
end_key,
prev_size=1)
try:
first_line = first_iter.next()
first_line = six.next(first_iter)
except StopIteration:
if end_line == last_line and query.key >= last_line:
if end_line == last_line and start_key >= last_line:
first_line = last_line
else:
reader.close()
@ -204,7 +210,7 @@ class ZipNumCluster(CDXSource):
try:
blocks = end['lineno'] - first['lineno']
total_pages = blocks / pagesize + 1
total_pages = int(blocks / pagesize) + 1
except:
blocks = -1
total_pages = 1
@ -215,8 +221,8 @@ class ZipNumCluster(CDXSource):
if blocks == 0:
try:
block_cdx_iter = self.idx_to_cdx([first_line], query)
block = block_cdx_iter.next()
cdx = block.next()
block = six.next(block_cdx_iter)
cdx = six.next(block)
except StopIteration:
total_pages = 0
blocks = -1
@ -250,12 +256,12 @@ class ZipNumCluster(CDXSource):
def search_by_line_num(self, reader, line): # pragma: no cover
def line_cmp(line1, line2):
line1_no = int(line1.rsplit('\t', 1)[-1])
line2_no = int(line2.rsplit('\t', 1)[-1])
line1_no = int(line1.rsplit(b'\t', 1)[-1])
line2_no = int(line2.rsplit(b'\t', 1)[-1])
return cmp(line1_no, line2_no)
line_iter = search(reader, line, compare_func=line_cmp)
yield line_iter.next()
yield six.next(line_iter)
def idx_to_cdx(self, idx_iter, query):
blocks = None
@ -304,7 +310,8 @@ class ZipNumCluster(CDXSource):
last_traceback = sys.exc_info()[2]
if last_exc:
raise last_exc, None, last_traceback
six.reraise(Exception, last_exc, last_traceback)
#raise last_exc
else:
raise Exception('No Locations Found for: ' + blocks.part)
@ -326,13 +333,13 @@ class ZipNumCluster(CDXSource):
for line in BytesIO(buff):
yield line
iter_ = itertools.chain(*itertools.imap(decompress_block, ranges))
iter_ = itertools.chain(*map(decompress_block, ranges))
# start bound
iter_ = linearsearch(iter_, query.key)
iter_ = linearsearch(iter_, query.key.encode('utf-8'))
# end bound
end = query.end_key
end = query.end_key.encode('utf-8')
iter_ = itertools.takewhile(lambda line: line < end, iter_)
return iter_

View File

@ -1,10 +1,10 @@
import urlparse
from six.moves.urllib.parse import urlsplit, urlunsplit, quote
import re
from urllib import quote
from pywb.rewrite.url_rewriter import UrlRewriter
from pywb.rewrite.wburl import WbUrl
from wbrequestresponse import WbRequest, WbResponse
from pywb.framework.wbrequestresponse import WbRequest, WbResponse
#=================================================================
@ -182,7 +182,7 @@ class ReferRedirect:
return None
# get referrer path name
ref_split = urlparse.urlsplit(referrer)
ref_split = urlsplit(referrer)
# require that referrer starts with current Host, if any
curr_host = env.get('HTTP_HOST')
@ -236,10 +236,10 @@ class ReferRedirect:
ref_request.wb_url.url = new_wb_url.url
return ref_route.handler(ref_request)
final_url = urlparse.urlunsplit((ref_split.scheme,
ref_split.netloc,
rewritten_url,
'',
''))
final_url = urlunsplit((ref_split.scheme,
ref_split.netloc,
rewritten_url,
'',
''))
return WbResponse.redir_response(final_url, status='302 Temp Redirect')

View File

@ -2,7 +2,7 @@ from pywb.utils.wbexception import BadRequestException
from pywb.utils.timeutils import http_date_to_timestamp
from pywb.utils.timeutils import timestamp_to_http_date
from wbrequestresponse import WbRequest, WbResponse
from pywb.framework.wbrequestresponse import WbRequest, WbResponse
from pywb.rewrite.wburl import WbUrl
LINK_FORMAT = 'application/link-format'

View File

@ -3,7 +3,7 @@ from __future__ import absolute_import
from pywb.framework.wbrequestresponse import WbResponse, WbRequest
from pywb.framework.archivalrouter import ArchivalRouter
import urlparse
from six.moves.urllib.parse import urlsplit
import base64
import socket
@ -164,7 +164,7 @@ class ProxyRouter(object):
url = env['REL_REQUEST_URI']
else:
parts = urlparse.urlsplit(env['REL_REQUEST_URI'])
parts = urlsplit(env['REL_REQUEST_URI'])
hostport = parts.netloc.split(':', 1)
env['pywb.proxy_host'] = hostport[0]
env['pywb.proxy_port'] = hostport[1] if len(hostport) == 2 else ''

View File

@ -1,13 +1,14 @@
from wbrequestresponse import WbResponse
from pywb.framework.wbrequestresponse import WbResponse
from pywb.utils.loaders import extract_client_cookie
from pywb.utils.wbexception import WbException
from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.rewrite.wburl import WbUrl
from cache import create_cache
from basehandlers import WbUrlHandler
from pywb.framework.cache import create_cache
from pywb.framework.basehandlers import WbUrlHandler
from six.moves.urllib.parse import parse_qs, urlsplit
import urlparse
import base64
import os
import json
@ -130,7 +131,7 @@ class IPCacheResolver(BaseCollResolver):
ip = env['REMOTE_ADDR']
qs = env.get('pywb.proxy_query')
if qs:
res = urlparse.parse_qs(qs)
res = parse_qs(qs)
if 'ip' in res:
ip = res['ip'][0]
@ -145,7 +146,7 @@ class IPCacheResolver(BaseCollResolver):
qs = env.get('pywb.proxy_query')
if qs:
res = urlparse.parse_qs(qs)
res = parse_qs(qs)
if 'ip' in res:
ip = res['ip'][0]
@ -223,7 +224,7 @@ class CookieResolver(BaseCollResolver):
def handle_magic_page(self, env):
request_url = env['REL_REQUEST_URI']
parts = urlparse.urlsplit(request_url)
parts = urlsplit(request_url)
server_name = env['pywb.proxy_host']
path_url = parts.path[1:]
@ -309,7 +310,7 @@ class CookieResolver(BaseCollResolver):
if '://' not in path_url:
path_url = 'http://' + path_url
path_parts = urlparse.urlsplit(path_url)
path_parts = urlsplit(path_url)
new_url = path_parts.path[1:]
if path_parts.query:

View File

@ -94,8 +94,10 @@ False
from pywb.framework.archivalrouter import Route, ReferRedirect, ArchivalRouter
from pywb.framework.basehandlers import BaseHandler, WbUrlHandler
import pprint
import urlparse
from six.moves.urllib.parse import urlsplit
def _test_route_req(route, env, abs_path=False):
matcher, coll = route.is_handling(env['REL_REQUEST_URI'])

View File

@ -87,7 +87,7 @@ def print_req_from_uri(request_uri, env={}, use_abs_prefix=False):
response = req_from_uri(request_uri, env, use_abs_prefix)
varlist = vars(response)
the_dict = dict((k, varlist[k]) for k in ('request_uri', 'wb_prefix', 'wb_url', 'coll'))
print the_dict
print(the_dict)
def req_from_uri(request_uri, env={}, use_abs_prefix=False):

View File

@ -41,7 +41,7 @@ def test_err_app():
resp = testapp.get('/abc', expect_errors=True)
assert resp.status_int == 500
assert '500 Internal Server Error Error: Test Unexpected Error' in resp.body
assert b'500 Internal Server Error Error: Test Unexpected Error' in resp.body
def test_custom_err_app():
the_app = init_app(initer(TestCustomErrApp), load_yaml=False)
@ -50,7 +50,7 @@ def test_custom_err_app():
resp = testapp.get('/abc', expect_errors=True)
assert resp.status_int == 403
assert '403 Access Denied Error: Forbidden Test' in resp.body
assert b'403 Access Denied Error: Forbidden Test' in resp.body

View File

@ -1,7 +1,7 @@
from pywb.utils.wbexception import WbException, NotFoundException
from pywb.utils.loaders import load_yaml_config
from wbrequestresponse import WbResponse, StatusAndHeaders
from pywb.framework.wbrequestresponse import WbResponse, StatusAndHeaders
import os
@ -92,12 +92,13 @@ class WSGIApp(object):
else:
err_url = None
err_msg = exc.message
if len(exc.args):
err_msg = exc.args[0]
if print_trace:
import traceback
err_details = traceback.format_exc(exc)
print err_details
err_details = traceback.format_exc()
print(err_details)
else:
logging.info(err_msg)
err_details = None

View File

@ -1,4 +1,4 @@
from Cookie import SimpleCookie, CookieError
from six.moves.http_cookies import SimpleCookie, CookieError
#=================================================================

View File

@ -1,16 +1,14 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys
import re
from HTMLParser import HTMLParser, HTMLParseError
from urlparse import urljoin, urlsplit, urlunsplit
from six.moves.html_parser import HTMLParser
from six.moves.urllib.parse import urljoin, urlsplit, urlunsplit
from url_rewriter import UrlRewriter
from regex_rewriters import JSRewriter, CSSRewriter
import cgi
from pywb.rewrite.url_rewriter import UrlRewriter
from pywb.rewrite.regex_rewriters import JSRewriter, CSSRewriter
#=================================================================
@ -411,7 +409,7 @@ class HTMLRewriter(HTMLRewriterMixin, HTMLParser):
def feed(self, string):
try:
HTMLParser.feed(self, string)
except HTMLParseError: # pragma: no cover
except Exception: # pragma: no cover
# only raised in 2.6
self.out.write(string)
@ -429,7 +427,7 @@ class HTMLRewriter(HTMLRewriterMixin, HTMLParser):
try:
HTMLParser.close(self)
except HTMLParseError: # pragma: no cover
except Exception: # pragma: no cover
# only raised in 2.6
pass

View File

@ -1,8 +1,6 @@
import re
import sys
import itertools
from url_rewriter import UrlRewriter
from pywb.rewrite.url_rewriter import UrlRewriter
#=================================================================

View File

@ -7,16 +7,16 @@ import re
from chardet.universaldetector import UniversalDetector
from io import BytesIO
from header_rewriter import RewrittenStatusAndHeaders
from pywb.rewrite.header_rewriter import RewrittenStatusAndHeaders
from rewriterules import RewriteRules
from pywb.rewrite.rewriterules import RewriteRules
from pywb.utils.dsrules import RuleSet
from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.utils.bufferedreaders import DecompressingBufferedReader
from pywb.utils.bufferedreaders import ChunkedDataReader, BufferedReader
from regex_rewriters import JSNoneRewriter, JSLinkOnlyRewriter
from pywb.rewrite.regex_rewriters import JSNoneRewriter, JSLinkOnlyRewriter
#=================================================================
@ -288,7 +288,7 @@ class RewriteContent:
def _decode_buff(buff, stream, encoding): # pragma: no coverage
try:
buff = buff.decode(encoding)
except UnicodeDecodeError, e:
except UnicodeDecodeError as e:
# chunk may have cut apart unicode bytes -- add 1-3 bytes and retry
for i in range(3):
buff += stream.read(1)

View File

@ -8,7 +8,7 @@ import mimetypes
import logging
import os
from urlparse import urlsplit
from six.moves.urllib.parse import urlsplit
from pywb.utils.loaders import is_http, LimitReader, LocalFileLoader, to_file_url
from pywb.utils.loaders import extract_client_cookie
@ -16,7 +16,7 @@ from pywb.utils.timeutils import timestamp_now
from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.utils.canonicalize import canonicalize
from rewrite_content import RewriteContent
from pywb.rewrite.rewrite_content import RewriteContent
#=================================================================

View File

@ -1,13 +1,12 @@
from pywb.utils.dsrules import BaseRule
from regex_rewriters import RegexRewriter, CSSRewriter, XMLRewriter
from regex_rewriters import JSLinkAndLocationRewriter, JSLinkOnlyRewriter
from regex_rewriters import JSLocationOnlyRewriter, JSNoneRewriter
from pywb.rewrite.regex_rewriters import RegexRewriter, CSSRewriter, XMLRewriter
from pywb.rewrite.regex_rewriters import JSLinkAndLocationRewriter, JSLinkOnlyRewriter
from pywb.rewrite.regex_rewriters import JSLocationOnlyRewriter, JSNoneRewriter
from header_rewriter import HeaderRewriter
from html_rewriter import HTMLRewriter
from pywb.rewrite.header_rewriter import HeaderRewriter
from pywb.rewrite.html_rewriter import HTMLRewriter
import itertools
import re

View File

@ -1,7 +1,7 @@
import urlparse
from six.moves.urllib.parse import urljoin, urlsplit, urlunsplit
from wburl import WbUrl
from cookie_rewriter import get_cookie_rewriter
from pywb.rewrite.wburl import WbUrl
from pywb.rewrite.cookie_rewriter import get_cookie_rewriter
#=================================================================
@ -119,11 +119,11 @@ class UrlRewriter(object):
@staticmethod
def urljoin(orig_url, url):
new_url = urlparse.urljoin(orig_url, url)
new_url = urljoin(orig_url, url)
if '../' not in new_url:
return new_url
parts = urlparse.urlsplit(new_url)
parts = urlsplit(new_url)
scheme, netloc, path, query, frag = parts
path_parts = path.split('/')
@ -147,7 +147,7 @@ class UrlRewriter(object):
parts = (scheme, netloc, path, query, frag)
new_url = urlparse.urlunsplit(parts)
new_url = urlunsplit(parts)
return new_url

View File

@ -39,8 +39,11 @@ wayback url format.
"""
import re
import urllib
import urlparse
import six
from six.moves.urllib.parse import urlsplit, urlunsplit
from six.moves.urllib.parse import quote_plus, quote, unquote_plus
#=================================================================
class BaseWbUrl(object):
@ -105,7 +108,7 @@ class WbUrl(BaseWbUrl):
if 'xn--' not in url:
return url
parts = urlparse.urlsplit(url)
parts = urlsplit(url)
domain = parts.netloc
try:
domain = domain.decode('idna')
@ -114,9 +117,9 @@ class WbUrl(BaseWbUrl):
# likely already encoded, so use as is
pass
domain = urllib.quote(domain)#, safe=r':\/')
domain = quote(domain)#, safe=r':\/')
return urlparse.urlunsplit((parts[0], domain, parts[2], parts[3], parts[4]))
return urlunsplit((parts[0], domain, parts[2], parts[3], parts[4]))
@staticmethod
@ -131,7 +134,7 @@ class WbUrl(BaseWbUrl):
"""
parts = WbUrl.FIRST_PATH.split(url, 1)
scheme_dom = urllib.unquote_plus(parts[0])
scheme_dom = unquote_plus(parts[0])
if isinstance(scheme_dom, str):
if scheme_dom == parts[0]:
@ -155,7 +158,7 @@ class WbUrl(BaseWbUrl):
if len(parts) > 1:
if isinstance(parts[1], unicode):
url += '/' + urllib.quote(parts[1].encode('utf-8'))
url += '/' + quote(parts[1].encode('utf-8'))
else:
url += '/' + parts[1]
@ -168,7 +171,7 @@ class WbUrl(BaseWbUrl):
if isinstance(orig_url, unicode):
orig_url = orig_url.encode('utf-8')
orig_url = urllib.quote(orig_url)
orig_url = quote(orig_url)
self._original_url = orig_url
@ -259,7 +262,7 @@ class WbUrl(BaseWbUrl):
rex_query = '=' + re.escape(prefix) + '([0-9])*([\w]{2}_)?/?'
self.url = re.sub(rex_query, '=', self.url)
rex_query = '=(' + urllib.quote_plus(prefix) + '.*?)((?:https?%3A)?%2F%2F[^&]+)'
rex_query = '=(' + quote_plus(prefix) + '.*?)((?:https?%3A)?%2F%2F[^&]+)'
self.url = re.sub(rex_query, '=\\2', self.url)
return self.url

View File

@ -45,6 +45,17 @@ def load_yaml_config(config_file):
return config
#=================================================================
def to_native_str(value, encoding='iso-8859-1'):
if isinstance(value, str):
return value
if six.PY3 and isinstance(value, six.binary_type):
return value.decode(encoding)
elif six.PY2 and isinstance(value, six.text_type):
return value.encode(encoding)
#=================================================================
def extract_post_query(method, mime, length, stream, buffered_stream=None):
"""
@ -77,7 +88,7 @@ def extract_post_query(method, mime, length, stream, buffered_stream=None):
if not buff:
break
post_query += buff
post_query += to_native_str(buff)
if buffered_stream:
buffered_stream.write(post_query)

View File

@ -5,8 +5,12 @@ Representation and parsing of HTTP-style status + headers
import pprint
from copy import copy
from six.moves import range
import six
from pywb.utils.loaders import to_native_str
WRAP_WIDTH = 80
#=================================================================
class StatusAndHeaders(object):
"""
@ -112,7 +116,7 @@ class StatusAndHeaders(object):
return self
def __repr__(self):
headers_str = pprint.pformat(self.headers, indent=2)
headers_str = pprint.pformat(self.headers, indent=2, width=WRAP_WIDTH)
return "StatusAndHeaders(protocol = '{0}', statusline = '{1}', \
headers = {2})".format(self.protocol, self.statusline, headers_str)
@ -145,9 +149,15 @@ class StatusAndHeadersParser(object):
support continuation headers starting with space or tab
"""
def readline():
return to_native_str(stream.readline())
# status line w newlines intact
if full_statusline is None:
full_statusline = stream.readline()
full_statusline = readline()
else:
full_statusline = to_native_str(full_statusline)
statusline, total_read = _strip_count(full_statusline, 0)
@ -173,7 +183,7 @@ class StatusAndHeadersParser(object):
else:
protocol_status = statusline.split(' ', 1)
line, total_read = _strip_count(stream.readline(), total_read)
line, total_read = _strip_count(readline(), total_read)
while line:
result = line.split(':', 1)
if len(result) == 2:
@ -183,14 +193,14 @@ class StatusAndHeadersParser(object):
name = result[0]
value = None
next_line, total_read = _strip_count(stream.readline(),
next_line, total_read = _strip_count(readline(),
total_read)
# append continuation lines, if any
while next_line and next_line.startswith((' ', '\t')):
if value is not None:
value += next_line
next_line, total_read = _strip_count(stream.readline(),
next_line, total_read = _strip_count(readline(),
total_read)
if value is not None:

View File

@ -3,7 +3,7 @@ from pywb.utils.bufferedreaders import DecompressingBufferedReader
from pywb.utils.canonicalize import canonicalize
from pywb.utils.loaders import extract_post_query, append_post_query
from recordloader import ArcWarcRecordLoader
from pywb.warc.recordloader import ArcWarcRecordLoader
import hashlib
import base64
@ -66,7 +66,10 @@ class ArchiveIterator(object):
self.member_info = None
self.no_record_parse = no_record_parse
def iter_records(self, block_size=16384):
def __iter__(self):
return self
def __call__(self, block_size=16384):
""" iterate over each record
"""
@ -152,10 +155,10 @@ class ArchiveIterator(object):
stripped = line.rstrip()
if stripped == '' or first_line:
if len(stripped) == 0 or first_line:
empty_size += len(line)
if stripped != '':
if len(stripped) != 0:
# if first line is not blank,
# likely content-length was invalid, display warning
err_offset = self.fh.tell() - self.reader.rem_length() - empty_size
@ -290,7 +293,7 @@ class ArchiveIndexEntryMixin(object):
#=================================================================
class DefaultRecordIter(object):
class DefaultRecordParser(object):
def __init__(self, **options):
self.options = options
self.entry_cache = {}
@ -329,14 +332,14 @@ class DefaultRecordIter(object):
def end_payload(self, entry):
if self.digester:
entry['digest'] = base64.b32encode(self.digester.digest())
entry['digest'] = base64.b32encode(self.digester.digest()).decode('ascii')
self.entry = None
def create_payload_buffer(self, entry):
return None
def create_record_iter(self, arcv_iter):
def create_record_iter(self, raw_iter):
append_post = self.options.get('append_post')
include_all = self.options.get('include_all')
block_size = self.options.get('block_size', 16384)
@ -347,7 +350,7 @@ class DefaultRecordIter(object):
raise Exception('Sorry, minimal index option and ' +
'append POST options can not be used together')
for record in arcv_iter.iter_records(block_size):
for record in raw_iter(block_size):
entry = None
if not include_all and not minimal and (record.status_headers.get_statuscode() == '-'):
@ -394,9 +397,9 @@ class DefaultRecordIter(object):
entry.record = record
self.begin_payload(compute_digest, entry)
arcv_iter.read_to_end(record, self.handle_payload)
raw_iter.read_to_end(record, self.handle_payload)
entry.set_rec_info(*arcv_iter.member_info)
entry.set_rec_info(*raw_iter.member_info)
self.end_payload(entry)
yield entry
@ -536,8 +539,15 @@ class DefaultRecordIter(object):
yield entry
def open(self, filename):
with open(filename, 'rb') as fh:
for entry in self(fh):
yield entry
class ArchiveIndexEntry(ArchiveIndexEntryMixin, dict):
pass
class OrderedArchiveIndexEntry(ArchiveIndexEntryMixin, OrderedDict):
pass

View File

@ -29,15 +29,18 @@ except ImportError: # pragma: no cover
from argparse import ArgumentParser, RawTextHelpFormatter
from bisect import insort
from io import BytesIO
from six import StringIO
from archiveiterator import DefaultRecordIter
from pywb.warc.archiveiterator import DefaultRecordParser
import codecs
import six
#=================================================================
class BaseCDXWriter(object):
def __init__(self, out):
self.out = out
self.out = codecs.getwriter('utf-8')(out)
#self.out = out
def __enter__(self):
self._write_header()
@ -69,7 +72,7 @@ class CDXJ(object):
outdict = OrderedDict()
for n, v in entry.iteritems():
for n, v in six.iteritems(entry):
if n in ('urlkey', 'timestamp'):
continue
@ -145,7 +148,7 @@ class SortedCDXWriter(BaseCDXWriter):
return res
def write(self, entry, filename):
self.out = BytesIO()
self.out = StringIO()
super(SortedCDXWriter, self).write(entry, filename)
line = self.out.getvalue()
if line:
@ -175,7 +178,7 @@ def iter_file_or_dir(inputs, recursive=True, rel_root=None):
if not rel_root:
filename = os.path.basename(input_)
else:
filename = _resolve_rel_path(input_, rel_root)
filename = _resolve_rel_path(input_, rel_root)
yield input_, filename
@ -268,7 +271,7 @@ def write_multi_cdx_index(output, inputs, **options):
outfile = open(output, 'wb')
writer_cls = get_cdx_writer_cls(options)
record_iter = DefaultRecordIter(**options)
record_iter = DefaultRecordParser(**options)
with writer_cls(outfile) as writer:
for fullpath, filename in iter_file_or_dir(inputs,
@ -285,13 +288,12 @@ def write_multi_cdx_index(output, inputs, **options):
#=================================================================
def write_cdx_index(outfile, infile, filename, **options):
if type(filename) is unicode:
filename = filename.encode(sys.getfilesystemencoding())
#filename = filename.encode(sys.getfilesystemencoding())
writer_cls = get_cdx_writer_cls(options)
with writer_cls(outfile) as writer:
entry_iter = DefaultRecordIter(**options)(infile)
entry_iter = DefaultRecordParser(**options)(infile)
for entry in entry_iter:
writer.write(entry, filename)

View File

@ -1,9 +1,11 @@
import redis
from pywb.utils.binsearch import iter_exact
from pywb.utils.loaders import to_native_str
from six.moves.urllib.parse import urlsplit
from six.moves.urllib.request import url2pathname
import urlparse
import urllib
import os
import logging
@ -49,7 +51,7 @@ class RedisResolver:
def __call__(self, filename):
redis_val = self.redis.hget(self.key_prefix + filename, 'path')
return [redis_val] if redis_val else []
return [to_native_str(redis_val)] if redis_val else []
def __repr__(self):
return "RedisResolver('{0}')".format(self.redis_url)
@ -62,12 +64,12 @@ class PathIndexResolver:
def __call__(self, filename):
with open(self.pathindex_file, 'rb') as reader:
result = iter_exact(reader, filename, '\t')
result = iter_exact(reader, filename.encode('utf-8'), b'\t')
for pathline in result:
paths = pathline.split('\t')[1:]
paths = pathline.split(b'\t')[1:]
for path in paths:
yield path
yield to_native_str(path)
def __repr__(self): # pragma: no cover
return "PathIndexResolver('{0}')".format(self.pathindex_file)
@ -84,7 +86,7 @@ def make_best_resolver(param):
path = param
arg = None
url_parts = urlparse.urlsplit(path)
url_parts = urlsplit(path)
if url_parts.scheme == 'redis':
logging.debug('Adding Redis Index: ' + path)
@ -92,7 +94,7 @@ def make_best_resolver(param):
if url_parts.scheme == 'file':
path = url_parts.path
path = urllib.url2pathname(path)
path = url2pathname(path)
if os.path.isfile(path):
logging.debug('Adding Path Index: ' + path)
@ -106,7 +108,7 @@ def make_best_resolver(param):
#=================================================================
def make_best_resolvers(paths):
if hasattr(paths, '__iter__'):
return map(make_best_resolver, paths)
if isinstance(paths, list) or isinstance(paths, set):
return list(map(make_best_resolver, paths))
else:
return [make_best_resolver(paths)]

View File

@ -1,5 +1,3 @@
import itertools
import urlparse
import collections
from pywb.utils.statusandheaders import StatusAndHeaders
@ -7,10 +5,14 @@ from pywb.utils.statusandheaders import StatusAndHeadersParser
from pywb.utils.statusandheaders import StatusAndHeadersParserException
from pywb.utils.loaders import BlockLoader, LimitReader
from pywb.utils.loaders import to_native_str
from pywb.utils.bufferedreaders import DecompressingBufferedReader
from pywb.utils.wbexception import WbException
from six.moves import zip
import six
#=================================================================
ArcWarcRecord = collections.namedtuple('ArcWarcRecord',
@ -34,7 +36,7 @@ class ArchiveLoadFailed(WbException):
#=================================================================
class ArcWarcRecordLoader:
class ArcWarcRecordLoader(object):
# Standard ARC v1.0 headers
# TODO: support ARC v2.0 also?
ARC_HEADERS = ["uri", "ip-address", "archive-date",
@ -73,7 +75,7 @@ class ArcWarcRecordLoader:
except:
length = -1
stream = self.loader.load(url, long(offset), length)
stream = self.loader.load(url, int(offset), length)
decomp_type = 'gzip'
# Create decompressing stream
@ -200,16 +202,21 @@ class ArcWarcRecordLoader:
#=================================================================
class ARCHeadersParser:
class ARCHeadersParser(object):
def __init__(self, headernames):
self.headernames = headernames
def parse(self, stream, headerline=None):
total_read = 0
def readline():
return to_native_str(stream.readline())
# if headerline passed in, use that
if headerline is None:
headerline = stream.readline()
headerline = readline()
else:
headerline = to_native_str(headerline)
header_len = len(headerline)
@ -222,8 +229,8 @@ class ARCHeadersParser:
# if arc header, consume next two lines
if headerline.startswith('filedesc://'):
version = stream.readline() # skip version
spec = stream.readline() # skip header spec, use preset one
version = readline() # skip version
spec = readline() # skip header spec, use preset one
total_read += len(version)
total_read += len(spec)
@ -236,7 +243,7 @@ class ARCHeadersParser:
headers = []
for name, value in itertools.izip(headernames, parts):
for name, value in zip(headernames, parts):
headers.append((name, value))
return StatusAndHeaders(statusline='',

View File

@ -1,8 +1,10 @@
from pywb.utils.timeutils import iso_date_to_timestamp
from recordloader import ArcWarcRecordLoader, ArchiveLoadFailed
from pathresolvers import make_best_resolvers
from pywb.warc.recordloader import ArcWarcRecordLoader, ArchiveLoadFailed
from pywb.warc.pathresolvers import make_best_resolvers
from pywb.utils.wbexception import NotFoundException
import six
#=================================================================
class ResolvingLoader(object):
@ -104,6 +106,9 @@ class ResolvingLoader(object):
for resolver in self.path_resolvers:
possible_paths = resolver(filename)
#import sys
#sys.stderr.write(str(possible_paths))
if possible_paths:
for path in possible_paths:
any_found = True
@ -125,7 +130,8 @@ class ResolvingLoader(object):
else:
msg = 'Archive File Not Found'
raise ArchiveLoadFailed(msg, filename), None, last_traceback
#raise ArchiveLoadFailed(msg, filename), None, last_traceback
six.reraise(ArchiveLoadFailed, ArchiveLoadFailed(msg, filename), last_traceback)
def _load_different_url_payload(self, cdx, headers_record,
failed_files, cdx_loader):

View File

@ -48,9 +48,9 @@ com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ
>>> print_cdx_index('example-wget-1-14.warc.gz')
CDX N b a m s k r M S V g
com,example)/ 20140216012908 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1151 792 example-wget-1-14.warc.gz
metadata)/gnu.org/software/wget/warc/manifest.txt 20140216012908 metadata://gnu.org/software/wget/warc/MANIFEST.txt text/plain - SWUF4CK2XMZSOKSA7SDT7M7NUGWH2TRE - - 315 1943 example-wget-1-14.warc.gz
metadata)/gnu.org/software/wget/warc/wget_arguments.txt 20140216012908 metadata://gnu.org/software/wget/warc/wget_arguments.txt text/plain - UCXDCGORD6K4RJT5NUQGKE2PKEG4ZZD6 - - 340 2258 example-wget-1-14.warc.gz
metadata)/gnu.org/software/wget/warc/wget.log 20140216012908 metadata://gnu.org/software/wget/warc/wget.log text/plain - 2ULE2LD5UOWDXGACCT624TU5BVKACRQ4 - - 599 2598 example-wget-1-14.warc.gz
org,gnu)/software/wget/warc/manifest.txt 20140216012908 metadata://gnu.org/software/wget/warc/MANIFEST.txt text/plain - SWUF4CK2XMZSOKSA7SDT7M7NUGWH2TRE - - 315 1943 example-wget-1-14.warc.gz
org,gnu)/software/wget/warc/wget_arguments.txt 20140216012908 metadata://gnu.org/software/wget/warc/wget_arguments.txt text/plain - UCXDCGORD6K4RJT5NUQGKE2PKEG4ZZD6 - - 340 2258 example-wget-1-14.warc.gz
org,gnu)/software/wget/warc/wget.log 20140216012908 metadata://gnu.org/software/wget/warc/wget.log text/plain - 2ULE2LD5UOWDXGACCT624TU5BVKACRQ4 - - 599 2598 example-wget-1-14.warc.gz
# wget warc, includes metadata and request
@ -58,9 +58,9 @@ metadata)/gnu.org/software/wget/warc/wget.log 20140216012908 metadata://gnu.org/
CDX N b a m s k r M S V g
com,example)/ 20140216012908 http://example.com/ - - - - - 394 398 example-wget-1-14.warc.gz
com,example)/ 20140216012908 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1151 792 example-wget-1-14.warc.gz
metadata)/gnu.org/software/wget/warc/manifest.txt 20140216012908 metadata://gnu.org/software/wget/warc/MANIFEST.txt text/plain - SWUF4CK2XMZSOKSA7SDT7M7NUGWH2TRE - - 315 1943 example-wget-1-14.warc.gz
metadata)/gnu.org/software/wget/warc/wget_arguments.txt 20140216012908 metadata://gnu.org/software/wget/warc/wget_arguments.txt text/plain - UCXDCGORD6K4RJT5NUQGKE2PKEG4ZZD6 - - 340 2258 example-wget-1-14.warc.gz
metadata)/gnu.org/software/wget/warc/wget.log 20140216012908 metadata://gnu.org/software/wget/warc/wget.log text/plain - 2ULE2LD5UOWDXGACCT624TU5BVKACRQ4 - - 599 2598 example-wget-1-14.warc.gz
org,gnu)/software/wget/warc/manifest.txt 20140216012908 metadata://gnu.org/software/wget/warc/MANIFEST.txt text/plain - SWUF4CK2XMZSOKSA7SDT7M7NUGWH2TRE - - 315 1943 example-wget-1-14.warc.gz
org,gnu)/software/wget/warc/wget_arguments.txt 20140216012908 metadata://gnu.org/software/wget/warc/wget_arguments.txt text/plain - UCXDCGORD6K4RJT5NUQGKE2PKEG4ZZD6 - - 340 2258 example-wget-1-14.warc.gz
org,gnu)/software/wget/warc/wget.log 20140216012908 metadata://gnu.org/software/wget/warc/wget.log text/plain - 2ULE2LD5UOWDXGACCT624TU5BVKACRQ4 - - 599 2598 example-wget-1-14.warc.gz
# wpull warc, includes metadata by default
>>> print_cdx_index('example-wpull.warc.gz')
@ -127,7 +127,7 @@ com,example)/?example=2 20140603030341 http://example.com?example=2 warc/revisit
com,example)/?example=2 20140103030321 http://example.com?example=2 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1987 3207 example-extra.warc
com,example)/?example=2 20140603030341 http://example.com?example=2 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 504 5910 example-extra.warc
>>> print_cdx_index('example-extra.warc', verify_http=True)
>>> print_cdx_index('example-extra.warc', verify_http=True) # doctest: +IGNORE_EXCEPTION_DETAIL
Traceback (most recent call last):
StatusAndHeadersParserException: Expected Status Line starting with ['HTTP/1.0', 'HTTP/1.1'] - Found: HTTPX/1.1 200 OK
@ -178,7 +178,7 @@ urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX
Total: 210
# test writing to temp dir, also use unicode filename
>>> cli_lines_with_dir(unicode(TEST_WARC_DIR + 'example.warc.gz'))
>>> cli_lines_with_dir(TEST_WARC_DIR + 'example.warc.gz')
example.cdx
com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
@ -223,7 +223,7 @@ def cdx_index(warc, **options):
return buff.getvalue()
def print_cdx_index(*args, **kwargs):
sys.stdout.write(cdx_index(*args, **kwargs))
sys.stdout.write(cdx_index(*args, **kwargs).decode('utf-8'))
def assert_cdx_match(cdx, warc, sort=False):
assert read_fully(cdx) == cdx_index(warc, sort=sort)
@ -239,11 +239,11 @@ def cli_lines(cmds):
sys.stdout = buff
main(cmds)
sys.stdout = orig
lines = buff.getvalue().rstrip().split('\n')
lines = buff.getvalue().rstrip().split(b'\n')
# print first, last, num lines
print(lines[1])
print(lines[-1])
print(lines[1].decode('utf-8'))
print(lines[-1].decode('utf-8'))
print('Total: ' + str(len(lines)))
def cli_lines_with_dir(input_):
@ -256,10 +256,10 @@ def cli_lines_with_dir(input_):
filename = cdx_filename(os.path.basename(input_))
print filename
print(filename)
with open(os.path.join(tmp_dir, filename), 'rb') as fh:
lines = fh.read(8192).rstrip().split('\n')
lines = fh.read(8192).rstrip().split(b'\n')
finally:
try:
@ -273,8 +273,8 @@ def cli_lines_with_dir(input_):
return
# print first, last, num lines
print (lines[1])
print (lines[-1])
print(lines[1].decode('utf-8'))
print(lines[-1].decode('utf-8'))
print('Total: ' + str(len(lines)))
@ -284,18 +284,18 @@ def test_non_chunked_gzip_err():
def parse_cdxj(string):
lines = string.split('\n')
if lines[0] == '':
lines = string.split(b'\n')
if lines[0] == b'':
lines = lines[1:]
cdxlist = map(CDXObject, lines)
return map(dict, cdxlist)
cdxlist = list(map(CDXObject, lines))
return list(map(dict, cdxlist))
def test_cdxj_warc_minimal():
# cdxj minimal
res = cdx_index('example.warc.gz', minimal=True, cdxj=True)
assert parse_cdxj(res) == parse_cdxj("""
assert parse_cdxj(res) == parse_cdxj(b"""
com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "1043", "offset": "333", "filename": "example.warc.gz"}
com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "mime": "warc/revisit", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "553", "offset": "1864", "filename": "example.warc.gz"}
org,iana)/domains/example 20140128051539 {"url": "http://www.iana.org/domains/example", "digest": "JZ622UA23G5ZU6Y3XAKH4LINONUEICEG", "length": "577", "offset": "2907", "filename": "example.warc.gz"}
@ -306,7 +306,7 @@ def test_cdxj_warc_all():
# warc.gz -- parse all -- CDXJ
res = cdx_index('example.warc.gz', include_all=True, cdxj=True)
assert parse_cdxj(res) == parse_cdxj("""
assert parse_cdxj(res) == parse_cdxj(b"""
com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "mime": "text/html", "status": "200", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "1043", "offset": "333", "filename": "example.warc.gz"}
com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "length": "488", "offset": "1376", "filename": "example.warc.gz"}
com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "mime": "warc/revisit", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "553", "offset": "1864", "filename": "example.warc.gz"}
@ -317,14 +317,14 @@ org,iana)/domains/example 20140128051539 {"url": "http://www.iana.org/domains/ex
def test_cdxj_arc():
# arc.gz -- json
res = cdx_index('example.arc.gz', cdxj=True)
assert parse_cdxj(res) == parse_cdxj("""
assert parse_cdxj(res) == parse_cdxj(b"""
com,example)/ 20140216050221 {"url": "http://example.com/", "mime": "text/html", "status": "200", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "856", "offset": "171", "filename": "example.arc.gz"}
""")
def test_cdxj_arc_minimal():
# arc.gz -- minimal + json
res = cdx_index('example.arc.gz', cdxj=True, minimal=True)
assert parse_cdxj(res) == parse_cdxj("""
assert parse_cdxj(res) == parse_cdxj(b"""
com,example)/ 20140216050221 {"url": "http://example.com/", "digest": "PEWDX5GTH66WU74WBPGFECIYBMPMP3FP", "length": "856", "offset": "171", "filename": "example.arc.gz"}
""")

View File

@ -37,8 +37,7 @@ Test loading different types of records from a variety of formats
('WARC-Payload-Digest', 'sha1:B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'),
('WARC-Target-URI', 'http://example.com?example=1'),
('WARC-Warcinfo-ID', '<urn:uuid:fbd6cf0a-6160-4550-b343-12188dc05234>'),
( 'WARC-Profile',
'http://netpreserve.org/warc/0.18/revisit/identical-payload-digest'),
('WARC-Profile', 'http://netpreserve.org/warc/0.18/revisit/identical-payload-digest'),
('WARC-Refers-To-Target-URI', 'http://example.com?example=1'),
('WARC-Refers-To-Date', '2014-01-03T03:03:21Z')]),
StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
@ -66,17 +65,13 @@ Test loading different types of records from a variety of formats
('WARC-Target-URI', 'http://example.com?example=1'),
('WARC-Warcinfo-ID', '<urn:uuid:fbd6cf0a-6160-4550-b343-12188dc05234>')]),
StatusAndHeaders(protocol = 'GET', statusline = '/?example=1 HTTP/1.1', headers = [ ('Connection', 'close'),
( 'Accept',
'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'),
('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'),
('Accept-Language', 'en-US,en;q=0.8'),
( 'User-Agent',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.57 Safari/537.36 (via Wayback Save Page)'),
('User-Agent', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) \
Chrome/31.0.1650.57 Safari/537.36 (via Wayback Save Page)'),
('Host', 'example.com')]))
StatusAndHeaders(protocol = '', statusline = '204 No Content', headers = []))
# Test of record loading based on cdx line
# Print parsed http headers + 2 lines of content
# ==============================================================================
@ -233,7 +228,7 @@ failed_files=failed_files)
Exception: ArchiveLoadFailed
# ensure failed_files being filled
>>> failed_files
>>> print_strs(failed_files)
['x-not-found-x.warc.gz']
>>> load_from_cdx_test('com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 856 170 x-not-found-x.warc.gz',\
@ -295,12 +290,15 @@ Exception: ArchiveLoadFailed
import os
import sys
import pprint
import six
from pywb.warc.recordloader import ArcWarcRecordLoader, ArchiveLoadFailed
from pywb.warc.pathresolvers import make_best_resolvers
from pywb.warc.resolvingloader import ResolvingLoader
from pywb.cdx.cdxobject import CDXObject
import pywb.utils.statusandheaders
from pywb import get_test_dir
#==============================================================================
@ -319,7 +317,7 @@ URL_AGNOSTIC_REVISIT_NO_DIGEST_CDX = 'com,example)/ 20130729195151 http://test@e
warc/revisit - - - - \
591 355 example-url-agnostic-revisit.warc.gz'
BAD_ORIG_CDX = 'org,iana,example)/ 20130702195401 http://example.iana.org/ \
BAD_ORIG_CDX = b'org,iana,example)/ 20130702195401 http://example.iana.org/ \
text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - \
1001 353 someunknown.warc.gz'
@ -332,8 +330,10 @@ def load_test_archive(test_file, offset, length):
archive = testloader.load(path, offset, length)
pywb.utils.statusandheaders.WRAP_WIDTH = 160
pprint.pprint(((archive.format, archive.rec_type),
archive.rec_headers, archive.status_headers))
archive.rec_headers, archive.status_headers), indent=1, width=160)
#==============================================================================
@ -345,25 +345,25 @@ def load_orig_bad_cdx(_):
#==============================================================================
def load_orig_cdx(_):
return [CDXObject(BAD_ORIG_CDX),
CDXObject(URL_AGNOSTIC_ORIG_CDX)]
CDXObject(URL_AGNOSTIC_ORIG_CDX.encode('utf-8'))]
#==============================================================================
def load_from_cdx_test(cdx, revisit_func=load_orig_cdx, reraise=False,
failed_files=None):
resolve_loader = ResolvingLoader(test_warc_dir)
cdx = CDXObject(cdx)
cdx = CDXObject(cdx.encode('utf-8'))
try:
(headers, stream) = resolve_loader(cdx, failed_files, revisit_func)
print headers
sys.stdout.write(stream.readline())
sys.stdout.write(stream.readline())
print(headers)
sys.stdout.write(stream.readline().decode('utf-8'))
sys.stdout.write(stream.readline().decode('utf-8'))
except ArchiveLoadFailed as e:
if reraise:
raise
else:
print 'Exception: ' + e.__class__.__name__
print('Exception: ' + e.__class__.__name__)
#==============================================================================
@ -371,7 +371,14 @@ def parse_stream_error(**params):
try:
return ArcWarcRecordLoader().parse_record_stream(**params)
except Exception as e:
print 'Exception: ' + e.__class__.__name__
print('Exception: ' + e.__class__.__name__)
#==============================================================================
def print_strs(strings):
return list(map(lambda string: string.encode('utf-8') if six.PY2 else string, strings))
if __name__ == "__main__":

View File

@ -47,7 +47,7 @@ RedisResolver('redis://myhost.example.com:1234/1')
# make_best_resolvers
>>> r = make_best_resolvers(['http://example.com/warcs/',\
'redis://example.com:1234/1'])
>>> map(lambda x: x.__class__.__name__, r)
>>> list(map(lambda x: x.__class__.__name__, r))
['PrefixResolver', 'RedisResolver']
"""

View File

@ -3,9 +3,10 @@ from pywb.cdx.cdxserver import create_cdx_server
from pywb.framework.basehandlers import BaseHandler
from pywb.framework.wbrequestresponse import WbResponse
from query_handler import QueryHandler
from pywb.webapp.query_handler import QueryHandler
from urlparse import parse_qs
from six.moves.urllib.parse import parse_qs
import six
#=================================================================
@ -22,7 +23,11 @@ class CDXAPIHandler(BaseHandler):
cdx_iter = self.index_handler.load_cdx(wbrequest, params)
return WbResponse.text_stream(cdx_iter)
def to_utf8():
for cdx in cdx_iter:
yield cdx.encode('utf-8')
return WbResponse.text_stream(to_utf8())
@staticmethod
def extract_params_from_wsgi_env(env):
@ -35,7 +40,7 @@ class CDXAPIHandler(BaseHandler):
# cdx processing expects singleton params for all params,
# except filters, so convert here
# use first value of the list
for name, val in params.iteritems():
for name, val in six.iteritems(params):
if name != 'filter':
params[name] = val[0]

View File

@ -15,8 +15,8 @@ from pywb.framework.wbrequestresponse import WbResponse
from pywb.warc.recordloader import ArcWarcRecordLoader
from pywb.warc.resolvingloader import ResolvingLoader
from views import J2TemplateView, init_view
from replay_views import ReplayView
from pywb.webapp.views import J2TemplateView, init_view
from pywb.webapp.replay_views import ReplayView
from pywb.framework.memento import MementoResponse
from pywb.utils.timeutils import datetime_to_timestamp

View File

@ -4,8 +4,8 @@ from pywb.framework.cache import create_cache
from pywb.rewrite.rewrite_live import LiveRewriter
from pywb.rewrite.wburl import WbUrl
from handlers import StaticHandler, SearchPageWbUrlHandler
from views import HeadInsertView
from pywb.webapp.handlers import StaticHandler, SearchPageWbUrlHandler
from pywb.webapp.views import HeadInsertView
from pywb.utils.wbexception import WbException
@ -60,7 +60,7 @@ class RewriteHandler(SearchPageWbUrlHandler):
except Exception as exc:
import traceback
err_details = traceback.format_exc(exc)
print err_details
print(err_details)
url = wbrequest.wb_url.url
msg = 'Could not load the url from the live web: ' + url

View File

@ -6,21 +6,22 @@ from pywb.framework.wbrequestresponse import WbRequest
from pywb.framework.memento import MementoRequest
from pywb.framework.basehandlers import BaseHandler
from views import J2TemplateView
from views import J2HtmlCapturesView, init_view
from pywb.webapp.views import J2TemplateView
from pywb.webapp.views import J2HtmlCapturesView, init_view
from live_rewrite_handler import RewriteHandler
from pywb.webapp.live_rewrite_handler import RewriteHandler
from query_handler import QueryHandler
from handlers import WBHandler
from handlers import StaticHandler
from handlers import DebugEchoHandler, DebugEchoEnvHandler
from cdx_api_handler import CDXAPIHandler
from pywb.webapp.query_handler import QueryHandler
from pywb.webapp.handlers import WBHandler
from pywb.webapp.handlers import StaticHandler
from pywb.webapp.handlers import DebugEchoHandler, DebugEchoEnvHandler
from pywb.webapp.cdx_api_handler import CDXAPIHandler
from pywb import DEFAULT_CONFIG
import os
import logging
import six
#=================================================================
@ -130,7 +131,7 @@ def create_cdx_server_app(passed_config):
routes = []
for name, value in collections.iteritems():
for name, value in six.iteritems(collections):
route_config = init_route_config(value, config)
query_handler = init_collection(route_config)
@ -234,7 +235,7 @@ class DirectoryCollsLoader(object):
# Check all templates
template_files = self.config.get('paths')['template_files']
for tname, tfile in template_files.iteritems():
for tname, tfile in six.iteritems(template_files):
if tname in coll_config:
# Already set
coll_config[tname] = self._norm_path(root_dir, coll_config[tname])
@ -288,10 +289,10 @@ def create_wb_router(passed_config=None):
jinja_env.globals.update(config.get('template_globals', {}))
for static_name, static_path in static_routes.iteritems():
for static_name, static_path in six.iteritems(static_routes):
routes.append(Route(static_name, StaticHandler(static_path)))
for name, value in collections.iteritems():
for name, value in six.iteritems(collections):
if isinstance(value, BaseHandler):
handler_dict[name] = value
new_route = Route(name, value, config=config)

View File

@ -1,12 +1,9 @@
import urllib
import urllib2
from pywb.utils.dsrules import DEFAULT_RULES_FILE
from pywb.perms.perms_filter import make_perms_cdx_filter
from pywb.framework.wbrequestresponse import WbResponse
from pywb.cdx.cdxserver import create_cdx_server
from views import MementoTimemapView
from pywb.webapp.views import MementoTimemapView
#=================================================================

View File

@ -2,7 +2,7 @@ import re
import logging
from io import BytesIO
from urlparse import urlsplit
from six.moves.urllib.parse import urlsplit
from itertools import chain
from pywb.utils.statusandheaders import StatusAndHeaders
@ -16,9 +16,9 @@ from pywb.framework.memento import MementoResponse
from pywb.rewrite.rewrite_content import RewriteContent
from pywb.warc.recordloader import ArchiveLoadFailed
from views import HeadInsertView
from pywb.webapp.views import HeadInsertView
from rangecache import range_cache
from pywb.webapp.rangecache import range_cache
#=================================================================

View File

@ -2,13 +2,12 @@ from pywb.utils.timeutils import timestamp_to_datetime, timestamp_to_sec
from pywb.framework.wbrequestresponse import WbResponse
from pywb.framework.memento import make_timemap, LINK_FORMAT
import urlparse
import urllib
from six.moves.urllib.parse import urlsplit
import logging
import json
import os
from itertools import imap
from jinja2 import Environment
from jinja2 import FileSystemLoader, PackageLoader, ChoiceLoader
@ -48,7 +47,7 @@ def format_ts(value, format_='%a, %b %d %Y %H:%M:%S'):
@template_filter('urlsplit')
def get_urlsplit(url):
split = urlparse.urlsplit(url)
split = urlsplit(url)
return split