mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
more python 3 support work -- pywb.cdx, pywb.warc tests succeed
most relative imports replaced with absolute
This commit is contained in:
parent
b7008920de
commit
bd841b91a9
@ -1,4 +1,4 @@
|
|||||||
__version__ = '0.11.1'
|
__version__ = '1.0b'
|
||||||
|
|
||||||
DEFAULT_CONFIG = 'pywb/default_config.yaml'
|
DEFAULT_CONFIG = 'pywb/default_config.yaml'
|
||||||
|
|
||||||
|
@ -2,13 +2,14 @@ import yaml
|
|||||||
import re
|
import re
|
||||||
import logging
|
import logging
|
||||||
import pkg_resources
|
import pkg_resources
|
||||||
import urlparse
|
|
||||||
|
from six.moves.urllib.parse import urlsplit
|
||||||
|
|
||||||
from pywb.utils.dsrules import BaseRule, RuleSet
|
from pywb.utils.dsrules import BaseRule, RuleSet
|
||||||
|
|
||||||
from pywb.utils.canonicalize import unsurt, UrlCanonicalizer
|
from pywb.utils.canonicalize import unsurt, UrlCanonicalizer
|
||||||
|
|
||||||
from query import CDXQuery
|
from pywb.cdx.query import CDXQuery
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
@ -102,7 +103,7 @@ class FuzzyQuery:
|
|||||||
url = url[:inx + len(repl)]
|
url = url[:inx + len(repl)]
|
||||||
|
|
||||||
if matched_rule.match_type == 'domain':
|
if matched_rule.match_type == 'domain':
|
||||||
host = urlparse.urlsplit(url).netloc
|
host = urlsplit(url).netloc
|
||||||
# remove the subdomain
|
# remove the subdomain
|
||||||
url = host.split('.', 1)[1]
|
url = host.split('.', 1)[1]
|
||||||
|
|
||||||
|
@ -3,10 +3,11 @@ try: # pragma: no cover
|
|||||||
except ImportError: # pragma: no cover
|
except ImportError: # pragma: no cover
|
||||||
from ordereddict import OrderedDict
|
from ordereddict import OrderedDict
|
||||||
|
|
||||||
import itertools
|
import six
|
||||||
|
from six.moves import zip
|
||||||
|
|
||||||
from urllib import urlencode, quote
|
from six.moves.urllib.parse import urlencode, quote
|
||||||
from urlparse import parse_qs
|
from six.moves.urllib.parse import parse_qs
|
||||||
|
|
||||||
from pywb.utils.wbexception import WbException
|
from pywb.utils.wbexception import WbException
|
||||||
|
|
||||||
@ -101,7 +102,7 @@ class CDXObject(OrderedDict):
|
|||||||
'f': FILENAME
|
'f': FILENAME
|
||||||
}
|
}
|
||||||
|
|
||||||
def __init__(self, cdxline=''):
|
def __init__(self, cdxline=b''):
|
||||||
OrderedDict.__init__(self)
|
OrderedDict.__init__(self)
|
||||||
|
|
||||||
cdxline = cdxline.rstrip()
|
cdxline = cdxline.rstrip()
|
||||||
@ -112,28 +113,28 @@ class CDXObject(OrderedDict):
|
|||||||
self.cdxline = cdxline
|
self.cdxline = cdxline
|
||||||
return
|
return
|
||||||
|
|
||||||
fields = cdxline.split(' ' , 2)
|
fields = cdxline.split(b' ' , 2)
|
||||||
# Check for CDX JSON
|
# Check for CDX JSON
|
||||||
if fields[-1].startswith('{'):
|
if fields[-1].startswith(b'{'):
|
||||||
self[URLKEY] = fields[0]
|
self[URLKEY] = fields[0].decode('utf-8')
|
||||||
self[TIMESTAMP] = fields[1]
|
self[TIMESTAMP] = fields[1].decode('utf-8')
|
||||||
json_fields = json_decode(fields[-1])
|
json_fields = json_decode(fields[-1].decode('utf-8'))
|
||||||
for n, v in json_fields.iteritems():
|
for n, v in six.iteritems(json_fields):
|
||||||
n = self.CDX_ALT_FIELDS.get(n, n)
|
n = self.CDX_ALT_FIELDS.get(n, n)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
self[n] = str(v)
|
v.encode('ascii')
|
||||||
except UnicodeEncodeError:
|
except UnicodeEncodeError:
|
||||||
v = v.encode('utf-8')
|
parts = v.encode('utf-8').split(b'//', 1)
|
||||||
parts = v.split('//', 1)
|
v = parts[0].decode('utf-8') + '//' + quote(parts[1])
|
||||||
v = parts[0] + '//' + quote(parts[1])
|
|
||||||
self[n] = v
|
self[n] = v
|
||||||
|
|
||||||
self.cdxline = cdxline
|
self.cdxline = cdxline
|
||||||
self._from_json = True
|
self._from_json = True
|
||||||
return
|
return
|
||||||
|
|
||||||
more_fields = fields.pop().split(' ')
|
more_fields = fields.pop().split(b' ')
|
||||||
fields.extend(more_fields)
|
fields.extend(more_fields)
|
||||||
|
|
||||||
cdxformat = None
|
cdxformat = None
|
||||||
@ -145,8 +146,8 @@ class CDXObject(OrderedDict):
|
|||||||
msg = 'unknown {0}-field cdx format'.format(len(fields))
|
msg = 'unknown {0}-field cdx format'.format(len(fields))
|
||||||
raise CDXException(msg)
|
raise CDXException(msg)
|
||||||
|
|
||||||
for header, field in itertools.izip(cdxformat, fields):
|
for header, field in zip(cdxformat, fields):
|
||||||
self[header] = field
|
self[header] = field.decode('utf-8')
|
||||||
|
|
||||||
self.cdxline = cdxline
|
self.cdxline = cdxline
|
||||||
|
|
||||||
@ -204,13 +205,14 @@ class CDXObject(OrderedDict):
|
|||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
if self.cdxline:
|
if self.cdxline:
|
||||||
return self.cdxline
|
return self.cdxline.decode('utf-8')
|
||||||
|
|
||||||
if not self._from_json:
|
if not self._from_json:
|
||||||
return ' '.join(val for n, val in self.iteritems())
|
return ' '.join(val for n, val in six.iteritems(self))
|
||||||
else:
|
else:
|
||||||
return json_encode(self)
|
return json_encode(self)
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class IDXObject(OrderedDict):
|
class IDXObject(OrderedDict):
|
||||||
|
|
||||||
@ -221,14 +223,14 @@ class IDXObject(OrderedDict):
|
|||||||
OrderedDict.__init__(self)
|
OrderedDict.__init__(self)
|
||||||
|
|
||||||
idxline = idxline.rstrip()
|
idxline = idxline.rstrip()
|
||||||
fields = idxline.split('\t')
|
fields = idxline.split(b'\t')
|
||||||
|
|
||||||
if len(fields) < self.NUM_REQ_FIELDS:
|
if len(fields) < self.NUM_REQ_FIELDS:
|
||||||
msg = 'invalid idx format: {0} fields found, {1} required'
|
msg = 'invalid idx format: {0} fields found, {1} required'
|
||||||
raise CDXException(msg.format(len(fields), self.NUM_REQ_FIELDS))
|
raise CDXException(msg.format(len(fields), self.NUM_REQ_FIELDS))
|
||||||
|
|
||||||
for header, field in itertools.izip(self.FORMAT, fields):
|
for header, field in zip(self.FORMAT, fields):
|
||||||
self[header] = field
|
self[header] = field.decode('utf-8')
|
||||||
|
|
||||||
self['offset'] = int(self['offset'])
|
self['offset'] = int(self['offset'])
|
||||||
self['length'] = int(self['length'])
|
self['length'] = int(self['length'])
|
||||||
@ -250,4 +252,4 @@ class IDXObject(OrderedDict):
|
|||||||
return json_encode(self) + '\n'
|
return json_encode(self) + '\n'
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return self.idxline
|
return self.idxline.decode('utf-8')
|
||||||
|
@ -1,15 +1,18 @@
|
|||||||
from cdxobject import CDXObject, IDXObject
|
from pywb.cdx.cdxobject import CDXObject, IDXObject
|
||||||
from cdxobject import TIMESTAMP, STATUSCODE, MIMETYPE, DIGEST
|
from pywb.cdx.cdxobject import TIMESTAMP, STATUSCODE, MIMETYPE, DIGEST
|
||||||
from cdxobject import OFFSET, LENGTH, FILENAME
|
from pywb.cdx.cdxobject import OFFSET, LENGTH, FILENAME
|
||||||
|
|
||||||
from query import CDXQuery
|
from pywb.cdx.query import CDXQuery
|
||||||
from pywb.utils.timeutils import timestamp_to_sec, pad_timestamp
|
from pywb.utils.timeutils import timestamp_to_sec, pad_timestamp
|
||||||
from pywb.utils.timeutils import PAD_14_DOWN, PAD_14_UP
|
from pywb.utils.timeutils import PAD_14_DOWN, PAD_14_UP
|
||||||
|
|
||||||
import bisect
|
import bisect
|
||||||
import itertools
|
|
||||||
|
from six.moves import zip, range, map
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
from heapq import merge
|
from heapq import merge
|
||||||
from collections import deque
|
from collections import deque
|
||||||
|
|
||||||
@ -127,7 +130,7 @@ def cdx_limit(cdx_iter, limit):
|
|||||||
"""
|
"""
|
||||||
# for cdx, _ in itertools.izip(cdx_iter, xrange(limit)):
|
# for cdx, _ in itertools.izip(cdx_iter, xrange(limit)):
|
||||||
# yield cdx
|
# yield cdx
|
||||||
return (cdx for cdx, _ in itertools.izip(cdx_iter, xrange(limit)))
|
return (cdx for cdx, _ in zip(cdx_iter, range(limit)))
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
@ -221,7 +224,7 @@ def cdx_filter(cdx_iter, filter_strings):
|
|||||||
def regex(self, val):
|
def regex(self, val):
|
||||||
return self.regex.match(val) is not None
|
return self.regex.match(val) is not None
|
||||||
|
|
||||||
filters = map(Filter, filter_strings)
|
filters = list(map(Filter, filter_strings))
|
||||||
|
|
||||||
for cdx in cdx_iter:
|
for cdx in cdx_iter:
|
||||||
if all(x(cdx) for x in filters):
|
if all(x(cdx) for x in filters):
|
||||||
@ -273,7 +276,7 @@ def cdx_sort_closest(closest, cdx_iter, limit=10):
|
|||||||
sort CDXCaptureResult by closest to timestamp.
|
sort CDXCaptureResult by closest to timestamp.
|
||||||
"""
|
"""
|
||||||
closest_cdx = []
|
closest_cdx = []
|
||||||
|
closest_keys = []
|
||||||
closest_sec = timestamp_to_sec(closest)
|
closest_sec = timestamp_to_sec(closest)
|
||||||
|
|
||||||
for cdx in cdx_iter:
|
for cdx in cdx_iter:
|
||||||
@ -281,19 +284,26 @@ def cdx_sort_closest(closest, cdx_iter, limit=10):
|
|||||||
key = abs(closest_sec - sec)
|
key = abs(closest_sec - sec)
|
||||||
|
|
||||||
# create tuple to sort by key
|
# create tuple to sort by key
|
||||||
bisect.insort(closest_cdx, (key, cdx))
|
#bisect.insort(closest_cdx, (key, cdx))
|
||||||
|
|
||||||
|
i = bisect.bisect_right(closest_keys, key)
|
||||||
|
closest_keys.insert(i, key)
|
||||||
|
closest_cdx.insert(i, cdx)
|
||||||
|
|
||||||
if len(closest_cdx) == limit:
|
if len(closest_cdx) == limit:
|
||||||
# assuming cdx in ascending order and keys have started increasing
|
# assuming cdx in ascending order and keys have started increasing
|
||||||
if key > closest_cdx[-1]:
|
if key > closest_keys[-1]:
|
||||||
break
|
break
|
||||||
|
|
||||||
if len(closest_cdx) > limit:
|
if len(closest_cdx) > limit:
|
||||||
closest_cdx.pop()
|
closest_cdx.pop()
|
||||||
|
|
||||||
for cdx in itertools.imap(lambda x: x[1], closest_cdx):
|
for cdx in closest_cdx:
|
||||||
yield cdx
|
yield cdx
|
||||||
|
|
||||||
|
#for cdx in map(lambda x: x[1], closest_cdx):
|
||||||
|
# yield cdx
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
# resolve revisits
|
# resolve revisits
|
||||||
|
@ -1,19 +1,18 @@
|
|||||||
from pywb.utils.canonicalize import UrlCanonicalizer, calc_search_range
|
from pywb.utils.canonicalize import UrlCanonicalizer, calc_search_range
|
||||||
from pywb.utils.wbexception import NotFoundException
|
from pywb.utils.wbexception import NotFoundException
|
||||||
|
|
||||||
from cdxops import cdx_load
|
from pywb.cdx.cdxops import cdx_load
|
||||||
from cdxsource import CDXSource, CDXFile, RemoteCDXSource, RedisCDXSource
|
from pywb.cdx.cdxsource import CDXSource, CDXFile, RemoteCDXSource, RedisCDXSource
|
||||||
from zipnum import ZipNumCluster
|
from pywb.cdx.zipnum import ZipNumCluster
|
||||||
from cdxobject import CDXObject, CDXException
|
from pywb.cdx.cdxobject import CDXObject, CDXException
|
||||||
from query import CDXQuery
|
from pywb.cdx.query import CDXQuery
|
||||||
from cdxdomainspecific import load_domain_specific_cdx_rules
|
from pywb.cdx.cdxdomainspecific import load_domain_specific_cdx_rules
|
||||||
|
|
||||||
from pywb.utils.loaders import is_http
|
from pywb.utils.loaders import is_http
|
||||||
|
|
||||||
from itertools import chain
|
from itertools import chain
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import urlparse
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
|
@ -3,11 +3,11 @@ from pywb.utils.binsearch import iter_range
|
|||||||
from pywb.utils.wbexception import AccessException, NotFoundException
|
from pywb.utils.wbexception import AccessException, NotFoundException
|
||||||
from pywb.utils.wbexception import BadRequestException, WbException
|
from pywb.utils.wbexception import BadRequestException, WbException
|
||||||
|
|
||||||
from query import CDXQuery
|
from pywb.cdx.query import CDXQuery
|
||||||
|
|
||||||
import urllib
|
from six.moves.urllib.request import urlopen, Request
|
||||||
import urllib2
|
from six.moves.urllib.error import HTTPError
|
||||||
import itertools
|
from six.moves import map
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
@ -33,7 +33,8 @@ class CDXFile(CDXSource):
|
|||||||
@staticmethod
|
@staticmethod
|
||||||
def _do_load_file(filename, query):
|
def _do_load_file(filename, query):
|
||||||
with open(filename, 'rb') as source:
|
with open(filename, 'rb') as source:
|
||||||
gen = iter_range(source, query.key, query.end_key)
|
gen = iter_range(source, query.key.encode('utf-8'),
|
||||||
|
query.end_key.encode('utf-8'))
|
||||||
for line in gen:
|
for line in gen:
|
||||||
yield line
|
yield line
|
||||||
|
|
||||||
@ -65,14 +66,14 @@ class RemoteCDXSource(CDXSource):
|
|||||||
urlparams = remote_query.urlencode()
|
urlparams = remote_query.urlencode()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
request = urllib2.Request(self.remote_url + '?' + urlparams)
|
request = Request(self.remote_url + '?' + urlparams)
|
||||||
|
|
||||||
if self.cookie:
|
if self.cookie:
|
||||||
request.add_header('Cookie', self.cookie)
|
request.add_header('Cookie', self.cookie)
|
||||||
|
|
||||||
response = urllib2.urlopen(request)
|
response = urlopen(request)
|
||||||
|
|
||||||
except urllib2.HTTPError as e:
|
except HTTPError as e:
|
||||||
if e.code == 403:
|
if e.code == 403:
|
||||||
raise AccessException('Access Denied')
|
raise AccessException('Access Denied')
|
||||||
elif e.code == 404:
|
elif e.code == 404:
|
||||||
@ -95,14 +96,14 @@ class RemoteCDXSource(CDXSource):
|
|||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class RedisCDXSource(CDXSource):
|
class RedisCDXSource(CDXSource):
|
||||||
DEFAULT_KEY_PREFIX = 'c:'
|
DEFAULT_KEY_PREFIX = b'c:'
|
||||||
|
|
||||||
def __init__(self, redis_url, config=None):
|
def __init__(self, redis_url, config=None):
|
||||||
import redis
|
import redis
|
||||||
|
|
||||||
parts = redis_url.split('/')
|
parts = redis_url.split('/')
|
||||||
if len(parts) > 4:
|
if len(parts) > 4:
|
||||||
self.cdx_key = parts[4]
|
self.cdx_key = parts[4].encode('utf-8')
|
||||||
redis_url = 'redis://' + parts[2] + '/' + parts[3]
|
redis_url = 'redis://' + parts[2] + '/' + parts[3]
|
||||||
else:
|
else:
|
||||||
self.cdx_key = None
|
self.cdx_key = None
|
||||||
@ -126,7 +127,7 @@ class RedisCDXSource(CDXSource):
|
|||||||
if self.cdx_key:
|
if self.cdx_key:
|
||||||
return self.load_sorted_range(query, self.cdx_key)
|
return self.load_sorted_range(query, self.cdx_key)
|
||||||
else:
|
else:
|
||||||
return self.load_single_key(query.key)
|
return self.load_single_key(query.key.encode('utf-8'))
|
||||||
|
|
||||||
def load_sorted_range(self, query, cdx_key):
|
def load_sorted_range(self, query, cdx_key):
|
||||||
cdx_list = self.redis.zrangebylex(cdx_key,
|
cdx_list = self.redis.zrangebylex(cdx_key,
|
||||||
@ -137,12 +138,12 @@ class RedisCDXSource(CDXSource):
|
|||||||
|
|
||||||
def load_single_key(self, key):
|
def load_single_key(self, key):
|
||||||
# ensure only url/surt is part of key
|
# ensure only url/surt is part of key
|
||||||
key = key.split(' ')[0]
|
key = key.split(b' ')[0]
|
||||||
cdx_list = self.redis.zrange(self.key_prefix + key, 0, -1)
|
cdx_list = self.redis.zrange(self.key_prefix + key, 0, -1)
|
||||||
|
|
||||||
# key is not part of list, so prepend to each line
|
# key is not part of list, so prepend to each line
|
||||||
key += ' '
|
key += b' '
|
||||||
cdx_list = itertools.imap(lambda x: key + x, cdx_list)
|
cdx_list = map(lambda x: key + x, cdx_list)
|
||||||
return cdx_list
|
return cdx_list
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
from urllib import urlencode
|
from six.moves.urllib.parse import urlencode
|
||||||
from cdxobject import CDXException
|
from pywb.cdx.cdxobject import CDXException
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
|
@ -5,17 +5,17 @@ from pywb.cdx.cdxobject import CDXObject, IDXObject, CDXException
|
|||||||
from pytest import raises
|
from pytest import raises
|
||||||
|
|
||||||
def test_empty_cdxobject():
|
def test_empty_cdxobject():
|
||||||
x = CDXObject('')
|
x = CDXObject(b'')
|
||||||
assert len(x) == 0
|
assert len(x) == 0
|
||||||
|
|
||||||
def test_invalid_cdx_format():
|
def test_invalid_cdx_format():
|
||||||
with raises(CDXException):
|
with raises(CDXException):
|
||||||
x = CDXObject('a b c')
|
x = CDXObject(b'a b c')
|
||||||
|
|
||||||
|
|
||||||
def _make_line(fields):
|
def _make_line(fields):
|
||||||
line = ' '.join(['-'] * fields)
|
line = ' '.join(['-'] * fields)
|
||||||
x = CDXObject(line)
|
x = CDXObject(line.encode('utf-8'))
|
||||||
assert len(x) == fields
|
assert len(x) == fields
|
||||||
assert str(x) == line
|
assert str(x) == line
|
||||||
|
|
||||||
@ -29,13 +29,13 @@ def test_valid_cdx_formats():
|
|||||||
_make_line(14)
|
_make_line(14)
|
||||||
|
|
||||||
def test_unicode_url():
|
def test_unicode_url():
|
||||||
x = CDXObject('com,example,cafe)/ 123 {"url": "http://example.com/café/path"}')
|
x = CDXObject(u'com,example,cafe)/ 123 {"url": "http://example.com/café/path"}'.encode('utf-8'))
|
||||||
assert x['urlkey'] == 'com,example,cafe)/'
|
assert x['urlkey'] == 'com,example,cafe)/'
|
||||||
assert x['timestamp'] == '123'
|
assert x['timestamp'] == '123'
|
||||||
assert x['url'] == 'http://example.com/caf%C3%A9/path'
|
assert x['url'] == 'http://example.com/caf%C3%A9/path'
|
||||||
|
|
||||||
def test_invalid_idx_format():
|
def test_invalid_idx_format():
|
||||||
with raises(CDXException):
|
with raises(CDXException):
|
||||||
x = IDXObject('a b c')
|
x = IDXObject(b'a b c')
|
||||||
|
|
||||||
|
|
||||||
|
@ -31,17 +31,17 @@ com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYA
|
|||||||
com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz
|
com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz
|
||||||
com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz
|
com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz
|
||||||
|
|
||||||
>>> cdx_ops_test('http://example.com/', sources = [test_cdx_dir], to='2012')
|
>>> cdx_ops_test('http://example.com/', sources = [test_cdx_dir], to='2012') # doctest: +IGNORE_EXCEPTION_DETAIL
|
||||||
Traceback (most recent call last):
|
Traceback (most recent call last):
|
||||||
NotFoundException: No Captures found for: http://example.com/
|
NotFoundException: No Captures found for: http://example.com/
|
||||||
|
|
||||||
# No matching results
|
# No matching results
|
||||||
>>> cdx_ops_test('http://iana.org/dont_have_this', reverse = True, resolveRevisits = True, limit = 2)
|
>>> cdx_ops_test('http://iana.org/dont_have_this', reverse = True, resolveRevisits = True, limit = 2) # doctest: +IGNORE_EXCEPTION_DETAIL
|
||||||
Traceback (most recent call last):
|
Traceback (most recent call last):
|
||||||
NotFoundException: No Captures found for: http://iana.org/dont_have_this
|
NotFoundException: No Captures found for: http://iana.org/dont_have_this
|
||||||
|
|
||||||
# No matching -- limit=1
|
# No matching -- limit=1
|
||||||
>>> cdx_ops_test('http://iana.org/dont_have_this', reverse = True, resolveRevisits = True, limit = 1)
|
>>> cdx_ops_test('http://iana.org/dont_have_this', reverse = True, resolveRevisits = True, limit = 1) # doctest: +IGNORE_EXCEPTION_DETAIL
|
||||||
Traceback (most recent call last):
|
Traceback (most recent call last):
|
||||||
NotFoundException: No Captures found for: http://iana.org/dont_have_this
|
NotFoundException: No Captures found for: http://iana.org/dont_have_this
|
||||||
|
|
||||||
@ -69,7 +69,7 @@ org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/
|
|||||||
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
|
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
|
||||||
|
|
||||||
# Filter -- no such field, no matches
|
# Filter -- no such field, no matches
|
||||||
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', filter = 'blah:200')
|
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', filter = 'blah:200') # doctest: +IGNORE_EXCEPTION_DETAIL
|
||||||
Traceback (most recent call last):
|
Traceback (most recent call last):
|
||||||
NotFoundException: No Captures found for: http://iana.org/_css/2013.1/screen.css
|
NotFoundException: No Captures found for: http://iana.org/_css/2013.1/screen.css
|
||||||
|
|
||||||
@ -163,50 +163,66 @@ org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201249 http://www.iana.org/_
|
|||||||
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz - - -
|
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz - - -
|
||||||
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz - - -
|
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz - - -
|
||||||
|
|
||||||
# Resolve Revisit -- cdxj minimal -- output also json
|
|
||||||
>>> cdx_ops_test(url = 'http://example.com/?example=1', sources=[get_test_dir() + 'cdxj/example.cdxj'], resolveRevisits=True)
|
|
||||||
{"urlkey": "com,example)/?example=1", "timestamp": "20140103030321", "url": "http://example.com?example=1", "length": "1043", "filename": "example.warc.gz", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "offset": "333", "orig.length": "-", "orig.offset": "-", "orig.filename": "-"}
|
|
||||||
{"urlkey": "com,example)/?example=1", "timestamp": "20140103030341", "url": "http://example.com?example=1", "filename": "example.warc.gz", "length": "553", "mime": "", "offset": "1864", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "orig.length": "1043", "orig.offset": "333", "orig.filename": "example.warc.gz"}
|
|
||||||
|
|
||||||
# Resolve Revisit -- cdxj minimal -- output also json
|
|
||||||
>>> cdx_ops_test(url = 'http://example.com/?example=1', sources=[get_test_dir() + 'cdxj/example-no-digest.cdxj'], resolveRevisits=True)
|
|
||||||
{"urlkey": "com,example)/?example=1", "timestamp": "20140103030321", "url": "http://example.com?example=1", "length": "1043", "filename": "example.warc.gz", "offset": "333", "orig.length": "-", "orig.offset": "-", "orig.filename": "-"}
|
|
||||||
{"urlkey": "com,example)/?example=1", "timestamp": "20140103030341", "url": "http://example.com?example=1", "length": "553", "filename": "example.warc.gz", "mime": "warc/revisit", "offset": "1864", "orig.length": "-", "orig.offset": "-", "orig.filename": "-"}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
from pywb.cdx.cdxserver import CDXServer
|
from pywb.cdx.cdxserver import CDXServer
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
import six
|
||||||
|
|
||||||
from pywb import get_test_dir
|
from pywb import get_test_dir
|
||||||
|
|
||||||
test_cdx_dir = get_test_dir() + 'cdx/'
|
test_cdx_dir = get_test_dir() + 'cdx/'
|
||||||
|
|
||||||
|
|
||||||
def cdx_ops_test(url, sources = [test_cdx_dir + 'iana.cdx'], **kwparams):
|
def cdx_ops_test_data(url, sources = [test_cdx_dir + 'iana.cdx'], **kwparams):
|
||||||
kwparams['url'] = url
|
kwparams['url'] = url
|
||||||
if not 'output' in kwparams:
|
if not 'output' in kwparams:
|
||||||
kwparams['output'] = 'cdxobject'
|
kwparams['output'] = 'cdxobject'
|
||||||
fields = kwparams.get('fields')
|
|
||||||
if fields:
|
|
||||||
fields = fields.split(',')
|
|
||||||
|
|
||||||
server = CDXServer(sources)
|
server = CDXServer(sources)
|
||||||
results = server.load_cdx(**kwparams)
|
results = server.load_cdx(**kwparams)
|
||||||
|
return list(results)
|
||||||
|
|
||||||
|
|
||||||
|
def cdx_ops_test(*args, **kwargs):
|
||||||
|
results = cdx_ops_test_data(*args, **kwargs)
|
||||||
|
|
||||||
|
fields = kwargs.get('fields')
|
||||||
|
if fields:
|
||||||
|
fields = fields.split(',')
|
||||||
|
|
||||||
for x in results:
|
for x in results:
|
||||||
if not isinstance(x, str):
|
if not isinstance(x, str):
|
||||||
l = x.to_text(fields).replace('\t', ' ')
|
l = x.to_text(fields).replace('\t', ' ')
|
||||||
else:
|
else:
|
||||||
l = x
|
l = x
|
||||||
|
|
||||||
sys.stdout.write(l)
|
sys.stdout.write(l)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def test_cdxj_resolve_revisit():
|
||||||
|
# Resolve Revisit -- cdxj minimal -- output also json
|
||||||
|
results = cdx_ops_test_data(url = 'http://example.com/?example=1', sources=[get_test_dir() + 'cdxj/example.cdxj'], resolveRevisits=True)
|
||||||
|
assert(len(results) == 2)
|
||||||
|
assert(dict(results[0]) == {"urlkey": "com,example)/?example=1", "timestamp": "20140103030321", "url": "http://example.com?example=1", "length": "1043", "filename": "example.warc.gz", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "offset": "333", "orig.length": "-", "orig.offset": "-", "orig.filename": "-"})
|
||||||
|
|
||||||
|
assert(dict(results[1]) == {"urlkey": "com,example)/?example=1", "timestamp": "20140103030341", "url": "http://example.com?example=1", "filename": "example.warc.gz", "length": "553", "mime": "", "offset": "1864", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "orig.length": "1043", "orig.offset": "333", "orig.filename": "example.warc.gz"})
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def test_cdxj_resolve_revisit_2():
|
||||||
|
# Resolve Revisit -- cdxj minimal -- output also json
|
||||||
|
results = cdx_ops_test_data(url = 'http://example.com/?example=1', sources=[get_test_dir() + 'cdxj/example-no-digest.cdxj'], resolveRevisits=True)
|
||||||
|
assert(len(results) == 2)
|
||||||
|
assert(dict(results[0]) == {"urlkey": "com,example)/?example=1", "timestamp": "20140103030321", "url": "http://example.com?example=1", "length": "1043", "filename": "example.warc.gz", "offset": "333", "orig.length": "-", "orig.offset": "-", "orig.filename": "-"})
|
||||||
|
|
||||||
|
assert(dict(results[1]) == {"urlkey": "com,example)/?example=1", "timestamp": "20140103030341", "url": "http://example.com?example=1", "length": "553", "filename": "example.warc.gz", "mime": "warc/revisit", "offset": "1864", "orig.length": "-", "orig.offset": "-", "orig.filename": "-"})
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import doctest
|
import doctest
|
||||||
doctest.testmod()
|
doctest.testmod()
|
||||||
|
@ -6,11 +6,14 @@ from pywb.utils.dsrules import DEFAULT_RULES_FILE
|
|||||||
from pywb.utils.wbexception import AccessException, NotFoundException
|
from pywb.utils.wbexception import AccessException, NotFoundException
|
||||||
from pywb.utils.wbexception import BadRequestException, WbException
|
from pywb.utils.wbexception import BadRequestException, WbException
|
||||||
|
|
||||||
from urllib2 import HTTPError
|
from six.moves.urllib.error import HTTPError
|
||||||
|
|
||||||
from mock import patch
|
from mock import patch
|
||||||
from pytest import raises
|
from pytest import raises
|
||||||
import webtest
|
import webtest
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
import six
|
||||||
|
|
||||||
from pywb import get_test_dir
|
from pywb import get_test_dir
|
||||||
|
|
||||||
@ -41,7 +44,7 @@ def setup_module(self):
|
|||||||
|
|
||||||
def mock_urlopen(req):
|
def mock_urlopen(req):
|
||||||
resp = testapp.get(req.get_full_url())
|
resp = testapp.get(req.get_full_url())
|
||||||
return resp.body.split('\n')
|
return resp.body.split(b'\n')
|
||||||
|
|
||||||
def mock_urlopen_err(err):
|
def mock_urlopen_err(err):
|
||||||
def make_err(req):
|
def make_err(req):
|
||||||
@ -58,45 +61,44 @@ def mock_urlopen_fuzzy(req):
|
|||||||
resp = testapp.get(req.get_full_url(), status=status)
|
resp = testapp.get(req.get_full_url(), status=status)
|
||||||
|
|
||||||
if status == 200:
|
if status == 200:
|
||||||
return resp.body.split('\n')
|
return resp.body.split(b'\n')
|
||||||
else:
|
else:
|
||||||
raise mock_urlopen_err(404)(req)
|
raise mock_urlopen_err(404)(req)
|
||||||
|
|
||||||
@patch('pywb.cdx.cdxsource.urllib2.urlopen', mock_urlopen)
|
@patch('pywb.cdx.cdxsource.urlopen', mock_urlopen)
|
||||||
def assert_cdx_match(server):
|
def assert_cdx_match(server):
|
||||||
x = server.load_cdx(url='example.com',
|
x = server.load_cdx(url='example.com',
|
||||||
limit=2,
|
limit=2,
|
||||||
output='cdxobject')
|
output='cdxobject')
|
||||||
x.next()
|
x = list(x)
|
||||||
assert x.next().items() == CDX_RESULT
|
assert(list(x[1].items()) == CDX_RESULT)
|
||||||
|
|
||||||
|
|
||||||
def assert_cdx_fuzzy_match(server, mock=mock_urlopen):
|
def assert_cdx_fuzzy_match(server, mock=mock_urlopen):
|
||||||
with patch('pywb.cdx.cdxsource.urllib2.urlopen', mock):
|
with patch('pywb.cdx.cdxsource.urlopen', mock):
|
||||||
x = server.load_cdx(url='http://example.com?_=123',
|
x = server.load_cdx(url='http://example.com?_=123',
|
||||||
limit=2,
|
limit=2,
|
||||||
output='cdxobject',
|
output='cdxobject',
|
||||||
allowFuzzy=True)
|
allowFuzzy=True)
|
||||||
x.next()
|
x = list(x)
|
||||||
assert x.next().items() == CDX_RESULT
|
assert(list(x[1].items()) == CDX_RESULT)
|
||||||
|
|
||||||
|
|
||||||
@patch('pywb.cdx.cdxsource.urllib2.urlopen', mock_urlopen_err(404))
|
@patch('pywb.cdx.cdxsource.urlopen', mock_urlopen_err(404))
|
||||||
def assert_404(server):
|
def assert_404(server):
|
||||||
server.load_cdx(url='http://notfound.example.com')
|
server.load_cdx(url='http://notfound.example.com')
|
||||||
|
|
||||||
|
|
||||||
@patch('pywb.cdx.cdxsource.urllib2.urlopen', mock_urlopen_err(403))
|
@patch('pywb.cdx.cdxsource.urlopen', mock_urlopen_err(403))
|
||||||
def assert_403(server):
|
def assert_403(server):
|
||||||
server.load_cdx(url='http://notfound.example.com')
|
server.load_cdx(url='http://notfound.example.com')
|
||||||
|
|
||||||
|
|
||||||
@patch('pywb.cdx.cdxsource.urllib2.urlopen', mock_urlopen_err(400))
|
@patch('pywb.cdx.cdxsource.urlopen', mock_urlopen_err(400))
|
||||||
def assert_400(server):
|
def assert_400(server):
|
||||||
server.load_cdx(url='http://notfound.example.com')
|
server.load_cdx(url='http://notfound.example.com')
|
||||||
|
|
||||||
|
|
||||||
@patch('pywb.cdx.cdxsource.urllib2.urlopen', mock_urlopen_err(502))
|
@patch('pywb.cdx.cdxsource.urlopen', mock_urlopen_err(502))
|
||||||
def assert_502(server):
|
def assert_502(server):
|
||||||
server.load_cdx(url='http://notfound.example.com')
|
server.load_cdx(url='http://notfound.example.com')
|
||||||
|
|
||||||
@ -131,7 +133,7 @@ def test_fuzzy_match():
|
|||||||
|
|
||||||
def test_fuzzy_no_match_1():
|
def test_fuzzy_no_match_1():
|
||||||
# no match, no fuzzy
|
# no match, no fuzzy
|
||||||
with patch('pywb.cdx.cdxsource.urllib2.urlopen', mock_urlopen):
|
with patch('pywb.cdx.cdxsource.urlopen', mock_urlopen):
|
||||||
server = CDXServer([TEST_CDX_DIR], ds_rules_file=DEFAULT_RULES_FILE)
|
server = CDXServer([TEST_CDX_DIR], ds_rules_file=DEFAULT_RULES_FILE)
|
||||||
with raises(NotFoundException):
|
with raises(NotFoundException):
|
||||||
server.load_cdx(url='http://notfound.example.com/',
|
server.load_cdx(url='http://notfound.example.com/',
|
||||||
@ -141,7 +143,7 @@ def test_fuzzy_no_match_1():
|
|||||||
|
|
||||||
def test_fuzzy_no_match_2():
|
def test_fuzzy_no_match_2():
|
||||||
# fuzzy rule, but no actual match
|
# fuzzy rule, but no actual match
|
||||||
with patch('pywb.cdx.cdxsource.urllib2.urlopen', mock_urlopen):
|
with patch('pywb.cdx.cdxsource.urlopen', mock_urlopen):
|
||||||
server = CDXServer([TEST_CDX_DIR], ds_rules_file=DEFAULT_RULES_FILE)
|
server = CDXServer([TEST_CDX_DIR], ds_rules_file=DEFAULT_RULES_FILE)
|
||||||
with raises(NotFoundException):
|
with raises(NotFoundException):
|
||||||
server.load_cdx(url='http://notfound.example.com/?_=1234',
|
server.load_cdx(url='http://notfound.example.com/?_=1234',
|
||||||
@ -153,7 +155,7 @@ def test_fuzzy_no_match_2():
|
|||||||
def test2_fuzzy_no_match_3():
|
def test2_fuzzy_no_match_3():
|
||||||
# special fuzzy rule, matches prefix test.example.example.,
|
# special fuzzy rule, matches prefix test.example.example.,
|
||||||
# but doesn't match rule regex
|
# but doesn't match rule regex
|
||||||
with patch('pywb.cdx.cdxsource.urllib2.urlopen', mock_urlopen):
|
with patch('pywb.cdx.cdxsource.urlopen', mock_urlopen):
|
||||||
server = CDXServer([TEST_CDX_DIR], ds_rules_file=DEFAULT_RULES_FILE)
|
server = CDXServer([TEST_CDX_DIR], ds_rules_file=DEFAULT_RULES_FILE)
|
||||||
with raises(NotFoundException):
|
with raises(NotFoundException):
|
||||||
server.load_cdx(url='http://test.example.example/',
|
server.load_cdx(url='http://test.example.example/',
|
||||||
|
@ -4,6 +4,8 @@ from pywb.cdx.query import CDXQuery
|
|||||||
|
|
||||||
from pytest import raises
|
from pytest import raises
|
||||||
|
|
||||||
|
import six
|
||||||
|
|
||||||
KEY = 'com,example)/'
|
KEY = 'com,example)/'
|
||||||
|
|
||||||
#================================================================
|
#================================================================
|
||||||
@ -30,7 +32,7 @@ def lazy_cdx_load(**params):
|
|||||||
|
|
||||||
# exception happens on first access attempt
|
# exception happens on first access attempt
|
||||||
with raises(AccessException):
|
with raises(AccessException):
|
||||||
cdx_iter.next()
|
six.next(cdx_iter)
|
||||||
|
|
||||||
|
|
||||||
def test_no_process():
|
def test_no_process():
|
||||||
|
@ -35,13 +35,13 @@ def zadd_cdx(source, cdx, key):
|
|||||||
source.redis.zadd(key, 0, cdx)
|
source.redis.zadd(key, 0, cdx)
|
||||||
return
|
return
|
||||||
|
|
||||||
parts = cdx.split(' ', 2)
|
parts = cdx.split(b' ', 2)
|
||||||
|
|
||||||
key = parts[0]
|
key = parts[0]
|
||||||
timestamp = parts[1]
|
timestamp = parts[1]
|
||||||
rest = timestamp + ' ' + parts[2]
|
rest = timestamp + b' ' + parts[2]
|
||||||
|
|
||||||
score = timestamp_to_sec(timestamp)
|
score = timestamp_to_sec(timestamp.decode('utf-8'))
|
||||||
source.redis.zadd(source.key_prefix + key, score, rest)
|
source.redis.zadd(source.key_prefix + key, score, rest)
|
||||||
|
|
||||||
|
|
||||||
|
@ -22,29 +22,6 @@ org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ te
|
|||||||
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
|
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
|
||||||
org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz
|
org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz
|
||||||
|
|
||||||
# Pages -- default page size
|
|
||||||
>>> zip_ops_test(url='http://iana.org/domains/example', matchType='exact', showNumPages=True)
|
|
||||||
{"blocks": 1, "pages": 1, "pageSize": 10}
|
|
||||||
|
|
||||||
>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', showNumPages=True)
|
|
||||||
{"blocks": 38, "pages": 4, "pageSize": 10}
|
|
||||||
|
|
||||||
# set page size
|
|
||||||
>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', pageSize=4, showNumPages=True)
|
|
||||||
{"blocks": 38, "pages": 10, "pageSize": 4}
|
|
||||||
|
|
||||||
# set page size -- alt domain query
|
|
||||||
>>> zip_ops_test(url='*.iana.org', pageSize='4', showNumPages=True)
|
|
||||||
{"blocks": 38, "pages": 10, "pageSize": 4}
|
|
||||||
|
|
||||||
# page size for non-existent, but secondary index match
|
|
||||||
>>> zip_ops_test(url='iana.org/domains/int/blah', pageSize=4, showNumPages=True)
|
|
||||||
{"blocks": 0, "pages": 0, "pageSize": 4}
|
|
||||||
|
|
||||||
# page size for non-existent, no secondary index match
|
|
||||||
>>> zip_ops_test(url='*.foo.bar', showNumPages=True)
|
|
||||||
{"blocks": 0, "pages": 0, "pageSize": 10}
|
|
||||||
|
|
||||||
# first page
|
# first page
|
||||||
>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', showPagedIndex=True, pageSize=4, page=0)
|
>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', showPagedIndex=True, pageSize=4, page=0)
|
||||||
com,example)/ 20140127171200 zipnum 0 275 1
|
com,example)/ 20140127171200 zipnum 0 275 1
|
||||||
@ -116,16 +93,16 @@ org,iana)/time-zones/y 20140126200737 http://www.iana.org/time-zones/Y text/html
|
|||||||
org,iana)/time-zones/y 20140126200737 http://www.iana.org/time-zones/Y text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
|
org,iana)/time-zones/y 20140126200737 http://www.iana.org/time-zones/Y text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
|
||||||
|
|
||||||
# invalid page
|
# invalid page
|
||||||
>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', showPagedIndex=True, pageSize=4, page=10)
|
>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', showPagedIndex=True, pageSize=4, page=10) # doctest: +IGNORE_EXCEPTION_DETAIL
|
||||||
Traceback (most recent call last):
|
Traceback (most recent call last):
|
||||||
CDXException: Page 10 invalid: First Page is 0, Last Page is 9
|
CDXException: Page 10 invalid: First Page is 0, Last Page is 9
|
||||||
|
|
||||||
|
|
||||||
>>> zip_ops_test(url='http://aaa.aaa/', matchType='exact', showPagedIndex=True)
|
>>> zip_ops_test(url='http://aaa.aaa/', matchType='exact', showPagedIndex=True) # doctest: +IGNORE_EXCEPTION_DETAIL
|
||||||
Traceback (most recent call last):
|
Traceback (most recent call last):
|
||||||
NotFoundException: No Captures found for: http://aaa.aaa/
|
NotFoundException: No Captures found for: http://aaa.aaa/
|
||||||
|
|
||||||
>>> zip_ops_test(url='http://aaa.aaa/', matchType='domain', showPagedIndex=True)
|
>>> zip_ops_test(url='http://aaa.aaa/', matchType='domain', showPagedIndex=True) # doctest: +IGNORE_EXCEPTION_DETAIL
|
||||||
Traceback (most recent call last):
|
Traceback (most recent call last):
|
||||||
NotFoundException: No Captures found for: http://aaa.aaa/ (domain query)
|
NotFoundException: No Captures found for: http://aaa.aaa/ (domain query)
|
||||||
|
|
||||||
@ -133,34 +110,26 @@ NotFoundException: No Captures found for: http://aaa.aaa/ (domain query)
|
|||||||
>>> zip_ops_test(url='http://aaa.zz/', matchType='domain', showPagedIndex=True)
|
>>> zip_ops_test(url='http://aaa.zz/', matchType='domain', showPagedIndex=True)
|
||||||
org,iana)/time-zones 20140126200737 zipnum 9631 166 38
|
org,iana)/time-zones 20140126200737 zipnum 9631 166 38
|
||||||
|
|
||||||
# read cdx to find 0 pages
|
|
||||||
>>> zip_ops_test(url='http://aaa.zz/', matchType='domain', showNumPages=True)
|
|
||||||
{"blocks": 0, "pages": 0, "pageSize": 10}
|
|
||||||
|
|
||||||
# read cdx to find no captures
|
# read cdx to find no captures
|
||||||
>>> zip_ops_test(url='http://aaa.zz/', matchType='domain')
|
>>> zip_ops_test(url='http://aaa.zz/', matchType='domain') # doctest: +IGNORE_EXCEPTION_DETAIL
|
||||||
Traceback (most recent call last):
|
Traceback (most recent call last):
|
||||||
NotFoundException: No Captures found for: http://aaa.zz/ (domain query)
|
NotFoundException: No Captures found for: http://aaa.zz/ (domain query)
|
||||||
|
|
||||||
# Invalid .idx filesor or missing loc
|
# Invalid .idx filesor or missing loc
|
||||||
|
|
||||||
>>> zip_test_err(url='http://example.com/', matchType='exact')
|
>>> zip_test_err(url='http://example.com/', matchType='exact') # doctest: +IGNORE_EXCEPTION_DETAIL
|
||||||
Traceback (most recent call last):
|
Traceback (most recent call last):
|
||||||
Exception: No Locations Found for: foo
|
Exception: No Locations Found for: foo
|
||||||
|
|
||||||
|
|
||||||
>>> zip_test_err(url='http://iana.org/x', matchType='exact')
|
|
||||||
Traceback (most recent call last):
|
|
||||||
IOError: [Errno 2] No such file or directory: './sample_archive/invalid'
|
|
||||||
|
|
||||||
|
>>> zip_test_err(url='http://example.zz/x', matchType='exact') # doctest: +IGNORE_EXCEPTION_DETAIL
|
||||||
>>> zip_test_err(url='http://example.zz/x', matchType='exact')
|
|
||||||
Traceback (most recent call last):
|
Traceback (most recent call last):
|
||||||
Exception: No Locations Found for: foo2
|
Exception: No Locations Found for: foo2
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from test_cdxops import cdx_ops_test
|
from test_cdxops import cdx_ops_test, cdx_ops_test_data
|
||||||
from pywb import get_test_dir
|
from pywb import get_test_dir
|
||||||
from pywb.cdx.cdxserver import CDXServer
|
from pywb.cdx.cdxserver import CDXServer
|
||||||
|
|
||||||
@ -170,9 +139,15 @@ import tempfile
|
|||||||
import os
|
import os
|
||||||
import json
|
import json
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
test_zipnum = get_test_dir() + 'zipcdx/zipnum-sample.idx'
|
test_zipnum = get_test_dir() + 'zipcdx/zipnum-sample.idx'
|
||||||
|
|
||||||
|
def zip_ops_test_data(url, **kwargs):
|
||||||
|
sources = test_zipnum
|
||||||
|
return json.loads(cdx_ops_test_data(url, sources, **kwargs)[0])
|
||||||
|
|
||||||
def zip_ops_test(url, **kwargs):
|
def zip_ops_test(url, **kwargs):
|
||||||
sources = test_zipnum
|
sources = test_zipnum
|
||||||
cdx_ops_test(url, sources, **kwargs)
|
cdx_ops_test(url, sources, **kwargs)
|
||||||
@ -220,6 +195,50 @@ def test_zip_prefix_load():
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def test_blocks_def_page_size():
|
||||||
|
# Pages -- default page size
|
||||||
|
res = zip_ops_test_data(url='http://iana.org/domains/example', matchType='exact', showNumPages=True)
|
||||||
|
assert(res == {"blocks": 1, "pages": 1, "pageSize": 10})
|
||||||
|
|
||||||
|
def test_blocks_def_size_2():
|
||||||
|
res = zip_ops_test_data(url='http://iana.org/domains/', matchType='domain', showNumPages=True)
|
||||||
|
assert(res == {"blocks": 38, "pages": 4, "pageSize": 10})
|
||||||
|
|
||||||
|
def test_blocks_set_page_size():
|
||||||
|
# set page size
|
||||||
|
res = zip_ops_test_data(url='http://iana.org/domains/', matchType='domain', pageSize=4, showNumPages=True)
|
||||||
|
assert(res == {"blocks": 38, "pages": 10, "pageSize": 4})
|
||||||
|
|
||||||
|
def test_blocks_alt_q():
|
||||||
|
# set page size -- alt domain query
|
||||||
|
res = zip_ops_test_data(url='*.iana.org', pageSize='4', showNumPages=True)
|
||||||
|
assert(res == {"blocks": 38, "pages": 10, "pageSize": 4})
|
||||||
|
|
||||||
|
def test_blocks_secondary_match():
|
||||||
|
# page size for non-existent, but secondary index match
|
||||||
|
res = zip_ops_test_data(url='iana.org/domains/int/blah', pageSize=4, showNumPages=True)
|
||||||
|
assert(res == {"blocks": 0, "pages": 0, "pageSize": 4})
|
||||||
|
|
||||||
|
def test_blocks_no_match():
|
||||||
|
# page size for non-existent, no secondary index match
|
||||||
|
res = zip_ops_test_data(url='*.foo.bar', showNumPages=True)
|
||||||
|
assert(res == {"blocks": 0, "pages": 0, "pageSize": 10})
|
||||||
|
|
||||||
|
def test_blocks_zero_pages():
|
||||||
|
# read cdx to find 0 pages
|
||||||
|
res = zip_ops_test_data(url='http://aaa.zz/', matchType='domain', showNumPages=True)
|
||||||
|
assert(res == {"blocks": 0, "pages": 0, "pageSize": 10})
|
||||||
|
|
||||||
|
|
||||||
|
# Errors
|
||||||
|
|
||||||
|
def test_err_file_not_found():
|
||||||
|
with pytest.raises(IOError):
|
||||||
|
zip_test_err(url='http://iana.org/x', matchType='exact') # doctest: +IGNORE_EXCEPTION_DETAIL
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import doctest
|
import doctest
|
||||||
doctest.testmod()
|
doctest.testmod()
|
||||||
|
@ -5,9 +5,12 @@ import logging
|
|||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
import datetime
|
import datetime
|
||||||
import json
|
import json
|
||||||
|
import six
|
||||||
|
|
||||||
from cdxsource import CDXSource
|
from six.moves import map
|
||||||
from cdxobject import IDXObject, CDXException
|
|
||||||
|
from pywb.cdx.cdxsource import CDXSource
|
||||||
|
from pywb.cdx.cdxobject import IDXObject, CDXException
|
||||||
|
|
||||||
from pywb.utils.loaders import BlockLoader, read_last_line
|
from pywb.utils.loaders import BlockLoader, read_last_line
|
||||||
from pywb.utils.bufferedreaders import gzip_decompressor
|
from pywb.utils.bufferedreaders import gzip_decompressor
|
||||||
@ -52,7 +55,7 @@ class LocMapResolver(object):
|
|||||||
self.loc_mtime = new_mtime
|
self.loc_mtime = new_mtime
|
||||||
|
|
||||||
logging.debug('Loading loc from: ' + self.loc_filename)
|
logging.debug('Loading loc from: ' + self.loc_filename)
|
||||||
with open(self.loc_filename, 'rb') as fh:
|
with open(self.loc_filename, 'r') as fh:
|
||||||
for line in fh:
|
for line in fh:
|
||||||
parts = line.rstrip().split('\t')
|
parts = line.rstrip().split('\t')
|
||||||
self.loc_map[parts[0]] = parts[1:]
|
self.loc_map[parts[0]] = parts[1:]
|
||||||
@ -170,25 +173,28 @@ class ZipNumCluster(CDXSource):
|
|||||||
|
|
||||||
last_line = None
|
last_line = None
|
||||||
|
|
||||||
|
start_key = query.key.encode('utf-8')
|
||||||
|
end_key = query.end_key.encode('utf-8')
|
||||||
|
|
||||||
# Get End
|
# Get End
|
||||||
end_iter = search(reader, query.end_key, prev_size=1)
|
end_iter = search(reader, end_key, prev_size=1)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
end_line = end_iter.next()
|
end_line = six.next(end_iter)
|
||||||
except StopIteration:
|
except StopIteration:
|
||||||
last_line = read_last_line(reader)
|
last_line = read_last_line(reader)
|
||||||
end_line = last_line
|
end_line = last_line
|
||||||
|
|
||||||
# Get Start
|
# Get Start
|
||||||
first_iter = iter_range(reader,
|
first_iter = iter_range(reader,
|
||||||
query.key,
|
start_key,
|
||||||
query.end_key,
|
end_key,
|
||||||
prev_size=1)
|
prev_size=1)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
first_line = first_iter.next()
|
first_line = six.next(first_iter)
|
||||||
except StopIteration:
|
except StopIteration:
|
||||||
if end_line == last_line and query.key >= last_line:
|
if end_line == last_line and start_key >= last_line:
|
||||||
first_line = last_line
|
first_line = last_line
|
||||||
else:
|
else:
|
||||||
reader.close()
|
reader.close()
|
||||||
@ -204,7 +210,7 @@ class ZipNumCluster(CDXSource):
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
blocks = end['lineno'] - first['lineno']
|
blocks = end['lineno'] - first['lineno']
|
||||||
total_pages = blocks / pagesize + 1
|
total_pages = int(blocks / pagesize) + 1
|
||||||
except:
|
except:
|
||||||
blocks = -1
|
blocks = -1
|
||||||
total_pages = 1
|
total_pages = 1
|
||||||
@ -215,8 +221,8 @@ class ZipNumCluster(CDXSource):
|
|||||||
if blocks == 0:
|
if blocks == 0:
|
||||||
try:
|
try:
|
||||||
block_cdx_iter = self.idx_to_cdx([first_line], query)
|
block_cdx_iter = self.idx_to_cdx([first_line], query)
|
||||||
block = block_cdx_iter.next()
|
block = six.next(block_cdx_iter)
|
||||||
cdx = block.next()
|
cdx = six.next(block)
|
||||||
except StopIteration:
|
except StopIteration:
|
||||||
total_pages = 0
|
total_pages = 0
|
||||||
blocks = -1
|
blocks = -1
|
||||||
@ -250,12 +256,12 @@ class ZipNumCluster(CDXSource):
|
|||||||
|
|
||||||
def search_by_line_num(self, reader, line): # pragma: no cover
|
def search_by_line_num(self, reader, line): # pragma: no cover
|
||||||
def line_cmp(line1, line2):
|
def line_cmp(line1, line2):
|
||||||
line1_no = int(line1.rsplit('\t', 1)[-1])
|
line1_no = int(line1.rsplit(b'\t', 1)[-1])
|
||||||
line2_no = int(line2.rsplit('\t', 1)[-1])
|
line2_no = int(line2.rsplit(b'\t', 1)[-1])
|
||||||
return cmp(line1_no, line2_no)
|
return cmp(line1_no, line2_no)
|
||||||
|
|
||||||
line_iter = search(reader, line, compare_func=line_cmp)
|
line_iter = search(reader, line, compare_func=line_cmp)
|
||||||
yield line_iter.next()
|
yield six.next(line_iter)
|
||||||
|
|
||||||
def idx_to_cdx(self, idx_iter, query):
|
def idx_to_cdx(self, idx_iter, query):
|
||||||
blocks = None
|
blocks = None
|
||||||
@ -304,7 +310,8 @@ class ZipNumCluster(CDXSource):
|
|||||||
last_traceback = sys.exc_info()[2]
|
last_traceback = sys.exc_info()[2]
|
||||||
|
|
||||||
if last_exc:
|
if last_exc:
|
||||||
raise last_exc, None, last_traceback
|
six.reraise(Exception, last_exc, last_traceback)
|
||||||
|
#raise last_exc
|
||||||
else:
|
else:
|
||||||
raise Exception('No Locations Found for: ' + blocks.part)
|
raise Exception('No Locations Found for: ' + blocks.part)
|
||||||
|
|
||||||
@ -326,13 +333,13 @@ class ZipNumCluster(CDXSource):
|
|||||||
for line in BytesIO(buff):
|
for line in BytesIO(buff):
|
||||||
yield line
|
yield line
|
||||||
|
|
||||||
iter_ = itertools.chain(*itertools.imap(decompress_block, ranges))
|
iter_ = itertools.chain(*map(decompress_block, ranges))
|
||||||
|
|
||||||
# start bound
|
# start bound
|
||||||
iter_ = linearsearch(iter_, query.key)
|
iter_ = linearsearch(iter_, query.key.encode('utf-8'))
|
||||||
|
|
||||||
# end bound
|
# end bound
|
||||||
end = query.end_key
|
end = query.end_key.encode('utf-8')
|
||||||
iter_ = itertools.takewhile(lambda line: line < end, iter_)
|
iter_ = itertools.takewhile(lambda line: line < end, iter_)
|
||||||
return iter_
|
return iter_
|
||||||
|
|
||||||
|
@ -1,10 +1,10 @@
|
|||||||
import urlparse
|
from six.moves.urllib.parse import urlsplit, urlunsplit, quote
|
||||||
|
|
||||||
import re
|
import re
|
||||||
from urllib import quote
|
|
||||||
|
|
||||||
from pywb.rewrite.url_rewriter import UrlRewriter
|
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||||
from pywb.rewrite.wburl import WbUrl
|
from pywb.rewrite.wburl import WbUrl
|
||||||
from wbrequestresponse import WbRequest, WbResponse
|
from pywb.framework.wbrequestresponse import WbRequest, WbResponse
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
@ -182,7 +182,7 @@ class ReferRedirect:
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
# get referrer path name
|
# get referrer path name
|
||||||
ref_split = urlparse.urlsplit(referrer)
|
ref_split = urlsplit(referrer)
|
||||||
|
|
||||||
# require that referrer starts with current Host, if any
|
# require that referrer starts with current Host, if any
|
||||||
curr_host = env.get('HTTP_HOST')
|
curr_host = env.get('HTTP_HOST')
|
||||||
@ -236,10 +236,10 @@ class ReferRedirect:
|
|||||||
ref_request.wb_url.url = new_wb_url.url
|
ref_request.wb_url.url = new_wb_url.url
|
||||||
return ref_route.handler(ref_request)
|
return ref_route.handler(ref_request)
|
||||||
|
|
||||||
final_url = urlparse.urlunsplit((ref_split.scheme,
|
final_url = urlunsplit((ref_split.scheme,
|
||||||
ref_split.netloc,
|
ref_split.netloc,
|
||||||
rewritten_url,
|
rewritten_url,
|
||||||
'',
|
'',
|
||||||
''))
|
''))
|
||||||
|
|
||||||
return WbResponse.redir_response(final_url, status='302 Temp Redirect')
|
return WbResponse.redir_response(final_url, status='302 Temp Redirect')
|
||||||
|
@ -2,7 +2,7 @@ from pywb.utils.wbexception import BadRequestException
|
|||||||
from pywb.utils.timeutils import http_date_to_timestamp
|
from pywb.utils.timeutils import http_date_to_timestamp
|
||||||
from pywb.utils.timeutils import timestamp_to_http_date
|
from pywb.utils.timeutils import timestamp_to_http_date
|
||||||
|
|
||||||
from wbrequestresponse import WbRequest, WbResponse
|
from pywb.framework.wbrequestresponse import WbRequest, WbResponse
|
||||||
from pywb.rewrite.wburl import WbUrl
|
from pywb.rewrite.wburl import WbUrl
|
||||||
|
|
||||||
LINK_FORMAT = 'application/link-format'
|
LINK_FORMAT = 'application/link-format'
|
||||||
|
@ -3,7 +3,7 @@ from __future__ import absolute_import
|
|||||||
from pywb.framework.wbrequestresponse import WbResponse, WbRequest
|
from pywb.framework.wbrequestresponse import WbResponse, WbRequest
|
||||||
from pywb.framework.archivalrouter import ArchivalRouter
|
from pywb.framework.archivalrouter import ArchivalRouter
|
||||||
|
|
||||||
import urlparse
|
from six.moves.urllib.parse import urlsplit
|
||||||
import base64
|
import base64
|
||||||
|
|
||||||
import socket
|
import socket
|
||||||
@ -164,7 +164,7 @@ class ProxyRouter(object):
|
|||||||
|
|
||||||
url = env['REL_REQUEST_URI']
|
url = env['REL_REQUEST_URI']
|
||||||
else:
|
else:
|
||||||
parts = urlparse.urlsplit(env['REL_REQUEST_URI'])
|
parts = urlsplit(env['REL_REQUEST_URI'])
|
||||||
hostport = parts.netloc.split(':', 1)
|
hostport = parts.netloc.split(':', 1)
|
||||||
env['pywb.proxy_host'] = hostport[0]
|
env['pywb.proxy_host'] = hostport[0]
|
||||||
env['pywb.proxy_port'] = hostport[1] if len(hostport) == 2 else ''
|
env['pywb.proxy_port'] = hostport[1] if len(hostport) == 2 else ''
|
||||||
|
@ -1,13 +1,14 @@
|
|||||||
from wbrequestresponse import WbResponse
|
from pywb.framework.wbrequestresponse import WbResponse
|
||||||
from pywb.utils.loaders import extract_client_cookie
|
from pywb.utils.loaders import extract_client_cookie
|
||||||
from pywb.utils.wbexception import WbException
|
from pywb.utils.wbexception import WbException
|
||||||
from pywb.utils.statusandheaders import StatusAndHeaders
|
from pywb.utils.statusandheaders import StatusAndHeaders
|
||||||
from pywb.rewrite.wburl import WbUrl
|
from pywb.rewrite.wburl import WbUrl
|
||||||
|
|
||||||
from cache import create_cache
|
from pywb.framework.cache import create_cache
|
||||||
from basehandlers import WbUrlHandler
|
from pywb.framework.basehandlers import WbUrlHandler
|
||||||
|
|
||||||
|
from six.moves.urllib.parse import parse_qs, urlsplit
|
||||||
|
|
||||||
import urlparse
|
|
||||||
import base64
|
import base64
|
||||||
import os
|
import os
|
||||||
import json
|
import json
|
||||||
@ -130,7 +131,7 @@ class IPCacheResolver(BaseCollResolver):
|
|||||||
ip = env['REMOTE_ADDR']
|
ip = env['REMOTE_ADDR']
|
||||||
qs = env.get('pywb.proxy_query')
|
qs = env.get('pywb.proxy_query')
|
||||||
if qs:
|
if qs:
|
||||||
res = urlparse.parse_qs(qs)
|
res = parse_qs(qs)
|
||||||
|
|
||||||
if 'ip' in res:
|
if 'ip' in res:
|
||||||
ip = res['ip'][0]
|
ip = res['ip'][0]
|
||||||
@ -145,7 +146,7 @@ class IPCacheResolver(BaseCollResolver):
|
|||||||
qs = env.get('pywb.proxy_query')
|
qs = env.get('pywb.proxy_query')
|
||||||
|
|
||||||
if qs:
|
if qs:
|
||||||
res = urlparse.parse_qs(qs)
|
res = parse_qs(qs)
|
||||||
|
|
||||||
if 'ip' in res:
|
if 'ip' in res:
|
||||||
ip = res['ip'][0]
|
ip = res['ip'][0]
|
||||||
@ -223,7 +224,7 @@ class CookieResolver(BaseCollResolver):
|
|||||||
|
|
||||||
def handle_magic_page(self, env):
|
def handle_magic_page(self, env):
|
||||||
request_url = env['REL_REQUEST_URI']
|
request_url = env['REL_REQUEST_URI']
|
||||||
parts = urlparse.urlsplit(request_url)
|
parts = urlsplit(request_url)
|
||||||
server_name = env['pywb.proxy_host']
|
server_name = env['pywb.proxy_host']
|
||||||
|
|
||||||
path_url = parts.path[1:]
|
path_url = parts.path[1:]
|
||||||
@ -309,7 +310,7 @@ class CookieResolver(BaseCollResolver):
|
|||||||
if '://' not in path_url:
|
if '://' not in path_url:
|
||||||
path_url = 'http://' + path_url
|
path_url = 'http://' + path_url
|
||||||
|
|
||||||
path_parts = urlparse.urlsplit(path_url)
|
path_parts = urlsplit(path_url)
|
||||||
|
|
||||||
new_url = path_parts.path[1:]
|
new_url = path_parts.path[1:]
|
||||||
if path_parts.query:
|
if path_parts.query:
|
||||||
|
@ -94,8 +94,10 @@ False
|
|||||||
|
|
||||||
from pywb.framework.archivalrouter import Route, ReferRedirect, ArchivalRouter
|
from pywb.framework.archivalrouter import Route, ReferRedirect, ArchivalRouter
|
||||||
from pywb.framework.basehandlers import BaseHandler, WbUrlHandler
|
from pywb.framework.basehandlers import BaseHandler, WbUrlHandler
|
||||||
|
|
||||||
import pprint
|
import pprint
|
||||||
import urlparse
|
|
||||||
|
from six.moves.urllib.parse import urlsplit
|
||||||
|
|
||||||
def _test_route_req(route, env, abs_path=False):
|
def _test_route_req(route, env, abs_path=False):
|
||||||
matcher, coll = route.is_handling(env['REL_REQUEST_URI'])
|
matcher, coll = route.is_handling(env['REL_REQUEST_URI'])
|
||||||
|
@ -87,7 +87,7 @@ def print_req_from_uri(request_uri, env={}, use_abs_prefix=False):
|
|||||||
response = req_from_uri(request_uri, env, use_abs_prefix)
|
response = req_from_uri(request_uri, env, use_abs_prefix)
|
||||||
varlist = vars(response)
|
varlist = vars(response)
|
||||||
the_dict = dict((k, varlist[k]) for k in ('request_uri', 'wb_prefix', 'wb_url', 'coll'))
|
the_dict = dict((k, varlist[k]) for k in ('request_uri', 'wb_prefix', 'wb_url', 'coll'))
|
||||||
print the_dict
|
print(the_dict)
|
||||||
|
|
||||||
|
|
||||||
def req_from_uri(request_uri, env={}, use_abs_prefix=False):
|
def req_from_uri(request_uri, env={}, use_abs_prefix=False):
|
||||||
|
@ -41,7 +41,7 @@ def test_err_app():
|
|||||||
resp = testapp.get('/abc', expect_errors=True)
|
resp = testapp.get('/abc', expect_errors=True)
|
||||||
|
|
||||||
assert resp.status_int == 500
|
assert resp.status_int == 500
|
||||||
assert '500 Internal Server Error Error: Test Unexpected Error' in resp.body
|
assert b'500 Internal Server Error Error: Test Unexpected Error' in resp.body
|
||||||
|
|
||||||
def test_custom_err_app():
|
def test_custom_err_app():
|
||||||
the_app = init_app(initer(TestCustomErrApp), load_yaml=False)
|
the_app = init_app(initer(TestCustomErrApp), load_yaml=False)
|
||||||
@ -50,7 +50,7 @@ def test_custom_err_app():
|
|||||||
resp = testapp.get('/abc', expect_errors=True)
|
resp = testapp.get('/abc', expect_errors=True)
|
||||||
|
|
||||||
assert resp.status_int == 403
|
assert resp.status_int == 403
|
||||||
assert '403 Access Denied Error: Forbidden Test' in resp.body
|
assert b'403 Access Denied Error: Forbidden Test' in resp.body
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
from pywb.utils.wbexception import WbException, NotFoundException
|
from pywb.utils.wbexception import WbException, NotFoundException
|
||||||
from pywb.utils.loaders import load_yaml_config
|
from pywb.utils.loaders import load_yaml_config
|
||||||
|
|
||||||
from wbrequestresponse import WbResponse, StatusAndHeaders
|
from pywb.framework.wbrequestresponse import WbResponse, StatusAndHeaders
|
||||||
|
|
||||||
|
|
||||||
import os
|
import os
|
||||||
@ -92,12 +92,13 @@ class WSGIApp(object):
|
|||||||
else:
|
else:
|
||||||
err_url = None
|
err_url = None
|
||||||
|
|
||||||
err_msg = exc.message
|
if len(exc.args):
|
||||||
|
err_msg = exc.args[0]
|
||||||
|
|
||||||
if print_trace:
|
if print_trace:
|
||||||
import traceback
|
import traceback
|
||||||
err_details = traceback.format_exc(exc)
|
err_details = traceback.format_exc()
|
||||||
print err_details
|
print(err_details)
|
||||||
else:
|
else:
|
||||||
logging.info(err_msg)
|
logging.info(err_msg)
|
||||||
err_details = None
|
err_details = None
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
from Cookie import SimpleCookie, CookieError
|
from six.moves.http_cookies import SimpleCookie, CookieError
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
|
@ -1,16 +1,14 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
import sys
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from HTMLParser import HTMLParser, HTMLParseError
|
from six.moves.html_parser import HTMLParser
|
||||||
from urlparse import urljoin, urlsplit, urlunsplit
|
from six.moves.urllib.parse import urljoin, urlsplit, urlunsplit
|
||||||
|
|
||||||
from url_rewriter import UrlRewriter
|
|
||||||
from regex_rewriters import JSRewriter, CSSRewriter
|
|
||||||
|
|
||||||
import cgi
|
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||||
|
from pywb.rewrite.regex_rewriters import JSRewriter, CSSRewriter
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
@ -411,7 +409,7 @@ class HTMLRewriter(HTMLRewriterMixin, HTMLParser):
|
|||||||
def feed(self, string):
|
def feed(self, string):
|
||||||
try:
|
try:
|
||||||
HTMLParser.feed(self, string)
|
HTMLParser.feed(self, string)
|
||||||
except HTMLParseError: # pragma: no cover
|
except Exception: # pragma: no cover
|
||||||
# only raised in 2.6
|
# only raised in 2.6
|
||||||
self.out.write(string)
|
self.out.write(string)
|
||||||
|
|
||||||
@ -429,7 +427,7 @@ class HTMLRewriter(HTMLRewriterMixin, HTMLParser):
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
HTMLParser.close(self)
|
HTMLParser.close(self)
|
||||||
except HTMLParseError: # pragma: no cover
|
except Exception: # pragma: no cover
|
||||||
# only raised in 2.6
|
# only raised in 2.6
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@ -1,8 +1,6 @@
|
|||||||
import re
|
import re
|
||||||
import sys
|
|
||||||
import itertools
|
|
||||||
|
|
||||||
from url_rewriter import UrlRewriter
|
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
|
@ -7,16 +7,16 @@ import re
|
|||||||
from chardet.universaldetector import UniversalDetector
|
from chardet.universaldetector import UniversalDetector
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
|
|
||||||
from header_rewriter import RewrittenStatusAndHeaders
|
from pywb.rewrite.header_rewriter import RewrittenStatusAndHeaders
|
||||||
|
|
||||||
from rewriterules import RewriteRules
|
from pywb.rewrite.rewriterules import RewriteRules
|
||||||
|
|
||||||
from pywb.utils.dsrules import RuleSet
|
from pywb.utils.dsrules import RuleSet
|
||||||
from pywb.utils.statusandheaders import StatusAndHeaders
|
from pywb.utils.statusandheaders import StatusAndHeaders
|
||||||
from pywb.utils.bufferedreaders import DecompressingBufferedReader
|
from pywb.utils.bufferedreaders import DecompressingBufferedReader
|
||||||
from pywb.utils.bufferedreaders import ChunkedDataReader, BufferedReader
|
from pywb.utils.bufferedreaders import ChunkedDataReader, BufferedReader
|
||||||
|
|
||||||
from regex_rewriters import JSNoneRewriter, JSLinkOnlyRewriter
|
from pywb.rewrite.regex_rewriters import JSNoneRewriter, JSLinkOnlyRewriter
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
@ -288,7 +288,7 @@ class RewriteContent:
|
|||||||
def _decode_buff(buff, stream, encoding): # pragma: no coverage
|
def _decode_buff(buff, stream, encoding): # pragma: no coverage
|
||||||
try:
|
try:
|
||||||
buff = buff.decode(encoding)
|
buff = buff.decode(encoding)
|
||||||
except UnicodeDecodeError, e:
|
except UnicodeDecodeError as e:
|
||||||
# chunk may have cut apart unicode bytes -- add 1-3 bytes and retry
|
# chunk may have cut apart unicode bytes -- add 1-3 bytes and retry
|
||||||
for i in range(3):
|
for i in range(3):
|
||||||
buff += stream.read(1)
|
buff += stream.read(1)
|
||||||
|
@ -8,7 +8,7 @@ import mimetypes
|
|||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from urlparse import urlsplit
|
from six.moves.urllib.parse import urlsplit
|
||||||
|
|
||||||
from pywb.utils.loaders import is_http, LimitReader, LocalFileLoader, to_file_url
|
from pywb.utils.loaders import is_http, LimitReader, LocalFileLoader, to_file_url
|
||||||
from pywb.utils.loaders import extract_client_cookie
|
from pywb.utils.loaders import extract_client_cookie
|
||||||
@ -16,7 +16,7 @@ from pywb.utils.timeutils import timestamp_now
|
|||||||
from pywb.utils.statusandheaders import StatusAndHeaders
|
from pywb.utils.statusandheaders import StatusAndHeaders
|
||||||
from pywb.utils.canonicalize import canonicalize
|
from pywb.utils.canonicalize import canonicalize
|
||||||
|
|
||||||
from rewrite_content import RewriteContent
|
from pywb.rewrite.rewrite_content import RewriteContent
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
|
@ -1,13 +1,12 @@
|
|||||||
from pywb.utils.dsrules import BaseRule
|
from pywb.utils.dsrules import BaseRule
|
||||||
|
|
||||||
from regex_rewriters import RegexRewriter, CSSRewriter, XMLRewriter
|
from pywb.rewrite.regex_rewriters import RegexRewriter, CSSRewriter, XMLRewriter
|
||||||
from regex_rewriters import JSLinkAndLocationRewriter, JSLinkOnlyRewriter
|
from pywb.rewrite.regex_rewriters import JSLinkAndLocationRewriter, JSLinkOnlyRewriter
|
||||||
from regex_rewriters import JSLocationOnlyRewriter, JSNoneRewriter
|
from pywb.rewrite.regex_rewriters import JSLocationOnlyRewriter, JSNoneRewriter
|
||||||
|
|
||||||
from header_rewriter import HeaderRewriter
|
from pywb.rewrite.header_rewriter import HeaderRewriter
|
||||||
from html_rewriter import HTMLRewriter
|
from pywb.rewrite.html_rewriter import HTMLRewriter
|
||||||
|
|
||||||
import itertools
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
import urlparse
|
from six.moves.urllib.parse import urljoin, urlsplit, urlunsplit
|
||||||
|
|
||||||
from wburl import WbUrl
|
from pywb.rewrite.wburl import WbUrl
|
||||||
from cookie_rewriter import get_cookie_rewriter
|
from pywb.rewrite.cookie_rewriter import get_cookie_rewriter
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
@ -119,11 +119,11 @@ class UrlRewriter(object):
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def urljoin(orig_url, url):
|
def urljoin(orig_url, url):
|
||||||
new_url = urlparse.urljoin(orig_url, url)
|
new_url = urljoin(orig_url, url)
|
||||||
if '../' not in new_url:
|
if '../' not in new_url:
|
||||||
return new_url
|
return new_url
|
||||||
|
|
||||||
parts = urlparse.urlsplit(new_url)
|
parts = urlsplit(new_url)
|
||||||
scheme, netloc, path, query, frag = parts
|
scheme, netloc, path, query, frag = parts
|
||||||
|
|
||||||
path_parts = path.split('/')
|
path_parts = path.split('/')
|
||||||
@ -147,7 +147,7 @@ class UrlRewriter(object):
|
|||||||
|
|
||||||
parts = (scheme, netloc, path, query, frag)
|
parts = (scheme, netloc, path, query, frag)
|
||||||
|
|
||||||
new_url = urlparse.urlunsplit(parts)
|
new_url = urlunsplit(parts)
|
||||||
return new_url
|
return new_url
|
||||||
|
|
||||||
|
|
||||||
|
@ -39,8 +39,11 @@ wayback url format.
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import re
|
import re
|
||||||
import urllib
|
import six
|
||||||
import urlparse
|
|
||||||
|
from six.moves.urllib.parse import urlsplit, urlunsplit
|
||||||
|
from six.moves.urllib.parse import quote_plus, quote, unquote_plus
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class BaseWbUrl(object):
|
class BaseWbUrl(object):
|
||||||
@ -105,7 +108,7 @@ class WbUrl(BaseWbUrl):
|
|||||||
if 'xn--' not in url:
|
if 'xn--' not in url:
|
||||||
return url
|
return url
|
||||||
|
|
||||||
parts = urlparse.urlsplit(url)
|
parts = urlsplit(url)
|
||||||
domain = parts.netloc
|
domain = parts.netloc
|
||||||
try:
|
try:
|
||||||
domain = domain.decode('idna')
|
domain = domain.decode('idna')
|
||||||
@ -114,9 +117,9 @@ class WbUrl(BaseWbUrl):
|
|||||||
# likely already encoded, so use as is
|
# likely already encoded, so use as is
|
||||||
pass
|
pass
|
||||||
|
|
||||||
domain = urllib.quote(domain)#, safe=r':\/')
|
domain = quote(domain)#, safe=r':\/')
|
||||||
|
|
||||||
return urlparse.urlunsplit((parts[0], domain, parts[2], parts[3], parts[4]))
|
return urlunsplit((parts[0], domain, parts[2], parts[3], parts[4]))
|
||||||
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@ -131,7 +134,7 @@ class WbUrl(BaseWbUrl):
|
|||||||
"""
|
"""
|
||||||
parts = WbUrl.FIRST_PATH.split(url, 1)
|
parts = WbUrl.FIRST_PATH.split(url, 1)
|
||||||
|
|
||||||
scheme_dom = urllib.unquote_plus(parts[0])
|
scheme_dom = unquote_plus(parts[0])
|
||||||
|
|
||||||
if isinstance(scheme_dom, str):
|
if isinstance(scheme_dom, str):
|
||||||
if scheme_dom == parts[0]:
|
if scheme_dom == parts[0]:
|
||||||
@ -155,7 +158,7 @@ class WbUrl(BaseWbUrl):
|
|||||||
|
|
||||||
if len(parts) > 1:
|
if len(parts) > 1:
|
||||||
if isinstance(parts[1], unicode):
|
if isinstance(parts[1], unicode):
|
||||||
url += '/' + urllib.quote(parts[1].encode('utf-8'))
|
url += '/' + quote(parts[1].encode('utf-8'))
|
||||||
else:
|
else:
|
||||||
url += '/' + parts[1]
|
url += '/' + parts[1]
|
||||||
|
|
||||||
@ -168,7 +171,7 @@ class WbUrl(BaseWbUrl):
|
|||||||
|
|
||||||
if isinstance(orig_url, unicode):
|
if isinstance(orig_url, unicode):
|
||||||
orig_url = orig_url.encode('utf-8')
|
orig_url = orig_url.encode('utf-8')
|
||||||
orig_url = urllib.quote(orig_url)
|
orig_url = quote(orig_url)
|
||||||
|
|
||||||
self._original_url = orig_url
|
self._original_url = orig_url
|
||||||
|
|
||||||
@ -259,7 +262,7 @@ class WbUrl(BaseWbUrl):
|
|||||||
rex_query = '=' + re.escape(prefix) + '([0-9])*([\w]{2}_)?/?'
|
rex_query = '=' + re.escape(prefix) + '([0-9])*([\w]{2}_)?/?'
|
||||||
self.url = re.sub(rex_query, '=', self.url)
|
self.url = re.sub(rex_query, '=', self.url)
|
||||||
|
|
||||||
rex_query = '=(' + urllib.quote_plus(prefix) + '.*?)((?:https?%3A)?%2F%2F[^&]+)'
|
rex_query = '=(' + quote_plus(prefix) + '.*?)((?:https?%3A)?%2F%2F[^&]+)'
|
||||||
self.url = re.sub(rex_query, '=\\2', self.url)
|
self.url = re.sub(rex_query, '=\\2', self.url)
|
||||||
|
|
||||||
return self.url
|
return self.url
|
||||||
|
@ -45,6 +45,17 @@ def load_yaml_config(config_file):
|
|||||||
return config
|
return config
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
def to_native_str(value, encoding='iso-8859-1'):
|
||||||
|
if isinstance(value, str):
|
||||||
|
return value
|
||||||
|
|
||||||
|
if six.PY3 and isinstance(value, six.binary_type):
|
||||||
|
return value.decode(encoding)
|
||||||
|
elif six.PY2 and isinstance(value, six.text_type):
|
||||||
|
return value.encode(encoding)
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
def extract_post_query(method, mime, length, stream, buffered_stream=None):
|
def extract_post_query(method, mime, length, stream, buffered_stream=None):
|
||||||
"""
|
"""
|
||||||
@ -77,7 +88,7 @@ def extract_post_query(method, mime, length, stream, buffered_stream=None):
|
|||||||
if not buff:
|
if not buff:
|
||||||
break
|
break
|
||||||
|
|
||||||
post_query += buff
|
post_query += to_native_str(buff)
|
||||||
|
|
||||||
if buffered_stream:
|
if buffered_stream:
|
||||||
buffered_stream.write(post_query)
|
buffered_stream.write(post_query)
|
||||||
|
@ -5,8 +5,12 @@ Representation and parsing of HTTP-style status + headers
|
|||||||
import pprint
|
import pprint
|
||||||
from copy import copy
|
from copy import copy
|
||||||
from six.moves import range
|
from six.moves import range
|
||||||
|
import six
|
||||||
|
from pywb.utils.loaders import to_native_str
|
||||||
|
|
||||||
|
|
||||||
|
WRAP_WIDTH = 80
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class StatusAndHeaders(object):
|
class StatusAndHeaders(object):
|
||||||
"""
|
"""
|
||||||
@ -112,7 +116,7 @@ class StatusAndHeaders(object):
|
|||||||
return self
|
return self
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
headers_str = pprint.pformat(self.headers, indent=2)
|
headers_str = pprint.pformat(self.headers, indent=2, width=WRAP_WIDTH)
|
||||||
return "StatusAndHeaders(protocol = '{0}', statusline = '{1}', \
|
return "StatusAndHeaders(protocol = '{0}', statusline = '{1}', \
|
||||||
headers = {2})".format(self.protocol, self.statusline, headers_str)
|
headers = {2})".format(self.protocol, self.statusline, headers_str)
|
||||||
|
|
||||||
@ -145,9 +149,15 @@ class StatusAndHeadersParser(object):
|
|||||||
|
|
||||||
support continuation headers starting with space or tab
|
support continuation headers starting with space or tab
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
def readline():
|
||||||
|
return to_native_str(stream.readline())
|
||||||
|
|
||||||
# status line w newlines intact
|
# status line w newlines intact
|
||||||
if full_statusline is None:
|
if full_statusline is None:
|
||||||
full_statusline = stream.readline()
|
full_statusline = readline()
|
||||||
|
else:
|
||||||
|
full_statusline = to_native_str(full_statusline)
|
||||||
|
|
||||||
statusline, total_read = _strip_count(full_statusline, 0)
|
statusline, total_read = _strip_count(full_statusline, 0)
|
||||||
|
|
||||||
@ -173,7 +183,7 @@ class StatusAndHeadersParser(object):
|
|||||||
else:
|
else:
|
||||||
protocol_status = statusline.split(' ', 1)
|
protocol_status = statusline.split(' ', 1)
|
||||||
|
|
||||||
line, total_read = _strip_count(stream.readline(), total_read)
|
line, total_read = _strip_count(readline(), total_read)
|
||||||
while line:
|
while line:
|
||||||
result = line.split(':', 1)
|
result = line.split(':', 1)
|
||||||
if len(result) == 2:
|
if len(result) == 2:
|
||||||
@ -183,14 +193,14 @@ class StatusAndHeadersParser(object):
|
|||||||
name = result[0]
|
name = result[0]
|
||||||
value = None
|
value = None
|
||||||
|
|
||||||
next_line, total_read = _strip_count(stream.readline(),
|
next_line, total_read = _strip_count(readline(),
|
||||||
total_read)
|
total_read)
|
||||||
|
|
||||||
# append continuation lines, if any
|
# append continuation lines, if any
|
||||||
while next_line and next_line.startswith((' ', '\t')):
|
while next_line and next_line.startswith((' ', '\t')):
|
||||||
if value is not None:
|
if value is not None:
|
||||||
value += next_line
|
value += next_line
|
||||||
next_line, total_read = _strip_count(stream.readline(),
|
next_line, total_read = _strip_count(readline(),
|
||||||
total_read)
|
total_read)
|
||||||
|
|
||||||
if value is not None:
|
if value is not None:
|
||||||
|
@ -3,7 +3,7 @@ from pywb.utils.bufferedreaders import DecompressingBufferedReader
|
|||||||
from pywb.utils.canonicalize import canonicalize
|
from pywb.utils.canonicalize import canonicalize
|
||||||
from pywb.utils.loaders import extract_post_query, append_post_query
|
from pywb.utils.loaders import extract_post_query, append_post_query
|
||||||
|
|
||||||
from recordloader import ArcWarcRecordLoader
|
from pywb.warc.recordloader import ArcWarcRecordLoader
|
||||||
|
|
||||||
import hashlib
|
import hashlib
|
||||||
import base64
|
import base64
|
||||||
@ -66,7 +66,10 @@ class ArchiveIterator(object):
|
|||||||
self.member_info = None
|
self.member_info = None
|
||||||
self.no_record_parse = no_record_parse
|
self.no_record_parse = no_record_parse
|
||||||
|
|
||||||
def iter_records(self, block_size=16384):
|
def __iter__(self):
|
||||||
|
return self
|
||||||
|
|
||||||
|
def __call__(self, block_size=16384):
|
||||||
""" iterate over each record
|
""" iterate over each record
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@ -152,10 +155,10 @@ class ArchiveIterator(object):
|
|||||||
|
|
||||||
stripped = line.rstrip()
|
stripped = line.rstrip()
|
||||||
|
|
||||||
if stripped == '' or first_line:
|
if len(stripped) == 0 or first_line:
|
||||||
empty_size += len(line)
|
empty_size += len(line)
|
||||||
|
|
||||||
if stripped != '':
|
if len(stripped) != 0:
|
||||||
# if first line is not blank,
|
# if first line is not blank,
|
||||||
# likely content-length was invalid, display warning
|
# likely content-length was invalid, display warning
|
||||||
err_offset = self.fh.tell() - self.reader.rem_length() - empty_size
|
err_offset = self.fh.tell() - self.reader.rem_length() - empty_size
|
||||||
@ -290,7 +293,7 @@ class ArchiveIndexEntryMixin(object):
|
|||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class DefaultRecordIter(object):
|
class DefaultRecordParser(object):
|
||||||
def __init__(self, **options):
|
def __init__(self, **options):
|
||||||
self.options = options
|
self.options = options
|
||||||
self.entry_cache = {}
|
self.entry_cache = {}
|
||||||
@ -329,14 +332,14 @@ class DefaultRecordIter(object):
|
|||||||
|
|
||||||
def end_payload(self, entry):
|
def end_payload(self, entry):
|
||||||
if self.digester:
|
if self.digester:
|
||||||
entry['digest'] = base64.b32encode(self.digester.digest())
|
entry['digest'] = base64.b32encode(self.digester.digest()).decode('ascii')
|
||||||
|
|
||||||
self.entry = None
|
self.entry = None
|
||||||
|
|
||||||
def create_payload_buffer(self, entry):
|
def create_payload_buffer(self, entry):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def create_record_iter(self, arcv_iter):
|
def create_record_iter(self, raw_iter):
|
||||||
append_post = self.options.get('append_post')
|
append_post = self.options.get('append_post')
|
||||||
include_all = self.options.get('include_all')
|
include_all = self.options.get('include_all')
|
||||||
block_size = self.options.get('block_size', 16384)
|
block_size = self.options.get('block_size', 16384)
|
||||||
@ -347,7 +350,7 @@ class DefaultRecordIter(object):
|
|||||||
raise Exception('Sorry, minimal index option and ' +
|
raise Exception('Sorry, minimal index option and ' +
|
||||||
'append POST options can not be used together')
|
'append POST options can not be used together')
|
||||||
|
|
||||||
for record in arcv_iter.iter_records(block_size):
|
for record in raw_iter(block_size):
|
||||||
entry = None
|
entry = None
|
||||||
|
|
||||||
if not include_all and not minimal and (record.status_headers.get_statuscode() == '-'):
|
if not include_all and not minimal and (record.status_headers.get_statuscode() == '-'):
|
||||||
@ -394,9 +397,9 @@ class DefaultRecordIter(object):
|
|||||||
entry.record = record
|
entry.record = record
|
||||||
|
|
||||||
self.begin_payload(compute_digest, entry)
|
self.begin_payload(compute_digest, entry)
|
||||||
arcv_iter.read_to_end(record, self.handle_payload)
|
raw_iter.read_to_end(record, self.handle_payload)
|
||||||
|
|
||||||
entry.set_rec_info(*arcv_iter.member_info)
|
entry.set_rec_info(*raw_iter.member_info)
|
||||||
self.end_payload(entry)
|
self.end_payload(entry)
|
||||||
|
|
||||||
yield entry
|
yield entry
|
||||||
@ -536,8 +539,15 @@ class DefaultRecordIter(object):
|
|||||||
|
|
||||||
yield entry
|
yield entry
|
||||||
|
|
||||||
|
def open(self, filename):
|
||||||
|
with open(filename, 'rb') as fh:
|
||||||
|
for entry in self(fh):
|
||||||
|
yield entry
|
||||||
|
|
||||||
class ArchiveIndexEntry(ArchiveIndexEntryMixin, dict):
|
class ArchiveIndexEntry(ArchiveIndexEntryMixin, dict):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
class OrderedArchiveIndexEntry(ArchiveIndexEntryMixin, OrderedDict):
|
class OrderedArchiveIndexEntry(ArchiveIndexEntryMixin, OrderedDict):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
@ -29,15 +29,18 @@ except ImportError: # pragma: no cover
|
|||||||
from argparse import ArgumentParser, RawTextHelpFormatter
|
from argparse import ArgumentParser, RawTextHelpFormatter
|
||||||
from bisect import insort
|
from bisect import insort
|
||||||
|
|
||||||
from io import BytesIO
|
from six import StringIO
|
||||||
|
|
||||||
from archiveiterator import DefaultRecordIter
|
from pywb.warc.archiveiterator import DefaultRecordParser
|
||||||
|
import codecs
|
||||||
|
import six
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class BaseCDXWriter(object):
|
class BaseCDXWriter(object):
|
||||||
def __init__(self, out):
|
def __init__(self, out):
|
||||||
self.out = out
|
self.out = codecs.getwriter('utf-8')(out)
|
||||||
|
#self.out = out
|
||||||
|
|
||||||
def __enter__(self):
|
def __enter__(self):
|
||||||
self._write_header()
|
self._write_header()
|
||||||
@ -69,7 +72,7 @@ class CDXJ(object):
|
|||||||
|
|
||||||
outdict = OrderedDict()
|
outdict = OrderedDict()
|
||||||
|
|
||||||
for n, v in entry.iteritems():
|
for n, v in six.iteritems(entry):
|
||||||
if n in ('urlkey', 'timestamp'):
|
if n in ('urlkey', 'timestamp'):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@ -145,7 +148,7 @@ class SortedCDXWriter(BaseCDXWriter):
|
|||||||
return res
|
return res
|
||||||
|
|
||||||
def write(self, entry, filename):
|
def write(self, entry, filename):
|
||||||
self.out = BytesIO()
|
self.out = StringIO()
|
||||||
super(SortedCDXWriter, self).write(entry, filename)
|
super(SortedCDXWriter, self).write(entry, filename)
|
||||||
line = self.out.getvalue()
|
line = self.out.getvalue()
|
||||||
if line:
|
if line:
|
||||||
@ -175,7 +178,7 @@ def iter_file_or_dir(inputs, recursive=True, rel_root=None):
|
|||||||
if not rel_root:
|
if not rel_root:
|
||||||
filename = os.path.basename(input_)
|
filename = os.path.basename(input_)
|
||||||
else:
|
else:
|
||||||
filename = _resolve_rel_path(input_, rel_root)
|
filename = _resolve_rel_path(input_, rel_root)
|
||||||
|
|
||||||
yield input_, filename
|
yield input_, filename
|
||||||
|
|
||||||
@ -268,7 +271,7 @@ def write_multi_cdx_index(output, inputs, **options):
|
|||||||
outfile = open(output, 'wb')
|
outfile = open(output, 'wb')
|
||||||
|
|
||||||
writer_cls = get_cdx_writer_cls(options)
|
writer_cls = get_cdx_writer_cls(options)
|
||||||
record_iter = DefaultRecordIter(**options)
|
record_iter = DefaultRecordParser(**options)
|
||||||
|
|
||||||
with writer_cls(outfile) as writer:
|
with writer_cls(outfile) as writer:
|
||||||
for fullpath, filename in iter_file_or_dir(inputs,
|
for fullpath, filename in iter_file_or_dir(inputs,
|
||||||
@ -285,13 +288,12 @@ def write_multi_cdx_index(output, inputs, **options):
|
|||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
def write_cdx_index(outfile, infile, filename, **options):
|
def write_cdx_index(outfile, infile, filename, **options):
|
||||||
if type(filename) is unicode:
|
#filename = filename.encode(sys.getfilesystemencoding())
|
||||||
filename = filename.encode(sys.getfilesystemencoding())
|
|
||||||
|
|
||||||
writer_cls = get_cdx_writer_cls(options)
|
writer_cls = get_cdx_writer_cls(options)
|
||||||
|
|
||||||
with writer_cls(outfile) as writer:
|
with writer_cls(outfile) as writer:
|
||||||
entry_iter = DefaultRecordIter(**options)(infile)
|
entry_iter = DefaultRecordParser(**options)(infile)
|
||||||
|
|
||||||
for entry in entry_iter:
|
for entry in entry_iter:
|
||||||
writer.write(entry, filename)
|
writer.write(entry, filename)
|
||||||
|
@ -1,9 +1,11 @@
|
|||||||
import redis
|
import redis
|
||||||
|
|
||||||
from pywb.utils.binsearch import iter_exact
|
from pywb.utils.binsearch import iter_exact
|
||||||
|
from pywb.utils.loaders import to_native_str
|
||||||
|
|
||||||
|
from six.moves.urllib.parse import urlsplit
|
||||||
|
from six.moves.urllib.request import url2pathname
|
||||||
|
|
||||||
import urlparse
|
|
||||||
import urllib
|
|
||||||
import os
|
import os
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
@ -49,7 +51,7 @@ class RedisResolver:
|
|||||||
|
|
||||||
def __call__(self, filename):
|
def __call__(self, filename):
|
||||||
redis_val = self.redis.hget(self.key_prefix + filename, 'path')
|
redis_val = self.redis.hget(self.key_prefix + filename, 'path')
|
||||||
return [redis_val] if redis_val else []
|
return [to_native_str(redis_val)] if redis_val else []
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return "RedisResolver('{0}')".format(self.redis_url)
|
return "RedisResolver('{0}')".format(self.redis_url)
|
||||||
@ -62,12 +64,12 @@ class PathIndexResolver:
|
|||||||
|
|
||||||
def __call__(self, filename):
|
def __call__(self, filename):
|
||||||
with open(self.pathindex_file, 'rb') as reader:
|
with open(self.pathindex_file, 'rb') as reader:
|
||||||
result = iter_exact(reader, filename, '\t')
|
result = iter_exact(reader, filename.encode('utf-8'), b'\t')
|
||||||
|
|
||||||
for pathline in result:
|
for pathline in result:
|
||||||
paths = pathline.split('\t')[1:]
|
paths = pathline.split(b'\t')[1:]
|
||||||
for path in paths:
|
for path in paths:
|
||||||
yield path
|
yield to_native_str(path)
|
||||||
|
|
||||||
def __repr__(self): # pragma: no cover
|
def __repr__(self): # pragma: no cover
|
||||||
return "PathIndexResolver('{0}')".format(self.pathindex_file)
|
return "PathIndexResolver('{0}')".format(self.pathindex_file)
|
||||||
@ -84,7 +86,7 @@ def make_best_resolver(param):
|
|||||||
path = param
|
path = param
|
||||||
arg = None
|
arg = None
|
||||||
|
|
||||||
url_parts = urlparse.urlsplit(path)
|
url_parts = urlsplit(path)
|
||||||
|
|
||||||
if url_parts.scheme == 'redis':
|
if url_parts.scheme == 'redis':
|
||||||
logging.debug('Adding Redis Index: ' + path)
|
logging.debug('Adding Redis Index: ' + path)
|
||||||
@ -92,7 +94,7 @@ def make_best_resolver(param):
|
|||||||
|
|
||||||
if url_parts.scheme == 'file':
|
if url_parts.scheme == 'file':
|
||||||
path = url_parts.path
|
path = url_parts.path
|
||||||
path = urllib.url2pathname(path)
|
path = url2pathname(path)
|
||||||
|
|
||||||
if os.path.isfile(path):
|
if os.path.isfile(path):
|
||||||
logging.debug('Adding Path Index: ' + path)
|
logging.debug('Adding Path Index: ' + path)
|
||||||
@ -106,7 +108,7 @@ def make_best_resolver(param):
|
|||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
def make_best_resolvers(paths):
|
def make_best_resolvers(paths):
|
||||||
if hasattr(paths, '__iter__'):
|
if isinstance(paths, list) or isinstance(paths, set):
|
||||||
return map(make_best_resolver, paths)
|
return list(map(make_best_resolver, paths))
|
||||||
else:
|
else:
|
||||||
return [make_best_resolver(paths)]
|
return [make_best_resolver(paths)]
|
||||||
|
@ -1,5 +1,3 @@
|
|||||||
import itertools
|
|
||||||
import urlparse
|
|
||||||
import collections
|
import collections
|
||||||
|
|
||||||
from pywb.utils.statusandheaders import StatusAndHeaders
|
from pywb.utils.statusandheaders import StatusAndHeaders
|
||||||
@ -7,10 +5,14 @@ from pywb.utils.statusandheaders import StatusAndHeadersParser
|
|||||||
from pywb.utils.statusandheaders import StatusAndHeadersParserException
|
from pywb.utils.statusandheaders import StatusAndHeadersParserException
|
||||||
|
|
||||||
from pywb.utils.loaders import BlockLoader, LimitReader
|
from pywb.utils.loaders import BlockLoader, LimitReader
|
||||||
|
from pywb.utils.loaders import to_native_str
|
||||||
from pywb.utils.bufferedreaders import DecompressingBufferedReader
|
from pywb.utils.bufferedreaders import DecompressingBufferedReader
|
||||||
|
|
||||||
from pywb.utils.wbexception import WbException
|
from pywb.utils.wbexception import WbException
|
||||||
|
|
||||||
|
from six.moves import zip
|
||||||
|
import six
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
ArcWarcRecord = collections.namedtuple('ArcWarcRecord',
|
ArcWarcRecord = collections.namedtuple('ArcWarcRecord',
|
||||||
@ -34,7 +36,7 @@ class ArchiveLoadFailed(WbException):
|
|||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class ArcWarcRecordLoader:
|
class ArcWarcRecordLoader(object):
|
||||||
# Standard ARC v1.0 headers
|
# Standard ARC v1.0 headers
|
||||||
# TODO: support ARC v2.0 also?
|
# TODO: support ARC v2.0 also?
|
||||||
ARC_HEADERS = ["uri", "ip-address", "archive-date",
|
ARC_HEADERS = ["uri", "ip-address", "archive-date",
|
||||||
@ -73,7 +75,7 @@ class ArcWarcRecordLoader:
|
|||||||
except:
|
except:
|
||||||
length = -1
|
length = -1
|
||||||
|
|
||||||
stream = self.loader.load(url, long(offset), length)
|
stream = self.loader.load(url, int(offset), length)
|
||||||
decomp_type = 'gzip'
|
decomp_type = 'gzip'
|
||||||
|
|
||||||
# Create decompressing stream
|
# Create decompressing stream
|
||||||
@ -200,16 +202,21 @@ class ArcWarcRecordLoader:
|
|||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class ARCHeadersParser:
|
class ARCHeadersParser(object):
|
||||||
def __init__(self, headernames):
|
def __init__(self, headernames):
|
||||||
self.headernames = headernames
|
self.headernames = headernames
|
||||||
|
|
||||||
def parse(self, stream, headerline=None):
|
def parse(self, stream, headerline=None):
|
||||||
total_read = 0
|
total_read = 0
|
||||||
|
|
||||||
|
def readline():
|
||||||
|
return to_native_str(stream.readline())
|
||||||
|
|
||||||
# if headerline passed in, use that
|
# if headerline passed in, use that
|
||||||
if headerline is None:
|
if headerline is None:
|
||||||
headerline = stream.readline()
|
headerline = readline()
|
||||||
|
else:
|
||||||
|
headerline = to_native_str(headerline)
|
||||||
|
|
||||||
header_len = len(headerline)
|
header_len = len(headerline)
|
||||||
|
|
||||||
@ -222,8 +229,8 @@ class ARCHeadersParser:
|
|||||||
|
|
||||||
# if arc header, consume next two lines
|
# if arc header, consume next two lines
|
||||||
if headerline.startswith('filedesc://'):
|
if headerline.startswith('filedesc://'):
|
||||||
version = stream.readline() # skip version
|
version = readline() # skip version
|
||||||
spec = stream.readline() # skip header spec, use preset one
|
spec = readline() # skip header spec, use preset one
|
||||||
total_read += len(version)
|
total_read += len(version)
|
||||||
total_read += len(spec)
|
total_read += len(spec)
|
||||||
|
|
||||||
@ -236,7 +243,7 @@ class ARCHeadersParser:
|
|||||||
|
|
||||||
headers = []
|
headers = []
|
||||||
|
|
||||||
for name, value in itertools.izip(headernames, parts):
|
for name, value in zip(headernames, parts):
|
||||||
headers.append((name, value))
|
headers.append((name, value))
|
||||||
|
|
||||||
return StatusAndHeaders(statusline='',
|
return StatusAndHeaders(statusline='',
|
||||||
|
@ -1,8 +1,10 @@
|
|||||||
from pywb.utils.timeutils import iso_date_to_timestamp
|
from pywb.utils.timeutils import iso_date_to_timestamp
|
||||||
from recordloader import ArcWarcRecordLoader, ArchiveLoadFailed
|
from pywb.warc.recordloader import ArcWarcRecordLoader, ArchiveLoadFailed
|
||||||
from pathresolvers import make_best_resolvers
|
from pywb.warc.pathresolvers import make_best_resolvers
|
||||||
from pywb.utils.wbexception import NotFoundException
|
from pywb.utils.wbexception import NotFoundException
|
||||||
|
|
||||||
|
import six
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class ResolvingLoader(object):
|
class ResolvingLoader(object):
|
||||||
@ -104,6 +106,9 @@ class ResolvingLoader(object):
|
|||||||
for resolver in self.path_resolvers:
|
for resolver in self.path_resolvers:
|
||||||
possible_paths = resolver(filename)
|
possible_paths = resolver(filename)
|
||||||
|
|
||||||
|
#import sys
|
||||||
|
#sys.stderr.write(str(possible_paths))
|
||||||
|
|
||||||
if possible_paths:
|
if possible_paths:
|
||||||
for path in possible_paths:
|
for path in possible_paths:
|
||||||
any_found = True
|
any_found = True
|
||||||
@ -125,7 +130,8 @@ class ResolvingLoader(object):
|
|||||||
else:
|
else:
|
||||||
msg = 'Archive File Not Found'
|
msg = 'Archive File Not Found'
|
||||||
|
|
||||||
raise ArchiveLoadFailed(msg, filename), None, last_traceback
|
#raise ArchiveLoadFailed(msg, filename), None, last_traceback
|
||||||
|
six.reraise(ArchiveLoadFailed, ArchiveLoadFailed(msg, filename), last_traceback)
|
||||||
|
|
||||||
def _load_different_url_payload(self, cdx, headers_record,
|
def _load_different_url_payload(self, cdx, headers_record,
|
||||||
failed_files, cdx_loader):
|
failed_files, cdx_loader):
|
||||||
|
@ -48,9 +48,9 @@ com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ
|
|||||||
>>> print_cdx_index('example-wget-1-14.warc.gz')
|
>>> print_cdx_index('example-wget-1-14.warc.gz')
|
||||||
CDX N b a m s k r M S V g
|
CDX N b a m s k r M S V g
|
||||||
com,example)/ 20140216012908 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1151 792 example-wget-1-14.warc.gz
|
com,example)/ 20140216012908 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1151 792 example-wget-1-14.warc.gz
|
||||||
metadata)/gnu.org/software/wget/warc/manifest.txt 20140216012908 metadata://gnu.org/software/wget/warc/MANIFEST.txt text/plain - SWUF4CK2XMZSOKSA7SDT7M7NUGWH2TRE - - 315 1943 example-wget-1-14.warc.gz
|
org,gnu)/software/wget/warc/manifest.txt 20140216012908 metadata://gnu.org/software/wget/warc/MANIFEST.txt text/plain - SWUF4CK2XMZSOKSA7SDT7M7NUGWH2TRE - - 315 1943 example-wget-1-14.warc.gz
|
||||||
metadata)/gnu.org/software/wget/warc/wget_arguments.txt 20140216012908 metadata://gnu.org/software/wget/warc/wget_arguments.txt text/plain - UCXDCGORD6K4RJT5NUQGKE2PKEG4ZZD6 - - 340 2258 example-wget-1-14.warc.gz
|
org,gnu)/software/wget/warc/wget_arguments.txt 20140216012908 metadata://gnu.org/software/wget/warc/wget_arguments.txt text/plain - UCXDCGORD6K4RJT5NUQGKE2PKEG4ZZD6 - - 340 2258 example-wget-1-14.warc.gz
|
||||||
metadata)/gnu.org/software/wget/warc/wget.log 20140216012908 metadata://gnu.org/software/wget/warc/wget.log text/plain - 2ULE2LD5UOWDXGACCT624TU5BVKACRQ4 - - 599 2598 example-wget-1-14.warc.gz
|
org,gnu)/software/wget/warc/wget.log 20140216012908 metadata://gnu.org/software/wget/warc/wget.log text/plain - 2ULE2LD5UOWDXGACCT624TU5BVKACRQ4 - - 599 2598 example-wget-1-14.warc.gz
|
||||||
|
|
||||||
|
|
||||||
# wget warc, includes metadata and request
|
# wget warc, includes metadata and request
|
||||||
@ -58,9 +58,9 @@ metadata)/gnu.org/software/wget/warc/wget.log 20140216012908 metadata://gnu.org/
|
|||||||
CDX N b a m s k r M S V g
|
CDX N b a m s k r M S V g
|
||||||
com,example)/ 20140216012908 http://example.com/ - - - - - 394 398 example-wget-1-14.warc.gz
|
com,example)/ 20140216012908 http://example.com/ - - - - - 394 398 example-wget-1-14.warc.gz
|
||||||
com,example)/ 20140216012908 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1151 792 example-wget-1-14.warc.gz
|
com,example)/ 20140216012908 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1151 792 example-wget-1-14.warc.gz
|
||||||
metadata)/gnu.org/software/wget/warc/manifest.txt 20140216012908 metadata://gnu.org/software/wget/warc/MANIFEST.txt text/plain - SWUF4CK2XMZSOKSA7SDT7M7NUGWH2TRE - - 315 1943 example-wget-1-14.warc.gz
|
org,gnu)/software/wget/warc/manifest.txt 20140216012908 metadata://gnu.org/software/wget/warc/MANIFEST.txt text/plain - SWUF4CK2XMZSOKSA7SDT7M7NUGWH2TRE - - 315 1943 example-wget-1-14.warc.gz
|
||||||
metadata)/gnu.org/software/wget/warc/wget_arguments.txt 20140216012908 metadata://gnu.org/software/wget/warc/wget_arguments.txt text/plain - UCXDCGORD6K4RJT5NUQGKE2PKEG4ZZD6 - - 340 2258 example-wget-1-14.warc.gz
|
org,gnu)/software/wget/warc/wget_arguments.txt 20140216012908 metadata://gnu.org/software/wget/warc/wget_arguments.txt text/plain - UCXDCGORD6K4RJT5NUQGKE2PKEG4ZZD6 - - 340 2258 example-wget-1-14.warc.gz
|
||||||
metadata)/gnu.org/software/wget/warc/wget.log 20140216012908 metadata://gnu.org/software/wget/warc/wget.log text/plain - 2ULE2LD5UOWDXGACCT624TU5BVKACRQ4 - - 599 2598 example-wget-1-14.warc.gz
|
org,gnu)/software/wget/warc/wget.log 20140216012908 metadata://gnu.org/software/wget/warc/wget.log text/plain - 2ULE2LD5UOWDXGACCT624TU5BVKACRQ4 - - 599 2598 example-wget-1-14.warc.gz
|
||||||
|
|
||||||
# wpull warc, includes metadata by default
|
# wpull warc, includes metadata by default
|
||||||
>>> print_cdx_index('example-wpull.warc.gz')
|
>>> print_cdx_index('example-wpull.warc.gz')
|
||||||
@ -127,7 +127,7 @@ com,example)/?example=2 20140603030341 http://example.com?example=2 warc/revisit
|
|||||||
com,example)/?example=2 20140103030321 http://example.com?example=2 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1987 3207 example-extra.warc
|
com,example)/?example=2 20140103030321 http://example.com?example=2 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1987 3207 example-extra.warc
|
||||||
com,example)/?example=2 20140603030341 http://example.com?example=2 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 504 5910 example-extra.warc
|
com,example)/?example=2 20140603030341 http://example.com?example=2 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 504 5910 example-extra.warc
|
||||||
|
|
||||||
>>> print_cdx_index('example-extra.warc', verify_http=True)
|
>>> print_cdx_index('example-extra.warc', verify_http=True) # doctest: +IGNORE_EXCEPTION_DETAIL
|
||||||
Traceback (most recent call last):
|
Traceback (most recent call last):
|
||||||
StatusAndHeadersParserException: Expected Status Line starting with ['HTTP/1.0', 'HTTP/1.1'] - Found: HTTPX/1.1 200 OK
|
StatusAndHeadersParserException: Expected Status Line starting with ['HTTP/1.0', 'HTTP/1.1'] - Found: HTTPX/1.1 200 OK
|
||||||
|
|
||||||
@ -178,7 +178,7 @@ urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX
|
|||||||
Total: 210
|
Total: 210
|
||||||
|
|
||||||
# test writing to temp dir, also use unicode filename
|
# test writing to temp dir, also use unicode filename
|
||||||
>>> cli_lines_with_dir(unicode(TEST_WARC_DIR + 'example.warc.gz'))
|
>>> cli_lines_with_dir(TEST_WARC_DIR + 'example.warc.gz')
|
||||||
example.cdx
|
example.cdx
|
||||||
com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
|
com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
|
||||||
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
|
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
|
||||||
@ -223,7 +223,7 @@ def cdx_index(warc, **options):
|
|||||||
return buff.getvalue()
|
return buff.getvalue()
|
||||||
|
|
||||||
def print_cdx_index(*args, **kwargs):
|
def print_cdx_index(*args, **kwargs):
|
||||||
sys.stdout.write(cdx_index(*args, **kwargs))
|
sys.stdout.write(cdx_index(*args, **kwargs).decode('utf-8'))
|
||||||
|
|
||||||
def assert_cdx_match(cdx, warc, sort=False):
|
def assert_cdx_match(cdx, warc, sort=False):
|
||||||
assert read_fully(cdx) == cdx_index(warc, sort=sort)
|
assert read_fully(cdx) == cdx_index(warc, sort=sort)
|
||||||
@ -239,11 +239,11 @@ def cli_lines(cmds):
|
|||||||
sys.stdout = buff
|
sys.stdout = buff
|
||||||
main(cmds)
|
main(cmds)
|
||||||
sys.stdout = orig
|
sys.stdout = orig
|
||||||
lines = buff.getvalue().rstrip().split('\n')
|
lines = buff.getvalue().rstrip().split(b'\n')
|
||||||
|
|
||||||
# print first, last, num lines
|
# print first, last, num lines
|
||||||
print(lines[1])
|
print(lines[1].decode('utf-8'))
|
||||||
print(lines[-1])
|
print(lines[-1].decode('utf-8'))
|
||||||
print('Total: ' + str(len(lines)))
|
print('Total: ' + str(len(lines)))
|
||||||
|
|
||||||
def cli_lines_with_dir(input_):
|
def cli_lines_with_dir(input_):
|
||||||
@ -256,10 +256,10 @@ def cli_lines_with_dir(input_):
|
|||||||
|
|
||||||
filename = cdx_filename(os.path.basename(input_))
|
filename = cdx_filename(os.path.basename(input_))
|
||||||
|
|
||||||
print filename
|
print(filename)
|
||||||
|
|
||||||
with open(os.path.join(tmp_dir, filename), 'rb') as fh:
|
with open(os.path.join(tmp_dir, filename), 'rb') as fh:
|
||||||
lines = fh.read(8192).rstrip().split('\n')
|
lines = fh.read(8192).rstrip().split(b'\n')
|
||||||
|
|
||||||
finally:
|
finally:
|
||||||
try:
|
try:
|
||||||
@ -273,8 +273,8 @@ def cli_lines_with_dir(input_):
|
|||||||
return
|
return
|
||||||
|
|
||||||
# print first, last, num lines
|
# print first, last, num lines
|
||||||
print (lines[1])
|
print(lines[1].decode('utf-8'))
|
||||||
print (lines[-1])
|
print(lines[-1].decode('utf-8'))
|
||||||
print('Total: ' + str(len(lines)))
|
print('Total: ' + str(len(lines)))
|
||||||
|
|
||||||
|
|
||||||
@ -284,18 +284,18 @@ def test_non_chunked_gzip_err():
|
|||||||
|
|
||||||
|
|
||||||
def parse_cdxj(string):
|
def parse_cdxj(string):
|
||||||
lines = string.split('\n')
|
lines = string.split(b'\n')
|
||||||
if lines[0] == '':
|
if lines[0] == b'':
|
||||||
lines = lines[1:]
|
lines = lines[1:]
|
||||||
cdxlist = map(CDXObject, lines)
|
cdxlist = list(map(CDXObject, lines))
|
||||||
return map(dict, cdxlist)
|
return list(map(dict, cdxlist))
|
||||||
|
|
||||||
|
|
||||||
def test_cdxj_warc_minimal():
|
def test_cdxj_warc_minimal():
|
||||||
# cdxj minimal
|
# cdxj minimal
|
||||||
res = cdx_index('example.warc.gz', minimal=True, cdxj=True)
|
res = cdx_index('example.warc.gz', minimal=True, cdxj=True)
|
||||||
|
|
||||||
assert parse_cdxj(res) == parse_cdxj("""
|
assert parse_cdxj(res) == parse_cdxj(b"""
|
||||||
com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "1043", "offset": "333", "filename": "example.warc.gz"}
|
com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "1043", "offset": "333", "filename": "example.warc.gz"}
|
||||||
com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "mime": "warc/revisit", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "553", "offset": "1864", "filename": "example.warc.gz"}
|
com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "mime": "warc/revisit", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "553", "offset": "1864", "filename": "example.warc.gz"}
|
||||||
org,iana)/domains/example 20140128051539 {"url": "http://www.iana.org/domains/example", "digest": "JZ622UA23G5ZU6Y3XAKH4LINONUEICEG", "length": "577", "offset": "2907", "filename": "example.warc.gz"}
|
org,iana)/domains/example 20140128051539 {"url": "http://www.iana.org/domains/example", "digest": "JZ622UA23G5ZU6Y3XAKH4LINONUEICEG", "length": "577", "offset": "2907", "filename": "example.warc.gz"}
|
||||||
@ -306,7 +306,7 @@ def test_cdxj_warc_all():
|
|||||||
# warc.gz -- parse all -- CDXJ
|
# warc.gz -- parse all -- CDXJ
|
||||||
res = cdx_index('example.warc.gz', include_all=True, cdxj=True)
|
res = cdx_index('example.warc.gz', include_all=True, cdxj=True)
|
||||||
|
|
||||||
assert parse_cdxj(res) == parse_cdxj("""
|
assert parse_cdxj(res) == parse_cdxj(b"""
|
||||||
com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "mime": "text/html", "status": "200", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "1043", "offset": "333", "filename": "example.warc.gz"}
|
com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "mime": "text/html", "status": "200", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "1043", "offset": "333", "filename": "example.warc.gz"}
|
||||||
com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "length": "488", "offset": "1376", "filename": "example.warc.gz"}
|
com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "length": "488", "offset": "1376", "filename": "example.warc.gz"}
|
||||||
com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "mime": "warc/revisit", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "553", "offset": "1864", "filename": "example.warc.gz"}
|
com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "mime": "warc/revisit", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "553", "offset": "1864", "filename": "example.warc.gz"}
|
||||||
@ -317,14 +317,14 @@ org,iana)/domains/example 20140128051539 {"url": "http://www.iana.org/domains/ex
|
|||||||
def test_cdxj_arc():
|
def test_cdxj_arc():
|
||||||
# arc.gz -- json
|
# arc.gz -- json
|
||||||
res = cdx_index('example.arc.gz', cdxj=True)
|
res = cdx_index('example.arc.gz', cdxj=True)
|
||||||
assert parse_cdxj(res) == parse_cdxj("""
|
assert parse_cdxj(res) == parse_cdxj(b"""
|
||||||
com,example)/ 20140216050221 {"url": "http://example.com/", "mime": "text/html", "status": "200", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "856", "offset": "171", "filename": "example.arc.gz"}
|
com,example)/ 20140216050221 {"url": "http://example.com/", "mime": "text/html", "status": "200", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "856", "offset": "171", "filename": "example.arc.gz"}
|
||||||
""")
|
""")
|
||||||
|
|
||||||
def test_cdxj_arc_minimal():
|
def test_cdxj_arc_minimal():
|
||||||
# arc.gz -- minimal + json
|
# arc.gz -- minimal + json
|
||||||
res = cdx_index('example.arc.gz', cdxj=True, minimal=True)
|
res = cdx_index('example.arc.gz', cdxj=True, minimal=True)
|
||||||
assert parse_cdxj(res) == parse_cdxj("""
|
assert parse_cdxj(res) == parse_cdxj(b"""
|
||||||
com,example)/ 20140216050221 {"url": "http://example.com/", "digest": "PEWDX5GTH66WU74WBPGFECIYBMPMP3FP", "length": "856", "offset": "171", "filename": "example.arc.gz"}
|
com,example)/ 20140216050221 {"url": "http://example.com/", "digest": "PEWDX5GTH66WU74WBPGFECIYBMPMP3FP", "length": "856", "offset": "171", "filename": "example.arc.gz"}
|
||||||
""")
|
""")
|
||||||
|
|
||||||
|
@ -37,8 +37,7 @@ Test loading different types of records from a variety of formats
|
|||||||
('WARC-Payload-Digest', 'sha1:B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'),
|
('WARC-Payload-Digest', 'sha1:B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'),
|
||||||
('WARC-Target-URI', 'http://example.com?example=1'),
|
('WARC-Target-URI', 'http://example.com?example=1'),
|
||||||
('WARC-Warcinfo-ID', '<urn:uuid:fbd6cf0a-6160-4550-b343-12188dc05234>'),
|
('WARC-Warcinfo-ID', '<urn:uuid:fbd6cf0a-6160-4550-b343-12188dc05234>'),
|
||||||
( 'WARC-Profile',
|
('WARC-Profile', 'http://netpreserve.org/warc/0.18/revisit/identical-payload-digest'),
|
||||||
'http://netpreserve.org/warc/0.18/revisit/identical-payload-digest'),
|
|
||||||
('WARC-Refers-To-Target-URI', 'http://example.com?example=1'),
|
('WARC-Refers-To-Target-URI', 'http://example.com?example=1'),
|
||||||
('WARC-Refers-To-Date', '2014-01-03T03:03:21Z')]),
|
('WARC-Refers-To-Date', '2014-01-03T03:03:21Z')]),
|
||||||
StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
|
StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
|
||||||
@ -66,17 +65,13 @@ Test loading different types of records from a variety of formats
|
|||||||
('WARC-Target-URI', 'http://example.com?example=1'),
|
('WARC-Target-URI', 'http://example.com?example=1'),
|
||||||
('WARC-Warcinfo-ID', '<urn:uuid:fbd6cf0a-6160-4550-b343-12188dc05234>')]),
|
('WARC-Warcinfo-ID', '<urn:uuid:fbd6cf0a-6160-4550-b343-12188dc05234>')]),
|
||||||
StatusAndHeaders(protocol = 'GET', statusline = '/?example=1 HTTP/1.1', headers = [ ('Connection', 'close'),
|
StatusAndHeaders(protocol = 'GET', statusline = '/?example=1 HTTP/1.1', headers = [ ('Connection', 'close'),
|
||||||
( 'Accept',
|
('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'),
|
||||||
'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'),
|
|
||||||
('Accept-Language', 'en-US,en;q=0.8'),
|
('Accept-Language', 'en-US,en;q=0.8'),
|
||||||
( 'User-Agent',
|
('User-Agent', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) \
|
||||||
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.57 Safari/537.36 (via Wayback Save Page)'),
|
Chrome/31.0.1650.57 Safari/537.36 (via Wayback Save Page)'),
|
||||||
('Host', 'example.com')]))
|
('Host', 'example.com')]))
|
||||||
|
|
||||||
|
|
||||||
StatusAndHeaders(protocol = '', statusline = '204 No Content', headers = []))
|
|
||||||
|
|
||||||
|
|
||||||
# Test of record loading based on cdx line
|
# Test of record loading based on cdx line
|
||||||
# Print parsed http headers + 2 lines of content
|
# Print parsed http headers + 2 lines of content
|
||||||
# ==============================================================================
|
# ==============================================================================
|
||||||
@ -233,7 +228,7 @@ failed_files=failed_files)
|
|||||||
Exception: ArchiveLoadFailed
|
Exception: ArchiveLoadFailed
|
||||||
|
|
||||||
# ensure failed_files being filled
|
# ensure failed_files being filled
|
||||||
>>> failed_files
|
>>> print_strs(failed_files)
|
||||||
['x-not-found-x.warc.gz']
|
['x-not-found-x.warc.gz']
|
||||||
|
|
||||||
>>> load_from_cdx_test('com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 856 170 x-not-found-x.warc.gz',\
|
>>> load_from_cdx_test('com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 856 170 x-not-found-x.warc.gz',\
|
||||||
@ -295,12 +290,15 @@ Exception: ArchiveLoadFailed
|
|||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import pprint
|
import pprint
|
||||||
|
import six
|
||||||
|
|
||||||
from pywb.warc.recordloader import ArcWarcRecordLoader, ArchiveLoadFailed
|
from pywb.warc.recordloader import ArcWarcRecordLoader, ArchiveLoadFailed
|
||||||
from pywb.warc.pathresolvers import make_best_resolvers
|
from pywb.warc.pathresolvers import make_best_resolvers
|
||||||
from pywb.warc.resolvingloader import ResolvingLoader
|
from pywb.warc.resolvingloader import ResolvingLoader
|
||||||
from pywb.cdx.cdxobject import CDXObject
|
from pywb.cdx.cdxobject import CDXObject
|
||||||
|
|
||||||
|
import pywb.utils.statusandheaders
|
||||||
|
|
||||||
from pywb import get_test_dir
|
from pywb import get_test_dir
|
||||||
|
|
||||||
#==============================================================================
|
#==============================================================================
|
||||||
@ -319,7 +317,7 @@ URL_AGNOSTIC_REVISIT_NO_DIGEST_CDX = 'com,example)/ 20130729195151 http://test@e
|
|||||||
warc/revisit - - - - \
|
warc/revisit - - - - \
|
||||||
591 355 example-url-agnostic-revisit.warc.gz'
|
591 355 example-url-agnostic-revisit.warc.gz'
|
||||||
|
|
||||||
BAD_ORIG_CDX = 'org,iana,example)/ 20130702195401 http://example.iana.org/ \
|
BAD_ORIG_CDX = b'org,iana,example)/ 20130702195401 http://example.iana.org/ \
|
||||||
text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - \
|
text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - \
|
||||||
1001 353 someunknown.warc.gz'
|
1001 353 someunknown.warc.gz'
|
||||||
|
|
||||||
@ -332,8 +330,10 @@ def load_test_archive(test_file, offset, length):
|
|||||||
|
|
||||||
archive = testloader.load(path, offset, length)
|
archive = testloader.load(path, offset, length)
|
||||||
|
|
||||||
|
pywb.utils.statusandheaders.WRAP_WIDTH = 160
|
||||||
|
|
||||||
pprint.pprint(((archive.format, archive.rec_type),
|
pprint.pprint(((archive.format, archive.rec_type),
|
||||||
archive.rec_headers, archive.status_headers))
|
archive.rec_headers, archive.status_headers), indent=1, width=160)
|
||||||
|
|
||||||
|
|
||||||
#==============================================================================
|
#==============================================================================
|
||||||
@ -345,25 +345,25 @@ def load_orig_bad_cdx(_):
|
|||||||
#==============================================================================
|
#==============================================================================
|
||||||
def load_orig_cdx(_):
|
def load_orig_cdx(_):
|
||||||
return [CDXObject(BAD_ORIG_CDX),
|
return [CDXObject(BAD_ORIG_CDX),
|
||||||
CDXObject(URL_AGNOSTIC_ORIG_CDX)]
|
CDXObject(URL_AGNOSTIC_ORIG_CDX.encode('utf-8'))]
|
||||||
|
|
||||||
|
|
||||||
#==============================================================================
|
#==============================================================================
|
||||||
def load_from_cdx_test(cdx, revisit_func=load_orig_cdx, reraise=False,
|
def load_from_cdx_test(cdx, revisit_func=load_orig_cdx, reraise=False,
|
||||||
failed_files=None):
|
failed_files=None):
|
||||||
resolve_loader = ResolvingLoader(test_warc_dir)
|
resolve_loader = ResolvingLoader(test_warc_dir)
|
||||||
cdx = CDXObject(cdx)
|
cdx = CDXObject(cdx.encode('utf-8'))
|
||||||
|
|
||||||
try:
|
try:
|
||||||
(headers, stream) = resolve_loader(cdx, failed_files, revisit_func)
|
(headers, stream) = resolve_loader(cdx, failed_files, revisit_func)
|
||||||
print headers
|
print(headers)
|
||||||
sys.stdout.write(stream.readline())
|
sys.stdout.write(stream.readline().decode('utf-8'))
|
||||||
sys.stdout.write(stream.readline())
|
sys.stdout.write(stream.readline().decode('utf-8'))
|
||||||
except ArchiveLoadFailed as e:
|
except ArchiveLoadFailed as e:
|
||||||
if reraise:
|
if reraise:
|
||||||
raise
|
raise
|
||||||
else:
|
else:
|
||||||
print 'Exception: ' + e.__class__.__name__
|
print('Exception: ' + e.__class__.__name__)
|
||||||
|
|
||||||
|
|
||||||
#==============================================================================
|
#==============================================================================
|
||||||
@ -371,7 +371,14 @@ def parse_stream_error(**params):
|
|||||||
try:
|
try:
|
||||||
return ArcWarcRecordLoader().parse_record_stream(**params)
|
return ArcWarcRecordLoader().parse_record_stream(**params)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print 'Exception: ' + e.__class__.__name__
|
print('Exception: ' + e.__class__.__name__)
|
||||||
|
|
||||||
|
|
||||||
|
#==============================================================================
|
||||||
|
def print_strs(strings):
|
||||||
|
return list(map(lambda string: string.encode('utf-8') if six.PY2 else string, strings))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
@ -47,7 +47,7 @@ RedisResolver('redis://myhost.example.com:1234/1')
|
|||||||
# make_best_resolvers
|
# make_best_resolvers
|
||||||
>>> r = make_best_resolvers(['http://example.com/warcs/',\
|
>>> r = make_best_resolvers(['http://example.com/warcs/',\
|
||||||
'redis://example.com:1234/1'])
|
'redis://example.com:1234/1'])
|
||||||
>>> map(lambda x: x.__class__.__name__, r)
|
>>> list(map(lambda x: x.__class__.__name__, r))
|
||||||
['PrefixResolver', 'RedisResolver']
|
['PrefixResolver', 'RedisResolver']
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
@ -3,9 +3,10 @@ from pywb.cdx.cdxserver import create_cdx_server
|
|||||||
from pywb.framework.basehandlers import BaseHandler
|
from pywb.framework.basehandlers import BaseHandler
|
||||||
from pywb.framework.wbrequestresponse import WbResponse
|
from pywb.framework.wbrequestresponse import WbResponse
|
||||||
|
|
||||||
from query_handler import QueryHandler
|
from pywb.webapp.query_handler import QueryHandler
|
||||||
|
|
||||||
from urlparse import parse_qs
|
from six.moves.urllib.parse import parse_qs
|
||||||
|
import six
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
@ -22,7 +23,11 @@ class CDXAPIHandler(BaseHandler):
|
|||||||
|
|
||||||
cdx_iter = self.index_handler.load_cdx(wbrequest, params)
|
cdx_iter = self.index_handler.load_cdx(wbrequest, params)
|
||||||
|
|
||||||
return WbResponse.text_stream(cdx_iter)
|
def to_utf8():
|
||||||
|
for cdx in cdx_iter:
|
||||||
|
yield cdx.encode('utf-8')
|
||||||
|
|
||||||
|
return WbResponse.text_stream(to_utf8())
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def extract_params_from_wsgi_env(env):
|
def extract_params_from_wsgi_env(env):
|
||||||
@ -35,7 +40,7 @@ class CDXAPIHandler(BaseHandler):
|
|||||||
# cdx processing expects singleton params for all params,
|
# cdx processing expects singleton params for all params,
|
||||||
# except filters, so convert here
|
# except filters, so convert here
|
||||||
# use first value of the list
|
# use first value of the list
|
||||||
for name, val in params.iteritems():
|
for name, val in six.iteritems(params):
|
||||||
if name != 'filter':
|
if name != 'filter':
|
||||||
params[name] = val[0]
|
params[name] = val[0]
|
||||||
|
|
||||||
|
@ -15,8 +15,8 @@ from pywb.framework.wbrequestresponse import WbResponse
|
|||||||
from pywb.warc.recordloader import ArcWarcRecordLoader
|
from pywb.warc.recordloader import ArcWarcRecordLoader
|
||||||
from pywb.warc.resolvingloader import ResolvingLoader
|
from pywb.warc.resolvingloader import ResolvingLoader
|
||||||
|
|
||||||
from views import J2TemplateView, init_view
|
from pywb.webapp.views import J2TemplateView, init_view
|
||||||
from replay_views import ReplayView
|
from pywb.webapp.replay_views import ReplayView
|
||||||
from pywb.framework.memento import MementoResponse
|
from pywb.framework.memento import MementoResponse
|
||||||
from pywb.utils.timeutils import datetime_to_timestamp
|
from pywb.utils.timeutils import datetime_to_timestamp
|
||||||
|
|
||||||
|
@ -4,8 +4,8 @@ from pywb.framework.cache import create_cache
|
|||||||
from pywb.rewrite.rewrite_live import LiveRewriter
|
from pywb.rewrite.rewrite_live import LiveRewriter
|
||||||
from pywb.rewrite.wburl import WbUrl
|
from pywb.rewrite.wburl import WbUrl
|
||||||
|
|
||||||
from handlers import StaticHandler, SearchPageWbUrlHandler
|
from pywb.webapp.handlers import StaticHandler, SearchPageWbUrlHandler
|
||||||
from views import HeadInsertView
|
from pywb.webapp.views import HeadInsertView
|
||||||
|
|
||||||
from pywb.utils.wbexception import WbException
|
from pywb.utils.wbexception import WbException
|
||||||
|
|
||||||
@ -60,7 +60,7 @@ class RewriteHandler(SearchPageWbUrlHandler):
|
|||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
import traceback
|
import traceback
|
||||||
err_details = traceback.format_exc(exc)
|
err_details = traceback.format_exc(exc)
|
||||||
print err_details
|
print(err_details)
|
||||||
|
|
||||||
url = wbrequest.wb_url.url
|
url = wbrequest.wb_url.url
|
||||||
msg = 'Could not load the url from the live web: ' + url
|
msg = 'Could not load the url from the live web: ' + url
|
||||||
|
@ -6,21 +6,22 @@ from pywb.framework.wbrequestresponse import WbRequest
|
|||||||
from pywb.framework.memento import MementoRequest
|
from pywb.framework.memento import MementoRequest
|
||||||
from pywb.framework.basehandlers import BaseHandler
|
from pywb.framework.basehandlers import BaseHandler
|
||||||
|
|
||||||
from views import J2TemplateView
|
from pywb.webapp.views import J2TemplateView
|
||||||
from views import J2HtmlCapturesView, init_view
|
from pywb.webapp.views import J2HtmlCapturesView, init_view
|
||||||
|
|
||||||
from live_rewrite_handler import RewriteHandler
|
from pywb.webapp.live_rewrite_handler import RewriteHandler
|
||||||
|
|
||||||
from query_handler import QueryHandler
|
from pywb.webapp.query_handler import QueryHandler
|
||||||
from handlers import WBHandler
|
from pywb.webapp.handlers import WBHandler
|
||||||
from handlers import StaticHandler
|
from pywb.webapp.handlers import StaticHandler
|
||||||
from handlers import DebugEchoHandler, DebugEchoEnvHandler
|
from pywb.webapp.handlers import DebugEchoHandler, DebugEchoEnvHandler
|
||||||
from cdx_api_handler import CDXAPIHandler
|
from pywb.webapp.cdx_api_handler import CDXAPIHandler
|
||||||
|
|
||||||
from pywb import DEFAULT_CONFIG
|
from pywb import DEFAULT_CONFIG
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import logging
|
import logging
|
||||||
|
import six
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
@ -130,7 +131,7 @@ def create_cdx_server_app(passed_config):
|
|||||||
|
|
||||||
routes = []
|
routes = []
|
||||||
|
|
||||||
for name, value in collections.iteritems():
|
for name, value in six.iteritems(collections):
|
||||||
route_config = init_route_config(value, config)
|
route_config = init_route_config(value, config)
|
||||||
query_handler = init_collection(route_config)
|
query_handler = init_collection(route_config)
|
||||||
|
|
||||||
@ -234,7 +235,7 @@ class DirectoryCollsLoader(object):
|
|||||||
|
|
||||||
# Check all templates
|
# Check all templates
|
||||||
template_files = self.config.get('paths')['template_files']
|
template_files = self.config.get('paths')['template_files']
|
||||||
for tname, tfile in template_files.iteritems():
|
for tname, tfile in six.iteritems(template_files):
|
||||||
if tname in coll_config:
|
if tname in coll_config:
|
||||||
# Already set
|
# Already set
|
||||||
coll_config[tname] = self._norm_path(root_dir, coll_config[tname])
|
coll_config[tname] = self._norm_path(root_dir, coll_config[tname])
|
||||||
@ -288,10 +289,10 @@ def create_wb_router(passed_config=None):
|
|||||||
|
|
||||||
jinja_env.globals.update(config.get('template_globals', {}))
|
jinja_env.globals.update(config.get('template_globals', {}))
|
||||||
|
|
||||||
for static_name, static_path in static_routes.iteritems():
|
for static_name, static_path in six.iteritems(static_routes):
|
||||||
routes.append(Route(static_name, StaticHandler(static_path)))
|
routes.append(Route(static_name, StaticHandler(static_path)))
|
||||||
|
|
||||||
for name, value in collections.iteritems():
|
for name, value in six.iteritems(collections):
|
||||||
if isinstance(value, BaseHandler):
|
if isinstance(value, BaseHandler):
|
||||||
handler_dict[name] = value
|
handler_dict[name] = value
|
||||||
new_route = Route(name, value, config=config)
|
new_route = Route(name, value, config=config)
|
||||||
|
@ -1,12 +1,9 @@
|
|||||||
import urllib
|
|
||||||
import urllib2
|
|
||||||
|
|
||||||
from pywb.utils.dsrules import DEFAULT_RULES_FILE
|
from pywb.utils.dsrules import DEFAULT_RULES_FILE
|
||||||
|
|
||||||
from pywb.perms.perms_filter import make_perms_cdx_filter
|
from pywb.perms.perms_filter import make_perms_cdx_filter
|
||||||
from pywb.framework.wbrequestresponse import WbResponse
|
from pywb.framework.wbrequestresponse import WbResponse
|
||||||
from pywb.cdx.cdxserver import create_cdx_server
|
from pywb.cdx.cdxserver import create_cdx_server
|
||||||
from views import MementoTimemapView
|
from pywb.webapp.views import MementoTimemapView
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
|
@ -2,7 +2,7 @@ import re
|
|||||||
import logging
|
import logging
|
||||||
|
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from urlparse import urlsplit
|
from six.moves.urllib.parse import urlsplit
|
||||||
from itertools import chain
|
from itertools import chain
|
||||||
|
|
||||||
from pywb.utils.statusandheaders import StatusAndHeaders
|
from pywb.utils.statusandheaders import StatusAndHeaders
|
||||||
@ -16,9 +16,9 @@ from pywb.framework.memento import MementoResponse
|
|||||||
from pywb.rewrite.rewrite_content import RewriteContent
|
from pywb.rewrite.rewrite_content import RewriteContent
|
||||||
from pywb.warc.recordloader import ArchiveLoadFailed
|
from pywb.warc.recordloader import ArchiveLoadFailed
|
||||||
|
|
||||||
from views import HeadInsertView
|
from pywb.webapp.views import HeadInsertView
|
||||||
|
|
||||||
from rangecache import range_cache
|
from pywb.webapp.rangecache import range_cache
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
|
@ -2,13 +2,12 @@ from pywb.utils.timeutils import timestamp_to_datetime, timestamp_to_sec
|
|||||||
from pywb.framework.wbrequestresponse import WbResponse
|
from pywb.framework.wbrequestresponse import WbResponse
|
||||||
from pywb.framework.memento import make_timemap, LINK_FORMAT
|
from pywb.framework.memento import make_timemap, LINK_FORMAT
|
||||||
|
|
||||||
import urlparse
|
from six.moves.urllib.parse import urlsplit
|
||||||
import urllib
|
|
||||||
import logging
|
import logging
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from itertools import imap
|
|
||||||
from jinja2 import Environment
|
from jinja2 import Environment
|
||||||
from jinja2 import FileSystemLoader, PackageLoader, ChoiceLoader
|
from jinja2 import FileSystemLoader, PackageLoader, ChoiceLoader
|
||||||
|
|
||||||
@ -48,7 +47,7 @@ def format_ts(value, format_='%a, %b %d %Y %H:%M:%S'):
|
|||||||
|
|
||||||
@template_filter('urlsplit')
|
@template_filter('urlsplit')
|
||||||
def get_urlsplit(url):
|
def get_urlsplit(url):
|
||||||
split = urlparse.urlsplit(url)
|
split = urlsplit(url)
|
||||||
return split
|
return split
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user