mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
Merge branch 'develop' for 0.7.5
This commit is contained in:
commit
c935aa5ec9
4
.gitattributes
vendored
Normal file
4
.gitattributes
vendored
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
*.arc -text
|
||||||
|
*.warc -text
|
||||||
|
*.cdx -text
|
||||||
|
*.gz -text
|
13
CHANGES.rst
13
CHANGES.rst
@ -1,3 +1,16 @@
|
|||||||
|
pywb 0.7.5 changelist
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
* Cross platform fixes to support Windows -- all tests pass on Linux, OS X and Windows now. Improved cross-platform support includes:
|
||||||
|
- read all files as binary to avoid line ending issues
|
||||||
|
- properly convert url <-> file
|
||||||
|
- avoid platform dependent apis
|
||||||
|
|
||||||
|
* Change any unhandled exceptions to result in a 500 error, instead of 400.
|
||||||
|
|
||||||
|
* More compresensive client side ``src`` attribute rewriting (via wombat.js), additional server-side HTML tag rewriting.
|
||||||
|
|
||||||
|
|
||||||
pywb 0.7.2 changelist
|
pywb 0.7.2 changelist
|
||||||
~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
PyWb 0.7.2
|
PyWb 0.7.5
|
||||||
==========
|
==========
|
||||||
|
|
||||||
.. image:: https://travis-ci.org/ikreymer/pywb.png?branch=master
|
.. image:: https://travis-ci.org/ikreymer/pywb.png?branch=master
|
||||||
@ -13,7 +13,7 @@ pywb is a python implementation of web archival replay tools, sometimes also kno
|
|||||||
pywb allows high-quality replay (browsing) of archived web data stored in standardized `ARC <http://en.wikipedia.org/wiki/ARC_(file_format)>`_ and `WARC <http://en.wikipedia.org/wiki/Web_ARChive>`_.
|
pywb allows high-quality replay (browsing) of archived web data stored in standardized `ARC <http://en.wikipedia.org/wiki/ARC_(file_format)>`_ and `WARC <http://en.wikipedia.org/wiki/Web_ARChive>`_.
|
||||||
The replay system is designed to accurately replay complex dynamic sites, including video and audio content.
|
The replay system is designed to accurately replay complex dynamic sites, including video and audio content.
|
||||||
|
|
||||||
pywb can be used as a traditional web application or an HTTP or HTTPS proxy server.
|
pywb can be used as a traditional web application or an HTTP or HTTPS proxy server, and has been tested on Linux, OS X and Windows platforms.
|
||||||
|
|
||||||
pywb is also fully compliant with the `Memento <http://mementoweb.org/>`_ protocol (`RFC-7089 <http://tools.ietf.org/html/rfc7089>`_).
|
pywb is also fully compliant with the `Memento <http://mementoweb.org/>`_ protocol (`RFC-7089 <http://tools.ietf.org/html/rfc7089>`_).
|
||||||
|
|
||||||
|
@ -30,7 +30,7 @@ class CDXFile(CDXSource):
|
|||||||
def load_cdx(self, query):
|
def load_cdx(self, query):
|
||||||
def do_open():
|
def do_open():
|
||||||
try:
|
try:
|
||||||
source = open(self.filename)
|
source = open(self.filename, 'rb')
|
||||||
gen = iter_range(source, query.key, query.end_key)
|
gen = iter_range(source, query.key, query.end_key)
|
||||||
for line in gen:
|
for line in gen:
|
||||||
yield line
|
yield line
|
||||||
|
@ -26,7 +26,7 @@ test_cdx_dir = get_test_dir() + 'cdx/'
|
|||||||
|
|
||||||
def load_cdx_into_redis(source, filename, key=None):
|
def load_cdx_into_redis(source, filename, key=None):
|
||||||
# load a cdx into mock redis
|
# load a cdx into mock redis
|
||||||
with open(test_cdx_dir + filename) as fh:
|
with open(test_cdx_dir + filename, 'rb') as fh:
|
||||||
for line in fh:
|
for line in fh:
|
||||||
zadd_cdx(source, line, key)
|
zadd_cdx(source, line, key)
|
||||||
|
|
||||||
|
@ -84,7 +84,7 @@ class ZipNumCluster(CDXSource):
|
|||||||
self.loc_mtime = new_mtime
|
self.loc_mtime = new_mtime
|
||||||
|
|
||||||
logging.debug('Loading loc from: ' + self.loc_filename)
|
logging.debug('Loading loc from: ' + self.loc_filename)
|
||||||
with open(self.loc_filename) as fh:
|
with open(self.loc_filename, 'rb') as fh:
|
||||||
for line in fh:
|
for line in fh:
|
||||||
parts = line.rstrip().split('\t')
|
parts = line.rstrip().split('\t')
|
||||||
self.loc_map[parts[0]] = parts[1:]
|
self.loc_map[parts[0]] = parts[1:]
|
||||||
@ -112,7 +112,7 @@ class ZipNumCluster(CDXSource):
|
|||||||
def load_cdx(self, query):
|
def load_cdx(self, query):
|
||||||
self.load_loc()
|
self.load_loc()
|
||||||
|
|
||||||
reader = open(self.summary)
|
reader = open(self.summary, 'rb')
|
||||||
|
|
||||||
idx_iter = iter_range(reader,
|
idx_iter = iter_range(reader,
|
||||||
query.key,
|
query.key,
|
||||||
|
@ -13,8 +13,8 @@ from argparse import ArgumentParser
|
|||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
# Duration of 100 years
|
# Duration of 10 years
|
||||||
CERT_DURATION = 100 * 365 * 24 * 60 * 60
|
CERT_DURATION = 10 * 365 * 24 * 60 * 60
|
||||||
|
|
||||||
CERTS_DIR = './ca/certs/'
|
CERTS_DIR = './ca/certs/'
|
||||||
|
|
||||||
|
@ -334,7 +334,7 @@ class ProxyRouter(object):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
buff = ''
|
buff = ''
|
||||||
with open(self.ca.ca_file) as fh:
|
with open(self.ca.ca_file, 'rb') as fh:
|
||||||
buff = fh.read()
|
buff = fh.read()
|
||||||
|
|
||||||
content_type = 'application/x-x509-ca-cert'
|
content_type = 'application/x-x509-ca-cert'
|
||||||
|
@ -5,8 +5,8 @@ import shutil
|
|||||||
|
|
||||||
from pywb.framework.certauth import main, CertificateAuthority
|
from pywb.framework.certauth import main, CertificateAuthority
|
||||||
|
|
||||||
TEST_CA_DIR = './pywb/framework/test/pywb_test_ca_certs'
|
TEST_CA_DIR = os.path.join('.', 'pywb', 'framework', 'test', 'pywb_test_ca_certs')
|
||||||
TEST_CA_ROOT = './pywb/framework/test/pywb_test_ca.pem'
|
TEST_CA_ROOT = os.path.join('.', 'pywb', 'framework', 'test', 'pywb_test_ca.pem')
|
||||||
|
|
||||||
def setup_module():
|
def setup_module():
|
||||||
openssl_support = pytest.importorskip("OpenSSL")
|
openssl_support = pytest.importorskip("OpenSSL")
|
||||||
|
@ -14,7 +14,7 @@ class TestOkApp:
|
|||||||
|
|
||||||
class TestErrApp:
|
class TestErrApp:
|
||||||
def __call__(self, env):
|
def __call__(self, env):
|
||||||
raise Exception('Test Error')
|
raise Exception('Test Unexpected Error')
|
||||||
|
|
||||||
class TestCustomErrApp:
|
class TestCustomErrApp:
|
||||||
def __call__(self, env):
|
def __call__(self, env):
|
||||||
@ -41,8 +41,8 @@ def test_err_app():
|
|||||||
testapp = webtest.TestApp(the_app)
|
testapp = webtest.TestApp(the_app)
|
||||||
resp = testapp.get('/abc', expect_errors=True)
|
resp = testapp.get('/abc', expect_errors=True)
|
||||||
|
|
||||||
assert resp.status_int == 400
|
assert resp.status_int == 500
|
||||||
assert '400 Bad Request Error: Test Error' in resp.body
|
assert '500 Internal Server Error Error: Test Unexpected Error' in resp.body
|
||||||
|
|
||||||
def test_custom_err_app():
|
def test_custom_err_app():
|
||||||
the_app = init_app(initer(TestCustomErrApp), load_yaml=False)
|
the_app = init_app(initer(TestCustomErrApp), load_yaml=False)
|
||||||
|
@ -118,7 +118,7 @@ class WSGIApp(object):
|
|||||||
if hasattr(exc, 'status'):
|
if hasattr(exc, 'status'):
|
||||||
status = exc.status()
|
status = exc.status()
|
||||||
else:
|
else:
|
||||||
status = '400 Bad Request'
|
status = '500 Internal Server Error'
|
||||||
|
|
||||||
if hasattr(exc, 'url'):
|
if hasattr(exc, 'url'):
|
||||||
err_url = exc.url
|
err_url = exc.url
|
||||||
|
@ -30,6 +30,8 @@ class HTMLRewriterMixin(object):
|
|||||||
'base': {'href': defmod},
|
'base': {'href': defmod},
|
||||||
'blockquote': {'cite': defmod},
|
'blockquote': {'cite': defmod},
|
||||||
'body': {'background': 'im_'},
|
'body': {'background': 'im_'},
|
||||||
|
'button': {'formaction': defmod},
|
||||||
|
'command': {'icon': 'im_'},
|
||||||
'del': {'cite': defmod},
|
'del': {'cite': defmod},
|
||||||
'embed': {'src': 'oe_'},
|
'embed': {'src': 'oe_'},
|
||||||
'head': {'': defmod}, # for head rewriting
|
'head': {'': defmod}, # for head rewriting
|
||||||
@ -37,7 +39,8 @@ class HTMLRewriterMixin(object):
|
|||||||
'img': {'src': 'im_',
|
'img': {'src': 'im_',
|
||||||
'srcset': 'im_'},
|
'srcset': 'im_'},
|
||||||
'ins': {'cite': defmod},
|
'ins': {'cite': defmod},
|
||||||
'input': {'src': 'im_'},
|
'input': {'src': 'im_',
|
||||||
|
'formaction': defmod},
|
||||||
'form': {'action': defmod},
|
'form': {'action': defmod},
|
||||||
'frame': {'src': 'fr_'},
|
'frame': {'src': 'fr_'},
|
||||||
'link': {'href': 'oe_'},
|
'link': {'href': 'oe_'},
|
||||||
@ -49,7 +52,8 @@ class HTMLRewriterMixin(object):
|
|||||||
'ref': {'href': 'oe_'},
|
'ref': {'href': 'oe_'},
|
||||||
'script': {'src': 'js_'},
|
'script': {'src': 'js_'},
|
||||||
'source': {'src': 'oe_'},
|
'source': {'src': 'oe_'},
|
||||||
'video': {'src': 'oe_'},
|
'video': {'src': 'oe_',
|
||||||
|
'poster': 'im_'},
|
||||||
|
|
||||||
'div': {'data-src': defmod,
|
'div': {'data-src': defmod,
|
||||||
'data-uri': defmod},
|
'data-uri': defmod},
|
||||||
|
@ -6,10 +6,11 @@ import requests
|
|||||||
import datetime
|
import datetime
|
||||||
import mimetypes
|
import mimetypes
|
||||||
import logging
|
import logging
|
||||||
|
import os
|
||||||
|
|
||||||
from urlparse import urlsplit
|
from urlparse import urlsplit
|
||||||
|
|
||||||
from pywb.utils.loaders import is_http, LimitReader, BlockLoader
|
from pywb.utils.loaders import is_http, LimitReader, BlockLoader, to_file_url
|
||||||
from pywb.utils.loaders import extract_client_cookie
|
from pywb.utils.loaders import extract_client_cookie
|
||||||
from pywb.utils.timeutils import datetime_to_timestamp
|
from pywb.utils.timeutils import datetime_to_timestamp
|
||||||
from pywb.utils.statusandheaders import StatusAndHeaders
|
from pywb.utils.statusandheaders import StatusAndHeaders
|
||||||
@ -180,11 +181,18 @@ class LiveRewriter(object):
|
|||||||
if url.startswith('//'):
|
if url.startswith('//'):
|
||||||
url = 'http:' + url
|
url = 'http:' + url
|
||||||
|
|
||||||
|
if is_http(url):
|
||||||
|
is_remote = True
|
||||||
|
else:
|
||||||
|
is_remote = False
|
||||||
|
if not url.startswith('file:'):
|
||||||
|
url = to_file_url(url)
|
||||||
|
|
||||||
# explicit urlkey may be passed in (say for testing)
|
# explicit urlkey may be passed in (say for testing)
|
||||||
if not urlkey:
|
if not urlkey:
|
||||||
urlkey = canonicalize(url)
|
urlkey = canonicalize(url)
|
||||||
|
|
||||||
if is_http(url):
|
if is_remote:
|
||||||
(status_headers, stream) = self.fetch_http(url, urlkey, env,
|
(status_headers, stream) = self.fetch_http(url, urlkey, env,
|
||||||
req_headers,
|
req_headers,
|
||||||
follow_redirects,
|
follow_redirects,
|
||||||
|
@ -103,12 +103,17 @@
|
|||||||
'http://example.com/file.html?param=https://example.com/filename.html&other=value&a=b¶m2=http://test.example.com'
|
'http://example.com/file.html?param=https://example.com/filename.html&other=value&a=b¶m2=http://test.example.com'
|
||||||
|
|
||||||
# HttpsUrlRewriter tests
|
# HttpsUrlRewriter tests
|
||||||
>>> HttpsUrlRewriter('http://example.com/', None).rewrite('https://example.com/abc')
|
>>> httpsrewriter = HttpsUrlRewriter('http://example.com/', None)
|
||||||
|
>>> httpsrewriter.rewrite('https://example.com/abc')
|
||||||
'http://example.com/abc'
|
'http://example.com/abc'
|
||||||
|
|
||||||
>>> HttpsUrlRewriter('http://example.com/', None).rewrite('http://example.com/abc')
|
>>> httpsrewriter.rewrite('http://example.com/abc')
|
||||||
'http://example.com/abc'
|
'http://example.com/abc'
|
||||||
|
|
||||||
|
# rebase is identity
|
||||||
|
>>> httpsrewriter.rebase_rewriter('https://example.com/') == httpsrewriter
|
||||||
|
True
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@ -105,6 +105,8 @@ _WBWombat = (function() {
|
|||||||
"http:/" + prefix, "https:/" + prefix];
|
"http:/" + prefix, "https:/" + prefix];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var SRC_TAGS = ["IMG", "SCRIPT", "VIDEO", "AUDIO", "SOURCE", "EMBED", "INPUT"];
|
||||||
|
|
||||||
//============================================
|
//============================================
|
||||||
function rewrite_url_(url) {
|
function rewrite_url_(url) {
|
||||||
// If undefined, just return it
|
// If undefined, just return it
|
||||||
@ -692,12 +694,9 @@ _WBWombat = (function() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
override_attr(created, "src");
|
override_attr(created, "src");
|
||||||
} else if (created.tagName == "IMG" || created.tagName == "VIDEO" || created.tagName == "AUDIO") {
|
} else if (created.tagName && starts_with(created.tagName, SRC_TAGS)) {
|
||||||
override_attr(created, "src");
|
override_attr(created, "src");
|
||||||
}
|
}
|
||||||
// } else if (created.tagName == "A") {
|
|
||||||
// override_attr(created, "href");
|
|
||||||
// }
|
|
||||||
|
|
||||||
return created;
|
return created;
|
||||||
}
|
}
|
||||||
|
@ -46,9 +46,6 @@ class BufferedReader(object):
|
|||||||
self.buff_size = 0
|
self.buff_size = 0
|
||||||
|
|
||||||
def set_decomp(self, decomp_type):
|
def set_decomp(self, decomp_type):
|
||||||
if self.num_read > 0:
|
|
||||||
raise Exception('Attempting to change decompression mid-stream')
|
|
||||||
|
|
||||||
self._init_decomp(decomp_type)
|
self._init_decomp(decomp_type)
|
||||||
|
|
||||||
def _init_decomp(self, decomp_type):
|
def _init_decomp(self, decomp_type):
|
||||||
|
@ -7,6 +7,7 @@ import os
|
|||||||
import hmac
|
import hmac
|
||||||
import urllib
|
import urllib
|
||||||
import urllib2
|
import urllib2
|
||||||
|
import urlparse
|
||||||
import time
|
import time
|
||||||
import pkg_resources
|
import pkg_resources
|
||||||
from io import open
|
from io import open
|
||||||
@ -17,6 +18,15 @@ def is_http(filename):
|
|||||||
return filename.startswith(('http://', 'https://'))
|
return filename.startswith(('http://', 'https://'))
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
def to_file_url(filename):
|
||||||
|
""" Convert a filename to a file:// url
|
||||||
|
"""
|
||||||
|
url = os.path.abspath(filename)
|
||||||
|
url = urlparse.urljoin('file:', urllib.pathname2url(url))
|
||||||
|
return url
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
def load_yaml_config(config_file):
|
def load_yaml_config(config_file):
|
||||||
import yaml
|
import yaml
|
||||||
@ -39,12 +49,12 @@ def extract_post_query(method, mime, length, stream):
|
|||||||
not mime.lower().startswith('application/x-www-form-urlencoded'))):
|
not mime.lower().startswith('application/x-www-form-urlencoded'))):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
if not length or length == '0':
|
|
||||||
return None
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
length = int(length)
|
length = int(length)
|
||||||
except ValueError:
|
except (ValueError, TypeError):
|
||||||
|
return None
|
||||||
|
|
||||||
|
if length <= 0:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
#todo: encoding issues?
|
#todo: encoding issues?
|
||||||
@ -129,9 +139,10 @@ class BlockLoader(object):
|
|||||||
# if starting with . or /, can only be a file path..
|
# if starting with . or /, can only be a file path..
|
||||||
file_only = url.startswith(('/', '.'))
|
file_only = url.startswith(('/', '.'))
|
||||||
|
|
||||||
|
# convert to filename
|
||||||
if url.startswith('file://'):
|
if url.startswith('file://'):
|
||||||
url = url[len('file://'):]
|
|
||||||
file_only = True
|
file_only = True
|
||||||
|
url = urllib.url2pathname(url[len('file://'):])
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# first, try as file
|
# first, try as file
|
||||||
|
@ -66,12 +66,12 @@ from pywb import get_test_dir
|
|||||||
test_cdx_dir = get_test_dir() + 'cdx/'
|
test_cdx_dir = get_test_dir() + 'cdx/'
|
||||||
|
|
||||||
def print_binsearch_results(key, iter_func):
|
def print_binsearch_results(key, iter_func):
|
||||||
with open(test_cdx_dir + 'iana.cdx') as cdx:
|
with open(test_cdx_dir + 'iana.cdx', 'rb') as cdx:
|
||||||
for line in iter_func(cdx, key):
|
for line in iter_func(cdx, key):
|
||||||
print line
|
print line
|
||||||
|
|
||||||
def print_binsearch_results_range(key, end_key, iter_func, prev_size=0):
|
def print_binsearch_results_range(key, end_key, iter_func, prev_size=0):
|
||||||
with open(test_cdx_dir + 'iana.cdx') as cdx:
|
with open(test_cdx_dir + 'iana.cdx', 'rb') as cdx:
|
||||||
for line in iter_func(cdx, key, end_key, prev_size=prev_size):
|
for line in iter_func(cdx, key, end_key, prev_size=prev_size):
|
||||||
print line
|
print line
|
||||||
|
|
||||||
|
@ -25,7 +25,7 @@ True
|
|||||||
100
|
100
|
||||||
|
|
||||||
# no length specified, read full amount requested
|
# no length specified, read full amount requested
|
||||||
>>> len(BlockLoader().load('file://' + test_cdx_dir + 'example.cdx', 0, -1).read(400))
|
>>> len(BlockLoader().load(to_file_url(test_cdx_dir + 'example.cdx'), 0, -1).read(400))
|
||||||
400
|
400
|
||||||
|
|
||||||
# HMAC Cookie Maker
|
# HMAC Cookie Maker
|
||||||
@ -56,14 +56,41 @@ True
|
|||||||
>>> extract_client_cookie(dict(HTTP_COOKIE='x'), 'x')
|
>>> extract_client_cookie(dict(HTTP_COOKIE='x'), 'x')
|
||||||
|
|
||||||
>>> extract_client_cookie({}, 'y')
|
>>> extract_client_cookie({}, 'y')
|
||||||
|
|
||||||
|
|
||||||
|
# extract_post_query tests
|
||||||
|
|
||||||
|
# correct POST data
|
||||||
|
>>> post_data = 'foo=bar&dir=%2Fbaz'
|
||||||
|
>>> extract_post_query('POST', 'application/x-www-form-urlencoded', len(post_data), BytesIO(post_data))
|
||||||
|
'foo=bar&dir=/baz'
|
||||||
|
|
||||||
|
# unsupported method
|
||||||
|
>>> extract_post_query('PUT', 'application/x-www-form-urlencoded', len(post_data), BytesIO(post_data))
|
||||||
|
|
||||||
|
# unsupported type
|
||||||
|
>>> extract_post_query('POST', 'text/plain', len(post_data), BytesIO(post_data))
|
||||||
|
|
||||||
|
# invalid length
|
||||||
|
>>> extract_post_query('POST', 'application/x-www-form-urlencoded', 'abc', BytesIO(post_data))
|
||||||
|
>>> extract_post_query('POST', 'application/x-www-form-urlencoded', 0, BytesIO(post_data))
|
||||||
|
|
||||||
|
# length too short
|
||||||
|
>>> extract_post_query('POST', 'application/x-www-form-urlencoded', len(post_data) - 4, BytesIO(post_data))
|
||||||
|
'foo=bar&dir=%2'
|
||||||
|
|
||||||
|
# length too long
|
||||||
|
>>> extract_post_query('POST', 'application/x-www-form-urlencoded', len(post_data) + 4, BytesIO(post_data))
|
||||||
|
'foo=bar&dir=/baz'
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
import re
|
import re
|
||||||
|
import os
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from pywb.utils.loaders import BlockLoader, HMACCookieMaker
|
from pywb.utils.loaders import BlockLoader, HMACCookieMaker, to_file_url
|
||||||
from pywb.utils.loaders import LimitReader, extract_client_cookie
|
from pywb.utils.loaders import LimitReader, extract_client_cookie, extract_post_query
|
||||||
|
|
||||||
from pywb import get_test_dir
|
from pywb import get_test_dir
|
||||||
|
|
||||||
@ -82,7 +109,6 @@ def seek_read_full(seekable_reader, offset):
|
|||||||
return seekable_reader.readline()
|
return seekable_reader.readline()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import doctest
|
import doctest
|
||||||
doctest.testmod()
|
doctest.testmod()
|
||||||
|
@ -6,8 +6,9 @@ class WbException(Exception):
|
|||||||
Exception.__init__(self, msg)
|
Exception.__init__(self, msg)
|
||||||
self.url = url
|
self.url = url
|
||||||
|
|
||||||
def status(self):
|
# Default Error Code
|
||||||
return '500 Internal Server Error'
|
# def status(self):
|
||||||
|
# return '500 Internal Server Error'
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
|
@ -115,8 +115,8 @@ def write_multi_cdx_index(output, inputs, **options):
|
|||||||
outpath = cdx_filename(filename)
|
outpath = cdx_filename(filename)
|
||||||
outpath = os.path.join(output, outpath)
|
outpath = os.path.join(output, outpath)
|
||||||
|
|
||||||
with open(outpath, 'w') as outfile:
|
with open(outpath, 'wb') as outfile:
|
||||||
with open(fullpath, 'r') as infile:
|
with open(fullpath, 'rb') as infile:
|
||||||
write_cdx_index(outfile, infile, filename, **options)
|
write_cdx_index(outfile, infile, filename, **options)
|
||||||
|
|
||||||
# write to one cdx file
|
# write to one cdx file
|
||||||
@ -124,7 +124,7 @@ def write_multi_cdx_index(output, inputs, **options):
|
|||||||
if output == '-':
|
if output == '-':
|
||||||
outfile = sys.stdout
|
outfile = sys.stdout
|
||||||
else:
|
else:
|
||||||
outfile = open(output, 'w')
|
outfile = open(output, 'wb')
|
||||||
|
|
||||||
if options.get('sort'):
|
if options.get('sort'):
|
||||||
writer_cls = SortedCDXWriter
|
writer_cls = SortedCDXWriter
|
||||||
@ -133,7 +133,7 @@ def write_multi_cdx_index(output, inputs, **options):
|
|||||||
|
|
||||||
with writer_cls(outfile, options.get('cdx09')) as writer:
|
with writer_cls(outfile, options.get('cdx09')) as writer:
|
||||||
for fullpath, filename in iter_file_or_dir(inputs):
|
for fullpath, filename in iter_file_or_dir(inputs):
|
||||||
with open(fullpath, 'r') as infile:
|
with open(fullpath, 'rb') as infile:
|
||||||
entry_iter = create_index_iter(infile, **options)
|
entry_iter = create_index_iter(infile, **options)
|
||||||
|
|
||||||
for entry in entry_iter:
|
for entry in entry_iter:
|
||||||
|
@ -3,6 +3,7 @@ import redis
|
|||||||
from pywb.utils.binsearch import iter_exact
|
from pywb.utils.binsearch import iter_exact
|
||||||
|
|
||||||
import urlparse
|
import urlparse
|
||||||
|
import urllib
|
||||||
import os
|
import os
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
@ -56,7 +57,7 @@ class RedisResolver:
|
|||||||
class PathIndexResolver:
|
class PathIndexResolver:
|
||||||
def __init__(self, pathindex_file):
|
def __init__(self, pathindex_file):
|
||||||
self.pathindex_file = pathindex_file
|
self.pathindex_file = pathindex_file
|
||||||
self.reader = open(pathindex_file)
|
self.reader = open(pathindex_file, 'rb')
|
||||||
|
|
||||||
def __call__(self, filename):
|
def __call__(self, filename):
|
||||||
result = iter_exact(self.reader, filename, '\t')
|
result = iter_exact(self.reader, filename, '\t')
|
||||||
@ -92,6 +93,7 @@ def make_best_resolver(param):
|
|||||||
|
|
||||||
if url_parts.scheme == 'file':
|
if url_parts.scheme == 'file':
|
||||||
path = url_parts.path
|
path = url_parts.path
|
||||||
|
path = urllib.url2pathname(path)
|
||||||
|
|
||||||
if os.path.isfile(path):
|
if os.path.isfile(path):
|
||||||
logging.debug('Adding Path Index: ' + path)
|
logging.debug('Adding Path Index: ' + path)
|
||||||
|
@ -160,7 +160,7 @@ TEST_CDX_DIR = get_test_dir() + 'cdx/'
|
|||||||
TEST_WARC_DIR = get_test_dir() + 'warcs/'
|
TEST_WARC_DIR = get_test_dir() + 'warcs/'
|
||||||
|
|
||||||
def read_fully(cdx):
|
def read_fully(cdx):
|
||||||
with open(TEST_CDX_DIR + cdx) as fh:
|
with open(TEST_CDX_DIR + cdx, 'rb') as fh:
|
||||||
curr = BytesIO()
|
curr = BytesIO()
|
||||||
while True:
|
while True:
|
||||||
b = fh.read()
|
b = fh.read()
|
||||||
@ -172,7 +172,7 @@ def read_fully(cdx):
|
|||||||
def cdx_index(warc, **options):
|
def cdx_index(warc, **options):
|
||||||
buff = BytesIO()
|
buff = BytesIO()
|
||||||
|
|
||||||
with open(TEST_WARC_DIR + warc) as fh:
|
with open(TEST_WARC_DIR + warc, 'rb') as fh:
|
||||||
write_cdx_index(buff, fh, warc, **options)
|
write_cdx_index(buff, fh, warc, **options)
|
||||||
|
|
||||||
return buff.getvalue()
|
return buff.getvalue()
|
||||||
@ -213,7 +213,7 @@ def cli_lines_with_dir(input_):
|
|||||||
|
|
||||||
print filename
|
print filename
|
||||||
|
|
||||||
with open(os.path.join(tmp_dir, filename), 'r') as fh:
|
with open(os.path.join(tmp_dir, filename), 'rb') as fh:
|
||||||
lines = fh.read(8192).rstrip().split('\n')
|
lines = fh.read(8192).rstrip().split('\n')
|
||||||
|
|
||||||
finally:
|
finally:
|
||||||
|
@ -33,13 +33,13 @@ PrefixResolver('http://myhost.example.com/warcs/', contains = '/')
|
|||||||
RedisResolver('redis://myhost.example.com:1234/1')
|
RedisResolver('redis://myhost.example.com:1234/1')
|
||||||
|
|
||||||
# a file
|
# a file
|
||||||
>>> r = make_best_resolver('file://' + os.path.realpath(__file__))
|
>>> r = make_best_resolver(to_file_url(os.path.realpath(__file__)))
|
||||||
>>> r.__class__.__name__
|
>>> r.__class__.__name__
|
||||||
'PathIndexResolver'
|
'PathIndexResolver'
|
||||||
|
|
||||||
# a dir
|
# a dir
|
||||||
>>> path = os.path.realpath(__file__)
|
>>> path = os.path.realpath(__file__)
|
||||||
>>> r = make_best_resolver('file://' + os.path.dirname(path))
|
>>> r = make_best_resolver(to_file_url(os.path.dirname(path)))
|
||||||
>>> r.__class__.__name__
|
>>> r.__class__.__name__
|
||||||
'PrefixResolver'
|
'PrefixResolver'
|
||||||
|
|
||||||
@ -54,8 +54,9 @@ RedisResolver('redis://myhost.example.com:1234/1')
|
|||||||
from pywb import get_test_dir
|
from pywb import get_test_dir
|
||||||
from pywb.warc.pathresolvers import PrefixResolver, PathIndexResolver, RedisResolver
|
from pywb.warc.pathresolvers import PrefixResolver, PathIndexResolver, RedisResolver
|
||||||
from pywb.warc.pathresolvers import make_best_resolver, make_best_resolvers
|
from pywb.warc.pathresolvers import make_best_resolver, make_best_resolvers
|
||||||
import os
|
from pywb.utils.loaders import to_file_url
|
||||||
|
|
||||||
|
import os
|
||||||
|
|
||||||
from fakeredis import FakeStrictRedis
|
from fakeredis import FakeStrictRedis
|
||||||
from mock import patch
|
from mock import patch
|
||||||
@ -68,7 +69,6 @@ def init_redis_resolver():
|
|||||||
def hset_path(filename, path):
|
def hset_path(filename, path):
|
||||||
redis_resolver.redis.hset(redis_resolver.key_prefix + filename, 'path', path)
|
redis_resolver.redis.hset(redis_resolver.key_prefix + filename, 'path', path)
|
||||||
|
|
||||||
|
|
||||||
redis_resolver = init_redis_resolver()
|
redis_resolver = init_redis_resolver()
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
|
@ -65,7 +65,7 @@ class RangeCache(object):
|
|||||||
maxlen = min(maxlen, end - start + 1)
|
maxlen = min(maxlen, end - start + 1)
|
||||||
|
|
||||||
def read_range():
|
def read_range():
|
||||||
with open(spec['name']) as fh:
|
with open(spec['name'], 'rb') as fh:
|
||||||
fh.seek(start)
|
fh.seek(start)
|
||||||
fh = LimitReader.wrap_stream(fh, maxlen)
|
fh = LimitReader.wrap_stream(fh, maxlen)
|
||||||
while True:
|
while True:
|
||||||
|
20
pywb/webapp/test/test_view_filters.py
Normal file
20
pywb/webapp/test/test_view_filters.py
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
"""
|
||||||
|
>>> format_ts('20141226101000')
|
||||||
|
'Fri, Dec 26 2014 10:10:00'
|
||||||
|
|
||||||
|
>>> format_ts('20141226101000', '%s')
|
||||||
|
1419588600
|
||||||
|
|
||||||
|
>>> is_wb_handler(DebugEchoHandler())
|
||||||
|
False
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
from pywb.webapp.views import format_ts, is_wb_handler
|
||||||
|
from pywb.webapp.handlers import DebugEchoHandler
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import doctest
|
||||||
|
doctest.testmod()
|
@ -1,4 +1,4 @@
|
|||||||
from pywb.utils.timeutils import timestamp_to_datetime
|
from pywb.utils.timeutils import timestamp_to_datetime, timestamp_to_sec
|
||||||
from pywb.framework.wbrequestresponse import WbResponse
|
from pywb.framework.wbrequestresponse import WbResponse
|
||||||
from pywb.framework.memento import make_timemap, LINK_FORMAT
|
from pywb.framework.memento import make_timemap, LINK_FORMAT
|
||||||
|
|
||||||
@ -22,11 +22,7 @@ class template_filter(object):
|
|||||||
Otherwise, the func name is the filter name
|
Otherwise, the func name is the filter name
|
||||||
"""
|
"""
|
||||||
def __init__(self, param=None):
|
def __init__(self, param=None):
|
||||||
if hasattr(param, '__call__'):
|
self.name = param
|
||||||
self.name = None
|
|
||||||
self.__call__(param)
|
|
||||||
else:
|
|
||||||
self.name = param
|
|
||||||
|
|
||||||
def __call__(self, func):
|
def __call__(self, func):
|
||||||
name = self.name
|
name = self.name
|
||||||
@ -39,10 +35,13 @@ class template_filter(object):
|
|||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
# Filters
|
# Filters
|
||||||
@template_filter
|
@template_filter()
|
||||||
def format_ts(value, format_='%a, %b %d %Y %H:%M:%S'):
|
def format_ts(value, format_='%a, %b %d %Y %H:%M:%S'):
|
||||||
value = timestamp_to_datetime(value)
|
if format_ == '%s':
|
||||||
return value.strftime(format_)
|
return timestamp_to_sec(value)
|
||||||
|
else:
|
||||||
|
value = timestamp_to_datetime(value)
|
||||||
|
return value.strftime(format_)
|
||||||
|
|
||||||
|
|
||||||
@template_filter('urlsplit')
|
@template_filter('urlsplit')
|
||||||
@ -51,17 +50,11 @@ def get_urlsplit(url):
|
|||||||
return split
|
return split
|
||||||
|
|
||||||
|
|
||||||
@template_filter()
|
|
||||||
def request_hostname(env):
|
|
||||||
return env.get('HTTP_HOST', 'localhost')
|
|
||||||
|
|
||||||
|
|
||||||
@template_filter()
|
@template_filter()
|
||||||
def is_wb_handler(obj):
|
def is_wb_handler(obj):
|
||||||
if not hasattr(obj, 'handler'):
|
if not hasattr(obj, 'handler'):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
#return isinstance(obj.handler, WBHandler)
|
|
||||||
return obj.handler.__class__.__name__ == "WBHandler"
|
return obj.handler.__class__.__name__ == "WBHandler"
|
||||||
|
|
||||||
|
|
||||||
|
12
setup.py
12
setup.py
@ -34,7 +34,7 @@ class PyTest(TestCommand):
|
|||||||
|
|
||||||
setup(
|
setup(
|
||||||
name='pywb',
|
name='pywb',
|
||||||
version='0.7.2',
|
version='0.7.5',
|
||||||
url='https://github.com/ikreymer/pywb',
|
url='https://github.com/ikreymer/pywb',
|
||||||
author='Ilya Kreymer',
|
author='Ilya Kreymer',
|
||||||
author_email='ikreymer@gmail.com',
|
author_email='ikreymer@gmail.com',
|
||||||
@ -58,10 +58,10 @@ setup(
|
|||||||
'pywb': ['static/flowplayer/*', 'static/*.*', 'ui/*', '*.yaml'],
|
'pywb': ['static/flowplayer/*', 'static/*.*', 'ui/*', '*.yaml'],
|
||||||
},
|
},
|
||||||
data_files=[
|
data_files=[
|
||||||
('sample_archive/cdx/', glob.glob('sample_archive/cdx/*')),
|
('sample_archive/cdx', glob.glob('sample_archive/cdx/*')),
|
||||||
('sample_archive/zipcdx/', glob.glob('sample_archive/zipcdx/*')),
|
('sample_archive/zipcdx', glob.glob('sample_archive/zipcdx/*')),
|
||||||
('sample_archive/warcs/', glob.glob('sample_archive/warcs/*')),
|
('sample_archive/warcs', glob.glob('sample_archive/warcs/*')),
|
||||||
('sample_archive/text_content/',
|
('sample_archive/text_content',
|
||||||
glob.glob('sample_archive/text_content/*')),
|
glob.glob('sample_archive/text_content/*')),
|
||||||
],
|
],
|
||||||
install_requires=[
|
install_requires=[
|
||||||
@ -90,7 +90,7 @@ setup(
|
|||||||
live-rewrite-server = pywb.apps.live_rewrite_server:main
|
live-rewrite-server = pywb.apps.live_rewrite_server:main
|
||||||
proxy-cert-auth = pywb.framework.certauth:main
|
proxy-cert-auth = pywb.framework.certauth:main
|
||||||
""",
|
""",
|
||||||
zip_safe=False,
|
zip_safe=True,
|
||||||
classifiers=[
|
classifiers=[
|
||||||
'Development Status :: 4 - Beta',
|
'Development Status :: 4 - Beta',
|
||||||
'Environment :: Web Environment',
|
'Environment :: Web Environment',
|
||||||
|
Loading…
x
Reference in New Issue
Block a user