1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

Merge branch 'develop' for 0.7.5

This commit is contained in:
Ilya Kreymer 2015-01-12 00:50:16 -08:00
commit c935aa5ec9
28 changed files with 158 additions and 75 deletions

4
.gitattributes vendored Normal file
View File

@ -0,0 +1,4 @@
*.arc -text
*.warc -text
*.cdx -text
*.gz -text

View File

@ -1,3 +1,16 @@
pywb 0.7.5 changelist
~~~~~~~~~~~~~~~~~~~~~
* Cross platform fixes to support Windows -- all tests pass on Linux, OS X and Windows now. Improved cross-platform support includes:
- read all files as binary to avoid line ending issues
- properly convert url <-> file
- avoid platform dependent apis
* Change any unhandled exceptions to result in a 500 error, instead of 400.
* More compresensive client side ``src`` attribute rewriting (via wombat.js), additional server-side HTML tag rewriting.
pywb 0.7.2 changelist
~~~~~~~~~~~~~~~~~~~~~

View File

@ -1,4 +1,4 @@
PyWb 0.7.2
PyWb 0.7.5
==========
.. image:: https://travis-ci.org/ikreymer/pywb.png?branch=master
@ -13,7 +13,7 @@ pywb is a python implementation of web archival replay tools, sometimes also kno
pywb allows high-quality replay (browsing) of archived web data stored in standardized `ARC <http://en.wikipedia.org/wiki/ARC_(file_format)>`_ and `WARC <http://en.wikipedia.org/wiki/Web_ARChive>`_.
The replay system is designed to accurately replay complex dynamic sites, including video and audio content.
pywb can be used as a traditional web application or an HTTP or HTTPS proxy server.
pywb can be used as a traditional web application or an HTTP or HTTPS proxy server, and has been tested on Linux, OS X and Windows platforms.
pywb is also fully compliant with the `Memento <http://mementoweb.org/>`_ protocol (`RFC-7089 <http://tools.ietf.org/html/rfc7089>`_).

View File

@ -30,7 +30,7 @@ class CDXFile(CDXSource):
def load_cdx(self, query):
def do_open():
try:
source = open(self.filename)
source = open(self.filename, 'rb')
gen = iter_range(source, query.key, query.end_key)
for line in gen:
yield line

View File

@ -26,7 +26,7 @@ test_cdx_dir = get_test_dir() + 'cdx/'
def load_cdx_into_redis(source, filename, key=None):
# load a cdx into mock redis
with open(test_cdx_dir + filename) as fh:
with open(test_cdx_dir + filename, 'rb') as fh:
for line in fh:
zadd_cdx(source, line, key)

View File

@ -84,7 +84,7 @@ class ZipNumCluster(CDXSource):
self.loc_mtime = new_mtime
logging.debug('Loading loc from: ' + self.loc_filename)
with open(self.loc_filename) as fh:
with open(self.loc_filename, 'rb') as fh:
for line in fh:
parts = line.rstrip().split('\t')
self.loc_map[parts[0]] = parts[1:]
@ -112,7 +112,7 @@ class ZipNumCluster(CDXSource):
def load_cdx(self, query):
self.load_loc()
reader = open(self.summary)
reader = open(self.summary, 'rb')
idx_iter = iter_range(reader,
query.key,

View File

@ -13,8 +13,8 @@ from argparse import ArgumentParser
#=================================================================
# Duration of 100 years
CERT_DURATION = 100 * 365 * 24 * 60 * 60
# Duration of 10 years
CERT_DURATION = 10 * 365 * 24 * 60 * 60
CERTS_DIR = './ca/certs/'

View File

@ -334,7 +334,7 @@ class ProxyRouter(object):
return None
buff = ''
with open(self.ca.ca_file) as fh:
with open(self.ca.ca_file, 'rb') as fh:
buff = fh.read()
content_type = 'application/x-x509-ca-cert'

View File

@ -5,8 +5,8 @@ import shutil
from pywb.framework.certauth import main, CertificateAuthority
TEST_CA_DIR = './pywb/framework/test/pywb_test_ca_certs'
TEST_CA_ROOT = './pywb/framework/test/pywb_test_ca.pem'
TEST_CA_DIR = os.path.join('.', 'pywb', 'framework', 'test', 'pywb_test_ca_certs')
TEST_CA_ROOT = os.path.join('.', 'pywb', 'framework', 'test', 'pywb_test_ca.pem')
def setup_module():
openssl_support = pytest.importorskip("OpenSSL")

View File

@ -14,7 +14,7 @@ class TestOkApp:
class TestErrApp:
def __call__(self, env):
raise Exception('Test Error')
raise Exception('Test Unexpected Error')
class TestCustomErrApp:
def __call__(self, env):
@ -41,8 +41,8 @@ def test_err_app():
testapp = webtest.TestApp(the_app)
resp = testapp.get('/abc', expect_errors=True)
assert resp.status_int == 400
assert '400 Bad Request Error: Test Error' in resp.body
assert resp.status_int == 500
assert '500 Internal Server Error Error: Test Unexpected Error' in resp.body
def test_custom_err_app():
the_app = init_app(initer(TestCustomErrApp), load_yaml=False)

View File

@ -118,7 +118,7 @@ class WSGIApp(object):
if hasattr(exc, 'status'):
status = exc.status()
else:
status = '400 Bad Request'
status = '500 Internal Server Error'
if hasattr(exc, 'url'):
err_url = exc.url

View File

@ -30,6 +30,8 @@ class HTMLRewriterMixin(object):
'base': {'href': defmod},
'blockquote': {'cite': defmod},
'body': {'background': 'im_'},
'button': {'formaction': defmod},
'command': {'icon': 'im_'},
'del': {'cite': defmod},
'embed': {'src': 'oe_'},
'head': {'': defmod}, # for head rewriting
@ -37,7 +39,8 @@ class HTMLRewriterMixin(object):
'img': {'src': 'im_',
'srcset': 'im_'},
'ins': {'cite': defmod},
'input': {'src': 'im_'},
'input': {'src': 'im_',
'formaction': defmod},
'form': {'action': defmod},
'frame': {'src': 'fr_'},
'link': {'href': 'oe_'},
@ -49,7 +52,8 @@ class HTMLRewriterMixin(object):
'ref': {'href': 'oe_'},
'script': {'src': 'js_'},
'source': {'src': 'oe_'},
'video': {'src': 'oe_'},
'video': {'src': 'oe_',
'poster': 'im_'},
'div': {'data-src': defmod,
'data-uri': defmod},

View File

@ -6,10 +6,11 @@ import requests
import datetime
import mimetypes
import logging
import os
from urlparse import urlsplit
from pywb.utils.loaders import is_http, LimitReader, BlockLoader
from pywb.utils.loaders import is_http, LimitReader, BlockLoader, to_file_url
from pywb.utils.loaders import extract_client_cookie
from pywb.utils.timeutils import datetime_to_timestamp
from pywb.utils.statusandheaders import StatusAndHeaders
@ -180,11 +181,18 @@ class LiveRewriter(object):
if url.startswith('//'):
url = 'http:' + url
if is_http(url):
is_remote = True
else:
is_remote = False
if not url.startswith('file:'):
url = to_file_url(url)
# explicit urlkey may be passed in (say for testing)
if not urlkey:
urlkey = canonicalize(url)
if is_http(url):
if is_remote:
(status_headers, stream) = self.fetch_http(url, urlkey, env,
req_headers,
follow_redirects,

View File

@ -103,12 +103,17 @@
'http://example.com/file.html?param=https://example.com/filename.html&other=value&a=b&param2=http://test.example.com'
# HttpsUrlRewriter tests
>>> HttpsUrlRewriter('http://example.com/', None).rewrite('https://example.com/abc')
>>> httpsrewriter = HttpsUrlRewriter('http://example.com/', None)
>>> httpsrewriter.rewrite('https://example.com/abc')
'http://example.com/abc'
>>> HttpsUrlRewriter('http://example.com/', None).rewrite('http://example.com/abc')
>>> httpsrewriter.rewrite('http://example.com/abc')
'http://example.com/abc'
# rebase is identity
>>> httpsrewriter.rebase_rewriter('https://example.com/') == httpsrewriter
True
"""

View File

@ -105,6 +105,8 @@ _WBWombat = (function() {
"http:/" + prefix, "https:/" + prefix];
}
var SRC_TAGS = ["IMG", "SCRIPT", "VIDEO", "AUDIO", "SOURCE", "EMBED", "INPUT"];
//============================================
function rewrite_url_(url) {
// If undefined, just return it
@ -692,12 +694,9 @@ _WBWombat = (function() {
}
override_attr(created, "src");
} else if (created.tagName == "IMG" || created.tagName == "VIDEO" || created.tagName == "AUDIO") {
} else if (created.tagName && starts_with(created.tagName, SRC_TAGS)) {
override_attr(created, "src");
}
// } else if (created.tagName == "A") {
// override_attr(created, "href");
// }
return created;
}

View File

@ -46,9 +46,6 @@ class BufferedReader(object):
self.buff_size = 0
def set_decomp(self, decomp_type):
if self.num_read > 0:
raise Exception('Attempting to change decompression mid-stream')
self._init_decomp(decomp_type)
def _init_decomp(self, decomp_type):

View File

@ -7,6 +7,7 @@ import os
import hmac
import urllib
import urllib2
import urlparse
import time
import pkg_resources
from io import open
@ -17,6 +18,15 @@ def is_http(filename):
return filename.startswith(('http://', 'https://'))
#=================================================================
def to_file_url(filename):
""" Convert a filename to a file:// url
"""
url = os.path.abspath(filename)
url = urlparse.urljoin('file:', urllib.pathname2url(url))
return url
#=================================================================
def load_yaml_config(config_file):
import yaml
@ -39,12 +49,12 @@ def extract_post_query(method, mime, length, stream):
not mime.lower().startswith('application/x-www-form-urlencoded'))):
return None
if not length or length == '0':
return None
try:
length = int(length)
except ValueError:
except (ValueError, TypeError):
return None
if length <= 0:
return None
#todo: encoding issues?
@ -129,9 +139,10 @@ class BlockLoader(object):
# if starting with . or /, can only be a file path..
file_only = url.startswith(('/', '.'))
# convert to filename
if url.startswith('file://'):
url = url[len('file://'):]
file_only = True
url = urllib.url2pathname(url[len('file://'):])
try:
# first, try as file

View File

@ -66,12 +66,12 @@ from pywb import get_test_dir
test_cdx_dir = get_test_dir() + 'cdx/'
def print_binsearch_results(key, iter_func):
with open(test_cdx_dir + 'iana.cdx') as cdx:
with open(test_cdx_dir + 'iana.cdx', 'rb') as cdx:
for line in iter_func(cdx, key):
print line
def print_binsearch_results_range(key, end_key, iter_func, prev_size=0):
with open(test_cdx_dir + 'iana.cdx') as cdx:
with open(test_cdx_dir + 'iana.cdx', 'rb') as cdx:
for line in iter_func(cdx, key, end_key, prev_size=prev_size):
print line

View File

@ -25,7 +25,7 @@ True
100
# no length specified, read full amount requested
>>> len(BlockLoader().load('file://' + test_cdx_dir + 'example.cdx', 0, -1).read(400))
>>> len(BlockLoader().load(to_file_url(test_cdx_dir + 'example.cdx'), 0, -1).read(400))
400
# HMAC Cookie Maker
@ -56,14 +56,41 @@ True
>>> extract_client_cookie(dict(HTTP_COOKIE='x'), 'x')
>>> extract_client_cookie({}, 'y')
# extract_post_query tests
# correct POST data
>>> post_data = 'foo=bar&dir=%2Fbaz'
>>> extract_post_query('POST', 'application/x-www-form-urlencoded', len(post_data), BytesIO(post_data))
'foo=bar&dir=/baz'
# unsupported method
>>> extract_post_query('PUT', 'application/x-www-form-urlencoded', len(post_data), BytesIO(post_data))
# unsupported type
>>> extract_post_query('POST', 'text/plain', len(post_data), BytesIO(post_data))
# invalid length
>>> extract_post_query('POST', 'application/x-www-form-urlencoded', 'abc', BytesIO(post_data))
>>> extract_post_query('POST', 'application/x-www-form-urlencoded', 0, BytesIO(post_data))
# length too short
>>> extract_post_query('POST', 'application/x-www-form-urlencoded', len(post_data) - 4, BytesIO(post_data))
'foo=bar&dir=%2'
# length too long
>>> extract_post_query('POST', 'application/x-www-form-urlencoded', len(post_data) + 4, BytesIO(post_data))
'foo=bar&dir=/baz'
"""
#=================================================================
import re
import os
from io import BytesIO
from pywb.utils.loaders import BlockLoader, HMACCookieMaker
from pywb.utils.loaders import LimitReader, extract_client_cookie
from pywb.utils.loaders import BlockLoader, HMACCookieMaker, to_file_url
from pywb.utils.loaders import LimitReader, extract_client_cookie, extract_post_query
from pywb import get_test_dir
@ -82,7 +109,6 @@ def seek_read_full(seekable_reader, offset):
return seekable_reader.readline()
if __name__ == "__main__":
import doctest
doctest.testmod()

View File

@ -6,8 +6,9 @@ class WbException(Exception):
Exception.__init__(self, msg)
self.url = url
def status(self):
return '500 Internal Server Error'
# Default Error Code
# def status(self):
# return '500 Internal Server Error'
#=================================================================

View File

@ -115,8 +115,8 @@ def write_multi_cdx_index(output, inputs, **options):
outpath = cdx_filename(filename)
outpath = os.path.join(output, outpath)
with open(outpath, 'w') as outfile:
with open(fullpath, 'r') as infile:
with open(outpath, 'wb') as outfile:
with open(fullpath, 'rb') as infile:
write_cdx_index(outfile, infile, filename, **options)
# write to one cdx file
@ -124,7 +124,7 @@ def write_multi_cdx_index(output, inputs, **options):
if output == '-':
outfile = sys.stdout
else:
outfile = open(output, 'w')
outfile = open(output, 'wb')
if options.get('sort'):
writer_cls = SortedCDXWriter
@ -133,7 +133,7 @@ def write_multi_cdx_index(output, inputs, **options):
with writer_cls(outfile, options.get('cdx09')) as writer:
for fullpath, filename in iter_file_or_dir(inputs):
with open(fullpath, 'r') as infile:
with open(fullpath, 'rb') as infile:
entry_iter = create_index_iter(infile, **options)
for entry in entry_iter:

View File

@ -3,6 +3,7 @@ import redis
from pywb.utils.binsearch import iter_exact
import urlparse
import urllib
import os
import logging
@ -56,7 +57,7 @@ class RedisResolver:
class PathIndexResolver:
def __init__(self, pathindex_file):
self.pathindex_file = pathindex_file
self.reader = open(pathindex_file)
self.reader = open(pathindex_file, 'rb')
def __call__(self, filename):
result = iter_exact(self.reader, filename, '\t')
@ -92,6 +93,7 @@ def make_best_resolver(param):
if url_parts.scheme == 'file':
path = url_parts.path
path = urllib.url2pathname(path)
if os.path.isfile(path):
logging.debug('Adding Path Index: ' + path)

View File

@ -160,7 +160,7 @@ TEST_CDX_DIR = get_test_dir() + 'cdx/'
TEST_WARC_DIR = get_test_dir() + 'warcs/'
def read_fully(cdx):
with open(TEST_CDX_DIR + cdx) as fh:
with open(TEST_CDX_DIR + cdx, 'rb') as fh:
curr = BytesIO()
while True:
b = fh.read()
@ -172,7 +172,7 @@ def read_fully(cdx):
def cdx_index(warc, **options):
buff = BytesIO()
with open(TEST_WARC_DIR + warc) as fh:
with open(TEST_WARC_DIR + warc, 'rb') as fh:
write_cdx_index(buff, fh, warc, **options)
return buff.getvalue()
@ -213,7 +213,7 @@ def cli_lines_with_dir(input_):
print filename
with open(os.path.join(tmp_dir, filename), 'r') as fh:
with open(os.path.join(tmp_dir, filename), 'rb') as fh:
lines = fh.read(8192).rstrip().split('\n')
finally:

View File

@ -33,13 +33,13 @@ PrefixResolver('http://myhost.example.com/warcs/', contains = '/')
RedisResolver('redis://myhost.example.com:1234/1')
# a file
>>> r = make_best_resolver('file://' + os.path.realpath(__file__))
>>> r = make_best_resolver(to_file_url(os.path.realpath(__file__)))
>>> r.__class__.__name__
'PathIndexResolver'
# a dir
>>> path = os.path.realpath(__file__)
>>> r = make_best_resolver('file://' + os.path.dirname(path))
>>> r = make_best_resolver(to_file_url(os.path.dirname(path)))
>>> r.__class__.__name__
'PrefixResolver'
@ -54,8 +54,9 @@ RedisResolver('redis://myhost.example.com:1234/1')
from pywb import get_test_dir
from pywb.warc.pathresolvers import PrefixResolver, PathIndexResolver, RedisResolver
from pywb.warc.pathresolvers import make_best_resolver, make_best_resolvers
import os
from pywb.utils.loaders import to_file_url
import os
from fakeredis import FakeStrictRedis
from mock import patch
@ -68,7 +69,6 @@ def init_redis_resolver():
def hset_path(filename, path):
redis_resolver.redis.hset(redis_resolver.key_prefix + filename, 'path', path)
redis_resolver = init_redis_resolver()
#=================================================================

View File

@ -65,7 +65,7 @@ class RangeCache(object):
maxlen = min(maxlen, end - start + 1)
def read_range():
with open(spec['name']) as fh:
with open(spec['name'], 'rb') as fh:
fh.seek(start)
fh = LimitReader.wrap_stream(fh, maxlen)
while True:

View File

@ -0,0 +1,20 @@
"""
>>> format_ts('20141226101000')
'Fri, Dec 26 2014 10:10:00'
>>> format_ts('20141226101000', '%s')
1419588600
>>> is_wb_handler(DebugEchoHandler())
False
"""
from pywb.webapp.views import format_ts, is_wb_handler
from pywb.webapp.handlers import DebugEchoHandler
if __name__ == "__main__":
import doctest
doctest.testmod()

View File

@ -1,4 +1,4 @@
from pywb.utils.timeutils import timestamp_to_datetime
from pywb.utils.timeutils import timestamp_to_datetime, timestamp_to_sec
from pywb.framework.wbrequestresponse import WbResponse
from pywb.framework.memento import make_timemap, LINK_FORMAT
@ -22,11 +22,7 @@ class template_filter(object):
Otherwise, the func name is the filter name
"""
def __init__(self, param=None):
if hasattr(param, '__call__'):
self.name = None
self.__call__(param)
else:
self.name = param
self.name = param
def __call__(self, func):
name = self.name
@ -39,10 +35,13 @@ class template_filter(object):
#=================================================================
# Filters
@template_filter
@template_filter()
def format_ts(value, format_='%a, %b %d %Y %H:%M:%S'):
value = timestamp_to_datetime(value)
return value.strftime(format_)
if format_ == '%s':
return timestamp_to_sec(value)
else:
value = timestamp_to_datetime(value)
return value.strftime(format_)
@template_filter('urlsplit')
@ -51,17 +50,11 @@ def get_urlsplit(url):
return split
@template_filter()
def request_hostname(env):
return env.get('HTTP_HOST', 'localhost')
@template_filter()
def is_wb_handler(obj):
if not hasattr(obj, 'handler'):
return False
#return isinstance(obj.handler, WBHandler)
return obj.handler.__class__.__name__ == "WBHandler"

View File

@ -34,7 +34,7 @@ class PyTest(TestCommand):
setup(
name='pywb',
version='0.7.2',
version='0.7.5',
url='https://github.com/ikreymer/pywb',
author='Ilya Kreymer',
author_email='ikreymer@gmail.com',
@ -58,10 +58,10 @@ setup(
'pywb': ['static/flowplayer/*', 'static/*.*', 'ui/*', '*.yaml'],
},
data_files=[
('sample_archive/cdx/', glob.glob('sample_archive/cdx/*')),
('sample_archive/zipcdx/', glob.glob('sample_archive/zipcdx/*')),
('sample_archive/warcs/', glob.glob('sample_archive/warcs/*')),
('sample_archive/text_content/',
('sample_archive/cdx', glob.glob('sample_archive/cdx/*')),
('sample_archive/zipcdx', glob.glob('sample_archive/zipcdx/*')),
('sample_archive/warcs', glob.glob('sample_archive/warcs/*')),
('sample_archive/text_content',
glob.glob('sample_archive/text_content/*')),
],
install_requires=[
@ -90,7 +90,7 @@ setup(
live-rewrite-server = pywb.apps.live_rewrite_server:main
proxy-cert-auth = pywb.framework.certauth:main
""",
zip_safe=False,
zip_safe=True,
classifiers=[
'Development Status :: 4 - Beta',
'Environment :: Web Environment',