1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

Merge branch 'develop' for 0.6.4

This commit is contained in:
Ilya Kreymer 2014-11-06 00:34:32 -08:00
commit 71a8abe9c3
24 changed files with 171 additions and 64 deletions

View File

@ -1,3 +1,19 @@
pywb 0.6.4 changelist
~~~~~~~~~~~~~~~~~~~~~
* Ignore bad multiline headers in warc.
* Rewrite fix: Don't parse html entities in HTML rewriter.
* Ensure cdx iterator closed when reeading.
* Rewrite fix: remove pywb prefix from any query params.
* Rewrite fix: better JS rewriting, avoid // comments when matching protocol-relative urls.
* WARC metadata and resource records include in cdx from cdx-indexer by default
pywb 0.6.3 changelist
~~~~~~~~~~~~~~~~~~~~~

View File

@ -1,10 +1,10 @@
PyWb 0.6.3
PyWb 0.6.4
==========
.. image:: https://travis-ci.org/ikreymer/pywb.png?branch=master
:target: https://travis-ci.org/ikreymer/pywb
.. image:: https://coveralls.io/repos/ikreymer/pywb/badge.png?branch=master
:target: https://coveralls.io/r/ikreymer/pywb?branch=master
.. image:: https://coveralls.io/repos/ikreymer/pywb/badge.png?branch=develop
:target: https://coveralls.io/r/ikreymer/pywb?branch=develop
.. image:: https://img.shields.io/gratipay/ikreymer.svg
:target: https://www.gratipay.com/ikreymer/

View File

@ -28,8 +28,17 @@ class CDXFile(CDXSource):
self.filename = filename
def load_cdx(self, query):
source = open(self.filename)
return iter_range(source, query.key, query.end_key)
def do_open():
try:
source = open(self.filename)
gen = iter_range(source, query.key, query.end_key)
for line in gen:
yield line
finally:
source.close()
return do_open()
#return iter_range(do_open(), query.key, query.end_key)
def __str__(self):
return 'CDX File - ' + self.filename

View File

@ -78,6 +78,8 @@ class WbRequest(object):
rel_prefix,
env.get('SCRIPT_NAME', '/'),
cookie_scope)
self.urlrewriter.deprefix_url()
else:
# no wb_url, just store blank wb_url
self.wb_url = None

View File

@ -136,9 +136,9 @@ class WSGIApp(object):
err_details = None
if error_view:
if err_url:
if err_url and isinstance(err_url, str):
err_url = err_url.decode('utf-8', 'ignore')
if err_msg:
if err_msg and isinstance(err_msg, str):
err_msg = err_msg.decode('utf-8', 'ignore')
return error_view.render_response(exc_type=type(exc).__name__,

View File

@ -263,10 +263,20 @@ class HTMLRewriterMixin(object):
#=================================================================
class HTMLRewriter(HTMLRewriterMixin, HTMLParser):
PARSETAG = re.compile('[<]')
def __init__(self, *args, **kwargs):
HTMLParser.__init__(self)
super(HTMLRewriter, self).__init__(*args, **kwargs)
def reset(self):
HTMLParser.reset(self)
self.interesting = self.PARSETAG
def clear_cdata_mode(self):
HTMLParser.clear_cdata_mode(self)
self.interesting = self.PARSETAG
def feed(self, string):
try:
HTMLParser.feed(self, string)
@ -311,11 +321,12 @@ class HTMLRewriter(HTMLRewriterMixin, HTMLParser):
def handle_data(self, data):
self.parse_data(data)
def handle_entityref(self, data):
self.out.write('&' + data + ';')
def handle_charref(self, data):
self.out.write('&#' + data + ';')
# overriding regex so that these are no longer called
#def handle_entityref(self, data):
# self.out.write('&' + data + ';')
#
#def handle_charref(self, data):
# self.out.write('&#' + data + ';')
def handle_comment(self, data):
self.out.write('<!--')

View File

@ -111,7 +111,8 @@ class JSLinkOnlyRewriter(RegexRewriter):
JS Rewriter which rewrites absolute http://, https:// and // urls
at the beginning of a string
"""
JS_HTTPX = r'(?<="|\'|;)(?:https?:)?\\{0,4}/\\{0,4}/[A-Za-z0-9:_@.-]+'
#JS_HTTPX = r'(?:(?:(?<=["\';])https?:)|(?<=["\']))\\{0,4}/\\{0,4}/[A-Za-z0-9:_@.-]+.*(?=["\s\';&\\])'
JS_HTTPX = r'(?<=["\';])(?:https?:)?\\{0,4}/\\{0,4}/[A-Za-z0-9:_@.\-/\\?&#]+(?=["\';&\\])'
def __init__(self, rewriter, rules=[]):
rules = rules + [

View File

@ -28,8 +28,11 @@ ur"""
<base href="/web/20131226101010/http://example.com/some/path/static/"/><img src="/web/20131226101010im_/http://example.com/some/path/static/image.gif"/>
# HTML Entities
>>> parse('<a href="">&rsaquo; &nbsp; &#62;</div>')
<a href="">&rsaquo; &nbsp; &#62;</div>
>>> parse('<a href="">&rsaquo; &nbsp; &#62; &#63</div>')
<a href="">&rsaquo; &nbsp; &#62; &#63</div>
>>> parse('<div>X&Y</div> </div>X&Y;</div>')
<div>X&Y</div> </div>X&Y;</div>
# Don't rewrite anchors
>>> parse('<HTML><A Href="#abc">Text</a></hTmL>')

View File

@ -61,6 +61,9 @@ r"""
>>> _test_js('&quot;http:\\/\\/www.example.com\\/some\\/path\\/?query=1&quot;')
'&quot;/web/20131010/http:\\/\\/www.example.com\\/some\\/path\\/?query=1&quot;'
>>> _test_js('"http:\/\/sub-site.example.com\/path-dashes\/path_other\/foo_bar.txt"')
'"/web/20131010/http:\\/\\/sub-site.example.com\\/path-dashes\\/path_other\\/foo_bar.txt"'
#=================================================================
# XML Rewriting

View File

@ -105,10 +105,10 @@ def test_example_1():
assert '/pywb/20131226101010/http://www.iana.org/domains/example' in buff, buff
def test_example_2_redirect():
status_headers, buff = get_rewritten('http://facebook.com/', urlrewriter)
status_headers, buff = get_rewritten('http://httpbin.org/redirect-to?url=http://example.com/', urlrewriter)
# redirect, no content
assert status_headers.get_statuscode() == '301'
assert status_headers.get_statuscode() == '302'
assert len(buff) == 0

View File

@ -74,6 +74,18 @@
>>> UrlRewriter('2013id_/example.com/file/path/blah.html', '/123/').get_new_url(timestamp='20131024')
'/123/20131024id_/http://example.com/file/path/blah.html'
# deprefix tests
>>> do_deprefix('2013id_/http://example.com/file/path/blah.html?param=http://localhost:8080/pywb/20141226/http://example.com/', '/pywb/', 'http://localhost:8080/pywb/')
'http://example.com/file/path/blah.html?param=http://example.com/'
>>> do_deprefix('2013id_/http://example.com/file/path/blah.html?param=http://localhost:8080/pywb/if_/https://example.com/filename.html', '/pywb/', 'http://localhost:8080/pywb/')
'http://example.com/file/path/blah.html?param=https://example.com/filename.html'
>>> do_deprefix('2013id_/http://example.com/file/path/blah.html?param=http://localhost:8080/pywb/https://example.com/filename.html', '/pywb/', 'http://localhost:8080/pywb/')
'http://example.com/file/path/blah.html?param=https://example.com/filename.html'
>>> do_deprefix('http://example.com/file.html?param=http://localhost:8080/pywb/https%3A//example.com/filename.html&other=value&a=b&param2=http://localhost:8080/pywb/http://test.example.com', '/pywb/', 'http://localhost:8080/pywb/')
'http://example.com/file.html?param=https://example.com/filename.html&other=value&a=b&param2=http://test.example.com'
# HttpsUrlRewriter tests
>>> HttpsUrlRewriter('http://example.com/', None).rewrite('https://example.com/abc')
@ -86,13 +98,22 @@
from pywb.rewrite.url_rewriter import UrlRewriter, HttpsUrlRewriter
import urllib
def do_rewrite(rel_url, base_url, prefix, mod=None, full_prefix=None):
rewriter = UrlRewriter(base_url, prefix, full_prefix=full_prefix)
return rewriter.rewrite(rel_url, mod)
def do_deprefix(url, rel_prefix, full_prefix):
encoded = urllib.quote_plus(full_prefix)
url = url.replace(full_prefix, encoded)
rewriter = UrlRewriter(url, rel_prefix, full_prefix)
url = rewriter.deprefix_url()
return urllib.unquote_plus(url)
if __name__ == "__main__":
import doctest
doctest.testmod()

View File

@ -1,4 +1,3 @@
import copy
import urlparse
from wburl import WbUrl
@ -88,6 +87,9 @@ class UrlRewriter(object):
cls = get_cookie_rewriter(scope)
return cls(self)
def deprefix_url(self):
return self.wburl.deprefix_url(self.full_prefix)
def __repr__(self):
return "UrlRewriter('{0}', '{1}')".format(self.wburl, self.prefix)
@ -150,3 +152,6 @@ class HttpsUrlRewriter(UrlRewriter):
def get_cookie_rewriter(self, scope=None):
return None
def deprefix_url(self):
return self.wburl.url

View File

@ -39,7 +39,7 @@ wayback url format.
"""
import re
import urllib
#=================================================================
class BaseWbUrl(object):
@ -149,6 +149,14 @@ class WbUrl(BaseWbUrl):
self.timestamp = timestamp
self.type = self.REPLAY
def deprefix_url(self, prefix):
prefix = urllib.quote_plus(prefix)
rex_query = '=' + re.escape(prefix) + '([0-9])*([\w]{2}_)?/?'
new_url = re.sub(rex_query, '=', self.url)
self.url = new_url
return self.url
# Str Representation
# ====================
def to_str(self, **overrides):

View File

@ -11,18 +11,18 @@ rules:
# facebook rules
#=================================================================
- url_prefix: 'com,facebook)/ajax/pagelet/generic.php/'
fuzzy_lookup: 'com,facebook\)/.*[?&]data=(.*?(?:[&]|query_type[^,]+))'
- url_prefix: 'com,facebook)/ajax/ufi/'
fuzzy_lookup:
- ft_ent_identifier
- lsd
- url_prefix: 'com,facebook)/ajax/chat/hovercard/sidebar.php'
fuzzy_lookup:
fuzzy_lookup:
- ids[0]
- url_prefix: 'com,facebook)/login.php'
@ -82,20 +82,21 @@ rules:
#=================================================================
- url_prefix: 'com,google,plus)/_/stream/getactivities'
fuzzy_lookup: '(egk[^"]+).*(f.sid=[^&]+)'
# fuzzy_lookup: '(egk[^"]+)?.*(f.sid=[^&]+)'
fuzzy_lookup: 'f.req=.*\]\]\]\,\"([^"]+).*(f.sid=[^&]+)'
- url_prefix: 'com,google,plus)/_/stream/squarestream'
fuzzy_lookup: '(cai[^"]+).*(f.sid=[^&]+)'
- url_prefix: 'com,google,plus)/_/communities/rt/landing'
fuzzy_lookup: 'com,google,plus\)/_/.*?.*\,(\d{13}\])&.*(f.sid=[^&]+).*'
- url_prefix: 'com,google,plus)/_/'
fuzzy_lookup: 'com,google,plus\)/_/.*?.*(f.sid=[^&]+)'

View File

@ -708,11 +708,11 @@ WB_wombat_init = (function() {
}
//============================================
function wombat_init(replay_prefix, capture_date, orig_scheme, orig_host, timestamp) {
function wombat_init(replay_prefix, capture_date, orig_scheme, orig_host, timestamp, mod) {
wb_replay_prefix = replay_prefix;
if (wb_replay_prefix) {
wb_replay_date_prefix = replay_prefix + capture_date + "/";
wb_replay_date_prefix = replay_prefix + capture_date + mod + "/";
if (capture_date.length > 0) {
wb_capture_date_part = "/" + capture_date + "/";

View File

@ -7,7 +7,8 @@
"{{ cdx['timestamp'] if include_ts else ''}}",
"{{ urlsplit.scheme }}",
"{{ urlsplit.netloc }}",
"{{ cdx.timestamp | format_ts('%s') }}");
"{{ cdx.timestamp | format_ts('%s') }}",
"{{ wbrequest.wb_url.mod }}");
</script>
{% endif %}
<script>

View File

@ -169,7 +169,8 @@ class StatusAndHeadersParser(object):
# append continuation lines, if any
while next_line and next_line.startswith((' ', '\t')):
value += next_line
if value is not None:
value += next_line
next_line, total_read = _strip_count(stream.readline(),
total_read)

View File

@ -32,6 +32,10 @@ False
# empty
>>> st2 = StatusAndHeadersParser(['HTTP/1.0']).parse(BytesIO(status_headers_2)); x = st2.validate_statusline('204 No Content'); st2
StatusAndHeaders(protocol = '', statusline = '204 No Content', headers = [])
>>> StatusAndHeadersParser(['HTTP/1.0']).parse(BytesIO(status_headers_3))
StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '204 Empty', headers = [('Content-Type', 'Value'), ('Content-Length', '0')])
"""
@ -54,6 +58,14 @@ status_headers_2 = """
"""
status_headers_3 = "\
HTTP/1.0 204 Empty\r\n\
Content-Type: Value\r\n\
%Invalid%\r\n\
\tMultiline\r\n\
Content-Length: 0\r\n\
\r\n"
if __name__ == "__main__":
import doctest

View File

@ -32,12 +32,11 @@ class ArchiveIterator(object):
self.member_info = None
def iter_records(self):
def iter_records(self, block_size=16384):
""" iterate over each record
"""
decomp_type = 'gzip'
block_size = 16384
self.reader = DecompressingBufferedReader(self.fh,
block_size=block_size)
@ -168,6 +167,8 @@ class ArchiveIndexEntry(object):
self.status = status_headers.get_statuscode()
if not self.status:
self.status = '-'
if self.status == '204' and 'Error' in status_headers.statusline:
self.status = '-'
def set_rec_info(self, offset, length, digest):
self.offset = str(offset)
@ -202,8 +203,9 @@ class ArchiveIndexEntry(object):
def create_record_iter(arcv_iter, options):
append_post = options.get('append_post')
include_all = options.get('include_all')
block_size = options.get('block_size', 16384)
for record in arcv_iter.iter_records():
for record in arcv_iter.iter_records(block_size):
entry = None
if not include_all and (record.status_headers.get_statuscode() == '-'):
@ -314,11 +316,11 @@ def parse_warc_record(record):
get_header('Content-Type'),
def_mime)
# status
if record.rec_type in ('request', 'revisit'):
entry.status = '-'
else:
# status -- only for response records (by convention):
if record.rec_type == 'response':
entry.extract_status(record.status_headers)
else:
entry.status = '-'
# digest
entry.digest = record.rec_headers.get_header('WARC-Payload-Digest')

View File

@ -36,7 +36,7 @@ class ArchiveLoadFailed(WbException):
#=================================================================
class ArcWarcRecordLoader:
# Standard ARC v1.0 headers
# TODO: support ARV v2.0 also?
# TODO: support ARC v2.0 also?
ARC_HEADERS = ["uri", "ip-address", "archive-date",
"content-type", "length"]
@ -128,9 +128,14 @@ class ArcWarcRecordLoader:
# limit stream to the length for all valid records
stream = LimitReader.wrap_stream(stream, length)
# if empty record (error or otherwise) set status to -
# if empty record (error or otherwise) set status to 204
if length == 0:
status_headers = StatusAndHeaders('- None', [])
if is_err:
msg = '204 Possible Error'
else:
msg = '204 No Content'
status_headers = StatusAndHeaders(msg, [])
# response record or non-empty revisit: parse HTTP status and headers!
elif (rec_type in ('response', 'revisit') and
@ -144,8 +149,10 @@ class ArcWarcRecordLoader:
# everything else: create a no-status entry, set content-type
else:
content_type_header = [('Content-Type', content_type)]
status_headers = StatusAndHeaders('- OK', content_type_header)
content_type_header = [('Content-Type', content_type),
('Content-Length', length)]
status_headers = StatusAndHeaders('200 OK', content_type_header)
return ArcWarcRecord(the_format, rec_type,
rec_headers, stream, status_headers,

View File

@ -43,12 +43,16 @@ com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ
CDX N b a m s k r M S V g
com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1656 151 example.arc
# wget warc, just responses
# wget warc, includes metadata by default
>>> print_cdx_index('example-wget-1-14.warc.gz')
CDX N b a m s k r M S V g
com,example)/ 20140216012908 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1151 792 example-wget-1-14.warc.gz
metadata)/gnu.org/software/wget/warc/manifest.txt 20140216012908 metadata://gnu.org/software/wget/warc/MANIFEST.txt text/plain - SWUF4CK2XMZSOKSA7SDT7M7NUGWH2TRE - - 315 1943 example-wget-1-14.warc.gz
metadata)/gnu.org/software/wget/warc/wget_arguments.txt 20140216012908 metadata://gnu.org/software/wget/warc/wget_arguments.txt text/plain - UCXDCGORD6K4RJT5NUQGKE2PKEG4ZZD6 - - 340 2258 example-wget-1-14.warc.gz
metadata)/gnu.org/software/wget/warc/wget.log 20140216012908 metadata://gnu.org/software/wget/warc/wget.log text/plain - 2ULE2LD5UOWDXGACCT624TU5BVKACRQ4 - - 599 2598 example-wget-1-14.warc.gz
# wget warc include all w/ metadata
# wget warc, includes metadata and request
>>> print_cdx_index('example-wget-1-14.warc.gz', include_all=True)
CDX N b a m s k r M S V g
com,example)/ 20140216012908 http://example.com/ - - - - - 394 398 example-wget-1-14.warc.gz
@ -110,32 +114,32 @@ org,httpbin)/post?data=^&foo=bar 20140610001255 http://httpbin.org/post?foo=bar
>>> cli_lines(['--sort', '-', TEST_WARC_DIR])
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1001 353 example-url-agnostic-orig.warc.gz
200
Total: 206
# test sort, multiple inputs, all records + post query
>>> cli_lines(['--sort', '-a', '-p', '-9', TEST_WARC_DIR])
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - 355 example-url-agnostic-revisit.warc.gz
org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - 353 example-url-agnostic-orig.warc.gz
398
Total: 398
# test writing to stdout
>>> cli_lines(['-', TEST_WARC_DIR + 'example.warc.gz'])
com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
4
Total: 4
# test writing to stdout ('-' omitted)
>>> cli_lines([TEST_WARC_DIR + 'example.warc.gz'])
com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
4
Total: 4
# test writing to temp dir, also use unicode filename
>>> cli_lines_with_dir(unicode(TEST_WARC_DIR + 'example.warc.gz'))
example.cdx
com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
4
Total: 4
"""
from pywb import get_test_dir
@ -191,9 +195,9 @@ def cli_lines(cmds):
lines = buff.getvalue().rstrip().split('\n')
# print first, last, num lines
print (lines[1])
print (lines[-1])
print len(lines)
print(lines[1])
print(lines[-1])
print('Total: ' + str(len(lines)))
def cli_lines_with_dir(input_):
try:
@ -224,6 +228,6 @@ def cli_lines_with_dir(input_):
# print first, last, num lines
print (lines[1])
print (lines[-1])
print len(lines)
print('Total: ' + str(len(lines)))

View File

@ -74,8 +74,8 @@ class SearchPageWbUrlHandler(WbUrlHandler):
return self.handle_request(wbrequest)
def get_top_frame_params(self, wbrequest):
embed_url = wbrequest.wb_url.to_str(mod='')
def get_top_frame_params(self, wbrequest, mod=''):
embed_url = wbrequest.wb_url.to_str(mod=mod)
if wbrequest.wb_url.timestamp:
timestamp = wbrequest.wb_url.timestamp

View File

@ -34,7 +34,7 @@ class PyTest(TestCommand):
setup(
name='pywb',
version='0.6.3',
version='0.6.4',
url='https://github.com/ikreymer/pywb',
author='Ilya Kreymer',
author_email='ikreymer@gmail.com',

View File

@ -14,8 +14,8 @@ class TestLiveRewriter:
assert resp.status_int == 200
def test_live_rewrite_redirect_2(self):
resp = self.testapp.get('/rewrite/http://facebook.com/')
assert resp.status_int == 301
resp = self.testapp.get('/rewrite/http://httpbin.org/redirect-to?url=http://example.com/')
assert resp.status_int == 302
def test_live_rewrite_post(self):
resp = self.testapp.post('/rewrite/httpbin.org/post', {'foo': 'bar', 'test': 'abc'})