mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
Merge branch 'develop' for 0.6.4
This commit is contained in:
commit
71a8abe9c3
16
CHANGES.rst
16
CHANGES.rst
@ -1,3 +1,19 @@
|
||||
pywb 0.6.4 changelist
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
* Ignore bad multiline headers in warc.
|
||||
|
||||
* Rewrite fix: Don't parse html entities in HTML rewriter.
|
||||
|
||||
* Ensure cdx iterator closed when reeading.
|
||||
|
||||
* Rewrite fix: remove pywb prefix from any query params.
|
||||
|
||||
* Rewrite fix: better JS rewriting, avoid // comments when matching protocol-relative urls.
|
||||
|
||||
* WARC metadata and resource records include in cdx from cdx-indexer by default
|
||||
|
||||
|
||||
pywb 0.6.3 changelist
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
|
@ -1,10 +1,10 @@
|
||||
PyWb 0.6.3
|
||||
PyWb 0.6.4
|
||||
==========
|
||||
|
||||
.. image:: https://travis-ci.org/ikreymer/pywb.png?branch=master
|
||||
:target: https://travis-ci.org/ikreymer/pywb
|
||||
.. image:: https://coveralls.io/repos/ikreymer/pywb/badge.png?branch=master
|
||||
:target: https://coveralls.io/r/ikreymer/pywb?branch=master
|
||||
.. image:: https://coveralls.io/repos/ikreymer/pywb/badge.png?branch=develop
|
||||
:target: https://coveralls.io/r/ikreymer/pywb?branch=develop
|
||||
.. image:: https://img.shields.io/gratipay/ikreymer.svg
|
||||
:target: https://www.gratipay.com/ikreymer/
|
||||
|
||||
|
@ -28,8 +28,17 @@ class CDXFile(CDXSource):
|
||||
self.filename = filename
|
||||
|
||||
def load_cdx(self, query):
|
||||
source = open(self.filename)
|
||||
return iter_range(source, query.key, query.end_key)
|
||||
def do_open():
|
||||
try:
|
||||
source = open(self.filename)
|
||||
gen = iter_range(source, query.key, query.end_key)
|
||||
for line in gen:
|
||||
yield line
|
||||
finally:
|
||||
source.close()
|
||||
|
||||
return do_open()
|
||||
#return iter_range(do_open(), query.key, query.end_key)
|
||||
|
||||
def __str__(self):
|
||||
return 'CDX File - ' + self.filename
|
||||
|
@ -78,6 +78,8 @@ class WbRequest(object):
|
||||
rel_prefix,
|
||||
env.get('SCRIPT_NAME', '/'),
|
||||
cookie_scope)
|
||||
|
||||
self.urlrewriter.deprefix_url()
|
||||
else:
|
||||
# no wb_url, just store blank wb_url
|
||||
self.wb_url = None
|
||||
|
@ -136,9 +136,9 @@ class WSGIApp(object):
|
||||
err_details = None
|
||||
|
||||
if error_view:
|
||||
if err_url:
|
||||
if err_url and isinstance(err_url, str):
|
||||
err_url = err_url.decode('utf-8', 'ignore')
|
||||
if err_msg:
|
||||
if err_msg and isinstance(err_msg, str):
|
||||
err_msg = err_msg.decode('utf-8', 'ignore')
|
||||
|
||||
return error_view.render_response(exc_type=type(exc).__name__,
|
||||
|
@ -263,10 +263,20 @@ class HTMLRewriterMixin(object):
|
||||
|
||||
#=================================================================
|
||||
class HTMLRewriter(HTMLRewriterMixin, HTMLParser):
|
||||
PARSETAG = re.compile('[<]')
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
HTMLParser.__init__(self)
|
||||
super(HTMLRewriter, self).__init__(*args, **kwargs)
|
||||
|
||||
def reset(self):
|
||||
HTMLParser.reset(self)
|
||||
self.interesting = self.PARSETAG
|
||||
|
||||
def clear_cdata_mode(self):
|
||||
HTMLParser.clear_cdata_mode(self)
|
||||
self.interesting = self.PARSETAG
|
||||
|
||||
def feed(self, string):
|
||||
try:
|
||||
HTMLParser.feed(self, string)
|
||||
@ -311,11 +321,12 @@ class HTMLRewriter(HTMLRewriterMixin, HTMLParser):
|
||||
def handle_data(self, data):
|
||||
self.parse_data(data)
|
||||
|
||||
def handle_entityref(self, data):
|
||||
self.out.write('&' + data + ';')
|
||||
|
||||
def handle_charref(self, data):
|
||||
self.out.write('&#' + data + ';')
|
||||
# overriding regex so that these are no longer called
|
||||
#def handle_entityref(self, data):
|
||||
# self.out.write('&' + data + ';')
|
||||
#
|
||||
#def handle_charref(self, data):
|
||||
# self.out.write('&#' + data + ';')
|
||||
|
||||
def handle_comment(self, data):
|
||||
self.out.write('<!--')
|
||||
|
@ -111,7 +111,8 @@ class JSLinkOnlyRewriter(RegexRewriter):
|
||||
JS Rewriter which rewrites absolute http://, https:// and // urls
|
||||
at the beginning of a string
|
||||
"""
|
||||
JS_HTTPX = r'(?<="|\'|;)(?:https?:)?\\{0,4}/\\{0,4}/[A-Za-z0-9:_@.-]+'
|
||||
#JS_HTTPX = r'(?:(?:(?<=["\';])https?:)|(?<=["\']))\\{0,4}/\\{0,4}/[A-Za-z0-9:_@.-]+.*(?=["\s\';&\\])'
|
||||
JS_HTTPX = r'(?<=["\';])(?:https?:)?\\{0,4}/\\{0,4}/[A-Za-z0-9:_@.\-/\\?&#]+(?=["\';&\\])'
|
||||
|
||||
def __init__(self, rewriter, rules=[]):
|
||||
rules = rules + [
|
||||
|
@ -28,8 +28,11 @@ ur"""
|
||||
<base href="/web/20131226101010/http://example.com/some/path/static/"/><img src="/web/20131226101010im_/http://example.com/some/path/static/image.gif"/>
|
||||
|
||||
# HTML Entities
|
||||
>>> parse('<a href="">› ></div>')
|
||||
<a href="">› ></div>
|
||||
>>> parse('<a href="">› > ?</div>')
|
||||
<a href="">› > ?</div>
|
||||
|
||||
>>> parse('<div>X&Y</div> </div>X&Y;</div>')
|
||||
<div>X&Y</div> </div>X&Y;</div>
|
||||
|
||||
# Don't rewrite anchors
|
||||
>>> parse('<HTML><A Href="#abc">Text</a></hTmL>')
|
||||
|
@ -61,6 +61,9 @@ r"""
|
||||
>>> _test_js('"http:\\/\\/www.example.com\\/some\\/path\\/?query=1"')
|
||||
'"/web/20131010/http:\\/\\/www.example.com\\/some\\/path\\/?query=1"'
|
||||
|
||||
>>> _test_js('"http:\/\/sub-site.example.com\/path-dashes\/path_other\/foo_bar.txt"')
|
||||
'"/web/20131010/http:\\/\\/sub-site.example.com\\/path-dashes\\/path_other\\/foo_bar.txt"'
|
||||
|
||||
|
||||
#=================================================================
|
||||
# XML Rewriting
|
||||
|
@ -105,10 +105,10 @@ def test_example_1():
|
||||
assert '/pywb/20131226101010/http://www.iana.org/domains/example' in buff, buff
|
||||
|
||||
def test_example_2_redirect():
|
||||
status_headers, buff = get_rewritten('http://facebook.com/', urlrewriter)
|
||||
status_headers, buff = get_rewritten('http://httpbin.org/redirect-to?url=http://example.com/', urlrewriter)
|
||||
|
||||
# redirect, no content
|
||||
assert status_headers.get_statuscode() == '301'
|
||||
assert status_headers.get_statuscode() == '302'
|
||||
assert len(buff) == 0
|
||||
|
||||
|
||||
|
@ -74,6 +74,18 @@
|
||||
>>> UrlRewriter('2013id_/example.com/file/path/blah.html', '/123/').get_new_url(timestamp='20131024')
|
||||
'/123/20131024id_/http://example.com/file/path/blah.html'
|
||||
|
||||
# deprefix tests
|
||||
>>> do_deprefix('2013id_/http://example.com/file/path/blah.html?param=http://localhost:8080/pywb/20141226/http://example.com/', '/pywb/', 'http://localhost:8080/pywb/')
|
||||
'http://example.com/file/path/blah.html?param=http://example.com/'
|
||||
|
||||
>>> do_deprefix('2013id_/http://example.com/file/path/blah.html?param=http://localhost:8080/pywb/if_/https://example.com/filename.html', '/pywb/', 'http://localhost:8080/pywb/')
|
||||
'http://example.com/file/path/blah.html?param=https://example.com/filename.html'
|
||||
|
||||
>>> do_deprefix('2013id_/http://example.com/file/path/blah.html?param=http://localhost:8080/pywb/https://example.com/filename.html', '/pywb/', 'http://localhost:8080/pywb/')
|
||||
'http://example.com/file/path/blah.html?param=https://example.com/filename.html'
|
||||
|
||||
>>> do_deprefix('http://example.com/file.html?param=http://localhost:8080/pywb/https%3A//example.com/filename.html&other=value&a=b¶m2=http://localhost:8080/pywb/http://test.example.com', '/pywb/', 'http://localhost:8080/pywb/')
|
||||
'http://example.com/file.html?param=https://example.com/filename.html&other=value&a=b¶m2=http://test.example.com'
|
||||
|
||||
# HttpsUrlRewriter tests
|
||||
>>> HttpsUrlRewriter('http://example.com/', None).rewrite('https://example.com/abc')
|
||||
@ -86,13 +98,22 @@
|
||||
|
||||
|
||||
from pywb.rewrite.url_rewriter import UrlRewriter, HttpsUrlRewriter
|
||||
|
||||
import urllib
|
||||
|
||||
def do_rewrite(rel_url, base_url, prefix, mod=None, full_prefix=None):
|
||||
rewriter = UrlRewriter(base_url, prefix, full_prefix=full_prefix)
|
||||
return rewriter.rewrite(rel_url, mod)
|
||||
|
||||
|
||||
def do_deprefix(url, rel_prefix, full_prefix):
|
||||
encoded = urllib.quote_plus(full_prefix)
|
||||
url = url.replace(full_prefix, encoded)
|
||||
|
||||
rewriter = UrlRewriter(url, rel_prefix, full_prefix)
|
||||
url = rewriter.deprefix_url()
|
||||
return urllib.unquote_plus(url)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
doctest.testmod()
|
||||
|
@ -1,4 +1,3 @@
|
||||
import copy
|
||||
import urlparse
|
||||
|
||||
from wburl import WbUrl
|
||||
@ -88,6 +87,9 @@ class UrlRewriter(object):
|
||||
cls = get_cookie_rewriter(scope)
|
||||
return cls(self)
|
||||
|
||||
def deprefix_url(self):
|
||||
return self.wburl.deprefix_url(self.full_prefix)
|
||||
|
||||
def __repr__(self):
|
||||
return "UrlRewriter('{0}', '{1}')".format(self.wburl, self.prefix)
|
||||
|
||||
@ -150,3 +152,6 @@ class HttpsUrlRewriter(UrlRewriter):
|
||||
|
||||
def get_cookie_rewriter(self, scope=None):
|
||||
return None
|
||||
|
||||
def deprefix_url(self):
|
||||
return self.wburl.url
|
||||
|
@ -39,7 +39,7 @@ wayback url format.
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
import urllib
|
||||
|
||||
#=================================================================
|
||||
class BaseWbUrl(object):
|
||||
@ -149,6 +149,14 @@ class WbUrl(BaseWbUrl):
|
||||
self.timestamp = timestamp
|
||||
self.type = self.REPLAY
|
||||
|
||||
|
||||
def deprefix_url(self, prefix):
|
||||
prefix = urllib.quote_plus(prefix)
|
||||
rex_query = '=' + re.escape(prefix) + '([0-9])*([\w]{2}_)?/?'
|
||||
new_url = re.sub(rex_query, '=', self.url)
|
||||
self.url = new_url
|
||||
return self.url
|
||||
|
||||
# Str Representation
|
||||
# ====================
|
||||
def to_str(self, **overrides):
|
||||
|
@ -11,18 +11,18 @@ rules:
|
||||
# facebook rules
|
||||
#=================================================================
|
||||
- url_prefix: 'com,facebook)/ajax/pagelet/generic.php/'
|
||||
|
||||
|
||||
fuzzy_lookup: 'com,facebook\)/.*[?&]data=(.*?(?:[&]|query_type[^,]+))'
|
||||
|
||||
|
||||
- url_prefix: 'com,facebook)/ajax/ufi/'
|
||||
|
||||
|
||||
fuzzy_lookup:
|
||||
- ft_ent_identifier
|
||||
- lsd
|
||||
|
||||
- url_prefix: 'com,facebook)/ajax/chat/hovercard/sidebar.php'
|
||||
|
||||
fuzzy_lookup:
|
||||
fuzzy_lookup:
|
||||
- ids[0]
|
||||
|
||||
- url_prefix: 'com,facebook)/login.php'
|
||||
@ -82,20 +82,21 @@ rules:
|
||||
#=================================================================
|
||||
|
||||
- url_prefix: 'com,google,plus)/_/stream/getactivities'
|
||||
|
||||
fuzzy_lookup: '(egk[^"]+).*(f.sid=[^&]+)'
|
||||
|
||||
|
||||
# fuzzy_lookup: '(egk[^"]+)?.*(f.sid=[^&]+)'
|
||||
fuzzy_lookup: 'f.req=.*\]\]\]\,\"([^"]+).*(f.sid=[^&]+)'
|
||||
|
||||
- url_prefix: 'com,google,plus)/_/stream/squarestream'
|
||||
|
||||
|
||||
fuzzy_lookup: '(cai[^"]+).*(f.sid=[^&]+)'
|
||||
|
||||
|
||||
- url_prefix: 'com,google,plus)/_/communities/rt/landing'
|
||||
|
||||
|
||||
fuzzy_lookup: 'com,google,plus\)/_/.*?.*\,(\d{13}\])&.*(f.sid=[^&]+).*'
|
||||
|
||||
|
||||
|
||||
- url_prefix: 'com,google,plus)/_/'
|
||||
|
||||
|
||||
fuzzy_lookup: 'com,google,plus\)/_/.*?.*(f.sid=[^&]+)'
|
||||
|
||||
|
||||
|
@ -708,11 +708,11 @@ WB_wombat_init = (function() {
|
||||
}
|
||||
|
||||
//============================================
|
||||
function wombat_init(replay_prefix, capture_date, orig_scheme, orig_host, timestamp) {
|
||||
function wombat_init(replay_prefix, capture_date, orig_scheme, orig_host, timestamp, mod) {
|
||||
wb_replay_prefix = replay_prefix;
|
||||
|
||||
if (wb_replay_prefix) {
|
||||
wb_replay_date_prefix = replay_prefix + capture_date + "/";
|
||||
wb_replay_date_prefix = replay_prefix + capture_date + mod + "/";
|
||||
|
||||
if (capture_date.length > 0) {
|
||||
wb_capture_date_part = "/" + capture_date + "/";
|
||||
|
@ -7,7 +7,8 @@
|
||||
"{{ cdx['timestamp'] if include_ts else ''}}",
|
||||
"{{ urlsplit.scheme }}",
|
||||
"{{ urlsplit.netloc }}",
|
||||
"{{ cdx.timestamp | format_ts('%s') }}");
|
||||
"{{ cdx.timestamp | format_ts('%s') }}",
|
||||
"{{ wbrequest.wb_url.mod }}");
|
||||
</script>
|
||||
{% endif %}
|
||||
<script>
|
||||
|
@ -169,7 +169,8 @@ class StatusAndHeadersParser(object):
|
||||
|
||||
# append continuation lines, if any
|
||||
while next_line and next_line.startswith((' ', '\t')):
|
||||
value += next_line
|
||||
if value is not None:
|
||||
value += next_line
|
||||
next_line, total_read = _strip_count(stream.readline(),
|
||||
total_read)
|
||||
|
||||
|
@ -32,6 +32,10 @@ False
|
||||
# empty
|
||||
>>> st2 = StatusAndHeadersParser(['HTTP/1.0']).parse(BytesIO(status_headers_2)); x = st2.validate_statusline('204 No Content'); st2
|
||||
StatusAndHeaders(protocol = '', statusline = '204 No Content', headers = [])
|
||||
|
||||
|
||||
>>> StatusAndHeadersParser(['HTTP/1.0']).parse(BytesIO(status_headers_3))
|
||||
StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '204 Empty', headers = [('Content-Type', 'Value'), ('Content-Length', '0')])
|
||||
"""
|
||||
|
||||
|
||||
@ -54,6 +58,14 @@ status_headers_2 = """
|
||||
|
||||
"""
|
||||
|
||||
status_headers_3 = "\
|
||||
HTTP/1.0 204 Empty\r\n\
|
||||
Content-Type: Value\r\n\
|
||||
%Invalid%\r\n\
|
||||
\tMultiline\r\n\
|
||||
Content-Length: 0\r\n\
|
||||
\r\n"
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
|
@ -32,12 +32,11 @@ class ArchiveIterator(object):
|
||||
|
||||
self.member_info = None
|
||||
|
||||
def iter_records(self):
|
||||
def iter_records(self, block_size=16384):
|
||||
""" iterate over each record
|
||||
"""
|
||||
|
||||
decomp_type = 'gzip'
|
||||
block_size = 16384
|
||||
|
||||
self.reader = DecompressingBufferedReader(self.fh,
|
||||
block_size=block_size)
|
||||
@ -168,6 +167,8 @@ class ArchiveIndexEntry(object):
|
||||
self.status = status_headers.get_statuscode()
|
||||
if not self.status:
|
||||
self.status = '-'
|
||||
if self.status == '204' and 'Error' in status_headers.statusline:
|
||||
self.status = '-'
|
||||
|
||||
def set_rec_info(self, offset, length, digest):
|
||||
self.offset = str(offset)
|
||||
@ -202,8 +203,9 @@ class ArchiveIndexEntry(object):
|
||||
def create_record_iter(arcv_iter, options):
|
||||
append_post = options.get('append_post')
|
||||
include_all = options.get('include_all')
|
||||
block_size = options.get('block_size', 16384)
|
||||
|
||||
for record in arcv_iter.iter_records():
|
||||
for record in arcv_iter.iter_records(block_size):
|
||||
entry = None
|
||||
|
||||
if not include_all and (record.status_headers.get_statuscode() == '-'):
|
||||
@ -314,11 +316,11 @@ def parse_warc_record(record):
|
||||
get_header('Content-Type'),
|
||||
def_mime)
|
||||
|
||||
# status
|
||||
if record.rec_type in ('request', 'revisit'):
|
||||
entry.status = '-'
|
||||
else:
|
||||
# status -- only for response records (by convention):
|
||||
if record.rec_type == 'response':
|
||||
entry.extract_status(record.status_headers)
|
||||
else:
|
||||
entry.status = '-'
|
||||
|
||||
# digest
|
||||
entry.digest = record.rec_headers.get_header('WARC-Payload-Digest')
|
||||
|
@ -36,7 +36,7 @@ class ArchiveLoadFailed(WbException):
|
||||
#=================================================================
|
||||
class ArcWarcRecordLoader:
|
||||
# Standard ARC v1.0 headers
|
||||
# TODO: support ARV v2.0 also?
|
||||
# TODO: support ARC v2.0 also?
|
||||
ARC_HEADERS = ["uri", "ip-address", "archive-date",
|
||||
"content-type", "length"]
|
||||
|
||||
@ -128,9 +128,14 @@ class ArcWarcRecordLoader:
|
||||
# limit stream to the length for all valid records
|
||||
stream = LimitReader.wrap_stream(stream, length)
|
||||
|
||||
# if empty record (error or otherwise) set status to -
|
||||
# if empty record (error or otherwise) set status to 204
|
||||
if length == 0:
|
||||
status_headers = StatusAndHeaders('- None', [])
|
||||
if is_err:
|
||||
msg = '204 Possible Error'
|
||||
else:
|
||||
msg = '204 No Content'
|
||||
|
||||
status_headers = StatusAndHeaders(msg, [])
|
||||
|
||||
# response record or non-empty revisit: parse HTTP status and headers!
|
||||
elif (rec_type in ('response', 'revisit') and
|
||||
@ -144,8 +149,10 @@ class ArcWarcRecordLoader:
|
||||
|
||||
# everything else: create a no-status entry, set content-type
|
||||
else:
|
||||
content_type_header = [('Content-Type', content_type)]
|
||||
status_headers = StatusAndHeaders('- OK', content_type_header)
|
||||
content_type_header = [('Content-Type', content_type),
|
||||
('Content-Length', length)]
|
||||
|
||||
status_headers = StatusAndHeaders('200 OK', content_type_header)
|
||||
|
||||
return ArcWarcRecord(the_format, rec_type,
|
||||
rec_headers, stream, status_headers,
|
||||
|
@ -43,12 +43,16 @@ com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ
|
||||
CDX N b a m s k r M S V g
|
||||
com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1656 151 example.arc
|
||||
|
||||
# wget warc, just responses
|
||||
# wget warc, includes metadata by default
|
||||
>>> print_cdx_index('example-wget-1-14.warc.gz')
|
||||
CDX N b a m s k r M S V g
|
||||
com,example)/ 20140216012908 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1151 792 example-wget-1-14.warc.gz
|
||||
metadata)/gnu.org/software/wget/warc/manifest.txt 20140216012908 metadata://gnu.org/software/wget/warc/MANIFEST.txt text/plain - SWUF4CK2XMZSOKSA7SDT7M7NUGWH2TRE - - 315 1943 example-wget-1-14.warc.gz
|
||||
metadata)/gnu.org/software/wget/warc/wget_arguments.txt 20140216012908 metadata://gnu.org/software/wget/warc/wget_arguments.txt text/plain - UCXDCGORD6K4RJT5NUQGKE2PKEG4ZZD6 - - 340 2258 example-wget-1-14.warc.gz
|
||||
metadata)/gnu.org/software/wget/warc/wget.log 20140216012908 metadata://gnu.org/software/wget/warc/wget.log text/plain - 2ULE2LD5UOWDXGACCT624TU5BVKACRQ4 - - 599 2598 example-wget-1-14.warc.gz
|
||||
|
||||
# wget warc include all w/ metadata
|
||||
|
||||
# wget warc, includes metadata and request
|
||||
>>> print_cdx_index('example-wget-1-14.warc.gz', include_all=True)
|
||||
CDX N b a m s k r M S V g
|
||||
com,example)/ 20140216012908 http://example.com/ - - - - - 394 398 example-wget-1-14.warc.gz
|
||||
@ -110,32 +114,32 @@ org,httpbin)/post?data=^&foo=bar 20140610001255 http://httpbin.org/post?foo=bar
|
||||
>>> cli_lines(['--sort', '-', TEST_WARC_DIR])
|
||||
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
|
||||
org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1001 353 example-url-agnostic-orig.warc.gz
|
||||
200
|
||||
Total: 206
|
||||
|
||||
# test sort, multiple inputs, all records + post query
|
||||
>>> cli_lines(['--sort', '-a', '-p', '-9', TEST_WARC_DIR])
|
||||
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - 355 example-url-agnostic-revisit.warc.gz
|
||||
org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - 353 example-url-agnostic-orig.warc.gz
|
||||
398
|
||||
Total: 398
|
||||
|
||||
# test writing to stdout
|
||||
>>> cli_lines(['-', TEST_WARC_DIR + 'example.warc.gz'])
|
||||
com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
|
||||
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
|
||||
4
|
||||
Total: 4
|
||||
|
||||
# test writing to stdout ('-' omitted)
|
||||
>>> cli_lines([TEST_WARC_DIR + 'example.warc.gz'])
|
||||
com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
|
||||
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
|
||||
4
|
||||
Total: 4
|
||||
|
||||
# test writing to temp dir, also use unicode filename
|
||||
>>> cli_lines_with_dir(unicode(TEST_WARC_DIR + 'example.warc.gz'))
|
||||
example.cdx
|
||||
com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
|
||||
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
|
||||
4
|
||||
Total: 4
|
||||
"""
|
||||
|
||||
from pywb import get_test_dir
|
||||
@ -191,9 +195,9 @@ def cli_lines(cmds):
|
||||
lines = buff.getvalue().rstrip().split('\n')
|
||||
|
||||
# print first, last, num lines
|
||||
print (lines[1])
|
||||
print (lines[-1])
|
||||
print len(lines)
|
||||
print(lines[1])
|
||||
print(lines[-1])
|
||||
print('Total: ' + str(len(lines)))
|
||||
|
||||
def cli_lines_with_dir(input_):
|
||||
try:
|
||||
@ -224,6 +228,6 @@ def cli_lines_with_dir(input_):
|
||||
# print first, last, num lines
|
||||
print (lines[1])
|
||||
print (lines[-1])
|
||||
print len(lines)
|
||||
print('Total: ' + str(len(lines)))
|
||||
|
||||
|
||||
|
@ -74,8 +74,8 @@ class SearchPageWbUrlHandler(WbUrlHandler):
|
||||
|
||||
return self.handle_request(wbrequest)
|
||||
|
||||
def get_top_frame_params(self, wbrequest):
|
||||
embed_url = wbrequest.wb_url.to_str(mod='')
|
||||
def get_top_frame_params(self, wbrequest, mod=''):
|
||||
embed_url = wbrequest.wb_url.to_str(mod=mod)
|
||||
|
||||
if wbrequest.wb_url.timestamp:
|
||||
timestamp = wbrequest.wb_url.timestamp
|
||||
|
2
setup.py
2
setup.py
@ -34,7 +34,7 @@ class PyTest(TestCommand):
|
||||
|
||||
setup(
|
||||
name='pywb',
|
||||
version='0.6.3',
|
||||
version='0.6.4',
|
||||
url='https://github.com/ikreymer/pywb',
|
||||
author='Ilya Kreymer',
|
||||
author_email='ikreymer@gmail.com',
|
||||
|
@ -14,8 +14,8 @@ class TestLiveRewriter:
|
||||
assert resp.status_int == 200
|
||||
|
||||
def test_live_rewrite_redirect_2(self):
|
||||
resp = self.testapp.get('/rewrite/http://facebook.com/')
|
||||
assert resp.status_int == 301
|
||||
resp = self.testapp.get('/rewrite/http://httpbin.org/redirect-to?url=http://example.com/')
|
||||
assert resp.status_int == 302
|
||||
|
||||
def test_live_rewrite_post(self):
|
||||
resp = self.testapp.post('/rewrite/httpbin.org/post', {'foo': 'bar', 'test': 'abc'})
|
||||
|
Loading…
x
Reference in New Issue
Block a user