1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

replay: better POST support via post query append!

record_loader can optionally parse 'request' records
archiveindexer has -a flag to write all records ('request' included),
-p flag to append post query
post-test.warc.gz and cdx
POST redirects using 307
This commit is contained in:
Ilya Kreymer 2014-06-10 19:21:46 -07:00
parent 028cdaa22e
commit e2349a74e2
17 changed files with 388 additions and 42 deletions

View File

@ -87,9 +87,9 @@ class FuzzyQuery:
matched_rule = rule matched_rule = rule
if len(m.groups()) == 1: groups = m.groups()
#filter_.append('~urlkey:' + m.group(1)) for g in groups:
filter_.append(rule.filter.format(m.group(1))) filter_.append(rule.filter.format(g))
break break
@ -100,6 +100,11 @@ class FuzzyQuery:
if matched_rule.replace: if matched_rule.replace:
repl = matched_rule.replace repl = matched_rule.replace
if '/_/stream/squarestream?soc-app' in url and 'jserror' not in url:
print 'KEY ', urlkey
print 'RULE ', url, vars(matched_rule)
print 'FILTERS ', filter_
inx = url.rfind(repl) inx = url.rfind(repl)
if inx > 0: if inx > 0:
url = url[:inx + 1] url = url[:inx + 1]
@ -148,6 +153,6 @@ class CDXDomainSpecificRule(BaseRule):
self.replace = unsurt(self.replace) self.replace = unsurt(self.replace)
if __name__ == "__main__": if __name__ == "__main__":
import doctest import doctest
doctest.testmod() doctest.testmod()

View File

@ -1,4 +1,5 @@
from pywb.utils.statusandheaders import StatusAndHeaders from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.utils.loaders import extract_post_query, append_post_query
import pprint import pprint
@ -45,6 +46,8 @@ class WbRequest(object):
else: else:
self.request_uri = env.get('REL_REQUEST_URI') self.request_uri = env.get('REL_REQUEST_URI')
self.method = self.env.get('REQUEST_METHOD')
self.coll = coll self.coll = coll
if not host_prefix: if not host_prefix:
@ -116,6 +119,22 @@ class WbRequest(object):
wburl_str = self.referrer[len(self.host_prefix + self.rel_prefix):] wburl_str = self.referrer[len(self.host_prefix + self.rel_prefix):]
return wburl_str return wburl_str
def normalize_post_query(self):
if self.method != 'POST':
return
if not self.wb_url:
return
mime = self.env.get('CONTENT_TYPE')
length = self.env.get('CONTENT_LENGTH')
stream = self.env['wsgi.input']
post_query = extract_post_query('POST', mime, length, stream)
if post_query:
self.wb_url.url = append_post_query(self.wb_url.url, post_query)
#================================================================= #=================================================================
class WbResponse(object): class WbResponse(object):

View File

@ -18,7 +18,9 @@ class RewrittenStatusAndHeaders:
#================================================================= #=================================================================
class HeaderRewriter: class HeaderRewriter:
REWRITE_TYPES = { REWRITE_TYPES = {
'html': ['text/html', 'application/xhtml'], 'html': ['text/html',
'application/xhtml',
'application/xhtml+xml'],
'css': ['text/css'], 'css': ['text/css'],

View File

@ -40,6 +40,27 @@ rules:
replace: '/' replace: '/'
# google plus rules
#=================================================================
- url_prefix: 'com,google,plus)/_/stream/getactivities'
fuzzy_lookup: '(egk[^"]+).*(f.sid=[^&]+)'
- url_prefix: 'com,google,plus)/_/stream/squarestream'
fuzzy_lookup: '(cai[^"]+).*(f.sid=[^&]+)'
- url_prefix: 'com,google,plus)/_/communities/rt/landing'
fuzzy_lookup: 'com,google,plus\)/_/.*?.*\,(\d{13}\])&.*(f.sid=[^&]+).*'
- url_prefix: 'com,google,plus)/_/'
fuzzy_lookup: 'com,google,plus\)/_/.*?.*(f.sid=[^&]+)'
# testing rules -- not for valid domain # testing rules -- not for valid domain
#================================================================= #=================================================================
# this rule block is a non-existent prefix merely for testing # this rule block is a non-existent prefix merely for testing

View File

@ -102,6 +102,6 @@ if (wbinfo.is_frame_mp && wbinfo.canon_url &&
(window.self == window.top) && (window.self == window.top) &&
window.location.href != wbinfo.canon_url) { window.location.href != wbinfo.canon_url) {
console.log('frame'); //console.log('frame');
window.location.replace(wbinfo.canon_url); //window.location.replace(wbinfo.canon_url);
} }

View File

@ -111,6 +111,10 @@ WB_wombat_init = (function() {
if (!url) { if (!url) {
return url; return url;
} }
if (url.indexOf("hypothes.is") > 0) {
return url;
}
var urltype_ = (typeof url); var urltype_ = (typeof url);

View File

@ -20,5 +20,8 @@
</script> </script>
<script src='{{ wbrequest.host_prefix }}/{{ static_path }}/wb.js'> </script> <script src='{{ wbrequest.host_prefix }}/{{ static_path }}/wb.js'> </script>
<link rel='stylesheet' href='{{ wbrequest.host_prefix }}/{{ static_path }}/wb.css'/> <link rel='stylesheet' href='{{ wbrequest.host_prefix }}/{{ static_path }}/wb.css'/>
<script type="text/javascript" src="https://hypothes.is/embed.js"></script>
<!-- End WB Insert --> <!-- End WB Insert -->

View File

@ -5,6 +5,7 @@ local and remote access
import os import os
import hmac import hmac
import urllib
import urllib2 import urllib2
import time import time
import pkg_resources import pkg_resources
@ -24,6 +25,56 @@ def load_yaml_config(config_file):
return config return config
#=================================================================
def extract_post_query(method, mime, length, stream):
"""
Extract a url-encoded form POST from stream
If not a application/x-www-form-urlencoded, or no missing
content length, return None
"""
if method.upper() != 'POST':
return None
if (not mime or
not mime.lower().startswith('application/x-www-form-urlencoded')):
return None
if not length or length == '0':
return None
try:
length = int(length)
except ValueError:
return None
#todo: encoding issues?
post_query = ''
while length > 0:
buff = stream.read(length)
length -= len(buff)
if not buff:
break
post_query += buff
post_query = urllib.unquote_plus(post_query)
return post_query
#=================================================================
def append_post_query(url, post_query):
if not post_query:
return url
if '?' not in url:
url += '?'
url += '&&&' + post_query
return url
#================================================================= #=================================================================
class BlockLoader(object): class BlockLoader(object):
""" """

View File

@ -1,6 +1,7 @@
from pywb.utils.timeutils import iso_date_to_timestamp from pywb.utils.timeutils import iso_date_to_timestamp
from pywb.utils.bufferedreaders import DecompressingBufferedReader from pywb.utils.bufferedreaders import DecompressingBufferedReader
from pywb.utils.canonicalize import canonicalize from pywb.utils.canonicalize import canonicalize
from pywb.utils.loaders import extract_post_query, append_post_query
from recordloader import ArcWarcRecordLoader from recordloader import ArcWarcRecordLoader
@ -21,15 +22,27 @@ class ArchiveIndexer(object):
The indexer will automatically detect format, and decompress The indexer will automatically detect format, and decompress
if necessary if necessary
""" """
# arc/warc record types which are
# indexed by default, without 'include_all'
DEFAULT_REC_TYPES = ('response', 'revisit', 'metadata', 'resource')
def __init__(self, fileobj, filename, def __init__(self, fileobj, filename,
out=sys.stdout, sort=False, writer=None, surt_ordered=True): out=sys.stdout, sort=False, writer=None, surt_ordered=True,
include_all=False, append_post_query=False):
self.fh = fileobj self.fh = fileobj
self.filename = filename self.filename = filename
self.loader = ArcWarcRecordLoader()
loader_parse_req = include_all or append_post_query
self.loader = ArcWarcRecordLoader(parse_request=loader_parse_req)
self.offset = 0 self.offset = 0
self.known_format = None self.known_format = None
self.surt_ordered = surt_ordered self.surt_ordered = surt_ordered
self.include_all = include_all
self.append_post_query = append_post_query
if writer: if writer:
self.writer = writer self.writer = writer
elif sort: elif sort:
@ -37,6 +50,12 @@ class ArchiveIndexer(object):
else: else:
self.writer = CDXWriter(out) self.writer = CDXWriter(out)
# todo: refactor this
self.writer.indexer = self
if append_post_query:
self.writer = PostResolveWriter(self.writer, self)
def make_index(self): def make_index(self):
""" Output a cdx index! """ Output a cdx index!
""" """
@ -127,9 +146,23 @@ class ArchiveIndexer(object):
self._read_to_record_end(reader, record) self._read_to_record_end(reader, record)
return record return record
post_query = None
if record.rec_type == 'request':
method = record.status_headers.protocol
mime = result[3]
len_ = record.status_headers.get_header('Content-Length')
post_query = extract_post_query(method,
mime,
len_,
record.stream)
# should be 0 if read query string
num = self.read_rest(record.stream)
# generate digest if it doesn't exist and if not a revisit # generate digest if it doesn't exist and if not a revisit
# if revisit, then nothing we can do here # if revisit, then nothing we can do here
if result[-1] == '-' and record.rec_type != 'revisit': elif result[-1] == '-' and record.rec_type != 'revisit':
digester = hashlib.sha1() digester = hashlib.sha1()
self.read_rest(record.stream, digester) self.read_rest(record.stream, digester)
result[-1] = base64.b32encode(digester.digest()) result[-1] = base64.b32encode(digester.digest())
@ -146,7 +179,7 @@ class ArchiveIndexer(object):
result.append(str(offset)) result.append(str(offset))
result.append(self.filename) result.append(self.filename)
self.writer.write(result) self.writer.write(result, record.rec_type, post_query)
return record return record
@ -154,25 +187,31 @@ class ArchiveIndexer(object):
""" Parse warc record to be included in index, or """ Parse warc record to be included in index, or
return none if skipping this type of record return none if skipping this type of record
""" """
if record.rec_type not in ('response', 'revisit',
'metadata', 'resource'): if (not self.append_post_query and
not self.include_record(record.rec_type)):
return None return None
url = record.rec_headers.get_header('WARC-Target-Uri') url = record.rec_headers.get_header('WARC-Target-Uri')
if not url:
return None
timestamp = record.rec_headers.get_header('WARC-Date') timestamp = record.rec_headers.get_header('WARC-Date')
timestamp = iso_date_to_timestamp(timestamp) timestamp = iso_date_to_timestamp(timestamp)
digest = record.rec_headers.get_header('WARC-Payload-Digest') digest = record.rec_headers.get_header('WARC-Payload-Digest')
status = self._extract_status(record.status_headers)
if record.rec_type == 'revisit': if record.rec_type == 'revisit':
mime = 'warc/revisit' mime = 'warc/revisit'
status = '-' status = '-'
elif record.rec_type == 'request':
mime = record.status_headers.get_header('Content-Type')
mime = self._extract_mime(mime, '-')
status = '-'
else: else:
mime = record.status_headers.get_header('Content-Type') mime = record.status_headers.get_header('Content-Type')
mime = self._extract_mime(mime) mime = self._extract_mime(mime)
status = self._extract_status(record.status_headers)
if digest and digest.startswith('sha1:'): if digest and digest.startswith('sha1:'):
digest = digest[len('sha1:'):] digest = digest[len('sha1:'):]
@ -225,14 +264,14 @@ class ArchiveIndexer(object):
MIME_RE = re.compile('[; ]') MIME_RE = re.compile('[; ]')
def _extract_mime(self, mime): def _extract_mime(self, mime, def_mime='unk'):
""" Utility function to extract mimetype only """ Utility function to extract mimetype only
from a full content type, removing charset settings from a full content type, removing charset settings
""" """
if mime: if mime:
mime = self.MIME_RE.split(mime, 1)[0] mime = self.MIME_RE.split(mime, 1)[0]
if not mime: if not mime:
mime = 'unk' mime = def_mime
return mime return mime
def _extract_status(self, status_headers): def _extract_status(self, status_headers):
@ -256,17 +295,27 @@ class ArchiveIndexer(object):
digester.update(b) digester.update(b)
return num return num
def include_record(self, type_):
return self.include_all or (type_ in self.DEFAULT_REC_TYPES)
def add_post_query(self, fields, post_query):
url = append_post_query(fields[2], post_query)
fields[0] = canonicalize(url, self.surt_ordered)
return fields
#================================================================= #=================================================================
class CDXWriter(object): class CDXWriter(object):
def __init__(self, out): def __init__(self, out):
self.out = out self.out = out
self.indexer = None
def start(self): def start(self):
self.out.write(' CDX N b a m s k r M S V g\n') self.out.write(' CDX N b a m s k r M S V g\n')
def write(self, line): def write(self, line, rec_type, *args):
self.out.write(' '.join(line) + '\n') if not self.indexer or self.indexer.include_record(rec_type):
self.out.write(' '.join(line) + '\n')
def end(self): def end(self):
pass pass
@ -278,14 +327,66 @@ class SortedCDXWriter(CDXWriter):
super(SortedCDXWriter, self).__init__(out) super(SortedCDXWriter, self).__init__(out)
self.sortlist = [] self.sortlist = []
def write(self, line): def write(self, line, rec_type, *args):
line = ' '.join(line) + '\n' if not self.indexer or self.indexer.include_record(rec_type):
insort(self.sortlist, line) line = ' '.join(line) + '\n'
insort(self.sortlist, line)
def end(self): def end(self):
self.out.write(''.join(self.sortlist)) self.out.write(''.join(self.sortlist))
#=================================================================
class PostResolveWriter(CDXWriter):
def __init__(self, writer, indexer):
self.writer = writer
self.indexer = indexer
self.prev_line = None
self.prev_post_query = None
self.prev_type = None
def start(self):
self.writer.start()
def write(self, line, rec_type, post_query):
if not self.prev_line:
self.prev_line = line
self.prev_post_query = post_query
self.prev_type = rec_type
return
#cdx original field
if self.prev_line[2] != line[2]:
self.writer.write(self.prev_line, self.prev_type)
self.prev_line = line
self.prev_post_query = post_query
return
if self.prev_post_query or post_query:
if self.prev_post_query:
self.indexer.add_post_query(line, self.prev_post_query)
else:
self.indexer.add_post_query(line, post_query)
# update prev url key too
self.prev_line[0] = line[0]
# write both lines
self.writer.write(self.prev_line, self.prev_type)
self.writer.write(line, rec_type)
# flush any cached lines
self.prev_line = None
self.prev_post_query = None
self.prev_type = None
def end(self):
if self.prev_line:
self.writer.write(self.prev_line, self.prev_type)
self.writer.end()
#================================================================= #=================================================================
class MultiFileMixin(object): class MultiFileMixin(object):
def start_all(self): def start_all(self):
@ -323,7 +424,8 @@ def iter_file_or_dir(inputs):
yield os.path.join(input_, filename), filename yield os.path.join(input_, filename), filename
def index_to_file(inputs, output, sort, surt_ordered): def index_to_file(inputs, output, sort,
surt_ordered, include_all, append_post_query):
if output == '-': if output == '-':
outfile = sys.stdout outfile = sys.stdout
else: else:
@ -343,7 +445,9 @@ def index_to_file(inputs, output, sort, surt_ordered):
ArchiveIndexer(fileobj=infile, ArchiveIndexer(fileobj=infile,
filename=filename, filename=filename,
writer=writer, writer=writer,
surt_ordered=surt_ordered).make_index() surt_ordered=surt_ordered,
append_post_query=append_post_query,
include_all=include_all).make_index()
finally: finally:
writer.end_all() writer.end_all()
if infile: if infile:
@ -363,7 +467,8 @@ def cdx_filename(filename):
return remove_ext(filename) + '.cdx' return remove_ext(filename) + '.cdx'
def index_to_dir(inputs, output, sort, surt_ordered): def index_to_dir(inputs, output, sort,
surt_ordered, include_all, append_post_query):
for fullpath, filename in iter_file_or_dir(inputs): for fullpath, filename in iter_file_or_dir(inputs):
outpath = cdx_filename(filename) outpath = cdx_filename(filename)
@ -375,7 +480,9 @@ def index_to_dir(inputs, output, sort, surt_ordered):
filename=filename, filename=filename,
sort=sort, sort=sort,
out=outfile, out=outfile,
surt_ordered=surt_ordered).make_index() surt_ordered=surt_ordered,
append_post_query=append_post_query,
include_all=include_all).make_index()
def main(args=None): def main(args=None):
@ -418,6 +525,13 @@ Not-recommended for new cdx, use only for backwards-compatibility.
- If directory, all archive files from that directory are read - If directory, all archive files from that directory are read
""" """
allrecords_help = """include all records.
currently includes the 'request' records in addition to all
response records"""
post_append_help = """for POST requests, append
form query to url key. (Only applies to form url encoded posts)"""
parser = ArgumentParser(description=description, parser = ArgumentParser(description=description,
epilog=epilog, epilog=epilog,
formatter_class=RawTextHelpFormatter) formatter_class=RawTextHelpFormatter)
@ -426,18 +540,28 @@ Not-recommended for new cdx, use only for backwards-compatibility.
action='store_true', action='store_true',
help=sort_help) help=sort_help)
parser.add_argument('-a', '--allrecords',
action='store_true',
help=allrecords_help)
parser.add_argument('-p', '--postappend',
action='store_true',
help=post_append_help)
parser.add_argument('-u', '--unsurt', parser.add_argument('-u', '--unsurt',
action='store_true', action='store_true',
help=unsurt_help) help=unsurt_help)
parser.add_argument('output', help=output_help) parser.add_argument('output', nargs='?', default='-', help=output_help)
parser.add_argument('inputs', nargs='+', help=input_help) parser.add_argument('inputs', nargs='+', help=input_help)
cmd = parser.parse_args(args=args) cmd = parser.parse_args(args=args)
if cmd.output != '-' and os.path.isdir(cmd.output): if cmd.output != '-' and os.path.isdir(cmd.output):
index_to_dir(cmd.inputs, cmd.output, cmd.sort, not cmd.unsurt) index_to_dir(cmd.inputs, cmd.output, cmd.sort,
not cmd.unsurt, cmd.allrecords, cmd.postappend)
else: else:
index_to_file(cmd.inputs, cmd.output, cmd.sort, not cmd.unsurt) index_to_file(cmd.inputs, cmd.output, cmd.sort,
not cmd.unsurt, cmd.allrecords, cmd.postappend)
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -39,7 +39,15 @@ class ArcWarcRecordLoader:
ARC_HEADERS = ["uri", "ip-address", "archive-date", ARC_HEADERS = ["uri", "ip-address", "archive-date",
"content-type", "length"] "content-type", "length"]
def __init__(self, loader=None, cookie_maker=None, block_size=8192): WARC_TYPES = ['WARC/1.0', 'WARC/0.17', 'WARC/0.18']
HTTP_TYPES = ['HTTP/1.0', 'HTTP/1.1']
HTTP_VERBS = ['GET', 'HEAD', 'POST', 'PUT', 'DELETE', 'TRACE',
'OPTIONS', 'CONNECT', 'PATCH']
def __init__(self, loader=None, cookie_maker=None, block_size=8192,
parse_request=False):
if not loader: if not loader:
loader = BlockLoader(cookie_maker) loader = BlockLoader(cookie_maker)
@ -48,9 +56,13 @@ class ArcWarcRecordLoader:
self.arc_parser = ARCHeadersParser(self.ARC_HEADERS) self.arc_parser = ARCHeadersParser(self.ARC_HEADERS)
warc_types = ['WARC/1.0', 'WARC/0.17', 'WARC/0.18'] self.warc_parser = StatusAndHeadersParser(self.WARC_TYPES)
self.warc_parser = StatusAndHeadersParser(warc_types) self.http_parser = StatusAndHeadersParser(self.HTTP_TYPES)
self.http_parser = StatusAndHeadersParser(['HTTP/1.0', 'HTTP/1.1'])
self.parse_request = parse_request
if self.parse_request:
self.http_req_parser = StatusAndHeadersParser(self.HTTP_VERBS)
def load(self, url, offset, length): def load(self, url, offset, length):
""" Load a single record from given url at offset with length """ Load a single record from given url at offset with length
@ -126,11 +138,16 @@ class ArcWarcRecordLoader:
status_headers = StatusAndHeaders('200 OK', content_type) status_headers = StatusAndHeaders('200 OK', content_type)
elif (rec_type == 'warcinfo' or elif (rec_type == 'warcinfo' or
rec_type == 'arc_header' or rec_type == 'arc_header'):
rec_type == 'request'):
# not parsing these for now # not parsing these for now
status_headers = StatusAndHeaders('204 No Content', []) status_headers = StatusAndHeaders('204 No Content', [])
elif (rec_type == 'request'):
if self.parse_request:
status_headers = self.http_req_parser.parse(stream)
else:
status_headers = StatusAndHeaders('204 No Content', [])
# special case: http 0.9 response, no status or headers # special case: http 0.9 response, no status or headers
#elif rec_type == 'response': #elif rec_type == 'response':
# content_type = rec_headers.get_header('Content-Type') # content_type = rec_headers.get_header('Content-Type')

View File

@ -8,6 +8,15 @@ com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 20
com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
# warc.gz -- parse all
>>> print_cdx_index('example.warc.gz', include_all=True)
CDX N b a m s k r M S V g
com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
com,example)/?example=1 20140103030321 http://example.com?example=1 - - - - - 488 1376 example.warc.gz
com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz
com,example)/?example=1 20140103030341 http://example.com?example=1 - - - - - 490 2417 example.warc.gz
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
# warc # warc
>>> print_cdx_index('example.warc') >>> print_cdx_index('example.warc')
CDX N b a m s k r M S V g CDX N b a m s k r M S V g
@ -40,6 +49,45 @@ com,example)/ 20140401000000 http://example.com/ text/html - 3I42H3S6NNFQ2MSVX7X
com,example)/ 20140102000000 http://example.com/ text/plain - 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 59 202 bad.arc com,example)/ 20140102000000 http://example.com/ text/plain - 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 59 202 bad.arc
com,example)/ 20140401000000 http://example.com/ text/html - 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 68 262 bad.arc com,example)/ 20140401000000 http://example.com/ text/html - 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 68 262 bad.arc
# POST request tests
#=================================================================
# no post append, no requests
>>> print_cdx_index('post-test.warc.gz')
CDX N b a m s k r M S V g
org,httpbin)/post 20140610000859 http://httpbin.org/post application/json 200 M532K5WS4GY2H4OVZO6HRPOP47A7KDWU - - 720 0 post-test.warc.gz
org,httpbin)/post 20140610001151 http://httpbin.org/post application/json 200 M7YCTM7HS3YKYQTAWQVMQSQZBNEOXGU2 - - 723 1196 post-test.warc.gz
org,httpbin)/post?foo=bar 20140610001255 http://httpbin.org/post?foo=bar application/json 200 B6E5P6JUZI6UPDTNO4L2BCHMGLTNCUAJ - - 723 2395 post-test.warc.gz
# post append
>>> print_cdx_index('post-test.warc.gz', append_post_query=True)
CDX N b a m s k r M S V g
org,httpbin)/post?&&&foo=bar&test=abc 20140610000859 http://httpbin.org/post application/json 200 M532K5WS4GY2H4OVZO6HRPOP47A7KDWU - - 720 0 post-test.warc.gz
org,httpbin)/post?&&&a=1&b=[]&c=3 20140610001151 http://httpbin.org/post application/json 200 M7YCTM7HS3YKYQTAWQVMQSQZBNEOXGU2 - - 723 1196 post-test.warc.gz
org,httpbin)/post?&&data=^&foo=bar 20140610001255 http://httpbin.org/post?foo=bar application/json 200 B6E5P6JUZI6UPDTNO4L2BCHMGLTNCUAJ - - 723 2395 post-test.warc.gz
# no post append, requests included
>>> print_cdx_index('post-test.warc.gz', include_all=True)
CDX N b a m s k r M S V g
org,httpbin)/post 20140610000859 http://httpbin.org/post application/json 200 M532K5WS4GY2H4OVZO6HRPOP47A7KDWU - - 720 0 post-test.warc.gz
org,httpbin)/post 20140610000859 http://httpbin.org/post application/x-www-form-urlencoded - - - - 476 720 post-test.warc.gz
org,httpbin)/post 20140610001151 http://httpbin.org/post application/json 200 M7YCTM7HS3YKYQTAWQVMQSQZBNEOXGU2 - - 723 1196 post-test.warc.gz
org,httpbin)/post 20140610001151 http://httpbin.org/post application/x-www-form-urlencoded - - - - 476 1919 post-test.warc.gz
org,httpbin)/post?foo=bar 20140610001255 http://httpbin.org/post?foo=bar application/json 200 B6E5P6JUZI6UPDTNO4L2BCHMGLTNCUAJ - - 723 2395 post-test.warc.gz
org,httpbin)/post?foo=bar 20140610001255 http://httpbin.org/post?foo=bar application/x-www-form-urlencoded - - - - 475 3118 post-test.warc.gz
# post append + requests included
>>> print_cdx_index('post-test.warc.gz', include_all=True, append_post_query=True)
CDX N b a m s k r M S V g
org,httpbin)/post?&&&foo=bar&test=abc 20140610000859 http://httpbin.org/post application/json 200 M532K5WS4GY2H4OVZO6HRPOP47A7KDWU - - 720 0 post-test.warc.gz
org,httpbin)/post?&&&foo=bar&test=abc 20140610000859 http://httpbin.org/post application/x-www-form-urlencoded - - - - 476 720 post-test.warc.gz
org,httpbin)/post?&&&a=1&b=[]&c=3 20140610001151 http://httpbin.org/post application/json 200 M7YCTM7HS3YKYQTAWQVMQSQZBNEOXGU2 - - 723 1196 post-test.warc.gz
org,httpbin)/post?&&&a=1&b=[]&c=3 20140610001151 http://httpbin.org/post application/x-www-form-urlencoded - - - - 476 1919 post-test.warc.gz
org,httpbin)/post?&&data=^&foo=bar 20140610001255 http://httpbin.org/post?foo=bar application/json 200 B6E5P6JUZI6UPDTNO4L2BCHMGLTNCUAJ - - 723 2395 post-test.warc.gz
org,httpbin)/post?&&data=^&foo=bar 20140610001255 http://httpbin.org/post?foo=bar application/x-www-form-urlencoded - - - - 475 3118 post-test.warc.gz
# Test CLI interface -- (check for num lines) # Test CLI interface -- (check for num lines)
#================================================================= #=================================================================
@ -47,7 +95,7 @@ com,example)/ 20140401000000 http://example.com/ text/html - 3I42H3S6NNFQ2MSVX7X
>>> cli_lines(['--sort', '-', TEST_WARC_DIR]) >>> cli_lines(['--sort', '-', TEST_WARC_DIR])
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1001 353 example-url-agnostic-orig.warc.gz org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1001 353 example-url-agnostic-orig.warc.gz
201 204
# test writing to stdout # test writing to stdout
>>> cli_lines(['-', TEST_WARC_DIR + 'example.warc.gz']) >>> cli_lines(['-', TEST_WARC_DIR + 'example.warc.gz'])
@ -55,6 +103,12 @@ com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 20
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
4 4
# test writing to stdout ('-' omitted)
>>> cli_lines([TEST_WARC_DIR + 'example.warc.gz'])
com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
4
# test writing to temp dir # test writing to temp dir
>>> cli_lines_with_dir(TEST_WARC_DIR + 'example.warc.gz') >>> cli_lines_with_dir(TEST_WARC_DIR + 'example.warc.gz')
example.cdx example.cdx
@ -86,19 +140,22 @@ def read_fully(cdx):
curr.write(b) curr.write(b)
return curr.getvalue() return curr.getvalue()
def cdx_index(warc, sort=False): def cdx_index(warc, sort=False,
include_all=False, append_post_query=False):
buff = BytesIO() buff = BytesIO()
with open(TEST_WARC_DIR + warc) as fh: with open(TEST_WARC_DIR + warc) as fh:
indexer = ArchiveIndexer(fh, warc, indexer = ArchiveIndexer(fh, warc,
out=buff, out=buff,
sort=sort) sort=sort,
include_all=include_all,
append_post_query=append_post_query)
indexer.make_index() indexer.make_index()
return buff.getvalue() return buff.getvalue()
def print_cdx_index(warc, sort=False): def print_cdx_index(*args, **kwargs):
sys.stdout.write(cdx_index(warc, sort)) sys.stdout.write(cdx_index(*args, **kwargs))
def assert_cdx_match(cdx, warc, sort=False): def assert_cdx_match(cdx, warc, sort=False):
assert read_fully(cdx) == cdx_index(warc, sort) assert read_fully(cdx) == cdx_index(warc, sort)

View File

@ -54,6 +54,28 @@ Test loading different types of records from a variety of formats
('Content-Length', '1270'), ('Content-Length', '1270'),
('Connection', 'close')])) ('Connection', 'close')]))
# request parsing
>>> load_test_archive('example.warc.gz', '1376', '488')
(('warc', 'request'),
StatusAndHeaders(protocol = 'WARC/1.0', statusline = '', headers = [ ('WARC-Type', 'request'),
('WARC-Record-ID', '<urn:uuid:9a3ffea5-9556-4790-a6bf-c15231fd6b97>'),
('WARC-Date', '2014-01-03T03:03:21Z'),
('Content-Length', '323'),
('Content-Type', 'application/http; msgtype=request'),
('WARC-Concurrent-To', '<urn:uuid:6d058047-ede2-4a13-be79-90c17c631dd4>'),
('WARC-Target-URI', 'http://example.com?example=1'),
('WARC-Warcinfo-ID', '<urn:uuid:fbd6cf0a-6160-4550-b343-12188dc05234>')]),
StatusAndHeaders(protocol = 'GET', statusline = '/?example=1 HTTP/1.1', headers = [ ('Connection', 'close'),
( 'Accept',
'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'),
('Accept-Language', 'en-US,en;q=0.8'),
( 'User-Agent',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.57 Safari/537.36 (via Wayback Save Page)'),
('Host', 'example.com')]))
StatusAndHeaders(protocol = '', statusline = '204 No Content', headers = []))
# Test of record loading based on cdx line # Test of record loading based on cdx line
# Print parsed http headers + 2 lines of content # Print parsed http headers + 2 lines of content
@ -308,7 +330,7 @@ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - \
def load_test_archive(test_file, offset, length): def load_test_archive(test_file, offset, length):
path = test_warc_dir + test_file path = test_warc_dir + test_file
testloader = ArcWarcRecordLoader() testloader = ArcWarcRecordLoader(parse_request=True)
archive = testloader.load(path, offset, length) archive = testloader.load(path, offset, length)

View File

@ -47,6 +47,8 @@ class QueryHandler(object):
return QueryHandler(cdx_server, html_view, perms_policy) return QueryHandler(cdx_server, html_view, perms_policy)
def load_for_request(self, wbrequest): def load_for_request(self, wbrequest):
wbrequest.normalize_post_query()
wb_url = wbrequest.wb_url wb_url = wbrequest.wb_url
# cdx server only supports text and cdxobject for now # cdx server only supports text and cdxobject for now

View File

@ -187,7 +187,12 @@ class ReplayView(object):
new_url = wbrequest.urlrewriter.get_timestamp_url(cdx['timestamp'], new_url = wbrequest.urlrewriter.get_timestamp_url(cdx['timestamp'],
cdx['original']) cdx['original'])
status_headers = StatusAndHeaders('302 Internal Redirect', if wbrequest.method not in ('HEAD', 'GET'):
statusline = '307 Same-Method Internal Redirect'
else:
statusline = '302 Internal Redirect'
status_headers = StatusAndHeaders(statusline,
[('Location', new_url)]) [('Location', new_url)])
# don't include cdx to indicate internal redirect # don't include cdx to indicate internal redirect

View File

@ -0,0 +1,4 @@
CDX N b a m s k r M S V g
org,httpbin)/post?&&&a=1&b=[]&c=3 20140610001151 http://httpbin.org/post application/json 200 M7YCTM7HS3YKYQTAWQVMQSQZBNEOXGU2 - - 723 1196 post-test.warc.gz
org,httpbin)/post?&&&foo=bar&test=abc 20140610000859 http://httpbin.org/post application/json 200 M532K5WS4GY2H4OVZO6HRPOP47A7KDWU - - 720 0 post-test.warc.gz
org,httpbin)/post?&&data=^&foo=bar 20140610001255 http://httpbin.org/post?foo=bar application/json 200 B6E5P6JUZI6UPDTNO4L2BCHMGLTNCUAJ - - 723 2395 post-test.warc.gz

Binary file not shown.

View File

@ -220,6 +220,16 @@ class TestWb:
assert resp.status_int == 302 assert resp.status_int == 302
def test_post_1(self):
resp = self.testapp.post('/pywb/httpbin.org/post', {'foo': 'bar', 'test': 'abc'})
assert resp.status_int == 307
resp = self.testapp.post('/pywb/20140610000859/http://httpbin.org/post', {'foo': 'bar', 'test': 'abc'})
assert resp.status_int == 200
assert '"foo": "bar"' in resp.body
assert '"test": "abc"' in resp.body
def test_excluded_content(self): def test_excluded_content(self):
resp = self.testapp.get('/pywb/http://www.iana.org/_img/bookmark_icon.ico', status = 403) resp = self.testapp.get('/pywb/http://www.iana.org/_img/bookmark_icon.ico', status = 403)
assert resp.status_int == 403 assert resp.status_int == 403