From e2349a74e2402519f2d73a875ce8bd624020cb57 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 10 Jun 2014 19:21:46 -0700 Subject: [PATCH] replay: better POST support via post query append! record_loader can optionally parse 'request' records archiveindexer has -a flag to write all records ('request' included), -p flag to append post query post-test.warc.gz and cdx POST redirects using 307 --- pywb/cdx/cdxdomainspecific.py | 13 +- pywb/framework/wbrequestresponse.py | 19 +++ pywb/rewrite/header_rewriter.py | 4 +- pywb/rules.yaml | 21 ++++ pywb/static/wb.js | 4 +- pywb/static/wombat.js | 4 + pywb/ui/head_insert.html | 3 + pywb/utils/loaders.py | 51 ++++++++ pywb/warc/archiveindexer.py | 168 +++++++++++++++++++++---- pywb/warc/recordloader.py | 29 ++++- pywb/warc/test/test_indexing.py | 67 +++++++++- pywb/warc/test/test_loading.py | 24 +++- pywb/webapp/query_handler.py | 2 + pywb/webapp/replay_views.py | 7 +- sample_archive/cdx/post-test.cdx | 4 + sample_archive/warcs/post-test.warc.gz | Bin 0 -> 3593 bytes tests/test_integration.py | 10 ++ 17 files changed, 388 insertions(+), 42 deletions(-) create mode 100644 sample_archive/cdx/post-test.cdx create mode 100644 sample_archive/warcs/post-test.warc.gz diff --git a/pywb/cdx/cdxdomainspecific.py b/pywb/cdx/cdxdomainspecific.py index fd830c17..da4725b7 100644 --- a/pywb/cdx/cdxdomainspecific.py +++ b/pywb/cdx/cdxdomainspecific.py @@ -87,9 +87,9 @@ class FuzzyQuery: matched_rule = rule - if len(m.groups()) == 1: - #filter_.append('~urlkey:' + m.group(1)) - filter_.append(rule.filter.format(m.group(1))) + groups = m.groups() + for g in groups: + filter_.append(rule.filter.format(g)) break @@ -100,6 +100,11 @@ class FuzzyQuery: if matched_rule.replace: repl = matched_rule.replace + if '/_/stream/squarestream?soc-app' in url and 'jserror' not in url: + print 'KEY ', urlkey + print 'RULE ', url, vars(matched_rule) + print 'FILTERS ', filter_ + inx = url.rfind(repl) if inx > 0: url = url[:inx + 1] @@ -148,6 +153,6 @@ class CDXDomainSpecificRule(BaseRule): self.replace = unsurt(self.replace) -if __name__ == "__main__": + if __name__ == "__main__": import doctest doctest.testmod() diff --git a/pywb/framework/wbrequestresponse.py b/pywb/framework/wbrequestresponse.py index 446aa88a..85ff2eb8 100644 --- a/pywb/framework/wbrequestresponse.py +++ b/pywb/framework/wbrequestresponse.py @@ -1,4 +1,5 @@ from pywb.utils.statusandheaders import StatusAndHeaders +from pywb.utils.loaders import extract_post_query, append_post_query import pprint @@ -45,6 +46,8 @@ class WbRequest(object): else: self.request_uri = env.get('REL_REQUEST_URI') + self.method = self.env.get('REQUEST_METHOD') + self.coll = coll if not host_prefix: @@ -116,6 +119,22 @@ class WbRequest(object): wburl_str = self.referrer[len(self.host_prefix + self.rel_prefix):] return wburl_str + def normalize_post_query(self): + if self.method != 'POST': + return + + if not self.wb_url: + return + + mime = self.env.get('CONTENT_TYPE') + length = self.env.get('CONTENT_LENGTH') + stream = self.env['wsgi.input'] + + post_query = extract_post_query('POST', mime, length, stream) + + if post_query: + self.wb_url.url = append_post_query(self.wb_url.url, post_query) + #================================================================= class WbResponse(object): diff --git a/pywb/rewrite/header_rewriter.py b/pywb/rewrite/header_rewriter.py index 25b27de4..2b22c000 100644 --- a/pywb/rewrite/header_rewriter.py +++ b/pywb/rewrite/header_rewriter.py @@ -18,7 +18,9 @@ class RewrittenStatusAndHeaders: #================================================================= class HeaderRewriter: REWRITE_TYPES = { - 'html': ['text/html', 'application/xhtml'], + 'html': ['text/html', + 'application/xhtml', + 'application/xhtml+xml'], 'css': ['text/css'], diff --git a/pywb/rules.yaml b/pywb/rules.yaml index 04327c92..ba92a652 100644 --- a/pywb/rules.yaml +++ b/pywb/rules.yaml @@ -40,6 +40,27 @@ rules: replace: '/' + # google plus rules + #================================================================= + + - url_prefix: 'com,google,plus)/_/stream/getactivities' + + fuzzy_lookup: '(egk[^"]+).*(f.sid=[^&]+)' + + - url_prefix: 'com,google,plus)/_/stream/squarestream' + + fuzzy_lookup: '(cai[^"]+).*(f.sid=[^&]+)' + + - url_prefix: 'com,google,plus)/_/communities/rt/landing' + + fuzzy_lookup: 'com,google,plus\)/_/.*?.*\,(\d{13}\])&.*(f.sid=[^&]+).*' + + + - url_prefix: 'com,google,plus)/_/' + + fuzzy_lookup: 'com,google,plus\)/_/.*?.*(f.sid=[^&]+)' + + # testing rules -- not for valid domain #================================================================= # this rule block is a non-existent prefix merely for testing diff --git a/pywb/static/wb.js b/pywb/static/wb.js index e10a522e..357fcaa5 100644 --- a/pywb/static/wb.js +++ b/pywb/static/wb.js @@ -102,6 +102,6 @@ if (wbinfo.is_frame_mp && wbinfo.canon_url && (window.self == window.top) && window.location.href != wbinfo.canon_url) { - console.log('frame'); - window.location.replace(wbinfo.canon_url); + //console.log('frame'); + //window.location.replace(wbinfo.canon_url); } diff --git a/pywb/static/wombat.js b/pywb/static/wombat.js index 78e4f7ea..d6cf60b5 100644 --- a/pywb/static/wombat.js +++ b/pywb/static/wombat.js @@ -111,6 +111,10 @@ WB_wombat_init = (function() { if (!url) { return url; } + + if (url.indexOf("hypothes.is") > 0) { + return url; + } var urltype_ = (typeof url); diff --git a/pywb/ui/head_insert.html b/pywb/ui/head_insert.html index 72d30142..196735f9 100644 --- a/pywb/ui/head_insert.html +++ b/pywb/ui/head_insert.html @@ -20,5 +20,8 @@ + + + diff --git a/pywb/utils/loaders.py b/pywb/utils/loaders.py index a1d12d27..e01f72bd 100644 --- a/pywb/utils/loaders.py +++ b/pywb/utils/loaders.py @@ -5,6 +5,7 @@ local and remote access import os import hmac +import urllib import urllib2 import time import pkg_resources @@ -24,6 +25,56 @@ def load_yaml_config(config_file): return config +#================================================================= +def extract_post_query(method, mime, length, stream): + """ + Extract a url-encoded form POST from stream + If not a application/x-www-form-urlencoded, or no missing + content length, return None + """ + if method.upper() != 'POST': + return None + + if (not mime or + not mime.lower().startswith('application/x-www-form-urlencoded')): + return None + + if not length or length == '0': + return None + + try: + length = int(length) + except ValueError: + return None + + #todo: encoding issues? + post_query = '' + + while length > 0: + buff = stream.read(length) + length -= len(buff) + + if not buff: + break + + post_query += buff + + post_query = urllib.unquote_plus(post_query) + return post_query + + +#================================================================= +def append_post_query(url, post_query): + if not post_query: + return url + + if '?' not in url: + url += '?' + + url += '&&&' + post_query + return url + + #================================================================= class BlockLoader(object): """ diff --git a/pywb/warc/archiveindexer.py b/pywb/warc/archiveindexer.py index df7eef66..84e4e022 100644 --- a/pywb/warc/archiveindexer.py +++ b/pywb/warc/archiveindexer.py @@ -1,6 +1,7 @@ from pywb.utils.timeutils import iso_date_to_timestamp from pywb.utils.bufferedreaders import DecompressingBufferedReader from pywb.utils.canonicalize import canonicalize +from pywb.utils.loaders import extract_post_query, append_post_query from recordloader import ArcWarcRecordLoader @@ -21,15 +22,27 @@ class ArchiveIndexer(object): The indexer will automatically detect format, and decompress if necessary """ + + # arc/warc record types which are + # indexed by default, without 'include_all' + DEFAULT_REC_TYPES = ('response', 'revisit', 'metadata', 'resource') + def __init__(self, fileobj, filename, - out=sys.stdout, sort=False, writer=None, surt_ordered=True): + out=sys.stdout, sort=False, writer=None, surt_ordered=True, + include_all=False, append_post_query=False): self.fh = fileobj self.filename = filename - self.loader = ArcWarcRecordLoader() + + loader_parse_req = include_all or append_post_query + self.loader = ArcWarcRecordLoader(parse_request=loader_parse_req) + self.offset = 0 self.known_format = None self.surt_ordered = surt_ordered + self.include_all = include_all + self.append_post_query = append_post_query + if writer: self.writer = writer elif sort: @@ -37,6 +50,12 @@ class ArchiveIndexer(object): else: self.writer = CDXWriter(out) + # todo: refactor this + self.writer.indexer = self + + if append_post_query: + self.writer = PostResolveWriter(self.writer, self) + def make_index(self): """ Output a cdx index! """ @@ -127,9 +146,23 @@ class ArchiveIndexer(object): self._read_to_record_end(reader, record) return record + post_query = None + + if record.rec_type == 'request': + method = record.status_headers.protocol + mime = result[3] + len_ = record.status_headers.get_header('Content-Length') + + post_query = extract_post_query(method, + mime, + len_, + record.stream) + + # should be 0 if read query string + num = self.read_rest(record.stream) # generate digest if it doesn't exist and if not a revisit # if revisit, then nothing we can do here - if result[-1] == '-' and record.rec_type != 'revisit': + elif result[-1] == '-' and record.rec_type != 'revisit': digester = hashlib.sha1() self.read_rest(record.stream, digester) result[-1] = base64.b32encode(digester.digest()) @@ -146,7 +179,7 @@ class ArchiveIndexer(object): result.append(str(offset)) result.append(self.filename) - self.writer.write(result) + self.writer.write(result, record.rec_type, post_query) return record @@ -154,25 +187,31 @@ class ArchiveIndexer(object): """ Parse warc record to be included in index, or return none if skipping this type of record """ - if record.rec_type not in ('response', 'revisit', - 'metadata', 'resource'): + + if (not self.append_post_query and + not self.include_record(record.rec_type)): return None url = record.rec_headers.get_header('WARC-Target-Uri') + if not url: + return None timestamp = record.rec_headers.get_header('WARC-Date') timestamp = iso_date_to_timestamp(timestamp) digest = record.rec_headers.get_header('WARC-Payload-Digest') - status = self._extract_status(record.status_headers) - if record.rec_type == 'revisit': mime = 'warc/revisit' status = '-' + elif record.rec_type == 'request': + mime = record.status_headers.get_header('Content-Type') + mime = self._extract_mime(mime, '-') + status = '-' else: mime = record.status_headers.get_header('Content-Type') mime = self._extract_mime(mime) + status = self._extract_status(record.status_headers) if digest and digest.startswith('sha1:'): digest = digest[len('sha1:'):] @@ -225,14 +264,14 @@ class ArchiveIndexer(object): MIME_RE = re.compile('[; ]') - def _extract_mime(self, mime): + def _extract_mime(self, mime, def_mime='unk'): """ Utility function to extract mimetype only from a full content type, removing charset settings """ if mime: mime = self.MIME_RE.split(mime, 1)[0] if not mime: - mime = 'unk' + mime = def_mime return mime def _extract_status(self, status_headers): @@ -256,17 +295,27 @@ class ArchiveIndexer(object): digester.update(b) return num + def include_record(self, type_): + return self.include_all or (type_ in self.DEFAULT_REC_TYPES) + + def add_post_query(self, fields, post_query): + url = append_post_query(fields[2], post_query) + fields[0] = canonicalize(url, self.surt_ordered) + return fields + #================================================================= class CDXWriter(object): def __init__(self, out): self.out = out + self.indexer = None def start(self): self.out.write(' CDX N b a m s k r M S V g\n') - def write(self, line): - self.out.write(' '.join(line) + '\n') + def write(self, line, rec_type, *args): + if not self.indexer or self.indexer.include_record(rec_type): + self.out.write(' '.join(line) + '\n') def end(self): pass @@ -278,14 +327,66 @@ class SortedCDXWriter(CDXWriter): super(SortedCDXWriter, self).__init__(out) self.sortlist = [] - def write(self, line): - line = ' '.join(line) + '\n' - insort(self.sortlist, line) + def write(self, line, rec_type, *args): + if not self.indexer or self.indexer.include_record(rec_type): + line = ' '.join(line) + '\n' + insort(self.sortlist, line) def end(self): self.out.write(''.join(self.sortlist)) +#================================================================= +class PostResolveWriter(CDXWriter): + def __init__(self, writer, indexer): + self.writer = writer + self.indexer = indexer + self.prev_line = None + self.prev_post_query = None + self.prev_type = None + + def start(self): + self.writer.start() + + def write(self, line, rec_type, post_query): + if not self.prev_line: + self.prev_line = line + self.prev_post_query = post_query + self.prev_type = rec_type + return + + #cdx original field + if self.prev_line[2] != line[2]: + self.writer.write(self.prev_line, self.prev_type) + self.prev_line = line + self.prev_post_query = post_query + return + + if self.prev_post_query or post_query: + if self.prev_post_query: + self.indexer.add_post_query(line, self.prev_post_query) + else: + self.indexer.add_post_query(line, post_query) + + # update prev url key too + self.prev_line[0] = line[0] + + # write both lines + self.writer.write(self.prev_line, self.prev_type) + self.writer.write(line, rec_type) + + # flush any cached lines + self.prev_line = None + self.prev_post_query = None + self.prev_type = None + + def end(self): + if self.prev_line: + self.writer.write(self.prev_line, self.prev_type) + + self.writer.end() + + #================================================================= class MultiFileMixin(object): def start_all(self): @@ -323,7 +424,8 @@ def iter_file_or_dir(inputs): yield os.path.join(input_, filename), filename -def index_to_file(inputs, output, sort, surt_ordered): +def index_to_file(inputs, output, sort, + surt_ordered, include_all, append_post_query): if output == '-': outfile = sys.stdout else: @@ -343,7 +445,9 @@ def index_to_file(inputs, output, sort, surt_ordered): ArchiveIndexer(fileobj=infile, filename=filename, writer=writer, - surt_ordered=surt_ordered).make_index() + surt_ordered=surt_ordered, + append_post_query=append_post_query, + include_all=include_all).make_index() finally: writer.end_all() if infile: @@ -363,7 +467,8 @@ def cdx_filename(filename): return remove_ext(filename) + '.cdx' -def index_to_dir(inputs, output, sort, surt_ordered): +def index_to_dir(inputs, output, sort, + surt_ordered, include_all, append_post_query): for fullpath, filename in iter_file_or_dir(inputs): outpath = cdx_filename(filename) @@ -375,7 +480,9 @@ def index_to_dir(inputs, output, sort, surt_ordered): filename=filename, sort=sort, out=outfile, - surt_ordered=surt_ordered).make_index() + surt_ordered=surt_ordered, + append_post_query=append_post_query, + include_all=include_all).make_index() def main(args=None): @@ -418,6 +525,13 @@ Not-recommended for new cdx, use only for backwards-compatibility. - If directory, all archive files from that directory are read """ + allrecords_help = """include all records. +currently includes the 'request' records in addition to all +response records""" + + post_append_help = """for POST requests, append +form query to url key. (Only applies to form url encoded posts)""" + parser = ArgumentParser(description=description, epilog=epilog, formatter_class=RawTextHelpFormatter) @@ -426,18 +540,28 @@ Not-recommended for new cdx, use only for backwards-compatibility. action='store_true', help=sort_help) + parser.add_argument('-a', '--allrecords', + action='store_true', + help=allrecords_help) + + parser.add_argument('-p', '--postappend', + action='store_true', + help=post_append_help) + parser.add_argument('-u', '--unsurt', action='store_true', help=unsurt_help) - parser.add_argument('output', help=output_help) + parser.add_argument('output', nargs='?', default='-', help=output_help) parser.add_argument('inputs', nargs='+', help=input_help) cmd = parser.parse_args(args=args) if cmd.output != '-' and os.path.isdir(cmd.output): - index_to_dir(cmd.inputs, cmd.output, cmd.sort, not cmd.unsurt) + index_to_dir(cmd.inputs, cmd.output, cmd.sort, + not cmd.unsurt, cmd.allrecords, cmd.postappend) else: - index_to_file(cmd.inputs, cmd.output, cmd.sort, not cmd.unsurt) + index_to_file(cmd.inputs, cmd.output, cmd.sort, + not cmd.unsurt, cmd.allrecords, cmd.postappend) if __name__ == '__main__': diff --git a/pywb/warc/recordloader.py b/pywb/warc/recordloader.py index 4c71dee3..c06c2b6f 100644 --- a/pywb/warc/recordloader.py +++ b/pywb/warc/recordloader.py @@ -39,7 +39,15 @@ class ArcWarcRecordLoader: ARC_HEADERS = ["uri", "ip-address", "archive-date", "content-type", "length"] - def __init__(self, loader=None, cookie_maker=None, block_size=8192): + WARC_TYPES = ['WARC/1.0', 'WARC/0.17', 'WARC/0.18'] + + HTTP_TYPES = ['HTTP/1.0', 'HTTP/1.1'] + + HTTP_VERBS = ['GET', 'HEAD', 'POST', 'PUT', 'DELETE', 'TRACE', + 'OPTIONS', 'CONNECT', 'PATCH'] + + def __init__(self, loader=None, cookie_maker=None, block_size=8192, + parse_request=False): if not loader: loader = BlockLoader(cookie_maker) @@ -48,9 +56,13 @@ class ArcWarcRecordLoader: self.arc_parser = ARCHeadersParser(self.ARC_HEADERS) - warc_types = ['WARC/1.0', 'WARC/0.17', 'WARC/0.18'] - self.warc_parser = StatusAndHeadersParser(warc_types) - self.http_parser = StatusAndHeadersParser(['HTTP/1.0', 'HTTP/1.1']) + self.warc_parser = StatusAndHeadersParser(self.WARC_TYPES) + self.http_parser = StatusAndHeadersParser(self.HTTP_TYPES) + + self.parse_request = parse_request + if self.parse_request: + self.http_req_parser = StatusAndHeadersParser(self.HTTP_VERBS) + def load(self, url, offset, length): """ Load a single record from given url at offset with length @@ -126,11 +138,16 @@ class ArcWarcRecordLoader: status_headers = StatusAndHeaders('200 OK', content_type) elif (rec_type == 'warcinfo' or - rec_type == 'arc_header' or - rec_type == 'request'): + rec_type == 'arc_header'): # not parsing these for now status_headers = StatusAndHeaders('204 No Content', []) + elif (rec_type == 'request'): + if self.parse_request: + status_headers = self.http_req_parser.parse(stream) + else: + status_headers = StatusAndHeaders('204 No Content', []) + # special case: http 0.9 response, no status or headers #elif rec_type == 'response': # content_type = rec_headers.get_header('Content-Type') diff --git a/pywb/warc/test/test_indexing.py b/pywb/warc/test/test_indexing.py index 0a3d6038..4a7a91e9 100644 --- a/pywb/warc/test/test_indexing.py +++ b/pywb/warc/test/test_indexing.py @@ -8,6 +8,15 @@ com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 20 com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz +# warc.gz -- parse all +>>> print_cdx_index('example.warc.gz', include_all=True) + CDX N b a m s k r M S V g +com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz +com,example)/?example=1 20140103030321 http://example.com?example=1 - - - - - 488 1376 example.warc.gz +com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz +com,example)/?example=1 20140103030341 http://example.com?example=1 - - - - - 490 2417 example.warc.gz +org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz + # warc >>> print_cdx_index('example.warc') CDX N b a m s k r M S V g @@ -40,6 +49,45 @@ com,example)/ 20140401000000 http://example.com/ text/html - 3I42H3S6NNFQ2MSVX7X com,example)/ 20140102000000 http://example.com/ text/plain - 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 59 202 bad.arc com,example)/ 20140401000000 http://example.com/ text/html - 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 68 262 bad.arc + +# POST request tests +#================================================================= +# no post append, no requests +>>> print_cdx_index('post-test.warc.gz') + CDX N b a m s k r M S V g +org,httpbin)/post 20140610000859 http://httpbin.org/post application/json 200 M532K5WS4GY2H4OVZO6HRPOP47A7KDWU - - 720 0 post-test.warc.gz +org,httpbin)/post 20140610001151 http://httpbin.org/post application/json 200 M7YCTM7HS3YKYQTAWQVMQSQZBNEOXGU2 - - 723 1196 post-test.warc.gz +org,httpbin)/post?foo=bar 20140610001255 http://httpbin.org/post?foo=bar application/json 200 B6E5P6JUZI6UPDTNO4L2BCHMGLTNCUAJ - - 723 2395 post-test.warc.gz + +# post append +>>> print_cdx_index('post-test.warc.gz', append_post_query=True) + CDX N b a m s k r M S V g +org,httpbin)/post?&&&foo=bar&test=abc 20140610000859 http://httpbin.org/post application/json 200 M532K5WS4GY2H4OVZO6HRPOP47A7KDWU - - 720 0 post-test.warc.gz +org,httpbin)/post?&&&a=1&b=[]&c=3 20140610001151 http://httpbin.org/post application/json 200 M7YCTM7HS3YKYQTAWQVMQSQZBNEOXGU2 - - 723 1196 post-test.warc.gz +org,httpbin)/post?&&data=^&foo=bar 20140610001255 http://httpbin.org/post?foo=bar application/json 200 B6E5P6JUZI6UPDTNO4L2BCHMGLTNCUAJ - - 723 2395 post-test.warc.gz + +# no post append, requests included +>>> print_cdx_index('post-test.warc.gz', include_all=True) + CDX N b a m s k r M S V g +org,httpbin)/post 20140610000859 http://httpbin.org/post application/json 200 M532K5WS4GY2H4OVZO6HRPOP47A7KDWU - - 720 0 post-test.warc.gz +org,httpbin)/post 20140610000859 http://httpbin.org/post application/x-www-form-urlencoded - - - - 476 720 post-test.warc.gz +org,httpbin)/post 20140610001151 http://httpbin.org/post application/json 200 M7YCTM7HS3YKYQTAWQVMQSQZBNEOXGU2 - - 723 1196 post-test.warc.gz +org,httpbin)/post 20140610001151 http://httpbin.org/post application/x-www-form-urlencoded - - - - 476 1919 post-test.warc.gz +org,httpbin)/post?foo=bar 20140610001255 http://httpbin.org/post?foo=bar application/json 200 B6E5P6JUZI6UPDTNO4L2BCHMGLTNCUAJ - - 723 2395 post-test.warc.gz +org,httpbin)/post?foo=bar 20140610001255 http://httpbin.org/post?foo=bar application/x-www-form-urlencoded - - - - 475 3118 post-test.warc.gz + +# post append + requests included +>>> print_cdx_index('post-test.warc.gz', include_all=True, append_post_query=True) + CDX N b a m s k r M S V g +org,httpbin)/post?&&&foo=bar&test=abc 20140610000859 http://httpbin.org/post application/json 200 M532K5WS4GY2H4OVZO6HRPOP47A7KDWU - - 720 0 post-test.warc.gz +org,httpbin)/post?&&&foo=bar&test=abc 20140610000859 http://httpbin.org/post application/x-www-form-urlencoded - - - - 476 720 post-test.warc.gz +org,httpbin)/post?&&&a=1&b=[]&c=3 20140610001151 http://httpbin.org/post application/json 200 M7YCTM7HS3YKYQTAWQVMQSQZBNEOXGU2 - - 723 1196 post-test.warc.gz +org,httpbin)/post?&&&a=1&b=[]&c=3 20140610001151 http://httpbin.org/post application/x-www-form-urlencoded - - - - 476 1919 post-test.warc.gz +org,httpbin)/post?&&data=^&foo=bar 20140610001255 http://httpbin.org/post?foo=bar application/json 200 B6E5P6JUZI6UPDTNO4L2BCHMGLTNCUAJ - - 723 2395 post-test.warc.gz +org,httpbin)/post?&&data=^&foo=bar 20140610001255 http://httpbin.org/post?foo=bar application/x-www-form-urlencoded - - - - 475 3118 post-test.warc.gz + + + # Test CLI interface -- (check for num lines) #================================================================= @@ -47,7 +95,7 @@ com,example)/ 20140401000000 http://example.com/ text/html - 3I42H3S6NNFQ2MSVX7X >>> cli_lines(['--sort', '-', TEST_WARC_DIR]) com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1001 353 example-url-agnostic-orig.warc.gz -201 +204 # test writing to stdout >>> cli_lines(['-', TEST_WARC_DIR + 'example.warc.gz']) @@ -55,6 +103,12 @@ com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 20 org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz 4 +# test writing to stdout ('-' omitted) +>>> cli_lines([TEST_WARC_DIR + 'example.warc.gz']) +com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz +org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz +4 + # test writing to temp dir >>> cli_lines_with_dir(TEST_WARC_DIR + 'example.warc.gz') example.cdx @@ -86,19 +140,22 @@ def read_fully(cdx): curr.write(b) return curr.getvalue() -def cdx_index(warc, sort=False): +def cdx_index(warc, sort=False, + include_all=False, append_post_query=False): buff = BytesIO() with open(TEST_WARC_DIR + warc) as fh: indexer = ArchiveIndexer(fh, warc, out=buff, - sort=sort) + sort=sort, + include_all=include_all, + append_post_query=append_post_query) indexer.make_index() return buff.getvalue() -def print_cdx_index(warc, sort=False): - sys.stdout.write(cdx_index(warc, sort)) +def print_cdx_index(*args, **kwargs): + sys.stdout.write(cdx_index(*args, **kwargs)) def assert_cdx_match(cdx, warc, sort=False): assert read_fully(cdx) == cdx_index(warc, sort) diff --git a/pywb/warc/test/test_loading.py b/pywb/warc/test/test_loading.py index ae559126..f744aa9f 100644 --- a/pywb/warc/test/test_loading.py +++ b/pywb/warc/test/test_loading.py @@ -54,6 +54,28 @@ Test loading different types of records from a variety of formats ('Content-Length', '1270'), ('Connection', 'close')])) +# request parsing +>>> load_test_archive('example.warc.gz', '1376', '488') +(('warc', 'request'), + StatusAndHeaders(protocol = 'WARC/1.0', statusline = '', headers = [ ('WARC-Type', 'request'), + ('WARC-Record-ID', ''), + ('WARC-Date', '2014-01-03T03:03:21Z'), + ('Content-Length', '323'), + ('Content-Type', 'application/http; msgtype=request'), + ('WARC-Concurrent-To', ''), + ('WARC-Target-URI', 'http://example.com?example=1'), + ('WARC-Warcinfo-ID', '')]), + StatusAndHeaders(protocol = 'GET', statusline = '/?example=1 HTTP/1.1', headers = [ ('Connection', 'close'), + ( 'Accept', + 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'), + ('Accept-Language', 'en-US,en;q=0.8'), + ( 'User-Agent', + 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.57 Safari/537.36 (via Wayback Save Page)'), + ('Host', 'example.com')])) + + +StatusAndHeaders(protocol = '', statusline = '204 No Content', headers = [])) + # Test of record loading based on cdx line # Print parsed http headers + 2 lines of content @@ -308,7 +330,7 @@ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - \ def load_test_archive(test_file, offset, length): path = test_warc_dir + test_file - testloader = ArcWarcRecordLoader() + testloader = ArcWarcRecordLoader(parse_request=True) archive = testloader.load(path, offset, length) diff --git a/pywb/webapp/query_handler.py b/pywb/webapp/query_handler.py index 42c93806..b7d09ecb 100644 --- a/pywb/webapp/query_handler.py +++ b/pywb/webapp/query_handler.py @@ -47,6 +47,8 @@ class QueryHandler(object): return QueryHandler(cdx_server, html_view, perms_policy) def load_for_request(self, wbrequest): + wbrequest.normalize_post_query() + wb_url = wbrequest.wb_url # cdx server only supports text and cdxobject for now diff --git a/pywb/webapp/replay_views.py b/pywb/webapp/replay_views.py index 07bcb7ce..0301b6ee 100644 --- a/pywb/webapp/replay_views.py +++ b/pywb/webapp/replay_views.py @@ -187,7 +187,12 @@ class ReplayView(object): new_url = wbrequest.urlrewriter.get_timestamp_url(cdx['timestamp'], cdx['original']) - status_headers = StatusAndHeaders('302 Internal Redirect', + if wbrequest.method not in ('HEAD', 'GET'): + statusline = '307 Same-Method Internal Redirect' + else: + statusline = '302 Internal Redirect' + + status_headers = StatusAndHeaders(statusline, [('Location', new_url)]) # don't include cdx to indicate internal redirect diff --git a/sample_archive/cdx/post-test.cdx b/sample_archive/cdx/post-test.cdx new file mode 100644 index 00000000..cb36fed7 --- /dev/null +++ b/sample_archive/cdx/post-test.cdx @@ -0,0 +1,4 @@ + CDX N b a m s k r M S V g +org,httpbin)/post?&&&a=1&b=[]&c=3 20140610001151 http://httpbin.org/post application/json 200 M7YCTM7HS3YKYQTAWQVMQSQZBNEOXGU2 - - 723 1196 post-test.warc.gz +org,httpbin)/post?&&&foo=bar&test=abc 20140610000859 http://httpbin.org/post application/json 200 M532K5WS4GY2H4OVZO6HRPOP47A7KDWU - - 720 0 post-test.warc.gz +org,httpbin)/post?&&data=^&foo=bar 20140610001255 http://httpbin.org/post?foo=bar application/json 200 B6E5P6JUZI6UPDTNO4L2BCHMGLTNCUAJ - - 723 2395 post-test.warc.gz diff --git a/sample_archive/warcs/post-test.warc.gz b/sample_archive/warcs/post-test.warc.gz new file mode 100644 index 0000000000000000000000000000000000000000..b9cc1f48102154696b7d95c36ade930003556a8f GIT binary patch literal 3593 zcmbW(_dgVl;|Fk+?J{ygMs|mA*15CyxUx=XB(h~>oN(58+asArc9G57I+EFvbt3a{ z#vz@^ii8W_KHqmgzJGeZ{)N};@qE0lKc=TIxSeBh@jTeo2_+>5mQ?^lWWm1!RfH*k zq4ElfQnE@gC75)uBg$FYJyhD)&(()2+~T1r5zKnVFMj@Z!yuNO+^y)0C`rkwZ^2@j zdtglkZqzDxrZ~Yf$~OhmCms3t@cjWNQ~!Q+VR%%@VC1#cP?PvFpZ2&Z6VGY z2u!8r3a@5L-Bd0|uMwj-^dt?<9kdAk4BtxM#Kb~-x7%ADr&C%lg9WlvJcaRb0|{3d z+1^z>?p|E5y^9d3i++4e9}|dNn^BmOfwv1L_YUjXv<{2`4#l%+a=5HPVDDR))UnNt zxoQ%}Dw23q#TUdWb#uY8(+h1}z5!7jG)UN1U=R#Dw*s z44V%!s^{)2P${EgUFl+;5_F#H!X<=0n7B*UkEd)ESe2s9O2d^h&1MpTwy*9WGu6w5 z&F3@nryqW#chx)IuC^ZHAw`8W1}u6mbEnZ5hm;Nv?kH$bWB-;L#4{f)X1(jp18K}0 z8kz;+&+%0=zC-h?=L|j&o+{dz7GJ@bw%zHZgTMc&+^aghvd4XZw16%I|cD>gT?YrHHN8t66 z#6IS`G^woyxH*BZeVaFl61uox2?UAzYBOL7;ywB>L6QkA`Q!Ri;+XwZ=C2D>ROjcH zsMI}LVV=}fRM-F8o?SBT`%Zv?AC`pIRF0Aol8DgrZ_~ntUMfj%8SUsLcE3PV*`*%G zY*xjXrm?p=Uom|d-Y^1tz53p`hv6N~VRuZY)?QdTfcw}BYY>LPmw&2=?v&Em(VxKi zdrRijL&arnqCtAp45C0w53Dxtsz|+1vQ>1C29tMtU@03j`x|QQdk(|9<3_)_UZGx; z#rYod={#w@#1#D%M8uRlVWd6HbbS#fdrdfuDGpOqDaLjg-yF|4!I}wOZtPW{8FA+S zJJU1CA2o>EZb8RBtmPHf>5otFezuMJzMGN$W2enT)+p^Ehf!qiGbyl_u;?`fPbKu+80lOH zQ4;Ctk4ulCnPswT?RGLe^+;%|$_ylLbPaQNFxH3rs=_bbgI$ilr(vhb$yTJ^I>x?I zD|bWwbVfOrFEXA-6qoA|{^W2TVzVoge9E%VZBofTIj0&GkQ*@Qsa?}L0{2<8?l0Oy z-1d$9)9s8-rHIwVf4Vg={uircWff(i3J{2_{J&d8MEtfY;^*x%tt71`)hFZ10*g?}c0+o;w$}X8zbls;Qha;(q^fB;Rv>tM8(tp@J2X4!elvUe22d3!Ee7VPIjnHh| za$M2}HLAs4v$Pt74|q;c<9~BylP%HC81&0bEc<0)*QU=vvHS<9sxiB{B@oLLzb;T1 z^g?TfhcdDM7368Ef1mEf-gjo zoRRsAu$_h&jBmA6tN9?PDSd>%Pf@?I4BHK1Rjg|Q@#|@6A@FxhZIsssZoa2%)7!M& z)bo}z^OMnKw#iXvw4gku(X<$C^hWG{52Xiuu5-EuDe!`t$~XBI*RH(PJZ&&1g2(b} z9Q)t+E(O2xYFwIhSnXy_d0aZr^!&q+q>hRoa}Z|J3N7aR!6J_<;Jf5UmJ6KwxAFTq z4ut&W-aLu(#x0K~iAf6nLA=9liZePIzjh^G-_g$H9Z>M7xU$34O8wFyH8BagcZr}z zt79o;QpoprW$;q^;kKiDr?{S@>RK|}rijUdu#&B9uC^0&)&7vr&~_5$oQ-l1qxN;HW17i1Nn`T4 z+u&w}x%3ZL3ZIK^q5g37A13XRVdq-_=8D_NIAZD9&}&t47&cf+b3_bNB( z1owqk0puf`xsC(;MgLWW8O3S$9X?HY8iS@}PU zu#%o?4H|Nv*keX;-lCCFWz_&-JxadgNnu$h>5EssnzYi`>HCE?;(}Q6_{X`(E>oXy zC)++7)0Y-Hbvwj(`GocU8)OH!xh~c|=#kI%?ItH;*WfvX@L53Eswyl)0N{O{V0Xynh=&3s^IA zK>!=&O$4wZv2+pf>d1BnWVWui7VOnL^Iqw)Oyg>+_Iyiq%)>MVL!4uRda%Y0^-0&A z8i+Y~4=j2?FlBBykW6V*O)Kf082qaQY2!BBeBhozZ}Fumd$9CZGq$;#bUfk=?kfTz zlCA>ADx6PVdZ@fKvbF!XZe$(}3jVNr^Hx^+(U3xwku4;TLf>d>R+|D$BQRJYkqm2G31Axnu{Fi7Y8VA23t3mx zNYSl$8RfHAG^1nqerT>PM&@pJ>!)^}#}YObr&~!9rdnjGv5d+tQ;53GS*(g|3lG1b z`-5RTlBoPidTPUKd<@3h=y9R_5Xhrn6>Z()K@o$2wL<^ZwaQ?rrkrf$OuTwN ziA7=x;I>m%qVy+Ge_JXNU!cuS8j2m6?{gW23%olRYz(jZT8#@RTHr!nq}c*&)% z{$HRQB*N+)alqbZQ}w4isO^T}q*dv}hYLKhxM$rChDl^|K81GltEeBmLQ$Q=p7-|)i^ZQ@A4GJ>Srw>rO8qeZxX_oy7 z{kqe!YP!QcRbDM*LaMB$(4P4uLfo}5CjZ8j**$adF^|GQLI;Xdm!7}B{W`bZqrts$ z_e3>#IO=PdYV>ZM*A)CLaiT)bp8J}o->f4J?(;LajS_)6%EjDoxN^cuo$$l%M3Aw^ zd-|no?vJEHEbt(L==bT#Hx$PS1sI_lx<^xd_eO>YD|C0qkzRo|p{6DfDjRwX`7cvpA(UpMjxY6i{RZJ&cwO3ep zm6J7LHEVM3dy}jQWH-7&l8u5*mu#W2>mRuilxn Re*NcMv23Ip-21K7{{a{yq$mIY literal 0 HcmV?d00001 diff --git a/tests/test_integration.py b/tests/test_integration.py index a710cfe4..b5299b96 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -220,6 +220,16 @@ class TestWb: assert resp.status_int == 302 + def test_post_1(self): + resp = self.testapp.post('/pywb/httpbin.org/post', {'foo': 'bar', 'test': 'abc'}) + assert resp.status_int == 307 + + resp = self.testapp.post('/pywb/20140610000859/http://httpbin.org/post', {'foo': 'bar', 'test': 'abc'}) + assert resp.status_int == 200 + assert '"foo": "bar"' in resp.body + assert '"test": "abc"' in resp.body + + def test_excluded_content(self): resp = self.testapp.get('/pywb/http://www.iana.org/_img/bookmark_icon.ico', status = 403) assert resp.status_int == 403