diff --git a/pywb/cdx/cdxdomainspecific.py b/pywb/cdx/cdxdomainspecific.py
index fd830c17..da4725b7 100644
--- a/pywb/cdx/cdxdomainspecific.py
+++ b/pywb/cdx/cdxdomainspecific.py
@@ -87,9 +87,9 @@ class FuzzyQuery:
matched_rule = rule
- if len(m.groups()) == 1:
- #filter_.append('~urlkey:' + m.group(1))
- filter_.append(rule.filter.format(m.group(1)))
+ groups = m.groups()
+ for g in groups:
+ filter_.append(rule.filter.format(g))
break
@@ -100,6 +100,11 @@ class FuzzyQuery:
if matched_rule.replace:
repl = matched_rule.replace
+ if '/_/stream/squarestream?soc-app' in url and 'jserror' not in url:
+ print 'KEY ', urlkey
+ print 'RULE ', url, vars(matched_rule)
+ print 'FILTERS ', filter_
+
inx = url.rfind(repl)
if inx > 0:
url = url[:inx + 1]
@@ -148,6 +153,6 @@ class CDXDomainSpecificRule(BaseRule):
self.replace = unsurt(self.replace)
-if __name__ == "__main__":
+ if __name__ == "__main__":
import doctest
doctest.testmod()
diff --git a/pywb/framework/wbrequestresponse.py b/pywb/framework/wbrequestresponse.py
index 446aa88a..85ff2eb8 100644
--- a/pywb/framework/wbrequestresponse.py
+++ b/pywb/framework/wbrequestresponse.py
@@ -1,4 +1,5 @@
from pywb.utils.statusandheaders import StatusAndHeaders
+from pywb.utils.loaders import extract_post_query, append_post_query
import pprint
@@ -45,6 +46,8 @@ class WbRequest(object):
else:
self.request_uri = env.get('REL_REQUEST_URI')
+ self.method = self.env.get('REQUEST_METHOD')
+
self.coll = coll
if not host_prefix:
@@ -116,6 +119,22 @@ class WbRequest(object):
wburl_str = self.referrer[len(self.host_prefix + self.rel_prefix):]
return wburl_str
+ def normalize_post_query(self):
+ if self.method != 'POST':
+ return
+
+ if not self.wb_url:
+ return
+
+ mime = self.env.get('CONTENT_TYPE')
+ length = self.env.get('CONTENT_LENGTH')
+ stream = self.env['wsgi.input']
+
+ post_query = extract_post_query('POST', mime, length, stream)
+
+ if post_query:
+ self.wb_url.url = append_post_query(self.wb_url.url, post_query)
+
#=================================================================
class WbResponse(object):
diff --git a/pywb/rewrite/header_rewriter.py b/pywb/rewrite/header_rewriter.py
index 25b27de4..2b22c000 100644
--- a/pywb/rewrite/header_rewriter.py
+++ b/pywb/rewrite/header_rewriter.py
@@ -18,7 +18,9 @@ class RewrittenStatusAndHeaders:
#=================================================================
class HeaderRewriter:
REWRITE_TYPES = {
- 'html': ['text/html', 'application/xhtml'],
+ 'html': ['text/html',
+ 'application/xhtml',
+ 'application/xhtml+xml'],
'css': ['text/css'],
diff --git a/pywb/rules.yaml b/pywb/rules.yaml
index 04327c92..ba92a652 100644
--- a/pywb/rules.yaml
+++ b/pywb/rules.yaml
@@ -40,6 +40,27 @@ rules:
replace: '/'
+ # google plus rules
+ #=================================================================
+
+ - url_prefix: 'com,google,plus)/_/stream/getactivities'
+
+ fuzzy_lookup: '(egk[^"]+).*(f.sid=[^&]+)'
+
+ - url_prefix: 'com,google,plus)/_/stream/squarestream'
+
+ fuzzy_lookup: '(cai[^"]+).*(f.sid=[^&]+)'
+
+ - url_prefix: 'com,google,plus)/_/communities/rt/landing'
+
+ fuzzy_lookup: 'com,google,plus\)/_/.*?.*\,(\d{13}\])&.*(f.sid=[^&]+).*'
+
+
+ - url_prefix: 'com,google,plus)/_/'
+
+ fuzzy_lookup: 'com,google,plus\)/_/.*?.*(f.sid=[^&]+)'
+
+
# testing rules -- not for valid domain
#=================================================================
# this rule block is a non-existent prefix merely for testing
diff --git a/pywb/static/wb.js b/pywb/static/wb.js
index e10a522e..357fcaa5 100644
--- a/pywb/static/wb.js
+++ b/pywb/static/wb.js
@@ -102,6 +102,6 @@ if (wbinfo.is_frame_mp && wbinfo.canon_url &&
(window.self == window.top) &&
window.location.href != wbinfo.canon_url) {
- console.log('frame');
- window.location.replace(wbinfo.canon_url);
+ //console.log('frame');
+ //window.location.replace(wbinfo.canon_url);
}
diff --git a/pywb/static/wombat.js b/pywb/static/wombat.js
index 78e4f7ea..d6cf60b5 100644
--- a/pywb/static/wombat.js
+++ b/pywb/static/wombat.js
@@ -111,6 +111,10 @@ WB_wombat_init = (function() {
if (!url) {
return url;
}
+
+ if (url.indexOf("hypothes.is") > 0) {
+ return url;
+ }
var urltype_ = (typeof url);
diff --git a/pywb/ui/head_insert.html b/pywb/ui/head_insert.html
index 72d30142..196735f9 100644
--- a/pywb/ui/head_insert.html
+++ b/pywb/ui/head_insert.html
@@ -20,5 +20,8 @@
+
+
+
diff --git a/pywb/utils/loaders.py b/pywb/utils/loaders.py
index a1d12d27..e01f72bd 100644
--- a/pywb/utils/loaders.py
+++ b/pywb/utils/loaders.py
@@ -5,6 +5,7 @@ local and remote access
import os
import hmac
+import urllib
import urllib2
import time
import pkg_resources
@@ -24,6 +25,56 @@ def load_yaml_config(config_file):
return config
+#=================================================================
+def extract_post_query(method, mime, length, stream):
+ """
+ Extract a url-encoded form POST from stream
+ If not a application/x-www-form-urlencoded, or no missing
+ content length, return None
+ """
+ if method.upper() != 'POST':
+ return None
+
+ if (not mime or
+ not mime.lower().startswith('application/x-www-form-urlencoded')):
+ return None
+
+ if not length or length == '0':
+ return None
+
+ try:
+ length = int(length)
+ except ValueError:
+ return None
+
+ #todo: encoding issues?
+ post_query = ''
+
+ while length > 0:
+ buff = stream.read(length)
+ length -= len(buff)
+
+ if not buff:
+ break
+
+ post_query += buff
+
+ post_query = urllib.unquote_plus(post_query)
+ return post_query
+
+
+#=================================================================
+def append_post_query(url, post_query):
+ if not post_query:
+ return url
+
+ if '?' not in url:
+ url += '?'
+
+ url += '&&&' + post_query
+ return url
+
+
#=================================================================
class BlockLoader(object):
"""
diff --git a/pywb/warc/archiveindexer.py b/pywb/warc/archiveindexer.py
index df7eef66..84e4e022 100644
--- a/pywb/warc/archiveindexer.py
+++ b/pywb/warc/archiveindexer.py
@@ -1,6 +1,7 @@
from pywb.utils.timeutils import iso_date_to_timestamp
from pywb.utils.bufferedreaders import DecompressingBufferedReader
from pywb.utils.canonicalize import canonicalize
+from pywb.utils.loaders import extract_post_query, append_post_query
from recordloader import ArcWarcRecordLoader
@@ -21,15 +22,27 @@ class ArchiveIndexer(object):
The indexer will automatically detect format, and decompress
if necessary
"""
+
+ # arc/warc record types which are
+ # indexed by default, without 'include_all'
+ DEFAULT_REC_TYPES = ('response', 'revisit', 'metadata', 'resource')
+
def __init__(self, fileobj, filename,
- out=sys.stdout, sort=False, writer=None, surt_ordered=True):
+ out=sys.stdout, sort=False, writer=None, surt_ordered=True,
+ include_all=False, append_post_query=False):
self.fh = fileobj
self.filename = filename
- self.loader = ArcWarcRecordLoader()
+
+ loader_parse_req = include_all or append_post_query
+ self.loader = ArcWarcRecordLoader(parse_request=loader_parse_req)
+
self.offset = 0
self.known_format = None
self.surt_ordered = surt_ordered
+ self.include_all = include_all
+ self.append_post_query = append_post_query
+
if writer:
self.writer = writer
elif sort:
@@ -37,6 +50,12 @@ class ArchiveIndexer(object):
else:
self.writer = CDXWriter(out)
+ # todo: refactor this
+ self.writer.indexer = self
+
+ if append_post_query:
+ self.writer = PostResolveWriter(self.writer, self)
+
def make_index(self):
""" Output a cdx index!
"""
@@ -127,9 +146,23 @@ class ArchiveIndexer(object):
self._read_to_record_end(reader, record)
return record
+ post_query = None
+
+ if record.rec_type == 'request':
+ method = record.status_headers.protocol
+ mime = result[3]
+ len_ = record.status_headers.get_header('Content-Length')
+
+ post_query = extract_post_query(method,
+ mime,
+ len_,
+ record.stream)
+
+ # should be 0 if read query string
+ num = self.read_rest(record.stream)
# generate digest if it doesn't exist and if not a revisit
# if revisit, then nothing we can do here
- if result[-1] == '-' and record.rec_type != 'revisit':
+ elif result[-1] == '-' and record.rec_type != 'revisit':
digester = hashlib.sha1()
self.read_rest(record.stream, digester)
result[-1] = base64.b32encode(digester.digest())
@@ -146,7 +179,7 @@ class ArchiveIndexer(object):
result.append(str(offset))
result.append(self.filename)
- self.writer.write(result)
+ self.writer.write(result, record.rec_type, post_query)
return record
@@ -154,25 +187,31 @@ class ArchiveIndexer(object):
""" Parse warc record to be included in index, or
return none if skipping this type of record
"""
- if record.rec_type not in ('response', 'revisit',
- 'metadata', 'resource'):
+
+ if (not self.append_post_query and
+ not self.include_record(record.rec_type)):
return None
url = record.rec_headers.get_header('WARC-Target-Uri')
+ if not url:
+ return None
timestamp = record.rec_headers.get_header('WARC-Date')
timestamp = iso_date_to_timestamp(timestamp)
digest = record.rec_headers.get_header('WARC-Payload-Digest')
- status = self._extract_status(record.status_headers)
-
if record.rec_type == 'revisit':
mime = 'warc/revisit'
status = '-'
+ elif record.rec_type == 'request':
+ mime = record.status_headers.get_header('Content-Type')
+ mime = self._extract_mime(mime, '-')
+ status = '-'
else:
mime = record.status_headers.get_header('Content-Type')
mime = self._extract_mime(mime)
+ status = self._extract_status(record.status_headers)
if digest and digest.startswith('sha1:'):
digest = digest[len('sha1:'):]
@@ -225,14 +264,14 @@ class ArchiveIndexer(object):
MIME_RE = re.compile('[; ]')
- def _extract_mime(self, mime):
+ def _extract_mime(self, mime, def_mime='unk'):
""" Utility function to extract mimetype only
from a full content type, removing charset settings
"""
if mime:
mime = self.MIME_RE.split(mime, 1)[0]
if not mime:
- mime = 'unk'
+ mime = def_mime
return mime
def _extract_status(self, status_headers):
@@ -256,17 +295,27 @@ class ArchiveIndexer(object):
digester.update(b)
return num
+ def include_record(self, type_):
+ return self.include_all or (type_ in self.DEFAULT_REC_TYPES)
+
+ def add_post_query(self, fields, post_query):
+ url = append_post_query(fields[2], post_query)
+ fields[0] = canonicalize(url, self.surt_ordered)
+ return fields
+
#=================================================================
class CDXWriter(object):
def __init__(self, out):
self.out = out
+ self.indexer = None
def start(self):
self.out.write(' CDX N b a m s k r M S V g\n')
- def write(self, line):
- self.out.write(' '.join(line) + '\n')
+ def write(self, line, rec_type, *args):
+ if not self.indexer or self.indexer.include_record(rec_type):
+ self.out.write(' '.join(line) + '\n')
def end(self):
pass
@@ -278,14 +327,66 @@ class SortedCDXWriter(CDXWriter):
super(SortedCDXWriter, self).__init__(out)
self.sortlist = []
- def write(self, line):
- line = ' '.join(line) + '\n'
- insort(self.sortlist, line)
+ def write(self, line, rec_type, *args):
+ if not self.indexer or self.indexer.include_record(rec_type):
+ line = ' '.join(line) + '\n'
+ insort(self.sortlist, line)
def end(self):
self.out.write(''.join(self.sortlist))
+#=================================================================
+class PostResolveWriter(CDXWriter):
+ def __init__(self, writer, indexer):
+ self.writer = writer
+ self.indexer = indexer
+ self.prev_line = None
+ self.prev_post_query = None
+ self.prev_type = None
+
+ def start(self):
+ self.writer.start()
+
+ def write(self, line, rec_type, post_query):
+ if not self.prev_line:
+ self.prev_line = line
+ self.prev_post_query = post_query
+ self.prev_type = rec_type
+ return
+
+ #cdx original field
+ if self.prev_line[2] != line[2]:
+ self.writer.write(self.prev_line, self.prev_type)
+ self.prev_line = line
+ self.prev_post_query = post_query
+ return
+
+ if self.prev_post_query or post_query:
+ if self.prev_post_query:
+ self.indexer.add_post_query(line, self.prev_post_query)
+ else:
+ self.indexer.add_post_query(line, post_query)
+
+ # update prev url key too
+ self.prev_line[0] = line[0]
+
+ # write both lines
+ self.writer.write(self.prev_line, self.prev_type)
+ self.writer.write(line, rec_type)
+
+ # flush any cached lines
+ self.prev_line = None
+ self.prev_post_query = None
+ self.prev_type = None
+
+ def end(self):
+ if self.prev_line:
+ self.writer.write(self.prev_line, self.prev_type)
+
+ self.writer.end()
+
+
#=================================================================
class MultiFileMixin(object):
def start_all(self):
@@ -323,7 +424,8 @@ def iter_file_or_dir(inputs):
yield os.path.join(input_, filename), filename
-def index_to_file(inputs, output, sort, surt_ordered):
+def index_to_file(inputs, output, sort,
+ surt_ordered, include_all, append_post_query):
if output == '-':
outfile = sys.stdout
else:
@@ -343,7 +445,9 @@ def index_to_file(inputs, output, sort, surt_ordered):
ArchiveIndexer(fileobj=infile,
filename=filename,
writer=writer,
- surt_ordered=surt_ordered).make_index()
+ surt_ordered=surt_ordered,
+ append_post_query=append_post_query,
+ include_all=include_all).make_index()
finally:
writer.end_all()
if infile:
@@ -363,7 +467,8 @@ def cdx_filename(filename):
return remove_ext(filename) + '.cdx'
-def index_to_dir(inputs, output, sort, surt_ordered):
+def index_to_dir(inputs, output, sort,
+ surt_ordered, include_all, append_post_query):
for fullpath, filename in iter_file_or_dir(inputs):
outpath = cdx_filename(filename)
@@ -375,7 +480,9 @@ def index_to_dir(inputs, output, sort, surt_ordered):
filename=filename,
sort=sort,
out=outfile,
- surt_ordered=surt_ordered).make_index()
+ surt_ordered=surt_ordered,
+ append_post_query=append_post_query,
+ include_all=include_all).make_index()
def main(args=None):
@@ -418,6 +525,13 @@ Not-recommended for new cdx, use only for backwards-compatibility.
- If directory, all archive files from that directory are read
"""
+ allrecords_help = """include all records.
+currently includes the 'request' records in addition to all
+response records"""
+
+ post_append_help = """for POST requests, append
+form query to url key. (Only applies to form url encoded posts)"""
+
parser = ArgumentParser(description=description,
epilog=epilog,
formatter_class=RawTextHelpFormatter)
@@ -426,18 +540,28 @@ Not-recommended for new cdx, use only for backwards-compatibility.
action='store_true',
help=sort_help)
+ parser.add_argument('-a', '--allrecords',
+ action='store_true',
+ help=allrecords_help)
+
+ parser.add_argument('-p', '--postappend',
+ action='store_true',
+ help=post_append_help)
+
parser.add_argument('-u', '--unsurt',
action='store_true',
help=unsurt_help)
- parser.add_argument('output', help=output_help)
+ parser.add_argument('output', nargs='?', default='-', help=output_help)
parser.add_argument('inputs', nargs='+', help=input_help)
cmd = parser.parse_args(args=args)
if cmd.output != '-' and os.path.isdir(cmd.output):
- index_to_dir(cmd.inputs, cmd.output, cmd.sort, not cmd.unsurt)
+ index_to_dir(cmd.inputs, cmd.output, cmd.sort,
+ not cmd.unsurt, cmd.allrecords, cmd.postappend)
else:
- index_to_file(cmd.inputs, cmd.output, cmd.sort, not cmd.unsurt)
+ index_to_file(cmd.inputs, cmd.output, cmd.sort,
+ not cmd.unsurt, cmd.allrecords, cmd.postappend)
if __name__ == '__main__':
diff --git a/pywb/warc/recordloader.py b/pywb/warc/recordloader.py
index 4c71dee3..c06c2b6f 100644
--- a/pywb/warc/recordloader.py
+++ b/pywb/warc/recordloader.py
@@ -39,7 +39,15 @@ class ArcWarcRecordLoader:
ARC_HEADERS = ["uri", "ip-address", "archive-date",
"content-type", "length"]
- def __init__(self, loader=None, cookie_maker=None, block_size=8192):
+ WARC_TYPES = ['WARC/1.0', 'WARC/0.17', 'WARC/0.18']
+
+ HTTP_TYPES = ['HTTP/1.0', 'HTTP/1.1']
+
+ HTTP_VERBS = ['GET', 'HEAD', 'POST', 'PUT', 'DELETE', 'TRACE',
+ 'OPTIONS', 'CONNECT', 'PATCH']
+
+ def __init__(self, loader=None, cookie_maker=None, block_size=8192,
+ parse_request=False):
if not loader:
loader = BlockLoader(cookie_maker)
@@ -48,9 +56,13 @@ class ArcWarcRecordLoader:
self.arc_parser = ARCHeadersParser(self.ARC_HEADERS)
- warc_types = ['WARC/1.0', 'WARC/0.17', 'WARC/0.18']
- self.warc_parser = StatusAndHeadersParser(warc_types)
- self.http_parser = StatusAndHeadersParser(['HTTP/1.0', 'HTTP/1.1'])
+ self.warc_parser = StatusAndHeadersParser(self.WARC_TYPES)
+ self.http_parser = StatusAndHeadersParser(self.HTTP_TYPES)
+
+ self.parse_request = parse_request
+ if self.parse_request:
+ self.http_req_parser = StatusAndHeadersParser(self.HTTP_VERBS)
+
def load(self, url, offset, length):
""" Load a single record from given url at offset with length
@@ -126,11 +138,16 @@ class ArcWarcRecordLoader:
status_headers = StatusAndHeaders('200 OK', content_type)
elif (rec_type == 'warcinfo' or
- rec_type == 'arc_header' or
- rec_type == 'request'):
+ rec_type == 'arc_header'):
# not parsing these for now
status_headers = StatusAndHeaders('204 No Content', [])
+ elif (rec_type == 'request'):
+ if self.parse_request:
+ status_headers = self.http_req_parser.parse(stream)
+ else:
+ status_headers = StatusAndHeaders('204 No Content', [])
+
# special case: http 0.9 response, no status or headers
#elif rec_type == 'response':
# content_type = rec_headers.get_header('Content-Type')
diff --git a/pywb/warc/test/test_indexing.py b/pywb/warc/test/test_indexing.py
index 0a3d6038..4a7a91e9 100644
--- a/pywb/warc/test/test_indexing.py
+++ b/pywb/warc/test/test_indexing.py
@@ -8,6 +8,15 @@ com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 20
com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
+# warc.gz -- parse all
+>>> print_cdx_index('example.warc.gz', include_all=True)
+ CDX N b a m s k r M S V g
+com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
+com,example)/?example=1 20140103030321 http://example.com?example=1 - - - - - 488 1376 example.warc.gz
+com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz
+com,example)/?example=1 20140103030341 http://example.com?example=1 - - - - - 490 2417 example.warc.gz
+org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
+
# warc
>>> print_cdx_index('example.warc')
CDX N b a m s k r M S V g
@@ -40,6 +49,45 @@ com,example)/ 20140401000000 http://example.com/ text/html - 3I42H3S6NNFQ2MSVX7X
com,example)/ 20140102000000 http://example.com/ text/plain - 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 59 202 bad.arc
com,example)/ 20140401000000 http://example.com/ text/html - 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 68 262 bad.arc
+
+# POST request tests
+#=================================================================
+# no post append, no requests
+>>> print_cdx_index('post-test.warc.gz')
+ CDX N b a m s k r M S V g
+org,httpbin)/post 20140610000859 http://httpbin.org/post application/json 200 M532K5WS4GY2H4OVZO6HRPOP47A7KDWU - - 720 0 post-test.warc.gz
+org,httpbin)/post 20140610001151 http://httpbin.org/post application/json 200 M7YCTM7HS3YKYQTAWQVMQSQZBNEOXGU2 - - 723 1196 post-test.warc.gz
+org,httpbin)/post?foo=bar 20140610001255 http://httpbin.org/post?foo=bar application/json 200 B6E5P6JUZI6UPDTNO4L2BCHMGLTNCUAJ - - 723 2395 post-test.warc.gz
+
+# post append
+>>> print_cdx_index('post-test.warc.gz', append_post_query=True)
+ CDX N b a m s k r M S V g
+org,httpbin)/post?&&&foo=bar&test=abc 20140610000859 http://httpbin.org/post application/json 200 M532K5WS4GY2H4OVZO6HRPOP47A7KDWU - - 720 0 post-test.warc.gz
+org,httpbin)/post?&&&a=1&b=[]&c=3 20140610001151 http://httpbin.org/post application/json 200 M7YCTM7HS3YKYQTAWQVMQSQZBNEOXGU2 - - 723 1196 post-test.warc.gz
+org,httpbin)/post?&&data=^&foo=bar 20140610001255 http://httpbin.org/post?foo=bar application/json 200 B6E5P6JUZI6UPDTNO4L2BCHMGLTNCUAJ - - 723 2395 post-test.warc.gz
+
+# no post append, requests included
+>>> print_cdx_index('post-test.warc.gz', include_all=True)
+ CDX N b a m s k r M S V g
+org,httpbin)/post 20140610000859 http://httpbin.org/post application/json 200 M532K5WS4GY2H4OVZO6HRPOP47A7KDWU - - 720 0 post-test.warc.gz
+org,httpbin)/post 20140610000859 http://httpbin.org/post application/x-www-form-urlencoded - - - - 476 720 post-test.warc.gz
+org,httpbin)/post 20140610001151 http://httpbin.org/post application/json 200 M7YCTM7HS3YKYQTAWQVMQSQZBNEOXGU2 - - 723 1196 post-test.warc.gz
+org,httpbin)/post 20140610001151 http://httpbin.org/post application/x-www-form-urlencoded - - - - 476 1919 post-test.warc.gz
+org,httpbin)/post?foo=bar 20140610001255 http://httpbin.org/post?foo=bar application/json 200 B6E5P6JUZI6UPDTNO4L2BCHMGLTNCUAJ - - 723 2395 post-test.warc.gz
+org,httpbin)/post?foo=bar 20140610001255 http://httpbin.org/post?foo=bar application/x-www-form-urlencoded - - - - 475 3118 post-test.warc.gz
+
+# post append + requests included
+>>> print_cdx_index('post-test.warc.gz', include_all=True, append_post_query=True)
+ CDX N b a m s k r M S V g
+org,httpbin)/post?&&&foo=bar&test=abc 20140610000859 http://httpbin.org/post application/json 200 M532K5WS4GY2H4OVZO6HRPOP47A7KDWU - - 720 0 post-test.warc.gz
+org,httpbin)/post?&&&foo=bar&test=abc 20140610000859 http://httpbin.org/post application/x-www-form-urlencoded - - - - 476 720 post-test.warc.gz
+org,httpbin)/post?&&&a=1&b=[]&c=3 20140610001151 http://httpbin.org/post application/json 200 M7YCTM7HS3YKYQTAWQVMQSQZBNEOXGU2 - - 723 1196 post-test.warc.gz
+org,httpbin)/post?&&&a=1&b=[]&c=3 20140610001151 http://httpbin.org/post application/x-www-form-urlencoded - - - - 476 1919 post-test.warc.gz
+org,httpbin)/post?&&data=^&foo=bar 20140610001255 http://httpbin.org/post?foo=bar application/json 200 B6E5P6JUZI6UPDTNO4L2BCHMGLTNCUAJ - - 723 2395 post-test.warc.gz
+org,httpbin)/post?&&data=^&foo=bar 20140610001255 http://httpbin.org/post?foo=bar application/x-www-form-urlencoded - - - - 475 3118 post-test.warc.gz
+
+
+
# Test CLI interface -- (check for num lines)
#=================================================================
@@ -47,7 +95,7 @@ com,example)/ 20140401000000 http://example.com/ text/html - 3I42H3S6NNFQ2MSVX7X
>>> cli_lines(['--sort', '-', TEST_WARC_DIR])
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1001 353 example-url-agnostic-orig.warc.gz
-201
+204
# test writing to stdout
>>> cli_lines(['-', TEST_WARC_DIR + 'example.warc.gz'])
@@ -55,6 +103,12 @@ com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 20
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
4
+# test writing to stdout ('-' omitted)
+>>> cli_lines([TEST_WARC_DIR + 'example.warc.gz'])
+com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
+org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
+4
+
# test writing to temp dir
>>> cli_lines_with_dir(TEST_WARC_DIR + 'example.warc.gz')
example.cdx
@@ -86,19 +140,22 @@ def read_fully(cdx):
curr.write(b)
return curr.getvalue()
-def cdx_index(warc, sort=False):
+def cdx_index(warc, sort=False,
+ include_all=False, append_post_query=False):
buff = BytesIO()
with open(TEST_WARC_DIR + warc) as fh:
indexer = ArchiveIndexer(fh, warc,
out=buff,
- sort=sort)
+ sort=sort,
+ include_all=include_all,
+ append_post_query=append_post_query)
indexer.make_index()
return buff.getvalue()
-def print_cdx_index(warc, sort=False):
- sys.stdout.write(cdx_index(warc, sort))
+def print_cdx_index(*args, **kwargs):
+ sys.stdout.write(cdx_index(*args, **kwargs))
def assert_cdx_match(cdx, warc, sort=False):
assert read_fully(cdx) == cdx_index(warc, sort)
diff --git a/pywb/warc/test/test_loading.py b/pywb/warc/test/test_loading.py
index ae559126..f744aa9f 100644
--- a/pywb/warc/test/test_loading.py
+++ b/pywb/warc/test/test_loading.py
@@ -54,6 +54,28 @@ Test loading different types of records from a variety of formats
('Content-Length', '1270'),
('Connection', 'close')]))
+# request parsing
+>>> load_test_archive('example.warc.gz', '1376', '488')
+(('warc', 'request'),
+ StatusAndHeaders(protocol = 'WARC/1.0', statusline = '', headers = [ ('WARC-Type', 'request'),
+ ('WARC-Record-ID', ''),
+ ('WARC-Date', '2014-01-03T03:03:21Z'),
+ ('Content-Length', '323'),
+ ('Content-Type', 'application/http; msgtype=request'),
+ ('WARC-Concurrent-To', ''),
+ ('WARC-Target-URI', 'http://example.com?example=1'),
+ ('WARC-Warcinfo-ID', '')]),
+ StatusAndHeaders(protocol = 'GET', statusline = '/?example=1 HTTP/1.1', headers = [ ('Connection', 'close'),
+ ( 'Accept',
+ 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'),
+ ('Accept-Language', 'en-US,en;q=0.8'),
+ ( 'User-Agent',
+ 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.57 Safari/537.36 (via Wayback Save Page)'),
+ ('Host', 'example.com')]))
+
+
+StatusAndHeaders(protocol = '', statusline = '204 No Content', headers = []))
+
# Test of record loading based on cdx line
# Print parsed http headers + 2 lines of content
@@ -308,7 +330,7 @@ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - \
def load_test_archive(test_file, offset, length):
path = test_warc_dir + test_file
- testloader = ArcWarcRecordLoader()
+ testloader = ArcWarcRecordLoader(parse_request=True)
archive = testloader.load(path, offset, length)
diff --git a/pywb/webapp/query_handler.py b/pywb/webapp/query_handler.py
index 42c93806..b7d09ecb 100644
--- a/pywb/webapp/query_handler.py
+++ b/pywb/webapp/query_handler.py
@@ -47,6 +47,8 @@ class QueryHandler(object):
return QueryHandler(cdx_server, html_view, perms_policy)
def load_for_request(self, wbrequest):
+ wbrequest.normalize_post_query()
+
wb_url = wbrequest.wb_url
# cdx server only supports text and cdxobject for now
diff --git a/pywb/webapp/replay_views.py b/pywb/webapp/replay_views.py
index 07bcb7ce..0301b6ee 100644
--- a/pywb/webapp/replay_views.py
+++ b/pywb/webapp/replay_views.py
@@ -187,7 +187,12 @@ class ReplayView(object):
new_url = wbrequest.urlrewriter.get_timestamp_url(cdx['timestamp'],
cdx['original'])
- status_headers = StatusAndHeaders('302 Internal Redirect',
+ if wbrequest.method not in ('HEAD', 'GET'):
+ statusline = '307 Same-Method Internal Redirect'
+ else:
+ statusline = '302 Internal Redirect'
+
+ status_headers = StatusAndHeaders(statusline,
[('Location', new_url)])
# don't include cdx to indicate internal redirect
diff --git a/sample_archive/cdx/post-test.cdx b/sample_archive/cdx/post-test.cdx
new file mode 100644
index 00000000..cb36fed7
--- /dev/null
+++ b/sample_archive/cdx/post-test.cdx
@@ -0,0 +1,4 @@
+ CDX N b a m s k r M S V g
+org,httpbin)/post?&&&a=1&b=[]&c=3 20140610001151 http://httpbin.org/post application/json 200 M7YCTM7HS3YKYQTAWQVMQSQZBNEOXGU2 - - 723 1196 post-test.warc.gz
+org,httpbin)/post?&&&foo=bar&test=abc 20140610000859 http://httpbin.org/post application/json 200 M532K5WS4GY2H4OVZO6HRPOP47A7KDWU - - 720 0 post-test.warc.gz
+org,httpbin)/post?&&data=^&foo=bar 20140610001255 http://httpbin.org/post?foo=bar application/json 200 B6E5P6JUZI6UPDTNO4L2BCHMGLTNCUAJ - - 723 2395 post-test.warc.gz
diff --git a/sample_archive/warcs/post-test.warc.gz b/sample_archive/warcs/post-test.warc.gz
new file mode 100644
index 00000000..b9cc1f48
Binary files /dev/null and b/sample_archive/warcs/post-test.warc.gz differ
diff --git a/tests/test_integration.py b/tests/test_integration.py
index a710cfe4..b5299b96 100644
--- a/tests/test_integration.py
+++ b/tests/test_integration.py
@@ -220,6 +220,16 @@ class TestWb:
assert resp.status_int == 302
+ def test_post_1(self):
+ resp = self.testapp.post('/pywb/httpbin.org/post', {'foo': 'bar', 'test': 'abc'})
+ assert resp.status_int == 307
+
+ resp = self.testapp.post('/pywb/20140610000859/http://httpbin.org/post', {'foo': 'bar', 'test': 'abc'})
+ assert resp.status_int == 200
+ assert '"foo": "bar"' in resp.body
+ assert '"test": "abc"' in resp.body
+
+
def test_excluded_content(self):
resp = self.testapp.get('/pywb/http://www.iana.org/_img/bookmark_icon.ico', status = 403)
assert resp.status_int == 403