mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
replay: better POST support via post query append!
record_loader can optionally parse 'request' records archiveindexer has -a flag to write all records ('request' included), -p flag to append post query post-test.warc.gz and cdx POST redirects using 307
This commit is contained in:
parent
028cdaa22e
commit
e2349a74e2
@ -87,9 +87,9 @@ class FuzzyQuery:
|
|||||||
|
|
||||||
matched_rule = rule
|
matched_rule = rule
|
||||||
|
|
||||||
if len(m.groups()) == 1:
|
groups = m.groups()
|
||||||
#filter_.append('~urlkey:' + m.group(1))
|
for g in groups:
|
||||||
filter_.append(rule.filter.format(m.group(1)))
|
filter_.append(rule.filter.format(g))
|
||||||
|
|
||||||
break
|
break
|
||||||
|
|
||||||
@ -100,6 +100,11 @@ class FuzzyQuery:
|
|||||||
if matched_rule.replace:
|
if matched_rule.replace:
|
||||||
repl = matched_rule.replace
|
repl = matched_rule.replace
|
||||||
|
|
||||||
|
if '/_/stream/squarestream?soc-app' in url and 'jserror' not in url:
|
||||||
|
print 'KEY ', urlkey
|
||||||
|
print 'RULE ', url, vars(matched_rule)
|
||||||
|
print 'FILTERS ', filter_
|
||||||
|
|
||||||
inx = url.rfind(repl)
|
inx = url.rfind(repl)
|
||||||
if inx > 0:
|
if inx > 0:
|
||||||
url = url[:inx + 1]
|
url = url[:inx + 1]
|
||||||
@ -148,6 +153,6 @@ class CDXDomainSpecificRule(BaseRule):
|
|||||||
self.replace = unsurt(self.replace)
|
self.replace = unsurt(self.replace)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import doctest
|
import doctest
|
||||||
doctest.testmod()
|
doctest.testmod()
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
from pywb.utils.statusandheaders import StatusAndHeaders
|
from pywb.utils.statusandheaders import StatusAndHeaders
|
||||||
|
from pywb.utils.loaders import extract_post_query, append_post_query
|
||||||
|
|
||||||
import pprint
|
import pprint
|
||||||
|
|
||||||
@ -45,6 +46,8 @@ class WbRequest(object):
|
|||||||
else:
|
else:
|
||||||
self.request_uri = env.get('REL_REQUEST_URI')
|
self.request_uri = env.get('REL_REQUEST_URI')
|
||||||
|
|
||||||
|
self.method = self.env.get('REQUEST_METHOD')
|
||||||
|
|
||||||
self.coll = coll
|
self.coll = coll
|
||||||
|
|
||||||
if not host_prefix:
|
if not host_prefix:
|
||||||
@ -116,6 +119,22 @@ class WbRequest(object):
|
|||||||
wburl_str = self.referrer[len(self.host_prefix + self.rel_prefix):]
|
wburl_str = self.referrer[len(self.host_prefix + self.rel_prefix):]
|
||||||
return wburl_str
|
return wburl_str
|
||||||
|
|
||||||
|
def normalize_post_query(self):
|
||||||
|
if self.method != 'POST':
|
||||||
|
return
|
||||||
|
|
||||||
|
if not self.wb_url:
|
||||||
|
return
|
||||||
|
|
||||||
|
mime = self.env.get('CONTENT_TYPE')
|
||||||
|
length = self.env.get('CONTENT_LENGTH')
|
||||||
|
stream = self.env['wsgi.input']
|
||||||
|
|
||||||
|
post_query = extract_post_query('POST', mime, length, stream)
|
||||||
|
|
||||||
|
if post_query:
|
||||||
|
self.wb_url.url = append_post_query(self.wb_url.url, post_query)
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class WbResponse(object):
|
class WbResponse(object):
|
||||||
|
@ -18,7 +18,9 @@ class RewrittenStatusAndHeaders:
|
|||||||
#=================================================================
|
#=================================================================
|
||||||
class HeaderRewriter:
|
class HeaderRewriter:
|
||||||
REWRITE_TYPES = {
|
REWRITE_TYPES = {
|
||||||
'html': ['text/html', 'application/xhtml'],
|
'html': ['text/html',
|
||||||
|
'application/xhtml',
|
||||||
|
'application/xhtml+xml'],
|
||||||
|
|
||||||
'css': ['text/css'],
|
'css': ['text/css'],
|
||||||
|
|
||||||
|
@ -40,6 +40,27 @@ rules:
|
|||||||
replace: '/'
|
replace: '/'
|
||||||
|
|
||||||
|
|
||||||
|
# google plus rules
|
||||||
|
#=================================================================
|
||||||
|
|
||||||
|
- url_prefix: 'com,google,plus)/_/stream/getactivities'
|
||||||
|
|
||||||
|
fuzzy_lookup: '(egk[^"]+).*(f.sid=[^&]+)'
|
||||||
|
|
||||||
|
- url_prefix: 'com,google,plus)/_/stream/squarestream'
|
||||||
|
|
||||||
|
fuzzy_lookup: '(cai[^"]+).*(f.sid=[^&]+)'
|
||||||
|
|
||||||
|
- url_prefix: 'com,google,plus)/_/communities/rt/landing'
|
||||||
|
|
||||||
|
fuzzy_lookup: 'com,google,plus\)/_/.*?.*\,(\d{13}\])&.*(f.sid=[^&]+).*'
|
||||||
|
|
||||||
|
|
||||||
|
- url_prefix: 'com,google,plus)/_/'
|
||||||
|
|
||||||
|
fuzzy_lookup: 'com,google,plus\)/_/.*?.*(f.sid=[^&]+)'
|
||||||
|
|
||||||
|
|
||||||
# testing rules -- not for valid domain
|
# testing rules -- not for valid domain
|
||||||
#=================================================================
|
#=================================================================
|
||||||
# this rule block is a non-existent prefix merely for testing
|
# this rule block is a non-existent prefix merely for testing
|
||||||
|
@ -102,6 +102,6 @@ if (wbinfo.is_frame_mp && wbinfo.canon_url &&
|
|||||||
(window.self == window.top) &&
|
(window.self == window.top) &&
|
||||||
window.location.href != wbinfo.canon_url) {
|
window.location.href != wbinfo.canon_url) {
|
||||||
|
|
||||||
console.log('frame');
|
//console.log('frame');
|
||||||
window.location.replace(wbinfo.canon_url);
|
//window.location.replace(wbinfo.canon_url);
|
||||||
}
|
}
|
||||||
|
@ -111,6 +111,10 @@ WB_wombat_init = (function() {
|
|||||||
if (!url) {
|
if (!url) {
|
||||||
return url;
|
return url;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (url.indexOf("hypothes.is") > 0) {
|
||||||
|
return url;
|
||||||
|
}
|
||||||
|
|
||||||
var urltype_ = (typeof url);
|
var urltype_ = (typeof url);
|
||||||
|
|
||||||
|
@ -20,5 +20,8 @@
|
|||||||
</script>
|
</script>
|
||||||
<script src='{{ wbrequest.host_prefix }}/{{ static_path }}/wb.js'> </script>
|
<script src='{{ wbrequest.host_prefix }}/{{ static_path }}/wb.js'> </script>
|
||||||
<link rel='stylesheet' href='{{ wbrequest.host_prefix }}/{{ static_path }}/wb.css'/>
|
<link rel='stylesheet' href='{{ wbrequest.host_prefix }}/{{ static_path }}/wb.css'/>
|
||||||
|
|
||||||
|
<script type="text/javascript" src="https://hypothes.is/embed.js"></script>
|
||||||
|
|
||||||
<!-- End WB Insert -->
|
<!-- End WB Insert -->
|
||||||
|
|
||||||
|
@ -5,6 +5,7 @@ local and remote access
|
|||||||
|
|
||||||
import os
|
import os
|
||||||
import hmac
|
import hmac
|
||||||
|
import urllib
|
||||||
import urllib2
|
import urllib2
|
||||||
import time
|
import time
|
||||||
import pkg_resources
|
import pkg_resources
|
||||||
@ -24,6 +25,56 @@ def load_yaml_config(config_file):
|
|||||||
return config
|
return config
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
def extract_post_query(method, mime, length, stream):
|
||||||
|
"""
|
||||||
|
Extract a url-encoded form POST from stream
|
||||||
|
If not a application/x-www-form-urlencoded, or no missing
|
||||||
|
content length, return None
|
||||||
|
"""
|
||||||
|
if method.upper() != 'POST':
|
||||||
|
return None
|
||||||
|
|
||||||
|
if (not mime or
|
||||||
|
not mime.lower().startswith('application/x-www-form-urlencoded')):
|
||||||
|
return None
|
||||||
|
|
||||||
|
if not length or length == '0':
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
length = int(length)
|
||||||
|
except ValueError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
#todo: encoding issues?
|
||||||
|
post_query = ''
|
||||||
|
|
||||||
|
while length > 0:
|
||||||
|
buff = stream.read(length)
|
||||||
|
length -= len(buff)
|
||||||
|
|
||||||
|
if not buff:
|
||||||
|
break
|
||||||
|
|
||||||
|
post_query += buff
|
||||||
|
|
||||||
|
post_query = urllib.unquote_plus(post_query)
|
||||||
|
return post_query
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
def append_post_query(url, post_query):
|
||||||
|
if not post_query:
|
||||||
|
return url
|
||||||
|
|
||||||
|
if '?' not in url:
|
||||||
|
url += '?'
|
||||||
|
|
||||||
|
url += '&&&' + post_query
|
||||||
|
return url
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class BlockLoader(object):
|
class BlockLoader(object):
|
||||||
"""
|
"""
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
from pywb.utils.timeutils import iso_date_to_timestamp
|
from pywb.utils.timeutils import iso_date_to_timestamp
|
||||||
from pywb.utils.bufferedreaders import DecompressingBufferedReader
|
from pywb.utils.bufferedreaders import DecompressingBufferedReader
|
||||||
from pywb.utils.canonicalize import canonicalize
|
from pywb.utils.canonicalize import canonicalize
|
||||||
|
from pywb.utils.loaders import extract_post_query, append_post_query
|
||||||
|
|
||||||
from recordloader import ArcWarcRecordLoader
|
from recordloader import ArcWarcRecordLoader
|
||||||
|
|
||||||
@ -21,15 +22,27 @@ class ArchiveIndexer(object):
|
|||||||
The indexer will automatically detect format, and decompress
|
The indexer will automatically detect format, and decompress
|
||||||
if necessary
|
if necessary
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
# arc/warc record types which are
|
||||||
|
# indexed by default, without 'include_all'
|
||||||
|
DEFAULT_REC_TYPES = ('response', 'revisit', 'metadata', 'resource')
|
||||||
|
|
||||||
def __init__(self, fileobj, filename,
|
def __init__(self, fileobj, filename,
|
||||||
out=sys.stdout, sort=False, writer=None, surt_ordered=True):
|
out=sys.stdout, sort=False, writer=None, surt_ordered=True,
|
||||||
|
include_all=False, append_post_query=False):
|
||||||
self.fh = fileobj
|
self.fh = fileobj
|
||||||
self.filename = filename
|
self.filename = filename
|
||||||
self.loader = ArcWarcRecordLoader()
|
|
||||||
|
loader_parse_req = include_all or append_post_query
|
||||||
|
self.loader = ArcWarcRecordLoader(parse_request=loader_parse_req)
|
||||||
|
|
||||||
self.offset = 0
|
self.offset = 0
|
||||||
self.known_format = None
|
self.known_format = None
|
||||||
self.surt_ordered = surt_ordered
|
self.surt_ordered = surt_ordered
|
||||||
|
|
||||||
|
self.include_all = include_all
|
||||||
|
self.append_post_query = append_post_query
|
||||||
|
|
||||||
if writer:
|
if writer:
|
||||||
self.writer = writer
|
self.writer = writer
|
||||||
elif sort:
|
elif sort:
|
||||||
@ -37,6 +50,12 @@ class ArchiveIndexer(object):
|
|||||||
else:
|
else:
|
||||||
self.writer = CDXWriter(out)
|
self.writer = CDXWriter(out)
|
||||||
|
|
||||||
|
# todo: refactor this
|
||||||
|
self.writer.indexer = self
|
||||||
|
|
||||||
|
if append_post_query:
|
||||||
|
self.writer = PostResolveWriter(self.writer, self)
|
||||||
|
|
||||||
def make_index(self):
|
def make_index(self):
|
||||||
""" Output a cdx index!
|
""" Output a cdx index!
|
||||||
"""
|
"""
|
||||||
@ -127,9 +146,23 @@ class ArchiveIndexer(object):
|
|||||||
self._read_to_record_end(reader, record)
|
self._read_to_record_end(reader, record)
|
||||||
return record
|
return record
|
||||||
|
|
||||||
|
post_query = None
|
||||||
|
|
||||||
|
if record.rec_type == 'request':
|
||||||
|
method = record.status_headers.protocol
|
||||||
|
mime = result[3]
|
||||||
|
len_ = record.status_headers.get_header('Content-Length')
|
||||||
|
|
||||||
|
post_query = extract_post_query(method,
|
||||||
|
mime,
|
||||||
|
len_,
|
||||||
|
record.stream)
|
||||||
|
|
||||||
|
# should be 0 if read query string
|
||||||
|
num = self.read_rest(record.stream)
|
||||||
# generate digest if it doesn't exist and if not a revisit
|
# generate digest if it doesn't exist and if not a revisit
|
||||||
# if revisit, then nothing we can do here
|
# if revisit, then nothing we can do here
|
||||||
if result[-1] == '-' and record.rec_type != 'revisit':
|
elif result[-1] == '-' and record.rec_type != 'revisit':
|
||||||
digester = hashlib.sha1()
|
digester = hashlib.sha1()
|
||||||
self.read_rest(record.stream, digester)
|
self.read_rest(record.stream, digester)
|
||||||
result[-1] = base64.b32encode(digester.digest())
|
result[-1] = base64.b32encode(digester.digest())
|
||||||
@ -146,7 +179,7 @@ class ArchiveIndexer(object):
|
|||||||
result.append(str(offset))
|
result.append(str(offset))
|
||||||
result.append(self.filename)
|
result.append(self.filename)
|
||||||
|
|
||||||
self.writer.write(result)
|
self.writer.write(result, record.rec_type, post_query)
|
||||||
|
|
||||||
return record
|
return record
|
||||||
|
|
||||||
@ -154,25 +187,31 @@ class ArchiveIndexer(object):
|
|||||||
""" Parse warc record to be included in index, or
|
""" Parse warc record to be included in index, or
|
||||||
return none if skipping this type of record
|
return none if skipping this type of record
|
||||||
"""
|
"""
|
||||||
if record.rec_type not in ('response', 'revisit',
|
|
||||||
'metadata', 'resource'):
|
if (not self.append_post_query and
|
||||||
|
not self.include_record(record.rec_type)):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
url = record.rec_headers.get_header('WARC-Target-Uri')
|
url = record.rec_headers.get_header('WARC-Target-Uri')
|
||||||
|
if not url:
|
||||||
|
return None
|
||||||
|
|
||||||
timestamp = record.rec_headers.get_header('WARC-Date')
|
timestamp = record.rec_headers.get_header('WARC-Date')
|
||||||
timestamp = iso_date_to_timestamp(timestamp)
|
timestamp = iso_date_to_timestamp(timestamp)
|
||||||
|
|
||||||
digest = record.rec_headers.get_header('WARC-Payload-Digest')
|
digest = record.rec_headers.get_header('WARC-Payload-Digest')
|
||||||
|
|
||||||
status = self._extract_status(record.status_headers)
|
|
||||||
|
|
||||||
if record.rec_type == 'revisit':
|
if record.rec_type == 'revisit':
|
||||||
mime = 'warc/revisit'
|
mime = 'warc/revisit'
|
||||||
status = '-'
|
status = '-'
|
||||||
|
elif record.rec_type == 'request':
|
||||||
|
mime = record.status_headers.get_header('Content-Type')
|
||||||
|
mime = self._extract_mime(mime, '-')
|
||||||
|
status = '-'
|
||||||
else:
|
else:
|
||||||
mime = record.status_headers.get_header('Content-Type')
|
mime = record.status_headers.get_header('Content-Type')
|
||||||
mime = self._extract_mime(mime)
|
mime = self._extract_mime(mime)
|
||||||
|
status = self._extract_status(record.status_headers)
|
||||||
|
|
||||||
if digest and digest.startswith('sha1:'):
|
if digest and digest.startswith('sha1:'):
|
||||||
digest = digest[len('sha1:'):]
|
digest = digest[len('sha1:'):]
|
||||||
@ -225,14 +264,14 @@ class ArchiveIndexer(object):
|
|||||||
|
|
||||||
MIME_RE = re.compile('[; ]')
|
MIME_RE = re.compile('[; ]')
|
||||||
|
|
||||||
def _extract_mime(self, mime):
|
def _extract_mime(self, mime, def_mime='unk'):
|
||||||
""" Utility function to extract mimetype only
|
""" Utility function to extract mimetype only
|
||||||
from a full content type, removing charset settings
|
from a full content type, removing charset settings
|
||||||
"""
|
"""
|
||||||
if mime:
|
if mime:
|
||||||
mime = self.MIME_RE.split(mime, 1)[0]
|
mime = self.MIME_RE.split(mime, 1)[0]
|
||||||
if not mime:
|
if not mime:
|
||||||
mime = 'unk'
|
mime = def_mime
|
||||||
return mime
|
return mime
|
||||||
|
|
||||||
def _extract_status(self, status_headers):
|
def _extract_status(self, status_headers):
|
||||||
@ -256,17 +295,27 @@ class ArchiveIndexer(object):
|
|||||||
digester.update(b)
|
digester.update(b)
|
||||||
return num
|
return num
|
||||||
|
|
||||||
|
def include_record(self, type_):
|
||||||
|
return self.include_all or (type_ in self.DEFAULT_REC_TYPES)
|
||||||
|
|
||||||
|
def add_post_query(self, fields, post_query):
|
||||||
|
url = append_post_query(fields[2], post_query)
|
||||||
|
fields[0] = canonicalize(url, self.surt_ordered)
|
||||||
|
return fields
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class CDXWriter(object):
|
class CDXWriter(object):
|
||||||
def __init__(self, out):
|
def __init__(self, out):
|
||||||
self.out = out
|
self.out = out
|
||||||
|
self.indexer = None
|
||||||
|
|
||||||
def start(self):
|
def start(self):
|
||||||
self.out.write(' CDX N b a m s k r M S V g\n')
|
self.out.write(' CDX N b a m s k r M S V g\n')
|
||||||
|
|
||||||
def write(self, line):
|
def write(self, line, rec_type, *args):
|
||||||
self.out.write(' '.join(line) + '\n')
|
if not self.indexer or self.indexer.include_record(rec_type):
|
||||||
|
self.out.write(' '.join(line) + '\n')
|
||||||
|
|
||||||
def end(self):
|
def end(self):
|
||||||
pass
|
pass
|
||||||
@ -278,14 +327,66 @@ class SortedCDXWriter(CDXWriter):
|
|||||||
super(SortedCDXWriter, self).__init__(out)
|
super(SortedCDXWriter, self).__init__(out)
|
||||||
self.sortlist = []
|
self.sortlist = []
|
||||||
|
|
||||||
def write(self, line):
|
def write(self, line, rec_type, *args):
|
||||||
line = ' '.join(line) + '\n'
|
if not self.indexer or self.indexer.include_record(rec_type):
|
||||||
insort(self.sortlist, line)
|
line = ' '.join(line) + '\n'
|
||||||
|
insort(self.sortlist, line)
|
||||||
|
|
||||||
def end(self):
|
def end(self):
|
||||||
self.out.write(''.join(self.sortlist))
|
self.out.write(''.join(self.sortlist))
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class PostResolveWriter(CDXWriter):
|
||||||
|
def __init__(self, writer, indexer):
|
||||||
|
self.writer = writer
|
||||||
|
self.indexer = indexer
|
||||||
|
self.prev_line = None
|
||||||
|
self.prev_post_query = None
|
||||||
|
self.prev_type = None
|
||||||
|
|
||||||
|
def start(self):
|
||||||
|
self.writer.start()
|
||||||
|
|
||||||
|
def write(self, line, rec_type, post_query):
|
||||||
|
if not self.prev_line:
|
||||||
|
self.prev_line = line
|
||||||
|
self.prev_post_query = post_query
|
||||||
|
self.prev_type = rec_type
|
||||||
|
return
|
||||||
|
|
||||||
|
#cdx original field
|
||||||
|
if self.prev_line[2] != line[2]:
|
||||||
|
self.writer.write(self.prev_line, self.prev_type)
|
||||||
|
self.prev_line = line
|
||||||
|
self.prev_post_query = post_query
|
||||||
|
return
|
||||||
|
|
||||||
|
if self.prev_post_query or post_query:
|
||||||
|
if self.prev_post_query:
|
||||||
|
self.indexer.add_post_query(line, self.prev_post_query)
|
||||||
|
else:
|
||||||
|
self.indexer.add_post_query(line, post_query)
|
||||||
|
|
||||||
|
# update prev url key too
|
||||||
|
self.prev_line[0] = line[0]
|
||||||
|
|
||||||
|
# write both lines
|
||||||
|
self.writer.write(self.prev_line, self.prev_type)
|
||||||
|
self.writer.write(line, rec_type)
|
||||||
|
|
||||||
|
# flush any cached lines
|
||||||
|
self.prev_line = None
|
||||||
|
self.prev_post_query = None
|
||||||
|
self.prev_type = None
|
||||||
|
|
||||||
|
def end(self):
|
||||||
|
if self.prev_line:
|
||||||
|
self.writer.write(self.prev_line, self.prev_type)
|
||||||
|
|
||||||
|
self.writer.end()
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class MultiFileMixin(object):
|
class MultiFileMixin(object):
|
||||||
def start_all(self):
|
def start_all(self):
|
||||||
@ -323,7 +424,8 @@ def iter_file_or_dir(inputs):
|
|||||||
yield os.path.join(input_, filename), filename
|
yield os.path.join(input_, filename), filename
|
||||||
|
|
||||||
|
|
||||||
def index_to_file(inputs, output, sort, surt_ordered):
|
def index_to_file(inputs, output, sort,
|
||||||
|
surt_ordered, include_all, append_post_query):
|
||||||
if output == '-':
|
if output == '-':
|
||||||
outfile = sys.stdout
|
outfile = sys.stdout
|
||||||
else:
|
else:
|
||||||
@ -343,7 +445,9 @@ def index_to_file(inputs, output, sort, surt_ordered):
|
|||||||
ArchiveIndexer(fileobj=infile,
|
ArchiveIndexer(fileobj=infile,
|
||||||
filename=filename,
|
filename=filename,
|
||||||
writer=writer,
|
writer=writer,
|
||||||
surt_ordered=surt_ordered).make_index()
|
surt_ordered=surt_ordered,
|
||||||
|
append_post_query=append_post_query,
|
||||||
|
include_all=include_all).make_index()
|
||||||
finally:
|
finally:
|
||||||
writer.end_all()
|
writer.end_all()
|
||||||
if infile:
|
if infile:
|
||||||
@ -363,7 +467,8 @@ def cdx_filename(filename):
|
|||||||
return remove_ext(filename) + '.cdx'
|
return remove_ext(filename) + '.cdx'
|
||||||
|
|
||||||
|
|
||||||
def index_to_dir(inputs, output, sort, surt_ordered):
|
def index_to_dir(inputs, output, sort,
|
||||||
|
surt_ordered, include_all, append_post_query):
|
||||||
for fullpath, filename in iter_file_or_dir(inputs):
|
for fullpath, filename in iter_file_or_dir(inputs):
|
||||||
|
|
||||||
outpath = cdx_filename(filename)
|
outpath = cdx_filename(filename)
|
||||||
@ -375,7 +480,9 @@ def index_to_dir(inputs, output, sort, surt_ordered):
|
|||||||
filename=filename,
|
filename=filename,
|
||||||
sort=sort,
|
sort=sort,
|
||||||
out=outfile,
|
out=outfile,
|
||||||
surt_ordered=surt_ordered).make_index()
|
surt_ordered=surt_ordered,
|
||||||
|
append_post_query=append_post_query,
|
||||||
|
include_all=include_all).make_index()
|
||||||
|
|
||||||
|
|
||||||
def main(args=None):
|
def main(args=None):
|
||||||
@ -418,6 +525,13 @@ Not-recommended for new cdx, use only for backwards-compatibility.
|
|||||||
- If directory, all archive files from that directory are read
|
- If directory, all archive files from that directory are read
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
allrecords_help = """include all records.
|
||||||
|
currently includes the 'request' records in addition to all
|
||||||
|
response records"""
|
||||||
|
|
||||||
|
post_append_help = """for POST requests, append
|
||||||
|
form query to url key. (Only applies to form url encoded posts)"""
|
||||||
|
|
||||||
parser = ArgumentParser(description=description,
|
parser = ArgumentParser(description=description,
|
||||||
epilog=epilog,
|
epilog=epilog,
|
||||||
formatter_class=RawTextHelpFormatter)
|
formatter_class=RawTextHelpFormatter)
|
||||||
@ -426,18 +540,28 @@ Not-recommended for new cdx, use only for backwards-compatibility.
|
|||||||
action='store_true',
|
action='store_true',
|
||||||
help=sort_help)
|
help=sort_help)
|
||||||
|
|
||||||
|
parser.add_argument('-a', '--allrecords',
|
||||||
|
action='store_true',
|
||||||
|
help=allrecords_help)
|
||||||
|
|
||||||
|
parser.add_argument('-p', '--postappend',
|
||||||
|
action='store_true',
|
||||||
|
help=post_append_help)
|
||||||
|
|
||||||
parser.add_argument('-u', '--unsurt',
|
parser.add_argument('-u', '--unsurt',
|
||||||
action='store_true',
|
action='store_true',
|
||||||
help=unsurt_help)
|
help=unsurt_help)
|
||||||
|
|
||||||
parser.add_argument('output', help=output_help)
|
parser.add_argument('output', nargs='?', default='-', help=output_help)
|
||||||
parser.add_argument('inputs', nargs='+', help=input_help)
|
parser.add_argument('inputs', nargs='+', help=input_help)
|
||||||
|
|
||||||
cmd = parser.parse_args(args=args)
|
cmd = parser.parse_args(args=args)
|
||||||
if cmd.output != '-' and os.path.isdir(cmd.output):
|
if cmd.output != '-' and os.path.isdir(cmd.output):
|
||||||
index_to_dir(cmd.inputs, cmd.output, cmd.sort, not cmd.unsurt)
|
index_to_dir(cmd.inputs, cmd.output, cmd.sort,
|
||||||
|
not cmd.unsurt, cmd.allrecords, cmd.postappend)
|
||||||
else:
|
else:
|
||||||
index_to_file(cmd.inputs, cmd.output, cmd.sort, not cmd.unsurt)
|
index_to_file(cmd.inputs, cmd.output, cmd.sort,
|
||||||
|
not cmd.unsurt, cmd.allrecords, cmd.postappend)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
@ -39,7 +39,15 @@ class ArcWarcRecordLoader:
|
|||||||
ARC_HEADERS = ["uri", "ip-address", "archive-date",
|
ARC_HEADERS = ["uri", "ip-address", "archive-date",
|
||||||
"content-type", "length"]
|
"content-type", "length"]
|
||||||
|
|
||||||
def __init__(self, loader=None, cookie_maker=None, block_size=8192):
|
WARC_TYPES = ['WARC/1.0', 'WARC/0.17', 'WARC/0.18']
|
||||||
|
|
||||||
|
HTTP_TYPES = ['HTTP/1.0', 'HTTP/1.1']
|
||||||
|
|
||||||
|
HTTP_VERBS = ['GET', 'HEAD', 'POST', 'PUT', 'DELETE', 'TRACE',
|
||||||
|
'OPTIONS', 'CONNECT', 'PATCH']
|
||||||
|
|
||||||
|
def __init__(self, loader=None, cookie_maker=None, block_size=8192,
|
||||||
|
parse_request=False):
|
||||||
if not loader:
|
if not loader:
|
||||||
loader = BlockLoader(cookie_maker)
|
loader = BlockLoader(cookie_maker)
|
||||||
|
|
||||||
@ -48,9 +56,13 @@ class ArcWarcRecordLoader:
|
|||||||
|
|
||||||
self.arc_parser = ARCHeadersParser(self.ARC_HEADERS)
|
self.arc_parser = ARCHeadersParser(self.ARC_HEADERS)
|
||||||
|
|
||||||
warc_types = ['WARC/1.0', 'WARC/0.17', 'WARC/0.18']
|
self.warc_parser = StatusAndHeadersParser(self.WARC_TYPES)
|
||||||
self.warc_parser = StatusAndHeadersParser(warc_types)
|
self.http_parser = StatusAndHeadersParser(self.HTTP_TYPES)
|
||||||
self.http_parser = StatusAndHeadersParser(['HTTP/1.0', 'HTTP/1.1'])
|
|
||||||
|
self.parse_request = parse_request
|
||||||
|
if self.parse_request:
|
||||||
|
self.http_req_parser = StatusAndHeadersParser(self.HTTP_VERBS)
|
||||||
|
|
||||||
|
|
||||||
def load(self, url, offset, length):
|
def load(self, url, offset, length):
|
||||||
""" Load a single record from given url at offset with length
|
""" Load a single record from given url at offset with length
|
||||||
@ -126,11 +138,16 @@ class ArcWarcRecordLoader:
|
|||||||
status_headers = StatusAndHeaders('200 OK', content_type)
|
status_headers = StatusAndHeaders('200 OK', content_type)
|
||||||
|
|
||||||
elif (rec_type == 'warcinfo' or
|
elif (rec_type == 'warcinfo' or
|
||||||
rec_type == 'arc_header' or
|
rec_type == 'arc_header'):
|
||||||
rec_type == 'request'):
|
|
||||||
# not parsing these for now
|
# not parsing these for now
|
||||||
status_headers = StatusAndHeaders('204 No Content', [])
|
status_headers = StatusAndHeaders('204 No Content', [])
|
||||||
|
|
||||||
|
elif (rec_type == 'request'):
|
||||||
|
if self.parse_request:
|
||||||
|
status_headers = self.http_req_parser.parse(stream)
|
||||||
|
else:
|
||||||
|
status_headers = StatusAndHeaders('204 No Content', [])
|
||||||
|
|
||||||
# special case: http 0.9 response, no status or headers
|
# special case: http 0.9 response, no status or headers
|
||||||
#elif rec_type == 'response':
|
#elif rec_type == 'response':
|
||||||
# content_type = rec_headers.get_header('Content-Type')
|
# content_type = rec_headers.get_header('Content-Type')
|
||||||
|
@ -8,6 +8,15 @@ com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 20
|
|||||||
com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz
|
com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz
|
||||||
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
|
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
|
||||||
|
|
||||||
|
# warc.gz -- parse all
|
||||||
|
>>> print_cdx_index('example.warc.gz', include_all=True)
|
||||||
|
CDX N b a m s k r M S V g
|
||||||
|
com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
|
||||||
|
com,example)/?example=1 20140103030321 http://example.com?example=1 - - - - - 488 1376 example.warc.gz
|
||||||
|
com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz
|
||||||
|
com,example)/?example=1 20140103030341 http://example.com?example=1 - - - - - 490 2417 example.warc.gz
|
||||||
|
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
|
||||||
|
|
||||||
# warc
|
# warc
|
||||||
>>> print_cdx_index('example.warc')
|
>>> print_cdx_index('example.warc')
|
||||||
CDX N b a m s k r M S V g
|
CDX N b a m s k r M S V g
|
||||||
@ -40,6 +49,45 @@ com,example)/ 20140401000000 http://example.com/ text/html - 3I42H3S6NNFQ2MSVX7X
|
|||||||
com,example)/ 20140102000000 http://example.com/ text/plain - 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 59 202 bad.arc
|
com,example)/ 20140102000000 http://example.com/ text/plain - 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 59 202 bad.arc
|
||||||
com,example)/ 20140401000000 http://example.com/ text/html - 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 68 262 bad.arc
|
com,example)/ 20140401000000 http://example.com/ text/html - 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 68 262 bad.arc
|
||||||
|
|
||||||
|
|
||||||
|
# POST request tests
|
||||||
|
#=================================================================
|
||||||
|
# no post append, no requests
|
||||||
|
>>> print_cdx_index('post-test.warc.gz')
|
||||||
|
CDX N b a m s k r M S V g
|
||||||
|
org,httpbin)/post 20140610000859 http://httpbin.org/post application/json 200 M532K5WS4GY2H4OVZO6HRPOP47A7KDWU - - 720 0 post-test.warc.gz
|
||||||
|
org,httpbin)/post 20140610001151 http://httpbin.org/post application/json 200 M7YCTM7HS3YKYQTAWQVMQSQZBNEOXGU2 - - 723 1196 post-test.warc.gz
|
||||||
|
org,httpbin)/post?foo=bar 20140610001255 http://httpbin.org/post?foo=bar application/json 200 B6E5P6JUZI6UPDTNO4L2BCHMGLTNCUAJ - - 723 2395 post-test.warc.gz
|
||||||
|
|
||||||
|
# post append
|
||||||
|
>>> print_cdx_index('post-test.warc.gz', append_post_query=True)
|
||||||
|
CDX N b a m s k r M S V g
|
||||||
|
org,httpbin)/post?&&&foo=bar&test=abc 20140610000859 http://httpbin.org/post application/json 200 M532K5WS4GY2H4OVZO6HRPOP47A7KDWU - - 720 0 post-test.warc.gz
|
||||||
|
org,httpbin)/post?&&&a=1&b=[]&c=3 20140610001151 http://httpbin.org/post application/json 200 M7YCTM7HS3YKYQTAWQVMQSQZBNEOXGU2 - - 723 1196 post-test.warc.gz
|
||||||
|
org,httpbin)/post?&&data=^&foo=bar 20140610001255 http://httpbin.org/post?foo=bar application/json 200 B6E5P6JUZI6UPDTNO4L2BCHMGLTNCUAJ - - 723 2395 post-test.warc.gz
|
||||||
|
|
||||||
|
# no post append, requests included
|
||||||
|
>>> print_cdx_index('post-test.warc.gz', include_all=True)
|
||||||
|
CDX N b a m s k r M S V g
|
||||||
|
org,httpbin)/post 20140610000859 http://httpbin.org/post application/json 200 M532K5WS4GY2H4OVZO6HRPOP47A7KDWU - - 720 0 post-test.warc.gz
|
||||||
|
org,httpbin)/post 20140610000859 http://httpbin.org/post application/x-www-form-urlencoded - - - - 476 720 post-test.warc.gz
|
||||||
|
org,httpbin)/post 20140610001151 http://httpbin.org/post application/json 200 M7YCTM7HS3YKYQTAWQVMQSQZBNEOXGU2 - - 723 1196 post-test.warc.gz
|
||||||
|
org,httpbin)/post 20140610001151 http://httpbin.org/post application/x-www-form-urlencoded - - - - 476 1919 post-test.warc.gz
|
||||||
|
org,httpbin)/post?foo=bar 20140610001255 http://httpbin.org/post?foo=bar application/json 200 B6E5P6JUZI6UPDTNO4L2BCHMGLTNCUAJ - - 723 2395 post-test.warc.gz
|
||||||
|
org,httpbin)/post?foo=bar 20140610001255 http://httpbin.org/post?foo=bar application/x-www-form-urlencoded - - - - 475 3118 post-test.warc.gz
|
||||||
|
|
||||||
|
# post append + requests included
|
||||||
|
>>> print_cdx_index('post-test.warc.gz', include_all=True, append_post_query=True)
|
||||||
|
CDX N b a m s k r M S V g
|
||||||
|
org,httpbin)/post?&&&foo=bar&test=abc 20140610000859 http://httpbin.org/post application/json 200 M532K5WS4GY2H4OVZO6HRPOP47A7KDWU - - 720 0 post-test.warc.gz
|
||||||
|
org,httpbin)/post?&&&foo=bar&test=abc 20140610000859 http://httpbin.org/post application/x-www-form-urlencoded - - - - 476 720 post-test.warc.gz
|
||||||
|
org,httpbin)/post?&&&a=1&b=[]&c=3 20140610001151 http://httpbin.org/post application/json 200 M7YCTM7HS3YKYQTAWQVMQSQZBNEOXGU2 - - 723 1196 post-test.warc.gz
|
||||||
|
org,httpbin)/post?&&&a=1&b=[]&c=3 20140610001151 http://httpbin.org/post application/x-www-form-urlencoded - - - - 476 1919 post-test.warc.gz
|
||||||
|
org,httpbin)/post?&&data=^&foo=bar 20140610001255 http://httpbin.org/post?foo=bar application/json 200 B6E5P6JUZI6UPDTNO4L2BCHMGLTNCUAJ - - 723 2395 post-test.warc.gz
|
||||||
|
org,httpbin)/post?&&data=^&foo=bar 20140610001255 http://httpbin.org/post?foo=bar application/x-www-form-urlencoded - - - - 475 3118 post-test.warc.gz
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Test CLI interface -- (check for num lines)
|
# Test CLI interface -- (check for num lines)
|
||||||
#=================================================================
|
#=================================================================
|
||||||
|
|
||||||
@ -47,7 +95,7 @@ com,example)/ 20140401000000 http://example.com/ text/html - 3I42H3S6NNFQ2MSVX7X
|
|||||||
>>> cli_lines(['--sort', '-', TEST_WARC_DIR])
|
>>> cli_lines(['--sort', '-', TEST_WARC_DIR])
|
||||||
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
|
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
|
||||||
org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1001 353 example-url-agnostic-orig.warc.gz
|
org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1001 353 example-url-agnostic-orig.warc.gz
|
||||||
201
|
204
|
||||||
|
|
||||||
# test writing to stdout
|
# test writing to stdout
|
||||||
>>> cli_lines(['-', TEST_WARC_DIR + 'example.warc.gz'])
|
>>> cli_lines(['-', TEST_WARC_DIR + 'example.warc.gz'])
|
||||||
@ -55,6 +103,12 @@ com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 20
|
|||||||
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
|
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
|
||||||
4
|
4
|
||||||
|
|
||||||
|
# test writing to stdout ('-' omitted)
|
||||||
|
>>> cli_lines([TEST_WARC_DIR + 'example.warc.gz'])
|
||||||
|
com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
|
||||||
|
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
|
||||||
|
4
|
||||||
|
|
||||||
# test writing to temp dir
|
# test writing to temp dir
|
||||||
>>> cli_lines_with_dir(TEST_WARC_DIR + 'example.warc.gz')
|
>>> cli_lines_with_dir(TEST_WARC_DIR + 'example.warc.gz')
|
||||||
example.cdx
|
example.cdx
|
||||||
@ -86,19 +140,22 @@ def read_fully(cdx):
|
|||||||
curr.write(b)
|
curr.write(b)
|
||||||
return curr.getvalue()
|
return curr.getvalue()
|
||||||
|
|
||||||
def cdx_index(warc, sort=False):
|
def cdx_index(warc, sort=False,
|
||||||
|
include_all=False, append_post_query=False):
|
||||||
buff = BytesIO()
|
buff = BytesIO()
|
||||||
with open(TEST_WARC_DIR + warc) as fh:
|
with open(TEST_WARC_DIR + warc) as fh:
|
||||||
indexer = ArchiveIndexer(fh, warc,
|
indexer = ArchiveIndexer(fh, warc,
|
||||||
out=buff,
|
out=buff,
|
||||||
sort=sort)
|
sort=sort,
|
||||||
|
include_all=include_all,
|
||||||
|
append_post_query=append_post_query)
|
||||||
|
|
||||||
indexer.make_index()
|
indexer.make_index()
|
||||||
|
|
||||||
return buff.getvalue()
|
return buff.getvalue()
|
||||||
|
|
||||||
def print_cdx_index(warc, sort=False):
|
def print_cdx_index(*args, **kwargs):
|
||||||
sys.stdout.write(cdx_index(warc, sort))
|
sys.stdout.write(cdx_index(*args, **kwargs))
|
||||||
|
|
||||||
def assert_cdx_match(cdx, warc, sort=False):
|
def assert_cdx_match(cdx, warc, sort=False):
|
||||||
assert read_fully(cdx) == cdx_index(warc, sort)
|
assert read_fully(cdx) == cdx_index(warc, sort)
|
||||||
|
@ -54,6 +54,28 @@ Test loading different types of records from a variety of formats
|
|||||||
('Content-Length', '1270'),
|
('Content-Length', '1270'),
|
||||||
('Connection', 'close')]))
|
('Connection', 'close')]))
|
||||||
|
|
||||||
|
# request parsing
|
||||||
|
>>> load_test_archive('example.warc.gz', '1376', '488')
|
||||||
|
(('warc', 'request'),
|
||||||
|
StatusAndHeaders(protocol = 'WARC/1.0', statusline = '', headers = [ ('WARC-Type', 'request'),
|
||||||
|
('WARC-Record-ID', '<urn:uuid:9a3ffea5-9556-4790-a6bf-c15231fd6b97>'),
|
||||||
|
('WARC-Date', '2014-01-03T03:03:21Z'),
|
||||||
|
('Content-Length', '323'),
|
||||||
|
('Content-Type', 'application/http; msgtype=request'),
|
||||||
|
('WARC-Concurrent-To', '<urn:uuid:6d058047-ede2-4a13-be79-90c17c631dd4>'),
|
||||||
|
('WARC-Target-URI', 'http://example.com?example=1'),
|
||||||
|
('WARC-Warcinfo-ID', '<urn:uuid:fbd6cf0a-6160-4550-b343-12188dc05234>')]),
|
||||||
|
StatusAndHeaders(protocol = 'GET', statusline = '/?example=1 HTTP/1.1', headers = [ ('Connection', 'close'),
|
||||||
|
( 'Accept',
|
||||||
|
'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'),
|
||||||
|
('Accept-Language', 'en-US,en;q=0.8'),
|
||||||
|
( 'User-Agent',
|
||||||
|
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.57 Safari/537.36 (via Wayback Save Page)'),
|
||||||
|
('Host', 'example.com')]))
|
||||||
|
|
||||||
|
|
||||||
|
StatusAndHeaders(protocol = '', statusline = '204 No Content', headers = []))
|
||||||
|
|
||||||
|
|
||||||
# Test of record loading based on cdx line
|
# Test of record loading based on cdx line
|
||||||
# Print parsed http headers + 2 lines of content
|
# Print parsed http headers + 2 lines of content
|
||||||
@ -308,7 +330,7 @@ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - \
|
|||||||
def load_test_archive(test_file, offset, length):
|
def load_test_archive(test_file, offset, length):
|
||||||
path = test_warc_dir + test_file
|
path = test_warc_dir + test_file
|
||||||
|
|
||||||
testloader = ArcWarcRecordLoader()
|
testloader = ArcWarcRecordLoader(parse_request=True)
|
||||||
|
|
||||||
archive = testloader.load(path, offset, length)
|
archive = testloader.load(path, offset, length)
|
||||||
|
|
||||||
|
@ -47,6 +47,8 @@ class QueryHandler(object):
|
|||||||
return QueryHandler(cdx_server, html_view, perms_policy)
|
return QueryHandler(cdx_server, html_view, perms_policy)
|
||||||
|
|
||||||
def load_for_request(self, wbrequest):
|
def load_for_request(self, wbrequest):
|
||||||
|
wbrequest.normalize_post_query()
|
||||||
|
|
||||||
wb_url = wbrequest.wb_url
|
wb_url = wbrequest.wb_url
|
||||||
|
|
||||||
# cdx server only supports text and cdxobject for now
|
# cdx server only supports text and cdxobject for now
|
||||||
|
@ -187,7 +187,12 @@ class ReplayView(object):
|
|||||||
new_url = wbrequest.urlrewriter.get_timestamp_url(cdx['timestamp'],
|
new_url = wbrequest.urlrewriter.get_timestamp_url(cdx['timestamp'],
|
||||||
cdx['original'])
|
cdx['original'])
|
||||||
|
|
||||||
status_headers = StatusAndHeaders('302 Internal Redirect',
|
if wbrequest.method not in ('HEAD', 'GET'):
|
||||||
|
statusline = '307 Same-Method Internal Redirect'
|
||||||
|
else:
|
||||||
|
statusline = '302 Internal Redirect'
|
||||||
|
|
||||||
|
status_headers = StatusAndHeaders(statusline,
|
||||||
[('Location', new_url)])
|
[('Location', new_url)])
|
||||||
|
|
||||||
# don't include cdx to indicate internal redirect
|
# don't include cdx to indicate internal redirect
|
||||||
|
4
sample_archive/cdx/post-test.cdx
Normal file
4
sample_archive/cdx/post-test.cdx
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
CDX N b a m s k r M S V g
|
||||||
|
org,httpbin)/post?&&&a=1&b=[]&c=3 20140610001151 http://httpbin.org/post application/json 200 M7YCTM7HS3YKYQTAWQVMQSQZBNEOXGU2 - - 723 1196 post-test.warc.gz
|
||||||
|
org,httpbin)/post?&&&foo=bar&test=abc 20140610000859 http://httpbin.org/post application/json 200 M532K5WS4GY2H4OVZO6HRPOP47A7KDWU - - 720 0 post-test.warc.gz
|
||||||
|
org,httpbin)/post?&&data=^&foo=bar 20140610001255 http://httpbin.org/post?foo=bar application/json 200 B6E5P6JUZI6UPDTNO4L2BCHMGLTNCUAJ - - 723 2395 post-test.warc.gz
|
BIN
sample_archive/warcs/post-test.warc.gz
Normal file
BIN
sample_archive/warcs/post-test.warc.gz
Normal file
Binary file not shown.
@ -220,6 +220,16 @@ class TestWb:
|
|||||||
assert resp.status_int == 302
|
assert resp.status_int == 302
|
||||||
|
|
||||||
|
|
||||||
|
def test_post_1(self):
|
||||||
|
resp = self.testapp.post('/pywb/httpbin.org/post', {'foo': 'bar', 'test': 'abc'})
|
||||||
|
assert resp.status_int == 307
|
||||||
|
|
||||||
|
resp = self.testapp.post('/pywb/20140610000859/http://httpbin.org/post', {'foo': 'bar', 'test': 'abc'})
|
||||||
|
assert resp.status_int == 200
|
||||||
|
assert '"foo": "bar"' in resp.body
|
||||||
|
assert '"test": "abc"' in resp.body
|
||||||
|
|
||||||
|
|
||||||
def test_excluded_content(self):
|
def test_excluded_content(self):
|
||||||
resp = self.testapp.get('/pywb/http://www.iana.org/_img/bookmark_icon.ico', status = 403)
|
resp = self.testapp.get('/pywb/http://www.iana.org/_img/bookmark_icon.ico', status = 403)
|
||||||
assert resp.status_int == 403
|
assert resp.status_int == 403
|
||||||
|
Loading…
x
Reference in New Issue
Block a user