mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
http OPTIONS and HEAD canonicalization: (#260)
* http OPTIONS canonicalization: - rename PostQueryExtractor to generic MethodQueryCanonicalizer, handles OPTIONS verb in addition to POST - use more generic 'query' instead of 'post_query' for method-query canonicalization - append '__pywb_method=options' to OPTIONS responses to distinguish from get in MethodQueryCanonicalizer * method canon: also add HEAD to __pywb_method query canonicalization
This commit is contained in:
parent
4b60dd5dda
commit
3e9087df3c
@ -188,7 +188,7 @@ class RewriterApp(object):
|
||||
|
||||
inputreq = RewriteInputRequest(environ, urlkey, wb_url.url, content_rw)
|
||||
|
||||
inputreq.include_post_query(wb_url.url)
|
||||
inputreq.include_method_query(wb_url.url)
|
||||
|
||||
mod_url = None
|
||||
use_206 = False
|
||||
|
@ -1,6 +1,6 @@
|
||||
from pywb.utils.canonicalize import canonicalize
|
||||
|
||||
from pywb.warcserver.inputrequest import PostQueryExtractor
|
||||
from pywb.warcserver.inputrequest import MethodQueryCanonicalizer
|
||||
from pywb.utils.io import BUFF_SIZE
|
||||
|
||||
from warcio.timeutils import iso_date_to_timestamp
|
||||
@ -69,7 +69,7 @@ class ArchiveIndexEntryMixin(object):
|
||||
# merge POST/PUT body query
|
||||
post_query = other.get('_post_query')
|
||||
url = self['url']
|
||||
new_url = post_query.append_post_query(url)
|
||||
new_url = post_query.append_query(url)
|
||||
if post_query and new_url != url:
|
||||
self['urlkey'] = canonicalize(new_url, surt_ordered)
|
||||
other['urlkey'] = self['urlkey']
|
||||
@ -181,7 +181,7 @@ class DefaultRecordParser(object):
|
||||
method = record.http_headers.protocol
|
||||
len_ = record.http_headers.get_header('Content-Length')
|
||||
|
||||
post_query = PostQueryExtractor(method,
|
||||
post_query = MethodQueryCanonicalizer(method,
|
||||
entry.get('_content_type'),
|
||||
len_,
|
||||
record.raw_stream)
|
||||
|
@ -55,7 +55,7 @@ class IndexHandler(object):
|
||||
|
||||
input_req = params.get('_input_req')
|
||||
if input_req:
|
||||
params['alt_url'] = input_req.include_post_query(url)
|
||||
params['alt_url'] = input_req.include_method_query(url)
|
||||
|
||||
return self.fuzzy(self.index_source, params)
|
||||
|
||||
|
@ -71,22 +71,26 @@ class DirectWSGIInputRequest(object):
|
||||
def _get_header(self, name):
|
||||
return self.env.get('HTTP_' + name.upper().replace('-', '_'))
|
||||
|
||||
def include_post_query(self, url):
|
||||
if not url or self.get_req_method() != 'POST':
|
||||
def include_method_query(self, url):
|
||||
if not url:
|
||||
return url
|
||||
|
||||
method = self.get_req_method()
|
||||
|
||||
if method not in ('OPTIONS', 'HEAD', 'POST'):
|
||||
return url
|
||||
|
||||
mime = self._get_content_type()
|
||||
#mime = mime.split(';')[0] if mime else ''
|
||||
length = self._get_content_length()
|
||||
stream = self.env['wsgi.input']
|
||||
|
||||
buffered_stream = BytesIO()
|
||||
|
||||
post_query = PostQueryExtractor('POST', mime, length, stream,
|
||||
buffered_stream=buffered_stream,
|
||||
environ=self.env)
|
||||
query = MethodQueryCanonicalizer(method, mime, length, stream,
|
||||
buffered_stream=buffered_stream,
|
||||
environ=self.env)
|
||||
|
||||
new_url = post_query.append_post_query(url)
|
||||
new_url = query.append_query(url)
|
||||
if new_url != url:
|
||||
self.env['wsgi.input'] = buffered_stream
|
||||
|
||||
@ -176,19 +180,26 @@ class POSTInputRequest(DirectWSGIInputRequest):
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class PostQueryExtractor(object):
|
||||
class MethodQueryCanonicalizer(object):
|
||||
def __init__(self, method, mime, length, stream,
|
||||
buffered_stream=None,
|
||||
environ=None):
|
||||
"""
|
||||
Extract a url-encoded form POST from stream
|
||||
content length, return None
|
||||
Append the method for HEAD/OPTIONS as __pywb_method=<method>
|
||||
For POST requests, requests extract a url-encoded form from stream
|
||||
read content length and convert to query params, if possible
|
||||
Attempt to decode application/x-www-form-urlencoded or multipart/*,
|
||||
otherwise read whole block and b64encode
|
||||
"""
|
||||
self.post_query = b''
|
||||
self.query = b''
|
||||
|
||||
if method.upper() != 'POST':
|
||||
method = method.upper()
|
||||
|
||||
if method in ('OPTIONS', 'HEAD'):
|
||||
self.query = '__pywb_method=' + method.lower()
|
||||
return
|
||||
|
||||
if method != 'POST':
|
||||
return
|
||||
|
||||
try:
|
||||
@ -199,7 +210,7 @@ class PostQueryExtractor(object):
|
||||
if length <= 0:
|
||||
return
|
||||
|
||||
post_query = b''
|
||||
query = b''
|
||||
|
||||
while length > 0:
|
||||
buff = stream.read(length)
|
||||
@ -208,25 +219,25 @@ class PostQueryExtractor(object):
|
||||
if not buff:
|
||||
break
|
||||
|
||||
post_query += buff
|
||||
query += buff
|
||||
|
||||
if buffered_stream:
|
||||
buffered_stream.write(post_query)
|
||||
buffered_stream.write(query)
|
||||
buffered_stream.seek(0)
|
||||
|
||||
if not mime:
|
||||
mime = ''
|
||||
|
||||
if mime.startswith('application/x-www-form-urlencoded'):
|
||||
post_query = to_native_str(post_query)
|
||||
post_query = unquote_plus(post_query)
|
||||
query = to_native_str(query)
|
||||
query = unquote_plus(query)
|
||||
|
||||
elif mime.startswith('multipart/'):
|
||||
env = {'REQUEST_METHOD': 'POST',
|
||||
'CONTENT_TYPE': mime,
|
||||
'CONTENT_LENGTH': len(post_query)}
|
||||
'CONTENT_LENGTH': len(query)}
|
||||
|
||||
args = dict(fp=BytesIO(post_query),
|
||||
args = dict(fp=BytesIO(query),
|
||||
environ=env,
|
||||
keep_blank_values=True)
|
||||
|
||||
@ -239,17 +250,17 @@ class PostQueryExtractor(object):
|
||||
for item in data.list:
|
||||
values.append((item.name, item.value))
|
||||
|
||||
post_query = urlencode(values, True)
|
||||
query = urlencode(values, True)
|
||||
|
||||
elif mime.startswith('application/x-amf'):
|
||||
post_query = self.amf_parse(post_query, environ)
|
||||
query = self.amf_parse(query, environ)
|
||||
|
||||
else:
|
||||
post_query = base64.b64encode(post_query)
|
||||
post_query = to_native_str(post_query)
|
||||
post_query = '__wb_post_data=' + post_query
|
||||
query = base64.b64encode(query)
|
||||
query = to_native_str(query)
|
||||
query = '__wb_post_data=' + query
|
||||
|
||||
self.post_query = post_query
|
||||
self.query = query
|
||||
|
||||
def amf_parse(self, string, environ):
|
||||
try:
|
||||
@ -284,8 +295,8 @@ class PostQueryExtractor(object):
|
||||
print(e)
|
||||
return None
|
||||
|
||||
def append_post_query(self, url):
|
||||
if not self.post_query:
|
||||
def append_query(self, url):
|
||||
if not self.query:
|
||||
return url
|
||||
|
||||
if '?' not in url:
|
||||
@ -293,6 +304,6 @@ class PostQueryExtractor(object):
|
||||
else:
|
||||
url += '&'
|
||||
|
||||
url += self.post_query
|
||||
url += self.query
|
||||
return url
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
from pywb.warcserver.inputrequest import DirectWSGIInputRequest, POSTInputRequest, PostQueryExtractor
|
||||
from pywb.warcserver.inputrequest import DirectWSGIInputRequest, POSTInputRequest, MethodQueryCanonicalizer
|
||||
from werkzeug.routing import Map, Rule
|
||||
|
||||
import webtest
|
||||
@ -84,54 +84,62 @@ class TestPostQueryExtract(object):
|
||||
cls.post_data = b'foo=bar&dir=%2Fbaz'
|
||||
|
||||
def test_post_extract_1(self):
|
||||
pq = PostQueryExtractor('POST', 'application/x-www-form-urlencoded',
|
||||
mq = MethodQueryCanonicalizer('POST', 'application/x-www-form-urlencoded',
|
||||
len(self.post_data), BytesIO(self.post_data))
|
||||
|
||||
assert pq.append_post_query('http://example.com/') == 'http://example.com/?foo=bar&dir=/baz'
|
||||
assert mq.append_query('http://example.com/') == 'http://example.com/?foo=bar&dir=/baz'
|
||||
|
||||
assert pq.append_post_query('http://example.com/?123=ABC') == 'http://example.com/?123=ABC&foo=bar&dir=/baz'
|
||||
assert mq.append_query('http://example.com/?123=ABC') == 'http://example.com/?123=ABC&foo=bar&dir=/baz'
|
||||
|
||||
def test_post_extract_wrong_method(self):
|
||||
pq = PostQueryExtractor('PUT', 'application/x-www-form-urlencoded',
|
||||
mq = MethodQueryCanonicalizer('PUT', 'application/x-www-form-urlencoded',
|
||||
len(self.post_data), BytesIO(self.post_data))
|
||||
|
||||
assert pq.append_post_query('http://example.com/') == 'http://example.com/'
|
||||
assert mq.append_query('http://example.com/') == 'http://example.com/'
|
||||
|
||||
def test_post_extract_non_form_data_1(self):
|
||||
pq = PostQueryExtractor('POST', 'application/octet-stream',
|
||||
mq = MethodQueryCanonicalizer('POST', 'application/octet-stream',
|
||||
len(self.post_data), BytesIO(self.post_data))
|
||||
|
||||
#base64 encoded data
|
||||
assert pq.append_post_query('http://example.com/') == 'http://example.com/?__wb_post_data=Zm9vPWJhciZkaXI9JTJGYmF6'
|
||||
assert mq.append_query('http://example.com/') == 'http://example.com/?__wb_post_data=Zm9vPWJhciZkaXI9JTJGYmF6'
|
||||
|
||||
def test_post_extract_non_form_data_2(self):
|
||||
pq = PostQueryExtractor('POST', 'text/plain',
|
||||
mq = MethodQueryCanonicalizer('POST', 'text/plain',
|
||||
len(self.post_data), BytesIO(self.post_data))
|
||||
|
||||
#base64 encoded data
|
||||
assert pq.append_post_query('http://example.com/pathbar?id=123') == 'http://example.com/pathbar?id=123&__wb_post_data=Zm9vPWJhciZkaXI9JTJGYmF6'
|
||||
assert mq.append_query('http://example.com/pathbar?id=123') == 'http://example.com/pathbar?id=123&__wb_post_data=Zm9vPWJhciZkaXI9JTJGYmF6'
|
||||
|
||||
def test_post_extract_length_invalid_ignore(self):
|
||||
pq = PostQueryExtractor('POST', 'application/x-www-form-urlencoded',
|
||||
mq = MethodQueryCanonicalizer('POST', 'application/x-www-form-urlencoded',
|
||||
0, BytesIO(self.post_data))
|
||||
|
||||
assert pq.append_post_query('http://example.com/') == 'http://example.com/'
|
||||
assert mq.append_query('http://example.com/') == 'http://example.com/'
|
||||
|
||||
pq = PostQueryExtractor('POST', 'application/x-www-form-urlencoded',
|
||||
mq = MethodQueryCanonicalizer('POST', 'application/x-www-form-urlencoded',
|
||||
'abc', BytesIO(self.post_data))
|
||||
|
||||
assert pq.append_post_query('http://example.com/') == 'http://example.com/'
|
||||
assert mq.append_query('http://example.com/') == 'http://example.com/'
|
||||
|
||||
def test_post_extract_length_too_short(self):
|
||||
pq = PostQueryExtractor('POST', 'application/x-www-form-urlencoded',
|
||||
mq = MethodQueryCanonicalizer('POST', 'application/x-www-form-urlencoded',
|
||||
len(self.post_data) - 4, BytesIO(self.post_data))
|
||||
|
||||
assert pq.append_post_query('http://example.com/') == 'http://example.com/?foo=bar&dir=%2'
|
||||
assert mq.append_query('http://example.com/') == 'http://example.com/?foo=bar&dir=%2'
|
||||
|
||||
def test_post_extract_length_too_long(self):
|
||||
pq = PostQueryExtractor('POST', 'application/x-www-form-urlencoded',
|
||||
mq = MethodQueryCanonicalizer('POST', 'application/x-www-form-urlencoded',
|
||||
len(self.post_data) + 4, BytesIO(self.post_data))
|
||||
|
||||
assert pq.append_post_query('http://example.com/') == 'http://example.com/?foo=bar&dir=/baz'
|
||||
assert mq.append_query('http://example.com/') == 'http://example.com/?foo=bar&dir=/baz'
|
||||
|
||||
def test_options(self):
|
||||
mq = MethodQueryCanonicalizer('OPTIONS', '', 0, BytesIO())
|
||||
assert mq.append_query('http://example.com/') == 'http://example.com/?__pywb_method=options'
|
||||
|
||||
def test_head(self):
|
||||
mq = MethodQueryCanonicalizer('HEAD', '', 0, BytesIO())
|
||||
assert mq.append_query('http://example.com/') == 'http://example.com/?__pywb_method=head'
|
||||
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user