1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

http OPTIONS and HEAD canonicalization: (#260)

* http OPTIONS canonicalization:
- rename PostQueryExtractor to generic MethodQueryCanonicalizer, handles OPTIONS verb in addition to POST
- use more generic 'query' instead of 'post_query' for method-query canonicalization
- append '__pywb_method=options' to OPTIONS responses to distinguish from get in MethodQueryCanonicalizer

* method canon: also add HEAD to __pywb_method query canonicalization
This commit is contained in:
Ilya Kreymer 2017-10-23 17:15:06 -07:00 committed by GitHub
parent 4b60dd5dda
commit 3e9087df3c
5 changed files with 70 additions and 51 deletions

View File

@ -188,7 +188,7 @@ class RewriterApp(object):
inputreq = RewriteInputRequest(environ, urlkey, wb_url.url, content_rw)
inputreq.include_post_query(wb_url.url)
inputreq.include_method_query(wb_url.url)
mod_url = None
use_206 = False

View File

@ -1,6 +1,6 @@
from pywb.utils.canonicalize import canonicalize
from pywb.warcserver.inputrequest import PostQueryExtractor
from pywb.warcserver.inputrequest import MethodQueryCanonicalizer
from pywb.utils.io import BUFF_SIZE
from warcio.timeutils import iso_date_to_timestamp
@ -69,7 +69,7 @@ class ArchiveIndexEntryMixin(object):
# merge POST/PUT body query
post_query = other.get('_post_query')
url = self['url']
new_url = post_query.append_post_query(url)
new_url = post_query.append_query(url)
if post_query and new_url != url:
self['urlkey'] = canonicalize(new_url, surt_ordered)
other['urlkey'] = self['urlkey']
@ -181,7 +181,7 @@ class DefaultRecordParser(object):
method = record.http_headers.protocol
len_ = record.http_headers.get_header('Content-Length')
post_query = PostQueryExtractor(method,
post_query = MethodQueryCanonicalizer(method,
entry.get('_content_type'),
len_,
record.raw_stream)

View File

@ -55,7 +55,7 @@ class IndexHandler(object):
input_req = params.get('_input_req')
if input_req:
params['alt_url'] = input_req.include_post_query(url)
params['alt_url'] = input_req.include_method_query(url)
return self.fuzzy(self.index_source, params)

View File

@ -71,22 +71,26 @@ class DirectWSGIInputRequest(object):
def _get_header(self, name):
return self.env.get('HTTP_' + name.upper().replace('-', '_'))
def include_post_query(self, url):
if not url or self.get_req_method() != 'POST':
def include_method_query(self, url):
if not url:
return url
method = self.get_req_method()
if method not in ('OPTIONS', 'HEAD', 'POST'):
return url
mime = self._get_content_type()
#mime = mime.split(';')[0] if mime else ''
length = self._get_content_length()
stream = self.env['wsgi.input']
buffered_stream = BytesIO()
post_query = PostQueryExtractor('POST', mime, length, stream,
buffered_stream=buffered_stream,
environ=self.env)
query = MethodQueryCanonicalizer(method, mime, length, stream,
buffered_stream=buffered_stream,
environ=self.env)
new_url = post_query.append_post_query(url)
new_url = query.append_query(url)
if new_url != url:
self.env['wsgi.input'] = buffered_stream
@ -176,19 +180,26 @@ class POSTInputRequest(DirectWSGIInputRequest):
# ============================================================================
class PostQueryExtractor(object):
class MethodQueryCanonicalizer(object):
def __init__(self, method, mime, length, stream,
buffered_stream=None,
environ=None):
"""
Extract a url-encoded form POST from stream
content length, return None
Append the method for HEAD/OPTIONS as __pywb_method=<method>
For POST requests, requests extract a url-encoded form from stream
read content length and convert to query params, if possible
Attempt to decode application/x-www-form-urlencoded or multipart/*,
otherwise read whole block and b64encode
"""
self.post_query = b''
self.query = b''
if method.upper() != 'POST':
method = method.upper()
if method in ('OPTIONS', 'HEAD'):
self.query = '__pywb_method=' + method.lower()
return
if method != 'POST':
return
try:
@ -199,7 +210,7 @@ class PostQueryExtractor(object):
if length <= 0:
return
post_query = b''
query = b''
while length > 0:
buff = stream.read(length)
@ -208,25 +219,25 @@ class PostQueryExtractor(object):
if not buff:
break
post_query += buff
query += buff
if buffered_stream:
buffered_stream.write(post_query)
buffered_stream.write(query)
buffered_stream.seek(0)
if not mime:
mime = ''
if mime.startswith('application/x-www-form-urlencoded'):
post_query = to_native_str(post_query)
post_query = unquote_plus(post_query)
query = to_native_str(query)
query = unquote_plus(query)
elif mime.startswith('multipart/'):
env = {'REQUEST_METHOD': 'POST',
'CONTENT_TYPE': mime,
'CONTENT_LENGTH': len(post_query)}
'CONTENT_LENGTH': len(query)}
args = dict(fp=BytesIO(post_query),
args = dict(fp=BytesIO(query),
environ=env,
keep_blank_values=True)
@ -239,17 +250,17 @@ class PostQueryExtractor(object):
for item in data.list:
values.append((item.name, item.value))
post_query = urlencode(values, True)
query = urlencode(values, True)
elif mime.startswith('application/x-amf'):
post_query = self.amf_parse(post_query, environ)
query = self.amf_parse(query, environ)
else:
post_query = base64.b64encode(post_query)
post_query = to_native_str(post_query)
post_query = '__wb_post_data=' + post_query
query = base64.b64encode(query)
query = to_native_str(query)
query = '__wb_post_data=' + query
self.post_query = post_query
self.query = query
def amf_parse(self, string, environ):
try:
@ -284,8 +295,8 @@ class PostQueryExtractor(object):
print(e)
return None
def append_post_query(self, url):
if not self.post_query:
def append_query(self, url):
if not self.query:
return url
if '?' not in url:
@ -293,6 +304,6 @@ class PostQueryExtractor(object):
else:
url += '&'
url += self.post_query
url += self.query
return url

View File

@ -1,4 +1,4 @@
from pywb.warcserver.inputrequest import DirectWSGIInputRequest, POSTInputRequest, PostQueryExtractor
from pywb.warcserver.inputrequest import DirectWSGIInputRequest, POSTInputRequest, MethodQueryCanonicalizer
from werkzeug.routing import Map, Rule
import webtest
@ -84,54 +84,62 @@ class TestPostQueryExtract(object):
cls.post_data = b'foo=bar&dir=%2Fbaz'
def test_post_extract_1(self):
pq = PostQueryExtractor('POST', 'application/x-www-form-urlencoded',
mq = MethodQueryCanonicalizer('POST', 'application/x-www-form-urlencoded',
len(self.post_data), BytesIO(self.post_data))
assert pq.append_post_query('http://example.com/') == 'http://example.com/?foo=bar&dir=/baz'
assert mq.append_query('http://example.com/') == 'http://example.com/?foo=bar&dir=/baz'
assert pq.append_post_query('http://example.com/?123=ABC') == 'http://example.com/?123=ABC&foo=bar&dir=/baz'
assert mq.append_query('http://example.com/?123=ABC') == 'http://example.com/?123=ABC&foo=bar&dir=/baz'
def test_post_extract_wrong_method(self):
pq = PostQueryExtractor('PUT', 'application/x-www-form-urlencoded',
mq = MethodQueryCanonicalizer('PUT', 'application/x-www-form-urlencoded',
len(self.post_data), BytesIO(self.post_data))
assert pq.append_post_query('http://example.com/') == 'http://example.com/'
assert mq.append_query('http://example.com/') == 'http://example.com/'
def test_post_extract_non_form_data_1(self):
pq = PostQueryExtractor('POST', 'application/octet-stream',
mq = MethodQueryCanonicalizer('POST', 'application/octet-stream',
len(self.post_data), BytesIO(self.post_data))
#base64 encoded data
assert pq.append_post_query('http://example.com/') == 'http://example.com/?__wb_post_data=Zm9vPWJhciZkaXI9JTJGYmF6'
assert mq.append_query('http://example.com/') == 'http://example.com/?__wb_post_data=Zm9vPWJhciZkaXI9JTJGYmF6'
def test_post_extract_non_form_data_2(self):
pq = PostQueryExtractor('POST', 'text/plain',
mq = MethodQueryCanonicalizer('POST', 'text/plain',
len(self.post_data), BytesIO(self.post_data))
#base64 encoded data
assert pq.append_post_query('http://example.com/pathbar?id=123') == 'http://example.com/pathbar?id=123&__wb_post_data=Zm9vPWJhciZkaXI9JTJGYmF6'
assert mq.append_query('http://example.com/pathbar?id=123') == 'http://example.com/pathbar?id=123&__wb_post_data=Zm9vPWJhciZkaXI9JTJGYmF6'
def test_post_extract_length_invalid_ignore(self):
pq = PostQueryExtractor('POST', 'application/x-www-form-urlencoded',
mq = MethodQueryCanonicalizer('POST', 'application/x-www-form-urlencoded',
0, BytesIO(self.post_data))
assert pq.append_post_query('http://example.com/') == 'http://example.com/'
assert mq.append_query('http://example.com/') == 'http://example.com/'
pq = PostQueryExtractor('POST', 'application/x-www-form-urlencoded',
mq = MethodQueryCanonicalizer('POST', 'application/x-www-form-urlencoded',
'abc', BytesIO(self.post_data))
assert pq.append_post_query('http://example.com/') == 'http://example.com/'
assert mq.append_query('http://example.com/') == 'http://example.com/'
def test_post_extract_length_too_short(self):
pq = PostQueryExtractor('POST', 'application/x-www-form-urlencoded',
mq = MethodQueryCanonicalizer('POST', 'application/x-www-form-urlencoded',
len(self.post_data) - 4, BytesIO(self.post_data))
assert pq.append_post_query('http://example.com/') == 'http://example.com/?foo=bar&dir=%2'
assert mq.append_query('http://example.com/') == 'http://example.com/?foo=bar&dir=%2'
def test_post_extract_length_too_long(self):
pq = PostQueryExtractor('POST', 'application/x-www-form-urlencoded',
mq = MethodQueryCanonicalizer('POST', 'application/x-www-form-urlencoded',
len(self.post_data) + 4, BytesIO(self.post_data))
assert pq.append_post_query('http://example.com/') == 'http://example.com/?foo=bar&dir=/baz'
assert mq.append_query('http://example.com/') == 'http://example.com/?foo=bar&dir=/baz'
def test_options(self):
mq = MethodQueryCanonicalizer('OPTIONS', '', 0, BytesIO())
assert mq.append_query('http://example.com/') == 'http://example.com/?__pywb_method=options'
def test_head(self):
mq = MethodQueryCanonicalizer('HEAD', '', 0, BytesIO())
assert mq.append_query('http://example.com/') == 'http://example.com/?__pywb_method=head'