mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
http OPTIONS and HEAD canonicalization: (#260)
* http OPTIONS canonicalization: - rename PostQueryExtractor to generic MethodQueryCanonicalizer, handles OPTIONS verb in addition to POST - use more generic 'query' instead of 'post_query' for method-query canonicalization - append '__pywb_method=options' to OPTIONS responses to distinguish from get in MethodQueryCanonicalizer * method canon: also add HEAD to __pywb_method query canonicalization
This commit is contained in:
parent
4b60dd5dda
commit
3e9087df3c
@ -188,7 +188,7 @@ class RewriterApp(object):
|
|||||||
|
|
||||||
inputreq = RewriteInputRequest(environ, urlkey, wb_url.url, content_rw)
|
inputreq = RewriteInputRequest(environ, urlkey, wb_url.url, content_rw)
|
||||||
|
|
||||||
inputreq.include_post_query(wb_url.url)
|
inputreq.include_method_query(wb_url.url)
|
||||||
|
|
||||||
mod_url = None
|
mod_url = None
|
||||||
use_206 = False
|
use_206 = False
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
from pywb.utils.canonicalize import canonicalize
|
from pywb.utils.canonicalize import canonicalize
|
||||||
|
|
||||||
from pywb.warcserver.inputrequest import PostQueryExtractor
|
from pywb.warcserver.inputrequest import MethodQueryCanonicalizer
|
||||||
from pywb.utils.io import BUFF_SIZE
|
from pywb.utils.io import BUFF_SIZE
|
||||||
|
|
||||||
from warcio.timeutils import iso_date_to_timestamp
|
from warcio.timeutils import iso_date_to_timestamp
|
||||||
@ -69,7 +69,7 @@ class ArchiveIndexEntryMixin(object):
|
|||||||
# merge POST/PUT body query
|
# merge POST/PUT body query
|
||||||
post_query = other.get('_post_query')
|
post_query = other.get('_post_query')
|
||||||
url = self['url']
|
url = self['url']
|
||||||
new_url = post_query.append_post_query(url)
|
new_url = post_query.append_query(url)
|
||||||
if post_query and new_url != url:
|
if post_query and new_url != url:
|
||||||
self['urlkey'] = canonicalize(new_url, surt_ordered)
|
self['urlkey'] = canonicalize(new_url, surt_ordered)
|
||||||
other['urlkey'] = self['urlkey']
|
other['urlkey'] = self['urlkey']
|
||||||
@ -181,7 +181,7 @@ class DefaultRecordParser(object):
|
|||||||
method = record.http_headers.protocol
|
method = record.http_headers.protocol
|
||||||
len_ = record.http_headers.get_header('Content-Length')
|
len_ = record.http_headers.get_header('Content-Length')
|
||||||
|
|
||||||
post_query = PostQueryExtractor(method,
|
post_query = MethodQueryCanonicalizer(method,
|
||||||
entry.get('_content_type'),
|
entry.get('_content_type'),
|
||||||
len_,
|
len_,
|
||||||
record.raw_stream)
|
record.raw_stream)
|
||||||
|
@ -55,7 +55,7 @@ class IndexHandler(object):
|
|||||||
|
|
||||||
input_req = params.get('_input_req')
|
input_req = params.get('_input_req')
|
||||||
if input_req:
|
if input_req:
|
||||||
params['alt_url'] = input_req.include_post_query(url)
|
params['alt_url'] = input_req.include_method_query(url)
|
||||||
|
|
||||||
return self.fuzzy(self.index_source, params)
|
return self.fuzzy(self.index_source, params)
|
||||||
|
|
||||||
|
@ -71,22 +71,26 @@ class DirectWSGIInputRequest(object):
|
|||||||
def _get_header(self, name):
|
def _get_header(self, name):
|
||||||
return self.env.get('HTTP_' + name.upper().replace('-', '_'))
|
return self.env.get('HTTP_' + name.upper().replace('-', '_'))
|
||||||
|
|
||||||
def include_post_query(self, url):
|
def include_method_query(self, url):
|
||||||
if not url or self.get_req_method() != 'POST':
|
if not url:
|
||||||
|
return url
|
||||||
|
|
||||||
|
method = self.get_req_method()
|
||||||
|
|
||||||
|
if method not in ('OPTIONS', 'HEAD', 'POST'):
|
||||||
return url
|
return url
|
||||||
|
|
||||||
mime = self._get_content_type()
|
mime = self._get_content_type()
|
||||||
#mime = mime.split(';')[0] if mime else ''
|
|
||||||
length = self._get_content_length()
|
length = self._get_content_length()
|
||||||
stream = self.env['wsgi.input']
|
stream = self.env['wsgi.input']
|
||||||
|
|
||||||
buffered_stream = BytesIO()
|
buffered_stream = BytesIO()
|
||||||
|
|
||||||
post_query = PostQueryExtractor('POST', mime, length, stream,
|
query = MethodQueryCanonicalizer(method, mime, length, stream,
|
||||||
buffered_stream=buffered_stream,
|
buffered_stream=buffered_stream,
|
||||||
environ=self.env)
|
environ=self.env)
|
||||||
|
|
||||||
new_url = post_query.append_post_query(url)
|
new_url = query.append_query(url)
|
||||||
if new_url != url:
|
if new_url != url:
|
||||||
self.env['wsgi.input'] = buffered_stream
|
self.env['wsgi.input'] = buffered_stream
|
||||||
|
|
||||||
@ -176,19 +180,26 @@ class POSTInputRequest(DirectWSGIInputRequest):
|
|||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
class PostQueryExtractor(object):
|
class MethodQueryCanonicalizer(object):
|
||||||
def __init__(self, method, mime, length, stream,
|
def __init__(self, method, mime, length, stream,
|
||||||
buffered_stream=None,
|
buffered_stream=None,
|
||||||
environ=None):
|
environ=None):
|
||||||
"""
|
"""
|
||||||
Extract a url-encoded form POST from stream
|
Append the method for HEAD/OPTIONS as __pywb_method=<method>
|
||||||
content length, return None
|
For POST requests, requests extract a url-encoded form from stream
|
||||||
|
read content length and convert to query params, if possible
|
||||||
Attempt to decode application/x-www-form-urlencoded or multipart/*,
|
Attempt to decode application/x-www-form-urlencoded or multipart/*,
|
||||||
otherwise read whole block and b64encode
|
otherwise read whole block and b64encode
|
||||||
"""
|
"""
|
||||||
self.post_query = b''
|
self.query = b''
|
||||||
|
|
||||||
if method.upper() != 'POST':
|
method = method.upper()
|
||||||
|
|
||||||
|
if method in ('OPTIONS', 'HEAD'):
|
||||||
|
self.query = '__pywb_method=' + method.lower()
|
||||||
|
return
|
||||||
|
|
||||||
|
if method != 'POST':
|
||||||
return
|
return
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@ -199,7 +210,7 @@ class PostQueryExtractor(object):
|
|||||||
if length <= 0:
|
if length <= 0:
|
||||||
return
|
return
|
||||||
|
|
||||||
post_query = b''
|
query = b''
|
||||||
|
|
||||||
while length > 0:
|
while length > 0:
|
||||||
buff = stream.read(length)
|
buff = stream.read(length)
|
||||||
@ -208,25 +219,25 @@ class PostQueryExtractor(object):
|
|||||||
if not buff:
|
if not buff:
|
||||||
break
|
break
|
||||||
|
|
||||||
post_query += buff
|
query += buff
|
||||||
|
|
||||||
if buffered_stream:
|
if buffered_stream:
|
||||||
buffered_stream.write(post_query)
|
buffered_stream.write(query)
|
||||||
buffered_stream.seek(0)
|
buffered_stream.seek(0)
|
||||||
|
|
||||||
if not mime:
|
if not mime:
|
||||||
mime = ''
|
mime = ''
|
||||||
|
|
||||||
if mime.startswith('application/x-www-form-urlencoded'):
|
if mime.startswith('application/x-www-form-urlencoded'):
|
||||||
post_query = to_native_str(post_query)
|
query = to_native_str(query)
|
||||||
post_query = unquote_plus(post_query)
|
query = unquote_plus(query)
|
||||||
|
|
||||||
elif mime.startswith('multipart/'):
|
elif mime.startswith('multipart/'):
|
||||||
env = {'REQUEST_METHOD': 'POST',
|
env = {'REQUEST_METHOD': 'POST',
|
||||||
'CONTENT_TYPE': mime,
|
'CONTENT_TYPE': mime,
|
||||||
'CONTENT_LENGTH': len(post_query)}
|
'CONTENT_LENGTH': len(query)}
|
||||||
|
|
||||||
args = dict(fp=BytesIO(post_query),
|
args = dict(fp=BytesIO(query),
|
||||||
environ=env,
|
environ=env,
|
||||||
keep_blank_values=True)
|
keep_blank_values=True)
|
||||||
|
|
||||||
@ -239,17 +250,17 @@ class PostQueryExtractor(object):
|
|||||||
for item in data.list:
|
for item in data.list:
|
||||||
values.append((item.name, item.value))
|
values.append((item.name, item.value))
|
||||||
|
|
||||||
post_query = urlencode(values, True)
|
query = urlencode(values, True)
|
||||||
|
|
||||||
elif mime.startswith('application/x-amf'):
|
elif mime.startswith('application/x-amf'):
|
||||||
post_query = self.amf_parse(post_query, environ)
|
query = self.amf_parse(query, environ)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
post_query = base64.b64encode(post_query)
|
query = base64.b64encode(query)
|
||||||
post_query = to_native_str(post_query)
|
query = to_native_str(query)
|
||||||
post_query = '__wb_post_data=' + post_query
|
query = '__wb_post_data=' + query
|
||||||
|
|
||||||
self.post_query = post_query
|
self.query = query
|
||||||
|
|
||||||
def amf_parse(self, string, environ):
|
def amf_parse(self, string, environ):
|
||||||
try:
|
try:
|
||||||
@ -284,8 +295,8 @@ class PostQueryExtractor(object):
|
|||||||
print(e)
|
print(e)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def append_post_query(self, url):
|
def append_query(self, url):
|
||||||
if not self.post_query:
|
if not self.query:
|
||||||
return url
|
return url
|
||||||
|
|
||||||
if '?' not in url:
|
if '?' not in url:
|
||||||
@ -293,6 +304,6 @@ class PostQueryExtractor(object):
|
|||||||
else:
|
else:
|
||||||
url += '&'
|
url += '&'
|
||||||
|
|
||||||
url += self.post_query
|
url += self.query
|
||||||
return url
|
return url
|
||||||
|
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
from pywb.warcserver.inputrequest import DirectWSGIInputRequest, POSTInputRequest, PostQueryExtractor
|
from pywb.warcserver.inputrequest import DirectWSGIInputRequest, POSTInputRequest, MethodQueryCanonicalizer
|
||||||
from werkzeug.routing import Map, Rule
|
from werkzeug.routing import Map, Rule
|
||||||
|
|
||||||
import webtest
|
import webtest
|
||||||
@ -84,54 +84,62 @@ class TestPostQueryExtract(object):
|
|||||||
cls.post_data = b'foo=bar&dir=%2Fbaz'
|
cls.post_data = b'foo=bar&dir=%2Fbaz'
|
||||||
|
|
||||||
def test_post_extract_1(self):
|
def test_post_extract_1(self):
|
||||||
pq = PostQueryExtractor('POST', 'application/x-www-form-urlencoded',
|
mq = MethodQueryCanonicalizer('POST', 'application/x-www-form-urlencoded',
|
||||||
len(self.post_data), BytesIO(self.post_data))
|
len(self.post_data), BytesIO(self.post_data))
|
||||||
|
|
||||||
assert pq.append_post_query('http://example.com/') == 'http://example.com/?foo=bar&dir=/baz'
|
assert mq.append_query('http://example.com/') == 'http://example.com/?foo=bar&dir=/baz'
|
||||||
|
|
||||||
assert pq.append_post_query('http://example.com/?123=ABC') == 'http://example.com/?123=ABC&foo=bar&dir=/baz'
|
assert mq.append_query('http://example.com/?123=ABC') == 'http://example.com/?123=ABC&foo=bar&dir=/baz'
|
||||||
|
|
||||||
def test_post_extract_wrong_method(self):
|
def test_post_extract_wrong_method(self):
|
||||||
pq = PostQueryExtractor('PUT', 'application/x-www-form-urlencoded',
|
mq = MethodQueryCanonicalizer('PUT', 'application/x-www-form-urlencoded',
|
||||||
len(self.post_data), BytesIO(self.post_data))
|
len(self.post_data), BytesIO(self.post_data))
|
||||||
|
|
||||||
assert pq.append_post_query('http://example.com/') == 'http://example.com/'
|
assert mq.append_query('http://example.com/') == 'http://example.com/'
|
||||||
|
|
||||||
def test_post_extract_non_form_data_1(self):
|
def test_post_extract_non_form_data_1(self):
|
||||||
pq = PostQueryExtractor('POST', 'application/octet-stream',
|
mq = MethodQueryCanonicalizer('POST', 'application/octet-stream',
|
||||||
len(self.post_data), BytesIO(self.post_data))
|
len(self.post_data), BytesIO(self.post_data))
|
||||||
|
|
||||||
#base64 encoded data
|
#base64 encoded data
|
||||||
assert pq.append_post_query('http://example.com/') == 'http://example.com/?__wb_post_data=Zm9vPWJhciZkaXI9JTJGYmF6'
|
assert mq.append_query('http://example.com/') == 'http://example.com/?__wb_post_data=Zm9vPWJhciZkaXI9JTJGYmF6'
|
||||||
|
|
||||||
def test_post_extract_non_form_data_2(self):
|
def test_post_extract_non_form_data_2(self):
|
||||||
pq = PostQueryExtractor('POST', 'text/plain',
|
mq = MethodQueryCanonicalizer('POST', 'text/plain',
|
||||||
len(self.post_data), BytesIO(self.post_data))
|
len(self.post_data), BytesIO(self.post_data))
|
||||||
|
|
||||||
#base64 encoded data
|
#base64 encoded data
|
||||||
assert pq.append_post_query('http://example.com/pathbar?id=123') == 'http://example.com/pathbar?id=123&__wb_post_data=Zm9vPWJhciZkaXI9JTJGYmF6'
|
assert mq.append_query('http://example.com/pathbar?id=123') == 'http://example.com/pathbar?id=123&__wb_post_data=Zm9vPWJhciZkaXI9JTJGYmF6'
|
||||||
|
|
||||||
def test_post_extract_length_invalid_ignore(self):
|
def test_post_extract_length_invalid_ignore(self):
|
||||||
pq = PostQueryExtractor('POST', 'application/x-www-form-urlencoded',
|
mq = MethodQueryCanonicalizer('POST', 'application/x-www-form-urlencoded',
|
||||||
0, BytesIO(self.post_data))
|
0, BytesIO(self.post_data))
|
||||||
|
|
||||||
assert pq.append_post_query('http://example.com/') == 'http://example.com/'
|
assert mq.append_query('http://example.com/') == 'http://example.com/'
|
||||||
|
|
||||||
pq = PostQueryExtractor('POST', 'application/x-www-form-urlencoded',
|
mq = MethodQueryCanonicalizer('POST', 'application/x-www-form-urlencoded',
|
||||||
'abc', BytesIO(self.post_data))
|
'abc', BytesIO(self.post_data))
|
||||||
|
|
||||||
assert pq.append_post_query('http://example.com/') == 'http://example.com/'
|
assert mq.append_query('http://example.com/') == 'http://example.com/'
|
||||||
|
|
||||||
def test_post_extract_length_too_short(self):
|
def test_post_extract_length_too_short(self):
|
||||||
pq = PostQueryExtractor('POST', 'application/x-www-form-urlencoded',
|
mq = MethodQueryCanonicalizer('POST', 'application/x-www-form-urlencoded',
|
||||||
len(self.post_data) - 4, BytesIO(self.post_data))
|
len(self.post_data) - 4, BytesIO(self.post_data))
|
||||||
|
|
||||||
assert pq.append_post_query('http://example.com/') == 'http://example.com/?foo=bar&dir=%2'
|
assert mq.append_query('http://example.com/') == 'http://example.com/?foo=bar&dir=%2'
|
||||||
|
|
||||||
def test_post_extract_length_too_long(self):
|
def test_post_extract_length_too_long(self):
|
||||||
pq = PostQueryExtractor('POST', 'application/x-www-form-urlencoded',
|
mq = MethodQueryCanonicalizer('POST', 'application/x-www-form-urlencoded',
|
||||||
len(self.post_data) + 4, BytesIO(self.post_data))
|
len(self.post_data) + 4, BytesIO(self.post_data))
|
||||||
|
|
||||||
assert pq.append_post_query('http://example.com/') == 'http://example.com/?foo=bar&dir=/baz'
|
assert mq.append_query('http://example.com/') == 'http://example.com/?foo=bar&dir=/baz'
|
||||||
|
|
||||||
|
def test_options(self):
|
||||||
|
mq = MethodQueryCanonicalizer('OPTIONS', '', 0, BytesIO())
|
||||||
|
assert mq.append_query('http://example.com/') == 'http://example.com/?__pywb_method=options'
|
||||||
|
|
||||||
|
def test_head(self):
|
||||||
|
mq = MethodQueryCanonicalizer('HEAD', '', 0, BytesIO())
|
||||||
|
assert mq.append_query('http://example.com/') == 'http://example.com/?__pywb_method=head'
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user