From 3e9087df3c1948f006689bd1db7cacd1e62d3c85 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 23 Oct 2017 17:15:06 -0700 Subject: [PATCH] http OPTIONS and HEAD canonicalization: (#260) * http OPTIONS canonicalization: - rename PostQueryExtractor to generic MethodQueryCanonicalizer, handles OPTIONS verb in addition to POST - use more generic 'query' instead of 'post_query' for method-query canonicalization - append '__pywb_method=options' to OPTIONS responses to distinguish from get in MethodQueryCanonicalizer * method canon: also add HEAD to __pywb_method query canonicalization --- pywb/apps/rewriterapp.py | 2 +- pywb/indexer/archiveindexer.py | 6 +-- pywb/warcserver/handlers.py | 2 +- pywb/warcserver/inputrequest.py | 67 ++++++++++++++++----------- pywb/warcserver/test/test_inputreq.py | 44 +++++++++++------- 5 files changed, 70 insertions(+), 51 deletions(-) diff --git a/pywb/apps/rewriterapp.py b/pywb/apps/rewriterapp.py index d7829081..f199be68 100644 --- a/pywb/apps/rewriterapp.py +++ b/pywb/apps/rewriterapp.py @@ -188,7 +188,7 @@ class RewriterApp(object): inputreq = RewriteInputRequest(environ, urlkey, wb_url.url, content_rw) - inputreq.include_post_query(wb_url.url) + inputreq.include_method_query(wb_url.url) mod_url = None use_206 = False diff --git a/pywb/indexer/archiveindexer.py b/pywb/indexer/archiveindexer.py index b3ff8f35..8c2fbbc0 100644 --- a/pywb/indexer/archiveindexer.py +++ b/pywb/indexer/archiveindexer.py @@ -1,6 +1,6 @@ from pywb.utils.canonicalize import canonicalize -from pywb.warcserver.inputrequest import PostQueryExtractor +from pywb.warcserver.inputrequest import MethodQueryCanonicalizer from pywb.utils.io import BUFF_SIZE from warcio.timeutils import iso_date_to_timestamp @@ -69,7 +69,7 @@ class ArchiveIndexEntryMixin(object): # merge POST/PUT body query post_query = other.get('_post_query') url = self['url'] - new_url = post_query.append_post_query(url) + new_url = post_query.append_query(url) if post_query and new_url != url: self['urlkey'] = canonicalize(new_url, surt_ordered) other['urlkey'] = self['urlkey'] @@ -181,7 +181,7 @@ class DefaultRecordParser(object): method = record.http_headers.protocol len_ = record.http_headers.get_header('Content-Length') - post_query = PostQueryExtractor(method, + post_query = MethodQueryCanonicalizer(method, entry.get('_content_type'), len_, record.raw_stream) diff --git a/pywb/warcserver/handlers.py b/pywb/warcserver/handlers.py index 8b5f61e1..9b77df65 100644 --- a/pywb/warcserver/handlers.py +++ b/pywb/warcserver/handlers.py @@ -55,7 +55,7 @@ class IndexHandler(object): input_req = params.get('_input_req') if input_req: - params['alt_url'] = input_req.include_post_query(url) + params['alt_url'] = input_req.include_method_query(url) return self.fuzzy(self.index_source, params) diff --git a/pywb/warcserver/inputrequest.py b/pywb/warcserver/inputrequest.py index 0af5faf1..6b888516 100644 --- a/pywb/warcserver/inputrequest.py +++ b/pywb/warcserver/inputrequest.py @@ -71,22 +71,26 @@ class DirectWSGIInputRequest(object): def _get_header(self, name): return self.env.get('HTTP_' + name.upper().replace('-', '_')) - def include_post_query(self, url): - if not url or self.get_req_method() != 'POST': + def include_method_query(self, url): + if not url: + return url + + method = self.get_req_method() + + if method not in ('OPTIONS', 'HEAD', 'POST'): return url mime = self._get_content_type() - #mime = mime.split(';')[0] if mime else '' length = self._get_content_length() stream = self.env['wsgi.input'] buffered_stream = BytesIO() - post_query = PostQueryExtractor('POST', mime, length, stream, - buffered_stream=buffered_stream, - environ=self.env) + query = MethodQueryCanonicalizer(method, mime, length, stream, + buffered_stream=buffered_stream, + environ=self.env) - new_url = post_query.append_post_query(url) + new_url = query.append_query(url) if new_url != url: self.env['wsgi.input'] = buffered_stream @@ -176,19 +180,26 @@ class POSTInputRequest(DirectWSGIInputRequest): # ============================================================================ -class PostQueryExtractor(object): +class MethodQueryCanonicalizer(object): def __init__(self, method, mime, length, stream, buffered_stream=None, environ=None): """ - Extract a url-encoded form POST from stream - content length, return None + Append the method for HEAD/OPTIONS as __pywb_method= + For POST requests, requests extract a url-encoded form from stream + read content length and convert to query params, if possible Attempt to decode application/x-www-form-urlencoded or multipart/*, otherwise read whole block and b64encode """ - self.post_query = b'' + self.query = b'' - if method.upper() != 'POST': + method = method.upper() + + if method in ('OPTIONS', 'HEAD'): + self.query = '__pywb_method=' + method.lower() + return + + if method != 'POST': return try: @@ -199,7 +210,7 @@ class PostQueryExtractor(object): if length <= 0: return - post_query = b'' + query = b'' while length > 0: buff = stream.read(length) @@ -208,25 +219,25 @@ class PostQueryExtractor(object): if not buff: break - post_query += buff + query += buff if buffered_stream: - buffered_stream.write(post_query) + buffered_stream.write(query) buffered_stream.seek(0) if not mime: mime = '' if mime.startswith('application/x-www-form-urlencoded'): - post_query = to_native_str(post_query) - post_query = unquote_plus(post_query) + query = to_native_str(query) + query = unquote_plus(query) elif mime.startswith('multipart/'): env = {'REQUEST_METHOD': 'POST', 'CONTENT_TYPE': mime, - 'CONTENT_LENGTH': len(post_query)} + 'CONTENT_LENGTH': len(query)} - args = dict(fp=BytesIO(post_query), + args = dict(fp=BytesIO(query), environ=env, keep_blank_values=True) @@ -239,17 +250,17 @@ class PostQueryExtractor(object): for item in data.list: values.append((item.name, item.value)) - post_query = urlencode(values, True) + query = urlencode(values, True) elif mime.startswith('application/x-amf'): - post_query = self.amf_parse(post_query, environ) + query = self.amf_parse(query, environ) else: - post_query = base64.b64encode(post_query) - post_query = to_native_str(post_query) - post_query = '__wb_post_data=' + post_query + query = base64.b64encode(query) + query = to_native_str(query) + query = '__wb_post_data=' + query - self.post_query = post_query + self.query = query def amf_parse(self, string, environ): try: @@ -284,8 +295,8 @@ class PostQueryExtractor(object): print(e) return None - def append_post_query(self, url): - if not self.post_query: + def append_query(self, url): + if not self.query: return url if '?' not in url: @@ -293,6 +304,6 @@ class PostQueryExtractor(object): else: url += '&' - url += self.post_query + url += self.query return url diff --git a/pywb/warcserver/test/test_inputreq.py b/pywb/warcserver/test/test_inputreq.py index 0857e897..2936e424 100644 --- a/pywb/warcserver/test/test_inputreq.py +++ b/pywb/warcserver/test/test_inputreq.py @@ -1,4 +1,4 @@ -from pywb.warcserver.inputrequest import DirectWSGIInputRequest, POSTInputRequest, PostQueryExtractor +from pywb.warcserver.inputrequest import DirectWSGIInputRequest, POSTInputRequest, MethodQueryCanonicalizer from werkzeug.routing import Map, Rule import webtest @@ -84,54 +84,62 @@ class TestPostQueryExtract(object): cls.post_data = b'foo=bar&dir=%2Fbaz' def test_post_extract_1(self): - pq = PostQueryExtractor('POST', 'application/x-www-form-urlencoded', + mq = MethodQueryCanonicalizer('POST', 'application/x-www-form-urlencoded', len(self.post_data), BytesIO(self.post_data)) - assert pq.append_post_query('http://example.com/') == 'http://example.com/?foo=bar&dir=/baz' + assert mq.append_query('http://example.com/') == 'http://example.com/?foo=bar&dir=/baz' - assert pq.append_post_query('http://example.com/?123=ABC') == 'http://example.com/?123=ABC&foo=bar&dir=/baz' + assert mq.append_query('http://example.com/?123=ABC') == 'http://example.com/?123=ABC&foo=bar&dir=/baz' def test_post_extract_wrong_method(self): - pq = PostQueryExtractor('PUT', 'application/x-www-form-urlencoded', + mq = MethodQueryCanonicalizer('PUT', 'application/x-www-form-urlencoded', len(self.post_data), BytesIO(self.post_data)) - assert pq.append_post_query('http://example.com/') == 'http://example.com/' + assert mq.append_query('http://example.com/') == 'http://example.com/' def test_post_extract_non_form_data_1(self): - pq = PostQueryExtractor('POST', 'application/octet-stream', + mq = MethodQueryCanonicalizer('POST', 'application/octet-stream', len(self.post_data), BytesIO(self.post_data)) #base64 encoded data - assert pq.append_post_query('http://example.com/') == 'http://example.com/?__wb_post_data=Zm9vPWJhciZkaXI9JTJGYmF6' + assert mq.append_query('http://example.com/') == 'http://example.com/?__wb_post_data=Zm9vPWJhciZkaXI9JTJGYmF6' def test_post_extract_non_form_data_2(self): - pq = PostQueryExtractor('POST', 'text/plain', + mq = MethodQueryCanonicalizer('POST', 'text/plain', len(self.post_data), BytesIO(self.post_data)) #base64 encoded data - assert pq.append_post_query('http://example.com/pathbar?id=123') == 'http://example.com/pathbar?id=123&__wb_post_data=Zm9vPWJhciZkaXI9JTJGYmF6' + assert mq.append_query('http://example.com/pathbar?id=123') == 'http://example.com/pathbar?id=123&__wb_post_data=Zm9vPWJhciZkaXI9JTJGYmF6' def test_post_extract_length_invalid_ignore(self): - pq = PostQueryExtractor('POST', 'application/x-www-form-urlencoded', + mq = MethodQueryCanonicalizer('POST', 'application/x-www-form-urlencoded', 0, BytesIO(self.post_data)) - assert pq.append_post_query('http://example.com/') == 'http://example.com/' + assert mq.append_query('http://example.com/') == 'http://example.com/' - pq = PostQueryExtractor('POST', 'application/x-www-form-urlencoded', + mq = MethodQueryCanonicalizer('POST', 'application/x-www-form-urlencoded', 'abc', BytesIO(self.post_data)) - assert pq.append_post_query('http://example.com/') == 'http://example.com/' + assert mq.append_query('http://example.com/') == 'http://example.com/' def test_post_extract_length_too_short(self): - pq = PostQueryExtractor('POST', 'application/x-www-form-urlencoded', + mq = MethodQueryCanonicalizer('POST', 'application/x-www-form-urlencoded', len(self.post_data) - 4, BytesIO(self.post_data)) - assert pq.append_post_query('http://example.com/') == 'http://example.com/?foo=bar&dir=%2' + assert mq.append_query('http://example.com/') == 'http://example.com/?foo=bar&dir=%2' def test_post_extract_length_too_long(self): - pq = PostQueryExtractor('POST', 'application/x-www-form-urlencoded', + mq = MethodQueryCanonicalizer('POST', 'application/x-www-form-urlencoded', len(self.post_data) + 4, BytesIO(self.post_data)) - assert pq.append_post_query('http://example.com/') == 'http://example.com/?foo=bar&dir=/baz' + assert mq.append_query('http://example.com/') == 'http://example.com/?foo=bar&dir=/baz' + + def test_options(self): + mq = MethodQueryCanonicalizer('OPTIONS', '', 0, BytesIO()) + assert mq.append_query('http://example.com/') == 'http://example.com/?__pywb_method=options' + + def test_head(self): + mq = MethodQueryCanonicalizer('HEAD', '', 0, BytesIO()) + assert mq.append_query('http://example.com/') == 'http://example.com/?__pywb_method=head'