mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
post request mapping improvements: work on #178, including:
- mapping multipart/form-data same as x-www-form-urlencoded - parsing application/x-amf with pyamf - RewriteContentAMF for rewriting AMF response to match request - default encoding of other POST data as base64 encoded __wb_post_data param
This commit is contained in:
parent
e5e7c5a7df
commit
87da25c703
@ -184,14 +184,15 @@ class WbRequest(object):
|
||||
if not self.wb_url:
|
||||
return
|
||||
|
||||
mime = self.env.get('CONTENT_TYPE', '').split(';')[0]
|
||||
mime = self.env.get('CONTENT_TYPE', '')
|
||||
length = self.env.get('CONTENT_LENGTH')
|
||||
stream = self.env['wsgi.input']
|
||||
|
||||
buffered_stream = BytesIO()
|
||||
|
||||
post_query = extract_post_query('POST', mime, length, stream,
|
||||
buffered_stream=buffered_stream)
|
||||
buffered_stream=buffered_stream,
|
||||
environ=self.env)
|
||||
|
||||
if post_query:
|
||||
self.env['wsgi.input'] = buffered_stream
|
||||
|
@ -5,7 +5,8 @@ import yaml
|
||||
import re
|
||||
|
||||
from chardet.universaldetector import UniversalDetector
|
||||
from io import BytesIO
|
||||
from io import BytesIO, BufferedReader
|
||||
from six.moves import zip
|
||||
|
||||
from pywb.rewrite.header_rewriter import RewrittenStatusAndHeaders
|
||||
|
||||
@ -21,7 +22,7 @@ from pywb.rewrite.regex_rewriters import JSNoneRewriter, JSLinkOnlyRewriter
|
||||
|
||||
|
||||
#=================================================================
|
||||
class RewriteContent:
|
||||
class RewriteContent(object):
|
||||
HEAD_REGEX = re.compile(b'<\s*head\\b[^>]*[>]+', re.I)
|
||||
|
||||
TAG_REGEX = re.compile(b'^\s*\<')
|
||||
@ -94,7 +95,7 @@ class RewriteContent:
|
||||
|
||||
def rewrite_content(self, urlrewriter, status_headers, stream,
|
||||
head_insert_func=None, urlkey='',
|
||||
cdx=None, cookie_rewriter=None):
|
||||
cdx=None, cookie_rewriter=None, env=None):
|
||||
|
||||
wb_url = urlrewriter.wburl
|
||||
|
||||
@ -118,9 +119,12 @@ class RewriteContent:
|
||||
|
||||
status_headers = rewritten_headers.status_headers
|
||||
|
||||
# use rewritten headers, but no further rewriting needed
|
||||
if rewritten_headers.text_type is None:
|
||||
return (status_headers, self.stream_to_gen(stream), False)
|
||||
res = self.handle_custom_rewrite(rewritten_headers.text_type,
|
||||
status_headers,
|
||||
stream,
|
||||
env)
|
||||
if res:
|
||||
return res
|
||||
|
||||
# Handle text content rewriting
|
||||
# ====================================================================
|
||||
@ -237,6 +241,11 @@ class RewriteContent:
|
||||
|
||||
return (status_headers, gen, True)
|
||||
|
||||
def handle_custom_rewrite(self, text_type, status_headers, stream, env):
|
||||
# use rewritten headers, but no further rewriting needed
|
||||
if text_type is None:
|
||||
return (status_headers, self.stream_to_gen(stream), False)
|
||||
|
||||
@staticmethod
|
||||
def _extract_html_charset(buff, status_headers):
|
||||
charset = None
|
||||
@ -360,3 +369,57 @@ class RewriteContent:
|
||||
|
||||
finally:
|
||||
stream.close()
|
||||
|
||||
|
||||
# =================================================================
|
||||
class RewriteContentAMF(RewriteContent):
|
||||
def handle_custom_rewrite(self, text_type, status_headers, stream, env):
|
||||
|
||||
if status_headers.get_header('Content-Type') == 'application/x-amf':
|
||||
stream = self.rewrite_amf(stream, env)
|
||||
|
||||
return (super(RewriteContentAMF, self).
|
||||
handle_custom_rewrite(text_type, status_headers, stream, env))
|
||||
|
||||
def rewrite_amf(self, stream, env):
|
||||
try:
|
||||
from pyamf import remoting
|
||||
|
||||
iobuff = BytesIO()
|
||||
while True:
|
||||
buff = stream.read()
|
||||
if not buff:
|
||||
break
|
||||
iobuff.write(buff)
|
||||
|
||||
iobuff.seek(0)
|
||||
res = remoting.decode(iobuff)
|
||||
|
||||
print('rewrite amf')
|
||||
|
||||
print(env.get('pywb.inputdata'))
|
||||
|
||||
if env and env.get('pywb.inputdata'):
|
||||
inputdata = env.get('pywb.inputdata')
|
||||
|
||||
new_list = []
|
||||
|
||||
for src, target in zip(inputdata.bodies, res.bodies):
|
||||
print(target[0] + ' = ' + src[0])
|
||||
|
||||
print('messageId => corrId ' + target[1].body.correlationId + ' => ' + src[1].body[0].messageId)
|
||||
target[1].body.correlationId = src[1].body[0].messageId
|
||||
|
||||
new_list.append((src[0], target[1]))
|
||||
|
||||
res.bodies = new_list
|
||||
|
||||
return BytesIO(remoting.encode(res).getvalue())
|
||||
|
||||
except Exception as e:
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
print(e)
|
||||
return stream
|
||||
|
||||
|
||||
|
@ -9,10 +9,12 @@ import requests
|
||||
|
||||
import six
|
||||
from six.moves.urllib.request import pathname2url, url2pathname
|
||||
from six.moves.urllib.parse import urljoin, unquote_plus, urlsplit
|
||||
from six.moves.urllib.parse import urljoin, unquote_plus, urlsplit, urlencode
|
||||
|
||||
import time
|
||||
import pkg_resources
|
||||
import base64
|
||||
import cgi
|
||||
|
||||
from io import open, BytesIO
|
||||
|
||||
@ -65,19 +67,18 @@ def to_native_str(value, encoding='iso-8859-1', func=lambda x: x):
|
||||
|
||||
|
||||
#=================================================================
|
||||
def extract_post_query(method, mime, length, stream, buffered_stream=None):
|
||||
def extract_post_query(method, mime, length, stream,
|
||||
buffered_stream=None,
|
||||
environ=None):
|
||||
"""
|
||||
Extract a url-encoded form POST from stream
|
||||
If not a application/x-www-form-urlencoded, or no missing
|
||||
content length, return None
|
||||
Attempt to decode application/x-www-form-urlencoded or multipart/*,
|
||||
otherwise read whole block and b64encode
|
||||
"""
|
||||
if method.upper() != 'POST':
|
||||
return None
|
||||
|
||||
if ((not mime or
|
||||
not mime.lower().startswith('application/x-www-form-urlencoded'))):
|
||||
return None
|
||||
|
||||
try:
|
||||
length = int(length)
|
||||
except (ValueError, TypeError):
|
||||
@ -101,11 +102,79 @@ def extract_post_query(method, mime, length, stream, buffered_stream=None):
|
||||
buffered_stream.write(post_query)
|
||||
buffered_stream.seek(0)
|
||||
|
||||
post_query = to_native_str(post_query)
|
||||
post_query = unquote_plus(post_query)
|
||||
if not mime:
|
||||
mime = ''
|
||||
|
||||
if mime.startswith('application/x-www-form-urlencoded'):
|
||||
post_query = to_native_str(post_query)
|
||||
post_query = unquote_plus(post_query)
|
||||
|
||||
elif mime.startswith('multipart/'):
|
||||
env = {'REQUEST_METHOD': 'POST',
|
||||
'CONTENT_TYPE': mime,
|
||||
'CONTENT_LENGTH': len(post_query)}
|
||||
|
||||
args = dict(fp=BytesIO(post_query),
|
||||
environ=env,
|
||||
keep_blank_values=True)
|
||||
|
||||
if six.PY3:
|
||||
args['encoding'] = 'utf-8'
|
||||
|
||||
data = cgi.FieldStorage(**args)
|
||||
|
||||
values = []
|
||||
for item in data.list:
|
||||
values.append((item.name, item.value))
|
||||
|
||||
post_query = urlencode(values, True)
|
||||
|
||||
elif mime.startswith('application/x-amf'):
|
||||
post_query = amf_parse(post_query, environ)
|
||||
|
||||
else:
|
||||
post_query = base64.b64encode(post_query)
|
||||
post_query = to_native_str(post_query)
|
||||
post_query = '&__wb_post_data=' + post_query
|
||||
|
||||
return post_query
|
||||
|
||||
|
||||
#=================================================================
|
||||
def amf_parse(string, environ):
|
||||
try:
|
||||
from pyamf import remoting
|
||||
|
||||
res = remoting.decode(BytesIO(string))
|
||||
|
||||
#print(res)
|
||||
body = res.bodies[0][1].body[0]
|
||||
|
||||
values = {}
|
||||
|
||||
if hasattr(body, 'body'):
|
||||
values['body'] = body.body
|
||||
|
||||
if hasattr(body, 'source'):
|
||||
values['source'] = body.source
|
||||
|
||||
if hasattr(body, 'operation'):
|
||||
values['op'] = body.operation
|
||||
|
||||
if environ is not None:
|
||||
environ['pywb.inputdata'] = res
|
||||
|
||||
query = urlencode(values)
|
||||
#print(query)
|
||||
return query
|
||||
|
||||
except Exception as e:
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
print(e)
|
||||
return None
|
||||
|
||||
|
||||
#=================================================================
|
||||
def append_post_query(url, post_query):
|
||||
if not post_query:
|
||||
|
@ -90,8 +90,9 @@ IOError: [Errno 2] No such file or directory: '_x_no_such_file_'
|
||||
# unsupported method
|
||||
>>> extract_post_query('PUT', 'application/x-www-form-urlencoded', len(post_data), BytesIO(post_data))
|
||||
|
||||
# unsupported type
|
||||
# base64 encode
|
||||
>>> extract_post_query('POST', 'text/plain', len(post_data), BytesIO(post_data))
|
||||
'&__wb_post_data=Zm9vPWJhciZkaXI9JTJGYmF6'
|
||||
|
||||
# invalid length
|
||||
>>> extract_post_query('POST', 'application/x-www-form-urlencoded', 'abc', BytesIO(post_data))
|
||||
|
@ -253,6 +253,7 @@ class ArchiveIndexEntryMixin(object):
|
||||
self['mime'] = def_mime
|
||||
if mime:
|
||||
self['mime'] = self.MIME_RE.split(mime, 1)[0]
|
||||
self['_content_type'] = mime
|
||||
|
||||
def extract_status(self, status_headers):
|
||||
""" Extract status code only from status line
|
||||
@ -390,7 +391,7 @@ class DefaultRecordParser(object):
|
||||
len_ = record.status_headers.get_header('Content-Length')
|
||||
|
||||
post_query = extract_post_query(method,
|
||||
entry.get('mime'),
|
||||
entry.get('_content_type'),
|
||||
len_,
|
||||
record.stream)
|
||||
|
||||
|
@ -13,7 +13,7 @@ from pywb.utils.timeutils import timestamp_now
|
||||
from pywb.framework.wbrequestresponse import WbResponse
|
||||
from pywb.framework.memento import MementoResponse
|
||||
|
||||
from pywb.rewrite.rewrite_content import RewriteContent
|
||||
from pywb.rewrite.rewrite_content import RewriteContentAMF
|
||||
from pywb.warc.recordloader import ArchiveLoadFailed
|
||||
|
||||
from pywb.webapp.views import HeadInsertView
|
||||
@ -40,7 +40,7 @@ class ReplayView(object):
|
||||
self.content_loader = content_loader
|
||||
|
||||
framed = config.get('framed_replay')
|
||||
self.content_rewriter = RewriteContent(is_framed_replay=framed)
|
||||
self.content_rewriter = RewriteContentAMF(is_framed_replay=framed)
|
||||
|
||||
self.head_insert_view = HeadInsertView.init_from_config(config)
|
||||
|
||||
@ -174,7 +174,8 @@ class ReplayView(object):
|
||||
stream=stream,
|
||||
head_insert_func=head_insert_func,
|
||||
urlkey=cdx['urlkey'],
|
||||
cdx=cdx))
|
||||
cdx=cdx,
|
||||
env=wbrequest.env))
|
||||
|
||||
(status_headers, response_iter, is_rewritten) = result
|
||||
|
||||
|
3
setup.py
3
setup.py
@ -82,6 +82,7 @@ setup(
|
||||
'pyyaml',
|
||||
'watchdog',
|
||||
'webencodings',
|
||||
'pyamf',
|
||||
],
|
||||
tests_require=[
|
||||
'pytest',
|
||||
@ -91,7 +92,7 @@ setup(
|
||||
'mock',
|
||||
],
|
||||
dependency_links=[
|
||||
'git+https://github.com/ikreymer/fakeredis.git@zset-lex-ops#egg=fakeredis-0.6.2-zset-lex-ops',
|
||||
'git+https://github.com/t0m/pyamf.git@python3'
|
||||
],
|
||||
cmdclass={'test': PyTest},
|
||||
test_suite='',
|
||||
|
Loading…
x
Reference in New Issue
Block a user