1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

post request mapping improvements: work on #178, including:

- mapping multipart/form-data same as x-www-form-urlencoded
- parsing application/x-amf with pyamf
- RewriteContentAMF for rewriting AMF response to match request
- default encoding of other POST data as base64 encoded __wb_post_data param
This commit is contained in:
Ilya Kreymer 2016-05-06 10:19:08 -07:00
parent e5e7c5a7df
commit 87da25c703
7 changed files with 160 additions and 23 deletions

View File

@ -184,14 +184,15 @@ class WbRequest(object):
if not self.wb_url:
return
mime = self.env.get('CONTENT_TYPE', '').split(';')[0]
mime = self.env.get('CONTENT_TYPE', '')
length = self.env.get('CONTENT_LENGTH')
stream = self.env['wsgi.input']
buffered_stream = BytesIO()
post_query = extract_post_query('POST', mime, length, stream,
buffered_stream=buffered_stream)
buffered_stream=buffered_stream,
environ=self.env)
if post_query:
self.env['wsgi.input'] = buffered_stream

View File

@ -5,7 +5,8 @@ import yaml
import re
from chardet.universaldetector import UniversalDetector
from io import BytesIO
from io import BytesIO, BufferedReader
from six.moves import zip
from pywb.rewrite.header_rewriter import RewrittenStatusAndHeaders
@ -21,7 +22,7 @@ from pywb.rewrite.regex_rewriters import JSNoneRewriter, JSLinkOnlyRewriter
#=================================================================
class RewriteContent:
class RewriteContent(object):
HEAD_REGEX = re.compile(b'<\s*head\\b[^>]*[>]+', re.I)
TAG_REGEX = re.compile(b'^\s*\<')
@ -94,7 +95,7 @@ class RewriteContent:
def rewrite_content(self, urlrewriter, status_headers, stream,
head_insert_func=None, urlkey='',
cdx=None, cookie_rewriter=None):
cdx=None, cookie_rewriter=None, env=None):
wb_url = urlrewriter.wburl
@ -118,9 +119,12 @@ class RewriteContent:
status_headers = rewritten_headers.status_headers
# use rewritten headers, but no further rewriting needed
if rewritten_headers.text_type is None:
return (status_headers, self.stream_to_gen(stream), False)
res = self.handle_custom_rewrite(rewritten_headers.text_type,
status_headers,
stream,
env)
if res:
return res
# Handle text content rewriting
# ====================================================================
@ -237,6 +241,11 @@ class RewriteContent:
return (status_headers, gen, True)
def handle_custom_rewrite(self, text_type, status_headers, stream, env):
# use rewritten headers, but no further rewriting needed
if text_type is None:
return (status_headers, self.stream_to_gen(stream), False)
@staticmethod
def _extract_html_charset(buff, status_headers):
charset = None
@ -360,3 +369,57 @@ class RewriteContent:
finally:
stream.close()
# =================================================================
class RewriteContentAMF(RewriteContent):
def handle_custom_rewrite(self, text_type, status_headers, stream, env):
if status_headers.get_header('Content-Type') == 'application/x-amf':
stream = self.rewrite_amf(stream, env)
return (super(RewriteContentAMF, self).
handle_custom_rewrite(text_type, status_headers, stream, env))
def rewrite_amf(self, stream, env):
try:
from pyamf import remoting
iobuff = BytesIO()
while True:
buff = stream.read()
if not buff:
break
iobuff.write(buff)
iobuff.seek(0)
res = remoting.decode(iobuff)
print('rewrite amf')
print(env.get('pywb.inputdata'))
if env and env.get('pywb.inputdata'):
inputdata = env.get('pywb.inputdata')
new_list = []
for src, target in zip(inputdata.bodies, res.bodies):
print(target[0] + ' = ' + src[0])
print('messageId => corrId ' + target[1].body.correlationId + ' => ' + src[1].body[0].messageId)
target[1].body.correlationId = src[1].body[0].messageId
new_list.append((src[0], target[1]))
res.bodies = new_list
return BytesIO(remoting.encode(res).getvalue())
except Exception as e:
import traceback
traceback.print_exc()
print(e)
return stream

View File

@ -9,10 +9,12 @@ import requests
import six
from six.moves.urllib.request import pathname2url, url2pathname
from six.moves.urllib.parse import urljoin, unquote_plus, urlsplit
from six.moves.urllib.parse import urljoin, unquote_plus, urlsplit, urlencode
import time
import pkg_resources
import base64
import cgi
from io import open, BytesIO
@ -65,19 +67,18 @@ def to_native_str(value, encoding='iso-8859-1', func=lambda x: x):
#=================================================================
def extract_post_query(method, mime, length, stream, buffered_stream=None):
def extract_post_query(method, mime, length, stream,
buffered_stream=None,
environ=None):
"""
Extract a url-encoded form POST from stream
If not a application/x-www-form-urlencoded, or no missing
content length, return None
Attempt to decode application/x-www-form-urlencoded or multipart/*,
otherwise read whole block and b64encode
"""
if method.upper() != 'POST':
return None
if ((not mime or
not mime.lower().startswith('application/x-www-form-urlencoded'))):
return None
try:
length = int(length)
except (ValueError, TypeError):
@ -101,11 +102,79 @@ def extract_post_query(method, mime, length, stream, buffered_stream=None):
buffered_stream.write(post_query)
buffered_stream.seek(0)
post_query = to_native_str(post_query)
post_query = unquote_plus(post_query)
if not mime:
mime = ''
if mime.startswith('application/x-www-form-urlencoded'):
post_query = to_native_str(post_query)
post_query = unquote_plus(post_query)
elif mime.startswith('multipart/'):
env = {'REQUEST_METHOD': 'POST',
'CONTENT_TYPE': mime,
'CONTENT_LENGTH': len(post_query)}
args = dict(fp=BytesIO(post_query),
environ=env,
keep_blank_values=True)
if six.PY3:
args['encoding'] = 'utf-8'
data = cgi.FieldStorage(**args)
values = []
for item in data.list:
values.append((item.name, item.value))
post_query = urlencode(values, True)
elif mime.startswith('application/x-amf'):
post_query = amf_parse(post_query, environ)
else:
post_query = base64.b64encode(post_query)
post_query = to_native_str(post_query)
post_query = '&__wb_post_data=' + post_query
return post_query
#=================================================================
def amf_parse(string, environ):
try:
from pyamf import remoting
res = remoting.decode(BytesIO(string))
#print(res)
body = res.bodies[0][1].body[0]
values = {}
if hasattr(body, 'body'):
values['body'] = body.body
if hasattr(body, 'source'):
values['source'] = body.source
if hasattr(body, 'operation'):
values['op'] = body.operation
if environ is not None:
environ['pywb.inputdata'] = res
query = urlencode(values)
#print(query)
return query
except Exception as e:
import traceback
traceback.print_exc()
print(e)
return None
#=================================================================
def append_post_query(url, post_query):
if not post_query:

View File

@ -90,8 +90,9 @@ IOError: [Errno 2] No such file or directory: '_x_no_such_file_'
# unsupported method
>>> extract_post_query('PUT', 'application/x-www-form-urlencoded', len(post_data), BytesIO(post_data))
# unsupported type
# base64 encode
>>> extract_post_query('POST', 'text/plain', len(post_data), BytesIO(post_data))
'&__wb_post_data=Zm9vPWJhciZkaXI9JTJGYmF6'
# invalid length
>>> extract_post_query('POST', 'application/x-www-form-urlencoded', 'abc', BytesIO(post_data))

View File

@ -253,6 +253,7 @@ class ArchiveIndexEntryMixin(object):
self['mime'] = def_mime
if mime:
self['mime'] = self.MIME_RE.split(mime, 1)[0]
self['_content_type'] = mime
def extract_status(self, status_headers):
""" Extract status code only from status line
@ -390,7 +391,7 @@ class DefaultRecordParser(object):
len_ = record.status_headers.get_header('Content-Length')
post_query = extract_post_query(method,
entry.get('mime'),
entry.get('_content_type'),
len_,
record.stream)

View File

@ -13,7 +13,7 @@ from pywb.utils.timeutils import timestamp_now
from pywb.framework.wbrequestresponse import WbResponse
from pywb.framework.memento import MementoResponse
from pywb.rewrite.rewrite_content import RewriteContent
from pywb.rewrite.rewrite_content import RewriteContentAMF
from pywb.warc.recordloader import ArchiveLoadFailed
from pywb.webapp.views import HeadInsertView
@ -40,7 +40,7 @@ class ReplayView(object):
self.content_loader = content_loader
framed = config.get('framed_replay')
self.content_rewriter = RewriteContent(is_framed_replay=framed)
self.content_rewriter = RewriteContentAMF(is_framed_replay=framed)
self.head_insert_view = HeadInsertView.init_from_config(config)
@ -174,7 +174,8 @@ class ReplayView(object):
stream=stream,
head_insert_func=head_insert_func,
urlkey=cdx['urlkey'],
cdx=cdx))
cdx=cdx,
env=wbrequest.env))
(status_headers, response_iter, is_rewritten) = result

View File

@ -82,6 +82,7 @@ setup(
'pyyaml',
'watchdog',
'webencodings',
'pyamf',
],
tests_require=[
'pytest',
@ -91,7 +92,7 @@ setup(
'mock',
],
dependency_links=[
'git+https://github.com/ikreymer/fakeredis.git@zset-lex-ops#egg=fakeredis-0.6.2-zset-lex-ops',
'git+https://github.com/t0m/pyamf.git@python3'
],
cmdclass={'test': PyTest},
test_suite='',