diff --git a/pywb/framework/wbrequestresponse.py b/pywb/framework/wbrequestresponse.py index 0d2634f5..36afff40 100644 --- a/pywb/framework/wbrequestresponse.py +++ b/pywb/framework/wbrequestresponse.py @@ -184,14 +184,15 @@ class WbRequest(object): if not self.wb_url: return - mime = self.env.get('CONTENT_TYPE', '').split(';')[0] + mime = self.env.get('CONTENT_TYPE', '') length = self.env.get('CONTENT_LENGTH') stream = self.env['wsgi.input'] buffered_stream = BytesIO() post_query = extract_post_query('POST', mime, length, stream, - buffered_stream=buffered_stream) + buffered_stream=buffered_stream, + environ=self.env) if post_query: self.env['wsgi.input'] = buffered_stream diff --git a/pywb/rewrite/rewrite_content.py b/pywb/rewrite/rewrite_content.py index 4454ea3c..e9ad9fd0 100644 --- a/pywb/rewrite/rewrite_content.py +++ b/pywb/rewrite/rewrite_content.py @@ -5,7 +5,8 @@ import yaml import re from chardet.universaldetector import UniversalDetector -from io import BytesIO +from io import BytesIO, BufferedReader +from six.moves import zip from pywb.rewrite.header_rewriter import RewrittenStatusAndHeaders @@ -21,7 +22,7 @@ from pywb.rewrite.regex_rewriters import JSNoneRewriter, JSLinkOnlyRewriter #================================================================= -class RewriteContent: +class RewriteContent(object): HEAD_REGEX = re.compile(b'<\s*head\\b[^>]*[>]+', re.I) TAG_REGEX = re.compile(b'^\s*\<') @@ -94,7 +95,7 @@ class RewriteContent: def rewrite_content(self, urlrewriter, status_headers, stream, head_insert_func=None, urlkey='', - cdx=None, cookie_rewriter=None): + cdx=None, cookie_rewriter=None, env=None): wb_url = urlrewriter.wburl @@ -118,9 +119,12 @@ class RewriteContent: status_headers = rewritten_headers.status_headers - # use rewritten headers, but no further rewriting needed - if rewritten_headers.text_type is None: - return (status_headers, self.stream_to_gen(stream), False) + res = self.handle_custom_rewrite(rewritten_headers.text_type, + status_headers, + stream, + env) + if res: + return res # Handle text content rewriting # ==================================================================== @@ -237,6 +241,11 @@ class RewriteContent: return (status_headers, gen, True) + def handle_custom_rewrite(self, text_type, status_headers, stream, env): + # use rewritten headers, but no further rewriting needed + if text_type is None: + return (status_headers, self.stream_to_gen(stream), False) + @staticmethod def _extract_html_charset(buff, status_headers): charset = None @@ -360,3 +369,57 @@ class RewriteContent: finally: stream.close() + + +# ================================================================= +class RewriteContentAMF(RewriteContent): + def handle_custom_rewrite(self, text_type, status_headers, stream, env): + + if status_headers.get_header('Content-Type') == 'application/x-amf': + stream = self.rewrite_amf(stream, env) + + return (super(RewriteContentAMF, self). + handle_custom_rewrite(text_type, status_headers, stream, env)) + + def rewrite_amf(self, stream, env): + try: + from pyamf import remoting + + iobuff = BytesIO() + while True: + buff = stream.read() + if not buff: + break + iobuff.write(buff) + + iobuff.seek(0) + res = remoting.decode(iobuff) + + print('rewrite amf') + + print(env.get('pywb.inputdata')) + + if env and env.get('pywb.inputdata'): + inputdata = env.get('pywb.inputdata') + + new_list = [] + + for src, target in zip(inputdata.bodies, res.bodies): + print(target[0] + ' = ' + src[0]) + + print('messageId => corrId ' + target[1].body.correlationId + ' => ' + src[1].body[0].messageId) + target[1].body.correlationId = src[1].body[0].messageId + + new_list.append((src[0], target[1])) + + res.bodies = new_list + + return BytesIO(remoting.encode(res).getvalue()) + + except Exception as e: + import traceback + traceback.print_exc() + print(e) + return stream + + diff --git a/pywb/utils/loaders.py b/pywb/utils/loaders.py index 6dbbf1e2..6a262db4 100644 --- a/pywb/utils/loaders.py +++ b/pywb/utils/loaders.py @@ -9,10 +9,12 @@ import requests import six from six.moves.urllib.request import pathname2url, url2pathname -from six.moves.urllib.parse import urljoin, unquote_plus, urlsplit +from six.moves.urllib.parse import urljoin, unquote_plus, urlsplit, urlencode import time import pkg_resources +import base64 +import cgi from io import open, BytesIO @@ -65,19 +67,18 @@ def to_native_str(value, encoding='iso-8859-1', func=lambda x: x): #================================================================= -def extract_post_query(method, mime, length, stream, buffered_stream=None): +def extract_post_query(method, mime, length, stream, + buffered_stream=None, + environ=None): """ Extract a url-encoded form POST from stream - If not a application/x-www-form-urlencoded, or no missing content length, return None + Attempt to decode application/x-www-form-urlencoded or multipart/*, + otherwise read whole block and b64encode """ if method.upper() != 'POST': return None - if ((not mime or - not mime.lower().startswith('application/x-www-form-urlencoded'))): - return None - try: length = int(length) except (ValueError, TypeError): @@ -101,11 +102,79 @@ def extract_post_query(method, mime, length, stream, buffered_stream=None): buffered_stream.write(post_query) buffered_stream.seek(0) - post_query = to_native_str(post_query) - post_query = unquote_plus(post_query) + if not mime: + mime = '' + + if mime.startswith('application/x-www-form-urlencoded'): + post_query = to_native_str(post_query) + post_query = unquote_plus(post_query) + + elif mime.startswith('multipart/'): + env = {'REQUEST_METHOD': 'POST', + 'CONTENT_TYPE': mime, + 'CONTENT_LENGTH': len(post_query)} + + args = dict(fp=BytesIO(post_query), + environ=env, + keep_blank_values=True) + + if six.PY3: + args['encoding'] = 'utf-8' + + data = cgi.FieldStorage(**args) + + values = [] + for item in data.list: + values.append((item.name, item.value)) + + post_query = urlencode(values, True) + + elif mime.startswith('application/x-amf'): + post_query = amf_parse(post_query, environ) + + else: + post_query = base64.b64encode(post_query) + post_query = to_native_str(post_query) + post_query = '&__wb_post_data=' + post_query + return post_query +#================================================================= +def amf_parse(string, environ): + try: + from pyamf import remoting + + res = remoting.decode(BytesIO(string)) + + #print(res) + body = res.bodies[0][1].body[0] + + values = {} + + if hasattr(body, 'body'): + values['body'] = body.body + + if hasattr(body, 'source'): + values['source'] = body.source + + if hasattr(body, 'operation'): + values['op'] = body.operation + + if environ is not None: + environ['pywb.inputdata'] = res + + query = urlencode(values) + #print(query) + return query + + except Exception as e: + import traceback + traceback.print_exc() + print(e) + return None + + #================================================================= def append_post_query(url, post_query): if not post_query: diff --git a/pywb/utils/test/test_loaders.py b/pywb/utils/test/test_loaders.py index abf0acfa..0a751712 100644 --- a/pywb/utils/test/test_loaders.py +++ b/pywb/utils/test/test_loaders.py @@ -90,8 +90,9 @@ IOError: [Errno 2] No such file or directory: '_x_no_such_file_' # unsupported method >>> extract_post_query('PUT', 'application/x-www-form-urlencoded', len(post_data), BytesIO(post_data)) -# unsupported type +# base64 encode >>> extract_post_query('POST', 'text/plain', len(post_data), BytesIO(post_data)) +'&__wb_post_data=Zm9vPWJhciZkaXI9JTJGYmF6' # invalid length >>> extract_post_query('POST', 'application/x-www-form-urlencoded', 'abc', BytesIO(post_data)) diff --git a/pywb/warc/archiveiterator.py b/pywb/warc/archiveiterator.py index 76a76abd..4ff500d4 100644 --- a/pywb/warc/archiveiterator.py +++ b/pywb/warc/archiveiterator.py @@ -253,6 +253,7 @@ class ArchiveIndexEntryMixin(object): self['mime'] = def_mime if mime: self['mime'] = self.MIME_RE.split(mime, 1)[0] + self['_content_type'] = mime def extract_status(self, status_headers): """ Extract status code only from status line @@ -390,7 +391,7 @@ class DefaultRecordParser(object): len_ = record.status_headers.get_header('Content-Length') post_query = extract_post_query(method, - entry.get('mime'), + entry.get('_content_type'), len_, record.stream) diff --git a/pywb/webapp/replay_views.py b/pywb/webapp/replay_views.py index d3771c68..6af44ba3 100644 --- a/pywb/webapp/replay_views.py +++ b/pywb/webapp/replay_views.py @@ -13,7 +13,7 @@ from pywb.utils.timeutils import timestamp_now from pywb.framework.wbrequestresponse import WbResponse from pywb.framework.memento import MementoResponse -from pywb.rewrite.rewrite_content import RewriteContent +from pywb.rewrite.rewrite_content import RewriteContentAMF from pywb.warc.recordloader import ArchiveLoadFailed from pywb.webapp.views import HeadInsertView @@ -40,7 +40,7 @@ class ReplayView(object): self.content_loader = content_loader framed = config.get('framed_replay') - self.content_rewriter = RewriteContent(is_framed_replay=framed) + self.content_rewriter = RewriteContentAMF(is_framed_replay=framed) self.head_insert_view = HeadInsertView.init_from_config(config) @@ -174,7 +174,8 @@ class ReplayView(object): stream=stream, head_insert_func=head_insert_func, urlkey=cdx['urlkey'], - cdx=cdx)) + cdx=cdx, + env=wbrequest.env)) (status_headers, response_iter, is_rewritten) = result diff --git a/setup.py b/setup.py index 6dba8420..6936b3f8 100755 --- a/setup.py +++ b/setup.py @@ -82,6 +82,7 @@ setup( 'pyyaml', 'watchdog', 'webencodings', + 'pyamf', ], tests_require=[ 'pytest', @@ -91,7 +92,7 @@ setup( 'mock', ], dependency_links=[ - 'git+https://github.com/ikreymer/fakeredis.git@zset-lex-ops#egg=fakeredis-0.6.2-zset-lex-ops', + 'git+https://github.com/t0m/pyamf.git@python3' ], cmdclass={'test': PyTest}, test_suite='',