diff --git a/pywb/indexer/archiveindexer.py b/pywb/indexer/archiveindexer.py index afcb1564..2a651525 100644 --- a/pywb/indexer/archiveindexer.py +++ b/pywb/indexer/archiveindexer.py @@ -1,7 +1,7 @@ from pywb.utils.canonicalize import canonicalize -from pywb.utils.loaders import extract_post_query, append_post_query -from pywb.webagg.utils import BUFF_SIZE +from pywb.warcserver.inputrequest import PostQueryExtractor +from pywb.warcserver.utils import BUFF_SIZE from warcio.timeutils import iso_date_to_timestamp from warcio.archiveiterator import ArchiveIterator @@ -68,9 +68,10 @@ class ArchiveIndexEntryMixin(object): # merge POST/PUT body query post_query = other.get('_post_query') - if post_query: - url = append_post_query(self['url'], post_query) - self['urlkey'] = canonicalize(url, surt_ordered) + url = self['url'] + new_url = post_query.append_post_query(url) + if post_query and new_url != url: + self['urlkey'] = canonicalize(new_url, surt_ordered) other['urlkey'] = self['urlkey'] referer = other.record.http_headers.get_header('referer') @@ -180,7 +181,7 @@ class DefaultRecordParser(object): method = record.http_headers.protocol len_ = record.http_headers.get_header('Content-Length') - post_query = extract_post_query(method, + post_query = PostQueryExtractor(method, entry.get('_content_type'), len_, record.raw_stream) diff --git a/pywb/indexer/cdxindexer.py b/pywb/indexer/cdxindexer.py index fd633eed..4939b9fb 100644 --- a/pywb/indexer/cdxindexer.py +++ b/pywb/indexer/cdxindexer.py @@ -31,7 +31,7 @@ from bisect import insort from six import StringIO -from pywb.warc.archiveindexer import DefaultRecordParser +from pywb.indexer.archiveindexer import DefaultRecordParser import codecs import six diff --git a/pywb/indexer/test/test_indexing.py b/pywb/indexer/test/test_indexing.py index fbe1413b..b2bd6dfe 100644 --- a/pywb/indexer/test/test_indexing.py +++ b/pywb/indexer/test/test_indexing.py @@ -200,9 +200,9 @@ Total: 4 from pywb import get_test_dir -from pywb.warc.cdxindexer import write_cdx_index, main, cdx_filename +from pywb.indexer.cdxindexer import write_cdx_index, main, cdx_filename -from pywb.cdx.cdxobject import CDXObject +from pywb.warcserver.index.cdxobject import CDXObject from io import BytesIO import sys diff --git a/pywb/manager/migrate.py b/pywb/manager/migrate.py index f340bfe1..d8bebdf1 100644 --- a/pywb/manager/migrate.py +++ b/pywb/manager/migrate.py @@ -1,6 +1,6 @@ from pywb.utils.canonicalize import canonicalize -from pywb.cdx.cdxobject import CDXObject, URLKEY, ORIGINAL -from pywb.warc.cdxindexer import CDXJ +from pywb.warcserver.index.cdxobject import CDXObject, URLKEY, ORIGINAL +from pywb.indexer.cdxindexer import CDXJ import os import shutil diff --git a/pywb/recorder/multifilewarcwriter.py b/pywb/recorder/multifilewarcwriter.py index 16fd5ba1..8dd3df63 100644 --- a/pywb/recorder/multifilewarcwriter.py +++ b/pywb/recorder/multifilewarcwriter.py @@ -10,7 +10,7 @@ import portalocker from warcio.timeutils import timestamp20_now from warcio.warcwriter import BaseWARCWriter -from pywb.webagg.utils import res_template +from pywb.warcserver.utils import res_template # ============================================================================ diff --git a/pywb/recorder/recorderapp.py b/pywb/recorder/recorderapp.py index b1fdecb5..3233962d 100644 --- a/pywb/recorder/recorderapp.py +++ b/pywb/recorder/recorderapp.py @@ -1,6 +1,6 @@ -from pywb.webagg.utils import StreamIter, BUFF_SIZE -from pywb.webagg.utils import ParamFormatter, res_template -from pywb.webagg.inputrequest import DirectWSGIInputRequest +from pywb.warcserver.utils import StreamIter, BUFF_SIZE +from pywb.warcserver.utils import ParamFormatter, res_template +from pywb.warcserver.inputrequest import DirectWSGIInputRequest from warcio.recordloader import ArcWarcRecordLoader diff --git a/pywb/recorder/redisindexer.py b/pywb/recorder/redisindexer.py index 142dad1e..55433464 100644 --- a/pywb/recorder/redisindexer.py +++ b/pywb/recorder/redisindexer.py @@ -1,15 +1,15 @@ -from pywb.utils.canonicalize import calc_search_range -from pywb.cdx.cdxobject import CDXObject -from pywb.warc.cdxindexer import write_cdx_index - from warcio.timeutils import iso_date_to_timestamp from io import BytesIO import os -from pywb.webagg.indexsource import RedisIndexSource -from pywb.webagg.aggregator import SimpleAggregator -from pywb.webagg.utils import res_template +from pywb.utils.canonicalize import calc_search_range +from pywb.indexer.cdxindexer import write_cdx_index + +from pywb.warcserver.index.cdxobject import CDXObject +from pywb.warcserver.index.indexsource import RedisIndexSource +from pywb.warcserver.index.aggregator import SimpleAggregator +from pywb.warcserver.utils import res_template from pywb.recorder.filters import WriteRevisitDupePolicy diff --git a/pywb/recorder/test/test_recorder.py b/pywb/recorder/test/test_recorder.py index 5e7f960f..a9526228 100644 --- a/pywb/recorder/test/test_recorder.py +++ b/pywb/recorder/test/test_recorder.py @@ -1,9 +1,8 @@ from gevent import monkey; monkey.patch_all() import gevent -import pywb.webagg -from pywb.webagg.test.testutils import TempDirTests, LiveServerTests, BaseTestClass, to_path -from pywb.webagg.test.testutils import FakeRedisTests +from pywb.warcserver.test.testutils import TempDirTests, LiveServerTests, BaseTestClass, to_path +from pywb.warcserver.test.testutils import FakeRedisTests import os import webtest @@ -17,16 +16,16 @@ from pywb.recorder.multifilewarcwriter import PerRecordWARCWriter, MultiFileWARC from pywb.recorder.filters import ExcludeSpecificHeaders, ExcludeHttpOnlyCookieHeaders from pywb.recorder.filters import SkipDupePolicy, WriteDupePolicy, WriteRevisitDupePolicy -from pywb.webagg.utils import MementoUtils +from pywb.warcserver.utils import MementoUtils -from pywb.cdx.cdxobject import CDXObject +from pywb.warcserver.index.cdxobject import CDXObject from warcio.statusandheaders import StatusAndHeadersParser from warcio.bufferedreaders import DecompressingBufferedReader from warcio.recordloader import ArcWarcRecordLoader from warcio.archiveiterator import ArchiveIterator -from pywb.warc.cdxindexer import write_cdx_index +from pywb.indexer.cdxindexer import write_cdx_index from six.moves.urllib.parse import quote, unquote, urlencode from io import BytesIO diff --git a/pywb/urlrewrite/geventserver.py b/pywb/utils/geventserver.py similarity index 100% rename from pywb/urlrewrite/geventserver.py rename to pywb/utils/geventserver.py diff --git a/pywb/utils/loaders.py b/pywb/utils/loaders.py index 84d28785..13b58397 100644 --- a/pywb/utils/loaders.py +++ b/pywb/utils/loaders.py @@ -18,6 +18,7 @@ import cgi from io import open, BytesIO from warcio.limitreader import LimitReader +from warcio.utils import to_native_str try: from boto import connect_s3 @@ -60,140 +61,6 @@ def load_yaml_config(config_file): return config -#================================================================= -def to_native_str(value, encoding='iso-8859-1', func=lambda x: x): - if isinstance(value, str): - return value - - if six.PY3 and isinstance(value, six.binary_type): #pragma: no cover - return func(value.decode(encoding)) - elif six.PY2 and isinstance(value, six.text_type): #pragma: no cover - return func(value.encode(encoding)) - - -#================================================================= -def extract_post_query(method, mime, length, stream, - buffered_stream=None, - environ=None): - """ - Extract a url-encoded form POST from stream - content length, return None - Attempt to decode application/x-www-form-urlencoded or multipart/*, - otherwise read whole block and b64encode - """ - if method.upper() != 'POST': - return None - - try: - length = int(length) - except (ValueError, TypeError): - return None - - if length <= 0: - return None - - post_query = b'' - - while length > 0: - buff = stream.read(length) - length -= len(buff) - - if not buff: - break - - post_query += buff - - if buffered_stream: - buffered_stream.write(post_query) - buffered_stream.seek(0) - - if not mime: - mime = '' - - if mime.startswith('application/x-www-form-urlencoded'): - post_query = to_native_str(post_query) - post_query = unquote_plus(post_query) - - elif mime.startswith('multipart/'): - env = {'REQUEST_METHOD': 'POST', - 'CONTENT_TYPE': mime, - 'CONTENT_LENGTH': len(post_query)} - - args = dict(fp=BytesIO(post_query), - environ=env, - keep_blank_values=True) - - if six.PY3: - args['encoding'] = 'utf-8' - - data = cgi.FieldStorage(**args) - - values = [] - for item in data.list: - values.append((item.name, item.value)) - - post_query = urlencode(values, True) - - elif mime.startswith('application/x-amf'): - post_query = amf_parse(post_query, environ) - - else: - post_query = base64.b64encode(post_query) - post_query = to_native_str(post_query) - post_query = '&__wb_post_data=' + post_query - - return post_query - - -#================================================================= -def amf_parse(string, environ): - try: - from pyamf import remoting - - res = remoting.decode(BytesIO(string)) - - #print(res) - body = res.bodies[0][1].body[0] - - values = {} - - if hasattr(body, 'body'): - values['body'] = body.body - - if hasattr(body, 'source'): - values['source'] = body.source - - if hasattr(body, 'operation'): - values['op'] = body.operation - - if environ is not None: - environ['pywb.inputdata'] = res - - query = urlencode(values) - #print(query) - return query - - except Exception as e: - import traceback - traceback.print_exc() - print(e) - return None - - -#================================================================= -def append_post_query(url, post_query): - if not post_query: - return url - - if '?' not in url: - url += '?' - else: - url += '&' - - url += post_query - return url - - #================================================================= def extract_client_cookie(env, cookie_name): cookie_header = env.get('HTTP_COOKIE') diff --git a/pywb/warcserver/Dockerfile b/pywb/warcserver/Dockerfile deleted file mode 100644 index 9dc3c623..00000000 --- a/pywb/warcserver/Dockerfile +++ /dev/null @@ -1,14 +0,0 @@ -FROM python:3.5 - -WORKDIR /code/ - -RUN pip install -U git+https://github.com/ikreymer/pywb.git@develop#egg=pywb-0.30.0-develop -RUN pip install uwsgi gevent bottle - -ADD . /code/webagg/ -ADD ./test/ /code/test/ - -WORKDIR /code/ -CMD uwsgi /code/test/live.ini - - diff --git a/pywb/warcserver/README.rst b/pywb/warcserver/README.rst deleted file mode 100644 index f06334b6..00000000 --- a/pywb/warcserver/README.rst +++ /dev/null @@ -1,6 +0,0 @@ -Resource Memento/Aggregator -=========================== - -This is a reference implementation of the `Resource/Memento Aggregator `_ -from the `Webrecorder Platform `_ - diff --git a/pywb/warcserver/inputrequest.py b/pywb/warcserver/inputrequest.py index 119f0ca7..58e44a35 100644 --- a/pywb/warcserver/inputrequest.py +++ b/pywb/warcserver/inputrequest.py @@ -1,12 +1,16 @@ from warcio.limitreader import LimitReader from warcio.statusandheaders import StatusAndHeadersParser -from pywb.utils.loaders import extract_post_query, append_post_query +from warcio.utils import to_native_str -from six.moves.urllib.parse import urlsplit, quote +from six.moves.urllib.parse import urlsplit, quote, unquote_plus from six import iteritems, StringIO from io import BytesIO +import base64 +import cgi + + #============================================================================= class DirectWSGIInputRequest(object): @@ -78,13 +82,12 @@ class DirectWSGIInputRequest(object): buffered_stream = BytesIO() - post_query = extract_post_query('POST', mime, length, stream, + post_query = PostQueryExtractor('POST', mime, length, stream, buffered_stream=buffered_stream, environ=self.env) - if post_query: + if post_query.append_post_query(url) != url: self.env['wsgi.input'] = buffered_stream - url = append_post_query(url, post_query) return url @@ -171,4 +174,124 @@ class POSTInputRequest(DirectWSGIInputRequest): return self.status_headers.get_header(name) +# ============================================================================ +class PostQueryExtractor(object): + def __init__(self, method, mime, length, stream, + buffered_stream=None, + environ=None): + """ + Extract a url-encoded form POST from stream + content length, return None + Attempt to decode application/x-www-form-urlencoded or multipart/*, + otherwise read whole block and b64encode + """ + self.post_query = b'' + + if method.upper() != 'POST': + return + + try: + length = int(length) + except (ValueError, TypeError): + return + + if length <= 0: + return + + post_query = b'' + + while length > 0: + buff = stream.read(length) + length -= len(buff) + + if not buff: + break + + post_query += buff + + if buffered_stream: + buffered_stream.write(post_query) + buffered_stream.seek(0) + + if not mime: + mime = '' + + if mime.startswith('application/x-www-form-urlencoded'): + post_query = to_native_str(post_query) + post_query = unquote_plus(post_query) + + elif mime.startswith('multipart/'): + env = {'REQUEST_METHOD': 'POST', + 'CONTENT_TYPE': mime, + 'CONTENT_LENGTH': len(post_query)} + + args = dict(fp=BytesIO(post_query), + environ=env, + keep_blank_values=True) + + if six.PY3: + args['encoding'] = 'utf-8' + + data = cgi.FieldStorage(**args) + + values = [] + for item in data.list: + values.append((item.name, item.value)) + + post_query = urlencode(values, True) + + elif mime.startswith('application/x-amf'): + post_query = self.amf_parse(post_query, environ) + + else: + post_query = base64.b64encode(post_query) + post_query = to_native_str(post_query) + post_query = '&__wb_post_data=' + post_query + + self.post_query = post_query + + def amf_parse(self, string, environ): + try: + from pyamf import remoting + + res = remoting.decode(BytesIO(string)) + + #print(res) + body = res.bodies[0][1].body[0] + + values = {} + + if hasattr(body, 'body'): + values['body'] = body.body + + if hasattr(body, 'source'): + values['source'] = body.source + + if hasattr(body, 'operation'): + values['op'] = body.operation + + if environ is not None: + environ['pywb.inputdata'] = res + + query = urlencode(values) + #print(query) + return query + + except Exception as e: + import traceback + traceback.print_exc() + print(e) + return None + + def append_post_query(self, url): + if not self.post_query: + return url + + if '?' not in url: + url += '?' + else: + url += '&' + + url += self.post_query + return url diff --git a/pywb/warcserver/test/testutils.py b/pywb/warcserver/test/testutils.py index ef8d44e7..bacea908 100644 --- a/pywb/warcserver/test/testutils.py +++ b/pywb/warcserver/test/testutils.py @@ -14,7 +14,7 @@ from pywb.warcserver.handlers import DefaultResourceHandler from pywb.warcserver.index.aggregator import SimpleAggregator from pywb.warcserver.index.indexsource import LiveIndexSource, MementoIndexSource -from pywb.urlrewrite.geventserver import GeventServer +from pywb.utils.geventserver import GeventServer from pywb import get_test_dir from pywb.utils.wbexception import NotFoundException