mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
refactor:
- fix pywb.indexer, pywb.manager, pywb.recorder packages, tests pass rename geventeventserver -> pywb.utils move extract_post_query/append_post_query to inputrequest.PostQueryExtractor remove to_native_str() in pywb.utils, redundant with warcio.utils version remove obsolete readme, dockerfile
This commit is contained in:
parent
ad33dc6728
commit
2907ed01c8
@ -1,7 +1,7 @@
|
||||
from pywb.utils.canonicalize import canonicalize
|
||||
from pywb.utils.loaders import extract_post_query, append_post_query
|
||||
|
||||
from pywb.webagg.utils import BUFF_SIZE
|
||||
from pywb.warcserver.inputrequest import PostQueryExtractor
|
||||
from pywb.warcserver.utils import BUFF_SIZE
|
||||
|
||||
from warcio.timeutils import iso_date_to_timestamp
|
||||
from warcio.archiveiterator import ArchiveIterator
|
||||
@ -68,9 +68,10 @@ class ArchiveIndexEntryMixin(object):
|
||||
|
||||
# merge POST/PUT body query
|
||||
post_query = other.get('_post_query')
|
||||
if post_query:
|
||||
url = append_post_query(self['url'], post_query)
|
||||
self['urlkey'] = canonicalize(url, surt_ordered)
|
||||
url = self['url']
|
||||
new_url = post_query.append_post_query(url)
|
||||
if post_query and new_url != url:
|
||||
self['urlkey'] = canonicalize(new_url, surt_ordered)
|
||||
other['urlkey'] = self['urlkey']
|
||||
|
||||
referer = other.record.http_headers.get_header('referer')
|
||||
@ -180,7 +181,7 @@ class DefaultRecordParser(object):
|
||||
method = record.http_headers.protocol
|
||||
len_ = record.http_headers.get_header('Content-Length')
|
||||
|
||||
post_query = extract_post_query(method,
|
||||
post_query = PostQueryExtractor(method,
|
||||
entry.get('_content_type'),
|
||||
len_,
|
||||
record.raw_stream)
|
||||
|
@ -31,7 +31,7 @@ from bisect import insort
|
||||
|
||||
from six import StringIO
|
||||
|
||||
from pywb.warc.archiveindexer import DefaultRecordParser
|
||||
from pywb.indexer.archiveindexer import DefaultRecordParser
|
||||
import codecs
|
||||
import six
|
||||
|
||||
|
@ -200,9 +200,9 @@ Total: 4
|
||||
|
||||
from pywb import get_test_dir
|
||||
|
||||
from pywb.warc.cdxindexer import write_cdx_index, main, cdx_filename
|
||||
from pywb.indexer.cdxindexer import write_cdx_index, main, cdx_filename
|
||||
|
||||
from pywb.cdx.cdxobject import CDXObject
|
||||
from pywb.warcserver.index.cdxobject import CDXObject
|
||||
|
||||
from io import BytesIO
|
||||
import sys
|
||||
|
@ -1,6 +1,6 @@
|
||||
from pywb.utils.canonicalize import canonicalize
|
||||
from pywb.cdx.cdxobject import CDXObject, URLKEY, ORIGINAL
|
||||
from pywb.warc.cdxindexer import CDXJ
|
||||
from pywb.warcserver.index.cdxobject import CDXObject, URLKEY, ORIGINAL
|
||||
from pywb.indexer.cdxindexer import CDXJ
|
||||
|
||||
import os
|
||||
import shutil
|
||||
|
@ -10,7 +10,7 @@ import portalocker
|
||||
from warcio.timeutils import timestamp20_now
|
||||
from warcio.warcwriter import BaseWARCWriter
|
||||
|
||||
from pywb.webagg.utils import res_template
|
||||
from pywb.warcserver.utils import res_template
|
||||
|
||||
|
||||
# ============================================================================
|
||||
|
@ -1,6 +1,6 @@
|
||||
from pywb.webagg.utils import StreamIter, BUFF_SIZE
|
||||
from pywb.webagg.utils import ParamFormatter, res_template
|
||||
from pywb.webagg.inputrequest import DirectWSGIInputRequest
|
||||
from pywb.warcserver.utils import StreamIter, BUFF_SIZE
|
||||
from pywb.warcserver.utils import ParamFormatter, res_template
|
||||
from pywb.warcserver.inputrequest import DirectWSGIInputRequest
|
||||
|
||||
from warcio.recordloader import ArcWarcRecordLoader
|
||||
|
||||
|
@ -1,15 +1,15 @@
|
||||
from pywb.utils.canonicalize import calc_search_range
|
||||
from pywb.cdx.cdxobject import CDXObject
|
||||
from pywb.warc.cdxindexer import write_cdx_index
|
||||
|
||||
from warcio.timeutils import iso_date_to_timestamp
|
||||
|
||||
from io import BytesIO
|
||||
import os
|
||||
|
||||
from pywb.webagg.indexsource import RedisIndexSource
|
||||
from pywb.webagg.aggregator import SimpleAggregator
|
||||
from pywb.webagg.utils import res_template
|
||||
from pywb.utils.canonicalize import calc_search_range
|
||||
from pywb.indexer.cdxindexer import write_cdx_index
|
||||
|
||||
from pywb.warcserver.index.cdxobject import CDXObject
|
||||
from pywb.warcserver.index.indexsource import RedisIndexSource
|
||||
from pywb.warcserver.index.aggregator import SimpleAggregator
|
||||
from pywb.warcserver.utils import res_template
|
||||
|
||||
from pywb.recorder.filters import WriteRevisitDupePolicy
|
||||
|
||||
|
@ -1,9 +1,8 @@
|
||||
from gevent import monkey; monkey.patch_all()
|
||||
import gevent
|
||||
|
||||
import pywb.webagg
|
||||
from pywb.webagg.test.testutils import TempDirTests, LiveServerTests, BaseTestClass, to_path
|
||||
from pywb.webagg.test.testutils import FakeRedisTests
|
||||
from pywb.warcserver.test.testutils import TempDirTests, LiveServerTests, BaseTestClass, to_path
|
||||
from pywb.warcserver.test.testutils import FakeRedisTests
|
||||
|
||||
import os
|
||||
import webtest
|
||||
@ -17,16 +16,16 @@ from pywb.recorder.multifilewarcwriter import PerRecordWARCWriter, MultiFileWARC
|
||||
from pywb.recorder.filters import ExcludeSpecificHeaders, ExcludeHttpOnlyCookieHeaders
|
||||
from pywb.recorder.filters import SkipDupePolicy, WriteDupePolicy, WriteRevisitDupePolicy
|
||||
|
||||
from pywb.webagg.utils import MementoUtils
|
||||
from pywb.warcserver.utils import MementoUtils
|
||||
|
||||
from pywb.cdx.cdxobject import CDXObject
|
||||
from pywb.warcserver.index.cdxobject import CDXObject
|
||||
|
||||
from warcio.statusandheaders import StatusAndHeadersParser
|
||||
from warcio.bufferedreaders import DecompressingBufferedReader
|
||||
from warcio.recordloader import ArcWarcRecordLoader
|
||||
from warcio.archiveiterator import ArchiveIterator
|
||||
|
||||
from pywb.warc.cdxindexer import write_cdx_index
|
||||
from pywb.indexer.cdxindexer import write_cdx_index
|
||||
|
||||
from six.moves.urllib.parse import quote, unquote, urlencode
|
||||
from io import BytesIO
|
||||
|
@ -18,6 +18,7 @@ import cgi
|
||||
|
||||
from io import open, BytesIO
|
||||
from warcio.limitreader import LimitReader
|
||||
from warcio.utils import to_native_str
|
||||
|
||||
try:
|
||||
from boto import connect_s3
|
||||
@ -60,140 +61,6 @@ def load_yaml_config(config_file):
|
||||
return config
|
||||
|
||||
|
||||
#=================================================================
|
||||
def to_native_str(value, encoding='iso-8859-1', func=lambda x: x):
|
||||
if isinstance(value, str):
|
||||
return value
|
||||
|
||||
if six.PY3 and isinstance(value, six.binary_type): #pragma: no cover
|
||||
return func(value.decode(encoding))
|
||||
elif six.PY2 and isinstance(value, six.text_type): #pragma: no cover
|
||||
return func(value.encode(encoding))
|
||||
|
||||
|
||||
#=================================================================
|
||||
def extract_post_query(method, mime, length, stream,
|
||||
buffered_stream=None,
|
||||
environ=None):
|
||||
"""
|
||||
Extract a url-encoded form POST from stream
|
||||
content length, return None
|
||||
Attempt to decode application/x-www-form-urlencoded or multipart/*,
|
||||
otherwise read whole block and b64encode
|
||||
"""
|
||||
if method.upper() != 'POST':
|
||||
return None
|
||||
|
||||
try:
|
||||
length = int(length)
|
||||
except (ValueError, TypeError):
|
||||
return None
|
||||
|
||||
if length <= 0:
|
||||
return None
|
||||
|
||||
post_query = b''
|
||||
|
||||
while length > 0:
|
||||
buff = stream.read(length)
|
||||
length -= len(buff)
|
||||
|
||||
if not buff:
|
||||
break
|
||||
|
||||
post_query += buff
|
||||
|
||||
if buffered_stream:
|
||||
buffered_stream.write(post_query)
|
||||
buffered_stream.seek(0)
|
||||
|
||||
if not mime:
|
||||
mime = ''
|
||||
|
||||
if mime.startswith('application/x-www-form-urlencoded'):
|
||||
post_query = to_native_str(post_query)
|
||||
post_query = unquote_plus(post_query)
|
||||
|
||||
elif mime.startswith('multipart/'):
|
||||
env = {'REQUEST_METHOD': 'POST',
|
||||
'CONTENT_TYPE': mime,
|
||||
'CONTENT_LENGTH': len(post_query)}
|
||||
|
||||
args = dict(fp=BytesIO(post_query),
|
||||
environ=env,
|
||||
keep_blank_values=True)
|
||||
|
||||
if six.PY3:
|
||||
args['encoding'] = 'utf-8'
|
||||
|
||||
data = cgi.FieldStorage(**args)
|
||||
|
||||
values = []
|
||||
for item in data.list:
|
||||
values.append((item.name, item.value))
|
||||
|
||||
post_query = urlencode(values, True)
|
||||
|
||||
elif mime.startswith('application/x-amf'):
|
||||
post_query = amf_parse(post_query, environ)
|
||||
|
||||
else:
|
||||
post_query = base64.b64encode(post_query)
|
||||
post_query = to_native_str(post_query)
|
||||
post_query = '&__wb_post_data=' + post_query
|
||||
|
||||
return post_query
|
||||
|
||||
|
||||
#=================================================================
|
||||
def amf_parse(string, environ):
|
||||
try:
|
||||
from pyamf import remoting
|
||||
|
||||
res = remoting.decode(BytesIO(string))
|
||||
|
||||
#print(res)
|
||||
body = res.bodies[0][1].body[0]
|
||||
|
||||
values = {}
|
||||
|
||||
if hasattr(body, 'body'):
|
||||
values['body'] = body.body
|
||||
|
||||
if hasattr(body, 'source'):
|
||||
values['source'] = body.source
|
||||
|
||||
if hasattr(body, 'operation'):
|
||||
values['op'] = body.operation
|
||||
|
||||
if environ is not None:
|
||||
environ['pywb.inputdata'] = res
|
||||
|
||||
query = urlencode(values)
|
||||
#print(query)
|
||||
return query
|
||||
|
||||
except Exception as e:
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
print(e)
|
||||
return None
|
||||
|
||||
|
||||
#=================================================================
|
||||
def append_post_query(url, post_query):
|
||||
if not post_query:
|
||||
return url
|
||||
|
||||
if '?' not in url:
|
||||
url += '?'
|
||||
else:
|
||||
url += '&'
|
||||
|
||||
url += post_query
|
||||
return url
|
||||
|
||||
|
||||
#=================================================================
|
||||
def extract_client_cookie(env, cookie_name):
|
||||
cookie_header = env.get('HTTP_COOKIE')
|
||||
|
@ -1,14 +0,0 @@
|
||||
FROM python:3.5
|
||||
|
||||
WORKDIR /code/
|
||||
|
||||
RUN pip install -U git+https://github.com/ikreymer/pywb.git@develop#egg=pywb-0.30.0-develop
|
||||
RUN pip install uwsgi gevent bottle
|
||||
|
||||
ADD . /code/webagg/
|
||||
ADD ./test/ /code/test/
|
||||
|
||||
WORKDIR /code/
|
||||
CMD uwsgi /code/test/live.ini
|
||||
|
||||
|
@ -1,6 +0,0 @@
|
||||
Resource Memento/Aggregator
|
||||
===========================
|
||||
|
||||
This is a reference implementation of the `Resource/Memento Aggregator <https://github.com/webrecorder/platform-spec/wiki/ResourceMementoAggregator>`_
|
||||
from the `Webrecorder Platform <https://github.com/webrecorder/platform-spec/wiki>`_
|
||||
|
@ -1,12 +1,16 @@
|
||||
from warcio.limitreader import LimitReader
|
||||
from warcio.statusandheaders import StatusAndHeadersParser
|
||||
|
||||
from pywb.utils.loaders import extract_post_query, append_post_query
|
||||
from warcio.utils import to_native_str
|
||||
|
||||
from six.moves.urllib.parse import urlsplit, quote
|
||||
from six.moves.urllib.parse import urlsplit, quote, unquote_plus
|
||||
from six import iteritems, StringIO
|
||||
from io import BytesIO
|
||||
|
||||
import base64
|
||||
import cgi
|
||||
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class DirectWSGIInputRequest(object):
|
||||
@ -78,13 +82,12 @@ class DirectWSGIInputRequest(object):
|
||||
|
||||
buffered_stream = BytesIO()
|
||||
|
||||
post_query = extract_post_query('POST', mime, length, stream,
|
||||
post_query = PostQueryExtractor('POST', mime, length, stream,
|
||||
buffered_stream=buffered_stream,
|
||||
environ=self.env)
|
||||
|
||||
if post_query:
|
||||
if post_query.append_post_query(url) != url:
|
||||
self.env['wsgi.input'] = buffered_stream
|
||||
url = append_post_query(url, post_query)
|
||||
|
||||
return url
|
||||
|
||||
@ -171,4 +174,124 @@ class POSTInputRequest(DirectWSGIInputRequest):
|
||||
return self.status_headers.get_header(name)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class PostQueryExtractor(object):
|
||||
def __init__(self, method, mime, length, stream,
|
||||
buffered_stream=None,
|
||||
environ=None):
|
||||
"""
|
||||
Extract a url-encoded form POST from stream
|
||||
content length, return None
|
||||
Attempt to decode application/x-www-form-urlencoded or multipart/*,
|
||||
otherwise read whole block and b64encode
|
||||
"""
|
||||
self.post_query = b''
|
||||
|
||||
if method.upper() != 'POST':
|
||||
return
|
||||
|
||||
try:
|
||||
length = int(length)
|
||||
except (ValueError, TypeError):
|
||||
return
|
||||
|
||||
if length <= 0:
|
||||
return
|
||||
|
||||
post_query = b''
|
||||
|
||||
while length > 0:
|
||||
buff = stream.read(length)
|
||||
length -= len(buff)
|
||||
|
||||
if not buff:
|
||||
break
|
||||
|
||||
post_query += buff
|
||||
|
||||
if buffered_stream:
|
||||
buffered_stream.write(post_query)
|
||||
buffered_stream.seek(0)
|
||||
|
||||
if not mime:
|
||||
mime = ''
|
||||
|
||||
if mime.startswith('application/x-www-form-urlencoded'):
|
||||
post_query = to_native_str(post_query)
|
||||
post_query = unquote_plus(post_query)
|
||||
|
||||
elif mime.startswith('multipart/'):
|
||||
env = {'REQUEST_METHOD': 'POST',
|
||||
'CONTENT_TYPE': mime,
|
||||
'CONTENT_LENGTH': len(post_query)}
|
||||
|
||||
args = dict(fp=BytesIO(post_query),
|
||||
environ=env,
|
||||
keep_blank_values=True)
|
||||
|
||||
if six.PY3:
|
||||
args['encoding'] = 'utf-8'
|
||||
|
||||
data = cgi.FieldStorage(**args)
|
||||
|
||||
values = []
|
||||
for item in data.list:
|
||||
values.append((item.name, item.value))
|
||||
|
||||
post_query = urlencode(values, True)
|
||||
|
||||
elif mime.startswith('application/x-amf'):
|
||||
post_query = self.amf_parse(post_query, environ)
|
||||
|
||||
else:
|
||||
post_query = base64.b64encode(post_query)
|
||||
post_query = to_native_str(post_query)
|
||||
post_query = '&__wb_post_data=' + post_query
|
||||
|
||||
self.post_query = post_query
|
||||
|
||||
def amf_parse(self, string, environ):
|
||||
try:
|
||||
from pyamf import remoting
|
||||
|
||||
res = remoting.decode(BytesIO(string))
|
||||
|
||||
#print(res)
|
||||
body = res.bodies[0][1].body[0]
|
||||
|
||||
values = {}
|
||||
|
||||
if hasattr(body, 'body'):
|
||||
values['body'] = body.body
|
||||
|
||||
if hasattr(body, 'source'):
|
||||
values['source'] = body.source
|
||||
|
||||
if hasattr(body, 'operation'):
|
||||
values['op'] = body.operation
|
||||
|
||||
if environ is not None:
|
||||
environ['pywb.inputdata'] = res
|
||||
|
||||
query = urlencode(values)
|
||||
#print(query)
|
||||
return query
|
||||
|
||||
except Exception as e:
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
print(e)
|
||||
return None
|
||||
|
||||
def append_post_query(self, url):
|
||||
if not self.post_query:
|
||||
return url
|
||||
|
||||
if '?' not in url:
|
||||
url += '?'
|
||||
else:
|
||||
url += '&'
|
||||
|
||||
url += self.post_query
|
||||
return url
|
||||
|
||||
|
@ -14,7 +14,7 @@ from pywb.warcserver.handlers import DefaultResourceHandler
|
||||
from pywb.warcserver.index.aggregator import SimpleAggregator
|
||||
from pywb.warcserver.index.indexsource import LiveIndexSource, MementoIndexSource
|
||||
|
||||
from pywb.urlrewrite.geventserver import GeventServer
|
||||
from pywb.utils.geventserver import GeventServer
|
||||
|
||||
from pywb import get_test_dir
|
||||
from pywb.utils.wbexception import NotFoundException
|
||||
|
Loading…
x
Reference in New Issue
Block a user