mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
refactor:
- fix pywb.indexer, pywb.manager, pywb.recorder packages, tests pass rename geventeventserver -> pywb.utils move extract_post_query/append_post_query to inputrequest.PostQueryExtractor remove to_native_str() in pywb.utils, redundant with warcio.utils version remove obsolete readme, dockerfile
This commit is contained in:
parent
ad33dc6728
commit
2907ed01c8
@ -1,7 +1,7 @@
|
|||||||
from pywb.utils.canonicalize import canonicalize
|
from pywb.utils.canonicalize import canonicalize
|
||||||
from pywb.utils.loaders import extract_post_query, append_post_query
|
|
||||||
|
|
||||||
from pywb.webagg.utils import BUFF_SIZE
|
from pywb.warcserver.inputrequest import PostQueryExtractor
|
||||||
|
from pywb.warcserver.utils import BUFF_SIZE
|
||||||
|
|
||||||
from warcio.timeutils import iso_date_to_timestamp
|
from warcio.timeutils import iso_date_to_timestamp
|
||||||
from warcio.archiveiterator import ArchiveIterator
|
from warcio.archiveiterator import ArchiveIterator
|
||||||
@ -68,9 +68,10 @@ class ArchiveIndexEntryMixin(object):
|
|||||||
|
|
||||||
# merge POST/PUT body query
|
# merge POST/PUT body query
|
||||||
post_query = other.get('_post_query')
|
post_query = other.get('_post_query')
|
||||||
if post_query:
|
url = self['url']
|
||||||
url = append_post_query(self['url'], post_query)
|
new_url = post_query.append_post_query(url)
|
||||||
self['urlkey'] = canonicalize(url, surt_ordered)
|
if post_query and new_url != url:
|
||||||
|
self['urlkey'] = canonicalize(new_url, surt_ordered)
|
||||||
other['urlkey'] = self['urlkey']
|
other['urlkey'] = self['urlkey']
|
||||||
|
|
||||||
referer = other.record.http_headers.get_header('referer')
|
referer = other.record.http_headers.get_header('referer')
|
||||||
@ -180,7 +181,7 @@ class DefaultRecordParser(object):
|
|||||||
method = record.http_headers.protocol
|
method = record.http_headers.protocol
|
||||||
len_ = record.http_headers.get_header('Content-Length')
|
len_ = record.http_headers.get_header('Content-Length')
|
||||||
|
|
||||||
post_query = extract_post_query(method,
|
post_query = PostQueryExtractor(method,
|
||||||
entry.get('_content_type'),
|
entry.get('_content_type'),
|
||||||
len_,
|
len_,
|
||||||
record.raw_stream)
|
record.raw_stream)
|
||||||
|
@ -31,7 +31,7 @@ from bisect import insort
|
|||||||
|
|
||||||
from six import StringIO
|
from six import StringIO
|
||||||
|
|
||||||
from pywb.warc.archiveindexer import DefaultRecordParser
|
from pywb.indexer.archiveindexer import DefaultRecordParser
|
||||||
import codecs
|
import codecs
|
||||||
import six
|
import six
|
||||||
|
|
||||||
|
@ -200,9 +200,9 @@ Total: 4
|
|||||||
|
|
||||||
from pywb import get_test_dir
|
from pywb import get_test_dir
|
||||||
|
|
||||||
from pywb.warc.cdxindexer import write_cdx_index, main, cdx_filename
|
from pywb.indexer.cdxindexer import write_cdx_index, main, cdx_filename
|
||||||
|
|
||||||
from pywb.cdx.cdxobject import CDXObject
|
from pywb.warcserver.index.cdxobject import CDXObject
|
||||||
|
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
import sys
|
import sys
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
from pywb.utils.canonicalize import canonicalize
|
from pywb.utils.canonicalize import canonicalize
|
||||||
from pywb.cdx.cdxobject import CDXObject, URLKEY, ORIGINAL
|
from pywb.warcserver.index.cdxobject import CDXObject, URLKEY, ORIGINAL
|
||||||
from pywb.warc.cdxindexer import CDXJ
|
from pywb.indexer.cdxindexer import CDXJ
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import shutil
|
import shutil
|
||||||
|
@ -10,7 +10,7 @@ import portalocker
|
|||||||
from warcio.timeutils import timestamp20_now
|
from warcio.timeutils import timestamp20_now
|
||||||
from warcio.warcwriter import BaseWARCWriter
|
from warcio.warcwriter import BaseWARCWriter
|
||||||
|
|
||||||
from pywb.webagg.utils import res_template
|
from pywb.warcserver.utils import res_template
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
from pywb.webagg.utils import StreamIter, BUFF_SIZE
|
from pywb.warcserver.utils import StreamIter, BUFF_SIZE
|
||||||
from pywb.webagg.utils import ParamFormatter, res_template
|
from pywb.warcserver.utils import ParamFormatter, res_template
|
||||||
from pywb.webagg.inputrequest import DirectWSGIInputRequest
|
from pywb.warcserver.inputrequest import DirectWSGIInputRequest
|
||||||
|
|
||||||
from warcio.recordloader import ArcWarcRecordLoader
|
from warcio.recordloader import ArcWarcRecordLoader
|
||||||
|
|
||||||
|
@ -1,15 +1,15 @@
|
|||||||
from pywb.utils.canonicalize import calc_search_range
|
|
||||||
from pywb.cdx.cdxobject import CDXObject
|
|
||||||
from pywb.warc.cdxindexer import write_cdx_index
|
|
||||||
|
|
||||||
from warcio.timeutils import iso_date_to_timestamp
|
from warcio.timeutils import iso_date_to_timestamp
|
||||||
|
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from pywb.webagg.indexsource import RedisIndexSource
|
from pywb.utils.canonicalize import calc_search_range
|
||||||
from pywb.webagg.aggregator import SimpleAggregator
|
from pywb.indexer.cdxindexer import write_cdx_index
|
||||||
from pywb.webagg.utils import res_template
|
|
||||||
|
from pywb.warcserver.index.cdxobject import CDXObject
|
||||||
|
from pywb.warcserver.index.indexsource import RedisIndexSource
|
||||||
|
from pywb.warcserver.index.aggregator import SimpleAggregator
|
||||||
|
from pywb.warcserver.utils import res_template
|
||||||
|
|
||||||
from pywb.recorder.filters import WriteRevisitDupePolicy
|
from pywb.recorder.filters import WriteRevisitDupePolicy
|
||||||
|
|
||||||
|
@ -1,9 +1,8 @@
|
|||||||
from gevent import monkey; monkey.patch_all()
|
from gevent import monkey; monkey.patch_all()
|
||||||
import gevent
|
import gevent
|
||||||
|
|
||||||
import pywb.webagg
|
from pywb.warcserver.test.testutils import TempDirTests, LiveServerTests, BaseTestClass, to_path
|
||||||
from pywb.webagg.test.testutils import TempDirTests, LiveServerTests, BaseTestClass, to_path
|
from pywb.warcserver.test.testutils import FakeRedisTests
|
||||||
from pywb.webagg.test.testutils import FakeRedisTests
|
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import webtest
|
import webtest
|
||||||
@ -17,16 +16,16 @@ from pywb.recorder.multifilewarcwriter import PerRecordWARCWriter, MultiFileWARC
|
|||||||
from pywb.recorder.filters import ExcludeSpecificHeaders, ExcludeHttpOnlyCookieHeaders
|
from pywb.recorder.filters import ExcludeSpecificHeaders, ExcludeHttpOnlyCookieHeaders
|
||||||
from pywb.recorder.filters import SkipDupePolicy, WriteDupePolicy, WriteRevisitDupePolicy
|
from pywb.recorder.filters import SkipDupePolicy, WriteDupePolicy, WriteRevisitDupePolicy
|
||||||
|
|
||||||
from pywb.webagg.utils import MementoUtils
|
from pywb.warcserver.utils import MementoUtils
|
||||||
|
|
||||||
from pywb.cdx.cdxobject import CDXObject
|
from pywb.warcserver.index.cdxobject import CDXObject
|
||||||
|
|
||||||
from warcio.statusandheaders import StatusAndHeadersParser
|
from warcio.statusandheaders import StatusAndHeadersParser
|
||||||
from warcio.bufferedreaders import DecompressingBufferedReader
|
from warcio.bufferedreaders import DecompressingBufferedReader
|
||||||
from warcio.recordloader import ArcWarcRecordLoader
|
from warcio.recordloader import ArcWarcRecordLoader
|
||||||
from warcio.archiveiterator import ArchiveIterator
|
from warcio.archiveiterator import ArchiveIterator
|
||||||
|
|
||||||
from pywb.warc.cdxindexer import write_cdx_index
|
from pywb.indexer.cdxindexer import write_cdx_index
|
||||||
|
|
||||||
from six.moves.urllib.parse import quote, unquote, urlencode
|
from six.moves.urllib.parse import quote, unquote, urlencode
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
|
@ -18,6 +18,7 @@ import cgi
|
|||||||
|
|
||||||
from io import open, BytesIO
|
from io import open, BytesIO
|
||||||
from warcio.limitreader import LimitReader
|
from warcio.limitreader import LimitReader
|
||||||
|
from warcio.utils import to_native_str
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from boto import connect_s3
|
from boto import connect_s3
|
||||||
@ -60,140 +61,6 @@ def load_yaml_config(config_file):
|
|||||||
return config
|
return config
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
def to_native_str(value, encoding='iso-8859-1', func=lambda x: x):
|
|
||||||
if isinstance(value, str):
|
|
||||||
return value
|
|
||||||
|
|
||||||
if six.PY3 and isinstance(value, six.binary_type): #pragma: no cover
|
|
||||||
return func(value.decode(encoding))
|
|
||||||
elif six.PY2 and isinstance(value, six.text_type): #pragma: no cover
|
|
||||||
return func(value.encode(encoding))
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
def extract_post_query(method, mime, length, stream,
|
|
||||||
buffered_stream=None,
|
|
||||||
environ=None):
|
|
||||||
"""
|
|
||||||
Extract a url-encoded form POST from stream
|
|
||||||
content length, return None
|
|
||||||
Attempt to decode application/x-www-form-urlencoded or multipart/*,
|
|
||||||
otherwise read whole block and b64encode
|
|
||||||
"""
|
|
||||||
if method.upper() != 'POST':
|
|
||||||
return None
|
|
||||||
|
|
||||||
try:
|
|
||||||
length = int(length)
|
|
||||||
except (ValueError, TypeError):
|
|
||||||
return None
|
|
||||||
|
|
||||||
if length <= 0:
|
|
||||||
return None
|
|
||||||
|
|
||||||
post_query = b''
|
|
||||||
|
|
||||||
while length > 0:
|
|
||||||
buff = stream.read(length)
|
|
||||||
length -= len(buff)
|
|
||||||
|
|
||||||
if not buff:
|
|
||||||
break
|
|
||||||
|
|
||||||
post_query += buff
|
|
||||||
|
|
||||||
if buffered_stream:
|
|
||||||
buffered_stream.write(post_query)
|
|
||||||
buffered_stream.seek(0)
|
|
||||||
|
|
||||||
if not mime:
|
|
||||||
mime = ''
|
|
||||||
|
|
||||||
if mime.startswith('application/x-www-form-urlencoded'):
|
|
||||||
post_query = to_native_str(post_query)
|
|
||||||
post_query = unquote_plus(post_query)
|
|
||||||
|
|
||||||
elif mime.startswith('multipart/'):
|
|
||||||
env = {'REQUEST_METHOD': 'POST',
|
|
||||||
'CONTENT_TYPE': mime,
|
|
||||||
'CONTENT_LENGTH': len(post_query)}
|
|
||||||
|
|
||||||
args = dict(fp=BytesIO(post_query),
|
|
||||||
environ=env,
|
|
||||||
keep_blank_values=True)
|
|
||||||
|
|
||||||
if six.PY3:
|
|
||||||
args['encoding'] = 'utf-8'
|
|
||||||
|
|
||||||
data = cgi.FieldStorage(**args)
|
|
||||||
|
|
||||||
values = []
|
|
||||||
for item in data.list:
|
|
||||||
values.append((item.name, item.value))
|
|
||||||
|
|
||||||
post_query = urlencode(values, True)
|
|
||||||
|
|
||||||
elif mime.startswith('application/x-amf'):
|
|
||||||
post_query = amf_parse(post_query, environ)
|
|
||||||
|
|
||||||
else:
|
|
||||||
post_query = base64.b64encode(post_query)
|
|
||||||
post_query = to_native_str(post_query)
|
|
||||||
post_query = '&__wb_post_data=' + post_query
|
|
||||||
|
|
||||||
return post_query
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
def amf_parse(string, environ):
|
|
||||||
try:
|
|
||||||
from pyamf import remoting
|
|
||||||
|
|
||||||
res = remoting.decode(BytesIO(string))
|
|
||||||
|
|
||||||
#print(res)
|
|
||||||
body = res.bodies[0][1].body[0]
|
|
||||||
|
|
||||||
values = {}
|
|
||||||
|
|
||||||
if hasattr(body, 'body'):
|
|
||||||
values['body'] = body.body
|
|
||||||
|
|
||||||
if hasattr(body, 'source'):
|
|
||||||
values['source'] = body.source
|
|
||||||
|
|
||||||
if hasattr(body, 'operation'):
|
|
||||||
values['op'] = body.operation
|
|
||||||
|
|
||||||
if environ is not None:
|
|
||||||
environ['pywb.inputdata'] = res
|
|
||||||
|
|
||||||
query = urlencode(values)
|
|
||||||
#print(query)
|
|
||||||
return query
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
import traceback
|
|
||||||
traceback.print_exc()
|
|
||||||
print(e)
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
def append_post_query(url, post_query):
|
|
||||||
if not post_query:
|
|
||||||
return url
|
|
||||||
|
|
||||||
if '?' not in url:
|
|
||||||
url += '?'
|
|
||||||
else:
|
|
||||||
url += '&'
|
|
||||||
|
|
||||||
url += post_query
|
|
||||||
return url
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
def extract_client_cookie(env, cookie_name):
|
def extract_client_cookie(env, cookie_name):
|
||||||
cookie_header = env.get('HTTP_COOKIE')
|
cookie_header = env.get('HTTP_COOKIE')
|
||||||
|
@ -1,14 +0,0 @@
|
|||||||
FROM python:3.5
|
|
||||||
|
|
||||||
WORKDIR /code/
|
|
||||||
|
|
||||||
RUN pip install -U git+https://github.com/ikreymer/pywb.git@develop#egg=pywb-0.30.0-develop
|
|
||||||
RUN pip install uwsgi gevent bottle
|
|
||||||
|
|
||||||
ADD . /code/webagg/
|
|
||||||
ADD ./test/ /code/test/
|
|
||||||
|
|
||||||
WORKDIR /code/
|
|
||||||
CMD uwsgi /code/test/live.ini
|
|
||||||
|
|
||||||
|
|
@ -1,6 +0,0 @@
|
|||||||
Resource Memento/Aggregator
|
|
||||||
===========================
|
|
||||||
|
|
||||||
This is a reference implementation of the `Resource/Memento Aggregator <https://github.com/webrecorder/platform-spec/wiki/ResourceMementoAggregator>`_
|
|
||||||
from the `Webrecorder Platform <https://github.com/webrecorder/platform-spec/wiki>`_
|
|
||||||
|
|
@ -1,12 +1,16 @@
|
|||||||
from warcio.limitreader import LimitReader
|
from warcio.limitreader import LimitReader
|
||||||
from warcio.statusandheaders import StatusAndHeadersParser
|
from warcio.statusandheaders import StatusAndHeadersParser
|
||||||
|
|
||||||
from pywb.utils.loaders import extract_post_query, append_post_query
|
from warcio.utils import to_native_str
|
||||||
|
|
||||||
from six.moves.urllib.parse import urlsplit, quote
|
from six.moves.urllib.parse import urlsplit, quote, unquote_plus
|
||||||
from six import iteritems, StringIO
|
from six import iteritems, StringIO
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
|
|
||||||
|
import base64
|
||||||
|
import cgi
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#=============================================================================
|
#=============================================================================
|
||||||
class DirectWSGIInputRequest(object):
|
class DirectWSGIInputRequest(object):
|
||||||
@ -78,13 +82,12 @@ class DirectWSGIInputRequest(object):
|
|||||||
|
|
||||||
buffered_stream = BytesIO()
|
buffered_stream = BytesIO()
|
||||||
|
|
||||||
post_query = extract_post_query('POST', mime, length, stream,
|
post_query = PostQueryExtractor('POST', mime, length, stream,
|
||||||
buffered_stream=buffered_stream,
|
buffered_stream=buffered_stream,
|
||||||
environ=self.env)
|
environ=self.env)
|
||||||
|
|
||||||
if post_query:
|
if post_query.append_post_query(url) != url:
|
||||||
self.env['wsgi.input'] = buffered_stream
|
self.env['wsgi.input'] = buffered_stream
|
||||||
url = append_post_query(url, post_query)
|
|
||||||
|
|
||||||
return url
|
return url
|
||||||
|
|
||||||
@ -171,4 +174,124 @@ class POSTInputRequest(DirectWSGIInputRequest):
|
|||||||
return self.status_headers.get_header(name)
|
return self.status_headers.get_header(name)
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
class PostQueryExtractor(object):
|
||||||
|
def __init__(self, method, mime, length, stream,
|
||||||
|
buffered_stream=None,
|
||||||
|
environ=None):
|
||||||
|
"""
|
||||||
|
Extract a url-encoded form POST from stream
|
||||||
|
content length, return None
|
||||||
|
Attempt to decode application/x-www-form-urlencoded or multipart/*,
|
||||||
|
otherwise read whole block and b64encode
|
||||||
|
"""
|
||||||
|
self.post_query = b''
|
||||||
|
|
||||||
|
if method.upper() != 'POST':
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
length = int(length)
|
||||||
|
except (ValueError, TypeError):
|
||||||
|
return
|
||||||
|
|
||||||
|
if length <= 0:
|
||||||
|
return
|
||||||
|
|
||||||
|
post_query = b''
|
||||||
|
|
||||||
|
while length > 0:
|
||||||
|
buff = stream.read(length)
|
||||||
|
length -= len(buff)
|
||||||
|
|
||||||
|
if not buff:
|
||||||
|
break
|
||||||
|
|
||||||
|
post_query += buff
|
||||||
|
|
||||||
|
if buffered_stream:
|
||||||
|
buffered_stream.write(post_query)
|
||||||
|
buffered_stream.seek(0)
|
||||||
|
|
||||||
|
if not mime:
|
||||||
|
mime = ''
|
||||||
|
|
||||||
|
if mime.startswith('application/x-www-form-urlencoded'):
|
||||||
|
post_query = to_native_str(post_query)
|
||||||
|
post_query = unquote_plus(post_query)
|
||||||
|
|
||||||
|
elif mime.startswith('multipart/'):
|
||||||
|
env = {'REQUEST_METHOD': 'POST',
|
||||||
|
'CONTENT_TYPE': mime,
|
||||||
|
'CONTENT_LENGTH': len(post_query)}
|
||||||
|
|
||||||
|
args = dict(fp=BytesIO(post_query),
|
||||||
|
environ=env,
|
||||||
|
keep_blank_values=True)
|
||||||
|
|
||||||
|
if six.PY3:
|
||||||
|
args['encoding'] = 'utf-8'
|
||||||
|
|
||||||
|
data = cgi.FieldStorage(**args)
|
||||||
|
|
||||||
|
values = []
|
||||||
|
for item in data.list:
|
||||||
|
values.append((item.name, item.value))
|
||||||
|
|
||||||
|
post_query = urlencode(values, True)
|
||||||
|
|
||||||
|
elif mime.startswith('application/x-amf'):
|
||||||
|
post_query = self.amf_parse(post_query, environ)
|
||||||
|
|
||||||
|
else:
|
||||||
|
post_query = base64.b64encode(post_query)
|
||||||
|
post_query = to_native_str(post_query)
|
||||||
|
post_query = '&__wb_post_data=' + post_query
|
||||||
|
|
||||||
|
self.post_query = post_query
|
||||||
|
|
||||||
|
def amf_parse(self, string, environ):
|
||||||
|
try:
|
||||||
|
from pyamf import remoting
|
||||||
|
|
||||||
|
res = remoting.decode(BytesIO(string))
|
||||||
|
|
||||||
|
#print(res)
|
||||||
|
body = res.bodies[0][1].body[0]
|
||||||
|
|
||||||
|
values = {}
|
||||||
|
|
||||||
|
if hasattr(body, 'body'):
|
||||||
|
values['body'] = body.body
|
||||||
|
|
||||||
|
if hasattr(body, 'source'):
|
||||||
|
values['source'] = body.source
|
||||||
|
|
||||||
|
if hasattr(body, 'operation'):
|
||||||
|
values['op'] = body.operation
|
||||||
|
|
||||||
|
if environ is not None:
|
||||||
|
environ['pywb.inputdata'] = res
|
||||||
|
|
||||||
|
query = urlencode(values)
|
||||||
|
#print(query)
|
||||||
|
return query
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
print(e)
|
||||||
|
return None
|
||||||
|
|
||||||
|
def append_post_query(self, url):
|
||||||
|
if not self.post_query:
|
||||||
|
return url
|
||||||
|
|
||||||
|
if '?' not in url:
|
||||||
|
url += '?'
|
||||||
|
else:
|
||||||
|
url += '&'
|
||||||
|
|
||||||
|
url += self.post_query
|
||||||
|
return url
|
||||||
|
|
||||||
|
@ -14,7 +14,7 @@ from pywb.warcserver.handlers import DefaultResourceHandler
|
|||||||
from pywb.warcserver.index.aggregator import SimpleAggregator
|
from pywb.warcserver.index.aggregator import SimpleAggregator
|
||||||
from pywb.warcserver.index.indexsource import LiveIndexSource, MementoIndexSource
|
from pywb.warcserver.index.indexsource import LiveIndexSource, MementoIndexSource
|
||||||
|
|
||||||
from pywb.urlrewrite.geventserver import GeventServer
|
from pywb.utils.geventserver import GeventServer
|
||||||
|
|
||||||
from pywb import get_test_dir
|
from pywb import get_test_dir
|
||||||
from pywb.utils.wbexception import NotFoundException
|
from pywb.utils.wbexception import NotFoundException
|
||||||
|
Loading…
x
Reference in New Issue
Block a user