1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

refactor:

- fix pywb.indexer, pywb.manager, pywb.recorder packages, tests pass
rename geventeventserver -> pywb.utils
move extract_post_query/append_post_query to inputrequest.PostQueryExtractor
remove to_native_str() in pywb.utils, redundant with warcio.utils version
remove obsolete readme, dockerfile
This commit is contained in:
Ilya Kreymer 2017-05-23 16:41:02 -07:00
parent ad33dc6728
commit 2907ed01c8
14 changed files with 158 additions and 188 deletions

View File

@ -1,7 +1,7 @@
from pywb.utils.canonicalize import canonicalize from pywb.utils.canonicalize import canonicalize
from pywb.utils.loaders import extract_post_query, append_post_query
from pywb.webagg.utils import BUFF_SIZE from pywb.warcserver.inputrequest import PostQueryExtractor
from pywb.warcserver.utils import BUFF_SIZE
from warcio.timeutils import iso_date_to_timestamp from warcio.timeutils import iso_date_to_timestamp
from warcio.archiveiterator import ArchiveIterator from warcio.archiveiterator import ArchiveIterator
@ -68,9 +68,10 @@ class ArchiveIndexEntryMixin(object):
# merge POST/PUT body query # merge POST/PUT body query
post_query = other.get('_post_query') post_query = other.get('_post_query')
if post_query: url = self['url']
url = append_post_query(self['url'], post_query) new_url = post_query.append_post_query(url)
self['urlkey'] = canonicalize(url, surt_ordered) if post_query and new_url != url:
self['urlkey'] = canonicalize(new_url, surt_ordered)
other['urlkey'] = self['urlkey'] other['urlkey'] = self['urlkey']
referer = other.record.http_headers.get_header('referer') referer = other.record.http_headers.get_header('referer')
@ -180,7 +181,7 @@ class DefaultRecordParser(object):
method = record.http_headers.protocol method = record.http_headers.protocol
len_ = record.http_headers.get_header('Content-Length') len_ = record.http_headers.get_header('Content-Length')
post_query = extract_post_query(method, post_query = PostQueryExtractor(method,
entry.get('_content_type'), entry.get('_content_type'),
len_, len_,
record.raw_stream) record.raw_stream)

View File

@ -31,7 +31,7 @@ from bisect import insort
from six import StringIO from six import StringIO
from pywb.warc.archiveindexer import DefaultRecordParser from pywb.indexer.archiveindexer import DefaultRecordParser
import codecs import codecs
import six import six

View File

@ -200,9 +200,9 @@ Total: 4
from pywb import get_test_dir from pywb import get_test_dir
from pywb.warc.cdxindexer import write_cdx_index, main, cdx_filename from pywb.indexer.cdxindexer import write_cdx_index, main, cdx_filename
from pywb.cdx.cdxobject import CDXObject from pywb.warcserver.index.cdxobject import CDXObject
from io import BytesIO from io import BytesIO
import sys import sys

View File

@ -1,6 +1,6 @@
from pywb.utils.canonicalize import canonicalize from pywb.utils.canonicalize import canonicalize
from pywb.cdx.cdxobject import CDXObject, URLKEY, ORIGINAL from pywb.warcserver.index.cdxobject import CDXObject, URLKEY, ORIGINAL
from pywb.warc.cdxindexer import CDXJ from pywb.indexer.cdxindexer import CDXJ
import os import os
import shutil import shutil

View File

@ -10,7 +10,7 @@ import portalocker
from warcio.timeutils import timestamp20_now from warcio.timeutils import timestamp20_now
from warcio.warcwriter import BaseWARCWriter from warcio.warcwriter import BaseWARCWriter
from pywb.webagg.utils import res_template from pywb.warcserver.utils import res_template
# ============================================================================ # ============================================================================

View File

@ -1,6 +1,6 @@
from pywb.webagg.utils import StreamIter, BUFF_SIZE from pywb.warcserver.utils import StreamIter, BUFF_SIZE
from pywb.webagg.utils import ParamFormatter, res_template from pywb.warcserver.utils import ParamFormatter, res_template
from pywb.webagg.inputrequest import DirectWSGIInputRequest from pywb.warcserver.inputrequest import DirectWSGIInputRequest
from warcio.recordloader import ArcWarcRecordLoader from warcio.recordloader import ArcWarcRecordLoader

View File

@ -1,15 +1,15 @@
from pywb.utils.canonicalize import calc_search_range
from pywb.cdx.cdxobject import CDXObject
from pywb.warc.cdxindexer import write_cdx_index
from warcio.timeutils import iso_date_to_timestamp from warcio.timeutils import iso_date_to_timestamp
from io import BytesIO from io import BytesIO
import os import os
from pywb.webagg.indexsource import RedisIndexSource from pywb.utils.canonicalize import calc_search_range
from pywb.webagg.aggregator import SimpleAggregator from pywb.indexer.cdxindexer import write_cdx_index
from pywb.webagg.utils import res_template
from pywb.warcserver.index.cdxobject import CDXObject
from pywb.warcserver.index.indexsource import RedisIndexSource
from pywb.warcserver.index.aggregator import SimpleAggregator
from pywb.warcserver.utils import res_template
from pywb.recorder.filters import WriteRevisitDupePolicy from pywb.recorder.filters import WriteRevisitDupePolicy

View File

@ -1,9 +1,8 @@
from gevent import monkey; monkey.patch_all() from gevent import monkey; monkey.patch_all()
import gevent import gevent
import pywb.webagg from pywb.warcserver.test.testutils import TempDirTests, LiveServerTests, BaseTestClass, to_path
from pywb.webagg.test.testutils import TempDirTests, LiveServerTests, BaseTestClass, to_path from pywb.warcserver.test.testutils import FakeRedisTests
from pywb.webagg.test.testutils import FakeRedisTests
import os import os
import webtest import webtest
@ -17,16 +16,16 @@ from pywb.recorder.multifilewarcwriter import PerRecordWARCWriter, MultiFileWARC
from pywb.recorder.filters import ExcludeSpecificHeaders, ExcludeHttpOnlyCookieHeaders from pywb.recorder.filters import ExcludeSpecificHeaders, ExcludeHttpOnlyCookieHeaders
from pywb.recorder.filters import SkipDupePolicy, WriteDupePolicy, WriteRevisitDupePolicy from pywb.recorder.filters import SkipDupePolicy, WriteDupePolicy, WriteRevisitDupePolicy
from pywb.webagg.utils import MementoUtils from pywb.warcserver.utils import MementoUtils
from pywb.cdx.cdxobject import CDXObject from pywb.warcserver.index.cdxobject import CDXObject
from warcio.statusandheaders import StatusAndHeadersParser from warcio.statusandheaders import StatusAndHeadersParser
from warcio.bufferedreaders import DecompressingBufferedReader from warcio.bufferedreaders import DecompressingBufferedReader
from warcio.recordloader import ArcWarcRecordLoader from warcio.recordloader import ArcWarcRecordLoader
from warcio.archiveiterator import ArchiveIterator from warcio.archiveiterator import ArchiveIterator
from pywb.warc.cdxindexer import write_cdx_index from pywb.indexer.cdxindexer import write_cdx_index
from six.moves.urllib.parse import quote, unquote, urlencode from six.moves.urllib.parse import quote, unquote, urlencode
from io import BytesIO from io import BytesIO

View File

@ -18,6 +18,7 @@ import cgi
from io import open, BytesIO from io import open, BytesIO
from warcio.limitreader import LimitReader from warcio.limitreader import LimitReader
from warcio.utils import to_native_str
try: try:
from boto import connect_s3 from boto import connect_s3
@ -60,140 +61,6 @@ def load_yaml_config(config_file):
return config return config
#=================================================================
def to_native_str(value, encoding='iso-8859-1', func=lambda x: x):
if isinstance(value, str):
return value
if six.PY3 and isinstance(value, six.binary_type): #pragma: no cover
return func(value.decode(encoding))
elif six.PY2 and isinstance(value, six.text_type): #pragma: no cover
return func(value.encode(encoding))
#=================================================================
def extract_post_query(method, mime, length, stream,
buffered_stream=None,
environ=None):
"""
Extract a url-encoded form POST from stream
content length, return None
Attempt to decode application/x-www-form-urlencoded or multipart/*,
otherwise read whole block and b64encode
"""
if method.upper() != 'POST':
return None
try:
length = int(length)
except (ValueError, TypeError):
return None
if length <= 0:
return None
post_query = b''
while length > 0:
buff = stream.read(length)
length -= len(buff)
if not buff:
break
post_query += buff
if buffered_stream:
buffered_stream.write(post_query)
buffered_stream.seek(0)
if not mime:
mime = ''
if mime.startswith('application/x-www-form-urlencoded'):
post_query = to_native_str(post_query)
post_query = unquote_plus(post_query)
elif mime.startswith('multipart/'):
env = {'REQUEST_METHOD': 'POST',
'CONTENT_TYPE': mime,
'CONTENT_LENGTH': len(post_query)}
args = dict(fp=BytesIO(post_query),
environ=env,
keep_blank_values=True)
if six.PY3:
args['encoding'] = 'utf-8'
data = cgi.FieldStorage(**args)
values = []
for item in data.list:
values.append((item.name, item.value))
post_query = urlencode(values, True)
elif mime.startswith('application/x-amf'):
post_query = amf_parse(post_query, environ)
else:
post_query = base64.b64encode(post_query)
post_query = to_native_str(post_query)
post_query = '&__wb_post_data=' + post_query
return post_query
#=================================================================
def amf_parse(string, environ):
try:
from pyamf import remoting
res = remoting.decode(BytesIO(string))
#print(res)
body = res.bodies[0][1].body[0]
values = {}
if hasattr(body, 'body'):
values['body'] = body.body
if hasattr(body, 'source'):
values['source'] = body.source
if hasattr(body, 'operation'):
values['op'] = body.operation
if environ is not None:
environ['pywb.inputdata'] = res
query = urlencode(values)
#print(query)
return query
except Exception as e:
import traceback
traceback.print_exc()
print(e)
return None
#=================================================================
def append_post_query(url, post_query):
if not post_query:
return url
if '?' not in url:
url += '?'
else:
url += '&'
url += post_query
return url
#================================================================= #=================================================================
def extract_client_cookie(env, cookie_name): def extract_client_cookie(env, cookie_name):
cookie_header = env.get('HTTP_COOKIE') cookie_header = env.get('HTTP_COOKIE')

View File

@ -1,14 +0,0 @@
FROM python:3.5
WORKDIR /code/
RUN pip install -U git+https://github.com/ikreymer/pywb.git@develop#egg=pywb-0.30.0-develop
RUN pip install uwsgi gevent bottle
ADD . /code/webagg/
ADD ./test/ /code/test/
WORKDIR /code/
CMD uwsgi /code/test/live.ini

View File

@ -1,6 +0,0 @@
Resource Memento/Aggregator
===========================
This is a reference implementation of the `Resource/Memento Aggregator <https://github.com/webrecorder/platform-spec/wiki/ResourceMementoAggregator>`_
from the `Webrecorder Platform <https://github.com/webrecorder/platform-spec/wiki>`_

View File

@ -1,12 +1,16 @@
from warcio.limitreader import LimitReader from warcio.limitreader import LimitReader
from warcio.statusandheaders import StatusAndHeadersParser from warcio.statusandheaders import StatusAndHeadersParser
from pywb.utils.loaders import extract_post_query, append_post_query from warcio.utils import to_native_str
from six.moves.urllib.parse import urlsplit, quote from six.moves.urllib.parse import urlsplit, quote, unquote_plus
from six import iteritems, StringIO from six import iteritems, StringIO
from io import BytesIO from io import BytesIO
import base64
import cgi
#============================================================================= #=============================================================================
class DirectWSGIInputRequest(object): class DirectWSGIInputRequest(object):
@ -78,13 +82,12 @@ class DirectWSGIInputRequest(object):
buffered_stream = BytesIO() buffered_stream = BytesIO()
post_query = extract_post_query('POST', mime, length, stream, post_query = PostQueryExtractor('POST', mime, length, stream,
buffered_stream=buffered_stream, buffered_stream=buffered_stream,
environ=self.env) environ=self.env)
if post_query: if post_query.append_post_query(url) != url:
self.env['wsgi.input'] = buffered_stream self.env['wsgi.input'] = buffered_stream
url = append_post_query(url, post_query)
return url return url
@ -171,4 +174,124 @@ class POSTInputRequest(DirectWSGIInputRequest):
return self.status_headers.get_header(name) return self.status_headers.get_header(name)
# ============================================================================
class PostQueryExtractor(object):
def __init__(self, method, mime, length, stream,
buffered_stream=None,
environ=None):
"""
Extract a url-encoded form POST from stream
content length, return None
Attempt to decode application/x-www-form-urlencoded or multipart/*,
otherwise read whole block and b64encode
"""
self.post_query = b''
if method.upper() != 'POST':
return
try:
length = int(length)
except (ValueError, TypeError):
return
if length <= 0:
return
post_query = b''
while length > 0:
buff = stream.read(length)
length -= len(buff)
if not buff:
break
post_query += buff
if buffered_stream:
buffered_stream.write(post_query)
buffered_stream.seek(0)
if not mime:
mime = ''
if mime.startswith('application/x-www-form-urlencoded'):
post_query = to_native_str(post_query)
post_query = unquote_plus(post_query)
elif mime.startswith('multipart/'):
env = {'REQUEST_METHOD': 'POST',
'CONTENT_TYPE': mime,
'CONTENT_LENGTH': len(post_query)}
args = dict(fp=BytesIO(post_query),
environ=env,
keep_blank_values=True)
if six.PY3:
args['encoding'] = 'utf-8'
data = cgi.FieldStorage(**args)
values = []
for item in data.list:
values.append((item.name, item.value))
post_query = urlencode(values, True)
elif mime.startswith('application/x-amf'):
post_query = self.amf_parse(post_query, environ)
else:
post_query = base64.b64encode(post_query)
post_query = to_native_str(post_query)
post_query = '&__wb_post_data=' + post_query
self.post_query = post_query
def amf_parse(self, string, environ):
try:
from pyamf import remoting
res = remoting.decode(BytesIO(string))
#print(res)
body = res.bodies[0][1].body[0]
values = {}
if hasattr(body, 'body'):
values['body'] = body.body
if hasattr(body, 'source'):
values['source'] = body.source
if hasattr(body, 'operation'):
values['op'] = body.operation
if environ is not None:
environ['pywb.inputdata'] = res
query = urlencode(values)
#print(query)
return query
except Exception as e:
import traceback
traceback.print_exc()
print(e)
return None
def append_post_query(self, url):
if not self.post_query:
return url
if '?' not in url:
url += '?'
else:
url += '&'
url += self.post_query
return url

View File

@ -14,7 +14,7 @@ from pywb.warcserver.handlers import DefaultResourceHandler
from pywb.warcserver.index.aggregator import SimpleAggregator from pywb.warcserver.index.aggregator import SimpleAggregator
from pywb.warcserver.index.indexsource import LiveIndexSource, MementoIndexSource from pywb.warcserver.index.indexsource import LiveIndexSource, MementoIndexSource
from pywb.urlrewrite.geventserver import GeventServer from pywb.utils.geventserver import GeventServer
from pywb import get_test_dir from pywb import get_test_dir
from pywb.utils.wbexception import NotFoundException from pywb.utils.wbexception import NotFoundException