mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
refactor:
- merge pywb.urlrewrite -> pywb.rewrite, remove obsolete stuff (rewrite_content.py, rewrite_live.py, dsrules.py) - move wbrequestresponse -> pywb.apps - move pywb.webapp.handlers -> pywb.apps.static_handler - remove pywb.webapp, pywb.framework packages - disable old header_rewriter, content_rewriter tests - finish renaming from previous warcserver refactor - all other tests passing!
This commit is contained in:
parent
2907ed01c8
commit
97182b71b7
@ -6,7 +6,7 @@ import logging
|
||||
|
||||
#=============================================================================
|
||||
def webagg(args=None):
|
||||
WebaggCli(args=args,
|
||||
WarcServerCli(args=args,
|
||||
default_port=8070,
|
||||
desc='pywb Web Aggregator Server').run()
|
||||
|
||||
@ -103,18 +103,18 @@ class ReplayCli(BaseCli):
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class WebaggCli(BaseCli):
|
||||
class WarcServerCli(BaseCli):
|
||||
def load(self):
|
||||
from pywb.webagg.autoapp import AutoConfigApp
|
||||
from pywb.warcserver.warcserver import WarcServer
|
||||
|
||||
super(WebaggCli, self).load()
|
||||
return AutoConfigApp(custom_config=self.extra_config)
|
||||
super(WarcServerCli, self).load()
|
||||
return WarcServer(custom_config=self.extra_config)
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class WaybackCli(ReplayCli):
|
||||
def load(self):
|
||||
from pywb.urlrewrite.frontendapp import FrontEndApp
|
||||
from pywb.apps.frontendapp import FrontEndApp
|
||||
|
||||
super(WaybackCli, self).load()
|
||||
return FrontEndApp(custom_config=self.extra_config)
|
||||
@ -123,7 +123,7 @@ class WaybackCli(ReplayCli):
|
||||
#=============================================================================
|
||||
class LiveCli(BaseCli):
|
||||
def load(self):
|
||||
from pywb.urlrewrite.frontendapp import FrontEndApp
|
||||
from pywb.apps.frontendapp import FrontEndApp
|
||||
|
||||
self.r.live = True
|
||||
|
||||
|
@ -8,16 +8,15 @@ from six.moves.urllib.parse import urljoin
|
||||
from six import iteritems
|
||||
|
||||
from pywb.utils.loaders import load_yaml_config, to_native_str
|
||||
from pywb.utils.geventserver import GeventServer
|
||||
|
||||
from pywb.webagg.autoapp import AutoConfigApp
|
||||
from pywb.webapp.handlers import StaticHandler
|
||||
from pywb.warcserver.warcserver import WarcServer
|
||||
|
||||
from pywb.framework.wbrequestresponse import WbResponse
|
||||
from pywb.rewrite.templateview import BaseInsertView
|
||||
|
||||
from pywb.urlrewrite.geventserver import GeventServer
|
||||
from pywb.urlrewrite.templateview import BaseInsertView
|
||||
|
||||
from pywb.urlrewrite.rewriterapp import RewriterApp, UpstreamException
|
||||
from pywb.apps.static_handler import StaticHandler
|
||||
from pywb.apps.rewriterapp import RewriterApp, UpstreamException
|
||||
from pywb.apps.wbrequestresponse import WbResponse
|
||||
|
||||
import os
|
||||
import traceback
|
||||
@ -27,14 +26,14 @@ import traceback
|
||||
class FrontEndApp(object):
|
||||
def __init__(self, config_file='./config.yaml', custom_config=None):
|
||||
self.debug = True
|
||||
self.webagg = AutoConfigApp(config_file=config_file,
|
||||
custom_config=custom_config)
|
||||
self.warcserver = WarcServer(config_file=config_file,
|
||||
custom_config=custom_config)
|
||||
|
||||
framed_replay = self.webagg.config.get('framed_replay', True)
|
||||
framed_replay = self.warcserver.config.get('framed_replay', True)
|
||||
|
||||
self.rewriterapp = RewriterApp(framed_replay, config=self.webagg.config)
|
||||
self.rewriterapp = RewriterApp(framed_replay, config=self.warcserver.config)
|
||||
|
||||
self.webagg_server = GeventServer(self.webagg, port=0)
|
||||
self.warcserver_server = GeventServer(self.warcserver, port=0)
|
||||
|
||||
self.static_handler = StaticHandler('pywb/static/')
|
||||
|
||||
@ -46,12 +45,12 @@ class FrontEndApp(object):
|
||||
self.url_map.add(Rule('/collinfo.json', endpoint=self.serve_listing))
|
||||
self.url_map.add(Rule('/', endpoint=self.serve_home))
|
||||
|
||||
self.rewriterapp.paths = self.get_upstream_paths(self.webagg_server.port)
|
||||
self.rewriterapp.paths = self.get_upstream_paths(self.warcserver_server.port)
|
||||
|
||||
self.templates_dir = self.webagg.config.get('templates_dir', 'templates')
|
||||
self.static_dir = self.webagg.config.get('static_dir', 'static')
|
||||
self.templates_dir = self.warcserver.config.get('templates_dir', 'templates')
|
||||
self.static_dir = self.warcserver.config.get('static_dir', 'static')
|
||||
|
||||
metadata_templ = os.path.join(self.webagg.root_dir, '{coll}', 'metadata.yaml')
|
||||
metadata_templ = os.path.join(self.warcserver.root_dir, '{coll}', 'metadata.yaml')
|
||||
self.metadata_cache = MetadataCache(metadata_templ)
|
||||
|
||||
def get_upstream_paths(self, port):
|
||||
@ -61,8 +60,8 @@ class FrontEndApp(object):
|
||||
|
||||
def serve_home(self, environ):
|
||||
home_view = BaseInsertView(self.rewriterapp.jinja_env, 'index.html')
|
||||
fixed_routes = self.webagg.list_fixed_routes()
|
||||
dynamic_routes = self.webagg.list_dynamic_routes()
|
||||
fixed_routes = self.warcserver.list_fixed_routes()
|
||||
dynamic_routes = self.warcserver.list_dynamic_routes()
|
||||
|
||||
routes = fixed_routes + dynamic_routes
|
||||
|
||||
@ -76,7 +75,7 @@ class FrontEndApp(object):
|
||||
|
||||
def serve_static(self, environ, coll='', filepath=''):
|
||||
if coll:
|
||||
path = os.path.join(self.webagg.root_dir, coll, self.static_dir)
|
||||
path = os.path.join(self.warcserver.root_dir, coll, self.static_dir)
|
||||
else:
|
||||
path = self.static_dir
|
||||
|
||||
@ -116,7 +115,7 @@ class FrontEndApp(object):
|
||||
|
||||
kwargs = {'coll': coll}
|
||||
|
||||
if coll in self.webagg.list_fixed_routes():
|
||||
if coll in self.warcserver.list_fixed_routes():
|
||||
kwargs['type'] = 'replay-fixed'
|
||||
else:
|
||||
kwargs['type'] = 'replay-dyn'
|
||||
@ -131,23 +130,23 @@ class FrontEndApp(object):
|
||||
|
||||
def setup_paths(self, environ, coll):
|
||||
pop_path_info(environ)
|
||||
if not coll or not self.webagg.root_dir:
|
||||
if not coll or not self.warcserver.root_dir:
|
||||
return
|
||||
|
||||
environ['pywb.templates_dir'] = os.path.join(self.webagg.root_dir,
|
||||
environ['pywb.templates_dir'] = os.path.join(self.warcserver.root_dir,
|
||||
coll,
|
||||
self.templates_dir)
|
||||
|
||||
def serve_listing(self, environ):
|
||||
result = {'fixed': self.webagg.list_fixed_routes(),
|
||||
'dynamic': self.webagg.list_dynamic_routes()
|
||||
result = {'fixed': self.warcserver.list_fixed_routes(),
|
||||
'dynamic': self.warcserver.list_dynamic_routes()
|
||||
}
|
||||
|
||||
return WbResponse.json_response(result)
|
||||
|
||||
def is_valid_coll(self, coll):
|
||||
return (coll in self.webagg.list_fixed_routes() or
|
||||
coll in self.webagg.list_dynamic_routes())
|
||||
return (coll in self.warcserver.list_fixed_routes() or
|
||||
coll in self.warcserver.list_dynamic_routes())
|
||||
|
||||
def raise_not_found(self, environ, msg):
|
||||
raise NotFound(response=self.rewriterapp._error_response(environ, msg))
|
@ -1,5 +1,5 @@
|
||||
from gevent.monkey import patch_all; patch_all()
|
||||
from pywb.urlrewrite.frontendapp import FrontEndApp
|
||||
from pywb.apps.frontendapp import FrontEndApp
|
||||
|
||||
application = FrontEndApp(config_file=None,
|
||||
custom_config={'collections': {'live': '$live'}})
|
||||
|
@ -1,9 +1,12 @@
|
||||
import requests
|
||||
|
||||
from werkzeug.http import HTTP_STATUS_CODES
|
||||
from six.moves.urllib.parse import urlencode, urlsplit, urlunsplit
|
||||
|
||||
#from pywb.rewrite.rewrite_amf import RewriteAMFMixin
|
||||
#from pywb.rewrite.rewrite_dash import RewriteDASHMixin
|
||||
#from pywb.rewrite.rewrite_content import RewriteContent
|
||||
from pywb.urlrewrite.rewriter import DefaultRewriter
|
||||
from pywb.rewrite.default_rewriter import DefaultRewriter
|
||||
|
||||
from pywb.rewrite.wburl import WbUrl
|
||||
from pywb.rewrite.url_rewriter import UrlRewriter, SchemeOnlyUrlRewriter
|
||||
@ -16,18 +19,14 @@ from warcio.timeutils import http_date_to_timestamp
|
||||
from warcio.bufferedreaders import BufferedReader
|
||||
from warcio.recordloader import ArcWarcRecordLoader
|
||||
|
||||
from pywb.webagg.utils import BUFF_SIZE
|
||||
from pywb.warcserver.index.cdxobject import CDXObject
|
||||
from pywb.apps.wbrequestresponse import WbResponse
|
||||
|
||||
from pywb.cdx.cdxobject import CDXObject
|
||||
from pywb.framework.wbrequestresponse import WbResponse
|
||||
from pywb.warcserver.utils import BUFF_SIZE
|
||||
from pywb.warcserver.utils import MementoUtils
|
||||
|
||||
from pywb.webagg.utils import MementoUtils
|
||||
|
||||
from werkzeug.http import HTTP_STATUS_CODES
|
||||
from six.moves.urllib.parse import urlencode, urlsplit, urlunsplit
|
||||
|
||||
from pywb.urlrewrite.rewriteinputreq import RewriteInputRequest
|
||||
from pywb.urlrewrite.templateview import JinjaEnv, HeadInsertView, TopFrameView, BaseInsertView
|
||||
from pywb.rewrite.rewriteinputreq import RewriteInputRequest
|
||||
from pywb.rewrite.templateview import JinjaEnv, HeadInsertView, TopFrameView, BaseInsertView
|
||||
|
||||
|
||||
from io import BytesIO
|
||||
@ -71,7 +70,7 @@ class RewriterApp(object):
|
||||
#frame_type = 'inverse' if framed_replay else False
|
||||
|
||||
#self.content_rewriter = Rewriter(is_framed_replay=frame_type)
|
||||
self.content_rw = DefaultRewriter('pkg://pywb/rules.yaml', self.replay_mod)
|
||||
self.content_rw = DefaultRewriter(replay_mod=self.replay_mod)
|
||||
|
||||
if not jinja_env:
|
||||
jinja_env = JinjaEnv(globals={'static_path': 'static'})
|
@ -3,7 +3,7 @@ import os
|
||||
|
||||
from pywb.utils.loaders import LocalFileLoader
|
||||
|
||||
from pywb.framework.wbrequestresponse import WbResponse
|
||||
from pywb.apps.wbrequestresponse import WbResponse
|
||||
|
||||
|
||||
#=================================================================
|
@ -1,9 +1,9 @@
|
||||
from gevent import monkey; monkey.patch_all(thread=False)
|
||||
|
||||
from pywb.webagg.test.testutils import LiveServerTests, BaseTestClass
|
||||
from pywb.webagg.test.testutils import FakeRedisTests
|
||||
from pywb.warcserver.test.testutils import LiveServerTests, BaseTestClass
|
||||
from pywb.warcserver.test.testutils import FakeRedisTests
|
||||
|
||||
from pywb.urlrewrite.frontendapp import FrontEndApp
|
||||
from pywb.apps.frontendapp import FrontEndApp
|
||||
|
||||
import os
|
||||
import webtest
|
||||
@ -12,10 +12,10 @@ import webtest
|
||||
LIVE_CONFIG = {'collections': {'live': '$live'}}
|
||||
|
||||
|
||||
class TestRewriter(FakeRedisTests, BaseTestClass):
|
||||
class TestRewriterApp(FakeRedisTests, BaseTestClass):
|
||||
@classmethod
|
||||
def setup_class(cls):
|
||||
super(TestRewriter, cls).setup_class()
|
||||
super(TestRewriterApp, cls).setup_class()
|
||||
|
||||
#cls.app = RWApp.create_app(replay_port=cls.server.port)
|
||||
#cls.testapp = webtest.TestApp(cls.app.app)
|
@ -1,4 +1,4 @@
|
||||
from pywb.framework.wbrequestresponse import WbResponse
|
||||
from pywb.apps.wbrequestresponse import WbResponse
|
||||
from warcio.statusandheaders import StatusAndHeaders
|
||||
|
||||
|
7
pywb/apps/warcserverapp.py
Normal file
7
pywb/apps/warcserverapp.py
Normal file
@ -0,0 +1,7 @@
|
||||
from gevent.monkey import patch_all; patch_all()
|
||||
from pywb.warcserver.warcserver import WarcServer
|
||||
|
||||
application = WarcServer(custom_config={'collections': {'live': '$live'}})
|
||||
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
from gevent.monkey import patch_all; patch_all()
|
||||
from pywb.urlrewrite.frontendapp import FrontEndApp
|
||||
from pywb.apps.frontendapp import FrontEndApp
|
||||
|
||||
application = FrontEndApp()
|
||||
|
||||
|
@ -1,7 +0,0 @@
|
||||
from gevent.monkey import patch_all; patch_all()
|
||||
from pywb.webagg.autoapp import AutoConfigApp
|
||||
|
||||
application = AutoConfigApp(custom_config={'collections': {'live': '$live'}})
|
||||
|
||||
|
||||
|
@ -122,7 +122,7 @@ directory structure expected by pywb
|
||||
self._cdx_index(cdx_file, [self.archive_dir])
|
||||
|
||||
def _cdx_index(self, out, input_, rel_root=None):
|
||||
from pywb.warc.cdxindexer import write_multi_cdx_index
|
||||
from pywb.indexer.cdxindexer import write_multi_cdx_index
|
||||
|
||||
options = dict(append_post=True,
|
||||
cdxj=True,
|
||||
|
@ -9,7 +9,7 @@ import re
|
||||
import webencodings
|
||||
import tempfile
|
||||
|
||||
from pywb.webagg.utils import StreamIter, BUFF_SIZE
|
||||
from pywb.warcserver.utils import StreamIter, BUFF_SIZE
|
||||
from pywb.rewrite.cookie_rewriter import ExactPathCookieRewriter
|
||||
|
||||
from pywb.utils.loaders import load_yaml_config
|
||||
@ -277,8 +277,9 @@ class RewriteInfo(object):
|
||||
|
||||
self.cookie_rewriter = cookie_rewriter
|
||||
|
||||
self._fill_text_type_and_charset()
|
||||
self._resolve_text_type()
|
||||
if self.record:
|
||||
self._fill_text_type_and_charset()
|
||||
self._resolve_text_type()
|
||||
|
||||
def _fill_text_type_and_charset(self):
|
||||
content_type = self.record.http_headers.get_header('Content-Type')
|
||||
|
@ -7,7 +7,7 @@ from pywb.rewrite.regex_rewriters import RegexRewriter, CSSRewriter, XMLRewriter
|
||||
from pywb.rewrite.regex_rewriters import JSLinkAndLocationRewriter, JSLinkOnlyRewriter
|
||||
from pywb.rewrite.regex_rewriters import JSLocationOnlyRewriter, JSNoneRewriter
|
||||
|
||||
from pywb.urlrewrite.header_rewriter import PrefixHeaderRewriter
|
||||
from pywb.rewrite.header_rewriter import PrefixHeaderRewriter
|
||||
|
||||
from pywb.rewrite.jsonp_rewriter import JSONPRewriter
|
||||
|
||||
@ -75,6 +75,10 @@ class DefaultRewriter(BaseContentRewriter):
|
||||
'text/plain': 'plain',
|
||||
}
|
||||
|
||||
def __init__(self, rules_file=None, replay_mod=''):
|
||||
rules_file = rules_file or 'pkg://pywb/rules.yaml'
|
||||
super(DefaultRewriter, self).__init__(rules_file, replay_mod)
|
||||
|
||||
def init_js_regex(self, regexs):
|
||||
return RegexRewriter.parse_rules_from_config(regexs)
|
||||
|
@ -1,102 +1,87 @@
|
||||
from warcio.statusandheaders import StatusAndHeaders
|
||||
from warcio.timeutils import datetime_to_http_date
|
||||
from datetime import datetime, timedelta
|
||||
import six
|
||||
|
||||
|
||||
#=================================================================
|
||||
class RewrittenStatusAndHeaders(object):
|
||||
def __init__(self, statusline, headers,
|
||||
removed_header_dict, text_type, charset):
|
||||
#=============================================================================
|
||||
class PrefixHeaderRewriter(object):
|
||||
header_rules = {
|
||||
'content-type': 'keep',
|
||||
'content-disposition': 'keep',
|
||||
'content-range': 'keep',
|
||||
'accept-rangees': 'keep',
|
||||
'www-authenticate': 'keep',
|
||||
'proxy-authenticate': 'keep',
|
||||
|
||||
self.status_headers = StatusAndHeaders(statusline, headers)
|
||||
self.removed_header_dict = removed_header_dict
|
||||
self.text_type = text_type
|
||||
self.charset = charset
|
||||
'location': 'url-rewrite',
|
||||
'content-location': 'url-rewrite',
|
||||
'content-base': 'url-rewrite',
|
||||
|
||||
def contains_removed_header(self, name, value):
|
||||
return self.removed_header_dict.get(name) == value
|
||||
'transfer-encoding': 'prefix',
|
||||
'connection': 'prefix',
|
||||
|
||||
def readd_rewrite_removed(self):
|
||||
for name in HeaderRewriter.KEEP_NO_REWRITE_HEADERS:
|
||||
value = self.removed_header_dict.get(name)
|
||||
if value is not None:
|
||||
self.status_headers.headers.append((name, value))
|
||||
'content-encoding': 'keep-if-no-content-rewrite',
|
||||
'content-length': 'content-length',
|
||||
|
||||
|
||||
#=================================================================
|
||||
class HeaderRewriter(object):
|
||||
REWRITE_TYPES = {
|
||||
'html': ['text/html',
|
||||
'application/xhtml',
|
||||
'application/xhtml+xml'],
|
||||
|
||||
'css': ['text/css'],
|
||||
|
||||
'js': ['text/javascript',
|
||||
'application/javascript',
|
||||
'application/x-javascript'],
|
||||
|
||||
'json': ['application/json'],
|
||||
|
||||
'hls': ['application/x-mpegURL'],
|
||||
|
||||
'xml': ['/xml', '+xml', '.xml', '.rss'],
|
||||
|
||||
'plain': ['text/plain'],
|
||||
'set-cookie': 'cookie',
|
||||
'cookie': 'cookie',
|
||||
}
|
||||
|
||||
PROXY_HEADERS = ['content-type', 'content-disposition', 'content-range',
|
||||
'accept-ranges', 'www-authenticate', 'proxy-authenticate']
|
||||
|
||||
URL_REWRITE_HEADERS = ['location', 'content-location', 'content-base']
|
||||
|
||||
REMOVE_ALWAYS_HEADERS = ['transfer-encoding']
|
||||
|
||||
KEEP_PROXY_HEADERS = ['content-security-policy', 'strict-transport-security']
|
||||
|
||||
KEEP_NO_REWRITE_HEADERS = ['content-length', 'content-encoding']
|
||||
|
||||
COOKIE_HEADERS = ['set-cookie', 'cookie']
|
||||
|
||||
CACHE_HEADERS = ['cache-control', 'expires', 'etag', 'last-modified']
|
||||
|
||||
|
||||
def __init__(self, header_prefix='X-Archive-Orig-'):
|
||||
def __init__(self, rwinfo, header_prefix='X-Archive-Orig-'):
|
||||
self.header_prefix = header_prefix
|
||||
self.rwinfo = rwinfo
|
||||
self.http_headers = rwinfo.record.http_headers
|
||||
|
||||
def rewrite(self, status_headers, urlrewriter, cookie_rewriter):
|
||||
content_type = status_headers.get_header('Content-Type')
|
||||
text_type = None
|
||||
charset = None
|
||||
content_modified = False
|
||||
http_cache = None
|
||||
if urlrewriter:
|
||||
http_cache = urlrewriter.rewrite_opts.get('http_cache')
|
||||
if rwinfo.is_url_rw():
|
||||
self.default_rule = 'prefix'
|
||||
else:
|
||||
self.default_rule = 'keep'
|
||||
|
||||
if content_type:
|
||||
text_type = self._extract_text_type(content_type)
|
||||
if text_type:
|
||||
charset = self._extract_char_set(content_type)
|
||||
content_modified = True
|
||||
def __call__(self):
|
||||
new_headers_list = []
|
||||
for name, value in self.http_headers.headers:
|
||||
rule = self.header_rules.get(name.lower(), self.default_rule)
|
||||
new_header = self.rewrite_header(name, value, rule)
|
||||
if new_header:
|
||||
if isinstance(new_header, list):
|
||||
new_headers_list.extend(new_header)
|
||||
else:
|
||||
new_headers_list.append(new_header)
|
||||
|
||||
result = self._rewrite_headers(status_headers.headers,
|
||||
urlrewriter,
|
||||
cookie_rewriter,
|
||||
content_modified,
|
||||
http_cache)
|
||||
return StatusAndHeaders(self.http_headers.statusline,
|
||||
headers=new_headers_list,
|
||||
protocol=self.http_headers.protocol)
|
||||
|
||||
new_headers = result[0]
|
||||
removed_header_dict = result[1]
|
||||
def rewrite_header(self, name, value, rule):
|
||||
if rule == 'keep':
|
||||
return (name, value)
|
||||
|
||||
if http_cache != None and http_cache != 'pass':
|
||||
self._add_cache_headers(new_headers, http_cache)
|
||||
elif rule == 'url-rewrite':
|
||||
return (name, self.rwinfo.url_rewriter.rewrite(value))
|
||||
|
||||
return RewrittenStatusAndHeaders(status_headers.statusline,
|
||||
new_headers,
|
||||
removed_header_dict,
|
||||
text_type,
|
||||
charset)
|
||||
elif rule == 'keep-if-no-content-rewrite':
|
||||
if not self.rwinfo.is_content_rw:
|
||||
return (name, value)
|
||||
|
||||
elif rule == 'content-length':
|
||||
if value == '0':
|
||||
return (name, value)
|
||||
|
||||
if not self.rwinfo.is_content_rw:
|
||||
try:
|
||||
if int(value) >= 0:
|
||||
return (name, value)
|
||||
except:
|
||||
pass
|
||||
|
||||
elif rule == 'cookie':
|
||||
if self.rwinfo.cookie_rewriter:
|
||||
return self.rwinfo.cookie_rewriter.rewrite(value)
|
||||
else:
|
||||
return (name, value)
|
||||
|
||||
# default 'prefix'
|
||||
return (self.header_prefix + name, value)
|
||||
|
||||
def _add_cache_headers(self, new_headers, http_cache):
|
||||
try:
|
||||
@ -112,76 +97,4 @@ class HeaderRewriter(object):
|
||||
new_headers.append(('Cache-Control', 'max-age=' + str(age)))
|
||||
new_headers.append(('Expires', datetime_to_http_date(dt)))
|
||||
|
||||
def _extract_text_type(self, content_type):
|
||||
for ctype, mimelist in six.iteritems(self.REWRITE_TYPES):
|
||||
if any((mime in content_type) for mime in mimelist):
|
||||
return ctype
|
||||
|
||||
return None
|
||||
|
||||
def _extract_char_set(self, content_type):
|
||||
CHARSET_TOKEN = 'charset='
|
||||
idx = content_type.find(CHARSET_TOKEN)
|
||||
if idx < 0:
|
||||
return None
|
||||
|
||||
return content_type[idx + len(CHARSET_TOKEN):].lower()
|
||||
|
||||
def _rewrite_headers(self, headers, urlrewriter,
|
||||
cookie_rewriter,
|
||||
content_modified,
|
||||
http_cache):
|
||||
|
||||
new_headers = []
|
||||
removed_header_dict = {}
|
||||
|
||||
def add_header(name, value):
|
||||
new_headers.append((name, value))
|
||||
|
||||
def add_prefixed_header(name, value):
|
||||
new_headers.append((self.header_prefix + name, value))
|
||||
|
||||
for (name, value) in headers:
|
||||
lowername = name.lower()
|
||||
|
||||
if lowername in self.PROXY_HEADERS:
|
||||
add_header(name, value)
|
||||
|
||||
elif urlrewriter and urlrewriter.prefix and lowername in self.URL_REWRITE_HEADERS:
|
||||
new_headers.append((name, urlrewriter.rewrite(value)))
|
||||
|
||||
elif lowername in self.KEEP_NO_REWRITE_HEADERS:
|
||||
if content_modified and value != '0':
|
||||
removed_header_dict[lowername] = value
|
||||
add_prefixed_header(name, value)
|
||||
else:
|
||||
add_header(name, value)
|
||||
|
||||
elif lowername in self.KEEP_PROXY_HEADERS:
|
||||
if urlrewriter.prefix:
|
||||
removed_header_dict[lowername] = value
|
||||
add_prefixed_header(name, value)
|
||||
else:
|
||||
add_header(name, value)
|
||||
|
||||
elif lowername in self.REMOVE_ALWAYS_HEADERS:
|
||||
removed_header_dict[lowername] = value
|
||||
add_prefixed_header(name, value)
|
||||
|
||||
elif (lowername in self.COOKIE_HEADERS and
|
||||
cookie_rewriter):
|
||||
cookie_list = cookie_rewriter.rewrite(value)
|
||||
new_headers.extend(cookie_list)
|
||||
|
||||
elif (lowername in self.CACHE_HEADERS):
|
||||
if http_cache == 'pass':
|
||||
add_header(name, value)
|
||||
else:
|
||||
add_prefixed_header(name, value)
|
||||
|
||||
elif urlrewriter and urlrewriter.prefix:
|
||||
add_prefixed_header(name, value)
|
||||
else:
|
||||
add_header(name, value)
|
||||
|
||||
return (new_headers, removed_header_dict)
|
||||
|
@ -1,403 +0,0 @@
|
||||
#import chardet
|
||||
import pkgutil
|
||||
import webencodings
|
||||
import yaml
|
||||
import re
|
||||
|
||||
#from chardet.universaldetector import UniversalDetector
|
||||
from io import BytesIO
|
||||
|
||||
from pywb.rewrite.header_rewriter import RewrittenStatusAndHeaders
|
||||
|
||||
from pywb.rewrite.rewriterules import RewriteRules
|
||||
|
||||
from pywb.utils.dsrules import RuleSet
|
||||
|
||||
from warcio.statusandheaders import StatusAndHeaders
|
||||
from warcio.bufferedreaders import DecompressingBufferedReader
|
||||
from warcio.bufferedreaders import ChunkedDataReader, BufferedReader
|
||||
from warcio.utils import to_native_str
|
||||
|
||||
from pywb.rewrite.regex_rewriters import JSNoneRewriter, JSLinkOnlyRewriter
|
||||
|
||||
|
||||
#=================================================================
|
||||
class RewriteContent(object):
|
||||
HEAD_REGEX = re.compile(b'<\s*head\\b[^>]*[>]+', re.I)
|
||||
|
||||
TAG_REGEX = re.compile(b'^\s*\<')
|
||||
|
||||
CHARSET_REGEX = re.compile(b'<meta[^>]*?[\s;"\']charset\s*=[\s"\']*([^\s"\'/>]*)')
|
||||
|
||||
BUFF_SIZE = 16384
|
||||
|
||||
def __init__(self, ds_rules_file=None, is_framed_replay=False):
|
||||
self.ruleset = RuleSet(RewriteRules, 'rewrite',
|
||||
default_rule_config={},
|
||||
ds_rules_file=ds_rules_file)
|
||||
|
||||
if is_framed_replay == 'inverse':
|
||||
self.defmod = 'mp_'
|
||||
else:
|
||||
self.defmod = ''
|
||||
|
||||
def sanitize_content(self, status_headers, stream):
|
||||
# remove transfer encoding chunked and wrap in a dechunking stream
|
||||
if (status_headers.remove_header('transfer-encoding')):
|
||||
stream = ChunkedDataReader(stream)
|
||||
|
||||
return (status_headers, stream)
|
||||
|
||||
def _rewrite_headers(self, urlrewriter, rule, status_headers, stream,
|
||||
urlkey='', cookie_rewriter=None):
|
||||
|
||||
header_rewriter_class = rule.rewriters['header']
|
||||
|
||||
if urlrewriter and not cookie_rewriter:
|
||||
cookie_rewriter = urlrewriter.get_cookie_rewriter(rule)
|
||||
|
||||
rewritten_headers = (header_rewriter_class().
|
||||
rewrite(status_headers,
|
||||
urlrewriter,
|
||||
cookie_rewriter))
|
||||
|
||||
# note: since chunk encoding may/may not be valid,
|
||||
# the approach taken here is to *always* attempt
|
||||
# to dechunk if 'transfer-encoding: chunked' is present
|
||||
#
|
||||
# an alternative may be to serve chunked unless
|
||||
# content rewriting is needed
|
||||
# todo: possible revisit this approach
|
||||
|
||||
if (rewritten_headers.
|
||||
contains_removed_header('transfer-encoding', 'chunked')):
|
||||
|
||||
stream = ChunkedDataReader(stream)
|
||||
|
||||
return (rewritten_headers, stream)
|
||||
|
||||
def _decoding_stream(self, rewritten_headers, stream):
|
||||
for decomp_type in BufferedReader.get_supported_decompressors():
|
||||
matched, stream = self._check_encoding(rewritten_headers,
|
||||
stream,
|
||||
decomp_type)
|
||||
if matched:
|
||||
break
|
||||
|
||||
return stream
|
||||
|
||||
def _check_encoding(self, rewritten_headers, stream, enc):
|
||||
matched = False
|
||||
if (rewritten_headers.
|
||||
contains_removed_header('content-encoding', enc)):
|
||||
|
||||
#optimize: if already a ChunkedDataReader, add the encoding
|
||||
if isinstance(stream, ChunkedDataReader):
|
||||
stream.set_decomp(enc)
|
||||
else:
|
||||
stream = DecompressingBufferedReader(stream, decomp_type=enc)
|
||||
|
||||
rewritten_headers.status_headers.remove_header('content-length')
|
||||
matched = True
|
||||
|
||||
return matched, stream
|
||||
|
||||
|
||||
|
||||
def rewrite_content(self, urlrewriter, status_headers, stream,
|
||||
head_insert_func=None, urlkey='',
|
||||
cdx=None, cookie_rewriter=None, env=None):
|
||||
|
||||
wb_url = urlrewriter.wburl
|
||||
|
||||
if (wb_url.is_identity or
|
||||
(not head_insert_func and wb_url.is_banner_only)):
|
||||
status_headers, stream = self.sanitize_content(status_headers,
|
||||
stream)
|
||||
return (status_headers, self.stream_to_gen(stream), False)
|
||||
|
||||
if urlrewriter and cdx and cdx.get('is_live'):
|
||||
urlrewriter.rewrite_opts['is_live'] = True
|
||||
|
||||
rule = self.ruleset.get_first_match(urlkey)
|
||||
|
||||
(rewritten_headers, stream) = self._rewrite_headers(urlrewriter,
|
||||
rule,
|
||||
status_headers,
|
||||
stream,
|
||||
urlkey,
|
||||
cookie_rewriter)
|
||||
|
||||
res = self.handle_custom_rewrite(rewritten_headers,
|
||||
stream,
|
||||
urlrewriter,
|
||||
wb_url.mod,
|
||||
env)
|
||||
if res:
|
||||
return res
|
||||
|
||||
# Handle text content rewriting
|
||||
# ====================================================================
|
||||
# special case -- need to ungzip the body
|
||||
|
||||
status_headers = rewritten_headers.status_headers
|
||||
text_type = rewritten_headers.text_type
|
||||
|
||||
# see known js/css modifier specified, the context should run
|
||||
# default text_type
|
||||
mod = wb_url.mod
|
||||
|
||||
stream_raw = False
|
||||
encoding = None
|
||||
first_buff = b''
|
||||
|
||||
stream = self._decoding_stream(rewritten_headers, stream)
|
||||
|
||||
if mod == 'js_':
|
||||
text_type, stream = self._resolve_text_type('js',
|
||||
text_type,
|
||||
stream)
|
||||
elif mod == 'cs_':
|
||||
text_type, stream = self._resolve_text_type('css',
|
||||
text_type,
|
||||
stream)
|
||||
|
||||
# for proxy mode: use special js_proxy rewriter
|
||||
# which may be none rewriter + custom rules (if any)
|
||||
if text_type == 'js' and not urlrewriter.prefix:
|
||||
rewriter_class = rule.rewriters['js_proxy']
|
||||
else:
|
||||
rewriter_class = rule.rewriters[text_type]
|
||||
|
||||
# for html, need to perform header insert, supply js, css, xml
|
||||
# rewriters
|
||||
if text_type == 'html':
|
||||
head_insert_str = ''
|
||||
charset = rewritten_headers.charset
|
||||
|
||||
# if no charset set, attempt to extract from first 1024
|
||||
if not rewritten_headers.charset:
|
||||
first_buff = stream.read(1024)
|
||||
charset = self._extract_html_charset(first_buff,
|
||||
status_headers)
|
||||
|
||||
if head_insert_func and not wb_url.is_url_rewrite_only:
|
||||
head_insert_orig = head_insert_func(rule, cdx)
|
||||
|
||||
if charset:
|
||||
try:
|
||||
head_insert_str = webencodings.encode(head_insert_orig, charset)
|
||||
except:
|
||||
pass
|
||||
|
||||
if not head_insert_str:
|
||||
charset = 'utf-8'
|
||||
head_insert_str = head_insert_orig.encode(charset)
|
||||
|
||||
head_insert_buf = head_insert_str
|
||||
#head_insert_str = to_native_str(head_insert_str)
|
||||
head_insert_str = head_insert_str.decode('iso-8859-1')
|
||||
|
||||
|
||||
if wb_url.is_banner_only:
|
||||
gen = self._head_insert_only_gen(head_insert_buf,
|
||||
stream,
|
||||
first_buff)
|
||||
|
||||
content_len = status_headers.get_header('Content-Length')
|
||||
try:
|
||||
content_len = int(content_len)
|
||||
except Exception:
|
||||
content_len = None
|
||||
|
||||
if content_len is not None and content_len >= 0:
|
||||
content_len = str(content_len + len(head_insert_str))
|
||||
status_headers.replace_header('Content-Length',
|
||||
content_len)
|
||||
|
||||
return (status_headers, gen, False)
|
||||
|
||||
# if proxy, use js_proxy rewriter
|
||||
if not urlrewriter.prefix:
|
||||
js_rewriter_class = rule.rewriters['js_proxy']
|
||||
else:
|
||||
js_rewriter_class = rule.rewriters['js']
|
||||
|
||||
css_rewriter_class = rule.rewriters['css']
|
||||
|
||||
if wb_url.is_url_rewrite_only:
|
||||
js_rewriter_class = JSNoneRewriter
|
||||
|
||||
rewriter = rewriter_class(urlrewriter,
|
||||
js_rewriter_class=js_rewriter_class,
|
||||
css_rewriter_class=css_rewriter_class,
|
||||
head_insert=head_insert_str,
|
||||
url=wb_url.url,
|
||||
defmod=self.defmod,
|
||||
parse_comments=rule.parse_comments)
|
||||
|
||||
else:
|
||||
if wb_url.is_banner_only:
|
||||
return (status_headers, self.stream_to_gen(stream), False)
|
||||
|
||||
# url-only rewriter, but not rewriting urls in JS, so return
|
||||
if wb_url.is_url_rewrite_only and text_type == 'js':
|
||||
#return (status_headers, self.stream_to_gen(stream), False)
|
||||
rewriter_class = JSLinkOnlyRewriter
|
||||
|
||||
# apply one of (js, css, xml) rewriters
|
||||
rewriter = rewriter_class(urlrewriter)
|
||||
|
||||
|
||||
# align to line end for all non-html rewriting
|
||||
align = (text_type != 'html')
|
||||
|
||||
# Create rewriting generator
|
||||
gen = self.rewrite_text_stream_to_gen(stream,
|
||||
rewrite_func=rewriter.rewrite,
|
||||
final_read_func=rewriter.close,
|
||||
first_buff=first_buff,
|
||||
align_to_line=align)
|
||||
|
||||
return (status_headers, gen, True)
|
||||
|
||||
def handle_custom_rewrite(self, rewritten_headers, stream,
|
||||
urlrewriter, mod, env):
|
||||
|
||||
text_type = rewritten_headers.text_type
|
||||
status_headers = rewritten_headers.status_headers
|
||||
|
||||
# use rewritten headers, but no further rewriting needed
|
||||
if text_type is None:
|
||||
return (status_headers, self.stream_to_gen(stream), False)
|
||||
|
||||
if ((text_type == 'html' and urlrewriter.rewrite_opts.get('is_ajax')) or
|
||||
(text_type == 'plain' and not mod in ('js_', 'cs_'))):
|
||||
rewritten_headers.readd_rewrite_removed()
|
||||
return (status_headers, self.stream_to_gen(stream), False)
|
||||
|
||||
@staticmethod
|
||||
def _extract_html_charset(buff, status_headers):
|
||||
charset = None
|
||||
m = RewriteContent.CHARSET_REGEX.search(buff)
|
||||
if m:
|
||||
charset = m.group(1)
|
||||
charset = to_native_str(charset)
|
||||
# content_type = 'text/html; charset=' + charset
|
||||
# status_headers.replace_header('content-type', content_type)
|
||||
|
||||
return charset
|
||||
|
||||
@staticmethod
|
||||
def _resolve_text_type(mod, text_type, stream):
|
||||
if text_type == 'css' and mod == 'js':
|
||||
return 'css', stream
|
||||
|
||||
# only attempt to resolve between html and other text types
|
||||
if text_type != 'html':
|
||||
return mod, stream
|
||||
|
||||
buff = stream.read(128)
|
||||
|
||||
wrapped_stream = BufferedReader(stream, starting_data=buff)
|
||||
|
||||
# check if starts with a tag, then likely html
|
||||
if RewriteContent.TAG_REGEX.match(buff):
|
||||
mod = 'html'
|
||||
|
||||
return mod, wrapped_stream
|
||||
|
||||
def _head_insert_only_gen(self, insert_str, stream, first_buff=b''):
|
||||
buff = first_buff
|
||||
max_len = 1024 - len(first_buff)
|
||||
while max_len > 0:
|
||||
curr = stream.read(max_len)
|
||||
if not curr:
|
||||
break
|
||||
|
||||
max_len -= len(buff)
|
||||
buff += curr
|
||||
|
||||
matcher = self.HEAD_REGEX.search(buff)
|
||||
|
||||
if matcher:
|
||||
yield buff[:matcher.end()]
|
||||
yield insert_str
|
||||
yield buff[matcher.end():]
|
||||
else:
|
||||
yield insert_str
|
||||
yield buff
|
||||
|
||||
for buff in self.stream_to_gen(stream):
|
||||
yield buff
|
||||
|
||||
@staticmethod
|
||||
def _decode_buff(buff, stream, encoding): # pragma: no coverage
|
||||
try:
|
||||
buff = buff.decode(encoding)
|
||||
except UnicodeDecodeError as e:
|
||||
# chunk may have cut apart unicode bytes -- add 1-3 bytes and retry
|
||||
for i in range(3):
|
||||
buff += stream.read(1)
|
||||
try:
|
||||
buff = buff.decode(encoding)
|
||||
break
|
||||
except UnicodeDecodeError:
|
||||
pass
|
||||
else:
|
||||
raise
|
||||
|
||||
return buff
|
||||
|
||||
@staticmethod
|
||||
def stream_to_gen(stream):
|
||||
"""
|
||||
Convert stream to an iterator, reading BUFF_SIZE bytes
|
||||
"""
|
||||
try:
|
||||
while True:
|
||||
buff = stream.read(RewriteContent.BUFF_SIZE)
|
||||
yield buff
|
||||
if not buff:
|
||||
break
|
||||
|
||||
finally:
|
||||
stream.close()
|
||||
|
||||
@staticmethod
|
||||
def rewrite_text_stream_to_gen(stream, rewrite_func,
|
||||
final_read_func, first_buff,
|
||||
align_to_line):
|
||||
"""
|
||||
Convert stream to generator using applying rewriting func
|
||||
to each portion of the stream.
|
||||
Align to line boundaries if needed.
|
||||
"""
|
||||
try:
|
||||
has_closed = hasattr(stream, 'closed')
|
||||
buff = first_buff
|
||||
|
||||
while True:
|
||||
if buff:
|
||||
buff = rewrite_func(buff.decode('iso-8859-1'))
|
||||
yield buff.encode('iso-8859-1')
|
||||
|
||||
buff = stream.read(RewriteContent.BUFF_SIZE)
|
||||
# on 2.6, readline() (but not read()) throws an exception
|
||||
# if stream already closed, so check stream.closed if present
|
||||
if (buff and align_to_line and
|
||||
(not has_closed or not stream.closed)):
|
||||
buff += stream.readline()
|
||||
|
||||
if not buff:
|
||||
break
|
||||
|
||||
# For adding a tail/handling final buffer
|
||||
buff = final_read_func()
|
||||
if buff:
|
||||
yield buff.encode('iso-8859-1')
|
||||
|
||||
finally:
|
||||
stream.close()
|
||||
|
||||
|
@ -1,315 +0,0 @@
|
||||
"""
|
||||
Fetch a url from live web and apply rewriting rules
|
||||
"""
|
||||
|
||||
from requests import request as live_request
|
||||
|
||||
import mimetypes
|
||||
import logging
|
||||
import os
|
||||
|
||||
from six.moves.urllib.parse import urlsplit
|
||||
import six
|
||||
|
||||
from warcio.timeutils import timestamp_now
|
||||
from warcio.statusandheaders import StatusAndHeaders
|
||||
|
||||
from pywb.utils.loaders import is_http, LimitReader, LocalFileLoader, to_file_url
|
||||
from pywb.utils.loaders import extract_client_cookie
|
||||
from pywb.utils.canonicalize import canonicalize
|
||||
|
||||
from pywb.rewrite.rewrite_content import RewriteContent
|
||||
|
||||
|
||||
#=================================================================
|
||||
class LiveRewriter(object):
|
||||
def __init__(self, is_framed_replay=False, proxies=None):
|
||||
self.rewriter = RewriteContent(is_framed_replay=is_framed_replay)
|
||||
|
||||
self.proxies = proxies
|
||||
|
||||
self.live_request = live_request
|
||||
|
||||
if self.proxies:
|
||||
logging.debug('Live Rewrite via proxy ' + str(proxies))
|
||||
|
||||
if isinstance(proxies, str):
|
||||
self.proxies = {'http': proxies,
|
||||
'https': proxies}
|
||||
|
||||
else:
|
||||
logging.debug('Live Rewrite Direct (no proxy)')
|
||||
|
||||
def is_recording(self):
|
||||
return self.proxies is not None
|
||||
|
||||
def fetch_local_file(self, uri):
|
||||
#fh = open(uri)
|
||||
fh = LocalFileLoader().load(uri)
|
||||
|
||||
content_type, _ = mimetypes.guess_type(uri)
|
||||
|
||||
# create fake headers for local file
|
||||
status_headers = StatusAndHeaders('200 OK',
|
||||
[('Content-Type', content_type)])
|
||||
stream = fh
|
||||
|
||||
return (status_headers, stream)
|
||||
|
||||
def translate_headers(self, url, urlkey, env):
|
||||
headers = {}
|
||||
|
||||
splits = urlsplit(url)
|
||||
has_cookies = False
|
||||
|
||||
for name, value in six.iteritems(env):
|
||||
if name == 'HTTP_HOST':
|
||||
name = 'Host'
|
||||
value = splits.netloc
|
||||
|
||||
elif name == 'HTTP_ORIGIN':
|
||||
name = 'Origin'
|
||||
value = (splits.scheme + '://' + splits.netloc)
|
||||
|
||||
elif name == 'HTTP_X_CSRFTOKEN':
|
||||
name = 'X-CSRFToken'
|
||||
cookie_val = extract_client_cookie(env, 'csrftoken')
|
||||
if cookie_val:
|
||||
value = cookie_val
|
||||
|
||||
elif name == 'HTTP_REFERER':
|
||||
continue
|
||||
|
||||
elif name == 'HTTP_X_PYWB_REQUESTED_WITH':
|
||||
continue
|
||||
|
||||
elif name == 'HTTP_X_FORWARDED_PROTO':
|
||||
name = 'X-Forwarded-Proto'
|
||||
value = splits.scheme
|
||||
|
||||
elif name == 'HTTP_COOKIE':
|
||||
name = 'Cookie'
|
||||
value = self._req_cookie_rewrite(urlkey, value)
|
||||
has_cookies = True
|
||||
|
||||
elif name.startswith('HTTP_'):
|
||||
name = name[5:].title().replace('_', '-')
|
||||
|
||||
elif name in ('CONTENT_LENGTH', 'CONTENT_TYPE'):
|
||||
name = name.title().replace('_', '-')
|
||||
|
||||
elif name == 'REL_REFERER':
|
||||
name = 'Referer'
|
||||
else:
|
||||
value = None
|
||||
|
||||
if value:
|
||||
headers[name] = value
|
||||
|
||||
if not has_cookies:
|
||||
value = self._req_cookie_rewrite(urlkey, '')
|
||||
if value:
|
||||
headers['Cookie'] = value
|
||||
|
||||
return headers
|
||||
|
||||
def _req_cookie_rewrite(self, urlkey, value):
|
||||
rule = self.rewriter.ruleset.get_first_match(urlkey)
|
||||
if not rule or not rule.req_cookie_rewrite:
|
||||
return value
|
||||
|
||||
for cr in rule.req_cookie_rewrite:
|
||||
try:
|
||||
value = cr['rx'].sub(cr['replace'], value)
|
||||
except KeyError:
|
||||
pass
|
||||
|
||||
return value
|
||||
|
||||
def fetch_http(self, url,
|
||||
urlkey=None,
|
||||
env=None,
|
||||
req_headers=None,
|
||||
follow_redirects=False,
|
||||
skip_recording=False,
|
||||
verify=True):
|
||||
|
||||
method = 'GET'
|
||||
data = None
|
||||
|
||||
proxies = None
|
||||
if not skip_recording:
|
||||
proxies = self.proxies
|
||||
|
||||
if not req_headers:
|
||||
req_headers = {}
|
||||
|
||||
if env is not None:
|
||||
method = env['REQUEST_METHOD'].upper()
|
||||
input_ = env['wsgi.input']
|
||||
|
||||
req_headers.update(self.translate_headers(url, urlkey, env))
|
||||
|
||||
if method in ('POST', 'PUT'):
|
||||
len_ = env.get('CONTENT_LENGTH')
|
||||
if len_:
|
||||
data = LimitReader(input_, int(len_))
|
||||
else:
|
||||
data = input_
|
||||
|
||||
response = self.live_request(method=method,
|
||||
url=url,
|
||||
data=data,
|
||||
headers=req_headers,
|
||||
allow_redirects=follow_redirects,
|
||||
proxies=proxies,
|
||||
stream=True,
|
||||
verify=verify)
|
||||
|
||||
statusline = str(response.status_code) + ' ' + response.reason
|
||||
|
||||
headers = response.headers.items()
|
||||
|
||||
stream = response.raw
|
||||
|
||||
try: #pragma: no cover
|
||||
#PY 3
|
||||
headers = stream._original_response.headers._headers
|
||||
except: #pragma: no cover
|
||||
#PY 2
|
||||
headers = []
|
||||
resp_headers = stream._original_response.msg.headers
|
||||
for h in resp_headers:
|
||||
n, v = h.split(':', 1)
|
||||
n = n.strip()
|
||||
v = v.strip()
|
||||
headers.append((n, v))
|
||||
|
||||
status_headers = StatusAndHeaders(statusline, headers)
|
||||
|
||||
return (status_headers, stream)
|
||||
|
||||
def fetch_request(self, url, urlrewriter,
|
||||
head_insert_func=None,
|
||||
urlkey=None,
|
||||
env=None,
|
||||
req_headers={},
|
||||
timestamp=None,
|
||||
follow_redirects=False,
|
||||
skip_recording=False,
|
||||
verify=True,
|
||||
remote_only=True):
|
||||
|
||||
ts_err = url.split('///')
|
||||
|
||||
# fixup for accidental erroneous rewrite which has ///
|
||||
# (unless file:///)
|
||||
if len(ts_err) > 1 and ts_err[0] != 'file:':
|
||||
url = 'http://' + ts_err[1]
|
||||
|
||||
if url.startswith('//'):
|
||||
url = 'http:' + url
|
||||
|
||||
if remote_only or is_http(url):
|
||||
is_remote = True
|
||||
else:
|
||||
is_remote = False
|
||||
if not url.startswith('file:'):
|
||||
url = to_file_url(url)
|
||||
|
||||
# explicit urlkey may be passed in (say for testing)
|
||||
if not urlkey:
|
||||
urlkey = canonicalize(url)
|
||||
|
||||
if is_remote:
|
||||
(status_headers, stream) = self.fetch_http(url, urlkey, env,
|
||||
req_headers,
|
||||
follow_redirects,
|
||||
skip_recording,
|
||||
verify)
|
||||
else:
|
||||
(status_headers, stream) = self.fetch_local_file(url)
|
||||
|
||||
if timestamp is None:
|
||||
timestamp = timestamp_now()
|
||||
|
||||
cdx = {'urlkey': urlkey,
|
||||
'timestamp': timestamp,
|
||||
'url': url,
|
||||
'status': status_headers.get_statuscode(),
|
||||
'mime': status_headers.get_header('Content-Type'),
|
||||
'is_live': True,
|
||||
}
|
||||
|
||||
result = (self.rewriter.
|
||||
rewrite_content(urlrewriter,
|
||||
status_headers,
|
||||
stream,
|
||||
head_insert_func=head_insert_func,
|
||||
urlkey=urlkey,
|
||||
cdx=cdx))
|
||||
|
||||
if env:
|
||||
env['pywb.cdx'] = cdx
|
||||
|
||||
return result
|
||||
|
||||
def fetch_async(self, url, headers):
|
||||
resp = self.live_request(method='GET',
|
||||
url=url,
|
||||
headers=headers,
|
||||
proxies=self.proxies,
|
||||
verify=False,
|
||||
stream=True)
|
||||
|
||||
# don't actually read whole response,
|
||||
# proxy response for writing it
|
||||
resp.close()
|
||||
|
||||
def add_metadata(self, url, headers, data):
|
||||
return self.live_request(method='PUTMETA',
|
||||
url=url,
|
||||
data=data,
|
||||
headers=headers,
|
||||
proxies=self.proxies,
|
||||
verify=False)
|
||||
|
||||
def get_rewritten(self, *args, **kwargs):
|
||||
result = self.fetch_request(*args, **kwargs)
|
||||
|
||||
status_headers, gen, is_rewritten = result
|
||||
|
||||
buff = b''.join(gen)
|
||||
|
||||
return (status_headers, buff)
|
||||
|
||||
def get_video_info(self, url):
|
||||
return youtubedl.extract_info(url)
|
||||
|
||||
|
||||
#=================================================================
|
||||
class YoutubeDLWrapper(object): #pragma: no cover
|
||||
""" YoutubeDL wrapper, inits youtubee-dl if it is available
|
||||
"""
|
||||
def __init__(self):
|
||||
try:
|
||||
from youtube_dl import YoutubeDL as YoutubeDL
|
||||
except ImportError:
|
||||
self.ydl = None
|
||||
return
|
||||
|
||||
self.ydl = YoutubeDL(dict(simulate=True,
|
||||
youtube_include_dash_manifest=False))
|
||||
self.ydl.add_default_info_extractors()
|
||||
|
||||
def extract_info(self, url):
|
||||
if not self.ydl:
|
||||
return None
|
||||
|
||||
info = self.ydl.extract_info(url)
|
||||
return info
|
||||
|
||||
|
||||
#=================================================================
|
||||
youtubedl = YoutubeDLWrapper()
|
||||
|
@ -1,4 +1,4 @@
|
||||
from pywb.webagg.inputrequest import DirectWSGIInputRequest
|
||||
from pywb.warcserver.inputrequest import DirectWSGIInputRequest
|
||||
from pywb.utils.loaders import extract_client_cookie
|
||||
|
||||
from six import iteritems
|
@ -1,80 +0,0 @@
|
||||
from pywb.utils.dsrules import BaseRule
|
||||
|
||||
from pywb.rewrite.regex_rewriters import RegexRewriter, CSSRewriter, XMLRewriter
|
||||
from pywb.rewrite.regex_rewriters import JSLinkAndLocationRewriter, JSLinkOnlyRewriter
|
||||
from pywb.rewrite.regex_rewriters import JSLocationOnlyRewriter, JSNoneRewriter
|
||||
|
||||
from pywb.rewrite.header_rewriter import HeaderRewriter
|
||||
from pywb.rewrite.html_rewriter import HTMLRewriter
|
||||
|
||||
from pywb.rewrite.jsonp_rewriter import JSONPRewriter
|
||||
|
||||
import re
|
||||
|
||||
|
||||
#=================================================================
|
||||
class RewriteRules(BaseRule):
|
||||
def __init__(self, url_prefix, config={}):
|
||||
super(RewriteRules, self).__init__(url_prefix, config)
|
||||
|
||||
self.rewriters = {}
|
||||
|
||||
#self._script_head_inserts = config.get('script_head_inserts', {})
|
||||
|
||||
self.rewriters['header'] = config.get('header_class', HeaderRewriter)
|
||||
self.rewriters['css'] = config.get('css_class', CSSRewriter)
|
||||
self.rewriters['xml'] = config.get('xml_class', XMLRewriter)
|
||||
self.rewriters['html'] = config.get('html_class', HTMLRewriter)
|
||||
self.rewriters['json'] = config.get('json_class', JSONPRewriter)
|
||||
|
||||
self.parse_comments = config.get('parse_comments', False)
|
||||
|
||||
# Custom handling for js rewriting, often the most complex
|
||||
self.js_rewrite_location = config.get('js_rewrite_location', 'location')
|
||||
|
||||
# ability to toggle rewriting
|
||||
if self.js_rewrite_location == 'all':
|
||||
js_default_class = JSLinkAndLocationRewriter
|
||||
elif self.js_rewrite_location == 'location':
|
||||
js_default_class = JSLocationOnlyRewriter
|
||||
# self.rewriters['json'] = JSNoneRewriter
|
||||
elif self.js_rewrite_location == 'none':
|
||||
js_default_class = JSNoneRewriter
|
||||
# self.rewriters['json'] = JSNoneRewriter
|
||||
else:
|
||||
js_default_class = JSLinkOnlyRewriter
|
||||
|
||||
# set js class, using either default or override from config
|
||||
self.rewriters['js'] = config.get('js_class', js_default_class)
|
||||
|
||||
self.rewriters['js_proxy'] = JSNoneRewriter
|
||||
|
||||
# add any regexs for js rewriter
|
||||
self._add_custom_regexs('js', 'js_regexs', config)
|
||||
self._add_custom_regexs('js_proxy', 'js_regexs', config)
|
||||
|
||||
# cookie rewrite scope
|
||||
self.cookie_scope = config.get('cookie_scope', 'default')
|
||||
|
||||
req_cookie_rewrite = config.get('req_cookie_rewrite', [])
|
||||
for rc in req_cookie_rewrite:
|
||||
rc['rx'] = re.compile(rc.get('match', ''))
|
||||
|
||||
self.req_cookie_rewrite = req_cookie_rewrite
|
||||
|
||||
def _add_custom_regexs(self, rw_id, field, config):
|
||||
regexs = config.get(field)
|
||||
if not regexs:
|
||||
return
|
||||
|
||||
rewriter_cls = self.rewriters[rw_id]
|
||||
|
||||
#rule_def_tuples = RegexRewriter.parse_rules_from_config(regexs)
|
||||
parse_rules_func = RegexRewriter.parse_rules_from_config(regexs)
|
||||
|
||||
def extend_rewriter_with_regex(urlrewriter):
|
||||
rule_def_tuples = parse_rules_func(urlrewriter)
|
||||
return rewriter_cls(urlrewriter, rule_def_tuples)
|
||||
|
||||
self.rewriters[rw_id] = extend_rewriter_with_regex
|
||||
|
@ -1,271 +0,0 @@
|
||||
from pywb.rewrite.rewrite_live import LiveRewriter
|
||||
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||
from pywb.rewrite.wburl import WbUrl
|
||||
|
||||
from pywb.utils.loaders import to_native_str
|
||||
|
||||
from pywb import get_test_dir
|
||||
|
||||
from io import BytesIO
|
||||
|
||||
# This module has some rewriting tests against the 'live web'
|
||||
# As such, the content may change and the test may break
|
||||
|
||||
urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/pywb/')
|
||||
bn_urlrewriter = UrlRewriter('20131226101010bn_/http://example.com/some/path/index.html', '/pywb/')
|
||||
|
||||
def head_insert_func(rule, cdx):
|
||||
if rule.js_rewrite_location != 'urls':
|
||||
return '<script src="/static/__pywb/wombat.js"> </script>'
|
||||
else:
|
||||
return ''
|
||||
|
||||
def test_csrf_token_headers():
|
||||
rewriter = LiveRewriter()
|
||||
env = {'HTTP_X_CSRFTOKEN': 'wrong', 'HTTP_COOKIE': 'csrftoken=foobar'}
|
||||
|
||||
req_headers = rewriter.translate_headers('http://example.com/', 'com,example)/', env)
|
||||
|
||||
assert req_headers == {'X-CSRFToken': 'foobar', 'Cookie': 'csrftoken=foobar'}
|
||||
|
||||
def test_forwarded_scheme():
|
||||
rewriter = LiveRewriter()
|
||||
env = {'HTTP_X_FORWARDED_PROTO': 'https', 'Other': 'Value'}
|
||||
|
||||
req_headers = rewriter.translate_headers('http://example.com/', 'com,example)/', env)
|
||||
|
||||
assert req_headers == {'X-Forwarded-Proto': 'http'}
|
||||
|
||||
def test_req_cookie_rewrite_1():
|
||||
rewriter = LiveRewriter()
|
||||
env = {'HTTP_COOKIE': 'A=B'}
|
||||
|
||||
urlkey = 'example,example,test)/'
|
||||
url = 'test.example.example/'
|
||||
|
||||
req_headers = rewriter.translate_headers(url, urlkey, env)
|
||||
|
||||
assert req_headers == {'Cookie': 'A=B; FOO=&bar=1'}
|
||||
|
||||
def test_req_cookie_rewrite_2():
|
||||
rewriter = LiveRewriter()
|
||||
env = {'HTTP_COOKIE': 'FOO=goo'}
|
||||
|
||||
urlkey = 'example,example,test)/'
|
||||
url = 'test.example.example/'
|
||||
|
||||
req_headers = rewriter.translate_headers(url, urlkey, env)
|
||||
|
||||
assert req_headers == {'Cookie': 'FOO=&bar=1'}
|
||||
|
||||
def test_req_cookie_rewrite_3():
|
||||
rewriter = LiveRewriter()
|
||||
env = {}
|
||||
|
||||
urlkey = 'example,example,test)/'
|
||||
url = 'test.example.example/'
|
||||
|
||||
req_headers = rewriter.translate_headers(url, urlkey, env)
|
||||
|
||||
assert req_headers == {'Cookie': '; FOO=&bar=1'}
|
||||
|
||||
def test_local_1():
|
||||
status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html',
|
||||
urlrewriter,
|
||||
head_insert_func,
|
||||
'example,example,test,all)/')
|
||||
|
||||
# wombat insert added
|
||||
assert '<head><script src="/static/__pywb/wombat.js"> </script>' in buff, buff
|
||||
|
||||
# JS location and JS link rewritten
|
||||
assert 'window.WB_wombat_location = "/pywb/20131226101010/http:\/\/example.com/dynamic_page.html"' in buff
|
||||
|
||||
# link rewritten
|
||||
assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff
|
||||
|
||||
|
||||
def test_local_no_head():
|
||||
status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample_no_head.html',
|
||||
urlrewriter,
|
||||
head_insert_func,
|
||||
'com,example,test)/')
|
||||
|
||||
# wombat insert added
|
||||
assert '<script src="/static/__pywb/wombat.js"> </script>' in buff, buff
|
||||
|
||||
# location rewritten
|
||||
assert 'window.WB_wombat_location = "/other.html"' in buff, buff
|
||||
|
||||
# link rewritten
|
||||
assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff, buff
|
||||
|
||||
def test_local_no_head_only_title():
|
||||
status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample_no_head_2.html',
|
||||
urlrewriter,
|
||||
head_insert_func,
|
||||
'com,example,test)/')
|
||||
|
||||
# wombat insert added
|
||||
assert '<script src="/static/__pywb/wombat.js"> </script>' in buff
|
||||
|
||||
|
||||
def test_local_no_head_banner_only():
|
||||
status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample_no_head.html',
|
||||
bn_urlrewriter,
|
||||
head_insert_func,
|
||||
'com,example,test)/')
|
||||
|
||||
# wombat insert added
|
||||
assert '<script src="/static/__pywb/wombat.js"> </script>' in buff
|
||||
|
||||
# location NOT rewritten
|
||||
assert 'window.location = "/other.html"' in buff
|
||||
|
||||
# link NOT rewritten
|
||||
assert '"/some/path/another.html"' in buff
|
||||
|
||||
def test_local_banner_only_no_rewrite():
|
||||
status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html',
|
||||
bn_urlrewriter,
|
||||
head_insert_func,
|
||||
'com,example,test)/')
|
||||
|
||||
# wombat insert added
|
||||
assert '<head><script src="/static/__pywb/wombat.js"> </script>' in buff
|
||||
|
||||
# JS location NOT rewritten, JS link NOT rewritten
|
||||
assert 'window.location = "http:\/\/example.com/dynamic_page.html"' in buff, buff
|
||||
|
||||
# link NOT rewritten
|
||||
assert '"/some/path/another.html"' in buff
|
||||
|
||||
def test_local_2_link_only_rewrite():
|
||||
status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html',
|
||||
urlrewriter,
|
||||
head_insert_func,
|
||||
'example,example,test)/nolocation_rewrite')
|
||||
|
||||
# no wombat insert
|
||||
assert '<head><script src="/static/__pywb/wombat.js"> </script>' not in buff
|
||||
|
||||
# JS location NOT rewritten, JS link rewritten
|
||||
assert 'window.location = "/pywb/20131226101010/http:\/\/example.com/dynamic_page.html"' in buff
|
||||
|
||||
# still link rewrite
|
||||
assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff
|
||||
|
||||
|
||||
def test_local_2_js_loc_only_rewrite():
|
||||
status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html',
|
||||
urlrewriter,
|
||||
head_insert_func,
|
||||
'example,example,test,loconly)/')
|
||||
|
||||
# wombat insert added
|
||||
assert '<script src="/static/__pywb/wombat.js"> </script>' in buff
|
||||
|
||||
# JS location rewritten, JS link NOT rewritten
|
||||
assert 'window.WB_wombat_location = "http:\/\/example.com/dynamic_page.html"' in buff
|
||||
|
||||
# still link rewrite in HTML
|
||||
assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff
|
||||
|
||||
def test_local_2_no_rewrite():
|
||||
status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html',
|
||||
urlrewriter,
|
||||
head_insert_func,
|
||||
'example,example,test,norewrite)/')
|
||||
|
||||
# wombat insert added
|
||||
assert '<script src="/static/__pywb/wombat.js"> </script>' in buff
|
||||
|
||||
# JS location NOT rewritten, JS link NOT rewritten
|
||||
assert 'window.location = "http:\/\/example.com/dynamic_page.html"' in buff
|
||||
|
||||
# still link rewrite in HTML
|
||||
assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff
|
||||
|
||||
def test_local_unclosed_script():
|
||||
status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample_unclosed_script.html',
|
||||
urlrewriter,
|
||||
head_insert_func,
|
||||
'example,example,test,all)/')
|
||||
|
||||
# wombat insert added
|
||||
assert '<head><script src="/static/__pywb/wombat.js"> </script>' in buff, buff
|
||||
|
||||
# JS location and JS link rewritten
|
||||
assert 'window.WB_wombat_location = "/pywb/20131226101010/http:\/\/example.com/dynamic_page.html";' in buff, buff
|
||||
|
||||
assert '</script>' in buff, buff
|
||||
|
||||
|
||||
def test_example_1():
|
||||
status_headers, buff = get_rewritten('http://example.com/', urlrewriter, req_headers={'Connection': 'close', 'Accept-Encoding': 'identity'})
|
||||
|
||||
# verify header rewriting
|
||||
assert status_headers.get_header('x-archive-orig-content-length') == '1270', status_headers
|
||||
|
||||
|
||||
# verify utf-8 charset detection
|
||||
assert status_headers.get_header('content-type') == 'text/html'
|
||||
|
||||
assert '/pywb/20131226101010/http://www.iana.org/domains/example' in buff, buff
|
||||
|
||||
def test_example_2_redirect():
|
||||
status_headers, buff = get_rewritten('http://httpbin.org/redirect-to?url=http://example.com/', urlrewriter)
|
||||
|
||||
# redirect, no content
|
||||
assert status_headers.get_statuscode() == '302'
|
||||
assert len(buff) == 0
|
||||
|
||||
|
||||
def test_example_3_rel():
|
||||
status_headers, buff = get_rewritten('//example.com/', urlrewriter)
|
||||
assert status_headers.get_statuscode() == '200'
|
||||
|
||||
|
||||
def test_example_4_rewrite_err():
|
||||
# may occur in case of rewrite mismatch, the /// gets stripped off
|
||||
status_headers, buff = get_rewritten('http://localhost:8080///example.com/', urlrewriter)
|
||||
assert status_headers.get_statuscode() == '200'
|
||||
|
||||
def test_example_domain_specific_3():
|
||||
status_headers, buff = get_rewritten('http://facebook.com/digitalpreservation', urlrewriter, follow_redirects=True)
|
||||
|
||||
# comment out Bootloader.configurePage, if it is still there
|
||||
if 'Bootloader.configurePage' in buff:
|
||||
assert '/* Bootloader.configurePage' in buff
|
||||
|
||||
def test_wombat_top():
|
||||
#status_headers, buff = get_rewritten('https://assets-cdn.github.com/assets/github-0f06d0f46fe7bcfbf31f2380f23aec15ba21b8ec.js', urlrewriter)
|
||||
status_headers, buff = get_rewritten(get_test_dir() + 'text_content/toptest.js', urlrewriter)
|
||||
|
||||
assert 'WB_wombat_top!==window' in buff
|
||||
|
||||
def test_post():
|
||||
buff = BytesIO(b'ABC=DEF')
|
||||
|
||||
env = {'REQUEST_METHOD': 'POST',
|
||||
'HTTP_ORIGIN': 'http://httpbin.org',
|
||||
'HTTP_HOST': 'httpbin.org',
|
||||
'wsgi.input': buff}
|
||||
|
||||
status_headers, resp_buff = get_rewritten('http://httpbin.org/post', urlrewriter, env=env)
|
||||
assert status_headers.get_statuscode() == '200', status_headers
|
||||
|
||||
def test_multiple_set_cookies():
|
||||
status_headers, buff = get_rewritten('http://httpbin.org/cookies/set?A=B&C=D', urlrewriter)
|
||||
|
||||
assert status_headers.get_statuscode() == '302'
|
||||
|
||||
print(status_headers.headers)
|
||||
|
||||
assert ('Set-Cookie', 'A=B; Path=/pywb/20131226101010/http://example.com/') in status_headers.headers
|
||||
assert ('Set-Cookie', 'C=D; Path=/pywb/20131226101010/http://example.com/') in status_headers.headers
|
||||
|
||||
|
||||
def get_rewritten(*args, **kwargs):
|
||||
status_headers, buff = LiveRewriter().get_rewritten(remote_only=False, *args, **kwargs)
|
||||
return status_headers, to_native_str(buff)
|
@ -1,100 +0,0 @@
|
||||
from warcio.statusandheaders import StatusAndHeaders
|
||||
from warcio.timeutils import datetime_to_http_date
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class PrefixHeaderRewriter(object):
|
||||
header_rules = {
|
||||
'content-type': 'keep',
|
||||
'content-disposition': 'keep',
|
||||
'content-range': 'keep',
|
||||
'accept-rangees': 'keep',
|
||||
'www-authenticate': 'keep',
|
||||
'proxy-authenticate': 'keep',
|
||||
|
||||
'location': 'url-rewrite',
|
||||
'content-location': 'url-rewrite',
|
||||
'content-base': 'url-rewrite',
|
||||
|
||||
'transfer-encoding': 'prefix',
|
||||
'connection': 'prefix',
|
||||
|
||||
'content-encoding': 'keep-if-no-content-rewrite',
|
||||
'content-length': 'content-length',
|
||||
|
||||
'set-cookie': 'cookie',
|
||||
'cookie': 'cookie',
|
||||
}
|
||||
|
||||
def __init__(self, rwinfo, header_prefix='X-Archive-Orig-'):
|
||||
self.header_prefix = header_prefix
|
||||
self.rwinfo = rwinfo
|
||||
self.http_headers = rwinfo.record.http_headers
|
||||
|
||||
if rwinfo.is_url_rw():
|
||||
self.default_rule = 'prefix'
|
||||
else:
|
||||
self.default_rule = 'keep'
|
||||
|
||||
def __call__(self):
|
||||
new_headers_list = []
|
||||
for name, value in self.http_headers.headers:
|
||||
rule = self.header_rules.get(name.lower(), self.default_rule)
|
||||
new_header = self.rewrite_header(name, value, rule)
|
||||
if new_header:
|
||||
if isinstance(new_header, list):
|
||||
new_headers_list.extend(new_header)
|
||||
else:
|
||||
new_headers_list.append(new_header)
|
||||
|
||||
return StatusAndHeaders(self.http_headers.statusline,
|
||||
headers=new_headers_list,
|
||||
protocol=self.http_headers.protocol)
|
||||
|
||||
def rewrite_header(self, name, value, rule):
|
||||
if rule == 'keep':
|
||||
return (name, value)
|
||||
|
||||
elif rule == 'url-rewrite':
|
||||
return (name, self.rwinfo.url_rewriter.rewrite(value))
|
||||
|
||||
elif rule == 'keep-if-no-content-rewrite':
|
||||
if not self.rwinfo.is_content_rw:
|
||||
return (name, value)
|
||||
|
||||
elif rule == 'content-length':
|
||||
if value == '0':
|
||||
return (name, value)
|
||||
|
||||
if not self.rwinfo.is_content_rw:
|
||||
try:
|
||||
if int(value) >= 0:
|
||||
return (name, value)
|
||||
except:
|
||||
pass
|
||||
|
||||
elif rule == 'cookie':
|
||||
if self.rwinfo.cookie_rewriter:
|
||||
return self.rwinfo.cookie_rewriter.rewrite(value)
|
||||
else:
|
||||
return (name, value)
|
||||
|
||||
# default 'prefix'
|
||||
return (self.header_prefix + name, value)
|
||||
|
||||
def _add_cache_headers(self, new_headers, http_cache):
|
||||
try:
|
||||
age = int(http_cache)
|
||||
except:
|
||||
age = 0
|
||||
|
||||
if age <= 0:
|
||||
new_headers.append(('Cache-Control', 'no-cache; no-store'))
|
||||
else:
|
||||
dt = datetime.utcnow()
|
||||
dt = dt + timedelta(seconds=age)
|
||||
new_headers.append(('Cache-Control', 'max-age=' + str(age)))
|
||||
new_headers.append(('Expires', datetime_to_http_date(dt)))
|
||||
|
||||
|
@ -1,18 +0,0 @@
|
||||
[uwsgi]
|
||||
if-not-env = PORT
|
||||
http-socket = :8090
|
||||
endif =
|
||||
|
||||
master = true
|
||||
buffer-size = 65536
|
||||
die-on-term = true
|
||||
|
||||
if-env = VIRTUAL_ENV
|
||||
venv = $(VIRTUAL_ENV)
|
||||
endif =
|
||||
|
||||
gevent = 100
|
||||
|
||||
wsgi = urlrewrite.test.simpleapp
|
||||
|
||||
|
@ -1,81 +0,0 @@
|
||||
import pkgutil
|
||||
from pywb.utils.loaders import load_yaml_config
|
||||
|
||||
|
||||
#=================================================================
|
||||
DEFAULT_RULES_FILE = 'pywb/rules.yaml'
|
||||
|
||||
|
||||
#=================================================================
|
||||
class RuleSet(object):
|
||||
DEFAULT_KEY = ''
|
||||
|
||||
def __init__(self, rule_cls, fieldname, **kwargs):
|
||||
"""
|
||||
A domain specific rules block, inited via config map.
|
||||
If config map not specified, it is loaded from default location.
|
||||
|
||||
The rules are represented as a map by domain.
|
||||
Each rules configuration will load is own field type
|
||||
from the list and given a specified rule_cls.
|
||||
"""
|
||||
|
||||
self.rules = []
|
||||
|
||||
default_rule_config = kwargs.get('default_rule_config')
|
||||
|
||||
ds_rules_file = kwargs.get('ds_rules_file')
|
||||
|
||||
if not ds_rules_file:
|
||||
ds_rules_file = DEFAULT_RULES_FILE
|
||||
|
||||
config = load_yaml_config(ds_rules_file)
|
||||
|
||||
# load rules dict or init to empty
|
||||
rulesmap = config.get('rules') if config else {}
|
||||
|
||||
def_key_found = False
|
||||
|
||||
# iterate over master rules file
|
||||
for value in rulesmap:
|
||||
url_prefix = value.get('url_prefix')
|
||||
rules_def = value.get(fieldname)
|
||||
if not rules_def:
|
||||
continue
|
||||
|
||||
if url_prefix == self.DEFAULT_KEY:
|
||||
def_key_found = True
|
||||
|
||||
self.rules.append(rule_cls(url_prefix, rules_def))
|
||||
|
||||
# if default_rule_config provided, always init a default ruleset
|
||||
if not def_key_found and default_rule_config is not None:
|
||||
self.rules.append(rule_cls(self.DEFAULT_KEY, default_rule_config))
|
||||
|
||||
def iter_matching(self, urlkey):
|
||||
"""
|
||||
Iterate over all matching rules for given urlkey
|
||||
"""
|
||||
for rule in self.rules:
|
||||
if rule.applies(urlkey):
|
||||
yield rule
|
||||
|
||||
def get_first_match(self, urlkey):
|
||||
for rule in self.rules:
|
||||
if rule.applies(urlkey):
|
||||
return rule
|
||||
|
||||
|
||||
#=================================================================
|
||||
class BaseRule(object):
|
||||
"""
|
||||
Base rule class -- subclassed to handle specific
|
||||
rules for given url_prefix key
|
||||
"""
|
||||
def __init__(self, url_prefix, rules):
|
||||
self.url_prefix = url_prefix
|
||||
if not isinstance(self.url_prefix, list):
|
||||
self.url_prefix = [self.url_prefix]
|
||||
|
||||
def applies(self, urlkey):
|
||||
return any(urlkey.startswith(x) for x in self.url_prefix)
|
@ -52,43 +52,6 @@ IOError: [Errno 2] No such file or directory: '_x_no_such_file_'
|
||||
|
||||
>>> extract_client_cookie({}, 'y')
|
||||
|
||||
# append_post_query
|
||||
>>> append_post_query('http://example.com/?abc=def', 'foo=bar')
|
||||
'http://example.com/?abc=def&foo=bar'
|
||||
|
||||
>>> append_post_query('http://example.com/', '')
|
||||
'http://example.com/'
|
||||
|
||||
>>> append_post_query('http://example.com/', 'foo=bar')
|
||||
'http://example.com/?foo=bar'
|
||||
|
||||
# extract_post_query tests
|
||||
|
||||
# correct POST data
|
||||
>>> post_data = b'foo=bar&dir=%2Fbaz'
|
||||
>>> extract_post_query('POST', 'application/x-www-form-urlencoded', len(post_data), BytesIO(post_data))
|
||||
'foo=bar&dir=/baz'
|
||||
|
||||
# unsupported method
|
||||
>>> extract_post_query('PUT', 'application/x-www-form-urlencoded', len(post_data), BytesIO(post_data))
|
||||
|
||||
# base64 encode
|
||||
>>> extract_post_query('POST', 'text/plain', len(post_data), BytesIO(post_data))
|
||||
'&__wb_post_data=Zm9vPWJhciZkaXI9JTJGYmF6'
|
||||
|
||||
# invalid length
|
||||
>>> extract_post_query('POST', 'application/x-www-form-urlencoded', 'abc', BytesIO(post_data))
|
||||
>>> extract_post_query('POST', 'application/x-www-form-urlencoded', 0, BytesIO(post_data))
|
||||
|
||||
# length too short
|
||||
>>> extract_post_query('POST', 'application/x-www-form-urlencoded', len(post_data) - 4, BytesIO(post_data))
|
||||
'foo=bar&dir=%2'
|
||||
|
||||
# length too long
|
||||
>>> extract_post_query('POST', 'application/x-www-form-urlencoded', len(post_data) + 4, BytesIO(post_data))
|
||||
'foo=bar&dir=/baz'
|
||||
|
||||
|
||||
# test read_last_line
|
||||
>>> print_str(read_last_line(BytesIO(b'A\nB\nC')))
|
||||
'C'
|
||||
@ -119,8 +82,8 @@ from io import BytesIO
|
||||
import requests
|
||||
|
||||
from pywb.utils.loaders import BlockLoader, HMACCookieMaker, to_file_url
|
||||
from pywb.utils.loaders import extract_client_cookie, extract_post_query
|
||||
from pywb.utils.loaders import append_post_query, read_last_line
|
||||
from pywb.utils.loaders import extract_client_cookie
|
||||
from pywb.utils.loaders import read_last_line
|
||||
|
||||
from warcio.bufferedreaders import DecompressingBufferedReader
|
||||
|
||||
|
@ -86,10 +86,11 @@ class DirectWSGIInputRequest(object):
|
||||
buffered_stream=buffered_stream,
|
||||
environ=self.env)
|
||||
|
||||
if post_query.append_post_query(url) != url:
|
||||
new_url = post_query.append_post_query(url)
|
||||
if new_url != url:
|
||||
self.env['wsgi.input'] = buffered_stream
|
||||
|
||||
return url
|
||||
return new_url
|
||||
|
||||
def get_full_request_uri(self):
|
||||
req_uri = self.env.get('REQUEST_URI')
|
||||
@ -246,7 +247,7 @@ class PostQueryExtractor(object):
|
||||
else:
|
||||
post_query = base64.b64encode(post_query)
|
||||
post_query = to_native_str(post_query)
|
||||
post_query = '&__wb_post_data=' + post_query
|
||||
post_query = '__wb_post_data=' + post_query
|
||||
|
||||
self.post_query = post_query
|
||||
|
||||
|
@ -1,9 +1,10 @@
|
||||
from pywb.warcserver.inputrequest import DirectWSGIInputRequest, POSTInputRequest
|
||||
from pywb.warcserver.inputrequest import DirectWSGIInputRequest, POSTInputRequest, PostQueryExtractor
|
||||
from werkzeug.routing import Map, Rule
|
||||
|
||||
import webtest
|
||||
import traceback
|
||||
from six.moves.urllib.parse import parse_qsl
|
||||
from io import BytesIO
|
||||
|
||||
|
||||
#=============================================================================
|
||||
@ -76,3 +77,61 @@ Foo: Bar\r\n\
|
||||
\r\n\
|
||||
'
|
||||
|
||||
|
||||
class TestPostQueryExtract(object):
|
||||
@classmethod
|
||||
def setup_class(cls):
|
||||
cls.post_data = b'foo=bar&dir=%2Fbaz'
|
||||
|
||||
def test_post_extract_1(self):
|
||||
pq = PostQueryExtractor('POST', 'application/x-www-form-urlencoded',
|
||||
len(self.post_data), BytesIO(self.post_data))
|
||||
|
||||
assert pq.append_post_query('http://example.com/') == 'http://example.com/?foo=bar&dir=/baz'
|
||||
|
||||
assert pq.append_post_query('http://example.com/?123=ABC') == 'http://example.com/?123=ABC&foo=bar&dir=/baz'
|
||||
|
||||
def test_post_extract_wrong_method(self):
|
||||
pq = PostQueryExtractor('PUT', 'application/x-www-form-urlencoded',
|
||||
len(self.post_data), BytesIO(self.post_data))
|
||||
|
||||
assert pq.append_post_query('http://example.com/') == 'http://example.com/'
|
||||
|
||||
def test_post_extract_non_form_data_1(self):
|
||||
pq = PostQueryExtractor('POST', 'application/octet-stream',
|
||||
len(self.post_data), BytesIO(self.post_data))
|
||||
|
||||
#base64 encoded data
|
||||
assert pq.append_post_query('http://example.com/') == 'http://example.com/?__wb_post_data=Zm9vPWJhciZkaXI9JTJGYmF6'
|
||||
|
||||
def test_post_extract_non_form_data_2(self):
|
||||
pq = PostQueryExtractor('POST', 'text/plain',
|
||||
len(self.post_data), BytesIO(self.post_data))
|
||||
|
||||
#base64 encoded data
|
||||
assert pq.append_post_query('http://example.com/pathbar?id=123') == 'http://example.com/pathbar?id=123&__wb_post_data=Zm9vPWJhciZkaXI9JTJGYmF6'
|
||||
|
||||
def test_post_extract_length_invalid_ignore(self):
|
||||
pq = PostQueryExtractor('POST', 'application/x-www-form-urlencoded',
|
||||
0, BytesIO(self.post_data))
|
||||
|
||||
assert pq.append_post_query('http://example.com/') == 'http://example.com/'
|
||||
|
||||
pq = PostQueryExtractor('POST', 'application/x-www-form-urlencoded',
|
||||
'abc', BytesIO(self.post_data))
|
||||
|
||||
assert pq.append_post_query('http://example.com/') == 'http://example.com/'
|
||||
|
||||
def test_post_extract_length_too_short(self):
|
||||
pq = PostQueryExtractor('POST', 'application/x-www-form-urlencoded',
|
||||
len(self.post_data) - 4, BytesIO(self.post_data))
|
||||
|
||||
assert pq.append_post_query('http://example.com/') == 'http://example.com/?foo=bar&dir=%2'
|
||||
|
||||
def test_post_extract_length_too_long(self):
|
||||
pq = PostQueryExtractor('POST', 'application/x-www-form-urlencoded',
|
||||
len(self.post_data) + 4, BytesIO(self.post_data))
|
||||
|
||||
assert pq.append_post_query('http://example.com/') == 'http://example.com/?foo=bar&dir=/baz'
|
||||
|
||||
|
||||
|
@ -3,9 +3,9 @@ from gevent import monkey; monkey.patch_all(thread=False)
|
||||
import pytest
|
||||
import webtest
|
||||
|
||||
from pywb.webagg.test.testutils import BaseTestClass
|
||||
from pywb.warcserver.test.testutils import BaseTestClass
|
||||
|
||||
from pywb.urlrewrite.frontendapp import FrontEndApp
|
||||
from pywb.apps.frontendapp import FrontEndApp
|
||||
import os
|
||||
|
||||
|
||||
|
@ -17,16 +17,16 @@ from pytest import raises
|
||||
from mock import patch
|
||||
|
||||
from pywb import get_test_dir
|
||||
from pywb.webagg.test.testutils import TempDirTests, BaseTestClass
|
||||
from pywb.warcserver.test.testutils import TempDirTests, BaseTestClass
|
||||
|
||||
from pywb.manager.manager import main
|
||||
|
||||
import pywb.manager.autoindex
|
||||
|
||||
from pywb.warc.cdxindexer import main as cdxindexer_main
|
||||
from pywb.cdx.cdxobject import CDXObject
|
||||
from pywb.indexer.cdxindexer import main as cdxindexer_main
|
||||
from pywb.warcserver.index.cdxobject import CDXObject
|
||||
|
||||
from pywb.urlrewrite.frontendapp import FrontEndApp
|
||||
from pywb.apps.frontendapp import FrontEndApp
|
||||
|
||||
|
||||
#=============================================================================
|
||||
|
@ -8,10 +8,10 @@ import webtest
|
||||
|
||||
from six.moves.urllib.parse import urlencode
|
||||
|
||||
from pywb.cdx.cdxobject import CDXObject
|
||||
from pywb.warcserver.index.cdxobject import CDXObject
|
||||
|
||||
from pywb.webagg.test.testutils import BaseTestClass
|
||||
from pywb.webagg.autoapp import AutoConfigApp
|
||||
from pywb.warcserver.test.testutils import BaseTestClass
|
||||
from pywb.warcserver.warcserver import WarcServer
|
||||
|
||||
|
||||
# ============================================================================
|
||||
@ -20,7 +20,7 @@ class TestCDXApp(BaseTestClass):
|
||||
def setup_class(cls):
|
||||
super(TestCDXApp, cls).setup_class()
|
||||
config_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'config_test.yaml')
|
||||
cls.testapp = webtest.TestApp(AutoConfigApp(config_file=config_file))
|
||||
cls.testapp = webtest.TestApp(WarcServer(config_file=config_file))
|
||||
|
||||
def query(self, url, is_error=False, **params):
|
||||
params['url'] = url
|
||||
|
@ -1,6 +1,6 @@
|
||||
from .base_config_test import BaseConfigTest, fmod
|
||||
|
||||
from pywb.cdx.cdxobject import CDXObject
|
||||
from pywb.warcserver.index.cdxobject import CDXObject
|
||||
|
||||
|
||||
# ============================================================================
|
||||
|
Loading…
x
Reference in New Issue
Block a user