mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
refactor:
- merge pywb.urlrewrite -> pywb.rewrite, remove obsolete stuff (rewrite_content.py, rewrite_live.py, dsrules.py) - move wbrequestresponse -> pywb.apps - move pywb.webapp.handlers -> pywb.apps.static_handler - remove pywb.webapp, pywb.framework packages - disable old header_rewriter, content_rewriter tests - finish renaming from previous warcserver refactor - all other tests passing!
This commit is contained in:
parent
2907ed01c8
commit
97182b71b7
@ -6,7 +6,7 @@ import logging
|
|||||||
|
|
||||||
#=============================================================================
|
#=============================================================================
|
||||||
def webagg(args=None):
|
def webagg(args=None):
|
||||||
WebaggCli(args=args,
|
WarcServerCli(args=args,
|
||||||
default_port=8070,
|
default_port=8070,
|
||||||
desc='pywb Web Aggregator Server').run()
|
desc='pywb Web Aggregator Server').run()
|
||||||
|
|
||||||
@ -103,18 +103,18 @@ class ReplayCli(BaseCli):
|
|||||||
|
|
||||||
|
|
||||||
#=============================================================================
|
#=============================================================================
|
||||||
class WebaggCli(BaseCli):
|
class WarcServerCli(BaseCli):
|
||||||
def load(self):
|
def load(self):
|
||||||
from pywb.webagg.autoapp import AutoConfigApp
|
from pywb.warcserver.warcserver import WarcServer
|
||||||
|
|
||||||
super(WebaggCli, self).load()
|
super(WarcServerCli, self).load()
|
||||||
return AutoConfigApp(custom_config=self.extra_config)
|
return WarcServer(custom_config=self.extra_config)
|
||||||
|
|
||||||
|
|
||||||
#=============================================================================
|
#=============================================================================
|
||||||
class WaybackCli(ReplayCli):
|
class WaybackCli(ReplayCli):
|
||||||
def load(self):
|
def load(self):
|
||||||
from pywb.urlrewrite.frontendapp import FrontEndApp
|
from pywb.apps.frontendapp import FrontEndApp
|
||||||
|
|
||||||
super(WaybackCli, self).load()
|
super(WaybackCli, self).load()
|
||||||
return FrontEndApp(custom_config=self.extra_config)
|
return FrontEndApp(custom_config=self.extra_config)
|
||||||
@ -123,7 +123,7 @@ class WaybackCli(ReplayCli):
|
|||||||
#=============================================================================
|
#=============================================================================
|
||||||
class LiveCli(BaseCli):
|
class LiveCli(BaseCli):
|
||||||
def load(self):
|
def load(self):
|
||||||
from pywb.urlrewrite.frontendapp import FrontEndApp
|
from pywb.apps.frontendapp import FrontEndApp
|
||||||
|
|
||||||
self.r.live = True
|
self.r.live = True
|
||||||
|
|
||||||
|
@ -8,16 +8,15 @@ from six.moves.urllib.parse import urljoin
|
|||||||
from six import iteritems
|
from six import iteritems
|
||||||
|
|
||||||
from pywb.utils.loaders import load_yaml_config, to_native_str
|
from pywb.utils.loaders import load_yaml_config, to_native_str
|
||||||
|
from pywb.utils.geventserver import GeventServer
|
||||||
|
|
||||||
from pywb.webagg.autoapp import AutoConfigApp
|
from pywb.warcserver.warcserver import WarcServer
|
||||||
from pywb.webapp.handlers import StaticHandler
|
|
||||||
|
|
||||||
from pywb.framework.wbrequestresponse import WbResponse
|
from pywb.rewrite.templateview import BaseInsertView
|
||||||
|
|
||||||
from pywb.urlrewrite.geventserver import GeventServer
|
from pywb.apps.static_handler import StaticHandler
|
||||||
from pywb.urlrewrite.templateview import BaseInsertView
|
from pywb.apps.rewriterapp import RewriterApp, UpstreamException
|
||||||
|
from pywb.apps.wbrequestresponse import WbResponse
|
||||||
from pywb.urlrewrite.rewriterapp import RewriterApp, UpstreamException
|
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import traceback
|
import traceback
|
||||||
@ -27,14 +26,14 @@ import traceback
|
|||||||
class FrontEndApp(object):
|
class FrontEndApp(object):
|
||||||
def __init__(self, config_file='./config.yaml', custom_config=None):
|
def __init__(self, config_file='./config.yaml', custom_config=None):
|
||||||
self.debug = True
|
self.debug = True
|
||||||
self.webagg = AutoConfigApp(config_file=config_file,
|
self.warcserver = WarcServer(config_file=config_file,
|
||||||
custom_config=custom_config)
|
custom_config=custom_config)
|
||||||
|
|
||||||
framed_replay = self.webagg.config.get('framed_replay', True)
|
framed_replay = self.warcserver.config.get('framed_replay', True)
|
||||||
|
|
||||||
self.rewriterapp = RewriterApp(framed_replay, config=self.webagg.config)
|
self.rewriterapp = RewriterApp(framed_replay, config=self.warcserver.config)
|
||||||
|
|
||||||
self.webagg_server = GeventServer(self.webagg, port=0)
|
self.warcserver_server = GeventServer(self.warcserver, port=0)
|
||||||
|
|
||||||
self.static_handler = StaticHandler('pywb/static/')
|
self.static_handler = StaticHandler('pywb/static/')
|
||||||
|
|
||||||
@ -46,12 +45,12 @@ class FrontEndApp(object):
|
|||||||
self.url_map.add(Rule('/collinfo.json', endpoint=self.serve_listing))
|
self.url_map.add(Rule('/collinfo.json', endpoint=self.serve_listing))
|
||||||
self.url_map.add(Rule('/', endpoint=self.serve_home))
|
self.url_map.add(Rule('/', endpoint=self.serve_home))
|
||||||
|
|
||||||
self.rewriterapp.paths = self.get_upstream_paths(self.webagg_server.port)
|
self.rewriterapp.paths = self.get_upstream_paths(self.warcserver_server.port)
|
||||||
|
|
||||||
self.templates_dir = self.webagg.config.get('templates_dir', 'templates')
|
self.templates_dir = self.warcserver.config.get('templates_dir', 'templates')
|
||||||
self.static_dir = self.webagg.config.get('static_dir', 'static')
|
self.static_dir = self.warcserver.config.get('static_dir', 'static')
|
||||||
|
|
||||||
metadata_templ = os.path.join(self.webagg.root_dir, '{coll}', 'metadata.yaml')
|
metadata_templ = os.path.join(self.warcserver.root_dir, '{coll}', 'metadata.yaml')
|
||||||
self.metadata_cache = MetadataCache(metadata_templ)
|
self.metadata_cache = MetadataCache(metadata_templ)
|
||||||
|
|
||||||
def get_upstream_paths(self, port):
|
def get_upstream_paths(self, port):
|
||||||
@ -61,8 +60,8 @@ class FrontEndApp(object):
|
|||||||
|
|
||||||
def serve_home(self, environ):
|
def serve_home(self, environ):
|
||||||
home_view = BaseInsertView(self.rewriterapp.jinja_env, 'index.html')
|
home_view = BaseInsertView(self.rewriterapp.jinja_env, 'index.html')
|
||||||
fixed_routes = self.webagg.list_fixed_routes()
|
fixed_routes = self.warcserver.list_fixed_routes()
|
||||||
dynamic_routes = self.webagg.list_dynamic_routes()
|
dynamic_routes = self.warcserver.list_dynamic_routes()
|
||||||
|
|
||||||
routes = fixed_routes + dynamic_routes
|
routes = fixed_routes + dynamic_routes
|
||||||
|
|
||||||
@ -76,7 +75,7 @@ class FrontEndApp(object):
|
|||||||
|
|
||||||
def serve_static(self, environ, coll='', filepath=''):
|
def serve_static(self, environ, coll='', filepath=''):
|
||||||
if coll:
|
if coll:
|
||||||
path = os.path.join(self.webagg.root_dir, coll, self.static_dir)
|
path = os.path.join(self.warcserver.root_dir, coll, self.static_dir)
|
||||||
else:
|
else:
|
||||||
path = self.static_dir
|
path = self.static_dir
|
||||||
|
|
||||||
@ -116,7 +115,7 @@ class FrontEndApp(object):
|
|||||||
|
|
||||||
kwargs = {'coll': coll}
|
kwargs = {'coll': coll}
|
||||||
|
|
||||||
if coll in self.webagg.list_fixed_routes():
|
if coll in self.warcserver.list_fixed_routes():
|
||||||
kwargs['type'] = 'replay-fixed'
|
kwargs['type'] = 'replay-fixed'
|
||||||
else:
|
else:
|
||||||
kwargs['type'] = 'replay-dyn'
|
kwargs['type'] = 'replay-dyn'
|
||||||
@ -131,23 +130,23 @@ class FrontEndApp(object):
|
|||||||
|
|
||||||
def setup_paths(self, environ, coll):
|
def setup_paths(self, environ, coll):
|
||||||
pop_path_info(environ)
|
pop_path_info(environ)
|
||||||
if not coll or not self.webagg.root_dir:
|
if not coll or not self.warcserver.root_dir:
|
||||||
return
|
return
|
||||||
|
|
||||||
environ['pywb.templates_dir'] = os.path.join(self.webagg.root_dir,
|
environ['pywb.templates_dir'] = os.path.join(self.warcserver.root_dir,
|
||||||
coll,
|
coll,
|
||||||
self.templates_dir)
|
self.templates_dir)
|
||||||
|
|
||||||
def serve_listing(self, environ):
|
def serve_listing(self, environ):
|
||||||
result = {'fixed': self.webagg.list_fixed_routes(),
|
result = {'fixed': self.warcserver.list_fixed_routes(),
|
||||||
'dynamic': self.webagg.list_dynamic_routes()
|
'dynamic': self.warcserver.list_dynamic_routes()
|
||||||
}
|
}
|
||||||
|
|
||||||
return WbResponse.json_response(result)
|
return WbResponse.json_response(result)
|
||||||
|
|
||||||
def is_valid_coll(self, coll):
|
def is_valid_coll(self, coll):
|
||||||
return (coll in self.webagg.list_fixed_routes() or
|
return (coll in self.warcserver.list_fixed_routes() or
|
||||||
coll in self.webagg.list_dynamic_routes())
|
coll in self.warcserver.list_dynamic_routes())
|
||||||
|
|
||||||
def raise_not_found(self, environ, msg):
|
def raise_not_found(self, environ, msg):
|
||||||
raise NotFound(response=self.rewriterapp._error_response(environ, msg))
|
raise NotFound(response=self.rewriterapp._error_response(environ, msg))
|
@ -1,5 +1,5 @@
|
|||||||
from gevent.monkey import patch_all; patch_all()
|
from gevent.monkey import patch_all; patch_all()
|
||||||
from pywb.urlrewrite.frontendapp import FrontEndApp
|
from pywb.apps.frontendapp import FrontEndApp
|
||||||
|
|
||||||
application = FrontEndApp(config_file=None,
|
application = FrontEndApp(config_file=None,
|
||||||
custom_config={'collections': {'live': '$live'}})
|
custom_config={'collections': {'live': '$live'}})
|
||||||
|
@ -1,9 +1,12 @@
|
|||||||
import requests
|
import requests
|
||||||
|
|
||||||
|
from werkzeug.http import HTTP_STATUS_CODES
|
||||||
|
from six.moves.urllib.parse import urlencode, urlsplit, urlunsplit
|
||||||
|
|
||||||
#from pywb.rewrite.rewrite_amf import RewriteAMFMixin
|
#from pywb.rewrite.rewrite_amf import RewriteAMFMixin
|
||||||
#from pywb.rewrite.rewrite_dash import RewriteDASHMixin
|
#from pywb.rewrite.rewrite_dash import RewriteDASHMixin
|
||||||
#from pywb.rewrite.rewrite_content import RewriteContent
|
#from pywb.rewrite.rewrite_content import RewriteContent
|
||||||
from pywb.urlrewrite.rewriter import DefaultRewriter
|
from pywb.rewrite.default_rewriter import DefaultRewriter
|
||||||
|
|
||||||
from pywb.rewrite.wburl import WbUrl
|
from pywb.rewrite.wburl import WbUrl
|
||||||
from pywb.rewrite.url_rewriter import UrlRewriter, SchemeOnlyUrlRewriter
|
from pywb.rewrite.url_rewriter import UrlRewriter, SchemeOnlyUrlRewriter
|
||||||
@ -16,18 +19,14 @@ from warcio.timeutils import http_date_to_timestamp
|
|||||||
from warcio.bufferedreaders import BufferedReader
|
from warcio.bufferedreaders import BufferedReader
|
||||||
from warcio.recordloader import ArcWarcRecordLoader
|
from warcio.recordloader import ArcWarcRecordLoader
|
||||||
|
|
||||||
from pywb.webagg.utils import BUFF_SIZE
|
from pywb.warcserver.index.cdxobject import CDXObject
|
||||||
|
from pywb.apps.wbrequestresponse import WbResponse
|
||||||
|
|
||||||
from pywb.cdx.cdxobject import CDXObject
|
from pywb.warcserver.utils import BUFF_SIZE
|
||||||
from pywb.framework.wbrequestresponse import WbResponse
|
from pywb.warcserver.utils import MementoUtils
|
||||||
|
|
||||||
from pywb.webagg.utils import MementoUtils
|
from pywb.rewrite.rewriteinputreq import RewriteInputRequest
|
||||||
|
from pywb.rewrite.templateview import JinjaEnv, HeadInsertView, TopFrameView, BaseInsertView
|
||||||
from werkzeug.http import HTTP_STATUS_CODES
|
|
||||||
from six.moves.urllib.parse import urlencode, urlsplit, urlunsplit
|
|
||||||
|
|
||||||
from pywb.urlrewrite.rewriteinputreq import RewriteInputRequest
|
|
||||||
from pywb.urlrewrite.templateview import JinjaEnv, HeadInsertView, TopFrameView, BaseInsertView
|
|
||||||
|
|
||||||
|
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
@ -71,7 +70,7 @@ class RewriterApp(object):
|
|||||||
#frame_type = 'inverse' if framed_replay else False
|
#frame_type = 'inverse' if framed_replay else False
|
||||||
|
|
||||||
#self.content_rewriter = Rewriter(is_framed_replay=frame_type)
|
#self.content_rewriter = Rewriter(is_framed_replay=frame_type)
|
||||||
self.content_rw = DefaultRewriter('pkg://pywb/rules.yaml', self.replay_mod)
|
self.content_rw = DefaultRewriter(replay_mod=self.replay_mod)
|
||||||
|
|
||||||
if not jinja_env:
|
if not jinja_env:
|
||||||
jinja_env = JinjaEnv(globals={'static_path': 'static'})
|
jinja_env = JinjaEnv(globals={'static_path': 'static'})
|
@ -3,7 +3,7 @@ import os
|
|||||||
|
|
||||||
from pywb.utils.loaders import LocalFileLoader
|
from pywb.utils.loaders import LocalFileLoader
|
||||||
|
|
||||||
from pywb.framework.wbrequestresponse import WbResponse
|
from pywb.apps.wbrequestresponse import WbResponse
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
@ -1,9 +1,9 @@
|
|||||||
from gevent import monkey; monkey.patch_all(thread=False)
|
from gevent import monkey; monkey.patch_all(thread=False)
|
||||||
|
|
||||||
from pywb.webagg.test.testutils import LiveServerTests, BaseTestClass
|
from pywb.warcserver.test.testutils import LiveServerTests, BaseTestClass
|
||||||
from pywb.webagg.test.testutils import FakeRedisTests
|
from pywb.warcserver.test.testutils import FakeRedisTests
|
||||||
|
|
||||||
from pywb.urlrewrite.frontendapp import FrontEndApp
|
from pywb.apps.frontendapp import FrontEndApp
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import webtest
|
import webtest
|
||||||
@ -12,10 +12,10 @@ import webtest
|
|||||||
LIVE_CONFIG = {'collections': {'live': '$live'}}
|
LIVE_CONFIG = {'collections': {'live': '$live'}}
|
||||||
|
|
||||||
|
|
||||||
class TestRewriter(FakeRedisTests, BaseTestClass):
|
class TestRewriterApp(FakeRedisTests, BaseTestClass):
|
||||||
@classmethod
|
@classmethod
|
||||||
def setup_class(cls):
|
def setup_class(cls):
|
||||||
super(TestRewriter, cls).setup_class()
|
super(TestRewriterApp, cls).setup_class()
|
||||||
|
|
||||||
#cls.app = RWApp.create_app(replay_port=cls.server.port)
|
#cls.app = RWApp.create_app(replay_port=cls.server.port)
|
||||||
#cls.testapp = webtest.TestApp(cls.app.app)
|
#cls.testapp = webtest.TestApp(cls.app.app)
|
@ -1,4 +1,4 @@
|
|||||||
from pywb.framework.wbrequestresponse import WbResponse
|
from pywb.apps.wbrequestresponse import WbResponse
|
||||||
from warcio.statusandheaders import StatusAndHeaders
|
from warcio.statusandheaders import StatusAndHeaders
|
||||||
|
|
||||||
|
|
7
pywb/apps/warcserverapp.py
Normal file
7
pywb/apps/warcserverapp.py
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
from gevent.monkey import patch_all; patch_all()
|
||||||
|
from pywb.warcserver.warcserver import WarcServer
|
||||||
|
|
||||||
|
application = WarcServer(custom_config={'collections': {'live': '$live'}})
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
|||||||
from gevent.monkey import patch_all; patch_all()
|
from gevent.monkey import patch_all; patch_all()
|
||||||
from pywb.urlrewrite.frontendapp import FrontEndApp
|
from pywb.apps.frontendapp import FrontEndApp
|
||||||
|
|
||||||
application = FrontEndApp()
|
application = FrontEndApp()
|
||||||
|
|
||||||
|
@ -1,7 +0,0 @@
|
|||||||
from gevent.monkey import patch_all; patch_all()
|
|
||||||
from pywb.webagg.autoapp import AutoConfigApp
|
|
||||||
|
|
||||||
application = AutoConfigApp(custom_config={'collections': {'live': '$live'}})
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -122,7 +122,7 @@ directory structure expected by pywb
|
|||||||
self._cdx_index(cdx_file, [self.archive_dir])
|
self._cdx_index(cdx_file, [self.archive_dir])
|
||||||
|
|
||||||
def _cdx_index(self, out, input_, rel_root=None):
|
def _cdx_index(self, out, input_, rel_root=None):
|
||||||
from pywb.warc.cdxindexer import write_multi_cdx_index
|
from pywb.indexer.cdxindexer import write_multi_cdx_index
|
||||||
|
|
||||||
options = dict(append_post=True,
|
options = dict(append_post=True,
|
||||||
cdxj=True,
|
cdxj=True,
|
||||||
|
@ -9,7 +9,7 @@ import re
|
|||||||
import webencodings
|
import webencodings
|
||||||
import tempfile
|
import tempfile
|
||||||
|
|
||||||
from pywb.webagg.utils import StreamIter, BUFF_SIZE
|
from pywb.warcserver.utils import StreamIter, BUFF_SIZE
|
||||||
from pywb.rewrite.cookie_rewriter import ExactPathCookieRewriter
|
from pywb.rewrite.cookie_rewriter import ExactPathCookieRewriter
|
||||||
|
|
||||||
from pywb.utils.loaders import load_yaml_config
|
from pywb.utils.loaders import load_yaml_config
|
||||||
@ -277,6 +277,7 @@ class RewriteInfo(object):
|
|||||||
|
|
||||||
self.cookie_rewriter = cookie_rewriter
|
self.cookie_rewriter = cookie_rewriter
|
||||||
|
|
||||||
|
if self.record:
|
||||||
self._fill_text_type_and_charset()
|
self._fill_text_type_and_charset()
|
||||||
self._resolve_text_type()
|
self._resolve_text_type()
|
||||||
|
|
||||||
|
@ -7,7 +7,7 @@ from pywb.rewrite.regex_rewriters import RegexRewriter, CSSRewriter, XMLRewriter
|
|||||||
from pywb.rewrite.regex_rewriters import JSLinkAndLocationRewriter, JSLinkOnlyRewriter
|
from pywb.rewrite.regex_rewriters import JSLinkAndLocationRewriter, JSLinkOnlyRewriter
|
||||||
from pywb.rewrite.regex_rewriters import JSLocationOnlyRewriter, JSNoneRewriter
|
from pywb.rewrite.regex_rewriters import JSLocationOnlyRewriter, JSNoneRewriter
|
||||||
|
|
||||||
from pywb.urlrewrite.header_rewriter import PrefixHeaderRewriter
|
from pywb.rewrite.header_rewriter import PrefixHeaderRewriter
|
||||||
|
|
||||||
from pywb.rewrite.jsonp_rewriter import JSONPRewriter
|
from pywb.rewrite.jsonp_rewriter import JSONPRewriter
|
||||||
|
|
||||||
@ -75,6 +75,10 @@ class DefaultRewriter(BaseContentRewriter):
|
|||||||
'text/plain': 'plain',
|
'text/plain': 'plain',
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def __init__(self, rules_file=None, replay_mod=''):
|
||||||
|
rules_file = rules_file or 'pkg://pywb/rules.yaml'
|
||||||
|
super(DefaultRewriter, self).__init__(rules_file, replay_mod)
|
||||||
|
|
||||||
def init_js_regex(self, regexs):
|
def init_js_regex(self, regexs):
|
||||||
return RegexRewriter.parse_rules_from_config(regexs)
|
return RegexRewriter.parse_rules_from_config(regexs)
|
||||||
|
|
@ -1,102 +1,87 @@
|
|||||||
from warcio.statusandheaders import StatusAndHeaders
|
from warcio.statusandheaders import StatusAndHeaders
|
||||||
from warcio.timeutils import datetime_to_http_date
|
from warcio.timeutils import datetime_to_http_date
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
import six
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=============================================================================
|
||||||
class RewrittenStatusAndHeaders(object):
|
class PrefixHeaderRewriter(object):
|
||||||
def __init__(self, statusline, headers,
|
header_rules = {
|
||||||
removed_header_dict, text_type, charset):
|
'content-type': 'keep',
|
||||||
|
'content-disposition': 'keep',
|
||||||
|
'content-range': 'keep',
|
||||||
|
'accept-rangees': 'keep',
|
||||||
|
'www-authenticate': 'keep',
|
||||||
|
'proxy-authenticate': 'keep',
|
||||||
|
|
||||||
self.status_headers = StatusAndHeaders(statusline, headers)
|
'location': 'url-rewrite',
|
||||||
self.removed_header_dict = removed_header_dict
|
'content-location': 'url-rewrite',
|
||||||
self.text_type = text_type
|
'content-base': 'url-rewrite',
|
||||||
self.charset = charset
|
|
||||||
|
|
||||||
def contains_removed_header(self, name, value):
|
'transfer-encoding': 'prefix',
|
||||||
return self.removed_header_dict.get(name) == value
|
'connection': 'prefix',
|
||||||
|
|
||||||
def readd_rewrite_removed(self):
|
'content-encoding': 'keep-if-no-content-rewrite',
|
||||||
for name in HeaderRewriter.KEEP_NO_REWRITE_HEADERS:
|
'content-length': 'content-length',
|
||||||
value = self.removed_header_dict.get(name)
|
|
||||||
if value is not None:
|
|
||||||
self.status_headers.headers.append((name, value))
|
|
||||||
|
|
||||||
|
'set-cookie': 'cookie',
|
||||||
#=================================================================
|
'cookie': 'cookie',
|
||||||
class HeaderRewriter(object):
|
|
||||||
REWRITE_TYPES = {
|
|
||||||
'html': ['text/html',
|
|
||||||
'application/xhtml',
|
|
||||||
'application/xhtml+xml'],
|
|
||||||
|
|
||||||
'css': ['text/css'],
|
|
||||||
|
|
||||||
'js': ['text/javascript',
|
|
||||||
'application/javascript',
|
|
||||||
'application/x-javascript'],
|
|
||||||
|
|
||||||
'json': ['application/json'],
|
|
||||||
|
|
||||||
'hls': ['application/x-mpegURL'],
|
|
||||||
|
|
||||||
'xml': ['/xml', '+xml', '.xml', '.rss'],
|
|
||||||
|
|
||||||
'plain': ['text/plain'],
|
|
||||||
}
|
}
|
||||||
|
|
||||||
PROXY_HEADERS = ['content-type', 'content-disposition', 'content-range',
|
def __init__(self, rwinfo, header_prefix='X-Archive-Orig-'):
|
||||||
'accept-ranges', 'www-authenticate', 'proxy-authenticate']
|
|
||||||
|
|
||||||
URL_REWRITE_HEADERS = ['location', 'content-location', 'content-base']
|
|
||||||
|
|
||||||
REMOVE_ALWAYS_HEADERS = ['transfer-encoding']
|
|
||||||
|
|
||||||
KEEP_PROXY_HEADERS = ['content-security-policy', 'strict-transport-security']
|
|
||||||
|
|
||||||
KEEP_NO_REWRITE_HEADERS = ['content-length', 'content-encoding']
|
|
||||||
|
|
||||||
COOKIE_HEADERS = ['set-cookie', 'cookie']
|
|
||||||
|
|
||||||
CACHE_HEADERS = ['cache-control', 'expires', 'etag', 'last-modified']
|
|
||||||
|
|
||||||
|
|
||||||
def __init__(self, header_prefix='X-Archive-Orig-'):
|
|
||||||
self.header_prefix = header_prefix
|
self.header_prefix = header_prefix
|
||||||
|
self.rwinfo = rwinfo
|
||||||
|
self.http_headers = rwinfo.record.http_headers
|
||||||
|
|
||||||
def rewrite(self, status_headers, urlrewriter, cookie_rewriter):
|
if rwinfo.is_url_rw():
|
||||||
content_type = status_headers.get_header('Content-Type')
|
self.default_rule = 'prefix'
|
||||||
text_type = None
|
else:
|
||||||
charset = None
|
self.default_rule = 'keep'
|
||||||
content_modified = False
|
|
||||||
http_cache = None
|
|
||||||
if urlrewriter:
|
|
||||||
http_cache = urlrewriter.rewrite_opts.get('http_cache')
|
|
||||||
|
|
||||||
if content_type:
|
def __call__(self):
|
||||||
text_type = self._extract_text_type(content_type)
|
new_headers_list = []
|
||||||
if text_type:
|
for name, value in self.http_headers.headers:
|
||||||
charset = self._extract_char_set(content_type)
|
rule = self.header_rules.get(name.lower(), self.default_rule)
|
||||||
content_modified = True
|
new_header = self.rewrite_header(name, value, rule)
|
||||||
|
if new_header:
|
||||||
|
if isinstance(new_header, list):
|
||||||
|
new_headers_list.extend(new_header)
|
||||||
|
else:
|
||||||
|
new_headers_list.append(new_header)
|
||||||
|
|
||||||
result = self._rewrite_headers(status_headers.headers,
|
return StatusAndHeaders(self.http_headers.statusline,
|
||||||
urlrewriter,
|
headers=new_headers_list,
|
||||||
cookie_rewriter,
|
protocol=self.http_headers.protocol)
|
||||||
content_modified,
|
|
||||||
http_cache)
|
|
||||||
|
|
||||||
new_headers = result[0]
|
def rewrite_header(self, name, value, rule):
|
||||||
removed_header_dict = result[1]
|
if rule == 'keep':
|
||||||
|
return (name, value)
|
||||||
|
|
||||||
if http_cache != None and http_cache != 'pass':
|
elif rule == 'url-rewrite':
|
||||||
self._add_cache_headers(new_headers, http_cache)
|
return (name, self.rwinfo.url_rewriter.rewrite(value))
|
||||||
|
|
||||||
return RewrittenStatusAndHeaders(status_headers.statusline,
|
elif rule == 'keep-if-no-content-rewrite':
|
||||||
new_headers,
|
if not self.rwinfo.is_content_rw:
|
||||||
removed_header_dict,
|
return (name, value)
|
||||||
text_type,
|
|
||||||
charset)
|
elif rule == 'content-length':
|
||||||
|
if value == '0':
|
||||||
|
return (name, value)
|
||||||
|
|
||||||
|
if not self.rwinfo.is_content_rw:
|
||||||
|
try:
|
||||||
|
if int(value) >= 0:
|
||||||
|
return (name, value)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
elif rule == 'cookie':
|
||||||
|
if self.rwinfo.cookie_rewriter:
|
||||||
|
return self.rwinfo.cookie_rewriter.rewrite(value)
|
||||||
|
else:
|
||||||
|
return (name, value)
|
||||||
|
|
||||||
|
# default 'prefix'
|
||||||
|
return (self.header_prefix + name, value)
|
||||||
|
|
||||||
def _add_cache_headers(self, new_headers, http_cache):
|
def _add_cache_headers(self, new_headers, http_cache):
|
||||||
try:
|
try:
|
||||||
@ -112,76 +97,4 @@ class HeaderRewriter(object):
|
|||||||
new_headers.append(('Cache-Control', 'max-age=' + str(age)))
|
new_headers.append(('Cache-Control', 'max-age=' + str(age)))
|
||||||
new_headers.append(('Expires', datetime_to_http_date(dt)))
|
new_headers.append(('Expires', datetime_to_http_date(dt)))
|
||||||
|
|
||||||
def _extract_text_type(self, content_type):
|
|
||||||
for ctype, mimelist in six.iteritems(self.REWRITE_TYPES):
|
|
||||||
if any((mime in content_type) for mime in mimelist):
|
|
||||||
return ctype
|
|
||||||
|
|
||||||
return None
|
|
||||||
|
|
||||||
def _extract_char_set(self, content_type):
|
|
||||||
CHARSET_TOKEN = 'charset='
|
|
||||||
idx = content_type.find(CHARSET_TOKEN)
|
|
||||||
if idx < 0:
|
|
||||||
return None
|
|
||||||
|
|
||||||
return content_type[idx + len(CHARSET_TOKEN):].lower()
|
|
||||||
|
|
||||||
def _rewrite_headers(self, headers, urlrewriter,
|
|
||||||
cookie_rewriter,
|
|
||||||
content_modified,
|
|
||||||
http_cache):
|
|
||||||
|
|
||||||
new_headers = []
|
|
||||||
removed_header_dict = {}
|
|
||||||
|
|
||||||
def add_header(name, value):
|
|
||||||
new_headers.append((name, value))
|
|
||||||
|
|
||||||
def add_prefixed_header(name, value):
|
|
||||||
new_headers.append((self.header_prefix + name, value))
|
|
||||||
|
|
||||||
for (name, value) in headers:
|
|
||||||
lowername = name.lower()
|
|
||||||
|
|
||||||
if lowername in self.PROXY_HEADERS:
|
|
||||||
add_header(name, value)
|
|
||||||
|
|
||||||
elif urlrewriter and urlrewriter.prefix and lowername in self.URL_REWRITE_HEADERS:
|
|
||||||
new_headers.append((name, urlrewriter.rewrite(value)))
|
|
||||||
|
|
||||||
elif lowername in self.KEEP_NO_REWRITE_HEADERS:
|
|
||||||
if content_modified and value != '0':
|
|
||||||
removed_header_dict[lowername] = value
|
|
||||||
add_prefixed_header(name, value)
|
|
||||||
else:
|
|
||||||
add_header(name, value)
|
|
||||||
|
|
||||||
elif lowername in self.KEEP_PROXY_HEADERS:
|
|
||||||
if urlrewriter.prefix:
|
|
||||||
removed_header_dict[lowername] = value
|
|
||||||
add_prefixed_header(name, value)
|
|
||||||
else:
|
|
||||||
add_header(name, value)
|
|
||||||
|
|
||||||
elif lowername in self.REMOVE_ALWAYS_HEADERS:
|
|
||||||
removed_header_dict[lowername] = value
|
|
||||||
add_prefixed_header(name, value)
|
|
||||||
|
|
||||||
elif (lowername in self.COOKIE_HEADERS and
|
|
||||||
cookie_rewriter):
|
|
||||||
cookie_list = cookie_rewriter.rewrite(value)
|
|
||||||
new_headers.extend(cookie_list)
|
|
||||||
|
|
||||||
elif (lowername in self.CACHE_HEADERS):
|
|
||||||
if http_cache == 'pass':
|
|
||||||
add_header(name, value)
|
|
||||||
else:
|
|
||||||
add_prefixed_header(name, value)
|
|
||||||
|
|
||||||
elif urlrewriter and urlrewriter.prefix:
|
|
||||||
add_prefixed_header(name, value)
|
|
||||||
else:
|
|
||||||
add_header(name, value)
|
|
||||||
|
|
||||||
return (new_headers, removed_header_dict)
|
|
||||||
|
@ -1,403 +0,0 @@
|
|||||||
#import chardet
|
|
||||||
import pkgutil
|
|
||||||
import webencodings
|
|
||||||
import yaml
|
|
||||||
import re
|
|
||||||
|
|
||||||
#from chardet.universaldetector import UniversalDetector
|
|
||||||
from io import BytesIO
|
|
||||||
|
|
||||||
from pywb.rewrite.header_rewriter import RewrittenStatusAndHeaders
|
|
||||||
|
|
||||||
from pywb.rewrite.rewriterules import RewriteRules
|
|
||||||
|
|
||||||
from pywb.utils.dsrules import RuleSet
|
|
||||||
|
|
||||||
from warcio.statusandheaders import StatusAndHeaders
|
|
||||||
from warcio.bufferedreaders import DecompressingBufferedReader
|
|
||||||
from warcio.bufferedreaders import ChunkedDataReader, BufferedReader
|
|
||||||
from warcio.utils import to_native_str
|
|
||||||
|
|
||||||
from pywb.rewrite.regex_rewriters import JSNoneRewriter, JSLinkOnlyRewriter
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
class RewriteContent(object):
|
|
||||||
HEAD_REGEX = re.compile(b'<\s*head\\b[^>]*[>]+', re.I)
|
|
||||||
|
|
||||||
TAG_REGEX = re.compile(b'^\s*\<')
|
|
||||||
|
|
||||||
CHARSET_REGEX = re.compile(b'<meta[^>]*?[\s;"\']charset\s*=[\s"\']*([^\s"\'/>]*)')
|
|
||||||
|
|
||||||
BUFF_SIZE = 16384
|
|
||||||
|
|
||||||
def __init__(self, ds_rules_file=None, is_framed_replay=False):
|
|
||||||
self.ruleset = RuleSet(RewriteRules, 'rewrite',
|
|
||||||
default_rule_config={},
|
|
||||||
ds_rules_file=ds_rules_file)
|
|
||||||
|
|
||||||
if is_framed_replay == 'inverse':
|
|
||||||
self.defmod = 'mp_'
|
|
||||||
else:
|
|
||||||
self.defmod = ''
|
|
||||||
|
|
||||||
def sanitize_content(self, status_headers, stream):
|
|
||||||
# remove transfer encoding chunked and wrap in a dechunking stream
|
|
||||||
if (status_headers.remove_header('transfer-encoding')):
|
|
||||||
stream = ChunkedDataReader(stream)
|
|
||||||
|
|
||||||
return (status_headers, stream)
|
|
||||||
|
|
||||||
def _rewrite_headers(self, urlrewriter, rule, status_headers, stream,
|
|
||||||
urlkey='', cookie_rewriter=None):
|
|
||||||
|
|
||||||
header_rewriter_class = rule.rewriters['header']
|
|
||||||
|
|
||||||
if urlrewriter and not cookie_rewriter:
|
|
||||||
cookie_rewriter = urlrewriter.get_cookie_rewriter(rule)
|
|
||||||
|
|
||||||
rewritten_headers = (header_rewriter_class().
|
|
||||||
rewrite(status_headers,
|
|
||||||
urlrewriter,
|
|
||||||
cookie_rewriter))
|
|
||||||
|
|
||||||
# note: since chunk encoding may/may not be valid,
|
|
||||||
# the approach taken here is to *always* attempt
|
|
||||||
# to dechunk if 'transfer-encoding: chunked' is present
|
|
||||||
#
|
|
||||||
# an alternative may be to serve chunked unless
|
|
||||||
# content rewriting is needed
|
|
||||||
# todo: possible revisit this approach
|
|
||||||
|
|
||||||
if (rewritten_headers.
|
|
||||||
contains_removed_header('transfer-encoding', 'chunked')):
|
|
||||||
|
|
||||||
stream = ChunkedDataReader(stream)
|
|
||||||
|
|
||||||
return (rewritten_headers, stream)
|
|
||||||
|
|
||||||
def _decoding_stream(self, rewritten_headers, stream):
|
|
||||||
for decomp_type in BufferedReader.get_supported_decompressors():
|
|
||||||
matched, stream = self._check_encoding(rewritten_headers,
|
|
||||||
stream,
|
|
||||||
decomp_type)
|
|
||||||
if matched:
|
|
||||||
break
|
|
||||||
|
|
||||||
return stream
|
|
||||||
|
|
||||||
def _check_encoding(self, rewritten_headers, stream, enc):
|
|
||||||
matched = False
|
|
||||||
if (rewritten_headers.
|
|
||||||
contains_removed_header('content-encoding', enc)):
|
|
||||||
|
|
||||||
#optimize: if already a ChunkedDataReader, add the encoding
|
|
||||||
if isinstance(stream, ChunkedDataReader):
|
|
||||||
stream.set_decomp(enc)
|
|
||||||
else:
|
|
||||||
stream = DecompressingBufferedReader(stream, decomp_type=enc)
|
|
||||||
|
|
||||||
rewritten_headers.status_headers.remove_header('content-length')
|
|
||||||
matched = True
|
|
||||||
|
|
||||||
return matched, stream
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def rewrite_content(self, urlrewriter, status_headers, stream,
|
|
||||||
head_insert_func=None, urlkey='',
|
|
||||||
cdx=None, cookie_rewriter=None, env=None):
|
|
||||||
|
|
||||||
wb_url = urlrewriter.wburl
|
|
||||||
|
|
||||||
if (wb_url.is_identity or
|
|
||||||
(not head_insert_func and wb_url.is_banner_only)):
|
|
||||||
status_headers, stream = self.sanitize_content(status_headers,
|
|
||||||
stream)
|
|
||||||
return (status_headers, self.stream_to_gen(stream), False)
|
|
||||||
|
|
||||||
if urlrewriter and cdx and cdx.get('is_live'):
|
|
||||||
urlrewriter.rewrite_opts['is_live'] = True
|
|
||||||
|
|
||||||
rule = self.ruleset.get_first_match(urlkey)
|
|
||||||
|
|
||||||
(rewritten_headers, stream) = self._rewrite_headers(urlrewriter,
|
|
||||||
rule,
|
|
||||||
status_headers,
|
|
||||||
stream,
|
|
||||||
urlkey,
|
|
||||||
cookie_rewriter)
|
|
||||||
|
|
||||||
res = self.handle_custom_rewrite(rewritten_headers,
|
|
||||||
stream,
|
|
||||||
urlrewriter,
|
|
||||||
wb_url.mod,
|
|
||||||
env)
|
|
||||||
if res:
|
|
||||||
return res
|
|
||||||
|
|
||||||
# Handle text content rewriting
|
|
||||||
# ====================================================================
|
|
||||||
# special case -- need to ungzip the body
|
|
||||||
|
|
||||||
status_headers = rewritten_headers.status_headers
|
|
||||||
text_type = rewritten_headers.text_type
|
|
||||||
|
|
||||||
# see known js/css modifier specified, the context should run
|
|
||||||
# default text_type
|
|
||||||
mod = wb_url.mod
|
|
||||||
|
|
||||||
stream_raw = False
|
|
||||||
encoding = None
|
|
||||||
first_buff = b''
|
|
||||||
|
|
||||||
stream = self._decoding_stream(rewritten_headers, stream)
|
|
||||||
|
|
||||||
if mod == 'js_':
|
|
||||||
text_type, stream = self._resolve_text_type('js',
|
|
||||||
text_type,
|
|
||||||
stream)
|
|
||||||
elif mod == 'cs_':
|
|
||||||
text_type, stream = self._resolve_text_type('css',
|
|
||||||
text_type,
|
|
||||||
stream)
|
|
||||||
|
|
||||||
# for proxy mode: use special js_proxy rewriter
|
|
||||||
# which may be none rewriter + custom rules (if any)
|
|
||||||
if text_type == 'js' and not urlrewriter.prefix:
|
|
||||||
rewriter_class = rule.rewriters['js_proxy']
|
|
||||||
else:
|
|
||||||
rewriter_class = rule.rewriters[text_type]
|
|
||||||
|
|
||||||
# for html, need to perform header insert, supply js, css, xml
|
|
||||||
# rewriters
|
|
||||||
if text_type == 'html':
|
|
||||||
head_insert_str = ''
|
|
||||||
charset = rewritten_headers.charset
|
|
||||||
|
|
||||||
# if no charset set, attempt to extract from first 1024
|
|
||||||
if not rewritten_headers.charset:
|
|
||||||
first_buff = stream.read(1024)
|
|
||||||
charset = self._extract_html_charset(first_buff,
|
|
||||||
status_headers)
|
|
||||||
|
|
||||||
if head_insert_func and not wb_url.is_url_rewrite_only:
|
|
||||||
head_insert_orig = head_insert_func(rule, cdx)
|
|
||||||
|
|
||||||
if charset:
|
|
||||||
try:
|
|
||||||
head_insert_str = webencodings.encode(head_insert_orig, charset)
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
|
|
||||||
if not head_insert_str:
|
|
||||||
charset = 'utf-8'
|
|
||||||
head_insert_str = head_insert_orig.encode(charset)
|
|
||||||
|
|
||||||
head_insert_buf = head_insert_str
|
|
||||||
#head_insert_str = to_native_str(head_insert_str)
|
|
||||||
head_insert_str = head_insert_str.decode('iso-8859-1')
|
|
||||||
|
|
||||||
|
|
||||||
if wb_url.is_banner_only:
|
|
||||||
gen = self._head_insert_only_gen(head_insert_buf,
|
|
||||||
stream,
|
|
||||||
first_buff)
|
|
||||||
|
|
||||||
content_len = status_headers.get_header('Content-Length')
|
|
||||||
try:
|
|
||||||
content_len = int(content_len)
|
|
||||||
except Exception:
|
|
||||||
content_len = None
|
|
||||||
|
|
||||||
if content_len is not None and content_len >= 0:
|
|
||||||
content_len = str(content_len + len(head_insert_str))
|
|
||||||
status_headers.replace_header('Content-Length',
|
|
||||||
content_len)
|
|
||||||
|
|
||||||
return (status_headers, gen, False)
|
|
||||||
|
|
||||||
# if proxy, use js_proxy rewriter
|
|
||||||
if not urlrewriter.prefix:
|
|
||||||
js_rewriter_class = rule.rewriters['js_proxy']
|
|
||||||
else:
|
|
||||||
js_rewriter_class = rule.rewriters['js']
|
|
||||||
|
|
||||||
css_rewriter_class = rule.rewriters['css']
|
|
||||||
|
|
||||||
if wb_url.is_url_rewrite_only:
|
|
||||||
js_rewriter_class = JSNoneRewriter
|
|
||||||
|
|
||||||
rewriter = rewriter_class(urlrewriter,
|
|
||||||
js_rewriter_class=js_rewriter_class,
|
|
||||||
css_rewriter_class=css_rewriter_class,
|
|
||||||
head_insert=head_insert_str,
|
|
||||||
url=wb_url.url,
|
|
||||||
defmod=self.defmod,
|
|
||||||
parse_comments=rule.parse_comments)
|
|
||||||
|
|
||||||
else:
|
|
||||||
if wb_url.is_banner_only:
|
|
||||||
return (status_headers, self.stream_to_gen(stream), False)
|
|
||||||
|
|
||||||
# url-only rewriter, but not rewriting urls in JS, so return
|
|
||||||
if wb_url.is_url_rewrite_only and text_type == 'js':
|
|
||||||
#return (status_headers, self.stream_to_gen(stream), False)
|
|
||||||
rewriter_class = JSLinkOnlyRewriter
|
|
||||||
|
|
||||||
# apply one of (js, css, xml) rewriters
|
|
||||||
rewriter = rewriter_class(urlrewriter)
|
|
||||||
|
|
||||||
|
|
||||||
# align to line end for all non-html rewriting
|
|
||||||
align = (text_type != 'html')
|
|
||||||
|
|
||||||
# Create rewriting generator
|
|
||||||
gen = self.rewrite_text_stream_to_gen(stream,
|
|
||||||
rewrite_func=rewriter.rewrite,
|
|
||||||
final_read_func=rewriter.close,
|
|
||||||
first_buff=first_buff,
|
|
||||||
align_to_line=align)
|
|
||||||
|
|
||||||
return (status_headers, gen, True)
|
|
||||||
|
|
||||||
def handle_custom_rewrite(self, rewritten_headers, stream,
|
|
||||||
urlrewriter, mod, env):
|
|
||||||
|
|
||||||
text_type = rewritten_headers.text_type
|
|
||||||
status_headers = rewritten_headers.status_headers
|
|
||||||
|
|
||||||
# use rewritten headers, but no further rewriting needed
|
|
||||||
if text_type is None:
|
|
||||||
return (status_headers, self.stream_to_gen(stream), False)
|
|
||||||
|
|
||||||
if ((text_type == 'html' and urlrewriter.rewrite_opts.get('is_ajax')) or
|
|
||||||
(text_type == 'plain' and not mod in ('js_', 'cs_'))):
|
|
||||||
rewritten_headers.readd_rewrite_removed()
|
|
||||||
return (status_headers, self.stream_to_gen(stream), False)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _extract_html_charset(buff, status_headers):
|
|
||||||
charset = None
|
|
||||||
m = RewriteContent.CHARSET_REGEX.search(buff)
|
|
||||||
if m:
|
|
||||||
charset = m.group(1)
|
|
||||||
charset = to_native_str(charset)
|
|
||||||
# content_type = 'text/html; charset=' + charset
|
|
||||||
# status_headers.replace_header('content-type', content_type)
|
|
||||||
|
|
||||||
return charset
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _resolve_text_type(mod, text_type, stream):
|
|
||||||
if text_type == 'css' and mod == 'js':
|
|
||||||
return 'css', stream
|
|
||||||
|
|
||||||
# only attempt to resolve between html and other text types
|
|
||||||
if text_type != 'html':
|
|
||||||
return mod, stream
|
|
||||||
|
|
||||||
buff = stream.read(128)
|
|
||||||
|
|
||||||
wrapped_stream = BufferedReader(stream, starting_data=buff)
|
|
||||||
|
|
||||||
# check if starts with a tag, then likely html
|
|
||||||
if RewriteContent.TAG_REGEX.match(buff):
|
|
||||||
mod = 'html'
|
|
||||||
|
|
||||||
return mod, wrapped_stream
|
|
||||||
|
|
||||||
def _head_insert_only_gen(self, insert_str, stream, first_buff=b''):
|
|
||||||
buff = first_buff
|
|
||||||
max_len = 1024 - len(first_buff)
|
|
||||||
while max_len > 0:
|
|
||||||
curr = stream.read(max_len)
|
|
||||||
if not curr:
|
|
||||||
break
|
|
||||||
|
|
||||||
max_len -= len(buff)
|
|
||||||
buff += curr
|
|
||||||
|
|
||||||
matcher = self.HEAD_REGEX.search(buff)
|
|
||||||
|
|
||||||
if matcher:
|
|
||||||
yield buff[:matcher.end()]
|
|
||||||
yield insert_str
|
|
||||||
yield buff[matcher.end():]
|
|
||||||
else:
|
|
||||||
yield insert_str
|
|
||||||
yield buff
|
|
||||||
|
|
||||||
for buff in self.stream_to_gen(stream):
|
|
||||||
yield buff
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _decode_buff(buff, stream, encoding): # pragma: no coverage
|
|
||||||
try:
|
|
||||||
buff = buff.decode(encoding)
|
|
||||||
except UnicodeDecodeError as e:
|
|
||||||
# chunk may have cut apart unicode bytes -- add 1-3 bytes and retry
|
|
||||||
for i in range(3):
|
|
||||||
buff += stream.read(1)
|
|
||||||
try:
|
|
||||||
buff = buff.decode(encoding)
|
|
||||||
break
|
|
||||||
except UnicodeDecodeError:
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
raise
|
|
||||||
|
|
||||||
return buff
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def stream_to_gen(stream):
|
|
||||||
"""
|
|
||||||
Convert stream to an iterator, reading BUFF_SIZE bytes
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
while True:
|
|
||||||
buff = stream.read(RewriteContent.BUFF_SIZE)
|
|
||||||
yield buff
|
|
||||||
if not buff:
|
|
||||||
break
|
|
||||||
|
|
||||||
finally:
|
|
||||||
stream.close()
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def rewrite_text_stream_to_gen(stream, rewrite_func,
|
|
||||||
final_read_func, first_buff,
|
|
||||||
align_to_line):
|
|
||||||
"""
|
|
||||||
Convert stream to generator using applying rewriting func
|
|
||||||
to each portion of the stream.
|
|
||||||
Align to line boundaries if needed.
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
has_closed = hasattr(stream, 'closed')
|
|
||||||
buff = first_buff
|
|
||||||
|
|
||||||
while True:
|
|
||||||
if buff:
|
|
||||||
buff = rewrite_func(buff.decode('iso-8859-1'))
|
|
||||||
yield buff.encode('iso-8859-1')
|
|
||||||
|
|
||||||
buff = stream.read(RewriteContent.BUFF_SIZE)
|
|
||||||
# on 2.6, readline() (but not read()) throws an exception
|
|
||||||
# if stream already closed, so check stream.closed if present
|
|
||||||
if (buff and align_to_line and
|
|
||||||
(not has_closed or not stream.closed)):
|
|
||||||
buff += stream.readline()
|
|
||||||
|
|
||||||
if not buff:
|
|
||||||
break
|
|
||||||
|
|
||||||
# For adding a tail/handling final buffer
|
|
||||||
buff = final_read_func()
|
|
||||||
if buff:
|
|
||||||
yield buff.encode('iso-8859-1')
|
|
||||||
|
|
||||||
finally:
|
|
||||||
stream.close()
|
|
||||||
|
|
||||||
|
|
@ -1,315 +0,0 @@
|
|||||||
"""
|
|
||||||
Fetch a url from live web and apply rewriting rules
|
|
||||||
"""
|
|
||||||
|
|
||||||
from requests import request as live_request
|
|
||||||
|
|
||||||
import mimetypes
|
|
||||||
import logging
|
|
||||||
import os
|
|
||||||
|
|
||||||
from six.moves.urllib.parse import urlsplit
|
|
||||||
import six
|
|
||||||
|
|
||||||
from warcio.timeutils import timestamp_now
|
|
||||||
from warcio.statusandheaders import StatusAndHeaders
|
|
||||||
|
|
||||||
from pywb.utils.loaders import is_http, LimitReader, LocalFileLoader, to_file_url
|
|
||||||
from pywb.utils.loaders import extract_client_cookie
|
|
||||||
from pywb.utils.canonicalize import canonicalize
|
|
||||||
|
|
||||||
from pywb.rewrite.rewrite_content import RewriteContent
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
class LiveRewriter(object):
|
|
||||||
def __init__(self, is_framed_replay=False, proxies=None):
|
|
||||||
self.rewriter = RewriteContent(is_framed_replay=is_framed_replay)
|
|
||||||
|
|
||||||
self.proxies = proxies
|
|
||||||
|
|
||||||
self.live_request = live_request
|
|
||||||
|
|
||||||
if self.proxies:
|
|
||||||
logging.debug('Live Rewrite via proxy ' + str(proxies))
|
|
||||||
|
|
||||||
if isinstance(proxies, str):
|
|
||||||
self.proxies = {'http': proxies,
|
|
||||||
'https': proxies}
|
|
||||||
|
|
||||||
else:
|
|
||||||
logging.debug('Live Rewrite Direct (no proxy)')
|
|
||||||
|
|
||||||
def is_recording(self):
|
|
||||||
return self.proxies is not None
|
|
||||||
|
|
||||||
def fetch_local_file(self, uri):
|
|
||||||
#fh = open(uri)
|
|
||||||
fh = LocalFileLoader().load(uri)
|
|
||||||
|
|
||||||
content_type, _ = mimetypes.guess_type(uri)
|
|
||||||
|
|
||||||
# create fake headers for local file
|
|
||||||
status_headers = StatusAndHeaders('200 OK',
|
|
||||||
[('Content-Type', content_type)])
|
|
||||||
stream = fh
|
|
||||||
|
|
||||||
return (status_headers, stream)
|
|
||||||
|
|
||||||
def translate_headers(self, url, urlkey, env):
|
|
||||||
headers = {}
|
|
||||||
|
|
||||||
splits = urlsplit(url)
|
|
||||||
has_cookies = False
|
|
||||||
|
|
||||||
for name, value in six.iteritems(env):
|
|
||||||
if name == 'HTTP_HOST':
|
|
||||||
name = 'Host'
|
|
||||||
value = splits.netloc
|
|
||||||
|
|
||||||
elif name == 'HTTP_ORIGIN':
|
|
||||||
name = 'Origin'
|
|
||||||
value = (splits.scheme + '://' + splits.netloc)
|
|
||||||
|
|
||||||
elif name == 'HTTP_X_CSRFTOKEN':
|
|
||||||
name = 'X-CSRFToken'
|
|
||||||
cookie_val = extract_client_cookie(env, 'csrftoken')
|
|
||||||
if cookie_val:
|
|
||||||
value = cookie_val
|
|
||||||
|
|
||||||
elif name == 'HTTP_REFERER':
|
|
||||||
continue
|
|
||||||
|
|
||||||
elif name == 'HTTP_X_PYWB_REQUESTED_WITH':
|
|
||||||
continue
|
|
||||||
|
|
||||||
elif name == 'HTTP_X_FORWARDED_PROTO':
|
|
||||||
name = 'X-Forwarded-Proto'
|
|
||||||
value = splits.scheme
|
|
||||||
|
|
||||||
elif name == 'HTTP_COOKIE':
|
|
||||||
name = 'Cookie'
|
|
||||||
value = self._req_cookie_rewrite(urlkey, value)
|
|
||||||
has_cookies = True
|
|
||||||
|
|
||||||
elif name.startswith('HTTP_'):
|
|
||||||
name = name[5:].title().replace('_', '-')
|
|
||||||
|
|
||||||
elif name in ('CONTENT_LENGTH', 'CONTENT_TYPE'):
|
|
||||||
name = name.title().replace('_', '-')
|
|
||||||
|
|
||||||
elif name == 'REL_REFERER':
|
|
||||||
name = 'Referer'
|
|
||||||
else:
|
|
||||||
value = None
|
|
||||||
|
|
||||||
if value:
|
|
||||||
headers[name] = value
|
|
||||||
|
|
||||||
if not has_cookies:
|
|
||||||
value = self._req_cookie_rewrite(urlkey, '')
|
|
||||||
if value:
|
|
||||||
headers['Cookie'] = value
|
|
||||||
|
|
||||||
return headers
|
|
||||||
|
|
||||||
def _req_cookie_rewrite(self, urlkey, value):
|
|
||||||
rule = self.rewriter.ruleset.get_first_match(urlkey)
|
|
||||||
if not rule or not rule.req_cookie_rewrite:
|
|
||||||
return value
|
|
||||||
|
|
||||||
for cr in rule.req_cookie_rewrite:
|
|
||||||
try:
|
|
||||||
value = cr['rx'].sub(cr['replace'], value)
|
|
||||||
except KeyError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
return value
|
|
||||||
|
|
||||||
def fetch_http(self, url,
|
|
||||||
urlkey=None,
|
|
||||||
env=None,
|
|
||||||
req_headers=None,
|
|
||||||
follow_redirects=False,
|
|
||||||
skip_recording=False,
|
|
||||||
verify=True):
|
|
||||||
|
|
||||||
method = 'GET'
|
|
||||||
data = None
|
|
||||||
|
|
||||||
proxies = None
|
|
||||||
if not skip_recording:
|
|
||||||
proxies = self.proxies
|
|
||||||
|
|
||||||
if not req_headers:
|
|
||||||
req_headers = {}
|
|
||||||
|
|
||||||
if env is not None:
|
|
||||||
method = env['REQUEST_METHOD'].upper()
|
|
||||||
input_ = env['wsgi.input']
|
|
||||||
|
|
||||||
req_headers.update(self.translate_headers(url, urlkey, env))
|
|
||||||
|
|
||||||
if method in ('POST', 'PUT'):
|
|
||||||
len_ = env.get('CONTENT_LENGTH')
|
|
||||||
if len_:
|
|
||||||
data = LimitReader(input_, int(len_))
|
|
||||||
else:
|
|
||||||
data = input_
|
|
||||||
|
|
||||||
response = self.live_request(method=method,
|
|
||||||
url=url,
|
|
||||||
data=data,
|
|
||||||
headers=req_headers,
|
|
||||||
allow_redirects=follow_redirects,
|
|
||||||
proxies=proxies,
|
|
||||||
stream=True,
|
|
||||||
verify=verify)
|
|
||||||
|
|
||||||
statusline = str(response.status_code) + ' ' + response.reason
|
|
||||||
|
|
||||||
headers = response.headers.items()
|
|
||||||
|
|
||||||
stream = response.raw
|
|
||||||
|
|
||||||
try: #pragma: no cover
|
|
||||||
#PY 3
|
|
||||||
headers = stream._original_response.headers._headers
|
|
||||||
except: #pragma: no cover
|
|
||||||
#PY 2
|
|
||||||
headers = []
|
|
||||||
resp_headers = stream._original_response.msg.headers
|
|
||||||
for h in resp_headers:
|
|
||||||
n, v = h.split(':', 1)
|
|
||||||
n = n.strip()
|
|
||||||
v = v.strip()
|
|
||||||
headers.append((n, v))
|
|
||||||
|
|
||||||
status_headers = StatusAndHeaders(statusline, headers)
|
|
||||||
|
|
||||||
return (status_headers, stream)
|
|
||||||
|
|
||||||
def fetch_request(self, url, urlrewriter,
|
|
||||||
head_insert_func=None,
|
|
||||||
urlkey=None,
|
|
||||||
env=None,
|
|
||||||
req_headers={},
|
|
||||||
timestamp=None,
|
|
||||||
follow_redirects=False,
|
|
||||||
skip_recording=False,
|
|
||||||
verify=True,
|
|
||||||
remote_only=True):
|
|
||||||
|
|
||||||
ts_err = url.split('///')
|
|
||||||
|
|
||||||
# fixup for accidental erroneous rewrite which has ///
|
|
||||||
# (unless file:///)
|
|
||||||
if len(ts_err) > 1 and ts_err[0] != 'file:':
|
|
||||||
url = 'http://' + ts_err[1]
|
|
||||||
|
|
||||||
if url.startswith('//'):
|
|
||||||
url = 'http:' + url
|
|
||||||
|
|
||||||
if remote_only or is_http(url):
|
|
||||||
is_remote = True
|
|
||||||
else:
|
|
||||||
is_remote = False
|
|
||||||
if not url.startswith('file:'):
|
|
||||||
url = to_file_url(url)
|
|
||||||
|
|
||||||
# explicit urlkey may be passed in (say for testing)
|
|
||||||
if not urlkey:
|
|
||||||
urlkey = canonicalize(url)
|
|
||||||
|
|
||||||
if is_remote:
|
|
||||||
(status_headers, stream) = self.fetch_http(url, urlkey, env,
|
|
||||||
req_headers,
|
|
||||||
follow_redirects,
|
|
||||||
skip_recording,
|
|
||||||
verify)
|
|
||||||
else:
|
|
||||||
(status_headers, stream) = self.fetch_local_file(url)
|
|
||||||
|
|
||||||
if timestamp is None:
|
|
||||||
timestamp = timestamp_now()
|
|
||||||
|
|
||||||
cdx = {'urlkey': urlkey,
|
|
||||||
'timestamp': timestamp,
|
|
||||||
'url': url,
|
|
||||||
'status': status_headers.get_statuscode(),
|
|
||||||
'mime': status_headers.get_header('Content-Type'),
|
|
||||||
'is_live': True,
|
|
||||||
}
|
|
||||||
|
|
||||||
result = (self.rewriter.
|
|
||||||
rewrite_content(urlrewriter,
|
|
||||||
status_headers,
|
|
||||||
stream,
|
|
||||||
head_insert_func=head_insert_func,
|
|
||||||
urlkey=urlkey,
|
|
||||||
cdx=cdx))
|
|
||||||
|
|
||||||
if env:
|
|
||||||
env['pywb.cdx'] = cdx
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|
||||||
def fetch_async(self, url, headers):
|
|
||||||
resp = self.live_request(method='GET',
|
|
||||||
url=url,
|
|
||||||
headers=headers,
|
|
||||||
proxies=self.proxies,
|
|
||||||
verify=False,
|
|
||||||
stream=True)
|
|
||||||
|
|
||||||
# don't actually read whole response,
|
|
||||||
# proxy response for writing it
|
|
||||||
resp.close()
|
|
||||||
|
|
||||||
def add_metadata(self, url, headers, data):
|
|
||||||
return self.live_request(method='PUTMETA',
|
|
||||||
url=url,
|
|
||||||
data=data,
|
|
||||||
headers=headers,
|
|
||||||
proxies=self.proxies,
|
|
||||||
verify=False)
|
|
||||||
|
|
||||||
def get_rewritten(self, *args, **kwargs):
|
|
||||||
result = self.fetch_request(*args, **kwargs)
|
|
||||||
|
|
||||||
status_headers, gen, is_rewritten = result
|
|
||||||
|
|
||||||
buff = b''.join(gen)
|
|
||||||
|
|
||||||
return (status_headers, buff)
|
|
||||||
|
|
||||||
def get_video_info(self, url):
|
|
||||||
return youtubedl.extract_info(url)
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
class YoutubeDLWrapper(object): #pragma: no cover
|
|
||||||
""" YoutubeDL wrapper, inits youtubee-dl if it is available
|
|
||||||
"""
|
|
||||||
def __init__(self):
|
|
||||||
try:
|
|
||||||
from youtube_dl import YoutubeDL as YoutubeDL
|
|
||||||
except ImportError:
|
|
||||||
self.ydl = None
|
|
||||||
return
|
|
||||||
|
|
||||||
self.ydl = YoutubeDL(dict(simulate=True,
|
|
||||||
youtube_include_dash_manifest=False))
|
|
||||||
self.ydl.add_default_info_extractors()
|
|
||||||
|
|
||||||
def extract_info(self, url):
|
|
||||||
if not self.ydl:
|
|
||||||
return None
|
|
||||||
|
|
||||||
info = self.ydl.extract_info(url)
|
|
||||||
return info
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
youtubedl = YoutubeDLWrapper()
|
|
||||||
|
|
@ -1,4 +1,4 @@
|
|||||||
from pywb.webagg.inputrequest import DirectWSGIInputRequest
|
from pywb.warcserver.inputrequest import DirectWSGIInputRequest
|
||||||
from pywb.utils.loaders import extract_client_cookie
|
from pywb.utils.loaders import extract_client_cookie
|
||||||
|
|
||||||
from six import iteritems
|
from six import iteritems
|
@ -1,80 +0,0 @@
|
|||||||
from pywb.utils.dsrules import BaseRule
|
|
||||||
|
|
||||||
from pywb.rewrite.regex_rewriters import RegexRewriter, CSSRewriter, XMLRewriter
|
|
||||||
from pywb.rewrite.regex_rewriters import JSLinkAndLocationRewriter, JSLinkOnlyRewriter
|
|
||||||
from pywb.rewrite.regex_rewriters import JSLocationOnlyRewriter, JSNoneRewriter
|
|
||||||
|
|
||||||
from pywb.rewrite.header_rewriter import HeaderRewriter
|
|
||||||
from pywb.rewrite.html_rewriter import HTMLRewriter
|
|
||||||
|
|
||||||
from pywb.rewrite.jsonp_rewriter import JSONPRewriter
|
|
||||||
|
|
||||||
import re
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
class RewriteRules(BaseRule):
|
|
||||||
def __init__(self, url_prefix, config={}):
|
|
||||||
super(RewriteRules, self).__init__(url_prefix, config)
|
|
||||||
|
|
||||||
self.rewriters = {}
|
|
||||||
|
|
||||||
#self._script_head_inserts = config.get('script_head_inserts', {})
|
|
||||||
|
|
||||||
self.rewriters['header'] = config.get('header_class', HeaderRewriter)
|
|
||||||
self.rewriters['css'] = config.get('css_class', CSSRewriter)
|
|
||||||
self.rewriters['xml'] = config.get('xml_class', XMLRewriter)
|
|
||||||
self.rewriters['html'] = config.get('html_class', HTMLRewriter)
|
|
||||||
self.rewriters['json'] = config.get('json_class', JSONPRewriter)
|
|
||||||
|
|
||||||
self.parse_comments = config.get('parse_comments', False)
|
|
||||||
|
|
||||||
# Custom handling for js rewriting, often the most complex
|
|
||||||
self.js_rewrite_location = config.get('js_rewrite_location', 'location')
|
|
||||||
|
|
||||||
# ability to toggle rewriting
|
|
||||||
if self.js_rewrite_location == 'all':
|
|
||||||
js_default_class = JSLinkAndLocationRewriter
|
|
||||||
elif self.js_rewrite_location == 'location':
|
|
||||||
js_default_class = JSLocationOnlyRewriter
|
|
||||||
# self.rewriters['json'] = JSNoneRewriter
|
|
||||||
elif self.js_rewrite_location == 'none':
|
|
||||||
js_default_class = JSNoneRewriter
|
|
||||||
# self.rewriters['json'] = JSNoneRewriter
|
|
||||||
else:
|
|
||||||
js_default_class = JSLinkOnlyRewriter
|
|
||||||
|
|
||||||
# set js class, using either default or override from config
|
|
||||||
self.rewriters['js'] = config.get('js_class', js_default_class)
|
|
||||||
|
|
||||||
self.rewriters['js_proxy'] = JSNoneRewriter
|
|
||||||
|
|
||||||
# add any regexs for js rewriter
|
|
||||||
self._add_custom_regexs('js', 'js_regexs', config)
|
|
||||||
self._add_custom_regexs('js_proxy', 'js_regexs', config)
|
|
||||||
|
|
||||||
# cookie rewrite scope
|
|
||||||
self.cookie_scope = config.get('cookie_scope', 'default')
|
|
||||||
|
|
||||||
req_cookie_rewrite = config.get('req_cookie_rewrite', [])
|
|
||||||
for rc in req_cookie_rewrite:
|
|
||||||
rc['rx'] = re.compile(rc.get('match', ''))
|
|
||||||
|
|
||||||
self.req_cookie_rewrite = req_cookie_rewrite
|
|
||||||
|
|
||||||
def _add_custom_regexs(self, rw_id, field, config):
|
|
||||||
regexs = config.get(field)
|
|
||||||
if not regexs:
|
|
||||||
return
|
|
||||||
|
|
||||||
rewriter_cls = self.rewriters[rw_id]
|
|
||||||
|
|
||||||
#rule_def_tuples = RegexRewriter.parse_rules_from_config(regexs)
|
|
||||||
parse_rules_func = RegexRewriter.parse_rules_from_config(regexs)
|
|
||||||
|
|
||||||
def extend_rewriter_with_regex(urlrewriter):
|
|
||||||
rule_def_tuples = parse_rules_func(urlrewriter)
|
|
||||||
return rewriter_cls(urlrewriter, rule_def_tuples)
|
|
||||||
|
|
||||||
self.rewriters[rw_id] = extend_rewriter_with_regex
|
|
||||||
|
|
@ -1,271 +0,0 @@
|
|||||||
from pywb.rewrite.rewrite_live import LiveRewriter
|
|
||||||
from pywb.rewrite.url_rewriter import UrlRewriter
|
|
||||||
from pywb.rewrite.wburl import WbUrl
|
|
||||||
|
|
||||||
from pywb.utils.loaders import to_native_str
|
|
||||||
|
|
||||||
from pywb import get_test_dir
|
|
||||||
|
|
||||||
from io import BytesIO
|
|
||||||
|
|
||||||
# This module has some rewriting tests against the 'live web'
|
|
||||||
# As such, the content may change and the test may break
|
|
||||||
|
|
||||||
urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/pywb/')
|
|
||||||
bn_urlrewriter = UrlRewriter('20131226101010bn_/http://example.com/some/path/index.html', '/pywb/')
|
|
||||||
|
|
||||||
def head_insert_func(rule, cdx):
|
|
||||||
if rule.js_rewrite_location != 'urls':
|
|
||||||
return '<script src="/static/__pywb/wombat.js"> </script>'
|
|
||||||
else:
|
|
||||||
return ''
|
|
||||||
|
|
||||||
def test_csrf_token_headers():
|
|
||||||
rewriter = LiveRewriter()
|
|
||||||
env = {'HTTP_X_CSRFTOKEN': 'wrong', 'HTTP_COOKIE': 'csrftoken=foobar'}
|
|
||||||
|
|
||||||
req_headers = rewriter.translate_headers('http://example.com/', 'com,example)/', env)
|
|
||||||
|
|
||||||
assert req_headers == {'X-CSRFToken': 'foobar', 'Cookie': 'csrftoken=foobar'}
|
|
||||||
|
|
||||||
def test_forwarded_scheme():
|
|
||||||
rewriter = LiveRewriter()
|
|
||||||
env = {'HTTP_X_FORWARDED_PROTO': 'https', 'Other': 'Value'}
|
|
||||||
|
|
||||||
req_headers = rewriter.translate_headers('http://example.com/', 'com,example)/', env)
|
|
||||||
|
|
||||||
assert req_headers == {'X-Forwarded-Proto': 'http'}
|
|
||||||
|
|
||||||
def test_req_cookie_rewrite_1():
|
|
||||||
rewriter = LiveRewriter()
|
|
||||||
env = {'HTTP_COOKIE': 'A=B'}
|
|
||||||
|
|
||||||
urlkey = 'example,example,test)/'
|
|
||||||
url = 'test.example.example/'
|
|
||||||
|
|
||||||
req_headers = rewriter.translate_headers(url, urlkey, env)
|
|
||||||
|
|
||||||
assert req_headers == {'Cookie': 'A=B; FOO=&bar=1'}
|
|
||||||
|
|
||||||
def test_req_cookie_rewrite_2():
|
|
||||||
rewriter = LiveRewriter()
|
|
||||||
env = {'HTTP_COOKIE': 'FOO=goo'}
|
|
||||||
|
|
||||||
urlkey = 'example,example,test)/'
|
|
||||||
url = 'test.example.example/'
|
|
||||||
|
|
||||||
req_headers = rewriter.translate_headers(url, urlkey, env)
|
|
||||||
|
|
||||||
assert req_headers == {'Cookie': 'FOO=&bar=1'}
|
|
||||||
|
|
||||||
def test_req_cookie_rewrite_3():
|
|
||||||
rewriter = LiveRewriter()
|
|
||||||
env = {}
|
|
||||||
|
|
||||||
urlkey = 'example,example,test)/'
|
|
||||||
url = 'test.example.example/'
|
|
||||||
|
|
||||||
req_headers = rewriter.translate_headers(url, urlkey, env)
|
|
||||||
|
|
||||||
assert req_headers == {'Cookie': '; FOO=&bar=1'}
|
|
||||||
|
|
||||||
def test_local_1():
|
|
||||||
status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html',
|
|
||||||
urlrewriter,
|
|
||||||
head_insert_func,
|
|
||||||
'example,example,test,all)/')
|
|
||||||
|
|
||||||
# wombat insert added
|
|
||||||
assert '<head><script src="/static/__pywb/wombat.js"> </script>' in buff, buff
|
|
||||||
|
|
||||||
# JS location and JS link rewritten
|
|
||||||
assert 'window.WB_wombat_location = "/pywb/20131226101010/http:\/\/example.com/dynamic_page.html"' in buff
|
|
||||||
|
|
||||||
# link rewritten
|
|
||||||
assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff
|
|
||||||
|
|
||||||
|
|
||||||
def test_local_no_head():
|
|
||||||
status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample_no_head.html',
|
|
||||||
urlrewriter,
|
|
||||||
head_insert_func,
|
|
||||||
'com,example,test)/')
|
|
||||||
|
|
||||||
# wombat insert added
|
|
||||||
assert '<script src="/static/__pywb/wombat.js"> </script>' in buff, buff
|
|
||||||
|
|
||||||
# location rewritten
|
|
||||||
assert 'window.WB_wombat_location = "/other.html"' in buff, buff
|
|
||||||
|
|
||||||
# link rewritten
|
|
||||||
assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff, buff
|
|
||||||
|
|
||||||
def test_local_no_head_only_title():
|
|
||||||
status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample_no_head_2.html',
|
|
||||||
urlrewriter,
|
|
||||||
head_insert_func,
|
|
||||||
'com,example,test)/')
|
|
||||||
|
|
||||||
# wombat insert added
|
|
||||||
assert '<script src="/static/__pywb/wombat.js"> </script>' in buff
|
|
||||||
|
|
||||||
|
|
||||||
def test_local_no_head_banner_only():
|
|
||||||
status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample_no_head.html',
|
|
||||||
bn_urlrewriter,
|
|
||||||
head_insert_func,
|
|
||||||
'com,example,test)/')
|
|
||||||
|
|
||||||
# wombat insert added
|
|
||||||
assert '<script src="/static/__pywb/wombat.js"> </script>' in buff
|
|
||||||
|
|
||||||
# location NOT rewritten
|
|
||||||
assert 'window.location = "/other.html"' in buff
|
|
||||||
|
|
||||||
# link NOT rewritten
|
|
||||||
assert '"/some/path/another.html"' in buff
|
|
||||||
|
|
||||||
def test_local_banner_only_no_rewrite():
|
|
||||||
status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html',
|
|
||||||
bn_urlrewriter,
|
|
||||||
head_insert_func,
|
|
||||||
'com,example,test)/')
|
|
||||||
|
|
||||||
# wombat insert added
|
|
||||||
assert '<head><script src="/static/__pywb/wombat.js"> </script>' in buff
|
|
||||||
|
|
||||||
# JS location NOT rewritten, JS link NOT rewritten
|
|
||||||
assert 'window.location = "http:\/\/example.com/dynamic_page.html"' in buff, buff
|
|
||||||
|
|
||||||
# link NOT rewritten
|
|
||||||
assert '"/some/path/another.html"' in buff
|
|
||||||
|
|
||||||
def test_local_2_link_only_rewrite():
|
|
||||||
status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html',
|
|
||||||
urlrewriter,
|
|
||||||
head_insert_func,
|
|
||||||
'example,example,test)/nolocation_rewrite')
|
|
||||||
|
|
||||||
# no wombat insert
|
|
||||||
assert '<head><script src="/static/__pywb/wombat.js"> </script>' not in buff
|
|
||||||
|
|
||||||
# JS location NOT rewritten, JS link rewritten
|
|
||||||
assert 'window.location = "/pywb/20131226101010/http:\/\/example.com/dynamic_page.html"' in buff
|
|
||||||
|
|
||||||
# still link rewrite
|
|
||||||
assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff
|
|
||||||
|
|
||||||
|
|
||||||
def test_local_2_js_loc_only_rewrite():
|
|
||||||
status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html',
|
|
||||||
urlrewriter,
|
|
||||||
head_insert_func,
|
|
||||||
'example,example,test,loconly)/')
|
|
||||||
|
|
||||||
# wombat insert added
|
|
||||||
assert '<script src="/static/__pywb/wombat.js"> </script>' in buff
|
|
||||||
|
|
||||||
# JS location rewritten, JS link NOT rewritten
|
|
||||||
assert 'window.WB_wombat_location = "http:\/\/example.com/dynamic_page.html"' in buff
|
|
||||||
|
|
||||||
# still link rewrite in HTML
|
|
||||||
assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff
|
|
||||||
|
|
||||||
def test_local_2_no_rewrite():
|
|
||||||
status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html',
|
|
||||||
urlrewriter,
|
|
||||||
head_insert_func,
|
|
||||||
'example,example,test,norewrite)/')
|
|
||||||
|
|
||||||
# wombat insert added
|
|
||||||
assert '<script src="/static/__pywb/wombat.js"> </script>' in buff
|
|
||||||
|
|
||||||
# JS location NOT rewritten, JS link NOT rewritten
|
|
||||||
assert 'window.location = "http:\/\/example.com/dynamic_page.html"' in buff
|
|
||||||
|
|
||||||
# still link rewrite in HTML
|
|
||||||
assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff
|
|
||||||
|
|
||||||
def test_local_unclosed_script():
|
|
||||||
status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample_unclosed_script.html',
|
|
||||||
urlrewriter,
|
|
||||||
head_insert_func,
|
|
||||||
'example,example,test,all)/')
|
|
||||||
|
|
||||||
# wombat insert added
|
|
||||||
assert '<head><script src="/static/__pywb/wombat.js"> </script>' in buff, buff
|
|
||||||
|
|
||||||
# JS location and JS link rewritten
|
|
||||||
assert 'window.WB_wombat_location = "/pywb/20131226101010/http:\/\/example.com/dynamic_page.html";' in buff, buff
|
|
||||||
|
|
||||||
assert '</script>' in buff, buff
|
|
||||||
|
|
||||||
|
|
||||||
def test_example_1():
|
|
||||||
status_headers, buff = get_rewritten('http://example.com/', urlrewriter, req_headers={'Connection': 'close', 'Accept-Encoding': 'identity'})
|
|
||||||
|
|
||||||
# verify header rewriting
|
|
||||||
assert status_headers.get_header('x-archive-orig-content-length') == '1270', status_headers
|
|
||||||
|
|
||||||
|
|
||||||
# verify utf-8 charset detection
|
|
||||||
assert status_headers.get_header('content-type') == 'text/html'
|
|
||||||
|
|
||||||
assert '/pywb/20131226101010/http://www.iana.org/domains/example' in buff, buff
|
|
||||||
|
|
||||||
def test_example_2_redirect():
|
|
||||||
status_headers, buff = get_rewritten('http://httpbin.org/redirect-to?url=http://example.com/', urlrewriter)
|
|
||||||
|
|
||||||
# redirect, no content
|
|
||||||
assert status_headers.get_statuscode() == '302'
|
|
||||||
assert len(buff) == 0
|
|
||||||
|
|
||||||
|
|
||||||
def test_example_3_rel():
|
|
||||||
status_headers, buff = get_rewritten('//example.com/', urlrewriter)
|
|
||||||
assert status_headers.get_statuscode() == '200'
|
|
||||||
|
|
||||||
|
|
||||||
def test_example_4_rewrite_err():
|
|
||||||
# may occur in case of rewrite mismatch, the /// gets stripped off
|
|
||||||
status_headers, buff = get_rewritten('http://localhost:8080///example.com/', urlrewriter)
|
|
||||||
assert status_headers.get_statuscode() == '200'
|
|
||||||
|
|
||||||
def test_example_domain_specific_3():
|
|
||||||
status_headers, buff = get_rewritten('http://facebook.com/digitalpreservation', urlrewriter, follow_redirects=True)
|
|
||||||
|
|
||||||
# comment out Bootloader.configurePage, if it is still there
|
|
||||||
if 'Bootloader.configurePage' in buff:
|
|
||||||
assert '/* Bootloader.configurePage' in buff
|
|
||||||
|
|
||||||
def test_wombat_top():
|
|
||||||
#status_headers, buff = get_rewritten('https://assets-cdn.github.com/assets/github-0f06d0f46fe7bcfbf31f2380f23aec15ba21b8ec.js', urlrewriter)
|
|
||||||
status_headers, buff = get_rewritten(get_test_dir() + 'text_content/toptest.js', urlrewriter)
|
|
||||||
|
|
||||||
assert 'WB_wombat_top!==window' in buff
|
|
||||||
|
|
||||||
def test_post():
|
|
||||||
buff = BytesIO(b'ABC=DEF')
|
|
||||||
|
|
||||||
env = {'REQUEST_METHOD': 'POST',
|
|
||||||
'HTTP_ORIGIN': 'http://httpbin.org',
|
|
||||||
'HTTP_HOST': 'httpbin.org',
|
|
||||||
'wsgi.input': buff}
|
|
||||||
|
|
||||||
status_headers, resp_buff = get_rewritten('http://httpbin.org/post', urlrewriter, env=env)
|
|
||||||
assert status_headers.get_statuscode() == '200', status_headers
|
|
||||||
|
|
||||||
def test_multiple_set_cookies():
|
|
||||||
status_headers, buff = get_rewritten('http://httpbin.org/cookies/set?A=B&C=D', urlrewriter)
|
|
||||||
|
|
||||||
assert status_headers.get_statuscode() == '302'
|
|
||||||
|
|
||||||
print(status_headers.headers)
|
|
||||||
|
|
||||||
assert ('Set-Cookie', 'A=B; Path=/pywb/20131226101010/http://example.com/') in status_headers.headers
|
|
||||||
assert ('Set-Cookie', 'C=D; Path=/pywb/20131226101010/http://example.com/') in status_headers.headers
|
|
||||||
|
|
||||||
|
|
||||||
def get_rewritten(*args, **kwargs):
|
|
||||||
status_headers, buff = LiveRewriter().get_rewritten(remote_only=False, *args, **kwargs)
|
|
||||||
return status_headers, to_native_str(buff)
|
|
@ -1,100 +0,0 @@
|
|||||||
from warcio.statusandheaders import StatusAndHeaders
|
|
||||||
from warcio.timeutils import datetime_to_http_date
|
|
||||||
from datetime import datetime, timedelta
|
|
||||||
|
|
||||||
|
|
||||||
#=============================================================================
|
|
||||||
class PrefixHeaderRewriter(object):
|
|
||||||
header_rules = {
|
|
||||||
'content-type': 'keep',
|
|
||||||
'content-disposition': 'keep',
|
|
||||||
'content-range': 'keep',
|
|
||||||
'accept-rangees': 'keep',
|
|
||||||
'www-authenticate': 'keep',
|
|
||||||
'proxy-authenticate': 'keep',
|
|
||||||
|
|
||||||
'location': 'url-rewrite',
|
|
||||||
'content-location': 'url-rewrite',
|
|
||||||
'content-base': 'url-rewrite',
|
|
||||||
|
|
||||||
'transfer-encoding': 'prefix',
|
|
||||||
'connection': 'prefix',
|
|
||||||
|
|
||||||
'content-encoding': 'keep-if-no-content-rewrite',
|
|
||||||
'content-length': 'content-length',
|
|
||||||
|
|
||||||
'set-cookie': 'cookie',
|
|
||||||
'cookie': 'cookie',
|
|
||||||
}
|
|
||||||
|
|
||||||
def __init__(self, rwinfo, header_prefix='X-Archive-Orig-'):
|
|
||||||
self.header_prefix = header_prefix
|
|
||||||
self.rwinfo = rwinfo
|
|
||||||
self.http_headers = rwinfo.record.http_headers
|
|
||||||
|
|
||||||
if rwinfo.is_url_rw():
|
|
||||||
self.default_rule = 'prefix'
|
|
||||||
else:
|
|
||||||
self.default_rule = 'keep'
|
|
||||||
|
|
||||||
def __call__(self):
|
|
||||||
new_headers_list = []
|
|
||||||
for name, value in self.http_headers.headers:
|
|
||||||
rule = self.header_rules.get(name.lower(), self.default_rule)
|
|
||||||
new_header = self.rewrite_header(name, value, rule)
|
|
||||||
if new_header:
|
|
||||||
if isinstance(new_header, list):
|
|
||||||
new_headers_list.extend(new_header)
|
|
||||||
else:
|
|
||||||
new_headers_list.append(new_header)
|
|
||||||
|
|
||||||
return StatusAndHeaders(self.http_headers.statusline,
|
|
||||||
headers=new_headers_list,
|
|
||||||
protocol=self.http_headers.protocol)
|
|
||||||
|
|
||||||
def rewrite_header(self, name, value, rule):
|
|
||||||
if rule == 'keep':
|
|
||||||
return (name, value)
|
|
||||||
|
|
||||||
elif rule == 'url-rewrite':
|
|
||||||
return (name, self.rwinfo.url_rewriter.rewrite(value))
|
|
||||||
|
|
||||||
elif rule == 'keep-if-no-content-rewrite':
|
|
||||||
if not self.rwinfo.is_content_rw:
|
|
||||||
return (name, value)
|
|
||||||
|
|
||||||
elif rule == 'content-length':
|
|
||||||
if value == '0':
|
|
||||||
return (name, value)
|
|
||||||
|
|
||||||
if not self.rwinfo.is_content_rw:
|
|
||||||
try:
|
|
||||||
if int(value) >= 0:
|
|
||||||
return (name, value)
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
|
|
||||||
elif rule == 'cookie':
|
|
||||||
if self.rwinfo.cookie_rewriter:
|
|
||||||
return self.rwinfo.cookie_rewriter.rewrite(value)
|
|
||||||
else:
|
|
||||||
return (name, value)
|
|
||||||
|
|
||||||
# default 'prefix'
|
|
||||||
return (self.header_prefix + name, value)
|
|
||||||
|
|
||||||
def _add_cache_headers(self, new_headers, http_cache):
|
|
||||||
try:
|
|
||||||
age = int(http_cache)
|
|
||||||
except:
|
|
||||||
age = 0
|
|
||||||
|
|
||||||
if age <= 0:
|
|
||||||
new_headers.append(('Cache-Control', 'no-cache; no-store'))
|
|
||||||
else:
|
|
||||||
dt = datetime.utcnow()
|
|
||||||
dt = dt + timedelta(seconds=age)
|
|
||||||
new_headers.append(('Cache-Control', 'max-age=' + str(age)))
|
|
||||||
new_headers.append(('Expires', datetime_to_http_date(dt)))
|
|
||||||
|
|
||||||
|
|
@ -1,18 +0,0 @@
|
|||||||
[uwsgi]
|
|
||||||
if-not-env = PORT
|
|
||||||
http-socket = :8090
|
|
||||||
endif =
|
|
||||||
|
|
||||||
master = true
|
|
||||||
buffer-size = 65536
|
|
||||||
die-on-term = true
|
|
||||||
|
|
||||||
if-env = VIRTUAL_ENV
|
|
||||||
venv = $(VIRTUAL_ENV)
|
|
||||||
endif =
|
|
||||||
|
|
||||||
gevent = 100
|
|
||||||
|
|
||||||
wsgi = urlrewrite.test.simpleapp
|
|
||||||
|
|
||||||
|
|
@ -1,81 +0,0 @@
|
|||||||
import pkgutil
|
|
||||||
from pywb.utils.loaders import load_yaml_config
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
DEFAULT_RULES_FILE = 'pywb/rules.yaml'
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
class RuleSet(object):
|
|
||||||
DEFAULT_KEY = ''
|
|
||||||
|
|
||||||
def __init__(self, rule_cls, fieldname, **kwargs):
|
|
||||||
"""
|
|
||||||
A domain specific rules block, inited via config map.
|
|
||||||
If config map not specified, it is loaded from default location.
|
|
||||||
|
|
||||||
The rules are represented as a map by domain.
|
|
||||||
Each rules configuration will load is own field type
|
|
||||||
from the list and given a specified rule_cls.
|
|
||||||
"""
|
|
||||||
|
|
||||||
self.rules = []
|
|
||||||
|
|
||||||
default_rule_config = kwargs.get('default_rule_config')
|
|
||||||
|
|
||||||
ds_rules_file = kwargs.get('ds_rules_file')
|
|
||||||
|
|
||||||
if not ds_rules_file:
|
|
||||||
ds_rules_file = DEFAULT_RULES_FILE
|
|
||||||
|
|
||||||
config = load_yaml_config(ds_rules_file)
|
|
||||||
|
|
||||||
# load rules dict or init to empty
|
|
||||||
rulesmap = config.get('rules') if config else {}
|
|
||||||
|
|
||||||
def_key_found = False
|
|
||||||
|
|
||||||
# iterate over master rules file
|
|
||||||
for value in rulesmap:
|
|
||||||
url_prefix = value.get('url_prefix')
|
|
||||||
rules_def = value.get(fieldname)
|
|
||||||
if not rules_def:
|
|
||||||
continue
|
|
||||||
|
|
||||||
if url_prefix == self.DEFAULT_KEY:
|
|
||||||
def_key_found = True
|
|
||||||
|
|
||||||
self.rules.append(rule_cls(url_prefix, rules_def))
|
|
||||||
|
|
||||||
# if default_rule_config provided, always init a default ruleset
|
|
||||||
if not def_key_found and default_rule_config is not None:
|
|
||||||
self.rules.append(rule_cls(self.DEFAULT_KEY, default_rule_config))
|
|
||||||
|
|
||||||
def iter_matching(self, urlkey):
|
|
||||||
"""
|
|
||||||
Iterate over all matching rules for given urlkey
|
|
||||||
"""
|
|
||||||
for rule in self.rules:
|
|
||||||
if rule.applies(urlkey):
|
|
||||||
yield rule
|
|
||||||
|
|
||||||
def get_first_match(self, urlkey):
|
|
||||||
for rule in self.rules:
|
|
||||||
if rule.applies(urlkey):
|
|
||||||
return rule
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
class BaseRule(object):
|
|
||||||
"""
|
|
||||||
Base rule class -- subclassed to handle specific
|
|
||||||
rules for given url_prefix key
|
|
||||||
"""
|
|
||||||
def __init__(self, url_prefix, rules):
|
|
||||||
self.url_prefix = url_prefix
|
|
||||||
if not isinstance(self.url_prefix, list):
|
|
||||||
self.url_prefix = [self.url_prefix]
|
|
||||||
|
|
||||||
def applies(self, urlkey):
|
|
||||||
return any(urlkey.startswith(x) for x in self.url_prefix)
|
|
@ -52,43 +52,6 @@ IOError: [Errno 2] No such file or directory: '_x_no_such_file_'
|
|||||||
|
|
||||||
>>> extract_client_cookie({}, 'y')
|
>>> extract_client_cookie({}, 'y')
|
||||||
|
|
||||||
# append_post_query
|
|
||||||
>>> append_post_query('http://example.com/?abc=def', 'foo=bar')
|
|
||||||
'http://example.com/?abc=def&foo=bar'
|
|
||||||
|
|
||||||
>>> append_post_query('http://example.com/', '')
|
|
||||||
'http://example.com/'
|
|
||||||
|
|
||||||
>>> append_post_query('http://example.com/', 'foo=bar')
|
|
||||||
'http://example.com/?foo=bar'
|
|
||||||
|
|
||||||
# extract_post_query tests
|
|
||||||
|
|
||||||
# correct POST data
|
|
||||||
>>> post_data = b'foo=bar&dir=%2Fbaz'
|
|
||||||
>>> extract_post_query('POST', 'application/x-www-form-urlencoded', len(post_data), BytesIO(post_data))
|
|
||||||
'foo=bar&dir=/baz'
|
|
||||||
|
|
||||||
# unsupported method
|
|
||||||
>>> extract_post_query('PUT', 'application/x-www-form-urlencoded', len(post_data), BytesIO(post_data))
|
|
||||||
|
|
||||||
# base64 encode
|
|
||||||
>>> extract_post_query('POST', 'text/plain', len(post_data), BytesIO(post_data))
|
|
||||||
'&__wb_post_data=Zm9vPWJhciZkaXI9JTJGYmF6'
|
|
||||||
|
|
||||||
# invalid length
|
|
||||||
>>> extract_post_query('POST', 'application/x-www-form-urlencoded', 'abc', BytesIO(post_data))
|
|
||||||
>>> extract_post_query('POST', 'application/x-www-form-urlencoded', 0, BytesIO(post_data))
|
|
||||||
|
|
||||||
# length too short
|
|
||||||
>>> extract_post_query('POST', 'application/x-www-form-urlencoded', len(post_data) - 4, BytesIO(post_data))
|
|
||||||
'foo=bar&dir=%2'
|
|
||||||
|
|
||||||
# length too long
|
|
||||||
>>> extract_post_query('POST', 'application/x-www-form-urlencoded', len(post_data) + 4, BytesIO(post_data))
|
|
||||||
'foo=bar&dir=/baz'
|
|
||||||
|
|
||||||
|
|
||||||
# test read_last_line
|
# test read_last_line
|
||||||
>>> print_str(read_last_line(BytesIO(b'A\nB\nC')))
|
>>> print_str(read_last_line(BytesIO(b'A\nB\nC')))
|
||||||
'C'
|
'C'
|
||||||
@ -119,8 +82,8 @@ from io import BytesIO
|
|||||||
import requests
|
import requests
|
||||||
|
|
||||||
from pywb.utils.loaders import BlockLoader, HMACCookieMaker, to_file_url
|
from pywb.utils.loaders import BlockLoader, HMACCookieMaker, to_file_url
|
||||||
from pywb.utils.loaders import extract_client_cookie, extract_post_query
|
from pywb.utils.loaders import extract_client_cookie
|
||||||
from pywb.utils.loaders import append_post_query, read_last_line
|
from pywb.utils.loaders import read_last_line
|
||||||
|
|
||||||
from warcio.bufferedreaders import DecompressingBufferedReader
|
from warcio.bufferedreaders import DecompressingBufferedReader
|
||||||
|
|
||||||
|
@ -86,10 +86,11 @@ class DirectWSGIInputRequest(object):
|
|||||||
buffered_stream=buffered_stream,
|
buffered_stream=buffered_stream,
|
||||||
environ=self.env)
|
environ=self.env)
|
||||||
|
|
||||||
if post_query.append_post_query(url) != url:
|
new_url = post_query.append_post_query(url)
|
||||||
|
if new_url != url:
|
||||||
self.env['wsgi.input'] = buffered_stream
|
self.env['wsgi.input'] = buffered_stream
|
||||||
|
|
||||||
return url
|
return new_url
|
||||||
|
|
||||||
def get_full_request_uri(self):
|
def get_full_request_uri(self):
|
||||||
req_uri = self.env.get('REQUEST_URI')
|
req_uri = self.env.get('REQUEST_URI')
|
||||||
@ -246,7 +247,7 @@ class PostQueryExtractor(object):
|
|||||||
else:
|
else:
|
||||||
post_query = base64.b64encode(post_query)
|
post_query = base64.b64encode(post_query)
|
||||||
post_query = to_native_str(post_query)
|
post_query = to_native_str(post_query)
|
||||||
post_query = '&__wb_post_data=' + post_query
|
post_query = '__wb_post_data=' + post_query
|
||||||
|
|
||||||
self.post_query = post_query
|
self.post_query = post_query
|
||||||
|
|
||||||
|
@ -1,9 +1,10 @@
|
|||||||
from pywb.warcserver.inputrequest import DirectWSGIInputRequest, POSTInputRequest
|
from pywb.warcserver.inputrequest import DirectWSGIInputRequest, POSTInputRequest, PostQueryExtractor
|
||||||
from werkzeug.routing import Map, Rule
|
from werkzeug.routing import Map, Rule
|
||||||
|
|
||||||
import webtest
|
import webtest
|
||||||
import traceback
|
import traceback
|
||||||
from six.moves.urllib.parse import parse_qsl
|
from six.moves.urllib.parse import parse_qsl
|
||||||
|
from io import BytesIO
|
||||||
|
|
||||||
|
|
||||||
#=============================================================================
|
#=============================================================================
|
||||||
@ -76,3 +77,61 @@ Foo: Bar\r\n\
|
|||||||
\r\n\
|
\r\n\
|
||||||
'
|
'
|
||||||
|
|
||||||
|
|
||||||
|
class TestPostQueryExtract(object):
|
||||||
|
@classmethod
|
||||||
|
def setup_class(cls):
|
||||||
|
cls.post_data = b'foo=bar&dir=%2Fbaz'
|
||||||
|
|
||||||
|
def test_post_extract_1(self):
|
||||||
|
pq = PostQueryExtractor('POST', 'application/x-www-form-urlencoded',
|
||||||
|
len(self.post_data), BytesIO(self.post_data))
|
||||||
|
|
||||||
|
assert pq.append_post_query('http://example.com/') == 'http://example.com/?foo=bar&dir=/baz'
|
||||||
|
|
||||||
|
assert pq.append_post_query('http://example.com/?123=ABC') == 'http://example.com/?123=ABC&foo=bar&dir=/baz'
|
||||||
|
|
||||||
|
def test_post_extract_wrong_method(self):
|
||||||
|
pq = PostQueryExtractor('PUT', 'application/x-www-form-urlencoded',
|
||||||
|
len(self.post_data), BytesIO(self.post_data))
|
||||||
|
|
||||||
|
assert pq.append_post_query('http://example.com/') == 'http://example.com/'
|
||||||
|
|
||||||
|
def test_post_extract_non_form_data_1(self):
|
||||||
|
pq = PostQueryExtractor('POST', 'application/octet-stream',
|
||||||
|
len(self.post_data), BytesIO(self.post_data))
|
||||||
|
|
||||||
|
#base64 encoded data
|
||||||
|
assert pq.append_post_query('http://example.com/') == 'http://example.com/?__wb_post_data=Zm9vPWJhciZkaXI9JTJGYmF6'
|
||||||
|
|
||||||
|
def test_post_extract_non_form_data_2(self):
|
||||||
|
pq = PostQueryExtractor('POST', 'text/plain',
|
||||||
|
len(self.post_data), BytesIO(self.post_data))
|
||||||
|
|
||||||
|
#base64 encoded data
|
||||||
|
assert pq.append_post_query('http://example.com/pathbar?id=123') == 'http://example.com/pathbar?id=123&__wb_post_data=Zm9vPWJhciZkaXI9JTJGYmF6'
|
||||||
|
|
||||||
|
def test_post_extract_length_invalid_ignore(self):
|
||||||
|
pq = PostQueryExtractor('POST', 'application/x-www-form-urlencoded',
|
||||||
|
0, BytesIO(self.post_data))
|
||||||
|
|
||||||
|
assert pq.append_post_query('http://example.com/') == 'http://example.com/'
|
||||||
|
|
||||||
|
pq = PostQueryExtractor('POST', 'application/x-www-form-urlencoded',
|
||||||
|
'abc', BytesIO(self.post_data))
|
||||||
|
|
||||||
|
assert pq.append_post_query('http://example.com/') == 'http://example.com/'
|
||||||
|
|
||||||
|
def test_post_extract_length_too_short(self):
|
||||||
|
pq = PostQueryExtractor('POST', 'application/x-www-form-urlencoded',
|
||||||
|
len(self.post_data) - 4, BytesIO(self.post_data))
|
||||||
|
|
||||||
|
assert pq.append_post_query('http://example.com/') == 'http://example.com/?foo=bar&dir=%2'
|
||||||
|
|
||||||
|
def test_post_extract_length_too_long(self):
|
||||||
|
pq = PostQueryExtractor('POST', 'application/x-www-form-urlencoded',
|
||||||
|
len(self.post_data) + 4, BytesIO(self.post_data))
|
||||||
|
|
||||||
|
assert pq.append_post_query('http://example.com/') == 'http://example.com/?foo=bar&dir=/baz'
|
||||||
|
|
||||||
|
|
||||||
|
@ -3,9 +3,9 @@ from gevent import monkey; monkey.patch_all(thread=False)
|
|||||||
import pytest
|
import pytest
|
||||||
import webtest
|
import webtest
|
||||||
|
|
||||||
from pywb.webagg.test.testutils import BaseTestClass
|
from pywb.warcserver.test.testutils import BaseTestClass
|
||||||
|
|
||||||
from pywb.urlrewrite.frontendapp import FrontEndApp
|
from pywb.apps.frontendapp import FrontEndApp
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
@ -17,16 +17,16 @@ from pytest import raises
|
|||||||
from mock import patch
|
from mock import patch
|
||||||
|
|
||||||
from pywb import get_test_dir
|
from pywb import get_test_dir
|
||||||
from pywb.webagg.test.testutils import TempDirTests, BaseTestClass
|
from pywb.warcserver.test.testutils import TempDirTests, BaseTestClass
|
||||||
|
|
||||||
from pywb.manager.manager import main
|
from pywb.manager.manager import main
|
||||||
|
|
||||||
import pywb.manager.autoindex
|
import pywb.manager.autoindex
|
||||||
|
|
||||||
from pywb.warc.cdxindexer import main as cdxindexer_main
|
from pywb.indexer.cdxindexer import main as cdxindexer_main
|
||||||
from pywb.cdx.cdxobject import CDXObject
|
from pywb.warcserver.index.cdxobject import CDXObject
|
||||||
|
|
||||||
from pywb.urlrewrite.frontendapp import FrontEndApp
|
from pywb.apps.frontendapp import FrontEndApp
|
||||||
|
|
||||||
|
|
||||||
#=============================================================================
|
#=============================================================================
|
||||||
|
@ -8,10 +8,10 @@ import webtest
|
|||||||
|
|
||||||
from six.moves.urllib.parse import urlencode
|
from six.moves.urllib.parse import urlencode
|
||||||
|
|
||||||
from pywb.cdx.cdxobject import CDXObject
|
from pywb.warcserver.index.cdxobject import CDXObject
|
||||||
|
|
||||||
from pywb.webagg.test.testutils import BaseTestClass
|
from pywb.warcserver.test.testutils import BaseTestClass
|
||||||
from pywb.webagg.autoapp import AutoConfigApp
|
from pywb.warcserver.warcserver import WarcServer
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
@ -20,7 +20,7 @@ class TestCDXApp(BaseTestClass):
|
|||||||
def setup_class(cls):
|
def setup_class(cls):
|
||||||
super(TestCDXApp, cls).setup_class()
|
super(TestCDXApp, cls).setup_class()
|
||||||
config_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'config_test.yaml')
|
config_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'config_test.yaml')
|
||||||
cls.testapp = webtest.TestApp(AutoConfigApp(config_file=config_file))
|
cls.testapp = webtest.TestApp(WarcServer(config_file=config_file))
|
||||||
|
|
||||||
def query(self, url, is_error=False, **params):
|
def query(self, url, is_error=False, **params):
|
||||||
params['url'] = url
|
params['url'] = url
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
from .base_config_test import BaseConfigTest, fmod
|
from .base_config_test import BaseConfigTest, fmod
|
||||||
|
|
||||||
from pywb.cdx.cdxobject import CDXObject
|
from pywb.warcserver.index.cdxobject import CDXObject
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
|
Loading…
x
Reference in New Issue
Block a user