diff --git a/pywb/apps/cli.py b/pywb/apps/cli.py index 952451ef..9ff4a2db 100644 --- a/pywb/apps/cli.py +++ b/pywb/apps/cli.py @@ -6,7 +6,7 @@ import logging #============================================================================= def webagg(args=None): - WebaggCli(args=args, + WarcServerCli(args=args, default_port=8070, desc='pywb Web Aggregator Server').run() @@ -103,18 +103,18 @@ class ReplayCli(BaseCli): #============================================================================= -class WebaggCli(BaseCli): +class WarcServerCli(BaseCli): def load(self): - from pywb.webagg.autoapp import AutoConfigApp + from pywb.warcserver.warcserver import WarcServer - super(WebaggCli, self).load() - return AutoConfigApp(custom_config=self.extra_config) + super(WarcServerCli, self).load() + return WarcServer(custom_config=self.extra_config) #============================================================================= class WaybackCli(ReplayCli): def load(self): - from pywb.urlrewrite.frontendapp import FrontEndApp + from pywb.apps.frontendapp import FrontEndApp super(WaybackCli, self).load() return FrontEndApp(custom_config=self.extra_config) @@ -123,7 +123,7 @@ class WaybackCli(ReplayCli): #============================================================================= class LiveCli(BaseCli): def load(self): - from pywb.urlrewrite.frontendapp import FrontEndApp + from pywb.apps.frontendapp import FrontEndApp self.r.live = True diff --git a/pywb/urlrewrite/frontendapp.py b/pywb/apps/frontendapp.py similarity index 81% rename from pywb/urlrewrite/frontendapp.py rename to pywb/apps/frontendapp.py index 7215534c..5987aa12 100644 --- a/pywb/urlrewrite/frontendapp.py +++ b/pywb/apps/frontendapp.py @@ -8,16 +8,15 @@ from six.moves.urllib.parse import urljoin from six import iteritems from pywb.utils.loaders import load_yaml_config, to_native_str +from pywb.utils.geventserver import GeventServer -from pywb.webagg.autoapp import AutoConfigApp -from pywb.webapp.handlers import StaticHandler +from pywb.warcserver.warcserver import WarcServer -from pywb.framework.wbrequestresponse import WbResponse +from pywb.rewrite.templateview import BaseInsertView -from pywb.urlrewrite.geventserver import GeventServer -from pywb.urlrewrite.templateview import BaseInsertView - -from pywb.urlrewrite.rewriterapp import RewriterApp, UpstreamException +from pywb.apps.static_handler import StaticHandler +from pywb.apps.rewriterapp import RewriterApp, UpstreamException +from pywb.apps.wbrequestresponse import WbResponse import os import traceback @@ -27,14 +26,14 @@ import traceback class FrontEndApp(object): def __init__(self, config_file='./config.yaml', custom_config=None): self.debug = True - self.webagg = AutoConfigApp(config_file=config_file, - custom_config=custom_config) + self.warcserver = WarcServer(config_file=config_file, + custom_config=custom_config) - framed_replay = self.webagg.config.get('framed_replay', True) + framed_replay = self.warcserver.config.get('framed_replay', True) - self.rewriterapp = RewriterApp(framed_replay, config=self.webagg.config) + self.rewriterapp = RewriterApp(framed_replay, config=self.warcserver.config) - self.webagg_server = GeventServer(self.webagg, port=0) + self.warcserver_server = GeventServer(self.warcserver, port=0) self.static_handler = StaticHandler('pywb/static/') @@ -46,12 +45,12 @@ class FrontEndApp(object): self.url_map.add(Rule('/collinfo.json', endpoint=self.serve_listing)) self.url_map.add(Rule('/', endpoint=self.serve_home)) - self.rewriterapp.paths = self.get_upstream_paths(self.webagg_server.port) + self.rewriterapp.paths = self.get_upstream_paths(self.warcserver_server.port) - self.templates_dir = self.webagg.config.get('templates_dir', 'templates') - self.static_dir = self.webagg.config.get('static_dir', 'static') + self.templates_dir = self.warcserver.config.get('templates_dir', 'templates') + self.static_dir = self.warcserver.config.get('static_dir', 'static') - metadata_templ = os.path.join(self.webagg.root_dir, '{coll}', 'metadata.yaml') + metadata_templ = os.path.join(self.warcserver.root_dir, '{coll}', 'metadata.yaml') self.metadata_cache = MetadataCache(metadata_templ) def get_upstream_paths(self, port): @@ -61,8 +60,8 @@ class FrontEndApp(object): def serve_home(self, environ): home_view = BaseInsertView(self.rewriterapp.jinja_env, 'index.html') - fixed_routes = self.webagg.list_fixed_routes() - dynamic_routes = self.webagg.list_dynamic_routes() + fixed_routes = self.warcserver.list_fixed_routes() + dynamic_routes = self.warcserver.list_dynamic_routes() routes = fixed_routes + dynamic_routes @@ -76,7 +75,7 @@ class FrontEndApp(object): def serve_static(self, environ, coll='', filepath=''): if coll: - path = os.path.join(self.webagg.root_dir, coll, self.static_dir) + path = os.path.join(self.warcserver.root_dir, coll, self.static_dir) else: path = self.static_dir @@ -116,7 +115,7 @@ class FrontEndApp(object): kwargs = {'coll': coll} - if coll in self.webagg.list_fixed_routes(): + if coll in self.warcserver.list_fixed_routes(): kwargs['type'] = 'replay-fixed' else: kwargs['type'] = 'replay-dyn' @@ -131,23 +130,23 @@ class FrontEndApp(object): def setup_paths(self, environ, coll): pop_path_info(environ) - if not coll or not self.webagg.root_dir: + if not coll or not self.warcserver.root_dir: return - environ['pywb.templates_dir'] = os.path.join(self.webagg.root_dir, + environ['pywb.templates_dir'] = os.path.join(self.warcserver.root_dir, coll, self.templates_dir) def serve_listing(self, environ): - result = {'fixed': self.webagg.list_fixed_routes(), - 'dynamic': self.webagg.list_dynamic_routes() + result = {'fixed': self.warcserver.list_fixed_routes(), + 'dynamic': self.warcserver.list_dynamic_routes() } return WbResponse.json_response(result) def is_valid_coll(self, coll): - return (coll in self.webagg.list_fixed_routes() or - coll in self.webagg.list_dynamic_routes()) + return (coll in self.warcserver.list_fixed_routes() or + coll in self.warcserver.list_dynamic_routes()) def raise_not_found(self, environ, msg): raise NotFound(response=self.rewriterapp._error_response(environ, msg)) diff --git a/pywb/apps/live.py b/pywb/apps/live.py index acad0ce6..05acbaed 100644 --- a/pywb/apps/live.py +++ b/pywb/apps/live.py @@ -1,5 +1,5 @@ from gevent.monkey import patch_all; patch_all() -from pywb.urlrewrite.frontendapp import FrontEndApp +from pywb.apps.frontendapp import FrontEndApp application = FrontEndApp(config_file=None, custom_config={'collections': {'live': '$live'}}) diff --git a/pywb/urlrewrite/rewriterapp.py b/pywb/apps/rewriterapp.py similarity index 97% rename from pywb/urlrewrite/rewriterapp.py rename to pywb/apps/rewriterapp.py index a8687c5e..a4741f62 100644 --- a/pywb/urlrewrite/rewriterapp.py +++ b/pywb/apps/rewriterapp.py @@ -1,9 +1,12 @@ import requests +from werkzeug.http import HTTP_STATUS_CODES +from six.moves.urllib.parse import urlencode, urlsplit, urlunsplit + #from pywb.rewrite.rewrite_amf import RewriteAMFMixin #from pywb.rewrite.rewrite_dash import RewriteDASHMixin #from pywb.rewrite.rewrite_content import RewriteContent -from pywb.urlrewrite.rewriter import DefaultRewriter +from pywb.rewrite.default_rewriter import DefaultRewriter from pywb.rewrite.wburl import WbUrl from pywb.rewrite.url_rewriter import UrlRewriter, SchemeOnlyUrlRewriter @@ -16,18 +19,14 @@ from warcio.timeutils import http_date_to_timestamp from warcio.bufferedreaders import BufferedReader from warcio.recordloader import ArcWarcRecordLoader -from pywb.webagg.utils import BUFF_SIZE +from pywb.warcserver.index.cdxobject import CDXObject +from pywb.apps.wbrequestresponse import WbResponse -from pywb.cdx.cdxobject import CDXObject -from pywb.framework.wbrequestresponse import WbResponse +from pywb.warcserver.utils import BUFF_SIZE +from pywb.warcserver.utils import MementoUtils -from pywb.webagg.utils import MementoUtils - -from werkzeug.http import HTTP_STATUS_CODES -from six.moves.urllib.parse import urlencode, urlsplit, urlunsplit - -from pywb.urlrewrite.rewriteinputreq import RewriteInputRequest -from pywb.urlrewrite.templateview import JinjaEnv, HeadInsertView, TopFrameView, BaseInsertView +from pywb.rewrite.rewriteinputreq import RewriteInputRequest +from pywb.rewrite.templateview import JinjaEnv, HeadInsertView, TopFrameView, BaseInsertView from io import BytesIO @@ -71,7 +70,7 @@ class RewriterApp(object): #frame_type = 'inverse' if framed_replay else False #self.content_rewriter = Rewriter(is_framed_replay=frame_type) - self.content_rw = DefaultRewriter('pkg://pywb/rules.yaml', self.replay_mod) + self.content_rw = DefaultRewriter(replay_mod=self.replay_mod) if not jinja_env: jinja_env = JinjaEnv(globals={'static_path': 'static'}) diff --git a/pywb/webapp/handlers.py b/pywb/apps/static_handler.py similarity index 96% rename from pywb/webapp/handlers.py rename to pywb/apps/static_handler.py index 9a61fcc2..26fdf976 100644 --- a/pywb/webapp/handlers.py +++ b/pywb/apps/static_handler.py @@ -3,7 +3,7 @@ import os from pywb.utils.loaders import LocalFileLoader -from pywb.framework.wbrequestresponse import WbResponse +from pywb.apps.wbrequestresponse import WbResponse #================================================================= diff --git a/pywb/urlrewrite/test/test_rewriter.py b/pywb/apps/test/test_rewriter.py similarity index 80% rename from pywb/urlrewrite/test/test_rewriter.py rename to pywb/apps/test/test_rewriter.py index 2b723674..834dc161 100644 --- a/pywb/urlrewrite/test/test_rewriter.py +++ b/pywb/apps/test/test_rewriter.py @@ -1,9 +1,9 @@ from gevent import monkey; monkey.patch_all(thread=False) -from pywb.webagg.test.testutils import LiveServerTests, BaseTestClass -from pywb.webagg.test.testutils import FakeRedisTests +from pywb.warcserver.test.testutils import LiveServerTests, BaseTestClass +from pywb.warcserver.test.testutils import FakeRedisTests -from pywb.urlrewrite.frontendapp import FrontEndApp +from pywb.apps.frontendapp import FrontEndApp import os import webtest @@ -12,10 +12,10 @@ import webtest LIVE_CONFIG = {'collections': {'live': '$live'}} -class TestRewriter(FakeRedisTests, BaseTestClass): +class TestRewriterApp(FakeRedisTests, BaseTestClass): @classmethod def setup_class(cls): - super(TestRewriter, cls).setup_class() + super(TestRewriterApp, cls).setup_class() #cls.app = RWApp.create_app(replay_port=cls.server.port) #cls.testapp = webtest.TestApp(cls.app.app) diff --git a/pywb/framework/test/test_wbrequestresponse.py b/pywb/apps/test/test_wbrequestresponse.py similarity index 96% rename from pywb/framework/test/test_wbrequestresponse.py rename to pywb/apps/test/test_wbrequestresponse.py index 4937f8f1..44dc2468 100644 --- a/pywb/framework/test/test_wbrequestresponse.py +++ b/pywb/apps/test/test_wbrequestresponse.py @@ -1,4 +1,4 @@ -from pywb.framework.wbrequestresponse import WbResponse +from pywb.apps.wbrequestresponse import WbResponse from warcio.statusandheaders import StatusAndHeaders diff --git a/pywb/apps/warcserverapp.py b/pywb/apps/warcserverapp.py new file mode 100644 index 00000000..35374cc6 --- /dev/null +++ b/pywb/apps/warcserverapp.py @@ -0,0 +1,7 @@ +from gevent.monkey import patch_all; patch_all() +from pywb.warcserver.warcserver import WarcServer + +application = WarcServer(custom_config={'collections': {'live': '$live'}}) + + + diff --git a/pywb/apps/wayback.py b/pywb/apps/wayback.py index 2febc1f4..d767f56a 100644 --- a/pywb/apps/wayback.py +++ b/pywb/apps/wayback.py @@ -1,5 +1,5 @@ from gevent.monkey import patch_all; patch_all() -from pywb.urlrewrite.frontendapp import FrontEndApp +from pywb.apps.frontendapp import FrontEndApp application = FrontEndApp() diff --git a/pywb/framework/wbrequestresponse.py b/pywb/apps/wbrequestresponse.py similarity index 100% rename from pywb/framework/wbrequestresponse.py rename to pywb/apps/wbrequestresponse.py diff --git a/pywb/apps/webagg.py b/pywb/apps/webagg.py deleted file mode 100644 index f393533a..00000000 --- a/pywb/apps/webagg.py +++ /dev/null @@ -1,7 +0,0 @@ -from gevent.monkey import patch_all; patch_all() -from pywb.webagg.autoapp import AutoConfigApp - -application = AutoConfigApp(custom_config={'collections': {'live': '$live'}}) - - - diff --git a/pywb/framework/__init__.py b/pywb/framework/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/pywb/manager/manager.py b/pywb/manager/manager.py index ebc18f5f..d2ea744f 100644 --- a/pywb/manager/manager.py +++ b/pywb/manager/manager.py @@ -122,7 +122,7 @@ directory structure expected by pywb self._cdx_index(cdx_file, [self.archive_dir]) def _cdx_index(self, out, input_, rel_root=None): - from pywb.warc.cdxindexer import write_multi_cdx_index + from pywb.indexer.cdxindexer import write_multi_cdx_index options = dict(append_post=True, cdxj=True, diff --git a/pywb/rewrite/content_rewriter.py b/pywb/rewrite/content_rewriter.py index 8e61ea60..1a364ab9 100644 --- a/pywb/rewrite/content_rewriter.py +++ b/pywb/rewrite/content_rewriter.py @@ -9,7 +9,7 @@ import re import webencodings import tempfile -from pywb.webagg.utils import StreamIter, BUFF_SIZE +from pywb.warcserver.utils import StreamIter, BUFF_SIZE from pywb.rewrite.cookie_rewriter import ExactPathCookieRewriter from pywb.utils.loaders import load_yaml_config @@ -277,8 +277,9 @@ class RewriteInfo(object): self.cookie_rewriter = cookie_rewriter - self._fill_text_type_and_charset() - self._resolve_text_type() + if self.record: + self._fill_text_type_and_charset() + self._resolve_text_type() def _fill_text_type_and_charset(self): content_type = self.record.http_headers.get_header('Content-Type') diff --git a/pywb/urlrewrite/cookies.py b/pywb/rewrite/cookies.py similarity index 100% rename from pywb/urlrewrite/cookies.py rename to pywb/rewrite/cookies.py diff --git a/pywb/urlrewrite/rewriter.py b/pywb/rewrite/default_rewriter.py similarity index 89% rename from pywb/urlrewrite/rewriter.py rename to pywb/rewrite/default_rewriter.py index c8de6fc7..3496c401 100644 --- a/pywb/urlrewrite/rewriter.py +++ b/pywb/rewrite/default_rewriter.py @@ -7,7 +7,7 @@ from pywb.rewrite.regex_rewriters import RegexRewriter, CSSRewriter, XMLRewriter from pywb.rewrite.regex_rewriters import JSLinkAndLocationRewriter, JSLinkOnlyRewriter from pywb.rewrite.regex_rewriters import JSLocationOnlyRewriter, JSNoneRewriter -from pywb.urlrewrite.header_rewriter import PrefixHeaderRewriter +from pywb.rewrite.header_rewriter import PrefixHeaderRewriter from pywb.rewrite.jsonp_rewriter import JSONPRewriter @@ -75,6 +75,10 @@ class DefaultRewriter(BaseContentRewriter): 'text/plain': 'plain', } + def __init__(self, rules_file=None, replay_mod=''): + rules_file = rules_file or 'pkg://pywb/rules.yaml' + super(DefaultRewriter, self).__init__(rules_file, replay_mod) + def init_js_regex(self, regexs): return RegexRewriter.parse_rules_from_config(regexs) diff --git a/pywb/rewrite/header_rewriter.py b/pywb/rewrite/header_rewriter.py index f5656170..394ea0bc 100644 --- a/pywb/rewrite/header_rewriter.py +++ b/pywb/rewrite/header_rewriter.py @@ -1,102 +1,87 @@ from warcio.statusandheaders import StatusAndHeaders from warcio.timeutils import datetime_to_http_date from datetime import datetime, timedelta -import six -#================================================================= -class RewrittenStatusAndHeaders(object): - def __init__(self, statusline, headers, - removed_header_dict, text_type, charset): +#============================================================================= +class PrefixHeaderRewriter(object): + header_rules = { + 'content-type': 'keep', + 'content-disposition': 'keep', + 'content-range': 'keep', + 'accept-rangees': 'keep', + 'www-authenticate': 'keep', + 'proxy-authenticate': 'keep', - self.status_headers = StatusAndHeaders(statusline, headers) - self.removed_header_dict = removed_header_dict - self.text_type = text_type - self.charset = charset + 'location': 'url-rewrite', + 'content-location': 'url-rewrite', + 'content-base': 'url-rewrite', - def contains_removed_header(self, name, value): - return self.removed_header_dict.get(name) == value + 'transfer-encoding': 'prefix', + 'connection': 'prefix', - def readd_rewrite_removed(self): - for name in HeaderRewriter.KEEP_NO_REWRITE_HEADERS: - value = self.removed_header_dict.get(name) - if value is not None: - self.status_headers.headers.append((name, value)) + 'content-encoding': 'keep-if-no-content-rewrite', + 'content-length': 'content-length', - -#================================================================= -class HeaderRewriter(object): - REWRITE_TYPES = { - 'html': ['text/html', - 'application/xhtml', - 'application/xhtml+xml'], - - 'css': ['text/css'], - - 'js': ['text/javascript', - 'application/javascript', - 'application/x-javascript'], - - 'json': ['application/json'], - - 'hls': ['application/x-mpegURL'], - - 'xml': ['/xml', '+xml', '.xml', '.rss'], - - 'plain': ['text/plain'], + 'set-cookie': 'cookie', + 'cookie': 'cookie', } - PROXY_HEADERS = ['content-type', 'content-disposition', 'content-range', - 'accept-ranges', 'www-authenticate', 'proxy-authenticate'] - - URL_REWRITE_HEADERS = ['location', 'content-location', 'content-base'] - - REMOVE_ALWAYS_HEADERS = ['transfer-encoding'] - - KEEP_PROXY_HEADERS = ['content-security-policy', 'strict-transport-security'] - - KEEP_NO_REWRITE_HEADERS = ['content-length', 'content-encoding'] - - COOKIE_HEADERS = ['set-cookie', 'cookie'] - - CACHE_HEADERS = ['cache-control', 'expires', 'etag', 'last-modified'] - - - def __init__(self, header_prefix='X-Archive-Orig-'): + def __init__(self, rwinfo, header_prefix='X-Archive-Orig-'): self.header_prefix = header_prefix + self.rwinfo = rwinfo + self.http_headers = rwinfo.record.http_headers - def rewrite(self, status_headers, urlrewriter, cookie_rewriter): - content_type = status_headers.get_header('Content-Type') - text_type = None - charset = None - content_modified = False - http_cache = None - if urlrewriter: - http_cache = urlrewriter.rewrite_opts.get('http_cache') + if rwinfo.is_url_rw(): + self.default_rule = 'prefix' + else: + self.default_rule = 'keep' - if content_type: - text_type = self._extract_text_type(content_type) - if text_type: - charset = self._extract_char_set(content_type) - content_modified = True + def __call__(self): + new_headers_list = [] + for name, value in self.http_headers.headers: + rule = self.header_rules.get(name.lower(), self.default_rule) + new_header = self.rewrite_header(name, value, rule) + if new_header: + if isinstance(new_header, list): + new_headers_list.extend(new_header) + else: + new_headers_list.append(new_header) - result = self._rewrite_headers(status_headers.headers, - urlrewriter, - cookie_rewriter, - content_modified, - http_cache) + return StatusAndHeaders(self.http_headers.statusline, + headers=new_headers_list, + protocol=self.http_headers.protocol) - new_headers = result[0] - removed_header_dict = result[1] + def rewrite_header(self, name, value, rule): + if rule == 'keep': + return (name, value) - if http_cache != None and http_cache != 'pass': - self._add_cache_headers(new_headers, http_cache) + elif rule == 'url-rewrite': + return (name, self.rwinfo.url_rewriter.rewrite(value)) - return RewrittenStatusAndHeaders(status_headers.statusline, - new_headers, - removed_header_dict, - text_type, - charset) + elif rule == 'keep-if-no-content-rewrite': + if not self.rwinfo.is_content_rw: + return (name, value) + + elif rule == 'content-length': + if value == '0': + return (name, value) + + if not self.rwinfo.is_content_rw: + try: + if int(value) >= 0: + return (name, value) + except: + pass + + elif rule == 'cookie': + if self.rwinfo.cookie_rewriter: + return self.rwinfo.cookie_rewriter.rewrite(value) + else: + return (name, value) + + # default 'prefix' + return (self.header_prefix + name, value) def _add_cache_headers(self, new_headers, http_cache): try: @@ -112,76 +97,4 @@ class HeaderRewriter(object): new_headers.append(('Cache-Control', 'max-age=' + str(age))) new_headers.append(('Expires', datetime_to_http_date(dt))) - def _extract_text_type(self, content_type): - for ctype, mimelist in six.iteritems(self.REWRITE_TYPES): - if any((mime in content_type) for mime in mimelist): - return ctype - return None - - def _extract_char_set(self, content_type): - CHARSET_TOKEN = 'charset=' - idx = content_type.find(CHARSET_TOKEN) - if idx < 0: - return None - - return content_type[idx + len(CHARSET_TOKEN):].lower() - - def _rewrite_headers(self, headers, urlrewriter, - cookie_rewriter, - content_modified, - http_cache): - - new_headers = [] - removed_header_dict = {} - - def add_header(name, value): - new_headers.append((name, value)) - - def add_prefixed_header(name, value): - new_headers.append((self.header_prefix + name, value)) - - for (name, value) in headers: - lowername = name.lower() - - if lowername in self.PROXY_HEADERS: - add_header(name, value) - - elif urlrewriter and urlrewriter.prefix and lowername in self.URL_REWRITE_HEADERS: - new_headers.append((name, urlrewriter.rewrite(value))) - - elif lowername in self.KEEP_NO_REWRITE_HEADERS: - if content_modified and value != '0': - removed_header_dict[lowername] = value - add_prefixed_header(name, value) - else: - add_header(name, value) - - elif lowername in self.KEEP_PROXY_HEADERS: - if urlrewriter.prefix: - removed_header_dict[lowername] = value - add_prefixed_header(name, value) - else: - add_header(name, value) - - elif lowername in self.REMOVE_ALWAYS_HEADERS: - removed_header_dict[lowername] = value - add_prefixed_header(name, value) - - elif (lowername in self.COOKIE_HEADERS and - cookie_rewriter): - cookie_list = cookie_rewriter.rewrite(value) - new_headers.extend(cookie_list) - - elif (lowername in self.CACHE_HEADERS): - if http_cache == 'pass': - add_header(name, value) - else: - add_prefixed_header(name, value) - - elif urlrewriter and urlrewriter.prefix: - add_prefixed_header(name, value) - else: - add_header(name, value) - - return (new_headers, removed_header_dict) diff --git a/pywb/rewrite/rewrite_content.py b/pywb/rewrite/rewrite_content.py deleted file mode 100644 index de239644..00000000 --- a/pywb/rewrite/rewrite_content.py +++ /dev/null @@ -1,403 +0,0 @@ -#import chardet -import pkgutil -import webencodings -import yaml -import re - -#from chardet.universaldetector import UniversalDetector -from io import BytesIO - -from pywb.rewrite.header_rewriter import RewrittenStatusAndHeaders - -from pywb.rewrite.rewriterules import RewriteRules - -from pywb.utils.dsrules import RuleSet - -from warcio.statusandheaders import StatusAndHeaders -from warcio.bufferedreaders import DecompressingBufferedReader -from warcio.bufferedreaders import ChunkedDataReader, BufferedReader -from warcio.utils import to_native_str - -from pywb.rewrite.regex_rewriters import JSNoneRewriter, JSLinkOnlyRewriter - - -#================================================================= -class RewriteContent(object): - HEAD_REGEX = re.compile(b'<\s*head\\b[^>]*[>]+', re.I) - - TAG_REGEX = re.compile(b'^\s*\<') - - CHARSET_REGEX = re.compile(b']*?[\s;"\']charset\s*=[\s"\']*([^\s"\'/>]*)') - - BUFF_SIZE = 16384 - - def __init__(self, ds_rules_file=None, is_framed_replay=False): - self.ruleset = RuleSet(RewriteRules, 'rewrite', - default_rule_config={}, - ds_rules_file=ds_rules_file) - - if is_framed_replay == 'inverse': - self.defmod = 'mp_' - else: - self.defmod = '' - - def sanitize_content(self, status_headers, stream): - # remove transfer encoding chunked and wrap in a dechunking stream - if (status_headers.remove_header('transfer-encoding')): - stream = ChunkedDataReader(stream) - - return (status_headers, stream) - - def _rewrite_headers(self, urlrewriter, rule, status_headers, stream, - urlkey='', cookie_rewriter=None): - - header_rewriter_class = rule.rewriters['header'] - - if urlrewriter and not cookie_rewriter: - cookie_rewriter = urlrewriter.get_cookie_rewriter(rule) - - rewritten_headers = (header_rewriter_class(). - rewrite(status_headers, - urlrewriter, - cookie_rewriter)) - - # note: since chunk encoding may/may not be valid, - # the approach taken here is to *always* attempt - # to dechunk if 'transfer-encoding: chunked' is present - # - # an alternative may be to serve chunked unless - # content rewriting is needed - # todo: possible revisit this approach - - if (rewritten_headers. - contains_removed_header('transfer-encoding', 'chunked')): - - stream = ChunkedDataReader(stream) - - return (rewritten_headers, stream) - - def _decoding_stream(self, rewritten_headers, stream): - for decomp_type in BufferedReader.get_supported_decompressors(): - matched, stream = self._check_encoding(rewritten_headers, - stream, - decomp_type) - if matched: - break - - return stream - - def _check_encoding(self, rewritten_headers, stream, enc): - matched = False - if (rewritten_headers. - contains_removed_header('content-encoding', enc)): - - #optimize: if already a ChunkedDataReader, add the encoding - if isinstance(stream, ChunkedDataReader): - stream.set_decomp(enc) - else: - stream = DecompressingBufferedReader(stream, decomp_type=enc) - - rewritten_headers.status_headers.remove_header('content-length') - matched = True - - return matched, stream - - - - def rewrite_content(self, urlrewriter, status_headers, stream, - head_insert_func=None, urlkey='', - cdx=None, cookie_rewriter=None, env=None): - - wb_url = urlrewriter.wburl - - if (wb_url.is_identity or - (not head_insert_func and wb_url.is_banner_only)): - status_headers, stream = self.sanitize_content(status_headers, - stream) - return (status_headers, self.stream_to_gen(stream), False) - - if urlrewriter and cdx and cdx.get('is_live'): - urlrewriter.rewrite_opts['is_live'] = True - - rule = self.ruleset.get_first_match(urlkey) - - (rewritten_headers, stream) = self._rewrite_headers(urlrewriter, - rule, - status_headers, - stream, - urlkey, - cookie_rewriter) - - res = self.handle_custom_rewrite(rewritten_headers, - stream, - urlrewriter, - wb_url.mod, - env) - if res: - return res - - # Handle text content rewriting - # ==================================================================== - # special case -- need to ungzip the body - - status_headers = rewritten_headers.status_headers - text_type = rewritten_headers.text_type - - # see known js/css modifier specified, the context should run - # default text_type - mod = wb_url.mod - - stream_raw = False - encoding = None - first_buff = b'' - - stream = self._decoding_stream(rewritten_headers, stream) - - if mod == 'js_': - text_type, stream = self._resolve_text_type('js', - text_type, - stream) - elif mod == 'cs_': - text_type, stream = self._resolve_text_type('css', - text_type, - stream) - - # for proxy mode: use special js_proxy rewriter - # which may be none rewriter + custom rules (if any) - if text_type == 'js' and not urlrewriter.prefix: - rewriter_class = rule.rewriters['js_proxy'] - else: - rewriter_class = rule.rewriters[text_type] - - # for html, need to perform header insert, supply js, css, xml - # rewriters - if text_type == 'html': - head_insert_str = '' - charset = rewritten_headers.charset - - # if no charset set, attempt to extract from first 1024 - if not rewritten_headers.charset: - first_buff = stream.read(1024) - charset = self._extract_html_charset(first_buff, - status_headers) - - if head_insert_func and not wb_url.is_url_rewrite_only: - head_insert_orig = head_insert_func(rule, cdx) - - if charset: - try: - head_insert_str = webencodings.encode(head_insert_orig, charset) - except: - pass - - if not head_insert_str: - charset = 'utf-8' - head_insert_str = head_insert_orig.encode(charset) - - head_insert_buf = head_insert_str - #head_insert_str = to_native_str(head_insert_str) - head_insert_str = head_insert_str.decode('iso-8859-1') - - - if wb_url.is_banner_only: - gen = self._head_insert_only_gen(head_insert_buf, - stream, - first_buff) - - content_len = status_headers.get_header('Content-Length') - try: - content_len = int(content_len) - except Exception: - content_len = None - - if content_len is not None and content_len >= 0: - content_len = str(content_len + len(head_insert_str)) - status_headers.replace_header('Content-Length', - content_len) - - return (status_headers, gen, False) - - # if proxy, use js_proxy rewriter - if not urlrewriter.prefix: - js_rewriter_class = rule.rewriters['js_proxy'] - else: - js_rewriter_class = rule.rewriters['js'] - - css_rewriter_class = rule.rewriters['css'] - - if wb_url.is_url_rewrite_only: - js_rewriter_class = JSNoneRewriter - - rewriter = rewriter_class(urlrewriter, - js_rewriter_class=js_rewriter_class, - css_rewriter_class=css_rewriter_class, - head_insert=head_insert_str, - url=wb_url.url, - defmod=self.defmod, - parse_comments=rule.parse_comments) - - else: - if wb_url.is_banner_only: - return (status_headers, self.stream_to_gen(stream), False) - - # url-only rewriter, but not rewriting urls in JS, so return - if wb_url.is_url_rewrite_only and text_type == 'js': - #return (status_headers, self.stream_to_gen(stream), False) - rewriter_class = JSLinkOnlyRewriter - - # apply one of (js, css, xml) rewriters - rewriter = rewriter_class(urlrewriter) - - - # align to line end for all non-html rewriting - align = (text_type != 'html') - - # Create rewriting generator - gen = self.rewrite_text_stream_to_gen(stream, - rewrite_func=rewriter.rewrite, - final_read_func=rewriter.close, - first_buff=first_buff, - align_to_line=align) - - return (status_headers, gen, True) - - def handle_custom_rewrite(self, rewritten_headers, stream, - urlrewriter, mod, env): - - text_type = rewritten_headers.text_type - status_headers = rewritten_headers.status_headers - - # use rewritten headers, but no further rewriting needed - if text_type is None: - return (status_headers, self.stream_to_gen(stream), False) - - if ((text_type == 'html' and urlrewriter.rewrite_opts.get('is_ajax')) or - (text_type == 'plain' and not mod in ('js_', 'cs_'))): - rewritten_headers.readd_rewrite_removed() - return (status_headers, self.stream_to_gen(stream), False) - - @staticmethod - def _extract_html_charset(buff, status_headers): - charset = None - m = RewriteContent.CHARSET_REGEX.search(buff) - if m: - charset = m.group(1) - charset = to_native_str(charset) - # content_type = 'text/html; charset=' + charset - # status_headers.replace_header('content-type', content_type) - - return charset - - @staticmethod - def _resolve_text_type(mod, text_type, stream): - if text_type == 'css' and mod == 'js': - return 'css', stream - - # only attempt to resolve between html and other text types - if text_type != 'html': - return mod, stream - - buff = stream.read(128) - - wrapped_stream = BufferedReader(stream, starting_data=buff) - - # check if starts with a tag, then likely html - if RewriteContent.TAG_REGEX.match(buff): - mod = 'html' - - return mod, wrapped_stream - - def _head_insert_only_gen(self, insert_str, stream, first_buff=b''): - buff = first_buff - max_len = 1024 - len(first_buff) - while max_len > 0: - curr = stream.read(max_len) - if not curr: - break - - max_len -= len(buff) - buff += curr - - matcher = self.HEAD_REGEX.search(buff) - - if matcher: - yield buff[:matcher.end()] - yield insert_str - yield buff[matcher.end():] - else: - yield insert_str - yield buff - - for buff in self.stream_to_gen(stream): - yield buff - - @staticmethod - def _decode_buff(buff, stream, encoding): # pragma: no coverage - try: - buff = buff.decode(encoding) - except UnicodeDecodeError as e: - # chunk may have cut apart unicode bytes -- add 1-3 bytes and retry - for i in range(3): - buff += stream.read(1) - try: - buff = buff.decode(encoding) - break - except UnicodeDecodeError: - pass - else: - raise - - return buff - - @staticmethod - def stream_to_gen(stream): - """ - Convert stream to an iterator, reading BUFF_SIZE bytes - """ - try: - while True: - buff = stream.read(RewriteContent.BUFF_SIZE) - yield buff - if not buff: - break - - finally: - stream.close() - - @staticmethod - def rewrite_text_stream_to_gen(stream, rewrite_func, - final_read_func, first_buff, - align_to_line): - """ - Convert stream to generator using applying rewriting func - to each portion of the stream. - Align to line boundaries if needed. - """ - try: - has_closed = hasattr(stream, 'closed') - buff = first_buff - - while True: - if buff: - buff = rewrite_func(buff.decode('iso-8859-1')) - yield buff.encode('iso-8859-1') - - buff = stream.read(RewriteContent.BUFF_SIZE) - # on 2.6, readline() (but not read()) throws an exception - # if stream already closed, so check stream.closed if present - if (buff and align_to_line and - (not has_closed or not stream.closed)): - buff += stream.readline() - - if not buff: - break - - # For adding a tail/handling final buffer - buff = final_read_func() - if buff: - yield buff.encode('iso-8859-1') - - finally: - stream.close() - - diff --git a/pywb/rewrite/rewrite_live.py b/pywb/rewrite/rewrite_live.py deleted file mode 100644 index 38ea8b87..00000000 --- a/pywb/rewrite/rewrite_live.py +++ /dev/null @@ -1,315 +0,0 @@ -""" -Fetch a url from live web and apply rewriting rules -""" - -from requests import request as live_request - -import mimetypes -import logging -import os - -from six.moves.urllib.parse import urlsplit -import six - -from warcio.timeutils import timestamp_now -from warcio.statusandheaders import StatusAndHeaders - -from pywb.utils.loaders import is_http, LimitReader, LocalFileLoader, to_file_url -from pywb.utils.loaders import extract_client_cookie -from pywb.utils.canonicalize import canonicalize - -from pywb.rewrite.rewrite_content import RewriteContent - - -#================================================================= -class LiveRewriter(object): - def __init__(self, is_framed_replay=False, proxies=None): - self.rewriter = RewriteContent(is_framed_replay=is_framed_replay) - - self.proxies = proxies - - self.live_request = live_request - - if self.proxies: - logging.debug('Live Rewrite via proxy ' + str(proxies)) - - if isinstance(proxies, str): - self.proxies = {'http': proxies, - 'https': proxies} - - else: - logging.debug('Live Rewrite Direct (no proxy)') - - def is_recording(self): - return self.proxies is not None - - def fetch_local_file(self, uri): - #fh = open(uri) - fh = LocalFileLoader().load(uri) - - content_type, _ = mimetypes.guess_type(uri) - - # create fake headers for local file - status_headers = StatusAndHeaders('200 OK', - [('Content-Type', content_type)]) - stream = fh - - return (status_headers, stream) - - def translate_headers(self, url, urlkey, env): - headers = {} - - splits = urlsplit(url) - has_cookies = False - - for name, value in six.iteritems(env): - if name == 'HTTP_HOST': - name = 'Host' - value = splits.netloc - - elif name == 'HTTP_ORIGIN': - name = 'Origin' - value = (splits.scheme + '://' + splits.netloc) - - elif name == 'HTTP_X_CSRFTOKEN': - name = 'X-CSRFToken' - cookie_val = extract_client_cookie(env, 'csrftoken') - if cookie_val: - value = cookie_val - - elif name == 'HTTP_REFERER': - continue - - elif name == 'HTTP_X_PYWB_REQUESTED_WITH': - continue - - elif name == 'HTTP_X_FORWARDED_PROTO': - name = 'X-Forwarded-Proto' - value = splits.scheme - - elif name == 'HTTP_COOKIE': - name = 'Cookie' - value = self._req_cookie_rewrite(urlkey, value) - has_cookies = True - - elif name.startswith('HTTP_'): - name = name[5:].title().replace('_', '-') - - elif name in ('CONTENT_LENGTH', 'CONTENT_TYPE'): - name = name.title().replace('_', '-') - - elif name == 'REL_REFERER': - name = 'Referer' - else: - value = None - - if value: - headers[name] = value - - if not has_cookies: - value = self._req_cookie_rewrite(urlkey, '') - if value: - headers['Cookie'] = value - - return headers - - def _req_cookie_rewrite(self, urlkey, value): - rule = self.rewriter.ruleset.get_first_match(urlkey) - if not rule or not rule.req_cookie_rewrite: - return value - - for cr in rule.req_cookie_rewrite: - try: - value = cr['rx'].sub(cr['replace'], value) - except KeyError: - pass - - return value - - def fetch_http(self, url, - urlkey=None, - env=None, - req_headers=None, - follow_redirects=False, - skip_recording=False, - verify=True): - - method = 'GET' - data = None - - proxies = None - if not skip_recording: - proxies = self.proxies - - if not req_headers: - req_headers = {} - - if env is not None: - method = env['REQUEST_METHOD'].upper() - input_ = env['wsgi.input'] - - req_headers.update(self.translate_headers(url, urlkey, env)) - - if method in ('POST', 'PUT'): - len_ = env.get('CONTENT_LENGTH') - if len_: - data = LimitReader(input_, int(len_)) - else: - data = input_ - - response = self.live_request(method=method, - url=url, - data=data, - headers=req_headers, - allow_redirects=follow_redirects, - proxies=proxies, - stream=True, - verify=verify) - - statusline = str(response.status_code) + ' ' + response.reason - - headers = response.headers.items() - - stream = response.raw - - try: #pragma: no cover - #PY 3 - headers = stream._original_response.headers._headers - except: #pragma: no cover - #PY 2 - headers = [] - resp_headers = stream._original_response.msg.headers - for h in resp_headers: - n, v = h.split(':', 1) - n = n.strip() - v = v.strip() - headers.append((n, v)) - - status_headers = StatusAndHeaders(statusline, headers) - - return (status_headers, stream) - - def fetch_request(self, url, urlrewriter, - head_insert_func=None, - urlkey=None, - env=None, - req_headers={}, - timestamp=None, - follow_redirects=False, - skip_recording=False, - verify=True, - remote_only=True): - - ts_err = url.split('///') - - # fixup for accidental erroneous rewrite which has /// - # (unless file:///) - if len(ts_err) > 1 and ts_err[0] != 'file:': - url = 'http://' + ts_err[1] - - if url.startswith('//'): - url = 'http:' + url - - if remote_only or is_http(url): - is_remote = True - else: - is_remote = False - if not url.startswith('file:'): - url = to_file_url(url) - - # explicit urlkey may be passed in (say for testing) - if not urlkey: - urlkey = canonicalize(url) - - if is_remote: - (status_headers, stream) = self.fetch_http(url, urlkey, env, - req_headers, - follow_redirects, - skip_recording, - verify) - else: - (status_headers, stream) = self.fetch_local_file(url) - - if timestamp is None: - timestamp = timestamp_now() - - cdx = {'urlkey': urlkey, - 'timestamp': timestamp, - 'url': url, - 'status': status_headers.get_statuscode(), - 'mime': status_headers.get_header('Content-Type'), - 'is_live': True, - } - - result = (self.rewriter. - rewrite_content(urlrewriter, - status_headers, - stream, - head_insert_func=head_insert_func, - urlkey=urlkey, - cdx=cdx)) - - if env: - env['pywb.cdx'] = cdx - - return result - - def fetch_async(self, url, headers): - resp = self.live_request(method='GET', - url=url, - headers=headers, - proxies=self.proxies, - verify=False, - stream=True) - - # don't actually read whole response, - # proxy response for writing it - resp.close() - - def add_metadata(self, url, headers, data): - return self.live_request(method='PUTMETA', - url=url, - data=data, - headers=headers, - proxies=self.proxies, - verify=False) - - def get_rewritten(self, *args, **kwargs): - result = self.fetch_request(*args, **kwargs) - - status_headers, gen, is_rewritten = result - - buff = b''.join(gen) - - return (status_headers, buff) - - def get_video_info(self, url): - return youtubedl.extract_info(url) - - -#================================================================= -class YoutubeDLWrapper(object): #pragma: no cover - """ YoutubeDL wrapper, inits youtubee-dl if it is available - """ - def __init__(self): - try: - from youtube_dl import YoutubeDL as YoutubeDL - except ImportError: - self.ydl = None - return - - self.ydl = YoutubeDL(dict(simulate=True, - youtube_include_dash_manifest=False)) - self.ydl.add_default_info_extractors() - - def extract_info(self, url): - if not self.ydl: - return None - - info = self.ydl.extract_info(url) - return info - - -#================================================================= -youtubedl = YoutubeDLWrapper() - diff --git a/pywb/urlrewrite/rewriteinputreq.py b/pywb/rewrite/rewriteinputreq.py similarity index 98% rename from pywb/urlrewrite/rewriteinputreq.py rename to pywb/rewrite/rewriteinputreq.py index 84569358..c43c785c 100644 --- a/pywb/urlrewrite/rewriteinputreq.py +++ b/pywb/rewrite/rewriteinputreq.py @@ -1,4 +1,4 @@ -from pywb.webagg.inputrequest import DirectWSGIInputRequest +from pywb.warcserver.inputrequest import DirectWSGIInputRequest from pywb.utils.loaders import extract_client_cookie from six import iteritems diff --git a/pywb/rewrite/rewriterules.py b/pywb/rewrite/rewriterules.py deleted file mode 100644 index 6ec33280..00000000 --- a/pywb/rewrite/rewriterules.py +++ /dev/null @@ -1,80 +0,0 @@ -from pywb.utils.dsrules import BaseRule - -from pywb.rewrite.regex_rewriters import RegexRewriter, CSSRewriter, XMLRewriter -from pywb.rewrite.regex_rewriters import JSLinkAndLocationRewriter, JSLinkOnlyRewriter -from pywb.rewrite.regex_rewriters import JSLocationOnlyRewriter, JSNoneRewriter - -from pywb.rewrite.header_rewriter import HeaderRewriter -from pywb.rewrite.html_rewriter import HTMLRewriter - -from pywb.rewrite.jsonp_rewriter import JSONPRewriter - -import re - - -#================================================================= -class RewriteRules(BaseRule): - def __init__(self, url_prefix, config={}): - super(RewriteRules, self).__init__(url_prefix, config) - - self.rewriters = {} - - #self._script_head_inserts = config.get('script_head_inserts', {}) - - self.rewriters['header'] = config.get('header_class', HeaderRewriter) - self.rewriters['css'] = config.get('css_class', CSSRewriter) - self.rewriters['xml'] = config.get('xml_class', XMLRewriter) - self.rewriters['html'] = config.get('html_class', HTMLRewriter) - self.rewriters['json'] = config.get('json_class', JSONPRewriter) - - self.parse_comments = config.get('parse_comments', False) - - # Custom handling for js rewriting, often the most complex - self.js_rewrite_location = config.get('js_rewrite_location', 'location') - - # ability to toggle rewriting - if self.js_rewrite_location == 'all': - js_default_class = JSLinkAndLocationRewriter - elif self.js_rewrite_location == 'location': - js_default_class = JSLocationOnlyRewriter -# self.rewriters['json'] = JSNoneRewriter - elif self.js_rewrite_location == 'none': - js_default_class = JSNoneRewriter -# self.rewriters['json'] = JSNoneRewriter - else: - js_default_class = JSLinkOnlyRewriter - - # set js class, using either default or override from config - self.rewriters['js'] = config.get('js_class', js_default_class) - - self.rewriters['js_proxy'] = JSNoneRewriter - - # add any regexs for js rewriter - self._add_custom_regexs('js', 'js_regexs', config) - self._add_custom_regexs('js_proxy', 'js_regexs', config) - - # cookie rewrite scope - self.cookie_scope = config.get('cookie_scope', 'default') - - req_cookie_rewrite = config.get('req_cookie_rewrite', []) - for rc in req_cookie_rewrite: - rc['rx'] = re.compile(rc.get('match', '')) - - self.req_cookie_rewrite = req_cookie_rewrite - - def _add_custom_regexs(self, rw_id, field, config): - regexs = config.get(field) - if not regexs: - return - - rewriter_cls = self.rewriters[rw_id] - - #rule_def_tuples = RegexRewriter.parse_rules_from_config(regexs) - parse_rules_func = RegexRewriter.parse_rules_from_config(regexs) - - def extend_rewriter_with_regex(urlrewriter): - rule_def_tuples = parse_rules_func(urlrewriter) - return rewriter_cls(urlrewriter, rule_def_tuples) - - self.rewriters[rw_id] = extend_rewriter_with_regex - diff --git a/pywb/urlrewrite/templateview.py b/pywb/rewrite/templateview.py similarity index 100% rename from pywb/urlrewrite/templateview.py rename to pywb/rewrite/templateview.py diff --git a/pywb/rewrite/test/test_rewrite_live.py b/pywb/rewrite/test/test_rewrite_live.py deleted file mode 100644 index 62fa3bf9..00000000 --- a/pywb/rewrite/test/test_rewrite_live.py +++ /dev/null @@ -1,271 +0,0 @@ -from pywb.rewrite.rewrite_live import LiveRewriter -from pywb.rewrite.url_rewriter import UrlRewriter -from pywb.rewrite.wburl import WbUrl - -from pywb.utils.loaders import to_native_str - -from pywb import get_test_dir - -from io import BytesIO - -# This module has some rewriting tests against the 'live web' -# As such, the content may change and the test may break - -urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/pywb/') -bn_urlrewriter = UrlRewriter('20131226101010bn_/http://example.com/some/path/index.html', '/pywb/') - -def head_insert_func(rule, cdx): - if rule.js_rewrite_location != 'urls': - return '' - else: - return '' - -def test_csrf_token_headers(): - rewriter = LiveRewriter() - env = {'HTTP_X_CSRFTOKEN': 'wrong', 'HTTP_COOKIE': 'csrftoken=foobar'} - - req_headers = rewriter.translate_headers('http://example.com/', 'com,example)/', env) - - assert req_headers == {'X-CSRFToken': 'foobar', 'Cookie': 'csrftoken=foobar'} - -def test_forwarded_scheme(): - rewriter = LiveRewriter() - env = {'HTTP_X_FORWARDED_PROTO': 'https', 'Other': 'Value'} - - req_headers = rewriter.translate_headers('http://example.com/', 'com,example)/', env) - - assert req_headers == {'X-Forwarded-Proto': 'http'} - -def test_req_cookie_rewrite_1(): - rewriter = LiveRewriter() - env = {'HTTP_COOKIE': 'A=B'} - - urlkey = 'example,example,test)/' - url = 'test.example.example/' - - req_headers = rewriter.translate_headers(url, urlkey, env) - - assert req_headers == {'Cookie': 'A=B; FOO=&bar=1'} - -def test_req_cookie_rewrite_2(): - rewriter = LiveRewriter() - env = {'HTTP_COOKIE': 'FOO=goo'} - - urlkey = 'example,example,test)/' - url = 'test.example.example/' - - req_headers = rewriter.translate_headers(url, urlkey, env) - - assert req_headers == {'Cookie': 'FOO=&bar=1'} - -def test_req_cookie_rewrite_3(): - rewriter = LiveRewriter() - env = {} - - urlkey = 'example,example,test)/' - url = 'test.example.example/' - - req_headers = rewriter.translate_headers(url, urlkey, env) - - assert req_headers == {'Cookie': '; FOO=&bar=1'} - -def test_local_1(): - status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html', - urlrewriter, - head_insert_func, - 'example,example,test,all)/') - - # wombat insert added - assert '' in buff, buff - - # JS location and JS link rewritten - assert 'window.WB_wombat_location = "/pywb/20131226101010/http:\/\/example.com/dynamic_page.html"' in buff - - # link rewritten - assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff - - -def test_local_no_head(): - status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample_no_head.html', - urlrewriter, - head_insert_func, - 'com,example,test)/') - - # wombat insert added - assert '' in buff, buff - - # location rewritten - assert 'window.WB_wombat_location = "/other.html"' in buff, buff - - # link rewritten - assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff, buff - -def test_local_no_head_only_title(): - status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample_no_head_2.html', - urlrewriter, - head_insert_func, - 'com,example,test)/') - - # wombat insert added - assert '' in buff - - -def test_local_no_head_banner_only(): - status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample_no_head.html', - bn_urlrewriter, - head_insert_func, - 'com,example,test)/') - - # wombat insert added - assert '' in buff - - # location NOT rewritten - assert 'window.location = "/other.html"' in buff - - # link NOT rewritten - assert '"/some/path/another.html"' in buff - -def test_local_banner_only_no_rewrite(): - status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html', - bn_urlrewriter, - head_insert_func, - 'com,example,test)/') - - # wombat insert added - assert '' in buff - - # JS location NOT rewritten, JS link NOT rewritten - assert 'window.location = "http:\/\/example.com/dynamic_page.html"' in buff, buff - - # link NOT rewritten - assert '"/some/path/another.html"' in buff - -def test_local_2_link_only_rewrite(): - status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html', - urlrewriter, - head_insert_func, - 'example,example,test)/nolocation_rewrite') - - # no wombat insert - assert '' not in buff - - # JS location NOT rewritten, JS link rewritten - assert 'window.location = "/pywb/20131226101010/http:\/\/example.com/dynamic_page.html"' in buff - - # still link rewrite - assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff - - -def test_local_2_js_loc_only_rewrite(): - status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html', - urlrewriter, - head_insert_func, - 'example,example,test,loconly)/') - - # wombat insert added - assert '' in buff - - # JS location rewritten, JS link NOT rewritten - assert 'window.WB_wombat_location = "http:\/\/example.com/dynamic_page.html"' in buff - - # still link rewrite in HTML - assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff - -def test_local_2_no_rewrite(): - status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html', - urlrewriter, - head_insert_func, - 'example,example,test,norewrite)/') - - # wombat insert added - assert '' in buff - - # JS location NOT rewritten, JS link NOT rewritten - assert 'window.location = "http:\/\/example.com/dynamic_page.html"' in buff - - # still link rewrite in HTML - assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff - -def test_local_unclosed_script(): - status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample_unclosed_script.html', - urlrewriter, - head_insert_func, - 'example,example,test,all)/') - - # wombat insert added - assert '' in buff, buff - - # JS location and JS link rewritten - assert 'window.WB_wombat_location = "/pywb/20131226101010/http:\/\/example.com/dynamic_page.html";' in buff, buff - - assert '' in buff, buff - - -def test_example_1(): - status_headers, buff = get_rewritten('http://example.com/', urlrewriter, req_headers={'Connection': 'close', 'Accept-Encoding': 'identity'}) - - # verify header rewriting - assert status_headers.get_header('x-archive-orig-content-length') == '1270', status_headers - - - # verify utf-8 charset detection - assert status_headers.get_header('content-type') == 'text/html' - - assert '/pywb/20131226101010/http://www.iana.org/domains/example' in buff, buff - -def test_example_2_redirect(): - status_headers, buff = get_rewritten('http://httpbin.org/redirect-to?url=http://example.com/', urlrewriter) - - # redirect, no content - assert status_headers.get_statuscode() == '302' - assert len(buff) == 0 - - -def test_example_3_rel(): - status_headers, buff = get_rewritten('//example.com/', urlrewriter) - assert status_headers.get_statuscode() == '200' - - -def test_example_4_rewrite_err(): - # may occur in case of rewrite mismatch, the /// gets stripped off - status_headers, buff = get_rewritten('http://localhost:8080///example.com/', urlrewriter) - assert status_headers.get_statuscode() == '200' - -def test_example_domain_specific_3(): - status_headers, buff = get_rewritten('http://facebook.com/digitalpreservation', urlrewriter, follow_redirects=True) - - # comment out Bootloader.configurePage, if it is still there - if 'Bootloader.configurePage' in buff: - assert '/* Bootloader.configurePage' in buff - -def test_wombat_top(): - #status_headers, buff = get_rewritten('https://assets-cdn.github.com/assets/github-0f06d0f46fe7bcfbf31f2380f23aec15ba21b8ec.js', urlrewriter) - status_headers, buff = get_rewritten(get_test_dir() + 'text_content/toptest.js', urlrewriter) - - assert 'WB_wombat_top!==window' in buff - -def test_post(): - buff = BytesIO(b'ABC=DEF') - - env = {'REQUEST_METHOD': 'POST', - 'HTTP_ORIGIN': 'http://httpbin.org', - 'HTTP_HOST': 'httpbin.org', - 'wsgi.input': buff} - - status_headers, resp_buff = get_rewritten('http://httpbin.org/post', urlrewriter, env=env) - assert status_headers.get_statuscode() == '200', status_headers - -def test_multiple_set_cookies(): - status_headers, buff = get_rewritten('http://httpbin.org/cookies/set?A=B&C=D', urlrewriter) - - assert status_headers.get_statuscode() == '302' - - print(status_headers.headers) - - assert ('Set-Cookie', 'A=B; Path=/pywb/20131226101010/http://example.com/') in status_headers.headers - assert ('Set-Cookie', 'C=D; Path=/pywb/20131226101010/http://example.com/') in status_headers.headers - - -def get_rewritten(*args, **kwargs): - status_headers, buff = LiveRewriter().get_rewritten(remote_only=False, *args, **kwargs) - return status_headers, to_native_str(buff) diff --git a/pywb/urlrewrite/__init__.py b/pywb/urlrewrite/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/pywb/urlrewrite/header_rewriter.py b/pywb/urlrewrite/header_rewriter.py deleted file mode 100644 index 394ea0bc..00000000 --- a/pywb/urlrewrite/header_rewriter.py +++ /dev/null @@ -1,100 +0,0 @@ -from warcio.statusandheaders import StatusAndHeaders -from warcio.timeutils import datetime_to_http_date -from datetime import datetime, timedelta - - -#============================================================================= -class PrefixHeaderRewriter(object): - header_rules = { - 'content-type': 'keep', - 'content-disposition': 'keep', - 'content-range': 'keep', - 'accept-rangees': 'keep', - 'www-authenticate': 'keep', - 'proxy-authenticate': 'keep', - - 'location': 'url-rewrite', - 'content-location': 'url-rewrite', - 'content-base': 'url-rewrite', - - 'transfer-encoding': 'prefix', - 'connection': 'prefix', - - 'content-encoding': 'keep-if-no-content-rewrite', - 'content-length': 'content-length', - - 'set-cookie': 'cookie', - 'cookie': 'cookie', - } - - def __init__(self, rwinfo, header_prefix='X-Archive-Orig-'): - self.header_prefix = header_prefix - self.rwinfo = rwinfo - self.http_headers = rwinfo.record.http_headers - - if rwinfo.is_url_rw(): - self.default_rule = 'prefix' - else: - self.default_rule = 'keep' - - def __call__(self): - new_headers_list = [] - for name, value in self.http_headers.headers: - rule = self.header_rules.get(name.lower(), self.default_rule) - new_header = self.rewrite_header(name, value, rule) - if new_header: - if isinstance(new_header, list): - new_headers_list.extend(new_header) - else: - new_headers_list.append(new_header) - - return StatusAndHeaders(self.http_headers.statusline, - headers=new_headers_list, - protocol=self.http_headers.protocol) - - def rewrite_header(self, name, value, rule): - if rule == 'keep': - return (name, value) - - elif rule == 'url-rewrite': - return (name, self.rwinfo.url_rewriter.rewrite(value)) - - elif rule == 'keep-if-no-content-rewrite': - if not self.rwinfo.is_content_rw: - return (name, value) - - elif rule == 'content-length': - if value == '0': - return (name, value) - - if not self.rwinfo.is_content_rw: - try: - if int(value) >= 0: - return (name, value) - except: - pass - - elif rule == 'cookie': - if self.rwinfo.cookie_rewriter: - return self.rwinfo.cookie_rewriter.rewrite(value) - else: - return (name, value) - - # default 'prefix' - return (self.header_prefix + name, value) - - def _add_cache_headers(self, new_headers, http_cache): - try: - age = int(http_cache) - except: - age = 0 - - if age <= 0: - new_headers.append(('Cache-Control', 'no-cache; no-store')) - else: - dt = datetime.utcnow() - dt = dt + timedelta(seconds=age) - new_headers.append(('Cache-Control', 'max-age=' + str(age))) - new_headers.append(('Expires', datetime_to_http_date(dt))) - - diff --git a/pywb/urlrewrite/test/__init__.py b/pywb/urlrewrite/test/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/pywb/urlrewrite/test/uwsgi.ini b/pywb/urlrewrite/test/uwsgi.ini deleted file mode 100644 index 7acd4f0b..00000000 --- a/pywb/urlrewrite/test/uwsgi.ini +++ /dev/null @@ -1,18 +0,0 @@ -[uwsgi] -if-not-env = PORT -http-socket = :8090 -endif = - -master = true -buffer-size = 65536 -die-on-term = true - -if-env = VIRTUAL_ENV -venv = $(VIRTUAL_ENV) -endif = - -gevent = 100 - -wsgi = urlrewrite.test.simpleapp - - diff --git a/pywb/utils/dsrules.py b/pywb/utils/dsrules.py deleted file mode 100644 index af509987..00000000 --- a/pywb/utils/dsrules.py +++ /dev/null @@ -1,81 +0,0 @@ -import pkgutil -from pywb.utils.loaders import load_yaml_config - - -#================================================================= -DEFAULT_RULES_FILE = 'pywb/rules.yaml' - - -#================================================================= -class RuleSet(object): - DEFAULT_KEY = '' - - def __init__(self, rule_cls, fieldname, **kwargs): - """ - A domain specific rules block, inited via config map. - If config map not specified, it is loaded from default location. - - The rules are represented as a map by domain. - Each rules configuration will load is own field type - from the list and given a specified rule_cls. - """ - - self.rules = [] - - default_rule_config = kwargs.get('default_rule_config') - - ds_rules_file = kwargs.get('ds_rules_file') - - if not ds_rules_file: - ds_rules_file = DEFAULT_RULES_FILE - - config = load_yaml_config(ds_rules_file) - - # load rules dict or init to empty - rulesmap = config.get('rules') if config else {} - - def_key_found = False - - # iterate over master rules file - for value in rulesmap: - url_prefix = value.get('url_prefix') - rules_def = value.get(fieldname) - if not rules_def: - continue - - if url_prefix == self.DEFAULT_KEY: - def_key_found = True - - self.rules.append(rule_cls(url_prefix, rules_def)) - - # if default_rule_config provided, always init a default ruleset - if not def_key_found and default_rule_config is not None: - self.rules.append(rule_cls(self.DEFAULT_KEY, default_rule_config)) - - def iter_matching(self, urlkey): - """ - Iterate over all matching rules for given urlkey - """ - for rule in self.rules: - if rule.applies(urlkey): - yield rule - - def get_first_match(self, urlkey): - for rule in self.rules: - if rule.applies(urlkey): - return rule - - -#================================================================= -class BaseRule(object): - """ - Base rule class -- subclassed to handle specific - rules for given url_prefix key - """ - def __init__(self, url_prefix, rules): - self.url_prefix = url_prefix - if not isinstance(self.url_prefix, list): - self.url_prefix = [self.url_prefix] - - def applies(self, urlkey): - return any(urlkey.startswith(x) for x in self.url_prefix) diff --git a/pywb/utils/test/test_loaders.py b/pywb/utils/test/test_loaders.py index 9842c7a3..d62e9626 100644 --- a/pywb/utils/test/test_loaders.py +++ b/pywb/utils/test/test_loaders.py @@ -52,43 +52,6 @@ IOError: [Errno 2] No such file or directory: '_x_no_such_file_' >>> extract_client_cookie({}, 'y') -# append_post_query ->>> append_post_query('http://example.com/?abc=def', 'foo=bar') -'http://example.com/?abc=def&foo=bar' - ->>> append_post_query('http://example.com/', '') -'http://example.com/' - ->>> append_post_query('http://example.com/', 'foo=bar') -'http://example.com/?foo=bar' - -# extract_post_query tests - -# correct POST data ->>> post_data = b'foo=bar&dir=%2Fbaz' ->>> extract_post_query('POST', 'application/x-www-form-urlencoded', len(post_data), BytesIO(post_data)) -'foo=bar&dir=/baz' - -# unsupported method ->>> extract_post_query('PUT', 'application/x-www-form-urlencoded', len(post_data), BytesIO(post_data)) - -# base64 encode ->>> extract_post_query('POST', 'text/plain', len(post_data), BytesIO(post_data)) -'&__wb_post_data=Zm9vPWJhciZkaXI9JTJGYmF6' - -# invalid length ->>> extract_post_query('POST', 'application/x-www-form-urlencoded', 'abc', BytesIO(post_data)) ->>> extract_post_query('POST', 'application/x-www-form-urlencoded', 0, BytesIO(post_data)) - -# length too short ->>> extract_post_query('POST', 'application/x-www-form-urlencoded', len(post_data) - 4, BytesIO(post_data)) -'foo=bar&dir=%2' - -# length too long ->>> extract_post_query('POST', 'application/x-www-form-urlencoded', len(post_data) + 4, BytesIO(post_data)) -'foo=bar&dir=/baz' - - # test read_last_line >>> print_str(read_last_line(BytesIO(b'A\nB\nC'))) 'C' @@ -119,8 +82,8 @@ from io import BytesIO import requests from pywb.utils.loaders import BlockLoader, HMACCookieMaker, to_file_url -from pywb.utils.loaders import extract_client_cookie, extract_post_query -from pywb.utils.loaders import append_post_query, read_last_line +from pywb.utils.loaders import extract_client_cookie +from pywb.utils.loaders import read_last_line from warcio.bufferedreaders import DecompressingBufferedReader diff --git a/pywb/warcserver/inputrequest.py b/pywb/warcserver/inputrequest.py index 58e44a35..491e60b1 100644 --- a/pywb/warcserver/inputrequest.py +++ b/pywb/warcserver/inputrequest.py @@ -86,10 +86,11 @@ class DirectWSGIInputRequest(object): buffered_stream=buffered_stream, environ=self.env) - if post_query.append_post_query(url) != url: + new_url = post_query.append_post_query(url) + if new_url != url: self.env['wsgi.input'] = buffered_stream - return url + return new_url def get_full_request_uri(self): req_uri = self.env.get('REQUEST_URI') @@ -246,7 +247,7 @@ class PostQueryExtractor(object): else: post_query = base64.b64encode(post_query) post_query = to_native_str(post_query) - post_query = '&__wb_post_data=' + post_query + post_query = '__wb_post_data=' + post_query self.post_query = post_query diff --git a/pywb/warcserver/test/test_inputreq.py b/pywb/warcserver/test/test_inputreq.py index edd55eb3..0857e897 100644 --- a/pywb/warcserver/test/test_inputreq.py +++ b/pywb/warcserver/test/test_inputreq.py @@ -1,9 +1,10 @@ -from pywb.warcserver.inputrequest import DirectWSGIInputRequest, POSTInputRequest +from pywb.warcserver.inputrequest import DirectWSGIInputRequest, POSTInputRequest, PostQueryExtractor from werkzeug.routing import Map, Rule import webtest import traceback from six.moves.urllib.parse import parse_qsl +from io import BytesIO #============================================================================= @@ -76,3 +77,61 @@ Foo: Bar\r\n\ \r\n\ ' + +class TestPostQueryExtract(object): + @classmethod + def setup_class(cls): + cls.post_data = b'foo=bar&dir=%2Fbaz' + + def test_post_extract_1(self): + pq = PostQueryExtractor('POST', 'application/x-www-form-urlencoded', + len(self.post_data), BytesIO(self.post_data)) + + assert pq.append_post_query('http://example.com/') == 'http://example.com/?foo=bar&dir=/baz' + + assert pq.append_post_query('http://example.com/?123=ABC') == 'http://example.com/?123=ABC&foo=bar&dir=/baz' + + def test_post_extract_wrong_method(self): + pq = PostQueryExtractor('PUT', 'application/x-www-form-urlencoded', + len(self.post_data), BytesIO(self.post_data)) + + assert pq.append_post_query('http://example.com/') == 'http://example.com/' + + def test_post_extract_non_form_data_1(self): + pq = PostQueryExtractor('POST', 'application/octet-stream', + len(self.post_data), BytesIO(self.post_data)) + + #base64 encoded data + assert pq.append_post_query('http://example.com/') == 'http://example.com/?__wb_post_data=Zm9vPWJhciZkaXI9JTJGYmF6' + + def test_post_extract_non_form_data_2(self): + pq = PostQueryExtractor('POST', 'text/plain', + len(self.post_data), BytesIO(self.post_data)) + + #base64 encoded data + assert pq.append_post_query('http://example.com/pathbar?id=123') == 'http://example.com/pathbar?id=123&__wb_post_data=Zm9vPWJhciZkaXI9JTJGYmF6' + + def test_post_extract_length_invalid_ignore(self): + pq = PostQueryExtractor('POST', 'application/x-www-form-urlencoded', + 0, BytesIO(self.post_data)) + + assert pq.append_post_query('http://example.com/') == 'http://example.com/' + + pq = PostQueryExtractor('POST', 'application/x-www-form-urlencoded', + 'abc', BytesIO(self.post_data)) + + assert pq.append_post_query('http://example.com/') == 'http://example.com/' + + def test_post_extract_length_too_short(self): + pq = PostQueryExtractor('POST', 'application/x-www-form-urlencoded', + len(self.post_data) - 4, BytesIO(self.post_data)) + + assert pq.append_post_query('http://example.com/') == 'http://example.com/?foo=bar&dir=%2' + + def test_post_extract_length_too_long(self): + pq = PostQueryExtractor('POST', 'application/x-www-form-urlencoded', + len(self.post_data) + 4, BytesIO(self.post_data)) + + assert pq.append_post_query('http://example.com/') == 'http://example.com/?foo=bar&dir=/baz' + + diff --git a/pywb/webapp/__init__.py b/pywb/webapp/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/base_config_test.py b/tests/base_config_test.py index 0a4ddb7d..db17a204 100644 --- a/tests/base_config_test.py +++ b/tests/base_config_test.py @@ -3,9 +3,9 @@ from gevent import monkey; monkey.patch_all(thread=False) import pytest import webtest -from pywb.webagg.test.testutils import BaseTestClass +from pywb.warcserver.test.testutils import BaseTestClass -from pywb.urlrewrite.frontendapp import FrontEndApp +from pywb.apps.frontendapp import FrontEndApp import os diff --git a/tests/test_auto_colls.py b/tests/test_auto_colls.py index 47dbc182..8708c1c2 100644 --- a/tests/test_auto_colls.py +++ b/tests/test_auto_colls.py @@ -17,16 +17,16 @@ from pytest import raises from mock import patch from pywb import get_test_dir -from pywb.webagg.test.testutils import TempDirTests, BaseTestClass +from pywb.warcserver.test.testutils import TempDirTests, BaseTestClass from pywb.manager.manager import main import pywb.manager.autoindex -from pywb.warc.cdxindexer import main as cdxindexer_main -from pywb.cdx.cdxobject import CDXObject +from pywb.indexer.cdxindexer import main as cdxindexer_main +from pywb.warcserver.index.cdxobject import CDXObject -from pywb.urlrewrite.frontendapp import FrontEndApp +from pywb.apps.frontendapp import FrontEndApp #============================================================================= diff --git a/tests/test_cdx_server_app.py b/tests/test_cdx_server_app.py index 2e9bb126..762efb62 100644 --- a/tests/test_cdx_server_app.py +++ b/tests/test_cdx_server_app.py @@ -8,10 +8,10 @@ import webtest from six.moves.urllib.parse import urlencode -from pywb.cdx.cdxobject import CDXObject +from pywb.warcserver.index.cdxobject import CDXObject -from pywb.webagg.test.testutils import BaseTestClass -from pywb.webagg.autoapp import AutoConfigApp +from pywb.warcserver.test.testutils import BaseTestClass +from pywb.warcserver.warcserver import WarcServer # ============================================================================ @@ -20,7 +20,7 @@ class TestCDXApp(BaseTestClass): def setup_class(cls): super(TestCDXApp, cls).setup_class() config_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'config_test.yaml') - cls.testapp = webtest.TestApp(AutoConfigApp(config_file=config_file)) + cls.testapp = webtest.TestApp(WarcServer(config_file=config_file)) def query(self, url, is_error=False, **params): params['url'] = url diff --git a/tests/test_integration.py b/tests/test_integration.py index 571165eb..3342daf7 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -1,6 +1,6 @@ from .base_config_test import BaseConfigTest, fmod -from pywb.cdx.cdxobject import CDXObject +from pywb.warcserver.index.cdxobject import CDXObject # ============================================================================ diff --git a/pywb/warcserver/test/live.py b/tests_disabled/live.py similarity index 100% rename from pywb/warcserver/test/live.py rename to tests_disabled/live.py diff --git a/pywb/rewrite/test/test_header_rewriter.py b/tests_disabled/test_header_rewriter.py similarity index 100% rename from pywb/rewrite/test/test_header_rewriter.py rename to tests_disabled/test_header_rewriter.py diff --git a/pywb/rewrite/test/test_rewrite_content.py b/tests_disabled/test_rewrite_content.py similarity index 100% rename from pywb/rewrite/test/test_rewrite_content.py rename to tests_disabled/test_rewrite_content.py