1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

refactor:

- merge pywb.urlrewrite -> pywb.rewrite, remove obsolete stuff (rewrite_content.py, rewrite_live.py, dsrules.py)
- move wbrequestresponse -> pywb.apps
- move pywb.webapp.handlers -> pywb.apps.static_handler
- remove pywb.webapp, pywb.framework packages
- disable old header_rewriter, content_rewriter tests
- finish renaming from previous warcserver refactor
- all other tests passing!
This commit is contained in:
Ilya Kreymer 2017-05-23 19:08:29 -07:00
parent 2907ed01c8
commit 97182b71b7
39 changed files with 213 additions and 1542 deletions

View File

@ -6,7 +6,7 @@ import logging
#=============================================================================
def webagg(args=None):
WebaggCli(args=args,
WarcServerCli(args=args,
default_port=8070,
desc='pywb Web Aggregator Server').run()
@ -103,18 +103,18 @@ class ReplayCli(BaseCli):
#=============================================================================
class WebaggCli(BaseCli):
class WarcServerCli(BaseCli):
def load(self):
from pywb.webagg.autoapp import AutoConfigApp
from pywb.warcserver.warcserver import WarcServer
super(WebaggCli, self).load()
return AutoConfigApp(custom_config=self.extra_config)
super(WarcServerCli, self).load()
return WarcServer(custom_config=self.extra_config)
#=============================================================================
class WaybackCli(ReplayCli):
def load(self):
from pywb.urlrewrite.frontendapp import FrontEndApp
from pywb.apps.frontendapp import FrontEndApp
super(WaybackCli, self).load()
return FrontEndApp(custom_config=self.extra_config)
@ -123,7 +123,7 @@ class WaybackCli(ReplayCli):
#=============================================================================
class LiveCli(BaseCli):
def load(self):
from pywb.urlrewrite.frontendapp import FrontEndApp
from pywb.apps.frontendapp import FrontEndApp
self.r.live = True

View File

@ -8,16 +8,15 @@ from six.moves.urllib.parse import urljoin
from six import iteritems
from pywb.utils.loaders import load_yaml_config, to_native_str
from pywb.utils.geventserver import GeventServer
from pywb.webagg.autoapp import AutoConfigApp
from pywb.webapp.handlers import StaticHandler
from pywb.warcserver.warcserver import WarcServer
from pywb.framework.wbrequestresponse import WbResponse
from pywb.rewrite.templateview import BaseInsertView
from pywb.urlrewrite.geventserver import GeventServer
from pywb.urlrewrite.templateview import BaseInsertView
from pywb.urlrewrite.rewriterapp import RewriterApp, UpstreamException
from pywb.apps.static_handler import StaticHandler
from pywb.apps.rewriterapp import RewriterApp, UpstreamException
from pywb.apps.wbrequestresponse import WbResponse
import os
import traceback
@ -27,14 +26,14 @@ import traceback
class FrontEndApp(object):
def __init__(self, config_file='./config.yaml', custom_config=None):
self.debug = True
self.webagg = AutoConfigApp(config_file=config_file,
self.warcserver = WarcServer(config_file=config_file,
custom_config=custom_config)
framed_replay = self.webagg.config.get('framed_replay', True)
framed_replay = self.warcserver.config.get('framed_replay', True)
self.rewriterapp = RewriterApp(framed_replay, config=self.webagg.config)
self.rewriterapp = RewriterApp(framed_replay, config=self.warcserver.config)
self.webagg_server = GeventServer(self.webagg, port=0)
self.warcserver_server = GeventServer(self.warcserver, port=0)
self.static_handler = StaticHandler('pywb/static/')
@ -46,12 +45,12 @@ class FrontEndApp(object):
self.url_map.add(Rule('/collinfo.json', endpoint=self.serve_listing))
self.url_map.add(Rule('/', endpoint=self.serve_home))
self.rewriterapp.paths = self.get_upstream_paths(self.webagg_server.port)
self.rewriterapp.paths = self.get_upstream_paths(self.warcserver_server.port)
self.templates_dir = self.webagg.config.get('templates_dir', 'templates')
self.static_dir = self.webagg.config.get('static_dir', 'static')
self.templates_dir = self.warcserver.config.get('templates_dir', 'templates')
self.static_dir = self.warcserver.config.get('static_dir', 'static')
metadata_templ = os.path.join(self.webagg.root_dir, '{coll}', 'metadata.yaml')
metadata_templ = os.path.join(self.warcserver.root_dir, '{coll}', 'metadata.yaml')
self.metadata_cache = MetadataCache(metadata_templ)
def get_upstream_paths(self, port):
@ -61,8 +60,8 @@ class FrontEndApp(object):
def serve_home(self, environ):
home_view = BaseInsertView(self.rewriterapp.jinja_env, 'index.html')
fixed_routes = self.webagg.list_fixed_routes()
dynamic_routes = self.webagg.list_dynamic_routes()
fixed_routes = self.warcserver.list_fixed_routes()
dynamic_routes = self.warcserver.list_dynamic_routes()
routes = fixed_routes + dynamic_routes
@ -76,7 +75,7 @@ class FrontEndApp(object):
def serve_static(self, environ, coll='', filepath=''):
if coll:
path = os.path.join(self.webagg.root_dir, coll, self.static_dir)
path = os.path.join(self.warcserver.root_dir, coll, self.static_dir)
else:
path = self.static_dir
@ -116,7 +115,7 @@ class FrontEndApp(object):
kwargs = {'coll': coll}
if coll in self.webagg.list_fixed_routes():
if coll in self.warcserver.list_fixed_routes():
kwargs['type'] = 'replay-fixed'
else:
kwargs['type'] = 'replay-dyn'
@ -131,23 +130,23 @@ class FrontEndApp(object):
def setup_paths(self, environ, coll):
pop_path_info(environ)
if not coll or not self.webagg.root_dir:
if not coll or not self.warcserver.root_dir:
return
environ['pywb.templates_dir'] = os.path.join(self.webagg.root_dir,
environ['pywb.templates_dir'] = os.path.join(self.warcserver.root_dir,
coll,
self.templates_dir)
def serve_listing(self, environ):
result = {'fixed': self.webagg.list_fixed_routes(),
'dynamic': self.webagg.list_dynamic_routes()
result = {'fixed': self.warcserver.list_fixed_routes(),
'dynamic': self.warcserver.list_dynamic_routes()
}
return WbResponse.json_response(result)
def is_valid_coll(self, coll):
return (coll in self.webagg.list_fixed_routes() or
coll in self.webagg.list_dynamic_routes())
return (coll in self.warcserver.list_fixed_routes() or
coll in self.warcserver.list_dynamic_routes())
def raise_not_found(self, environ, msg):
raise NotFound(response=self.rewriterapp._error_response(environ, msg))

View File

@ -1,5 +1,5 @@
from gevent.monkey import patch_all; patch_all()
from pywb.urlrewrite.frontendapp import FrontEndApp
from pywb.apps.frontendapp import FrontEndApp
application = FrontEndApp(config_file=None,
custom_config={'collections': {'live': '$live'}})

View File

@ -1,9 +1,12 @@
import requests
from werkzeug.http import HTTP_STATUS_CODES
from six.moves.urllib.parse import urlencode, urlsplit, urlunsplit
#from pywb.rewrite.rewrite_amf import RewriteAMFMixin
#from pywb.rewrite.rewrite_dash import RewriteDASHMixin
#from pywb.rewrite.rewrite_content import RewriteContent
from pywb.urlrewrite.rewriter import DefaultRewriter
from pywb.rewrite.default_rewriter import DefaultRewriter
from pywb.rewrite.wburl import WbUrl
from pywb.rewrite.url_rewriter import UrlRewriter, SchemeOnlyUrlRewriter
@ -16,18 +19,14 @@ from warcio.timeutils import http_date_to_timestamp
from warcio.bufferedreaders import BufferedReader
from warcio.recordloader import ArcWarcRecordLoader
from pywb.webagg.utils import BUFF_SIZE
from pywb.warcserver.index.cdxobject import CDXObject
from pywb.apps.wbrequestresponse import WbResponse
from pywb.cdx.cdxobject import CDXObject
from pywb.framework.wbrequestresponse import WbResponse
from pywb.warcserver.utils import BUFF_SIZE
from pywb.warcserver.utils import MementoUtils
from pywb.webagg.utils import MementoUtils
from werkzeug.http import HTTP_STATUS_CODES
from six.moves.urllib.parse import urlencode, urlsplit, urlunsplit
from pywb.urlrewrite.rewriteinputreq import RewriteInputRequest
from pywb.urlrewrite.templateview import JinjaEnv, HeadInsertView, TopFrameView, BaseInsertView
from pywb.rewrite.rewriteinputreq import RewriteInputRequest
from pywb.rewrite.templateview import JinjaEnv, HeadInsertView, TopFrameView, BaseInsertView
from io import BytesIO
@ -71,7 +70,7 @@ class RewriterApp(object):
#frame_type = 'inverse' if framed_replay else False
#self.content_rewriter = Rewriter(is_framed_replay=frame_type)
self.content_rw = DefaultRewriter('pkg://pywb/rules.yaml', self.replay_mod)
self.content_rw = DefaultRewriter(replay_mod=self.replay_mod)
if not jinja_env:
jinja_env = JinjaEnv(globals={'static_path': 'static'})

View File

@ -3,7 +3,7 @@ import os
from pywb.utils.loaders import LocalFileLoader
from pywb.framework.wbrequestresponse import WbResponse
from pywb.apps.wbrequestresponse import WbResponse
#=================================================================

View File

@ -1,9 +1,9 @@
from gevent import monkey; monkey.patch_all(thread=False)
from pywb.webagg.test.testutils import LiveServerTests, BaseTestClass
from pywb.webagg.test.testutils import FakeRedisTests
from pywb.warcserver.test.testutils import LiveServerTests, BaseTestClass
from pywb.warcserver.test.testutils import FakeRedisTests
from pywb.urlrewrite.frontendapp import FrontEndApp
from pywb.apps.frontendapp import FrontEndApp
import os
import webtest
@ -12,10 +12,10 @@ import webtest
LIVE_CONFIG = {'collections': {'live': '$live'}}
class TestRewriter(FakeRedisTests, BaseTestClass):
class TestRewriterApp(FakeRedisTests, BaseTestClass):
@classmethod
def setup_class(cls):
super(TestRewriter, cls).setup_class()
super(TestRewriterApp, cls).setup_class()
#cls.app = RWApp.create_app(replay_port=cls.server.port)
#cls.testapp = webtest.TestApp(cls.app.app)

View File

@ -1,4 +1,4 @@
from pywb.framework.wbrequestresponse import WbResponse
from pywb.apps.wbrequestresponse import WbResponse
from warcio.statusandheaders import StatusAndHeaders

View File

@ -0,0 +1,7 @@
from gevent.monkey import patch_all; patch_all()
from pywb.warcserver.warcserver import WarcServer
application = WarcServer(custom_config={'collections': {'live': '$live'}})

View File

@ -1,5 +1,5 @@
from gevent.monkey import patch_all; patch_all()
from pywb.urlrewrite.frontendapp import FrontEndApp
from pywb.apps.frontendapp import FrontEndApp
application = FrontEndApp()

View File

@ -1,7 +0,0 @@
from gevent.monkey import patch_all; patch_all()
from pywb.webagg.autoapp import AutoConfigApp
application = AutoConfigApp(custom_config={'collections': {'live': '$live'}})

View File

@ -122,7 +122,7 @@ directory structure expected by pywb
self._cdx_index(cdx_file, [self.archive_dir])
def _cdx_index(self, out, input_, rel_root=None):
from pywb.warc.cdxindexer import write_multi_cdx_index
from pywb.indexer.cdxindexer import write_multi_cdx_index
options = dict(append_post=True,
cdxj=True,

View File

@ -9,7 +9,7 @@ import re
import webencodings
import tempfile
from pywb.webagg.utils import StreamIter, BUFF_SIZE
from pywb.warcserver.utils import StreamIter, BUFF_SIZE
from pywb.rewrite.cookie_rewriter import ExactPathCookieRewriter
from pywb.utils.loaders import load_yaml_config
@ -277,6 +277,7 @@ class RewriteInfo(object):
self.cookie_rewriter = cookie_rewriter
if self.record:
self._fill_text_type_and_charset()
self._resolve_text_type()

View File

@ -7,7 +7,7 @@ from pywb.rewrite.regex_rewriters import RegexRewriter, CSSRewriter, XMLRewriter
from pywb.rewrite.regex_rewriters import JSLinkAndLocationRewriter, JSLinkOnlyRewriter
from pywb.rewrite.regex_rewriters import JSLocationOnlyRewriter, JSNoneRewriter
from pywb.urlrewrite.header_rewriter import PrefixHeaderRewriter
from pywb.rewrite.header_rewriter import PrefixHeaderRewriter
from pywb.rewrite.jsonp_rewriter import JSONPRewriter
@ -75,6 +75,10 @@ class DefaultRewriter(BaseContentRewriter):
'text/plain': 'plain',
}
def __init__(self, rules_file=None, replay_mod=''):
rules_file = rules_file or 'pkg://pywb/rules.yaml'
super(DefaultRewriter, self).__init__(rules_file, replay_mod)
def init_js_regex(self, regexs):
return RegexRewriter.parse_rules_from_config(regexs)

View File

@ -1,102 +1,87 @@
from warcio.statusandheaders import StatusAndHeaders
from warcio.timeutils import datetime_to_http_date
from datetime import datetime, timedelta
import six
#=================================================================
class RewrittenStatusAndHeaders(object):
def __init__(self, statusline, headers,
removed_header_dict, text_type, charset):
#=============================================================================
class PrefixHeaderRewriter(object):
header_rules = {
'content-type': 'keep',
'content-disposition': 'keep',
'content-range': 'keep',
'accept-rangees': 'keep',
'www-authenticate': 'keep',
'proxy-authenticate': 'keep',
self.status_headers = StatusAndHeaders(statusline, headers)
self.removed_header_dict = removed_header_dict
self.text_type = text_type
self.charset = charset
'location': 'url-rewrite',
'content-location': 'url-rewrite',
'content-base': 'url-rewrite',
def contains_removed_header(self, name, value):
return self.removed_header_dict.get(name) == value
'transfer-encoding': 'prefix',
'connection': 'prefix',
def readd_rewrite_removed(self):
for name in HeaderRewriter.KEEP_NO_REWRITE_HEADERS:
value = self.removed_header_dict.get(name)
if value is not None:
self.status_headers.headers.append((name, value))
'content-encoding': 'keep-if-no-content-rewrite',
'content-length': 'content-length',
#=================================================================
class HeaderRewriter(object):
REWRITE_TYPES = {
'html': ['text/html',
'application/xhtml',
'application/xhtml+xml'],
'css': ['text/css'],
'js': ['text/javascript',
'application/javascript',
'application/x-javascript'],
'json': ['application/json'],
'hls': ['application/x-mpegURL'],
'xml': ['/xml', '+xml', '.xml', '.rss'],
'plain': ['text/plain'],
'set-cookie': 'cookie',
'cookie': 'cookie',
}
PROXY_HEADERS = ['content-type', 'content-disposition', 'content-range',
'accept-ranges', 'www-authenticate', 'proxy-authenticate']
URL_REWRITE_HEADERS = ['location', 'content-location', 'content-base']
REMOVE_ALWAYS_HEADERS = ['transfer-encoding']
KEEP_PROXY_HEADERS = ['content-security-policy', 'strict-transport-security']
KEEP_NO_REWRITE_HEADERS = ['content-length', 'content-encoding']
COOKIE_HEADERS = ['set-cookie', 'cookie']
CACHE_HEADERS = ['cache-control', 'expires', 'etag', 'last-modified']
def __init__(self, header_prefix='X-Archive-Orig-'):
def __init__(self, rwinfo, header_prefix='X-Archive-Orig-'):
self.header_prefix = header_prefix
self.rwinfo = rwinfo
self.http_headers = rwinfo.record.http_headers
def rewrite(self, status_headers, urlrewriter, cookie_rewriter):
content_type = status_headers.get_header('Content-Type')
text_type = None
charset = None
content_modified = False
http_cache = None
if urlrewriter:
http_cache = urlrewriter.rewrite_opts.get('http_cache')
if rwinfo.is_url_rw():
self.default_rule = 'prefix'
else:
self.default_rule = 'keep'
if content_type:
text_type = self._extract_text_type(content_type)
if text_type:
charset = self._extract_char_set(content_type)
content_modified = True
def __call__(self):
new_headers_list = []
for name, value in self.http_headers.headers:
rule = self.header_rules.get(name.lower(), self.default_rule)
new_header = self.rewrite_header(name, value, rule)
if new_header:
if isinstance(new_header, list):
new_headers_list.extend(new_header)
else:
new_headers_list.append(new_header)
result = self._rewrite_headers(status_headers.headers,
urlrewriter,
cookie_rewriter,
content_modified,
http_cache)
return StatusAndHeaders(self.http_headers.statusline,
headers=new_headers_list,
protocol=self.http_headers.protocol)
new_headers = result[0]
removed_header_dict = result[1]
def rewrite_header(self, name, value, rule):
if rule == 'keep':
return (name, value)
if http_cache != None and http_cache != 'pass':
self._add_cache_headers(new_headers, http_cache)
elif rule == 'url-rewrite':
return (name, self.rwinfo.url_rewriter.rewrite(value))
return RewrittenStatusAndHeaders(status_headers.statusline,
new_headers,
removed_header_dict,
text_type,
charset)
elif rule == 'keep-if-no-content-rewrite':
if not self.rwinfo.is_content_rw:
return (name, value)
elif rule == 'content-length':
if value == '0':
return (name, value)
if not self.rwinfo.is_content_rw:
try:
if int(value) >= 0:
return (name, value)
except:
pass
elif rule == 'cookie':
if self.rwinfo.cookie_rewriter:
return self.rwinfo.cookie_rewriter.rewrite(value)
else:
return (name, value)
# default 'prefix'
return (self.header_prefix + name, value)
def _add_cache_headers(self, new_headers, http_cache):
try:
@ -112,76 +97,4 @@ class HeaderRewriter(object):
new_headers.append(('Cache-Control', 'max-age=' + str(age)))
new_headers.append(('Expires', datetime_to_http_date(dt)))
def _extract_text_type(self, content_type):
for ctype, mimelist in six.iteritems(self.REWRITE_TYPES):
if any((mime in content_type) for mime in mimelist):
return ctype
return None
def _extract_char_set(self, content_type):
CHARSET_TOKEN = 'charset='
idx = content_type.find(CHARSET_TOKEN)
if idx < 0:
return None
return content_type[idx + len(CHARSET_TOKEN):].lower()
def _rewrite_headers(self, headers, urlrewriter,
cookie_rewriter,
content_modified,
http_cache):
new_headers = []
removed_header_dict = {}
def add_header(name, value):
new_headers.append((name, value))
def add_prefixed_header(name, value):
new_headers.append((self.header_prefix + name, value))
for (name, value) in headers:
lowername = name.lower()
if lowername in self.PROXY_HEADERS:
add_header(name, value)
elif urlrewriter and urlrewriter.prefix and lowername in self.URL_REWRITE_HEADERS:
new_headers.append((name, urlrewriter.rewrite(value)))
elif lowername in self.KEEP_NO_REWRITE_HEADERS:
if content_modified and value != '0':
removed_header_dict[lowername] = value
add_prefixed_header(name, value)
else:
add_header(name, value)
elif lowername in self.KEEP_PROXY_HEADERS:
if urlrewriter.prefix:
removed_header_dict[lowername] = value
add_prefixed_header(name, value)
else:
add_header(name, value)
elif lowername in self.REMOVE_ALWAYS_HEADERS:
removed_header_dict[lowername] = value
add_prefixed_header(name, value)
elif (lowername in self.COOKIE_HEADERS and
cookie_rewriter):
cookie_list = cookie_rewriter.rewrite(value)
new_headers.extend(cookie_list)
elif (lowername in self.CACHE_HEADERS):
if http_cache == 'pass':
add_header(name, value)
else:
add_prefixed_header(name, value)
elif urlrewriter and urlrewriter.prefix:
add_prefixed_header(name, value)
else:
add_header(name, value)
return (new_headers, removed_header_dict)

View File

@ -1,403 +0,0 @@
#import chardet
import pkgutil
import webencodings
import yaml
import re
#from chardet.universaldetector import UniversalDetector
from io import BytesIO
from pywb.rewrite.header_rewriter import RewrittenStatusAndHeaders
from pywb.rewrite.rewriterules import RewriteRules
from pywb.utils.dsrules import RuleSet
from warcio.statusandheaders import StatusAndHeaders
from warcio.bufferedreaders import DecompressingBufferedReader
from warcio.bufferedreaders import ChunkedDataReader, BufferedReader
from warcio.utils import to_native_str
from pywb.rewrite.regex_rewriters import JSNoneRewriter, JSLinkOnlyRewriter
#=================================================================
class RewriteContent(object):
HEAD_REGEX = re.compile(b'<\s*head\\b[^>]*[>]+', re.I)
TAG_REGEX = re.compile(b'^\s*\<')
CHARSET_REGEX = re.compile(b'<meta[^>]*?[\s;"\']charset\s*=[\s"\']*([^\s"\'/>]*)')
BUFF_SIZE = 16384
def __init__(self, ds_rules_file=None, is_framed_replay=False):
self.ruleset = RuleSet(RewriteRules, 'rewrite',
default_rule_config={},
ds_rules_file=ds_rules_file)
if is_framed_replay == 'inverse':
self.defmod = 'mp_'
else:
self.defmod = ''
def sanitize_content(self, status_headers, stream):
# remove transfer encoding chunked and wrap in a dechunking stream
if (status_headers.remove_header('transfer-encoding')):
stream = ChunkedDataReader(stream)
return (status_headers, stream)
def _rewrite_headers(self, urlrewriter, rule, status_headers, stream,
urlkey='', cookie_rewriter=None):
header_rewriter_class = rule.rewriters['header']
if urlrewriter and not cookie_rewriter:
cookie_rewriter = urlrewriter.get_cookie_rewriter(rule)
rewritten_headers = (header_rewriter_class().
rewrite(status_headers,
urlrewriter,
cookie_rewriter))
# note: since chunk encoding may/may not be valid,
# the approach taken here is to *always* attempt
# to dechunk if 'transfer-encoding: chunked' is present
#
# an alternative may be to serve chunked unless
# content rewriting is needed
# todo: possible revisit this approach
if (rewritten_headers.
contains_removed_header('transfer-encoding', 'chunked')):
stream = ChunkedDataReader(stream)
return (rewritten_headers, stream)
def _decoding_stream(self, rewritten_headers, stream):
for decomp_type in BufferedReader.get_supported_decompressors():
matched, stream = self._check_encoding(rewritten_headers,
stream,
decomp_type)
if matched:
break
return stream
def _check_encoding(self, rewritten_headers, stream, enc):
matched = False
if (rewritten_headers.
contains_removed_header('content-encoding', enc)):
#optimize: if already a ChunkedDataReader, add the encoding
if isinstance(stream, ChunkedDataReader):
stream.set_decomp(enc)
else:
stream = DecompressingBufferedReader(stream, decomp_type=enc)
rewritten_headers.status_headers.remove_header('content-length')
matched = True
return matched, stream
def rewrite_content(self, urlrewriter, status_headers, stream,
head_insert_func=None, urlkey='',
cdx=None, cookie_rewriter=None, env=None):
wb_url = urlrewriter.wburl
if (wb_url.is_identity or
(not head_insert_func and wb_url.is_banner_only)):
status_headers, stream = self.sanitize_content(status_headers,
stream)
return (status_headers, self.stream_to_gen(stream), False)
if urlrewriter and cdx and cdx.get('is_live'):
urlrewriter.rewrite_opts['is_live'] = True
rule = self.ruleset.get_first_match(urlkey)
(rewritten_headers, stream) = self._rewrite_headers(urlrewriter,
rule,
status_headers,
stream,
urlkey,
cookie_rewriter)
res = self.handle_custom_rewrite(rewritten_headers,
stream,
urlrewriter,
wb_url.mod,
env)
if res:
return res
# Handle text content rewriting
# ====================================================================
# special case -- need to ungzip the body
status_headers = rewritten_headers.status_headers
text_type = rewritten_headers.text_type
# see known js/css modifier specified, the context should run
# default text_type
mod = wb_url.mod
stream_raw = False
encoding = None
first_buff = b''
stream = self._decoding_stream(rewritten_headers, stream)
if mod == 'js_':
text_type, stream = self._resolve_text_type('js',
text_type,
stream)
elif mod == 'cs_':
text_type, stream = self._resolve_text_type('css',
text_type,
stream)
# for proxy mode: use special js_proxy rewriter
# which may be none rewriter + custom rules (if any)
if text_type == 'js' and not urlrewriter.prefix:
rewriter_class = rule.rewriters['js_proxy']
else:
rewriter_class = rule.rewriters[text_type]
# for html, need to perform header insert, supply js, css, xml
# rewriters
if text_type == 'html':
head_insert_str = ''
charset = rewritten_headers.charset
# if no charset set, attempt to extract from first 1024
if not rewritten_headers.charset:
first_buff = stream.read(1024)
charset = self._extract_html_charset(first_buff,
status_headers)
if head_insert_func and not wb_url.is_url_rewrite_only:
head_insert_orig = head_insert_func(rule, cdx)
if charset:
try:
head_insert_str = webencodings.encode(head_insert_orig, charset)
except:
pass
if not head_insert_str:
charset = 'utf-8'
head_insert_str = head_insert_orig.encode(charset)
head_insert_buf = head_insert_str
#head_insert_str = to_native_str(head_insert_str)
head_insert_str = head_insert_str.decode('iso-8859-1')
if wb_url.is_banner_only:
gen = self._head_insert_only_gen(head_insert_buf,
stream,
first_buff)
content_len = status_headers.get_header('Content-Length')
try:
content_len = int(content_len)
except Exception:
content_len = None
if content_len is not None and content_len >= 0:
content_len = str(content_len + len(head_insert_str))
status_headers.replace_header('Content-Length',
content_len)
return (status_headers, gen, False)
# if proxy, use js_proxy rewriter
if not urlrewriter.prefix:
js_rewriter_class = rule.rewriters['js_proxy']
else:
js_rewriter_class = rule.rewriters['js']
css_rewriter_class = rule.rewriters['css']
if wb_url.is_url_rewrite_only:
js_rewriter_class = JSNoneRewriter
rewriter = rewriter_class(urlrewriter,
js_rewriter_class=js_rewriter_class,
css_rewriter_class=css_rewriter_class,
head_insert=head_insert_str,
url=wb_url.url,
defmod=self.defmod,
parse_comments=rule.parse_comments)
else:
if wb_url.is_banner_only:
return (status_headers, self.stream_to_gen(stream), False)
# url-only rewriter, but not rewriting urls in JS, so return
if wb_url.is_url_rewrite_only and text_type == 'js':
#return (status_headers, self.stream_to_gen(stream), False)
rewriter_class = JSLinkOnlyRewriter
# apply one of (js, css, xml) rewriters
rewriter = rewriter_class(urlrewriter)
# align to line end for all non-html rewriting
align = (text_type != 'html')
# Create rewriting generator
gen = self.rewrite_text_stream_to_gen(stream,
rewrite_func=rewriter.rewrite,
final_read_func=rewriter.close,
first_buff=first_buff,
align_to_line=align)
return (status_headers, gen, True)
def handle_custom_rewrite(self, rewritten_headers, stream,
urlrewriter, mod, env):
text_type = rewritten_headers.text_type
status_headers = rewritten_headers.status_headers
# use rewritten headers, but no further rewriting needed
if text_type is None:
return (status_headers, self.stream_to_gen(stream), False)
if ((text_type == 'html' and urlrewriter.rewrite_opts.get('is_ajax')) or
(text_type == 'plain' and not mod in ('js_', 'cs_'))):
rewritten_headers.readd_rewrite_removed()
return (status_headers, self.stream_to_gen(stream), False)
@staticmethod
def _extract_html_charset(buff, status_headers):
charset = None
m = RewriteContent.CHARSET_REGEX.search(buff)
if m:
charset = m.group(1)
charset = to_native_str(charset)
# content_type = 'text/html; charset=' + charset
# status_headers.replace_header('content-type', content_type)
return charset
@staticmethod
def _resolve_text_type(mod, text_type, stream):
if text_type == 'css' and mod == 'js':
return 'css', stream
# only attempt to resolve between html and other text types
if text_type != 'html':
return mod, stream
buff = stream.read(128)
wrapped_stream = BufferedReader(stream, starting_data=buff)
# check if starts with a tag, then likely html
if RewriteContent.TAG_REGEX.match(buff):
mod = 'html'
return mod, wrapped_stream
def _head_insert_only_gen(self, insert_str, stream, first_buff=b''):
buff = first_buff
max_len = 1024 - len(first_buff)
while max_len > 0:
curr = stream.read(max_len)
if not curr:
break
max_len -= len(buff)
buff += curr
matcher = self.HEAD_REGEX.search(buff)
if matcher:
yield buff[:matcher.end()]
yield insert_str
yield buff[matcher.end():]
else:
yield insert_str
yield buff
for buff in self.stream_to_gen(stream):
yield buff
@staticmethod
def _decode_buff(buff, stream, encoding): # pragma: no coverage
try:
buff = buff.decode(encoding)
except UnicodeDecodeError as e:
# chunk may have cut apart unicode bytes -- add 1-3 bytes and retry
for i in range(3):
buff += stream.read(1)
try:
buff = buff.decode(encoding)
break
except UnicodeDecodeError:
pass
else:
raise
return buff
@staticmethod
def stream_to_gen(stream):
"""
Convert stream to an iterator, reading BUFF_SIZE bytes
"""
try:
while True:
buff = stream.read(RewriteContent.BUFF_SIZE)
yield buff
if not buff:
break
finally:
stream.close()
@staticmethod
def rewrite_text_stream_to_gen(stream, rewrite_func,
final_read_func, first_buff,
align_to_line):
"""
Convert stream to generator using applying rewriting func
to each portion of the stream.
Align to line boundaries if needed.
"""
try:
has_closed = hasattr(stream, 'closed')
buff = first_buff
while True:
if buff:
buff = rewrite_func(buff.decode('iso-8859-1'))
yield buff.encode('iso-8859-1')
buff = stream.read(RewriteContent.BUFF_SIZE)
# on 2.6, readline() (but not read()) throws an exception
# if stream already closed, so check stream.closed if present
if (buff and align_to_line and
(not has_closed or not stream.closed)):
buff += stream.readline()
if not buff:
break
# For adding a tail/handling final buffer
buff = final_read_func()
if buff:
yield buff.encode('iso-8859-1')
finally:
stream.close()

View File

@ -1,315 +0,0 @@
"""
Fetch a url from live web and apply rewriting rules
"""
from requests import request as live_request
import mimetypes
import logging
import os
from six.moves.urllib.parse import urlsplit
import six
from warcio.timeutils import timestamp_now
from warcio.statusandheaders import StatusAndHeaders
from pywb.utils.loaders import is_http, LimitReader, LocalFileLoader, to_file_url
from pywb.utils.loaders import extract_client_cookie
from pywb.utils.canonicalize import canonicalize
from pywb.rewrite.rewrite_content import RewriteContent
#=================================================================
class LiveRewriter(object):
def __init__(self, is_framed_replay=False, proxies=None):
self.rewriter = RewriteContent(is_framed_replay=is_framed_replay)
self.proxies = proxies
self.live_request = live_request
if self.proxies:
logging.debug('Live Rewrite via proxy ' + str(proxies))
if isinstance(proxies, str):
self.proxies = {'http': proxies,
'https': proxies}
else:
logging.debug('Live Rewrite Direct (no proxy)')
def is_recording(self):
return self.proxies is not None
def fetch_local_file(self, uri):
#fh = open(uri)
fh = LocalFileLoader().load(uri)
content_type, _ = mimetypes.guess_type(uri)
# create fake headers for local file
status_headers = StatusAndHeaders('200 OK',
[('Content-Type', content_type)])
stream = fh
return (status_headers, stream)
def translate_headers(self, url, urlkey, env):
headers = {}
splits = urlsplit(url)
has_cookies = False
for name, value in six.iteritems(env):
if name == 'HTTP_HOST':
name = 'Host'
value = splits.netloc
elif name == 'HTTP_ORIGIN':
name = 'Origin'
value = (splits.scheme + '://' + splits.netloc)
elif name == 'HTTP_X_CSRFTOKEN':
name = 'X-CSRFToken'
cookie_val = extract_client_cookie(env, 'csrftoken')
if cookie_val:
value = cookie_val
elif name == 'HTTP_REFERER':
continue
elif name == 'HTTP_X_PYWB_REQUESTED_WITH':
continue
elif name == 'HTTP_X_FORWARDED_PROTO':
name = 'X-Forwarded-Proto'
value = splits.scheme
elif name == 'HTTP_COOKIE':
name = 'Cookie'
value = self._req_cookie_rewrite(urlkey, value)
has_cookies = True
elif name.startswith('HTTP_'):
name = name[5:].title().replace('_', '-')
elif name in ('CONTENT_LENGTH', 'CONTENT_TYPE'):
name = name.title().replace('_', '-')
elif name == 'REL_REFERER':
name = 'Referer'
else:
value = None
if value:
headers[name] = value
if not has_cookies:
value = self._req_cookie_rewrite(urlkey, '')
if value:
headers['Cookie'] = value
return headers
def _req_cookie_rewrite(self, urlkey, value):
rule = self.rewriter.ruleset.get_first_match(urlkey)
if not rule or not rule.req_cookie_rewrite:
return value
for cr in rule.req_cookie_rewrite:
try:
value = cr['rx'].sub(cr['replace'], value)
except KeyError:
pass
return value
def fetch_http(self, url,
urlkey=None,
env=None,
req_headers=None,
follow_redirects=False,
skip_recording=False,
verify=True):
method = 'GET'
data = None
proxies = None
if not skip_recording:
proxies = self.proxies
if not req_headers:
req_headers = {}
if env is not None:
method = env['REQUEST_METHOD'].upper()
input_ = env['wsgi.input']
req_headers.update(self.translate_headers(url, urlkey, env))
if method in ('POST', 'PUT'):
len_ = env.get('CONTENT_LENGTH')
if len_:
data = LimitReader(input_, int(len_))
else:
data = input_
response = self.live_request(method=method,
url=url,
data=data,
headers=req_headers,
allow_redirects=follow_redirects,
proxies=proxies,
stream=True,
verify=verify)
statusline = str(response.status_code) + ' ' + response.reason
headers = response.headers.items()
stream = response.raw
try: #pragma: no cover
#PY 3
headers = stream._original_response.headers._headers
except: #pragma: no cover
#PY 2
headers = []
resp_headers = stream._original_response.msg.headers
for h in resp_headers:
n, v = h.split(':', 1)
n = n.strip()
v = v.strip()
headers.append((n, v))
status_headers = StatusAndHeaders(statusline, headers)
return (status_headers, stream)
def fetch_request(self, url, urlrewriter,
head_insert_func=None,
urlkey=None,
env=None,
req_headers={},
timestamp=None,
follow_redirects=False,
skip_recording=False,
verify=True,
remote_only=True):
ts_err = url.split('///')
# fixup for accidental erroneous rewrite which has ///
# (unless file:///)
if len(ts_err) > 1 and ts_err[0] != 'file:':
url = 'http://' + ts_err[1]
if url.startswith('//'):
url = 'http:' + url
if remote_only or is_http(url):
is_remote = True
else:
is_remote = False
if not url.startswith('file:'):
url = to_file_url(url)
# explicit urlkey may be passed in (say for testing)
if not urlkey:
urlkey = canonicalize(url)
if is_remote:
(status_headers, stream) = self.fetch_http(url, urlkey, env,
req_headers,
follow_redirects,
skip_recording,
verify)
else:
(status_headers, stream) = self.fetch_local_file(url)
if timestamp is None:
timestamp = timestamp_now()
cdx = {'urlkey': urlkey,
'timestamp': timestamp,
'url': url,
'status': status_headers.get_statuscode(),
'mime': status_headers.get_header('Content-Type'),
'is_live': True,
}
result = (self.rewriter.
rewrite_content(urlrewriter,
status_headers,
stream,
head_insert_func=head_insert_func,
urlkey=urlkey,
cdx=cdx))
if env:
env['pywb.cdx'] = cdx
return result
def fetch_async(self, url, headers):
resp = self.live_request(method='GET',
url=url,
headers=headers,
proxies=self.proxies,
verify=False,
stream=True)
# don't actually read whole response,
# proxy response for writing it
resp.close()
def add_metadata(self, url, headers, data):
return self.live_request(method='PUTMETA',
url=url,
data=data,
headers=headers,
proxies=self.proxies,
verify=False)
def get_rewritten(self, *args, **kwargs):
result = self.fetch_request(*args, **kwargs)
status_headers, gen, is_rewritten = result
buff = b''.join(gen)
return (status_headers, buff)
def get_video_info(self, url):
return youtubedl.extract_info(url)
#=================================================================
class YoutubeDLWrapper(object): #pragma: no cover
""" YoutubeDL wrapper, inits youtubee-dl if it is available
"""
def __init__(self):
try:
from youtube_dl import YoutubeDL as YoutubeDL
except ImportError:
self.ydl = None
return
self.ydl = YoutubeDL(dict(simulate=True,
youtube_include_dash_manifest=False))
self.ydl.add_default_info_extractors()
def extract_info(self, url):
if not self.ydl:
return None
info = self.ydl.extract_info(url)
return info
#=================================================================
youtubedl = YoutubeDLWrapper()

View File

@ -1,4 +1,4 @@
from pywb.webagg.inputrequest import DirectWSGIInputRequest
from pywb.warcserver.inputrequest import DirectWSGIInputRequest
from pywb.utils.loaders import extract_client_cookie
from six import iteritems

View File

@ -1,80 +0,0 @@
from pywb.utils.dsrules import BaseRule
from pywb.rewrite.regex_rewriters import RegexRewriter, CSSRewriter, XMLRewriter
from pywb.rewrite.regex_rewriters import JSLinkAndLocationRewriter, JSLinkOnlyRewriter
from pywb.rewrite.regex_rewriters import JSLocationOnlyRewriter, JSNoneRewriter
from pywb.rewrite.header_rewriter import HeaderRewriter
from pywb.rewrite.html_rewriter import HTMLRewriter
from pywb.rewrite.jsonp_rewriter import JSONPRewriter
import re
#=================================================================
class RewriteRules(BaseRule):
def __init__(self, url_prefix, config={}):
super(RewriteRules, self).__init__(url_prefix, config)
self.rewriters = {}
#self._script_head_inserts = config.get('script_head_inserts', {})
self.rewriters['header'] = config.get('header_class', HeaderRewriter)
self.rewriters['css'] = config.get('css_class', CSSRewriter)
self.rewriters['xml'] = config.get('xml_class', XMLRewriter)
self.rewriters['html'] = config.get('html_class', HTMLRewriter)
self.rewriters['json'] = config.get('json_class', JSONPRewriter)
self.parse_comments = config.get('parse_comments', False)
# Custom handling for js rewriting, often the most complex
self.js_rewrite_location = config.get('js_rewrite_location', 'location')
# ability to toggle rewriting
if self.js_rewrite_location == 'all':
js_default_class = JSLinkAndLocationRewriter
elif self.js_rewrite_location == 'location':
js_default_class = JSLocationOnlyRewriter
# self.rewriters['json'] = JSNoneRewriter
elif self.js_rewrite_location == 'none':
js_default_class = JSNoneRewriter
# self.rewriters['json'] = JSNoneRewriter
else:
js_default_class = JSLinkOnlyRewriter
# set js class, using either default or override from config
self.rewriters['js'] = config.get('js_class', js_default_class)
self.rewriters['js_proxy'] = JSNoneRewriter
# add any regexs for js rewriter
self._add_custom_regexs('js', 'js_regexs', config)
self._add_custom_regexs('js_proxy', 'js_regexs', config)
# cookie rewrite scope
self.cookie_scope = config.get('cookie_scope', 'default')
req_cookie_rewrite = config.get('req_cookie_rewrite', [])
for rc in req_cookie_rewrite:
rc['rx'] = re.compile(rc.get('match', ''))
self.req_cookie_rewrite = req_cookie_rewrite
def _add_custom_regexs(self, rw_id, field, config):
regexs = config.get(field)
if not regexs:
return
rewriter_cls = self.rewriters[rw_id]
#rule_def_tuples = RegexRewriter.parse_rules_from_config(regexs)
parse_rules_func = RegexRewriter.parse_rules_from_config(regexs)
def extend_rewriter_with_regex(urlrewriter):
rule_def_tuples = parse_rules_func(urlrewriter)
return rewriter_cls(urlrewriter, rule_def_tuples)
self.rewriters[rw_id] = extend_rewriter_with_regex

View File

@ -1,271 +0,0 @@
from pywb.rewrite.rewrite_live import LiveRewriter
from pywb.rewrite.url_rewriter import UrlRewriter
from pywb.rewrite.wburl import WbUrl
from pywb.utils.loaders import to_native_str
from pywb import get_test_dir
from io import BytesIO
# This module has some rewriting tests against the 'live web'
# As such, the content may change and the test may break
urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/pywb/')
bn_urlrewriter = UrlRewriter('20131226101010bn_/http://example.com/some/path/index.html', '/pywb/')
def head_insert_func(rule, cdx):
if rule.js_rewrite_location != 'urls':
return '<script src="/static/__pywb/wombat.js"> </script>'
else:
return ''
def test_csrf_token_headers():
rewriter = LiveRewriter()
env = {'HTTP_X_CSRFTOKEN': 'wrong', 'HTTP_COOKIE': 'csrftoken=foobar'}
req_headers = rewriter.translate_headers('http://example.com/', 'com,example)/', env)
assert req_headers == {'X-CSRFToken': 'foobar', 'Cookie': 'csrftoken=foobar'}
def test_forwarded_scheme():
rewriter = LiveRewriter()
env = {'HTTP_X_FORWARDED_PROTO': 'https', 'Other': 'Value'}
req_headers = rewriter.translate_headers('http://example.com/', 'com,example)/', env)
assert req_headers == {'X-Forwarded-Proto': 'http'}
def test_req_cookie_rewrite_1():
rewriter = LiveRewriter()
env = {'HTTP_COOKIE': 'A=B'}
urlkey = 'example,example,test)/'
url = 'test.example.example/'
req_headers = rewriter.translate_headers(url, urlkey, env)
assert req_headers == {'Cookie': 'A=B; FOO=&bar=1'}
def test_req_cookie_rewrite_2():
rewriter = LiveRewriter()
env = {'HTTP_COOKIE': 'FOO=goo'}
urlkey = 'example,example,test)/'
url = 'test.example.example/'
req_headers = rewriter.translate_headers(url, urlkey, env)
assert req_headers == {'Cookie': 'FOO=&bar=1'}
def test_req_cookie_rewrite_3():
rewriter = LiveRewriter()
env = {}
urlkey = 'example,example,test)/'
url = 'test.example.example/'
req_headers = rewriter.translate_headers(url, urlkey, env)
assert req_headers == {'Cookie': '; FOO=&bar=1'}
def test_local_1():
status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html',
urlrewriter,
head_insert_func,
'example,example,test,all)/')
# wombat insert added
assert '<head><script src="/static/__pywb/wombat.js"> </script>' in buff, buff
# JS location and JS link rewritten
assert 'window.WB_wombat_location = "/pywb/20131226101010/http:\/\/example.com/dynamic_page.html"' in buff
# link rewritten
assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff
def test_local_no_head():
status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample_no_head.html',
urlrewriter,
head_insert_func,
'com,example,test)/')
# wombat insert added
assert '<script src="/static/__pywb/wombat.js"> </script>' in buff, buff
# location rewritten
assert 'window.WB_wombat_location = "/other.html"' in buff, buff
# link rewritten
assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff, buff
def test_local_no_head_only_title():
status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample_no_head_2.html',
urlrewriter,
head_insert_func,
'com,example,test)/')
# wombat insert added
assert '<script src="/static/__pywb/wombat.js"> </script>' in buff
def test_local_no_head_banner_only():
status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample_no_head.html',
bn_urlrewriter,
head_insert_func,
'com,example,test)/')
# wombat insert added
assert '<script src="/static/__pywb/wombat.js"> </script>' in buff
# location NOT rewritten
assert 'window.location = "/other.html"' in buff
# link NOT rewritten
assert '"/some/path/another.html"' in buff
def test_local_banner_only_no_rewrite():
status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html',
bn_urlrewriter,
head_insert_func,
'com,example,test)/')
# wombat insert added
assert '<head><script src="/static/__pywb/wombat.js"> </script>' in buff
# JS location NOT rewritten, JS link NOT rewritten
assert 'window.location = "http:\/\/example.com/dynamic_page.html"' in buff, buff
# link NOT rewritten
assert '"/some/path/another.html"' in buff
def test_local_2_link_only_rewrite():
status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html',
urlrewriter,
head_insert_func,
'example,example,test)/nolocation_rewrite')
# no wombat insert
assert '<head><script src="/static/__pywb/wombat.js"> </script>' not in buff
# JS location NOT rewritten, JS link rewritten
assert 'window.location = "/pywb/20131226101010/http:\/\/example.com/dynamic_page.html"' in buff
# still link rewrite
assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff
def test_local_2_js_loc_only_rewrite():
status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html',
urlrewriter,
head_insert_func,
'example,example,test,loconly)/')
# wombat insert added
assert '<script src="/static/__pywb/wombat.js"> </script>' in buff
# JS location rewritten, JS link NOT rewritten
assert 'window.WB_wombat_location = "http:\/\/example.com/dynamic_page.html"' in buff
# still link rewrite in HTML
assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff
def test_local_2_no_rewrite():
status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html',
urlrewriter,
head_insert_func,
'example,example,test,norewrite)/')
# wombat insert added
assert '<script src="/static/__pywb/wombat.js"> </script>' in buff
# JS location NOT rewritten, JS link NOT rewritten
assert 'window.location = "http:\/\/example.com/dynamic_page.html"' in buff
# still link rewrite in HTML
assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff
def test_local_unclosed_script():
status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample_unclosed_script.html',
urlrewriter,
head_insert_func,
'example,example,test,all)/')
# wombat insert added
assert '<head><script src="/static/__pywb/wombat.js"> </script>' in buff, buff
# JS location and JS link rewritten
assert 'window.WB_wombat_location = "/pywb/20131226101010/http:\/\/example.com/dynamic_page.html";' in buff, buff
assert '</script>' in buff, buff
def test_example_1():
status_headers, buff = get_rewritten('http://example.com/', urlrewriter, req_headers={'Connection': 'close', 'Accept-Encoding': 'identity'})
# verify header rewriting
assert status_headers.get_header('x-archive-orig-content-length') == '1270', status_headers
# verify utf-8 charset detection
assert status_headers.get_header('content-type') == 'text/html'
assert '/pywb/20131226101010/http://www.iana.org/domains/example' in buff, buff
def test_example_2_redirect():
status_headers, buff = get_rewritten('http://httpbin.org/redirect-to?url=http://example.com/', urlrewriter)
# redirect, no content
assert status_headers.get_statuscode() == '302'
assert len(buff) == 0
def test_example_3_rel():
status_headers, buff = get_rewritten('//example.com/', urlrewriter)
assert status_headers.get_statuscode() == '200'
def test_example_4_rewrite_err():
# may occur in case of rewrite mismatch, the /// gets stripped off
status_headers, buff = get_rewritten('http://localhost:8080///example.com/', urlrewriter)
assert status_headers.get_statuscode() == '200'
def test_example_domain_specific_3():
status_headers, buff = get_rewritten('http://facebook.com/digitalpreservation', urlrewriter, follow_redirects=True)
# comment out Bootloader.configurePage, if it is still there
if 'Bootloader.configurePage' in buff:
assert '/* Bootloader.configurePage' in buff
def test_wombat_top():
#status_headers, buff = get_rewritten('https://assets-cdn.github.com/assets/github-0f06d0f46fe7bcfbf31f2380f23aec15ba21b8ec.js', urlrewriter)
status_headers, buff = get_rewritten(get_test_dir() + 'text_content/toptest.js', urlrewriter)
assert 'WB_wombat_top!==window' in buff
def test_post():
buff = BytesIO(b'ABC=DEF')
env = {'REQUEST_METHOD': 'POST',
'HTTP_ORIGIN': 'http://httpbin.org',
'HTTP_HOST': 'httpbin.org',
'wsgi.input': buff}
status_headers, resp_buff = get_rewritten('http://httpbin.org/post', urlrewriter, env=env)
assert status_headers.get_statuscode() == '200', status_headers
def test_multiple_set_cookies():
status_headers, buff = get_rewritten('http://httpbin.org/cookies/set?A=B&C=D', urlrewriter)
assert status_headers.get_statuscode() == '302'
print(status_headers.headers)
assert ('Set-Cookie', 'A=B; Path=/pywb/20131226101010/http://example.com/') in status_headers.headers
assert ('Set-Cookie', 'C=D; Path=/pywb/20131226101010/http://example.com/') in status_headers.headers
def get_rewritten(*args, **kwargs):
status_headers, buff = LiveRewriter().get_rewritten(remote_only=False, *args, **kwargs)
return status_headers, to_native_str(buff)

View File

@ -1,100 +0,0 @@
from warcio.statusandheaders import StatusAndHeaders
from warcio.timeutils import datetime_to_http_date
from datetime import datetime, timedelta
#=============================================================================
class PrefixHeaderRewriter(object):
header_rules = {
'content-type': 'keep',
'content-disposition': 'keep',
'content-range': 'keep',
'accept-rangees': 'keep',
'www-authenticate': 'keep',
'proxy-authenticate': 'keep',
'location': 'url-rewrite',
'content-location': 'url-rewrite',
'content-base': 'url-rewrite',
'transfer-encoding': 'prefix',
'connection': 'prefix',
'content-encoding': 'keep-if-no-content-rewrite',
'content-length': 'content-length',
'set-cookie': 'cookie',
'cookie': 'cookie',
}
def __init__(self, rwinfo, header_prefix='X-Archive-Orig-'):
self.header_prefix = header_prefix
self.rwinfo = rwinfo
self.http_headers = rwinfo.record.http_headers
if rwinfo.is_url_rw():
self.default_rule = 'prefix'
else:
self.default_rule = 'keep'
def __call__(self):
new_headers_list = []
for name, value in self.http_headers.headers:
rule = self.header_rules.get(name.lower(), self.default_rule)
new_header = self.rewrite_header(name, value, rule)
if new_header:
if isinstance(new_header, list):
new_headers_list.extend(new_header)
else:
new_headers_list.append(new_header)
return StatusAndHeaders(self.http_headers.statusline,
headers=new_headers_list,
protocol=self.http_headers.protocol)
def rewrite_header(self, name, value, rule):
if rule == 'keep':
return (name, value)
elif rule == 'url-rewrite':
return (name, self.rwinfo.url_rewriter.rewrite(value))
elif rule == 'keep-if-no-content-rewrite':
if not self.rwinfo.is_content_rw:
return (name, value)
elif rule == 'content-length':
if value == '0':
return (name, value)
if not self.rwinfo.is_content_rw:
try:
if int(value) >= 0:
return (name, value)
except:
pass
elif rule == 'cookie':
if self.rwinfo.cookie_rewriter:
return self.rwinfo.cookie_rewriter.rewrite(value)
else:
return (name, value)
# default 'prefix'
return (self.header_prefix + name, value)
def _add_cache_headers(self, new_headers, http_cache):
try:
age = int(http_cache)
except:
age = 0
if age <= 0:
new_headers.append(('Cache-Control', 'no-cache; no-store'))
else:
dt = datetime.utcnow()
dt = dt + timedelta(seconds=age)
new_headers.append(('Cache-Control', 'max-age=' + str(age)))
new_headers.append(('Expires', datetime_to_http_date(dt)))

View File

@ -1,18 +0,0 @@
[uwsgi]
if-not-env = PORT
http-socket = :8090
endif =
master = true
buffer-size = 65536
die-on-term = true
if-env = VIRTUAL_ENV
venv = $(VIRTUAL_ENV)
endif =
gevent = 100
wsgi = urlrewrite.test.simpleapp

View File

@ -1,81 +0,0 @@
import pkgutil
from pywb.utils.loaders import load_yaml_config
#=================================================================
DEFAULT_RULES_FILE = 'pywb/rules.yaml'
#=================================================================
class RuleSet(object):
DEFAULT_KEY = ''
def __init__(self, rule_cls, fieldname, **kwargs):
"""
A domain specific rules block, inited via config map.
If config map not specified, it is loaded from default location.
The rules are represented as a map by domain.
Each rules configuration will load is own field type
from the list and given a specified rule_cls.
"""
self.rules = []
default_rule_config = kwargs.get('default_rule_config')
ds_rules_file = kwargs.get('ds_rules_file')
if not ds_rules_file:
ds_rules_file = DEFAULT_RULES_FILE
config = load_yaml_config(ds_rules_file)
# load rules dict or init to empty
rulesmap = config.get('rules') if config else {}
def_key_found = False
# iterate over master rules file
for value in rulesmap:
url_prefix = value.get('url_prefix')
rules_def = value.get(fieldname)
if not rules_def:
continue
if url_prefix == self.DEFAULT_KEY:
def_key_found = True
self.rules.append(rule_cls(url_prefix, rules_def))
# if default_rule_config provided, always init a default ruleset
if not def_key_found and default_rule_config is not None:
self.rules.append(rule_cls(self.DEFAULT_KEY, default_rule_config))
def iter_matching(self, urlkey):
"""
Iterate over all matching rules for given urlkey
"""
for rule in self.rules:
if rule.applies(urlkey):
yield rule
def get_first_match(self, urlkey):
for rule in self.rules:
if rule.applies(urlkey):
return rule
#=================================================================
class BaseRule(object):
"""
Base rule class -- subclassed to handle specific
rules for given url_prefix key
"""
def __init__(self, url_prefix, rules):
self.url_prefix = url_prefix
if not isinstance(self.url_prefix, list):
self.url_prefix = [self.url_prefix]
def applies(self, urlkey):
return any(urlkey.startswith(x) for x in self.url_prefix)

View File

@ -52,43 +52,6 @@ IOError: [Errno 2] No such file or directory: '_x_no_such_file_'
>>> extract_client_cookie({}, 'y')
# append_post_query
>>> append_post_query('http://example.com/?abc=def', 'foo=bar')
'http://example.com/?abc=def&foo=bar'
>>> append_post_query('http://example.com/', '')
'http://example.com/'
>>> append_post_query('http://example.com/', 'foo=bar')
'http://example.com/?foo=bar'
# extract_post_query tests
# correct POST data
>>> post_data = b'foo=bar&dir=%2Fbaz'
>>> extract_post_query('POST', 'application/x-www-form-urlencoded', len(post_data), BytesIO(post_data))
'foo=bar&dir=/baz'
# unsupported method
>>> extract_post_query('PUT', 'application/x-www-form-urlencoded', len(post_data), BytesIO(post_data))
# base64 encode
>>> extract_post_query('POST', 'text/plain', len(post_data), BytesIO(post_data))
'&__wb_post_data=Zm9vPWJhciZkaXI9JTJGYmF6'
# invalid length
>>> extract_post_query('POST', 'application/x-www-form-urlencoded', 'abc', BytesIO(post_data))
>>> extract_post_query('POST', 'application/x-www-form-urlencoded', 0, BytesIO(post_data))
# length too short
>>> extract_post_query('POST', 'application/x-www-form-urlencoded', len(post_data) - 4, BytesIO(post_data))
'foo=bar&dir=%2'
# length too long
>>> extract_post_query('POST', 'application/x-www-form-urlencoded', len(post_data) + 4, BytesIO(post_data))
'foo=bar&dir=/baz'
# test read_last_line
>>> print_str(read_last_line(BytesIO(b'A\nB\nC')))
'C'
@ -119,8 +82,8 @@ from io import BytesIO
import requests
from pywb.utils.loaders import BlockLoader, HMACCookieMaker, to_file_url
from pywb.utils.loaders import extract_client_cookie, extract_post_query
from pywb.utils.loaders import append_post_query, read_last_line
from pywb.utils.loaders import extract_client_cookie
from pywb.utils.loaders import read_last_line
from warcio.bufferedreaders import DecompressingBufferedReader

View File

@ -86,10 +86,11 @@ class DirectWSGIInputRequest(object):
buffered_stream=buffered_stream,
environ=self.env)
if post_query.append_post_query(url) != url:
new_url = post_query.append_post_query(url)
if new_url != url:
self.env['wsgi.input'] = buffered_stream
return url
return new_url
def get_full_request_uri(self):
req_uri = self.env.get('REQUEST_URI')
@ -246,7 +247,7 @@ class PostQueryExtractor(object):
else:
post_query = base64.b64encode(post_query)
post_query = to_native_str(post_query)
post_query = '&__wb_post_data=' + post_query
post_query = '__wb_post_data=' + post_query
self.post_query = post_query

View File

@ -1,9 +1,10 @@
from pywb.warcserver.inputrequest import DirectWSGIInputRequest, POSTInputRequest
from pywb.warcserver.inputrequest import DirectWSGIInputRequest, POSTInputRequest, PostQueryExtractor
from werkzeug.routing import Map, Rule
import webtest
import traceback
from six.moves.urllib.parse import parse_qsl
from io import BytesIO
#=============================================================================
@ -76,3 +77,61 @@ Foo: Bar\r\n\
\r\n\
'
class TestPostQueryExtract(object):
@classmethod
def setup_class(cls):
cls.post_data = b'foo=bar&dir=%2Fbaz'
def test_post_extract_1(self):
pq = PostQueryExtractor('POST', 'application/x-www-form-urlencoded',
len(self.post_data), BytesIO(self.post_data))
assert pq.append_post_query('http://example.com/') == 'http://example.com/?foo=bar&dir=/baz'
assert pq.append_post_query('http://example.com/?123=ABC') == 'http://example.com/?123=ABC&foo=bar&dir=/baz'
def test_post_extract_wrong_method(self):
pq = PostQueryExtractor('PUT', 'application/x-www-form-urlencoded',
len(self.post_data), BytesIO(self.post_data))
assert pq.append_post_query('http://example.com/') == 'http://example.com/'
def test_post_extract_non_form_data_1(self):
pq = PostQueryExtractor('POST', 'application/octet-stream',
len(self.post_data), BytesIO(self.post_data))
#base64 encoded data
assert pq.append_post_query('http://example.com/') == 'http://example.com/?__wb_post_data=Zm9vPWJhciZkaXI9JTJGYmF6'
def test_post_extract_non_form_data_2(self):
pq = PostQueryExtractor('POST', 'text/plain',
len(self.post_data), BytesIO(self.post_data))
#base64 encoded data
assert pq.append_post_query('http://example.com/pathbar?id=123') == 'http://example.com/pathbar?id=123&__wb_post_data=Zm9vPWJhciZkaXI9JTJGYmF6'
def test_post_extract_length_invalid_ignore(self):
pq = PostQueryExtractor('POST', 'application/x-www-form-urlencoded',
0, BytesIO(self.post_data))
assert pq.append_post_query('http://example.com/') == 'http://example.com/'
pq = PostQueryExtractor('POST', 'application/x-www-form-urlencoded',
'abc', BytesIO(self.post_data))
assert pq.append_post_query('http://example.com/') == 'http://example.com/'
def test_post_extract_length_too_short(self):
pq = PostQueryExtractor('POST', 'application/x-www-form-urlencoded',
len(self.post_data) - 4, BytesIO(self.post_data))
assert pq.append_post_query('http://example.com/') == 'http://example.com/?foo=bar&dir=%2'
def test_post_extract_length_too_long(self):
pq = PostQueryExtractor('POST', 'application/x-www-form-urlencoded',
len(self.post_data) + 4, BytesIO(self.post_data))
assert pq.append_post_query('http://example.com/') == 'http://example.com/?foo=bar&dir=/baz'

View File

@ -3,9 +3,9 @@ from gevent import monkey; monkey.patch_all(thread=False)
import pytest
import webtest
from pywb.webagg.test.testutils import BaseTestClass
from pywb.warcserver.test.testutils import BaseTestClass
from pywb.urlrewrite.frontendapp import FrontEndApp
from pywb.apps.frontendapp import FrontEndApp
import os

View File

@ -17,16 +17,16 @@ from pytest import raises
from mock import patch
from pywb import get_test_dir
from pywb.webagg.test.testutils import TempDirTests, BaseTestClass
from pywb.warcserver.test.testutils import TempDirTests, BaseTestClass
from pywb.manager.manager import main
import pywb.manager.autoindex
from pywb.warc.cdxindexer import main as cdxindexer_main
from pywb.cdx.cdxobject import CDXObject
from pywb.indexer.cdxindexer import main as cdxindexer_main
from pywb.warcserver.index.cdxobject import CDXObject
from pywb.urlrewrite.frontendapp import FrontEndApp
from pywb.apps.frontendapp import FrontEndApp
#=============================================================================

View File

@ -8,10 +8,10 @@ import webtest
from six.moves.urllib.parse import urlencode
from pywb.cdx.cdxobject import CDXObject
from pywb.warcserver.index.cdxobject import CDXObject
from pywb.webagg.test.testutils import BaseTestClass
from pywb.webagg.autoapp import AutoConfigApp
from pywb.warcserver.test.testutils import BaseTestClass
from pywb.warcserver.warcserver import WarcServer
# ============================================================================
@ -20,7 +20,7 @@ class TestCDXApp(BaseTestClass):
def setup_class(cls):
super(TestCDXApp, cls).setup_class()
config_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'config_test.yaml')
cls.testapp = webtest.TestApp(AutoConfigApp(config_file=config_file))
cls.testapp = webtest.TestApp(WarcServer(config_file=config_file))
def query(self, url, is_error=False, **params):
params['url'] = url

View File

@ -1,6 +1,6 @@
from .base_config_test import BaseConfigTest, fmod
from pywb.cdx.cdxobject import CDXObject
from pywb.warcserver.index.cdxobject import CDXObject
# ============================================================================