mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
new rewriting system!
- new header rewriter - new extensible content rewriter in urlrewrite.rewriter!
This commit is contained in:
parent
331320b17a
commit
db9d0ae41a
@ -1,4 +1,4 @@
|
||||
FROM python:3.5.2
|
||||
FROM python:3.5.3
|
||||
|
||||
MAINTAINER Ilya Kreymer <ikreymer at gmail.com>
|
||||
|
||||
|
@ -88,8 +88,10 @@ class HTMLRewriterMixin(object):
|
||||
# ===========================
|
||||
def __init__(self, url_rewriter,
|
||||
head_insert=None,
|
||||
js_rewriter_class=JSRewriter,
|
||||
css_rewriter_class=CSSRewriter,
|
||||
js_rewriter_class=None,
|
||||
js_rewriter=None,
|
||||
css_rewriter=None,
|
||||
css_rewriter_class=None,
|
||||
url = '',
|
||||
defmod='',
|
||||
parse_comments=False):
|
||||
@ -97,8 +99,19 @@ class HTMLRewriterMixin(object):
|
||||
self.url_rewriter = url_rewriter
|
||||
self._wb_parse_context = None
|
||||
|
||||
self.js_rewriter = js_rewriter_class(url_rewriter)
|
||||
self.css_rewriter = css_rewriter_class(url_rewriter)
|
||||
if js_rewriter:
|
||||
self.js_rewriter = js_rewriter
|
||||
elif js_rewriter_class:
|
||||
self.js_rewriter = js_rewriter_class(url_rewriter)
|
||||
else:
|
||||
self.js_rewriter = JSRewriter(url_rewriter)
|
||||
|
||||
if css_rewriter:
|
||||
self.css_rewriter = css_rewriter
|
||||
elif css_rewriter_class:
|
||||
self.css_rewriter = css_rewriter_class(url_rewriter)
|
||||
else:
|
||||
self.css_rewriter = CSSRewriter(url_rewriter)
|
||||
|
||||
self.head_insert = head_insert
|
||||
self.parse_comments = parse_comments
|
||||
|
102
pywb/urlrewrite/header_rewriter.py
Normal file
102
pywb/urlrewrite/header_rewriter.py
Normal file
@ -0,0 +1,102 @@
|
||||
from warcio.statusandheaders import StatusAndHeaders
|
||||
from warcio.timeutils import datetime_to_http_date
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class PrefixHeaderRewriter(object):
|
||||
header_rules = {
|
||||
'content-type': 'keep',
|
||||
'content-disposition': 'keep',
|
||||
'content-range': 'keep',
|
||||
'accept-rangees': 'keep',
|
||||
'www-authenticate': 'keep',
|
||||
'proxy-authenticate': 'keep',
|
||||
|
||||
'location': 'url-rewrite',
|
||||
'content-location': 'url-rewrite',
|
||||
'content-base': 'url-rewrite',
|
||||
|
||||
'content-encoding': 'keep-if-no-content-rewrite',
|
||||
'content-length': 'content-length',
|
||||
|
||||
'set-cookie': 'cookie',
|
||||
'cookie': 'cookie',
|
||||
}
|
||||
|
||||
default_rule = 'prefix'
|
||||
|
||||
def __init__(self, rwinfo, header_prefix='X-Archive-Orig-'):
|
||||
self.header_prefix = header_prefix
|
||||
self.rwinfo = rwinfo
|
||||
self.http_headers = rwinfo.record.http_headers
|
||||
|
||||
def __call__(self):
|
||||
new_headers_list = []
|
||||
for name, value in self.http_headers.headers:
|
||||
rule = self.header_rules.get(name.lower(), self.default_rule)
|
||||
new_header = self.rewrite_header(name, value, rule)
|
||||
if new_header:
|
||||
if isinstance(new_header, list):
|
||||
new_headers_list.extend(new_header)
|
||||
else:
|
||||
new_headers_list.append(new_header)
|
||||
|
||||
return StatusAndHeaders(self.http_headers.statusline,
|
||||
headers=new_headers_list,
|
||||
protocol=self.http_headers.protocol)
|
||||
|
||||
def rewrite_header(self, name, value, rule):
|
||||
if rule == 'keep':
|
||||
return (name, value)
|
||||
|
||||
elif rule == 'url-rewrite':
|
||||
return (name, self.rwinfo.url_rewriter.rewrite(value))
|
||||
|
||||
elif rule == 'keep-if-no-content-rewrite':
|
||||
if not self.rwinfo.is_content_rw():
|
||||
return (name, value)
|
||||
|
||||
elif rule == 'content-length':
|
||||
if value == '0':
|
||||
return (name, value)
|
||||
|
||||
if not self.rwinfo.is_content_rw():
|
||||
try:
|
||||
if int(value) >= 0:
|
||||
return (name, value)
|
||||
except:
|
||||
pass
|
||||
|
||||
elif rule == 'cookie':
|
||||
if self.rwinfo.cookie_rewriter:
|
||||
return self.rwinfo.cookie_rewriter.rewrite(value)
|
||||
else:
|
||||
return (name, value)
|
||||
|
||||
# default 'prefix'
|
||||
return (self.header_prefix + name, value)
|
||||
|
||||
def _add_cache_headers(self, new_headers, http_cache):
|
||||
try:
|
||||
age = int(http_cache)
|
||||
except:
|
||||
age = 0
|
||||
|
||||
if age <= 0:
|
||||
new_headers.append(('Cache-Control', 'no-cache; no-store'))
|
||||
else:
|
||||
dt = datetime.utcnow()
|
||||
dt = dt + timedelta(seconds=age)
|
||||
new_headers.append(('Cache-Control', 'max-age=' + str(age)))
|
||||
new_headers.append(('Expires', datetime_to_http_date(dt)))
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class ProxyHeaderRewriter(PrefixHeaderRewriter):
|
||||
header_rules = {
|
||||
'transfer-encoding': 'prefix',
|
||||
'connection': 'prefix',
|
||||
}
|
||||
|
||||
default_rule = 'keep'
|
@ -89,6 +89,8 @@ class RewriteInputRequest(DirectWSGIInputRequest):
|
||||
return headers
|
||||
|
||||
def _req_cookie_rewrite(self, value):
|
||||
return value
|
||||
|
||||
rule = self.rewriter.ruleset.get_first_match(self.urlkey)
|
||||
if not rule or not rule.req_cookie_rewrite:
|
||||
return value
|
||||
|
363
pywb/urlrewrite/rewriter.py
Normal file
363
pywb/urlrewrite/rewriter.py
Normal file
@ -0,0 +1,363 @@
|
||||
from warcio.utils import to_native_str
|
||||
from warcio.bufferedreaders import BufferedReader
|
||||
|
||||
import webencodings
|
||||
import re
|
||||
|
||||
from pywb.utils.loaders import load_yaml_config
|
||||
|
||||
from pywb.rewrite.html_rewriter import HTMLRewriter
|
||||
|
||||
from pywb.rewrite.regex_rewriters import RegexRewriter, CSSRewriter, XMLRewriter
|
||||
from pywb.rewrite.regex_rewriters import JSLinkAndLocationRewriter, JSLinkOnlyRewriter
|
||||
from pywb.rewrite.regex_rewriters import JSLocationOnlyRewriter, JSNoneRewriter
|
||||
|
||||
from pywb.rewrite.cookie_rewriter import ExactPathCookieRewriter
|
||||
|
||||
from pywb.urlrewrite.header_rewriter import PrefixHeaderRewriter, ProxyHeaderRewriter
|
||||
|
||||
from pywb.rewrite.jsonp_rewriter import JSONPRewriter
|
||||
|
||||
from pywb.webagg.utils import StreamIter, BUFF_SIZE
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class Rewriter(object):
|
||||
CHARSET_REGEX = re.compile(b'<meta[^>]*?[\s;"\']charset\s*=[\s"\']*([^\s"\'/>]*)')
|
||||
|
||||
all_rewriters = {
|
||||
'header': PrefixHeaderRewriter,
|
||||
'header-proxy': ProxyHeaderRewriter,
|
||||
|
||||
'html': HTMLRewriter,
|
||||
|
||||
'css': CSSRewriter,
|
||||
|
||||
'js': JSLocationOnlyRewriter,
|
||||
'js-proxy': JSNoneRewriter,
|
||||
|
||||
'json': JSONPRewriter,
|
||||
|
||||
'xml': XMLRewriter,
|
||||
}
|
||||
|
||||
rewrite_types = {
|
||||
# HTML
|
||||
'text/html': 'html',
|
||||
'application/xhtml': 'html',
|
||||
'application/xhtml+xml': 'html',
|
||||
|
||||
# CSS
|
||||
'text/css': 'css',
|
||||
|
||||
# JS
|
||||
'text/javascript': 'js',
|
||||
'application/javascript': 'js',
|
||||
'application/x-javascript': 'js',
|
||||
|
||||
# JSON
|
||||
'application/json': 'json',
|
||||
|
||||
# HLS
|
||||
'application/x-mpegURL': 'hls',
|
||||
|
||||
# DASH
|
||||
'application/dash+xml': 'dash',
|
||||
|
||||
# XML
|
||||
'text/xml': 'xml',
|
||||
'application/xml': 'xml',
|
||||
'application/rss+xml': 'xml',
|
||||
|
||||
# PLAIN
|
||||
'text/plain': 'plain',
|
||||
}
|
||||
|
||||
def __init__(self, rules_file, replay_mod=''):
|
||||
self.rules = []
|
||||
self.load_rules(rules_file)
|
||||
self.replay_mod = replay_mod
|
||||
#for rw in self.known_rewriters:
|
||||
# self.all_rewriters[rw.name] = rw
|
||||
|
||||
def add_rewriter(self, rw):
|
||||
self.all_rewriters[rw.name] = rw
|
||||
|
||||
def get_rewriter(self, url, text_type):
|
||||
return self.all_rewriters.get(text_type)
|
||||
|
||||
def load_rules(self, filename):
|
||||
config = load_yaml_config(filename)
|
||||
for rule in config.get('rules'):
|
||||
rule = self.parse_rewrite_rule(rule)
|
||||
if rule:
|
||||
self.rules.append(rule)
|
||||
|
||||
def parse_rewrite_rule(self, config):
|
||||
rw_config = config.get('rewrite')
|
||||
if not rw_config:
|
||||
return
|
||||
|
||||
rule = rw_config
|
||||
url_prefix = config.get('url_prefix')
|
||||
if not isinstance(url_prefix, list):
|
||||
url_prefix = [url_prefix]
|
||||
|
||||
rule['url_prefix'] = url_prefix
|
||||
|
||||
regexs = rule.get('js_regexs')
|
||||
if regexs:
|
||||
parse_rules_func = RegexRewriter.parse_rules_from_config(regexs)
|
||||
rule['js_regex_func'] = parse_rules_func
|
||||
|
||||
return rule
|
||||
|
||||
def get_rule(self, cdx):
|
||||
urlkey = to_native_str(cdx['urlkey'])
|
||||
|
||||
for rule in self.rules:
|
||||
if any((urlkey.startswith(prefix) for prefix in rule['url_prefix'])):
|
||||
return rule
|
||||
|
||||
return {}
|
||||
|
||||
def get_rw_class(self, rule, text_type, rwinfo):
|
||||
if text_type == 'js' and not rwinfo.is_url_rw():
|
||||
text_type = 'js-proxy'
|
||||
|
||||
rw_type = rule.get(text_type, text_type)
|
||||
rw_class = self.all_rewriters.get(rw_type)
|
||||
|
||||
return rw_type, rw_class
|
||||
|
||||
def create_rewriter(self, text_type, rule, rwinfo, cdx, head_insert_func=None):
|
||||
rw_type, rw_class = self.get_rw_class(rule, text_type, rwinfo)
|
||||
|
||||
if rw_type in ('js', 'js_proxy'):
|
||||
extra_rules = []
|
||||
if 'js_regex_func' in rule:
|
||||
extra_rules = rule['js_regex_func'](rwinfo.url_rewriter)
|
||||
|
||||
return rw_class(rwinfo.url_rewriter, extra_rules)
|
||||
|
||||
elif rw_type != 'html':
|
||||
return rw_class(rwinfo.url_rewriter)
|
||||
|
||||
# HTML Rewriter
|
||||
head_insert_str = self.get_head_insert(rwinfo, rule, head_insert_func, cdx)
|
||||
|
||||
js_rewriter = self.create_rewriter('js', rule, rwinfo, cdx)
|
||||
css_rewriter = self.create_rewriter('css', rule, rwinfo, cdx)
|
||||
|
||||
rw = rw_class(rwinfo.url_rewriter,
|
||||
js_rewriter=js_rewriter,
|
||||
css_rewriter=css_rewriter,
|
||||
head_insert=head_insert_str,
|
||||
url=cdx['url'],
|
||||
defmod=self.replay_mod,
|
||||
parse_comments=rule.get('parse_comments', False))
|
||||
|
||||
return rw
|
||||
|
||||
def get_head_insert(self, rwinfo, rule, head_insert_func, cdx):
|
||||
head_insert_str = ''
|
||||
charset = rwinfo.charset
|
||||
|
||||
# if no charset set, attempt to extract from first 1024
|
||||
if not charset:
|
||||
first_buff = rwinfo.read_and_keep(1024)
|
||||
charset = self.extract_html_charset(first_buff)
|
||||
|
||||
if head_insert_func:
|
||||
head_insert_orig = head_insert_func(rule, cdx)
|
||||
|
||||
if charset:
|
||||
try:
|
||||
head_insert_str = webencodings.encode(head_insert_orig, charset)
|
||||
except:
|
||||
pass
|
||||
|
||||
if not head_insert_str:
|
||||
charset = 'utf-8'
|
||||
head_insert_str = head_insert_orig.encode(charset)
|
||||
|
||||
head_insert_str = head_insert_str.decode('iso-8859-1')
|
||||
|
||||
return head_insert_str
|
||||
|
||||
def extract_html_charset(self, buff):
|
||||
charset = None
|
||||
m = self.CHARSET_REGEX.search(buff)
|
||||
if m:
|
||||
charset = m.group(1)
|
||||
charset = to_native_str(charset)
|
||||
|
||||
return charset
|
||||
|
||||
def rewrite_headers(self, rwinfo):
|
||||
if rwinfo.is_url_rw():
|
||||
header_rw_name = 'header'
|
||||
else:
|
||||
header_rw_name = 'header-proxy'
|
||||
|
||||
header_rw_class = self.all_rewriters.get(header_rw_name)
|
||||
rwinfo.rw_http_headers = header_rw_class(rwinfo)()
|
||||
|
||||
def __call__(self, record, url_rewriter, cookie_rewriter,
|
||||
head_insert_func=None,
|
||||
cdx=None):
|
||||
|
||||
rwinfo = RewriteInfo(record, self, url_rewriter, cookie_rewriter)
|
||||
|
||||
self.rewrite_headers(rwinfo)
|
||||
|
||||
content_rewriter = None
|
||||
if rwinfo.is_content_rw():
|
||||
rule = self.get_rule(cdx)
|
||||
content_rewriter = self.create_rewriter(rwinfo.text_type, rule, rwinfo, cdx, head_insert_func)
|
||||
|
||||
if not content_rewriter:
|
||||
return rwinfo.rw_http_headers, StreamIter(rwinfo.content_stream), False
|
||||
|
||||
#rwinfo.rw_http_headers.status_headers.remove_header('content-length')
|
||||
|
||||
# align to line end for all non-html rewriting
|
||||
align = (rwinfo.text_type != 'html')
|
||||
|
||||
# Create rewriting generator
|
||||
gen = self.rewrite_text_stream_to_gen(rwinfo.content_stream,
|
||||
rewrite_func=content_rewriter.rewrite,
|
||||
final_read_func=content_rewriter.close,
|
||||
align_to_line=align)
|
||||
|
||||
return rwinfo.rw_http_headers, gen, True
|
||||
|
||||
@staticmethod
|
||||
def rewrite_text_stream_to_gen(stream,
|
||||
rewrite_func,
|
||||
final_read_func,
|
||||
align_to_line):
|
||||
"""
|
||||
Convert stream to generator using applying rewriting func
|
||||
to each portion of the stream.
|
||||
Align to line boundaries if needed.
|
||||
"""
|
||||
try:
|
||||
buff = ''
|
||||
|
||||
while True:
|
||||
buff = stream.read(BUFF_SIZE)
|
||||
if not buff:
|
||||
break
|
||||
|
||||
if align_to_line:
|
||||
buff += stream.readline()
|
||||
|
||||
buff = rewrite_func(buff.decode('iso-8859-1'))
|
||||
yield buff.encode('iso-8859-1')
|
||||
|
||||
# For adding a tail/handling final buffer
|
||||
buff = final_read_func()
|
||||
if buff:
|
||||
yield buff.encode('iso-8859-1')
|
||||
|
||||
finally:
|
||||
stream.close()
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class RewriteInfo(object):
|
||||
TAG_REGEX = re.compile(b'^\s*\<')
|
||||
|
||||
def __init__(self, record, rewriter, url_rewriter, cookie_rewriter):
|
||||
self.record = record
|
||||
|
||||
self.rw_http_headers = record.http_headers
|
||||
self.content_stream = record.content_stream()
|
||||
|
||||
self.rewriter = rewriter
|
||||
|
||||
self.text_type = None
|
||||
self.charset = None
|
||||
|
||||
self.url_rewriter = url_rewriter
|
||||
|
||||
if not cookie_rewriter:
|
||||
cookie_rewriter = ExactPathCookieRewriter(url_rewriter)
|
||||
|
||||
self.cookie_rewriter = cookie_rewriter
|
||||
|
||||
self._fill_text_type_and_charset()
|
||||
self._resolve_text_type()
|
||||
|
||||
def _fill_text_type_and_charset(self):
|
||||
content_type = self.record.http_headers.get_header('Content-Type')
|
||||
if not content_type:
|
||||
return
|
||||
|
||||
parts = content_type.split(';', 1)
|
||||
mime = parts[0]
|
||||
|
||||
self.text_type = self.rewriter.rewrite_types.get(mime)
|
||||
if not self.text_type:
|
||||
return
|
||||
|
||||
if len(parts) == 2:
|
||||
parts = parts[1].lower().split('charset=', 1)
|
||||
if len(parts) == 2:
|
||||
self.charset = parts[1].strip()
|
||||
|
||||
def _resolve_text_type(self):
|
||||
mod = self.url_rewriter.wburl.mod
|
||||
|
||||
if self.text_type == 'css' and mod == 'js_':
|
||||
self.text_type = 'css'
|
||||
|
||||
# only attempt to resolve between html and other text types
|
||||
if self.text_type != 'html':
|
||||
return
|
||||
|
||||
if mod != 'js_' and mod != 'cs_':
|
||||
return
|
||||
|
||||
buff = self.read_and_keep(128)
|
||||
|
||||
# check if starts with a tag, then likely html
|
||||
if self.TAG_REGEX.match(buff):
|
||||
self.text_type = 'html'
|
||||
|
||||
def read_and_keep(self, size):
|
||||
buff = self.content_stream.read(size)
|
||||
self.content_stream = BufferedReader(self.content_stream, starting_data=buff)
|
||||
return buff
|
||||
|
||||
def is_content_rw(self):
|
||||
if not self.url_rewriter.prefix:
|
||||
return False
|
||||
|
||||
if self.url_rewriter.wburl.mod == 'id_':
|
||||
return False
|
||||
|
||||
if self.text_type == 'html':
|
||||
if self.url_rewriter.rewrite_opts.get('is_ajax'):
|
||||
return False
|
||||
|
||||
elif self.text_type == 'plain':
|
||||
if self.url_rewriter.wburl.mod not in ('js_', 'cs_'):
|
||||
return False
|
||||
|
||||
elif not self.text_type:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def is_url_rw(self):
|
||||
if not self.url_rewriter:
|
||||
return False
|
||||
|
||||
if self.url_rewriter.wburl.mod == 'id_':
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
@ -1,8 +1,9 @@
|
||||
import requests
|
||||
|
||||
from pywb.rewrite.rewrite_amf import RewriteAMFMixin
|
||||
from pywb.rewrite.rewrite_dash import RewriteDASHMixin
|
||||
from pywb.rewrite.rewrite_content import RewriteContent
|
||||
#from pywb.rewrite.rewrite_amf import RewriteAMFMixin
|
||||
#from pywb.rewrite.rewrite_dash import RewriteDASHMixin
|
||||
#from pywb.rewrite.rewrite_content import RewriteContent
|
||||
from pywb.urlrewrite.rewriter import Rewriter
|
||||
|
||||
from pywb.rewrite.wburl import WbUrl
|
||||
from pywb.rewrite.url_rewriter import UrlRewriter, SchemeOnlyUrlRewriter
|
||||
@ -44,8 +45,8 @@ class UpstreamException(WbException):
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class Rewriter(RewriteDASHMixin, RewriteAMFMixin, RewriteContent):
|
||||
pass
|
||||
#class Rewriter(RewriteDASHMixin, RewriteAMFMixin, RewriteContent):
|
||||
# pass
|
||||
|
||||
|
||||
# ============================================================================
|
||||
@ -67,9 +68,10 @@ class RewriterApp(object):
|
||||
self.frame_mod = None
|
||||
self.replay_mod = ''
|
||||
|
||||
frame_type = 'inverse' if framed_replay else False
|
||||
#frame_type = 'inverse' if framed_replay else False
|
||||
|
||||
self.content_rewriter = Rewriter(is_framed_replay=frame_type)
|
||||
#self.content_rewriter = Rewriter(is_framed_replay=frame_type)
|
||||
self.content_rw = Rewriter('pkg://pywb/rules.yaml', self.replay_mod)
|
||||
|
||||
if not jinja_env:
|
||||
jinja_env = JinjaEnv(globals={'static_path': 'static'})
|
||||
@ -149,7 +151,7 @@ class RewriterApp(object):
|
||||
urlkey = canonicalize(wb_url.url)
|
||||
|
||||
inputreq = RewriteInputRequest(environ, urlkey, wb_url.url,
|
||||
self.content_rewriter)
|
||||
self.content_rw)
|
||||
|
||||
inputreq.include_post_query(wb_url.url)
|
||||
|
||||
@ -267,14 +269,15 @@ class RewriterApp(object):
|
||||
cookie_rewriter = self.cookie_tracker.get_rewriter(urlrewriter,
|
||||
cookie_key)
|
||||
|
||||
result = self.content_rewriter.rewrite_content(urlrewriter,
|
||||
record.http_headers,
|
||||
record.raw_stream,
|
||||
head_insert_func,
|
||||
urlkey,
|
||||
cdx,
|
||||
cookie_rewriter,
|
||||
environ)
|
||||
#result = self.content_rewriter.rewrite_content(urlrewriter,
|
||||
# record.http_headers,
|
||||
# record.raw_stream,
|
||||
# head_insert_func,
|
||||
# urlkey,
|
||||
# cdx,
|
||||
# cookie_rewriter,
|
||||
# environ)
|
||||
result = self.content_rw(record, urlrewriter, cookie_rewriter, head_insert_func, cdx)
|
||||
|
||||
status_headers, gen, is_rw = result
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user