1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

new rewriting system!

- new header rewriter
- new extensible content rewriter in urlrewrite.rewriter!
This commit is contained in:
Ilya Kreymer 2017-05-08 19:17:09 -07:00
parent 331320b17a
commit db9d0ae41a
6 changed files with 504 additions and 21 deletions

View File

@ -1,4 +1,4 @@
FROM python:3.5.2
FROM python:3.5.3
MAINTAINER Ilya Kreymer <ikreymer at gmail.com>

View File

@ -88,8 +88,10 @@ class HTMLRewriterMixin(object):
# ===========================
def __init__(self, url_rewriter,
head_insert=None,
js_rewriter_class=JSRewriter,
css_rewriter_class=CSSRewriter,
js_rewriter_class=None,
js_rewriter=None,
css_rewriter=None,
css_rewriter_class=None,
url = '',
defmod='',
parse_comments=False):
@ -97,8 +99,19 @@ class HTMLRewriterMixin(object):
self.url_rewriter = url_rewriter
self._wb_parse_context = None
self.js_rewriter = js_rewriter_class(url_rewriter)
self.css_rewriter = css_rewriter_class(url_rewriter)
if js_rewriter:
self.js_rewriter = js_rewriter
elif js_rewriter_class:
self.js_rewriter = js_rewriter_class(url_rewriter)
else:
self.js_rewriter = JSRewriter(url_rewriter)
if css_rewriter:
self.css_rewriter = css_rewriter
elif css_rewriter_class:
self.css_rewriter = css_rewriter_class(url_rewriter)
else:
self.css_rewriter = CSSRewriter(url_rewriter)
self.head_insert = head_insert
self.parse_comments = parse_comments

View File

@ -0,0 +1,102 @@
from warcio.statusandheaders import StatusAndHeaders
from warcio.timeutils import datetime_to_http_date
from datetime import datetime, timedelta
#=============================================================================
class PrefixHeaderRewriter(object):
header_rules = {
'content-type': 'keep',
'content-disposition': 'keep',
'content-range': 'keep',
'accept-rangees': 'keep',
'www-authenticate': 'keep',
'proxy-authenticate': 'keep',
'location': 'url-rewrite',
'content-location': 'url-rewrite',
'content-base': 'url-rewrite',
'content-encoding': 'keep-if-no-content-rewrite',
'content-length': 'content-length',
'set-cookie': 'cookie',
'cookie': 'cookie',
}
default_rule = 'prefix'
def __init__(self, rwinfo, header_prefix='X-Archive-Orig-'):
self.header_prefix = header_prefix
self.rwinfo = rwinfo
self.http_headers = rwinfo.record.http_headers
def __call__(self):
new_headers_list = []
for name, value in self.http_headers.headers:
rule = self.header_rules.get(name.lower(), self.default_rule)
new_header = self.rewrite_header(name, value, rule)
if new_header:
if isinstance(new_header, list):
new_headers_list.extend(new_header)
else:
new_headers_list.append(new_header)
return StatusAndHeaders(self.http_headers.statusline,
headers=new_headers_list,
protocol=self.http_headers.protocol)
def rewrite_header(self, name, value, rule):
if rule == 'keep':
return (name, value)
elif rule == 'url-rewrite':
return (name, self.rwinfo.url_rewriter.rewrite(value))
elif rule == 'keep-if-no-content-rewrite':
if not self.rwinfo.is_content_rw():
return (name, value)
elif rule == 'content-length':
if value == '0':
return (name, value)
if not self.rwinfo.is_content_rw():
try:
if int(value) >= 0:
return (name, value)
except:
pass
elif rule == 'cookie':
if self.rwinfo.cookie_rewriter:
return self.rwinfo.cookie_rewriter.rewrite(value)
else:
return (name, value)
# default 'prefix'
return (self.header_prefix + name, value)
def _add_cache_headers(self, new_headers, http_cache):
try:
age = int(http_cache)
except:
age = 0
if age <= 0:
new_headers.append(('Cache-Control', 'no-cache; no-store'))
else:
dt = datetime.utcnow()
dt = dt + timedelta(seconds=age)
new_headers.append(('Cache-Control', 'max-age=' + str(age)))
new_headers.append(('Expires', datetime_to_http_date(dt)))
#=============================================================================
class ProxyHeaderRewriter(PrefixHeaderRewriter):
header_rules = {
'transfer-encoding': 'prefix',
'connection': 'prefix',
}
default_rule = 'keep'

View File

@ -89,6 +89,8 @@ class RewriteInputRequest(DirectWSGIInputRequest):
return headers
def _req_cookie_rewrite(self, value):
return value
rule = self.rewriter.ruleset.get_first_match(self.urlkey)
if not rule or not rule.req_cookie_rewrite:
return value

363
pywb/urlrewrite/rewriter.py Normal file
View File

@ -0,0 +1,363 @@
from warcio.utils import to_native_str
from warcio.bufferedreaders import BufferedReader
import webencodings
import re
from pywb.utils.loaders import load_yaml_config
from pywb.rewrite.html_rewriter import HTMLRewriter
from pywb.rewrite.regex_rewriters import RegexRewriter, CSSRewriter, XMLRewriter
from pywb.rewrite.regex_rewriters import JSLinkAndLocationRewriter, JSLinkOnlyRewriter
from pywb.rewrite.regex_rewriters import JSLocationOnlyRewriter, JSNoneRewriter
from pywb.rewrite.cookie_rewriter import ExactPathCookieRewriter
from pywb.urlrewrite.header_rewriter import PrefixHeaderRewriter, ProxyHeaderRewriter
from pywb.rewrite.jsonp_rewriter import JSONPRewriter
from pywb.webagg.utils import StreamIter, BUFF_SIZE
# ============================================================================
class Rewriter(object):
CHARSET_REGEX = re.compile(b'<meta[^>]*?[\s;"\']charset\s*=[\s"\']*([^\s"\'/>]*)')
all_rewriters = {
'header': PrefixHeaderRewriter,
'header-proxy': ProxyHeaderRewriter,
'html': HTMLRewriter,
'css': CSSRewriter,
'js': JSLocationOnlyRewriter,
'js-proxy': JSNoneRewriter,
'json': JSONPRewriter,
'xml': XMLRewriter,
}
rewrite_types = {
# HTML
'text/html': 'html',
'application/xhtml': 'html',
'application/xhtml+xml': 'html',
# CSS
'text/css': 'css',
# JS
'text/javascript': 'js',
'application/javascript': 'js',
'application/x-javascript': 'js',
# JSON
'application/json': 'json',
# HLS
'application/x-mpegURL': 'hls',
# DASH
'application/dash+xml': 'dash',
# XML
'text/xml': 'xml',
'application/xml': 'xml',
'application/rss+xml': 'xml',
# PLAIN
'text/plain': 'plain',
}
def __init__(self, rules_file, replay_mod=''):
self.rules = []
self.load_rules(rules_file)
self.replay_mod = replay_mod
#for rw in self.known_rewriters:
# self.all_rewriters[rw.name] = rw
def add_rewriter(self, rw):
self.all_rewriters[rw.name] = rw
def get_rewriter(self, url, text_type):
return self.all_rewriters.get(text_type)
def load_rules(self, filename):
config = load_yaml_config(filename)
for rule in config.get('rules'):
rule = self.parse_rewrite_rule(rule)
if rule:
self.rules.append(rule)
def parse_rewrite_rule(self, config):
rw_config = config.get('rewrite')
if not rw_config:
return
rule = rw_config
url_prefix = config.get('url_prefix')
if not isinstance(url_prefix, list):
url_prefix = [url_prefix]
rule['url_prefix'] = url_prefix
regexs = rule.get('js_regexs')
if regexs:
parse_rules_func = RegexRewriter.parse_rules_from_config(regexs)
rule['js_regex_func'] = parse_rules_func
return rule
def get_rule(self, cdx):
urlkey = to_native_str(cdx['urlkey'])
for rule in self.rules:
if any((urlkey.startswith(prefix) for prefix in rule['url_prefix'])):
return rule
return {}
def get_rw_class(self, rule, text_type, rwinfo):
if text_type == 'js' and not rwinfo.is_url_rw():
text_type = 'js-proxy'
rw_type = rule.get(text_type, text_type)
rw_class = self.all_rewriters.get(rw_type)
return rw_type, rw_class
def create_rewriter(self, text_type, rule, rwinfo, cdx, head_insert_func=None):
rw_type, rw_class = self.get_rw_class(rule, text_type, rwinfo)
if rw_type in ('js', 'js_proxy'):
extra_rules = []
if 'js_regex_func' in rule:
extra_rules = rule['js_regex_func'](rwinfo.url_rewriter)
return rw_class(rwinfo.url_rewriter, extra_rules)
elif rw_type != 'html':
return rw_class(rwinfo.url_rewriter)
# HTML Rewriter
head_insert_str = self.get_head_insert(rwinfo, rule, head_insert_func, cdx)
js_rewriter = self.create_rewriter('js', rule, rwinfo, cdx)
css_rewriter = self.create_rewriter('css', rule, rwinfo, cdx)
rw = rw_class(rwinfo.url_rewriter,
js_rewriter=js_rewriter,
css_rewriter=css_rewriter,
head_insert=head_insert_str,
url=cdx['url'],
defmod=self.replay_mod,
parse_comments=rule.get('parse_comments', False))
return rw
def get_head_insert(self, rwinfo, rule, head_insert_func, cdx):
head_insert_str = ''
charset = rwinfo.charset
# if no charset set, attempt to extract from first 1024
if not charset:
first_buff = rwinfo.read_and_keep(1024)
charset = self.extract_html_charset(first_buff)
if head_insert_func:
head_insert_orig = head_insert_func(rule, cdx)
if charset:
try:
head_insert_str = webencodings.encode(head_insert_orig, charset)
except:
pass
if not head_insert_str:
charset = 'utf-8'
head_insert_str = head_insert_orig.encode(charset)
head_insert_str = head_insert_str.decode('iso-8859-1')
return head_insert_str
def extract_html_charset(self, buff):
charset = None
m = self.CHARSET_REGEX.search(buff)
if m:
charset = m.group(1)
charset = to_native_str(charset)
return charset
def rewrite_headers(self, rwinfo):
if rwinfo.is_url_rw():
header_rw_name = 'header'
else:
header_rw_name = 'header-proxy'
header_rw_class = self.all_rewriters.get(header_rw_name)
rwinfo.rw_http_headers = header_rw_class(rwinfo)()
def __call__(self, record, url_rewriter, cookie_rewriter,
head_insert_func=None,
cdx=None):
rwinfo = RewriteInfo(record, self, url_rewriter, cookie_rewriter)
self.rewrite_headers(rwinfo)
content_rewriter = None
if rwinfo.is_content_rw():
rule = self.get_rule(cdx)
content_rewriter = self.create_rewriter(rwinfo.text_type, rule, rwinfo, cdx, head_insert_func)
if not content_rewriter:
return rwinfo.rw_http_headers, StreamIter(rwinfo.content_stream), False
#rwinfo.rw_http_headers.status_headers.remove_header('content-length')
# align to line end for all non-html rewriting
align = (rwinfo.text_type != 'html')
# Create rewriting generator
gen = self.rewrite_text_stream_to_gen(rwinfo.content_stream,
rewrite_func=content_rewriter.rewrite,
final_read_func=content_rewriter.close,
align_to_line=align)
return rwinfo.rw_http_headers, gen, True
@staticmethod
def rewrite_text_stream_to_gen(stream,
rewrite_func,
final_read_func,
align_to_line):
"""
Convert stream to generator using applying rewriting func
to each portion of the stream.
Align to line boundaries if needed.
"""
try:
buff = ''
while True:
buff = stream.read(BUFF_SIZE)
if not buff:
break
if align_to_line:
buff += stream.readline()
buff = rewrite_func(buff.decode('iso-8859-1'))
yield buff.encode('iso-8859-1')
# For adding a tail/handling final buffer
buff = final_read_func()
if buff:
yield buff.encode('iso-8859-1')
finally:
stream.close()
# ============================================================================
class RewriteInfo(object):
TAG_REGEX = re.compile(b'^\s*\<')
def __init__(self, record, rewriter, url_rewriter, cookie_rewriter):
self.record = record
self.rw_http_headers = record.http_headers
self.content_stream = record.content_stream()
self.rewriter = rewriter
self.text_type = None
self.charset = None
self.url_rewriter = url_rewriter
if not cookie_rewriter:
cookie_rewriter = ExactPathCookieRewriter(url_rewriter)
self.cookie_rewriter = cookie_rewriter
self._fill_text_type_and_charset()
self._resolve_text_type()
def _fill_text_type_and_charset(self):
content_type = self.record.http_headers.get_header('Content-Type')
if not content_type:
return
parts = content_type.split(';', 1)
mime = parts[0]
self.text_type = self.rewriter.rewrite_types.get(mime)
if not self.text_type:
return
if len(parts) == 2:
parts = parts[1].lower().split('charset=', 1)
if len(parts) == 2:
self.charset = parts[1].strip()
def _resolve_text_type(self):
mod = self.url_rewriter.wburl.mod
if self.text_type == 'css' and mod == 'js_':
self.text_type = 'css'
# only attempt to resolve between html and other text types
if self.text_type != 'html':
return
if mod != 'js_' and mod != 'cs_':
return
buff = self.read_and_keep(128)
# check if starts with a tag, then likely html
if self.TAG_REGEX.match(buff):
self.text_type = 'html'
def read_and_keep(self, size):
buff = self.content_stream.read(size)
self.content_stream = BufferedReader(self.content_stream, starting_data=buff)
return buff
def is_content_rw(self):
if not self.url_rewriter.prefix:
return False
if self.url_rewriter.wburl.mod == 'id_':
return False
if self.text_type == 'html':
if self.url_rewriter.rewrite_opts.get('is_ajax'):
return False
elif self.text_type == 'plain':
if self.url_rewriter.wburl.mod not in ('js_', 'cs_'):
return False
elif not self.text_type:
return False
return True
def is_url_rw(self):
if not self.url_rewriter:
return False
if self.url_rewriter.wburl.mod == 'id_':
return False
return True

View File

@ -1,8 +1,9 @@
import requests
from pywb.rewrite.rewrite_amf import RewriteAMFMixin
from pywb.rewrite.rewrite_dash import RewriteDASHMixin
from pywb.rewrite.rewrite_content import RewriteContent
#from pywb.rewrite.rewrite_amf import RewriteAMFMixin
#from pywb.rewrite.rewrite_dash import RewriteDASHMixin
#from pywb.rewrite.rewrite_content import RewriteContent
from pywb.urlrewrite.rewriter import Rewriter
from pywb.rewrite.wburl import WbUrl
from pywb.rewrite.url_rewriter import UrlRewriter, SchemeOnlyUrlRewriter
@ -44,8 +45,8 @@ class UpstreamException(WbException):
# ============================================================================
class Rewriter(RewriteDASHMixin, RewriteAMFMixin, RewriteContent):
pass
#class Rewriter(RewriteDASHMixin, RewriteAMFMixin, RewriteContent):
# pass
# ============================================================================
@ -67,9 +68,10 @@ class RewriterApp(object):
self.frame_mod = None
self.replay_mod = ''
frame_type = 'inverse' if framed_replay else False
#frame_type = 'inverse' if framed_replay else False
self.content_rewriter = Rewriter(is_framed_replay=frame_type)
#self.content_rewriter = Rewriter(is_framed_replay=frame_type)
self.content_rw = Rewriter('pkg://pywb/rules.yaml', self.replay_mod)
if not jinja_env:
jinja_env = JinjaEnv(globals={'static_path': 'static'})
@ -149,7 +151,7 @@ class RewriterApp(object):
urlkey = canonicalize(wb_url.url)
inputreq = RewriteInputRequest(environ, urlkey, wb_url.url,
self.content_rewriter)
self.content_rw)
inputreq.include_post_query(wb_url.url)
@ -267,14 +269,15 @@ class RewriterApp(object):
cookie_rewriter = self.cookie_tracker.get_rewriter(urlrewriter,
cookie_key)
result = self.content_rewriter.rewrite_content(urlrewriter,
record.http_headers,
record.raw_stream,
head_insert_func,
urlkey,
cdx,
cookie_rewriter,
environ)
#result = self.content_rewriter.rewrite_content(urlrewriter,
# record.http_headers,
# record.raw_stream,
# head_insert_func,
# urlkey,
# cdx,
# cookie_rewriter,
# environ)
result = self.content_rw(record, urlrewriter, cookie_rewriter, head_insert_func, cdx)
status_headers, gen, is_rw = result