diff --git a/pywb/apps/frontendapp.py b/pywb/apps/frontendapp.py index 2de16fa6..80469e5c 100644 --- a/pywb/apps/frontendapp.py +++ b/pywb/apps/frontendapp.py @@ -24,7 +24,7 @@ from pywb.warcserver.warcserver import WarcServer from pywb.rewrite.templateview import BaseInsertView from pywb.apps.static_handler import StaticHandler -from pywb.apps.rewriterapp import RewriterApp +from pywb.apps.rewriterapp import RewriterApp, UpstreamException from pywb.apps.wbrequestresponse import WbResponse import os @@ -441,6 +441,13 @@ class FrontEndApp(object): coll in self.warcserver.list_dynamic_routes()) def raise_not_found(self, environ, err_type, url): + """Utility function for raising a werkzeug.exceptions.NotFound execption with the supplied WSGI environment + and message. + + :param dict environ: The WSGI environment dictionary for the request + :param str err_type: The identifier for type of error that occured + :param str url: The url of the archived page that was requested + """ raise AppPageNotFound(err_type, url) def _check_refer_redirect(self, environ): diff --git a/pywb/rewrite/html_rewriter.py b/pywb/rewrite/html_rewriter.py index 92c4e8e0..bf28bf25 100644 --- a/pywb/rewrite/html_rewriter.py +++ b/pywb/rewrite/html_rewriter.py @@ -56,7 +56,7 @@ class HTMLRewriterMixin(StreamingRewriter): 'archive': 'oe_'}, 'area': {'href': defmod}, 'audio': {'src': 'oe_'}, - 'base': {'href': 'ba_'}, + 'base': {'href': defmod}, 'blockquote': {'cite': defmod}, 'body': {'background': 'im_'}, 'button': {'formaction': defmod}, diff --git a/pywb/rewrite/regex_rewriters.py b/pywb/rewrite/regex_rewriters.py index 2bcf42d9..9403bb1d 100644 --- a/pywb/rewrite/regex_rewriters.py +++ b/pywb/rewrite/regex_rewriters.py @@ -25,8 +25,8 @@ class RxRules(object): return lambda _, _2: string @staticmethod - def archival_rewrite(): - return lambda string, rewriter: rewriter.rewrite(string) + def archival_rewrite(mod=None): + return lambda string, rewriter: rewriter.rewrite(string, mod) @staticmethod def add_prefix(prefix): @@ -327,13 +327,12 @@ class JSReplaceFuzzy(object): class CSSRules(RxRules): CSS_URL_REGEX = "url\\s*\\(\\s*(?:[\\\\\"']|(?:&.{1,4};))*\\s*([^)'\"]+)\\s*(?:[\\\\\"']|(?:&.{1,4};))*\\s*\\)" - CSS_IMPORT_NO_URL_REGEX = ("@import\\s+(?!url)\\(?\\s*['\"]?" + - "(?!url[\\s\\(])([\w.:/\\\\-]+)") + CSS_IMPORT_REGEX = ("@import\\s+(?:url\\s*)?\\(?\\s*['\"]?([\w.:/\\\\-]+)") def __init__(self): rules = [ - (self.CSS_URL_REGEX, self.archival_rewrite(), 1), - (self.CSS_IMPORT_NO_URL_REGEX, self.archival_rewrite(), 1), + (self.CSS_URL_REGEX, self.archival_rewrite('oe_'), 1), + (self.CSS_IMPORT_REGEX, self.archival_rewrite('cs_'), 1), ] super(CSSRules, self).__init__(rules) diff --git a/pywb/rewrite/test/test_html_rewriter.py b/pywb/rewrite/test/test_html_rewriter.py index 26f12248..cd7ab24b 100644 --- a/pywb/rewrite/test/test_html_rewriter.py +++ b/pywb/rewrite/test/test_html_rewriter.py @@ -25,23 +25,23 @@ r""" # Base Tests -- w/ rewrite (default) >>> parse('') - + # Full Path >>> parse('', urlrewriter=full_path_urlrewriter) - + # Full Path Scheme Rel Base >>> parse('', urlrewriter=full_path_urlrewriter) - + # Rel Base >>> parse('', urlrewriter=full_path_urlrewriter) - + # Rel Base + example >>> parse('', urlrewriter=full_path_urlrewriter) - + # Rel Base >>> parse('', urlrewriter=full_path_urlrewriter) @@ -53,7 +53,7 @@ r""" # ensure trailing slash added >>> parse('') - + # Base Tests -- no rewrite >>> parse('', urlrewriter=no_base_canon_rewriter) @@ -244,29 +244,29 @@ r"""
>>> parse('
') -
+
>>> parse('') - + >>> parse('') - + >>> parse('') - + >>> parse('') >>> parse("") - + #>>> parse('') # Style >>> parse('') - + # Unterminated style tag, handle and auto-terminate >>> parse('