2013-12-23 15:52:33 -08:00
import re
import sys
2013-12-23 23:57:13 -08:00
import itertools
2013-12-23 15:52:33 -08:00
2014-01-28 19:37:37 -08:00
from url_rewriter import UrlRewriter
2013-12-23 15:52:33 -08:00
2014-01-03 13:03:03 -08:00
#=================================================================
2013-12-23 23:57:13 -08:00
class RegexRewriter :
2013-12-23 15:52:33 -08:00
"""
2013-12-23 23:57:13 -08:00
# Test https->http converter (other tests below in subclasses)
2014-01-28 19:37:37 -08:00
>> > RegexRewriter ( [ ( RegexRewriter . HTTPX_MATCH_STR , RegexRewriter . remove_https , 0 ) ] ) . rewrite ( ' a = https://example.com; b = http://example.com; c = https://some-url/path/https://embedded.example.com ' )
2013-12-23 23:57:13 -08:00
' a = http://example.com; b = http://example.com; c = http://some-url/path/http://embedded.example.com '
2013-12-23 15:52:33 -08:00
"""
2013-12-23 23:57:13 -08:00
@staticmethod
2014-01-28 19:37:37 -08:00
def comment_out ( string ) :
2013-12-23 23:57:13 -08:00
return ' /* ' + string + ' */ '
2013-12-23 15:52:33 -08:00
2013-12-23 23:57:13 -08:00
@staticmethod
2014-01-28 19:37:37 -08:00
def remove_https ( string ) :
2013-12-23 23:57:13 -08:00
return string . replace ( " https " , " http " )
2013-12-23 15:52:33 -08:00
2013-12-23 23:57:13 -08:00
@staticmethod
2014-01-28 19:37:37 -08:00
def add_prefix ( prefix ) :
2013-12-23 23:57:13 -08:00
return lambda string : prefix + string
2013-12-23 15:52:33 -08:00
2013-12-23 23:57:13 -08:00
@staticmethod
2014-01-28 19:37:37 -08:00
def archival_rewrite ( rewriter ) :
2013-12-23 23:57:13 -08:00
return lambda x : rewriter . rewrite ( x )
2013-12-23 15:52:33 -08:00
2014-01-23 01:38:09 -08:00
@staticmethod
def replacer ( string ) :
return lambda x : string
2014-01-03 13:03:03 -08:00
HTTPX_MATCH_STR = ' https?: \\ \\ ?/ \\ \\ ?/[A-Za-z0-9:_@.-]+ '
2013-12-23 15:52:33 -08:00
2014-01-28 19:37:37 -08:00
DEFAULT_OP = add_prefix
2013-12-23 15:52:33 -08:00
2013-12-23 23:57:13 -08:00
def __init__ ( self , rules ) :
2014-01-28 19:37:37 -08:00
#rules = self.create_rules(http_prefix)
2013-12-23 15:52:33 -08:00
# Build regexstr, concatenating regex list
2014-01-28 19:37:37 -08:00
regex_str = ' | ' . join ( [ ' ( ' + rx + ' ) ' for rx , op , count in rules ] )
2013-12-23 15:52:33 -08:00
# ensure it's not middle of a word, wrap in non-capture group
2014-01-28 19:37:37 -08:00
regex_str = ' (?<! \ w)(?: ' + regex_str + ' ) '
2013-12-23 15:52:33 -08:00
2014-01-28 19:37:37 -08:00
self . regex = re . compile ( regex_str , re . M )
2013-12-23 15:52:33 -08:00
self . rules = rules
2014-01-03 13:03:03 -08:00
def filter ( self , m ) :
return True
2014-01-22 14:03:41 -08:00
def rewrite ( self , string ) :
2013-12-23 23:57:13 -08:00
return self . regex . sub ( lambda x : self . replace ( x ) , string )
2013-12-23 15:52:33 -08:00
2014-01-22 14:03:41 -08:00
def close ( self ) :
return ' '
2013-12-23 23:57:13 -08:00
def replace ( self , m ) :
i = 0
for _ , op , count in self . rules :
i + = 1
2013-12-23 15:52:33 -08:00
2014-01-28 19:37:37 -08:00
full_m = i
2013-12-23 23:57:13 -08:00
while count > 0 :
i + = 1
count - = 1
2013-12-23 15:52:33 -08:00
2013-12-23 23:57:13 -08:00
if not m . group ( i ) :
continue
2013-12-23 15:52:33 -08:00
2014-01-03 13:03:03 -08:00
# Optional filter to skip matches
if not self . filter ( m ) :
return m . group ( 0 )
2013-12-23 23:57:13 -08:00
# Custom func
if not hasattr ( op , ' __call__ ' ) :
op = RegexRewriter . DEFAULT_OP ( op )
2013-12-23 15:52:33 -08:00
2013-12-23 23:57:13 -08:00
result = op ( m . group ( i ) )
2013-12-23 15:52:33 -08:00
2013-12-23 23:57:13 -08:00
# if extracting partial match
2014-01-28 19:37:37 -08:00
if i != full_m :
result = m . string [ m . start ( full_m ) : m . start ( i ) ] + result + m . string [ m . end ( i ) : m . end ( full_m ) ]
2013-12-23 15:52:33 -08:00
2013-12-23 23:57:13 -08:00
return result
2013-12-23 15:52:33 -08:00
2014-01-03 13:03:03 -08:00
#=================================================================
2013-12-23 23:57:13 -08:00
class JSRewriter ( RegexRewriter ) :
2013-12-23 15:52:33 -08:00
"""
2013-12-23 23:57:13 -08:00
>> > test_js ( ' location = " http://example.com/abc.html " ' )
2013-12-23 15:52:33 -08:00
' WB_wombat_location = " /web/20131010im_/http://example.com/abc.html " '
2013-12-23 23:57:13 -08:00
>> > test_js ( ' cool_Location = " http://example.com/abc.html " ' )
2013-12-23 15:52:33 -08:00
' cool_Location = " /web/20131010im_/http://example.com/abc.html " '
2013-12-23 23:57:13 -08:00
>> > test_js ( ' window.location = " http://example.com/abc.html " document.domain = " anotherdomain.com " ' )
2013-12-23 15:52:33 -08:00
' window.WB_wombat_location = " /web/20131010im_/http://example.com/abc.html " document.WB_wombat_domain = " anotherdomain.com " '
2014-01-23 01:38:09 -08:00
>> > test_js ( ' document_domain = " anotherdomain.com " ; window.document.domain = " example.com " ' )
' document_domain = " anotherdomain.com " ; window.document.WB_wombat_domain = " example.com " '
2013-12-23 23:57:13 -08:00
# custom rules added
2014-01-28 19:37:37 -08:00
>> > test_js ( ' window.location = " http://example.com/abc.html " ; some_func(); ' , [ ( ' some_func \ ( \ ).* ' , RegexRewriter . comment_out , 0 ) ] )
2013-12-23 23:57:13 -08:00
' window.WB_wombat_location = " /web/20131010im_/http://example.com/abc.html " ; /*some_func(); */ '
2013-12-23 15:52:33 -08:00
"""
2013-12-28 17:39:43 -08:00
def __init__ ( self , rewriter , extra = [ ] ) :
2014-01-28 19:37:37 -08:00
rules = self . _create_rules ( rewriter . get_abs_url ( ) )
2013-12-23 23:57:13 -08:00
rules . extend ( extra )
2013-12-23 15:52:33 -08:00
2013-12-23 23:57:13 -08:00
RegexRewriter . __init__ ( self , rules )
2013-12-23 15:52:33 -08:00
2014-01-28 19:37:37 -08:00
def _create_rules ( self , http_prefix ) :
2013-12-23 23:57:13 -08:00
return [
2014-01-28 19:37:37 -08:00
( RegexRewriter . HTTPX_MATCH_STR , http_prefix , 0 ) ,
2014-01-23 01:38:09 -08:00
( ' location ' , ' WB_wombat_ ' , 0 ) ,
( ' (?<=document \ .)domain ' , ' WB_wombat_ ' , 0 ) ,
2013-12-23 23:57:13 -08:00
]
2013-12-23 15:52:33 -08:00
2014-01-03 13:03:03 -08:00
#=================================================================
class XMLRewriter ( RegexRewriter ) :
"""
>> > test_xml ( ' <tag xmlns= " http://www.example.com/ns " attr= " http://example.com " ></tag> ' )
' <tag xmlns= " http://www.example.com/ns " attr= " /web/20131010im_/http://example.com " ></tag> '
>> > test_xml ( ' <tag xmlns:xsi= " http://www.example.com/ns " attr= " http://example.com " ></tag> ' )
' <tag xmlns:xsi= " http://www.example.com/ns " attr= " /web/20131010im_/http://example.com " ></tag> '
>> > test_xml ( ' <tag> http://example.com<other>abchttp://example.com</other></tag> ' )
' <tag> /web/20131010im_/http://example.com<other>abchttp://example.com</other></tag> '
>> > test_xml ( ' <main> http://www.example.com/blah</tag> <other xmlns:abcdef= " http://example.com " /> http://example.com </main> ' )
' <main> /web/20131010im_/http://www.example.com/blah</tag> <other xmlns:abcdef= " http://example.com " /> /web/20131010im_/http://example.com </main> '
"""
def __init__ ( self , rewriter , extra = [ ] ) :
2014-01-28 19:37:37 -08:00
rules = self . _create_rules ( rewriter . get_abs_url ( ) )
2014-01-03 13:03:03 -08:00
RegexRewriter . __init__ ( self , rules )
# custom filter to reject 'xmlns' attr
def filter ( self , m ) :
attr = m . group ( 1 )
if attr and attr . startswith ( ' xmlns ' ) :
return False
return True
2014-01-28 19:37:37 -08:00
def _create_rules ( self , http_prefix ) :
2014-01-03 13:03:03 -08:00
return [
2014-01-28 19:37:37 -08:00
( ' ([A-Za-z:]+[ \ s=]+)?[ " \' \ s]*( ' + RegexRewriter . HTTPX_MATCH_STR + ' ) ' , http_prefix , 2 ) ,
2014-01-03 13:03:03 -08:00
]
#=================================================================
2013-12-23 23:57:13 -08:00
class CSSRewriter ( RegexRewriter ) :
r """
>> > test_css ( " background: url( ' /some/path.html ' ) " )
" background: url( ' /web/20131010im_/http://example.com/some/path.html ' ) "
2013-12-23 15:52:33 -08:00
2013-12-23 23:57:13 -08:00
>> > test_css ( " background: url( ' ../path.html ' ) " )
" background: url( ' /web/20131010im_/http://example.com/path.html ' ) "
2013-12-23 15:52:33 -08:00
2013-12-23 23:57:13 -08:00
>> > test_css ( " background: url( \" http://domain.com/path.html \" ) " )
' background: url( " /web/20131010im_/http://domain.com/path.html " ) '
2013-12-23 15:52:33 -08:00
2013-12-23 23:57:13 -08:00
>> > test_css ( " background: url(file.jpeg) " )
' background: url(/web/20131010im_/http://example.com/file.jpeg) '
2013-12-23 15:52:33 -08:00
2013-12-23 23:57:13 -08:00
>> > test_css ( " background: url( ' ' ) " )
" background: url( ' ' ) "
2013-12-23 15:52:33 -08:00
2013-12-23 23:57:13 -08:00
>> > test_css ( " background: url ( \" weirdpath \' ) " )
' background: url ( " /web/20131010im_/http://example.com/weirdpath \' ) '
2013-12-23 15:52:33 -08:00
2013-12-23 23:57:13 -08:00
>> > test_css ( " @import url ( ' path.css ' ) " )
" @import url ( ' /web/20131010im_/http://example.com/path.css ' ) "
2013-12-23 15:52:33 -08:00
2013-12-23 23:57:13 -08:00
>> > test_css ( " @import url( ' path.css ' ) " )
" @import url( ' /web/20131010im_/http://example.com/path.css ' ) "
2013-12-23 15:52:33 -08:00
2013-12-23 23:57:13 -08:00
>> > test_css ( " @import ( ' path.css ' ) " )
" @import ( ' /web/20131010im_/http://example.com/path.css ' ) "
2013-12-23 15:52:33 -08:00
2013-12-23 23:57:13 -08:00
>> > test_css ( " @import \" path.css \" " )
' @import " /web/20131010im_/http://example.com/path.css " '
2013-12-23 15:52:33 -08:00
2013-12-23 23:57:13 -08:00
>> > test_css ( " @import ( ' ../path.css \" " )
' @import ( \' /web/20131010im_/http://example.com/path.css " '
2013-12-23 15:52:33 -08:00
2013-12-23 23:57:13 -08:00
>> > test_css ( " @import ( ' ../url.css \" " )
' @import ( \' /web/20131010im_/http://example.com/url.css " '
2013-12-23 15:52:33 -08:00
2013-12-23 23:57:13 -08:00
>> > test_css ( " @import ( \" url.css \" ) " )
' @import ( " /web/20131010im_/http://example.com/url.css " ) '
2014-01-03 21:38:18 +00:00
>> > test_css ( " @import url(/url.css) \n @import url(/anotherurl.css) \n @import url(/and_a_third.css) " )
' @import url(/web/20131010im_/http://example.com/url.css) \n @import url(/web/20131010im_/http://example.com/anotherurl.css) \n @import url(/web/20131010im_/http://example.com/and_a_third.css) '
2013-12-23 23:57:13 -08:00
"""
2014-01-03 21:38:18 +00:00
CSS_URL_REGEX = " url \\ s* \\ ( \\ s*[ \\ \\ \" ' ]*([^) ' \" ]+)[ \\ \\ \" ' ]* \\ s* \\ ) "
2013-12-24 22:51:33 -08:00
CSS_IMPORT_NO_URL_REGEX = " @import \\ s+(?!url) \\ (? \\ s*[ ' \" ]?(?!url[ \\ s \\ (])([ \ w.:/ \\ \\ -]+) "
2013-12-23 23:57:13 -08:00
def __init__ ( self , rewriter ) :
2014-01-28 19:37:37 -08:00
rules = self . _create_rules ( rewriter )
2013-12-23 23:57:13 -08:00
RegexRewriter . __init__ ( self , rules )
2014-01-28 19:37:37 -08:00
def _create_rules ( self , rewriter ) :
2013-12-23 15:52:33 -08:00
return [
2014-01-28 19:37:37 -08:00
( CSSRewriter . CSS_URL_REGEX , RegexRewriter . archival_rewrite ( rewriter ) , 1 ) ,
( CSSRewriter . CSS_IMPORT_NO_URL_REGEX , RegexRewriter . archival_rewrite ( rewriter ) , 1 ) ,
2013-12-23 15:52:33 -08:00
]
2014-01-22 15:28:01 -08:00
import utils
if __name__ == " __main__ " or utils . enable_doctests ( ) :
2014-01-28 19:37:37 -08:00
arcrw = UrlRewriter ( ' /20131010im_/http://example.com/ ' , ' /web/ ' )
2013-12-23 23:57:13 -08:00
def test_js ( string , extra = [ ] ) :
2014-01-22 14:03:41 -08:00
return JSRewriter ( arcrw , extra ) . rewrite ( string )
2013-12-23 23:57:13 -08:00
2014-01-03 13:03:03 -08:00
def test_xml ( string ) :
2014-01-22 14:03:41 -08:00
return XMLRewriter ( arcrw ) . rewrite ( string )
2014-01-03 13:03:03 -08:00
2013-12-23 23:57:13 -08:00
def test_css ( string ) :
2014-01-22 14:03:41 -08:00
return CSSRewriter ( arcrw ) . rewrite ( string )
2013-12-23 15:52:33 -08:00
2014-01-22 15:28:01 -08:00
import doctest
2013-12-23 15:52:33 -08:00
doctest . testmod ( )