mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
new unified config system, via rules.yaml!
contains configs for cdx canon, fuzzy matching and rewriting! rewriting: ability to add custom regexs per domain also, ability to toggle js rewriting and custom rewriting file (default is wombat.js)
This commit is contained in:
parent
349a1a7a3a
commit
5a41f59f39
@ -2,6 +2,9 @@
|
|||||||
omit =
|
omit =
|
||||||
*/test/*
|
*/test/*
|
||||||
*/tests/*
|
*/tests/*
|
||||||
|
*.html
|
||||||
|
*.js
|
||||||
|
*.css
|
||||||
|
|
||||||
[report]
|
[report]
|
||||||
exclude_lines =
|
exclude_lines =
|
||||||
|
@ -3,31 +3,38 @@ import re
|
|||||||
import logging
|
import logging
|
||||||
import pkgutil
|
import pkgutil
|
||||||
|
|
||||||
|
from pywb.utils.dsrules import BaseRule, RuleSet
|
||||||
|
|
||||||
from canonicalize import unsurt, UrlCanonicalizer
|
from canonicalize import unsurt, UrlCanonicalizer
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
def load_domain_specific_cdx_rules(filename, surt_ordered):
|
def load_domain_specific_cdx_rules(filename, surt_ordered):
|
||||||
fh = pkgutil.get_data(__package__, filename)
|
#fh = pkgutil.get_data(__package__, filename)
|
||||||
config = yaml.load(fh)
|
#config = yaml.load(fh)
|
||||||
|
|
||||||
|
canon = None
|
||||||
|
fuzzy = None
|
||||||
|
|
||||||
# Load Canonicalizer Rules
|
# Load Canonicalizer Rules
|
||||||
rules = StartsWithRule.load_rules(config.get('canon_rules'),
|
rules = RuleSet(CDXDomainSpecificRule, 'canonicalize')
|
||||||
surt_ordered)
|
|
||||||
|
if not surt_ordered:
|
||||||
|
for rule in rules:
|
||||||
|
rule.unsurt()
|
||||||
|
|
||||||
if rules:
|
if rules:
|
||||||
canon = CustomUrlCanonicalizer(rules, surt_ordered)
|
canon = CustomUrlCanonicalizer(rules, surt_ordered)
|
||||||
else:
|
|
||||||
canon = None
|
|
||||||
|
|
||||||
# Load Fuzzy Lookup Rules
|
# Load Fuzzy Lookup Rules
|
||||||
rules = StartsWithRule.load_rules(config.get('fuzzy_lookup_rules'),
|
rules = RuleSet(CDXDomainSpecificRule, 'fuzzy_lookup')
|
||||||
surt_ordered)
|
|
||||||
|
if not surt_ordered:
|
||||||
|
for rule in rules:
|
||||||
|
rule.unsurt()
|
||||||
|
|
||||||
if rules:
|
if rules:
|
||||||
fuzzy = FuzzyQuery(rules)
|
fuzzy = FuzzyQuery(rules)
|
||||||
else:
|
|
||||||
fuzzy = None
|
|
||||||
|
|
||||||
logging.debug('CANON: ' + str(canon))
|
logging.debug('CANON: ' + str(canon))
|
||||||
logging.debug('FUZZY: ' + str(fuzzy))
|
logging.debug('FUZZY: ' + str(fuzzy))
|
||||||
@ -43,10 +50,7 @@ class CustomUrlCanonicalizer(UrlCanonicalizer):
|
|||||||
def __call__(self, url):
|
def __call__(self, url):
|
||||||
urlkey = super(CustomUrlCanonicalizer, self).__call__(url)
|
urlkey = super(CustomUrlCanonicalizer, self).__call__(url)
|
||||||
|
|
||||||
for rule in self.rules:
|
for rule in self.rules.iter_matching(urlkey):
|
||||||
if not any(urlkey.startswith(x) for x in rule.starts):
|
|
||||||
continue
|
|
||||||
|
|
||||||
m = rule.regex.match(urlkey)
|
m = rule.regex.match(urlkey)
|
||||||
if not m:
|
if not m:
|
||||||
continue
|
continue
|
||||||
@ -68,10 +72,7 @@ class FuzzyQuery:
|
|||||||
urlkey = params['key']
|
urlkey = params['key']
|
||||||
url = params['url']
|
url = params['url']
|
||||||
|
|
||||||
for rule in self.rules:
|
for rule in self.rules.iter_matching(urlkey):
|
||||||
if not any(urlkey.startswith(x) for x in rule.starts):
|
|
||||||
continue
|
|
||||||
|
|
||||||
m = rule.regex.search(urlkey)
|
m = rule.regex.search(urlkey)
|
||||||
if not m:
|
if not m:
|
||||||
continue
|
continue
|
||||||
@ -96,20 +97,29 @@ class FuzzyQuery:
|
|||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class StartsWithRule:
|
class CDXDomainSpecificRule(BaseRule):
|
||||||
def __init__(self, config, surt_ordered=True):
|
def __init__(self, name, config):
|
||||||
self.starts = config.get('startswith')
|
super(CDXDomainSpecificRule, self).__init__(name, config)
|
||||||
if not isinstance(self.starts, list):
|
|
||||||
self.starts = [self.starts]
|
|
||||||
|
|
||||||
self.regex = re.compile(config.get('matches'))
|
if isinstance(config, basestring):
|
||||||
self.replace = config.get('replace')
|
self.regex = re.compile(config)
|
||||||
|
self.replace = None
|
||||||
|
else:
|
||||||
|
self.regex = re.compile(config.get('match'))
|
||||||
|
self.replace = config.get('replace')
|
||||||
|
|
||||||
def unsurt(self):
|
def unsurt(self):
|
||||||
# must convert to non-surt form
|
"""
|
||||||
self.starts = map(unsurt, self.starts)
|
urlkey is assumed to be in surt format by default
|
||||||
self.regex = unsurt(self.regex)
|
In the case of non-surt format, this method is called
|
||||||
self.replace = unsurt(self.replace)
|
to desurt any urls
|
||||||
|
"""
|
||||||
|
self.url_prefix = map(unsurt, self.url_prefix)
|
||||||
|
if self.regex:
|
||||||
|
self.regex = unsurt(self.regex)
|
||||||
|
|
||||||
|
if self.replace:
|
||||||
|
self.replace = unsurt(self.replace)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def load_rules(rules_config, surt_ordered=True):
|
def load_rules(rules_config, surt_ordered=True):
|
||||||
|
@ -1,24 +0,0 @@
|
|||||||
|
|
||||||
fuzzy_lookup_rules:
|
|
||||||
- startswith: 'com,twitter)/i/profiles/show/'
|
|
||||||
matches: '/profiles/show/.*with_replies\?.*(max_id=[^&]+)'
|
|
||||||
|
|
||||||
- startswith: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet'
|
|
||||||
matches: 'com,facebook\)/.*[?&]data=(.*(?:[&]|query_type[^,]+))'
|
|
||||||
|
|
||||||
- startswith: ['com,yimg,l)/g/combo', 'com,yahooapis,yui)/combo']
|
|
||||||
matches: '([^/]+(?:\.css|\.js))'
|
|
||||||
|
|
||||||
# matches all urls
|
|
||||||
- startswith: ''
|
|
||||||
matches: '[&?](?:_|uncache)=[\d]+[&]?'
|
|
||||||
|
|
||||||
canon_rules:
|
|
||||||
- startswith: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet'
|
|
||||||
matches: 'com,facebook\)/.*[?&]data=([^&]+).*'
|
|
||||||
replace: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet?data=\1'
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -4,11 +4,16 @@ import itertools
|
|||||||
|
|
||||||
from url_rewriter import UrlRewriter
|
from url_rewriter import UrlRewriter
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class RegexRewriter(object):
|
class RegexRewriter(object):
|
||||||
|
#@staticmethod
|
||||||
|
#def comment_out(string):
|
||||||
|
# return '/*' + string + '*/'
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def comment_out(string):
|
def format(template):
|
||||||
return '/*' + string + '*/'
|
return lambda string: template.format(string)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def remove_https(string):
|
def remove_https(string):
|
||||||
@ -20,19 +25,16 @@ class RegexRewriter(object):
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def archival_rewrite(rewriter):
|
def archival_rewrite(rewriter):
|
||||||
return lambda x: rewriter.rewrite(x)
|
return lambda string: rewriter.rewrite(string)
|
||||||
|
|
||||||
@staticmethod
|
#@staticmethod
|
||||||
def replacer(string):
|
#def replacer(other):
|
||||||
return lambda x: string
|
# return lambda m, string: other
|
||||||
|
|
||||||
HTTPX_MATCH_STR = r'https?:\\?/\\?/[A-Za-z0-9:_@.-]+'
|
HTTPX_MATCH_STR = r'https?:\\?/\\?/[A-Za-z0-9:_@.-]+'
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_OP = add_prefix
|
DEFAULT_OP = add_prefix
|
||||||
|
|
||||||
|
|
||||||
def __init__(self, rules):
|
def __init__(self, rules):
|
||||||
#rules = self.create_rules(http_prefix)
|
#rules = self.create_rules(http_prefix)
|
||||||
|
|
||||||
@ -76,52 +78,68 @@ class RegexRewriter(object):
|
|||||||
op = RegexRewriter.DEFAULT_OP(op)
|
op = RegexRewriter.DEFAULT_OP(op)
|
||||||
|
|
||||||
result = op(m.group(i))
|
result = op(m.group(i))
|
||||||
|
final_str = result
|
||||||
|
|
||||||
# if extracting partial match
|
# if extracting partial match
|
||||||
if i != full_m:
|
if i != full_m:
|
||||||
result = m.string[m.start(full_m):m.start(i)] + result + m.string[m.end(i):m.end(full_m)]
|
final_str = m.string[m.start(full_m):m.start(i)]
|
||||||
|
final_str += result
|
||||||
|
final_str += m.string[m.end(i):m.end(full_m)]
|
||||||
|
|
||||||
|
return final_str
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def parse_rules_from_config(config):
|
||||||
|
def parse_rule(obj):
|
||||||
|
match = obj.get('match')
|
||||||
|
replace = RegexRewriter.format(obj.get('replace', '{0}'))
|
||||||
|
group = obj.get('group', 0)
|
||||||
|
result = (match, replace, group)
|
||||||
return result
|
return result
|
||||||
|
return map(parse_rule, config)
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class JSLinkRewriter(RegexRewriter):
|
class JSLinkOnlyRewriter(RegexRewriter):
|
||||||
"""
|
"""
|
||||||
JS Rewriter which rewrites absolute http://, https:// and // urls
|
JS Rewriter which rewrites absolute http://, https:// and // urls
|
||||||
at the beginning of a string
|
at the beginning of a string
|
||||||
"""
|
"""
|
||||||
JS_HTTPX = r'(?<="|\')(?:https?:)?\\{0,2}/\\{0,2}/[A-Za-z0-9:_@.-]+'
|
JS_HTTPX = r'(?<="|\')(?:https?:)?\\{0,2}/\\{0,2}/[A-Za-z0-9:_@.-]+'
|
||||||
|
|
||||||
def __init__(self, rewriter, rules = []):
|
def __init__(self, rewriter, rules=[]):
|
||||||
rules = rules + [(self.JS_HTTPX, rewriter.get_abs_url(), 0)]
|
rules = rules + [(self.JS_HTTPX, rewriter.get_abs_url(), 0)]
|
||||||
super(JSLinkRewriter, self).__init__(rules)
|
super(JSLinkOnlyRewriter, self).__init__(rules)
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class JSLocationAndLinkRewriter(JSLinkRewriter):
|
class JSLinkAndLocationRewriter(JSLinkOnlyRewriter):
|
||||||
"""
|
"""
|
||||||
JS Rewriter which also rewrites location and domain to the
|
JS Rewriter which also rewrites location and domain to the
|
||||||
specified prefix (default: 'WB_wombat_')
|
specified prefix (default: 'WB_wombat_')
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, rewriter, rules = [], prefix = 'WB_wombat_'):
|
def __init__(self, rewriter, rules=[], prefix='WB_wombat_'):
|
||||||
rules = rules + [
|
rules = rules + [
|
||||||
(r'(?<!/)\blocation\b', prefix, 0),
|
(r'(?<!/)\blocation\b', prefix, 0),
|
||||||
(r'(?<=document\.)domain', prefix, 0),
|
(r'(?<=document\.)domain', prefix, 0),
|
||||||
]
|
]
|
||||||
super(JSLocationAndLinkRewriter, self).__init__(rewriter, rules)
|
#import sys
|
||||||
|
#sys.stderr.write('\n\n*** RULES:' + str(rules) + '\n\n')
|
||||||
|
super(JSLinkAndLocationRewriter, self).__init__(rewriter, rules)
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
# Set 'default' JSRewriter
|
# Set 'default' JSRewriter
|
||||||
JSRewriter = JSLocationAndLinkRewriter
|
JSRewriter = JSLinkAndLocationRewriter
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class XMLRewriter(RegexRewriter):
|
class XMLRewriter(RegexRewriter):
|
||||||
def __init__(self, rewriter, extra = []):
|
def __init__(self, rewriter, extra=[]):
|
||||||
rules = self._create_rules(rewriter.get_abs_url())
|
rules = self._create_rules(rewriter.get_abs_url())
|
||||||
|
|
||||||
RegexRewriter.__init__(self, rules)
|
super(XMLRewriter, self).__init__(rules)
|
||||||
|
|
||||||
# custom filter to reject 'xmlns' attr
|
# custom filter to reject 'xmlns' attr
|
||||||
def filter(self, m):
|
def filter(self, m):
|
||||||
@ -133,24 +151,28 @@ class XMLRewriter(RegexRewriter):
|
|||||||
|
|
||||||
def _create_rules(self, http_prefix):
|
def _create_rules(self, http_prefix):
|
||||||
return [
|
return [
|
||||||
('([A-Za-z:]+[\s=]+)?["\'\s]*(' + RegexRewriter.HTTPX_MATCH_STR + ')', http_prefix, 2),
|
('([A-Za-z:]+[\s=]+)?["\'\s]*(' +
|
||||||
|
RegexRewriter.HTTPX_MATCH_STR + ')', http_prefix, 2),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class CSSRewriter(RegexRewriter):
|
class CSSRewriter(RegexRewriter):
|
||||||
|
|
||||||
CSS_URL_REGEX = "url\\s*\\(\\s*[\\\\\"']*([^)'\"]+)[\\\\\"']*\\s*\\)"
|
CSS_URL_REGEX = "url\\s*\\(\\s*[\\\\\"']*([^)'\"]+)[\\\\\"']*\\s*\\)"
|
||||||
CSS_IMPORT_NO_URL_REGEX = "@import\\s+(?!url)\\(?\\s*['\"]?(?!url[\\s\\(])([\w.:/\\\\-]+)"
|
|
||||||
|
CSS_IMPORT_NO_URL_REGEX = ("@import\\s+(?!url)\\(?\\s*['\"]?" +
|
||||||
|
"(?!url[\\s\\(])([\w.:/\\\\-]+)")
|
||||||
|
|
||||||
def __init__(self, rewriter):
|
def __init__(self, rewriter):
|
||||||
rules = self._create_rules(rewriter)
|
rules = self._create_rules(rewriter)
|
||||||
|
super(CSSRewriter, self).__init__(rules)
|
||||||
RegexRewriter.__init__(self, rules)
|
|
||||||
|
|
||||||
|
|
||||||
def _create_rules(self, rewriter):
|
def _create_rules(self, rewriter):
|
||||||
return [
|
return [
|
||||||
(CSSRewriter.CSS_URL_REGEX, RegexRewriter.archival_rewrite(rewriter), 1),
|
(CSSRewriter.CSS_URL_REGEX,
|
||||||
(CSSRewriter.CSS_IMPORT_NO_URL_REGEX, RegexRewriter.archival_rewrite(rewriter), 1),
|
RegexRewriter.archival_rewrite(rewriter), 1),
|
||||||
|
|
||||||
|
(CSSRewriter.CSS_IMPORT_NO_URL_REGEX,
|
||||||
|
RegexRewriter.archival_rewrite(rewriter), 1),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,30 +1,24 @@
|
|||||||
import chardet
|
import chardet
|
||||||
|
import pkgutil
|
||||||
|
import yaml
|
||||||
|
|
||||||
from url_rewriter import UrlRewriter
|
from header_rewriter import RewrittenStatusAndHeaders
|
||||||
from html_rewriter import HTMLRewriter
|
|
||||||
from regex_rewriters import RegexRewriter, JSRewriter, CSSRewriter, XMLRewriter
|
|
||||||
from header_rewriter import HeaderRewriter, RewrittenStatusAndHeaders
|
|
||||||
|
|
||||||
|
from rewriterules import RewriteRules
|
||||||
|
|
||||||
|
from pywb.utils.dsrules import RuleSet
|
||||||
from pywb.utils.statusandheaders import StatusAndHeaders
|
from pywb.utils.statusandheaders import StatusAndHeaders
|
||||||
from pywb.utils.bufferedreaders import DecompressingBufferedReader, ChunkedDataReader
|
from pywb.utils.bufferedreaders import DecompressingBufferedReader, ChunkedDataReader
|
||||||
|
|
||||||
|
|
||||||
class RewriteContent:
|
class RewriteContent:
|
||||||
|
def __init__(self, config=None):
|
||||||
|
self.ruleset = RuleSet(RewriteRules, 'rewrite', config, {})
|
||||||
|
|
||||||
DEFAULT_CONTENT_REWRITERS = {
|
def rewrite_headers(self, urlrewriter, status_headers, stream, urlkey=''):
|
||||||
'header': HeaderRewriter,
|
header_rewriter_class = self.ruleset.get_first_match(urlkey).rewriters['header']
|
||||||
'js': JSRewriter,
|
|
||||||
'css': CSSRewriter,
|
|
||||||
'xml': XMLRewriter,
|
|
||||||
'html': HTMLRewriter
|
|
||||||
}
|
|
||||||
|
|
||||||
|
rewritten_headers = header_rewriter_class().rewrite(status_headers, urlrewriter)
|
||||||
def __init__(self, rewriters = {}):
|
|
||||||
self.rewriters = dict(self.DEFAULT_CONTENT_REWRITERS.items() + rewriters.items())
|
|
||||||
|
|
||||||
|
|
||||||
def rewrite_headers(self, urlrewriter, status_headers, stream):
|
|
||||||
rewritten_headers = self.rewriters['header']().rewrite(status_headers, urlrewriter)
|
|
||||||
|
|
||||||
# note: since chunking may be broken, approach taken here is to *always* attempt
|
# note: since chunking may be broken, approach taken here is to *always* attempt
|
||||||
# to dechunk if transfer-encoding: chunked is present
|
# to dechunk if transfer-encoding: chunked is present
|
||||||
@ -37,7 +31,8 @@ class RewriteContent:
|
|||||||
|
|
||||||
return (rewritten_headers, stream)
|
return (rewritten_headers, stream)
|
||||||
|
|
||||||
def rewrite_content(self, urlrewriter, headers, stream, head_insert_str = None):
|
def rewrite_content(self, urlrewriter, headers, stream, head_insert_str=None, urlkey=''):
|
||||||
|
|
||||||
# see if we've already rewritten headers
|
# see if we've already rewritten headers
|
||||||
if isinstance(headers, RewrittenStatusAndHeaders):
|
if isinstance(headers, RewrittenStatusAndHeaders):
|
||||||
rewritten_headers = headers
|
rewritten_headers = headers
|
||||||
@ -50,9 +45,11 @@ class RewriteContent:
|
|||||||
return (status_headers, gen)
|
return (status_headers, gen)
|
||||||
|
|
||||||
status_headers = rewritten_headers.status_headers
|
status_headers = rewritten_headers.status_headers
|
||||||
|
|
||||||
# Handle text content rewriting
|
# Handle text content rewriting
|
||||||
# =========================================================================
|
# =========================================================================
|
||||||
# special case -- need to ungzip the body
|
# special case -- need to ungzip the body
|
||||||
|
|
||||||
if (rewritten_headers.contains_removed_header('content-encoding', 'gzip')):
|
if (rewritten_headers.contains_removed_header('content-encoding', 'gzip')):
|
||||||
stream = DecompressingBufferedReader(stream, decomp_type='gzip')
|
stream = DecompressingBufferedReader(stream, decomp_type='gzip')
|
||||||
|
|
||||||
@ -68,13 +65,25 @@ class RewriteContent:
|
|||||||
|
|
||||||
text_type = rewritten_headers.text_type
|
text_type = rewritten_headers.text_type
|
||||||
|
|
||||||
rewriter_class = self.rewriters.get(text_type)
|
#rewriter_class = self.rewriters.get(text_type)
|
||||||
if not rewriter_class:
|
rule = self.ruleset.get_first_match(urlkey)
|
||||||
|
|
||||||
|
try:
|
||||||
|
rewriter_class = rule.rewriters[text_type]
|
||||||
|
except KeyError:
|
||||||
raise Exception('Unknown Text Type for Rewrite: ' + text_type)
|
raise Exception('Unknown Text Type for Rewrite: ' + text_type)
|
||||||
|
|
||||||
|
#import sys
|
||||||
|
#sys.stderr.write(str(vars(self.ruleset.get_first_match(urlkey))))
|
||||||
|
|
||||||
if text_type == 'html':
|
if text_type == 'html':
|
||||||
rewriter = rewriter_class(urlrewriter, outstream = None, head_insert = head_insert_str)
|
head_insert_str = rule.create_head_inserts() + head_insert_str
|
||||||
|
|
||||||
|
rewriter = rewriter_class(urlrewriter,
|
||||||
|
outstream=None,
|
||||||
|
js_rewriter_class=rule.rewriters['js'],
|
||||||
|
css_rewriter_class=rule.rewriters['css'],
|
||||||
|
head_insert=head_insert_str)
|
||||||
else:
|
else:
|
||||||
rewriter = rewriter_class(urlrewriter)
|
rewriter = rewriter_class(urlrewriter)
|
||||||
|
|
||||||
|
@ -2,12 +2,17 @@ import urllib2
|
|||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import datetime
|
import datetime
|
||||||
|
import mimetypes
|
||||||
|
|
||||||
|
from pywb.utils.loaders import is_http
|
||||||
from pywb.utils.timeutils import datetime_to_timestamp
|
from pywb.utils.timeutils import datetime_to_timestamp
|
||||||
from pywb.utils.statusandheaders import StatusAndHeaders
|
from pywb.utils.statusandheaders import StatusAndHeaders
|
||||||
|
|
||||||
from pywb.rewrite.url_rewriter import UrlRewriter
|
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||||
from pywb.rewrite.rewrite_content import RewriteContent
|
from pywb.rewrite.rewrite_content import RewriteContent
|
||||||
|
|
||||||
|
from pywb.cdx.canonicalize import canonicalize
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Fetch a url from live web and apply rewriting rules
|
Fetch a url from live web and apply rewriting rules
|
||||||
"""
|
"""
|
||||||
@ -26,10 +31,33 @@ def get_status_and_stream(url):
|
|||||||
return (status_headers, stream)
|
return (status_headers, stream)
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
def get_rewritten(url, urlrewriter):
|
def get_local_file(uri):
|
||||||
(status_headers, stream) = get_status_and_stream(url)
|
fh = open(uri)
|
||||||
|
|
||||||
status_headers, gen = RewriteContent().rewrite_content(urlrewriter, status_headers, stream)
|
content_type, _ = mimetypes.guess_type(uri)
|
||||||
|
|
||||||
|
# create fake headers for local file
|
||||||
|
status_headers = StatusAndHeaders('200 OK', [('Content-Type', content_type)])
|
||||||
|
stream = fh
|
||||||
|
|
||||||
|
return (status_headers, stream)
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
def get_rewritten(url, urlrewriter, urlkey=None):
|
||||||
|
if is_http(url):
|
||||||
|
(status_headers, stream) = get_status_and_stream(url)
|
||||||
|
else:
|
||||||
|
(status_headers, stream) = get_local_file(url)
|
||||||
|
|
||||||
|
# explicit urlkey may be passed in (say for testing)
|
||||||
|
if not urlkey:
|
||||||
|
urlkey = canonicalize(url)
|
||||||
|
|
||||||
|
status_headers, gen = RewriteContent().rewrite_content(urlrewriter,
|
||||||
|
status_headers,
|
||||||
|
stream,
|
||||||
|
head_insert_str='',
|
||||||
|
urlkey=urlkey)
|
||||||
|
|
||||||
buff = ''
|
buff = ''
|
||||||
for x in gen:
|
for x in gen:
|
||||||
|
@ -121,7 +121,7 @@ r"""
|
|||||||
'document_domain = "anotherdomain.com"; window.document.WB_wombat_domain = "example.com"'
|
'document_domain = "anotherdomain.com"; window.document.WB_wombat_domain = "example.com"'
|
||||||
|
|
||||||
# custom rules added
|
# custom rules added
|
||||||
>>> _test_js('window.location = "http://example.com/abc.html"; some_func(); ', [('some_func\(\).*', RegexRewriter.comment_out, 0)])
|
>>> _test_js('window.location = "http://example.com/abc.html"; some_func(); ', [('some_func\(\).*', RegexRewriter.format('/*{0}*/'), 0)])
|
||||||
'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"; /*some_func(); */'
|
'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"; /*some_func(); */'
|
||||||
|
|
||||||
# scheme-agnostic
|
# scheme-agnostic
|
||||||
|
@ -1,12 +1,39 @@
|
|||||||
from pywb.rewrite.rewrite_live import get_rewritten
|
from pywb.rewrite.rewrite_live import get_rewritten
|
||||||
from pywb.rewrite.url_rewriter import UrlRewriter
|
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||||
|
|
||||||
|
from pywb import get_test_dir
|
||||||
|
|
||||||
# This module has some rewriting tests against the 'live web'
|
# This module has some rewriting tests against the 'live web'
|
||||||
# As such, the content may change and the test may break
|
# As such, the content may change and the test may break
|
||||||
|
|
||||||
urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/pywb/')
|
urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/pywb/')
|
||||||
|
|
||||||
|
|
||||||
|
def test_local_1():
|
||||||
|
status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html', urlrewriter, 'com,example,test)/')
|
||||||
|
|
||||||
|
# wombat insert added
|
||||||
|
assert '<head><script src="/static/default/wombat.js"> </script>' in buff
|
||||||
|
|
||||||
|
# location rewritten
|
||||||
|
assert 'window.WB_wombat_location = "/other.html"' in buff
|
||||||
|
|
||||||
|
# link rewritten
|
||||||
|
assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff
|
||||||
|
|
||||||
|
|
||||||
|
def test_local_2_no_js_location_rewrite():
|
||||||
|
status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html', urlrewriter, 'example,example,test)/nolocation_rewrite')
|
||||||
|
|
||||||
|
# no wombat insert
|
||||||
|
assert '<head><script src="/static/default/wombat.js"> </script>' not in buff
|
||||||
|
|
||||||
|
# no location rewrite
|
||||||
|
assert 'window.location = "/other.html"' in buff
|
||||||
|
|
||||||
|
# still link rewrite
|
||||||
|
assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff
|
||||||
|
|
||||||
def test_example_1():
|
def test_example_1():
|
||||||
status_headers, buff = get_rewritten('http://example.com/', urlrewriter)
|
status_headers, buff = get_rewritten('http://example.com/', urlrewriter)
|
||||||
|
|
||||||
@ -24,9 +51,10 @@ def test_example_2():
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
#def test_example_3():
|
def test_example_domain_specific_3():
|
||||||
# status_headers, buff = get_rewritten('http://archive.org/', urlrewriter)
|
status_headers, buff = get_rewritten('http://facebook.com/digitalpreservation', urlrewriter)
|
||||||
|
|
||||||
# assert '/pywb/20131226101010/http://example.com/about/terms.php' in buff, buff
|
# comment out bootloader
|
||||||
|
assert '/* Bootloader.configurePage' in buff, buff
|
||||||
|
|
||||||
|
|
||||||
|
49
pywb/rules.yaml
Normal file
49
pywb/rules.yaml
Normal file
@ -0,0 +1,49 @@
|
|||||||
|
|
||||||
|
rules:
|
||||||
|
|
||||||
|
# twitter rules
|
||||||
|
#=================================================================
|
||||||
|
- url_prefix: 'com,twitter)/i/profiles/show/'
|
||||||
|
|
||||||
|
fuzzy_lookup: '/profiles/show/.*with_replies\?.*(max_id=[^&]+)'
|
||||||
|
|
||||||
|
|
||||||
|
# facebook rules
|
||||||
|
#=================================================================
|
||||||
|
- url_prefix: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet'
|
||||||
|
|
||||||
|
fuzzy_lookup: 'com,facebook\)/.*[?&]data=(.*(?:[&]|query_type[^,]+))'
|
||||||
|
|
||||||
|
canonicalize:
|
||||||
|
match: 'com,facebook\)/.*[?&]data=([^&]+).*'
|
||||||
|
replace: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet?data=\1'
|
||||||
|
|
||||||
|
|
||||||
|
- url_prefix: 'com,facebook)/'
|
||||||
|
rewrite:
|
||||||
|
js_regexs:
|
||||||
|
- match: 'Bootloader\.configurePage.*'
|
||||||
|
replace: '/* {0} */'
|
||||||
|
|
||||||
|
|
||||||
|
# yahoo rules
|
||||||
|
#=================================================================
|
||||||
|
- url_prefix: ['com,yimg,l)/g/combo', 'com,yahooapis,yui)/combo']
|
||||||
|
|
||||||
|
fuzzy_lookup: '([^/]+(?:\.css|\.js))'
|
||||||
|
|
||||||
|
|
||||||
|
# testing rules -- not for valid domain
|
||||||
|
#=================================================================
|
||||||
|
# this rule block is a non-existent prefix merely for testing
|
||||||
|
- url_prefix: 'example,example,test)/nolocation_rewrite'
|
||||||
|
|
||||||
|
rewrite:
|
||||||
|
js_rewrite_location: False
|
||||||
|
|
||||||
|
|
||||||
|
# all domain rules -- fallback to this dataset
|
||||||
|
#=================================================================
|
||||||
|
# Applies to all urls -- should be last
|
||||||
|
- url_prefix: ''
|
||||||
|
fuzzy_lookup: '[&?](?:_|uncache)=[\d]+[&]?'
|
@ -9,6 +9,7 @@ import urllib2
|
|||||||
import time
|
import time
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
def is_http(filename):
|
def is_http(filename):
|
||||||
return any(filename.startswith(x) for x in ['http://', 'https://'])
|
return any(filename.startswith(x) for x in ['http://', 'https://'])
|
||||||
|
|
||||||
|
14
sample_archive/text_content/sample.html
Normal file
14
sample_archive/text_content/sample.html
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>Sample Page For Rewrite Test</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<script>
|
||||||
|
var some_val = false;
|
||||||
|
if (some_val) {
|
||||||
|
window.location = "/other.html";
|
||||||
|
}
|
||||||
|
</script>
|
||||||
|
Test Content
|
||||||
|
<a href="another.html">Some Link</a>
|
||||||
|
</body>
|
3
setup.py
3
setup.py
@ -15,7 +15,8 @@ setuptools.setup(name='pywb',
|
|||||||
provides=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite'],
|
provides=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite'],
|
||||||
package_data={'pywb': ['ui/*', 'static/*'], 'pywb.cdx': ['*.yaml']},
|
package_data={'pywb': ['ui/*', 'static/*'], 'pywb.cdx': ['*.yaml']},
|
||||||
data_files = [('sample_archive/cdx/', glob.glob('sample_archive/cdx/*')),
|
data_files = [('sample_archive/cdx/', glob.glob('sample_archive/cdx/*')),
|
||||||
('sample_archive/warcs/', glob.glob('sample_archive/warcs/*'))],
|
('sample_archive/warcs/', glob.glob('sample_archive/warcs/*')),
|
||||||
|
('sample_archive/text_content/', glob.glob('sample_archive/text_content/*'))],
|
||||||
install_requires=['uwsgi', 'rfc3987', 'chardet', 'redis', 'jinja2', 'surt', 'pyyaml', 'WebTest','pytest'],
|
install_requires=['uwsgi', 'rfc3987', 'chardet', 'redis', 'jinja2', 'surt', 'pyyaml', 'WebTest','pytest'],
|
||||||
# tests_require=['WebTest', 'pytest'],
|
# tests_require=['WebTest', 'pytest'],
|
||||||
zip_safe=False)
|
zip_safe=False)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user