mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
Merge remote-tracking branch 'origin/master' into cdx-server
Conflicts: pywb/cdx/cdxdomainspecific.py pywb/cdx/cdxserver.py pywb/cdx/test/cdxserver_test.py setup.py tests/test_integration.py
This commit is contained in:
commit
1f65eff828
@ -2,6 +2,9 @@
|
|||||||
omit =
|
omit =
|
||||||
*/test/*
|
*/test/*
|
||||||
*/tests/*
|
*/tests/*
|
||||||
|
*.html
|
||||||
|
*.js
|
||||||
|
*.css
|
||||||
|
|
||||||
[report]
|
[report]
|
||||||
exclude_lines =
|
exclude_lines =
|
||||||
|
@ -3,9 +3,8 @@ python:
|
|||||||
- "2.7"
|
- "2.7"
|
||||||
# command to install dependencies
|
# command to install dependencies
|
||||||
install:
|
install:
|
||||||
- "python setup.py -q install"
|
- python setup.py -q install
|
||||||
- "pip install python-coveralls"
|
- pip install coverage pytest-cov coveralls --use-mirrors
|
||||||
- "pip install pytest-cov"
|
|
||||||
# command to run tests
|
# command to run tests
|
||||||
#script: nosetests --with-doctest
|
#script: nosetests --with-doctest
|
||||||
#script: py.test run-tests.py ./pywb/ --doctest-modules --ignore=setup.py
|
#script: py.test run-tests.py ./pywb/ --doctest-modules --ignore=setup.py
|
||||||
|
@ -2,6 +2,7 @@ PyWb 0.2 Beta
|
|||||||
==============
|
==============
|
||||||
|
|
||||||
[](https://travis-ci.org/ikreymer/pywb)
|
[](https://travis-ci.org/ikreymer/pywb)
|
||||||
|
[](https://coveralls.io/r/ikreymer/pywb?branch=master)
|
||||||
|
|
||||||
pywb is a Python re-implementation of the Wayback Machine software.
|
pywb is a Python re-implementation of the Wayback Machine software.
|
||||||
|
|
||||||
|
@ -50,7 +50,10 @@ class Route:
|
|||||||
|
|
||||||
def __init__(self, regex, handler, coll_group = 0, config = {}, lookahead = SLASH_QUERY_LOOKAHEAD):
|
def __init__(self, regex, handler, coll_group = 0, config = {}, lookahead = SLASH_QUERY_LOOKAHEAD):
|
||||||
self.path = regex
|
self.path = regex
|
||||||
self.regex = re.compile(regex + lookahead)
|
if regex:
|
||||||
|
self.regex = re.compile(regex + lookahead)
|
||||||
|
else:
|
||||||
|
self.regex = re.compile('')
|
||||||
self.handler = handler
|
self.handler = handler
|
||||||
# collection id from regex group (default 0)
|
# collection id from regex group (default 0)
|
||||||
self.coll_group = coll_group
|
self.coll_group = coll_group
|
||||||
@ -70,7 +73,6 @@ class Route:
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
matched_str = matcher.group(0)
|
matched_str = matcher.group(0)
|
||||||
|
|
||||||
if matched_str:
|
if matched_str:
|
||||||
rel_prefix = env['SCRIPT_NAME'] + '/' + matched_str + '/'
|
rel_prefix = env['SCRIPT_NAME'] + '/' + matched_str + '/'
|
||||||
wb_url_str = request_uri[len(matched_str) + 2:] # remove the '/' + rel_prefix part of uri
|
wb_url_str = request_uri[len(matched_str) + 2:] # remove the '/' + rel_prefix part of uri
|
||||||
|
@ -3,34 +3,43 @@ import re
|
|||||||
import logging
|
import logging
|
||||||
import pkg_resources
|
import pkg_resources
|
||||||
|
|
||||||
from canonicalize import unsurt, UrlCanonicalizer
|
from pywb.utils.dsrules import BaseRule, RuleSet
|
||||||
|
|
||||||
|
from pywb.utils.canonicalize import unsurt, UrlCanonicalizer
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
def load_domain_specific_cdx_rules(filename, surt_ordered):
|
def load_domain_specific_cdx_rules(ds_rules_file, surt_ordered):
|
||||||
fh = pkg_resources.resource_string(__name__, filename)
|
#fh = pkg_resources.resource_string(__name__, filename)
|
||||||
config = yaml.load(fh)
|
#config = yaml.load(fh)
|
||||||
|
|
||||||
|
canon = None
|
||||||
|
fuzzy = None
|
||||||
|
|
||||||
# Load Canonicalizer Rules
|
# Load Canonicalizer Rules
|
||||||
rules = StartsWithRule.load_rules(config.get('canon_rules'),
|
rules = RuleSet(CDXDomainSpecificRule, 'canonicalize',
|
||||||
surt_ordered)
|
ds_rules_file=ds_rules_file)
|
||||||
|
|
||||||
|
if not surt_ordered:
|
||||||
|
for rule in rules:
|
||||||
|
rule.unsurt()
|
||||||
|
|
||||||
if rules:
|
if rules:
|
||||||
canon = CustomUrlCanonicalizer(rules, surt_ordered)
|
canon = CustomUrlCanonicalizer(rules, surt_ordered)
|
||||||
else:
|
|
||||||
canon = None
|
|
||||||
|
|
||||||
# Load Fuzzy Lookup Rules
|
# Load Fuzzy Lookup Rules
|
||||||
rules = StartsWithRule.load_rules(config.get('fuzzy_lookup_rules'),
|
rules = RuleSet(CDXDomainSpecificRule, 'fuzzy_lookup',
|
||||||
surt_ordered)
|
ds_rules_file=ds_rules_file)
|
||||||
|
|
||||||
|
if not surt_ordered:
|
||||||
|
for rule in rules:
|
||||||
|
rule.unsurt()
|
||||||
|
|
||||||
if rules:
|
if rules:
|
||||||
fuzzy = FuzzyQuery(rules)
|
fuzzy = FuzzyQuery(rules)
|
||||||
else:
|
|
||||||
fuzzy = None
|
|
||||||
|
|
||||||
logging.debug('CANON: ' + str(canon))
|
logging.debug('CustomCanonilizer? ' + str(bool(canon)))
|
||||||
logging.debug('FUZZY: ' + str(fuzzy))
|
logging.debug('FuzzyMatcher? ' + str(bool(canon)))
|
||||||
return (canon, fuzzy)
|
return (canon, fuzzy)
|
||||||
|
|
||||||
|
|
||||||
@ -43,10 +52,7 @@ class CustomUrlCanonicalizer(UrlCanonicalizer):
|
|||||||
def __call__(self, url):
|
def __call__(self, url):
|
||||||
urlkey = super(CustomUrlCanonicalizer, self).__call__(url)
|
urlkey = super(CustomUrlCanonicalizer, self).__call__(url)
|
||||||
|
|
||||||
for rule in self.rules:
|
for rule in self.rules.iter_matching(urlkey):
|
||||||
if not any(urlkey.startswith(x) for x in rule.starts):
|
|
||||||
continue
|
|
||||||
|
|
||||||
m = rule.regex.match(urlkey)
|
m = rule.regex.match(urlkey)
|
||||||
if not m:
|
if not m:
|
||||||
continue
|
continue
|
||||||
@ -67,11 +73,10 @@ class FuzzyQuery:
|
|||||||
|
|
||||||
urlkey = params['key']
|
urlkey = params['key']
|
||||||
url = params['url']
|
url = params['url']
|
||||||
|
filter_ = params.get('filter', [])
|
||||||
|
output = params.get('output')
|
||||||
|
|
||||||
for rule in self.rules:
|
for rule in self.rules.iter_matching(urlkey):
|
||||||
if not any(urlkey.startswith(x) for x in rule.starts):
|
|
||||||
continue
|
|
||||||
|
|
||||||
m = rule.regex.search(urlkey)
|
m = rule.regex.search(urlkey)
|
||||||
if not m:
|
if not m:
|
||||||
continue
|
continue
|
||||||
@ -79,7 +84,7 @@ class FuzzyQuery:
|
|||||||
matched_rule = rule
|
matched_rule = rule
|
||||||
|
|
||||||
if len(m.groups()) == 1:
|
if len(m.groups()) == 1:
|
||||||
params['filter'] = '=urlkey:' + m.group(1)
|
filter_.append('~urlkey:' + m.group(1))
|
||||||
|
|
||||||
break
|
break
|
||||||
|
|
||||||
@ -88,28 +93,40 @@ class FuzzyQuery:
|
|||||||
|
|
||||||
inx = url.find('?')
|
inx = url.find('?')
|
||||||
if inx > 0:
|
if inx > 0:
|
||||||
params['url'] = url[:inx + 1]
|
url = url[:inx + 1]
|
||||||
|
|
||||||
|
params = {'url': url,
|
||||||
|
'matchType': 'prefix',
|
||||||
|
'filter': filter_,
|
||||||
|
'output': output}
|
||||||
|
|
||||||
params['matchType'] = 'prefix'
|
|
||||||
params['key'] = None
|
|
||||||
return params
|
return params
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class StartsWithRule:
|
class CDXDomainSpecificRule(BaseRule):
|
||||||
def __init__(self, config, surt_ordered=True):
|
def __init__(self, name, config):
|
||||||
self.starts = config.get('startswith')
|
super(CDXDomainSpecificRule, self).__init__(name, config)
|
||||||
if not isinstance(self.starts, list):
|
|
||||||
self.starts = [self.starts]
|
|
||||||
|
|
||||||
self.regex = re.compile(config.get('matches'))
|
if isinstance(config, basestring):
|
||||||
self.replace = config.get('replace')
|
self.regex = re.compile(config)
|
||||||
|
self.replace = None
|
||||||
|
else:
|
||||||
|
self.regex = re.compile(config.get('match'))
|
||||||
|
self.replace = config.get('replace')
|
||||||
|
|
||||||
def unsurt(self):
|
def unsurt(self):
|
||||||
# must convert to non-surt form
|
"""
|
||||||
self.starts = map(unsurt, self.starts)
|
urlkey is assumed to be in surt format by default
|
||||||
self.regex = unsurt(self.regex)
|
In the case of non-surt format, this method is called
|
||||||
self.replace = unsurt(self.replace)
|
to desurt any urls
|
||||||
|
"""
|
||||||
|
self.url_prefix = map(unsurt, self.url_prefix)
|
||||||
|
if self.regex:
|
||||||
|
self.regex = unsurt(self.regex)
|
||||||
|
|
||||||
|
if self.replace:
|
||||||
|
self.replace = unsurt(self.replace)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def load_rules(rules_config, surt_ordered=True):
|
def load_rules(rules_config, surt_ordered=True):
|
||||||
|
@ -151,9 +151,15 @@ def cdx_filter(cdx_iter, filter_strings):
|
|||||||
if self.invert:
|
if self.invert:
|
||||||
string = string[1:]
|
string = string[1:]
|
||||||
|
|
||||||
self.exact = string.startswith('=')
|
# exact match
|
||||||
if self.exact:
|
if string.startswith('='):
|
||||||
string = string[1:]
|
string = string[1:]
|
||||||
|
self.compare_func = self.exact
|
||||||
|
elif string.startswith('~'):
|
||||||
|
string = string[1:]
|
||||||
|
self.compare_func = self.contains
|
||||||
|
else:
|
||||||
|
self.compare_func = self.regex
|
||||||
|
|
||||||
parts = string.split(':', 1)
|
parts = string.split(':', 1)
|
||||||
# no field set, apply filter to entire cdx
|
# no field set, apply filter to entire cdx
|
||||||
@ -164,19 +170,28 @@ def cdx_filter(cdx_iter, filter_strings):
|
|||||||
self.field = parts[0]
|
self.field = parts[0]
|
||||||
string = parts[1]
|
string = parts[1]
|
||||||
|
|
||||||
if self.exact:
|
# make regex if regex mode
|
||||||
self.exact_str = string
|
if self.compare_func == self.regex:
|
||||||
else:
|
|
||||||
self.regex = re.compile(string)
|
self.regex = re.compile(string)
|
||||||
|
else:
|
||||||
|
self.filter_str = string
|
||||||
|
|
||||||
def __call__(self, cdx):
|
def __call__(self, cdx):
|
||||||
val = cdx[self.field] if self.field else str(cdx)
|
val = cdx[self.field] if self.field else str(cdx)
|
||||||
if self.exact:
|
|
||||||
matched = (self.exact_str == val)
|
matched = self.compare_func(val)
|
||||||
else:
|
|
||||||
matched = self.regex.match(val) is not None
|
|
||||||
return matched ^ self.invert
|
return matched ^ self.invert
|
||||||
|
|
||||||
|
def exact(self, val):
|
||||||
|
return (self.filter_str == val)
|
||||||
|
|
||||||
|
def contains(self, val):
|
||||||
|
return (self.filter_str in val)
|
||||||
|
|
||||||
|
def regex(self, val):
|
||||||
|
return self.regex.match(val) is not None
|
||||||
|
|
||||||
filters = map(Filter, filter_strings)
|
filters = map(Filter, filter_strings)
|
||||||
|
|
||||||
for cdx in cdx_iter:
|
for cdx in cdx_iter:
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
from canonicalize import UrlCanonicalizer, calc_search_range
|
from pywb.utils.canonicalize import UrlCanonicalizer, calc_search_range
|
||||||
|
|
||||||
from cdxops import cdx_load
|
from cdxops import cdx_load
|
||||||
from cdxsource import CDXSource, CDXFile, RemoteCDXSource, RedisCDXSource
|
from cdxsource import CDXSource, CDXFile, RemoteCDXSource, RedisCDXSource
|
||||||
@ -17,13 +17,13 @@ import urlparse
|
|||||||
#=================================================================
|
#=================================================================
|
||||||
class BaseCDXServer(object):
|
class BaseCDXServer(object):
|
||||||
def __init__(self, **kwargs):
|
def __init__(self, **kwargs):
|
||||||
ds_rules = kwargs.get('ds_rules')
|
ds_rules_file = kwargs.get('ds_rules_file')
|
||||||
surt_ordered = kwargs.get('surt_ordered', True)
|
surt_ordered = kwargs.get('surt_ordered', True)
|
||||||
|
|
||||||
# load from domain-specific rules
|
# load from domain-specific rules
|
||||||
if ds_rules:
|
if ds_rules_file:
|
||||||
self.url_canon, self.fuzzy_query = (
|
self.url_canon, self.fuzzy_query = (
|
||||||
load_domain_specific_cdx_rules(ds_rules, surt_ordered))
|
load_domain_specific_cdx_rules(ds_rules_file, surt_ordered))
|
||||||
# or custom passed in canonicalizer
|
# or custom passed in canonicalizer
|
||||||
else:
|
else:
|
||||||
self.url_canon = kwargs.get('url_canon')
|
self.url_canon = kwargs.get('url_canon')
|
||||||
@ -50,14 +50,14 @@ class BaseCDXServer(object):
|
|||||||
|
|
||||||
url = params['url']
|
url = params['url']
|
||||||
|
|
||||||
if self.fuzzy_query and params.get('allowFuzzy'):
|
# check if fuzzy is allowed and ensure that its an
|
||||||
if not 'key' in params:
|
# exact match
|
||||||
params['key'] = self.url_canon(url)
|
if (self.fuzzy_query and params.get('allowFuzzy') and
|
||||||
|
params.get('matchType', 'exact') == 'exact'):
|
||||||
|
|
||||||
params = self.fuzzy_query(params)
|
fuzzy_params = self.fuzzy_query(params)
|
||||||
if params:
|
if fuzzy_params:
|
||||||
params['allowFuzzy'] = False
|
return self.load_cdx(**fuzzy_params)
|
||||||
return self.load_cdx(**params)
|
|
||||||
|
|
||||||
msg = 'No Captures found for: ' + url
|
msg = 'No Captures found for: ' + url
|
||||||
raise CaptureNotFoundException(msg)
|
raise CaptureNotFoundException(msg)
|
||||||
@ -98,7 +98,6 @@ class CDXServer(BaseCDXServer):
|
|||||||
msg = 'A url= param must be specified to query the cdx server'
|
msg = 'A url= param must be specified to query the cdx server'
|
||||||
raise CDXException(msg)
|
raise CDXException(msg)
|
||||||
|
|
||||||
#params['key'] = self.url_canon(url)
|
|
||||||
match_type = params.get('matchType', 'exact')
|
match_type = params.get('matchType', 'exact')
|
||||||
|
|
||||||
key, end_key = calc_search_range(url=url,
|
key, end_key = calc_search_range(url=url,
|
||||||
@ -159,7 +158,7 @@ class CDXServer(BaseCDXServer):
|
|||||||
if filename.endswith('.cdx'):
|
if filename.endswith('.cdx'):
|
||||||
return CDXFile(filename)
|
return CDXFile(filename)
|
||||||
|
|
||||||
if filename.endswith('.summary'):
|
if filename.endswith(('.summary', '.idx')):
|
||||||
return ZipNumCluster(filename, config)
|
return ZipNumCluster(filename, config)
|
||||||
|
|
||||||
logging.warn('skipping unrecognized URI:%s', filename)
|
logging.warn('skipping unrecognized URI:%s', filename)
|
||||||
@ -218,7 +217,7 @@ def create_cdx_server(config, ds_rules_file=None):
|
|||||||
return server_cls(paths,
|
return server_cls(paths,
|
||||||
config=pass_config,
|
config=pass_config,
|
||||||
surt_ordered=surt_ordered,
|
surt_ordered=surt_ordered,
|
||||||
ds_rules=ds_rules_file,
|
ds_rules_file=ds_rules_file,
|
||||||
perms_checker=perms_checker)
|
perms_checker=perms_checker)
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
|
@ -1,6 +1,8 @@
|
|||||||
from pywb.utils.binsearch import iter_range
|
from pywb.utils.binsearch import iter_range
|
||||||
from pywb.utils.loaders import SeekableTextFileReader
|
from pywb.utils.loaders import SeekableTextFileReader
|
||||||
|
|
||||||
|
from cdxobject import AccessException
|
||||||
|
|
||||||
import urllib
|
import urllib
|
||||||
import urllib2
|
import urllib2
|
||||||
import itertools
|
import itertools
|
||||||
@ -93,7 +95,7 @@ class RedisCDXSource(CDXSource):
|
|||||||
self.key_prefix = self.DEFAULT_KEY_PREFIX
|
self.key_prefix = self.DEFAULT_KEY_PREFIX
|
||||||
if config:
|
if config:
|
||||||
self.key_prefix = config.get('redis_key_prefix', self.key_prefix)
|
self.key_prefix = config.get('redis_key_prefix', self.key_prefix)
|
||||||
|
|
||||||
|
|
||||||
def load_cdx(self, params):
|
def load_cdx(self, params):
|
||||||
"""
|
"""
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class AllowAllPerms:
|
class AllowAllPerms(object):
|
||||||
"""
|
"""
|
||||||
Sample Perm Checker which allows all
|
Sample Perm Checker which allows all
|
||||||
"""
|
"""
|
||||||
|
@ -1,24 +0,0 @@
|
|||||||
|
|
||||||
fuzzy_lookup_rules:
|
|
||||||
- startswith: 'com,twitter)/i/profiles/show/'
|
|
||||||
matches: '/profiles/show/.*with_replies\?.*(max_id=[^&]+)'
|
|
||||||
|
|
||||||
- startswith: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet'
|
|
||||||
matches: 'com,facebook\)/.*[?&]data=(.*(?:[&]|query_type[^,]+))'
|
|
||||||
|
|
||||||
- startswith: ['com,yimg,l)/g/combo', 'com,yahooapis,yui)/combo']
|
|
||||||
matches: '([^/]+(?:\.css|\.js))'
|
|
||||||
|
|
||||||
# matches all urls
|
|
||||||
- startswith: ''
|
|
||||||
matches: '[&?](?:_|uncache)=[\d]+[&]?'
|
|
||||||
|
|
||||||
canon_rules:
|
|
||||||
- startswith: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet'
|
|
||||||
matches: 'com,facebook\)/.*[?&]data=([^&]+).*'
|
|
||||||
replace: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet?data=\1'
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -54,6 +54,16 @@ com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit
|
|||||||
com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz
|
com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz
|
||||||
com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz
|
com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz
|
||||||
|
|
||||||
|
# Filter contains
|
||||||
|
>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '~urlkey:example=1')
|
||||||
|
com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
|
||||||
|
com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz
|
||||||
|
|
||||||
|
# Filter contains invert
|
||||||
|
>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '!~urlkey:example=1')
|
||||||
|
com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz
|
||||||
|
com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz
|
||||||
|
|
||||||
# Collapse by timestamp
|
# Collapse by timestamp
|
||||||
# unresolved revisits, different statuscode results in an extra repeat
|
# unresolved revisits, different statuscode results in an extra repeat
|
||||||
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapseTime = 11)
|
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapseTime = 11)
|
||||||
@ -131,9 +141,9 @@ org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db tex
|
|||||||
('offset', '334'),
|
('offset', '334'),
|
||||||
('filename', 'dupes.warc.gz')]
|
('filename', 'dupes.warc.gz')]
|
||||||
|
|
||||||
# NOTE: external dependency -- need self-contained test
|
# NOTE: external dependency -- need self-contained test TODO
|
||||||
#>>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'example.com', output = 'raw', limit = '2')
|
>>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'example.com', output = 'raw', limit = '2')
|
||||||
#>>> pprint.pprint(x.next().items())
|
>>> pprint.pprint(x.next().items())
|
||||||
[('urlkey', 'com,example)/'),
|
[('urlkey', 'com,example)/'),
|
||||||
('timestamp', '20020120142510'),
|
('timestamp', '20020120142510'),
|
||||||
('original', 'http://example.com:80/'),
|
('original', 'http://example.com:80/'),
|
||||||
@ -142,6 +152,10 @@ org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db tex
|
|||||||
('digest', 'HT2DYGA5UKZCPBSFVCV3JOBXGW2G5UUA'),
|
('digest', 'HT2DYGA5UKZCPBSFVCV3JOBXGW2G5UUA'),
|
||||||
('length', '1792')]
|
('length', '1792')]
|
||||||
|
|
||||||
|
|
||||||
|
>>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'facebook.com', output = 'raw', limit = '2')
|
||||||
|
Traceback (most recent call last):
|
||||||
|
AccessException: Blocked By Robots
|
||||||
"""
|
"""
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
@ -169,7 +183,8 @@ def cdx_ops_test(url, sources = [test_cdx_dir + 'iana.cdx'], **kwparams):
|
|||||||
results = server.load_cdx(**kwparams)
|
results = server.load_cdx(**kwparams)
|
||||||
|
|
||||||
for x in results:
|
for x in results:
|
||||||
sys.stdout.write(x.to_text(fields))
|
l = x.to_text(fields).replace('\t', ' ')
|
||||||
|
sys.stdout.write(l)
|
||||||
|
|
||||||
#================================================================
|
#================================================================
|
||||||
|
|
||||||
|
44
pywb/cdx/test/zipnum_test.py
Normal file
44
pywb/cdx/test/zipnum_test.py
Normal file
@ -0,0 +1,44 @@
|
|||||||
|
"""
|
||||||
|
>>> zip_ops_test(url = 'http://iana.org')
|
||||||
|
org,iana)/ 20140126200624 http://www.iana.org/ text/html 200 OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 2258 334 iana.warc.gz
|
||||||
|
org,iana)/ 20140127171238 http://iana.org unk 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 343 1858 dupes.warc.gz
|
||||||
|
org,iana)/ 20140127171238 http://www.iana.org/ warc/revisit - OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 536 2678 dupes.warc.gz
|
||||||
|
|
||||||
|
# test idx index (tabs replacad with 4 spaces)
|
||||||
|
>>> zip_ops_test(url = 'http://iana.org/domains/', matchType = 'prefix', showPagedIndex = True)
|
||||||
|
org,iana)/dnssec 20140126201307 zipnum 8511 373
|
||||||
|
org,iana)/domains/int 20140126201239 zipnum 8884 353
|
||||||
|
org,iana)/domains/root/servers 20140126201227 zipnum 9237 386
|
||||||
|
|
||||||
|
>>> zip_ops_test(url = 'http://iana.org/domains/', matchType = 'prefix')
|
||||||
|
org,iana)/domains/arpa 20140126201248 http://www.iana.org/domains/arpa text/html 200 QOFZZRN6JIKAL2JRL6ZC2VVG42SPKGHT - - 2939 759039 iana.warc.gz
|
||||||
|
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
|
||||||
|
org,iana)/domains/idn-tables 20140126201127 http://www.iana.org/domains/idn-tables text/html 200 HNCUFTJMOQOGAEY6T56KVC3T7TVLKGEW - - 8118 715878 iana.warc.gz
|
||||||
|
org,iana)/domains/int 20140126201239 http://www.iana.org/domains/int text/html 200 X32BBNNORV4SPEHTQF5KI5NFHSKTZK6Q - - 2482 746788 iana.warc.gz
|
||||||
|
org,iana)/domains/reserved 20140126201054 http://www.iana.org/domains/reserved text/html 200 R5AAEQX5XY5X5DG66B23ODN5DUBWRA27 - - 3573 701457 iana.warc.gz
|
||||||
|
org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz
|
||||||
|
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz
|
||||||
|
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
|
||||||
|
org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
from cdxserver_test import cdx_ops_test
|
||||||
|
|
||||||
|
from pywb import get_test_dir
|
||||||
|
test_zipnum = get_test_dir() + 'zipcdx/zipnum-sample.idx'
|
||||||
|
print test_zipnum
|
||||||
|
|
||||||
|
def zip_ops_test(url, **kwargs):
|
||||||
|
sources = test_zipnum
|
||||||
|
cdx_ops_test(url, sources, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import doctest
|
||||||
|
doctest.testmod()
|
||||||
|
|
||||||
|
|
@ -18,17 +18,19 @@ def load_template_file(file, desc = None, view_class = views.J2TemplateView):
|
|||||||
return file
|
return file
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
def create_wb_handler(cdx_server, config):
|
def create_wb_handler(cdx_server, config, ds_rules_file=None):
|
||||||
|
|
||||||
record_loader = ArcWarcRecordLoader(cookie_maker = config.get('cookie_maker'))
|
record_loader = ArcWarcRecordLoader(cookie_maker = config.get('cookie_maker'))
|
||||||
paths = config.get('archive_paths')
|
paths = config.get('archive_paths')
|
||||||
|
|
||||||
resolving_loader = ResolvingLoader(paths = paths, cdx_server = cdx_server, record_loader = record_loader)
|
resolving_loader = ResolvingLoader(paths=paths,
|
||||||
|
cdx_server=cdx_server,
|
||||||
|
record_loader=record_loader)
|
||||||
|
|
||||||
replayer = replay_views.ReplayView(
|
replayer = replay_views.ReplayView(
|
||||||
content_loader = resolving_loader,
|
content_loader = resolving_loader,
|
||||||
|
|
||||||
content_rewriter = RewriteContent(),
|
content_rewriter = RewriteContent(ds_rules_file=ds_rules_file),
|
||||||
|
|
||||||
head_insert_view = load_template_file(config.get('head_insert_html'), 'Head Insert'),
|
head_insert_view = load_template_file(config.get('head_insert_html'), 'Head Insert'),
|
||||||
|
|
||||||
|
@ -57,12 +57,13 @@ def pywb_config_manual(passed_config = {}):
|
|||||||
|
|
||||||
route_config = DictChain(value, config)
|
route_config = DictChain(value, config)
|
||||||
|
|
||||||
ds_rules = route_config.get('domain_specific_rules', None)
|
ds_rules_file = route_config.get('domain_specific_rules', None)
|
||||||
cdx_server = IndexReader(route_config, ds_rules)
|
cdx_server = IndexReader(route_config, ds_rules_file)
|
||||||
|
|
||||||
wb_handler = config_utils.create_wb_handler(
|
wb_handler = config_utils.create_wb_handler(
|
||||||
cdx_server = cdx_server,
|
cdx_server=cdx_server,
|
||||||
config = route_config,
|
config=route_config,
|
||||||
|
ds_rules_file=ds_rules_file,
|
||||||
)
|
)
|
||||||
|
|
||||||
logging.debug('Adding Collection: ' + name)
|
logging.debug('Adding Collection: ' + name)
|
||||||
|
@ -7,6 +7,8 @@ from wbrequestresponse import WbResponse
|
|||||||
from wbexceptions import CaptureException, InternalRedirect
|
from wbexceptions import CaptureException, InternalRedirect
|
||||||
from pywb.warc.recordloader import ArchiveLoadFailed
|
from pywb.warc.recordloader import ArchiveLoadFailed
|
||||||
|
|
||||||
|
from pywb.utils.loaders import LimitReader
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class ReplayView:
|
class ReplayView:
|
||||||
def __init__(self, content_loader, content_rewriter, head_insert_view = None,
|
def __init__(self, content_loader, content_rewriter, head_insert_view = None,
|
||||||
@ -53,10 +55,21 @@ class ReplayView:
|
|||||||
|
|
||||||
response = None
|
response = None
|
||||||
|
|
||||||
|
# if Content-Length for payload is present, ensure we don't read past it
|
||||||
|
content_len = status_headers.get_header('content-length')
|
||||||
|
try:
|
||||||
|
content_len=int(content_len)
|
||||||
|
if content_len > 0:
|
||||||
|
stream = LimitReader(stream, content_len)
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
if self.content_rewriter and wbrequest.wb_url.mod != 'id_':
|
if self.content_rewriter and wbrequest.wb_url.mod != 'id_':
|
||||||
response = self.rewrite_content(wbrequest, cdx, status_headers, stream)
|
response = self.rewrite_content(wbrequest, cdx, status_headers, stream)
|
||||||
else:
|
else:
|
||||||
(status_headers, stream) = self.sanitize_content(status_headers, stream)
|
(status_headers, stream) = self.sanitize_content(status_headers, stream)
|
||||||
|
#status_headers.remove_header('content-length')
|
||||||
|
|
||||||
response_iter = self.stream_to_iter(stream)
|
response_iter = self.stream_to_iter(stream)
|
||||||
response = WbResponse(status_headers, response_iter)
|
response = WbResponse(status_headers, response_iter)
|
||||||
|
|
||||||
@ -99,20 +112,34 @@ class ReplayView:
|
|||||||
def rewrite_content(self, wbrequest, cdx, status_headers, stream):
|
def rewrite_content(self, wbrequest, cdx, status_headers, stream):
|
||||||
urlrewriter = wbrequest.urlrewriter
|
urlrewriter = wbrequest.urlrewriter
|
||||||
|
|
||||||
(rewritten_headers, stream) = self.content_rewriter.rewrite_headers(urlrewriter, status_headers, stream)
|
result = self.content_rewriter.rewrite_headers(urlrewriter,
|
||||||
|
status_headers,
|
||||||
|
stream,
|
||||||
|
cdx['urlkey'])
|
||||||
|
(rewritten_headers, stream) = result
|
||||||
|
|
||||||
# no rewriting needed!
|
# no rewriting needed!
|
||||||
if rewritten_headers.text_type is None:
|
if rewritten_headers.text_type is None:
|
||||||
response_iter = self.stream_to_iter(stream)
|
response_iter = self.stream_to_iter(stream)
|
||||||
return WbResponse(rewritten_headers.status_headers, response_iter)
|
return WbResponse(rewritten_headers.status_headers, response_iter)
|
||||||
|
|
||||||
# do head insert
|
def make_head_insert(rule):
|
||||||
|
return (self.head_insert_view.render_to_string(wbrequest=wbrequest,
|
||||||
|
cdx=cdx,
|
||||||
|
rule=rule))
|
||||||
|
# do head insert
|
||||||
if self.head_insert_view:
|
if self.head_insert_view:
|
||||||
head_insert_str = self.head_insert_view.render_to_string(wbrequest = wbrequest, cdx = cdx)
|
head_insert_func = make_head_insert
|
||||||
else:
|
else:
|
||||||
head_insert_str = None
|
head_insert_func = None
|
||||||
|
|
||||||
(status_headers, response_gen) = self.content_rewriter.rewrite_content(urlrewriter, rewritten_headers, stream, head_insert_str)
|
result = self.content_rewriter.rewrite_content(urlrewriter,
|
||||||
|
rewritten_headers,
|
||||||
|
stream,
|
||||||
|
head_insert_func,
|
||||||
|
cdx['urlkey'])
|
||||||
|
|
||||||
|
(status_headers, response_gen) = result
|
||||||
|
|
||||||
if self.buffer_response:
|
if self.buffer_response:
|
||||||
if wbrequest.wb_url.mod == 'id_':
|
if wbrequest.wb_url.mod == 'id_':
|
||||||
|
@ -4,11 +4,16 @@ import itertools
|
|||||||
|
|
||||||
from url_rewriter import UrlRewriter
|
from url_rewriter import UrlRewriter
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class RegexRewriter(object):
|
class RegexRewriter(object):
|
||||||
|
#@staticmethod
|
||||||
|
#def comment_out(string):
|
||||||
|
# return '/*' + string + '*/'
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def comment_out(string):
|
def format(template):
|
||||||
return '/*' + string + '*/'
|
return lambda string: template.format(string)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def remove_https(string):
|
def remove_https(string):
|
||||||
@ -20,19 +25,16 @@ class RegexRewriter(object):
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def archival_rewrite(rewriter):
|
def archival_rewrite(rewriter):
|
||||||
return lambda x: rewriter.rewrite(x)
|
return lambda string: rewriter.rewrite(string)
|
||||||
|
|
||||||
@staticmethod
|
#@staticmethod
|
||||||
def replacer(string):
|
#def replacer(other):
|
||||||
return lambda x: string
|
# return lambda m, string: other
|
||||||
|
|
||||||
HTTPX_MATCH_STR = r'https?:\\?/\\?/[A-Za-z0-9:_@.-]+'
|
HTTPX_MATCH_STR = r'https?:\\?/\\?/[A-Za-z0-9:_@.-]+'
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_OP = add_prefix
|
DEFAULT_OP = add_prefix
|
||||||
|
|
||||||
|
|
||||||
def __init__(self, rules):
|
def __init__(self, rules):
|
||||||
#rules = self.create_rules(http_prefix)
|
#rules = self.create_rules(http_prefix)
|
||||||
|
|
||||||
@ -76,52 +78,68 @@ class RegexRewriter(object):
|
|||||||
op = RegexRewriter.DEFAULT_OP(op)
|
op = RegexRewriter.DEFAULT_OP(op)
|
||||||
|
|
||||||
result = op(m.group(i))
|
result = op(m.group(i))
|
||||||
|
final_str = result
|
||||||
|
|
||||||
# if extracting partial match
|
# if extracting partial match
|
||||||
if i != full_m:
|
if i != full_m:
|
||||||
result = m.string[m.start(full_m):m.start(i)] + result + m.string[m.end(i):m.end(full_m)]
|
final_str = m.string[m.start(full_m):m.start(i)]
|
||||||
|
final_str += result
|
||||||
|
final_str += m.string[m.end(i):m.end(full_m)]
|
||||||
|
|
||||||
|
return final_str
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def parse_rules_from_config(config):
|
||||||
|
def parse_rule(obj):
|
||||||
|
match = obj.get('match')
|
||||||
|
replace = RegexRewriter.format(obj.get('replace', '{0}'))
|
||||||
|
group = obj.get('group', 0)
|
||||||
|
result = (match, replace, group)
|
||||||
return result
|
return result
|
||||||
|
return map(parse_rule, config)
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class JSLinkRewriter(RegexRewriter):
|
class JSLinkOnlyRewriter(RegexRewriter):
|
||||||
"""
|
"""
|
||||||
JS Rewriter which rewrites absolute http://, https:// and // urls
|
JS Rewriter which rewrites absolute http://, https:// and // urls
|
||||||
at the beginning of a string
|
at the beginning of a string
|
||||||
"""
|
"""
|
||||||
JS_HTTPX = r'(?<="|\')(?:https?:)?\\{0,2}/\\{0,2}/[A-Za-z0-9:_@.-]+'
|
JS_HTTPX = r'(?<="|\')(?:https?:)?\\{0,2}/\\{0,2}/[A-Za-z0-9:_@.-]+'
|
||||||
|
|
||||||
def __init__(self, rewriter, rules = []):
|
def __init__(self, rewriter, rules=[]):
|
||||||
rules = rules + [(self.JS_HTTPX, rewriter.get_abs_url(), 0)]
|
rules = rules + [(self.JS_HTTPX, rewriter.get_abs_url(), 0)]
|
||||||
super(JSLinkRewriter, self).__init__(rules)
|
super(JSLinkOnlyRewriter, self).__init__(rules)
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class JSLocationAndLinkRewriter(JSLinkRewriter):
|
class JSLinkAndLocationRewriter(JSLinkOnlyRewriter):
|
||||||
"""
|
"""
|
||||||
JS Rewriter which also rewrites location and domain to the
|
JS Rewriter which also rewrites location and domain to the
|
||||||
specified prefix (default: 'WB_wombat_')
|
specified prefix (default: 'WB_wombat_')
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, rewriter, rules = [], prefix = 'WB_wombat_'):
|
def __init__(self, rewriter, rules=[], prefix='WB_wombat_'):
|
||||||
rules = rules + [
|
rules = rules + [
|
||||||
(r'(?<!/)\blocation\b', prefix, 0),
|
(r'(?<!/)\blocation\b', prefix, 0),
|
||||||
(r'(?<=document\.)domain', prefix, 0),
|
(r'(?<=document\.)domain', prefix, 0),
|
||||||
]
|
]
|
||||||
super(JSLocationAndLinkRewriter, self).__init__(rewriter, rules)
|
#import sys
|
||||||
|
#sys.stderr.write('\n\n*** RULES:' + str(rules) + '\n\n')
|
||||||
|
super(JSLinkAndLocationRewriter, self).__init__(rewriter, rules)
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
# Set 'default' JSRewriter
|
# Set 'default' JSRewriter
|
||||||
JSRewriter = JSLocationAndLinkRewriter
|
JSRewriter = JSLinkAndLocationRewriter
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class XMLRewriter(RegexRewriter):
|
class XMLRewriter(RegexRewriter):
|
||||||
def __init__(self, rewriter, extra = []):
|
def __init__(self, rewriter, extra=[]):
|
||||||
rules = self._create_rules(rewriter.get_abs_url())
|
rules = self._create_rules(rewriter.get_abs_url())
|
||||||
|
|
||||||
RegexRewriter.__init__(self, rules)
|
super(XMLRewriter, self).__init__(rules)
|
||||||
|
|
||||||
# custom filter to reject 'xmlns' attr
|
# custom filter to reject 'xmlns' attr
|
||||||
def filter(self, m):
|
def filter(self, m):
|
||||||
@ -133,24 +151,28 @@ class XMLRewriter(RegexRewriter):
|
|||||||
|
|
||||||
def _create_rules(self, http_prefix):
|
def _create_rules(self, http_prefix):
|
||||||
return [
|
return [
|
||||||
('([A-Za-z:]+[\s=]+)?["\'\s]*(' + RegexRewriter.HTTPX_MATCH_STR + ')', http_prefix, 2),
|
('([A-Za-z:]+[\s=]+)?["\'\s]*(' +
|
||||||
|
RegexRewriter.HTTPX_MATCH_STR + ')', http_prefix, 2),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class CSSRewriter(RegexRewriter):
|
class CSSRewriter(RegexRewriter):
|
||||||
|
|
||||||
CSS_URL_REGEX = "url\\s*\\(\\s*[\\\\\"']*([^)'\"]+)[\\\\\"']*\\s*\\)"
|
CSS_URL_REGEX = "url\\s*\\(\\s*[\\\\\"']*([^)'\"]+)[\\\\\"']*\\s*\\)"
|
||||||
CSS_IMPORT_NO_URL_REGEX = "@import\\s+(?!url)\\(?\\s*['\"]?(?!url[\\s\\(])([\w.:/\\\\-]+)"
|
|
||||||
|
CSS_IMPORT_NO_URL_REGEX = ("@import\\s+(?!url)\\(?\\s*['\"]?" +
|
||||||
|
"(?!url[\\s\\(])([\w.:/\\\\-]+)")
|
||||||
|
|
||||||
def __init__(self, rewriter):
|
def __init__(self, rewriter):
|
||||||
rules = self._create_rules(rewriter)
|
rules = self._create_rules(rewriter)
|
||||||
|
super(CSSRewriter, self).__init__(rules)
|
||||||
RegexRewriter.__init__(self, rules)
|
|
||||||
|
|
||||||
|
|
||||||
def _create_rules(self, rewriter):
|
def _create_rules(self, rewriter):
|
||||||
return [
|
return [
|
||||||
(CSSRewriter.CSS_URL_REGEX, RegexRewriter.archival_rewrite(rewriter), 1),
|
(CSSRewriter.CSS_URL_REGEX,
|
||||||
(CSSRewriter.CSS_IMPORT_NO_URL_REGEX, RegexRewriter.archival_rewrite(rewriter), 1),
|
RegexRewriter.archival_rewrite(rewriter), 1),
|
||||||
|
|
||||||
|
(CSSRewriter.CSS_IMPORT_NO_URL_REGEX,
|
||||||
|
RegexRewriter.archival_rewrite(rewriter), 1),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,30 +1,27 @@
|
|||||||
import chardet
|
import chardet
|
||||||
|
import pkgutil
|
||||||
|
import yaml
|
||||||
|
|
||||||
from url_rewriter import UrlRewriter
|
from header_rewriter import RewrittenStatusAndHeaders
|
||||||
from html_rewriter import HTMLRewriter
|
|
||||||
from regex_rewriters import RegexRewriter, JSRewriter, CSSRewriter, XMLRewriter
|
|
||||||
from header_rewriter import HeaderRewriter, RewrittenStatusAndHeaders
|
|
||||||
|
|
||||||
|
from rewriterules import RewriteRules
|
||||||
|
|
||||||
|
from pywb.utils.dsrules import RuleSet
|
||||||
from pywb.utils.statusandheaders import StatusAndHeaders
|
from pywb.utils.statusandheaders import StatusAndHeaders
|
||||||
from pywb.utils.bufferedreaders import DecompressingBufferedReader, ChunkedDataReader
|
from pywb.utils.bufferedreaders import DecompressingBufferedReader, ChunkedDataReader
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
class RewriteContent:
|
class RewriteContent:
|
||||||
|
def __init__(self, ds_rules_file=None):
|
||||||
|
self.ruleset = RuleSet(RewriteRules, 'rewrite',
|
||||||
|
default_rule_config={},
|
||||||
|
ds_rules_file=ds_rules_file)
|
||||||
|
|
||||||
DEFAULT_CONTENT_REWRITERS = {
|
def rewrite_headers(self, urlrewriter, status_headers, stream, urlkey=''):
|
||||||
'header': HeaderRewriter,
|
header_rewriter_class = self.ruleset.get_first_match(urlkey).rewriters['header']
|
||||||
'js': JSRewriter,
|
|
||||||
'css': CSSRewriter,
|
|
||||||
'xml': XMLRewriter,
|
|
||||||
'html': HTMLRewriter
|
|
||||||
}
|
|
||||||
|
|
||||||
|
rewritten_headers = header_rewriter_class().rewrite(status_headers, urlrewriter)
|
||||||
def __init__(self, rewriters = {}):
|
|
||||||
self.rewriters = dict(self.DEFAULT_CONTENT_REWRITERS.items() + rewriters.items())
|
|
||||||
|
|
||||||
|
|
||||||
def rewrite_headers(self, urlrewriter, status_headers, stream):
|
|
||||||
rewritten_headers = self.rewriters['header']().rewrite(status_headers, urlrewriter)
|
|
||||||
|
|
||||||
# note: since chunking may be broken, approach taken here is to *always* attempt
|
# note: since chunking may be broken, approach taken here is to *always* attempt
|
||||||
# to dechunk if transfer-encoding: chunked is present
|
# to dechunk if transfer-encoding: chunked is present
|
||||||
@ -37,7 +34,8 @@ class RewriteContent:
|
|||||||
|
|
||||||
return (rewritten_headers, stream)
|
return (rewritten_headers, stream)
|
||||||
|
|
||||||
def rewrite_content(self, urlrewriter, headers, stream, head_insert_str = None):
|
def rewrite_content(self, urlrewriter, headers, stream, head_insert_func=None, urlkey=''):
|
||||||
|
|
||||||
# see if we've already rewritten headers
|
# see if we've already rewritten headers
|
||||||
if isinstance(headers, RewrittenStatusAndHeaders):
|
if isinstance(headers, RewrittenStatusAndHeaders):
|
||||||
rewritten_headers = headers
|
rewritten_headers = headers
|
||||||
@ -50,9 +48,11 @@ class RewriteContent:
|
|||||||
return (status_headers, gen)
|
return (status_headers, gen)
|
||||||
|
|
||||||
status_headers = rewritten_headers.status_headers
|
status_headers = rewritten_headers.status_headers
|
||||||
|
|
||||||
# Handle text content rewriting
|
# Handle text content rewriting
|
||||||
# =========================================================================
|
# =========================================================================
|
||||||
# special case -- need to ungzip the body
|
# special case -- need to ungzip the body
|
||||||
|
|
||||||
if (rewritten_headers.contains_removed_header('content-encoding', 'gzip')):
|
if (rewritten_headers.contains_removed_header('content-encoding', 'gzip')):
|
||||||
stream = DecompressingBufferedReader(stream, decomp_type='gzip')
|
stream = DecompressingBufferedReader(stream, decomp_type='gzip')
|
||||||
|
|
||||||
@ -68,13 +68,27 @@ class RewriteContent:
|
|||||||
|
|
||||||
text_type = rewritten_headers.text_type
|
text_type = rewritten_headers.text_type
|
||||||
|
|
||||||
rewriter_class = self.rewriters.get(text_type)
|
rule = self.ruleset.get_first_match(urlkey)
|
||||||
if not rewriter_class:
|
|
||||||
|
try:
|
||||||
|
rewriter_class = rule.rewriters[text_type]
|
||||||
|
except KeyError:
|
||||||
raise Exception('Unknown Text Type for Rewrite: ' + text_type)
|
raise Exception('Unknown Text Type for Rewrite: ' + text_type)
|
||||||
|
|
||||||
|
#import sys
|
||||||
|
#sys.stderr.write(str(vars(rule)))
|
||||||
|
|
||||||
if text_type == 'html':
|
if text_type == 'html':
|
||||||
rewriter = rewriter_class(urlrewriter, outstream = None, head_insert = head_insert_str)
|
head_insert_str = ''
|
||||||
|
|
||||||
|
if head_insert_func:
|
||||||
|
head_insert_str = head_insert_func(rule)
|
||||||
|
|
||||||
|
rewriter = rewriter_class(urlrewriter,
|
||||||
|
outstream=None,
|
||||||
|
js_rewriter_class=rule.rewriters['js'],
|
||||||
|
css_rewriter_class=rule.rewriters['css'],
|
||||||
|
head_insert=head_insert_str)
|
||||||
else:
|
else:
|
||||||
rewriter = rewriter_class(urlrewriter)
|
rewriter = rewriter_class(urlrewriter)
|
||||||
|
|
||||||
|
@ -2,12 +2,17 @@ import urllib2
|
|||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import datetime
|
import datetime
|
||||||
|
import mimetypes
|
||||||
|
|
||||||
|
from pywb.utils.loaders import is_http
|
||||||
from pywb.utils.timeutils import datetime_to_timestamp
|
from pywb.utils.timeutils import datetime_to_timestamp
|
||||||
from pywb.utils.statusandheaders import StatusAndHeaders
|
from pywb.utils.statusandheaders import StatusAndHeaders
|
||||||
|
from pywb.utils.canonicalize import canonicalize
|
||||||
|
|
||||||
from pywb.rewrite.url_rewriter import UrlRewriter
|
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||||
from pywb.rewrite.rewrite_content import RewriteContent
|
from pywb.rewrite.rewrite_content import RewriteContent
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Fetch a url from live web and apply rewriting rules
|
Fetch a url from live web and apply rewriting rules
|
||||||
"""
|
"""
|
||||||
@ -26,10 +31,37 @@ def get_status_and_stream(url):
|
|||||||
return (status_headers, stream)
|
return (status_headers, stream)
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
def get_rewritten(url, urlrewriter):
|
def get_local_file(uri):
|
||||||
(status_headers, stream) = get_status_and_stream(url)
|
fh = open(uri)
|
||||||
|
|
||||||
status_headers, gen = RewriteContent().rewrite_content(urlrewriter, status_headers, stream)
|
content_type, _ = mimetypes.guess_type(uri)
|
||||||
|
|
||||||
|
# create fake headers for local file
|
||||||
|
status_headers = StatusAndHeaders('200 OK', [('Content-Type', content_type)])
|
||||||
|
stream = fh
|
||||||
|
|
||||||
|
return (status_headers, stream)
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
def get_rewritten(url, urlrewriter, urlkey=None, head_insert_func=None):
|
||||||
|
if is_http(url):
|
||||||
|
(status_headers, stream) = get_status_and_stream(url)
|
||||||
|
else:
|
||||||
|
(status_headers, stream) = get_local_file(url)
|
||||||
|
|
||||||
|
# explicit urlkey may be passed in (say for testing)
|
||||||
|
if not urlkey:
|
||||||
|
urlkey = canonicalize(url)
|
||||||
|
|
||||||
|
rewriter = RewriteContent()
|
||||||
|
|
||||||
|
result = rewriter.rewrite_content(urlrewriter,
|
||||||
|
status_headers,
|
||||||
|
stream,
|
||||||
|
head_insert_func=head_insert_func,
|
||||||
|
urlkey=urlkey)
|
||||||
|
|
||||||
|
status_headers, gen = result
|
||||||
|
|
||||||
buff = ''
|
buff = ''
|
||||||
for x in gen:
|
for x in gen:
|
||||||
|
53
pywb/rewrite/rewriterules.py
Normal file
53
pywb/rewrite/rewriterules.py
Normal file
@ -0,0 +1,53 @@
|
|||||||
|
from pywb.utils.dsrules import BaseRule
|
||||||
|
|
||||||
|
from regex_rewriters import RegexRewriter, CSSRewriter, XMLRewriter
|
||||||
|
from regex_rewriters import JSLinkAndLocationRewriter, JSLinkOnlyRewriter
|
||||||
|
from html_rewriter import HTMLRewriter
|
||||||
|
from header_rewriter import HeaderRewriter
|
||||||
|
|
||||||
|
import itertools
|
||||||
|
|
||||||
|
class RewriteRules(BaseRule):
|
||||||
|
def __init__(self, url_prefix, config={}):
|
||||||
|
super(RewriteRules, self).__init__(url_prefix, config)
|
||||||
|
|
||||||
|
self.rewriters = {}
|
||||||
|
|
||||||
|
#self._script_head_inserts = config.get('script_head_inserts', {})
|
||||||
|
|
||||||
|
self.rewriters['header'] = config.get('header_class', HeaderRewriter)
|
||||||
|
self.rewriters['css'] = config.get('css_class', CSSRewriter)
|
||||||
|
self.rewriters['xml'] = config.get('xml_class', XMLRewriter)
|
||||||
|
self.rewriters['html'] = config.get('html_class', HTMLRewriter)
|
||||||
|
|
||||||
|
# Custom handling for js rewriting, often the most complex
|
||||||
|
self.js_rewrite_location = config.get('js_rewrite_location', True)
|
||||||
|
self.js_rewrite_location = bool(self.js_rewrite_location)
|
||||||
|
|
||||||
|
# ability to toggle rewriting
|
||||||
|
if self.js_rewrite_location:
|
||||||
|
js_default_class = JSLinkAndLocationRewriter
|
||||||
|
else:
|
||||||
|
js_default_class = JSLinkOnlyRewriter
|
||||||
|
|
||||||
|
# set js class, using either default or override from config
|
||||||
|
self.rewriters['js'] = config.get('js_class', js_default_class)
|
||||||
|
|
||||||
|
# add any regexs for js rewriter
|
||||||
|
self._add_custom_regexs('js', config)
|
||||||
|
|
||||||
|
def _add_custom_regexs(self, field, config):
|
||||||
|
regexs = config.get(field + '_regexs')
|
||||||
|
if not regexs:
|
||||||
|
return
|
||||||
|
|
||||||
|
rewriter_cls = self.rewriters[field]
|
||||||
|
|
||||||
|
rule_def_tuples = RegexRewriter.parse_rules_from_config(regexs)
|
||||||
|
|
||||||
|
def extend_rewriter_with_regex(urlrewriter):
|
||||||
|
#import sys
|
||||||
|
#sys.stderr.write('\n\nEXTEND: ' + str(rule_def_tuples))
|
||||||
|
return rewriter_cls(urlrewriter, rule_def_tuples)
|
||||||
|
|
||||||
|
self.rewriters[field] = extend_rewriter_with_regex
|
@ -121,7 +121,7 @@ r"""
|
|||||||
'document_domain = "anotherdomain.com"; window.document.WB_wombat_domain = "example.com"'
|
'document_domain = "anotherdomain.com"; window.document.WB_wombat_domain = "example.com"'
|
||||||
|
|
||||||
# custom rules added
|
# custom rules added
|
||||||
>>> _test_js('window.location = "http://example.com/abc.html"; some_func(); ', [('some_func\(\).*', RegexRewriter.comment_out, 0)])
|
>>> _test_js('window.location = "http://example.com/abc.html"; some_func(); ', [('some_func\(\).*', RegexRewriter.format('/*{0}*/'), 0)])
|
||||||
'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"; /*some_func(); */'
|
'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"; /*some_func(); */'
|
||||||
|
|
||||||
# scheme-agnostic
|
# scheme-agnostic
|
||||||
|
@ -1,11 +1,50 @@
|
|||||||
from pywb.rewrite.rewrite_live import get_rewritten
|
from pywb.rewrite.rewrite_live import get_rewritten
|
||||||
from pywb.rewrite.url_rewriter import UrlRewriter
|
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||||
|
|
||||||
|
from pywb import get_test_dir
|
||||||
|
|
||||||
# This module has some rewriting tests against the 'live web'
|
# This module has some rewriting tests against the 'live web'
|
||||||
# As such, the content may change and the test may break
|
# As such, the content may change and the test may break
|
||||||
|
|
||||||
urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/pywb/')
|
urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/pywb/')
|
||||||
|
|
||||||
|
def head_insert_func(rule):
|
||||||
|
if rule.js_rewrite_location == True:
|
||||||
|
return '<script src="/static/default/wombat.js"> </script>'
|
||||||
|
else:
|
||||||
|
return ''
|
||||||
|
|
||||||
|
|
||||||
|
def test_local_1():
|
||||||
|
status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html',
|
||||||
|
urlrewriter,
|
||||||
|
'com,example,test)/',
|
||||||
|
head_insert_func)
|
||||||
|
|
||||||
|
# wombat insert added
|
||||||
|
assert '<head><script src="/static/default/wombat.js"> </script>' in buff
|
||||||
|
|
||||||
|
# location rewritten
|
||||||
|
assert 'window.WB_wombat_location = "/other.html"' in buff
|
||||||
|
|
||||||
|
# link rewritten
|
||||||
|
assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff
|
||||||
|
|
||||||
|
|
||||||
|
def test_local_2_no_js_location_rewrite():
|
||||||
|
status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html',
|
||||||
|
urlrewriter,
|
||||||
|
'example,example,test)/nolocation_rewrite',
|
||||||
|
head_insert_func)
|
||||||
|
|
||||||
|
# no wombat insert
|
||||||
|
assert '<head><script src="/static/default/wombat.js"> </script>' not in buff
|
||||||
|
|
||||||
|
# no location rewrite
|
||||||
|
assert 'window.location = "/other.html"' in buff
|
||||||
|
|
||||||
|
# still link rewrite
|
||||||
|
assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff
|
||||||
|
|
||||||
def test_example_1():
|
def test_example_1():
|
||||||
status_headers, buff = get_rewritten('http://example.com/', urlrewriter)
|
status_headers, buff = get_rewritten('http://example.com/', urlrewriter)
|
||||||
@ -24,9 +63,10 @@ def test_example_2():
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
#def test_example_3():
|
def test_example_domain_specific_3():
|
||||||
# status_headers, buff = get_rewritten('http://archive.org/', urlrewriter)
|
status_headers, buff = get_rewritten('http://facebook.com/digitalpreservation', urlrewriter)
|
||||||
|
|
||||||
# assert '/pywb/20131226101010/http://example.com/about/terms.php' in buff, buff
|
# comment out bootloader
|
||||||
|
assert '/* Bootloader.configurePage' in buff
|
||||||
|
|
||||||
|
|
||||||
|
50
pywb/rules.yaml
Normal file
50
pywb/rules.yaml
Normal file
@ -0,0 +1,50 @@
|
|||||||
|
|
||||||
|
rules:
|
||||||
|
|
||||||
|
# twitter rules
|
||||||
|
#=================================================================
|
||||||
|
- url_prefix: 'com,twitter)/i/profiles/show/'
|
||||||
|
|
||||||
|
fuzzy_lookup: '/profiles/show/.*with_replies\?.*(max_id=[^&]+)'
|
||||||
|
|
||||||
|
|
||||||
|
# facebook rules
|
||||||
|
#=================================================================
|
||||||
|
- url_prefix: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet'
|
||||||
|
|
||||||
|
fuzzy_lookup: 'com,facebook\)/.*[?&]data=(.*?(?:[&]|query_type[^,]+))'
|
||||||
|
|
||||||
|
# not actually needed, fuzzy match is used instead here
|
||||||
|
# canonicalize:
|
||||||
|
# match: 'com,facebook\)/.*[?&]data=([^&]+).*'
|
||||||
|
# replace: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet?data=\1'
|
||||||
|
|
||||||
|
|
||||||
|
- url_prefix: 'com,facebook)/'
|
||||||
|
rewrite:
|
||||||
|
js_regexs:
|
||||||
|
- match: 'Bootloader\.configurePage.*'
|
||||||
|
replace: '/* {0} */'
|
||||||
|
|
||||||
|
|
||||||
|
# yahoo rules
|
||||||
|
#=================================================================
|
||||||
|
- url_prefix: ['com,yimg,l)/g/combo', 'com,yahooapis,yui)/combo']
|
||||||
|
|
||||||
|
fuzzy_lookup: '([^/]+(?:\.css|\.js))'
|
||||||
|
|
||||||
|
|
||||||
|
# testing rules -- not for valid domain
|
||||||
|
#=================================================================
|
||||||
|
# this rule block is a non-existent prefix merely for testing
|
||||||
|
- url_prefix: 'example,example,test)/nolocation_rewrite'
|
||||||
|
|
||||||
|
rewrite:
|
||||||
|
js_rewrite_location: False
|
||||||
|
|
||||||
|
|
||||||
|
# all domain rules -- fallback to this dataset
|
||||||
|
#=================================================================
|
||||||
|
# Applies to all urls -- should be last
|
||||||
|
- url_prefix: ''
|
||||||
|
fuzzy_lookup: '[&?](?:_|uncache)=[\d]+[&]?'
|
@ -1,18 +1,21 @@
|
|||||||
|
/*
|
||||||
|
Copyright(c) 2013-2014 Ilya Kreymer. Released under the GNU General Public License.
|
||||||
|
|
||||||
|
This file is part of pywb.
|
||||||
|
|
||||||
// Rewritten location and domain obj setup
|
pywb is free software: you can redistribute it and/or modify
|
||||||
window.WB_wombat_location = window.location
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation, either version 3 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
if (window.top != window) {
|
pywb is distributed in the hope that it will be useful,
|
||||||
window.top.WB_wombat_location = window.top.location
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
}
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
if (window.opener) {
|
|
||||||
window.opener.WB_wombat_location = window.opener.location
|
|
||||||
}
|
|
||||||
|
|
||||||
document.WB_wombat_domain = document.domain
|
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License
|
||||||
|
along with pywb. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
function initBanner()
|
function initBanner()
|
||||||
{
|
{
|
||||||
|
219
pywb/static/wombat.js
Normal file
219
pywb/static/wombat.js
Normal file
@ -0,0 +1,219 @@
|
|||||||
|
/*
|
||||||
|
Copyright(c) 2013-2014 Ilya Kreymer. Released under the GNU General Public License.
|
||||||
|
|
||||||
|
This file is part of pywb.
|
||||||
|
|
||||||
|
pywb is free software: you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation, either version 3 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
pywb is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License
|
||||||
|
along with pywb. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
//============================================
|
||||||
|
// Wombat JS-Rewriting Library
|
||||||
|
//============================================
|
||||||
|
|
||||||
|
var WB_wombat_replayPrefix;
|
||||||
|
var WB_wombat_replayDatePrefix;
|
||||||
|
var WB_wombat_captureDatePart;
|
||||||
|
var WB_wombat_origHost;
|
||||||
|
|
||||||
|
|
||||||
|
function WB_StripPort(str)
|
||||||
|
{
|
||||||
|
var hostWithPort = str.match(/^http:\/\/[\w\d@.-]+:\d+/);
|
||||||
|
if (hostWithPort) {
|
||||||
|
var hostName = hostWithPort[0].substr(0, hostWithPort[0].lastIndexOf(':'));
|
||||||
|
return hostName + str.substr(hostWithPort[0].length);
|
||||||
|
}
|
||||||
|
|
||||||
|
return str;
|
||||||
|
}
|
||||||
|
|
||||||
|
function WB_IsHostUrl(str)
|
||||||
|
{
|
||||||
|
// Good guess that's its a hostname
|
||||||
|
if (str.indexOf("www.") == 0) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// hostname:port (port required)
|
||||||
|
var matches = str.match(/^[\w-]+(\.[\w-_]+)+(:\d+)(\/|$)/);
|
||||||
|
if (matches && (matches[0].length < 64)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ip:port
|
||||||
|
matches = str.match(/^\d+\.\d+\.\d+\.\d+(:\d+)?(\/|$)/);
|
||||||
|
if (matches && (matches[0].length < 64)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
function WB_RewriteUrl(url)
|
||||||
|
{
|
||||||
|
var httpPrefix = "http://";
|
||||||
|
|
||||||
|
// If not dealing with a string, just return it
|
||||||
|
if (!url || (typeof url) != "string") {
|
||||||
|
return url;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If starts with prefix, no rewriting needed
|
||||||
|
// Only check replay prefix (no date) as date may be different for each capture
|
||||||
|
if (url.indexOf(WB_wombat_replayPrefix) == 0) {
|
||||||
|
return url;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If server relative url, add prefix and original host
|
||||||
|
if (url.charAt(0) == "/") {
|
||||||
|
|
||||||
|
// Already a relative url, don't make any changes!
|
||||||
|
if (url.indexOf(WB_wombat_captureDatePart) >= 0) {
|
||||||
|
return url;
|
||||||
|
}
|
||||||
|
|
||||||
|
return WB_wombat_replayDatePrefix + WB_wombat_origHost + url;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If full url starting with http://, add prefix
|
||||||
|
if (url.indexOf(httpPrefix) == 0) {
|
||||||
|
return WB_wombat_replayDatePrefix + url;
|
||||||
|
}
|
||||||
|
|
||||||
|
// May or may not be a hostname, call function to determine
|
||||||
|
// If it is, add the prefix and make sure port is removed
|
||||||
|
if (WB_IsHostUrl(url)) {
|
||||||
|
return WB_wombat_replayDatePrefix + httpPrefix + url;
|
||||||
|
}
|
||||||
|
|
||||||
|
return url;
|
||||||
|
}
|
||||||
|
|
||||||
|
function WB_CopyObjectFields(obj)
|
||||||
|
{
|
||||||
|
var newObj = {};
|
||||||
|
|
||||||
|
for (prop in obj) {
|
||||||
|
if ((typeof obj[prop]) != "function") {
|
||||||
|
newObj[prop] = obj[prop];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return newObj;
|
||||||
|
}
|
||||||
|
|
||||||
|
function WB_ExtractOrig(href)
|
||||||
|
{
|
||||||
|
if (!href) {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
href = href.toString();
|
||||||
|
var index = href.indexOf("/http", 1);
|
||||||
|
if (index > 0) {
|
||||||
|
return href.substr(index + 1);
|
||||||
|
} else {
|
||||||
|
return href;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function WB_CopyLocationObj(loc)
|
||||||
|
{
|
||||||
|
var newLoc = WB_CopyObjectFields(loc);
|
||||||
|
|
||||||
|
newLoc._origLoc = loc;
|
||||||
|
newLoc._origHref = loc.href;
|
||||||
|
|
||||||
|
// Rewrite replace and assign functions
|
||||||
|
newLoc.replace = function(url) { this._origLoc.replace(WB_RewriteUrl(url)); }
|
||||||
|
newLoc.assign = function(url) { this._origLoc.assign(WB_RewriteUrl(url)); }
|
||||||
|
newLoc.reload = loc.reload;
|
||||||
|
newLoc.href = WB_ExtractOrig(newLoc._origHref);
|
||||||
|
newLoc.toString = function() { return this.href; }
|
||||||
|
|
||||||
|
return newLoc;
|
||||||
|
}
|
||||||
|
|
||||||
|
function WB_wombat_updateLoc(reqHref, origHref, location)
|
||||||
|
{
|
||||||
|
if (reqHref && (WB_ExtractOrig(origHref) != WB_ExtractOrig(reqHref))) {
|
||||||
|
var finalHref = WB_RewriteUrl(reqHref);
|
||||||
|
|
||||||
|
location.href = finalHref;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function WB_wombat_checkLocationChange(wbLoc, isTop)
|
||||||
|
{
|
||||||
|
var locType = (typeof wbLoc);
|
||||||
|
|
||||||
|
var location = (isTop ? window.top.location : window.location);
|
||||||
|
|
||||||
|
// String has been assigned to location, so assign it
|
||||||
|
if (locType == "string") {
|
||||||
|
WB_wombat_updateLoc(wbLoc, location.href, location)
|
||||||
|
|
||||||
|
} else if (locType == "object") {
|
||||||
|
WB_wombat_updateLoc(wbLoc.href, wbLoc._origHref, location);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var wombat_updating = false;
|
||||||
|
|
||||||
|
function WB_wombat_checkLocations()
|
||||||
|
{
|
||||||
|
if (wombat_updating) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
wombat_updating = true;
|
||||||
|
|
||||||
|
WB_wombat_checkLocationChange(window.WB_wombat_location, false);
|
||||||
|
|
||||||
|
if (window.self.location != window.top.location) {
|
||||||
|
WB_wombat_checkLocationChange(window.top.WB_wombat_location, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
wombat_updating = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
function WB_wombat_Init(replayPrefix, captureDate, origHost)
|
||||||
|
{
|
||||||
|
WB_wombat_replayPrefix = replayPrefix;
|
||||||
|
WB_wombat_replayDatePrefix = replayPrefix + captureDate + "/";
|
||||||
|
WB_wombat_captureDatePart = "/" + captureDate + "/";
|
||||||
|
|
||||||
|
WB_wombat_origHost = "http://" + origHost;
|
||||||
|
|
||||||
|
window.WB_wombat_location = WB_CopyLocationObj(window.self.location);
|
||||||
|
|
||||||
|
|
||||||
|
if (window.self.location != window.top.location) {
|
||||||
|
window.top.WB_wombat_location = WB_CopyLocationObj(window.top.location);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (window.opener) {
|
||||||
|
window.opener.WB_wombat_location = (window.opener ? WB_CopyLocationObj(window.opener.location) : null);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
document.WB_wombat_domain = origHost;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check quickly after page load
|
||||||
|
setTimeout(WB_wombat_checkLocations, 100);
|
||||||
|
|
||||||
|
|
||||||
|
// Check periodically every few seconds
|
||||||
|
setInterval(WB_wombat_checkLocations, 500);
|
@ -15,6 +15,13 @@
|
|||||||
'wb_prefix': 'https://localhost:8081/my_pywb/web/',
|
'wb_prefix': 'https://localhost:8081/my_pywb/web/',
|
||||||
'wb_url': ('replay', '2013', 'im_', 'http://test.example.com', '2013im_/http://test.example.com')}
|
'wb_url': ('replay', '2013', 'im_', 'http://test.example.com', '2013im_/http://test.example.com')}
|
||||||
|
|
||||||
|
# route with no collection
|
||||||
|
>>> print_req(Route('', BaseHandler())({'REL_REQUEST_URI': 'http://example.com', 'SCRIPT_NAME': '/pywb'}, False))
|
||||||
|
{'coll': '',
|
||||||
|
'request_uri': 'http://example.com',
|
||||||
|
'wb_prefix': '/pywb/',
|
||||||
|
'wb_url': None}
|
||||||
|
|
||||||
# not matching route -- skipped
|
# not matching route -- skipped
|
||||||
>>> Route('web', BaseHandler())({'REL_REQUEST_URI': '/other/test.example.com', 'SCRIPT_NAME': ''}, False)
|
>>> Route('web', BaseHandler())({'REL_REQUEST_URI': '/other/test.example.com', 'SCRIPT_NAME': ''}, False)
|
||||||
|
|
||||||
@ -67,6 +74,13 @@ False
|
|||||||
>>> _test_redir('http://localhost:8080/', '/../other.html', 'http://localhost:8080/extra/coll/20131010/http://example.com/path/page.html', '/extr')
|
>>> _test_redir('http://localhost:8080/', '/../other.html', 'http://localhost:8080/extra/coll/20131010/http://example.com/path/page.html', '/extr')
|
||||||
False
|
False
|
||||||
|
|
||||||
|
# With no collection
|
||||||
|
>>> _test_redir('http://localhost:8080/', '/other.html', 'http://localhost:8080/2013/http://example.com/path/page.html', coll='')
|
||||||
|
'http://localhost:8080/2013/http://example.com/other.html'
|
||||||
|
|
||||||
|
# With SCRIPT_NAME but no collection
|
||||||
|
>>> _test_redir('http://localhost:8080/', '/other.html', 'http://localhost:8080/pywb-access/http://example.com/path/page.html', '/pywb-access', coll='')
|
||||||
|
'http://localhost:8080/pywb-access/http://example.com/other.html'
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
@ -1,7 +1,14 @@
|
|||||||
<!-- WB Insert -->
|
<!-- WB Insert -->
|
||||||
|
{% if rule.js_rewrite_location %}
|
||||||
|
<script src='{{ wbrequest.host_prefix }}/static/default/wombat.js'> </script>
|
||||||
<script>
|
<script>
|
||||||
wbinfo = {}
|
WB_wombat_Init("{{wbrequest.wb_prefix}}", "{{cdx['timestamp']}}", "{{cdx['original'] | host}}");
|
||||||
wbinfo.capture_str = "{{ cdx['timestamp'] | format_ts }}";
|
</script>
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
|
<script>
|
||||||
|
wbinfo = {}
|
||||||
|
wbinfo.capture_str = "{{ cdx['timestamp'] | format_ts }}";
|
||||||
</script>
|
</script>
|
||||||
<script src='{{ wbrequest.host_prefix }}/static/default/wb.js'> </script>
|
<script src='{{ wbrequest.host_prefix }}/static/default/wb.js'> </script>
|
||||||
<link rel='stylesheet' href='{{ wbrequest.host_prefix }}/static/default/wb.css'/>
|
<link rel='stylesheet' href='{{ wbrequest.host_prefix }}/static/default/wb.css'/>
|
||||||
|
@ -3,8 +3,6 @@
|
|||||||
|
|
||||||
import surt
|
import surt
|
||||||
import urlparse
|
import urlparse
|
||||||
from cdxobject import CDXException
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class UrlCanonicalizer(object):
|
class UrlCanonicalizer(object):
|
||||||
@ -15,6 +13,12 @@ class UrlCanonicalizer(object):
|
|||||||
return canonicalize(url, self.surt_ordered)
|
return canonicalize(url, self.surt_ordered)
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class UrlCanonicalizeException(Exception):
|
||||||
|
def status(self):
|
||||||
|
return '400 Bad Request'
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
def canonicalize(url, surt_ordered=True):
|
def canonicalize(url, surt_ordered=True):
|
||||||
"""
|
"""
|
||||||
@ -31,7 +35,7 @@ def canonicalize(url, surt_ordered=True):
|
|||||||
try:
|
try:
|
||||||
key = surt.surt(url)
|
key = surt.surt(url)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise CDXException('Invalid Url: ' + url)
|
raise UrlCanonicalizeException('Invalid Url: ' + url)
|
||||||
|
|
||||||
# if not surt, unsurt the surt to get canonicalized non-surt url
|
# if not surt, unsurt the surt to get canonicalized non-surt url
|
||||||
if not surt_ordered:
|
if not surt_ordered:
|
||||||
@ -114,10 +118,15 @@ def calc_search_range(url, match_type, surt_ordered=True, url_canon=None):
|
|||||||
>>> calc_search_range('http://example.com/path/file.html', 'host', False)
|
>>> calc_search_range('http://example.com/path/file.html', 'host', False)
|
||||||
('example.com/', 'example.com0')
|
('example.com/', 'example.com0')
|
||||||
|
|
||||||
# domain range not supported
|
# errors: domain range not supported
|
||||||
>>> calc_search_range('http://example.com/path/file.html', 'domain', False)
|
>>> calc_search_range('http://example.com/path/file.html', 'domain', False)
|
||||||
Traceback (most recent call last):
|
Traceback (most recent call last):
|
||||||
Exception: matchType=domain unsupported for non-surt
|
UrlCanonicalizeException: matchType=domain unsupported for non-surt
|
||||||
|
|
||||||
|
>>> calc_search_range('http://example.com/path/file.html', 'blah', False)
|
||||||
|
Traceback (most recent call last):
|
||||||
|
UrlCanonicalizeException: Invalid match_type: blah
|
||||||
|
|
||||||
"""
|
"""
|
||||||
def inc_last_char(x):
|
def inc_last_char(x):
|
||||||
return x[0:-1] + chr(ord(x[-1]) + 1)
|
return x[0:-1] + chr(ord(x[-1]) + 1)
|
||||||
@ -155,7 +164,7 @@ def calc_search_range(url, match_type, surt_ordered=True, url_canon=None):
|
|||||||
|
|
||||||
elif match_type == 'domain':
|
elif match_type == 'domain':
|
||||||
if not surt_ordered:
|
if not surt_ordered:
|
||||||
raise Exception('matchType=domain unsupported for non-surt')
|
raise UrlCanonicalizeException('matchType=domain unsupported for non-surt')
|
||||||
|
|
||||||
host = start_key.split(')/')[0]
|
host = start_key.split(')/')[0]
|
||||||
|
|
||||||
@ -168,7 +177,7 @@ def calc_search_range(url, match_type, surt_ordered=True, url_canon=None):
|
|||||||
|
|
||||||
end_key = host + '-'
|
end_key = host + '-'
|
||||||
else:
|
else:
|
||||||
raise Exception('Invalid match_type: ' + match_type)
|
raise UrlCanonicalizeException('Invalid match_type: ' + match_type)
|
||||||
|
|
||||||
return (start_key, end_key)
|
return (start_key, end_key)
|
||||||
|
|
98
pywb/utils/dsrules.py
Normal file
98
pywb/utils/dsrules.py
Normal file
@ -0,0 +1,98 @@
|
|||||||
|
import yaml
|
||||||
|
import pkgutil
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
|
||||||
|
DEFAULT_RULES_FILE = 'rules.yaml'
|
||||||
|
DEFAULT_RULES_PKG = 'pywb'
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class RuleSet(object):
|
||||||
|
DEFAULT_KEY = ''
|
||||||
|
|
||||||
|
def __init__(self, rule_cls, fieldname, **kwargs):
|
||||||
|
"""
|
||||||
|
A domain specific rules block, inited via config map.
|
||||||
|
If config map not specified, it is loaded from default location.
|
||||||
|
|
||||||
|
The rules are represented as a map by domain.
|
||||||
|
Each rules configuration will load is own field type
|
||||||
|
from the list and given a specified rule_cls.
|
||||||
|
"""
|
||||||
|
|
||||||
|
self.rules = []
|
||||||
|
|
||||||
|
ds_rules_file = kwargs.get('ds_rules_file')
|
||||||
|
default_rule_config = kwargs.get('default_rule_config')
|
||||||
|
|
||||||
|
config = self.load_default_rules(ds_rules_file)
|
||||||
|
|
||||||
|
rulesmap = config.get('rules') if config else None
|
||||||
|
|
||||||
|
# if default_rule_config provided, always init a default ruleset
|
||||||
|
if not rulesmap and default_rule_config is not None:
|
||||||
|
self.rules = [rule_cls(self.DEFAULT_KEY, default_rule_config)]
|
||||||
|
return
|
||||||
|
|
||||||
|
def_key_found = False
|
||||||
|
|
||||||
|
# iterate over master rules file
|
||||||
|
for value in rulesmap:
|
||||||
|
url_prefix = value.get('url_prefix')
|
||||||
|
rules_def = value.get(fieldname)
|
||||||
|
if not rules_def:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if url_prefix == self.DEFAULT_KEY:
|
||||||
|
def_key_found = True
|
||||||
|
|
||||||
|
self.rules.append(rule_cls(url_prefix, rules_def))
|
||||||
|
|
||||||
|
# if default_rule_config provided, always init a default ruleset
|
||||||
|
if not def_key_found and default_rule_config is not None:
|
||||||
|
self.rules.append(rule_cls(self.DEFAULT_KEY, default_rule_config))
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def load_default_rules(filename=None, pkg=None):
|
||||||
|
config = None
|
||||||
|
|
||||||
|
if not filename:
|
||||||
|
filename = DEFAULT_RULES_FILE
|
||||||
|
|
||||||
|
if not pkg:
|
||||||
|
pkg = DEFAULT_RULES_PKG
|
||||||
|
|
||||||
|
if filename:
|
||||||
|
yaml_str = pkgutil.get_data(pkg, filename)
|
||||||
|
config = yaml.load(yaml_str)
|
||||||
|
|
||||||
|
return config
|
||||||
|
|
||||||
|
def iter_matching(self, urlkey):
|
||||||
|
"""
|
||||||
|
Iterate over all matching rules for given urlkey
|
||||||
|
"""
|
||||||
|
for rule in self.rules:
|
||||||
|
if rule.applies(urlkey):
|
||||||
|
yield rule
|
||||||
|
|
||||||
|
def get_first_match(self, urlkey):
|
||||||
|
for rule in self.rules:
|
||||||
|
if rule.applies(urlkey):
|
||||||
|
return rule
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class BaseRule(object):
|
||||||
|
"""
|
||||||
|
Base rule class -- subclassed to handle specific
|
||||||
|
rules for given url_prefix key
|
||||||
|
"""
|
||||||
|
def __init__(self, url_prefix, rules):
|
||||||
|
self.url_prefix = url_prefix
|
||||||
|
if not isinstance(self.url_prefix, list):
|
||||||
|
self.url_prefix = [self.url_prefix]
|
||||||
|
|
||||||
|
def applies(self, urlkey):
|
||||||
|
return any(urlkey.startswith(x) for x in self.url_prefix)
|
@ -9,6 +9,7 @@ import urllib2
|
|||||||
import time
|
import time
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
def is_http(filename):
|
def is_http(filename):
|
||||||
return any(filename.startswith(x) for x in ['http://', 'https://'])
|
return any(filename.startswith(x) for x in ['http://', 'https://'])
|
||||||
|
|
||||||
|
@ -162,6 +162,10 @@ def timestamp_to_datetime(string):
|
|||||||
>>> timestamp_to_datetime('40001965252477')
|
>>> timestamp_to_datetime('40001965252477')
|
||||||
datetime.datetime(2999, 12, 31, 23, 24, 59)
|
datetime.datetime(2999, 12, 31, 23, 24, 59)
|
||||||
|
|
||||||
|
# not a number!
|
||||||
|
>>> timestamp_to_datetime('2010abc')
|
||||||
|
datetime.datetime(2010, 12, 31, 23, 59, 59)
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# pad to 6 digits
|
# pad to 6 digits
|
||||||
|
@ -2,6 +2,7 @@ from wbexceptions import WbException, NotFoundException, InternalRedirect
|
|||||||
from wbrequestresponse import WbResponse, StatusAndHeaders
|
from wbrequestresponse import WbResponse, StatusAndHeaders
|
||||||
|
|
||||||
from pywb.cdx.cdxserver import CDXException
|
from pywb.cdx.cdxserver import CDXException
|
||||||
|
from pywb.utils.canonicalize import UrlCanonicalizeException
|
||||||
from pywb.warc.recordloader import ArchiveLoadFailed
|
from pywb.warc.recordloader import ArchiveLoadFailed
|
||||||
|
|
||||||
import os
|
import os
|
||||||
@ -55,7 +56,8 @@ def create_wb_app(wb_router):
|
|||||||
except InternalRedirect as ir:
|
except InternalRedirect as ir:
|
||||||
response = WbResponse(StatusAndHeaders(ir.status, ir.httpHeaders))
|
response = WbResponse(StatusAndHeaders(ir.status, ir.httpHeaders))
|
||||||
|
|
||||||
except (WbException, CDXException, ArchiveLoadFailed) as e:
|
except (WbException, CDXException,
|
||||||
|
UrlCanonicalizeException, ArchiveLoadFailed) as e:
|
||||||
response = handle_exception(env, wb_router.error_view, e, False)
|
response = handle_exception(env, wb_router.error_view, e, False)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
14
sample_archive/text_content/sample.html
Normal file
14
sample_archive/text_content/sample.html
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>Sample Page For Rewrite Test</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<script>
|
||||||
|
var some_val = false;
|
||||||
|
if (some_val) {
|
||||||
|
window.location = "/other.html";
|
||||||
|
}
|
||||||
|
</script>
|
||||||
|
Test Content
|
||||||
|
<a href="another.html">Some Link</a>
|
||||||
|
</body>
|
BIN
sample_archive/zipcdx/zipnum-sample.cdx.gz
Normal file
BIN
sample_archive/zipcdx/zipnum-sample.cdx.gz
Normal file
Binary file not shown.
38
sample_archive/zipcdx/zipnum-sample.idx
Normal file
38
sample_archive/zipcdx/zipnum-sample.idx
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
com,example)/ 20140127171200 zipnum 0 276
|
||||||
|
org,iana)/ 20140127171238 zipnum 276 328
|
||||||
|
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201055 zipnum 604 312
|
||||||
|
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200718 zipnum 916 235
|
||||||
|
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200912 zipnum 1151 235
|
||||||
|
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201240 zipnum 1386 306
|
||||||
|
org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200654 zipnum 1692 235
|
||||||
|
org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200816 zipnum 1927 231
|
||||||
|
org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126201128 zipnum 2158 236
|
||||||
|
org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140127171240 zipnum 2394 312
|
||||||
|
org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126200805 zipnum 2706 234
|
||||||
|
org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126201055 zipnum 2940 235
|
||||||
|
org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126201308 zipnum 3175 289
|
||||||
|
org,iana)/_css/2013.1/print.css 20140126200737 zipnum 3464 208
|
||||||
|
org,iana)/_css/2013.1/print.css 20140126200929 zipnum 3672 207
|
||||||
|
org,iana)/_css/2013.1/print.css 20140126201248 zipnum 3879 276
|
||||||
|
org,iana)/_css/2013.1/screen.css 20140126200706 zipnum 4155 210
|
||||||
|
org,iana)/_css/2013.1/screen.css 20140126200825 zipnum 4365 211
|
||||||
|
org,iana)/_css/2013.1/screen.css 20140126201227 zipnum 4576 216
|
||||||
|
org,iana)/_img/2013.1/iana-logo-header.svg 20140126200654 zipnum 4792 236
|
||||||
|
org,iana)/_img/2013.1/iana-logo-header.svg 20140126200816 zipnum 5028 219
|
||||||
|
org,iana)/_img/2013.1/iana-logo-header.svg 20140126201128 zipnum 5247 221
|
||||||
|
org,iana)/_img/2013.1/iana-logo-homepage.png 20140126200625 zipnum 5468 299
|
||||||
|
org,iana)/_img/2013.1/icann-logo.svg 20140126200719 zipnum 5767 210
|
||||||
|
org,iana)/_img/2013.1/icann-logo.svg 20140126200912 zipnum 5977 212
|
||||||
|
org,iana)/_img/2013.1/icann-logo.svg 20140126201240 zipnum 6189 281
|
||||||
|
org,iana)/_img/bookmark_icon.ico 20140126200631 zipnum 6470 298
|
||||||
|
org,iana)/_js/2013.1/iana.js 20140126200716 zipnum 6768 213
|
||||||
|
org,iana)/_js/2013.1/iana.js 20140126200912 zipnum 6981 216
|
||||||
|
org,iana)/_js/2013.1/iana.js 20140126201239 zipnum 7197 270
|
||||||
|
org,iana)/_js/2013.1/jquery.js 20140126200653 zipnum 7467 215
|
||||||
|
org,iana)/_js/2013.1/jquery.js 20140126200816 zipnum 7682 209
|
||||||
|
org,iana)/_js/2013.1/jquery.js 20140126201127 zipnum 7891 210
|
||||||
|
org,iana)/_js/2013.1/jquery.js 20140127171239 zipnum 8101 410
|
||||||
|
org,iana)/dnssec 20140126201307 zipnum 8511 373
|
||||||
|
org,iana)/domains/int 20140126201239 zipnum 8884 353
|
||||||
|
org,iana)/domains/root/servers 20140126201227 zipnum 9237 386
|
||||||
|
org,iana)/time-zones 20140126200737 zipnum 9623 145
|
1
sample_archive/zipcdx/zipnum-sample.loc
Normal file
1
sample_archive/zipcdx/zipnum-sample.loc
Normal file
@ -0,0 +1 @@
|
|||||||
|
zipnum ./sample_archive/zipcdx/zipnum-sample.cdx.gz
|
4
setup.py
4
setup.py
@ -22,7 +22,9 @@ setup(
|
|||||||
},
|
},
|
||||||
data_files = [
|
data_files = [
|
||||||
('sample_archive/cdx/', glob.glob('sample_archive/cdx/*')),
|
('sample_archive/cdx/', glob.glob('sample_archive/cdx/*')),
|
||||||
('sample_archive/warcs/', glob.glob('sample_archive/warcs/*'))
|
('sample_archive/zipcdx/', glob.glob('sample_archive/zipcdx/*')),
|
||||||
|
('sample_archive/warcs/', glob.glob('sample_archive/warcs/*')),
|
||||||
|
('sample_archive/text_content/', glob.glob('sample_archive/text_content/*'))],
|
||||||
],
|
],
|
||||||
install_requires=[
|
install_requires=[
|
||||||
'uwsgi',
|
'uwsgi',
|
||||||
|
@ -3,6 +3,8 @@ import pytest
|
|||||||
|
|
||||||
import yaml
|
import yaml
|
||||||
|
|
||||||
|
from pywb.cdx.perms import AllowAllPerms
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def testconfig():
|
def testconfig():
|
||||||
config = yaml.load(open('test_config.yaml'))
|
config = yaml.load(open('test_config.yaml'))
|
||||||
@ -25,7 +27,7 @@ class PrintReporter:
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
#================================================================
|
#================================================================
|
||||||
class TestExclusionPerms:
|
class TestExclusionPerms(AllowAllPerms):
|
||||||
"""
|
"""
|
||||||
Perm Checker fixture which can block one URL.
|
Perm Checker fixture which can block one URL.
|
||||||
"""
|
"""
|
||||||
@ -37,20 +39,7 @@ class TestExclusionPerms:
|
|||||||
Return true/false if url or urlkey (canonicalized url)
|
Return true/false if url or urlkey (canonicalized url)
|
||||||
should be allowed
|
should be allowed
|
||||||
"""
|
"""
|
||||||
print "allow_url_lookup:urlkey={}".format(urlkey)
|
|
||||||
if urlkey == self.URLKEY_EXCLUDED:
|
if urlkey == self.URLKEY_EXCLUDED:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
return True
|
return super(TestExclusionPerms, self).allow_url_lookup(urlkey, url)
|
||||||
|
|
||||||
def allow_capture(self, cdx):
|
|
||||||
"""
|
|
||||||
Return True if specified capture (cdx) is allowed.
|
|
||||||
"""
|
|
||||||
return True
|
|
||||||
|
|
||||||
def filter_fields(self, cdx):
|
|
||||||
"""
|
|
||||||
Filter out any forbidden cdx fields from cdx object
|
|
||||||
"""
|
|
||||||
return cdx
|
|
||||||
|
@ -2,6 +2,7 @@ import webtest
|
|||||||
from pywb.pywb_init import pywb_config
|
from pywb.pywb_init import pywb_config
|
||||||
from pywb.wbapp import create_wb_app
|
from pywb.wbapp import create_wb_app
|
||||||
from pywb.cdx.cdxobject import CDXObject
|
from pywb.cdx.cdxobject import CDXObject
|
||||||
|
from pywb.cdx.perms import AllowAllPerms
|
||||||
|
|
||||||
class TestWb:
|
class TestWb:
|
||||||
TEST_CONFIG = 'test_config.yaml'
|
TEST_CONFIG = 'test_config.yaml'
|
||||||
@ -75,7 +76,19 @@ class TestWb:
|
|||||||
|
|
||||||
assert 'Mon, Jan 27 2014 17:12:38' in resp.body
|
assert 'Mon, Jan 27 2014 17:12:38' in resp.body
|
||||||
assert 'wb.js' in resp.body
|
assert 'wb.js' in resp.body
|
||||||
assert '/pywb/20140127171238/http://www.iana.org/time-zones' in resp.body
|
assert '/pywb/20140127171238/http://www.iana.org/time-zones"' in resp.body
|
||||||
|
|
||||||
|
def test_replay_identity_1(self):
|
||||||
|
resp = self.testapp.get('/pywb/20140127171251id_/http://example.com')
|
||||||
|
#resp = self.testapp.get('/pywb/20140126200654id_/http://www.iana.org/_img/2013.1/rir-map.svg')
|
||||||
|
#resp = self.testapp.get('/pywb/20140127171239id_/http://www.iana.org/_css/2013.1/screen.css')
|
||||||
|
#self._assert_basic_html(resp)
|
||||||
|
|
||||||
|
# no wb header insertion
|
||||||
|
assert 'wb.js' not in resp.body
|
||||||
|
|
||||||
|
# original unrewritten url present
|
||||||
|
assert '"http://www.iana.org/domains/example"' in resp.body
|
||||||
|
|
||||||
def test_replay_content_length_1(self):
|
def test_replay_content_length_1(self):
|
||||||
# test larger file, rewritten file (svg!)
|
# test larger file, rewritten file (svg!)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user