1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

Merge remote-tracking branch 'origin/master' into cdx-server

Conflicts:
	pywb/cdx/cdxdomainspecific.py
	pywb/cdx/cdxserver.py
	pywb/cdx/test/cdxserver_test.py
	setup.py
	tests/test_integration.py
This commit is contained in:
Kenji Nagahashi 2014-02-28 19:47:24 +00:00
commit 1f65eff828
38 changed files with 931 additions and 203 deletions

View File

@ -2,6 +2,9 @@
omit = omit =
*/test/* */test/*
*/tests/* */tests/*
*.html
*.js
*.css
[report] [report]
exclude_lines = exclude_lines =

View File

@ -3,9 +3,8 @@ python:
- "2.7" - "2.7"
# command to install dependencies # command to install dependencies
install: install:
- "python setup.py -q install" - python setup.py -q install
- "pip install python-coveralls" - pip install coverage pytest-cov coveralls --use-mirrors
- "pip install pytest-cov"
# command to run tests # command to run tests
#script: nosetests --with-doctest #script: nosetests --with-doctest
#script: py.test run-tests.py ./pywb/ --doctest-modules --ignore=setup.py #script: py.test run-tests.py ./pywb/ --doctest-modules --ignore=setup.py

View File

@ -2,6 +2,7 @@ PyWb 0.2 Beta
============== ==============
[![Build Status](https://travis-ci.org/ikreymer/pywb.png?branch=master)](https://travis-ci.org/ikreymer/pywb) [![Build Status](https://travis-ci.org/ikreymer/pywb.png?branch=master)](https://travis-ci.org/ikreymer/pywb)
[![Coverage Status](https://coveralls.io/repos/ikreymer/pywb/badge.png?branch=master)](https://coveralls.io/r/ikreymer/pywb?branch=master)
pywb is a Python re-implementation of the Wayback Machine software. pywb is a Python re-implementation of the Wayback Machine software.

View File

@ -50,7 +50,10 @@ class Route:
def __init__(self, regex, handler, coll_group = 0, config = {}, lookahead = SLASH_QUERY_LOOKAHEAD): def __init__(self, regex, handler, coll_group = 0, config = {}, lookahead = SLASH_QUERY_LOOKAHEAD):
self.path = regex self.path = regex
self.regex = re.compile(regex + lookahead) if regex:
self.regex = re.compile(regex + lookahead)
else:
self.regex = re.compile('')
self.handler = handler self.handler = handler
# collection id from regex group (default 0) # collection id from regex group (default 0)
self.coll_group = coll_group self.coll_group = coll_group
@ -70,7 +73,6 @@ class Route:
return None return None
matched_str = matcher.group(0) matched_str = matcher.group(0)
if matched_str: if matched_str:
rel_prefix = env['SCRIPT_NAME'] + '/' + matched_str + '/' rel_prefix = env['SCRIPT_NAME'] + '/' + matched_str + '/'
wb_url_str = request_uri[len(matched_str) + 2:] # remove the '/' + rel_prefix part of uri wb_url_str = request_uri[len(matched_str) + 2:] # remove the '/' + rel_prefix part of uri

View File

@ -3,34 +3,43 @@ import re
import logging import logging
import pkg_resources import pkg_resources
from canonicalize import unsurt, UrlCanonicalizer from pywb.utils.dsrules import BaseRule, RuleSet
from pywb.utils.canonicalize import unsurt, UrlCanonicalizer
#================================================================= #=================================================================
def load_domain_specific_cdx_rules(filename, surt_ordered): def load_domain_specific_cdx_rules(ds_rules_file, surt_ordered):
fh = pkg_resources.resource_string(__name__, filename) #fh = pkg_resources.resource_string(__name__, filename)
config = yaml.load(fh) #config = yaml.load(fh)
canon = None
fuzzy = None
# Load Canonicalizer Rules # Load Canonicalizer Rules
rules = StartsWithRule.load_rules(config.get('canon_rules'), rules = RuleSet(CDXDomainSpecificRule, 'canonicalize',
surt_ordered) ds_rules_file=ds_rules_file)
if not surt_ordered:
for rule in rules:
rule.unsurt()
if rules: if rules:
canon = CustomUrlCanonicalizer(rules, surt_ordered) canon = CustomUrlCanonicalizer(rules, surt_ordered)
else:
canon = None
# Load Fuzzy Lookup Rules # Load Fuzzy Lookup Rules
rules = StartsWithRule.load_rules(config.get('fuzzy_lookup_rules'), rules = RuleSet(CDXDomainSpecificRule, 'fuzzy_lookup',
surt_ordered) ds_rules_file=ds_rules_file)
if not surt_ordered:
for rule in rules:
rule.unsurt()
if rules: if rules:
fuzzy = FuzzyQuery(rules) fuzzy = FuzzyQuery(rules)
else:
fuzzy = None
logging.debug('CANON: ' + str(canon)) logging.debug('CustomCanonilizer? ' + str(bool(canon)))
logging.debug('FUZZY: ' + str(fuzzy)) logging.debug('FuzzyMatcher? ' + str(bool(canon)))
return (canon, fuzzy) return (canon, fuzzy)
@ -43,10 +52,7 @@ class CustomUrlCanonicalizer(UrlCanonicalizer):
def __call__(self, url): def __call__(self, url):
urlkey = super(CustomUrlCanonicalizer, self).__call__(url) urlkey = super(CustomUrlCanonicalizer, self).__call__(url)
for rule in self.rules: for rule in self.rules.iter_matching(urlkey):
if not any(urlkey.startswith(x) for x in rule.starts):
continue
m = rule.regex.match(urlkey) m = rule.regex.match(urlkey)
if not m: if not m:
continue continue
@ -67,11 +73,10 @@ class FuzzyQuery:
urlkey = params['key'] urlkey = params['key']
url = params['url'] url = params['url']
filter_ = params.get('filter', [])
output = params.get('output')
for rule in self.rules: for rule in self.rules.iter_matching(urlkey):
if not any(urlkey.startswith(x) for x in rule.starts):
continue
m = rule.regex.search(urlkey) m = rule.regex.search(urlkey)
if not m: if not m:
continue continue
@ -79,7 +84,7 @@ class FuzzyQuery:
matched_rule = rule matched_rule = rule
if len(m.groups()) == 1: if len(m.groups()) == 1:
params['filter'] = '=urlkey:' + m.group(1) filter_.append('~urlkey:' + m.group(1))
break break
@ -88,28 +93,40 @@ class FuzzyQuery:
inx = url.find('?') inx = url.find('?')
if inx > 0: if inx > 0:
params['url'] = url[:inx + 1] url = url[:inx + 1]
params = {'url': url,
'matchType': 'prefix',
'filter': filter_,
'output': output}
params['matchType'] = 'prefix'
params['key'] = None
return params return params
#================================================================= #=================================================================
class StartsWithRule: class CDXDomainSpecificRule(BaseRule):
def __init__(self, config, surt_ordered=True): def __init__(self, name, config):
self.starts = config.get('startswith') super(CDXDomainSpecificRule, self).__init__(name, config)
if not isinstance(self.starts, list):
self.starts = [self.starts]
self.regex = re.compile(config.get('matches')) if isinstance(config, basestring):
self.replace = config.get('replace') self.regex = re.compile(config)
self.replace = None
else:
self.regex = re.compile(config.get('match'))
self.replace = config.get('replace')
def unsurt(self): def unsurt(self):
# must convert to non-surt form """
self.starts = map(unsurt, self.starts) urlkey is assumed to be in surt format by default
self.regex = unsurt(self.regex) In the case of non-surt format, this method is called
self.replace = unsurt(self.replace) to desurt any urls
"""
self.url_prefix = map(unsurt, self.url_prefix)
if self.regex:
self.regex = unsurt(self.regex)
if self.replace:
self.replace = unsurt(self.replace)
@staticmethod @staticmethod
def load_rules(rules_config, surt_ordered=True): def load_rules(rules_config, surt_ordered=True):

View File

@ -151,9 +151,15 @@ def cdx_filter(cdx_iter, filter_strings):
if self.invert: if self.invert:
string = string[1:] string = string[1:]
self.exact = string.startswith('=') # exact match
if self.exact: if string.startswith('='):
string = string[1:] string = string[1:]
self.compare_func = self.exact
elif string.startswith('~'):
string = string[1:]
self.compare_func = self.contains
else:
self.compare_func = self.regex
parts = string.split(':', 1) parts = string.split(':', 1)
# no field set, apply filter to entire cdx # no field set, apply filter to entire cdx
@ -164,19 +170,28 @@ def cdx_filter(cdx_iter, filter_strings):
self.field = parts[0] self.field = parts[0]
string = parts[1] string = parts[1]
if self.exact: # make regex if regex mode
self.exact_str = string if self.compare_func == self.regex:
else:
self.regex = re.compile(string) self.regex = re.compile(string)
else:
self.filter_str = string
def __call__(self, cdx): def __call__(self, cdx):
val = cdx[self.field] if self.field else str(cdx) val = cdx[self.field] if self.field else str(cdx)
if self.exact:
matched = (self.exact_str == val) matched = self.compare_func(val)
else:
matched = self.regex.match(val) is not None
return matched ^ self.invert return matched ^ self.invert
def exact(self, val):
return (self.filter_str == val)
def contains(self, val):
return (self.filter_str in val)
def regex(self, val):
return self.regex.match(val) is not None
filters = map(Filter, filter_strings) filters = map(Filter, filter_strings)
for cdx in cdx_iter: for cdx in cdx_iter:

View File

@ -1,4 +1,4 @@
from canonicalize import UrlCanonicalizer, calc_search_range from pywb.utils.canonicalize import UrlCanonicalizer, calc_search_range
from cdxops import cdx_load from cdxops import cdx_load
from cdxsource import CDXSource, CDXFile, RemoteCDXSource, RedisCDXSource from cdxsource import CDXSource, CDXFile, RemoteCDXSource, RedisCDXSource
@ -17,13 +17,13 @@ import urlparse
#================================================================= #=================================================================
class BaseCDXServer(object): class BaseCDXServer(object):
def __init__(self, **kwargs): def __init__(self, **kwargs):
ds_rules = kwargs.get('ds_rules') ds_rules_file = kwargs.get('ds_rules_file')
surt_ordered = kwargs.get('surt_ordered', True) surt_ordered = kwargs.get('surt_ordered', True)
# load from domain-specific rules # load from domain-specific rules
if ds_rules: if ds_rules_file:
self.url_canon, self.fuzzy_query = ( self.url_canon, self.fuzzy_query = (
load_domain_specific_cdx_rules(ds_rules, surt_ordered)) load_domain_specific_cdx_rules(ds_rules_file, surt_ordered))
# or custom passed in canonicalizer # or custom passed in canonicalizer
else: else:
self.url_canon = kwargs.get('url_canon') self.url_canon = kwargs.get('url_canon')
@ -50,14 +50,14 @@ class BaseCDXServer(object):
url = params['url'] url = params['url']
if self.fuzzy_query and params.get('allowFuzzy'): # check if fuzzy is allowed and ensure that its an
if not 'key' in params: # exact match
params['key'] = self.url_canon(url) if (self.fuzzy_query and params.get('allowFuzzy') and
params.get('matchType', 'exact') == 'exact'):
params = self.fuzzy_query(params) fuzzy_params = self.fuzzy_query(params)
if params: if fuzzy_params:
params['allowFuzzy'] = False return self.load_cdx(**fuzzy_params)
return self.load_cdx(**params)
msg = 'No Captures found for: ' + url msg = 'No Captures found for: ' + url
raise CaptureNotFoundException(msg) raise CaptureNotFoundException(msg)
@ -98,7 +98,6 @@ class CDXServer(BaseCDXServer):
msg = 'A url= param must be specified to query the cdx server' msg = 'A url= param must be specified to query the cdx server'
raise CDXException(msg) raise CDXException(msg)
#params['key'] = self.url_canon(url)
match_type = params.get('matchType', 'exact') match_type = params.get('matchType', 'exact')
key, end_key = calc_search_range(url=url, key, end_key = calc_search_range(url=url,
@ -159,7 +158,7 @@ class CDXServer(BaseCDXServer):
if filename.endswith('.cdx'): if filename.endswith('.cdx'):
return CDXFile(filename) return CDXFile(filename)
if filename.endswith('.summary'): if filename.endswith(('.summary', '.idx')):
return ZipNumCluster(filename, config) return ZipNumCluster(filename, config)
logging.warn('skipping unrecognized URI:%s', filename) logging.warn('skipping unrecognized URI:%s', filename)
@ -218,7 +217,7 @@ def create_cdx_server(config, ds_rules_file=None):
return server_cls(paths, return server_cls(paths,
config=pass_config, config=pass_config,
surt_ordered=surt_ordered, surt_ordered=surt_ordered,
ds_rules=ds_rules_file, ds_rules_file=ds_rules_file,
perms_checker=perms_checker) perms_checker=perms_checker)
#================================================================= #=================================================================

View File

@ -1,6 +1,8 @@
from pywb.utils.binsearch import iter_range from pywb.utils.binsearch import iter_range
from pywb.utils.loaders import SeekableTextFileReader from pywb.utils.loaders import SeekableTextFileReader
from cdxobject import AccessException
import urllib import urllib
import urllib2 import urllib2
import itertools import itertools
@ -93,7 +95,7 @@ class RedisCDXSource(CDXSource):
self.key_prefix = self.DEFAULT_KEY_PREFIX self.key_prefix = self.DEFAULT_KEY_PREFIX
if config: if config:
self.key_prefix = config.get('redis_key_prefix', self.key_prefix) self.key_prefix = config.get('redis_key_prefix', self.key_prefix)
def load_cdx(self, params): def load_cdx(self, params):
""" """

View File

@ -1,7 +1,7 @@
#================================================================= #=================================================================
class AllowAllPerms: class AllowAllPerms(object):
""" """
Sample Perm Checker which allows all Sample Perm Checker which allows all
""" """

View File

@ -1,24 +0,0 @@
fuzzy_lookup_rules:
- startswith: 'com,twitter)/i/profiles/show/'
matches: '/profiles/show/.*with_replies\?.*(max_id=[^&]+)'
- startswith: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet'
matches: 'com,facebook\)/.*[?&]data=(.*(?:[&]|query_type[^,]+))'
- startswith: ['com,yimg,l)/g/combo', 'com,yahooapis,yui)/combo']
matches: '([^/]+(?:\.css|\.js))'
# matches all urls
- startswith: ''
matches: '[&?](?:_|uncache)=[\d]+[&]?'
canon_rules:
- startswith: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet'
matches: 'com,facebook\)/.*[?&]data=([^&]+).*'
replace: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet?data=\1'

View File

@ -54,6 +54,16 @@ com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit
com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz
com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz
# Filter contains
>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '~urlkey:example=1')
com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz
# Filter contains invert
>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '!~urlkey:example=1')
com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz
com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz
# Collapse by timestamp # Collapse by timestamp
# unresolved revisits, different statuscode results in an extra repeat # unresolved revisits, different statuscode results in an extra repeat
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapseTime = 11) >>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapseTime = 11)
@ -131,9 +141,9 @@ org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db tex
('offset', '334'), ('offset', '334'),
('filename', 'dupes.warc.gz')] ('filename', 'dupes.warc.gz')]
# NOTE: external dependency -- need self-contained test # NOTE: external dependency -- need self-contained test TODO
#>>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'example.com', output = 'raw', limit = '2') >>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'example.com', output = 'raw', limit = '2')
#>>> pprint.pprint(x.next().items()) >>> pprint.pprint(x.next().items())
[('urlkey', 'com,example)/'), [('urlkey', 'com,example)/'),
('timestamp', '20020120142510'), ('timestamp', '20020120142510'),
('original', 'http://example.com:80/'), ('original', 'http://example.com:80/'),
@ -142,6 +152,10 @@ org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db tex
('digest', 'HT2DYGA5UKZCPBSFVCV3JOBXGW2G5UUA'), ('digest', 'HT2DYGA5UKZCPBSFVCV3JOBXGW2G5UUA'),
('length', '1792')] ('length', '1792')]
>>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'facebook.com', output = 'raw', limit = '2')
Traceback (most recent call last):
AccessException: Blocked By Robots
""" """
#================================================================= #=================================================================
@ -169,7 +183,8 @@ def cdx_ops_test(url, sources = [test_cdx_dir + 'iana.cdx'], **kwparams):
results = server.load_cdx(**kwparams) results = server.load_cdx(**kwparams)
for x in results: for x in results:
sys.stdout.write(x.to_text(fields)) l = x.to_text(fields).replace('\t', ' ')
sys.stdout.write(l)
#================================================================ #================================================================

View File

@ -0,0 +1,44 @@
"""
>>> zip_ops_test(url = 'http://iana.org')
org,iana)/ 20140126200624 http://www.iana.org/ text/html 200 OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 2258 334 iana.warc.gz
org,iana)/ 20140127171238 http://iana.org unk 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 343 1858 dupes.warc.gz
org,iana)/ 20140127171238 http://www.iana.org/ warc/revisit - OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 536 2678 dupes.warc.gz
# test idx index (tabs replacad with 4 spaces)
>>> zip_ops_test(url = 'http://iana.org/domains/', matchType = 'prefix', showPagedIndex = True)
org,iana)/dnssec 20140126201307 zipnum 8511 373
org,iana)/domains/int 20140126201239 zipnum 8884 353
org,iana)/domains/root/servers 20140126201227 zipnum 9237 386
>>> zip_ops_test(url = 'http://iana.org/domains/', matchType = 'prefix')
org,iana)/domains/arpa 20140126201248 http://www.iana.org/domains/arpa text/html 200 QOFZZRN6JIKAL2JRL6ZC2VVG42SPKGHT - - 2939 759039 iana.warc.gz
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
org,iana)/domains/idn-tables 20140126201127 http://www.iana.org/domains/idn-tables text/html 200 HNCUFTJMOQOGAEY6T56KVC3T7TVLKGEW - - 8118 715878 iana.warc.gz
org,iana)/domains/int 20140126201239 http://www.iana.org/domains/int text/html 200 X32BBNNORV4SPEHTQF5KI5NFHSKTZK6Q - - 2482 746788 iana.warc.gz
org,iana)/domains/reserved 20140126201054 http://www.iana.org/domains/reserved text/html 200 R5AAEQX5XY5X5DG66B23ODN5DUBWRA27 - - 3573 701457 iana.warc.gz
org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz
"""
from cdxserver_test import cdx_ops_test
from pywb import get_test_dir
test_zipnum = get_test_dir() + 'zipcdx/zipnum-sample.idx'
print test_zipnum
def zip_ops_test(url, **kwargs):
sources = test_zipnum
cdx_ops_test(url, sources, **kwargs)
if __name__ == "__main__":
import doctest
doctest.testmod()

View File

@ -18,17 +18,19 @@ def load_template_file(file, desc = None, view_class = views.J2TemplateView):
return file return file
#================================================================= #=================================================================
def create_wb_handler(cdx_server, config): def create_wb_handler(cdx_server, config, ds_rules_file=None):
record_loader = ArcWarcRecordLoader(cookie_maker = config.get('cookie_maker')) record_loader = ArcWarcRecordLoader(cookie_maker = config.get('cookie_maker'))
paths = config.get('archive_paths') paths = config.get('archive_paths')
resolving_loader = ResolvingLoader(paths = paths, cdx_server = cdx_server, record_loader = record_loader) resolving_loader = ResolvingLoader(paths=paths,
cdx_server=cdx_server,
record_loader=record_loader)
replayer = replay_views.ReplayView( replayer = replay_views.ReplayView(
content_loader = resolving_loader, content_loader = resolving_loader,
content_rewriter = RewriteContent(), content_rewriter = RewriteContent(ds_rules_file=ds_rules_file),
head_insert_view = load_template_file(config.get('head_insert_html'), 'Head Insert'), head_insert_view = load_template_file(config.get('head_insert_html'), 'Head Insert'),

View File

@ -57,12 +57,13 @@ def pywb_config_manual(passed_config = {}):
route_config = DictChain(value, config) route_config = DictChain(value, config)
ds_rules = route_config.get('domain_specific_rules', None) ds_rules_file = route_config.get('domain_specific_rules', None)
cdx_server = IndexReader(route_config, ds_rules) cdx_server = IndexReader(route_config, ds_rules_file)
wb_handler = config_utils.create_wb_handler( wb_handler = config_utils.create_wb_handler(
cdx_server = cdx_server, cdx_server=cdx_server,
config = route_config, config=route_config,
ds_rules_file=ds_rules_file,
) )
logging.debug('Adding Collection: ' + name) logging.debug('Adding Collection: ' + name)

View File

@ -7,6 +7,8 @@ from wbrequestresponse import WbResponse
from wbexceptions import CaptureException, InternalRedirect from wbexceptions import CaptureException, InternalRedirect
from pywb.warc.recordloader import ArchiveLoadFailed from pywb.warc.recordloader import ArchiveLoadFailed
from pywb.utils.loaders import LimitReader
#================================================================= #=================================================================
class ReplayView: class ReplayView:
def __init__(self, content_loader, content_rewriter, head_insert_view = None, def __init__(self, content_loader, content_rewriter, head_insert_view = None,
@ -53,10 +55,21 @@ class ReplayView:
response = None response = None
# if Content-Length for payload is present, ensure we don't read past it
content_len = status_headers.get_header('content-length')
try:
content_len=int(content_len)
if content_len > 0:
stream = LimitReader(stream, content_len)
except ValueError:
pass
if self.content_rewriter and wbrequest.wb_url.mod != 'id_': if self.content_rewriter and wbrequest.wb_url.mod != 'id_':
response = self.rewrite_content(wbrequest, cdx, status_headers, stream) response = self.rewrite_content(wbrequest, cdx, status_headers, stream)
else: else:
(status_headers, stream) = self.sanitize_content(status_headers, stream) (status_headers, stream) = self.sanitize_content(status_headers, stream)
#status_headers.remove_header('content-length')
response_iter = self.stream_to_iter(stream) response_iter = self.stream_to_iter(stream)
response = WbResponse(status_headers, response_iter) response = WbResponse(status_headers, response_iter)
@ -99,20 +112,34 @@ class ReplayView:
def rewrite_content(self, wbrequest, cdx, status_headers, stream): def rewrite_content(self, wbrequest, cdx, status_headers, stream):
urlrewriter = wbrequest.urlrewriter urlrewriter = wbrequest.urlrewriter
(rewritten_headers, stream) = self.content_rewriter.rewrite_headers(urlrewriter, status_headers, stream) result = self.content_rewriter.rewrite_headers(urlrewriter,
status_headers,
stream,
cdx['urlkey'])
(rewritten_headers, stream) = result
# no rewriting needed! # no rewriting needed!
if rewritten_headers.text_type is None: if rewritten_headers.text_type is None:
response_iter = self.stream_to_iter(stream) response_iter = self.stream_to_iter(stream)
return WbResponse(rewritten_headers.status_headers, response_iter) return WbResponse(rewritten_headers.status_headers, response_iter)
# do head insert def make_head_insert(rule):
return (self.head_insert_view.render_to_string(wbrequest=wbrequest,
cdx=cdx,
rule=rule))
# do head insert
if self.head_insert_view: if self.head_insert_view:
head_insert_str = self.head_insert_view.render_to_string(wbrequest = wbrequest, cdx = cdx) head_insert_func = make_head_insert
else: else:
head_insert_str = None head_insert_func = None
(status_headers, response_gen) = self.content_rewriter.rewrite_content(urlrewriter, rewritten_headers, stream, head_insert_str) result = self.content_rewriter.rewrite_content(urlrewriter,
rewritten_headers,
stream,
head_insert_func,
cdx['urlkey'])
(status_headers, response_gen) = result
if self.buffer_response: if self.buffer_response:
if wbrequest.wb_url.mod == 'id_': if wbrequest.wb_url.mod == 'id_':

View File

@ -4,11 +4,16 @@ import itertools
from url_rewriter import UrlRewriter from url_rewriter import UrlRewriter
#================================================================= #=================================================================
class RegexRewriter(object): class RegexRewriter(object):
#@staticmethod
#def comment_out(string):
# return '/*' + string + '*/'
@staticmethod @staticmethod
def comment_out(string): def format(template):
return '/*' + string + '*/' return lambda string: template.format(string)
@staticmethod @staticmethod
def remove_https(string): def remove_https(string):
@ -20,19 +25,16 @@ class RegexRewriter(object):
@staticmethod @staticmethod
def archival_rewrite(rewriter): def archival_rewrite(rewriter):
return lambda x: rewriter.rewrite(x) return lambda string: rewriter.rewrite(string)
@staticmethod #@staticmethod
def replacer(string): #def replacer(other):
return lambda x: string # return lambda m, string: other
HTTPX_MATCH_STR = r'https?:\\?/\\?/[A-Za-z0-9:_@.-]+' HTTPX_MATCH_STR = r'https?:\\?/\\?/[A-Za-z0-9:_@.-]+'
DEFAULT_OP = add_prefix DEFAULT_OP = add_prefix
def __init__(self, rules): def __init__(self, rules):
#rules = self.create_rules(http_prefix) #rules = self.create_rules(http_prefix)
@ -76,52 +78,68 @@ class RegexRewriter(object):
op = RegexRewriter.DEFAULT_OP(op) op = RegexRewriter.DEFAULT_OP(op)
result = op(m.group(i)) result = op(m.group(i))
final_str = result
# if extracting partial match # if extracting partial match
if i != full_m: if i != full_m:
result = m.string[m.start(full_m):m.start(i)] + result + m.string[m.end(i):m.end(full_m)] final_str = m.string[m.start(full_m):m.start(i)]
final_str += result
final_str += m.string[m.end(i):m.end(full_m)]
return final_str
@staticmethod
def parse_rules_from_config(config):
def parse_rule(obj):
match = obj.get('match')
replace = RegexRewriter.format(obj.get('replace', '{0}'))
group = obj.get('group', 0)
result = (match, replace, group)
return result return result
return map(parse_rule, config)
#================================================================= #=================================================================
class JSLinkRewriter(RegexRewriter): class JSLinkOnlyRewriter(RegexRewriter):
""" """
JS Rewriter which rewrites absolute http://, https:// and // urls JS Rewriter which rewrites absolute http://, https:// and // urls
at the beginning of a string at the beginning of a string
""" """
JS_HTTPX = r'(?<="|\')(?:https?:)?\\{0,2}/\\{0,2}/[A-Za-z0-9:_@.-]+' JS_HTTPX = r'(?<="|\')(?:https?:)?\\{0,2}/\\{0,2}/[A-Za-z0-9:_@.-]+'
def __init__(self, rewriter, rules = []): def __init__(self, rewriter, rules=[]):
rules = rules + [(self.JS_HTTPX, rewriter.get_abs_url(), 0)] rules = rules + [(self.JS_HTTPX, rewriter.get_abs_url(), 0)]
super(JSLinkRewriter, self).__init__(rules) super(JSLinkOnlyRewriter, self).__init__(rules)
#================================================================= #=================================================================
class JSLocationAndLinkRewriter(JSLinkRewriter): class JSLinkAndLocationRewriter(JSLinkOnlyRewriter):
""" """
JS Rewriter which also rewrites location and domain to the JS Rewriter which also rewrites location and domain to the
specified prefix (default: 'WB_wombat_') specified prefix (default: 'WB_wombat_')
""" """
def __init__(self, rewriter, rules = [], prefix = 'WB_wombat_'): def __init__(self, rewriter, rules=[], prefix='WB_wombat_'):
rules = rules + [ rules = rules + [
(r'(?<!/)\blocation\b', prefix, 0), (r'(?<!/)\blocation\b', prefix, 0),
(r'(?<=document\.)domain', prefix, 0), (r'(?<=document\.)domain', prefix, 0),
] ]
super(JSLocationAndLinkRewriter, self).__init__(rewriter, rules) #import sys
#sys.stderr.write('\n\n*** RULES:' + str(rules) + '\n\n')
super(JSLinkAndLocationRewriter, self).__init__(rewriter, rules)
#================================================================= #=================================================================
# Set 'default' JSRewriter # Set 'default' JSRewriter
JSRewriter = JSLocationAndLinkRewriter JSRewriter = JSLinkAndLocationRewriter
#================================================================= #=================================================================
class XMLRewriter(RegexRewriter): class XMLRewriter(RegexRewriter):
def __init__(self, rewriter, extra = []): def __init__(self, rewriter, extra=[]):
rules = self._create_rules(rewriter.get_abs_url()) rules = self._create_rules(rewriter.get_abs_url())
RegexRewriter.__init__(self, rules) super(XMLRewriter, self).__init__(rules)
# custom filter to reject 'xmlns' attr # custom filter to reject 'xmlns' attr
def filter(self, m): def filter(self, m):
@ -133,24 +151,28 @@ class XMLRewriter(RegexRewriter):
def _create_rules(self, http_prefix): def _create_rules(self, http_prefix):
return [ return [
('([A-Za-z:]+[\s=]+)?["\'\s]*(' + RegexRewriter.HTTPX_MATCH_STR + ')', http_prefix, 2), ('([A-Za-z:]+[\s=]+)?["\'\s]*(' +
RegexRewriter.HTTPX_MATCH_STR + ')', http_prefix, 2),
] ]
#================================================================= #=================================================================
class CSSRewriter(RegexRewriter): class CSSRewriter(RegexRewriter):
CSS_URL_REGEX = "url\\s*\\(\\s*[\\\\\"']*([^)'\"]+)[\\\\\"']*\\s*\\)" CSS_URL_REGEX = "url\\s*\\(\\s*[\\\\\"']*([^)'\"]+)[\\\\\"']*\\s*\\)"
CSS_IMPORT_NO_URL_REGEX = "@import\\s+(?!url)\\(?\\s*['\"]?(?!url[\\s\\(])([\w.:/\\\\-]+)"
CSS_IMPORT_NO_URL_REGEX = ("@import\\s+(?!url)\\(?\\s*['\"]?" +
"(?!url[\\s\\(])([\w.:/\\\\-]+)")
def __init__(self, rewriter): def __init__(self, rewriter):
rules = self._create_rules(rewriter) rules = self._create_rules(rewriter)
super(CSSRewriter, self).__init__(rules)
RegexRewriter.__init__(self, rules)
def _create_rules(self, rewriter): def _create_rules(self, rewriter):
return [ return [
(CSSRewriter.CSS_URL_REGEX, RegexRewriter.archival_rewrite(rewriter), 1), (CSSRewriter.CSS_URL_REGEX,
(CSSRewriter.CSS_IMPORT_NO_URL_REGEX, RegexRewriter.archival_rewrite(rewriter), 1), RegexRewriter.archival_rewrite(rewriter), 1),
(CSSRewriter.CSS_IMPORT_NO_URL_REGEX,
RegexRewriter.archival_rewrite(rewriter), 1),
] ]

View File

@ -1,30 +1,27 @@
import chardet import chardet
import pkgutil
import yaml
from url_rewriter import UrlRewriter from header_rewriter import RewrittenStatusAndHeaders
from html_rewriter import HTMLRewriter
from regex_rewriters import RegexRewriter, JSRewriter, CSSRewriter, XMLRewriter
from header_rewriter import HeaderRewriter, RewrittenStatusAndHeaders
from rewriterules import RewriteRules
from pywb.utils.dsrules import RuleSet
from pywb.utils.statusandheaders import StatusAndHeaders from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.utils.bufferedreaders import DecompressingBufferedReader, ChunkedDataReader from pywb.utils.bufferedreaders import DecompressingBufferedReader, ChunkedDataReader
#=================================================================
class RewriteContent: class RewriteContent:
def __init__(self, ds_rules_file=None):
self.ruleset = RuleSet(RewriteRules, 'rewrite',
default_rule_config={},
ds_rules_file=ds_rules_file)
DEFAULT_CONTENT_REWRITERS = { def rewrite_headers(self, urlrewriter, status_headers, stream, urlkey=''):
'header': HeaderRewriter, header_rewriter_class = self.ruleset.get_first_match(urlkey).rewriters['header']
'js': JSRewriter,
'css': CSSRewriter,
'xml': XMLRewriter,
'html': HTMLRewriter
}
rewritten_headers = header_rewriter_class().rewrite(status_headers, urlrewriter)
def __init__(self, rewriters = {}):
self.rewriters = dict(self.DEFAULT_CONTENT_REWRITERS.items() + rewriters.items())
def rewrite_headers(self, urlrewriter, status_headers, stream):
rewritten_headers = self.rewriters['header']().rewrite(status_headers, urlrewriter)
# note: since chunking may be broken, approach taken here is to *always* attempt # note: since chunking may be broken, approach taken here is to *always* attempt
# to dechunk if transfer-encoding: chunked is present # to dechunk if transfer-encoding: chunked is present
@ -37,7 +34,8 @@ class RewriteContent:
return (rewritten_headers, stream) return (rewritten_headers, stream)
def rewrite_content(self, urlrewriter, headers, stream, head_insert_str = None): def rewrite_content(self, urlrewriter, headers, stream, head_insert_func=None, urlkey=''):
# see if we've already rewritten headers # see if we've already rewritten headers
if isinstance(headers, RewrittenStatusAndHeaders): if isinstance(headers, RewrittenStatusAndHeaders):
rewritten_headers = headers rewritten_headers = headers
@ -50,9 +48,11 @@ class RewriteContent:
return (status_headers, gen) return (status_headers, gen)
status_headers = rewritten_headers.status_headers status_headers = rewritten_headers.status_headers
# Handle text content rewriting # Handle text content rewriting
# ========================================================================= # =========================================================================
# special case -- need to ungzip the body # special case -- need to ungzip the body
if (rewritten_headers.contains_removed_header('content-encoding', 'gzip')): if (rewritten_headers.contains_removed_header('content-encoding', 'gzip')):
stream = DecompressingBufferedReader(stream, decomp_type='gzip') stream = DecompressingBufferedReader(stream, decomp_type='gzip')
@ -68,13 +68,27 @@ class RewriteContent:
text_type = rewritten_headers.text_type text_type = rewritten_headers.text_type
rewriter_class = self.rewriters.get(text_type) rule = self.ruleset.get_first_match(urlkey)
if not rewriter_class:
try:
rewriter_class = rule.rewriters[text_type]
except KeyError:
raise Exception('Unknown Text Type for Rewrite: ' + text_type) raise Exception('Unknown Text Type for Rewrite: ' + text_type)
#import sys
#sys.stderr.write(str(vars(rule)))
if text_type == 'html': if text_type == 'html':
rewriter = rewriter_class(urlrewriter, outstream = None, head_insert = head_insert_str) head_insert_str = ''
if head_insert_func:
head_insert_str = head_insert_func(rule)
rewriter = rewriter_class(urlrewriter,
outstream=None,
js_rewriter_class=rule.rewriters['js'],
css_rewriter_class=rule.rewriters['css'],
head_insert=head_insert_str)
else: else:
rewriter = rewriter_class(urlrewriter) rewriter = rewriter_class(urlrewriter)

View File

@ -2,12 +2,17 @@ import urllib2
import os import os
import sys import sys
import datetime import datetime
import mimetypes
from pywb.utils.loaders import is_http
from pywb.utils.timeutils import datetime_to_timestamp from pywb.utils.timeutils import datetime_to_timestamp
from pywb.utils.statusandheaders import StatusAndHeaders from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.utils.canonicalize import canonicalize
from pywb.rewrite.url_rewriter import UrlRewriter from pywb.rewrite.url_rewriter import UrlRewriter
from pywb.rewrite.rewrite_content import RewriteContent from pywb.rewrite.rewrite_content import RewriteContent
""" """
Fetch a url from live web and apply rewriting rules Fetch a url from live web and apply rewriting rules
""" """
@ -26,10 +31,37 @@ def get_status_and_stream(url):
return (status_headers, stream) return (status_headers, stream)
#================================================================= #=================================================================
def get_rewritten(url, urlrewriter): def get_local_file(uri):
(status_headers, stream) = get_status_and_stream(url) fh = open(uri)
status_headers, gen = RewriteContent().rewrite_content(urlrewriter, status_headers, stream) content_type, _ = mimetypes.guess_type(uri)
# create fake headers for local file
status_headers = StatusAndHeaders('200 OK', [('Content-Type', content_type)])
stream = fh
return (status_headers, stream)
#=================================================================
def get_rewritten(url, urlrewriter, urlkey=None, head_insert_func=None):
if is_http(url):
(status_headers, stream) = get_status_and_stream(url)
else:
(status_headers, stream) = get_local_file(url)
# explicit urlkey may be passed in (say for testing)
if not urlkey:
urlkey = canonicalize(url)
rewriter = RewriteContent()
result = rewriter.rewrite_content(urlrewriter,
status_headers,
stream,
head_insert_func=head_insert_func,
urlkey=urlkey)
status_headers, gen = result
buff = '' buff = ''
for x in gen: for x in gen:

View File

@ -0,0 +1,53 @@
from pywb.utils.dsrules import BaseRule
from regex_rewriters import RegexRewriter, CSSRewriter, XMLRewriter
from regex_rewriters import JSLinkAndLocationRewriter, JSLinkOnlyRewriter
from html_rewriter import HTMLRewriter
from header_rewriter import HeaderRewriter
import itertools
class RewriteRules(BaseRule):
def __init__(self, url_prefix, config={}):
super(RewriteRules, self).__init__(url_prefix, config)
self.rewriters = {}
#self._script_head_inserts = config.get('script_head_inserts', {})
self.rewriters['header'] = config.get('header_class', HeaderRewriter)
self.rewriters['css'] = config.get('css_class', CSSRewriter)
self.rewriters['xml'] = config.get('xml_class', XMLRewriter)
self.rewriters['html'] = config.get('html_class', HTMLRewriter)
# Custom handling for js rewriting, often the most complex
self.js_rewrite_location = config.get('js_rewrite_location', True)
self.js_rewrite_location = bool(self.js_rewrite_location)
# ability to toggle rewriting
if self.js_rewrite_location:
js_default_class = JSLinkAndLocationRewriter
else:
js_default_class = JSLinkOnlyRewriter
# set js class, using either default or override from config
self.rewriters['js'] = config.get('js_class', js_default_class)
# add any regexs for js rewriter
self._add_custom_regexs('js', config)
def _add_custom_regexs(self, field, config):
regexs = config.get(field + '_regexs')
if not regexs:
return
rewriter_cls = self.rewriters[field]
rule_def_tuples = RegexRewriter.parse_rules_from_config(regexs)
def extend_rewriter_with_regex(urlrewriter):
#import sys
#sys.stderr.write('\n\nEXTEND: ' + str(rule_def_tuples))
return rewriter_cls(urlrewriter, rule_def_tuples)
self.rewriters[field] = extend_rewriter_with_regex

View File

@ -121,7 +121,7 @@ r"""
'document_domain = "anotherdomain.com"; window.document.WB_wombat_domain = "example.com"' 'document_domain = "anotherdomain.com"; window.document.WB_wombat_domain = "example.com"'
# custom rules added # custom rules added
>>> _test_js('window.location = "http://example.com/abc.html"; some_func(); ', [('some_func\(\).*', RegexRewriter.comment_out, 0)]) >>> _test_js('window.location = "http://example.com/abc.html"; some_func(); ', [('some_func\(\).*', RegexRewriter.format('/*{0}*/'), 0)])
'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"; /*some_func(); */' 'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"; /*some_func(); */'
# scheme-agnostic # scheme-agnostic

View File

@ -1,11 +1,50 @@
from pywb.rewrite.rewrite_live import get_rewritten from pywb.rewrite.rewrite_live import get_rewritten
from pywb.rewrite.url_rewriter import UrlRewriter from pywb.rewrite.url_rewriter import UrlRewriter
from pywb import get_test_dir
# This module has some rewriting tests against the 'live web' # This module has some rewriting tests against the 'live web'
# As such, the content may change and the test may break # As such, the content may change and the test may break
urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/pywb/') urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/pywb/')
def head_insert_func(rule):
if rule.js_rewrite_location == True:
return '<script src="/static/default/wombat.js"> </script>'
else:
return ''
def test_local_1():
status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html',
urlrewriter,
'com,example,test)/',
head_insert_func)
# wombat insert added
assert '<head><script src="/static/default/wombat.js"> </script>' in buff
# location rewritten
assert 'window.WB_wombat_location = "/other.html"' in buff
# link rewritten
assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff
def test_local_2_no_js_location_rewrite():
status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html',
urlrewriter,
'example,example,test)/nolocation_rewrite',
head_insert_func)
# no wombat insert
assert '<head><script src="/static/default/wombat.js"> </script>' not in buff
# no location rewrite
assert 'window.location = "/other.html"' in buff
# still link rewrite
assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff
def test_example_1(): def test_example_1():
status_headers, buff = get_rewritten('http://example.com/', urlrewriter) status_headers, buff = get_rewritten('http://example.com/', urlrewriter)
@ -24,9 +63,10 @@ def test_example_2():
#def test_example_3(): def test_example_domain_specific_3():
# status_headers, buff = get_rewritten('http://archive.org/', urlrewriter) status_headers, buff = get_rewritten('http://facebook.com/digitalpreservation', urlrewriter)
# assert '/pywb/20131226101010/http://example.com/about/terms.php' in buff, buff # comment out bootloader
assert '/* Bootloader.configurePage' in buff

50
pywb/rules.yaml Normal file
View File

@ -0,0 +1,50 @@
rules:
# twitter rules
#=================================================================
- url_prefix: 'com,twitter)/i/profiles/show/'
fuzzy_lookup: '/profiles/show/.*with_replies\?.*(max_id=[^&]+)'
# facebook rules
#=================================================================
- url_prefix: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet'
fuzzy_lookup: 'com,facebook\)/.*[?&]data=(.*?(?:[&]|query_type[^,]+))'
# not actually needed, fuzzy match is used instead here
# canonicalize:
# match: 'com,facebook\)/.*[?&]data=([^&]+).*'
# replace: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet?data=\1'
- url_prefix: 'com,facebook)/'
rewrite:
js_regexs:
- match: 'Bootloader\.configurePage.*'
replace: '/* {0} */'
# yahoo rules
#=================================================================
- url_prefix: ['com,yimg,l)/g/combo', 'com,yahooapis,yui)/combo']
fuzzy_lookup: '([^/]+(?:\.css|\.js))'
# testing rules -- not for valid domain
#=================================================================
# this rule block is a non-existent prefix merely for testing
- url_prefix: 'example,example,test)/nolocation_rewrite'
rewrite:
js_rewrite_location: False
# all domain rules -- fallback to this dataset
#=================================================================
# Applies to all urls -- should be last
- url_prefix: ''
fuzzy_lookup: '[&?](?:_|uncache)=[\d]+[&]?'

View File

@ -1,18 +1,21 @@
/*
Copyright(c) 2013-2014 Ilya Kreymer. Released under the GNU General Public License.
This file is part of pywb.
// Rewritten location and domain obj setup pywb is free software: you can redistribute it and/or modify
window.WB_wombat_location = window.location it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
if (window.top != window) { pywb is distributed in the hope that it will be useful,
window.top.WB_wombat_location = window.top.location but WITHOUT ANY WARRANTY; without even the implied warranty of
} MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
if (window.opener) {
window.opener.WB_wombat_location = window.opener.location
}
document.WB_wombat_domain = document.domain
You should have received a copy of the GNU General Public License
along with pywb. If not, see <http://www.gnu.org/licenses/>.
*/
function initBanner() function initBanner()
{ {

219
pywb/static/wombat.js Normal file
View File

@ -0,0 +1,219 @@
/*
Copyright(c) 2013-2014 Ilya Kreymer. Released under the GNU General Public License.
This file is part of pywb.
pywb is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
pywb is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with pywb. If not, see <http://www.gnu.org/licenses/>.
*/
//============================================
// Wombat JS-Rewriting Library
//============================================
var WB_wombat_replayPrefix;
var WB_wombat_replayDatePrefix;
var WB_wombat_captureDatePart;
var WB_wombat_origHost;
function WB_StripPort(str)
{
var hostWithPort = str.match(/^http:\/\/[\w\d@.-]+:\d+/);
if (hostWithPort) {
var hostName = hostWithPort[0].substr(0, hostWithPort[0].lastIndexOf(':'));
return hostName + str.substr(hostWithPort[0].length);
}
return str;
}
function WB_IsHostUrl(str)
{
// Good guess that's its a hostname
if (str.indexOf("www.") == 0) {
return true;
}
// hostname:port (port required)
var matches = str.match(/^[\w-]+(\.[\w-_]+)+(:\d+)(\/|$)/);
if (matches && (matches[0].length < 64)) {
return true;
}
// ip:port
matches = str.match(/^\d+\.\d+\.\d+\.\d+(:\d+)?(\/|$)/);
if (matches && (matches[0].length < 64)) {
return true;
}
return false;
}
function WB_RewriteUrl(url)
{
var httpPrefix = "http://";
// If not dealing with a string, just return it
if (!url || (typeof url) != "string") {
return url;
}
// If starts with prefix, no rewriting needed
// Only check replay prefix (no date) as date may be different for each capture
if (url.indexOf(WB_wombat_replayPrefix) == 0) {
return url;
}
// If server relative url, add prefix and original host
if (url.charAt(0) == "/") {
// Already a relative url, don't make any changes!
if (url.indexOf(WB_wombat_captureDatePart) >= 0) {
return url;
}
return WB_wombat_replayDatePrefix + WB_wombat_origHost + url;
}
// If full url starting with http://, add prefix
if (url.indexOf(httpPrefix) == 0) {
return WB_wombat_replayDatePrefix + url;
}
// May or may not be a hostname, call function to determine
// If it is, add the prefix and make sure port is removed
if (WB_IsHostUrl(url)) {
return WB_wombat_replayDatePrefix + httpPrefix + url;
}
return url;
}
function WB_CopyObjectFields(obj)
{
var newObj = {};
for (prop in obj) {
if ((typeof obj[prop]) != "function") {
newObj[prop] = obj[prop];
}
}
return newObj;
}
function WB_ExtractOrig(href)
{
if (!href) {
return "";
}
href = href.toString();
var index = href.indexOf("/http", 1);
if (index > 0) {
return href.substr(index + 1);
} else {
return href;
}
}
function WB_CopyLocationObj(loc)
{
var newLoc = WB_CopyObjectFields(loc);
newLoc._origLoc = loc;
newLoc._origHref = loc.href;
// Rewrite replace and assign functions
newLoc.replace = function(url) { this._origLoc.replace(WB_RewriteUrl(url)); }
newLoc.assign = function(url) { this._origLoc.assign(WB_RewriteUrl(url)); }
newLoc.reload = loc.reload;
newLoc.href = WB_ExtractOrig(newLoc._origHref);
newLoc.toString = function() { return this.href; }
return newLoc;
}
function WB_wombat_updateLoc(reqHref, origHref, location)
{
if (reqHref && (WB_ExtractOrig(origHref) != WB_ExtractOrig(reqHref))) {
var finalHref = WB_RewriteUrl(reqHref);
location.href = finalHref;
}
}
function WB_wombat_checkLocationChange(wbLoc, isTop)
{
var locType = (typeof wbLoc);
var location = (isTop ? window.top.location : window.location);
// String has been assigned to location, so assign it
if (locType == "string") {
WB_wombat_updateLoc(wbLoc, location.href, location)
} else if (locType == "object") {
WB_wombat_updateLoc(wbLoc.href, wbLoc._origHref, location);
}
}
var wombat_updating = false;
function WB_wombat_checkLocations()
{
if (wombat_updating) {
return false;
}
wombat_updating = true;
WB_wombat_checkLocationChange(window.WB_wombat_location, false);
if (window.self.location != window.top.location) {
WB_wombat_checkLocationChange(window.top.WB_wombat_location, true);
}
wombat_updating = false;
}
function WB_wombat_Init(replayPrefix, captureDate, origHost)
{
WB_wombat_replayPrefix = replayPrefix;
WB_wombat_replayDatePrefix = replayPrefix + captureDate + "/";
WB_wombat_captureDatePart = "/" + captureDate + "/";
WB_wombat_origHost = "http://" + origHost;
window.WB_wombat_location = WB_CopyLocationObj(window.self.location);
if (window.self.location != window.top.location) {
window.top.WB_wombat_location = WB_CopyLocationObj(window.top.location);
}
if (window.opener) {
window.opener.WB_wombat_location = (window.opener ? WB_CopyLocationObj(window.opener.location) : null);
}
document.WB_wombat_domain = origHost;
}
// Check quickly after page load
setTimeout(WB_wombat_checkLocations, 100);
// Check periodically every few seconds
setInterval(WB_wombat_checkLocations, 500);

View File

@ -15,6 +15,13 @@
'wb_prefix': 'https://localhost:8081/my_pywb/web/', 'wb_prefix': 'https://localhost:8081/my_pywb/web/',
'wb_url': ('replay', '2013', 'im_', 'http://test.example.com', '2013im_/http://test.example.com')} 'wb_url': ('replay', '2013', 'im_', 'http://test.example.com', '2013im_/http://test.example.com')}
# route with no collection
>>> print_req(Route('', BaseHandler())({'REL_REQUEST_URI': 'http://example.com', 'SCRIPT_NAME': '/pywb'}, False))
{'coll': '',
'request_uri': 'http://example.com',
'wb_prefix': '/pywb/',
'wb_url': None}
# not matching route -- skipped # not matching route -- skipped
>>> Route('web', BaseHandler())({'REL_REQUEST_URI': '/other/test.example.com', 'SCRIPT_NAME': ''}, False) >>> Route('web', BaseHandler())({'REL_REQUEST_URI': '/other/test.example.com', 'SCRIPT_NAME': ''}, False)
@ -67,6 +74,13 @@ False
>>> _test_redir('http://localhost:8080/', '/../other.html', 'http://localhost:8080/extra/coll/20131010/http://example.com/path/page.html', '/extr') >>> _test_redir('http://localhost:8080/', '/../other.html', 'http://localhost:8080/extra/coll/20131010/http://example.com/path/page.html', '/extr')
False False
# With no collection
>>> _test_redir('http://localhost:8080/', '/other.html', 'http://localhost:8080/2013/http://example.com/path/page.html', coll='')
'http://localhost:8080/2013/http://example.com/other.html'
# With SCRIPT_NAME but no collection
>>> _test_redir('http://localhost:8080/', '/other.html', 'http://localhost:8080/pywb-access/http://example.com/path/page.html', '/pywb-access', coll='')
'http://localhost:8080/pywb-access/http://example.com/other.html'
""" """

View File

@ -1,7 +1,14 @@
<!-- WB Insert --> <!-- WB Insert -->
{% if rule.js_rewrite_location %}
<script src='{{ wbrequest.host_prefix }}/static/default/wombat.js'> </script>
<script> <script>
wbinfo = {} WB_wombat_Init("{{wbrequest.wb_prefix}}", "{{cdx['timestamp']}}", "{{cdx['original'] | host}}");
wbinfo.capture_str = "{{ cdx['timestamp'] | format_ts }}"; </script>
{% endif %}
<script>
wbinfo = {}
wbinfo.capture_str = "{{ cdx['timestamp'] | format_ts }}";
</script> </script>
<script src='{{ wbrequest.host_prefix }}/static/default/wb.js'> </script> <script src='{{ wbrequest.host_prefix }}/static/default/wb.js'> </script>
<link rel='stylesheet' href='{{ wbrequest.host_prefix }}/static/default/wb.css'/> <link rel='stylesheet' href='{{ wbrequest.host_prefix }}/static/default/wb.css'/>

View File

@ -3,8 +3,6 @@
import surt import surt
import urlparse import urlparse
from cdxobject import CDXException
#================================================================= #=================================================================
class UrlCanonicalizer(object): class UrlCanonicalizer(object):
@ -15,6 +13,12 @@ class UrlCanonicalizer(object):
return canonicalize(url, self.surt_ordered) return canonicalize(url, self.surt_ordered)
#=================================================================
class UrlCanonicalizeException(Exception):
def status(self):
return '400 Bad Request'
#================================================================= #=================================================================
def canonicalize(url, surt_ordered=True): def canonicalize(url, surt_ordered=True):
""" """
@ -31,7 +35,7 @@ def canonicalize(url, surt_ordered=True):
try: try:
key = surt.surt(url) key = surt.surt(url)
except Exception as e: except Exception as e:
raise CDXException('Invalid Url: ' + url) raise UrlCanonicalizeException('Invalid Url: ' + url)
# if not surt, unsurt the surt to get canonicalized non-surt url # if not surt, unsurt the surt to get canonicalized non-surt url
if not surt_ordered: if not surt_ordered:
@ -114,10 +118,15 @@ def calc_search_range(url, match_type, surt_ordered=True, url_canon=None):
>>> calc_search_range('http://example.com/path/file.html', 'host', False) >>> calc_search_range('http://example.com/path/file.html', 'host', False)
('example.com/', 'example.com0') ('example.com/', 'example.com0')
# domain range not supported # errors: domain range not supported
>>> calc_search_range('http://example.com/path/file.html', 'domain', False) >>> calc_search_range('http://example.com/path/file.html', 'domain', False)
Traceback (most recent call last): Traceback (most recent call last):
Exception: matchType=domain unsupported for non-surt UrlCanonicalizeException: matchType=domain unsupported for non-surt
>>> calc_search_range('http://example.com/path/file.html', 'blah', False)
Traceback (most recent call last):
UrlCanonicalizeException: Invalid match_type: blah
""" """
def inc_last_char(x): def inc_last_char(x):
return x[0:-1] + chr(ord(x[-1]) + 1) return x[0:-1] + chr(ord(x[-1]) + 1)
@ -155,7 +164,7 @@ def calc_search_range(url, match_type, surt_ordered=True, url_canon=None):
elif match_type == 'domain': elif match_type == 'domain':
if not surt_ordered: if not surt_ordered:
raise Exception('matchType=domain unsupported for non-surt') raise UrlCanonicalizeException('matchType=domain unsupported for non-surt')
host = start_key.split(')/')[0] host = start_key.split(')/')[0]
@ -168,7 +177,7 @@ def calc_search_range(url, match_type, surt_ordered=True, url_canon=None):
end_key = host + '-' end_key = host + '-'
else: else:
raise Exception('Invalid match_type: ' + match_type) raise UrlCanonicalizeException('Invalid match_type: ' + match_type)
return (start_key, end_key) return (start_key, end_key)

98
pywb/utils/dsrules.py Normal file
View File

@ -0,0 +1,98 @@
import yaml
import pkgutil
#=================================================================
DEFAULT_RULES_FILE = 'rules.yaml'
DEFAULT_RULES_PKG = 'pywb'
#=================================================================
class RuleSet(object):
DEFAULT_KEY = ''
def __init__(self, rule_cls, fieldname, **kwargs):
"""
A domain specific rules block, inited via config map.
If config map not specified, it is loaded from default location.
The rules are represented as a map by domain.
Each rules configuration will load is own field type
from the list and given a specified rule_cls.
"""
self.rules = []
ds_rules_file = kwargs.get('ds_rules_file')
default_rule_config = kwargs.get('default_rule_config')
config = self.load_default_rules(ds_rules_file)
rulesmap = config.get('rules') if config else None
# if default_rule_config provided, always init a default ruleset
if not rulesmap and default_rule_config is not None:
self.rules = [rule_cls(self.DEFAULT_KEY, default_rule_config)]
return
def_key_found = False
# iterate over master rules file
for value in rulesmap:
url_prefix = value.get('url_prefix')
rules_def = value.get(fieldname)
if not rules_def:
continue
if url_prefix == self.DEFAULT_KEY:
def_key_found = True
self.rules.append(rule_cls(url_prefix, rules_def))
# if default_rule_config provided, always init a default ruleset
if not def_key_found and default_rule_config is not None:
self.rules.append(rule_cls(self.DEFAULT_KEY, default_rule_config))
@staticmethod
def load_default_rules(filename=None, pkg=None):
config = None
if not filename:
filename = DEFAULT_RULES_FILE
if not pkg:
pkg = DEFAULT_RULES_PKG
if filename:
yaml_str = pkgutil.get_data(pkg, filename)
config = yaml.load(yaml_str)
return config
def iter_matching(self, urlkey):
"""
Iterate over all matching rules for given urlkey
"""
for rule in self.rules:
if rule.applies(urlkey):
yield rule
def get_first_match(self, urlkey):
for rule in self.rules:
if rule.applies(urlkey):
return rule
#=================================================================
class BaseRule(object):
"""
Base rule class -- subclassed to handle specific
rules for given url_prefix key
"""
def __init__(self, url_prefix, rules):
self.url_prefix = url_prefix
if not isinstance(self.url_prefix, list):
self.url_prefix = [self.url_prefix]
def applies(self, urlkey):
return any(urlkey.startswith(x) for x in self.url_prefix)

View File

@ -9,6 +9,7 @@ import urllib2
import time import time
#=================================================================
def is_http(filename): def is_http(filename):
return any(filename.startswith(x) for x in ['http://', 'https://']) return any(filename.startswith(x) for x in ['http://', 'https://'])

View File

@ -162,6 +162,10 @@ def timestamp_to_datetime(string):
>>> timestamp_to_datetime('40001965252477') >>> timestamp_to_datetime('40001965252477')
datetime.datetime(2999, 12, 31, 23, 24, 59) datetime.datetime(2999, 12, 31, 23, 24, 59)
# not a number!
>>> timestamp_to_datetime('2010abc')
datetime.datetime(2010, 12, 31, 23, 59, 59)
""" """
# pad to 6 digits # pad to 6 digits

View File

@ -2,6 +2,7 @@ from wbexceptions import WbException, NotFoundException, InternalRedirect
from wbrequestresponse import WbResponse, StatusAndHeaders from wbrequestresponse import WbResponse, StatusAndHeaders
from pywb.cdx.cdxserver import CDXException from pywb.cdx.cdxserver import CDXException
from pywb.utils.canonicalize import UrlCanonicalizeException
from pywb.warc.recordloader import ArchiveLoadFailed from pywb.warc.recordloader import ArchiveLoadFailed
import os import os
@ -55,7 +56,8 @@ def create_wb_app(wb_router):
except InternalRedirect as ir: except InternalRedirect as ir:
response = WbResponse(StatusAndHeaders(ir.status, ir.httpHeaders)) response = WbResponse(StatusAndHeaders(ir.status, ir.httpHeaders))
except (WbException, CDXException, ArchiveLoadFailed) as e: except (WbException, CDXException,
UrlCanonicalizeException, ArchiveLoadFailed) as e:
response = handle_exception(env, wb_router.error_view, e, False) response = handle_exception(env, wb_router.error_view, e, False)
except Exception as e: except Exception as e:

View File

@ -0,0 +1,14 @@
<html>
<head>
<title>Sample Page For Rewrite Test</title>
</head>
<body>
<script>
var some_val = false;
if (some_val) {
window.location = "/other.html";
}
</script>
Test Content
<a href="another.html">Some Link</a>
</body>

Binary file not shown.

View File

@ -0,0 +1,38 @@
com,example)/ 20140127171200 zipnum 0 276
org,iana)/ 20140127171238 zipnum 276 328
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201055 zipnum 604 312
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200718 zipnum 916 235
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200912 zipnum 1151 235
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201240 zipnum 1386 306
org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200654 zipnum 1692 235
org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200816 zipnum 1927 231
org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126201128 zipnum 2158 236
org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140127171240 zipnum 2394 312
org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126200805 zipnum 2706 234
org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126201055 zipnum 2940 235
org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126201308 zipnum 3175 289
org,iana)/_css/2013.1/print.css 20140126200737 zipnum 3464 208
org,iana)/_css/2013.1/print.css 20140126200929 zipnum 3672 207
org,iana)/_css/2013.1/print.css 20140126201248 zipnum 3879 276
org,iana)/_css/2013.1/screen.css 20140126200706 zipnum 4155 210
org,iana)/_css/2013.1/screen.css 20140126200825 zipnum 4365 211
org,iana)/_css/2013.1/screen.css 20140126201227 zipnum 4576 216
org,iana)/_img/2013.1/iana-logo-header.svg 20140126200654 zipnum 4792 236
org,iana)/_img/2013.1/iana-logo-header.svg 20140126200816 zipnum 5028 219
org,iana)/_img/2013.1/iana-logo-header.svg 20140126201128 zipnum 5247 221
org,iana)/_img/2013.1/iana-logo-homepage.png 20140126200625 zipnum 5468 299
org,iana)/_img/2013.1/icann-logo.svg 20140126200719 zipnum 5767 210
org,iana)/_img/2013.1/icann-logo.svg 20140126200912 zipnum 5977 212
org,iana)/_img/2013.1/icann-logo.svg 20140126201240 zipnum 6189 281
org,iana)/_img/bookmark_icon.ico 20140126200631 zipnum 6470 298
org,iana)/_js/2013.1/iana.js 20140126200716 zipnum 6768 213
org,iana)/_js/2013.1/iana.js 20140126200912 zipnum 6981 216
org,iana)/_js/2013.1/iana.js 20140126201239 zipnum 7197 270
org,iana)/_js/2013.1/jquery.js 20140126200653 zipnum 7467 215
org,iana)/_js/2013.1/jquery.js 20140126200816 zipnum 7682 209
org,iana)/_js/2013.1/jquery.js 20140126201127 zipnum 7891 210
org,iana)/_js/2013.1/jquery.js 20140127171239 zipnum 8101 410
org,iana)/dnssec 20140126201307 zipnum 8511 373
org,iana)/domains/int 20140126201239 zipnum 8884 353
org,iana)/domains/root/servers 20140126201227 zipnum 9237 386
org,iana)/time-zones 20140126200737 zipnum 9623 145

View File

@ -0,0 +1 @@
zipnum ./sample_archive/zipcdx/zipnum-sample.cdx.gz

View File

@ -22,7 +22,9 @@ setup(
}, },
data_files = [ data_files = [
('sample_archive/cdx/', glob.glob('sample_archive/cdx/*')), ('sample_archive/cdx/', glob.glob('sample_archive/cdx/*')),
('sample_archive/warcs/', glob.glob('sample_archive/warcs/*')) ('sample_archive/zipcdx/', glob.glob('sample_archive/zipcdx/*')),
('sample_archive/warcs/', glob.glob('sample_archive/warcs/*')),
('sample_archive/text_content/', glob.glob('sample_archive/text_content/*'))],
], ],
install_requires=[ install_requires=[
'uwsgi', 'uwsgi',

View File

@ -3,6 +3,8 @@ import pytest
import yaml import yaml
from pywb.cdx.perms import AllowAllPerms
@pytest.fixture @pytest.fixture
def testconfig(): def testconfig():
config = yaml.load(open('test_config.yaml')) config = yaml.load(open('test_config.yaml'))
@ -25,7 +27,7 @@ class PrintReporter:
pass pass
#================================================================ #================================================================
class TestExclusionPerms: class TestExclusionPerms(AllowAllPerms):
""" """
Perm Checker fixture which can block one URL. Perm Checker fixture which can block one URL.
""" """
@ -37,20 +39,7 @@ class TestExclusionPerms:
Return true/false if url or urlkey (canonicalized url) Return true/false if url or urlkey (canonicalized url)
should be allowed should be allowed
""" """
print "allow_url_lookup:urlkey={}".format(urlkey)
if urlkey == self.URLKEY_EXCLUDED: if urlkey == self.URLKEY_EXCLUDED:
return False return False
return True return super(TestExclusionPerms, self).allow_url_lookup(urlkey, url)
def allow_capture(self, cdx):
"""
Return True if specified capture (cdx) is allowed.
"""
return True
def filter_fields(self, cdx):
"""
Filter out any forbidden cdx fields from cdx object
"""
return cdx

View File

@ -2,6 +2,7 @@ import webtest
from pywb.pywb_init import pywb_config from pywb.pywb_init import pywb_config
from pywb.wbapp import create_wb_app from pywb.wbapp import create_wb_app
from pywb.cdx.cdxobject import CDXObject from pywb.cdx.cdxobject import CDXObject
from pywb.cdx.perms import AllowAllPerms
class TestWb: class TestWb:
TEST_CONFIG = 'test_config.yaml' TEST_CONFIG = 'test_config.yaml'
@ -75,7 +76,19 @@ class TestWb:
assert 'Mon, Jan 27 2014 17:12:38' in resp.body assert 'Mon, Jan 27 2014 17:12:38' in resp.body
assert 'wb.js' in resp.body assert 'wb.js' in resp.body
assert '/pywb/20140127171238/http://www.iana.org/time-zones' in resp.body assert '/pywb/20140127171238/http://www.iana.org/time-zones"' in resp.body
def test_replay_identity_1(self):
resp = self.testapp.get('/pywb/20140127171251id_/http://example.com')
#resp = self.testapp.get('/pywb/20140126200654id_/http://www.iana.org/_img/2013.1/rir-map.svg')
#resp = self.testapp.get('/pywb/20140127171239id_/http://www.iana.org/_css/2013.1/screen.css')
#self._assert_basic_html(resp)
# no wb header insertion
assert 'wb.js' not in resp.body
# original unrewritten url present
assert '"http://www.iana.org/domains/example"' in resp.body
def test_replay_content_length_1(self): def test_replay_content_length_1(self):
# test larger file, rewritten file (svg!) # test larger file, rewritten file (svg!)