mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
warcserver: aggregator + live url rewriter
- add support for conditional RulesAggregator - add LiveRewriterIndexSource which can rewrite live web urls via regex
This commit is contained in:
parent
b7bf693885
commit
0bab36b26e
@ -128,7 +128,7 @@ class ResourceHandler(IndexHandler):
|
||||
return out_headers, resp, errs
|
||||
except (WbException, ArchiveLoadFailed) as e:
|
||||
last_exc = e
|
||||
if logger.isEnabledFor(logging.DEBUG):
|
||||
if True or logger.isEnabledFor(logging.DEBUG):
|
||||
traceback.print_exc()
|
||||
errs[str(loader)] = str(e)
|
||||
|
||||
|
@ -4,6 +4,7 @@ import gevent
|
||||
import json
|
||||
import time
|
||||
import os
|
||||
import re
|
||||
|
||||
from warcio.timeutils import timestamp_now
|
||||
|
||||
@ -392,3 +393,33 @@ class BaseRedisMultiKeyIndexSource(BaseAggregator, RedisIndexSource):
|
||||
class RedisMultiKeyIndexSource(SeqAggMixin, BaseRedisMultiKeyIndexSource):
|
||||
pass
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class BaseRulesAggregator(BaseSourceListAggregator):
|
||||
def __init__(self, sources, **kwargs):
|
||||
super(BaseRulesAggregator, self).__init__(sources, **kwargs)
|
||||
rules = kwargs.get('rules', [])
|
||||
|
||||
self.rules = []
|
||||
|
||||
for rule in rules:
|
||||
match = rule['match']
|
||||
name = rule['name']
|
||||
self.rules.append((re.compile(match), name))
|
||||
|
||||
def get_all_sources(self, params):
|
||||
url = params['url']
|
||||
|
||||
for rx, name in self.rules:
|
||||
if rx.match(url):
|
||||
source = self.sources.get(name)
|
||||
if source:
|
||||
return {name: source}
|
||||
|
||||
return []
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class RulesAggregator(SeqAggMixin, BaseRulesAggregator):
|
||||
pass
|
||||
|
||||
|
@ -196,8 +196,7 @@ class RemoteIndexSource(BaseIndexSource):
|
||||
|
||||
#=============================================================================
|
||||
class LiveIndexSource(BaseIndexSource):
|
||||
def __init__(self, proxy_url='{url}'):
|
||||
self.proxy_url = proxy_url
|
||||
def __init__(self):
|
||||
self._init_sesh(DefaultAdapters.live_adapter)
|
||||
|
||||
def load_index(self, params):
|
||||
@ -209,7 +208,7 @@ class LiveIndexSource(BaseIndexSource):
|
||||
cdx['urlkey'] = params.get('key').decode('utf-8')
|
||||
cdx['timestamp'] = timestamp_now()
|
||||
cdx['url'] = params['url']
|
||||
cdx['load_url'] = res_template(self.proxy_url, params)
|
||||
cdx['load_url'] = self.get_load_url(params)
|
||||
cdx['is_live'] = 'true'
|
||||
|
||||
mime = params.get('content_type', '')
|
||||
@ -231,6 +230,9 @@ class LiveIndexSource(BaseIndexSource):
|
||||
|
||||
return iter([cdx])
|
||||
|
||||
def get_load_url(self, params):
|
||||
return params['url']
|
||||
|
||||
def __repr__(self):
|
||||
return '{0}()'.format(self.__class__.__name__)
|
||||
|
||||
@ -259,6 +261,26 @@ class LiveIndexSource(BaseIndexSource):
|
||||
return cls()
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class LiveRewriteIndexSource(LiveIndexSource):
|
||||
def __init__(self, match, replace):
|
||||
super(LiveRewriteIndexSource, self).__init__()
|
||||
self.rx = re.compile(match)
|
||||
self.replace = replace
|
||||
|
||||
def get_load_url(self, params):
|
||||
res = self.rx.sub(self.replace, params['url'])
|
||||
print(res)
|
||||
return res
|
||||
|
||||
@classmethod
|
||||
def init_from_config(cls, config):
|
||||
if config['type'] != 'live_rw':
|
||||
return
|
||||
|
||||
return cls(config['match'], config['replace'])
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class RedisIndexSource(BaseIndexSource):
|
||||
def __init__(self, redis_url=None, redis=None, key_template=None, **kwargs):
|
||||
@ -579,3 +601,4 @@ class WBMementoIndexSource(MementoIndexSource):
|
||||
@classmethod
|
||||
def _init_id(cls):
|
||||
return 'wb-memento'
|
||||
|
||||
|
@ -1,7 +1,7 @@
|
||||
from pywb.warcserver.index.indexsource import FileIndexSource, RemoteIndexSource, MementoIndexSource, RedisIndexSource
|
||||
from pywb.warcserver.index.indexsource import LiveIndexSource, WBMementoIndexSource
|
||||
from pywb.warcserver.index.indexsource import LiveIndexSource, WBMementoIndexSource, LiveRewriteIndexSource
|
||||
|
||||
from pywb.warcserver.index.aggregator import SimpleAggregator
|
||||
from pywb.warcserver.index.aggregator import SimpleAggregator, RulesAggregator
|
||||
|
||||
from warcio.timeutils import timestamp_now
|
||||
|
||||
@ -144,6 +144,51 @@ com,instagram)/amaliaulman 20141014162333 http://webenact.rhizome.org/all/201410
|
||||
assert(key_ts_res(res, 'load_url') == expected)
|
||||
assert(errs == {})
|
||||
|
||||
def test_live_rewrite(self):
|
||||
url = 'http://example.com/some/path?A=B'
|
||||
source = LiveRewriteIndexSource(match='https?:\/\/example.com\/(.*)',
|
||||
replace=r'https://other-host.example.org/\1')
|
||||
|
||||
res, errs = self.query_single_source(source, dict(url=url))
|
||||
|
||||
assert list(res)[0]['load_url'] == 'https://other-host.example.org/some/path?A=B'
|
||||
|
||||
assert(errs == {})
|
||||
|
||||
def test_cond_aggregator_rewrite(self):
|
||||
match_url = 'https?:\/\/example.com\/(.*)'
|
||||
live_rw = LiveRewriteIndexSource(match=match_url,
|
||||
replace=r'https://other-host.example.org/\1')
|
||||
|
||||
rules = [dict(match=match_url, name='live_rw'),
|
||||
dict(match='.*', name='live')
|
||||
]
|
||||
|
||||
rules_agg = RulesAggregator(sources={'live': LiveIndexSource(),
|
||||
'live_rw': live_rw},
|
||||
rules=rules)
|
||||
|
||||
# Rewrite Matching url
|
||||
res, errs = rules_agg({'url': 'http://example.com/some/path?A=B'})
|
||||
cdx = list(res)[0]
|
||||
|
||||
# assert rewriting source used
|
||||
assert cdx['load_url'] == 'https://other-host.example.org/some/path?A=B'
|
||||
assert cdx['source'] == 'live_rw'
|
||||
|
||||
assert(errs == {})
|
||||
|
||||
# Don't rewrite other urls
|
||||
res, errs = rules_agg({'url': 'http://example.net/some/path?A=B'})
|
||||
cdx = list(res)[0]
|
||||
|
||||
# assert live source used
|
||||
assert cdx['load_url'] == 'http://example.net/some/path?A=B'
|
||||
assert cdx['source'] == 'live'
|
||||
|
||||
assert(errs == {})
|
||||
|
||||
|
||||
# Errors -- Not Found All
|
||||
def test_all_not_found(self, all_source):
|
||||
url = 'http://x-not-found-x.notfound/'
|
||||
|
@ -5,6 +5,7 @@ import os
|
||||
from pywb.warcserver.index.indexsource import RemoteIndexSource, LiveIndexSource, MementoIndexSource
|
||||
from pywb.warcserver.index.indexsource import WBMementoIndexSource, FileIndexSource
|
||||
from pywb.warcserver.index.aggregator import BaseSourceListAggregator, DirectoryIndexSource
|
||||
from pywb.warcserver.index.aggregator import RulesAggregator
|
||||
from pywb.warcserver.handlers import ResourceHandler, HandlerSeq
|
||||
|
||||
|
||||
@ -50,7 +51,7 @@ class TestWarcServer(TempDirTests, BaseTestClass):
|
||||
return handler.index_source.sources
|
||||
|
||||
def test_list_static(self):
|
||||
assert len(self.loader.list_fixed_routes()) == 13
|
||||
assert len(self.loader.list_fixed_routes()) == 14
|
||||
|
||||
def test_list_dynamic(self):
|
||||
assert set(self.loader.list_dynamic_routes()) == set(['auto1', 'auto2'])
|
||||
@ -125,3 +126,18 @@ class TestWarcServer(TempDirTests, BaseTestClass):
|
||||
assert len(sources) == 1
|
||||
assert isinstance(sources['live'], LiveIndexSource)
|
||||
|
||||
def test_rules_agg(self):
|
||||
handler = self.loader.fixed_routes.get('many_rules')
|
||||
assert(handler)
|
||||
assert isinstance(handler.index_source, RulesAggregator)
|
||||
|
||||
res = handler.index_source.get_all_sources({'url': 'http://example.com/path'})
|
||||
assert len(res) == 1
|
||||
assert list(res.keys()) == ['local']
|
||||
|
||||
res = handler.index_source.get_all_sources({'url': 'http://httpbin.org/'})
|
||||
assert len(res) == 1
|
||||
assert list(res.keys()) == ['liveweb']
|
||||
|
||||
|
||||
|
||||
|
@ -1,3 +1,5 @@
|
||||
debug: true
|
||||
|
||||
collections:
|
||||
|
||||
# Live Index
|
||||
@ -45,6 +47,23 @@ collections:
|
||||
|
||||
timeout: 10
|
||||
|
||||
# many sources, conditional rules
|
||||
many_rules:
|
||||
index_group:
|
||||
local:
|
||||
path: ./local/indexes
|
||||
archive_paths: ./local/data
|
||||
type: file
|
||||
|
||||
liveweb: live
|
||||
|
||||
rules:
|
||||
- match: 'https?:\/\/example\.com\/.*'
|
||||
name: local
|
||||
|
||||
- match: '.*'
|
||||
name: liveweb
|
||||
|
||||
# Local Dir CDX
|
||||
local:
|
||||
index: ./local/indexes
|
||||
|
@ -3,13 +3,14 @@ from pywb.utils.loaders import load_yaml_config, load_overlay_config
|
||||
from pywb.warcserver.basewarcserver import BaseWarcServer
|
||||
|
||||
from pywb.warcserver.index.aggregator import CacheDirectoryIndexSource, RedisMultiKeyIndexSource
|
||||
from pywb.warcserver.index.aggregator import GeventTimeoutAggregator, SimpleAggregator
|
||||
from pywb.warcserver.index.aggregator import GeventTimeoutAggregator, SimpleAggregator, RulesAggregator
|
||||
|
||||
from pywb.warcserver.handlers import DefaultResourceHandler, HandlerSeq
|
||||
|
||||
from pywb.warcserver.index.indexsource import FileIndexSource, RemoteIndexSource
|
||||
from pywb.warcserver.index.indexsource import MementoIndexSource, RedisIndexSource
|
||||
from pywb.warcserver.index.indexsource import LiveIndexSource, WBMementoIndexSource
|
||||
from pywb.warcserver.index.indexsource import MementoIndexSource, WBMementoIndexSource
|
||||
from pywb.warcserver.index.indexsource import LiveIndexSource, LiveRewriteIndexSource
|
||||
from pywb.warcserver.index.indexsource import RedisIndexSource
|
||||
from pywb.warcserver.index.zipnum import ZipNumIndexSource
|
||||
|
||||
from pywb import DEFAULT_CONFIG
|
||||
@ -27,6 +28,7 @@ SOURCE_LIST = [LiveIndexSource,
|
||||
FileIndexSource,
|
||||
RemoteIndexSource,
|
||||
ZipNumIndexSource,
|
||||
LiveRewriteIndexSource,
|
||||
]
|
||||
|
||||
|
||||
@ -137,6 +139,7 @@ class WarcServer(BaseWarcServer):
|
||||
handler = self.load_coll(name, coll_config)
|
||||
except:
|
||||
print('Invalid Collection: ' + name)
|
||||
raise
|
||||
if self.debug:
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
@ -178,7 +181,8 @@ class WarcServer(BaseWarcServer):
|
||||
raise Exception('no index, index_group or sequence found')
|
||||
|
||||
timeout = int(coll_config.get('timeout', 0))
|
||||
agg = init_index_agg(index_group, True, timeout)
|
||||
rules = coll_config.get('rules')
|
||||
agg = init_index_agg(index_group, True, timeout, rules=rules)
|
||||
|
||||
if not archive_paths:
|
||||
archive_paths = self.config.get('archive_paths')
|
||||
@ -232,11 +236,15 @@ def register_source(source_cls, end=False):
|
||||
|
||||
|
||||
# ============================================================================
|
||||
def init_index_agg(source_configs, use_gevent=False, timeout=0, source_list=None):
|
||||
def init_index_agg(source_configs, use_gevent=False, timeout=0, source_list=None,
|
||||
rules=None):
|
||||
sources = {}
|
||||
for n, v in iteritems(source_configs):
|
||||
sources[n] = init_index_source(v, source_list=source_list)
|
||||
|
||||
if rules:
|
||||
return RulesAggregator(sources, rules=rules)
|
||||
|
||||
if use_gevent:
|
||||
return GeventTimeoutAggregator(sources, timeout=timeout)
|
||||
else:
|
||||
|
Loading…
x
Reference in New Issue
Block a user