1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

cdx: cleanup regarding and more consistency for RemoteCDXServer

RemoteCDXServer delegates filter/processing and simply proxies response from remote
RemoteCDXSource (and default usage with CDXServer) only fetches the unfiltered/unprocessed
stream and performs cdx ops locally
This commit is contained in:
Ilya Kreymer 2014-03-01 16:35:27 -08:00
parent 739d0a6f93
commit 15d2cdd1b3
8 changed files with 29 additions and 26 deletions

View File

@ -25,7 +25,7 @@ def cdx_load(sources, query, perms_checker=None, process=True):
cdx_iter = load_cdx_streams(sources, query) cdx_iter = load_cdx_streams(sources, query)
cdx_iter = make_obj_iter(cdx_iter, query) cdx_iter = make_obj_iter(cdx_iter, query)
if process and query.process: if process and not query.secondary_index_only:
cdx_iter = process_cdx(cdx_iter, query) cdx_iter = process_cdx(cdx_iter, query)
if perms_checker: if perms_checker:

View File

@ -175,14 +175,13 @@ class RemoteCDXServer(BaseCDXServer):
if isinstance(source, RemoteCDXSource): if isinstance(source, RemoteCDXSource):
self.source = source self.source = source
elif (isinstance(source, str) and elif (isinstance(source, str) and is_http(source)):
any(source.startswith(x) for x in ['http://', 'https://'])): self.source = RemoteCDXSource(source, remote_processing=True)
self.source = RemoteCDXSource(source)
else: else:
raise Exception('Invalid remote cdx source: ' + str(source)) raise Exception('Invalid remote cdx source: ' + str(source))
def load_cdx_query(self, query): def load_cdx_query(self, query):
remote_iter = cdx_load(self.sources, query, process=False) remote_iter = cdx_load([self.source], query, process=False)
return self._check_cdx_iter(remote_iter, query) return self._check_cdx_iter(remote_iter, query)
def __str__(self): def __str__(self):

View File

@ -41,19 +41,18 @@ class RemoteCDXSource(CDXSource):
Only url and match type params are proxied at this time, Only url and match type params are proxied at this time,
the stream is passed through all other filters locally. the stream is passed through all other filters locally.
""" """
def __init__(self, filename, cookie=None, proxy_all=True): def __init__(self, filename, cookie=None, remote_processing=False):
self.remote_url = filename self.remote_url = filename
self.cookie = cookie self.cookie = cookie
self.proxy_all = proxy_all self.remote_processing = remote_processing
def load_cdx(self, query): def load_cdx(self, query):
if self.proxy_all: if self.remote_processing:
query.set_process(False)
remote_query = query remote_query = query
else: else:
# Only send url and matchType params to remote # Only send url and matchType params to remote
remote_query = CDXQuery(url=query.url, remote_query = CDXQuery(url=query.url,
match_type=query.matchType) match_type=query.match_type)
urlparams = remote_query.urlencode() urlparams = remote_query.urlencode()

View File

@ -79,13 +79,6 @@ class CDXQuery(object):
def secondary_index_only(self): def secondary_index_only(self):
return self._get_bool('showPagedIndex') return self._get_bool('showPagedIndex')
@property
def process(self):
return self._get_bool('processOps', True)
def set_process(self, process):
self.params['processOps'] = process
def _get_bool(self, name, def_val=False): def _get_bool(self, name, def_val=False):
v = self.params.get(name) v = self.params.get(name)
if v: if v:
@ -103,6 +96,10 @@ class CDXQuery(object):
@staticmethod @staticmethod
def from_wsgi_env(env): def from_wsgi_env(env):
return CDXQuery(**CDXQuery.extract_params_from_wsgi_env(env))
@staticmethod
def extract_params_from_wsgi_env(env):
""" utility function to extract params and create a CDXQuery """ utility function to extract params and create a CDXQuery
from a WSGI environment dictionary from a WSGI environment dictionary
""" """
@ -119,4 +116,4 @@ class CDXQuery(object):
if name != 'filter': if name != 'filter':
params[name] = val[0] params[name] = val[0]
return CDXQuery(**params) return params

View File

@ -142,6 +142,8 @@ org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db tex
('filename', 'dupes.warc.gz')] ('filename', 'dupes.warc.gz')]
# NOTE: external dependency -- need self-contained test TODO # NOTE: external dependency -- need self-contained test TODO
# Load remote query but filter locally
>>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'example.com', output = 'raw', limit = '2') >>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'example.com', output = 'raw', limit = '2')
>>> pprint.pprint(x.next().items()) >>> pprint.pprint(x.next().items())
[('urlkey', 'com,example)/'), [('urlkey', 'com,example)/'),
@ -152,14 +154,24 @@ org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db tex
('digest', 'HT2DYGA5UKZCPBSFVCV3JOBXGW2G5UUA'), ('digest', 'HT2DYGA5UKZCPBSFVCV3JOBXGW2G5UUA'),
('length', '1792')] ('length', '1792')]
# No local filtering/processing of cdx, simply return result from remote server
>>> x = RemoteCDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'example.com', output = 'raw', limit = '2')
>>> pprint.pprint(x.next().items())
[('urlkey', 'com,example)/'),
('timestamp', '20020120142510'),
('original', 'http://example.com:80/'),
('mimetype', 'text/html'),
('statuscode', '200'),
('digest', 'HT2DYGA5UKZCPBSFVCV3JOBXGW2G5UUA'),
('length', '1792')]
>>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'facebook.com', output = 'raw', limit = '2') >>> x = RemoteCDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'facebook.com', output = 'raw', limit = '2')
Traceback (most recent call last): Traceback (most recent call last):
AccessException: Blocked By Robots AccessException: Blocked By Robots
""" """
#================================================================= #=================================================================
from pywb.cdx.cdxserver import CDXServer from pywb.cdx.cdxserver import CDXServer, RemoteCDXServer
import os import os
import sys import sys
import pprint import pprint

View File

@ -121,7 +121,6 @@ class ZipNumCluster(CDXSource):
prev_size=1) prev_size=1)
if query.secondary_index_only: if query.secondary_index_only:
query.set_process(False)
return idx_iter return idx_iter
else: else:
blocks = self.idx_to_cdx(idx_iter, query) blocks = self.idx_to_cdx(idx_iter, query)

View File

@ -79,8 +79,8 @@ class CDXHandler(BaseHandler):
self.view = view if view else TextCapturesView() self.view = view if view else TextCapturesView()
def __call__(self, wbrequest): def __call__(self, wbrequest):
query = CDXQuery.from_wsgi_env(wbrequest.env) params = CDXQuery.extract_params_from_wsgi_env(wbrequest.env)
cdx_lines = self.index_reader.load_cdx_query(query) cdx_lines = self.index_reader.load_cdx(**params)
return self.view.render_response(wbrequest, cdx_lines) return self.view.render_response(wbrequest, cdx_lines)

View File

@ -34,9 +34,6 @@ class IndexReader(object):
return cdxlines return cdxlines
def load_cdx_query(self, query):
return self.cdx_server.load_cdx_query(query)
def load_cdx(self, **params): def load_cdx(self, **params):
return self.cdx_server.load_cdx(**params) return self.cdx_server.load_cdx(**params)