From 15d2cdd1b3c8e2e8e8ac7b83c94f02ee99f31b07 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 1 Mar 2014 16:35:27 -0800 Subject: [PATCH] cdx: cleanup regarding and more consistency for RemoteCDXServer RemoteCDXServer delegates filter/processing and simply proxies response from remote RemoteCDXSource (and default usage with CDXServer) only fetches the unfiltered/unprocessed stream and performs cdx ops locally --- pywb/cdx/cdxops.py | 2 +- pywb/cdx/cdxserver.py | 7 +++---- pywb/cdx/cdxsource.py | 9 ++++----- pywb/cdx/query.py | 13 +++++-------- pywb/cdx/test/cdxserver_test.py | 16 ++++++++++++++-- pywb/cdx/zipnum.py | 1 - pywb/handlers.py | 4 ++-- pywb/indexreader.py | 3 --- 8 files changed, 29 insertions(+), 26 deletions(-) diff --git a/pywb/cdx/cdxops.py b/pywb/cdx/cdxops.py index 92809b07..10be9bb1 100644 --- a/pywb/cdx/cdxops.py +++ b/pywb/cdx/cdxops.py @@ -25,7 +25,7 @@ def cdx_load(sources, query, perms_checker=None, process=True): cdx_iter = load_cdx_streams(sources, query) cdx_iter = make_obj_iter(cdx_iter, query) - if process and query.process: + if process and not query.secondary_index_only: cdx_iter = process_cdx(cdx_iter, query) if perms_checker: diff --git a/pywb/cdx/cdxserver.py b/pywb/cdx/cdxserver.py index 8753545d..54d46f4b 100644 --- a/pywb/cdx/cdxserver.py +++ b/pywb/cdx/cdxserver.py @@ -175,14 +175,13 @@ class RemoteCDXServer(BaseCDXServer): if isinstance(source, RemoteCDXSource): self.source = source - elif (isinstance(source, str) and - any(source.startswith(x) for x in ['http://', 'https://'])): - self.source = RemoteCDXSource(source) + elif (isinstance(source, str) and is_http(source)): + self.source = RemoteCDXSource(source, remote_processing=True) else: raise Exception('Invalid remote cdx source: ' + str(source)) def load_cdx_query(self, query): - remote_iter = cdx_load(self.sources, query, process=False) + remote_iter = cdx_load([self.source], query, process=False) return self._check_cdx_iter(remote_iter, query) def __str__(self): diff --git a/pywb/cdx/cdxsource.py b/pywb/cdx/cdxsource.py index 119f2006..0923fba9 100644 --- a/pywb/cdx/cdxsource.py +++ b/pywb/cdx/cdxsource.py @@ -41,19 +41,18 @@ class RemoteCDXSource(CDXSource): Only url and match type params are proxied at this time, the stream is passed through all other filters locally. """ - def __init__(self, filename, cookie=None, proxy_all=True): + def __init__(self, filename, cookie=None, remote_processing=False): self.remote_url = filename self.cookie = cookie - self.proxy_all = proxy_all + self.remote_processing = remote_processing def load_cdx(self, query): - if self.proxy_all: - query.set_process(False) + if self.remote_processing: remote_query = query else: # Only send url and matchType params to remote remote_query = CDXQuery(url=query.url, - match_type=query.matchType) + match_type=query.match_type) urlparams = remote_query.urlencode() diff --git a/pywb/cdx/query.py b/pywb/cdx/query.py index 3ce2fc3d..dc480836 100644 --- a/pywb/cdx/query.py +++ b/pywb/cdx/query.py @@ -79,13 +79,6 @@ class CDXQuery(object): def secondary_index_only(self): return self._get_bool('showPagedIndex') - @property - def process(self): - return self._get_bool('processOps', True) - - def set_process(self, process): - self.params['processOps'] = process - def _get_bool(self, name, def_val=False): v = self.params.get(name) if v: @@ -103,6 +96,10 @@ class CDXQuery(object): @staticmethod def from_wsgi_env(env): + return CDXQuery(**CDXQuery.extract_params_from_wsgi_env(env)) + + @staticmethod + def extract_params_from_wsgi_env(env): """ utility function to extract params and create a CDXQuery from a WSGI environment dictionary """ @@ -119,4 +116,4 @@ class CDXQuery(object): if name != 'filter': params[name] = val[0] - return CDXQuery(**params) + return params diff --git a/pywb/cdx/test/cdxserver_test.py b/pywb/cdx/test/cdxserver_test.py index f09af0fc..e261ead4 100644 --- a/pywb/cdx/test/cdxserver_test.py +++ b/pywb/cdx/test/cdxserver_test.py @@ -142,6 +142,8 @@ org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db tex ('filename', 'dupes.warc.gz')] # NOTE: external dependency -- need self-contained test TODO + +# Load remote query but filter locally >>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'example.com', output = 'raw', limit = '2') >>> pprint.pprint(x.next().items()) [('urlkey', 'com,example)/'), @@ -152,14 +154,24 @@ org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db tex ('digest', 'HT2DYGA5UKZCPBSFVCV3JOBXGW2G5UUA'), ('length', '1792')] +# No local filtering/processing of cdx, simply return result from remote server +>>> x = RemoteCDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'example.com', output = 'raw', limit = '2') +>>> pprint.pprint(x.next().items()) +[('urlkey', 'com,example)/'), + ('timestamp', '20020120142510'), + ('original', 'http://example.com:80/'), + ('mimetype', 'text/html'), + ('statuscode', '200'), + ('digest', 'HT2DYGA5UKZCPBSFVCV3JOBXGW2G5UUA'), + ('length', '1792')] ->>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'facebook.com', output = 'raw', limit = '2') +>>> x = RemoteCDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'facebook.com', output = 'raw', limit = '2') Traceback (most recent call last): AccessException: Blocked By Robots """ #================================================================= -from pywb.cdx.cdxserver import CDXServer +from pywb.cdx.cdxserver import CDXServer, RemoteCDXServer import os import sys import pprint diff --git a/pywb/cdx/zipnum.py b/pywb/cdx/zipnum.py index fbb1503f..1d0cb24f 100644 --- a/pywb/cdx/zipnum.py +++ b/pywb/cdx/zipnum.py @@ -121,7 +121,6 @@ class ZipNumCluster(CDXSource): prev_size=1) if query.secondary_index_only: - query.set_process(False) return idx_iter else: blocks = self.idx_to_cdx(idx_iter, query) diff --git a/pywb/handlers.py b/pywb/handlers.py index 43cb94f9..0d9500f4 100644 --- a/pywb/handlers.py +++ b/pywb/handlers.py @@ -79,8 +79,8 @@ class CDXHandler(BaseHandler): self.view = view if view else TextCapturesView() def __call__(self, wbrequest): - query = CDXQuery.from_wsgi_env(wbrequest.env) - cdx_lines = self.index_reader.load_cdx_query(query) + params = CDXQuery.extract_params_from_wsgi_env(wbrequest.env) + cdx_lines = self.index_reader.load_cdx(**params) return self.view.render_response(wbrequest, cdx_lines) diff --git a/pywb/indexreader.py b/pywb/indexreader.py index ff17dfde..a422d0b4 100644 --- a/pywb/indexreader.py +++ b/pywb/indexreader.py @@ -34,9 +34,6 @@ class IndexReader(object): return cdxlines - def load_cdx_query(self, query): - return self.cdx_server.load_cdx_query(query) - def load_cdx(self, **params): return self.cdx_server.load_cdx(**params)