mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
cdx: cleanup regarding and more consistency for RemoteCDXServer
RemoteCDXServer delegates filter/processing and simply proxies response from remote RemoteCDXSource (and default usage with CDXServer) only fetches the unfiltered/unprocessed stream and performs cdx ops locally
This commit is contained in:
parent
739d0a6f93
commit
15d2cdd1b3
@ -25,7 +25,7 @@ def cdx_load(sources, query, perms_checker=None, process=True):
|
|||||||
cdx_iter = load_cdx_streams(sources, query)
|
cdx_iter = load_cdx_streams(sources, query)
|
||||||
cdx_iter = make_obj_iter(cdx_iter, query)
|
cdx_iter = make_obj_iter(cdx_iter, query)
|
||||||
|
|
||||||
if process and query.process:
|
if process and not query.secondary_index_only:
|
||||||
cdx_iter = process_cdx(cdx_iter, query)
|
cdx_iter = process_cdx(cdx_iter, query)
|
||||||
|
|
||||||
if perms_checker:
|
if perms_checker:
|
||||||
|
@ -175,14 +175,13 @@ class RemoteCDXServer(BaseCDXServer):
|
|||||||
|
|
||||||
if isinstance(source, RemoteCDXSource):
|
if isinstance(source, RemoteCDXSource):
|
||||||
self.source = source
|
self.source = source
|
||||||
elif (isinstance(source, str) and
|
elif (isinstance(source, str) and is_http(source)):
|
||||||
any(source.startswith(x) for x in ['http://', 'https://'])):
|
self.source = RemoteCDXSource(source, remote_processing=True)
|
||||||
self.source = RemoteCDXSource(source)
|
|
||||||
else:
|
else:
|
||||||
raise Exception('Invalid remote cdx source: ' + str(source))
|
raise Exception('Invalid remote cdx source: ' + str(source))
|
||||||
|
|
||||||
def load_cdx_query(self, query):
|
def load_cdx_query(self, query):
|
||||||
remote_iter = cdx_load(self.sources, query, process=False)
|
remote_iter = cdx_load([self.source], query, process=False)
|
||||||
return self._check_cdx_iter(remote_iter, query)
|
return self._check_cdx_iter(remote_iter, query)
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
|
@ -41,19 +41,18 @@ class RemoteCDXSource(CDXSource):
|
|||||||
Only url and match type params are proxied at this time,
|
Only url and match type params are proxied at this time,
|
||||||
the stream is passed through all other filters locally.
|
the stream is passed through all other filters locally.
|
||||||
"""
|
"""
|
||||||
def __init__(self, filename, cookie=None, proxy_all=True):
|
def __init__(self, filename, cookie=None, remote_processing=False):
|
||||||
self.remote_url = filename
|
self.remote_url = filename
|
||||||
self.cookie = cookie
|
self.cookie = cookie
|
||||||
self.proxy_all = proxy_all
|
self.remote_processing = remote_processing
|
||||||
|
|
||||||
def load_cdx(self, query):
|
def load_cdx(self, query):
|
||||||
if self.proxy_all:
|
if self.remote_processing:
|
||||||
query.set_process(False)
|
|
||||||
remote_query = query
|
remote_query = query
|
||||||
else:
|
else:
|
||||||
# Only send url and matchType params to remote
|
# Only send url and matchType params to remote
|
||||||
remote_query = CDXQuery(url=query.url,
|
remote_query = CDXQuery(url=query.url,
|
||||||
match_type=query.matchType)
|
match_type=query.match_type)
|
||||||
|
|
||||||
urlparams = remote_query.urlencode()
|
urlparams = remote_query.urlencode()
|
||||||
|
|
||||||
|
@ -79,13 +79,6 @@ class CDXQuery(object):
|
|||||||
def secondary_index_only(self):
|
def secondary_index_only(self):
|
||||||
return self._get_bool('showPagedIndex')
|
return self._get_bool('showPagedIndex')
|
||||||
|
|
||||||
@property
|
|
||||||
def process(self):
|
|
||||||
return self._get_bool('processOps', True)
|
|
||||||
|
|
||||||
def set_process(self, process):
|
|
||||||
self.params['processOps'] = process
|
|
||||||
|
|
||||||
def _get_bool(self, name, def_val=False):
|
def _get_bool(self, name, def_val=False):
|
||||||
v = self.params.get(name)
|
v = self.params.get(name)
|
||||||
if v:
|
if v:
|
||||||
@ -103,6 +96,10 @@ class CDXQuery(object):
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def from_wsgi_env(env):
|
def from_wsgi_env(env):
|
||||||
|
return CDXQuery(**CDXQuery.extract_params_from_wsgi_env(env))
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def extract_params_from_wsgi_env(env):
|
||||||
""" utility function to extract params and create a CDXQuery
|
""" utility function to extract params and create a CDXQuery
|
||||||
from a WSGI environment dictionary
|
from a WSGI environment dictionary
|
||||||
"""
|
"""
|
||||||
@ -119,4 +116,4 @@ class CDXQuery(object):
|
|||||||
if name != 'filter':
|
if name != 'filter':
|
||||||
params[name] = val[0]
|
params[name] = val[0]
|
||||||
|
|
||||||
return CDXQuery(**params)
|
return params
|
||||||
|
@ -142,6 +142,8 @@ org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db tex
|
|||||||
('filename', 'dupes.warc.gz')]
|
('filename', 'dupes.warc.gz')]
|
||||||
|
|
||||||
# NOTE: external dependency -- need self-contained test TODO
|
# NOTE: external dependency -- need self-contained test TODO
|
||||||
|
|
||||||
|
# Load remote query but filter locally
|
||||||
>>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'example.com', output = 'raw', limit = '2')
|
>>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'example.com', output = 'raw', limit = '2')
|
||||||
>>> pprint.pprint(x.next().items())
|
>>> pprint.pprint(x.next().items())
|
||||||
[('urlkey', 'com,example)/'),
|
[('urlkey', 'com,example)/'),
|
||||||
@ -152,14 +154,24 @@ org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db tex
|
|||||||
('digest', 'HT2DYGA5UKZCPBSFVCV3JOBXGW2G5UUA'),
|
('digest', 'HT2DYGA5UKZCPBSFVCV3JOBXGW2G5UUA'),
|
||||||
('length', '1792')]
|
('length', '1792')]
|
||||||
|
|
||||||
|
# No local filtering/processing of cdx, simply return result from remote server
|
||||||
|
>>> x = RemoteCDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'example.com', output = 'raw', limit = '2')
|
||||||
|
>>> pprint.pprint(x.next().items())
|
||||||
|
[('urlkey', 'com,example)/'),
|
||||||
|
('timestamp', '20020120142510'),
|
||||||
|
('original', 'http://example.com:80/'),
|
||||||
|
('mimetype', 'text/html'),
|
||||||
|
('statuscode', '200'),
|
||||||
|
('digest', 'HT2DYGA5UKZCPBSFVCV3JOBXGW2G5UUA'),
|
||||||
|
('length', '1792')]
|
||||||
|
|
||||||
>>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'facebook.com', output = 'raw', limit = '2')
|
>>> x = RemoteCDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'facebook.com', output = 'raw', limit = '2')
|
||||||
Traceback (most recent call last):
|
Traceback (most recent call last):
|
||||||
AccessException: Blocked By Robots
|
AccessException: Blocked By Robots
|
||||||
"""
|
"""
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
from pywb.cdx.cdxserver import CDXServer
|
from pywb.cdx.cdxserver import CDXServer, RemoteCDXServer
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import pprint
|
import pprint
|
||||||
|
@ -121,7 +121,6 @@ class ZipNumCluster(CDXSource):
|
|||||||
prev_size=1)
|
prev_size=1)
|
||||||
|
|
||||||
if query.secondary_index_only:
|
if query.secondary_index_only:
|
||||||
query.set_process(False)
|
|
||||||
return idx_iter
|
return idx_iter
|
||||||
else:
|
else:
|
||||||
blocks = self.idx_to_cdx(idx_iter, query)
|
blocks = self.idx_to_cdx(idx_iter, query)
|
||||||
|
@ -79,8 +79,8 @@ class CDXHandler(BaseHandler):
|
|||||||
self.view = view if view else TextCapturesView()
|
self.view = view if view else TextCapturesView()
|
||||||
|
|
||||||
def __call__(self, wbrequest):
|
def __call__(self, wbrequest):
|
||||||
query = CDXQuery.from_wsgi_env(wbrequest.env)
|
params = CDXQuery.extract_params_from_wsgi_env(wbrequest.env)
|
||||||
cdx_lines = self.index_reader.load_cdx_query(query)
|
cdx_lines = self.index_reader.load_cdx(**params)
|
||||||
|
|
||||||
return self.view.render_response(wbrequest, cdx_lines)
|
return self.view.render_response(wbrequest, cdx_lines)
|
||||||
|
|
||||||
|
@ -34,9 +34,6 @@ class IndexReader(object):
|
|||||||
|
|
||||||
return cdxlines
|
return cdxlines
|
||||||
|
|
||||||
def load_cdx_query(self, query):
|
|
||||||
return self.cdx_server.load_cdx_query(query)
|
|
||||||
|
|
||||||
def load_cdx(self, **params):
|
def load_cdx(self, **params):
|
||||||
return self.cdx_server.load_cdx(**params)
|
return self.cdx_server.load_cdx(**params)
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user