mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
cdx: cleanup regarding and more consistency for RemoteCDXServer
RemoteCDXServer delegates filter/processing and simply proxies response from remote RemoteCDXSource (and default usage with CDXServer) only fetches the unfiltered/unprocessed stream and performs cdx ops locally
This commit is contained in:
parent
739d0a6f93
commit
15d2cdd1b3
@ -25,7 +25,7 @@ def cdx_load(sources, query, perms_checker=None, process=True):
|
||||
cdx_iter = load_cdx_streams(sources, query)
|
||||
cdx_iter = make_obj_iter(cdx_iter, query)
|
||||
|
||||
if process and query.process:
|
||||
if process and not query.secondary_index_only:
|
||||
cdx_iter = process_cdx(cdx_iter, query)
|
||||
|
||||
if perms_checker:
|
||||
|
@ -175,14 +175,13 @@ class RemoteCDXServer(BaseCDXServer):
|
||||
|
||||
if isinstance(source, RemoteCDXSource):
|
||||
self.source = source
|
||||
elif (isinstance(source, str) and
|
||||
any(source.startswith(x) for x in ['http://', 'https://'])):
|
||||
self.source = RemoteCDXSource(source)
|
||||
elif (isinstance(source, str) and is_http(source)):
|
||||
self.source = RemoteCDXSource(source, remote_processing=True)
|
||||
else:
|
||||
raise Exception('Invalid remote cdx source: ' + str(source))
|
||||
|
||||
def load_cdx_query(self, query):
|
||||
remote_iter = cdx_load(self.sources, query, process=False)
|
||||
remote_iter = cdx_load([self.source], query, process=False)
|
||||
return self._check_cdx_iter(remote_iter, query)
|
||||
|
||||
def __str__(self):
|
||||
|
@ -41,19 +41,18 @@ class RemoteCDXSource(CDXSource):
|
||||
Only url and match type params are proxied at this time,
|
||||
the stream is passed through all other filters locally.
|
||||
"""
|
||||
def __init__(self, filename, cookie=None, proxy_all=True):
|
||||
def __init__(self, filename, cookie=None, remote_processing=False):
|
||||
self.remote_url = filename
|
||||
self.cookie = cookie
|
||||
self.proxy_all = proxy_all
|
||||
self.remote_processing = remote_processing
|
||||
|
||||
def load_cdx(self, query):
|
||||
if self.proxy_all:
|
||||
query.set_process(False)
|
||||
if self.remote_processing:
|
||||
remote_query = query
|
||||
else:
|
||||
# Only send url and matchType params to remote
|
||||
remote_query = CDXQuery(url=query.url,
|
||||
match_type=query.matchType)
|
||||
match_type=query.match_type)
|
||||
|
||||
urlparams = remote_query.urlencode()
|
||||
|
||||
|
@ -79,13 +79,6 @@ class CDXQuery(object):
|
||||
def secondary_index_only(self):
|
||||
return self._get_bool('showPagedIndex')
|
||||
|
||||
@property
|
||||
def process(self):
|
||||
return self._get_bool('processOps', True)
|
||||
|
||||
def set_process(self, process):
|
||||
self.params['processOps'] = process
|
||||
|
||||
def _get_bool(self, name, def_val=False):
|
||||
v = self.params.get(name)
|
||||
if v:
|
||||
@ -103,6 +96,10 @@ class CDXQuery(object):
|
||||
|
||||
@staticmethod
|
||||
def from_wsgi_env(env):
|
||||
return CDXQuery(**CDXQuery.extract_params_from_wsgi_env(env))
|
||||
|
||||
@staticmethod
|
||||
def extract_params_from_wsgi_env(env):
|
||||
""" utility function to extract params and create a CDXQuery
|
||||
from a WSGI environment dictionary
|
||||
"""
|
||||
@ -119,4 +116,4 @@ class CDXQuery(object):
|
||||
if name != 'filter':
|
||||
params[name] = val[0]
|
||||
|
||||
return CDXQuery(**params)
|
||||
return params
|
||||
|
@ -142,6 +142,8 @@ org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db tex
|
||||
('filename', 'dupes.warc.gz')]
|
||||
|
||||
# NOTE: external dependency -- need self-contained test TODO
|
||||
|
||||
# Load remote query but filter locally
|
||||
>>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'example.com', output = 'raw', limit = '2')
|
||||
>>> pprint.pprint(x.next().items())
|
||||
[('urlkey', 'com,example)/'),
|
||||
@ -152,14 +154,24 @@ org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db tex
|
||||
('digest', 'HT2DYGA5UKZCPBSFVCV3JOBXGW2G5UUA'),
|
||||
('length', '1792')]
|
||||
|
||||
# No local filtering/processing of cdx, simply return result from remote server
|
||||
>>> x = RemoteCDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'example.com', output = 'raw', limit = '2')
|
||||
>>> pprint.pprint(x.next().items())
|
||||
[('urlkey', 'com,example)/'),
|
||||
('timestamp', '20020120142510'),
|
||||
('original', 'http://example.com:80/'),
|
||||
('mimetype', 'text/html'),
|
||||
('statuscode', '200'),
|
||||
('digest', 'HT2DYGA5UKZCPBSFVCV3JOBXGW2G5UUA'),
|
||||
('length', '1792')]
|
||||
|
||||
>>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'facebook.com', output = 'raw', limit = '2')
|
||||
>>> x = RemoteCDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'facebook.com', output = 'raw', limit = '2')
|
||||
Traceback (most recent call last):
|
||||
AccessException: Blocked By Robots
|
||||
"""
|
||||
|
||||
#=================================================================
|
||||
from pywb.cdx.cdxserver import CDXServer
|
||||
from pywb.cdx.cdxserver import CDXServer, RemoteCDXServer
|
||||
import os
|
||||
import sys
|
||||
import pprint
|
||||
|
@ -121,7 +121,6 @@ class ZipNumCluster(CDXSource):
|
||||
prev_size=1)
|
||||
|
||||
if query.secondary_index_only:
|
||||
query.set_process(False)
|
||||
return idx_iter
|
||||
else:
|
||||
blocks = self.idx_to_cdx(idx_iter, query)
|
||||
|
@ -79,8 +79,8 @@ class CDXHandler(BaseHandler):
|
||||
self.view = view if view else TextCapturesView()
|
||||
|
||||
def __call__(self, wbrequest):
|
||||
query = CDXQuery.from_wsgi_env(wbrequest.env)
|
||||
cdx_lines = self.index_reader.load_cdx_query(query)
|
||||
params = CDXQuery.extract_params_from_wsgi_env(wbrequest.env)
|
||||
cdx_lines = self.index_reader.load_cdx(**params)
|
||||
|
||||
return self.view.render_response(wbrequest, cdx_lines)
|
||||
|
||||
|
@ -34,9 +34,6 @@ class IndexReader(object):
|
||||
|
||||
return cdxlines
|
||||
|
||||
def load_cdx_query(self, query):
|
||||
return self.cdx_server.load_cdx_query(query)
|
||||
|
||||
def load_cdx(self, **params):
|
||||
return self.cdx_server.load_cdx(**params)
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user