mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
clean up docstrings: fix reST formatting issues.
cherry-picked f03e0a7092 + some more.
This commit is contained in:
parent
daf868fd61
commit
64f4699203
@ -25,6 +25,9 @@ class CaptureNotFoundException(CDXException):
|
||||
|
||||
#=================================================================
|
||||
class CDXObject(OrderedDict):
|
||||
"""
|
||||
dictionary object representing parsed CDX line.
|
||||
"""
|
||||
CDX_FORMATS = [
|
||||
# Public CDX Format
|
||||
["urlkey", "timestamp", "original", "mimetype", "statuscode",
|
||||
@ -75,12 +78,16 @@ class CDXObject(OrderedDict):
|
||||
self.cdxline = None
|
||||
|
||||
def is_revisit(self):
|
||||
"""return ``True`` if this record is a revisit record."""
|
||||
return (self['mimetype'] == 'warc/revisit' or
|
||||
self['filename'] == '-')
|
||||
|
||||
def to_text(self, fields=None):
|
||||
"""
|
||||
return plaintext CDX record (includes newline).
|
||||
if ``fields`` is ``None``, output will have all fields
|
||||
in the order they are stored.
|
||||
|
||||
:param fields: list of field names to output.
|
||||
"""
|
||||
if fields is None:
|
||||
@ -132,6 +139,7 @@ class IDXObject(OrderedDict):
|
||||
def to_text(self, fields=None):
|
||||
"""
|
||||
return plaintext IDX record (including newline).
|
||||
|
||||
:param fields: list of field names to output (currently ignored)
|
||||
"""
|
||||
return str(self) + '\n'
|
||||
|
@ -88,9 +88,10 @@ def create_merged_cdx_gen(sources, query):
|
||||
|
||||
|
||||
#=================================================================
|
||||
# convert text cdx stream to CDXObject/IDXObject
|
||||
def make_obj_iter(text_iter, query):
|
||||
# already converted
|
||||
"""
|
||||
convert text cdx stream to CDXObject/IDXObject.
|
||||
"""
|
||||
if query.secondary_index_only:
|
||||
cls = IDXObject
|
||||
else:
|
||||
@ -100,16 +101,20 @@ def make_obj_iter(text_iter, query):
|
||||
|
||||
|
||||
#=================================================================
|
||||
# limit cdx to at most limit
|
||||
def cdx_limit(cdx_iter, limit):
|
||||
"""
|
||||
limit cdx to at most `limit`.
|
||||
"""
|
||||
# for cdx, _ in itertools.izip(cdx_iter, xrange(limit)):
|
||||
# yield cdx
|
||||
return (cdx for cdx, _ in itertools.izip(cdx_iter, xrange(limit)))
|
||||
|
||||
|
||||
#=================================================================
|
||||
# reverse cdx
|
||||
def cdx_reverse(cdx_iter, limit):
|
||||
"""
|
||||
return cdx records in reverse order.
|
||||
"""
|
||||
# optimize for single last
|
||||
if limit == 1:
|
||||
last = None
|
||||
@ -129,9 +134,11 @@ def cdx_reverse(cdx_iter, limit):
|
||||
|
||||
|
||||
#=================================================================
|
||||
# filter cdx by regex if each filter is field:regex form,
|
||||
# apply filter to cdx[field]
|
||||
def cdx_filter(cdx_iter, filter_strings):
|
||||
"""
|
||||
filter CDX by regex if each filter is :samp:`{field}:{regex}` form,
|
||||
apply filter to :samp:`cdx[{field}]`.
|
||||
"""
|
||||
# Support single strings as well
|
||||
if isinstance(filter_strings, str):
|
||||
filter_strings = [filter_strings]
|
||||
@ -195,8 +202,10 @@ def cdx_filter(cdx_iter, filter_strings):
|
||||
|
||||
|
||||
#=================================================================
|
||||
# collapse by timestamp and status code
|
||||
def cdx_collapse_time_status(cdx_iter, timelen=10):
|
||||
"""
|
||||
collapse by timestamp and status code.
|
||||
"""
|
||||
timelen = int(timelen)
|
||||
|
||||
last_token = None
|
||||
@ -211,8 +220,10 @@ def cdx_collapse_time_status(cdx_iter, timelen=10):
|
||||
|
||||
|
||||
#=================================================================
|
||||
# sort CDXCaptureResult by closest to timestamp
|
||||
def cdx_sort_closest(closest, cdx_iter, limit=10):
|
||||
"""
|
||||
sort CDXCaptureResult by closest to timestamp.
|
||||
"""
|
||||
closest_cdx = []
|
||||
|
||||
closest_sec = timestamp_to_sec(closest)
|
||||
@ -242,8 +253,15 @@ def cdx_sort_closest(closest, cdx_iter, limit=10):
|
||||
# Fields to append from cdx original to revisit
|
||||
ORIG_TUPLE = ['length', 'offset', 'filename']
|
||||
|
||||
|
||||
def cdx_resolve_revisits(cdx_iter):
|
||||
"""
|
||||
resolve revisits.
|
||||
|
||||
this filter adds three fields to CDX: ``orig.length``, ``orig.offset``,
|
||||
and ``orig.filename``. for revisit records, these fields have corresponding
|
||||
field values in previous non-revisit (original) CDX record.
|
||||
They are all ``"-"`` for non-revisit records.
|
||||
"""
|
||||
originals = {}
|
||||
|
||||
for cdx in cdx_iter:
|
||||
|
@ -39,9 +39,9 @@ class BaseCDXServer(object):
|
||||
|
||||
def _check_cdx_iter(self, cdx_iter, query):
|
||||
""" Check cdx iter semantics
|
||||
If iter is empty (no matches), check if fuzzy matching
|
||||
If `cdx_iter` is empty (no matches), check if fuzzy matching
|
||||
is allowed, and try it -- otherwise,
|
||||
throw CaptureNotFoundException
|
||||
throw :exc:`~pywb.cdx.cdxobject.CaptureNotFoundException`
|
||||
"""
|
||||
|
||||
cdx_iter = self.peek_iter(cdx_iter)
|
||||
@ -93,6 +93,19 @@ class CDXServer(BaseCDXServer):
|
||||
self._create_cdx_sources(paths, kwargs.get('config'))
|
||||
|
||||
def load_cdx_query(self, query):
|
||||
"""
|
||||
load CDX for query parameters ``params``.
|
||||
``key`` (or ``url``) parameter specifies URL to query,
|
||||
``matchType`` parameter specifies matching method for ``key``
|
||||
(default ``exact``).
|
||||
other parameters are passed down to :func:`cdx_load`.
|
||||
raises :exc:`~pywb.cdx.cdxobject.CaptureNotFoundException`
|
||||
if no captures are found.
|
||||
|
||||
:param query: query parameters
|
||||
:type query: :class:`~pywb.cdx.query.CDXQuery`
|
||||
:rtype: iterator on :class:`~pywb.cdx.cdxobject.CDXObject`
|
||||
"""
|
||||
url = query.url
|
||||
key, end_key = calc_search_range(url=url,
|
||||
match_type=query.match_type,
|
||||
@ -107,7 +120,8 @@ class CDXServer(BaseCDXServer):
|
||||
|
||||
def _create_cdx_sources(self, paths, config):
|
||||
"""
|
||||
build CDXSource instances for each of path in :param paths:.
|
||||
build CDXSource instances for each of path in ``paths``.
|
||||
|
||||
:param paths: list of sources or single source.
|
||||
each source may be either string or CDXSource instance. value
|
||||
of any other types will be silently ignored.
|
||||
@ -171,7 +185,8 @@ class CDXServer(BaseCDXServer):
|
||||
#=================================================================
|
||||
class RemoteCDXServer(BaseCDXServer):
|
||||
"""
|
||||
A special cdx server that uses a single RemoteCDXSource
|
||||
A special cdx server that uses a single
|
||||
:class:`~pywb.cdx.cdxsource.RemoteCDXSource`.
|
||||
It simply proxies the query params to the remote source
|
||||
and performs no local processing/filtering
|
||||
"""
|
||||
|
@ -39,7 +39,7 @@ class RemoteCDXSource(CDXSource):
|
||||
"""
|
||||
Represents a remote cdx server, to which requests will be proxied.
|
||||
|
||||
Only url and match type params are proxied at this time,
|
||||
Only ``url`` and ``match_type`` params are proxied at this time,
|
||||
the stream is passed through all other filters locally.
|
||||
"""
|
||||
def __init__(self, filename, cookie=None, remote_processing=False):
|
||||
|
Loading…
x
Reference in New Issue
Block a user