1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

clean up docstrings: fix reST formatting issues.

cherry-picked f03e0a7092 + some more.
This commit is contained in:
Kenji Nagahashi 2014-03-04 19:08:23 +00:00
parent daf868fd61
commit 64f4699203
4 changed files with 55 additions and 14 deletions

View File

@ -25,6 +25,9 @@ class CaptureNotFoundException(CDXException):
#=================================================================
class CDXObject(OrderedDict):
"""
dictionary object representing parsed CDX line.
"""
CDX_FORMATS = [
# Public CDX Format
["urlkey", "timestamp", "original", "mimetype", "statuscode",
@ -75,12 +78,16 @@ class CDXObject(OrderedDict):
self.cdxline = None
def is_revisit(self):
"""return ``True`` if this record is a revisit record."""
return (self['mimetype'] == 'warc/revisit' or
self['filename'] == '-')
def to_text(self, fields=None):
"""
return plaintext CDX record (includes newline).
if ``fields`` is ``None``, output will have all fields
in the order they are stored.
:param fields: list of field names to output.
"""
if fields is None:
@ -132,6 +139,7 @@ class IDXObject(OrderedDict):
def to_text(self, fields=None):
"""
return plaintext IDX record (including newline).
:param fields: list of field names to output (currently ignored)
"""
return str(self) + '\n'

View File

@ -88,9 +88,10 @@ def create_merged_cdx_gen(sources, query):
#=================================================================
# convert text cdx stream to CDXObject/IDXObject
def make_obj_iter(text_iter, query):
# already converted
"""
convert text cdx stream to CDXObject/IDXObject.
"""
if query.secondary_index_only:
cls = IDXObject
else:
@ -100,16 +101,20 @@ def make_obj_iter(text_iter, query):
#=================================================================
# limit cdx to at most limit
def cdx_limit(cdx_iter, limit):
"""
limit cdx to at most `limit`.
"""
# for cdx, _ in itertools.izip(cdx_iter, xrange(limit)):
# yield cdx
return (cdx for cdx, _ in itertools.izip(cdx_iter, xrange(limit)))
#=================================================================
# reverse cdx
def cdx_reverse(cdx_iter, limit):
"""
return cdx records in reverse order.
"""
# optimize for single last
if limit == 1:
last = None
@ -129,9 +134,11 @@ def cdx_reverse(cdx_iter, limit):
#=================================================================
# filter cdx by regex if each filter is field:regex form,
# apply filter to cdx[field]
def cdx_filter(cdx_iter, filter_strings):
"""
filter CDX by regex if each filter is :samp:`{field}:{regex}` form,
apply filter to :samp:`cdx[{field}]`.
"""
# Support single strings as well
if isinstance(filter_strings, str):
filter_strings = [filter_strings]
@ -195,8 +202,10 @@ def cdx_filter(cdx_iter, filter_strings):
#=================================================================
# collapse by timestamp and status code
def cdx_collapse_time_status(cdx_iter, timelen=10):
"""
collapse by timestamp and status code.
"""
timelen = int(timelen)
last_token = None
@ -211,8 +220,10 @@ def cdx_collapse_time_status(cdx_iter, timelen=10):
#=================================================================
# sort CDXCaptureResult by closest to timestamp
def cdx_sort_closest(closest, cdx_iter, limit=10):
"""
sort CDXCaptureResult by closest to timestamp.
"""
closest_cdx = []
closest_sec = timestamp_to_sec(closest)
@ -242,8 +253,15 @@ def cdx_sort_closest(closest, cdx_iter, limit=10):
# Fields to append from cdx original to revisit
ORIG_TUPLE = ['length', 'offset', 'filename']
def cdx_resolve_revisits(cdx_iter):
"""
resolve revisits.
this filter adds three fields to CDX: ``orig.length``, ``orig.offset``,
and ``orig.filename``. for revisit records, these fields have corresponding
field values in previous non-revisit (original) CDX record.
They are all ``"-"`` for non-revisit records.
"""
originals = {}
for cdx in cdx_iter:

View File

@ -39,9 +39,9 @@ class BaseCDXServer(object):
def _check_cdx_iter(self, cdx_iter, query):
""" Check cdx iter semantics
If iter is empty (no matches), check if fuzzy matching
If `cdx_iter` is empty (no matches), check if fuzzy matching
is allowed, and try it -- otherwise,
throw CaptureNotFoundException
throw :exc:`~pywb.cdx.cdxobject.CaptureNotFoundException`
"""
cdx_iter = self.peek_iter(cdx_iter)
@ -93,6 +93,19 @@ class CDXServer(BaseCDXServer):
self._create_cdx_sources(paths, kwargs.get('config'))
def load_cdx_query(self, query):
"""
load CDX for query parameters ``params``.
``key`` (or ``url``) parameter specifies URL to query,
``matchType`` parameter specifies matching method for ``key``
(default ``exact``).
other parameters are passed down to :func:`cdx_load`.
raises :exc:`~pywb.cdx.cdxobject.CaptureNotFoundException`
if no captures are found.
:param query: query parameters
:type query: :class:`~pywb.cdx.query.CDXQuery`
:rtype: iterator on :class:`~pywb.cdx.cdxobject.CDXObject`
"""
url = query.url
key, end_key = calc_search_range(url=url,
match_type=query.match_type,
@ -107,7 +120,8 @@ class CDXServer(BaseCDXServer):
def _create_cdx_sources(self, paths, config):
"""
build CDXSource instances for each of path in :param paths:.
build CDXSource instances for each of path in ``paths``.
:param paths: list of sources or single source.
each source may be either string or CDXSource instance. value
of any other types will be silently ignored.
@ -171,7 +185,8 @@ class CDXServer(BaseCDXServer):
#=================================================================
class RemoteCDXServer(BaseCDXServer):
"""
A special cdx server that uses a single RemoteCDXSource
A special cdx server that uses a single
:class:`~pywb.cdx.cdxsource.RemoteCDXSource`.
It simply proxies the query params to the remote source
and performs no local processing/filtering
"""

View File

@ -39,7 +39,7 @@ class RemoteCDXSource(CDXSource):
"""
Represents a remote cdx server, to which requests will be proxied.
Only url and match type params are proxied at this time,
Only ``url`` and ``match_type`` params are proxied at this time,
the stream is passed through all other filters locally.
"""
def __init__(self, filename, cookie=None, remote_processing=False):