diff --git a/pywb/cdx/cdxobject.py b/pywb/cdx/cdxobject.py index 6b7dfdfe..837059df 100644 --- a/pywb/cdx/cdxobject.py +++ b/pywb/cdx/cdxobject.py @@ -25,6 +25,9 @@ class CaptureNotFoundException(CDXException): #================================================================= class CDXObject(OrderedDict): + """ + dictionary object representing parsed CDX line. + """ CDX_FORMATS = [ # Public CDX Format ["urlkey", "timestamp", "original", "mimetype", "statuscode", @@ -75,12 +78,16 @@ class CDXObject(OrderedDict): self.cdxline = None def is_revisit(self): + """return ``True`` if this record is a revisit record.""" return (self['mimetype'] == 'warc/revisit' or self['filename'] == '-') def to_text(self, fields=None): """ return plaintext CDX record (includes newline). + if ``fields`` is ``None``, output will have all fields + in the order they are stored. + :param fields: list of field names to output. """ if fields is None: @@ -132,6 +139,7 @@ class IDXObject(OrderedDict): def to_text(self, fields=None): """ return plaintext IDX record (including newline). + :param fields: list of field names to output (currently ignored) """ return str(self) + '\n' diff --git a/pywb/cdx/cdxops.py b/pywb/cdx/cdxops.py index b127635c..3ef07b6a 100644 --- a/pywb/cdx/cdxops.py +++ b/pywb/cdx/cdxops.py @@ -88,9 +88,10 @@ def create_merged_cdx_gen(sources, query): #================================================================= -# convert text cdx stream to CDXObject/IDXObject def make_obj_iter(text_iter, query): - # already converted + """ + convert text cdx stream to CDXObject/IDXObject. + """ if query.secondary_index_only: cls = IDXObject else: @@ -100,16 +101,20 @@ def make_obj_iter(text_iter, query): #================================================================= -# limit cdx to at most limit def cdx_limit(cdx_iter, limit): + """ + limit cdx to at most `limit`. + """ # for cdx, _ in itertools.izip(cdx_iter, xrange(limit)): # yield cdx return (cdx for cdx, _ in itertools.izip(cdx_iter, xrange(limit))) #================================================================= -# reverse cdx def cdx_reverse(cdx_iter, limit): + """ + return cdx records in reverse order. + """ # optimize for single last if limit == 1: last = None @@ -129,9 +134,11 @@ def cdx_reverse(cdx_iter, limit): #================================================================= -# filter cdx by regex if each filter is field:regex form, -# apply filter to cdx[field] def cdx_filter(cdx_iter, filter_strings): + """ + filter CDX by regex if each filter is :samp:`{field}:{regex}` form, + apply filter to :samp:`cdx[{field}]`. + """ # Support single strings as well if isinstance(filter_strings, str): filter_strings = [filter_strings] @@ -195,8 +202,10 @@ def cdx_filter(cdx_iter, filter_strings): #================================================================= -# collapse by timestamp and status code def cdx_collapse_time_status(cdx_iter, timelen=10): + """ + collapse by timestamp and status code. + """ timelen = int(timelen) last_token = None @@ -211,8 +220,10 @@ def cdx_collapse_time_status(cdx_iter, timelen=10): #================================================================= -# sort CDXCaptureResult by closest to timestamp def cdx_sort_closest(closest, cdx_iter, limit=10): + """ + sort CDXCaptureResult by closest to timestamp. + """ closest_cdx = [] closest_sec = timestamp_to_sec(closest) @@ -242,8 +253,15 @@ def cdx_sort_closest(closest, cdx_iter, limit=10): # Fields to append from cdx original to revisit ORIG_TUPLE = ['length', 'offset', 'filename'] - def cdx_resolve_revisits(cdx_iter): + """ + resolve revisits. + + this filter adds three fields to CDX: ``orig.length``, ``orig.offset``, + and ``orig.filename``. for revisit records, these fields have corresponding + field values in previous non-revisit (original) CDX record. + They are all ``"-"`` for non-revisit records. + """ originals = {} for cdx in cdx_iter: diff --git a/pywb/cdx/cdxserver.py b/pywb/cdx/cdxserver.py index 08dc2b56..ee02edbe 100644 --- a/pywb/cdx/cdxserver.py +++ b/pywb/cdx/cdxserver.py @@ -39,9 +39,9 @@ class BaseCDXServer(object): def _check_cdx_iter(self, cdx_iter, query): """ Check cdx iter semantics - If iter is empty (no matches), check if fuzzy matching + If `cdx_iter` is empty (no matches), check if fuzzy matching is allowed, and try it -- otherwise, - throw CaptureNotFoundException + throw :exc:`~pywb.cdx.cdxobject.CaptureNotFoundException` """ cdx_iter = self.peek_iter(cdx_iter) @@ -93,6 +93,19 @@ class CDXServer(BaseCDXServer): self._create_cdx_sources(paths, kwargs.get('config')) def load_cdx_query(self, query): + """ + load CDX for query parameters ``params``. + ``key`` (or ``url``) parameter specifies URL to query, + ``matchType`` parameter specifies matching method for ``key`` + (default ``exact``). + other parameters are passed down to :func:`cdx_load`. + raises :exc:`~pywb.cdx.cdxobject.CaptureNotFoundException` + if no captures are found. + + :param query: query parameters + :type query: :class:`~pywb.cdx.query.CDXQuery` + :rtype: iterator on :class:`~pywb.cdx.cdxobject.CDXObject` + """ url = query.url key, end_key = calc_search_range(url=url, match_type=query.match_type, @@ -107,7 +120,8 @@ class CDXServer(BaseCDXServer): def _create_cdx_sources(self, paths, config): """ - build CDXSource instances for each of path in :param paths:. + build CDXSource instances for each of path in ``paths``. + :param paths: list of sources or single source. each source may be either string or CDXSource instance. value of any other types will be silently ignored. @@ -171,7 +185,8 @@ class CDXServer(BaseCDXServer): #================================================================= class RemoteCDXServer(BaseCDXServer): """ - A special cdx server that uses a single RemoteCDXSource + A special cdx server that uses a single + :class:`~pywb.cdx.cdxsource.RemoteCDXSource`. It simply proxies the query params to the remote source and performs no local processing/filtering """ diff --git a/pywb/cdx/cdxsource.py b/pywb/cdx/cdxsource.py index e0f0467e..ec63c1c8 100644 --- a/pywb/cdx/cdxsource.py +++ b/pywb/cdx/cdxsource.py @@ -39,7 +39,7 @@ class RemoteCDXSource(CDXSource): """ Represents a remote cdx server, to which requests will be proxied. - Only url and match type params are proxied at this time, + Only ``url`` and ``match_type`` params are proxied at this time, the stream is passed through all other filters locally. """ def __init__(self, filename, cookie=None, remote_processing=False):