1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

Merge pull request #29 from kngenie/just-a-cleanup

clean up docstrings: fix reST formatting issues.
This commit is contained in:
ikreymer 2014-03-05 14:36:07 -08:00
commit 03ebca47c0
4 changed files with 55 additions and 14 deletions

View File

@ -25,6 +25,9 @@ class CaptureNotFoundException(CDXException):
#================================================================= #=================================================================
class CDXObject(OrderedDict): class CDXObject(OrderedDict):
"""
dictionary object representing parsed CDX line.
"""
CDX_FORMATS = [ CDX_FORMATS = [
# Public CDX Format # Public CDX Format
["urlkey", "timestamp", "original", "mimetype", "statuscode", ["urlkey", "timestamp", "original", "mimetype", "statuscode",
@ -75,12 +78,16 @@ class CDXObject(OrderedDict):
self.cdxline = None self.cdxline = None
def is_revisit(self): def is_revisit(self):
"""return ``True`` if this record is a revisit record."""
return (self['mimetype'] == 'warc/revisit' or return (self['mimetype'] == 'warc/revisit' or
self['filename'] == '-') self['filename'] == '-')
def to_text(self, fields=None): def to_text(self, fields=None):
""" """
return plaintext CDX record (includes newline). return plaintext CDX record (includes newline).
if ``fields`` is ``None``, output will have all fields
in the order they are stored.
:param fields: list of field names to output. :param fields: list of field names to output.
""" """
if fields is None: if fields is None:
@ -132,6 +139,7 @@ class IDXObject(OrderedDict):
def to_text(self, fields=None): def to_text(self, fields=None):
""" """
return plaintext IDX record (including newline). return plaintext IDX record (including newline).
:param fields: list of field names to output (currently ignored) :param fields: list of field names to output (currently ignored)
""" """
return str(self) + '\n' return str(self) + '\n'

View File

@ -88,9 +88,10 @@ def create_merged_cdx_gen(sources, query):
#================================================================= #=================================================================
# convert text cdx stream to CDXObject/IDXObject
def make_obj_iter(text_iter, query): def make_obj_iter(text_iter, query):
# already converted """
convert text cdx stream to CDXObject/IDXObject.
"""
if query.secondary_index_only: if query.secondary_index_only:
cls = IDXObject cls = IDXObject
else: else:
@ -100,16 +101,20 @@ def make_obj_iter(text_iter, query):
#================================================================= #=================================================================
# limit cdx to at most limit
def cdx_limit(cdx_iter, limit): def cdx_limit(cdx_iter, limit):
"""
limit cdx to at most `limit`.
"""
# for cdx, _ in itertools.izip(cdx_iter, xrange(limit)): # for cdx, _ in itertools.izip(cdx_iter, xrange(limit)):
# yield cdx # yield cdx
return (cdx for cdx, _ in itertools.izip(cdx_iter, xrange(limit))) return (cdx for cdx, _ in itertools.izip(cdx_iter, xrange(limit)))
#================================================================= #=================================================================
# reverse cdx
def cdx_reverse(cdx_iter, limit): def cdx_reverse(cdx_iter, limit):
"""
return cdx records in reverse order.
"""
# optimize for single last # optimize for single last
if limit == 1: if limit == 1:
last = None last = None
@ -129,9 +134,11 @@ def cdx_reverse(cdx_iter, limit):
#================================================================= #=================================================================
# filter cdx by regex if each filter is field:regex form,
# apply filter to cdx[field]
def cdx_filter(cdx_iter, filter_strings): def cdx_filter(cdx_iter, filter_strings):
"""
filter CDX by regex if each filter is :samp:`{field}:{regex}` form,
apply filter to :samp:`cdx[{field}]`.
"""
# Support single strings as well # Support single strings as well
if isinstance(filter_strings, str): if isinstance(filter_strings, str):
filter_strings = [filter_strings] filter_strings = [filter_strings]
@ -195,8 +202,10 @@ def cdx_filter(cdx_iter, filter_strings):
#================================================================= #=================================================================
# collapse by timestamp and status code
def cdx_collapse_time_status(cdx_iter, timelen=10): def cdx_collapse_time_status(cdx_iter, timelen=10):
"""
collapse by timestamp and status code.
"""
timelen = int(timelen) timelen = int(timelen)
last_token = None last_token = None
@ -211,8 +220,10 @@ def cdx_collapse_time_status(cdx_iter, timelen=10):
#================================================================= #=================================================================
# sort CDXCaptureResult by closest to timestamp
def cdx_sort_closest(closest, cdx_iter, limit=10): def cdx_sort_closest(closest, cdx_iter, limit=10):
"""
sort CDXCaptureResult by closest to timestamp.
"""
closest_cdx = [] closest_cdx = []
closest_sec = timestamp_to_sec(closest) closest_sec = timestamp_to_sec(closest)
@ -242,8 +253,15 @@ def cdx_sort_closest(closest, cdx_iter, limit=10):
# Fields to append from cdx original to revisit # Fields to append from cdx original to revisit
ORIG_TUPLE = ['length', 'offset', 'filename'] ORIG_TUPLE = ['length', 'offset', 'filename']
def cdx_resolve_revisits(cdx_iter): def cdx_resolve_revisits(cdx_iter):
"""
resolve revisits.
this filter adds three fields to CDX: ``orig.length``, ``orig.offset``,
and ``orig.filename``. for revisit records, these fields have corresponding
field values in previous non-revisit (original) CDX record.
They are all ``"-"`` for non-revisit records.
"""
originals = {} originals = {}
for cdx in cdx_iter: for cdx in cdx_iter:

View File

@ -39,9 +39,9 @@ class BaseCDXServer(object):
def _check_cdx_iter(self, cdx_iter, query): def _check_cdx_iter(self, cdx_iter, query):
""" Check cdx iter semantics """ Check cdx iter semantics
If iter is empty (no matches), check if fuzzy matching If `cdx_iter` is empty (no matches), check if fuzzy matching
is allowed, and try it -- otherwise, is allowed, and try it -- otherwise,
throw CaptureNotFoundException throw :exc:`~pywb.cdx.cdxobject.CaptureNotFoundException`
""" """
cdx_iter = self.peek_iter(cdx_iter) cdx_iter = self.peek_iter(cdx_iter)
@ -93,6 +93,19 @@ class CDXServer(BaseCDXServer):
self._create_cdx_sources(paths, kwargs.get('config')) self._create_cdx_sources(paths, kwargs.get('config'))
def load_cdx_query(self, query): def load_cdx_query(self, query):
"""
load CDX for query parameters ``params``.
``key`` (or ``url``) parameter specifies URL to query,
``matchType`` parameter specifies matching method for ``key``
(default ``exact``).
other parameters are passed down to :func:`cdx_load`.
raises :exc:`~pywb.cdx.cdxobject.CaptureNotFoundException`
if no captures are found.
:param query: query parameters
:type query: :class:`~pywb.cdx.query.CDXQuery`
:rtype: iterator on :class:`~pywb.cdx.cdxobject.CDXObject`
"""
url = query.url url = query.url
key, end_key = calc_search_range(url=url, key, end_key = calc_search_range(url=url,
match_type=query.match_type, match_type=query.match_type,
@ -107,7 +120,8 @@ class CDXServer(BaseCDXServer):
def _create_cdx_sources(self, paths, config): def _create_cdx_sources(self, paths, config):
""" """
build CDXSource instances for each of path in :param paths:. build CDXSource instances for each of path in ``paths``.
:param paths: list of sources or single source. :param paths: list of sources or single source.
each source may be either string or CDXSource instance. value each source may be either string or CDXSource instance. value
of any other types will be silently ignored. of any other types will be silently ignored.
@ -171,7 +185,8 @@ class CDXServer(BaseCDXServer):
#================================================================= #=================================================================
class RemoteCDXServer(BaseCDXServer): class RemoteCDXServer(BaseCDXServer):
""" """
A special cdx server that uses a single RemoteCDXSource A special cdx server that uses a single
:class:`~pywb.cdx.cdxsource.RemoteCDXSource`.
It simply proxies the query params to the remote source It simply proxies the query params to the remote source
and performs no local processing/filtering and performs no local processing/filtering
""" """

View File

@ -39,7 +39,7 @@ class RemoteCDXSource(CDXSource):
""" """
Represents a remote cdx server, to which requests will be proxied. Represents a remote cdx server, to which requests will be proxied.
Only url and match type params are proxied at this time, Only ``url`` and ``match_type`` params are proxied at this time,
the stream is passed through all other filters locally. the stream is passed through all other filters locally.
""" """
def __init__(self, filename, cookie=None, remote_processing=False): def __init__(self, filename, cookie=None, remote_processing=False):