mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
Merge pull request #29 from kngenie/just-a-cleanup
clean up docstrings: fix reST formatting issues.
This commit is contained in:
commit
03ebca47c0
@ -25,6 +25,9 @@ class CaptureNotFoundException(CDXException):
|
|||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class CDXObject(OrderedDict):
|
class CDXObject(OrderedDict):
|
||||||
|
"""
|
||||||
|
dictionary object representing parsed CDX line.
|
||||||
|
"""
|
||||||
CDX_FORMATS = [
|
CDX_FORMATS = [
|
||||||
# Public CDX Format
|
# Public CDX Format
|
||||||
["urlkey", "timestamp", "original", "mimetype", "statuscode",
|
["urlkey", "timestamp", "original", "mimetype", "statuscode",
|
||||||
@ -75,12 +78,16 @@ class CDXObject(OrderedDict):
|
|||||||
self.cdxline = None
|
self.cdxline = None
|
||||||
|
|
||||||
def is_revisit(self):
|
def is_revisit(self):
|
||||||
|
"""return ``True`` if this record is a revisit record."""
|
||||||
return (self['mimetype'] == 'warc/revisit' or
|
return (self['mimetype'] == 'warc/revisit' or
|
||||||
self['filename'] == '-')
|
self['filename'] == '-')
|
||||||
|
|
||||||
def to_text(self, fields=None):
|
def to_text(self, fields=None):
|
||||||
"""
|
"""
|
||||||
return plaintext CDX record (includes newline).
|
return plaintext CDX record (includes newline).
|
||||||
|
if ``fields`` is ``None``, output will have all fields
|
||||||
|
in the order they are stored.
|
||||||
|
|
||||||
:param fields: list of field names to output.
|
:param fields: list of field names to output.
|
||||||
"""
|
"""
|
||||||
if fields is None:
|
if fields is None:
|
||||||
@ -132,6 +139,7 @@ class IDXObject(OrderedDict):
|
|||||||
def to_text(self, fields=None):
|
def to_text(self, fields=None):
|
||||||
"""
|
"""
|
||||||
return plaintext IDX record (including newline).
|
return plaintext IDX record (including newline).
|
||||||
|
|
||||||
:param fields: list of field names to output (currently ignored)
|
:param fields: list of field names to output (currently ignored)
|
||||||
"""
|
"""
|
||||||
return str(self) + '\n'
|
return str(self) + '\n'
|
||||||
|
@ -88,9 +88,10 @@ def create_merged_cdx_gen(sources, query):
|
|||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
# convert text cdx stream to CDXObject/IDXObject
|
|
||||||
def make_obj_iter(text_iter, query):
|
def make_obj_iter(text_iter, query):
|
||||||
# already converted
|
"""
|
||||||
|
convert text cdx stream to CDXObject/IDXObject.
|
||||||
|
"""
|
||||||
if query.secondary_index_only:
|
if query.secondary_index_only:
|
||||||
cls = IDXObject
|
cls = IDXObject
|
||||||
else:
|
else:
|
||||||
@ -100,16 +101,20 @@ def make_obj_iter(text_iter, query):
|
|||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
# limit cdx to at most limit
|
|
||||||
def cdx_limit(cdx_iter, limit):
|
def cdx_limit(cdx_iter, limit):
|
||||||
|
"""
|
||||||
|
limit cdx to at most `limit`.
|
||||||
|
"""
|
||||||
# for cdx, _ in itertools.izip(cdx_iter, xrange(limit)):
|
# for cdx, _ in itertools.izip(cdx_iter, xrange(limit)):
|
||||||
# yield cdx
|
# yield cdx
|
||||||
return (cdx for cdx, _ in itertools.izip(cdx_iter, xrange(limit)))
|
return (cdx for cdx, _ in itertools.izip(cdx_iter, xrange(limit)))
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
# reverse cdx
|
|
||||||
def cdx_reverse(cdx_iter, limit):
|
def cdx_reverse(cdx_iter, limit):
|
||||||
|
"""
|
||||||
|
return cdx records in reverse order.
|
||||||
|
"""
|
||||||
# optimize for single last
|
# optimize for single last
|
||||||
if limit == 1:
|
if limit == 1:
|
||||||
last = None
|
last = None
|
||||||
@ -129,9 +134,11 @@ def cdx_reverse(cdx_iter, limit):
|
|||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
# filter cdx by regex if each filter is field:regex form,
|
|
||||||
# apply filter to cdx[field]
|
|
||||||
def cdx_filter(cdx_iter, filter_strings):
|
def cdx_filter(cdx_iter, filter_strings):
|
||||||
|
"""
|
||||||
|
filter CDX by regex if each filter is :samp:`{field}:{regex}` form,
|
||||||
|
apply filter to :samp:`cdx[{field}]`.
|
||||||
|
"""
|
||||||
# Support single strings as well
|
# Support single strings as well
|
||||||
if isinstance(filter_strings, str):
|
if isinstance(filter_strings, str):
|
||||||
filter_strings = [filter_strings]
|
filter_strings = [filter_strings]
|
||||||
@ -195,8 +202,10 @@ def cdx_filter(cdx_iter, filter_strings):
|
|||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
# collapse by timestamp and status code
|
|
||||||
def cdx_collapse_time_status(cdx_iter, timelen=10):
|
def cdx_collapse_time_status(cdx_iter, timelen=10):
|
||||||
|
"""
|
||||||
|
collapse by timestamp and status code.
|
||||||
|
"""
|
||||||
timelen = int(timelen)
|
timelen = int(timelen)
|
||||||
|
|
||||||
last_token = None
|
last_token = None
|
||||||
@ -211,8 +220,10 @@ def cdx_collapse_time_status(cdx_iter, timelen=10):
|
|||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
# sort CDXCaptureResult by closest to timestamp
|
|
||||||
def cdx_sort_closest(closest, cdx_iter, limit=10):
|
def cdx_sort_closest(closest, cdx_iter, limit=10):
|
||||||
|
"""
|
||||||
|
sort CDXCaptureResult by closest to timestamp.
|
||||||
|
"""
|
||||||
closest_cdx = []
|
closest_cdx = []
|
||||||
|
|
||||||
closest_sec = timestamp_to_sec(closest)
|
closest_sec = timestamp_to_sec(closest)
|
||||||
@ -242,8 +253,15 @@ def cdx_sort_closest(closest, cdx_iter, limit=10):
|
|||||||
# Fields to append from cdx original to revisit
|
# Fields to append from cdx original to revisit
|
||||||
ORIG_TUPLE = ['length', 'offset', 'filename']
|
ORIG_TUPLE = ['length', 'offset', 'filename']
|
||||||
|
|
||||||
|
|
||||||
def cdx_resolve_revisits(cdx_iter):
|
def cdx_resolve_revisits(cdx_iter):
|
||||||
|
"""
|
||||||
|
resolve revisits.
|
||||||
|
|
||||||
|
this filter adds three fields to CDX: ``orig.length``, ``orig.offset``,
|
||||||
|
and ``orig.filename``. for revisit records, these fields have corresponding
|
||||||
|
field values in previous non-revisit (original) CDX record.
|
||||||
|
They are all ``"-"`` for non-revisit records.
|
||||||
|
"""
|
||||||
originals = {}
|
originals = {}
|
||||||
|
|
||||||
for cdx in cdx_iter:
|
for cdx in cdx_iter:
|
||||||
|
@ -39,9 +39,9 @@ class BaseCDXServer(object):
|
|||||||
|
|
||||||
def _check_cdx_iter(self, cdx_iter, query):
|
def _check_cdx_iter(self, cdx_iter, query):
|
||||||
""" Check cdx iter semantics
|
""" Check cdx iter semantics
|
||||||
If iter is empty (no matches), check if fuzzy matching
|
If `cdx_iter` is empty (no matches), check if fuzzy matching
|
||||||
is allowed, and try it -- otherwise,
|
is allowed, and try it -- otherwise,
|
||||||
throw CaptureNotFoundException
|
throw :exc:`~pywb.cdx.cdxobject.CaptureNotFoundException`
|
||||||
"""
|
"""
|
||||||
|
|
||||||
cdx_iter = self.peek_iter(cdx_iter)
|
cdx_iter = self.peek_iter(cdx_iter)
|
||||||
@ -93,6 +93,19 @@ class CDXServer(BaseCDXServer):
|
|||||||
self._create_cdx_sources(paths, kwargs.get('config'))
|
self._create_cdx_sources(paths, kwargs.get('config'))
|
||||||
|
|
||||||
def load_cdx_query(self, query):
|
def load_cdx_query(self, query):
|
||||||
|
"""
|
||||||
|
load CDX for query parameters ``params``.
|
||||||
|
``key`` (or ``url``) parameter specifies URL to query,
|
||||||
|
``matchType`` parameter specifies matching method for ``key``
|
||||||
|
(default ``exact``).
|
||||||
|
other parameters are passed down to :func:`cdx_load`.
|
||||||
|
raises :exc:`~pywb.cdx.cdxobject.CaptureNotFoundException`
|
||||||
|
if no captures are found.
|
||||||
|
|
||||||
|
:param query: query parameters
|
||||||
|
:type query: :class:`~pywb.cdx.query.CDXQuery`
|
||||||
|
:rtype: iterator on :class:`~pywb.cdx.cdxobject.CDXObject`
|
||||||
|
"""
|
||||||
url = query.url
|
url = query.url
|
||||||
key, end_key = calc_search_range(url=url,
|
key, end_key = calc_search_range(url=url,
|
||||||
match_type=query.match_type,
|
match_type=query.match_type,
|
||||||
@ -107,7 +120,8 @@ class CDXServer(BaseCDXServer):
|
|||||||
|
|
||||||
def _create_cdx_sources(self, paths, config):
|
def _create_cdx_sources(self, paths, config):
|
||||||
"""
|
"""
|
||||||
build CDXSource instances for each of path in :param paths:.
|
build CDXSource instances for each of path in ``paths``.
|
||||||
|
|
||||||
:param paths: list of sources or single source.
|
:param paths: list of sources or single source.
|
||||||
each source may be either string or CDXSource instance. value
|
each source may be either string or CDXSource instance. value
|
||||||
of any other types will be silently ignored.
|
of any other types will be silently ignored.
|
||||||
@ -171,7 +185,8 @@ class CDXServer(BaseCDXServer):
|
|||||||
#=================================================================
|
#=================================================================
|
||||||
class RemoteCDXServer(BaseCDXServer):
|
class RemoteCDXServer(BaseCDXServer):
|
||||||
"""
|
"""
|
||||||
A special cdx server that uses a single RemoteCDXSource
|
A special cdx server that uses a single
|
||||||
|
:class:`~pywb.cdx.cdxsource.RemoteCDXSource`.
|
||||||
It simply proxies the query params to the remote source
|
It simply proxies the query params to the remote source
|
||||||
and performs no local processing/filtering
|
and performs no local processing/filtering
|
||||||
"""
|
"""
|
||||||
|
@ -39,7 +39,7 @@ class RemoteCDXSource(CDXSource):
|
|||||||
"""
|
"""
|
||||||
Represents a remote cdx server, to which requests will be proxied.
|
Represents a remote cdx server, to which requests will be proxied.
|
||||||
|
|
||||||
Only url and match type params are proxied at this time,
|
Only ``url`` and ``match_type`` params are proxied at this time,
|
||||||
the stream is passed through all other filters locally.
|
the stream is passed through all other filters locally.
|
||||||
"""
|
"""
|
||||||
def __init__(self, filename, cookie=None, remote_processing=False):
|
def __init__(self, filename, cookie=None, remote_processing=False):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user