mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
slightly less incomplete work on new postfetch processor chain
This commit is contained in:
parent
c715eaba4e
commit
bd25991a0d
@ -19,8 +19,11 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
|
|||||||
USA.
|
USA.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
import datetime
|
import datetime
|
||||||
import threading
|
import threading
|
||||||
|
import time
|
||||||
|
import logging
|
||||||
from argparse import Namespace as _Namespace
|
from argparse import Namespace as _Namespace
|
||||||
from pkg_resources import get_distribution as _get_distribution
|
from pkg_resources import get_distribution as _get_distribution
|
||||||
__version__ = _get_distribution('warcprox').version
|
__version__ = _get_distribution('warcprox').version
|
||||||
@ -28,6 +31,7 @@ try:
|
|||||||
import queue
|
import queue
|
||||||
except ImportError:
|
except ImportError:
|
||||||
import Queue as queue
|
import Queue as queue
|
||||||
|
|
||||||
def digest_str(hash_obj, base32=False):
|
def digest_str(hash_obj, base32=False):
|
||||||
import base64
|
import base64
|
||||||
return hash_obj.name.encode('utf-8') + b':' + (
|
return hash_obj.name.encode('utf-8') + b':' + (
|
||||||
@ -93,15 +97,17 @@ class RequestBlockedByRule(Exception):
|
|||||||
return "%s: %s" % (self.__class__.__name__, self.msg)
|
return "%s: %s" % (self.__class__.__name__, self.msg)
|
||||||
|
|
||||||
class BasePostfetchProcessor(threading.Thread):
|
class BasePostfetchProcessor(threading.Thread):
|
||||||
def __init__(self, inq, outq, profile=False):
|
logger = logging.getLogger("warcprox.BasePostfetchProcessor")
|
||||||
|
|
||||||
|
def __init__(self, inq, outq, options=Options()):
|
||||||
threading.Thread.__init__(self, name='???')
|
threading.Thread.__init__(self, name='???')
|
||||||
self.inq = inq
|
self.inq = inq
|
||||||
self.outq = outq
|
self.outq = outq
|
||||||
|
self.options = options
|
||||||
self.stop = threading.Event()
|
self.stop = threading.Event()
|
||||||
self.profile = profile
|
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
if self.profile:
|
if self.options.profile:
|
||||||
import cProfile
|
import cProfile
|
||||||
self.profiler = cProfile.Profile()
|
self.profiler = cProfile.Profile()
|
||||||
self.profiler.enable()
|
self.profiler.enable()
|
||||||
@ -186,9 +192,15 @@ class ListenerPostfetchProcessor(BaseStandardPostfetchProcessor):
|
|||||||
def _process_url(self, recorded_url):
|
def _process_url(self, recorded_url):
|
||||||
return self.listener.notify(recorded_url, recorded_url.warc_records)
|
return self.listener.notify(recorded_url, recorded_url.warc_records)
|
||||||
|
|
||||||
|
# @classmethod
|
||||||
|
# def wrap(cls, listener, inq, outq, profile=False):
|
||||||
|
# if listener:
|
||||||
|
# return cls(listener, inq, outq, profile)
|
||||||
|
# else:
|
||||||
|
# return None
|
||||||
|
|
||||||
# monkey-patch log levels TRACE and NOTICE
|
# monkey-patch log levels TRACE and NOTICE
|
||||||
TRACE = 5
|
TRACE = 5
|
||||||
import logging
|
|
||||||
def _logger_trace(self, msg, *args, **kwargs):
|
def _logger_trace(self, msg, *args, **kwargs):
|
||||||
if self.isEnabledFor(TRACE):
|
if self.isEnabledFor(TRACE):
|
||||||
self._log(TRACE, msg, args, **kwargs)
|
self._log(TRACE, msg, args, **kwargs)
|
||||||
@ -197,7 +209,6 @@ logging.trace = logging.root.trace
|
|||||||
logging.addLevelName(TRACE, 'TRACE')
|
logging.addLevelName(TRACE, 'TRACE')
|
||||||
|
|
||||||
NOTICE = (logging.INFO + logging.WARN) // 2
|
NOTICE = (logging.INFO + logging.WARN) // 2
|
||||||
import logging
|
|
||||||
def _logger_notice(self, msg, *args, **kwargs):
|
def _logger_notice(self, msg, *args, **kwargs):
|
||||||
if self.isEnabledFor(NOTICE):
|
if self.isEnabledFor(NOTICE):
|
||||||
self._log(NOTICE, msg, args, **kwargs)
|
self._log(NOTICE, msg, args, **kwargs)
|
||||||
|
@ -32,6 +32,7 @@ import gc
|
|||||||
import datetime
|
import datetime
|
||||||
import warcprox
|
import warcprox
|
||||||
import certauth
|
import certauth
|
||||||
|
import functools
|
||||||
|
|
||||||
class Factory:
|
class Factory:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@ -60,7 +61,8 @@ class Factory:
|
|||||||
logging.info('statistics tracking disabled')
|
logging.info('statistics tracking disabled')
|
||||||
stats_db = None
|
stats_db = None
|
||||||
else:
|
else:
|
||||||
stats_db = warcprox.stats.StatsDb(options.stats_db_file, options=options)
|
stats_db = warcprox.stats.StatsDb(
|
||||||
|
options.stats_db_file, options=options)
|
||||||
return stats_db
|
return stats_db
|
||||||
|
|
||||||
# @staticmethod
|
# @staticmethod
|
||||||
@ -70,6 +72,10 @@ class Factory:
|
|||||||
# options.cacert, args.certs_dir, ca_name=ca_name)
|
# options.cacert, args.certs_dir, ca_name=ca_name)
|
||||||
# return ca
|
# return ca
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def warc_writer(inq, outq, options):
|
||||||
|
return warcprox.writerthread.WarcWriterThread(inq, outq, options)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def playback_proxy(options):
|
def playback_proxy(options):
|
||||||
if options.playback_port is not None:
|
if options.playback_port is not None:
|
||||||
@ -86,48 +92,32 @@ class Factory:
|
|||||||
def crawl_logger(options):
|
def crawl_logger(options):
|
||||||
if options.crawl_log_dir:
|
if options.crawl_log_dir:
|
||||||
return warcprox.crawl_log.CrawlLogger(
|
return warcprox.crawl_log.CrawlLogger(
|
||||||
options.crawl_log_dir, options=options))
|
options.crawl_log_dir, options=options)
|
||||||
else:
|
else:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def plugin(qualname, inq, outq):
|
def plugin(qualname):
|
||||||
try:
|
try:
|
||||||
(module_name, class_name) = qualname.rsplit('.', 1)
|
(module_name, class_name) = qualname.rsplit('.', 1)
|
||||||
module_ = importlib.import_module(module_name)
|
module_ = importlib.import_module(module_name)
|
||||||
class_ = getattr(module_, class_name)
|
class_ = getattr(module_, class_name)
|
||||||
instance = class_()
|
listener = class_()
|
||||||
plugin.notify # make sure it has this method
|
plugin.notify # make sure it has this method
|
||||||
return instance
|
return plugin
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.fatal('problem with plugin class %r: %s', qualname, e)
|
logging.fatal('problem with plugin class %r: %s', qualname, e)
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
# @staticmethod
|
@staticmethod
|
||||||
# def plugins(options):
|
def service_registry(options):
|
||||||
# plugins = []
|
if options.rethinkdb_services_url:
|
||||||
# for qualname in options.plugins or []:
|
parsed = doublethink.parse_rethinkdb_url(
|
||||||
# try:
|
options.rethinkdb_services_url)
|
||||||
# (module_name, class_name) = qualname.rsplit('.', 1)
|
rr = doublethink.Rethinker(servers=parsed.hosts, db=parsed.database)
|
||||||
# module_ = importlib.import_module(module_name)
|
return doublethink.ServiceRegistry(rr, table=parsed.table)
|
||||||
# class_ = getattr(module_, class_name)
|
else:
|
||||||
# plugin = class_()
|
return None
|
||||||
# plugin.notify # make sure it has this method
|
|
||||||
# plugins.append(plugin)
|
|
||||||
# except Exception as e:
|
|
||||||
# logging.fatal('problem with plugin class %r: %s', qualname, e)
|
|
||||||
# sys.exit(1)
|
|
||||||
# return plugins
|
|
||||||
|
|
||||||
# @staticmethod
|
|
||||||
# def service_registry(options):
|
|
||||||
# if options.rethinkdb_services_url:
|
|
||||||
# parsed = doublethink.parse_rethinkdb_url(
|
|
||||||
# options.rethinkdb_services_url)
|
|
||||||
# rr = doublethink.Rethinker(servers=parsed.hosts, db=parsed.database)
|
|
||||||
# return doublethink.ServiceRegistry(rr, table=parsed.table)
|
|
||||||
# else:
|
|
||||||
# return None
|
|
||||||
|
|
||||||
class WarcproxController(object):
|
class WarcproxController(object):
|
||||||
logger = logging.getLogger("warcprox.controller.WarcproxController")
|
logger = logging.getLogger("warcprox.controller.WarcproxController")
|
||||||
@ -148,64 +138,54 @@ class WarcproxController(object):
|
|||||||
|
|
||||||
self.proxy = warcprox.warcproxy.WarcProxy(options=options)
|
self.proxy = warcprox.warcproxy.WarcProxy(options=options)
|
||||||
|
|
||||||
self.build_postfetch_chain(proxy.recorded_url_q)
|
self.build_postfetch_chain(self.proxy.recorded_url_q)
|
||||||
|
|
||||||
# if warc_writer_threads is not None:
|
self.service_registry = Factory.service_registry(options)
|
||||||
# self.warc_writer_threads = warc_writer_threads
|
|
||||||
# else:
|
|
||||||
# self.warc_writer_threads = [
|
|
||||||
# warcprox.writerthread.WarcWriterThread(
|
|
||||||
# name='WarcWriterThread%03d' % i,
|
|
||||||
# recorded_url_q=self.proxy.recorded_url_q,
|
|
||||||
# listeners=[self.proxy.running_stats], options=options)
|
|
||||||
# for i in range(int(self.proxy.max_threads ** 0.5))]
|
|
||||||
# self.playback_proxy = playback_proxy
|
|
||||||
# self.service_registry = service_registry
|
|
||||||
|
|
||||||
def build_postfetch_chain(self, inq):
|
def build_postfetch_chain(self, inq):
|
||||||
outq = warcprox.TimestampedQueue(maxsize=self.options.queue_size)
|
constructors = []
|
||||||
|
|
||||||
def maybe_add_to_chain(processor_init):
|
self.dedup_db = Factory.dedup_db(self.options)
|
||||||
processor = processor_init(inq, outq, self.options)
|
|
||||||
if processor:
|
if self.dedup_db:
|
||||||
|
constructors.append(self.dedup_db.loader)
|
||||||
|
|
||||||
|
constructors.append(Factory.warc_writer)
|
||||||
|
|
||||||
|
if self.dedup_db:
|
||||||
|
constructors.append(self.dedup_db.storer)
|
||||||
|
|
||||||
|
stats_db = Factory.stats_db(self.options)
|
||||||
|
if stats_db:
|
||||||
|
constructors.append(functools.partial(
|
||||||
|
warcprox.ListenerPostfetchProcessor, stats_db))
|
||||||
|
|
||||||
|
self.playback_proxy = Factory.playback_proxy(self.options)
|
||||||
|
if self.playback_proxy:
|
||||||
|
constructors.append(functools.partial(
|
||||||
|
warcprox.ListenerPostfetchProcessor,
|
||||||
|
self.playback_proxy.playback_index_db))
|
||||||
|
|
||||||
|
crawl_logger = Factory.crawl_logger(self.options)
|
||||||
|
if crawl_logger:
|
||||||
|
constructors.append(functools.partial(
|
||||||
|
warcprox.ListenerPostfetchProcessor, crawl_logger))
|
||||||
|
|
||||||
|
for qualname in self.options.plugins or []:
|
||||||
|
plugin = Factory.plugin(qualname)
|
||||||
|
constructors.append(functools.partial(
|
||||||
|
warcprox.ListenerPostfetchProcessor, plugin))
|
||||||
|
|
||||||
|
self._postfetch_chain = []
|
||||||
|
for i, constructor in enumerate(constructors):
|
||||||
|
if i != len(constructors) - 1:
|
||||||
|
outq = warcprox.TimestampedQueue(
|
||||||
|
maxsize=self.options.queue_size)
|
||||||
|
else:
|
||||||
|
outq = None
|
||||||
|
processor = constructor(inq, outq, self.options)
|
||||||
self._postfetch_chain.append(processor)
|
self._postfetch_chain.append(processor)
|
||||||
inq = outq
|
inq = outq
|
||||||
outq = warcprox.TimestampedQueue(maxsize=self.options.queue_size)
|
|
||||||
|
|
||||||
self.dedup_db = Factory.dedup_db(options)
|
|
||||||
|
|
||||||
# dedup loader
|
|
||||||
if self.dedup_db:
|
|
||||||
maybe_add_to_chain(self.dedup_db.loader)
|
|
||||||
|
|
||||||
# warc writer
|
|
||||||
maybe_add_to_chain(Factory.warc_writer)
|
|
||||||
|
|
||||||
# dedup storer
|
|
||||||
if self.dedup_db:
|
|
||||||
maybe_add_to_chain(self.dedup_db.storer)
|
|
||||||
|
|
||||||
# playback index storer
|
|
||||||
# XXX XXX XXX FIXME
|
|
||||||
# self.playback_proxy = Factory.playback_proxy(options)
|
|
||||||
# if self.playback_proxy:
|
|
||||||
# maybe_add_to_chain()
|
|
||||||
# outq = warcprox.TimestampedQueue(maxsize=self.options.queue_size)
|
|
||||||
# processor = self.playback_proxy.playback_index_db(inq, outq)
|
|
||||||
# self._postfetch_chain.append(processor)
|
|
||||||
# inq = outq
|
|
||||||
|
|
||||||
# stats db
|
|
||||||
maybe_add_to_chain(Factory.stats_db)
|
|
||||||
|
|
||||||
# crawl logger
|
|
||||||
maybe_add_to_chain(Factory.crawl_logger)
|
|
||||||
|
|
||||||
for qualname in self.options.plugins:
|
|
||||||
maybe_add_to_chain(
|
|
||||||
lambda inq, outq, options: Factory.plugin(qualname, inq, outq))
|
|
||||||
# self.plugins = Factory.plugins(options)
|
|
||||||
|
|
||||||
|
|
||||||
def debug_mem(self):
|
def debug_mem(self):
|
||||||
self.logger.info("self.proxy.recorded_url_q.qsize()=%s", self.proxy.recorded_url_q.qsize())
|
self.logger.info("self.proxy.recorded_url_q.qsize()=%s", self.proxy.recorded_url_q.qsize())
|
||||||
@ -293,26 +273,14 @@ class WarcproxController(object):
|
|||||||
self.logger.info('warcprox is already running')
|
self.logger.info('warcprox is already running')
|
||||||
return
|
return
|
||||||
|
|
||||||
if self.proxy.stats_db:
|
|
||||||
self.proxy.stats_db.start()
|
|
||||||
self.proxy_thread = threading.Thread(
|
self.proxy_thread = threading.Thread(
|
||||||
target=self.proxy.serve_forever, name='ProxyThread')
|
target=self.proxy.serve_forever, name='ProxyThread')
|
||||||
self.proxy_thread.start()
|
self.proxy_thread.start()
|
||||||
|
|
||||||
assert(all(
|
for processor in self._postfetch_chain:
|
||||||
wwt.dedup_db is self.warc_writer_threads[0].dedup_db
|
# logging.info('starting postfetch processor %r', processor)
|
||||||
for wwt in self.warc_writer_threads))
|
processor.start()
|
||||||
if any((t.dedup_db for t in self.warc_writer_threads)):
|
logging.info('started postfetch processor %r', processor)
|
||||||
self.warc_writer_threads[0].dedup_db.start()
|
|
||||||
|
|
||||||
for wwt in self.warc_writer_threads:
|
|
||||||
wwt.start()
|
|
||||||
|
|
||||||
if self.playback_proxy is not None:
|
|
||||||
self.playback_proxy_thread = threading.Thread(
|
|
||||||
target=self.playback_proxy.serve_forever,
|
|
||||||
name='PlaybackProxyThread')
|
|
||||||
self.playback_proxy_thread.start()
|
|
||||||
|
|
||||||
def shutdown(self):
|
def shutdown(self):
|
||||||
with self._start_stop_lock:
|
with self._start_stop_lock:
|
||||||
@ -320,30 +288,34 @@ class WarcproxController(object):
|
|||||||
self.logger.info('warcprox is not running')
|
self.logger.info('warcprox is not running')
|
||||||
return
|
return
|
||||||
|
|
||||||
for wwt in self.warc_writer_threads:
|
# for wwt in self.warc_writer_threads:
|
||||||
wwt.stop.set()
|
# wwt.stop.set()
|
||||||
|
for processor in self._postfetch_chain:
|
||||||
|
processor.stop.set()
|
||||||
self.proxy.shutdown()
|
self.proxy.shutdown()
|
||||||
self.proxy.server_close()
|
self.proxy.server_close()
|
||||||
|
|
||||||
if self.playback_proxy is not None:
|
for processor in self._postfetch_chain:
|
||||||
self.playback_proxy.shutdown()
|
processor.join()
|
||||||
self.playback_proxy.server_close()
|
# if self.playback_proxy is not None:
|
||||||
if self.playback_proxy.playback_index_db is not None:
|
# self.playback_proxy.shutdown()
|
||||||
self.playback_proxy.playback_index_db.close()
|
# self.playback_proxy.server_close()
|
||||||
|
# if self.playback_proxy.playback_index_db is not None:
|
||||||
|
# self.playback_proxy.playback_index_db.close()
|
||||||
|
|
||||||
# wait for threads to finish
|
# # wait for threads to finish
|
||||||
for wwt in self.warc_writer_threads:
|
# for wwt in self.warc_writer_threads:
|
||||||
wwt.join()
|
# wwt.join()
|
||||||
|
|
||||||
if self.proxy.stats_db:
|
# if self.proxy.stats_db:
|
||||||
self.proxy.stats_db.stop()
|
# self.proxy.stats_db.stop()
|
||||||
|
|
||||||
self.proxy_thread.join()
|
# self.proxy_thread.join()
|
||||||
if self.playback_proxy is not None:
|
# if self.playback_proxy is not None:
|
||||||
self.playback_proxy_thread.join()
|
# self.playback_proxy_thread.join()
|
||||||
|
|
||||||
if self.service_registry and hasattr(self, "status_info"):
|
# if self.service_registry and hasattr(self, "status_info"):
|
||||||
self.service_registry.unregister(self.status_info["id"])
|
# self.service_registry.unregister(self.status_info["id"])
|
||||||
|
|
||||||
def run_until_shutdown(self):
|
def run_until_shutdown(self):
|
||||||
"""
|
"""
|
||||||
|
@ -35,6 +35,14 @@ from urllib3.exceptions import HTTPError
|
|||||||
|
|
||||||
urllib3.disable_warnings()
|
urllib3.disable_warnings()
|
||||||
|
|
||||||
|
class DedupLoader(warcprox.BaseStandardPostfetchProcessor):
|
||||||
|
def __init__(self, dedup_db, inq, outq, base32=False, profile=False):
|
||||||
|
warcprox.BaseStandardPostfetchProcessor.__init__(self, inq, outq, profile)
|
||||||
|
self.dedup_db = dedup_db
|
||||||
|
self.base32 = base32
|
||||||
|
def _process_url(self, recorded_url):
|
||||||
|
decorate_with_dedup_info(self.dedup_db, recorded_url, self.base32)
|
||||||
|
|
||||||
class DedupDb(object):
|
class DedupDb(object):
|
||||||
logger = logging.getLogger("warcprox.dedup.DedupDb")
|
logger = logging.getLogger("warcprox.dedup.DedupDb")
|
||||||
|
|
||||||
@ -61,6 +69,12 @@ class DedupDb(object):
|
|||||||
conn.commit()
|
conn.commit()
|
||||||
conn.close()
|
conn.close()
|
||||||
|
|
||||||
|
def loader(self, inq, outq, profile=False):
|
||||||
|
return DedupLoader(self, inq, outq, self.options.base32, profile)
|
||||||
|
|
||||||
|
def storer(self, inq, outq, profile=False):
|
||||||
|
return warcprox.ListenerPostfetchProcessor(self, inq, outq, profile)
|
||||||
|
|
||||||
def save(self, digest_key, response_record, bucket=""):
|
def save(self, digest_key, response_record, bucket=""):
|
||||||
record_id = response_record.get_header(warctools.WarcRecord.ID).decode('latin1')
|
record_id = response_record.get_header(warctools.WarcRecord.ID).decode('latin1')
|
||||||
url = response_record.get_header(warctools.WarcRecord.URL).decode('latin1')
|
url = response_record.get_header(warctools.WarcRecord.URL).decode('latin1')
|
||||||
@ -106,20 +120,20 @@ class DedupDb(object):
|
|||||||
else:
|
else:
|
||||||
self.save(digest_key, records[0])
|
self.save(digest_key, records[0])
|
||||||
|
|
||||||
|
|
||||||
def decorate_with_dedup_info(dedup_db, recorded_url, base32=False):
|
def decorate_with_dedup_info(dedup_db, recorded_url, base32=False):
|
||||||
if (recorded_url.response_recorder
|
if (recorded_url.response_recorder
|
||||||
and recorded_url.payload_digest
|
and recorded_url.payload_digest
|
||||||
and recorded_url.response_recorder.payload_size() > 0):
|
and recorded_url.response_recorder.payload_size() > 0):
|
||||||
digest_key = warcprox.digest_str(recorded_url.payload_digest, base32)
|
digest_key = warcprox.digest_str(recorded_url.payload_digest, base32)
|
||||||
if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta:
|
if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta:
|
||||||
recorded_url.dedup_info = dedup_db.lookup(digest_key, recorded_url.warcprox_meta["captures-bucket"],
|
recorded_url.dedup_info = dedup_db.lookup(
|
||||||
|
digest_key, recorded_url.warcprox_meta["captures-bucket"],
|
||||||
recorded_url.url)
|
recorded_url.url)
|
||||||
else:
|
else:
|
||||||
recorded_url.dedup_info = dedup_db.lookup(digest_key,
|
recorded_url.dedup_info = dedup_db.lookup(
|
||||||
url=recorded_url.url)
|
digest_key, url=recorded_url.url)
|
||||||
|
|
||||||
class RethinkDedupDb:
|
class RethinkDedupDb(DedupDb):
|
||||||
logger = logging.getLogger("warcprox.dedup.RethinkDedupDb")
|
logger = logging.getLogger("warcprox.dedup.RethinkDedupDb")
|
||||||
|
|
||||||
def __init__(self, options=warcprox.Options()):
|
def __init__(self, options=warcprox.Options()):
|
||||||
@ -181,7 +195,7 @@ class RethinkDedupDb:
|
|||||||
else:
|
else:
|
||||||
self.save(digest_key, records[0])
|
self.save(digest_key, records[0])
|
||||||
|
|
||||||
class CdxServerDedup(object):
|
class CdxServerDedup(DedupDb):
|
||||||
"""Query a CDX server to perform deduplication.
|
"""Query a CDX server to perform deduplication.
|
||||||
"""
|
"""
|
||||||
logger = logging.getLogger("warcprox.dedup.CdxServerDedup")
|
logger = logging.getLogger("warcprox.dedup.CdxServerDedup")
|
||||||
@ -244,7 +258,7 @@ class CdxServerDedup(object):
|
|||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
class TroughDedupDb(object):
|
class TroughDedupDb(DedupDb):
|
||||||
'''
|
'''
|
||||||
https://github.com/internetarchive/trough
|
https://github.com/internetarchive/trough
|
||||||
'''
|
'''
|
||||||
|
@ -198,12 +198,12 @@ def dump_state(signum=None, frame=None):
|
|||||||
'dumping state (caught signal %s)\n%s',
|
'dumping state (caught signal %s)\n%s',
|
||||||
signum, '\n'.join(state_strs))
|
signum, '\n'.join(state_strs))
|
||||||
|
|
||||||
def init_controller(args):
|
def parse_args(argv):
|
||||||
'''
|
'''
|
||||||
Creates a warcprox.controller.WarcproxController configured according to
|
Parses command line arguments with argparse.
|
||||||
the supplied arguments (normally the result of parse_args(sys.argv)).
|
|
||||||
'''
|
'''
|
||||||
options = warcprox.Options(**vars(args))
|
arg_parser = _build_arg_parser(prog=os.path.basename(argv[0]))
|
||||||
|
args = arg_parser.parse_args(args=argv[1:])
|
||||||
|
|
||||||
try:
|
try:
|
||||||
hashlib.new(args.digest_algorithm)
|
hashlib.new(args.digest_algorithm)
|
||||||
@ -211,19 +211,6 @@ def init_controller(args):
|
|||||||
logging.fatal(e)
|
logging.fatal(e)
|
||||||
exit(1)
|
exit(1)
|
||||||
|
|
||||||
|
|
||||||
controller = warcprox.controller.WarcproxController(
|
|
||||||
proxy, warc_writer_threads, playback_proxy,
|
|
||||||
service_registry=svcreg, options=options)
|
|
||||||
|
|
||||||
return controller
|
|
||||||
|
|
||||||
def parse_args(argv):
|
|
||||||
'''
|
|
||||||
Parses command line arguments with argparse.
|
|
||||||
'''
|
|
||||||
arg_parser = _build_arg_parser(prog=os.path.basename(argv[0]))
|
|
||||||
args = arg_parser.parse_args(args=argv[1:])
|
|
||||||
return args
|
return args
|
||||||
|
|
||||||
def main(argv=None):
|
def main(argv=None):
|
||||||
@ -249,7 +236,8 @@ def main(argv=None):
|
|||||||
# see https://github.com/pyca/cryptography/issues/2911
|
# see https://github.com/pyca/cryptography/issues/2911
|
||||||
cryptography.hazmat.backends.openssl.backend.activate_builtin_random()
|
cryptography.hazmat.backends.openssl.backend.activate_builtin_random()
|
||||||
|
|
||||||
controller = init_controller(args)
|
options = warcprox.Options(**vars(args))
|
||||||
|
controller = warcprox.controller.WarcproxController(options)
|
||||||
|
|
||||||
signal.signal(signal.SIGTERM, lambda a,b: controller.stop.set())
|
signal.signal(signal.SIGTERM, lambda a,b: controller.stop.set())
|
||||||
signal.signal(signal.SIGINT, lambda a,b: controller.stop.set())
|
signal.signal(signal.SIGINT, lambda a,b: controller.stop.set())
|
||||||
|
@ -2,7 +2,7 @@
|
|||||||
warcprox/writerthread.py - warc writer thread, reads from the recorded url
|
warcprox/writerthread.py - warc writer thread, reads from the recorded url
|
||||||
queue, writes warc records, runs final tasks after warc records are written
|
queue, writes warc records, runs final tasks after warc records are written
|
||||||
|
|
||||||
Copyright (C) 2013-2017 Internet Archive
|
Copyright (C) 2013-2018 Internet Archive
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or
|
This program is free software; you can redistribute it and/or
|
||||||
modify it under the terms of the GNU General Public License
|
modify it under the terms of the GNU General Public License
|
||||||
@ -28,44 +28,34 @@ except ImportError:
|
|||||||
import Queue as queue
|
import Queue as queue
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
import threading
|
|
||||||
import time
|
import time
|
||||||
from datetime import datetime
|
|
||||||
from hanzo import warctools
|
|
||||||
import warcprox
|
import warcprox
|
||||||
import sys
|
|
||||||
|
|
||||||
class WarcWriterThread(threading.Thread):
|
class WarcWriterThread(warcprox.BaseStandardPostfetchProcessor):
|
||||||
logger = logging.getLogger("warcprox.warcproxwriter.WarcWriterThread")
|
logger = logging.getLogger("warcprox.writerthread.WarcWriterThread")
|
||||||
|
|
||||||
def __init__(
|
def __init__(self, inq, outq, options=warcprox.Options()):
|
||||||
self, recorded_url_q, name='WarcWriterThread', writer_pool=None,
|
warcprox.BaseStandardPostfetchProcessor.__init__(
|
||||||
dedup_db=None, listeners=[], options=warcprox.Options()):
|
self, inq, outq, options=options)
|
||||||
"""recorded_url_q is a queue.Queue of warcprox.warcprox.RecordedUrl."""
|
|
||||||
threading.Thread.__init__(self, name=name)
|
|
||||||
self.recorded_url_q = recorded_url_q
|
|
||||||
self.stop = threading.Event()
|
|
||||||
if writer_pool:
|
|
||||||
self.writer_pool = writer_pool
|
|
||||||
else:
|
|
||||||
self.writer_pool = warcprox.writer.WarcWriterPool()
|
|
||||||
self.dedup_db = dedup_db
|
|
||||||
self.listeners = listeners
|
|
||||||
self.options = options
|
self.options = options
|
||||||
self.idle = None
|
self.writer_pool = warcprox.writer.WarcWriterPool(options)
|
||||||
self.method_filter = set(method.upper() for method in self.options.method_filter or [])
|
self.method_filter = set(method.upper() for method in self.options.method_filter or [])
|
||||||
|
|
||||||
def run(self):
|
def _get_process_put(self):
|
||||||
if self.options.profile:
|
try:
|
||||||
import cProfile
|
warcprox.BaseStandardPostfetchProcessor._get_process_put(self)
|
||||||
self.profiler = cProfile.Profile()
|
finally:
|
||||||
self.profiler.enable()
|
self.writer_pool.maybe_idle_rollover()
|
||||||
self._run()
|
|
||||||
self.profiler.disable()
|
def _process_url(self, recorded_url):
|
||||||
else:
|
if self._should_archive(recorded_url):
|
||||||
self._run()
|
records = self.writer_pool.write_records(recorded_url)
|
||||||
|
recorded_url.warc_records = records
|
||||||
|
self._log(recorded_url, records)
|
||||||
|
# try to release resources in a timely fashion
|
||||||
|
if recorded_url.response_recorder and recorded_url.response_recorder.tempfile:
|
||||||
|
recorded_url.response_recorder.tempfile.close()
|
||||||
|
|
||||||
_ALWAYS_ACCEPT = {'WARCPROX_WRITE_RECORD'}
|
|
||||||
def _filter_accepts(self, recorded_url):
|
def _filter_accepts(self, recorded_url):
|
||||||
if not self.method_filter:
|
if not self.method_filter:
|
||||||
return True
|
return True
|
||||||
@ -81,68 +71,9 @@ class WarcWriterThread(threading.Thread):
|
|||||||
# special warc name prefix '-' means "don't archive"
|
# special warc name prefix '-' means "don't archive"
|
||||||
return prefix != '-' and self._filter_accepts(recorded_url)
|
return prefix != '-' and self._filter_accepts(recorded_url)
|
||||||
|
|
||||||
def _run(self):
|
|
||||||
self.name = '%s(tid=%s)'% (self.name, warcprox.gettid())
|
|
||||||
while not self.stop.is_set():
|
|
||||||
try:
|
|
||||||
while True:
|
|
||||||
try:
|
|
||||||
if self.stop.is_set():
|
|
||||||
qsize = self.recorded_url_q.qsize()
|
|
||||||
if qsize % 50 == 0:
|
|
||||||
self.logger.info("%s urls left to write", qsize)
|
|
||||||
|
|
||||||
recorded_url = self.recorded_url_q.get(block=True, timeout=0.5)
|
|
||||||
records = []
|
|
||||||
self.idle = None
|
|
||||||
if self._should_archive(recorded_url):
|
|
||||||
if self.dedup_db:
|
|
||||||
warcprox.dedup.decorate_with_dedup_info(self.dedup_db,
|
|
||||||
recorded_url, base32=self.options.base32)
|
|
||||||
records = self.writer_pool.write_records(recorded_url)
|
|
||||||
|
|
||||||
self._final_tasks(recorded_url, records)
|
|
||||||
|
|
||||||
# try to release resources in a timely fashion
|
|
||||||
if recorded_url.response_recorder and recorded_url.response_recorder.tempfile:
|
|
||||||
recorded_url.response_recorder.tempfile.close()
|
|
||||||
except queue.Empty:
|
|
||||||
if self.stop.is_set():
|
|
||||||
break
|
|
||||||
self.idle = time.time()
|
|
||||||
finally:
|
|
||||||
self.writer_pool.maybe_idle_rollover()
|
|
||||||
|
|
||||||
self.logger.info('WarcWriterThread shutting down')
|
|
||||||
self._shutdown()
|
|
||||||
except Exception as e:
|
|
||||||
if isinstance(e, OSError) and e.errno == 28:
|
|
||||||
# OSError: [Errno 28] No space left on device
|
|
||||||
self.logger.critical(
|
|
||||||
'shutting down due to fatal problem: %s: %s',
|
|
||||||
e.__class__.__name__, e)
|
|
||||||
self._shutdown()
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
self.logger.critical(
|
|
||||||
'WarcWriterThread will try to continue after unexpected '
|
|
||||||
'error', exc_info=True)
|
|
||||||
time.sleep(0.5)
|
|
||||||
|
|
||||||
def _shutdown(self):
|
|
||||||
self.writer_pool.close_writers()
|
|
||||||
for listener in self.listeners:
|
|
||||||
if hasattr(listener, 'stop'):
|
|
||||||
try:
|
|
||||||
listener.stop()
|
|
||||||
except:
|
|
||||||
self.logger.error(
|
|
||||||
'%s raised exception', listener.stop, exc_info=True)
|
|
||||||
|
|
||||||
# closest thing we have to heritrix crawl log at the moment
|
|
||||||
def _log(self, recorded_url, records):
|
def _log(self, recorded_url, records):
|
||||||
try:
|
try:
|
||||||
payload_digest = records[0].get_header(warctools.WarcRecord.PAYLOAD_DIGEST).decode("utf-8")
|
payload_digest = records[0].get_header('WARC-Payload-Digest').decode("utf-8")
|
||||||
except:
|
except:
|
||||||
payload_digest = "-"
|
payload_digest = "-"
|
||||||
|
|
||||||
@ -156,13 +87,3 @@ class WarcWriterThread(threading.Thread):
|
|||||||
recorded_url.method, recorded_url.url.decode("utf-8"),
|
recorded_url.method, recorded_url.url.decode("utf-8"),
|
||||||
recorded_url.mimetype, recorded_url.size, payload_digest,
|
recorded_url.mimetype, recorded_url.size, payload_digest,
|
||||||
type_, filename, offset)
|
type_, filename, offset)
|
||||||
|
|
||||||
def _final_tasks(self, recorded_url, records):
|
|
||||||
if self.listeners:
|
|
||||||
for listener in self.listeners:
|
|
||||||
try:
|
|
||||||
listener.notify(recorded_url, records)
|
|
||||||
except:
|
|
||||||
self.logger.error('%s raised exception',
|
|
||||||
listener.notify, exc_info=True)
|
|
||||||
self._log(recorded_url, records)
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user