From c715eaba4ef4b4ad122207837fd5614cd3690d5a Mon Sep 17 00:00:00 2001 From: Noah Levitt <nlevitt@archive.org> Date: Fri, 12 Jan 2018 11:39:53 -0800 Subject: [PATCH 01/19] very incomplete work on new postfetch processor chain --- warcprox/__init__.py | 100 ++++++++++++++++++++- warcprox/controller.py | 195 ++++++++++++++++++++++++++++++++++------- warcprox/main.py | 86 ------------------ warcprox/warcproxy.py | 2 +- 4 files changed, 261 insertions(+), 122 deletions(-) diff --git a/warcprox/__init__.py b/warcprox/__init__.py index 256ede9..c592eb6 100644 --- a/warcprox/__init__.py +++ b/warcprox/__init__.py @@ -1,7 +1,7 @@ """ warcprox/__init__.py - warcprox package main file, contains some utility code -Copyright (C) 2013-2017 Internet Archive +Copyright (C) 2013-2018 Internet Archive This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License @@ -19,6 +19,8 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. """ +import datetime +import threading from argparse import Namespace as _Namespace from pkg_resources import get_distribution as _get_distribution __version__ = _get_distribution('warcprox').version @@ -26,8 +28,6 @@ try: import queue except ImportError: import Queue as queue -import datetime - def digest_str(hash_obj, base32=False): import base64 return hash_obj.name.encode('utf-8') + b':' + ( @@ -92,6 +92,100 @@ class RequestBlockedByRule(Exception): def __str__(self): return "%s: %s" % (self.__class__.__name__, self.msg) +class BasePostfetchProcessor(threading.Thread): + def __init__(self, inq, outq, profile=False): + threading.Thread.__init__(self, name='???') + self.inq = inq + self.outq = outq + self.stop = threading.Event() + self.profile = profile + + def run(self): + if self.profile: + import cProfile + self.profiler = cProfile.Profile() + self.profiler.enable() + self._run() + self.profiler.disable() + else: + self._run() + + def _get_process_put(self): + ''' + Get url(s) from `self.inq`, process url(s), queue to `self.outq`. + + Subclasses must implement this. + + May raise queue.Empty. + ''' + raise Exception('not implemented') + + def _run(self): + while not self.stop.is_set(): + try: + while True: + try: + self._get_process_put() + except queue.Empty: + if self.stop.is_set(): + break + self._shutdown() + except Exception as e: + if isinstance(e, OSError) and e.errno == 28: + # OSError: [Errno 28] No space left on device + self.logger.critical( + 'shutting down due to fatal problem: %s: %s', + e.__class__.__name__, e) + self._shutdown() + sys.exit(1) + + self.logger.critical( + '%s will try to continue after unexpected error', + self.name, exc_info=True) + time.sleep(0.5) + + def _shutdown(self): + pass + +class BaseStandardPostfetchProcessor(BasePostfetchProcessor): + def _get_process_put(self): + recorded_url = self.inq.get(block=True, timeout=0.5) + self._process_url(recorded_url) + if self.outq: + self.outq.put(recorded_url) + + def _process_url(self, recorded_url): + raise Exception('not implemented') + +class BaseBatchPostfetchProcessor(BasePostfetchProcessor): + MAX_BATCH_SIZE = 500 + + def _get_process_put(self): + batch = [] + batch.append(self.inq.get(block=True, timeout=0.5)) + try: + while len(batch) < self.MAX_BATCH_SIZE: + batch.append(self.inq.get(block=False)) + except queue.Empty: + pass + + self._process_batch(batch) + + if self.outq: + for recorded_url in batch: + self.outq.put(recorded_url) + + def _process_batch(self, batch): + raise Exception('not implemented') + +class ListenerPostfetchProcessor(BaseStandardPostfetchProcessor): + def __init__(self, listener, inq, outq, profile=False): + BaseStandardPostfetchProcessor.__init__(self, inq, outq, profile) + self.listener = listener + + def _process_url(self, recorded_url): + return self.listener.notify(recorded_url, recorded_url.warc_records) + # monkey-patch log levels TRACE and NOTICE TRACE = 5 import logging diff --git a/warcprox/controller.py b/warcprox/controller.py index 0bf8a4f..0f0c20a 100644 --- a/warcprox/controller.py +++ b/warcprox/controller.py @@ -4,7 +4,7 @@ starting up and shutting down the various components of warcprox, and for sending heartbeats to the service registry if configured to do so; also has some memory profiling capabilities -Copyright (C) 2013-2017 Internet Archive +Copyright (C) 2013-2018 Internet Archive This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License @@ -27,55 +27,186 @@ from __future__ import absolute_import import logging import threading import time -import warcprox import sys import gc import datetime +import warcprox +import certauth + +class Factory: + @staticmethod + def dedup_db(options): + if options.rethinkdb_dedup_url: + dedup_db = warcprox.dedup.RethinkDedupDb(options=options) + elif options.rethinkdb_big_table_url: + dedup_db = warcprox.bigtable.RethinkCapturesDedup(options=options) + elif options.rethinkdb_trough_db_url: + dedup_db = warcprox.dedup.TroughDedupDb(options) + elif options.cdxserver_dedup: + dedup_db = warcprox.dedup.CdxServerDedup( + cdx_url=options.cdxserver_dedup) + elif options.dedup_db_file in (None, '', '/dev/null'): + logging.info('deduplication disabled') + dedup_db = None + else: + dedup_db = warcprox.dedup.DedupDb(options.dedup_db_file, options=options) + return dedup_db + + @staticmethod + def stats_db(options): + if options.rethinkdb_stats_url: + stats_db = warcprox.stats.RethinkStatsDb(options=options) + elif options.stats_db_file in (None, '', '/dev/null'): + logging.info('statistics tracking disabled') + stats_db = None + else: + stats_db = warcprox.stats.StatsDb(options.stats_db_file, options=options) + return stats_db + + # @staticmethod + # def certauth(options): + # ca_name = 'Warcprox CA on {}'.format(socket.gethostname())[:64] + # ca = certauth.certauth.CertificateAuthority( + # options.cacert, args.certs_dir, ca_name=ca_name) + # return ca + + @staticmethod + def playback_proxy(options): + if options.playback_port is not None: + playback_index_db = warcprox.playback.PlaybackIndexDb( + options.playback_index_db_file, options=options) + playback_proxy = warcprox.playback.PlaybackProxy( + ca=ca, playback_index_db=playback_index_db, options=options) + else: + playback_index_db = None + playback_proxy = None + return playback_proxy + + @staticmethod + def crawl_logger(options): + if options.crawl_log_dir: + return warcprox.crawl_log.CrawlLogger( + options.crawl_log_dir, options=options)) + else: + return None + + @staticmethod + def plugin(qualname, inq, outq): + try: + (module_name, class_name) = qualname.rsplit('.', 1) + module_ = importlib.import_module(module_name) + class_ = getattr(module_, class_name) + instance = class_() + plugin.notify # make sure it has this method + return instance + except Exception as e: + logging.fatal('problem with plugin class %r: %s', qualname, e) + sys.exit(1) + + # @staticmethod + # def plugins(options): + # plugins = [] + # for qualname in options.plugins or []: + # try: + # (module_name, class_name) = qualname.rsplit('.', 1) + # module_ = importlib.import_module(module_name) + # class_ = getattr(module_, class_name) + # plugin = class_() + # plugin.notify # make sure it has this method + # plugins.append(plugin) + # except Exception as e: + # logging.fatal('problem with plugin class %r: %s', qualname, e) + # sys.exit(1) + # return plugins + + # @staticmethod + # def service_registry(options): + # if options.rethinkdb_services_url: + # parsed = doublethink.parse_rethinkdb_url( + # options.rethinkdb_services_url) + # rr = doublethink.Rethinker(servers=parsed.hosts, db=parsed.database) + # return doublethink.ServiceRegistry(rr, table=parsed.table) + # else: + # return None class WarcproxController(object): logger = logging.getLogger("warcprox.controller.WarcproxController") HEARTBEAT_INTERVAL = 20.0 - def __init__( - self, proxy=None, warc_writer_threads=None, playback_proxy=None, - service_registry=None, options=warcprox.Options()): + def __init__(self, options=warcprox.Options()): """ - Create warcprox controller. - - If supplied, `proxy` should be an instance of WarcProxy, and - `warc_writer_threads` should be a list of WarcWriterThread instances. - If not supplied, they are created with default values. - - If supplied, playback_proxy should be an instance of PlaybackProxy. If - not supplied, no playback proxy will run. + Create warcprox controller based on `options`. """ - if proxy is not None: - self.proxy = proxy - else: - self.proxy = warcprox.warcproxy.WarcProxy(options=options) - - if warc_writer_threads is not None: - self.warc_writer_threads = warc_writer_threads - else: - self.warc_writer_threads = [ - warcprox.writerthread.WarcWriterThread( - name='WarcWriterThread%03d' % i, - recorded_url_q=self.proxy.recorded_url_q, - listeners=[self.proxy.running_stats], options=options) - for i in range(int(self.proxy.max_threads ** 0.5))] + self.options = options self.proxy_thread = None self.playback_proxy_thread = None - self.playback_proxy = playback_proxy - self.service_registry = service_registry - self.options = options - self._last_rss = None - self.stop = threading.Event() self._start_stop_lock = threading.Lock() + self.proxy = warcprox.warcproxy.WarcProxy(options=options) + + self.build_postfetch_chain(proxy.recorded_url_q) + + # if warc_writer_threads is not None: + # self.warc_writer_threads = warc_writer_threads + # else: + # self.warc_writer_threads = [ + # warcprox.writerthread.WarcWriterThread( + # name='WarcWriterThread%03d' % i, + # recorded_url_q=self.proxy.recorded_url_q, + # listeners=[self.proxy.running_stats], options=options) + # for i in range(int(self.proxy.max_threads ** 0.5))] + # self.playback_proxy = playback_proxy + # self.service_registry = service_registry + + def build_postfetch_chain(self, inq): + outq = warcprox.TimestampedQueue(maxsize=self.options.queue_size) + + def maybe_add_to_chain(processor_init): + processor = processor_init(inq, outq, self.options) + if processor: + self._postfetch_chain.append(processor) + inq = outq + outq = warcprox.TimestampedQueue(maxsize=self.options.queue_size) + + self.dedup_db = Factory.dedup_db(options) + + # dedup loader + if self.dedup_db: + maybe_add_to_chain(self.dedup_db.loader) + + # warc writer + maybe_add_to_chain(Factory.warc_writer) + + # dedup storer + if self.dedup_db: + maybe_add_to_chain(self.dedup_db.storer) + + # playback index storer + # XXX XXX XXX FIXME + # self.playback_proxy = Factory.playback_proxy(options) + # if self.playback_proxy: + # maybe_add_to_chain() + # outq = warcprox.TimestampedQueue(maxsize=self.options.queue_size) + # processor = self.playback_proxy.playback_index_db(inq, outq) + # self._postfetch_chain.append(processor) + # inq = outq + + # stats db + maybe_add_to_chain(Factory.stats_db) + + # crawl logger + maybe_add_to_chain(Factory.crawl_logger) + + for qualname in self.options.plugins: + maybe_add_to_chain( + lambda inq, outq, options: Factory.plugin(qualname, inq, outq)) + # self.plugins = Factory.plugins(options) + + def debug_mem(self): self.logger.info("self.proxy.recorded_url_q.qsize()=%s", self.proxy.recorded_url_q.qsize()) with open("/proc/self/status") as f: diff --git a/warcprox/main.py b/warcprox/main.py index 348dfbf..d17e2be 100644 --- a/warcprox/main.py +++ b/warcprox/main.py @@ -211,92 +211,6 @@ def init_controller(args): logging.fatal(e) exit(1) - listeners = [] - running_stats = warcprox.stats.RunningStats() - listeners.append(running_stats) - - if args.rethinkdb_dedup_url: - dedup_db = warcprox.dedup.RethinkDedupDb(options=options) - elif args.rethinkdb_big_table_url: - dedup_db = warcprox.bigtable.RethinkCapturesDedup(options=options) - elif args.rethinkdb_trough_db_url: - dedup_db = warcprox.dedup.TroughDedupDb(options) - elif args.cdxserver_dedup: - dedup_db = warcprox.dedup.CdxServerDedup(cdx_url=args.cdxserver_dedup) - elif args.dedup_db_file in (None, '', '/dev/null'): - logging.info('deduplication disabled') - dedup_db = None - else: - dedup_db = warcprox.dedup.DedupDb(args.dedup_db_file, options=options) - if dedup_db: - listeners.append(dedup_db) - - if args.rethinkdb_stats_url: - stats_db = warcprox.stats.RethinkStatsDb(options=options) - listeners.append(stats_db) - elif args.stats_db_file in (None, '', '/dev/null'): - logging.info('statistics tracking disabled') - stats_db = None - else: - stats_db = warcprox.stats.StatsDb(args.stats_db_file, options=options) - listeners.append(stats_db) - - recorded_url_q = warcprox.TimestampedQueue(maxsize=args.queue_size) - - ca_name = 'Warcprox CA on {}'.format(socket.gethostname())[:64] - ca = certauth.certauth.CertificateAuthority(args.cacert, args.certs_dir, - ca_name=ca_name) - - proxy = warcprox.warcproxy.WarcProxy( - ca=ca, recorded_url_q=recorded_url_q, stats_db=stats_db, - running_stats=running_stats, options=options) - - if args.playback_port is not None: - playback_index_db = warcprox.playback.PlaybackIndexDb( - args.playback_index_db_file, options=options) - playback_proxy = warcprox.playback.PlaybackProxy( - ca=ca, playback_index_db=playback_index_db, options=options) - listeners.append(playback_index_db) - else: - playback_index_db = None - playback_proxy = None - - if args.crawl_log_dir: - listeners.append(warcprox.crawl_log.CrawlLogger( - args.crawl_log_dir, options=options)) - - for qualname in args.plugins or []: - try: - (module_name, class_name) = qualname.rsplit('.', 1) - module_ = importlib.import_module(module_name) - class_ = getattr(module_, class_name) - listener = class_() - listener.notify # make sure it has this method - listeners.append(listener) - except Exception as e: - logging.fatal('problem with plugin class %r: %s', qualname, e) - sys.exit(1) - - writer_pool = warcprox.writer.WarcWriterPool(options=options) - # number of warc writer threads = sqrt(proxy.max_threads) - # I came up with this out of thin air because it strikes me as reasonable - # 1=>1 2=>1 5=>2 10=>3 50=>7 100=>10 200=>14 500=>22 1000=>32 2000=>45 - num_writer_threads = args.writer_threads or int(proxy.max_threads ** 0.5) - logging.debug('initializing %d warc writer threads', num_writer_threads) - warc_writer_threads = [ - warcprox.writerthread.WarcWriterThread( - name='WarcWriterThread%03d' % i, recorded_url_q=recorded_url_q, - writer_pool=writer_pool, dedup_db=dedup_db, - listeners=listeners, options=options) - for i in range(num_writer_threads)] - - if args.rethinkdb_services_url: - parsed = doublethink.parse_rethinkdb_url( - options.rethinkdb_services_url) - rr = doublethink.Rethinker(servers=parsed.hosts, db=parsed.database) - svcreg = doublethink.ServiceRegistry(rr, table=parsed.table) - else: - svcreg = None controller = warcprox.controller.WarcproxController( proxy, warc_writer_threads, playback_proxy, diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index 6f7dd34..60f79d3 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -2,7 +2,7 @@ warcprox/warcproxy.py - recording proxy, extends mitmproxy to record traffic, enqueue info on the recorded url queue -Copyright (C) 2013-2016 Internet Archive +Copyright (C) 2013-2018 Internet Archive This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License From bd25991a0d8c9cc90906acba2bb7863d3bda3a80 Mon Sep 17 00:00:00 2001 From: Noah Levitt <nlevitt@archive.org> Date: Fri, 12 Jan 2018 14:58:26 -0800 Subject: [PATCH 02/19] slightly less incomplete work on new postfetch processor chain --- warcprox/__init__.py | 21 +++-- warcprox/controller.py | 188 +++++++++++++++++---------------------- warcprox/dedup.py | 30 +++++-- warcprox/main.py | 24 ++--- warcprox/writerthread.py | 123 +++++-------------------- 5 files changed, 146 insertions(+), 240 deletions(-) diff --git a/warcprox/__init__.py b/warcprox/__init__.py index c592eb6..7c628ad 100644 --- a/warcprox/__init__.py +++ b/warcprox/__init__.py @@ -19,8 +19,11 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. """ +import sys import datetime import threading +import time +import logging from argparse import Namespace as _Namespace from pkg_resources import get_distribution as _get_distribution __version__ = _get_distribution('warcprox').version @@ -28,6 +31,7 @@ try: import queue except ImportError: import Queue as queue + def digest_str(hash_obj, base32=False): import base64 return hash_obj.name.encode('utf-8') + b':' + ( @@ -93,15 +97,17 @@ class RequestBlockedByRule(Exception): return "%s: %s" % (self.__class__.__name__, self.msg) class BasePostfetchProcessor(threading.Thread): - def __init__(self, inq, outq, profile=False): + logger = logging.getLogger("warcprox.BasePostfetchProcessor") + + def __init__(self, inq, outq, options=Options()): threading.Thread.__init__(self, name='???') self.inq = inq self.outq = outq + self.options = options self.stop = threading.Event() - self.profile = profile def run(self): - if self.profile: + if self.options.profile: import cProfile self.profiler = cProfile.Profile() self.profiler.enable() @@ -186,9 +192,15 @@ class ListenerPostfetchProcessor(BaseStandardPostfetchProcessor): def _process_url(self, recorded_url): return self.listener.notify(recorded_url, recorded_url.warc_records) + # @classmethod + # def wrap(cls, listener, inq, outq, profile=False): + # if listener: + # return cls(listener, inq, outq, profile) + # else: + # return None + # monkey-patch log levels TRACE and NOTICE TRACE = 5 -import logging def _logger_trace(self, msg, *args, **kwargs): if self.isEnabledFor(TRACE): self._log(TRACE, msg, args, **kwargs) @@ -197,7 +209,6 @@ logging.trace = logging.root.trace logging.addLevelName(TRACE, 'TRACE') NOTICE = (logging.INFO + logging.WARN) // 2 -import logging def _logger_notice(self, msg, *args, **kwargs): if self.isEnabledFor(NOTICE): self._log(NOTICE, msg, args, **kwargs) diff --git a/warcprox/controller.py b/warcprox/controller.py index 0f0c20a..80b9c50 100644 --- a/warcprox/controller.py +++ b/warcprox/controller.py @@ -32,6 +32,7 @@ import gc import datetime import warcprox import certauth +import functools class Factory: @staticmethod @@ -60,7 +61,8 @@ class Factory: logging.info('statistics tracking disabled') stats_db = None else: - stats_db = warcprox.stats.StatsDb(options.stats_db_file, options=options) + stats_db = warcprox.stats.StatsDb( + options.stats_db_file, options=options) return stats_db # @staticmethod @@ -70,6 +72,10 @@ class Factory: # options.cacert, args.certs_dir, ca_name=ca_name) # return ca + @staticmethod + def warc_writer(inq, outq, options): + return warcprox.writerthread.WarcWriterThread(inq, outq, options) + @staticmethod def playback_proxy(options): if options.playback_port is not None: @@ -86,48 +92,32 @@ class Factory: def crawl_logger(options): if options.crawl_log_dir: return warcprox.crawl_log.CrawlLogger( - options.crawl_log_dir, options=options)) + options.crawl_log_dir, options=options) else: return None @staticmethod - def plugin(qualname, inq, outq): + def plugin(qualname): try: (module_name, class_name) = qualname.rsplit('.', 1) module_ = importlib.import_module(module_name) class_ = getattr(module_, class_name) - instance = class_() + listener = class_() plugin.notify # make sure it has this method - return instance + return plugin except Exception as e: logging.fatal('problem with plugin class %r: %s', qualname, e) sys.exit(1) - # @staticmethod - # def plugins(options): - # plugins = [] - # for qualname in options.plugins or []: - # try: - # (module_name, class_name) = qualname.rsplit('.', 1) - # module_ = importlib.import_module(module_name) - # class_ = getattr(module_, class_name) - # plugin = class_() - # plugin.notify # make sure it has this method - # plugins.append(plugin) - # except Exception as e: - # logging.fatal('problem with plugin class %r: %s', qualname, e) - # sys.exit(1) - # return plugins - - # @staticmethod - # def service_registry(options): - # if options.rethinkdb_services_url: - # parsed = doublethink.parse_rethinkdb_url( - # options.rethinkdb_services_url) - # rr = doublethink.Rethinker(servers=parsed.hosts, db=parsed.database) - # return doublethink.ServiceRegistry(rr, table=parsed.table) - # else: - # return None + @staticmethod + def service_registry(options): + if options.rethinkdb_services_url: + parsed = doublethink.parse_rethinkdb_url( + options.rethinkdb_services_url) + rr = doublethink.Rethinker(servers=parsed.hosts, db=parsed.database) + return doublethink.ServiceRegistry(rr, table=parsed.table) + else: + return None class WarcproxController(object): logger = logging.getLogger("warcprox.controller.WarcproxController") @@ -148,65 +138,55 @@ class WarcproxController(object): self.proxy = warcprox.warcproxy.WarcProxy(options=options) - self.build_postfetch_chain(proxy.recorded_url_q) + self.build_postfetch_chain(self.proxy.recorded_url_q) - # if warc_writer_threads is not None: - # self.warc_writer_threads = warc_writer_threads - # else: - # self.warc_writer_threads = [ - # warcprox.writerthread.WarcWriterThread( - # name='WarcWriterThread%03d' % i, - # recorded_url_q=self.proxy.recorded_url_q, - # listeners=[self.proxy.running_stats], options=options) - # for i in range(int(self.proxy.max_threads ** 0.5))] - # self.playback_proxy = playback_proxy - # self.service_registry = service_registry + self.service_registry = Factory.service_registry(options) def build_postfetch_chain(self, inq): - outq = warcprox.TimestampedQueue(maxsize=self.options.queue_size) + constructors = [] - def maybe_add_to_chain(processor_init): - processor = processor_init(inq, outq, self.options) - if processor: - self._postfetch_chain.append(processor) - inq = outq - outq = warcprox.TimestampedQueue(maxsize=self.options.queue_size) + self.dedup_db = Factory.dedup_db(self.options) - self.dedup_db = Factory.dedup_db(options) - - # dedup loader if self.dedup_db: - maybe_add_to_chain(self.dedup_db.loader) + constructors.append(self.dedup_db.loader) - # warc writer - maybe_add_to_chain(Factory.warc_writer) + constructors.append(Factory.warc_writer) - # dedup storer if self.dedup_db: - maybe_add_to_chain(self.dedup_db.storer) + constructors.append(self.dedup_db.storer) - # playback index storer - # XXX XXX XXX FIXME - # self.playback_proxy = Factory.playback_proxy(options) - # if self.playback_proxy: - # maybe_add_to_chain() - # outq = warcprox.TimestampedQueue(maxsize=self.options.queue_size) - # processor = self.playback_proxy.playback_index_db(inq, outq) - # self._postfetch_chain.append(processor) - # inq = outq + stats_db = Factory.stats_db(self.options) + if stats_db: + constructors.append(functools.partial( + warcprox.ListenerPostfetchProcessor, stats_db)) - # stats db - maybe_add_to_chain(Factory.stats_db) + self.playback_proxy = Factory.playback_proxy(self.options) + if self.playback_proxy: + constructors.append(functools.partial( + warcprox.ListenerPostfetchProcessor, + self.playback_proxy.playback_index_db)) - # crawl logger - maybe_add_to_chain(Factory.crawl_logger) - - for qualname in self.options.plugins: - maybe_add_to_chain( - lambda inq, outq, options: Factory.plugin(qualname, inq, outq)) - # self.plugins = Factory.plugins(options) + crawl_logger = Factory.crawl_logger(self.options) + if crawl_logger: + constructors.append(functools.partial( + warcprox.ListenerPostfetchProcessor, crawl_logger)) + for qualname in self.options.plugins or []: + plugin = Factory.plugin(qualname) + constructors.append(functools.partial( + warcprox.ListenerPostfetchProcessor, plugin)) + self._postfetch_chain = [] + for i, constructor in enumerate(constructors): + if i != len(constructors) - 1: + outq = warcprox.TimestampedQueue( + maxsize=self.options.queue_size) + else: + outq = None + processor = constructor(inq, outq, self.options) + self._postfetch_chain.append(processor) + inq = outq + def debug_mem(self): self.logger.info("self.proxy.recorded_url_q.qsize()=%s", self.proxy.recorded_url_q.qsize()) with open("/proc/self/status") as f: @@ -293,26 +273,14 @@ class WarcproxController(object): self.logger.info('warcprox is already running') return - if self.proxy.stats_db: - self.proxy.stats_db.start() self.proxy_thread = threading.Thread( target=self.proxy.serve_forever, name='ProxyThread') self.proxy_thread.start() - assert(all( - wwt.dedup_db is self.warc_writer_threads[0].dedup_db - for wwt in self.warc_writer_threads)) - if any((t.dedup_db for t in self.warc_writer_threads)): - self.warc_writer_threads[0].dedup_db.start() - - for wwt in self.warc_writer_threads: - wwt.start() - - if self.playback_proxy is not None: - self.playback_proxy_thread = threading.Thread( - target=self.playback_proxy.serve_forever, - name='PlaybackProxyThread') - self.playback_proxy_thread.start() + for processor in self._postfetch_chain: + # logging.info('starting postfetch processor %r', processor) + processor.start() + logging.info('started postfetch processor %r', processor) def shutdown(self): with self._start_stop_lock: @@ -320,30 +288,34 @@ class WarcproxController(object): self.logger.info('warcprox is not running') return - for wwt in self.warc_writer_threads: - wwt.stop.set() + # for wwt in self.warc_writer_threads: + # wwt.stop.set() + for processor in self._postfetch_chain: + processor.stop.set() self.proxy.shutdown() self.proxy.server_close() - if self.playback_proxy is not None: - self.playback_proxy.shutdown() - self.playback_proxy.server_close() - if self.playback_proxy.playback_index_db is not None: - self.playback_proxy.playback_index_db.close() + for processor in self._postfetch_chain: + processor.join() + # if self.playback_proxy is not None: + # self.playback_proxy.shutdown() + # self.playback_proxy.server_close() + # if self.playback_proxy.playback_index_db is not None: + # self.playback_proxy.playback_index_db.close() - # wait for threads to finish - for wwt in self.warc_writer_threads: - wwt.join() + # # wait for threads to finish + # for wwt in self.warc_writer_threads: + # wwt.join() - if self.proxy.stats_db: - self.proxy.stats_db.stop() + # if self.proxy.stats_db: + # self.proxy.stats_db.stop() - self.proxy_thread.join() - if self.playback_proxy is not None: - self.playback_proxy_thread.join() + # self.proxy_thread.join() + # if self.playback_proxy is not None: + # self.playback_proxy_thread.join() - if self.service_registry and hasattr(self, "status_info"): - self.service_registry.unregister(self.status_info["id"]) + # if self.service_registry and hasattr(self, "status_info"): + # self.service_registry.unregister(self.status_info["id"]) def run_until_shutdown(self): """ diff --git a/warcprox/dedup.py b/warcprox/dedup.py index f21e1df..f62eccb 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -35,6 +35,14 @@ from urllib3.exceptions import HTTPError urllib3.disable_warnings() +class DedupLoader(warcprox.BaseStandardPostfetchProcessor): + def __init__(self, dedup_db, inq, outq, base32=False, profile=False): + warcprox.BaseStandardPostfetchProcessor.__init__(self, inq, outq, profile) + self.dedup_db = dedup_db + self.base32 = base32 + def _process_url(self, recorded_url): + decorate_with_dedup_info(self.dedup_db, recorded_url, self.base32) + class DedupDb(object): logger = logging.getLogger("warcprox.dedup.DedupDb") @@ -61,6 +69,12 @@ class DedupDb(object): conn.commit() conn.close() + def loader(self, inq, outq, profile=False): + return DedupLoader(self, inq, outq, self.options.base32, profile) + + def storer(self, inq, outq, profile=False): + return warcprox.ListenerPostfetchProcessor(self, inq, outq, profile) + def save(self, digest_key, response_record, bucket=""): record_id = response_record.get_header(warctools.WarcRecord.ID).decode('latin1') url = response_record.get_header(warctools.WarcRecord.URL).decode('latin1') @@ -106,20 +120,20 @@ class DedupDb(object): else: self.save(digest_key, records[0]) - def decorate_with_dedup_info(dedup_db, recorded_url, base32=False): if (recorded_url.response_recorder and recorded_url.payload_digest and recorded_url.response_recorder.payload_size() > 0): digest_key = warcprox.digest_str(recorded_url.payload_digest, base32) if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta: - recorded_url.dedup_info = dedup_db.lookup(digest_key, recorded_url.warcprox_meta["captures-bucket"], - recorded_url.url) + recorded_url.dedup_info = dedup_db.lookup( + digest_key, recorded_url.warcprox_meta["captures-bucket"], + recorded_url.url) else: - recorded_url.dedup_info = dedup_db.lookup(digest_key, - url=recorded_url.url) + recorded_url.dedup_info = dedup_db.lookup( + digest_key, url=recorded_url.url) -class RethinkDedupDb: +class RethinkDedupDb(DedupDb): logger = logging.getLogger("warcprox.dedup.RethinkDedupDb") def __init__(self, options=warcprox.Options()): @@ -181,7 +195,7 @@ class RethinkDedupDb: else: self.save(digest_key, records[0]) -class CdxServerDedup(object): +class CdxServerDedup(DedupDb): """Query a CDX server to perform deduplication. """ logger = logging.getLogger("warcprox.dedup.CdxServerDedup") @@ -244,7 +258,7 @@ class CdxServerDedup(object): """ pass -class TroughDedupDb(object): +class TroughDedupDb(DedupDb): ''' https://github.com/internetarchive/trough ''' diff --git a/warcprox/main.py b/warcprox/main.py index d17e2be..785c029 100644 --- a/warcprox/main.py +++ b/warcprox/main.py @@ -198,12 +198,12 @@ def dump_state(signum=None, frame=None): 'dumping state (caught signal %s)\n%s', signum, '\n'.join(state_strs)) -def init_controller(args): +def parse_args(argv): ''' - Creates a warcprox.controller.WarcproxController configured according to - the supplied arguments (normally the result of parse_args(sys.argv)). + Parses command line arguments with argparse. ''' - options = warcprox.Options(**vars(args)) + arg_parser = _build_arg_parser(prog=os.path.basename(argv[0])) + args = arg_parser.parse_args(args=argv[1:]) try: hashlib.new(args.digest_algorithm) @@ -211,19 +211,6 @@ def init_controller(args): logging.fatal(e) exit(1) - - controller = warcprox.controller.WarcproxController( - proxy, warc_writer_threads, playback_proxy, - service_registry=svcreg, options=options) - - return controller - -def parse_args(argv): - ''' - Parses command line arguments with argparse. - ''' - arg_parser = _build_arg_parser(prog=os.path.basename(argv[0])) - args = arg_parser.parse_args(args=argv[1:]) return args def main(argv=None): @@ -249,7 +236,8 @@ def main(argv=None): # see https://github.com/pyca/cryptography/issues/2911 cryptography.hazmat.backends.openssl.backend.activate_builtin_random() - controller = init_controller(args) + options = warcprox.Options(**vars(args)) + controller = warcprox.controller.WarcproxController(options) signal.signal(signal.SIGTERM, lambda a,b: controller.stop.set()) signal.signal(signal.SIGINT, lambda a,b: controller.stop.set()) diff --git a/warcprox/writerthread.py b/warcprox/writerthread.py index 7ee9159..ae64484 100644 --- a/warcprox/writerthread.py +++ b/warcprox/writerthread.py @@ -2,7 +2,7 @@ warcprox/writerthread.py - warc writer thread, reads from the recorded url queue, writes warc records, runs final tasks after warc records are written -Copyright (C) 2013-2017 Internet Archive +Copyright (C) 2013-2018 Internet Archive This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License @@ -28,44 +28,34 @@ except ImportError: import Queue as queue import logging -import threading import time -from datetime import datetime -from hanzo import warctools import warcprox -import sys -class WarcWriterThread(threading.Thread): - logger = logging.getLogger("warcprox.warcproxwriter.WarcWriterThread") +class WarcWriterThread(warcprox.BaseStandardPostfetchProcessor): + logger = logging.getLogger("warcprox.writerthread.WarcWriterThread") - def __init__( - self, recorded_url_q, name='WarcWriterThread', writer_pool=None, - dedup_db=None, listeners=[], options=warcprox.Options()): - """recorded_url_q is a queue.Queue of warcprox.warcprox.RecordedUrl.""" - threading.Thread.__init__(self, name=name) - self.recorded_url_q = recorded_url_q - self.stop = threading.Event() - if writer_pool: - self.writer_pool = writer_pool - else: - self.writer_pool = warcprox.writer.WarcWriterPool() - self.dedup_db = dedup_db - self.listeners = listeners + def __init__(self, inq, outq, options=warcprox.Options()): + warcprox.BaseStandardPostfetchProcessor.__init__( + self, inq, outq, options=options) self.options = options - self.idle = None + self.writer_pool = warcprox.writer.WarcWriterPool(options) self.method_filter = set(method.upper() for method in self.options.method_filter or []) - def run(self): - if self.options.profile: - import cProfile - self.profiler = cProfile.Profile() - self.profiler.enable() - self._run() - self.profiler.disable() - else: - self._run() + def _get_process_put(self): + try: + warcprox.BaseStandardPostfetchProcessor._get_process_put(self) + finally: + self.writer_pool.maybe_idle_rollover() + + def _process_url(self, recorded_url): + if self._should_archive(recorded_url): + records = self.writer_pool.write_records(recorded_url) + recorded_url.warc_records = records + self._log(recorded_url, records) + # try to release resources in a timely fashion + if recorded_url.response_recorder and recorded_url.response_recorder.tempfile: + recorded_url.response_recorder.tempfile.close() - _ALWAYS_ACCEPT = {'WARCPROX_WRITE_RECORD'} def _filter_accepts(self, recorded_url): if not self.method_filter: return True @@ -81,68 +71,9 @@ class WarcWriterThread(threading.Thread): # special warc name prefix '-' means "don't archive" return prefix != '-' and self._filter_accepts(recorded_url) - def _run(self): - self.name = '%s(tid=%s)'% (self.name, warcprox.gettid()) - while not self.stop.is_set(): - try: - while True: - try: - if self.stop.is_set(): - qsize = self.recorded_url_q.qsize() - if qsize % 50 == 0: - self.logger.info("%s urls left to write", qsize) - - recorded_url = self.recorded_url_q.get(block=True, timeout=0.5) - records = [] - self.idle = None - if self._should_archive(recorded_url): - if self.dedup_db: - warcprox.dedup.decorate_with_dedup_info(self.dedup_db, - recorded_url, base32=self.options.base32) - records = self.writer_pool.write_records(recorded_url) - - self._final_tasks(recorded_url, records) - - # try to release resources in a timely fashion - if recorded_url.response_recorder and recorded_url.response_recorder.tempfile: - recorded_url.response_recorder.tempfile.close() - except queue.Empty: - if self.stop.is_set(): - break - self.idle = time.time() - finally: - self.writer_pool.maybe_idle_rollover() - - self.logger.info('WarcWriterThread shutting down') - self._shutdown() - except Exception as e: - if isinstance(e, OSError) and e.errno == 28: - # OSError: [Errno 28] No space left on device - self.logger.critical( - 'shutting down due to fatal problem: %s: %s', - e.__class__.__name__, e) - self._shutdown() - sys.exit(1) - - self.logger.critical( - 'WarcWriterThread will try to continue after unexpected ' - 'error', exc_info=True) - time.sleep(0.5) - - def _shutdown(self): - self.writer_pool.close_writers() - for listener in self.listeners: - if hasattr(listener, 'stop'): - try: - listener.stop() - except: - self.logger.error( - '%s raised exception', listener.stop, exc_info=True) - - # closest thing we have to heritrix crawl log at the moment def _log(self, recorded_url, records): try: - payload_digest = records[0].get_header(warctools.WarcRecord.PAYLOAD_DIGEST).decode("utf-8") + payload_digest = records[0].get_header('WARC-Payload-Digest').decode("utf-8") except: payload_digest = "-" @@ -156,13 +87,3 @@ class WarcWriterThread(threading.Thread): recorded_url.method, recorded_url.url.decode("utf-8"), recorded_url.mimetype, recorded_url.size, payload_digest, type_, filename, offset) - - def _final_tasks(self, recorded_url, records): - if self.listeners: - for listener in self.listeners: - try: - listener.notify(recorded_url, records) - except: - self.logger.error('%s raised exception', - listener.notify, exc_info=True) - self._log(recorded_url, records) From c9a39958db4e7d5628fbb302230ecbda2104f3e4 Mon Sep 17 00:00:00 2001 From: Noah Levitt <nlevitt@archive.org> Date: Mon, 15 Jan 2018 14:37:27 -0800 Subject: [PATCH 03/19] tests are passing --- tests/test_warcprox.py | 120 ++++++++++++++++++++------------------- tests/test_writer.py | 44 +++++++------- warcprox/__init__.py | 17 +++--- warcprox/bigtable.py | 2 +- warcprox/controller.py | 71 ++++++++++++----------- warcprox/dedup.py | 3 +- warcprox/main.py | 6 +- warcprox/mitmproxy.py | 7 ++- warcprox/playback.py | 18 +++--- warcprox/warcproxy.py | 44 ++++++-------- warcprox/writerthread.py | 3 + 11 files changed, 169 insertions(+), 166 deletions(-) diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index f5175b9..36b7cea 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -357,7 +357,9 @@ def warcprox_(request): argv.append('--rethinkdb-trough-db-url=%s' % request.config.getoption('--rethinkdb-trough-db-url')) args = warcprox.main.parse_args(argv) - warcprox_ = warcprox.main.init_controller(args) + + options = warcprox.Options(**vars(args)) + warcprox_ = warcprox.controller.WarcproxController(options) logging.info('starting warcprox') warcprox_thread = threading.Thread( @@ -490,8 +492,8 @@ def test_dedup_http(http_daemon, warcprox_, archiving_proxies, playback_proxies) assert response.content == b'404 Not in Archive\n' # check not in dedup db - dedup_lookup = warcprox_.warc_writer_threads[0].dedup_db.lookup( - b'sha1:65e1216acfd220f0292715e74bd7a1ec35c99dfc') + dedup_lookup = warcprox_.dedup_db.lookup( + b'sha1:65e1216acfd220f0292715e74bd7a1ec35c99dfc') assert dedup_lookup is None # archive @@ -508,13 +510,13 @@ def test_dedup_http(http_daemon, warcprox_, archiving_proxies, playback_proxies) # wait for writer thread to process time.sleep(0.5) - while not all(wwt.idle for wwt in warcprox_.warc_writer_threads): + while warcprox_.postfetch_chain_busy(): time.sleep(0.5) time.sleep(0.5) # check in dedup db # {u'id': u'<urn:uuid:e691dc0f-4bb9-4ad8-9afb-2af836aa05e4>', u'url': u'https://localhost:62841/c/d', u'date': u'2013-11-22T00:14:37Z'} - dedup_lookup = warcprox_.warc_writer_threads[0].dedup_db.lookup( + dedup_lookup = warcprox_.dedup_db.lookup( b'sha1:65e1216acfd220f0292715e74bd7a1ec35c99dfc') assert dedup_lookup assert dedup_lookup['url'] == url.encode('ascii') @@ -535,12 +537,12 @@ def test_dedup_http(http_daemon, warcprox_, archiving_proxies, playback_proxies) # wait for writer thread to process time.sleep(0.5) - while not all(wwt.idle for wwt in warcprox_.warc_writer_threads): + while warcprox_.postfetch_chain_busy(): time.sleep(0.5) time.sleep(0.5) # check in dedup db (no change from prev) - dedup_lookup = warcprox_.warc_writer_threads[0].dedup_db.lookup( + dedup_lookup = warcprox_.dedup_db.lookup( b'sha1:65e1216acfd220f0292715e74bd7a1ec35c99dfc') assert dedup_lookup['url'] == url.encode('ascii') assert dedup_lookup['id'] == record_id @@ -564,7 +566,7 @@ def test_dedup_https(https_daemon, warcprox_, archiving_proxies, playback_proxie assert response.content == b'404 Not in Archive\n' # check not in dedup db - dedup_lookup = warcprox_.warc_writer_threads[0].dedup_db.lookup( + dedup_lookup = warcprox_.dedup_db.lookup( b'sha1:5b4efa64fdb308ec06ae56a9beba155a6f734b89') assert dedup_lookup is None @@ -582,13 +584,13 @@ def test_dedup_https(https_daemon, warcprox_, archiving_proxies, playback_proxie # wait for writer thread to process time.sleep(0.5) - while not all(wwt.idle for wwt in warcprox_.warc_writer_threads): + while warcprox_.postfetch_chain_busy(): time.sleep(0.5) time.sleep(0.5) # check in dedup db # {u'id': u'<urn:uuid:e691dc0f-4bb9-4ad8-9afb-2af836aa05e4>', u'url': u'https://localhost:62841/c/d', u'date': u'2013-11-22T00:14:37Z'} - dedup_lookup = warcprox_.warc_writer_threads[0].dedup_db.lookup( + dedup_lookup = warcprox_.dedup_db.lookup( b'sha1:5b4efa64fdb308ec06ae56a9beba155a6f734b89') assert dedup_lookup assert dedup_lookup['url'] == url.encode('ascii') @@ -609,12 +611,12 @@ def test_dedup_https(https_daemon, warcprox_, archiving_proxies, playback_proxie # wait for writer thread to process time.sleep(0.5) - while not all(wwt.idle for wwt in warcprox_.warc_writer_threads): + while warcprox_.postfetch_chain_busy(): time.sleep(0.5) time.sleep(0.5) # check in dedup db (no change from prev) - dedup_lookup = warcprox_.warc_writer_threads[0].dedup_db.lookup( + dedup_lookup = warcprox_.dedup_db.lookup( b'sha1:5b4efa64fdb308ec06ae56a9beba155a6f734b89') assert dedup_lookup['url'] == url.encode('ascii') assert dedup_lookup['id'] == record_id @@ -640,7 +642,7 @@ def test_limits(http_daemon, warcprox_, archiving_proxies): # wait for writer thread to process time.sleep(0.5) - while not all(wwt.idle for wwt in warcprox_.warc_writer_threads): + while warcprox_.postfetch_chain_busy(): time.sleep(0.5) time.sleep(0.5) @@ -652,7 +654,7 @@ def test_limits(http_daemon, warcprox_, archiving_proxies): # wait for writer thread to process time.sleep(0.5) - while not all(wwt.idle for wwt in warcprox_.warc_writer_threads): + while warcprox_.postfetch_chain_busy(): time.sleep(0.5) time.sleep(2.5) @@ -693,12 +695,12 @@ def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies, # wait for writer thread to process time.sleep(0.5) - while not all(wwt.idle for wwt in warcprox_.warc_writer_threads): + while warcprox_.postfetch_chain_busy(): time.sleep(0.5) time.sleep(0.5) # check url1 in dedup db bucket_a - dedup_lookup = warcprox_.warc_writer_threads[0].dedup_db.lookup( + dedup_lookup = warcprox_.dedup_db.lookup( b'sha1:bc3fac8847c9412f49d955e626fb58a76befbf81', bucket="bucket_a") assert dedup_lookup['url'] == url1.encode('ascii') assert re.match(br'^<urn:uuid:[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}>$', dedup_lookup['id']) @@ -707,7 +709,7 @@ def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies, dedup_date = dedup_lookup['date'] # check url1 not in dedup db bucket_b - dedup_lookup = warcprox_.warc_writer_threads[0].dedup_db.lookup( + dedup_lookup = warcprox_.dedup_db.lookup( b'sha1:bc3fac8847c9412f49d955e626fb58a76befbf81', bucket="bucket_b") assert dedup_lookup is None @@ -720,12 +722,12 @@ def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies, # wait for writer thread to process time.sleep(0.5) - while not all(wwt.idle for wwt in warcprox_.warc_writer_threads): + while warcprox_.postfetch_chain_busy(): time.sleep(0.5) time.sleep(0.5) # check url2 in dedup db bucket_b - dedup_lookup = warcprox_.warc_writer_threads[0].dedup_db.lookup( + dedup_lookup = warcprox_.dedup_db.lookup( b'sha1:bc3fac8847c9412f49d955e626fb58a76befbf81', bucket="bucket_b") assert dedup_lookup['url'] == url2.encode('ascii') assert re.match(br'^<urn:uuid:[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}>$', dedup_lookup['id']) @@ -742,7 +744,7 @@ def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies, # wait for writer thread to process time.sleep(0.5) - while not all(wwt.idle for wwt in warcprox_.warc_writer_threads): + while warcprox_.postfetch_chain_busy(): time.sleep(0.5) time.sleep(0.5) @@ -755,15 +757,15 @@ def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies, # wait for writer thread to process time.sleep(0.5) - while not all(wwt.idle for wwt in warcprox_.warc_writer_threads): + while warcprox_.postfetch_chain_busy(): time.sleep(0.5) time.sleep(0.5) # close the warc - assert warcprox_.warc_writer_threads[0].writer_pool.warc_writers["test_dedup_buckets"] - writer = warcprox_.warc_writer_threads[0].writer_pool.warc_writers["test_dedup_buckets"] + assert warcprox_.warc_writer_thread.writer_pool.warc_writers["test_dedup_buckets"] + writer = warcprox_.warc_writer_thread.writer_pool.warc_writers["test_dedup_buckets"] warc_path = os.path.join(writer.directory, writer._f_finalname) - warcprox_.warc_writer_threads[0].writer_pool.warc_writers["test_dedup_buckets"].close_writer() + warcprox_.warc_writer_thread.writer_pool.warc_writers["test_dedup_buckets"].close_writer() assert os.path.exists(warc_path) # read the warc @@ -948,7 +950,7 @@ def test_domain_doc_soft_limit( # wait for writer thread to process time.sleep(0.5) - while not all(wwt.idle for wwt in warcprox_.warc_writer_threads): + while warcprox_.postfetch_chain_busy(): time.sleep(0.5) time.sleep(0.5) @@ -963,7 +965,7 @@ def test_domain_doc_soft_limit( # wait for writer thread to process time.sleep(0.5) - while not all(wwt.idle for wwt in warcprox_.warc_writer_threads): + while warcprox_.postfetch_chain_busy(): time.sleep(0.5) # rethinkdb stats db update cycle is 2 seconds (at the moment anyway) time.sleep(2.0) @@ -990,7 +992,7 @@ def test_domain_doc_soft_limit( # wait for writer thread to process time.sleep(0.5) - while not all(wwt.idle for wwt in warcprox_.warc_writer_threads): + while warcprox_.postfetch_chain_busy(): time.sleep(0.5) # rethinkdb stats db update cycle is 2 seconds (at the moment anyway) time.sleep(2.0) @@ -1005,7 +1007,7 @@ def test_domain_doc_soft_limit( # wait for writer thread to process time.sleep(0.5) - while not all(wwt.idle for wwt in warcprox_.warc_writer_threads): + while warcprox_.postfetch_chain_busy(): time.sleep(0.5) # rethinkdb stats db update cycle is 2 seconds (at the moment anyway) time.sleep(2.0) @@ -1073,7 +1075,7 @@ def test_domain_data_soft_limit( # wait for writer thread to process time.sleep(0.5) - while not all(wwt.idle for wwt in warcprox_.warc_writer_threads): + while warcprox_.postfetch_chain_busy(): time.sleep(0.5) # rethinkdb stats db update cycle is 2 seconds (at the moment anyway) time.sleep(2.0) @@ -1089,7 +1091,7 @@ def test_domain_data_soft_limit( # wait for writer thread to process time.sleep(0.5) - while not all(wwt.idle for wwt in warcprox_.warc_writer_threads): + while warcprox_.postfetch_chain_busy(): time.sleep(0.5) # rethinkdb stats db update cycle is 2 seconds (at the moment anyway) time.sleep(2.0) @@ -1105,7 +1107,7 @@ def test_domain_data_soft_limit( # wait for writer thread to process time.sleep(0.5) - while not all(wwt.idle for wwt in warcprox_.warc_writer_threads): + while warcprox_.postfetch_chain_busy(): time.sleep(0.5) # rethinkdb stats db update cycle is 2 seconds (at the moment anyway) time.sleep(2.0) @@ -1238,7 +1240,7 @@ def test_dedup_ok_flag( url = 'http://localhost:{}/z/b'.format(http_daemon.server_port) # check not in dedup db - dedup_lookup = warcprox_.warc_writer_threads[0].dedup_db.lookup( + dedup_lookup = warcprox_.dedup_db.lookup( b'sha1:2d7f13181b90a256ce5e5ebfd6e9c9826ece9079', bucket='test_dedup_ok_flag') assert dedup_lookup is None @@ -1253,12 +1255,12 @@ def test_dedup_ok_flag( assert response.content == b'I am the warcprox test payload! bbbbbbbbbb!\n' time.sleep(0.5) - while not all(wwt.idle for wwt in warcprox_.warc_writer_threads): + while warcprox_.postfetch_chain_busy(): time.sleep(0.5) time.sleep(0.5) # check that dedup db doesn't give us anything for this - dedup_lookup = warcprox_.warc_writer_threads[0].dedup_db.lookup( + dedup_lookup = warcprox_.dedup_db.lookup( b'sha1:2d7f13181b90a256ce5e5ebfd6e9c9826ece9079', bucket='test_dedup_ok_flag') assert dedup_lookup is None @@ -1274,18 +1276,18 @@ def test_dedup_ok_flag( assert response.content == b'I am the warcprox test payload! bbbbbbbbbb!\n' time.sleep(0.5) - while not all(wwt.idle for wwt in warcprox_.warc_writer_threads): + while warcprox_.postfetch_chain_busy(): time.sleep(0.5) time.sleep(0.5) # check that dedup db gives us something for this - dedup_lookup = warcprox_.warc_writer_threads[0].dedup_db.lookup( + dedup_lookup = warcprox_.dedup_db.lookup( b'sha1:2d7f13181b90a256ce5e5ebfd6e9c9826ece9079', bucket='test_dedup_ok_flag') assert dedup_lookup # inspect what's in rethinkdb more closely - rethink_captures = warcprox_.warc_writer_threads[0].dedup_db.captures_db + rethink_captures = warcprox_.dedup_db.captures_db results_iter = rethink_captures.rr.table(rethink_captures.table).get_all( ['FV7RGGA3SCRFNTS6L275N2OJQJXM5EDZ', 'response', 'test_dedup_ok_flag'], index='sha1_warc_type').order_by( @@ -1366,26 +1368,28 @@ def test_controller_with_defaults(): assert controller.proxy.server_address == ('127.0.0.1', 8000) assert controller.proxy.server_port == 8000 assert controller.proxy.running_stats - for wwt in controller.warc_writer_threads: - assert wwt - assert wwt.recorded_url_q - assert wwt.recorded_url_q is controller.proxy.recorded_url_q - assert wwt.writer_pool - assert wwt.writer_pool.default_warc_writer - assert wwt.writer_pool.default_warc_writer.directory == './warcs' - assert wwt.writer_pool.default_warc_writer.rollover_idle_time is None - assert wwt.writer_pool.default_warc_writer.rollover_size == 1000000000 - assert wwt.writer_pool.default_warc_writer.prefix == 'warcprox' - assert wwt.writer_pool.default_warc_writer.gzip is False - assert wwt.writer_pool.default_warc_writer.record_builder - assert not wwt.writer_pool.default_warc_writer.record_builder.base32 - assert wwt.writer_pool.default_warc_writer.record_builder.digest_algorithm == 'sha1' + assert not controller.proxy.stats_db + wwt = controller.warc_writer_thread + assert wwt + assert wwt.inq + assert not wwt.outq + assert wwt.writer_pool + assert wwt.writer_pool.default_warc_writer + assert wwt.writer_pool.default_warc_writer.directory == './warcs' + assert wwt.writer_pool.default_warc_writer.rollover_idle_time is None + assert wwt.writer_pool.default_warc_writer.rollover_size == 1000000000 + assert wwt.writer_pool.default_warc_writer.prefix == 'warcprox' + assert wwt.writer_pool.default_warc_writer.gzip is False + assert wwt.writer_pool.default_warc_writer.record_builder + assert not wwt.writer_pool.default_warc_writer.record_builder.base32 + assert wwt.writer_pool.default_warc_writer.record_builder.digest_algorithm == 'sha1' def test_choose_a_port_for_me(warcprox_): options = warcprox.Options() options.port = 0 - controller = warcprox.controller.WarcproxController( - service_registry=warcprox_.service_registry, options=options) + if warcprox_.service_registry: + options.rethinkdb_services_url = 'rethinkdb://localhost/test0/services' + controller = warcprox.controller.WarcproxController(options) assert controller.proxy.server_port != 0 assert controller.proxy.server_port != 8000 assert controller.proxy.server_address == ( @@ -1426,7 +1430,7 @@ def test_via_response_header(warcprox_, http_daemon, archiving_proxies, playback assert response.status_code == 200 assert not 'via' in playback_response - warc = warcprox_.warc_writer_threads[0].writer_pool.default_warc_writer._fpath + warc = warcprox_.warc_writer_thread.writer_pool.default_warc_writer._fpath with open(warc, 'rb') as f: for record in warcio.archiveiterator.ArchiveIterator(f): if record.rec_headers.get_header('warc-target-uri') == url: @@ -1644,15 +1648,15 @@ def test_long_warcprox_meta( # wait for writer thread to process time.sleep(0.5) - while not all(wwt.idle for wwt in warcprox_.warc_writer_threads): + while warcprox_.postfetch_chain_busy(): time.sleep(0.5) time.sleep(0.5) # check that warcprox-meta was parsed and honored ("warc-prefix" param) - assert warcprox_.warc_writer_threads[0].writer_pool.warc_writers["test_long_warcprox_meta"] - writer = warcprox_.warc_writer_threads[0].writer_pool.warc_writers["test_long_warcprox_meta"] + assert warcprox_.warc_writer_thread.writer_pool.warc_writers["test_long_warcprox_meta"] + writer = warcprox_.warc_writer_thread.writer_pool.warc_writers["test_long_warcprox_meta"] warc_path = os.path.join(writer.directory, writer._f_finalname) - warcprox_.warc_writer_threads[0].writer_pool.warc_writers["test_long_warcprox_meta"].close_writer() + warcprox_.warc_writer_thread.writer_pool.warc_writers["test_long_warcprox_meta"].close_writer() assert os.path.exists(warc_path) # read the warc diff --git a/tests/test_writer.py b/tests/test_writer.py index 4474f82..2d9505f 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -88,27 +88,20 @@ def wait(callback, timeout): raise Exception('timed out waiting for %s to return truthy' % callback) def test_special_dont_write_prefix(): - class NotifyMe: - def __init__(self): - self.the_list = [] - def notify(self, recorded_url, records): - self.the_list.append((recorded_url, records)) - with tempfile.TemporaryDirectory() as tmpdir: logging.debug('cd %s', tmpdir) os.chdir(tmpdir) - q = warcprox.TimestampedQueue(maxsize=1) - listener = NotifyMe() + inq = warcprox.TimestampedQueue(maxsize=1) + outq = warcprox.TimestampedQueue(maxsize=1) wwt = warcprox.writerthread.WarcWriterThread( - recorded_url_q=q, options=Options(prefix='-'), - listeners=[listener]) + inq, outq, Options(prefix='-')) try: wwt.start() # not to be written due to default prefix recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None) recorder.read() - q.put(RecordedUrl( + inq.put(RecordedUrl( url='http://example.com/no', content_type='text/plain', status=200, client_ip='127.0.0.2', request_data=b'abc', response_recorder=recorder, remote_ip='127.0.0.3', @@ -117,30 +110,31 @@ def test_special_dont_write_prefix(): # to be written due to warcprox-meta prefix recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None) recorder.read() - q.put(RecordedUrl( + inq.put(RecordedUrl( url='http://example.com/yes', content_type='text/plain', status=200, client_ip='127.0.0.2', request_data=b'abc', response_recorder=recorder, remote_ip='127.0.0.3', timestamp=datetime.utcnow(), payload_digest=recorder.block_digest, warcprox_meta={'warc-prefix': 'normal-warc-prefix'})) - wait(lambda: len(listener.the_list) == 2, 10.0) - assert not listener.the_list[0][1] - assert listener.the_list[1][1] + recorded_url = outq.get(timeout=10) + assert not recorded_url.warc_records + recorded_url = outq.get(timeout=10) + assert recorded_url.warc_records + assert outq.empty() finally: wwt.stop.set() wwt.join() - q = warcprox.TimestampedQueue(maxsize=1) - listener = NotifyMe() - wwt = warcprox.writerthread.WarcWriterThread( - recorded_url_q=q, listeners=[listener]) + inq = warcprox.TimestampedQueue(maxsize=1) + outq = warcprox.TimestampedQueue(maxsize=1) + wwt = warcprox.writerthread.WarcWriterThread(inq, outq) try: wwt.start() # to be written due to default prefix recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None) recorder.read() - q.put(RecordedUrl( + inq.put(RecordedUrl( url='http://example.com/yes', content_type='text/plain', status=200, client_ip='127.0.0.2', request_data=b'abc', response_recorder=recorder, remote_ip='127.0.0.3', @@ -149,16 +143,18 @@ def test_special_dont_write_prefix(): # not to be written due to warcprox-meta prefix recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None) recorder.read() - q.put(RecordedUrl( + inq.put(RecordedUrl( url='http://example.com/no', content_type='text/plain', status=200, client_ip='127.0.0.2', request_data=b'abc', response_recorder=recorder, remote_ip='127.0.0.3', timestamp=datetime.utcnow(), payload_digest=recorder.block_digest, warcprox_meta={'warc-prefix': '-'})) - wait(lambda: len(listener.the_list) == 2, 10.0) - assert listener.the_list[0][1] - assert not listener.the_list[1][1] + recorded_url = outq.get(timeout=10) + assert recorded_url.warc_records + recorded_url = outq.get(timeout=10) + assert not recorded_url.warc_records + assert outq.empty() finally: wwt.stop.set() wwt.join() diff --git a/warcprox/__init__.py b/warcprox/__init__.py index 7c628ad..757f1c1 100644 --- a/warcprox/__init__.py +++ b/warcprox/__init__.py @@ -100,7 +100,7 @@ class BasePostfetchProcessor(threading.Thread): logger = logging.getLogger("warcprox.BasePostfetchProcessor") def __init__(self, inq, outq, options=Options()): - threading.Thread.__init__(self, name='???') + threading.Thread.__init__(self, name=self.__class__.__name__) self.inq = inq self.outq = outq self.options = options @@ -120,7 +120,8 @@ class BasePostfetchProcessor(threading.Thread): ''' Get url(s) from `self.inq`, process url(s), queue to `self.outq`. - Subclasses must implement this. + Subclasses must implement this. Implementations may operate on + individual urls, or on batches. May raise queue.Empty. ''' @@ -188,16 +189,16 @@ class ListenerPostfetchProcessor(BaseStandardPostfetchProcessor): def __init__(self, listener, inq, outq, profile=False): BaseStandardPostfetchProcessor.__init__(self, inq, outq, profile) self.listener = listener + self.name = listener.__class__.__name__ + logging.info('self.name=%s', self.name) def _process_url(self, recorded_url): return self.listener.notify(recorded_url, recorded_url.warc_records) - # @classmethod - # def wrap(cls, listener, inq, outq, profile=False): - # if listener: - # return cls(listener, inq, outq, profile) - # else: - # return None + def start(self): + if hasattr(self.listener, 'start'): + self.listener.start() + BaseStandardPostfetchProcessor.start(self) # monkey-patch log levels TRACE and NOTICE TRACE = 5 diff --git a/warcprox/bigtable.py b/warcprox/bigtable.py index 79d6240..e6674a6 100644 --- a/warcprox/bigtable.py +++ b/warcprox/bigtable.py @@ -215,7 +215,7 @@ class RethinkCaptures: if self._timer: self._timer.join() -class RethinkCapturesDedup: +class RethinkCapturesDedup(warcprox.dedup.DedupDb): logger = logging.getLogger("warcprox.dedup.RethinkCapturesDedup") def __init__(self, options=warcprox.Options()): diff --git a/warcprox/controller.py b/warcprox/controller.py index 80b9c50..f6a720f 100644 --- a/warcprox/controller.py +++ b/warcprox/controller.py @@ -33,6 +33,7 @@ import datetime import warcprox import certauth import functools +import doublethink class Factory: @staticmethod @@ -65,22 +66,15 @@ class Factory: options.stats_db_file, options=options) return stats_db - # @staticmethod - # def certauth(options): - # ca_name = 'Warcprox CA on {}'.format(socket.gethostname())[:64] - # ca = certauth.certauth.CertificateAuthority( - # options.cacert, args.certs_dir, ca_name=ca_name) - # return ca - @staticmethod def warc_writer(inq, outq, options): return warcprox.writerthread.WarcWriterThread(inq, outq, options) @staticmethod - def playback_proxy(options): + def playback_proxy(ca, options): if options.playback_port is not None: playback_index_db = warcprox.playback.PlaybackIndexDb( - options.playback_index_db_file, options=options) + options=options) playback_proxy = warcprox.playback.PlaybackProxy( ca=ca, playback_index_db=playback_index_db, options=options) else: @@ -136,12 +130,22 @@ class WarcproxController(object): self.stop = threading.Event() self._start_stop_lock = threading.Lock() - self.proxy = warcprox.warcproxy.WarcProxy(options=options) + self.stats_db = Factory.stats_db(self.options) + + self.proxy = warcprox.warcproxy.WarcProxy(self.stats_db, options) + self.playback_proxy = Factory.playback_proxy( + self.proxy.ca, self.options) self.build_postfetch_chain(self.proxy.recorded_url_q) self.service_registry = Factory.service_registry(options) + def postfetch_chain_busy(self): + for processor in self._postfetch_chain: + if processor.inq.qsize() > 0: + return True + return False + def build_postfetch_chain(self, inq): constructors = [] @@ -155,12 +159,10 @@ class WarcproxController(object): if self.dedup_db: constructors.append(self.dedup_db.storer) - stats_db = Factory.stats_db(self.options) - if stats_db: + if self.stats_db: constructors.append(functools.partial( - warcprox.ListenerPostfetchProcessor, stats_db)) + warcprox.ListenerPostfetchProcessor, self.stats_db)) - self.playback_proxy = Factory.playback_proxy(self.options) if self.playback_proxy: constructors.append(functools.partial( warcprox.ListenerPostfetchProcessor, @@ -175,7 +177,7 @@ class WarcproxController(object): plugin = Factory.plugin(qualname) constructors.append(functools.partial( warcprox.ListenerPostfetchProcessor, plugin)) - + self._postfetch_chain = [] for i, constructor in enumerate(constructors): if i != len(constructors) - 1: @@ -184,6 +186,8 @@ class WarcproxController(object): else: outq = None processor = constructor(inq, outq, self.options) + if isinstance(processor, warcprox.writerthread.WarcWriterThread): + self.warc_writer_thread = processor # ugly self._postfetch_chain.append(processor) inq = outq @@ -277,6 +281,12 @@ class WarcproxController(object): target=self.proxy.serve_forever, name='ProxyThread') self.proxy_thread.start() + if self.playback_proxy: + self.playback_proxy_thread = threading.Thread( + target=self.playback_proxy.serve_forever, + name='PlaybackProxyThread') + self.playback_proxy_thread.start() + for processor in self._postfetch_chain: # logging.info('starting postfetch processor %r', processor) processor.start() @@ -288,34 +298,29 @@ class WarcproxController(object): self.logger.info('warcprox is not running') return - # for wwt in self.warc_writer_threads: - # wwt.stop.set() for processor in self._postfetch_chain: processor.stop.set() self.proxy.shutdown() self.proxy.server_close() + if self.playback_proxy is not None: + self.playback_proxy.shutdown() + self.playback_proxy.server_close() + if self.playback_proxy.playback_index_db is not None: + self.playback_proxy.playback_index_db.close() + for processor in self._postfetch_chain: processor.join() - # if self.playback_proxy is not None: - # self.playback_proxy.shutdown() - # self.playback_proxy.server_close() - # if self.playback_proxy.playback_index_db is not None: - # self.playback_proxy.playback_index_db.close() - # # wait for threads to finish - # for wwt in self.warc_writer_threads: - # wwt.join() + if self.stats_db: + self.stats_db.stop() - # if self.proxy.stats_db: - # self.proxy.stats_db.stop() + self.proxy_thread.join() + if self.playback_proxy is not None: + self.playback_proxy_thread.join() - # self.proxy_thread.join() - # if self.playback_proxy is not None: - # self.playback_proxy_thread.join() - - # if self.service_registry and hasattr(self, "status_info"): - # self.service_registry.unregister(self.status_info["id"]) + if self.service_registry and hasattr(self, "status_info"): + self.service_registry.unregister(self.status_info["id"]) def run_until_shutdown(self): """ diff --git a/warcprox/dedup.py b/warcprox/dedup.py index f62eccb..2aab0aa 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -37,7 +37,8 @@ urllib3.disable_warnings() class DedupLoader(warcprox.BaseStandardPostfetchProcessor): def __init__(self, dedup_db, inq, outq, base32=False, profile=False): - warcprox.BaseStandardPostfetchProcessor.__init__(self, inq, outq, profile) + warcprox.BaseStandardPostfetchProcessor.__init__( + self, inq, outq, profile) self.dedup_db = dedup_db self.base32 = base32 def _process_url(self, recorded_url): diff --git a/warcprox/main.py b/warcprox/main.py index 785c029..3c93504 100644 --- a/warcprox/main.py +++ b/warcprox/main.py @@ -116,9 +116,9 @@ def _build_arg_parser(prog): arg_parser.add_argument('-P', '--playback-port', dest='playback_port', type=int, default=None, help='port to listen on for instant playback') - arg_parser.add_argument('--playback-index-db-file', dest='playback_index_db_file', - default='./warcprox-playback-index.db', - help='playback index database file (only used if --playback-port is specified)') + # arg_parser.add_argument('--playback-index-db-file', dest='playback_index_db_file', + # default='./warcprox-playback-index.db', + # help='playback index database file (only used if --playback-port is specified)') group = arg_parser.add_mutually_exclusive_group() group.add_argument('-j', '--dedup-db-file', dest='dedup_db_file', default='./warcprox.sqlite', help='persistent deduplication database file; empty string or /dev/null disables deduplication') diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index 130196a..c42fb68 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -562,7 +562,12 @@ class PooledMitmProxy(PooledMixIn, MitmProxy): request_queue_size = 4096 def __init__(self, max_threads, options=warcprox.Options()): - PooledMixIn.__init__(self, max_threads) + if options.max_threads: + self.logger.info( + "max_threads=%s set by command line option", + options.max_threads) + + PooledMixIn.__init__(self, options.max_threads) self.profilers = {} if options.profile: diff --git a/warcprox/playback.py b/warcprox/playback.py index 1a698c0..91f86aa 100644 --- a/warcprox/playback.py +++ b/warcprox/playback.py @@ -121,9 +121,6 @@ class PlaybackProxyHandler(MitmProxyHandler): def _send_headers_and_refd_payload( self, headers, refers_to_target_uri, refers_to_date, payload_digest): - """Parameters: - - """ location = self.server.playback_index_db.lookup_exact( refers_to_target_uri, refers_to_date, payload_digest) self.logger.debug('loading http payload from {}'.format(location)) @@ -133,11 +130,13 @@ class PlaybackProxyHandler(MitmProxyHandler): for (offset, record, errors) in fh.read_records(limit=1, offsets=True): pass + if not record: + raise Exception('failed to read record at offset %s from %s' % (offset, warcfilename)) + if errors: raise Exception('warc errors at {}:{} -- {}'.format(location['f'], offset, errors)) - warc_type = record.get_header(warctools.WarcRecord.TYPE) - if warc_type != warctools.WarcRecord.RESPONSE: + if record.type != warctools.WarcRecord.RESPONSE: raise Exception('invalid attempt to retrieve http payload of "{}" record'.format(warc_type)) # find end of headers @@ -158,12 +157,13 @@ class PlaybackProxyHandler(MitmProxyHandler): for (offset, record, errors) in fh.read_records(limit=1, offsets=True): pass + if not record: + raise Exception('failed to read record at offset %s from %s' % (offset, warcfilename)) + if errors: raise Exception('warc errors at {}:{} -- {}'.format(warcfilename, offset, errors)) - warc_type = record.get_header(warctools.WarcRecord.TYPE) - - if warc_type == warctools.WarcRecord.RESPONSE: + if record.type == warctools.WarcRecord.RESPONSE: headers_buf = bytearray() while True: line = record.content_file.readline() @@ -173,7 +173,7 @@ class PlaybackProxyHandler(MitmProxyHandler): return self._send_response(headers_buf, record.content_file) - elif warc_type == warctools.WarcRecord.REVISIT: + elif record.type == warctools.WarcRecord.REVISIT: # response consists of http headers from revisit record and # payload from the referenced record warc_profile = record.get_header(warctools.WarcRecord.PROFILE) diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index 60f79d3..b2ac69e 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -92,6 +92,8 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): self.url, rule)) def _enforce_limit(self, limit_key, limit_value, soft=False): + if not self.server.stats_db: + return bucket0, bucket1, bucket2 = limit_key.rsplit("/", 2) _limit_key = limit_key @@ -328,7 +330,7 @@ class RecordedUrl: warcprox_meta=None, content_type=None, custom_type=None, status=None, size=None, client_ip=None, method=None, timestamp=None, host=None, duration=None, referer=None, - payload_digest=None): + payload_digest=None, warc_records=None): # XXX should test what happens with non-ascii url (when does # url-encoding happen?) if type(url) is not bytes: @@ -367,6 +369,7 @@ class RecordedUrl: self.duration = duration self.referer = referer self.payload_digest = payload_digest + self.warc_records = warc_records # inherit from object so that multiple inheritance from this class works # properly in python 2 @@ -374,9 +377,9 @@ class RecordedUrl: class SingleThreadedWarcProxy(http_server.HTTPServer, object): logger = logging.getLogger("warcprox.warcproxy.WarcProxy") - def __init__( - self, ca=None, recorded_url_q=None, stats_db=None, - options=warcprox.Options()): + def __init__(self, stats_db=None, options=warcprox.Options()): + self.options = options + server_address = ( options.address or 'localhost', options.port if options.port is not None else 8000) @@ -395,22 +398,15 @@ class SingleThreadedWarcProxy(http_server.HTTPServer, object): self.digest_algorithm = options.digest_algorithm or 'sha1' - if ca is not None: - self.ca = ca - else: - ca_name = 'Warcprox CA on {}'.format(socket.gethostname())[:64] - self.ca = CertificateAuthority(ca_file='warcprox-ca.pem', - certs_dir='./warcprox-ca', - ca_name=ca_name) + ca_name = ('Warcprox CA on %s' % socket.gethostname())[:64] + self.ca = CertificateAuthority( + ca_file='warcprox-ca.pem', certs_dir='./warcprox-ca', + ca_name=ca_name) - if recorded_url_q is not None: - self.recorded_url_q = recorded_url_q - else: - self.recorded_url_q = warcprox.TimestampedQueue( - maxsize=options.queue_size or 1000) + self.recorded_url_q = warcprox.TimestampedQueue( + maxsize=options.queue_size or 1000) self.stats_db = stats_db - self.options = options self.running_stats = warcprox.stats.RunningStats() @@ -449,17 +445,9 @@ class SingleThreadedWarcProxy(http_server.HTTPServer, object): class WarcProxy(SingleThreadedWarcProxy, warcprox.mitmproxy.PooledMitmProxy): logger = logging.getLogger("warcprox.warcproxy.WarcProxy") - def __init__( - self, ca=None, recorded_url_q=None, stats_db=None, - running_stats=None, options=warcprox.Options()): - if options.max_threads: - self.logger.info( - "max_threads=%s set by command line option", - options.max_threads) - warcprox.mitmproxy.PooledMitmProxy.__init__( - self, options.max_threads, options) - SingleThreadedWarcProxy.__init__( - self, ca, recorded_url_q, stats_db, options) + def __init__(self, stats_db=None, options=warcprox.Options()): + warcprox.mitmproxy.PooledMitmProxy.__init__(self, options) + SingleThreadedWarcProxy.__init__(self, stats_db, options) def server_activate(self): http_server.HTTPServer.server_activate(self) diff --git a/warcprox/writerthread.py b/warcprox/writerthread.py index ae64484..5747428 100644 --- a/warcprox/writerthread.py +++ b/warcprox/writerthread.py @@ -34,6 +34,8 @@ import warcprox class WarcWriterThread(warcprox.BaseStandardPostfetchProcessor): logger = logging.getLogger("warcprox.writerthread.WarcWriterThread") + _ALWAYS_ACCEPT = {'WARCPROX_WRITE_RECORD'} + def __init__(self, inq, outq, options=warcprox.Options()): warcprox.BaseStandardPostfetchProcessor.__init__( self, inq, outq, options=options) @@ -48,6 +50,7 @@ class WarcWriterThread(warcprox.BaseStandardPostfetchProcessor): self.writer_pool.maybe_idle_rollover() def _process_url(self, recorded_url): + records = [] if self._should_archive(recorded_url): records = self.writer_pool.write_records(recorded_url) recorded_url.warc_records = records From b7d176be28d83bb0724f874f6cc626d50fe1f86f Mon Sep 17 00:00:00 2001 From: Noah Levitt <nlevitt@archive.org> Date: Mon, 15 Jan 2018 15:37:26 -0800 Subject: [PATCH 04/19] shut down postfetch processors --- warcprox/__init__.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/warcprox/__init__.py b/warcprox/__init__.py index 757f1c1..ed934b4 100644 --- a/warcprox/__init__.py +++ b/warcprox/__init__.py @@ -136,6 +136,7 @@ class BasePostfetchProcessor(threading.Thread): except queue.Empty: if self.stop.is_set(): break + logging.info('%s shutting down', self) self._shutdown() except Exception as e: if isinstance(e, OSError) and e.errno == 28: @@ -190,7 +191,6 @@ class ListenerPostfetchProcessor(BaseStandardPostfetchProcessor): BaseStandardPostfetchProcessor.__init__(self, inq, outq, profile) self.listener = listener self.name = listener.__class__.__name__ - logging.info('self.name=%s', self.name) def _process_url(self, recorded_url): return self.listener.notify(recorded_url, recorded_url.warc_records) @@ -200,6 +200,14 @@ class ListenerPostfetchProcessor(BaseStandardPostfetchProcessor): self.listener.start() BaseStandardPostfetchProcessor.start(self) + def _shutdown(self): + if hasattr(self.listener, 'stop'): + try: + self.listener.stop() + except: + self.logger.error( + '%s raised exception', listener.stop, exc_info=True) + # monkey-patch log levels TRACE and NOTICE TRACE = 5 def _logger_trace(self, msg, *args, **kwargs): From e44d6a88fbdf3c8cb320c0876ca62929c4628bfb Mon Sep 17 00:00:00 2001 From: Noah Levitt <nlevitt@archive.org> Date: Mon, 15 Jan 2018 17:15:19 -0800 Subject: [PATCH 05/19] keep running stats --- warcprox/controller.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/warcprox/controller.py b/warcprox/controller.py index f6a720f..d27c0cd 100644 --- a/warcprox/controller.py +++ b/warcprox/controller.py @@ -173,6 +173,9 @@ class WarcproxController(object): constructors.append(functools.partial( warcprox.ListenerPostfetchProcessor, crawl_logger)) + constructors.append(functools.partial( + warcprox.ListenerPostfetchProcessor, self.proxy.running_stats)) + for qualname in self.options.plugins or []: plugin = Factory.plugin(qualname) constructors.append(functools.partial( From 6ab73764eac0cb6261be2f9820950d299cd93185 Mon Sep 17 00:00:00 2001 From: Noah Levitt <nlevitt@archive.org> Date: Mon, 15 Jan 2018 17:15:36 -0800 Subject: [PATCH 06/19] make run-benchmarks.py work (with no args) --- benchmarks/run-benchmarks.py | 10 ++++++---- warcprox/main.py | 2 +- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/benchmarks/run-benchmarks.py b/benchmarks/run-benchmarks.py index 8267806..4491a8b 100755 --- a/benchmarks/run-benchmarks.py +++ b/benchmarks/run-benchmarks.py @@ -215,9 +215,9 @@ if __name__ == '__main__': args.cacert = os.path.join(tmpdir, 'benchmark-warcprox-ca.pem') args.certs_dir = os.path.join(tmpdir, 'benchmark-warcprox-ca') args.directory = os.path.join(tmpdir, 'warcs') - if args.rethinkdb_servers: - args.rethinkdb_db = 'benchmarks_{:%Y%m%d%H%M%S}' % ( - datetime.datetime.utcnow()) + # if args.rethinkdb_servers: + # args.rethinkdb_db = 'benchmarks_{:%Y%m%d%H%M%S}' % ( + # datetime.datetime.utcnow()) start_servers() logging.info( @@ -247,7 +247,9 @@ if __name__ == '__main__': logging.info('SKIPPED') logging.info('===== baseline benchmark finished =====') - warcprox_controller = warcprox.main.init_controller(args) + options = warcprox.Options(**vars(args)) + warcprox_controller = warcprox.controller.WarcproxController(options) + warcprox_controller_thread = threading.Thread( target=warcprox_controller.run_until_shutdown) warcprox_controller_thread.start() diff --git a/warcprox/main.py b/warcprox/main.py index 3c93504..a55e682 100644 --- a/warcprox/main.py +++ b/warcprox/main.py @@ -61,7 +61,7 @@ class BetterArgumentDefaultsHelpFormatter( else: return argparse.ArgumentDefaultsHelpFormatter._get_help_string(self, action) -def _build_arg_parser(prog): +def _build_arg_parser(prog='warcprox'): arg_parser = argparse.ArgumentParser(prog=prog, description='warcprox - WARC writing MITM HTTP/S proxy', formatter_class=BetterArgumentDefaultsHelpFormatter) From d4bbaf10b73d16468d64b9451bad752f9131cc8b Mon Sep 17 00:00:00 2001 From: Noah Levitt <nlevitt@archive.org> Date: Tue, 16 Jan 2018 11:37:56 -0800 Subject: [PATCH 07/19] batch trough dedup loader --- warcprox/controller.py | 1 - warcprox/dedup.py | 71 +++++++++++++++++++++++++++++++++++++++--- 2 files changed, 67 insertions(+), 5 deletions(-) diff --git a/warcprox/controller.py b/warcprox/controller.py index d27c0cd..efa7885 100644 --- a/warcprox/controller.py +++ b/warcprox/controller.py @@ -291,7 +291,6 @@ class WarcproxController(object): self.playback_proxy_thread.start() for processor in self._postfetch_chain: - # logging.info('starting postfetch processor %r', processor) processor.start() logging.info('started postfetch processor %r', processor) diff --git a/warcprox/dedup.py b/warcprox/dedup.py index 2aab0aa..962ec3a 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -1,7 +1,7 @@ ''' warcprox/dedup.py - identical payload digest deduplication using sqlite db -Copyright (C) 2013-2017 Internet Archive +Copyright (C) 2013-2018 Internet Archive This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License @@ -32,17 +32,18 @@ import doublethink import datetime import urllib3 from urllib3.exceptions import HTTPError +import collections urllib3.disable_warnings() class DedupLoader(warcprox.BaseStandardPostfetchProcessor): - def __init__(self, dedup_db, inq, outq, base32=False, profile=False): + def __init__(self, dedup_db, inq, outq, options=warcprox.Options()): warcprox.BaseStandardPostfetchProcessor.__init__( self, inq, outq, profile) self.dedup_db = dedup_db - self.base32 = base32 def _process_url(self, recorded_url): - decorate_with_dedup_info(self.dedup_db, recorded_url, self.base32) + decorate_with_dedup_info( + self.dedup_db, recorded_url, self.options.base32) class DedupDb(object): logger = logging.getLogger("warcprox.dedup.DedupDb") @@ -259,6 +260,48 @@ class CdxServerDedup(DedupDb): """ pass +class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor): + def __init__(self, trough_dedup_db, inq, outq, options=warcprox.Options()): + warcprox.BaseBatchPostfetchProcessor.__init__(self, inq, outq, options) + self.trough_dedup_db = trough_dedup_db + + def _filter_and_bucketize(self, batch): + ''' + Returns `{bucket: [recorded_url, ...]}`, excluding urls that should not + be looked up. + ''' + buckets = collections.defaultdict(list) + for recorded_url in batch: + if (recorded_url.response_recorder + and recorded_url.payload_digest + and recorded_url.response_recorder.payload_size() > 0): + if recorded_url.warcprox_meta and 'captures-bucket' in recorded_url.warcprox_meta: + bucket = recorded_url.warcprox_meta['captures-bucket'] + else: + bucket = '__unspecified__' + buckets[bucket].append(recorded_url) + return buckets + + def _build_key_index(self, batch): + ''' + Returns `{digest_key: [recorded_url, ...]}`. + ''' + key_index = collections.defaultdict(list) + for recorded_url in batch: + digest_key = warcprox.digest_str( + recorded_url.payload_digest, self.options.base32) + key_index[digest_key].append(recorded_url) + return key_index + + def _process_batch(self, batch): + buckets = self._filter_and_bucketize(batch) + for bucket in buckets: + key_index = self._build_key_index(buckets[bucket]) + results = self.trough_dedup_db.batch_lookup(key_index.keys(), bucket) + for result in results: + for recorded_url in key_index[result['digest_key']]: + recorded_url.dedup_info = result + class TroughDedupDb(DedupDb): ''' https://github.com/internetarchive/trough @@ -279,6 +322,9 @@ class TroughDedupDb(DedupDb): self._trough_cli = warcprox.trough.TroughClient( options.rethinkdb_trough_db_url, promotion_interval=60*60) + def loader(self, inq, outq, options=warcprox.Options()): + return BatchTroughLoader(self, inq, outq, options) + def start(self): self._trough_cli.register_schema(self.SCHEMA_ID, self.SCHEMA_SQL) @@ -306,6 +352,23 @@ class TroughDedupDb(DedupDb): else: return None + def batch_lookup(self, digest_keys, bucket='__unspecified__'): + sql_tmpl = 'select * from dedup where digest_key in (%s)' % ( + ','.join('%s' for i in range(len(digest_keys)))) + results = self._trough_cli.read(bucket, sql_tmpl, digest_keys) + if results is None: + return [] + self.logger.debug( + 'trough batch lookup of %s keys returned %s results', + len(digest_keys), len(results)) + assert len(results) >= 0 and len(results) <= len(digest_keys) + for result in results: + result['id'] = result['id'].encode('ascii') + result['url'] = result['url'].encode('ascii') + result['date'] = result['date'].encode('ascii') + result['digest_key'] = result['digest_key'].encode('ascii') + return results + def notify(self, recorded_url, records): if (records and records[0].type == b'response' and recorded_url.response_recorder.payload_size() > 0): From 6ff9030e6728a699030c613840e035a1163bbcdd Mon Sep 17 00:00:00 2001 From: Noah Levitt <nlevitt@archive.org> Date: Tue, 16 Jan 2018 15:18:53 -0800 Subject: [PATCH 08/19] improve batching, make tests pass --- tests/test_warcprox.py | 7 ++++++- warcprox/__init__.py | 27 +++++++++++++++++++-------- warcprox/dedup.py | 14 ++++++++------ 3 files changed, 33 insertions(+), 15 deletions(-) diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 36b7cea..95532d4 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -339,6 +339,9 @@ def warcprox_(request): logging.info('changing to working directory %r', work_dir) os.chdir(work_dir) + # we can't wait around all day in the tests + warcprox.BaseBatchPostfetchProcessor.MAX_BATCH_SEC = 0.5 + argv = ['warcprox', '--method-filter=GET', '--method-filter=POST', @@ -362,6 +365,7 @@ def warcprox_(request): warcprox_ = warcprox.controller.WarcproxController(options) logging.info('starting warcprox') + warcprox_.start() warcprox_thread = threading.Thread( name='WarcproxThread', target=warcprox_.run_until_shutdown) warcprox_thread.start() @@ -1372,7 +1376,7 @@ def test_controller_with_defaults(): wwt = controller.warc_writer_thread assert wwt assert wwt.inq - assert not wwt.outq + assert wwt.outq assert wwt.writer_pool assert wwt.writer_pool.default_warc_writer assert wwt.writer_pool.default_warc_writer.directory == './warcs' @@ -1396,6 +1400,7 @@ def test_choose_a_port_for_me(warcprox_): '127.0.0.1', controller.proxy.server_port) th = threading.Thread(target=controller.run_until_shutdown) + controller.start() th.start() try: # check that the status api lists the correct port diff --git a/warcprox/__init__.py b/warcprox/__init__.py index ed934b4..60ca2ef 100644 --- a/warcprox/__init__.py +++ b/warcprox/__init__.py @@ -167,16 +167,27 @@ class BaseStandardPostfetchProcessor(BasePostfetchProcessor): class BaseBatchPostfetchProcessor(BasePostfetchProcessor): MAX_BATCH_SIZE = 500 + MAX_BATCH_SEC = 10 def _get_process_put(self): batch = [] - batch.append(self.inq.get(block=True, timeout=0.5)) - try: - while len(batch) < self.MAX_BATCH_SIZE: - batch.append(self.inq.get(block=False)) - except queue.Empty: - pass + start = time.time() + while (len(batch) < self.MAX_BATCH_SIZE + and time.time() - start < self.MAX_BATCH_SEC): + try: + batch.append(self.inq.get(block=True, timeout=0.5)) + except queue.Empty: + if self.stop.is_set(): + break + # else keep adding to the batch + + if not batch: + raise queue.Empty + + self.logger.info( + 'gathered batch of %s in %0.1f sec', + len(batch), time.time() - start) self._process_batch(batch) if self.outq: @@ -187,8 +198,8 @@ class BaseBatchPostfetchProcessor(BasePostfetchProcessor): raise Exception('not implemented') class ListenerPostfetchProcessor(BaseStandardPostfetchProcessor): - def __init__(self, listener, inq, outq, profile=False): - BaseStandardPostfetchProcessor.__init__(self, inq, outq, profile) + def __init__(self, listener, inq, outq, options=Options()): + BaseStandardPostfetchProcessor.__init__(self, inq, outq, options) self.listener = listener self.name = listener.__class__.__name__ diff --git a/warcprox/dedup.py b/warcprox/dedup.py index 962ec3a..b9e136e 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -39,7 +39,7 @@ urllib3.disable_warnings() class DedupLoader(warcprox.BaseStandardPostfetchProcessor): def __init__(self, dedup_db, inq, outq, options=warcprox.Options()): warcprox.BaseStandardPostfetchProcessor.__init__( - self, inq, outq, profile) + self, inq, outq, options) self.dedup_db = dedup_db def _process_url(self, recorded_url): decorate_with_dedup_info( @@ -71,11 +71,12 @@ class DedupDb(object): conn.commit() conn.close() - def loader(self, inq, outq, profile=False): - return DedupLoader(self, inq, outq, self.options.base32, profile) + def loader(self, inq, outq, *args, **kwargs): + return DedupLoader(self, inq, outq, self.options) - def storer(self, inq, outq, profile=False): - return warcprox.ListenerPostfetchProcessor(self, inq, outq, profile) + def storer(self, inq, outq, *args, **kwargs): + return warcprox.ListenerPostfetchProcessor( + self, inq, outq, self.options) def save(self, digest_key, response_record, bucket=""): record_id = response_record.get_header(warctools.WarcRecord.ID).decode('latin1') @@ -297,7 +298,8 @@ class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor): buckets = self._filter_and_bucketize(batch) for bucket in buckets: key_index = self._build_key_index(buckets[bucket]) - results = self.trough_dedup_db.batch_lookup(key_index.keys(), bucket) + results = self.trough_dedup_db.batch_lookup( + key_index.keys(), bucket) for result in results: for recorded_url in key_index[result['digest_key']]: recorded_url.dedup_info = result From 75486d05736f8bd564593a6b4a48544e2109e90d Mon Sep 17 00:00:00 2001 From: Noah Levitt <nlevitt@archive.org> Date: Tue, 16 Jan 2018 15:58:29 -0800 Subject: [PATCH 09/19] make --profile work again --- warcprox/controller.py | 26 +++++++++++--------------- warcprox/mitmproxy.py | 11 +++++------ 2 files changed, 16 insertions(+), 21 deletions(-) diff --git a/warcprox/controller.py b/warcprox/controller.py index efa7885..dabbf6a 100644 --- a/warcprox/controller.py +++ b/warcprox/controller.py @@ -398,18 +398,14 @@ class WarcproxController(object): 'aggregate performance profile of %s proxy threads:\n%s', len(files), buf.getvalue()) - # warc writer threads - files = [] - for wwt in self.warc_writer_threads: - file = os.path.join(tmpdir, '%s.dat' % wwt.ident) - wwt.profiler.dump_stats(file) - files.append(file) - - buf = io.StringIO() - stats = pstats.Stats(*files, stream=buf) - stats.sort_stats('cumulative') - stats.print_stats(0.1) - self.logger.notice( - 'aggregate performance profile of %s warc writer threads:\n%s', - len(self.warc_writer_threads), buf.getvalue()) - + # postfetch processors + for processor in self._postfetch_chain: + file = os.path.join(tmpdir, '%s.dat' % processor.ident) + processor.profiler.dump_stats(file) + buf = io.StringIO() + stats = pstats.Stats(file, stream=buf) + stats.sort_stats('cumulative') + stats.print_stats(0.1) + self.logger.notice( + 'performance profile of %s:\n%s', processor, + buf.getvalue()) diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index c42fb68..608cc75 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -62,6 +62,8 @@ except ImportError: import concurrent.futures import urlcanon import time +import collections +import cProfile class ProxyingRecorder(object): """ @@ -561,14 +563,14 @@ class PooledMitmProxy(PooledMixIn, MitmProxy): # See also https://blog.dubbelboer.com/2012/04/09/syn-cookies.html request_queue_size = 4096 - def __init__(self, max_threads, options=warcprox.Options()): + def __init__(self, options=warcprox.Options()): if options.max_threads: self.logger.info( - "max_threads=%s set by command line option", + 'max_threads=%s set by command line option', options.max_threads) PooledMixIn.__init__(self, options.max_threads) - self.profilers = {} + self.profilers = collections.defaultdict(cProfile.Profile) if options.profile: self.process_request_thread = self._profile_process_request_thread @@ -576,9 +578,6 @@ class PooledMitmProxy(PooledMixIn, MitmProxy): self.process_request_thread = self._process_request_thread def _profile_process_request_thread(self, request, client_address): - if not threading.current_thread().ident in self.profilers: - import cProfile - self.profilers[threading.current_thread().ident] = cProfile.Profile() profiler = self.profilers[threading.current_thread().ident] profiler.enable() self._process_request_thread(request, client_address) From 9e1a7cb6f01076dbbb78e175364689bddc7f0a7c Mon Sep 17 00:00:00 2001 From: Noah Levitt <nlevitt@archive.org> Date: Wed, 17 Jan 2018 11:15:21 -0800 Subject: [PATCH 10/19] include RunningStats raw stats in status info --- warcprox/warcproxy.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index b2ac69e..7bccba7 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -421,6 +421,9 @@ class SingleThreadedWarcProxy(http_server.HTTPServer, object): 'queued_urls': self.recorded_url_q.qsize(), 'queue_max_size': self.recorded_url_q.maxsize, 'seconds_behind': self.recorded_url_q.seconds_behind(), + 'urls_processed': self.running_stats.urls, + 'warc_bytes_written': self.running_stats.warc_bytes, + 'start_time': self.running_stats.first_snap_time, }) elapsed, urls_per_sec, warc_bytes_per_sec = self.running_stats.current_rates(1) result['rates_1min'] = { From 6a641074784a2ce144691023bb9de7f5eda6bc21 Mon Sep 17 00:00:00 2001 From: Noah Levitt <nlevitt@archive.org> Date: Wed, 17 Jan 2018 12:27:19 -0800 Subject: [PATCH 11/19] don't keep next processor waiting in batch postfetch processor, accumulate urls for the next batch for at most 0.5 sec, if the outq is empty (i.e. the next processor is waiting idly) --- warcprox/__init__.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/warcprox/__init__.py b/warcprox/__init__.py index 60ca2ef..bc1365c 100644 --- a/warcprox/__init__.py +++ b/warcprox/__init__.py @@ -168,13 +168,24 @@ class BaseStandardPostfetchProcessor(BasePostfetchProcessor): class BaseBatchPostfetchProcessor(BasePostfetchProcessor): MAX_BATCH_SIZE = 500 MAX_BATCH_SEC = 10 + MIN_BATCH_SEC = 0.5 def _get_process_put(self): batch = [] start = time.time() - while (len(batch) < self.MAX_BATCH_SIZE - and time.time() - start < self.MAX_BATCH_SEC): + while True: + if len(batch) >= self.MAX_BATCH_SIZE: + break # full batch + + elapsed = time.time() - start + if elapsed >= self.MAX_BATCH_SEC: + break # been batching for a while + + if (elapsed >= self.MIN_BATCH_SEC and self.outq + and len(self.outq.queue) == 0): + break # next processor is waiting on us + try: batch.append(self.inq.get(block=True, timeout=0.5)) except queue.Empty: From 9c5a5eda99bacd77901e913a3aa90925858df894 Mon Sep 17 00:00:00 2001 From: Noah Levitt <nlevitt@archive.org> Date: Wed, 17 Jan 2018 14:58:52 -0800 Subject: [PATCH 12/19] use batch postfetch processor for stats --- tests/test_warcprox.py | 327 ++++++++++++++++++++------------------- warcprox/__init__.py | 31 ++-- warcprox/controller.py | 82 +++++----- warcprox/dedup.py | 23 ++- warcprox/stats.py | 287 +++++++++++++--------------------- warcprox/writerthread.py | 5 +- 6 files changed, 341 insertions(+), 414 deletions(-) diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 95532d4..0c250e7 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -96,6 +96,14 @@ logging.getLogger("requests.packages.urllib3").setLevel(logging.WARN) warnings.simplefilter("ignore", category=requests.packages.urllib3.exceptions.InsecureRequestWarning) warnings.simplefilter("ignore", category=requests.packages.urllib3.exceptions.InsecurePlatformWarning) +def wait(callback, timeout=10): + start = time.time() + while time.time() - start < timeout: + if callback(): + return + time.sleep(0.1) + raise Exception('timed out waiting for %s to return truthy' % callback) + # monkey patch dns lookup so we can test domain inheritance on localhost orig_getaddrinfo = socket.getaddrinfo orig_gethostbyname = socket.gethostbyname @@ -339,9 +347,6 @@ def warcprox_(request): logging.info('changing to working directory %r', work_dir) os.chdir(work_dir) - # we can't wait around all day in the tests - warcprox.BaseBatchPostfetchProcessor.MAX_BATCH_SEC = 0.5 - argv = ['warcprox', '--method-filter=GET', '--method-filter=POST', @@ -437,17 +442,9 @@ def test_httpds_no_proxy(http_daemon, https_daemon): assert response.headers['warcprox-test-header'] == 'c!' assert response.content == b'I am the warcprox test payload! dddddddddd!\n' -def _poll_playback_until(playback_proxies, url, status, timeout_sec): - start = time.time() - # check playback (warc writing is asynchronous, give it up to 10 sec) - while time.time() - start < timeout_sec: - response = requests.get(url, proxies=playback_proxies, verify=False) - if response.status_code == status: - break - time.sleep(0.5) - return response +def test_archive_and_playback_http_url(http_daemon, archiving_proxies, playback_proxies, warcprox_): + urls_before = warcprox_.proxy.running_stats.urls -def test_archive_and_playback_http_url(http_daemon, archiving_proxies, playback_proxies): url = 'http://localhost:{}/a/b'.format(http_daemon.server_port) # ensure playback fails before archiving @@ -461,12 +458,17 @@ def test_archive_and_playback_http_url(http_daemon, archiving_proxies, playback_ assert response.headers['warcprox-test-header'] == 'a!' assert response.content == b'I am the warcprox test payload! bbbbbbbbbb!\n' - response = _poll_playback_until(playback_proxies, url, status=200, timeout_sec=10) + # wait for postfetch chain + wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 1) + + response = requests.get(url, proxies=playback_proxies, verify=False) assert response.status_code == 200 assert response.headers['warcprox-test-header'] == 'a!' assert response.content == b'I am the warcprox test payload! bbbbbbbbbb!\n' -def test_archive_and_playback_https_url(https_daemon, archiving_proxies, playback_proxies): +def test_archive_and_playback_https_url(https_daemon, archiving_proxies, playback_proxies, warcprox_): + urls_before = warcprox_.proxy.running_stats.urls + url = 'https://localhost:{}/c/d'.format(https_daemon.server_port) # ensure playback fails before archiving @@ -480,14 +482,19 @@ def test_archive_and_playback_https_url(https_daemon, archiving_proxies, playbac assert response.headers['warcprox-test-header'] == 'c!' assert response.content == b'I am the warcprox test payload! dddddddddd!\n' + # wait for postfetch chain + wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 1) + # test playback - response = _poll_playback_until(playback_proxies, url, status=200, timeout_sec=10) + response = requests.get(url, proxies=playback_proxies, verify=False) assert response.status_code == 200 assert response.headers['warcprox-test-header'] == 'c!' assert response.content == b'I am the warcprox test payload! dddddddddd!\n' # test dedup of same http url with same payload def test_dedup_http(http_daemon, warcprox_, archiving_proxies, playback_proxies): + urls_before = warcprox_.proxy.running_stats.urls + url = 'http://localhost:{}/e/f'.format(http_daemon.server_port) # ensure playback fails before archiving @@ -506,18 +513,14 @@ def test_dedup_http(http_daemon, warcprox_, archiving_proxies, playback_proxies) assert response.headers['warcprox-test-header'] == 'e!' assert response.content == b'I am the warcprox test payload! ffffffffff!\n' + # wait for postfetch chain + wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 1) # test playback - response = _poll_playback_until(playback_proxies, url, status=200, timeout_sec=10) + response = requests.get(url, proxies=playback_proxies, verify=False) assert response.status_code == 200 assert response.headers['warcprox-test-header'] == 'e!' assert response.content == b'I am the warcprox test payload! ffffffffff!\n' - # wait for writer thread to process - time.sleep(0.5) - while warcprox_.postfetch_chain_busy(): - time.sleep(0.5) - time.sleep(0.5) - # check in dedup db # {u'id': u'<urn:uuid:e691dc0f-4bb9-4ad8-9afb-2af836aa05e4>', u'url': u'https://localhost:62841/c/d', u'date': u'2013-11-22T00:14:37Z'} dedup_lookup = warcprox_.dedup_db.lookup( @@ -531,7 +534,7 @@ def test_dedup_http(http_daemon, warcprox_, archiving_proxies, playback_proxies) # need revisit to have a later timestamp than original, else playing # back the latest record might not hit the revisit - time.sleep(1.5) + time.sleep(1.1) # fetch & archive revisit response = requests.get(url, proxies=archiving_proxies, verify=False) @@ -539,11 +542,8 @@ def test_dedup_http(http_daemon, warcprox_, archiving_proxies, playback_proxies) assert response.headers['warcprox-test-header'] == 'e!' assert response.content == b'I am the warcprox test payload! ffffffffff!\n' - # wait for writer thread to process - time.sleep(0.5) - while warcprox_.postfetch_chain_busy(): - time.sleep(0.5) - time.sleep(0.5) + # wait for postfetch chain + wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 2) # check in dedup db (no change from prev) dedup_lookup = warcprox_.dedup_db.lookup( @@ -554,7 +554,7 @@ def test_dedup_http(http_daemon, warcprox_, archiving_proxies, playback_proxies) # test playback logging.debug('testing playback of revisit of {}'.format(url)) - response = _poll_playback_until(playback_proxies, url, status=200, timeout_sec=10) + response = requests.get(url, proxies=playback_proxies, verify=False) assert response.status_code == 200 assert response.headers['warcprox-test-header'] == 'e!' assert response.content == b'I am the warcprox test payload! ffffffffff!\n' @@ -562,6 +562,8 @@ def test_dedup_http(http_daemon, warcprox_, archiving_proxies, playback_proxies) # test dedup of same https url with same payload def test_dedup_https(https_daemon, warcprox_, archiving_proxies, playback_proxies): + urls_before = warcprox_.proxy.running_stats.urls + url = 'https://localhost:{}/g/h'.format(https_daemon.server_port) # ensure playback fails before archiving @@ -580,18 +582,15 @@ def test_dedup_https(https_daemon, warcprox_, archiving_proxies, playback_proxie assert response.headers['warcprox-test-header'] == 'g!' assert response.content == b'I am the warcprox test payload! hhhhhhhhhh!\n' + # wait for postfetch chain + wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 1) + # test playback - response = _poll_playback_until(playback_proxies, url, status=200, timeout_sec=10) + response = requests.get(url, proxies=playback_proxies, verify=False) assert response.status_code == 200 assert response.headers['warcprox-test-header'] == 'g!' assert response.content == b'I am the warcprox test payload! hhhhhhhhhh!\n' - # wait for writer thread to process - time.sleep(0.5) - while warcprox_.postfetch_chain_busy(): - time.sleep(0.5) - time.sleep(0.5) - # check in dedup db # {u'id': u'<urn:uuid:e691dc0f-4bb9-4ad8-9afb-2af836aa05e4>', u'url': u'https://localhost:62841/c/d', u'date': u'2013-11-22T00:14:37Z'} dedup_lookup = warcprox_.dedup_db.lookup( @@ -605,7 +604,7 @@ def test_dedup_https(https_daemon, warcprox_, archiving_proxies, playback_proxie # need revisit to have a later timestamp than original, else playing # back the latest record might not hit the revisit - time.sleep(1.5) + time.sleep(1.1) # fetch & archive revisit response = requests.get(url, proxies=archiving_proxies, verify=False) @@ -613,11 +612,8 @@ def test_dedup_https(https_daemon, warcprox_, archiving_proxies, playback_proxie assert response.headers['warcprox-test-header'] == 'g!' assert response.content == b'I am the warcprox test payload! hhhhhhhhhh!\n' - # wait for writer thread to process - time.sleep(0.5) - while warcprox_.postfetch_chain_busy(): - time.sleep(0.5) - time.sleep(0.5) + # wait for postfetch chain + wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 2) # check in dedup db (no change from prev) dedup_lookup = warcprox_.dedup_db.lookup( @@ -628,13 +624,15 @@ def test_dedup_https(https_daemon, warcprox_, archiving_proxies, playback_proxie # test playback logging.debug('testing playback of revisit of {}'.format(url)) - response = _poll_playback_until(playback_proxies, url, status=200, timeout_sec=10) + response = requests.get(url, proxies=playback_proxies, verify=False) assert response.status_code == 200 assert response.headers['warcprox-test-header'] == 'g!' assert response.content == b'I am the warcprox test payload! hhhhhhhhhh!\n' # XXX how to check dedup was used? def test_limits(http_daemon, warcprox_, archiving_proxies): + urls_before = warcprox_.proxy.running_stats.urls + url = 'http://localhost:{}/i/j'.format(http_daemon.server_port) request_meta = {"stats":{"buckets":["test_limits_bucket"]},"limits":{"test_limits_bucket/total/urls":10}} headers = {"Warcprox-Meta": json.dumps(request_meta)} @@ -644,11 +642,8 @@ def test_limits(http_daemon, warcprox_, archiving_proxies): assert response.headers['warcprox-test-header'] == 'i!' assert response.content == b'I am the warcprox test payload! jjjjjjjjjj!\n' - # wait for writer thread to process - time.sleep(0.5) - while warcprox_.postfetch_chain_busy(): - time.sleep(0.5) - time.sleep(0.5) + # wait for postfetch chain + wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 1) for i in range(9): response = requests.get(url, proxies=archiving_proxies, headers=headers, stream=True) @@ -656,11 +651,8 @@ def test_limits(http_daemon, warcprox_, archiving_proxies): assert response.headers['warcprox-test-header'] == 'i!' assert response.content == b'I am the warcprox test payload! jjjjjjjjjj!\n' - # wait for writer thread to process - time.sleep(0.5) - while warcprox_.postfetch_chain_busy(): - time.sleep(0.5) - time.sleep(2.5) + # wait for postfetch chain + wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 10) response = requests.get(url, proxies=archiving_proxies, headers=headers, stream=True) assert response.status_code == 420 @@ -671,6 +663,8 @@ def test_limits(http_daemon, warcprox_, archiving_proxies): assert response.raw.data == b"request rejected by warcprox: reached limit test_limits_bucket/total/urls=10\n" def test_return_capture_timestamp(http_daemon, warcprox_, archiving_proxies): + urls_before = warcprox_.proxy.running_stats.urls + url = 'http://localhost:{}/i/j'.format(http_daemon.server_port) request_meta = {"accept": ["capture-metadata"]} headers = {"Warcprox-Meta": json.dumps(request_meta)} @@ -686,7 +680,12 @@ def test_return_capture_timestamp(http_daemon, warcprox_, archiving_proxies): except ValueError: pytest.fail('Invalid capture-timestamp format %s', data['capture-timestamp']) + # wait for postfetch chain (or subsequent test could fail) + wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 1) + def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies, playback_proxies): + urls_before = warcprox_.proxy.running_stats.urls + url1 = 'http://localhost:{}/k/l'.format(http_daemon.server_port) url2 = 'https://localhost:{}/k/l'.format(https_daemon.server_port) @@ -697,15 +696,14 @@ def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies, assert response.headers['warcprox-test-header'] == 'k!' assert response.content == b'I am the warcprox test payload! llllllllll!\n' - # wait for writer thread to process - time.sleep(0.5) - while warcprox_.postfetch_chain_busy(): - time.sleep(0.5) - time.sleep(0.5) + # wait for postfetch chain + wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 1) # check url1 in dedup db bucket_a + # logging.info('looking up sha1:bc3fac8847c9412f49d955e626fb58a76befbf81 in bucket_a') dedup_lookup = warcprox_.dedup_db.lookup( b'sha1:bc3fac8847c9412f49d955e626fb58a76befbf81', bucket="bucket_a") + assert dedup_lookup assert dedup_lookup['url'] == url1.encode('ascii') assert re.match(br'^<urn:uuid:[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}>$', dedup_lookup['id']) assert re.match(br'^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$', dedup_lookup['date']) @@ -724,11 +722,8 @@ def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies, assert response.headers['warcprox-test-header'] == 'k!' assert response.content == b'I am the warcprox test payload! llllllllll!\n' - # wait for writer thread to process - time.sleep(0.5) - while warcprox_.postfetch_chain_busy(): - time.sleep(0.5) - time.sleep(0.5) + # wait for postfetch chain + wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 2) # check url2 in dedup db bucket_b dedup_lookup = warcprox_.dedup_db.lookup( @@ -746,11 +741,8 @@ def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies, assert response.headers['warcprox-test-header'] == 'k!' assert response.content == b'I am the warcprox test payload! llllllllll!\n' - # wait for writer thread to process - time.sleep(0.5) - while warcprox_.postfetch_chain_busy(): - time.sleep(0.5) - time.sleep(0.5) + # wait for postfetch chain + wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 3) # archive url1 bucket_b headers = {"Warcprox-Meta": json.dumps({"warc-prefix":"test_dedup_buckets","captures-bucket":"bucket_b"})} @@ -759,11 +751,8 @@ def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies, assert response.headers['warcprox-test-header'] == 'k!' assert response.content == b'I am the warcprox test payload! llllllllll!\n' - # wait for writer thread to process - time.sleep(0.5) - while warcprox_.postfetch_chain_busy(): - time.sleep(0.5) - time.sleep(0.5) + # wait for postfetch chain + wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 4) # close the warc assert warcprox_.warc_writer_thread.writer_pool.warc_writers["test_dedup_buckets"] @@ -827,6 +816,8 @@ def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies, fh.close() def test_block_rules(http_daemon, https_daemon, warcprox_, archiving_proxies): + urls_before = warcprox_.proxy.running_stats.urls + rules = [ { "domain": "localhost", @@ -863,6 +854,9 @@ def test_block_rules(http_daemon, https_daemon, warcprox_, archiving_proxies): url, proxies=archiving_proxies, headers=headers, stream=True) assert response.status_code == 200 + # wait for postfetch chain + wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 1) + # blocked by SURT_MATCH url = 'http://localhost:{}/fuh/guh'.format(http_daemon.server_port) response = requests.get( @@ -878,6 +872,9 @@ def test_block_rules(http_daemon, https_daemon, warcprox_, archiving_proxies): # 404 because server set up at the top of this file doesn't handle this url assert response.status_code == 404 + # wait for postfetch chain + wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 2) + # not blocked because surt scheme does not match (differs from heritrix # behavior where https urls are coerced to http surt form) url = 'https://localhost:{}/fuh/guh'.format(https_daemon.server_port) @@ -886,6 +883,9 @@ def test_block_rules(http_daemon, https_daemon, warcprox_, archiving_proxies): verify=False) assert response.status_code == 200 + # wait for postfetch chain + wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 3) + # blocked by blanket domain block url = 'http://bad.domain.com/' response = requests.get( @@ -938,6 +938,8 @@ def test_block_rules(http_daemon, https_daemon, warcprox_, archiving_proxies): def test_domain_doc_soft_limit( http_daemon, https_daemon, warcprox_, archiving_proxies): + urls_before = warcprox_.proxy.running_stats.urls + request_meta = { "stats": {"buckets": [{"bucket":"test_domain_doc_limit_bucket","tally-domains":["foo.localhost"]}]}, "soft-limits": {"test_domain_doc_limit_bucket:foo.localhost/total/urls":10}, @@ -952,11 +954,8 @@ def test_domain_doc_soft_limit( assert response.headers['warcprox-test-header'] == 'o!' assert response.content == b'I am the warcprox test payload! pppppppppp!\n' - # wait for writer thread to process - time.sleep(0.5) - while warcprox_.postfetch_chain_busy(): - time.sleep(0.5) - time.sleep(0.5) + # wait for postfetch chain + wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 1) # make sure stats from different domain don't count url = 'http://bar.localhost:{}/o/p'.format(http_daemon.server_port) @@ -967,15 +966,10 @@ def test_domain_doc_soft_limit( assert response.headers['warcprox-test-header'] == 'o!' assert response.content == b'I am the warcprox test payload! pppppppppp!\n' - # wait for writer thread to process - time.sleep(0.5) - while warcprox_.postfetch_chain_busy(): - time.sleep(0.5) - # rethinkdb stats db update cycle is 2 seconds (at the moment anyway) - time.sleep(2.0) + # wait for postfetch chain + wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 11) # (2) same host but different scheme and port: domain limit applies - # url = 'https://foo.localhost:{}/o/p'.format(https_daemon.server_port) response = requests.get( url, proxies=archiving_proxies, headers=headers, stream=True, @@ -994,12 +988,12 @@ def test_domain_doc_soft_limit( assert response.headers['warcprox-test-header'] == 'o!' assert response.content == b'I am the warcprox test payload! pppppppppp!\n' - # wait for writer thread to process - time.sleep(0.5) - while warcprox_.postfetch_chain_busy(): - time.sleep(0.5) - # rethinkdb stats db update cycle is 2 seconds (at the moment anyway) - time.sleep(2.0) + # wait for postfetch chain + time.sleep(3) + logging.info( + 'warcprox_.proxy.running_stats.urls - urls_before = %s', + warcprox_.proxy.running_stats.urls - urls_before) + wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 19) # (10) response = requests.get( @@ -1009,12 +1003,8 @@ def test_domain_doc_soft_limit( assert response.headers['warcprox-test-header'] == 'o!' assert response.content == b'I am the warcprox test payload! pppppppppp!\n' - # wait for writer thread to process - time.sleep(0.5) - while warcprox_.postfetch_chain_busy(): - time.sleep(0.5) - # rethinkdb stats db update cycle is 2 seconds (at the moment anyway) - time.sleep(2.0) + # wait for postfetch chain + wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 20) # (11) back to http, and this is the 11th request url = 'http://zuh.foo.localhost:{}/o/p'.format(http_daemon.server_port) @@ -1036,6 +1026,9 @@ def test_domain_doc_soft_limit( assert response.headers['warcprox-test-header'] == 'o!' assert response.content == b'I am the warcprox test payload! pppppppppp!\n' + # wait for postfetch chain + wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 21) + # https also blocked url = 'https://zuh.foo.localhost:{}/o/p'.format(https_daemon.server_port) response = requests.get( @@ -1062,6 +1055,8 @@ def test_domain_doc_soft_limit( def test_domain_data_soft_limit( http_daemon, https_daemon, warcprox_, archiving_proxies): + urls_before = warcprox_.proxy.running_stats.urls + # using idn request_meta = { "stats": {"buckets": [{"bucket":"test_domain_data_limit_bucket","tally-domains":['ÞzZ.LOCALhost']}]}, @@ -1077,12 +1072,8 @@ def test_domain_data_soft_limit( assert response.headers['warcprox-test-header'] == 'y!' assert response.content == b'I am the warcprox test payload! zzzzzzzzzz!\n' - # wait for writer thread to process - time.sleep(0.5) - while warcprox_.postfetch_chain_busy(): - time.sleep(0.5) - # rethinkdb stats db update cycle is 2 seconds (at the moment anyway) - time.sleep(2.0) + # wait for postfetch chain + wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 1) # duplicate, does not count toward limit url = 'https://baz.Þzz.localhost:{}/y/z'.format(https_daemon.server_port) @@ -1093,12 +1084,8 @@ def test_domain_data_soft_limit( assert response.headers['warcprox-test-header'] == 'y!' assert response.content == b'I am the warcprox test payload! zzzzzzzzzz!\n' - # wait for writer thread to process - time.sleep(0.5) - while warcprox_.postfetch_chain_busy(): - time.sleep(0.5) - # rethinkdb stats db update cycle is 2 seconds (at the moment anyway) - time.sleep(2.0) + # wait for postfetch chain + wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 2) # novel, pushes stats over the limit url = 'https://muh.XN--Zz-2Ka.locALHOst:{}/z/~'.format(https_daemon.server_port) @@ -1109,12 +1096,8 @@ def test_domain_data_soft_limit( assert response.headers['warcprox-test-header'] == 'z!' assert response.content == b'I am the warcprox test payload! ~~~~~~~~~~!\n' - # wait for writer thread to process - time.sleep(0.5) - while warcprox_.postfetch_chain_busy(): - time.sleep(0.5) - # rethinkdb stats db update cycle is 2 seconds (at the moment anyway) - time.sleep(2.0) + # wait for postfetch chain + wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 3) # make sure limit doesn't get applied to a different host url = 'http://baz.localhost:{}/z/~'.format(http_daemon.server_port) @@ -1124,6 +1107,9 @@ def test_domain_data_soft_limit( assert response.headers['warcprox-test-header'] == 'z!' assert response.content == b'I am the warcprox test payload! ~~~~~~~~~~!\n' + # wait for postfetch chain + wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 4) + # blocked because we're over the limit now url = 'http://lOl.wHut.ÞZZ.lOcALHOst:{}/y/z'.format(http_daemon.server_port) response = requests.get( @@ -1155,7 +1141,9 @@ def test_domain_data_soft_limit( # connection to the internet, and relies on a third party site (facebook) being # up and behaving a certain way @pytest.mark.xfail -def test_tor_onion(archiving_proxies): +def test_tor_onion(archiving_proxies, warcprox_): + urls_before = warcprox_.proxy.running_stats.urls + response = requests.get('http://www.facebookcorewwwi.onion/', proxies=archiving_proxies, verify=False, allow_redirects=False) assert response.status_code == 302 @@ -1164,7 +1152,12 @@ def test_tor_onion(archiving_proxies): proxies=archiving_proxies, verify=False, allow_redirects=False) assert response.status_code == 200 -def test_missing_content_length(archiving_proxies, http_daemon, https_daemon): + # wait for postfetch chain + wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 2) + +def test_missing_content_length(archiving_proxies, http_daemon, https_daemon, warcprox_): + urls_before = warcprox_.proxy.running_stats.urls + # double-check that our test http server is responding as expected url = 'http://localhost:%s/missing-content-length' % ( http_daemon.server_port) @@ -1201,8 +1194,14 @@ def test_missing_content_length(archiving_proxies, http_daemon, https_daemon): b'This response is missing a Content-Length http header.') assert not 'content-length' in response.headers + # wait for postfetch chain + wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 2) + def test_method_filter( - https_daemon, http_daemon, archiving_proxies, playback_proxies): + warcprox_, https_daemon, http_daemon, archiving_proxies, + playback_proxies): + urls_before = warcprox_.proxy.running_stats.urls + # we've configured warcprox with method_filters=['GET','POST'] so HEAD # requests should not be archived @@ -1213,7 +1212,10 @@ def test_method_filter( assert response.headers['warcprox-test-header'] == 'z!' assert response.content == b'' - response = _poll_playback_until(playback_proxies, url, status=200, timeout_sec=10) + # wait for postfetch chain + wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 1) + + response = requests.get(url, proxies=playback_proxies, verify=False) assert response.status_code == 404 assert response.content == b'404 Not in Archive\n' @@ -1230,13 +1232,17 @@ def test_method_filter( headers=headers, proxies=archiving_proxies) assert response.status_code == 204 - response = _poll_playback_until( - playback_proxies, url, status=200, timeout_sec=10) + # wait for postfetch chain + wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 2) + + response = requests.get(url, proxies=playback_proxies, verify=False) assert response.status_code == 200 assert response.content == payload def test_dedup_ok_flag( https_daemon, http_daemon, warcprox_, archiving_proxies): + urls_before = warcprox_.proxy.running_stats.urls + if not warcprox_.options.rethinkdb_big_table: # this feature is n/a unless using rethinkdb big table return @@ -1258,10 +1264,8 @@ def test_dedup_ok_flag( assert response.headers['warcprox-test-header'] == 'z!' assert response.content == b'I am the warcprox test payload! bbbbbbbbbb!\n' - time.sleep(0.5) - while warcprox_.postfetch_chain_busy(): - time.sleep(0.5) - time.sleep(0.5) + # wait for postfetch chain + wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 1) # check that dedup db doesn't give us anything for this dedup_lookup = warcprox_.dedup_db.lookup( @@ -1279,10 +1283,8 @@ def test_dedup_ok_flag( assert response.headers['warcprox-test-header'] == 'z!' assert response.content == b'I am the warcprox test payload! bbbbbbbbbb!\n' - time.sleep(0.5) - while warcprox_.postfetch_chain_busy(): - time.sleep(0.5) - time.sleep(0.5) + # wait for postfetch chain + wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 2) # check that dedup db gives us something for this dedup_lookup = warcprox_.dedup_db.lookup( @@ -1316,7 +1318,8 @@ def test_status_api(warcprox_): 'role', 'version', 'host', 'address', 'port', 'pid', 'load', 'queued_urls', 'queue_max_size', 'seconds_behind', 'threads', 'rates_5min', 'rates_1min', 'unaccepted_requests', 'rates_15min', - 'active_requests',} + 'active_requests','start_time','urls_processed', + 'warc_bytes_written'} assert status['role'] == 'warcprox' assert status['version'] == warcprox.__version__ assert status['port'] == warcprox_.proxy.server_port @@ -1337,7 +1340,8 @@ def test_svcreg_status(warcprox_): 'queued_urls', 'queue_max_size', 'seconds_behind', 'first_heartbeat', 'ttl', 'last_heartbeat', 'threads', 'rates_5min', 'rates_1min', 'unaccepted_requests', - 'rates_15min', 'active_requests',} + 'rates_15min', 'active_requests','start_time','urls_processed', + 'warc_bytes_written',} assert status['role'] == 'warcprox' assert status['version'] == warcprox.__version__ assert status['port'] == warcprox_.proxy.server_port @@ -1426,12 +1430,17 @@ def test_choose_a_port_for_me(warcprox_): th.join() def test_via_response_header(warcprox_, http_daemon, archiving_proxies, playback_proxies): + urls_before = warcprox_.proxy.running_stats.urls + url = 'http://localhost:%s/a/z' % http_daemon.server_port response = requests.get(url, proxies=archiving_proxies) assert response.headers['via'] == '1.1 warcprox' - playback_response = _poll_playback_until( - playback_proxies, url, status=200, timeout_sec=10) + # wait for postfetch chain + wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 1) + + playback_response = requests.get( + url, proxies=playback_proxies, verify=False) assert response.status_code == 200 assert not 'via' in playback_response @@ -1458,15 +1467,19 @@ def test_slash_in_warc_prefix(warcprox_, http_daemon, archiving_proxies): assert response.reason == 'request rejected by warcprox: slash and backslash are not permitted in warc-prefix' def test_crawl_log(warcprox_, http_daemon, archiving_proxies): + urls_before = warcprox_.proxy.running_stats.urls + try: os.unlink(os.path.join(warcprox_.options.crawl_log_dir, 'crawl.log')) except: pass + # should go to default crawl log url = 'http://localhost:%s/b/aa' % http_daemon.server_port response = requests.get(url, proxies=archiving_proxies) assert response.status_code == 200 + # should go to test_crawl_log_1.log url = 'http://localhost:%s/b/bb' % http_daemon.server_port headers = { "Warcprox-Meta": json.dumps({"warc-prefix":"test_crawl_log_1"}), @@ -1475,13 +1488,12 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies): response = requests.get(url, proxies=archiving_proxies, headers=headers) assert response.status_code == 200 - start = time.time() + # wait for postfetch chain + wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 2) + file = os.path.join(warcprox_.options.crawl_log_dir, 'test_crawl_log_1.log') - while time.time() - start < 10: - if os.path.exists(file) and os.stat(file).st_size > 0: - break - time.sleep(0.5) assert os.path.exists(file) + assert os.stat(file).st_size > 0 assert os.path.exists(os.path.join( warcprox_.options.crawl_log_dir, 'crawl.log')) @@ -1536,13 +1548,12 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies): response = requests.get(url, proxies=archiving_proxies, headers=headers) assert response.status_code == 200 - start = time.time() + # wait for postfetch chain + wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 3) + file = os.path.join(warcprox_.options.crawl_log_dir, 'test_crawl_log_2.log') - while time.time() - start < 10: - if os.path.exists(file) and os.stat(file).st_size > 0: - break - time.sleep(0.5) assert os.path.exists(file) + assert os.stat(file).st_size > 0 crawl_log_2 = open(file, 'rb').read() @@ -1566,17 +1577,14 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies): assert extra_info['contentSize'] == 145 # a request that is not saved to a warc (because of --method-filter) - # currently not logged at all (XXX maybe it should be) url = 'http://localhost:%s/b/cc' % http_daemon.server_port headers = {'Warcprox-Meta': json.dumps({'warc-prefix': 'test_crawl_log_3'})} response = requests.head(url, proxies=archiving_proxies, headers=headers) + # wait for postfetch chain + wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 4) + file = os.path.join(warcprox_.options.crawl_log_dir, 'test_crawl_log_3.log') - start = time.time() - while time.time() - start < 10: - if os.path.exists(file) and os.stat(file).st_size > 0: - break - time.sleep(0.5) assert os.path.exists(file) crawl_log_3 = open(file, 'rb').read() @@ -1611,13 +1619,10 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies): headers=headers, proxies=archiving_proxies) assert response.status_code == 204 - start = time.time() - file = os.path.join(warcprox_.options.crawl_log_dir, 'test_crawl_log_4.log') - while time.time() - start < 10: - if os.path.exists(file) and os.stat(file).st_size > 0: - break - time.sleep(0.5) + # wait for postfetch chain + wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 5) + file = os.path.join(warcprox_.options.crawl_log_dir, 'test_crawl_log_4.log') assert os.path.exists(file) crawl_log_4 = open(file, 'rb').read() @@ -1642,6 +1647,8 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies): def test_long_warcprox_meta( warcprox_, http_daemon, archiving_proxies, playback_proxies): + urls_before = warcprox_.proxy.running_stats.urls + url = 'http://localhost:%s/b/g' % http_daemon.server_port # create a very long warcprox-meta header @@ -1651,11 +1658,8 @@ def test_long_warcprox_meta( url, proxies=archiving_proxies, headers=headers, verify=False) assert response.status_code == 200 - # wait for writer thread to process - time.sleep(0.5) - while warcprox_.postfetch_chain_busy(): - time.sleep(0.5) - time.sleep(0.5) + # wait for postfetch chain + wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 1) # check that warcprox-meta was parsed and honored ("warc-prefix" param) assert warcprox_.warc_writer_thread.writer_pool.warc_writers["test_long_warcprox_meta"] @@ -1681,7 +1685,6 @@ def test_long_warcprox_meta( def test_empty_response( warcprox_, http_daemon, https_daemon, archiving_proxies, playback_proxies): - url = 'http://localhost:%s/empty-response' % http_daemon.server_port response = requests.get(url, proxies=archiving_proxies, verify=False) assert response.status_code == 502 diff --git a/warcprox/__init__.py b/warcprox/__init__.py index bc1365c..2cd62cd 100644 --- a/warcprox/__init__.py +++ b/warcprox/__init__.py @@ -99,12 +99,13 @@ class RequestBlockedByRule(Exception): class BasePostfetchProcessor(threading.Thread): logger = logging.getLogger("warcprox.BasePostfetchProcessor") - def __init__(self, inq, outq, options=Options()): + def __init__(self, options=Options()): threading.Thread.__init__(self, name=self.__class__.__name__) - self.inq = inq - self.outq = outq self.options = options self.stop = threading.Event() + # these should be set before thread is started + self.inq = None + self.outq = None def run(self): if self.options.profile: @@ -128,6 +129,7 @@ class BasePostfetchProcessor(threading.Thread): raise Exception('not implemented') def _run(self): + self._startup() while not self.stop.is_set(): try: while True: @@ -152,6 +154,9 @@ class BasePostfetchProcessor(threading.Thread): self.name, exc_info=True) time.sleep(0.5) + def _startup(self): + pass + def _shutdown(self): pass @@ -175,6 +180,13 @@ class BaseBatchPostfetchProcessor(BasePostfetchProcessor): start = time.time() while True: + try: + batch.append(self.inq.get(block=True, timeout=0.5)) + except queue.Empty: + if self.stop.is_set(): + break + # else maybe keep adding to the batch + if len(batch) >= self.MAX_BATCH_SIZE: break # full batch @@ -186,18 +198,11 @@ class BaseBatchPostfetchProcessor(BasePostfetchProcessor): and len(self.outq.queue) == 0): break # next processor is waiting on us - try: - batch.append(self.inq.get(block=True, timeout=0.5)) - except queue.Empty: - if self.stop.is_set(): - break - # else keep adding to the batch - if not batch: raise queue.Empty self.logger.info( - 'gathered batch of %s in %0.1f sec', + 'gathered batch of %s in %0.2f sec', len(batch), time.time() - start) self._process_batch(batch) @@ -209,8 +214,8 @@ class BaseBatchPostfetchProcessor(BasePostfetchProcessor): raise Exception('not implemented') class ListenerPostfetchProcessor(BaseStandardPostfetchProcessor): - def __init__(self, listener, inq, outq, options=Options()): - BaseStandardPostfetchProcessor.__init__(self, inq, outq, options) + def __init__(self, listener, options=Options()): + BaseStandardPostfetchProcessor.__init__(self, options) self.listener = listener self.name = listener.__class__.__name__ diff --git a/warcprox/controller.py b/warcprox/controller.py index dabbf6a..9902cb5 100644 --- a/warcprox/controller.py +++ b/warcprox/controller.py @@ -55,20 +55,20 @@ class Factory: return dedup_db @staticmethod - def stats_db(options): + def stats_processor(options): + # return warcprox.stats.StatsProcessor(options) if options.rethinkdb_stats_url: - stats_db = warcprox.stats.RethinkStatsDb(options=options) + stats_processor = warcprox.stats.RethinkStatsProcessor(options) elif options.stats_db_file in (None, '', '/dev/null'): logging.info('statistics tracking disabled') - stats_db = None + stats_processor = None else: - stats_db = warcprox.stats.StatsDb( - options.stats_db_file, options=options) - return stats_db + stats_processor = warcprox.stats.StatsProcessor(options) + return stats_processor @staticmethod - def warc_writer(inq, outq, options): - return warcprox.writerthread.WarcWriterThread(inq, outq, options) + def warc_writer(options): + return warcprox.writerthread.WarcWriterThread(options) @staticmethod def playback_proxy(ca, options): @@ -130,9 +130,9 @@ class WarcproxController(object): self.stop = threading.Event() self._start_stop_lock = threading.Lock() - self.stats_db = Factory.stats_db(self.options) + self.stats_processor = Factory.stats_processor(self.options) - self.proxy = warcprox.warcproxy.WarcProxy(self.stats_db, options) + self.proxy = warcprox.warcproxy.WarcProxy(self.stats_processor, options) self.playback_proxy = Factory.playback_proxy( self.proxy.ca, self.options) @@ -140,59 +140,52 @@ class WarcproxController(object): self.service_registry = Factory.service_registry(options) - def postfetch_chain_busy(self): - for processor in self._postfetch_chain: - if processor.inq.qsize() > 0: - return True - return False + def chain(self, processor0, processor1): + assert not processor0.outq + assert not processor1.inq + q = warcprox.TimestampedQueue(maxsize=self.options.queue_size) + processor0.outq = q + processor1.inq = q def build_postfetch_chain(self, inq): - constructors = [] + self._postfetch_chain = [] self.dedup_db = Factory.dedup_db(self.options) if self.dedup_db: - constructors.append(self.dedup_db.loader) + self._postfetch_chain.append(self.dedup_db.loader()) - constructors.append(Factory.warc_writer) + self.warc_writer_thread = Factory.warc_writer(self.options) + self._postfetch_chain.append(self.warc_writer_thread) if self.dedup_db: - constructors.append(self.dedup_db.storer) + self._postfetch_chain.append(self.dedup_db.storer()) - if self.stats_db: - constructors.append(functools.partial( - warcprox.ListenerPostfetchProcessor, self.stats_db)) + if self.stats_processor: + self._postfetch_chain.append(self.stats_processor) if self.playback_proxy: - constructors.append(functools.partial( - warcprox.ListenerPostfetchProcessor, - self.playback_proxy.playback_index_db)) + self._postfetch_chain.append( + warcprox.ListenerPostfetchProcessor( + self.playback_proxy.playback_index_db)) crawl_logger = Factory.crawl_logger(self.options) if crawl_logger: - constructors.append(functools.partial( - warcprox.ListenerPostfetchProcessor, crawl_logger)) + self._postfetch_chain.append( + warcprox.ListenerPostfetchProcessor(crawl_logger)) - constructors.append(functools.partial( - warcprox.ListenerPostfetchProcessor, self.proxy.running_stats)) + self._postfetch_chain.append( + warcprox.ListenerPostfetchProcessor(self.proxy.running_stats)) for qualname in self.options.plugins or []: plugin = Factory.plugin(qualname) - constructors.append(functools.partial( - warcprox.ListenerPostfetchProcessor, plugin)) + self._postfetch_chain.append( + warcprox.ListenerPostfetchProcessor(plugin)) - self._postfetch_chain = [] - for i, constructor in enumerate(constructors): - if i != len(constructors) - 1: - outq = warcprox.TimestampedQueue( - maxsize=self.options.queue_size) - else: - outq = None - processor = constructor(inq, outq, self.options) - if isinstance(processor, warcprox.writerthread.WarcWriterThread): - self.warc_writer_thread = processor # ugly - self._postfetch_chain.append(processor) - inq = outq + # chain them all up + self._postfetch_chain[0].inq = inq + for i in range(1, len(self._postfetch_chain)): + self.chain(self._postfetch_chain[i-1], self._postfetch_chain[i]) def debug_mem(self): self.logger.info("self.proxy.recorded_url_q.qsize()=%s", self.proxy.recorded_url_q.qsize()) @@ -314,9 +307,6 @@ class WarcproxController(object): for processor in self._postfetch_chain: processor.join() - if self.stats_db: - self.stats_db.stop() - self.proxy_thread.join() if self.playback_proxy is not None: self.playback_proxy_thread.join() diff --git a/warcprox/dedup.py b/warcprox/dedup.py index 8931db2..0b52ffb 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -37,10 +37,10 @@ import collections urllib3.disable_warnings() class DedupLoader(warcprox.BaseStandardPostfetchProcessor): - def __init__(self, dedup_db, inq, outq, options=warcprox.Options()): - warcprox.BaseStandardPostfetchProcessor.__init__( - self, inq, outq, options) + def __init__(self, dedup_db, options=warcprox.Options()): + warcprox.BaseStandardPostfetchProcessor.__init__(self, options=options) self.dedup_db = dedup_db + def _process_url(self, recorded_url): decorate_with_dedup_info( self.dedup_db, recorded_url, self.options.base32) @@ -71,12 +71,11 @@ class DedupDb(object): conn.commit() conn.close() - def loader(self, inq, outq, *args, **kwargs): - return DedupLoader(self, inq, outq, self.options) + def loader(self, *args, **kwargs): + return DedupLoader(self, self.options) - def storer(self, inq, outq, *args, **kwargs): - return warcprox.ListenerPostfetchProcessor( - self, inq, outq, self.options) + def storer(self, *args, **kwargs): + return warcprox.ListenerPostfetchProcessor(self, self.options) def save(self, digest_key, response_record, bucket=""): record_id = response_record.get_header(warctools.WarcRecord.ID).decode('latin1') @@ -262,8 +261,8 @@ class CdxServerDedup(DedupDb): pass class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor): - def __init__(self, trough_dedup_db, inq, outq, options=warcprox.Options()): - warcprox.BaseBatchPostfetchProcessor.__init__(self, inq, outq, options) + def __init__(self, trough_dedup_db, options=warcprox.Options()): + warcprox.BaseBatchPostfetchProcessor.__init__(self, options) self.trough_dedup_db = trough_dedup_db def _filter_and_bucketize(self, batch): @@ -324,8 +323,8 @@ class TroughDedupDb(DedupDb): self._trough_cli = warcprox.trough.TroughClient( options.rethinkdb_trough_db_url, promotion_interval=60*60) - def loader(self, inq, outq, options=warcprox.Options()): - return BatchTroughLoader(self, inq, outq, options) + def loader(self, options=warcprox.Options()): + return BatchTroughLoader(self, options) def start(self): self._trough_cli.register_schema(self.SCHEMA_ID, self.SCHEMA_SQL) diff --git a/warcprox/stats.py b/warcprox/stats.py index 6047443..db2493c 100644 --- a/warcprox/stats.py +++ b/warcprox/stats.py @@ -53,45 +53,88 @@ def _empty_bucket(bucket): }, } -class StatsDb: - logger = logging.getLogger("warcprox.stats.StatsDb") +class StatsProcessor(warcprox.BaseBatchPostfetchProcessor): + logger = logging.getLogger("warcprox.stats.StatsProcessor") - def __init__(self, file='./warcprox.sqlite', options=warcprox.Options()): - self.file = file - self.options = options - self._lock = threading.RLock() + def _startup(self): + if os.path.exists(self.options.stats_db_file): + self.logger.info( + 'opening existing stats database %s', + self.options.stats_db_file) + else: + self.logger.info( + 'creating new stats database %s', + self.options.stats_db_file) - def start(self): - with self._lock: - if os.path.exists(self.file): - self.logger.info( - 'opening existing stats database %s', self.file) - else: - self.logger.info( - 'creating new stats database %s', self.file) + conn = sqlite3.connect(self.options.stats_db_file) + conn.execute( + 'create table if not exists buckets_of_stats (' + ' bucket varchar(300) primary key,' + ' stats varchar(4000)' + ');') + conn.commit() + conn.close() - conn = sqlite3.connect(self.file) + self.logger.info( + 'created table buckets_of_stats in %s', + self.options.stats_db_file) + + def _process_batch(self, batch): + batch_buckets = self._tally_batch(batch) + self._update_db(batch_buckets) + logging.trace('updated stats from batch of %s', len(batch)) + + def _update_db(self, batch_buckets): + conn = sqlite3.connect(self.options.stats_db_file) + for bucket in batch_buckets: + bucket_stats = batch_buckets[bucket] + + cursor = conn.execute( + 'select stats from buckets_of_stats where bucket=?', + (bucket,)) + result_tuple = cursor.fetchone() + cursor.close() + + if result_tuple: + old_bucket_stats = json.loads(result_tuple[0]) + + bucket_stats['total']['urls'] += old_bucket_stats['total']['urls'] + bucket_stats['total']['wire_bytes'] += old_bucket_stats['total']['wire_bytes'] + bucket_stats['revisit']['urls'] += old_bucket_stats['revisit']['urls'] + bucket_stats['revisit']['wire_bytes'] += old_bucket_stats['revisit']['wire_bytes'] + bucket_stats['new']['urls'] += old_bucket_stats['new']['urls'] + bucket_stats['new']['wire_bytes'] += old_bucket_stats['new']['wire_bytes'] + + json_value = json.dumps(bucket_stats, separators=(',',':')) conn.execute( - 'create table if not exists buckets_of_stats (' - ' bucket varchar(300) primary key,' - ' stats varchar(4000)' - ');') + 'insert or replace into buckets_of_stats ' + '(bucket, stats) values (?, ?)', (bucket, json_value)) conn.commit() - conn.close() + conn.close() - self.logger.info('created table buckets_of_stats in %s', self.file) + def _tally_batch(self, batch): + batch_buckets = {} + for recorded_url in batch: + for bucket in self.buckets(recorded_url): + bucket_stats = batch_buckets.get(bucket) + if not bucket_stats: + bucket_stats = _empty_bucket(bucket) + batch_buckets[bucket] = bucket_stats - def stop(self): - pass + bucket_stats["total"]["urls"] += 1 + bucket_stats["total"]["wire_bytes"] += recorded_url.size - def close(self): - pass - - def sync(self): - pass + if recorded_url.warc_records: + if recorded_url.warc_records[0].type == b'revisit': + bucket_stats["revisit"]["urls"] += 1 + bucket_stats["revisit"]["wire_bytes"] += recorded_url.size + else: + bucket_stats["new"]["urls"] += 1 + bucket_stats["new"]["wire_bytes"] += recorded_url.size + return batch_buckets def value(self, bucket0="__all__", bucket1=None, bucket2=None): - conn = sqlite3.connect(self.file) + conn = sqlite3.connect(self.options.stats_db_file) cursor = conn.execute( 'select stats from buckets_of_stats where bucket = ?', (bucket0,)) @@ -109,9 +152,6 @@ class StatsDb: else: return None - def notify(self, recorded_url, records): - self.tally(recorded_url, records) - def buckets(self, recorded_url): ''' Unravels bucket definitions in Warcprox-Meta header. Each bucket @@ -154,117 +194,20 @@ class StatsDb: return buckets - def tally(self, recorded_url, records): - with self._lock: - conn = sqlite3.connect(self.file) - - for bucket in self.buckets(recorded_url): - cursor = conn.execute( - 'select stats from buckets_of_stats where bucket=?', - (bucket,)) - - result_tuple = cursor.fetchone() - cursor.close() - if result_tuple: - bucket_stats = json.loads(result_tuple[0]) - else: - bucket_stats = _empty_bucket(bucket) - - bucket_stats["total"]["urls"] += 1 - bucket_stats["total"]["wire_bytes"] += recorded_url.size - - if records: - if records[0].type == b'revisit': - bucket_stats["revisit"]["urls"] += 1 - bucket_stats["revisit"]["wire_bytes"] += recorded_url.size - else: - bucket_stats["new"]["urls"] += 1 - bucket_stats["new"]["wire_bytes"] += recorded_url.size - - json_value = json.dumps(bucket_stats, separators=(',',':')) - conn.execute( - 'insert or replace into buckets_of_stats ' - '(bucket, stats) values (?, ?)', (bucket, json_value)) - conn.commit() - - conn.close() - -class RethinkStatsDb(StatsDb): - """Updates database in batch every 2.0 seconds""" - logger = logging.getLogger("warcprox.stats.RethinkStatsDb") +class RethinkStatsProcessor(StatsProcessor): + logger = logging.getLogger("warcprox.stats.RethinkStatsProcessor") def __init__(self, options=warcprox.Options()): + StatsProcessor.__init__(self, options) + parsed = doublethink.parse_rethinkdb_url(options.rethinkdb_stats_url) self.rr = doublethink.Rethinker( servers=parsed.hosts, db=parsed.database) self.table = parsed.table self.replicas = min(3, len(self.rr.servers)) + + def _startup(self): self._ensure_db_table() - self.options = options - - self._stop = threading.Event() - self._batch_lock = threading.RLock() - with self._batch_lock: - self._batch = {} - self._timer = None - - def start(self): - """Starts batch update repeating timer.""" - self._update_batch() # starts repeating timer - - def _bucket_batch_update_reql(self, bucket, batch): - return self.rr.table(self.table).get(bucket).replace( - lambda old: r.branch( - old.eq(None), batch[bucket], old.merge({ - "total": { - "urls": old["total"]["urls"].add( - batch[bucket]["total"]["urls"]), - "wire_bytes": old["total"]["wire_bytes"].add( - batch[bucket]["total"]["wire_bytes"]), - }, - "new": { - "urls": old["new"]["urls"].add( - batch[bucket]["new"]["urls"]), - "wire_bytes": old["new"]["wire_bytes"].add( - batch[bucket]["new"]["wire_bytes"]), - }, - "revisit": { - "urls": old["revisit"]["urls"].add( - batch[bucket]["revisit"]["urls"]), - "wire_bytes": old["revisit"]["wire_bytes"].add( - batch[bucket]["revisit"]["wire_bytes"]), - }, - }))) - - def _update_batch(self): - with self._batch_lock: - batch_copy = copy.deepcopy(self._batch) - self._batch = {} - try: - if len(batch_copy) > 0: - # XXX can all the buckets be done in one query? - for bucket in batch_copy: - result = self._bucket_batch_update_reql( - bucket, batch_copy).run() - if (not result["inserted"] and not result["replaced"] - or sorted(result.values()) != [0,0,0,0,0,1]): - raise Exception( - "unexpected result %s updating stats %s" % ( - result, batch_copy[bucket])) - except Exception as e: - self.logger.error("problem updating stats", exc_info=True) - # now we need to restore the stats that didn't get saved to the - # batch so that they are saved in the next call to _update_batch() - with self._batch_lock: - self._add_to_batch(batch_copy) - finally: - if not self._stop.is_set(): - self._timer = threading.Timer(2.0, self._update_batch) - self._timer.name = "RethinkStats-batch-update-timer-%s" % ( - datetime.datetime.utcnow().isoformat()) - self._timer.start() - else: - self.logger.info("finished") def _ensure_db_table(self): dbs = self.rr.db_list().run() @@ -282,17 +225,38 @@ class RethinkStatsDb(StatsDb): self.table, primary_key="bucket", shards=1, replicas=self.replicas).run() - def close(self): - self.stop() + def _update_db(self, batch_buckets): + # XXX can all the buckets be done in one query? + for bucket in batch_buckets: + result = self._bucket_batch_update_reql( + bucket, batch_buckets[bucket]).run() + if (not result['inserted'] and not result['replaced'] + or sorted(result.values()) != [0,0,0,0,0,1]): + self.logger.error( + 'unexpected result %s updating stats %s' % ( + result, batch_buckets[bucket])) - def stop(self): - self.logger.info("stopping rethinkdb stats table batch updates") - self._stop.set() - if self._timer: - self._timer.join() - - def sync(self): - pass + def _bucket_batch_update_reql(self, bucket, new): + return self.rr.table(self.table).get(bucket).replace( + lambda old: r.branch( + old.eq(None), new, old.merge({ + 'total': { + 'urls': old['total']['urls'].add(new['total']['urls']), + 'wire_bytes': old['total']['wire_bytes'].add( + new['total']['wire_bytes']), + }, + 'new': { + 'urls': old['new']['urls'].add(new['new']['urls']), + 'wire_bytes': old['new']['wire_bytes'].add( + new['new']['wire_bytes']), + }, + 'revisit': { + 'urls': old['revisit']['urls'].add( + new['revisit']['urls']), + 'wire_bytes': old['revisit']['wire_bytes'].add( + new['revisit']['wire_bytes']), + }, + }))) def value(self, bucket0="__all__", bucket1=None, bucket2=None): bucket0_stats = self.rr.table(self.table).get(bucket0).run() @@ -307,39 +271,6 @@ class RethinkStatsDb(StatsDb): return bucket0_stats[bucket1] return bucket0_stats - def tally(self, recorded_url, records): - buckets = self.buckets(recorded_url) - with self._batch_lock: - for bucket in buckets: - bucket_stats = self._batch.setdefault( - bucket, _empty_bucket(bucket)) - - bucket_stats["total"]["urls"] += 1 - bucket_stats["total"]["wire_bytes"] += recorded_url.size - - if records: - if records[0].type == b'revisit': - bucket_stats["revisit"]["urls"] += 1 - bucket_stats["revisit"]["wire_bytes"] += recorded_url.size - else: - bucket_stats["new"]["urls"] += 1 - bucket_stats["new"]["wire_bytes"] += recorded_url.size - - def _add_to_batch(self, add_me): - with self._batch_lock: - for bucket in add_me: - bucket_stats = self._batch.setdefault( - bucket, _empty_bucket(bucket)) - bucket_stats["total"]["urls"] += add_me[bucket]["total"]["urls"] - bucket_stats["total"]["wire_bytes"] += add_me[bucket]["total"]["wire_bytes"] - bucket_stats["revisit"]["urls"] += add_me[bucket]["revisit"]["urls"] - bucket_stats["revisit"]["wire_bytes"] += add_me[bucket]["revisit"]["wire_bytes"] - bucket_stats["new"]["urls"] += add_me[bucket]["new"]["urls"] - bucket_stats["new"]["wire_bytes"] += add_me[bucket]["new"]["wire_bytes"] - - def notify(self, recorded_url, records): - self.tally(recorded_url, records) - class RunningStats: ''' In-memory stats for measuring overall warcprox performance. diff --git a/warcprox/writerthread.py b/warcprox/writerthread.py index 5747428..f823cc6 100644 --- a/warcprox/writerthread.py +++ b/warcprox/writerthread.py @@ -36,9 +36,8 @@ class WarcWriterThread(warcprox.BaseStandardPostfetchProcessor): _ALWAYS_ACCEPT = {'WARCPROX_WRITE_RECORD'} - def __init__(self, inq, outq, options=warcprox.Options()): - warcprox.BaseStandardPostfetchProcessor.__init__( - self, inq, outq, options=options) + def __init__(self, options=warcprox.Options()): + warcprox.BaseStandardPostfetchProcessor.__init__(self, options=options) self.options = options self.writer_pool = warcprox.writer.WarcWriterPool(options) self.method_filter = set(method.upper() for method in self.options.method_filter or []) From a974ec86fa6d37c0ae2db15be3273a3154555fcd Mon Sep 17 00:00:00 2001 From: Noah Levitt <nlevitt@archive.org> Date: Wed, 17 Jan 2018 15:33:41 -0800 Subject: [PATCH 13/19] fixes to make tests pass --- tests/test_writer.py | 33 ++++++++++++++++----------------- warcprox/main.py | 5 +++-- 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/tests/test_writer.py b/tests/test_writer.py index 9c37424..0a18c33 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -93,16 +93,15 @@ def test_special_dont_write_prefix(): logging.debug('cd %s', tmpdir) os.chdir(tmpdir) - inq = warcprox.TimestampedQueue(maxsize=1) - outq = warcprox.TimestampedQueue(maxsize=1) - wwt = warcprox.writerthread.WarcWriterThread( - inq, outq, Options(prefix='-')) + wwt = warcprox.writerthread.WarcWriterThread(Options(prefix='-')) + wwt.inq = warcprox.TimestampedQueue(maxsize=1) + wwt.outq = warcprox.TimestampedQueue(maxsize=1) try: wwt.start() # not to be written due to default prefix recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None) recorder.read() - inq.put(RecordedUrl( + wwt.inq.put(RecordedUrl( url='http://example.com/no', content_type='text/plain', status=200, client_ip='127.0.0.2', request_data=b'abc', response_recorder=recorder, remote_ip='127.0.0.3', @@ -111,31 +110,31 @@ def test_special_dont_write_prefix(): # to be written due to warcprox-meta prefix recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None) recorder.read() - inq.put(RecordedUrl( + wwt.inq.put(RecordedUrl( url='http://example.com/yes', content_type='text/plain', status=200, client_ip='127.0.0.2', request_data=b'abc', response_recorder=recorder, remote_ip='127.0.0.3', timestamp=datetime.utcnow(), payload_digest=recorder.block_digest, warcprox_meta={'warc-prefix': 'normal-warc-prefix'})) - recorded_url = outq.get(timeout=10) + recorded_url = wwt.outq.get(timeout=10) assert not recorded_url.warc_records - recorded_url = outq.get(timeout=10) + recorded_url = wwt.outq.get(timeout=10) assert recorded_url.warc_records - assert outq.empty() + assert wwt.outq.empty() finally: wwt.stop.set() wwt.join() - inq = warcprox.TimestampedQueue(maxsize=1) - outq = warcprox.TimestampedQueue(maxsize=1) - wwt = warcprox.writerthread.WarcWriterThread(inq, outq) + wwt = warcprox.writerthread.WarcWriterThread() + wwt.inq = warcprox.TimestampedQueue(maxsize=1) + wwt.outq = warcprox.TimestampedQueue(maxsize=1) try: wwt.start() # to be written due to default prefix recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None) recorder.read() - inq.put(RecordedUrl( + wwt.inq.put(RecordedUrl( url='http://example.com/yes', content_type='text/plain', status=200, client_ip='127.0.0.2', request_data=b'abc', response_recorder=recorder, remote_ip='127.0.0.3', @@ -144,18 +143,18 @@ def test_special_dont_write_prefix(): # not to be written due to warcprox-meta prefix recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None) recorder.read() - inq.put(RecordedUrl( + wwt.inq.put(RecordedUrl( url='http://example.com/no', content_type='text/plain', status=200, client_ip='127.0.0.2', request_data=b'abc', response_recorder=recorder, remote_ip='127.0.0.3', timestamp=datetime.utcnow(), payload_digest=recorder.block_digest, warcprox_meta={'warc-prefix': '-'})) - recorded_url = outq.get(timeout=10) + recorded_url = wwt.outq.get(timeout=10) assert recorded_url.warc_records - recorded_url = outq.get(timeout=10) + recorded_url = wwt.outq.get(timeout=10) assert not recorded_url.warc_records - assert outq.empty() + assert wwt.outq.empty() finally: wwt.stop.set() wwt.join() diff --git a/warcprox/main.py b/warcprox/main.py index 860e816..beeed35 100644 --- a/warcprox/main.py +++ b/warcprox/main.py @@ -311,7 +311,8 @@ def ensure_rethinkdb_tables(argv=None): svcreg = doublethink.ServiceRegistry(rr, table=parsed.table) did_something = True if args.rethinkdb_stats_url: - stats_db = warcprox.stats.RethinkStatsDb(options=options) + stats_db = warcprox.stats.RethinkStatsProcessor(options=options) + stats_db._ensure_db_table() did_something = True if args.rethinkdb_dedup_url: dedup_db = warcprox.dedup.RethinkDedupDb(options=options) @@ -322,7 +323,7 @@ def ensure_rethinkdb_tables(argv=None): if args.rethinkdb_trough_db_url: dedup_db = warcprox.dedup.TroughDedupDb(options) logging.warn( - 'trough it responsible for creating most of the rethinkdb ' + 'trough is responsible for creating most of the rethinkdb ' 'tables that it uses') did_something = True From c933cb3119096fdbceb794976f2b394b8f489220 Mon Sep 17 00:00:00 2001 From: Noah Levitt <nlevitt@archive.org> Date: Wed, 17 Jan 2018 16:49:28 -0800 Subject: [PATCH 14/19] batch storing for trough dedup --- warcprox/__init__.py | 2 ++ warcprox/controller.py | 14 ++++++---- warcprox/dedup.py | 59 +++++++++++++++++++++++++++++++++++++++--- 3 files changed, 66 insertions(+), 9 deletions(-) diff --git a/warcprox/__init__.py b/warcprox/__init__.py index 2cd62cd..1934895 100644 --- a/warcprox/__init__.py +++ b/warcprox/__init__.py @@ -106,6 +106,7 @@ class BasePostfetchProcessor(threading.Thread): # these should be set before thread is started self.inq = None self.outq = None + self.profiler = None def run(self): if self.options.profile: @@ -129,6 +130,7 @@ class BasePostfetchProcessor(threading.Thread): raise Exception('not implemented') def _run(self): + logging.info('%s starting up', self) self._startup() while not self.stop.is_set(): try: diff --git a/warcprox/controller.py b/warcprox/controller.py index 9902cb5..4a0c09a 100644 --- a/warcprox/controller.py +++ b/warcprox/controller.py @@ -167,20 +167,22 @@ class WarcproxController(object): if self.playback_proxy: self._postfetch_chain.append( warcprox.ListenerPostfetchProcessor( - self.playback_proxy.playback_index_db)) + self.playback_proxy.playback_index_db, self.options)) crawl_logger = Factory.crawl_logger(self.options) if crawl_logger: self._postfetch_chain.append( - warcprox.ListenerPostfetchProcessor(crawl_logger)) + warcprox.ListenerPostfetchProcessor( + crawl_logger, self.options)) self._postfetch_chain.append( - warcprox.ListenerPostfetchProcessor(self.proxy.running_stats)) + warcprox.ListenerPostfetchProcessor( + self.proxy.running_stats, self.options)) for qualname in self.options.plugins or []: plugin = Factory.plugin(qualname) self._postfetch_chain.append( - warcprox.ListenerPostfetchProcessor(plugin)) + warcprox.ListenerPostfetchProcessor(plugin, self.options)) # chain them all up self._postfetch_chain[0].inq = inq @@ -285,7 +287,6 @@ class WarcproxController(object): for processor in self._postfetch_chain: processor.start() - logging.info('started postfetch processor %r', processor) def shutdown(self): with self._start_stop_lock: @@ -390,6 +391,9 @@ class WarcproxController(object): # postfetch processors for processor in self._postfetch_chain: + if not processor.profiler: + self.logger.notice('%s has no profiling data', processor) + continue file = os.path.join(tmpdir, '%s.dat' % processor.ident) processor.profiler.dump_stats(file) buf = io.StringIO() diff --git a/warcprox/dedup.py b/warcprox/dedup.py index 0b52ffb..950c110 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -260,11 +260,42 @@ class CdxServerDedup(DedupDb): """ pass +class BatchTroughStorer(warcprox.BaseBatchPostfetchProcessor): + def __init__(self, trough_dedup_db, options=warcprox.Options()): + warcprox.BaseBatchPostfetchProcessor.__init__(self, options) + self.trough_dedup_db = trough_dedup_db + + def _filter_and_bucketize(self, batch): + ''' + Returns `{bucket: [recorded_url, ...]}`, excluding urls that should + have dedup info stored. + ''' + buckets = collections.defaultdict(list) + for recorded_url in batch: + if (recorded_url.warc_records + and recorded_url.warc_records[0].type == b'response' + and recorded_url.response_recorder.payload_size() > 0): + if (recorded_url.warcprox_meta + and 'captures-bucket' in recorded_url.warcprox_meta): + bucket = recorded_url.warcprox_meta['captures-bucket'] + else: + bucket = '__unspecified__' + buckets[bucket].append(recorded_url) + return buckets + + def _process_batch(self, batch): + buckets = self._filter_and_bucketize(batch) + for bucket in buckets: + self.trough_dedup_db.batch_save(buckets[bucket], bucket) + class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor): def __init__(self, trough_dedup_db, options=warcprox.Options()): warcprox.BaseBatchPostfetchProcessor.__init__(self, options) self.trough_dedup_db = trough_dedup_db + def _startup(self): + self.trough_dedup_db.start() + def _filter_and_bucketize(self, batch): ''' Returns `{bucket: [recorded_url, ...]}`, excluding urls that should not @@ -275,7 +306,8 @@ class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor): if (recorded_url.response_recorder and recorded_url.payload_digest and recorded_url.response_recorder.payload_size() > 0): - if recorded_url.warcprox_meta and 'captures-bucket' in recorded_url.warcprox_meta: + if (recorded_url.warcprox_meta + and 'captures-bucket' in recorded_url.warcprox_meta): bucket = recorded_url.warcprox_meta['captures-bucket'] else: bucket = '__unspecified__' @@ -315,7 +347,8 @@ class TroughDedupDb(DedupDb): ' url varchar(2100) not null,\n' ' date datetime not null,\n' ' id varchar(100));\n') # warc record id - WRITE_SQL_TMPL = ('insert into dedup (digest_key, url, date, id) ' + WRITE_SQL_TMPL = ('insert or ignore into dedup\n' + '(digest_key, url, date, id)\n' 'values (%s, %s, %s, %s);') def __init__(self, options=warcprox.Options()): @@ -323,8 +356,11 @@ class TroughDedupDb(DedupDb): self._trough_cli = warcprox.trough.TroughClient( options.rethinkdb_trough_db_url, promotion_interval=60*60) - def loader(self, options=warcprox.Options()): - return BatchTroughLoader(self, options) + def loader(self, *args, **kwargs): + return BatchTroughLoader(self, self.options) + + def storer(self, *args, **kwargs): + return BatchTroughStorer(self, self.options) def start(self): self._trough_cli.register_schema(self.SCHEMA_ID, self.SCHEMA_SQL) @@ -337,6 +373,21 @@ class TroughDedupDb(DedupDb): bucket, self.WRITE_SQL_TMPL, (digest_key, url, warc_date, record_id), self.SCHEMA_ID) + def batch_save(self, batch, bucket='__unspecified__'): + sql_tmpl = ('insert or ignore into dedup\n' + '(digest_key, url, date, id)\n' + 'values %s;' % ','.join( + '(%s,%s,%s,%s)' for i in range(len(batch)))) + values = [] + for recorded_url in batch: + values.extend([ + warcprox.digest_str( + recorded_url.payload_digest, self.options.base32), + recorded_url.url, + recorded_url.warc_records[0].date, + recorded_url.warc_records[0].id,]) + self._trough_cli.write(bucket, sql_tmpl, values, self.SCHEMA_ID) + def lookup(self, digest_key, bucket='__unspecified__', url=None): results = self._trough_cli.read( bucket, 'select * from dedup where digest_key=%s;', From 93e2baab8f085746cc2ca2ee5dcf669a55cad652 Mon Sep 17 00:00:00 2001 From: Noah Levitt <nlevitt@archive.org> Date: Thu, 18 Jan 2018 11:08:10 -0800 Subject: [PATCH 15/19] batch for at least 2 seconds --- warcprox/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/warcprox/__init__.py b/warcprox/__init__.py index 1934895..39b31ea 100644 --- a/warcprox/__init__.py +++ b/warcprox/__init__.py @@ -175,7 +175,7 @@ class BaseStandardPostfetchProcessor(BasePostfetchProcessor): class BaseBatchPostfetchProcessor(BasePostfetchProcessor): MAX_BATCH_SIZE = 500 MAX_BATCH_SEC = 10 - MIN_BATCH_SEC = 0.5 + MIN_BATCH_SEC = 2.0 def _get_process_put(self): batch = [] From bed04af440c159da1a8f8ee4cdbd406f38e6678a Mon Sep 17 00:00:00 2001 From: Noah Levitt <nlevitt@archive.org> Date: Thu, 18 Jan 2018 11:12:52 -0800 Subject: [PATCH 16/19] postfetch chain info for /status and service reg including number of queued urls for each processor --- tests/test_warcprox.py | 7 +++++-- warcprox/controller.py | 23 ++++++++++++++++++++++- warcprox/warcproxy.py | 18 +++++++++++++----- 3 files changed, 40 insertions(+), 8 deletions(-) diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 0c250e7..5e13f95 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -347,6 +347,9 @@ def warcprox_(request): logging.info('changing to working directory %r', work_dir) os.chdir(work_dir) + # we can't wait around all day in the tests + warcprox.BaseBatchPostfetchProcessor.MAX_BATCH_SEC = 0.5 + argv = ['warcprox', '--method-filter=GET', '--method-filter=POST', @@ -1319,7 +1322,7 @@ def test_status_api(warcprox_): 'queued_urls', 'queue_max_size', 'seconds_behind', 'threads', 'rates_5min', 'rates_1min', 'unaccepted_requests', 'rates_15min', 'active_requests','start_time','urls_processed', - 'warc_bytes_written'} + 'warc_bytes_written','postfetch_chain',} assert status['role'] == 'warcprox' assert status['version'] == warcprox.__version__ assert status['port'] == warcprox_.proxy.server_port @@ -1341,7 +1344,7 @@ def test_svcreg_status(warcprox_): 'first_heartbeat', 'ttl', 'last_heartbeat', 'threads', 'rates_5min', 'rates_1min', 'unaccepted_requests', 'rates_15min', 'active_requests','start_time','urls_processed', - 'warc_bytes_written',} + 'warc_bytes_written','postfetch_chain',} assert status['role'] == 'warcprox' assert status['version'] == warcprox.__version__ assert status['port'] == warcprox_.proxy.server_port diff --git a/warcprox/controller.py b/warcprox/controller.py index 4a0c09a..2d27958 100644 --- a/warcprox/controller.py +++ b/warcprox/controller.py @@ -132,7 +132,8 @@ class WarcproxController(object): self.stats_processor = Factory.stats_processor(self.options) - self.proxy = warcprox.warcproxy.WarcProxy(self.stats_processor, options) + self.proxy = warcprox.warcproxy.WarcProxy( + self.stats_processor, self.postfetch_status, options) self.playback_proxy = Factory.playback_proxy( self.proxy.ca, self.options) @@ -140,7 +141,27 @@ class WarcproxController(object): self.service_registry = Factory.service_registry(options) + def postfetch_status(self): + result = {'postfetch_chain': []} + for processor in self._postfetch_chain: + if processor.__class__ == warcprox.ListenerPostfetchProcessor: + name = processor.listener.__class__.__name__ + else: + name = processor.__class__.__name__ + + queued = len(processor.inq.queue) + if hasattr(processor, 'batch'): + queued += len(processor.batch) + + result['postfetch_chain'].append({ + 'processor': name, + 'queued_urls': len(processor.inq.queue)}) + return result + def chain(self, processor0, processor1): + ''' + Sets `processor0.outq` = `processor1.inq` = `queue.Queue()` + ''' assert not processor0.outq assert not processor1.inq q = warcprox.TimestampedQueue(maxsize=self.options.queue_size) diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index 7bccba7..8850821 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -377,7 +377,11 @@ class RecordedUrl: class SingleThreadedWarcProxy(http_server.HTTPServer, object): logger = logging.getLogger("warcprox.warcproxy.WarcProxy") - def __init__(self, stats_db=None, options=warcprox.Options()): + def __init__( + self, stats_db=None, status_callback=None, + options=warcprox.Options()): + self.status_callback = status_callback + self.stats_db = stats_db self.options = options server_address = ( @@ -406,8 +410,6 @@ class SingleThreadedWarcProxy(http_server.HTTPServer, object): self.recorded_url_q = warcprox.TimestampedQueue( maxsize=options.queue_size or 1000) - self.stats_db = stats_db - self.running_stats = warcprox.stats.RunningStats() def status(self): @@ -443,14 +445,20 @@ class SingleThreadedWarcProxy(http_server.HTTPServer, object): 'urls_per_sec': urls_per_sec, 'warc_bytes_per_sec': warc_bytes_per_sec, } + # gets postfetch chain status from the controller + if self.status_callback: + result.update(self.status_callback()) return result class WarcProxy(SingleThreadedWarcProxy, warcprox.mitmproxy.PooledMitmProxy): logger = logging.getLogger("warcprox.warcproxy.WarcProxy") - def __init__(self, stats_db=None, options=warcprox.Options()): + def __init__( + self, stats_db=None, status_callback=None, + options=warcprox.Options()): warcprox.mitmproxy.PooledMitmProxy.__init__(self, options) - SingleThreadedWarcProxy.__init__(self, stats_db, options) + SingleThreadedWarcProxy.__init__( + self, stats_db, status_callback, options) def server_activate(self): http_server.HTTPServer.server_activate(self) From 87cdd855d4e4939564cd27665ba1e4487ebcab68 Mon Sep 17 00:00:00 2001 From: Noah Levitt <nlevitt@archive.org> Date: Thu, 18 Jan 2018 11:28:23 -0800 Subject: [PATCH 17/19] fix import to fix plugins --- warcprox/controller.py | 1 + warcprox/main.py | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/warcprox/controller.py b/warcprox/controller.py index 2d27958..0ccb1d7 100644 --- a/warcprox/controller.py +++ b/warcprox/controller.py @@ -34,6 +34,7 @@ import warcprox import certauth import functools import doublethink +import importlib class Factory: @staticmethod diff --git a/warcprox/main.py b/warcprox/main.py index beeed35..59e4b4a 100644 --- a/warcprox/main.py +++ b/warcprox/main.py @@ -42,7 +42,6 @@ import certauth.certauth import warcprox import doublethink import cryptography.hazmat.backends.openssl -import importlib class BetterArgumentDefaultsHelpFormatter( argparse.ArgumentDefaultsHelpFormatter, From 6cc6cf4f28bb8f0f550257158ccc6c83e86b66e4 Mon Sep 17 00:00:00 2001 From: Noah Levitt <nlevitt@archive.org> Date: Thu, 18 Jan 2018 11:38:24 -0800 Subject: [PATCH 18/19] fix plugin loading and add a rudimentary test case --- tests/test_warcprox.py | 17 +++++++++++++++++ warcprox/controller.py | 2 +- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 5e13f95..6ab90f6 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -1395,6 +1395,23 @@ def test_controller_with_defaults(): assert not wwt.writer_pool.default_warc_writer.record_builder.base32 assert wwt.writer_pool.default_warc_writer.record_builder.digest_algorithm == 'sha1' +def test_load_plugin(): + options = warcprox.Options() + options.plugins = ['warcprox.stats.RunningStats'] + controller = warcprox.controller.WarcproxController(options) + assert isinstance( + controller._postfetch_chain[-1], + warcprox.ListenerPostfetchProcessor) + assert isinstance( + controller._postfetch_chain[-1].listener, + warcprox.stats.RunningStats) + assert isinstance( + controller._postfetch_chain[-2], + warcprox.ListenerPostfetchProcessor) + assert isinstance( + controller._postfetch_chain[-2].listener, + warcprox.stats.RunningStats) + def test_choose_a_port_for_me(warcprox_): options = warcprox.Options() options.port = 0 diff --git a/warcprox/controller.py b/warcprox/controller.py index 0ccb1d7..dfd930b 100644 --- a/warcprox/controller.py +++ b/warcprox/controller.py @@ -97,7 +97,7 @@ class Factory: (module_name, class_name) = qualname.rsplit('.', 1) module_ = importlib.import_module(module_name) class_ = getattr(module_, class_name) - listener = class_() + plugin = class_() plugin.notify # make sure it has this method return plugin except Exception as e: From d590dee59a227394b7f092dfdd24c7ad94e19625 Mon Sep 17 00:00:00 2001 From: Noah Levitt <nlevitt@archive.org> Date: Thu, 18 Jan 2018 12:00:27 -0800 Subject: [PATCH 19/19] fix port conflict test failure on travis-ci --- tests/test_warcprox.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 6ab90f6..d091542 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -1396,8 +1396,7 @@ def test_controller_with_defaults(): assert wwt.writer_pool.default_warc_writer.record_builder.digest_algorithm == 'sha1' def test_load_plugin(): - options = warcprox.Options() - options.plugins = ['warcprox.stats.RunningStats'] + options = warcprox.Options(port=0, plugins=['warcprox.stats.RunningStats']) controller = warcprox.controller.WarcproxController(options) assert isinstance( controller._postfetch_chain[-1],