diff --git a/warcprox/main.py b/warcprox/main.py index a127016..5045298 100644 --- a/warcprox/main.py +++ b/warcprox/main.py @@ -77,6 +77,8 @@ def _build_arg_parser(prog=os.path.basename(sys.argv[0])): default='sha1', help='digest algorithm, one of {}'.format(', '.join(hash_algos))) arg_parser.add_argument('--base32', dest='base32', action='store_true', default=False, help='write digests in Base32 instead of hex') + arg_parser.add_argument('--method-filter', metavar='HTTP_METHOD', + action='append', help='only record requests with the given http method(s) (can be used more than once)') arg_parser.add_argument('--stats-db-file', dest='stats_db_file', default='./warcprox-stats.db', help='persistent statistics database file; empty string or /dev/null disables statistics tracking') arg_parser.add_argument('-P', '--playback-port', dest='playback_port', diff --git a/warcprox/writerthread.py b/warcprox/writerthread.py index 002b897..92bd416 100644 --- a/warcprox/writerthread.py +++ b/warcprox/writerthread.py @@ -56,6 +56,7 @@ class WarcWriterThread(threading.Thread): self.listeners = listeners self.options = options self.idle = None + self.method_filter = set(method.upper() for method in self.options.method_filter or []) def run(self): if self.options.profile: @@ -63,6 +64,9 @@ class WarcWriterThread(threading.Thread): else: self._run() + def _filter_accepts(self, recorded_url): + return not self.method_filter or recorded_url.method.upper() in self.method_filter + def _run(self): while not self.stop.is_set(): try: @@ -76,11 +80,12 @@ class WarcWriterThread(threading.Thread): recorded_url = self.recorded_url_q.get(block=True, timeout=0.5) self.idle = None - if self.dedup_db: - warcprox.dedup.decorate_with_dedup_info(self.dedup_db, - recorded_url, base32=self.options.base32) - records = self.writer_pool.write_records(recorded_url) - self._final_tasks(recorded_url, records) + if self._filter_accepts(recorded_url): + if self.dedup_db: + warcprox.dedup.decorate_with_dedup_info(self.dedup_db, + recorded_url, base32=self.options.base32) + records = self.writer_pool.write_records(recorded_url) + self._final_tasks(recorded_url, records) # try to release resources in a timely fashion if recorded_url.response_recorder and recorded_url.response_recorder.tempfile: