add --method-filter option

This commit is contained in:
Alex Osborne 2016-11-15 23:24:59 +11:00
parent 41bd6c72af
commit 90031a2058
2 changed files with 12 additions and 5 deletions

View File

@ -77,6 +77,8 @@ def _build_arg_parser(prog=os.path.basename(sys.argv[0])):
default='sha1', help='digest algorithm, one of {}'.format(', '.join(hash_algos)))
arg_parser.add_argument('--base32', dest='base32', action='store_true',
default=False, help='write digests in Base32 instead of hex')
arg_parser.add_argument('--method-filter', metavar='HTTP_METHOD',
action='append', help='only record requests with the given http method(s) (can be used more than once)')
arg_parser.add_argument('--stats-db-file', dest='stats_db_file',
default='./warcprox-stats.db', help='persistent statistics database file; empty string or /dev/null disables statistics tracking')
arg_parser.add_argument('-P', '--playback-port', dest='playback_port',

View File

@ -56,6 +56,7 @@ class WarcWriterThread(threading.Thread):
self.listeners = listeners
self.options = options
self.idle = None
self.method_filter = set(method.upper() for method in self.options.method_filter or [])
def run(self):
if self.options.profile:
@ -63,6 +64,9 @@ class WarcWriterThread(threading.Thread):
else:
self._run()
def _filter_accepts(self, recorded_url):
return not self.method_filter or recorded_url.method.upper() in self.method_filter
def _run(self):
while not self.stop.is_set():
try:
@ -76,11 +80,12 @@ class WarcWriterThread(threading.Thread):
recorded_url = self.recorded_url_q.get(block=True, timeout=0.5)
self.idle = None
if self.dedup_db:
warcprox.dedup.decorate_with_dedup_info(self.dedup_db,
recorded_url, base32=self.options.base32)
records = self.writer_pool.write_records(recorded_url)
self._final_tasks(recorded_url, records)
if self._filter_accepts(recorded_url):
if self.dedup_db:
warcprox.dedup.decorate_with_dedup_info(self.dedup_db,
recorded_url, base32=self.options.base32)
records = self.writer_pool.write_records(recorded_url)
self._final_tasks(recorded_url, records)
# try to release resources in a timely fashion
if recorded_url.response_recorder and recorded_url.response_recorder.tempfile: