add --method-filter option

This commit is contained in:
Alex Osborne 2016-11-15 23:24:59 +11:00
parent 41bd6c72af
commit 90031a2058
2 changed files with 12 additions and 5 deletions

View File

@ -77,6 +77,8 @@ def _build_arg_parser(prog=os.path.basename(sys.argv[0])):
default='sha1', help='digest algorithm, one of {}'.format(', '.join(hash_algos))) default='sha1', help='digest algorithm, one of {}'.format(', '.join(hash_algos)))
arg_parser.add_argument('--base32', dest='base32', action='store_true', arg_parser.add_argument('--base32', dest='base32', action='store_true',
default=False, help='write digests in Base32 instead of hex') default=False, help='write digests in Base32 instead of hex')
arg_parser.add_argument('--method-filter', metavar='HTTP_METHOD',
action='append', help='only record requests with the given http method(s) (can be used more than once)')
arg_parser.add_argument('--stats-db-file', dest='stats_db_file', arg_parser.add_argument('--stats-db-file', dest='stats_db_file',
default='./warcprox-stats.db', help='persistent statistics database file; empty string or /dev/null disables statistics tracking') default='./warcprox-stats.db', help='persistent statistics database file; empty string or /dev/null disables statistics tracking')
arg_parser.add_argument('-P', '--playback-port', dest='playback_port', arg_parser.add_argument('-P', '--playback-port', dest='playback_port',

View File

@ -56,6 +56,7 @@ class WarcWriterThread(threading.Thread):
self.listeners = listeners self.listeners = listeners
self.options = options self.options = options
self.idle = None self.idle = None
self.method_filter = set(method.upper() for method in self.options.method_filter or [])
def run(self): def run(self):
if self.options.profile: if self.options.profile:
@ -63,6 +64,9 @@ class WarcWriterThread(threading.Thread):
else: else:
self._run() self._run()
def _filter_accepts(self, recorded_url):
return not self.method_filter or recorded_url.method.upper() in self.method_filter
def _run(self): def _run(self):
while not self.stop.is_set(): while not self.stop.is_set():
try: try:
@ -76,11 +80,12 @@ class WarcWriterThread(threading.Thread):
recorded_url = self.recorded_url_q.get(block=True, timeout=0.5) recorded_url = self.recorded_url_q.get(block=True, timeout=0.5)
self.idle = None self.idle = None
if self.dedup_db: if self._filter_accepts(recorded_url):
warcprox.dedup.decorate_with_dedup_info(self.dedup_db, if self.dedup_db:
recorded_url, base32=self.options.base32) warcprox.dedup.decorate_with_dedup_info(self.dedup_db,
records = self.writer_pool.write_records(recorded_url) recorded_url, base32=self.options.base32)
self._final_tasks(recorded_url, records) records = self.writer_pool.write_records(recorded_url)
self._final_tasks(recorded_url, records)
# try to release resources in a timely fashion # try to release resources in a timely fashion
if recorded_url.response_recorder and recorded_url.response_recorder.tempfile: if recorded_url.response_recorder and recorded_url.response_recorder.tempfile: