tests are passing

This commit is contained in:
Noah Levitt 2018-01-15 14:37:27 -08:00
parent bd25991a0d
commit c9a39958db
11 changed files with 169 additions and 166 deletions

View File

@ -357,7 +357,9 @@ def warcprox_(request):
argv.append('--rethinkdb-trough-db-url=%s' % request.config.getoption('--rethinkdb-trough-db-url')) argv.append('--rethinkdb-trough-db-url=%s' % request.config.getoption('--rethinkdb-trough-db-url'))
args = warcprox.main.parse_args(argv) args = warcprox.main.parse_args(argv)
warcprox_ = warcprox.main.init_controller(args)
options = warcprox.Options(**vars(args))
warcprox_ = warcprox.controller.WarcproxController(options)
logging.info('starting warcprox') logging.info('starting warcprox')
warcprox_thread = threading.Thread( warcprox_thread = threading.Thread(
@ -490,8 +492,8 @@ def test_dedup_http(http_daemon, warcprox_, archiving_proxies, playback_proxies)
assert response.content == b'404 Not in Archive\n' assert response.content == b'404 Not in Archive\n'
# check not in dedup db # check not in dedup db
dedup_lookup = warcprox_.warc_writer_threads[0].dedup_db.lookup( dedup_lookup = warcprox_.dedup_db.lookup(
b'sha1:65e1216acfd220f0292715e74bd7a1ec35c99dfc') b'sha1:65e1216acfd220f0292715e74bd7a1ec35c99dfc')
assert dedup_lookup is None assert dedup_lookup is None
# archive # archive
@ -508,13 +510,13 @@ def test_dedup_http(http_daemon, warcprox_, archiving_proxies, playback_proxies)
# wait for writer thread to process # wait for writer thread to process
time.sleep(0.5) time.sleep(0.5)
while not all(wwt.idle for wwt in warcprox_.warc_writer_threads): while warcprox_.postfetch_chain_busy():
time.sleep(0.5) time.sleep(0.5)
time.sleep(0.5) time.sleep(0.5)
# check in dedup db # check in dedup db
# {u'id': u'<urn:uuid:e691dc0f-4bb9-4ad8-9afb-2af836aa05e4>', u'url': u'https://localhost:62841/c/d', u'date': u'2013-11-22T00:14:37Z'} # {u'id': u'<urn:uuid:e691dc0f-4bb9-4ad8-9afb-2af836aa05e4>', u'url': u'https://localhost:62841/c/d', u'date': u'2013-11-22T00:14:37Z'}
dedup_lookup = warcprox_.warc_writer_threads[0].dedup_db.lookup( dedup_lookup = warcprox_.dedup_db.lookup(
b'sha1:65e1216acfd220f0292715e74bd7a1ec35c99dfc') b'sha1:65e1216acfd220f0292715e74bd7a1ec35c99dfc')
assert dedup_lookup assert dedup_lookup
assert dedup_lookup['url'] == url.encode('ascii') assert dedup_lookup['url'] == url.encode('ascii')
@ -535,12 +537,12 @@ def test_dedup_http(http_daemon, warcprox_, archiving_proxies, playback_proxies)
# wait for writer thread to process # wait for writer thread to process
time.sleep(0.5) time.sleep(0.5)
while not all(wwt.idle for wwt in warcprox_.warc_writer_threads): while warcprox_.postfetch_chain_busy():
time.sleep(0.5) time.sleep(0.5)
time.sleep(0.5) time.sleep(0.5)
# check in dedup db (no change from prev) # check in dedup db (no change from prev)
dedup_lookup = warcprox_.warc_writer_threads[0].dedup_db.lookup( dedup_lookup = warcprox_.dedup_db.lookup(
b'sha1:65e1216acfd220f0292715e74bd7a1ec35c99dfc') b'sha1:65e1216acfd220f0292715e74bd7a1ec35c99dfc')
assert dedup_lookup['url'] == url.encode('ascii') assert dedup_lookup['url'] == url.encode('ascii')
assert dedup_lookup['id'] == record_id assert dedup_lookup['id'] == record_id
@ -564,7 +566,7 @@ def test_dedup_https(https_daemon, warcprox_, archiving_proxies, playback_proxie
assert response.content == b'404 Not in Archive\n' assert response.content == b'404 Not in Archive\n'
# check not in dedup db # check not in dedup db
dedup_lookup = warcprox_.warc_writer_threads[0].dedup_db.lookup( dedup_lookup = warcprox_.dedup_db.lookup(
b'sha1:5b4efa64fdb308ec06ae56a9beba155a6f734b89') b'sha1:5b4efa64fdb308ec06ae56a9beba155a6f734b89')
assert dedup_lookup is None assert dedup_lookup is None
@ -582,13 +584,13 @@ def test_dedup_https(https_daemon, warcprox_, archiving_proxies, playback_proxie
# wait for writer thread to process # wait for writer thread to process
time.sleep(0.5) time.sleep(0.5)
while not all(wwt.idle for wwt in warcprox_.warc_writer_threads): while warcprox_.postfetch_chain_busy():
time.sleep(0.5) time.sleep(0.5)
time.sleep(0.5) time.sleep(0.5)
# check in dedup db # check in dedup db
# {u'id': u'<urn:uuid:e691dc0f-4bb9-4ad8-9afb-2af836aa05e4>', u'url': u'https://localhost:62841/c/d', u'date': u'2013-11-22T00:14:37Z'} # {u'id': u'<urn:uuid:e691dc0f-4bb9-4ad8-9afb-2af836aa05e4>', u'url': u'https://localhost:62841/c/d', u'date': u'2013-11-22T00:14:37Z'}
dedup_lookup = warcprox_.warc_writer_threads[0].dedup_db.lookup( dedup_lookup = warcprox_.dedup_db.lookup(
b'sha1:5b4efa64fdb308ec06ae56a9beba155a6f734b89') b'sha1:5b4efa64fdb308ec06ae56a9beba155a6f734b89')
assert dedup_lookup assert dedup_lookup
assert dedup_lookup['url'] == url.encode('ascii') assert dedup_lookup['url'] == url.encode('ascii')
@ -609,12 +611,12 @@ def test_dedup_https(https_daemon, warcprox_, archiving_proxies, playback_proxie
# wait for writer thread to process # wait for writer thread to process
time.sleep(0.5) time.sleep(0.5)
while not all(wwt.idle for wwt in warcprox_.warc_writer_threads): while warcprox_.postfetch_chain_busy():
time.sleep(0.5) time.sleep(0.5)
time.sleep(0.5) time.sleep(0.5)
# check in dedup db (no change from prev) # check in dedup db (no change from prev)
dedup_lookup = warcprox_.warc_writer_threads[0].dedup_db.lookup( dedup_lookup = warcprox_.dedup_db.lookup(
b'sha1:5b4efa64fdb308ec06ae56a9beba155a6f734b89') b'sha1:5b4efa64fdb308ec06ae56a9beba155a6f734b89')
assert dedup_lookup['url'] == url.encode('ascii') assert dedup_lookup['url'] == url.encode('ascii')
assert dedup_lookup['id'] == record_id assert dedup_lookup['id'] == record_id
@ -640,7 +642,7 @@ def test_limits(http_daemon, warcprox_, archiving_proxies):
# wait for writer thread to process # wait for writer thread to process
time.sleep(0.5) time.sleep(0.5)
while not all(wwt.idle for wwt in warcprox_.warc_writer_threads): while warcprox_.postfetch_chain_busy():
time.sleep(0.5) time.sleep(0.5)
time.sleep(0.5) time.sleep(0.5)
@ -652,7 +654,7 @@ def test_limits(http_daemon, warcprox_, archiving_proxies):
# wait for writer thread to process # wait for writer thread to process
time.sleep(0.5) time.sleep(0.5)
while not all(wwt.idle for wwt in warcprox_.warc_writer_threads): while warcprox_.postfetch_chain_busy():
time.sleep(0.5) time.sleep(0.5)
time.sleep(2.5) time.sleep(2.5)
@ -693,12 +695,12 @@ def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies,
# wait for writer thread to process # wait for writer thread to process
time.sleep(0.5) time.sleep(0.5)
while not all(wwt.idle for wwt in warcprox_.warc_writer_threads): while warcprox_.postfetch_chain_busy():
time.sleep(0.5) time.sleep(0.5)
time.sleep(0.5) time.sleep(0.5)
# check url1 in dedup db bucket_a # check url1 in dedup db bucket_a
dedup_lookup = warcprox_.warc_writer_threads[0].dedup_db.lookup( dedup_lookup = warcprox_.dedup_db.lookup(
b'sha1:bc3fac8847c9412f49d955e626fb58a76befbf81', bucket="bucket_a") b'sha1:bc3fac8847c9412f49d955e626fb58a76befbf81', bucket="bucket_a")
assert dedup_lookup['url'] == url1.encode('ascii') assert dedup_lookup['url'] == url1.encode('ascii')
assert re.match(br'^<urn:uuid:[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}>$', dedup_lookup['id']) assert re.match(br'^<urn:uuid:[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}>$', dedup_lookup['id'])
@ -707,7 +709,7 @@ def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies,
dedup_date = dedup_lookup['date'] dedup_date = dedup_lookup['date']
# check url1 not in dedup db bucket_b # check url1 not in dedup db bucket_b
dedup_lookup = warcprox_.warc_writer_threads[0].dedup_db.lookup( dedup_lookup = warcprox_.dedup_db.lookup(
b'sha1:bc3fac8847c9412f49d955e626fb58a76befbf81', bucket="bucket_b") b'sha1:bc3fac8847c9412f49d955e626fb58a76befbf81', bucket="bucket_b")
assert dedup_lookup is None assert dedup_lookup is None
@ -720,12 +722,12 @@ def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies,
# wait for writer thread to process # wait for writer thread to process
time.sleep(0.5) time.sleep(0.5)
while not all(wwt.idle for wwt in warcprox_.warc_writer_threads): while warcprox_.postfetch_chain_busy():
time.sleep(0.5) time.sleep(0.5)
time.sleep(0.5) time.sleep(0.5)
# check url2 in dedup db bucket_b # check url2 in dedup db bucket_b
dedup_lookup = warcprox_.warc_writer_threads[0].dedup_db.lookup( dedup_lookup = warcprox_.dedup_db.lookup(
b'sha1:bc3fac8847c9412f49d955e626fb58a76befbf81', bucket="bucket_b") b'sha1:bc3fac8847c9412f49d955e626fb58a76befbf81', bucket="bucket_b")
assert dedup_lookup['url'] == url2.encode('ascii') assert dedup_lookup['url'] == url2.encode('ascii')
assert re.match(br'^<urn:uuid:[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}>$', dedup_lookup['id']) assert re.match(br'^<urn:uuid:[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}>$', dedup_lookup['id'])
@ -742,7 +744,7 @@ def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies,
# wait for writer thread to process # wait for writer thread to process
time.sleep(0.5) time.sleep(0.5)
while not all(wwt.idle for wwt in warcprox_.warc_writer_threads): while warcprox_.postfetch_chain_busy():
time.sleep(0.5) time.sleep(0.5)
time.sleep(0.5) time.sleep(0.5)
@ -755,15 +757,15 @@ def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies,
# wait for writer thread to process # wait for writer thread to process
time.sleep(0.5) time.sleep(0.5)
while not all(wwt.idle for wwt in warcprox_.warc_writer_threads): while warcprox_.postfetch_chain_busy():
time.sleep(0.5) time.sleep(0.5)
time.sleep(0.5) time.sleep(0.5)
# close the warc # close the warc
assert warcprox_.warc_writer_threads[0].writer_pool.warc_writers["test_dedup_buckets"] assert warcprox_.warc_writer_thread.writer_pool.warc_writers["test_dedup_buckets"]
writer = warcprox_.warc_writer_threads[0].writer_pool.warc_writers["test_dedup_buckets"] writer = warcprox_.warc_writer_thread.writer_pool.warc_writers["test_dedup_buckets"]
warc_path = os.path.join(writer.directory, writer._f_finalname) warc_path = os.path.join(writer.directory, writer._f_finalname)
warcprox_.warc_writer_threads[0].writer_pool.warc_writers["test_dedup_buckets"].close_writer() warcprox_.warc_writer_thread.writer_pool.warc_writers["test_dedup_buckets"].close_writer()
assert os.path.exists(warc_path) assert os.path.exists(warc_path)
# read the warc # read the warc
@ -948,7 +950,7 @@ def test_domain_doc_soft_limit(
# wait for writer thread to process # wait for writer thread to process
time.sleep(0.5) time.sleep(0.5)
while not all(wwt.idle for wwt in warcprox_.warc_writer_threads): while warcprox_.postfetch_chain_busy():
time.sleep(0.5) time.sleep(0.5)
time.sleep(0.5) time.sleep(0.5)
@ -963,7 +965,7 @@ def test_domain_doc_soft_limit(
# wait for writer thread to process # wait for writer thread to process
time.sleep(0.5) time.sleep(0.5)
while not all(wwt.idle for wwt in warcprox_.warc_writer_threads): while warcprox_.postfetch_chain_busy():
time.sleep(0.5) time.sleep(0.5)
# rethinkdb stats db update cycle is 2 seconds (at the moment anyway) # rethinkdb stats db update cycle is 2 seconds (at the moment anyway)
time.sleep(2.0) time.sleep(2.0)
@ -990,7 +992,7 @@ def test_domain_doc_soft_limit(
# wait for writer thread to process # wait for writer thread to process
time.sleep(0.5) time.sleep(0.5)
while not all(wwt.idle for wwt in warcprox_.warc_writer_threads): while warcprox_.postfetch_chain_busy():
time.sleep(0.5) time.sleep(0.5)
# rethinkdb stats db update cycle is 2 seconds (at the moment anyway) # rethinkdb stats db update cycle is 2 seconds (at the moment anyway)
time.sleep(2.0) time.sleep(2.0)
@ -1005,7 +1007,7 @@ def test_domain_doc_soft_limit(
# wait for writer thread to process # wait for writer thread to process
time.sleep(0.5) time.sleep(0.5)
while not all(wwt.idle for wwt in warcprox_.warc_writer_threads): while warcprox_.postfetch_chain_busy():
time.sleep(0.5) time.sleep(0.5)
# rethinkdb stats db update cycle is 2 seconds (at the moment anyway) # rethinkdb stats db update cycle is 2 seconds (at the moment anyway)
time.sleep(2.0) time.sleep(2.0)
@ -1073,7 +1075,7 @@ def test_domain_data_soft_limit(
# wait for writer thread to process # wait for writer thread to process
time.sleep(0.5) time.sleep(0.5)
while not all(wwt.idle for wwt in warcprox_.warc_writer_threads): while warcprox_.postfetch_chain_busy():
time.sleep(0.5) time.sleep(0.5)
# rethinkdb stats db update cycle is 2 seconds (at the moment anyway) # rethinkdb stats db update cycle is 2 seconds (at the moment anyway)
time.sleep(2.0) time.sleep(2.0)
@ -1089,7 +1091,7 @@ def test_domain_data_soft_limit(
# wait for writer thread to process # wait for writer thread to process
time.sleep(0.5) time.sleep(0.5)
while not all(wwt.idle for wwt in warcprox_.warc_writer_threads): while warcprox_.postfetch_chain_busy():
time.sleep(0.5) time.sleep(0.5)
# rethinkdb stats db update cycle is 2 seconds (at the moment anyway) # rethinkdb stats db update cycle is 2 seconds (at the moment anyway)
time.sleep(2.0) time.sleep(2.0)
@ -1105,7 +1107,7 @@ def test_domain_data_soft_limit(
# wait for writer thread to process # wait for writer thread to process
time.sleep(0.5) time.sleep(0.5)
while not all(wwt.idle for wwt in warcprox_.warc_writer_threads): while warcprox_.postfetch_chain_busy():
time.sleep(0.5) time.sleep(0.5)
# rethinkdb stats db update cycle is 2 seconds (at the moment anyway) # rethinkdb stats db update cycle is 2 seconds (at the moment anyway)
time.sleep(2.0) time.sleep(2.0)
@ -1238,7 +1240,7 @@ def test_dedup_ok_flag(
url = 'http://localhost:{}/z/b'.format(http_daemon.server_port) url = 'http://localhost:{}/z/b'.format(http_daemon.server_port)
# check not in dedup db # check not in dedup db
dedup_lookup = warcprox_.warc_writer_threads[0].dedup_db.lookup( dedup_lookup = warcprox_.dedup_db.lookup(
b'sha1:2d7f13181b90a256ce5e5ebfd6e9c9826ece9079', b'sha1:2d7f13181b90a256ce5e5ebfd6e9c9826ece9079',
bucket='test_dedup_ok_flag') bucket='test_dedup_ok_flag')
assert dedup_lookup is None assert dedup_lookup is None
@ -1253,12 +1255,12 @@ def test_dedup_ok_flag(
assert response.content == b'I am the warcprox test payload! bbbbbbbbbb!\n' assert response.content == b'I am the warcprox test payload! bbbbbbbbbb!\n'
time.sleep(0.5) time.sleep(0.5)
while not all(wwt.idle for wwt in warcprox_.warc_writer_threads): while warcprox_.postfetch_chain_busy():
time.sleep(0.5) time.sleep(0.5)
time.sleep(0.5) time.sleep(0.5)
# check that dedup db doesn't give us anything for this # check that dedup db doesn't give us anything for this
dedup_lookup = warcprox_.warc_writer_threads[0].dedup_db.lookup( dedup_lookup = warcprox_.dedup_db.lookup(
b'sha1:2d7f13181b90a256ce5e5ebfd6e9c9826ece9079', b'sha1:2d7f13181b90a256ce5e5ebfd6e9c9826ece9079',
bucket='test_dedup_ok_flag') bucket='test_dedup_ok_flag')
assert dedup_lookup is None assert dedup_lookup is None
@ -1274,18 +1276,18 @@ def test_dedup_ok_flag(
assert response.content == b'I am the warcprox test payload! bbbbbbbbbb!\n' assert response.content == b'I am the warcprox test payload! bbbbbbbbbb!\n'
time.sleep(0.5) time.sleep(0.5)
while not all(wwt.idle for wwt in warcprox_.warc_writer_threads): while warcprox_.postfetch_chain_busy():
time.sleep(0.5) time.sleep(0.5)
time.sleep(0.5) time.sleep(0.5)
# check that dedup db gives us something for this # check that dedup db gives us something for this
dedup_lookup = warcprox_.warc_writer_threads[0].dedup_db.lookup( dedup_lookup = warcprox_.dedup_db.lookup(
b'sha1:2d7f13181b90a256ce5e5ebfd6e9c9826ece9079', b'sha1:2d7f13181b90a256ce5e5ebfd6e9c9826ece9079',
bucket='test_dedup_ok_flag') bucket='test_dedup_ok_flag')
assert dedup_lookup assert dedup_lookup
# inspect what's in rethinkdb more closely # inspect what's in rethinkdb more closely
rethink_captures = warcprox_.warc_writer_threads[0].dedup_db.captures_db rethink_captures = warcprox_.dedup_db.captures_db
results_iter = rethink_captures.rr.table(rethink_captures.table).get_all( results_iter = rethink_captures.rr.table(rethink_captures.table).get_all(
['FV7RGGA3SCRFNTS6L275N2OJQJXM5EDZ', 'response', ['FV7RGGA3SCRFNTS6L275N2OJQJXM5EDZ', 'response',
'test_dedup_ok_flag'], index='sha1_warc_type').order_by( 'test_dedup_ok_flag'], index='sha1_warc_type').order_by(
@ -1366,26 +1368,28 @@ def test_controller_with_defaults():
assert controller.proxy.server_address == ('127.0.0.1', 8000) assert controller.proxy.server_address == ('127.0.0.1', 8000)
assert controller.proxy.server_port == 8000 assert controller.proxy.server_port == 8000
assert controller.proxy.running_stats assert controller.proxy.running_stats
for wwt in controller.warc_writer_threads: assert not controller.proxy.stats_db
assert wwt wwt = controller.warc_writer_thread
assert wwt.recorded_url_q assert wwt
assert wwt.recorded_url_q is controller.proxy.recorded_url_q assert wwt.inq
assert wwt.writer_pool assert not wwt.outq
assert wwt.writer_pool.default_warc_writer assert wwt.writer_pool
assert wwt.writer_pool.default_warc_writer.directory == './warcs' assert wwt.writer_pool.default_warc_writer
assert wwt.writer_pool.default_warc_writer.rollover_idle_time is None assert wwt.writer_pool.default_warc_writer.directory == './warcs'
assert wwt.writer_pool.default_warc_writer.rollover_size == 1000000000 assert wwt.writer_pool.default_warc_writer.rollover_idle_time is None
assert wwt.writer_pool.default_warc_writer.prefix == 'warcprox' assert wwt.writer_pool.default_warc_writer.rollover_size == 1000000000
assert wwt.writer_pool.default_warc_writer.gzip is False assert wwt.writer_pool.default_warc_writer.prefix == 'warcprox'
assert wwt.writer_pool.default_warc_writer.record_builder assert wwt.writer_pool.default_warc_writer.gzip is False
assert not wwt.writer_pool.default_warc_writer.record_builder.base32 assert wwt.writer_pool.default_warc_writer.record_builder
assert wwt.writer_pool.default_warc_writer.record_builder.digest_algorithm == 'sha1' assert not wwt.writer_pool.default_warc_writer.record_builder.base32
assert wwt.writer_pool.default_warc_writer.record_builder.digest_algorithm == 'sha1'
def test_choose_a_port_for_me(warcprox_): def test_choose_a_port_for_me(warcprox_):
options = warcprox.Options() options = warcprox.Options()
options.port = 0 options.port = 0
controller = warcprox.controller.WarcproxController( if warcprox_.service_registry:
service_registry=warcprox_.service_registry, options=options) options.rethinkdb_services_url = 'rethinkdb://localhost/test0/services'
controller = warcprox.controller.WarcproxController(options)
assert controller.proxy.server_port != 0 assert controller.proxy.server_port != 0
assert controller.proxy.server_port != 8000 assert controller.proxy.server_port != 8000
assert controller.proxy.server_address == ( assert controller.proxy.server_address == (
@ -1426,7 +1430,7 @@ def test_via_response_header(warcprox_, http_daemon, archiving_proxies, playback
assert response.status_code == 200 assert response.status_code == 200
assert not 'via' in playback_response assert not 'via' in playback_response
warc = warcprox_.warc_writer_threads[0].writer_pool.default_warc_writer._fpath warc = warcprox_.warc_writer_thread.writer_pool.default_warc_writer._fpath
with open(warc, 'rb') as f: with open(warc, 'rb') as f:
for record in warcio.archiveiterator.ArchiveIterator(f): for record in warcio.archiveiterator.ArchiveIterator(f):
if record.rec_headers.get_header('warc-target-uri') == url: if record.rec_headers.get_header('warc-target-uri') == url:
@ -1644,15 +1648,15 @@ def test_long_warcprox_meta(
# wait for writer thread to process # wait for writer thread to process
time.sleep(0.5) time.sleep(0.5)
while not all(wwt.idle for wwt in warcprox_.warc_writer_threads): while warcprox_.postfetch_chain_busy():
time.sleep(0.5) time.sleep(0.5)
time.sleep(0.5) time.sleep(0.5)
# check that warcprox-meta was parsed and honored ("warc-prefix" param) # check that warcprox-meta was parsed and honored ("warc-prefix" param)
assert warcprox_.warc_writer_threads[0].writer_pool.warc_writers["test_long_warcprox_meta"] assert warcprox_.warc_writer_thread.writer_pool.warc_writers["test_long_warcprox_meta"]
writer = warcprox_.warc_writer_threads[0].writer_pool.warc_writers["test_long_warcprox_meta"] writer = warcprox_.warc_writer_thread.writer_pool.warc_writers["test_long_warcprox_meta"]
warc_path = os.path.join(writer.directory, writer._f_finalname) warc_path = os.path.join(writer.directory, writer._f_finalname)
warcprox_.warc_writer_threads[0].writer_pool.warc_writers["test_long_warcprox_meta"].close_writer() warcprox_.warc_writer_thread.writer_pool.warc_writers["test_long_warcprox_meta"].close_writer()
assert os.path.exists(warc_path) assert os.path.exists(warc_path)
# read the warc # read the warc

View File

@ -88,27 +88,20 @@ def wait(callback, timeout):
raise Exception('timed out waiting for %s to return truthy' % callback) raise Exception('timed out waiting for %s to return truthy' % callback)
def test_special_dont_write_prefix(): def test_special_dont_write_prefix():
class NotifyMe:
def __init__(self):
self.the_list = []
def notify(self, recorded_url, records):
self.the_list.append((recorded_url, records))
with tempfile.TemporaryDirectory() as tmpdir: with tempfile.TemporaryDirectory() as tmpdir:
logging.debug('cd %s', tmpdir) logging.debug('cd %s', tmpdir)
os.chdir(tmpdir) os.chdir(tmpdir)
q = warcprox.TimestampedQueue(maxsize=1) inq = warcprox.TimestampedQueue(maxsize=1)
listener = NotifyMe() outq = warcprox.TimestampedQueue(maxsize=1)
wwt = warcprox.writerthread.WarcWriterThread( wwt = warcprox.writerthread.WarcWriterThread(
recorded_url_q=q, options=Options(prefix='-'), inq, outq, Options(prefix='-'))
listeners=[listener])
try: try:
wwt.start() wwt.start()
# not to be written due to default prefix # not to be written due to default prefix
recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None) recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
recorder.read() recorder.read()
q.put(RecordedUrl( inq.put(RecordedUrl(
url='http://example.com/no', content_type='text/plain', url='http://example.com/no', content_type='text/plain',
status=200, client_ip='127.0.0.2', request_data=b'abc', status=200, client_ip='127.0.0.2', request_data=b'abc',
response_recorder=recorder, remote_ip='127.0.0.3', response_recorder=recorder, remote_ip='127.0.0.3',
@ -117,30 +110,31 @@ def test_special_dont_write_prefix():
# to be written due to warcprox-meta prefix # to be written due to warcprox-meta prefix
recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None) recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
recorder.read() recorder.read()
q.put(RecordedUrl( inq.put(RecordedUrl(
url='http://example.com/yes', content_type='text/plain', url='http://example.com/yes', content_type='text/plain',
status=200, client_ip='127.0.0.2', request_data=b'abc', status=200, client_ip='127.0.0.2', request_data=b'abc',
response_recorder=recorder, remote_ip='127.0.0.3', response_recorder=recorder, remote_ip='127.0.0.3',
timestamp=datetime.utcnow(), timestamp=datetime.utcnow(),
payload_digest=recorder.block_digest, payload_digest=recorder.block_digest,
warcprox_meta={'warc-prefix': 'normal-warc-prefix'})) warcprox_meta={'warc-prefix': 'normal-warc-prefix'}))
wait(lambda: len(listener.the_list) == 2, 10.0) recorded_url = outq.get(timeout=10)
assert not listener.the_list[0][1] assert not recorded_url.warc_records
assert listener.the_list[1][1] recorded_url = outq.get(timeout=10)
assert recorded_url.warc_records
assert outq.empty()
finally: finally:
wwt.stop.set() wwt.stop.set()
wwt.join() wwt.join()
q = warcprox.TimestampedQueue(maxsize=1) inq = warcprox.TimestampedQueue(maxsize=1)
listener = NotifyMe() outq = warcprox.TimestampedQueue(maxsize=1)
wwt = warcprox.writerthread.WarcWriterThread( wwt = warcprox.writerthread.WarcWriterThread(inq, outq)
recorded_url_q=q, listeners=[listener])
try: try:
wwt.start() wwt.start()
# to be written due to default prefix # to be written due to default prefix
recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None) recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
recorder.read() recorder.read()
q.put(RecordedUrl( inq.put(RecordedUrl(
url='http://example.com/yes', content_type='text/plain', url='http://example.com/yes', content_type='text/plain',
status=200, client_ip='127.0.0.2', request_data=b'abc', status=200, client_ip='127.0.0.2', request_data=b'abc',
response_recorder=recorder, remote_ip='127.0.0.3', response_recorder=recorder, remote_ip='127.0.0.3',
@ -149,16 +143,18 @@ def test_special_dont_write_prefix():
# not to be written due to warcprox-meta prefix # not to be written due to warcprox-meta prefix
recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None) recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
recorder.read() recorder.read()
q.put(RecordedUrl( inq.put(RecordedUrl(
url='http://example.com/no', content_type='text/plain', url='http://example.com/no', content_type='text/plain',
status=200, client_ip='127.0.0.2', request_data=b'abc', status=200, client_ip='127.0.0.2', request_data=b'abc',
response_recorder=recorder, remote_ip='127.0.0.3', response_recorder=recorder, remote_ip='127.0.0.3',
timestamp=datetime.utcnow(), timestamp=datetime.utcnow(),
payload_digest=recorder.block_digest, payload_digest=recorder.block_digest,
warcprox_meta={'warc-prefix': '-'})) warcprox_meta={'warc-prefix': '-'}))
wait(lambda: len(listener.the_list) == 2, 10.0) recorded_url = outq.get(timeout=10)
assert listener.the_list[0][1] assert recorded_url.warc_records
assert not listener.the_list[1][1] recorded_url = outq.get(timeout=10)
assert not recorded_url.warc_records
assert outq.empty()
finally: finally:
wwt.stop.set() wwt.stop.set()
wwt.join() wwt.join()

View File

@ -100,7 +100,7 @@ class BasePostfetchProcessor(threading.Thread):
logger = logging.getLogger("warcprox.BasePostfetchProcessor") logger = logging.getLogger("warcprox.BasePostfetchProcessor")
def __init__(self, inq, outq, options=Options()): def __init__(self, inq, outq, options=Options()):
threading.Thread.__init__(self, name='???') threading.Thread.__init__(self, name=self.__class__.__name__)
self.inq = inq self.inq = inq
self.outq = outq self.outq = outq
self.options = options self.options = options
@ -120,7 +120,8 @@ class BasePostfetchProcessor(threading.Thread):
''' '''
Get url(s) from `self.inq`, process url(s), queue to `self.outq`. Get url(s) from `self.inq`, process url(s), queue to `self.outq`.
Subclasses must implement this. Subclasses must implement this. Implementations may operate on
individual urls, or on batches.
May raise queue.Empty. May raise queue.Empty.
''' '''
@ -188,16 +189,16 @@ class ListenerPostfetchProcessor(BaseStandardPostfetchProcessor):
def __init__(self, listener, inq, outq, profile=False): def __init__(self, listener, inq, outq, profile=False):
BaseStandardPostfetchProcessor.__init__(self, inq, outq, profile) BaseStandardPostfetchProcessor.__init__(self, inq, outq, profile)
self.listener = listener self.listener = listener
self.name = listener.__class__.__name__
logging.info('self.name=%s', self.name)
def _process_url(self, recorded_url): def _process_url(self, recorded_url):
return self.listener.notify(recorded_url, recorded_url.warc_records) return self.listener.notify(recorded_url, recorded_url.warc_records)
# @classmethod def start(self):
# def wrap(cls, listener, inq, outq, profile=False): if hasattr(self.listener, 'start'):
# if listener: self.listener.start()
# return cls(listener, inq, outq, profile) BaseStandardPostfetchProcessor.start(self)
# else:
# return None
# monkey-patch log levels TRACE and NOTICE # monkey-patch log levels TRACE and NOTICE
TRACE = 5 TRACE = 5

View File

@ -215,7 +215,7 @@ class RethinkCaptures:
if self._timer: if self._timer:
self._timer.join() self._timer.join()
class RethinkCapturesDedup: class RethinkCapturesDedup(warcprox.dedup.DedupDb):
logger = logging.getLogger("warcprox.dedup.RethinkCapturesDedup") logger = logging.getLogger("warcprox.dedup.RethinkCapturesDedup")
def __init__(self, options=warcprox.Options()): def __init__(self, options=warcprox.Options()):

View File

@ -33,6 +33,7 @@ import datetime
import warcprox import warcprox
import certauth import certauth
import functools import functools
import doublethink
class Factory: class Factory:
@staticmethod @staticmethod
@ -65,22 +66,15 @@ class Factory:
options.stats_db_file, options=options) options.stats_db_file, options=options)
return stats_db return stats_db
# @staticmethod
# def certauth(options):
# ca_name = 'Warcprox CA on {}'.format(socket.gethostname())[:64]
# ca = certauth.certauth.CertificateAuthority(
# options.cacert, args.certs_dir, ca_name=ca_name)
# return ca
@staticmethod @staticmethod
def warc_writer(inq, outq, options): def warc_writer(inq, outq, options):
return warcprox.writerthread.WarcWriterThread(inq, outq, options) return warcprox.writerthread.WarcWriterThread(inq, outq, options)
@staticmethod @staticmethod
def playback_proxy(options): def playback_proxy(ca, options):
if options.playback_port is not None: if options.playback_port is not None:
playback_index_db = warcprox.playback.PlaybackIndexDb( playback_index_db = warcprox.playback.PlaybackIndexDb(
options.playback_index_db_file, options=options) options=options)
playback_proxy = warcprox.playback.PlaybackProxy( playback_proxy = warcprox.playback.PlaybackProxy(
ca=ca, playback_index_db=playback_index_db, options=options) ca=ca, playback_index_db=playback_index_db, options=options)
else: else:
@ -136,12 +130,22 @@ class WarcproxController(object):
self.stop = threading.Event() self.stop = threading.Event()
self._start_stop_lock = threading.Lock() self._start_stop_lock = threading.Lock()
self.proxy = warcprox.warcproxy.WarcProxy(options=options) self.stats_db = Factory.stats_db(self.options)
self.proxy = warcprox.warcproxy.WarcProxy(self.stats_db, options)
self.playback_proxy = Factory.playback_proxy(
self.proxy.ca, self.options)
self.build_postfetch_chain(self.proxy.recorded_url_q) self.build_postfetch_chain(self.proxy.recorded_url_q)
self.service_registry = Factory.service_registry(options) self.service_registry = Factory.service_registry(options)
def postfetch_chain_busy(self):
for processor in self._postfetch_chain:
if processor.inq.qsize() > 0:
return True
return False
def build_postfetch_chain(self, inq): def build_postfetch_chain(self, inq):
constructors = [] constructors = []
@ -155,12 +159,10 @@ class WarcproxController(object):
if self.dedup_db: if self.dedup_db:
constructors.append(self.dedup_db.storer) constructors.append(self.dedup_db.storer)
stats_db = Factory.stats_db(self.options) if self.stats_db:
if stats_db:
constructors.append(functools.partial( constructors.append(functools.partial(
warcprox.ListenerPostfetchProcessor, stats_db)) warcprox.ListenerPostfetchProcessor, self.stats_db))
self.playback_proxy = Factory.playback_proxy(self.options)
if self.playback_proxy: if self.playback_proxy:
constructors.append(functools.partial( constructors.append(functools.partial(
warcprox.ListenerPostfetchProcessor, warcprox.ListenerPostfetchProcessor,
@ -175,7 +177,7 @@ class WarcproxController(object):
plugin = Factory.plugin(qualname) plugin = Factory.plugin(qualname)
constructors.append(functools.partial( constructors.append(functools.partial(
warcprox.ListenerPostfetchProcessor, plugin)) warcprox.ListenerPostfetchProcessor, plugin))
self._postfetch_chain = [] self._postfetch_chain = []
for i, constructor in enumerate(constructors): for i, constructor in enumerate(constructors):
if i != len(constructors) - 1: if i != len(constructors) - 1:
@ -184,6 +186,8 @@ class WarcproxController(object):
else: else:
outq = None outq = None
processor = constructor(inq, outq, self.options) processor = constructor(inq, outq, self.options)
if isinstance(processor, warcprox.writerthread.WarcWriterThread):
self.warc_writer_thread = processor # ugly
self._postfetch_chain.append(processor) self._postfetch_chain.append(processor)
inq = outq inq = outq
@ -277,6 +281,12 @@ class WarcproxController(object):
target=self.proxy.serve_forever, name='ProxyThread') target=self.proxy.serve_forever, name='ProxyThread')
self.proxy_thread.start() self.proxy_thread.start()
if self.playback_proxy:
self.playback_proxy_thread = threading.Thread(
target=self.playback_proxy.serve_forever,
name='PlaybackProxyThread')
self.playback_proxy_thread.start()
for processor in self._postfetch_chain: for processor in self._postfetch_chain:
# logging.info('starting postfetch processor %r', processor) # logging.info('starting postfetch processor %r', processor)
processor.start() processor.start()
@ -288,34 +298,29 @@ class WarcproxController(object):
self.logger.info('warcprox is not running') self.logger.info('warcprox is not running')
return return
# for wwt in self.warc_writer_threads:
# wwt.stop.set()
for processor in self._postfetch_chain: for processor in self._postfetch_chain:
processor.stop.set() processor.stop.set()
self.proxy.shutdown() self.proxy.shutdown()
self.proxy.server_close() self.proxy.server_close()
if self.playback_proxy is not None:
self.playback_proxy.shutdown()
self.playback_proxy.server_close()
if self.playback_proxy.playback_index_db is not None:
self.playback_proxy.playback_index_db.close()
for processor in self._postfetch_chain: for processor in self._postfetch_chain:
processor.join() processor.join()
# if self.playback_proxy is not None:
# self.playback_proxy.shutdown()
# self.playback_proxy.server_close()
# if self.playback_proxy.playback_index_db is not None:
# self.playback_proxy.playback_index_db.close()
# # wait for threads to finish if self.stats_db:
# for wwt in self.warc_writer_threads: self.stats_db.stop()
# wwt.join()
# if self.proxy.stats_db: self.proxy_thread.join()
# self.proxy.stats_db.stop() if self.playback_proxy is not None:
self.playback_proxy_thread.join()
# self.proxy_thread.join() if self.service_registry and hasattr(self, "status_info"):
# if self.playback_proxy is not None: self.service_registry.unregister(self.status_info["id"])
# self.playback_proxy_thread.join()
# if self.service_registry and hasattr(self, "status_info"):
# self.service_registry.unregister(self.status_info["id"])
def run_until_shutdown(self): def run_until_shutdown(self):
""" """

View File

@ -37,7 +37,8 @@ urllib3.disable_warnings()
class DedupLoader(warcprox.BaseStandardPostfetchProcessor): class DedupLoader(warcprox.BaseStandardPostfetchProcessor):
def __init__(self, dedup_db, inq, outq, base32=False, profile=False): def __init__(self, dedup_db, inq, outq, base32=False, profile=False):
warcprox.BaseStandardPostfetchProcessor.__init__(self, inq, outq, profile) warcprox.BaseStandardPostfetchProcessor.__init__(
self, inq, outq, profile)
self.dedup_db = dedup_db self.dedup_db = dedup_db
self.base32 = base32 self.base32 = base32
def _process_url(self, recorded_url): def _process_url(self, recorded_url):

View File

@ -116,9 +116,9 @@ def _build_arg_parser(prog):
arg_parser.add_argument('-P', '--playback-port', dest='playback_port', arg_parser.add_argument('-P', '--playback-port', dest='playback_port',
type=int, default=None, help='port to listen on for instant playback') type=int, default=None, help='port to listen on for instant playback')
arg_parser.add_argument('--playback-index-db-file', dest='playback_index_db_file', # arg_parser.add_argument('--playback-index-db-file', dest='playback_index_db_file',
default='./warcprox-playback-index.db', # default='./warcprox-playback-index.db',
help='playback index database file (only used if --playback-port is specified)') # help='playback index database file (only used if --playback-port is specified)')
group = arg_parser.add_mutually_exclusive_group() group = arg_parser.add_mutually_exclusive_group()
group.add_argument('-j', '--dedup-db-file', dest='dedup_db_file', group.add_argument('-j', '--dedup-db-file', dest='dedup_db_file',
default='./warcprox.sqlite', help='persistent deduplication database file; empty string or /dev/null disables deduplication') default='./warcprox.sqlite', help='persistent deduplication database file; empty string or /dev/null disables deduplication')

View File

@ -562,7 +562,12 @@ class PooledMitmProxy(PooledMixIn, MitmProxy):
request_queue_size = 4096 request_queue_size = 4096
def __init__(self, max_threads, options=warcprox.Options()): def __init__(self, max_threads, options=warcprox.Options()):
PooledMixIn.__init__(self, max_threads) if options.max_threads:
self.logger.info(
"max_threads=%s set by command line option",
options.max_threads)
PooledMixIn.__init__(self, options.max_threads)
self.profilers = {} self.profilers = {}
if options.profile: if options.profile:

View File

@ -121,9 +121,6 @@ class PlaybackProxyHandler(MitmProxyHandler):
def _send_headers_and_refd_payload( def _send_headers_and_refd_payload(
self, headers, refers_to_target_uri, refers_to_date, payload_digest): self, headers, refers_to_target_uri, refers_to_date, payload_digest):
"""Parameters:
"""
location = self.server.playback_index_db.lookup_exact( location = self.server.playback_index_db.lookup_exact(
refers_to_target_uri, refers_to_date, payload_digest) refers_to_target_uri, refers_to_date, payload_digest)
self.logger.debug('loading http payload from {}'.format(location)) self.logger.debug('loading http payload from {}'.format(location))
@ -133,11 +130,13 @@ class PlaybackProxyHandler(MitmProxyHandler):
for (offset, record, errors) in fh.read_records(limit=1, offsets=True): for (offset, record, errors) in fh.read_records(limit=1, offsets=True):
pass pass
if not record:
raise Exception('failed to read record at offset %s from %s' % (offset, warcfilename))
if errors: if errors:
raise Exception('warc errors at {}:{} -- {}'.format(location['f'], offset, errors)) raise Exception('warc errors at {}:{} -- {}'.format(location['f'], offset, errors))
warc_type = record.get_header(warctools.WarcRecord.TYPE) if record.type != warctools.WarcRecord.RESPONSE:
if warc_type != warctools.WarcRecord.RESPONSE:
raise Exception('invalid attempt to retrieve http payload of "{}" record'.format(warc_type)) raise Exception('invalid attempt to retrieve http payload of "{}" record'.format(warc_type))
# find end of headers # find end of headers
@ -158,12 +157,13 @@ class PlaybackProxyHandler(MitmProxyHandler):
for (offset, record, errors) in fh.read_records(limit=1, offsets=True): for (offset, record, errors) in fh.read_records(limit=1, offsets=True):
pass pass
if not record:
raise Exception('failed to read record at offset %s from %s' % (offset, warcfilename))
if errors: if errors:
raise Exception('warc errors at {}:{} -- {}'.format(warcfilename, offset, errors)) raise Exception('warc errors at {}:{} -- {}'.format(warcfilename, offset, errors))
warc_type = record.get_header(warctools.WarcRecord.TYPE) if record.type == warctools.WarcRecord.RESPONSE:
if warc_type == warctools.WarcRecord.RESPONSE:
headers_buf = bytearray() headers_buf = bytearray()
while True: while True:
line = record.content_file.readline() line = record.content_file.readline()
@ -173,7 +173,7 @@ class PlaybackProxyHandler(MitmProxyHandler):
return self._send_response(headers_buf, record.content_file) return self._send_response(headers_buf, record.content_file)
elif warc_type == warctools.WarcRecord.REVISIT: elif record.type == warctools.WarcRecord.REVISIT:
# response consists of http headers from revisit record and # response consists of http headers from revisit record and
# payload from the referenced record # payload from the referenced record
warc_profile = record.get_header(warctools.WarcRecord.PROFILE) warc_profile = record.get_header(warctools.WarcRecord.PROFILE)

View File

@ -92,6 +92,8 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
self.url, rule)) self.url, rule))
def _enforce_limit(self, limit_key, limit_value, soft=False): def _enforce_limit(self, limit_key, limit_value, soft=False):
if not self.server.stats_db:
return
bucket0, bucket1, bucket2 = limit_key.rsplit("/", 2) bucket0, bucket1, bucket2 = limit_key.rsplit("/", 2)
_limit_key = limit_key _limit_key = limit_key
@ -328,7 +330,7 @@ class RecordedUrl:
warcprox_meta=None, content_type=None, custom_type=None, warcprox_meta=None, content_type=None, custom_type=None,
status=None, size=None, client_ip=None, method=None, status=None, size=None, client_ip=None, method=None,
timestamp=None, host=None, duration=None, referer=None, timestamp=None, host=None, duration=None, referer=None,
payload_digest=None): payload_digest=None, warc_records=None):
# XXX should test what happens with non-ascii url (when does # XXX should test what happens with non-ascii url (when does
# url-encoding happen?) # url-encoding happen?)
if type(url) is not bytes: if type(url) is not bytes:
@ -367,6 +369,7 @@ class RecordedUrl:
self.duration = duration self.duration = duration
self.referer = referer self.referer = referer
self.payload_digest = payload_digest self.payload_digest = payload_digest
self.warc_records = warc_records
# inherit from object so that multiple inheritance from this class works # inherit from object so that multiple inheritance from this class works
# properly in python 2 # properly in python 2
@ -374,9 +377,9 @@ class RecordedUrl:
class SingleThreadedWarcProxy(http_server.HTTPServer, object): class SingleThreadedWarcProxy(http_server.HTTPServer, object):
logger = logging.getLogger("warcprox.warcproxy.WarcProxy") logger = logging.getLogger("warcprox.warcproxy.WarcProxy")
def __init__( def __init__(self, stats_db=None, options=warcprox.Options()):
self, ca=None, recorded_url_q=None, stats_db=None, self.options = options
options=warcprox.Options()):
server_address = ( server_address = (
options.address or 'localhost', options.address or 'localhost',
options.port if options.port is not None else 8000) options.port if options.port is not None else 8000)
@ -395,22 +398,15 @@ class SingleThreadedWarcProxy(http_server.HTTPServer, object):
self.digest_algorithm = options.digest_algorithm or 'sha1' self.digest_algorithm = options.digest_algorithm or 'sha1'
if ca is not None: ca_name = ('Warcprox CA on %s' % socket.gethostname())[:64]
self.ca = ca self.ca = CertificateAuthority(
else: ca_file='warcprox-ca.pem', certs_dir='./warcprox-ca',
ca_name = 'Warcprox CA on {}'.format(socket.gethostname())[:64] ca_name=ca_name)
self.ca = CertificateAuthority(ca_file='warcprox-ca.pem',
certs_dir='./warcprox-ca',
ca_name=ca_name)
if recorded_url_q is not None: self.recorded_url_q = warcprox.TimestampedQueue(
self.recorded_url_q = recorded_url_q maxsize=options.queue_size or 1000)
else:
self.recorded_url_q = warcprox.TimestampedQueue(
maxsize=options.queue_size or 1000)
self.stats_db = stats_db self.stats_db = stats_db
self.options = options
self.running_stats = warcprox.stats.RunningStats() self.running_stats = warcprox.stats.RunningStats()
@ -449,17 +445,9 @@ class SingleThreadedWarcProxy(http_server.HTTPServer, object):
class WarcProxy(SingleThreadedWarcProxy, warcprox.mitmproxy.PooledMitmProxy): class WarcProxy(SingleThreadedWarcProxy, warcprox.mitmproxy.PooledMitmProxy):
logger = logging.getLogger("warcprox.warcproxy.WarcProxy") logger = logging.getLogger("warcprox.warcproxy.WarcProxy")
def __init__( def __init__(self, stats_db=None, options=warcprox.Options()):
self, ca=None, recorded_url_q=None, stats_db=None, warcprox.mitmproxy.PooledMitmProxy.__init__(self, options)
running_stats=None, options=warcprox.Options()): SingleThreadedWarcProxy.__init__(self, stats_db, options)
if options.max_threads:
self.logger.info(
"max_threads=%s set by command line option",
options.max_threads)
warcprox.mitmproxy.PooledMitmProxy.__init__(
self, options.max_threads, options)
SingleThreadedWarcProxy.__init__(
self, ca, recorded_url_q, stats_db, options)
def server_activate(self): def server_activate(self):
http_server.HTTPServer.server_activate(self) http_server.HTTPServer.server_activate(self)

View File

@ -34,6 +34,8 @@ import warcprox
class WarcWriterThread(warcprox.BaseStandardPostfetchProcessor): class WarcWriterThread(warcprox.BaseStandardPostfetchProcessor):
logger = logging.getLogger("warcprox.writerthread.WarcWriterThread") logger = logging.getLogger("warcprox.writerthread.WarcWriterThread")
_ALWAYS_ACCEPT = {'WARCPROX_WRITE_RECORD'}
def __init__(self, inq, outq, options=warcprox.Options()): def __init__(self, inq, outq, options=warcprox.Options()):
warcprox.BaseStandardPostfetchProcessor.__init__( warcprox.BaseStandardPostfetchProcessor.__init__(
self, inq, outq, options=options) self, inq, outq, options=options)
@ -48,6 +50,7 @@ class WarcWriterThread(warcprox.BaseStandardPostfetchProcessor):
self.writer_pool.maybe_idle_rollover() self.writer_pool.maybe_idle_rollover()
def _process_url(self, recorded_url): def _process_url(self, recorded_url):
records = []
if self._should_archive(recorded_url): if self._should_archive(recorded_url):
records = self.writer_pool.write_records(recorded_url) records = self.writer_pool.write_records(recorded_url)
recorded_url.warc_records = records recorded_url.warc_records = records