mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
Merge pull request #98 from nlevitt/trough-dedup-bugs
WIP: trough dedup bug fix
This commit is contained in:
commit
f32d5636a1
@ -51,6 +51,7 @@ import gzip
|
|||||||
import mock
|
import mock
|
||||||
import email.message
|
import email.message
|
||||||
import socketserver
|
import socketserver
|
||||||
|
from concurrent import futures
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import http.server as http_server
|
import http.server as http_server
|
||||||
@ -886,6 +887,57 @@ def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies,
|
|||||||
finally:
|
finally:
|
||||||
fh.close()
|
fh.close()
|
||||||
|
|
||||||
|
def test_dedup_bucket_concurrency(https_daemon, http_daemon, warcprox_, archiving_proxies):
|
||||||
|
urls_before = warcprox_.proxy.running_stats.urls
|
||||||
|
revisits_before = warcprox_.proxy.stats_db.value(
|
||||||
|
'__all__', 'revisit', 'urls') or 0
|
||||||
|
|
||||||
|
# fire off 20 initial requests simultaneously-ish
|
||||||
|
with futures.ThreadPoolExecutor(max_workers=20) as pool:
|
||||||
|
for i in range(20):
|
||||||
|
url = 'http://localhost:%s/test_dedup_bucket_concurrency/%s' % (
|
||||||
|
http_daemon.server_port, i)
|
||||||
|
headers = {"Warcprox-Meta": json.dumps({
|
||||||
|
"warc-prefix":"test_dedup_buckets",
|
||||||
|
"dedup-bucket":"bucket_%s" % i})}
|
||||||
|
pool.submit(
|
||||||
|
requests.get, url, proxies=archiving_proxies, verify=False,
|
||||||
|
headers=headers)
|
||||||
|
|
||||||
|
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 20)
|
||||||
|
assert warcprox_.proxy.stats_db.value('__all__', 'revisit', 'urls') == revisits_before
|
||||||
|
|
||||||
|
# fire off 20 requests to the same urls but different buckets
|
||||||
|
# none should be deduped
|
||||||
|
with futures.ThreadPoolExecutor(max_workers=20) as pool:
|
||||||
|
for i in range(20):
|
||||||
|
url = 'http://localhost:%s/test_dedup_bucket_concurrency/%s' % (
|
||||||
|
http_daemon.server_port, -i - 1)
|
||||||
|
headers = {"Warcprox-Meta": json.dumps({
|
||||||
|
"warc-prefix":"test_dedup_buckets",
|
||||||
|
"dedup-bucket":"bucket_%s" % i})}
|
||||||
|
pool.submit(
|
||||||
|
requests.get, url, proxies=archiving_proxies, verify=False,
|
||||||
|
headers=headers)
|
||||||
|
|
||||||
|
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 40)
|
||||||
|
assert warcprox_.proxy.stats_db.value('__all__', 'revisit', 'urls') == revisits_before
|
||||||
|
|
||||||
|
# fire off 20 requests same as the initial requests, all should be deduped
|
||||||
|
with futures.ThreadPoolExecutor(max_workers=20) as pool:
|
||||||
|
for i in range(20):
|
||||||
|
url = 'http://localhost:%s/test_dedup_bucket_concurrency/%s' % (
|
||||||
|
http_daemon.server_port, i)
|
||||||
|
headers = {"Warcprox-Meta": json.dumps({
|
||||||
|
"warc-prefix":"test_dedup_buckets",
|
||||||
|
"dedup-bucket":"bucket_%s" % i})}
|
||||||
|
pool.submit(
|
||||||
|
requests.get, url, proxies=archiving_proxies, verify=False,
|
||||||
|
headers=headers)
|
||||||
|
|
||||||
|
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 60)
|
||||||
|
assert warcprox_.proxy.stats_db.value('__all__', 'revisit', 'urls') == revisits_before + 20
|
||||||
|
|
||||||
def test_block_rules(http_daemon, https_daemon, warcprox_, archiving_proxies):
|
def test_block_rules(http_daemon, https_daemon, warcprox_, archiving_proxies):
|
||||||
urls_before = warcprox_.proxy.running_stats.urls
|
urls_before = warcprox_.proxy.running_stats.urls
|
||||||
|
|
||||||
|
@ -148,6 +148,8 @@ class BasePostfetchProcessor(threading.Thread):
|
|||||||
raise Exception('not implemented')
|
raise Exception('not implemented')
|
||||||
|
|
||||||
def _run(self):
|
def _run(self):
|
||||||
|
threading.current_thread().name = '%s(tid=%s)' % (
|
||||||
|
threading.current_thread().name, gettid())
|
||||||
self.logger.info('%s starting up', self)
|
self.logger.info('%s starting up', self)
|
||||||
self._startup()
|
self._startup()
|
||||||
while not self.stop.is_set():
|
while not self.stop.is_set():
|
||||||
|
@ -405,7 +405,9 @@ class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor):
|
|||||||
recorded_url.payload_digest, self.options.base32)
|
recorded_url.payload_digest, self.options.base32)
|
||||||
if recorded_url.payload_digest else 'n/a')
|
if recorded_url.payload_digest else 'n/a')
|
||||||
self.logger.debug(
|
self.logger.debug(
|
||||||
'filtered out digests (not loading dedup): %r', discards)
|
'len(batch)=%s len(discards)=%s buckets=%s',
|
||||||
|
len(batch), len(discards),
|
||||||
|
{bucket: len(buckets[bucket]) for bucket in buckets})
|
||||||
return buckets
|
return buckets
|
||||||
|
|
||||||
def _build_key_index(self, batch):
|
def _build_key_index(self, batch):
|
||||||
@ -432,11 +434,12 @@ class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor):
|
|||||||
fs = {}
|
fs = {}
|
||||||
with futures.ThreadPoolExecutor(max_workers=len(buckets)) as pool:
|
with futures.ThreadPoolExecutor(max_workers=len(buckets)) as pool:
|
||||||
# send off the trough requests in parallel
|
# send off the trough requests in parallel
|
||||||
|
key_indexes = {}
|
||||||
for bucket in buckets:
|
for bucket in buckets:
|
||||||
key_index = self._build_key_index(buckets[bucket])
|
key_indexes[bucket] = self._build_key_index(buckets[bucket])
|
||||||
future = pool.submit(
|
future = pool.submit(
|
||||||
self.trough_dedup_db.batch_lookup,
|
self.trough_dedup_db.batch_lookup,
|
||||||
key_index.keys(), bucket)
|
key_indexes[bucket].keys(), bucket)
|
||||||
fs[future] = bucket
|
fs[future] = bucket
|
||||||
|
|
||||||
# process results as they come back
|
# process results as they come back
|
||||||
@ -444,6 +447,7 @@ class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor):
|
|||||||
for future in futures.as_completed(fs, timeout=20):
|
for future in futures.as_completed(fs, timeout=20):
|
||||||
bucket = fs[future]
|
bucket = fs[future]
|
||||||
try:
|
try:
|
||||||
|
key_index = key_indexes[bucket]
|
||||||
for entry in future.result():
|
for entry in future.result():
|
||||||
for recorded_url in key_index[entry['digest_key']]:
|
for recorded_url in key_index[entry['digest_key']]:
|
||||||
recorded_url.dedup_info = entry
|
recorded_url.dedup_info = entry
|
||||||
@ -459,8 +463,8 @@ class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor):
|
|||||||
novel = sorted([
|
novel = sorted([
|
||||||
k for k in key_index.keys() if k not in dups])
|
k for k in key_index.keys() if k not in dups])
|
||||||
self.logger.debug(
|
self.logger.debug(
|
||||||
'bucket %s: dups=%r novel=%r',
|
'bucket %s: dups(%s)=%r novel(%s)=%r',
|
||||||
bucket, dups, novel)
|
bucket, len(dups), dups, len(novel), novel)
|
||||||
|
|
||||||
except futures.TimeoutError as e:
|
except futures.TimeoutError as e:
|
||||||
# the remaining threads actually keep running in this case,
|
# the remaining threads actually keep running in this case,
|
||||||
|
@ -204,13 +204,14 @@ class WarcWriter:
|
|||||||
record.offset = offset
|
record.offset = offset
|
||||||
record.length = warc.f.tell() - offset
|
record.length = warc.f.tell() - offset
|
||||||
record.warc_filename = warc.finalname
|
record.warc_filename = warc.finalname
|
||||||
self.logger.debug(
|
self.logger.trace(
|
||||||
'wrote warc record: warc_type=%s content_length=%s '
|
'wrote warc record: warc_type=%s content_length=%s '
|
||||||
'url=%s warc=%s offset=%d',
|
'digest=%s offset=%d warc=%s url=%s',
|
||||||
record.get_header(warctools.WarcRecord.TYPE),
|
record.type,
|
||||||
record.get_header(warctools.WarcRecord.CONTENT_LENGTH),
|
record.get_header(warctools.WarcRecord.CONTENT_LENGTH),
|
||||||
record.get_header(warctools.WarcRecord.URL),
|
record.get_header(b'WARC-Payload-Digest'),
|
||||||
warc.path, record.offset)
|
record.offset, warc.path,
|
||||||
|
record.get_header(warctools.WarcRecord.URL))
|
||||||
|
|
||||||
return records
|
return records
|
||||||
|
|
||||||
|
@ -117,19 +117,18 @@ class WarcWriterProcessor(warcprox.BaseStandardPostfetchProcessor):
|
|||||||
and self._filter_accepts(recorded_url))
|
and self._filter_accepts(recorded_url))
|
||||||
|
|
||||||
def _log(self, recorded_url, records):
|
def _log(self, recorded_url, records):
|
||||||
try:
|
|
||||||
payload_digest = records[0].get_header('WARC-Payload-Digest').decode("utf-8")
|
|
||||||
except:
|
|
||||||
payload_digest = "-"
|
|
||||||
|
|
||||||
# 2015-07-17T22:32:23.672Z 1 58 dns:www.dhss.delaware.gov P http://www.dhss.delaware.gov/dhss/ text/dns #045 20150717223214881+316 sha1:63UTPB7GTWIHAGIK3WWL76E57BBTJGAK http://www.dhss.delaware.gov/dhss/ - {"warcFileOffset":2964,"warcFilename":"ARCHIVEIT-1303-WEEKLY-JOB165158-20150717223222113-00000.warc.gz"}
|
# 2015-07-17T22:32:23.672Z 1 58 dns:www.dhss.delaware.gov P http://www.dhss.delaware.gov/dhss/ text/dns #045 20150717223214881+316 sha1:63UTPB7GTWIHAGIK3WWL76E57BBTJGAK http://www.dhss.delaware.gov/dhss/ - {"warcFileOffset":2964,"warcFilename":"ARCHIVEIT-1303-WEEKLY-JOB165158-20150717223222113-00000.warc.gz"}
|
||||||
type_ = records[0].type.decode("utf-8") if records else '-'
|
try:
|
||||||
|
payload_digest = records[0].get_header(b'WARC-Payload-Digest').decode('utf-8')
|
||||||
|
except:
|
||||||
|
payload_digest = '-'
|
||||||
|
type_ = records[0].type.decode('utf-8') if records else '-'
|
||||||
filename = records[0].warc_filename if records else '-'
|
filename = records[0].warc_filename if records else '-'
|
||||||
offset = records[0].offset if records else '-'
|
offset = records[0].offset if records else '-'
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
"%s %s %s %s %s size=%s %s %s %s offset=%s",
|
'%s %s %s %s %s size=%s %s %s %s offset=%s',
|
||||||
recorded_url.client_ip, recorded_url.status,
|
recorded_url.client_ip, recorded_url.status,
|
||||||
recorded_url.method, recorded_url.url.decode("utf-8"),
|
recorded_url.method, recorded_url.url.decode('utf-8'),
|
||||||
recorded_url.mimetype, recorded_url.size, payload_digest,
|
recorded_url.mimetype, recorded_url.size, payload_digest,
|
||||||
type_, filename, offset)
|
type_, filename, offset)
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user