mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
trough for deduplication initial proof-of-concept-ish code
This commit is contained in:
parent
9b8043d3a2
commit
4eda89f232
@ -27,6 +27,7 @@ import json
|
|||||||
from hanzo import warctools
|
from hanzo import warctools
|
||||||
import warcprox
|
import warcprox
|
||||||
import sqlite3
|
import sqlite3
|
||||||
|
import requests
|
||||||
|
|
||||||
class DedupDb(object):
|
class DedupDb(object):
|
||||||
logger = logging.getLogger("warcprox.dedup.DedupDb")
|
logger = logging.getLogger("warcprox.dedup.DedupDb")
|
||||||
@ -174,3 +175,107 @@ class RethinkDedupDb:
|
|||||||
else:
|
else:
|
||||||
self.save(digest_key, records[0])
|
self.save(digest_key, records[0])
|
||||||
|
|
||||||
|
class TroughDedupDb(object):
|
||||||
|
'''
|
||||||
|
https://github.com/jkafader/trough
|
||||||
|
'''
|
||||||
|
logger = logging.getLogger("warcprox.dedup.TroughDedupDb")
|
||||||
|
|
||||||
|
def __init__(self, options=warcprox.Options()):
|
||||||
|
self.options = options
|
||||||
|
|
||||||
|
def start(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def stop(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def _write_url(self, bucket):
|
||||||
|
import doublethink
|
||||||
|
segment_id = 'warcprox-trough-%s' % bucket
|
||||||
|
rr = doublethink.Rethinker(
|
||||||
|
servers=['localhost'], db='trough_configuration')
|
||||||
|
services = doublethink.ServiceRegistry(rr)
|
||||||
|
master_node = services.unique_service('trough-sync-master')
|
||||||
|
response = requests.post(master_node['url'], segment_id)
|
||||||
|
response.raise_for_status()
|
||||||
|
write_url = response.text.strip()
|
||||||
|
return write_url
|
||||||
|
|
||||||
|
def _read_url(self, bucket):
|
||||||
|
import doublethink
|
||||||
|
import rethinkdb as r
|
||||||
|
segment_id = 'warcprox-trough-%s' % bucket
|
||||||
|
rr = doublethink.Rethinker(
|
||||||
|
servers=['localhost'], db='trough_configuration')
|
||||||
|
reql = rr.table('services').get_all(segment_id, index='segment').filter(
|
||||||
|
{'role':'trough-read'}).filter(
|
||||||
|
lambda svc: r.now().sub(
|
||||||
|
svc['last_heartbeat']).lt(svc['ttl'])
|
||||||
|
).order_by('load')
|
||||||
|
logging.debug('querying rethinkdb: %r', reql)
|
||||||
|
results = reql.run()
|
||||||
|
if results:
|
||||||
|
return results[0]['url']
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def save(self, digest_key, response_record, bucket='__unspecified__'):
|
||||||
|
write_url = self._write_url(bucket)
|
||||||
|
record_id = response_record.get_header(warctools.WarcRecord.ID).decode('ascii')
|
||||||
|
url = response_record.get_header(warctools.WarcRecord.URL).decode('ascii')
|
||||||
|
warc_date = response_record.get_header(warctools.WarcRecord.DATE).decode('ascii')
|
||||||
|
|
||||||
|
# XXX create table statement here is a temporary hack,
|
||||||
|
# see https://webarchive.jira.com/browse/AITFIVE-1465
|
||||||
|
sql = ('create table if not exists dedup (\n'
|
||||||
|
' digest_key varchar(100) primary key,\n'
|
||||||
|
' url varchar(2100) not null,\n'
|
||||||
|
' date datetime not null,\n'
|
||||||
|
' id varchar(100));\n' # warc record id
|
||||||
|
'insert into dedup (digest_key, url, date, id) '
|
||||||
|
'values (%r, %r, %r, %r);') % (
|
||||||
|
digest_key.decode('ascii'), url, warc_date, record_id)
|
||||||
|
response = requests.post(write_url, sql)
|
||||||
|
if response.status_code != 200:
|
||||||
|
logging.warn(
|
||||||
|
'unexpected response %r %r %r to sql=%r',
|
||||||
|
response.status_code, response.reason, response.text, sql)
|
||||||
|
|
||||||
|
def lookup(self, digest_key, bucket='__unspecified__'):
|
||||||
|
read_url = self._read_url(bucket)
|
||||||
|
if not read_url:
|
||||||
|
return None
|
||||||
|
sql = 'select * from dedup where digest_key=%r;' % digest_key.decode('ascii')
|
||||||
|
response = requests.post(read_url, sql)
|
||||||
|
if response.status_code != 200:
|
||||||
|
logging.warn(
|
||||||
|
'unexpected response %r %r %r to sql=%r',
|
||||||
|
response.status_code, response.reason, response.text, sql)
|
||||||
|
return None
|
||||||
|
logging.debug('got %r from query %r', response.text, sql)
|
||||||
|
results = json.loads(response.text)
|
||||||
|
assert len(results) <= 1 # sanity check (digest_key is primary key)
|
||||||
|
if results:
|
||||||
|
result = results[0]
|
||||||
|
result['id'] = result['id'].encode('ascii')
|
||||||
|
result['url'] = result['url'].encode('ascii')
|
||||||
|
result['date'] = result['date'].encode('ascii')
|
||||||
|
self.logger.debug(
|
||||||
|
'trough lookup of key=%r returning %r', digest_key, result)
|
||||||
|
return result
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def notify(self, recorded_url, records):
|
||||||
|
if (records[0].get_header(warctools.WarcRecord.TYPE) == warctools.WarcRecord.RESPONSE
|
||||||
|
and recorded_url.response_recorder.payload_size() > 0):
|
||||||
|
digest_key = warcprox.digest_str(
|
||||||
|
recorded_url.response_recorder.payload_digest,
|
||||||
|
self.options.base32)
|
||||||
|
if recorded_url.warcprox_meta and 'captures-bucket' in recorded_url.warcprox_meta:
|
||||||
|
self.save(
|
||||||
|
digest_key, records[0],
|
||||||
|
bucket=recorded_url.warcprox_meta['captures-bucket'])
|
||||||
|
else:
|
||||||
|
self.save(digest_key, records[0])
|
||||||
|
@ -108,6 +108,7 @@ def _build_arg_parser(prog=os.path.basename(sys.argv[0])):
|
|||||||
default='./warcprox.sqlite', help='persistent deduplication database file; empty string or /dev/null disables deduplication')
|
default='./warcprox.sqlite', help='persistent deduplication database file; empty string or /dev/null disables deduplication')
|
||||||
group.add_argument('--rethinkdb-servers', dest='rethinkdb_servers',
|
group.add_argument('--rethinkdb-servers', dest='rethinkdb_servers',
|
||||||
help='rethinkdb servers, used for dedup and stats if specified; e.g. db0.foo.org,db0.foo.org:38015,db1.foo.org')
|
help='rethinkdb servers, used for dedup and stats if specified; e.g. db0.foo.org,db0.foo.org:38015,db1.foo.org')
|
||||||
|
group.add_argument('--trough', help='use trough for deduplication 🐷 🐷 🐷 🐷', action='store_true')
|
||||||
arg_parser.add_argument('--rethinkdb-db', dest='rethinkdb_db', default='warcprox',
|
arg_parser.add_argument('--rethinkdb-db', dest='rethinkdb_db', default='warcprox',
|
||||||
help='rethinkdb database name (ignored unless --rethinkdb-servers is specified)')
|
help='rethinkdb database name (ignored unless --rethinkdb-servers is specified)')
|
||||||
arg_parser.add_argument('--rethinkdb-big-table',
|
arg_parser.add_argument('--rethinkdb-big-table',
|
||||||
@ -177,7 +178,10 @@ def init_controller(args):
|
|||||||
exit(1)
|
exit(1)
|
||||||
|
|
||||||
listeners = []
|
listeners = []
|
||||||
if args.rethinkdb_servers:
|
if args.trough:
|
||||||
|
dedup_db = warcprox.dedup.TroughDedupDb(options)
|
||||||
|
listeners.append(dedup_db)
|
||||||
|
elif args.rethinkdb_servers:
|
||||||
rr = doublethink.Rethinker(
|
rr = doublethink.Rethinker(
|
||||||
args.rethinkdb_servers.split(","), args.rethinkdb_db)
|
args.rethinkdb_servers.split(","), args.rethinkdb_db)
|
||||||
if args.rethinkdb_big_table:
|
if args.rethinkdb_big_table:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user