2014-11-15 03:20:05 -08:00
|
|
|
# vim:set sw=4 et:
|
|
|
|
|
2015-03-18 16:29:44 -07:00
|
|
|
from __future__ import absolute_import
|
|
|
|
|
2014-11-15 03:20:05 -08:00
|
|
|
try:
|
|
|
|
import dbm.gnu as dbm_gnu
|
|
|
|
except ImportError:
|
|
|
|
try:
|
|
|
|
import gdbm as dbm_gnu
|
|
|
|
except ImportError:
|
|
|
|
import anydbm as dbm_gnu
|
|
|
|
|
|
|
|
import logging
|
|
|
|
import os
|
|
|
|
import json
|
|
|
|
from hanzo import warctools
|
2015-07-30 00:12:59 +00:00
|
|
|
import warcprox
|
2015-08-20 21:46:40 +00:00
|
|
|
import rethinkdb
|
|
|
|
r = rethinkdb
|
|
|
|
import random
|
2014-11-15 03:20:05 -08:00
|
|
|
|
|
|
|
class DedupDb(object):
|
2015-03-18 16:29:44 -07:00
|
|
|
logger = logging.getLogger("warcprox.dedup.DedupDb")
|
2014-11-15 03:20:05 -08:00
|
|
|
|
2015-08-24 23:53:11 +00:00
|
|
|
def __init__(self, dbm_file='./warcprox-dedup.db', options=warcprox.Options()):
|
2014-11-15 03:20:05 -08:00
|
|
|
if os.path.exists(dbm_file):
|
|
|
|
self.logger.info('opening existing deduplication database {}'.format(dbm_file))
|
|
|
|
else:
|
|
|
|
self.logger.info('creating new deduplication database {}'.format(dbm_file))
|
|
|
|
|
|
|
|
self.db = dbm_gnu.open(dbm_file, 'c')
|
2015-08-24 23:53:11 +00:00
|
|
|
self.options = options
|
2014-11-15 03:20:05 -08:00
|
|
|
|
|
|
|
def close(self):
|
|
|
|
self.db.close()
|
|
|
|
|
|
|
|
def sync(self):
|
2014-11-15 04:47:26 -08:00
|
|
|
try:
|
2014-11-15 03:20:05 -08:00
|
|
|
self.db.sync()
|
2014-11-15 04:47:26 -08:00
|
|
|
except:
|
|
|
|
pass
|
2014-11-15 03:20:05 -08:00
|
|
|
|
2015-08-20 21:46:40 +00:00
|
|
|
def save(self, key, response_record):
|
2014-11-15 03:20:05 -08:00
|
|
|
record_id = response_record.get_header(warctools.WarcRecord.ID).decode('latin1')
|
|
|
|
url = response_record.get_header(warctools.WarcRecord.URL).decode('latin1')
|
|
|
|
date = response_record.get_header(warctools.WarcRecord.DATE).decode('latin1')
|
|
|
|
|
2015-08-20 21:46:40 +00:00
|
|
|
py_value = {'id':record_id, 'url':url, 'date':date}
|
2014-11-15 03:20:05 -08:00
|
|
|
json_value = json.dumps(py_value, separators=(',',':'))
|
|
|
|
|
|
|
|
self.db[key] = json_value.encode('utf-8')
|
2015-07-30 00:12:59 +00:00
|
|
|
self.logger.debug('dedup db saved %s:%s', key, json_value)
|
2014-11-15 03:20:05 -08:00
|
|
|
|
|
|
|
def lookup(self, key):
|
2015-07-30 00:12:59 +00:00
|
|
|
result = None
|
2014-11-15 03:20:05 -08:00
|
|
|
if key in self.db:
|
|
|
|
json_result = self.db[key]
|
|
|
|
result = json.loads(json_result.decode('utf-8'))
|
2015-08-20 21:46:40 +00:00
|
|
|
result['id'] = result['id'].encode('latin1')
|
|
|
|
result['url'] = result['url'].encode('latin1')
|
|
|
|
result['date'] = result['date'].encode('latin1')
|
2015-07-30 00:12:59 +00:00
|
|
|
self.logger.debug('dedup db lookup of key=%s returning %s', key, result)
|
|
|
|
return result
|
2014-11-15 03:20:05 -08:00
|
|
|
|
2015-08-24 23:53:11 +00:00
|
|
|
def notify(self, recorded_url, records):
|
|
|
|
if (records[0].get_header(warctools.WarcRecord.TYPE) == warctools.WarcRecord.RESPONSE
|
|
|
|
and recorded_url.response_recorder.payload_size() > 0):
|
|
|
|
key = warcprox.digest_str(recorded_url.response_recorder.payload_digest,
|
|
|
|
self.options.base32)
|
|
|
|
self.save(key, records[0])
|
|
|
|
|
|
|
|
|
2015-07-30 00:12:59 +00:00
|
|
|
def decorate_with_dedup_info(dedup_db, recorded_url, base32=False):
|
2015-08-01 00:08:01 +00:00
|
|
|
if recorded_url.response_recorder and recorded_url.response_recorder.payload_digest:
|
2015-07-30 00:12:59 +00:00
|
|
|
key = warcprox.digest_str(recorded_url.response_recorder.payload_digest, base32)
|
|
|
|
recorded_url.dedup_info = dedup_db.lookup(key)
|
2014-11-15 03:20:05 -08:00
|
|
|
|
2015-08-20 21:46:40 +00:00
|
|
|
class RethinkDedupDb:
|
|
|
|
logger = logging.getLogger("warcprox.dedup.RethinkDedupDb")
|
|
|
|
|
2015-08-24 23:53:11 +00:00
|
|
|
def __init__(self, servers=["localhost"], db="warcprox", table="dedup", shards=3, replicas=3, options=warcprox.Options()):
|
2015-08-20 21:46:40 +00:00
|
|
|
self.servers = servers
|
|
|
|
self.db = db
|
|
|
|
self.table = table
|
|
|
|
self.shards = shards
|
|
|
|
self.replicas = replicas
|
|
|
|
self._ensure_db_table()
|
2015-08-24 23:53:11 +00:00
|
|
|
self.options = options
|
2015-08-20 21:46:40 +00:00
|
|
|
|
|
|
|
# https://github.com/rethinkdb/rethinkdb-example-webpy-blog/blob/master/model.py
|
|
|
|
# "Best practices: Managing connections: a connection per request"
|
|
|
|
def _random_server_connection(self):
|
|
|
|
server = random.choice(self.servers)
|
|
|
|
try:
|
|
|
|
host, port = server.split(":")
|
|
|
|
return r.connect(host=host, port=port)
|
|
|
|
except ValueError:
|
|
|
|
return r.connect(host=server)
|
|
|
|
|
|
|
|
def _ensure_db_table(self):
|
|
|
|
with self._random_server_connection() as conn:
|
|
|
|
dbs = r.db_list().run(conn)
|
|
|
|
if not self.db in dbs:
|
|
|
|
self.logger.info("creating rethinkdb database %s", repr(self.db))
|
|
|
|
r.db_create(self.db).run(conn)
|
|
|
|
tables = r.db(self.db).table_list().run(conn)
|
|
|
|
if not self.table in tables:
|
|
|
|
self.logger.info("creating rethinkdb table %s in database %s", repr(self.table), repr(self.db))
|
|
|
|
r.db(self.db).table_create(self.table, primary_key="key", shards=self.shards, replicas=self.replicas).run(conn)
|
|
|
|
|
|
|
|
def close(self):
|
|
|
|
pass
|
|
|
|
|
|
|
|
def sync(self):
|
|
|
|
pass
|
|
|
|
|
|
|
|
def save(self, key, response_record):
|
|
|
|
k = key.decode("utf-8") if isinstance(key, bytes) else key
|
|
|
|
record_id = response_record.get_header(warctools.WarcRecord.ID).decode('latin1')
|
|
|
|
url = response_record.get_header(warctools.WarcRecord.URL).decode('latin1')
|
|
|
|
date = response_record.get_header(warctools.WarcRecord.DATE).decode('latin1')
|
|
|
|
record = {'key':k,'url':url,'date':date,'id':record_id}
|
|
|
|
with self._random_server_connection() as conn:
|
|
|
|
result = r.db(self.db).table(self.table).insert(record,conflict="replace").run(conn)
|
2015-08-25 01:26:51 +00:00
|
|
|
if sorted(result.values()) != [0,0,0,0,0,1] and [result["deleted"],result["skipped"],result["errors"]] != [0,0,0]:
|
2015-08-20 21:46:40 +00:00
|
|
|
raise Exception("unexpected result %s saving %s", result, record)
|
|
|
|
self.logger.debug('dedup db saved %s:%s', key, record)
|
|
|
|
|
|
|
|
def lookup(self, key):
|
|
|
|
k = key.decode("utf-8") if isinstance(key, bytes) else key
|
|
|
|
with self._random_server_connection() as conn:
|
|
|
|
result = r.db(self.db).table(self.table).get(k).run(conn)
|
|
|
|
if result:
|
|
|
|
for x in result:
|
|
|
|
result[x] = result[x].encode("utf-8")
|
|
|
|
self.logger.debug('dedup db lookup of key=%s returning %s', key, result)
|
|
|
|
return result
|
2015-08-24 23:53:11 +00:00
|
|
|
|
|
|
|
def notify(self, recorded_url, records):
|
|
|
|
if (records[0].get_header(warctools.WarcRecord.TYPE) == warctools.WarcRecord.RESPONSE
|
|
|
|
and recorded_url.response_recorder.payload_size() > 0):
|
|
|
|
key = warcprox.digest_str(recorded_url.response_recorder.payload_digest,
|
|
|
|
self.options.base32)
|
|
|
|
self.save(key, records[0])
|