mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
update trough dedup to use new segment manager api to register schema sql
This commit is contained in:
parent
ed49eea4d5
commit
ab99fe52b9
@ -246,36 +246,35 @@ class CdxServerDedup(object):
|
|||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
class TroughDedupDb(object):
|
class TroughClient(object):
|
||||||
'''
|
logger = logging.getLogger("warcprox.dedup.TroughClient")
|
||||||
https://github.com/jkafader/trough
|
|
||||||
'''
|
|
||||||
logger = logging.getLogger("warcprox.dedup.TroughDedupDb")
|
|
||||||
|
|
||||||
def __init__(self, options=warcprox.Options()):
|
def __init__(self, rethinkdb_trough_db_url):
|
||||||
parsed = doublethink.parse_rethinkdb_url(
|
parsed = doublethink.parse_rethinkdb_url(rethinkdb_trough_db_url)
|
||||||
options.rethinkdb_trough_db_url)
|
|
||||||
self.rr = doublethink.Rethinker(
|
self.rr = doublethink.Rethinker(
|
||||||
servers=parsed.hosts, db=parsed.database)
|
servers=parsed.hosts, db=parsed.database)
|
||||||
self.svcreg = doublethink.ServiceRegistry(self.rr)
|
self.svcreg = doublethink.ServiceRegistry(self.rr)
|
||||||
self.options = options
|
|
||||||
|
|
||||||
def start(self):
|
def segment_manager_url(self):
|
||||||
pass
|
# XXX cache until expired (check last_heartbeat and ttl)
|
||||||
|
|
||||||
def stop(self):
|
|
||||||
pass
|
|
||||||
|
|
||||||
def _write_url(self, bucket):
|
|
||||||
segment_id = 'warcprox-trough-%s' % bucket
|
|
||||||
master_node = self.svcreg.unique_service('trough-sync-master')
|
master_node = self.svcreg.unique_service('trough-sync-master')
|
||||||
response = requests.post(master_node['url'], segment_id)
|
assert master_node
|
||||||
response.raise_for_status()
|
return master_node['url']
|
||||||
write_url = response.text.strip()
|
|
||||||
return write_url
|
|
||||||
|
|
||||||
def _read_url(self, bucket):
|
def write_url(self, segment_id, schema_id='default'):
|
||||||
segment_id = 'warcprox-trough-%s' % bucket
|
provision_url = os.path.join(self.segment_manager_url(), 'provision')
|
||||||
|
payload_dict = {'segment': segment_id, 'schema': schema_id}
|
||||||
|
response = requests.post(provision_url, json=payload_dict)
|
||||||
|
if response.status_code != 200:
|
||||||
|
raise Exception(
|
||||||
|
'Received %s: %r in response to POST %s with data %s' % (
|
||||||
|
response.status_code, response.text, provision_url,
|
||||||
|
json.dumps(payload_dict)))
|
||||||
|
result_dict = response.json()
|
||||||
|
# assert result_dict['schema'] == schema_id # previously provisioned?
|
||||||
|
return result_dict['write_url']
|
||||||
|
|
||||||
|
def read_url(self, segment_id):
|
||||||
reql = self.rr.table('services').get_all(
|
reql = self.rr.table('services').get_all(
|
||||||
segment_id, index='segment').filter(
|
segment_id, index='segment').filter(
|
||||||
{'role':'trough-read'}).filter(
|
{'role':'trough-read'}).filter(
|
||||||
@ -289,6 +288,52 @@ class TroughDedupDb(object):
|
|||||||
else:
|
else:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
def schema_exists(self, schema_id):
|
||||||
|
url = os.path.join(self.segment_manager_url(), 'schema', schema_id)
|
||||||
|
response = requests.get(url)
|
||||||
|
if response.status_code == 200:
|
||||||
|
return True
|
||||||
|
elif response.status_code == 404:
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
def register_schema(self, schema_id, sql):
|
||||||
|
url = '%s/schema/%s/sql' % (self.segment_manager_url(), schema_id)
|
||||||
|
response = requests.put(url, sql)
|
||||||
|
if response.status_code not in (201, 204):
|
||||||
|
raise Exception(
|
||||||
|
'Received %s: %r in response to PUT %r with data %r' % (
|
||||||
|
response.status_code, response.text, sql, url))
|
||||||
|
|
||||||
|
class TroughDedupDb(object):
|
||||||
|
'''
|
||||||
|
https://github.com/internetarchive/trough
|
||||||
|
'''
|
||||||
|
logger = logging.getLogger("warcprox.dedup.TroughDedupDb")
|
||||||
|
|
||||||
|
SCHEMA_ID = 'warcprox-dedup-v1'
|
||||||
|
SCHEMA_SQL = ('create table dedup (\n'
|
||||||
|
' digest_key varchar(100) primary key,\n'
|
||||||
|
' url varchar(2100) not null,\n'
|
||||||
|
' date datetime not null,\n'
|
||||||
|
' id varchar(100));\n') # warc record id
|
||||||
|
|
||||||
|
def __init__(self, options=warcprox.Options()):
|
||||||
|
self.options = options
|
||||||
|
self._trough_cli = TroughClient(options.rethinkdb_trough_db_url)
|
||||||
|
|
||||||
|
def start(self):
|
||||||
|
self._trough_cli.register_schema(self.SCHEMA_ID, self.SCHEMA_SQL)
|
||||||
|
|
||||||
|
def _write_url(self, bucket):
|
||||||
|
segment_id = 'warcprox-trough-%s' % bucket
|
||||||
|
return self._trough_cli.write_url(segment_id, self.SCHEMA_ID)
|
||||||
|
|
||||||
|
def _read_url(self, bucket):
|
||||||
|
segment_id = 'warcprox-trough-%s' % bucket
|
||||||
|
return self._trough_cli.read_url(segment_id)
|
||||||
|
|
||||||
def sql_value(self, x):
|
def sql_value(self, x):
|
||||||
if x is None:
|
if x is None:
|
||||||
return 'null'
|
return 'null'
|
||||||
@ -317,14 +362,7 @@ class TroughDedupDb(object):
|
|||||||
url = response_record.get_header(warctools.WarcRecord.URL)
|
url = response_record.get_header(warctools.WarcRecord.URL)
|
||||||
warc_date = response_record.get_header(warctools.WarcRecord.DATE)
|
warc_date = response_record.get_header(warctools.WarcRecord.DATE)
|
||||||
|
|
||||||
# XXX create table statement here is a temporary hack,
|
sql = ('insert into dedup (digest_key, url, date, id) '
|
||||||
# see https://webarchive.jira.com/browse/AITFIVE-1465
|
|
||||||
sql = ('create table if not exists dedup (\n'
|
|
||||||
' digest_key varchar(100) primary key,\n'
|
|
||||||
' url varchar(2100) not null,\n'
|
|
||||||
' date datetime not null,\n'
|
|
||||||
' id varchar(100));\n' # warc record id
|
|
||||||
'insert into dedup (digest_key, url, date, id) '
|
|
||||||
'values (%s, %s, %s, %s);') % (
|
'values (%s, %s, %s, %s);') % (
|
||||||
self.sql_value(digest_key), self.sql_value(url),
|
self.sql_value(digest_key), self.sql_value(url),
|
||||||
self.sql_value(warc_date), self.sql_value(record_id))
|
self.sql_value(warc_date), self.sql_value(record_id))
|
||||||
|
Loading…
x
Reference in New Issue
Block a user