mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
get rid of dbm, switch to sqlite, for easier portability, clarity around threading
This commit is contained in:
parent
99dd840d20
commit
95dfa54968
@ -18,8 +18,6 @@ matrix:
|
||||
addons:
|
||||
apt:
|
||||
packages:
|
||||
- python-gdbm
|
||||
- python3-gdbm
|
||||
- tor
|
||||
|
||||
services:
|
||||
|
@ -15,7 +15,7 @@ To install latest release run:
|
||||
|
||||
::
|
||||
|
||||
# apt-get install libffi-dev libssl-dev python3-gdbm
|
||||
# apt-get install libffi-dev libssl-dev
|
||||
pip install warcprox
|
||||
|
||||
You can also install the latest bleeding edge code:
|
||||
|
2
setup.py
2
setup.py
@ -24,7 +24,6 @@ import sys
|
||||
import setuptools
|
||||
import setuptools.command.test
|
||||
|
||||
# special class needs to be added to support the pytest written dump-anydbm tests
|
||||
class PyTest(setuptools.command.test.test):
|
||||
def finalize_options(self):
|
||||
setuptools.command.test.test.finalize_options(self)
|
||||
@ -68,7 +67,6 @@ setuptools.setup(
|
||||
'warcprox=warcprox.main:main',
|
||||
('warcprox-ensure-rethinkdb-tables='
|
||||
'warcprox.main:ensure_rethinkdb_tables'),
|
||||
'dump-anydbm=warcprox.dump_anydbm:main',
|
||||
],
|
||||
},
|
||||
zip_safe=False,
|
||||
|
@ -39,9 +39,8 @@ RUN mkdir -vp /etc/service/rethinkdb \
|
||||
&& chmod a+x /etc/service/rethinkdb/run
|
||||
|
||||
RUN apt-get -y install git
|
||||
RUN apt-get -y install python-gdbm python3-gdbm libpython2.7-dev \
|
||||
libpython3-dev libffi-dev libssl-dev python-setuptools \
|
||||
python3-setuptools
|
||||
RUN apt-get -y install libpython2.7-dev libpython3-dev libffi-dev libssl-dev \
|
||||
python-setuptools python3-setuptools
|
||||
RUN apt-get -y install gcc
|
||||
|
||||
RUN echo '57ff41e99cb01b6a1c2b0999161589b726f0ec8b /tmp/pip-9.0.1.tar.gz' > /tmp/sha1sums.txt
|
||||
|
@ -1,154 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
#
|
||||
# tests/test_dump-anydbm.py - tests for dump-anydbm
|
||||
#
|
||||
# Copyright (C) 2013-2016 Internet Archive
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU General Public License
|
||||
# as published by the Free Software Foundation; either version 2
|
||||
# of the License, or (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
|
||||
# USA.
|
||||
#
|
||||
|
||||
import pytest
|
||||
import os
|
||||
import tempfile
|
||||
import subprocess # to access the script from shell
|
||||
import sys
|
||||
import glob
|
||||
import distutils
|
||||
|
||||
# will try as python 3 then default to python 2 modules
|
||||
try:
|
||||
import dbm
|
||||
from dbm import ndbm
|
||||
from dbm import gnu as gdbm
|
||||
from dbm import dumb
|
||||
|
||||
whichdb = dbm.whichdb
|
||||
|
||||
ndbm_type = b"dbm.ndbm"
|
||||
gdbm_type = b"dbm.gnu"
|
||||
dumb_type = b"dbm.dumb"
|
||||
|
||||
except:
|
||||
import dbm as ndbm
|
||||
import gdbm
|
||||
import dumbdbm as dumb
|
||||
|
||||
from whichdb import whichdb
|
||||
|
||||
ndbm_type = b"dbm"
|
||||
gdbm_type = b"gdbm"
|
||||
dumb_type = b"dumbdbm"
|
||||
|
||||
#global settings
|
||||
key1 = 'very first key'
|
||||
key2 = 'second key'
|
||||
val1 = 'very first value'
|
||||
val2 = 'second value'
|
||||
|
||||
py = sys.executable
|
||||
dump_anydbm_loc = distutils.spawn.find_executable("dump-anydbm")
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def gdbm_test_db(request):
|
||||
temp_file = tempfile.NamedTemporaryFile(delete=False)
|
||||
print("creating test gdbm file {}".format(temp_file.name))
|
||||
test_db = gdbm.open(temp_file.name, "n")
|
||||
test_db[key1] = val1
|
||||
test_db[key2] = val2
|
||||
test_db.close()
|
||||
|
||||
def delete_gdbm_test_db():
|
||||
temp_file.close()
|
||||
for f in glob.glob("{}*".format(temp_file.name)):
|
||||
print("deleting test gdbm file {}".format(f))
|
||||
os.remove(f)
|
||||
|
||||
request.addfinalizer(delete_gdbm_test_db)
|
||||
return temp_file.name
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def ndbm_test_db(request):
|
||||
temp_file = tempfile.NamedTemporaryFile(delete=False)
|
||||
test_db = ndbm.open(temp_file.name, "n")
|
||||
test_db[key1] = val1
|
||||
test_db[key2] = val2
|
||||
test_db.close()
|
||||
|
||||
def delete_test_ndbm():
|
||||
temp_file.close()
|
||||
for f in glob.glob("{}*".format(temp_file.name)):
|
||||
print("deleting test ndbm file {}".format(f))
|
||||
os.remove(f)
|
||||
|
||||
request.addfinalizer(delete_test_ndbm)
|
||||
return temp_file.name
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def dumbdbm_test_db(request):
|
||||
temp_file = tempfile.NamedTemporaryFile(delete=False)
|
||||
print("creating test dumbdbm file {}".format(temp_file.name))
|
||||
test_db = dumb.open(temp_file.name, "n")
|
||||
test_db[key1] = val1
|
||||
test_db[key2] = val2
|
||||
test_db.close()
|
||||
|
||||
def delete_dumbdbm_test_db():
|
||||
temp_file.close()
|
||||
for f in glob.glob("{}*".format(temp_file.name)):
|
||||
print("deleting test dumbdbm file {}".format(f))
|
||||
os.remove(f)
|
||||
|
||||
request.addfinalizer(delete_dumbdbm_test_db)
|
||||
return temp_file.name
|
||||
|
||||
|
||||
def test_dumpanydbm_identify_gdbm(gdbm_test_db):
|
||||
print("running test_dumpanydbm_identify_gdbm")
|
||||
output = subprocess.check_output([py, dump_anydbm_loc, gdbm_test_db])
|
||||
print("script printout: ")
|
||||
print(output)
|
||||
print("check_one: ")
|
||||
print(gdbm_test_db.encode(encoding='UTF-8') + b' is a ' + gdbm_type + b' db\nvery first key:very first value\nsecond key:second value\n')
|
||||
|
||||
assert (output == gdbm_test_db.encode(encoding='UTF-8') + b' is a ' + gdbm_type + b' db\nvery first key:very first value\nsecond key:second value\n' or
|
||||
output == gdbm_test_db.encode(encoding='UTF-8') + b' is a ' + gdbm_type + b' db\nsecond key:second value\nvery first key:very first value\n')
|
||||
|
||||
|
||||
def test_dumpanydbm_identify_ndbm(ndbm_test_db):
|
||||
print("running test_dumpanydbm_identify_ndbm")
|
||||
output = subprocess.check_output([py, dump_anydbm_loc, ndbm_test_db])
|
||||
print("script printout: ")
|
||||
print(output)
|
||||
print("check_one: ")
|
||||
print(ndbm_test_db.encode(encoding='UTF-8') + b' is a ' + ndbm_type + b' db\nvery first key:very first value\nsecond key:second value\n')
|
||||
|
||||
assert (output == ndbm_test_db.encode(encoding='UTF-8') + b' is a ' + ndbm_type + b' db\nvery first key:very first value\nsecond key:second value\n' or
|
||||
output == ndbm_test_db.encode(encoding='UTF-8') + b' is a ' + ndbm_type + b' db\nsecond key:second value\nvery first key:very first value\n')
|
||||
|
||||
|
||||
def test_dumpanydbm_identify_dumbdbm(dumbdbm_test_db):
|
||||
print("running test_dumpanydbm_identify_dumbdbm")
|
||||
|
||||
output = subprocess.check_output([py, dump_anydbm_loc, dumbdbm_test_db])
|
||||
print("script printout: ")
|
||||
print(output)
|
||||
print("check_one: ")
|
||||
print(dumbdbm_test_db.encode(encoding='UTF-8') + b' is a ' + dumb_type + b' db\nvery first key:very first value\nsecond key:second value\n')
|
||||
|
||||
assert (output == dumbdbm_test_db.encode(encoding='UTF-8') + b' is a ' + dumb_type + b' db\nvery first key:very first value\nsecond key:second value\n' or
|
||||
output == dumbdbm_test_db.encode(encoding='UTF-8') + b' is a ' + dumb_type + b' db\nsecond key:second value\nvery first key:very first value\n')
|
@ -83,7 +83,7 @@ def _send(self, data):
|
||||
# http_client.HTTPConnection.send = _send
|
||||
|
||||
logging.basicConfig(
|
||||
stream=sys.stdout, level=logging.INFO, # level=warcprox.TRACE,
|
||||
stream=sys.stdout, level=logging.DEBUG, # level=warcprox.TRACE,
|
||||
format='%(asctime)s %(process)d %(levelname)s %(threadName)s '
|
||||
'%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s')
|
||||
logging.getLogger("requests.packages.urllib3").setLevel(logging.WARN)
|
||||
@ -322,9 +322,9 @@ def stats_db(request, rethinkdb_servers):
|
||||
logging.info('dropping rethinkdb database {}'.format(db))
|
||||
result = sdb.rr.db_drop(db).run()
|
||||
logging.info("result=%s", result)
|
||||
else:
|
||||
logging.info('deleting file {}'.format(stats_db_file))
|
||||
os.unlink(stats_db_file)
|
||||
# else:
|
||||
# logging.info('deleting file {}'.format(stats_db_file))
|
||||
# os.unlink(stats_db_file)
|
||||
request.addfinalizer(fin)
|
||||
|
||||
return sdb
|
||||
|
@ -1,5 +1,5 @@
|
||||
'''
|
||||
warcprox/dedup.py - identical payload digest deduplication
|
||||
warcprox/dedup.py - identical payload digest deduplication using sqlite db
|
||||
|
||||
Copyright (C) 2013-2017 Internet Archive
|
||||
|
||||
@ -27,61 +27,71 @@ import json
|
||||
from hanzo import warctools
|
||||
import warcprox
|
||||
import random
|
||||
import sqlite3
|
||||
import threading
|
||||
|
||||
class DedupDb(object):
|
||||
logger = logging.getLogger("warcprox.dedup.DedupDb")
|
||||
|
||||
def __init__(self, dbm_file='./warcprox-dedup.db', options=warcprox.Options()):
|
||||
try:
|
||||
import dbm.gnu as dbm_gnu
|
||||
except ImportError:
|
||||
try:
|
||||
import gdbm as dbm_gnu
|
||||
except ImportError:
|
||||
import anydbm as dbm_gnu
|
||||
|
||||
if os.path.exists(dbm_file):
|
||||
self.logger.info('opening existing deduplication database {}'.format(dbm_file))
|
||||
else:
|
||||
self.logger.info('creating new deduplication database {}'.format(dbm_file))
|
||||
|
||||
self.db = dbm_gnu.open(dbm_file, 'c')
|
||||
def __init__(
|
||||
self, file='./warcprox.sqlite', options=warcprox.Options()):
|
||||
self.file = file
|
||||
self.options = options
|
||||
|
||||
def start(self):
|
||||
pass
|
||||
if os.path.exists(self.file):
|
||||
self.logger.info(
|
||||
'opening existing deduplication database %s',
|
||||
self.file)
|
||||
else:
|
||||
self.logger.info(
|
||||
'creating new deduplication database %s', self.file)
|
||||
|
||||
conn = sqlite3.connect(self.file)
|
||||
conn.execute(
|
||||
'create table if not exists dedup ('
|
||||
' key varchar(300) primary key,'
|
||||
' value varchar(4000)'
|
||||
');')
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
def stop(self):
|
||||
self.close()
|
||||
pass
|
||||
|
||||
def close(self):
|
||||
self.db.close()
|
||||
pass
|
||||
|
||||
def sync(self):
|
||||
try:
|
||||
self.db.sync()
|
||||
except:
|
||||
pass
|
||||
pass
|
||||
|
||||
def save(self, digest_key, response_record, bucket=""):
|
||||
record_id = response_record.get_header(warctools.WarcRecord.ID).decode('latin1')
|
||||
url = response_record.get_header(warctools.WarcRecord.URL).decode('latin1')
|
||||
date = response_record.get_header(warctools.WarcRecord.DATE).decode('latin1')
|
||||
|
||||
key = digest_key + b"|" + bucket.encode("utf-8")
|
||||
key = digest_key.decode('utf-8') + "|" + bucket
|
||||
|
||||
py_value = {'id':record_id, 'url':url, 'date':date}
|
||||
json_value = json.dumps(py_value, separators=(',',':'))
|
||||
|
||||
self.db[key] = json_value.encode('utf-8')
|
||||
conn = sqlite3.connect(self.file)
|
||||
conn.execute(
|
||||
'insert into dedup (key, value) values (?, ?);',
|
||||
(key, json_value))
|
||||
conn.commit()
|
||||
conn.close()
|
||||
self.logger.debug('dedup db saved %s:%s', key, json_value)
|
||||
|
||||
def lookup(self, digest_key, bucket=""):
|
||||
result = None
|
||||
key = digest_key + b"|" + bucket.encode("utf-8")
|
||||
if key in self.db:
|
||||
json_result = self.db[key]
|
||||
result = json.loads(json_result.decode('utf-8'))
|
||||
key = digest_key.decode('utf-8') + '|' + bucket
|
||||
conn = sqlite3.connect(self.file)
|
||||
cursor = conn.execute('select value from dedup where key = ?', (key,))
|
||||
result_tuple = cursor.fetchone()
|
||||
conn.close()
|
||||
if result_tuple:
|
||||
result = json.loads(result_tuple[0])
|
||||
result['id'] = result['id'].encode('latin1')
|
||||
result['url'] = result['url'].encode('latin1')
|
||||
result['date'] = result['date'].encode('latin1')
|
||||
@ -91,10 +101,13 @@ class DedupDb(object):
|
||||
def notify(self, recorded_url, records):
|
||||
if (records[0].get_header(warctools.WarcRecord.TYPE) == warctools.WarcRecord.RESPONSE
|
||||
and recorded_url.response_recorder.payload_size() > 0):
|
||||
digest_key = warcprox.digest_str(recorded_url.response_recorder.payload_digest,
|
||||
self.options.base32)
|
||||
digest_key = warcprox.digest_str(
|
||||
recorded_url.response_recorder.payload_digest,
|
||||
self.options.base32)
|
||||
if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta:
|
||||
self.save(digest_key, records[0], bucket=recorded_url.warcprox_meta["captures-bucket"])
|
||||
self.save(
|
||||
digest_key, records[0],
|
||||
bucket=recorded_url.warcprox_meta["captures-bucket"])
|
||||
else:
|
||||
self.save(digest_key, records[0])
|
||||
|
||||
|
@ -1,80 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
'''
|
||||
dump-anydbm - dumps contents of dbm file to stdout
|
||||
|
||||
Dump contents of database to stdout. Database can be any file that the anydbm
|
||||
module can read. Included with warcprox because it's useful for inspecting a
|
||||
deduplication database or a playback index database, but it is a generic tool.
|
||||
|
||||
Copyright (C) 2013-2016 Internet Archive
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU General Public License
|
||||
as published by the Free Software Foundation; either version 2
|
||||
of the License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
|
||||
USA.
|
||||
'''
|
||||
|
||||
try:
|
||||
import dbm
|
||||
from dbm import ndbm
|
||||
whichdb = dbm.whichdb
|
||||
|
||||
except:
|
||||
import anydbm
|
||||
dbm = anydbm
|
||||
from whichdb import whichdb
|
||||
|
||||
import sys
|
||||
import os.path
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
def main():
|
||||
if len(sys.argv) != 2:
|
||||
sys.stderr.write("usage: {} DBM_FILE\n".format(sys.argv[0]))
|
||||
exit(1)
|
||||
|
||||
filename = sys.argv[1]
|
||||
which = whichdb(filename)
|
||||
|
||||
# if which returns none and the file does not exist, print usage line
|
||||
if which == None and not os.path.exists(sys.argv[1]):
|
||||
sys.stderr.write('No such file {}\n\n'.format(sys.argv[1]))
|
||||
sys.stderr.write("usage: {} DBM_FILE\n".format(sys.argv[0]))
|
||||
exit(1)
|
||||
|
||||
# covers case where an ndbm is checked with its extension & identified incorrectly
|
||||
elif 'bsd' in which:
|
||||
correct_file = filename.split(".db")[0]
|
||||
correct_which = whichdb(correct_file)
|
||||
if correct_which in ('dbm', 'dbm.ndbm'):
|
||||
filename = correct_file
|
||||
which = correct_which
|
||||
|
||||
elif which == '':
|
||||
sys.stderr.write("{} is an unrecognized database type\n".format(sys.argv[1]))
|
||||
sys.stderr.write("Try the file again by removing the extension\n")
|
||||
exit(1)
|
||||
|
||||
try:
|
||||
out = sys.stdout.buffer
|
||||
|
||||
except AttributeError:
|
||||
out = sys.stdout
|
||||
|
||||
out.write(filename.encode('UTF-8') + b' is a ' + which.encode('UTF-8') + b' db\n')
|
||||
|
||||
db = dbm.open(filename, 'r')
|
||||
for key in db.keys():
|
||||
out.write(key + b":" + db[key] + b"\n")
|
@ -97,7 +97,7 @@ def _build_arg_parser(prog=os.path.basename(sys.argv[0])):
|
||||
arg_parser.add_argument('--method-filter', metavar='HTTP_METHOD',
|
||||
action='append', help='only record requests with the given http method(s) (can be used more than once)')
|
||||
arg_parser.add_argument('--stats-db-file', dest='stats_db_file',
|
||||
default='./warcprox-stats.db', help='persistent statistics database file; empty string or /dev/null disables statistics tracking')
|
||||
default='./warcprox.sqlite', help='persistent statistics database file; empty string or /dev/null disables statistics tracking')
|
||||
arg_parser.add_argument('-P', '--playback-port', dest='playback_port',
|
||||
type=int, default=None, help='port to listen on for instant playback')
|
||||
arg_parser.add_argument('--playback-index-db-file', dest='playback_index_db_file',
|
||||
@ -105,7 +105,7 @@ def _build_arg_parser(prog=os.path.basename(sys.argv[0])):
|
||||
help='playback index database file (only used if --playback-port is specified)')
|
||||
group = arg_parser.add_mutually_exclusive_group()
|
||||
group.add_argument('-j', '--dedup-db-file', dest='dedup_db_file',
|
||||
default='./warcprox-dedup.db', help='persistent deduplication database file; empty string or /dev/null disables deduplication')
|
||||
default='./warcprox.sqlite', help='persistent deduplication database file; empty string or /dev/null disables deduplication')
|
||||
group.add_argument('--rethinkdb-servers', dest='rethinkdb_servers',
|
||||
help='rethinkdb servers, used for dedup and stats if specified; e.g. db0.foo.org,db0.foo.org:38015,db1.foo.org')
|
||||
arg_parser.add_argument('--rethinkdb-db', dest='rethinkdb_db', default='warcprox',
|
||||
|
@ -2,7 +2,7 @@
|
||||
warcprox/playback.py - rudimentary support for playback of urls archived by
|
||||
warcprox (not much used or maintained)
|
||||
|
||||
Copyright (C) 2013-2016 Internet Archive
|
||||
Copyright (C) 2013-2017 Internet Archive
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU General Public License
|
||||
@ -40,6 +40,7 @@ import traceback
|
||||
import re
|
||||
from warcprox.mitmproxy import MitmProxyHandler
|
||||
import warcprox
|
||||
import sqlite3
|
||||
|
||||
class PlaybackProxyHandler(MitmProxyHandler):
|
||||
logger = logging.getLogger("warcprox.playback.PlaybackProxyHandler")
|
||||
@ -49,10 +50,9 @@ class PlaybackProxyHandler(MitmProxyHandler):
|
||||
# don't connect to any remote server!
|
||||
pass
|
||||
|
||||
|
||||
# @Override
|
||||
def _proxy_request(self):
|
||||
date, location = self.server.playback_index_db.lookup_latest(self.url.encode('utf-8'))
|
||||
date, location = self.server.playback_index_db.lookup_latest(self.url)
|
||||
self.logger.debug('lookup_latest returned {}:{}'.format(date, location))
|
||||
|
||||
status = None
|
||||
@ -82,7 +82,8 @@ class PlaybackProxyHandler(MitmProxyHandler):
|
||||
sz = len(headers) + len(payload)
|
||||
|
||||
self.log_message('"%s" %s %s %s',
|
||||
self.requestline, str(status), str(sz), repr(location) if location else '-')
|
||||
self.requestline, str(status), str(sz),
|
||||
repr(location) if location else '-')
|
||||
|
||||
|
||||
def _open_warc_at_offset(self, warcfilename, offset):
|
||||
@ -99,7 +100,6 @@ class PlaybackProxyHandler(MitmProxyHandler):
|
||||
|
||||
return warctools.warc.WarcRecord.open_archive(filename=warcpath, mode='rb', offset=offset)
|
||||
|
||||
|
||||
def _send_response(self, headers, payload_fh):
|
||||
status = '-'
|
||||
m = re.match(br'^HTTP/\d\.\d (\d{3})', headers)
|
||||
@ -118,8 +118,10 @@ class PlaybackProxyHandler(MitmProxyHandler):
|
||||
return status, sz
|
||||
|
||||
|
||||
def _send_headers_and_refd_payload(self, headers, refers_to, refers_to_target_uri, refers_to_date):
|
||||
location = self.server.playback_index_db.lookup_exact(refers_to_target_uri, refers_to_date, record_id=refers_to)
|
||||
def _send_headers_and_refd_payload(
|
||||
self, headers, refers_to, refers_to_target_uri, refers_to_date):
|
||||
location = self.server.playback_index_db.lookup_exact(
|
||||
refers_to_target_uri, refers_to_date, record_id=refers_to)
|
||||
self.logger.debug('loading http payload from {}'.format(location))
|
||||
|
||||
fh = self._open_warc_at_offset(location['f'], location['o'])
|
||||
@ -174,12 +176,20 @@ class PlaybackProxyHandler(MitmProxyHandler):
|
||||
if warc_profile != warctools.WarcRecord.PROFILE_IDENTICAL_PAYLOAD_DIGEST:
|
||||
raise Exception('unknown revisit record profile {}'.format(warc_profile))
|
||||
|
||||
refers_to = record.get_header(warctools.WarcRecord.REFERS_TO)
|
||||
refers_to_target_uri = record.get_header(warctools.WarcRecord.REFERS_TO_TARGET_URI)
|
||||
refers_to_date = record.get_header(warctools.WarcRecord.REFERS_TO_DATE)
|
||||
refers_to = record.get_header(
|
||||
warctools.WarcRecord.REFERS_TO).decode('latin1')
|
||||
refers_to_target_uri = record.get_header(
|
||||
warctools.WarcRecord.REFERS_TO_TARGET_URI).decode(
|
||||
'latin1')
|
||||
refers_to_date = record.get_header(
|
||||
warctools.WarcRecord.REFERS_TO_DATE).decode('latin1')
|
||||
|
||||
self.logger.debug('revisit record references {}:{} capture of {}'.format(refers_to_date, refers_to, refers_to_target_uri))
|
||||
return self._send_headers_and_refd_payload(record.content[1], refers_to, refers_to_target_uri, refers_to_date)
|
||||
self.logger.debug(
|
||||
'revisit record references %s:%s capture of %s',
|
||||
refers_to_date, refers_to, refers_to_target_uri)
|
||||
return self._send_headers_and_refd_payload(
|
||||
record.content[1], refers_to, refers_to_target_uri,
|
||||
refers_to_date)
|
||||
|
||||
else:
|
||||
# send it back raw, whatever it is
|
||||
@ -220,30 +230,30 @@ class PlaybackProxy(socketserver.ThreadingMixIn, http_server.HTTPServer):
|
||||
class PlaybackIndexDb(object):
|
||||
logger = logging.getLogger("warcprox.playback.PlaybackIndexDb")
|
||||
|
||||
def __init__(self, dbm_file='./warcprox-playback-index.db'):
|
||||
try:
|
||||
import dbm.gnu as dbm_gnu
|
||||
except ImportError:
|
||||
try:
|
||||
import gdbm as dbm_gnu
|
||||
except ImportError:
|
||||
import anydbm as dbm_gnu
|
||||
def __init__(self, file='./warcprox.sqlite'):
|
||||
self.file = file
|
||||
|
||||
if os.path.exists(dbm_file):
|
||||
self.logger.info('opening existing playback index database {}'.format(dbm_file))
|
||||
if os.path.exists(self.file):
|
||||
self.logger.info(
|
||||
'opening existing playback index database %s', self.file)
|
||||
else:
|
||||
self.logger.info('creating new playback index database {}'.format(dbm_file))
|
||||
self.logger.info(
|
||||
'creating new playback index database %s', self.file)
|
||||
|
||||
self.db = dbm_gnu.open(dbm_file, 'c')
|
||||
conn = sqlite3.connect(self.file)
|
||||
conn.execute(
|
||||
'create table if not exists playback ('
|
||||
' url varchar(4000) primary key,'
|
||||
' value varchar(4000)'
|
||||
');')
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
def close(self):
|
||||
self.db.close()
|
||||
pass
|
||||
|
||||
def sync(self):
|
||||
try:
|
||||
self.db.sync()
|
||||
except:
|
||||
pass
|
||||
pass
|
||||
|
||||
def notify(self, recorded_url, records):
|
||||
self.save(records[0].warc_filename, records, records[0].offset)
|
||||
@ -251,7 +261,7 @@ class PlaybackIndexDb(object):
|
||||
def save(self, warcfile, recordset, offset):
|
||||
response_record = recordset[0]
|
||||
# XXX canonicalize url?
|
||||
url = response_record.get_header(warctools.WarcRecord.URL)
|
||||
url = response_record.get_header(warctools.WarcRecord.URL).decode('latin1')
|
||||
date_str = response_record.get_header(warctools.WarcRecord.DATE).decode('latin1')
|
||||
record_id_str = response_record.get_header(warctools.WarcRecord.ID).decode('latin1')
|
||||
|
||||
@ -259,9 +269,13 @@ class PlaybackIndexDb(object):
|
||||
# prescribed as YYYY-MM-DDThh:mm:ssZ, so we have to handle it :-\
|
||||
|
||||
# url:{date1:[record1={'f':warcfile,'o':response_offset,'q':request_offset,'i':record_id},record2,...],date2:[{...}],...}
|
||||
if url in self.db:
|
||||
existing_json_value = self.db[url].decode('utf-8')
|
||||
py_value = json.loads(existing_json_value)
|
||||
|
||||
conn = sqlite3.connect(self.file)
|
||||
cursor = conn.execute(
|
||||
'select value from playback where url = ?', (url,))
|
||||
result_tuple = cursor.fetchone()
|
||||
if result_tuple:
|
||||
py_value = json.loads(result_tuple[0])
|
||||
else:
|
||||
py_value = {}
|
||||
|
||||
@ -272,16 +286,25 @@ class PlaybackIndexDb(object):
|
||||
|
||||
json_value = json.dumps(py_value, separators=(',',':'))
|
||||
|
||||
self.db[url] = json_value.encode('utf-8')
|
||||
conn.execute(
|
||||
'insert or replace into playback (url, value) values (?, ?)',
|
||||
(url, json_value))
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
self.logger.debug('playback index saved: {}:{}'.format(url, json_value))
|
||||
|
||||
|
||||
def lookup_latest(self, url):
|
||||
if url not in self.db:
|
||||
conn = sqlite3.connect(self.file)
|
||||
cursor = conn.execute(
|
||||
'select value from playback where url = ?', (url,))
|
||||
result_tuple = cursor.fetchone()
|
||||
conn.close()
|
||||
|
||||
if not result_tuple:
|
||||
return None, None
|
||||
|
||||
json_value = self.db[url].decode('utf-8')
|
||||
json_value = result_tuple[0]
|
||||
self.logger.debug("{}:{}".format(repr(url), repr(json_value)))
|
||||
py_value = json.loads(json_value)
|
||||
|
||||
@ -290,26 +313,33 @@ class PlaybackIndexDb(object):
|
||||
result['i'] = result['i'].encode('ascii')
|
||||
return latest_date, result
|
||||
|
||||
|
||||
# in python3 params are bytes
|
||||
def lookup_exact(self, url, warc_date, record_id):
|
||||
if url not in self.db:
|
||||
conn = sqlite3.connect(self.file)
|
||||
cursor = conn.execute(
|
||||
'select value from playback where url = ?', (url,))
|
||||
result_tuple = cursor.fetchone()
|
||||
conn.close()
|
||||
|
||||
if not result_tuple:
|
||||
return None
|
||||
|
||||
json_value = self.db[url].decode('utf-8')
|
||||
self.logger.debug("{}:{}".format(repr(url), repr(json_value)))
|
||||
json_value = result_tuple[0]
|
||||
self.logger.debug("%s:%s", repr(url), repr(json_value))
|
||||
py_value = json.loads(json_value)
|
||||
|
||||
warc_date_str = warc_date.decode('ascii')
|
||||
|
||||
if warc_date_str in py_value:
|
||||
for record in py_value[warc_date_str]:
|
||||
if record['i'].encode('ascii') == record_id:
|
||||
self.logger.debug("found exact match for ({},{},{})".format(repr(warc_date), repr(record_id), repr(url)))
|
||||
record['i'] = record['i'].encode('ascii')
|
||||
if warc_date in py_value:
|
||||
for record in py_value[warc_date]:
|
||||
if record['i'] == record_id:
|
||||
self.logger.debug(
|
||||
"found exact match for (%s,%s,%s)",
|
||||
repr(warc_date), repr(record_id), repr(url))
|
||||
record['i'] = record['i']
|
||||
return record
|
||||
else:
|
||||
self.logger.info("match not found for ({},{},{})".format(repr(warc_date), repr(record_id), repr(url)))
|
||||
self.logger.info(
|
||||
"match not found for (%s,%s,%s)", repr(warc_date),
|
||||
repr(record_id), repr(url))
|
||||
return None
|
||||
|
||||
|
||||
|
@ -31,6 +31,7 @@ import threading
|
||||
import rethinkdb as r
|
||||
import datetime
|
||||
import urlcanon
|
||||
import sqlite3
|
||||
|
||||
def _empty_bucket(bucket):
|
||||
return {
|
||||
@ -52,53 +53,52 @@ def _empty_bucket(bucket):
|
||||
class StatsDb:
|
||||
logger = logging.getLogger("warcprox.stats.StatsDb")
|
||||
|
||||
def __init__(self, dbm_file='./warcprox-stats.db', options=warcprox.Options()):
|
||||
try:
|
||||
import dbm.gnu as dbm_gnu
|
||||
except ImportError:
|
||||
try:
|
||||
import gdbm as dbm_gnu
|
||||
except ImportError:
|
||||
import anydbm as dbm_gnu
|
||||
|
||||
if os.path.exists(dbm_file):
|
||||
self.logger.info('opening existing stats database {}'.format(dbm_file))
|
||||
else:
|
||||
self.logger.info('creating new stats database {}'.format(dbm_file))
|
||||
|
||||
self.db = dbm_gnu.open(dbm_file, 'c')
|
||||
def __init__(self, file='./warcprox.sqlite', options=warcprox.Options()):
|
||||
self.file = file
|
||||
self.options = options
|
||||
|
||||
def start(self):
|
||||
# method only exists to match RethinkStatsDb
|
||||
pass
|
||||
if os.path.exists(self.file):
|
||||
self.logger.info(
|
||||
'opening existing stats database %s', self.file)
|
||||
else:
|
||||
self.logger.info(
|
||||
'creating new stats database %s', self.file)
|
||||
|
||||
conn = sqlite3.connect(self.file)
|
||||
conn.execute(
|
||||
'create table if not exists buckets_of_stats ('
|
||||
' bucket varchar(300) primary key,'
|
||||
' stats varchar(4000)'
|
||||
');')
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
self.logger.info('created table buckets_of_stats in %s', self.file)
|
||||
|
||||
def stop(self):
|
||||
self.close()
|
||||
pass
|
||||
|
||||
def close(self):
|
||||
self.db.close()
|
||||
pass
|
||||
|
||||
def sync(self):
|
||||
try:
|
||||
self.db.sync()
|
||||
except:
|
||||
pass
|
||||
pass
|
||||
|
||||
def value(self, bucket0="__all__", bucket1=None, bucket2=None):
|
||||
# Gdbm wants str/bytes keys in python2, str/unicode keys in python3.
|
||||
# This ugliness deals with keys that arrive as unicode in py2.
|
||||
b0 = bucket0.encode("utf-8") if bucket0 and not isinstance(bucket0, str) else bucket0
|
||||
b1 = bucket1.encode("utf-8") if bucket1 and not isinstance(bucket1, str) else bucket1
|
||||
b2 = bucket2.encode("utf-8") if bucket2 and not isinstance(bucket2, str) else bucket2
|
||||
|
||||
if b0 in self.db:
|
||||
bucket0_stats = json.loads(self.db[b0].decode("utf-8"))
|
||||
if b1:
|
||||
if b2:
|
||||
return bucket0_stats[b1][b2]
|
||||
conn = sqlite3.connect(self.file)
|
||||
cursor = conn.execute(
|
||||
'select stats from buckets_of_stats where bucket = ?',
|
||||
(bucket0,))
|
||||
result_tuple = cursor.fetchone()
|
||||
conn.close()
|
||||
if result_tuple:
|
||||
bucket0_stats = json.loads(result_tuple[0])
|
||||
if bucket1:
|
||||
if bucket2:
|
||||
return bucket0_stats[bucket1][bucket2]
|
||||
else:
|
||||
return bucket0_stats[b1]
|
||||
return bucket0_stats[bucket1]
|
||||
else:
|
||||
return bucket0_stats
|
||||
else:
|
||||
@ -115,7 +115,7 @@ class StatsDb:
|
||||
with key 'bucket' whose value is the name of the bucket. The other
|
||||
currently recognized item is 'tally-domains', which if supplied should
|
||||
be a list of domains. This instructs warcprox to additionally tally
|
||||
substats of the given bucket by domain. Host stats are stored in the
|
||||
substats of the given bucket by domain. Host stats are stored in the
|
||||
stats table under the key '{parent-bucket}:{domain(normalized)}'.
|
||||
|
||||
Example Warcprox-Meta header (a real one will likely have other
|
||||
@ -150,14 +150,27 @@ class StatsDb:
|
||||
return buckets
|
||||
|
||||
def tally(self, recorded_url, records):
|
||||
conn = sqlite3.connect(self.file)
|
||||
|
||||
i = 0
|
||||
for bucket in self.buckets(recorded_url):
|
||||
# Gdbm wants str/bytes keys in python2, str/unicode keys in python3.
|
||||
# This ugliness deals with keys that arrive as unicode in py2.
|
||||
b = bucket.encode("utf-8") if bucket and not isinstance(bucket, str) else bucket
|
||||
if b in self.db:
|
||||
bucket_stats = json.loads(self.db[b].decode("utf-8"))
|
||||
try:
|
||||
cursor = conn.execute(
|
||||
'select stats from buckets_of_stats where bucket=?',
|
||||
(bucket,))
|
||||
except:
|
||||
logging.info(
|
||||
'i=%s bucket=%s self.file=%s', i, repr(bucket),
|
||||
repr(self.file), exc_info=1)
|
||||
raise
|
||||
i += 1
|
||||
|
||||
result_tuple = cursor.fetchone()
|
||||
cursor.close()
|
||||
if result_tuple:
|
||||
bucket_stats = json.loads(result_tuple[0])
|
||||
else:
|
||||
bucket_stats = _empty_bucket(b)
|
||||
bucket_stats = _empty_bucket(bucket)
|
||||
|
||||
bucket_stats["total"]["urls"] += 1
|
||||
bucket_stats["total"]["wire_bytes"] += recorded_url.size
|
||||
@ -169,7 +182,13 @@ class StatsDb:
|
||||
bucket_stats["new"]["urls"] += 1
|
||||
bucket_stats["new"]["wire_bytes"] += recorded_url.size
|
||||
|
||||
self.db[b] = json.dumps(bucket_stats, separators=(',',':')).encode("utf-8")
|
||||
json_value = json.dumps(bucket_stats, separators=(',',':'))
|
||||
conn.execute(
|
||||
'insert or replace into buckets_of_stats(bucket, stats) '
|
||||
'values (?, ?)', (bucket, json_value))
|
||||
conn.commit()
|
||||
|
||||
conn.close()
|
||||
|
||||
class RethinkStatsDb(StatsDb):
|
||||
"""Updates database in batch every 2.0 seconds"""
|
||||
|
Loading…
x
Reference in New Issue
Block a user