From ac959c6db5ecee843012eaf62afb8fc745a8a6f2 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 19 Nov 2019 13:33:59 -0800 Subject: [PATCH 1/4] change trough dedup `date` type to varchar This is a backwards-compatible change whose purpose is to clarify the existing usage. In sqlite (and therefore trough), the datatypes of columns are just suggestions. In fact the values can have any type. See https://sqlite.org/datatype3.html. `datetime` isn't even a real sqlite type. Warcprox stores a string formatted like '2019-11-19T01:23:45Z' in that field. When it pulls it out of the database and writes a revisit record, it sticks the raw value in the `WARC-Date` header of that record. Warcprox never parses the string value. Since we use the raw textual value of the field, it makes sense to use a textual datatype to store it. --- warcprox/dedup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/warcprox/dedup.py b/warcprox/dedup.py index 0e09239..ce1ab1f 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -500,7 +500,7 @@ class TroughDedupDb(DedupDb, DedupableMixin): SCHEMA_SQL = ('create table dedup (\n' ' digest_key varchar(100) primary key,\n' ' url varchar(2100) not null,\n' - ' date datetime not null,\n' + ' date varchar(100) not null,\n' ' id varchar(100));\n') # warc record id WRITE_SQL_TMPL = ('insert or ignore into dedup\n' '(digest_key, url, date, id)\n' From f54e1b37c7548cd7a85df8c67e61772acc815cff Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 7 Jan 2020 14:40:58 -0800 Subject: [PATCH 2/4] bump version after merge --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 7c7185f..27f5720 100755 --- a/setup.py +++ b/setup.py @@ -44,7 +44,7 @@ except: setuptools.setup( name='warcprox', - version='2.4.21', + version='2.4.22', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', From 91fcc054c4eb106ec876521271120059d57f8508 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 7 Jan 2020 14:42:40 -0800 Subject: [PATCH 3/4] bump version after merge --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 27f5720..3add2e8 100755 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ ''' setup.py - setuptools installation configuration for warcprox -Copyright (C) 2013-2019 Internet Archive +Copyright (C) 2013-2020 Internet Archive This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License @@ -44,7 +44,7 @@ except: setuptools.setup( name='warcprox', - version='2.4.22', + version='2.4.23', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', From 469b41773a71f7fa037a12cabf2c186d746c21aa Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 7 Jan 2020 15:19:03 -0800 Subject: [PATCH 4/4] fix logging config which trough interfered with --- setup.py | 2 +- warcprox/controller.py | 2 +- warcprox/dedup.py | 2 +- warcprox/main.py | 1 + 4 files changed, 4 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 3add2e8..9881520 100755 --- a/setup.py +++ b/setup.py @@ -44,7 +44,7 @@ except: setuptools.setup( name='warcprox', - version='2.4.23', + version='2.4.24', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', diff --git a/warcprox/controller.py b/warcprox/controller.py index 84c3b93..8d670cb 100644 --- a/warcprox/controller.py +++ b/warcprox/controller.py @@ -111,7 +111,7 @@ class Factory: assert hasattr(plugin, 'notify') ^ hasattr(plugin, '_startup') return plugin except Exception as e: - logging.fatal('problem with plugin class %r: %s', qualname, e) + logging.fatal('problem with plugin class %r', qualname, exc_info=1) sys.exit(1) @staticmethod diff --git a/warcprox/dedup.py b/warcprox/dedup.py index 5613ec7..8227aa7 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -26,7 +26,6 @@ import os import json from hanzo import warctools import warcprox -import trough.client import sqlite3 import doublethink import datetime @@ -507,6 +506,7 @@ class TroughDedupDb(DedupDb, DedupableMixin): 'values (%s, %s, %s, %s);') def __init__(self, options=warcprox.Options()): + import trough.client DedupableMixin.__init__(self, options) self.options = options self._trough_cli = trough.client.TroughClient( diff --git a/warcprox/main.py b/warcprox/main.py index d61e6b1..3afc1bb 100644 --- a/warcprox/main.py +++ b/warcprox/main.py @@ -302,6 +302,7 @@ def main(argv=None): else: loglevel = logging.INFO + logging.root.handlers = [] logging.basicConfig( stream=sys.stdout, level=loglevel, format=( '%(asctime)s %(process)d %(levelname)s %(threadName)s '