From ac959c6db5ecee843012eaf62afb8fc745a8a6f2 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 19 Nov 2019 13:33:59 -0800 Subject: [PATCH] change trough dedup `date` type to varchar This is a backwards-compatible change whose purpose is to clarify the existing usage. In sqlite (and therefore trough), the datatypes of columns are just suggestions. In fact the values can have any type. See https://sqlite.org/datatype3.html. `datetime` isn't even a real sqlite type. Warcprox stores a string formatted like '2019-11-19T01:23:45Z' in that field. When it pulls it out of the database and writes a revisit record, it sticks the raw value in the `WARC-Date` header of that record. Warcprox never parses the string value. Since we use the raw textual value of the field, it makes sense to use a textual datatype to store it. --- warcprox/dedup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/warcprox/dedup.py b/warcprox/dedup.py index 0e09239..ce1ab1f 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -500,7 +500,7 @@ class TroughDedupDb(DedupDb, DedupableMixin): SCHEMA_SQL = ('create table dedup (\n' ' digest_key varchar(100) primary key,\n' ' url varchar(2100) not null,\n' - ' date datetime not null,\n' + ' date varchar(100) not null,\n' ' id varchar(100));\n') # warc record id WRITE_SQL_TMPL = ('insert or ignore into dedup\n' '(digest_key, url, date, id)\n'