diff --git a/warcprox/bigtable.py b/warcprox/bigtable.py index 0587cf9..917edab 100644 --- a/warcprox/bigtable.py +++ b/warcprox/bigtable.py @@ -66,12 +66,14 @@ class RethinkCaptures: else: bucket = "__unspecified__" - canon_surt = surt.surt(recorded_url.url.decode("utf-8"), trailing_comma=True, host_massage=False) + canon_surt = surt.surt(recorded_url.url.decode("utf-8"), + trailing_comma=True, host_massage=False, with_scheme=True) entry = { # id only specified for rethinkdb partitioning "id": "{} {}".format(canon_surt[:20], records[0].id.decode("utf-8")[10:-1]), "abbr_canon_surt": canon_surt[:150], + "canon_surt": canon_surt, # "timestamp": re.sub(r"[^0-9]", "", records[0].date.decode("utf-8")), "timestamp": records[0].date.decode("utf-8"), "url": recorded_url.url.decode("utf-8"), diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index b1772ac..01f94cd 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -148,11 +148,12 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): def _enforce_limits(self, warcprox_meta): if warcprox_meta and "limits" in warcprox_meta: - # self.logger.info("warcprox_meta['limits']=%s", warcprox_meta['limits']) for item in warcprox_meta["limits"].items(): key, limit = item bucket0, bucket1, bucket2 = key.rsplit(".", 2) value = self.server.stats_db.value(bucket0, bucket1, bucket2) + # self.logger.debug("warcprox_meta['limits']=%s stats['%s']=%s recorded_url_q.qsize()=%s", + # warcprox_meta['limits'], key, value, self.server.recorded_url_q.qsize()) if value and value >= limit: body = "request rejected by warcprox: reached limit {}={}\n".format(key, limit).encode("utf-8") self.send_response(420, "Reached limit") @@ -243,9 +244,9 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): self.log_request(prox_rec_res.status, prox_rec_res.recorder.len) except socket.timeout as e: - self.logger.warn("%s proxying %s", repr(e), self.url) + self.logger.warn("%s proxying %s %s", repr(e), self.command, self.url) except BaseException as e: - self.logger.error("%s proxying %s", repr(e), self.url, exc_info=True) + self.logger.error("%s proxying %s %s", repr(e), self.command, self.url, exc_info=True) finally: # Let's close off the remote end if prox_rec_res: diff --git a/warcprox/writerthread.py b/warcprox/writerthread.py index 11f3e3f..8da6c11 100644 --- a/warcprox/writerthread.py +++ b/warcprox/writerthread.py @@ -38,6 +38,7 @@ class WarcWriterThread(threading.Thread): def run(self): try: + # XXX warcprox can shut down with urls to archive left in the queue self.setName('WarcWriterThread(tid={})'.format(warcprox.gettid())) while not self.stop.is_set(): try: