for captures table generate canonical surt with scheme://

This commit is contained in:
Noah Levitt 2015-09-16 01:14:13 +00:00
parent 686a297f98
commit 12432b23ae
3 changed files with 8 additions and 4 deletions

View File

@ -66,12 +66,14 @@ class RethinkCaptures:
else:
bucket = "__unspecified__"
canon_surt = surt.surt(recorded_url.url.decode("utf-8"), trailing_comma=True, host_massage=False)
canon_surt = surt.surt(recorded_url.url.decode("utf-8"),
trailing_comma=True, host_massage=False, with_scheme=True)
entry = {
# id only specified for rethinkdb partitioning
"id": "{} {}".format(canon_surt[:20], records[0].id.decode("utf-8")[10:-1]),
"abbr_canon_surt": canon_surt[:150],
"canon_surt": canon_surt,
# "timestamp": re.sub(r"[^0-9]", "", records[0].date.decode("utf-8")),
"timestamp": records[0].date.decode("utf-8"),
"url": recorded_url.url.decode("utf-8"),

View File

@ -148,11 +148,12 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
def _enforce_limits(self, warcprox_meta):
if warcprox_meta and "limits" in warcprox_meta:
# self.logger.info("warcprox_meta['limits']=%s", warcprox_meta['limits'])
for item in warcprox_meta["limits"].items():
key, limit = item
bucket0, bucket1, bucket2 = key.rsplit(".", 2)
value = self.server.stats_db.value(bucket0, bucket1, bucket2)
# self.logger.debug("warcprox_meta['limits']=%s stats['%s']=%s recorded_url_q.qsize()=%s",
# warcprox_meta['limits'], key, value, self.server.recorded_url_q.qsize())
if value and value >= limit:
body = "request rejected by warcprox: reached limit {}={}\n".format(key, limit).encode("utf-8")
self.send_response(420, "Reached limit")
@ -243,9 +244,9 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
self.log_request(prox_rec_res.status, prox_rec_res.recorder.len)
except socket.timeout as e:
self.logger.warn("%s proxying %s", repr(e), self.url)
self.logger.warn("%s proxying %s %s", repr(e), self.command, self.url)
except BaseException as e:
self.logger.error("%s proxying %s", repr(e), self.url, exc_info=True)
self.logger.error("%s proxying %s %s", repr(e), self.command, self.url, exc_info=True)
finally:
# Let's close off the remote end
if prox_rec_res:

View File

@ -38,6 +38,7 @@ class WarcWriterThread(threading.Thread):
def run(self):
try:
# XXX warcprox can shut down with urls to archive left in the queue
self.setName('WarcWriterThread(tid={})'.format(warcprox.gettid()))
while not self.stop.is_set():
try: