mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
for captures table generate canonical surt with scheme://
This commit is contained in:
parent
686a297f98
commit
12432b23ae
@ -66,12 +66,14 @@ class RethinkCaptures:
|
||||
else:
|
||||
bucket = "__unspecified__"
|
||||
|
||||
canon_surt = surt.surt(recorded_url.url.decode("utf-8"), trailing_comma=True, host_massage=False)
|
||||
canon_surt = surt.surt(recorded_url.url.decode("utf-8"),
|
||||
trailing_comma=True, host_massage=False, with_scheme=True)
|
||||
|
||||
entry = {
|
||||
# id only specified for rethinkdb partitioning
|
||||
"id": "{} {}".format(canon_surt[:20], records[0].id.decode("utf-8")[10:-1]),
|
||||
"abbr_canon_surt": canon_surt[:150],
|
||||
"canon_surt": canon_surt,
|
||||
# "timestamp": re.sub(r"[^0-9]", "", records[0].date.decode("utf-8")),
|
||||
"timestamp": records[0].date.decode("utf-8"),
|
||||
"url": recorded_url.url.decode("utf-8"),
|
||||
|
@ -148,11 +148,12 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
||||
|
||||
def _enforce_limits(self, warcprox_meta):
|
||||
if warcprox_meta and "limits" in warcprox_meta:
|
||||
# self.logger.info("warcprox_meta['limits']=%s", warcprox_meta['limits'])
|
||||
for item in warcprox_meta["limits"].items():
|
||||
key, limit = item
|
||||
bucket0, bucket1, bucket2 = key.rsplit(".", 2)
|
||||
value = self.server.stats_db.value(bucket0, bucket1, bucket2)
|
||||
# self.logger.debug("warcprox_meta['limits']=%s stats['%s']=%s recorded_url_q.qsize()=%s",
|
||||
# warcprox_meta['limits'], key, value, self.server.recorded_url_q.qsize())
|
||||
if value and value >= limit:
|
||||
body = "request rejected by warcprox: reached limit {}={}\n".format(key, limit).encode("utf-8")
|
||||
self.send_response(420, "Reached limit")
|
||||
@ -243,9 +244,9 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
||||
|
||||
self.log_request(prox_rec_res.status, prox_rec_res.recorder.len)
|
||||
except socket.timeout as e:
|
||||
self.logger.warn("%s proxying %s", repr(e), self.url)
|
||||
self.logger.warn("%s proxying %s %s", repr(e), self.command, self.url)
|
||||
except BaseException as e:
|
||||
self.logger.error("%s proxying %s", repr(e), self.url, exc_info=True)
|
||||
self.logger.error("%s proxying %s %s", repr(e), self.command, self.url, exc_info=True)
|
||||
finally:
|
||||
# Let's close off the remote end
|
||||
if prox_rec_res:
|
||||
|
@ -38,6 +38,7 @@ class WarcWriterThread(threading.Thread):
|
||||
|
||||
def run(self):
|
||||
try:
|
||||
# XXX warcprox can shut down with urls to archive left in the queue
|
||||
self.setName('WarcWriterThread(tid={})'.format(warcprox.gettid()))
|
||||
while not self.stop.is_set():
|
||||
try:
|
||||
|
Loading…
x
Reference in New Issue
Block a user