From decb985250fb101418061b26204fcbe8e0d4a643 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 1 Sep 2015 00:53:38 +0000 Subject: [PATCH] add length field to each record in big captures table (size in bytes of compressed warc record) because pywayback needs it --- warcprox/bigtable.py | 10 +++++++++- warcprox/warcproxy.py | 6 +++--- warcprox/writer.py | 1 + 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/warcprox/bigtable.py b/warcprox/bigtable.py index 2ca7ba8..ea38cc9 100644 --- a/warcprox/bigtable.py +++ b/warcprox/bigtable.py @@ -63,6 +63,13 @@ class RethinkCaptures: bucket = "__unspecified__" canon_surt = surt.surt(recorded_url.url.decode("utf-8"), trailing_comma=True, host_massage=False) + + mimetype = recorded_url.content_type + if mimetype: + n = mimetype.find(";") + if n >= 0: + mimetype = mimetype[:n] + entry = { # id only specified for rethinkdb partitioning "id": "{} {}".format(canon_surt[:20], records[0].id.decode("utf-8")[10:-1]), @@ -75,10 +82,11 @@ class RethinkCaptures: "warc_type": records[0].type.decode("utf-8"), "warc_id": records[0].id.decode("utf-8"), "sha1base32": base64.b32encode(recorded_url.response_recorder.payload_digest.digest()).decode("utf-8"), - "content_type": recorded_url.content_type, + "content_type": mimetype, "response_code": recorded_url.status, "http_method": recorded_url.method, "bucket": bucket, + "length": records[0].length, } result = self.r.run(r.table(self.table).insert(entry)) diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index 1b56e4b..9b82ac5 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -147,25 +147,25 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): logger = logging.getLogger("warcprox.warcprox.WarcProxyHandler") def _enforce_limits(self, warcprox_meta): - if (warcprox_meta and "limits" in warcprox_meta): + if warcprox_meta and "limits" in warcprox_meta: # self.logger.info("warcprox_meta['limits']=%s", warcprox_meta['limits']) for item in warcprox_meta["limits"].items(): key, limit = item bucket0, bucket1, bucket2 = key.rsplit(".", 2) value = self.server.stats_db.value(bucket0, bucket1, bucket2) if value and value >= limit: - self.logger.info('sending "420 Reached limit" %s=%s', key, limit) body = "request rejected by warcprox: reached limit {}={}\n".format(key, limit).encode("utf-8") self.send_response(420, "Reached limit") self.send_header("Content-Type", "text/plain;charset=utf-8") self.send_header("Connection", "close") self.send_header("Content-Length", len(body)) - response_meta = {"reached-limit":{key:limit}, "stats":{bucket0: self.server.stats_db.value(bucket0)}} + response_meta = {"reached-limit":{key:limit}, "stats":{bucket0:self.server.stats_db.value(bucket0)}} self.send_header("Warcprox-Meta", json.dumps(response_meta, separators=(",",":"))) self.end_headers() if self.command != "HEAD": self.wfile.write(body) self.connection.close() + self.logger.info("%s 420 %s %s -- reached limit %s=%s", self.client_address[0], self.command, self.url, key, limit) return True return False diff --git a/warcprox/writer.py b/warcprox/writer.py index 21ae23f..4603c0c 100644 --- a/warcprox/writer.py +++ b/warcprox/writer.py @@ -89,6 +89,7 @@ class WarcWriter: offset = writer.tell() record.write_to(writer, gzip=self.gzip) record.offset = offset + record.length = writer.tell() - offset record.warc_filename = self._f_finalname self.logger.debug('wrote warc record: warc_type=%s content_length=%s url=%s warc=%s offset=%d', record.get_header(warctools.WarcRecord.TYPE),