mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
use %r instead of calling repr()
This commit is contained in:
parent
2f93cdcad9
commit
1500341875
2
setup.py
2
setup.py
@ -50,7 +50,7 @@ except:
|
|||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='warcprox',
|
name='warcprox',
|
||||||
version='2.1b1.dev87',
|
version='2.1b1.dev88',
|
||||||
description='WARC writing MITM HTTP/S proxy',
|
description='WARC writing MITM HTTP/S proxy',
|
||||||
url='https://github.com/internetarchive/warcprox',
|
url='https://github.com/internetarchive/warcprox',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
@ -77,7 +77,7 @@ def _send(self, data):
|
|||||||
logging.root.handlers[0].stream.write(data)
|
logging.root.handlers[0].stream.write(data)
|
||||||
logging.root.handlers[0].stream.write('\n')
|
logging.root.handlers[0].stream.write('\n')
|
||||||
else:
|
else:
|
||||||
logging.info('sending data from %s', repr(data))
|
logging.info('sending data from %r', data)
|
||||||
orig_send(self, data)
|
orig_send(self, data)
|
||||||
### uncomment this to block see raw requests going over the wire
|
### uncomment this to block see raw requests going over the wire
|
||||||
# http_client.HTTPConnection.send = _send
|
# http_client.HTTPConnection.send = _send
|
||||||
|
@ -102,14 +102,13 @@ class RethinkCaptures:
|
|||||||
def _ensure_db_table(self):
|
def _ensure_db_table(self):
|
||||||
dbs = self.rr.db_list().run()
|
dbs = self.rr.db_list().run()
|
||||||
if not self.rr.dbname in dbs:
|
if not self.rr.dbname in dbs:
|
||||||
self.logger.info(
|
self.logger.info("creating rethinkdb database %r", self.rr.dbname)
|
||||||
"creating rethinkdb database %s", repr(self.rr.dbname))
|
|
||||||
self.rr.db_create(self.rr.dbname).run()
|
self.rr.db_create(self.rr.dbname).run()
|
||||||
tables = self.rr.table_list().run()
|
tables = self.rr.table_list().run()
|
||||||
if not self.table in tables:
|
if not self.table in tables:
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
"creating rethinkdb table %s in database %s",
|
"creating rethinkdb table %r in database %r",
|
||||||
repr(self.table), repr(self.rr.dbname))
|
self.table, self.rr.dbname)
|
||||||
self.rr.table_create(self.table, shards=self.shards, replicas=self.replicas).run()
|
self.rr.table_create(self.table, shards=self.shards, replicas=self.replicas).run()
|
||||||
self.rr.table(self.table).index_create(
|
self.rr.table(self.table).index_create(
|
||||||
"abbr_canon_surt_timestamp",
|
"abbr_canon_surt_timestamp",
|
||||||
@ -120,7 +119,7 @@ class RethinkCaptures:
|
|||||||
def find_response_by_digest(self, algo, raw_digest, bucket="__unspecified__"):
|
def find_response_by_digest(self, algo, raw_digest, bucket="__unspecified__"):
|
||||||
if algo != "sha1":
|
if algo != "sha1":
|
||||||
raise Exception(
|
raise Exception(
|
||||||
"digest type is %s but big captures table is indexed by "
|
"digest type is %r but big captures table is indexed by "
|
||||||
"sha1" % algo)
|
"sha1" % algo)
|
||||||
sha1base32 = base64.b32encode(raw_digest).decode("utf-8")
|
sha1base32 = base64.b32encode(raw_digest).decode("utf-8")
|
||||||
results_iter = self.rr.table(self.table).get_all(
|
results_iter = self.rr.table(self.table).get_all(
|
||||||
@ -130,11 +129,14 @@ class RethinkCaptures:
|
|||||||
results = list(results_iter)
|
results = list(results_iter)
|
||||||
if len(results) > 0:
|
if len(results) > 0:
|
||||||
if len(results) > 1:
|
if len(results) > 1:
|
||||||
self.logger.debug("expected 0 or 1 but found %s results for sha1base32=%s bucket=%s (will use first result)", len(results), sha1base32, bucket)
|
self.logger.debug(
|
||||||
|
"expected 0 or 1 but found %r results for "
|
||||||
|
"sha1base32=%r bucket=%r (will use first result)",
|
||||||
|
len(results), sha1base32, bucket)
|
||||||
result = results[0]
|
result = results[0]
|
||||||
else:
|
else:
|
||||||
result = None
|
result = None
|
||||||
self.logger.debug("returning %s for sha1base32=%s bucket=%s",
|
self.logger.debug("returning %r for sha1base32=%r bucket=%r",
|
||||||
result, sha1base32, bucket)
|
result, sha1base32, bucket)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
@ -146,7 +148,7 @@ class RethinkCaptures:
|
|||||||
).decode("utf-8")
|
).decode("utf-8")
|
||||||
else:
|
else:
|
||||||
self.logger.warn(
|
self.logger.warn(
|
||||||
"digest type is %s but big captures table is indexed "
|
"digest type is %r but big captures table is indexed "
|
||||||
"by sha1",
|
"by sha1",
|
||||||
recorded_url.response_recorder.payload_digest.name)
|
recorded_url.response_recorder.payload_digest.name)
|
||||||
else:
|
else:
|
||||||
|
@ -135,15 +135,14 @@ class RethinkDedupDb:
|
|||||||
def _ensure_db_table(self):
|
def _ensure_db_table(self):
|
||||||
dbs = self.rr.db_list().run()
|
dbs = self.rr.db_list().run()
|
||||||
if not self.rr.dbname in dbs:
|
if not self.rr.dbname in dbs:
|
||||||
self.logger.info(
|
self.logger.info("creating rethinkdb database %r", self.rr.dbname)
|
||||||
"creating rethinkdb database %s", repr(self.rr.dbname))
|
|
||||||
self.rr.db_create(self.rr.dbname).run()
|
self.rr.db_create(self.rr.dbname).run()
|
||||||
tables = self.rr.table_list().run()
|
tables = self.rr.table_list().run()
|
||||||
if not self.table in tables:
|
if not self.table in tables:
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
"creating rethinkdb table %s in database %s shards=%s "
|
"creating rethinkdb table %r in database %r shards=%r "
|
||||||
"replicas=%s", repr(self.table), repr(self.rr.dbname),
|
"replicas=%r", self.table, self.rr.dbname, self.shards,
|
||||||
self.shards, self.replicas)
|
self.replicas)
|
||||||
self.rr.table_create(
|
self.rr.table_create(
|
||||||
self.table, primary_key="key", shards=self.shards,
|
self.table, primary_key="key", shards=self.shards,
|
||||||
replicas=self.replicas).run()
|
replicas=self.replicas).run()
|
||||||
|
@ -97,7 +97,7 @@ class CaptureFeed:
|
|||||||
d[k] = v
|
d[k] = v
|
||||||
|
|
||||||
msg = json.dumps(d, separators=(',', ':')).encode('utf-8')
|
msg = json.dumps(d, separators=(',', ':')).encode('utf-8')
|
||||||
self.logger.debug('feeding kafka topic=%s msg=%s', repr(topic), msg)
|
self.logger.debug('feeding kafka topic=%r msg=%r', topic, msg)
|
||||||
p = self._producer()
|
p = self._producer()
|
||||||
if p:
|
if p:
|
||||||
p.send(topic, msg)
|
p.send(topic, msg)
|
||||||
|
@ -209,8 +209,8 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
|||||||
u = urllib_parse.urlparse(self.url)
|
u = urllib_parse.urlparse(self.url)
|
||||||
if u.scheme != 'http':
|
if u.scheme != 'http':
|
||||||
raise Exception(
|
raise Exception(
|
||||||
'unable to parse request %s as a proxy request' % (
|
'unable to parse request %r as a proxy request' % (
|
||||||
repr(self.requestline)))
|
self.requestline))
|
||||||
host = u.hostname
|
host = u.hostname
|
||||||
self.port = u.port or 80
|
self.port = u.port or 80
|
||||||
self.path = urllib_parse.urlunparse(
|
self.path = urllib_parse.urlunparse(
|
||||||
@ -294,7 +294,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
try:
|
try:
|
||||||
self.logger.error(
|
self.logger.error(
|
||||||
"problem handling %s: %s", repr(self.requestline), e)
|
"problem handling %r: %r", self.requestline, e)
|
||||||
if type(e) is socket.timeout:
|
if type(e) is socket.timeout:
|
||||||
self.send_error(504, str(e))
|
self.send_error(504, str(e))
|
||||||
else:
|
else:
|
||||||
@ -328,7 +328,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
|||||||
|
|
||||||
def do_COMMAND(self):
|
def do_COMMAND(self):
|
||||||
self.logger.trace(
|
self.logger.trace(
|
||||||
'request from %s:%s: %s', self.client_address[0],
|
'request from %s:%s: %r', self.client_address[0],
|
||||||
self.client_address[1], self.requestline)
|
self.client_address[1], self.requestline)
|
||||||
try:
|
try:
|
||||||
if self.is_connect:
|
if self.is_connect:
|
||||||
@ -341,12 +341,12 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
|||||||
self._connect_to_remote_server()
|
self._connect_to_remote_server()
|
||||||
except warcprox.RequestBlockedByRule as e:
|
except warcprox.RequestBlockedByRule as e:
|
||||||
# limit enforcers have already sent the appropriate response
|
# limit enforcers have already sent the appropriate response
|
||||||
self.logger.info("%s: %s", repr(self.requestline), e)
|
self.logger.info("%r: %r", self.requestline, e)
|
||||||
return
|
return
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.logger.error(
|
self.logger.error(
|
||||||
"problem processing request %s: %s",
|
"problem processing request %r: %r",
|
||||||
repr(self.requestline), e, exc_info=True)
|
self.requestline, e, exc_info=True)
|
||||||
self.send_error(500, str(e))
|
self.send_error(500, str(e))
|
||||||
return
|
return
|
||||||
|
|
||||||
@ -393,7 +393,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
|||||||
req += self.rfile.read(int(self.headers['Content-Length']))
|
req += self.rfile.read(int(self.headers['Content-Length']))
|
||||||
|
|
||||||
try:
|
try:
|
||||||
self.logger.debug('sending to remote server req=%s', repr(req))
|
self.logger.debug('sending to remote server req=%r', req)
|
||||||
|
|
||||||
# Send it down the pipe!
|
# Send it down the pipe!
|
||||||
self._remote_server_sock.sendall(req)
|
self._remote_server_sock.sendall(req)
|
||||||
@ -411,7 +411,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
|||||||
self.log_request(prox_rec_res.status, prox_rec_res.recorder.len)
|
self.log_request(prox_rec_res.status, prox_rec_res.recorder.len)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.logger.error(
|
self.logger.error(
|
||||||
"%s proxying %s %s", repr(e), self.command, self.url,
|
"%r proxying %s %s", e, self.command, self.url,
|
||||||
exc_info=True)
|
exc_info=True)
|
||||||
finally:
|
finally:
|
||||||
# Let's close off the remote end
|
# Let's close off the remote end
|
||||||
|
@ -82,7 +82,7 @@ class PlaybackProxyHandler(MitmProxyHandler):
|
|||||||
self.connection.sendall(payload)
|
self.connection.sendall(payload)
|
||||||
sz = len(headers) + len(payload)
|
sz = len(headers) + len(payload)
|
||||||
|
|
||||||
self.log_message('"%s" %s %s %s',
|
self.log_message('%r %s %s %s',
|
||||||
self.requestline, str(status), str(sz),
|
self.requestline, str(status), str(sz),
|
||||||
repr(location) if location else '-')
|
repr(location) if location else '-')
|
||||||
|
|
||||||
@ -310,7 +310,7 @@ class PlaybackIndexDb(object):
|
|||||||
return None, None
|
return None, None
|
||||||
|
|
||||||
json_value = result_tuple[0]
|
json_value = result_tuple[0]
|
||||||
self.logger.debug("{}:{}".format(repr(url), repr(json_value)))
|
self.logger.debug('%r:%r', url, json_value)
|
||||||
py_value = json.loads(json_value)
|
py_value = json.loads(json_value)
|
||||||
|
|
||||||
latest_date = max(py_value)
|
latest_date = max(py_value)
|
||||||
@ -330,21 +330,19 @@ class PlaybackIndexDb(object):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
json_value = result_tuple[0]
|
json_value = result_tuple[0]
|
||||||
self.logger.debug("%s:%s", repr(url), repr(json_value))
|
self.logger.debug('%r:%r', url, json_value)
|
||||||
py_value = json.loads(json_value)
|
py_value = json.loads(json_value)
|
||||||
|
|
||||||
if warc_date in py_value:
|
if warc_date in py_value:
|
||||||
for record in py_value[warc_date]:
|
for record in py_value[warc_date]:
|
||||||
if record['i'] == record_id:
|
if record['i'] == record_id:
|
||||||
self.logger.debug(
|
self.logger.debug(
|
||||||
"found exact match for (%s,%s,%s)",
|
"found exact match for (%r,%r,%r)",
|
||||||
repr(warc_date), repr(record_id), repr(url))
|
warc_date, record_id, url)
|
||||||
record['i'] = record['i']
|
record['i'] = record['i']
|
||||||
return record
|
return record
|
||||||
else:
|
else:
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
"match not found for (%s,%s,%s)", repr(warc_date),
|
"match not found for (%r,%r,%r)", warc_date, record_id, url)
|
||||||
repr(record_id), repr(url))
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
@ -256,14 +256,14 @@ class RethinkStatsDb(StatsDb):
|
|||||||
dbs = self.rr.db_list().run()
|
dbs = self.rr.db_list().run()
|
||||||
if not self.rr.dbname in dbs:
|
if not self.rr.dbname in dbs:
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
"creating rethinkdb database %s", repr(self.rr.dbname))
|
"creating rethinkdb database %r", self.rr.dbname)
|
||||||
self.rr.db_create(self.rr.dbname).run()
|
self.rr.db_create(self.rr.dbname).run()
|
||||||
tables = self.rr.table_list().run()
|
tables = self.rr.table_list().run()
|
||||||
if not self.table in tables:
|
if not self.table in tables:
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
"creating rethinkdb table %s in database %s shards=%s "
|
"creating rethinkdb table %r in database %r shards=%r "
|
||||||
"replicas=%s", repr(self.table), repr(self.rr.dbname),
|
"replicas=%r", self.table, self.rr.dbname, self.shards,
|
||||||
self.shards, self.replicas)
|
self.replicas)
|
||||||
self.rr.table_create(
|
self.rr.table_create(
|
||||||
self.table, primary_key="bucket", shards=self.shards,
|
self.table, primary_key="bucket", shards=self.shards,
|
||||||
replicas=self.replicas).run()
|
replicas=self.replicas).run()
|
||||||
|
@ -1,23 +1,23 @@
|
|||||||
#
|
'''
|
||||||
# warcprox/writer.py - warc writer, manages and writes records to warc files
|
warcprox/writer.py - warc writer, manages and writes records to warc files
|
||||||
#
|
|
||||||
# Copyright (C) 2013-2016 Internet Archive
|
Copyright (C) 2013-2017 Internet Archive
|
||||||
#
|
|
||||||
# This program is free software; you can redistribute it and/or
|
This program is free software; you can redistribute it and/or
|
||||||
# modify it under the terms of the GNU General Public License
|
modify it under the terms of the GNU General Public License
|
||||||
# as published by the Free Software Foundation; either version 2
|
as published by the Free Software Foundation; either version 2
|
||||||
# of the License, or (at your option) any later version.
|
of the License, or (at your option) any later version.
|
||||||
#
|
|
||||||
# This program is distributed in the hope that it will be useful,
|
This program is distributed in the hope that it will be useful,
|
||||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
# GNU General Public License for more details.
|
GNU General Public License for more details.
|
||||||
#
|
|
||||||
# You should have received a copy of the GNU General Public License
|
You should have received a copy of the GNU General Public License
|
||||||
# along with this program; if not, write to the Free Software
|
along with this program; if not, write to the Free Software
|
||||||
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
|
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
|
||||||
# USA.
|
USA.
|
||||||
#
|
'''
|
||||||
|
|
||||||
from __future__ import absolute_import
|
from __future__ import absolute_import
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user