From 403404f59093dc447ec65bc959b384c14d2e23e5 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 14 Jul 2015 15:58:15 -0700 Subject: [PATCH 001/146] custom PUTMETA http verb for writing warc metadata records; code borrowed from Ilya's fork https://github.com/ikreymer/warcprox --- warcprox/mitmproxy.py | 9 +++++++++ warcprox/warcprox.py | 29 ++++++++++++++++++++++++++++- warcprox/warcwriter.py | 14 ++++++++++++-- 3 files changed, 49 insertions(+), 3 deletions(-) diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index 0ed3211..cbd3992 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -116,6 +116,13 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): def do_COMMAND(self): if not self.is_connect: + if self.command == 'PUTMETA': + self._handle_custom_record(type_='metadata') + return + # if self.command == 'PUTRES': + # self._handle_custom_record(type_='resource') + # return + try: # Connect to destination self._determine_host_port() @@ -130,6 +137,8 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): self._proxy_request() + def _handle_custom_record(self, type_): + raise Exception('Not supported') def _proxy_request(self): raise Exception('_proxy_request() not implemented in MitmProxyHandler, must be implemented in subclass!') diff --git a/warcprox/warcprox.py b/warcprox/warcprox.py index 7d98293..10e5b12 100644 --- a/warcprox/warcprox.py +++ b/warcprox/warcprox.py @@ -215,9 +215,33 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): return recorded_url + def _handle_custom_record(self, type_): + self.url = self.path + + if 'Content-Length' in self.headers and 'Content-Type' in self.headers: + request_data = self.rfile.read(int(self.headers['Content-Length'])) + + warcprox_meta = self.headers.get('Warcprox-Meta') + + rec_custom = RecordedUrl(url=self.url, + request_data=request_data, + response_recorder=None, + remote_ip=b'', + warcprox_meta=warcprox_meta, + content_type=self.headers['Content-Type'].encode('latin1'), + custom_type=type_) + + self.server.recorded_url_q.put(rec_custom) + self.send_response(204, 'OK') + else: + self.send_error(400, 'Bad request') + + self.end_headers() + class RecordedUrl(object): - def __init__(self, url, request_data, response_recorder, remote_ip, warcprox_meta=None): + def __init__(self, url, request_data, response_recorder, remote_ip, + warcprox_meta=None, content_type=None, custom_type=None): # XXX should test what happens with non-ascii url (when does # url-encoding happen?) if type(url) is not bytes: @@ -238,6 +262,9 @@ class RecordedUrl(object): else: self.warcprox_meta = {} + self.content_type = content_type + self.custom_type = custom_type + class WarcProxy(socketserver.ThreadingMixIn, http_server.HTTPServer): logger = logging.getLogger("warcprox.warcprox.WarcProxy") diff --git a/warcprox/warcwriter.py b/warcprox/warcwriter.py index 6af6733..d92f98a 100644 --- a/warcprox/warcwriter.py +++ b/warcprox/warcwriter.py @@ -56,6 +56,16 @@ class WarcWriter: warc_date = warctools.warc.warc_datetime_str(datetime.utcnow()) dedup_info = None + + # metadata special case + if recorded_url.custom_type == 'metadata': + metadata_rec = self.build_warc_record(url=recorded_url.url, + warc_date=warc_date, + data=recorded_url.request_data, + warc_type=warctools.WarcRecord.METADATA, + content_type=recorded_url.content_type) + return [metadata_rec] + if self.dedup_db is not None and recorded_url.response_recorder.payload_digest is not None: key = self.digest_str(recorded_url.response_recorder.payload_digest) dedup_info = self.dedup_db.lookup(key) @@ -230,7 +240,8 @@ class WarcWriter: if self.playback_index_db is not None: self.playback_index_db.save(self._f_finalname, recordset, recordset_offset) - recorded_url.response_recorder.tempfile.close() + if recorded_url.response_recorder is not None: + recorded_url.response_recorder.tempfile.close() def write_records(self, recorded_url): recordset = self.build_warc_records(recorded_url) @@ -252,7 +263,6 @@ class WarcWriter: self._final_tasks(recorded_url, recordset, recordset_offset) - class WarcWriterThread(threading.Thread): logger = logging.getLogger("warcprox.warcwriter.WarcWriterThread") From 0647c0c76d2a83ba39af7285e43d5e584d657342 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 17 Jul 2015 21:49:43 -0700 Subject: [PATCH 002/146] support for writing to different warcs based on Warcprox-Meta http request header warc-prefix setting --- warcprox/controller.py | 4 +- warcprox/main.py | 8 ++-- warcprox/mitmproxy.py | 2 +- warcprox/warcprox.py | 10 ++++- warcprox/warcwriter.py | 87 ++++++++++++++++++++++++++++-------------- 5 files changed, 73 insertions(+), 38 deletions(-) diff --git a/warcprox/controller.py b/warcprox/controller.py index 185ce9f..26e88fc 100644 --- a/warcprox/controller.py +++ b/warcprox/controller.py @@ -69,8 +69,8 @@ class WarcproxController(object): self.proxy.shutdown() self.proxy.server_close() - if self.warc_writer_thread.warc_writer.dedup_db is not None: - self.warc_writer_thread.warc_writer.dedup_db.close() + if self.warc_writer_thread.default_warc_writer.dedup_db is not None: + self.warc_writer_thread.default_warc_writer.dedup_db.close() if self.playback_proxy is not None: self.playback_proxy.shutdown() diff --git a/warcprox/main.py b/warcprox/main.py index 04156d3..147a030 100644 --- a/warcprox/main.py +++ b/warcprox/main.py @@ -123,14 +123,14 @@ def main(argv=sys.argv): playback_index_db = None playback_proxy = None - warc_writer = warcprox.warcwriter.WarcWriter(directory=args.directory, + default_warc_writer = warcprox.warcwriter.WarcWriter(directory=args.directory, gzip=args.gzip, prefix=args.prefix, port=int(args.port), rollover_size=int(args.size), base32=args.base32, dedup_db=dedup_db, digest_algorithm=args.digest_algorithm, - playback_index_db=playback_index_db) - warc_writer_thread = warcprox.warcwriter.WarcWriterThread( - recorded_url_q=recorded_url_q, warc_writer=warc_writer, + playback_index_db=playback_index_db, rollover_idle_time=int(args.rollover_idle_time) if args.rollover_idle_time is not None else None) + warc_writer_thread = warcprox.warcwriter.WarcWriterThread(recorded_url_q=recorded_url_q, + default_warc_writer=default_warc_writer) controller = warcprox.controller.WarcproxController(proxy, warc_writer_thread, playback_proxy) controller.run_until_shutdown() diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index cbd3992..24758f0 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -152,7 +152,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): self.log_date_time_string(), fmt % args)) def log_message(self, fmt, *args): - self.logger.info("{} {} - - [{}] {}".format(self.__class__.__name__, + self.logger.debug("{} {} - - [{}] {}".format(self.__class__.__name__, self.address_string(), self.log_date_time_string(), fmt % args)) diff --git a/warcprox/warcprox.py b/warcprox/warcprox.py index 10e5b12..68184b2 100644 --- a/warcprox/warcprox.py +++ b/warcprox/warcprox.py @@ -210,7 +210,8 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): recorded_url = RecordedUrl(url=self.url, request_data=req, response_recorder=h.recorder, remote_ip=remote_ip, - warcprox_meta=warcprox_meta) + warcprox_meta=warcprox_meta, method=self.command, + status=h.status, size=h.recorder.len) self.server.recorded_url_q.put(recorded_url) return recorded_url @@ -241,7 +242,8 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): class RecordedUrl(object): def __init__(self, url, request_data, response_recorder, remote_ip, - warcprox_meta=None, content_type=None, custom_type=None): + warcprox_meta=None, content_type=None, custom_type=None, + method=None, status=None, size=None): # XXX should test what happens with non-ascii url (when does # url-encoding happen?) if type(url) is not bytes: @@ -265,6 +267,10 @@ class RecordedUrl(object): self.content_type = content_type self.custom_type = custom_type + self.method = method + self.status = status + self.size = size + class WarcProxy(socketserver.ThreadingMixIn, http_server.HTTPServer): logger = logging.getLogger("warcprox.warcprox.WarcProxy") diff --git a/warcprox/warcwriter.py b/warcprox/warcwriter.py index d92f98a..4736a86 100644 --- a/warcprox/warcwriter.py +++ b/warcprox/warcwriter.py @@ -26,9 +26,11 @@ class WarcWriter: def __init__(self, directory='./warcs', rollover_size=1000000000, gzip=False, prefix='WARCPROX', port=0, digest_algorithm='sha1', base32=False, dedup_db=None, - playback_index_db=None): + playback_index_db=None, rollover_idle_time=None): self.rollover_size = rollover_size + self.rollover_idle_time = rollover_idle_time + self._last_activity = time.time() self.gzip = gzip self.digest_algorithm = digest_algorithm @@ -50,7 +52,6 @@ class WarcWriter: self.logger.info("warc destination directory {} doesn't exist, creating it".format(directory)) os.mkdir(directory) - # returns a tuple (principal_record, request_record) where principal_record is either a response or revisit record def build_warc_records(self, recorded_url): warc_date = warctools.warc.warc_datetime_str(datetime.utcnow()) @@ -107,11 +108,9 @@ class WarcWriter: return principal_record, request_record - def digest_str(self, hash_obj): return hash_obj.name.encode('utf-8') + b':' + (base64.b32encode(hash_obj.digest()) if self.base32 else hash_obj.hexdigest().encode('ascii')) - def build_warc_record(self, url, warc_date=None, recorder=None, data=None, concurrent_to=None, warc_type=None, content_type=None, remote_ip=None, profile=None, refers_to=None, refers_to_target_uri=None, @@ -167,7 +166,6 @@ class WarcWriter: return record - def timestamp17(self): now = datetime.utcnow() return '{:%Y%m%d%H%M%S}{:03d}'.format(now, now.microsecond//1000) @@ -207,7 +205,6 @@ class WarcWriter: return record - # def _writer(self): if self._fpath and os.path.getsize(self._fpath) > self.rollover_size: @@ -229,7 +226,6 @@ class WarcWriter: return self._f - def _final_tasks(self, recorded_url, recordset, recordset_offset): if (self.dedup_db is not None and recordset[0].get_header(warctools.WarcRecord.TYPE) == warctools.WarcRecord.RESPONSE @@ -243,6 +239,15 @@ class WarcWriter: if recorded_url.response_recorder is not None: recorded_url.response_recorder.tempfile.close() + self._last_activity = time.time() + + # 2015-07-17T22:32:23.672Z 1 58 dns:www.dhss.delaware.gov P http://www.dhss.delaware.gov/dhss/ text/dns #045 20150717223214881+316 sha1:63UTPB7GTWIHAGIK3WWL76E57BBTJGAK http://www.dhss.delaware.gov/dhss/ - {"warcFileOffset":2964,"warcFilename":"ARCHIVEIT-1303-WEEKLY-JOB165158-20150717223222113-00000.warc.gz"} + self.logger.info("{} {} {} size={} {} {} offset={}".format( + recorded_url.status, recorded_url.method, + recorded_url.url.decode('utf-8'), recorded_url.size, + recordset[0].get_header(warctools.WarcRecord.PAYLOAD_DIGEST).decode("utf-8"), + self._f_finalname, recordset_offset)) + def write_records(self, recorded_url): recordset = self.build_warc_records(recorded_url) @@ -262,50 +267,74 @@ class WarcWriter: self._final_tasks(recorded_url, recordset, recordset_offset) + def maybe_idle_rollover(self): + if (self._fpath is not None + and self.rollover_idle_time is not None + and self.rollover_idle_time > 0 + and time.time() - self._last_activity > self.rollover_idle_time): + self.logger.debug('rolling over {} after {} seconds idle'.format(self._f_finalname, time.time() - self._last_activity)) + self.close_writer() class WarcWriterThread(threading.Thread): logger = logging.getLogger("warcprox.warcwriter.WarcWriterThread") - def __init__(self, recorded_url_q=None, warc_writer=None, rollover_idle_time=None): + def __init__(self, recorded_url_q=None, default_warc_writer=None): """recorded_url_q is a queue.Queue of warcprox.warcprox.RecordedUrl.""" threading.Thread.__init__(self, name='WarcWriterThread') self.recorded_url_q = recorded_url_q - self.rollover_idle_time = rollover_idle_time self.stop = threading.Event() - if warc_writer: - self.warc_writer = warc_writer + if default_warc_writer: + self.default_warc_writer = default_warc_writer else: - self.warc_writer = WarcWriter() + self.default_warc_writer = WarcWriter() + self.warc_writers = {} # {prefix:WarcWriter} + + def write_records(self, recorded_url): + w = self.default_warc_writer + if recorded_url.warcprox_meta and "warc-prefix" in recorded_url.warcprox_meta: + # self.logger.info("recorded_url.warcprox_meta={} for {}".format(recorded_url.warcprox_meta, recorded_url.url)) + prefix = recorded_url.warcprox_meta["warc-prefix"] + if not prefix in self.warc_writers: + self.warc_writers[prefix] = WarcWriter(prefix=prefix, + directory=self.default_warc_writer.directory, + rollover_size=self.default_warc_writer.rollover_size, + rollover_idle_time=self.default_warc_writer.rollover_idle_time, + gzip=self.default_warc_writer.gzip, + port=self.default_warc_writer.port, + digest_algorithm=self.default_warc_writer.digest_algorithm, + base32=self.default_warc_writer.base32, + dedup_db=self.default_warc_writer.dedup_db, + playback_index_db=self.default_warc_writer.playback_index_db) + w = self.warc_writers[prefix] + w.write_records(recorded_url) def run(self): self.logger.info('WarcWriterThread starting, directory={} gzip={} rollover_size={} rollover_idle_time={} prefix={} port={}'.format( - os.path.abspath(self.warc_writer.directory), self.warc_writer.gzip, self.warc_writer.rollover_size, - self.rollover_idle_time, self.warc_writer.prefix, self.warc_writer.port)) + os.path.abspath(self.default_warc_writer.directory), self.default_warc_writer.gzip, self.default_warc_writer.rollover_size, + self.default_warc_writer.rollover_idle_time, self.default_warc_writer.prefix, self.default_warc_writer.port)) - self._last_sync = self._last_activity = time.time() + self._last_sync = time.time() while not self.stop.is_set(): try: recorded_url = self.recorded_url_q.get(block=True, timeout=0.5) - self.logger.info("recorded_url.warcprox_meta={} for {}".format(recorded_url.warcprox_meta, recorded_url.url)) - self.warc_writer.write_records(recorded_url) - self._last_activity = time.time() + self.write_records(recorded_url) except queue.Empty: - if (self.warc_writer._fpath is not None - and self.rollover_idle_time is not None - and self.rollover_idle_time > 0 - and time.time() - self._last_activity > self.rollover_idle_time): - self.logger.debug('rolling over warc file after {} seconds idle'.format(time.time() - self._last_activity)) - self.warc_writer.close_writer() + self.default_warc_writer.maybe_idle_rollover() + for w in self.warc_writers.values(): + w.maybe_idle_rollover() + # XXX prob doesn't belong here (do we need it at all?) if time.time() - self._last_sync > 60: - if self.warc_writer.dedup_db: - self.warc_writer.dedup_db.sync() - if self.warc_writer.playback_index_db: - self.warc_writer.playback_index_db.sync() + if self.default_warc_writer.dedup_db: + self.default_warc_writer.dedup_db.sync() + if self.default_warc_writer.playback_index_db: + self.default_warc_writer.playback_index_db.sync() self._last_sync = time.time() self.logger.info('WarcWriterThread shutting down') - self.warc_writer.close_writer(); + self.default_warc_writer.close_writer() + for w in self.warc_writers.values(): + w.close_writer() From f00602b764fe86ebdceda28cc4df2fb2a12e3043 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Mon, 20 Jul 2015 13:40:20 -0700 Subject: [PATCH 003/146] some logging tweaks, etc --- warcprox/mitmproxy.py | 6 +++--- warcprox/warcprox.py | 14 +++++++++++--- warcprox/warcwriter.py | 11 +++++++---- 3 files changed, 21 insertions(+), 10 deletions(-) diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index 24758f0..d6fe96d 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -117,7 +117,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): def do_COMMAND(self): if not self.is_connect: if self.command == 'PUTMETA': - self._handle_custom_record(type_='metadata') + self._prepare_custom_record(method=self.command, type_='metadata') return # if self.command == 'PUTRES': # self._handle_custom_record(type_='resource') @@ -137,7 +137,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): self._proxy_request() - def _handle_custom_record(self, type_): + def _handle_custom_record(self, method, type_): raise Exception('Not supported') def _proxy_request(self): @@ -152,7 +152,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): self.log_date_time_string(), fmt % args)) def log_message(self, fmt, *args): - self.logger.debug("{} {} - - [{}] {}".format(self.__class__.__name__, + self.logger.info("{} {} - - [{}] {}".format(self.__class__.__name__, self.address_string(), self.log_date_time_string(), fmt % args)) diff --git a/warcprox/warcprox.py b/warcprox/warcprox.py index 68184b2..7fcebbd 100644 --- a/warcprox/warcprox.py +++ b/warcprox/warcprox.py @@ -216,7 +216,7 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): return recorded_url - def _handle_custom_record(self, type_): + def _handle_custom_record(self, method, type_): self.url = self.path if 'Content-Length' in self.headers and 'Content-Type' in self.headers: @@ -230,7 +230,9 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): remote_ip=b'', warcprox_meta=warcprox_meta, content_type=self.headers['Content-Type'].encode('latin1'), - custom_type=type_) + custom_type=type_, + method=method, + status=204, size=len(request_data)) self.server.recorded_url_q.put(rec_custom) self.send_response(204, 'OK') @@ -239,6 +241,13 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): self.end_headers() + def log_error(self, fmt, *args): + # logging better handled elsewhere? + pass + + def log_message(self, fmt, *args): + # logging better handled elsewhere? + pass class RecordedUrl(object): def __init__(self, url, request_data, response_recorder, remote_ip, @@ -271,7 +280,6 @@ class RecordedUrl(object): self.status = status self.size = size - class WarcProxy(socketserver.ThreadingMixIn, http_server.HTTPServer): logger = logging.getLogger("warcprox.warcprox.WarcProxy") diff --git a/warcprox/warcwriter.py b/warcprox/warcwriter.py index 4736a86..7146904 100644 --- a/warcprox/warcwriter.py +++ b/warcprox/warcwriter.py @@ -240,13 +240,16 @@ class WarcWriter: recorded_url.response_recorder.tempfile.close() self._last_activity = time.time() - + + try: + payload_digest = recordset[0].get_header(warctools.WarcRecord.PAYLOAD_DIGEST).decode("utf-8") + except: + payload_digest = "-" # 2015-07-17T22:32:23.672Z 1 58 dns:www.dhss.delaware.gov P http://www.dhss.delaware.gov/dhss/ text/dns #045 20150717223214881+316 sha1:63UTPB7GTWIHAGIK3WWL76E57BBTJGAK http://www.dhss.delaware.gov/dhss/ - {"warcFileOffset":2964,"warcFilename":"ARCHIVEIT-1303-WEEKLY-JOB165158-20150717223222113-00000.warc.gz"} self.logger.info("{} {} {} size={} {} {} offset={}".format( - recorded_url.status, recorded_url.method, + recorded_url.status, recorded_url.method, recorded_url.url.decode('utf-8'), recorded_url.size, - recordset[0].get_header(warctools.WarcRecord.PAYLOAD_DIGEST).decode("utf-8"), - self._f_finalname, recordset_offset)) + payload_digest, self._f_finalname, recordset_offset)) def write_records(self, recorded_url): recordset = self.build_warc_records(recorded_url) From eb7de9d3f9399b0ec851d410d491d2a032bfc964 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 23 Jul 2015 00:37:02 +0000 Subject: [PATCH 004/146] catch exception handling special request (currently that means PUTMETA) --- warcprox/mitmproxy.py | 6 +++--- warcprox/warcprox.py | 43 ++++++++++++++++++++++++------------------- 2 files changed, 27 insertions(+), 22 deletions(-) diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index d6fe96d..74b6ca7 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -117,10 +117,10 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): def do_COMMAND(self): if not self.is_connect: if self.command == 'PUTMETA': - self._prepare_custom_record(method=self.command, type_='metadata') + self._special_request(method=self.command, type_='metadata') return # if self.command == 'PUTRES': - # self._handle_custom_record(type_='resource') + # self._special_request(type_='resource') # return try: @@ -137,7 +137,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): self._proxy_request() - def _handle_custom_record(self, method, type_): + def _special_request(self, method, type_): raise Exception('Not supported') def _proxy_request(self): diff --git a/warcprox/warcprox.py b/warcprox/warcprox.py index 7fcebbd..59115fb 100644 --- a/warcprox/warcprox.py +++ b/warcprox/warcprox.py @@ -216,30 +216,35 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): return recorded_url - def _handle_custom_record(self, method, type_): - self.url = self.path + def _special_request(self, method, type_): + try: + self.url = self.path - if 'Content-Length' in self.headers and 'Content-Type' in self.headers: - request_data = self.rfile.read(int(self.headers['Content-Length'])) + if (method == 'PUTMETA' and 'Content-Length' in self.headers + and 'Content-Type' in self.headers): + request_data = self.rfile.read(int(self.headers['Content-Length'])) - warcprox_meta = self.headers.get('Warcprox-Meta') + warcprox_meta = self.headers.get('Warcprox-Meta') - rec_custom = RecordedUrl(url=self.url, - request_data=request_data, - response_recorder=None, - remote_ip=b'', - warcprox_meta=warcprox_meta, - content_type=self.headers['Content-Type'].encode('latin1'), - custom_type=type_, - method=method, - status=204, size=len(request_data)) + rec_custom = RecordedUrl(url=self.url, + request_data=request_data, + response_recorder=None, + remote_ip=b'', + warcprox_meta=warcprox_meta, + content_type=self.headers['Content-Type'].encode('latin1'), + custom_type=type_, + method=method, + status=204, size=len(request_data)) - self.server.recorded_url_q.put(rec_custom) - self.send_response(204, 'OK') - else: - self.send_error(400, 'Bad request') + self.server.recorded_url_q.put(rec_custom) + self.send_response(204, 'OK') + else: + self.send_error(400, 'Bad request') - self.end_headers() + self.end_headers() + except: + self.logger.error("uncaught except in _special_request", exc_info=True) + raise def log_error(self, fmt, *args): # logging better handled elsewhere? From 86eab2119ae66a1b8cdc3a37606f7882399b18e6 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 24 Jul 2015 01:39:11 +0000 Subject: [PATCH 005/146] logging and exception handling tweaks --- warcprox/mitmproxy.py | 18 ++++++++---------- warcprox/warcprox.py | 18 ++++++++++++------ warcprox/warcwriter.py | 23 +++++++++++++++++++---- 3 files changed, 39 insertions(+), 20 deletions(-) diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index 74b6ca7..b1a9a5b 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -32,7 +32,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): self.url = self.path u = urllib_parse.urlparse(self.url) if u.scheme != 'http': - raise Exception('Unknown scheme %s' % repr(u.scheme)) + raise Exception('unable to parse request "{}" as a proxy request'.format(self.requestline)) self.hostname = u.hostname self.port = u.port or 80 self.path = urllib_parse.urlunparse( @@ -83,6 +83,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): self._transition_to_ssl() except Exception as e: try: + self.logger.error("problem with connect line {}: {}".format(repr(self.requestline), e)) if type(e) is socket.timeout: self.send_error(504, str(e)) else: @@ -129,13 +130,18 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): self._connect_to_host() assert self.url except Exception as e: + self.logger.error("problem processing request {}: {}".format(repr(self.requestline), e)) self.send_error(500, str(e)) return else: # if self.is_connect we already connected in do_CONNECT self.url = self._construct_tunneled_url() - self._proxy_request() + try: + self._proxy_request() + except: + self.logger.error("exception from {}".format(self._proxy_request), exc_info=True) + raise def _special_request(self, method, type_): raise Exception('Not supported') @@ -147,12 +153,4 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): if item.startswith('do_'): return self.do_COMMAND - def log_error(self, fmt, *args): - self.logger.error("{0} - - [{1}] {2}".format(self.address_string(), - self.log_date_time_string(), fmt % args)) - - def log_message(self, fmt, *args): - self.logger.info("{} {} - - [{}] {}".format(self.__class__.__name__, - self.address_string(), self.log_date_time_string(), fmt % args)) - diff --git a/warcprox/warcprox.py b/warcprox/warcprox.py index 59115fb..4f0cafe 100644 --- a/warcprox/warcprox.py +++ b/warcprox/warcprox.py @@ -210,8 +210,11 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): recorded_url = RecordedUrl(url=self.url, request_data=req, response_recorder=h.recorder, remote_ip=remote_ip, - warcprox_meta=warcprox_meta, method=self.command, - status=h.status, size=h.recorder.len) + warcprox_meta=warcprox_meta, + status=h.status, size=h.recorder.len, + client_ip=self.client_address[0], + content_type=h.getheader("Content-Type"), + method=self.command) self.server.recorded_url_q.put(recorded_url) return recorded_url @@ -233,8 +236,9 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): warcprox_meta=warcprox_meta, content_type=self.headers['Content-Type'].encode('latin1'), custom_type=type_, - method=method, - status=204, size=len(request_data)) + status=204, size=len(request_data), + client_ip=self.client_address[0], + method=method) self.server.recorded_url_q.put(rec_custom) self.send_response(204, 'OK') @@ -254,10 +258,11 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): # logging better handled elsewhere? pass + class RecordedUrl(object): def __init__(self, url, request_data, response_recorder, remote_ip, warcprox_meta=None, content_type=None, custom_type=None, - method=None, status=None, size=None): + status=None, size=None, client_ip=None, method=None): # XXX should test what happens with non-ascii url (when does # url-encoding happen?) if type(url) is not bytes: @@ -281,9 +286,10 @@ class RecordedUrl(object): self.content_type = content_type self.custom_type = custom_type - self.method = method self.status = status self.size = size + self.client_ip = client_ip + self.method = method class WarcProxy(socketserver.ThreadingMixIn, http_server.HTTPServer): logger = logging.getLogger("warcprox.warcprox.WarcProxy") diff --git a/warcprox/warcwriter.py b/warcprox/warcwriter.py index 7146904..936e485 100644 --- a/warcprox/warcwriter.py +++ b/warcprox/warcwriter.py @@ -226,6 +226,12 @@ class WarcWriter: return self._f + def _decode(self, x): + if isinstance(x, bytes): + return x.decode("utf-8") + else: + return x + def _final_tasks(self, recorded_url, recordset, recordset_offset): if (self.dedup_db is not None and recordset[0].get_header(warctools.WarcRecord.TYPE) == warctools.WarcRecord.RESPONSE @@ -245,11 +251,20 @@ class WarcWriter: payload_digest = recordset[0].get_header(warctools.WarcRecord.PAYLOAD_DIGEST).decode("utf-8") except: payload_digest = "-" + mimetype = self._decode(recorded_url.content_type) + mimetype = mimetype[:mimetype.find(";")] + # 2015-07-17T22:32:23.672Z 1 58 dns:www.dhss.delaware.gov P http://www.dhss.delaware.gov/dhss/ text/dns #045 20150717223214881+316 sha1:63UTPB7GTWIHAGIK3WWL76E57BBTJGAK http://www.dhss.delaware.gov/dhss/ - {"warcFileOffset":2964,"warcFilename":"ARCHIVEIT-1303-WEEKLY-JOB165158-20150717223222113-00000.warc.gz"} - self.logger.info("{} {} {} size={} {} {} offset={}".format( - recorded_url.status, recorded_url.method, - recorded_url.url.decode('utf-8'), recorded_url.size, - payload_digest, self._f_finalname, recordset_offset)) + self.logger.info("{} {} {} {} {} size={} {} {} offset={}".format( + self._decode(recorded_url.client_ip), + self._decode(recorded_url.status), + self._decode(recorded_url.method), + self._decode(recorded_url.url), + mimetype, + recorded_url.size, + self._decode(payload_digest), + self._decode(self._f_finalname), + recordset_offset)) def write_records(self, recorded_url): recordset = self.build_warc_records(recorded_url) From 084bd75ed61c63496c2b15b1c9ab720d288e6ffd Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 24 Jul 2015 20:46:23 +0000 Subject: [PATCH 006/146] dump thread tracebacks on sigquit, more logging and exception handling tweaks --- warcprox/controller.py | 16 +++--------- warcprox/main.py | 21 ++++++++++++++++ warcprox/mitmproxy.py | 4 +-- warcprox/warcprox.py | 15 +++++++----- warcprox/warcwriter.py | 55 +++++++++++++++++++++++------------------- 5 files changed, 66 insertions(+), 45 deletions(-) diff --git a/warcprox/controller.py b/warcprox/controller.py index 26e88fc..89d420d 100644 --- a/warcprox/controller.py +++ b/warcprox/controller.py @@ -4,7 +4,6 @@ from __future__ import absolute_import import logging import threading -import signal import time import warcprox.warcprox @@ -36,12 +35,10 @@ class WarcproxController(object): self.playback_proxy = playback_proxy - def run_until_shutdown(self): - """Start warcprox and run until shut down. - - If running in the main thread, SIGTERM initiates a graceful shutdown. - Otherwise, call warcprox_controller.stop.set(). + """ + Start warcprox and run until shut down. Call + warcprox_controller.stop.set() to initiate graceful shutdown. """ proxy_thread = threading.Thread(target=self.proxy.serve_forever, name='ProxyThread') proxy_thread.start() @@ -53,16 +50,11 @@ class WarcproxController(object): self.stop = threading.Event() - try: - signal.signal(signal.SIGTERM, self.stop.set) - self.logger.info('SIGTERM will initiate graceful shutdown') - except ValueError: - pass - try: while not self.stop.is_set(): time.sleep(0.5) except: + self.logger.critical("fatal exception, shutting down", exc_info=1) pass finally: self.warc_writer_thread.stop.set() diff --git a/warcprox/main.py b/warcprox/main.py index 147a030..0ab6885 100644 --- a/warcprox/main.py +++ b/warcprox/main.py @@ -14,6 +14,10 @@ import hashlib import argparse import os import socket +import pprint +import traceback +import signal +import threading import certauth.certauth @@ -76,6 +80,18 @@ def _build_arg_parser(prog=os.path.basename(sys.argv[0])): return arg_parser +def dump_state(signum=None, frame=None): + pp = pprint.PrettyPrinter(indent=4) + state_strs = [] + + for th in threading.enumerate(): + state_strs.append(str(th)) + stack = traceback.format_stack(sys._current_frames()[th.ident]) + state_strs.append("".join(stack)) + + logging.warn("dumping state (caught signal {})\n{}".format(signum, "\n".join(state_strs))) + + def main(argv=sys.argv): arg_parser = _build_arg_parser(prog=os.path.basename(argv[0])) args = arg_parser.parse_args(args=argv[1:]) @@ -133,6 +149,11 @@ def main(argv=sys.argv): default_warc_writer=default_warc_writer) controller = warcprox.controller.WarcproxController(proxy, warc_writer_thread, playback_proxy) + + signal.signal(signal.SIGTERM, lambda a,b: controller.stop.set()) + signal.signal(signal.SIGINT, lambda a,b: controller.stop.set()) + signal.signal(signal.SIGQUIT, dump_state) + controller.run_until_shutdown() diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index b1a9a5b..edc9657 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -83,13 +83,13 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): self._transition_to_ssl() except Exception as e: try: - self.logger.error("problem with connect line {}: {}".format(repr(self.requestline), e)) + self.logger.error("problem with connect line {}: {}".format(repr(self.requestline), e), exc_info=True) if type(e) is socket.timeout: self.send_error(504, str(e)) else: self.send_error(500, str(e)) except Exception as f: - self.logger.warn("failed to send error response ({}) to proxy client: {}".format(e, f)) + self.logger.warn("failed to send error response ({}) to proxy client: {}".format(e, f), exc_info=True) return # Reload! diff --git a/warcprox/warcprox.py b/warcprox/warcprox.py index 4f0cafe..930a290 100644 --- a/warcprox/warcprox.py +++ b/warcprox/warcprox.py @@ -48,7 +48,7 @@ class ProxyingRecorder(object): logger = logging.getLogger("warcprox.warcprox.ProxyingRecorder") - def __init__(self, fp, proxy_dest, digest_algorithm='sha1'): + def __init__(self, fp, proxy_dest, digest_algorithm='sha1', url=None): self.fp = fp # "The file has no name, and will cease to exist when it is closed." self.tempfile = tempfile.SpooledTemporaryFile(max_size=512*1024) @@ -60,6 +60,7 @@ class ProxyingRecorder(object): self._proxy_dest_conn_open = True self._prev_hunk_last_two_bytes = b'' self.len = 0 + self.url = url def _update_payload_digest(self, hunk): if self.payload_digest is None: @@ -103,8 +104,8 @@ class ProxyingRecorder(object): self.proxy_dest.sendall(hunk) except BaseException as e: self._proxy_dest_conn_open = False - self.logger.warn('{} sending data to proxy client'.format(e)) - self.logger.info('will continue downloading from remote server without sending to client') + self.logger.warn('{} sending data to proxy client for url {}'.format(e, self.url)) + self.logger.info('will continue downloading from remote server without sending to client {}'.format(self.url)) self.len += len(hunk) @@ -140,12 +141,13 @@ class ProxyingRecorder(object): class ProxyingRecordingHTTPResponse(http_client.HTTPResponse): - def __init__(self, sock, debuglevel=0, method=None, proxy_dest=None, digest_algorithm='sha1'): + def __init__(self, sock, debuglevel=0, method=None, proxy_dest=None, digest_algorithm='sha1', url=None): http_client.HTTPResponse.__init__(self, sock, debuglevel=debuglevel, method=method) + self.url = url # Keep around extra reference to self.fp because HTTPResponse sets # self.fp=None after it finishes reading, but we still need it - self.recorder = ProxyingRecorder(self.fp, proxy_dest, digest_algorithm) + self.recorder = ProxyingRecorder(self.fp, proxy_dest, digest_algorithm, url=url) self.fp = self.recorder @@ -193,7 +195,8 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): # Proxy and record the response h = ProxyingRecordingHTTPResponse(self._proxy_sock, proxy_dest=self.connection, - digest_algorithm=self.server.digest_algorithm) + digest_algorithm=self.server.digest_algorithm, + url=self.url) h.begin() buf = h.read(8192) diff --git a/warcprox/warcwriter.py b/warcprox/warcwriter.py index 936e485..0d57bda 100644 --- a/warcprox/warcwriter.py +++ b/warcprox/warcwriter.py @@ -252,7 +252,10 @@ class WarcWriter: except: payload_digest = "-" mimetype = self._decode(recorded_url.content_type) - mimetype = mimetype[:mimetype.find(";")] + if mimetype: + n = mimetype.find(";") + if n >= 0: + mimetype = mimetype[:n] # 2015-07-17T22:32:23.672Z 1 58 dns:www.dhss.delaware.gov P http://www.dhss.delaware.gov/dhss/ text/dns #045 20150717223214881+316 sha1:63UTPB7GTWIHAGIK3WWL76E57BBTJGAK http://www.dhss.delaware.gov/dhss/ - {"warcFileOffset":2964,"warcFilename":"ARCHIVEIT-1303-WEEKLY-JOB165158-20150717223222113-00000.warc.gz"} self.logger.info("{} {} {} {} {} size={} {} {} offset={}".format( @@ -327,32 +330,34 @@ class WarcWriterThread(threading.Thread): w.write_records(recorded_url) def run(self): - self.logger.info('WarcWriterThread starting, directory={} gzip={} rollover_size={} rollover_idle_time={} prefix={} port={}'.format( - os.path.abspath(self.default_warc_writer.directory), self.default_warc_writer.gzip, self.default_warc_writer.rollover_size, - self.default_warc_writer.rollover_idle_time, self.default_warc_writer.prefix, self.default_warc_writer.port)) + try: + self.logger.info('WarcWriterThread starting, directory={} gzip={} rollover_size={} rollover_idle_time={} prefix={} port={}'.format( + os.path.abspath(self.default_warc_writer.directory), self.default_warc_writer.gzip, self.default_warc_writer.rollover_size, + self.default_warc_writer.rollover_idle_time, self.default_warc_writer.prefix, self.default_warc_writer.port)) - self._last_sync = time.time() + self._last_sync = time.time() - while not self.stop.is_set(): - try: - recorded_url = self.recorded_url_q.get(block=True, timeout=0.5) - self.write_records(recorded_url) - except queue.Empty: - self.default_warc_writer.maybe_idle_rollover() - for w in self.warc_writers.values(): - w.maybe_idle_rollover() + while not self.stop.is_set(): + try: + recorded_url = self.recorded_url_q.get(block=True, timeout=0.5) + self.write_records(recorded_url) + except queue.Empty: + self.default_warc_writer.maybe_idle_rollover() + for w in self.warc_writers.values(): + w.maybe_idle_rollover() - # XXX prob doesn't belong here (do we need it at all?) - if time.time() - self._last_sync > 60: - if self.default_warc_writer.dedup_db: - self.default_warc_writer.dedup_db.sync() - if self.default_warc_writer.playback_index_db: - self.default_warc_writer.playback_index_db.sync() - self._last_sync = time.time() - - self.logger.info('WarcWriterThread shutting down') - self.default_warc_writer.close_writer() - for w in self.warc_writers.values(): - w.close_writer() + # XXX prob doesn't belong here (do we need it at all?) + if time.time() - self._last_sync > 60: + if self.default_warc_writer.dedup_db: + self.default_warc_writer.dedup_db.sync() + if self.default_warc_writer.playback_index_db: + self.default_warc_writer.playback_index_db.sync() + self._last_sync = time.time() + self.logger.info('WarcWriterThread shutting down') + self.default_warc_writer.close_writer() + for w in self.warc_writers.values(): + w.close_writer() + except: + self.logger.critical("WarcWriterThread shutting down after unexpected error", exc_info=True) From 771383d0a6b762cffb11eef2faefc0c5d128e8bd Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 28 Jul 2015 01:05:08 +0000 Subject: [PATCH 007/146] refactor proxy handler to use do_* methods for custom http verbs; refactor warc writer thread to use new WarcWriterPool class --- warcprox/controller.py | 4 +- warcprox/main.py | 3 +- warcprox/mitmproxy.py | 9 +-- warcprox/warcprox.py | 20 ++++--- warcprox/warcwriter.py | 127 +++++++++++++++++++++++------------------ 5 files changed, 90 insertions(+), 73 deletions(-) diff --git a/warcprox/controller.py b/warcprox/controller.py index 89d420d..db76135 100644 --- a/warcprox/controller.py +++ b/warcprox/controller.py @@ -61,8 +61,8 @@ class WarcproxController(object): self.proxy.shutdown() self.proxy.server_close() - if self.warc_writer_thread.default_warc_writer.dedup_db is not None: - self.warc_writer_thread.default_warc_writer.dedup_db.close() + if self.warc_writer_thread.writer_pool.default_warc_writer.dedup_db is not None: + self.warc_writer_thread.writer_pool.default_warc_writer.dedup_db.close() if self.playback_proxy is not None: self.playback_proxy.shutdown() diff --git a/warcprox/main.py b/warcprox/main.py index 0ab6885..7a9f0b3 100644 --- a/warcprox/main.py +++ b/warcprox/main.py @@ -145,8 +145,9 @@ def main(argv=sys.argv): dedup_db=dedup_db, digest_algorithm=args.digest_algorithm, playback_index_db=playback_index_db, rollover_idle_time=int(args.rollover_idle_time) if args.rollover_idle_time is not None else None) + writer_pool=warcprox.warcwriter.WarcWriterPool(default_warc_writer) warc_writer_thread = warcprox.warcwriter.WarcWriterThread(recorded_url_q=recorded_url_q, - default_warc_writer=default_warc_writer) + writer_pool=writer_pool) controller = warcprox.controller.WarcproxController(proxy, warc_writer_thread, playback_proxy) diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index edc9657..9d57b44 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -83,7 +83,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): self._transition_to_ssl() except Exception as e: try: - self.logger.error("problem with connect line {}: {}".format(repr(self.requestline), e), exc_info=True) + self.logger.error("problem handling {}: {}".format(repr(self.requestline), e), exc_info=True) if type(e) is socket.timeout: self.send_error(504, str(e)) else: @@ -117,13 +117,6 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): def do_COMMAND(self): if not self.is_connect: - if self.command == 'PUTMETA': - self._special_request(method=self.command, type_='metadata') - return - # if self.command == 'PUTRES': - # self._special_request(type_='resource') - # return - try: # Connect to destination self._determine_host_port() diff --git a/warcprox/warcprox.py b/warcprox/warcprox.py index 930a290..c5d6b4e 100644 --- a/warcprox/warcprox.py +++ b/warcprox/warcprox.py @@ -36,6 +36,7 @@ import traceback import hashlib import json import socket +from hanzo import warctools from certauth.certauth import CertificateAuthority import warcprox.mitmproxy @@ -179,7 +180,7 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): if 'Content-Length' in self.headers: req += self.rfile.read(int(self.headers['Content-Length'])) - self.logger.debug('req={}'.format(repr(req))) + self.logger.debug('sending to remote server req={}'.format(repr(req))) # Send it down the pipe! self._proxy_sock.sendall(req) @@ -222,12 +223,17 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): return recorded_url - def _special_request(self, method, type_): + # deprecated + def do_PUTMETA(self): + self.do_WARCPROX_WRITE_RECORD(warc_type=warctools.WarcRecord.METADATA) + + def do_WARCPROX_WRITE_RECORD(self, warc_type=None): try: self.url = self.path - if (method == 'PUTMETA' and 'Content-Length' in self.headers - and 'Content-Type' in self.headers): + if ('Content-Length' in self.headers and 'Content-Type' in self.headers + and (warc_type or 'WARC-Type' in self.headers)): + # stream this? request_data = self.rfile.read(int(self.headers['Content-Length'])) warcprox_meta = self.headers.get('Warcprox-Meta') @@ -238,10 +244,10 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): remote_ip=b'', warcprox_meta=warcprox_meta, content_type=self.headers['Content-Type'].encode('latin1'), - custom_type=type_, + custom_type=warc_type or self.headers['WARC-Type'], status=204, size=len(request_data), client_ip=self.client_address[0], - method=method) + method=self.command) self.server.recorded_url_q.put(rec_custom) self.send_response(204, 'OK') @@ -250,7 +256,7 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): self.end_headers() except: - self.logger.error("uncaught except in _special_request", exc_info=True) + self.logger.error("uncaught exception in do_WARCPROX_WRITE_RECORD", exc_info=True) raise def log_error(self, fmt, *args): diff --git a/warcprox/warcwriter.py b/warcprox/warcwriter.py index 0d57bda..ac69cb9 100644 --- a/warcprox/warcwriter.py +++ b/warcprox/warcwriter.py @@ -52,21 +52,8 @@ class WarcWriter: self.logger.info("warc destination directory {} doesn't exist, creating it".format(directory)) os.mkdir(directory) - # returns a tuple (principal_record, request_record) where principal_record is either a response or revisit record - def build_warc_records(self, recorded_url): - warc_date = warctools.warc.warc_datetime_str(datetime.utcnow()) - - dedup_info = None - - # metadata special case - if recorded_url.custom_type == 'metadata': - metadata_rec = self.build_warc_record(url=recorded_url.url, - warc_date=warc_date, - data=recorded_url.request_data, - warc_type=warctools.WarcRecord.METADATA, - content_type=recorded_url.content_type) - return [metadata_rec] - + def _build_response_principal_record(self, recorded_url, warc_date): + """Builds response or revisit record, whichever is appropriate.""" if self.dedup_db is not None and recorded_url.response_recorder.payload_digest is not None: key = self.digest_str(recorded_url.response_recorder.payload_digest) dedup_info = self.dedup_db.lookup(key) @@ -79,7 +66,7 @@ class WarcWriter: else: response_header_block = recorded_url.response_recorder.tempfile.read() - principal_record = self.build_warc_record( + return self.build_warc_record( url=recorded_url.url, warc_date=warc_date, data=response_header_block, warc_type=warctools.WarcRecord.REVISIT, @@ -92,21 +79,31 @@ class WarcWriter: remote_ip=recorded_url.remote_ip) else: # response record - principal_record = self.build_warc_record( + return self.build_warc_record( url=recorded_url.url, warc_date=warc_date, recorder=recorded_url.response_recorder, warc_type=warctools.WarcRecord.RESPONSE, content_type=hanzo.httptools.ResponseMessage.CONTENT_TYPE, remote_ip=recorded_url.remote_ip) - request_record = self.build_warc_record( - url=recorded_url.url, warc_date=warc_date, - data=recorded_url.request_data, - warc_type=warctools.WarcRecord.REQUEST, - content_type=hanzo.httptools.RequestMessage.CONTENT_TYPE, - concurrent_to=principal_record.id) + # returns a tuple (principal_record, ...) + def build_warc_records(self, recorded_url): + warc_date = warctools.warc.warc_datetime_str(datetime.utcnow()) - return principal_record, request_record + if recorded_url.response_recorder: + principal_record = self._build_response_principal_record(recorded_url, warc_date) + request_record = self.build_warc_record(url=recorded_url.url, + warc_date=warc_date, data=recorded_url.request_data, + warc_type=warctools.WarcRecord.REQUEST, + content_type=hanzo.httptools.RequestMessage.CONTENT_TYPE, + concurrent_to=principal_record.id) + return principal_record, request_record + else: + principal_record = self.build_warc_record(url=recorded_url.url, + warc_date=warc_date, data=recorded_url.request_data, + warc_type=recorded_url.custom_type, + content_type=recorded_url.content_type) + return (principal_record,) def digest_str(self, hash_obj): return hash_obj.name.encode('utf-8') + b':' + (base64.b32encode(hash_obj.digest()) if self.base32 else hash_obj.hexdigest().encode('ascii')) @@ -278,11 +275,11 @@ class WarcWriter: for record in recordset: offset = writer.tell() record.write_to(writer, gzip=self.gzip) - self.logger.debug('wrote warc record: warc_type={} content_length={} url={} warc={} offset={}'.format( + self.logger.debug('wrote warc record: warc_type=%s content_length=%s url=%s warc=%s offset=%d', record.get_header(warctools.WarcRecord.TYPE), record.get_header(warctools.WarcRecord.CONTENT_LENGTH), record.get_header(warctools.WarcRecord.URL), - self._fpath, offset)) + self._fpath, offset) self._f.flush() @@ -296,21 +293,23 @@ class WarcWriter: self.logger.debug('rolling over {} after {} seconds idle'.format(self._f_finalname, time.time() - self._last_activity)) self.close_writer() -class WarcWriterThread(threading.Thread): +class WarcWriterPool: logger = logging.getLogger("warcprox.warcwriter.WarcWriterThread") - def __init__(self, recorded_url_q=None, default_warc_writer=None): - """recorded_url_q is a queue.Queue of warcprox.warcprox.RecordedUrl.""" - threading.Thread.__init__(self, name='WarcWriterThread') - self.recorded_url_q = recorded_url_q - self.stop = threading.Event() + def __init__(self, default_warc_writer): if default_warc_writer: self.default_warc_writer = default_warc_writer else: self.default_warc_writer = WarcWriter() self.warc_writers = {} # {prefix:WarcWriter} + self._last_sync = time.time() - def write_records(self, recorded_url): + self.logger.info('directory={} gzip={} rollover_size={} rollover_idle_time={} prefix={} port={}'.format( + os.path.abspath(self.default_warc_writer.directory), self.default_warc_writer.gzip, self.default_warc_writer.rollover_size, + self.default_warc_writer.rollover_idle_time, self.default_warc_writer.prefix, self.default_warc_writer.port)) + + # chooses writer for filename specified by warcprox_meta["warc-prefix"] if set + def _writer(self, recorded_url): w = self.default_warc_writer if recorded_url.warcprox_meta and "warc-prefix" in recorded_url.warcprox_meta: # self.logger.info("recorded_url.warcprox_meta={} for {}".format(recorded_url.warcprox_meta, recorded_url.url)) @@ -327,37 +326,55 @@ class WarcWriterThread(threading.Thread): dedup_db=self.default_warc_writer.dedup_db, playback_index_db=self.default_warc_writer.playback_index_db) w = self.warc_writers[prefix] - w.write_records(recorded_url) + return w + + def write_records(self, recorded_url): + self._writer(recorded_url).write_records(recorded_url) + + def maybe_idle_rollover(self): + self.default_warc_writer.maybe_idle_rollover() + for w in self.warc_writers.values(): + w.maybe_idle_rollover() + + def sync(self): + # XXX prob doesn't belong here (do we need it at all?) + if time.time() - self._last_sync > 60: + if self.default_warc_writer.dedup_db: + self.default_warc_writer.dedup_db.sync() + if self.default_warc_writer.playback_index_db: + self.default_warc_writer.playback_index_db.sync() + self._last_sync = time.time() + + def close_writers(self): + self.default_warc_writer.close_writer() + for w in self.warc_writers.values(): + w.close_writer() + +class WarcWriterThread(threading.Thread): + logger = logging.getLogger("warcprox.warcwriter.WarcWriterThread") + + def __init__(self, recorded_url_q=None, writer_pool=None): + """recorded_url_q is a queue.Queue of warcprox.warcprox.RecordedUrl.""" + threading.Thread.__init__(self, name='WarcWriterThread') + self.recorded_url_q = recorded_url_q + self.stop = threading.Event() + if writer_pool: + self.writer_pool = writer_pool + else: + self.writer_pool = WarcWriterPool() def run(self): try: - self.logger.info('WarcWriterThread starting, directory={} gzip={} rollover_size={} rollover_idle_time={} prefix={} port={}'.format( - os.path.abspath(self.default_warc_writer.directory), self.default_warc_writer.gzip, self.default_warc_writer.rollover_size, - self.default_warc_writer.rollover_idle_time, self.default_warc_writer.prefix, self.default_warc_writer.port)) - - self._last_sync = time.time() - while not self.stop.is_set(): try: recorded_url = self.recorded_url_q.get(block=True, timeout=0.5) - self.write_records(recorded_url) + self.writer_pool.write_records(recorded_url) except queue.Empty: - self.default_warc_writer.maybe_idle_rollover() - for w in self.warc_writers.values(): - w.maybe_idle_rollover() - - # XXX prob doesn't belong here (do we need it at all?) - if time.time() - self._last_sync > 60: - if self.default_warc_writer.dedup_db: - self.default_warc_writer.dedup_db.sync() - if self.default_warc_writer.playback_index_db: - self.default_warc_writer.playback_index_db.sync() - self._last_sync = time.time() + self.writer_pool.maybe_idle_rollover() + self.writer_pool.sync() self.logger.info('WarcWriterThread shutting down') - self.default_warc_writer.close_writer() - for w in self.warc_writers.values(): - w.close_writer() + self.writer_pool.close_writers() except: self.logger.critical("WarcWriterThread shutting down after unexpected error", exc_info=True) From d38ab08086fb714ba7a1c513863e84ea89fe55e2 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 28 Jul 2015 01:06:28 +0000 Subject: [PATCH 008/146] close connection to proxy client after proxying the request, seems to solve hanging connection issue (see comment in code) --- warcprox/warcprox.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/warcprox/warcprox.py b/warcprox/warcprox.py index c5d6b4e..5a0dc58 100644 --- a/warcprox/warcprox.py +++ b/warcprox/warcprox.py @@ -212,6 +212,11 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): h.close() self._proxy_sock.close() + # XXX Close connection to proxy client. Doing this because we were + # seeing some connection hangs and this seems to solve that problem. + # Not clear what the correct, optimal behavior is. + self.connection.close() + recorded_url = RecordedUrl(url=self.url, request_data=req, response_recorder=h.recorder, remote_ip=remote_ip, warcprox_meta=warcprox_meta, From d3d23f9878d3b264d71d5c4eb2b8cb30a4c925ee Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 29 Jul 2015 20:13:55 +0000 Subject: [PATCH 009/146] convert test_warcprox.py to py.test with fixtures --- warcprox/tests/test_warcprox.py | 591 ++++++++++++++++---------------- 1 file changed, 296 insertions(+), 295 deletions(-) diff --git a/warcprox/tests/test_warcprox.py b/warcprox/tests/test_warcprox.py index f263bef..66b0d0a 100755 --- a/warcprox/tests/test_warcprox.py +++ b/warcprox/tests/test_warcprox.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # vim: set sw=4 et: -import unittest +import pytest import threading import time import logging @@ -13,6 +13,7 @@ import OpenSSL import os import shutil import requests +import re try: import http.server as http_server @@ -32,11 +33,9 @@ import warcprox.playback import warcprox.warcwriter import warcprox.dedup -class TestHttpRequestHandler(http_server.BaseHTTPRequestHandler): - logger = logging.getLogger('TestHttpRequestHandler') - +class _TestHttpRequestHandler(http_server.BaseHTTPRequestHandler): def do_GET(self): - self.logger.info('GET {}'.format(self.path)) + logging.info('GET {}'.format(self.path)) m = re.match(r'^/([^/]+)/([^/]+)$', self.path) if m is not None: @@ -57,358 +56,360 @@ class TestHttpRequestHandler(http_server.BaseHTTPRequestHandler): self.connection.sendall(headers) self.connection.sendall(payload) +@pytest.fixture() +def cert(request): + f = tempfile.NamedTemporaryFile(prefix='warcprox-test-https-', suffix='.pem', delete=False) -class WarcproxTest(unittest.TestCase): - logger = logging.getLogger('WarcproxTest') + def fin(): + logging.info("deleting file %s", f.name) + os.unlink(f.name) + request.addfinalizer(fin) - def __init__(self, methodName='runTest'): - self.__cert = None - unittest.TestCase.__init__(self, methodName) + try: + key = OpenSSL.crypto.PKey() + key.generate_key(OpenSSL.crypto.TYPE_RSA, 2048) + req = OpenSSL.crypto.X509Req() + req.get_subject().CN = 'localhost' + req.set_pubkey(key) + req.sign(key, 'sha1') + cert = OpenSSL.crypto.X509() + cert.set_subject(req.get_subject()) + cert.set_serial_number(0) + cert.gmtime_adj_notBefore(0) + cert.gmtime_adj_notAfter(2*60*60) # valid for 2hrs + cert.set_issuer(cert.get_subject()) + cert.set_pubkey(req.get_pubkey()) + cert.sign(key, 'sha1') - @property - def _cert(self): - if self.__cert is None: - f = tempfile.NamedTemporaryFile(prefix='warcprox-test-https-', suffix='.pem', delete=False) - try: - key = OpenSSL.crypto.PKey() - key.generate_key(OpenSSL.crypto.TYPE_RSA, 2048) - req = OpenSSL.crypto.X509Req() - req.get_subject().CN = 'localhost' - req.set_pubkey(key) - req.sign(key, 'sha1') - cert = OpenSSL.crypto.X509() - cert.set_subject(req.get_subject()) - cert.set_serial_number(0) - cert.gmtime_adj_notBefore(0) - cert.gmtime_adj_notAfter(2*60*60) # valid for 2hrs - cert.set_issuer(cert.get_subject()) - cert.set_pubkey(req.get_pubkey()) - cert.sign(key, 'sha1') + f.write(OpenSSL.crypto.dump_privatekey(OpenSSL.SSL.FILETYPE_PEM, key)) + f.write(OpenSSL.crypto.dump_certificate(OpenSSL.SSL.FILETYPE_PEM, cert)) - f.write(OpenSSL.crypto.dump_privatekey(OpenSSL.SSL.FILETYPE_PEM, key)) - f.write(OpenSSL.crypto.dump_certificate(OpenSSL.SSL.FILETYPE_PEM, cert)) - - self.logger.info('generated self-signed certificate {}'.format(f.name)) - self.__cert = f.name - finally: - f.close() - - return self.__cert - - - def _start_http_servers(self): - self.http_daemon = http_server.HTTPServer(('localhost', 0), - RequestHandlerClass=TestHttpRequestHandler) - self.logger.info('starting http://{}:{}'.format(self.http_daemon.server_address[0], self.http_daemon.server_address[1])) - self.http_daemon_thread = threading.Thread(name='HttpdThread', - target=self.http_daemon.serve_forever) - self.http_daemon_thread.start() - - # http://www.piware.de/2011/01/creating-an-https-server-in-python/ - self.https_daemon = http_server.HTTPServer(('localhost', 0), - RequestHandlerClass=TestHttpRequestHandler) - # self.https_daemon.socket = ssl.wrap_socket(httpd.socket, certfile='path/to/localhost.pem', server_side=True) - self.https_daemon.socket = ssl.wrap_socket(self.https_daemon.socket, certfile=self._cert, server_side=True) - self.logger.info('starting https://{}:{}'.format(self.https_daemon.server_address[0], self.https_daemon.server_address[1])) - self.https_daemon_thread = threading.Thread(name='HttpdThread', - target=self.https_daemon.serve_forever) - self.https_daemon_thread.start() - - - def _start_warcprox(self): - f = tempfile.NamedTemporaryFile(prefix='warcprox-test-ca-', suffix='.pem', delete=True) - f.close() # delete it, or CertificateAuthority will try to read it - self._ca_file = f.name - self._ca_dir = tempfile.mkdtemp(prefix='warcprox-test-', suffix='-ca') - ca = certauth.certauth.CertificateAuthority(self._ca_file, self._ca_dir, 'warcprox-test') - - recorded_url_q = queue.Queue() - - proxy = warcprox.warcprox.WarcProxy(server_address=('localhost', 0), ca=ca, - recorded_url_q=recorded_url_q) - - self._warcs_dir = tempfile.mkdtemp(prefix='warcprox-test-warcs-') - - f = tempfile.NamedTemporaryFile(prefix='warcprox-test-playback-index-', suffix='.db', delete=False) + logging.info('generated self-signed certificate {}'.format(f.name)) + return f.name + finally: f.close() - self._playback_index_db_file = f.name - playback_index_db = warcprox.playback.PlaybackIndexDb(self._playback_index_db_file) - playback_proxy = warcprox.playback.PlaybackProxy(server_address=('localhost', 0), ca=ca, - playback_index_db=playback_index_db, warcs_dir=self._warcs_dir) - f = tempfile.NamedTemporaryFile(prefix='warcprox-test-dedup-', suffix='.db', delete=False) - f.close() - self._dedup_db_file = f.name - dedup_db = warcprox.dedup.DedupDb(self._dedup_db_file) +@pytest.fixture() +def http_daemon(request): + http_daemon = http_server.HTTPServer(('localhost', 0), + RequestHandlerClass=_TestHttpRequestHandler) + logging.info('starting http://{}:{}'.format(http_daemon.server_address[0], http_daemon.server_address[1])) + http_daemon_thread = threading.Thread(name='HttpDaemonThread', + target=http_daemon.serve_forever) + http_daemon_thread.start() - warc_writer = warcprox.warcwriter.WarcWriter(directory=self._warcs_dir, - port=proxy.server_port, dedup_db=dedup_db, - playback_index_db=playback_index_db) - warc_writer_thread = warcprox.warcwriter.WarcWriterThread(recorded_url_q=recorded_url_q, - warc_writer=warc_writer) + def fin(): + logging.info("stopping http daemon") + http_daemon.shutdown() + http_daemon.server_close() + http_daemon_thread.join() + request.addfinalizer(fin) - self.warcprox = warcprox.controller.WarcproxController(proxy, warc_writer_thread, playback_proxy) - self.logger.info('starting warcprox') - self.warcprox_thread = threading.Thread(name='WarcproxThread', - target=self.warcprox.run_until_shutdown) - self.warcprox_thread.start() + return http_daemon +@pytest.fixture() +def https_daemon(request, cert): + # http://www.piware.de/2011/01/creating-an-https-server-in-python/ + https_daemon = http_server.HTTPServer(('localhost', 0), + RequestHandlerClass=_TestHttpRequestHandler) + # https_daemon.socket = ssl.wrap_socket(httpd.socket, certfile='path/to/localhost.pem', server_side=True) + https_daemon.socket = ssl.wrap_socket(https_daemon.socket, certfile=cert, server_side=True) + logging.info('starting https://{}:{}'.format(https_daemon.server_address[0], https_daemon.server_address[1])) + https_daemon_thread = threading.Thread(name='HttpsDaemonThread', + target=https_daemon.serve_forever) + https_daemon_thread.start() - def setUp(self): - logging.basicConfig(stream=sys.stdout, level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(process)d %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s') + def fin(): + logging.info("stopping https daemon") + https_daemon.shutdown() + https_daemon.server_close() + https_daemon_thread.join() + request.addfinalizer(fin) - self._start_http_servers() - self._start_warcprox() + return https_daemon - archiving_proxy = 'http://localhost:{}'.format(self.warcprox.proxy.server_port) - self.archiving_proxies = {'http':archiving_proxy, 'https':archiving_proxy} +@pytest.fixture() +def warcprox_(request): + f = tempfile.NamedTemporaryFile(prefix='warcprox-test-ca-', suffix='.pem', delete=True) + f.close() # delete it, or CertificateAuthority will try to read it + ca_file = f.name + ca_dir = tempfile.mkdtemp(prefix='warcprox-test-', suffix='-ca') + ca = certauth.certauth.CertificateAuthority(ca_file, ca_dir, 'warcprox-test') - playback_proxy = 'http://localhost:{}'.format(self.warcprox.playback_proxy.server_port) - self.playback_proxies = {'http':playback_proxy, 'https':playback_proxy} + recorded_url_q = queue.Queue() + proxy = warcprox.warcprox.WarcProxy(server_address=('localhost', 0), ca=ca, + recorded_url_q=recorded_url_q) - def tearDown(self): - self.logger.info('stopping warcprox') - self.warcprox.stop.set() + warcs_dir = tempfile.mkdtemp(prefix='warcprox-test-warcs-') - self.logger.info('stopping http and https daemons') - self.http_daemon.shutdown() - self.https_daemon.shutdown() - self.http_daemon.server_close() - self.https_daemon.server_close() + f = tempfile.NamedTemporaryFile(prefix='warcprox-test-playback-index-', suffix='.db', delete=False) + f.close() + playback_index_db_file = f.name + playback_index_db = warcprox.playback.PlaybackIndexDb(playback_index_db_file) + playback_proxy = warcprox.playback.PlaybackProxy(server_address=('localhost', 0), ca=ca, + playback_index_db=playback_index_db, warcs_dir=warcs_dir) - # Have to wait for threads to finish or the threads will try to use - # variables that no longer exist, resulting in errors like this: - # File "/usr/lib/python2.7/SocketServer.py", line 235, in serve_forever - # r, w, e = _eintr_retry(select.select, [self], [], [], - # AttributeError: 'NoneType' object has no attribute 'select' - self.http_daemon_thread.join() - self.https_daemon_thread.join() - self.warcprox_thread.join() + f = tempfile.NamedTemporaryFile(prefix='warcprox-test-dedup-', suffix='.db', delete=False) + f.close() + dedup_db_file = f.name + dedup_db = warcprox.dedup.DedupDb(dedup_db_file) - for f in (self.__cert, self._ca_file, self._ca_dir, self._warcs_dir, self._playback_index_db_file, self._dedup_db_file): + default_warc_writer = warcprox.warcwriter.WarcWriter(directory=warcs_dir, + port=proxy.server_port, dedup_db=dedup_db, + playback_index_db=playback_index_db) + writer_pool = warcprox.warcwriter.WarcWriterPool(default_warc_writer) + warc_writer_thread = warcprox.warcwriter.WarcWriterThread(recorded_url_q=recorded_url_q, + writer_pool=writer_pool) + + warcprox_ = warcprox.controller.WarcproxController(proxy, warc_writer_thread, playback_proxy) + logging.info('starting warcprox') + warcprox_thread = threading.Thread(name='WarcproxThread', + target=warcprox_.run_until_shutdown) + warcprox_thread.start() + + def fin(): + logging.info('stopping warcprox') + warcprox_.stop.set() + warcprox_thread.join() + for f in (ca_file, ca_dir, warcs_dir, playback_index_db_file, dedup_db_file): if os.path.isdir(f): - self.logger.info('deleting directory {}'.format(f)) + logging.info('deleting directory {}'.format(f)) shutil.rmtree(f) else: - self.logger.info('deleting file {}'.format(f)) + logging.info('deleting file {}'.format(f)) os.unlink(f) + request.addfinalizer(fin) + return warcprox_ - def _test_httpds_no_proxy(self): - url = 'http://localhost:{}/'.format(self.http_daemon.server_port) - response = requests.get(url) - self.assertEqual(response.status_code, 404) - self.assertEqual(response.content, b'404 Not Found\n') +@pytest.fixture() +def archiving_proxies(warcprox_): + archiving_proxy = 'http://localhost:{}'.format(warcprox_.proxy.server_port) + return {'http':archiving_proxy, 'https':archiving_proxy} - url = 'https://localhost:{}/'.format(self.https_daemon.server_port) - response = requests.get(url, verify=False) - self.assertEqual(response.status_code, 404) - self.assertEqual(response.content, b'404 Not Found\n') +@pytest.fixture() +def playback_proxies(warcprox_): + playback_proxy = 'http://localhost:{}'.format(warcprox_.playback_proxy.server_port) + return {'http':playback_proxy, 'https':playback_proxy} - url = 'http://localhost:{}/a/b'.format(self.http_daemon.server_port) - response = requests.get(url) - self.assertEqual(response.status_code, 200) - self.assertEqual(response.headers['warcprox-test-header'], 'a!') - self.assertEqual(response.content, b'I am the warcprox test payload! bbbbbbbbbb!\n') +# def tearDown(self): +# logging.info('stopping warcprox') +# self.warcprox.stop.set() +# +# logging.info('stopping http and https daemons') +# self.http_daemon.shutdown() +# self.https_daemon.shutdown() +# self.http_daemon.server_close() +# self.https_daemon.server_close() +# +# self.http_daemon_thread.join() +# self.https_daemon_thread.join() +# self.warcprox_thread.join() +# +# for f in (self.__cert, self._ca_file, self._ca_dir, self._warcs_dir, self._playback_index_db_file, self._dedup_db_file): +# if os.path.isdir(f): +# logging.info('deleting directory {}'.format(f)) +# shutil.rmtree(f) +# else: +# logging.info('deleting file {}'.format(f)) +# os.unlink(f) - url = 'https://localhost:{}/c/d'.format(self.https_daemon.server_port) - response = requests.get(url, verify=False) - self.assertEqual(response.status_code, 200) - self.assertEqual(response.headers['warcprox-test-header'], 'c!') - self.assertEqual(response.content, b'I am the warcprox test payload! dddddddddd!\n') +def test_httpds_no_proxy(http_daemon, https_daemon): + url = 'http://localhost:{}/'.format(http_daemon.server_port) + response = requests.get(url) + assert response.status_code == 404 + assert response.content == b'404 Not Found\n' + url = 'https://localhost:{}/'.format(https_daemon.server_port) + response = requests.get(url, verify=False) + assert response.status_code == 404 + assert response.content == b'404 Not Found\n' - def poll_playback_until(self, url, status, timeout_sec): - start = time.time() - # check playback (warc writing is asynchronous, give it up to 10 sec) - while time.time() - start < timeout_sec: - response = requests.get(url, proxies=self.playback_proxies, verify=False) - if response.status_code == status: - break - time.sleep(0.5) + url = 'http://localhost:{}/a/b'.format(http_daemon.server_port) + response = requests.get(url) + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'a!' + assert response.content == b'I am the warcprox test payload! bbbbbbbbbb!\n' - return response + url = 'https://localhost:{}/c/d'.format(https_daemon.server_port) + response = requests.get(url, verify=False) + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'c!' + assert response.content == b'I am the warcprox test payload! dddddddddd!\n' +def _poll_playback_until(playback_proxies, url, status, timeout_sec): + start = time.time() + # check playback (warc writing is asynchronous, give it up to 10 sec) + while time.time() - start < timeout_sec: + response = requests.get(url, proxies=playback_proxies, verify=False) + if response.status_code == status: + break + time.sleep(0.5) - def _test_archive_and_playback_http_url(self): - url = 'http://localhost:{}/a/b'.format(self.http_daemon.server_port) + return response - # ensure playback fails before archiving - response = requests.get(url, proxies=self.playback_proxies) - self.assertEqual(response.status_code, 404) - self.assertEqual(response.content, b'404 Not in Archive\n') +def test_archive_and_playback_http_url(http_daemon, archiving_proxies, playback_proxies): + url = 'http://localhost:{}/a/b'.format(http_daemon.server_port) - # archive - response = requests.get(url, proxies=self.archiving_proxies) - self.assertEqual(response.status_code, 200) - self.assertEqual(response.headers['warcprox-test-header'], 'a!') - self.assertEqual(response.content, b'I am the warcprox test payload! bbbbbbbbbb!\n') + # ensure playback fails before archiving + response = requests.get(url, proxies=playback_proxies) + assert response.status_code == 404 + assert response.content == b'404 Not in Archive\n' - response = self.poll_playback_until(url, status=200, timeout_sec=10) - self.assertEqual(response.status_code, 200) - self.assertEqual(response.headers['warcprox-test-header'], 'a!') - self.assertEqual(response.content, b'I am the warcprox test payload! bbbbbbbbbb!\n') + # archive + response = requests.get(url, proxies=archiving_proxies) + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'a!' + assert response.content == b'I am the warcprox test payload! bbbbbbbbbb!\n' + response = _poll_playback_until(playback_proxies, url, status=200, timeout_sec=10) + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'a!' + assert response.content == b'I am the warcprox test payload! bbbbbbbbbb!\n' - def _test_archive_and_playback_https_url(self): - url = 'https://localhost:{}/c/d'.format(self.https_daemon.server_port) +def test_archive_and_playback_https_url(https_daemon, archiving_proxies, playback_proxies): + url = 'https://localhost:{}/c/d'.format(https_daemon.server_port) - # ensure playback fails before archiving - response = requests.get(url, proxies=self.playback_proxies, verify=False) - self.assertEqual(response.status_code, 404) - self.assertEqual(response.content, b'404 Not in Archive\n') + # ensure playback fails before archiving + response = requests.get(url, proxies=playback_proxies, verify=False) + assert response.status_code == 404 + assert response.content == b'404 Not in Archive\n' - # fetch & archive response - response = requests.get(url, proxies=self.archiving_proxies, verify=False) - self.assertEqual(response.status_code, 200) - self.assertEqual(response.headers['warcprox-test-header'], 'c!') - self.assertEqual(response.content, b'I am the warcprox test payload! dddddddddd!\n') + # fetch & archive response + response = requests.get(url, proxies=archiving_proxies, verify=False) + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'c!' + assert response.content == b'I am the warcprox test payload! dddddddddd!\n' - # test playback - response = self.poll_playback_until(url, status=200, timeout_sec=10) - self.assertEqual(response.status_code, 200) - self.assertEqual(response.headers['warcprox-test-header'], 'c!') - self.assertEqual(response.content, b'I am the warcprox test payload! dddddddddd!\n') + # test playback + response = _poll_playback_until(playback_proxies, url, status=200, timeout_sec=10) + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'c!' + assert response.content == b'I am the warcprox test payload! dddddddddd!\n' +# test dedup of same http url with same payload +def test_dedup_http(http_daemon, warcprox_, archiving_proxies, playback_proxies): + url = 'http://localhost:{}/e/f'.format(http_daemon.server_port) - # test dedup of same http url with same payload - def _test_dedup_http(self): - url = 'http://localhost:{}/e/f'.format(self.http_daemon.server_port) + # ensure playback fails before archiving + response = requests.get(url, proxies=playback_proxies, verify=False) + assert response.status_code == 404 + assert response.content == b'404 Not in Archive\n' - # ensure playback fails before archiving - response = requests.get(url, proxies=self.playback_proxies, verify=False) - self.assertEqual(response.status_code, 404) - self.assertEqual(response.content, b'404 Not in Archive\n') + # check not in dedup db + dedup_lookup = warcprox_.warc_writer_thread.writer_pool.default_warc_writer.dedup_db.lookup(b'sha1:65e1216acfd220f0292715e74bd7a1ec35c99dfc') + assert dedup_lookup is None - # check not in dedup db - dedup_lookup = self.warcprox.warc_writer_thread.warc_writer.dedup_db.lookup(b'sha1:65e1216acfd220f0292715e74bd7a1ec35c99dfc') - self.assertIsNone(dedup_lookup) + # archive + response = requests.get(url, proxies=archiving_proxies, verify=False) + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'e!' + assert response.content == b'I am the warcprox test payload! ffffffffff!\n' - # archive - response = requests.get(url, proxies=self.archiving_proxies, verify=False) - self.assertEqual(response.status_code, 200) - self.assertEqual(response.headers['warcprox-test-header'], 'e!') - self.assertEqual(response.content, b'I am the warcprox test payload! ffffffffff!\n') + # test playback + response = _poll_playback_until(playback_proxies, url, status=200, timeout_sec=10) + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'e!' + assert response.content == b'I am the warcprox test payload! ffffffffff!\n' - # test playback - response = self.poll_playback_until(url, status=200, timeout_sec=10) - self.assertEqual(response.status_code, 200) - self.assertEqual(response.headers['warcprox-test-header'], 'e!') - self.assertEqual(response.content, b'I am the warcprox test payload! ffffffffff!\n') + # check in dedup db + # {u'i': u'', u'u': u'https://localhost:62841/c/d', u'd': u'2013-11-22T00:14:37Z'} + dedup_lookup = warcprox_.warc_writer_thread.writer_pool.default_warc_writer.dedup_db.lookup(b'sha1:65e1216acfd220f0292715e74bd7a1ec35c99dfc') + assert dedup_lookup['u'] == url.encode('ascii') + assert re.match(br'^$', dedup_lookup['i']) + assert re.match(br'^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$', dedup_lookup['d']) + record_id = dedup_lookup['i'] + dedup_date = dedup_lookup['d'] - # check in dedup db - # {u'i': u'', u'u': u'https://localhost:62841/c/d', u'd': u'2013-11-22T00:14:37Z'} - dedup_lookup = self.warcprox.warc_writer_thread.warc_writer.dedup_db.lookup(b'sha1:65e1216acfd220f0292715e74bd7a1ec35c99dfc') - self.assertEqual(dedup_lookup['u'], url.encode('ascii')) - self.assertRegexpMatches(dedup_lookup['i'], br'^$') - self.assertRegexpMatches(dedup_lookup['d'], br'^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$') - record_id = dedup_lookup['i'] - dedup_date = dedup_lookup['d'] + # need revisit to have a later timestamp than original, else playing + # back the latest record might not hit the revisit + time.sleep(1.5) - # need revisit to have a later timestamp than original, else playing - # back the latest record might not hit the revisit - time.sleep(1.5) + # fetch & archive revisit + response = requests.get(url, proxies=archiving_proxies, verify=False) + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'e!' + assert response.content == b'I am the warcprox test payload! ffffffffff!\n' - # fetch & archive revisit - response = requests.get(url, proxies=self.archiving_proxies, verify=False) - self.assertEqual(response.status_code, 200) - self.assertEqual(response.headers['warcprox-test-header'], 'e!') - self.assertEqual(response.content, b'I am the warcprox test payload! ffffffffff!\n') + # XXX need to give warc writer thread a chance, and we don't have any change to poll for :-\ + time.sleep(2.0) - # XXX need to give warc writer thread a chance, and we don't have any change to poll for :-\ - time.sleep(2.0) + # check in dedup db (no change from prev) + dedup_lookup = warcprox_.warc_writer_thread.writer_pool.default_warc_writer.dedup_db.lookup(b'sha1:65e1216acfd220f0292715e74bd7a1ec35c99dfc') + assert dedup_lookup['u'] == url.encode('ascii') + assert dedup_lookup['i'] == record_id + assert dedup_lookup['d'] == dedup_date - # check in dedup db (no change from prev) - dedup_lookup = self.warcprox.warc_writer_thread.warc_writer.dedup_db.lookup(b'sha1:65e1216acfd220f0292715e74bd7a1ec35c99dfc') - self.assertEqual(dedup_lookup['u'], url.encode('ascii')) - self.assertEqual(dedup_lookup['i'], record_id) - self.assertEqual(dedup_lookup['d'], dedup_date) + # test playback + logging.debug('testing playback of revisit of {}'.format(url)) + response = _poll_playback_until(playback_proxies, url, status=200, timeout_sec=10) + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'e!' + assert response.content == b'I am the warcprox test payload! ffffffffff!\n' + # XXX how to check dedup was used? - # test playback - self.logger.debug('testing playback of revisit of {}'.format(url)) - response = self.poll_playback_until(url, status=200, timeout_sec=10) - self.assertEqual(response.status_code, 200) - self.assertEqual(response.headers['warcprox-test-header'], 'e!') - self.assertEqual(response.content, b'I am the warcprox test payload! ffffffffff!\n') - # XXX how to check dedup was used? +# test dedup of same https url with same payload +def test_dedup_https(https_daemon, warcprox_, archiving_proxies, playback_proxies): + url = 'https://localhost:{}/g/h'.format(https_daemon.server_port) + # ensure playback fails before archiving + response = requests.get(url, proxies=playback_proxies, verify=False) + assert response.status_code == 404 + assert response.content == b'404 Not in Archive\n' - # test dedup of same https url with same payload - def _test_dedup_https(self): - url = 'https://localhost:{}/g/h'.format(self.https_daemon.server_port) + # check not in dedup db + dedup_lookup = warcprox_.warc_writer_thread.writer_pool.default_warc_writer.dedup_db.lookup(b'sha1:5b4efa64fdb308ec06ae56a9beba155a6f734b89') + assert dedup_lookup is None - # ensure playback fails before archiving - response = requests.get(url, proxies=self.playback_proxies, verify=False) - self.assertEqual(response.status_code, 404) - self.assertEqual(response.content, b'404 Not in Archive\n') + # archive + response = requests.get(url, proxies=archiving_proxies, verify=False) + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'g!' + assert response.content == b'I am the warcprox test payload! hhhhhhhhhh!\n' - # check not in dedup db - dedup_lookup = self.warcprox.warc_writer_thread.warc_writer.dedup_db.lookup(b'sha1:5b4efa64fdb308ec06ae56a9beba155a6f734b89') - self.assertIsNone(dedup_lookup) + # test playback + response = _poll_playback_until(playback_proxies, url, status=200, timeout_sec=10) + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'g!' + assert response.content == b'I am the warcprox test payload! hhhhhhhhhh!\n' - # archive - response = requests.get(url, proxies=self.archiving_proxies, verify=False) - self.assertEqual(response.status_code, 200) - self.assertEqual(response.headers['warcprox-test-header'], 'g!') - self.assertEqual(response.content, b'I am the warcprox test payload! hhhhhhhhhh!\n') + # check in dedup db + # {u'i': u'', u'u': u'https://localhost:62841/c/d', u'd': u'2013-11-22T00:14:37Z'} + dedup_lookup = warcprox_.warc_writer_thread.writer_pool.default_warc_writer.dedup_db.lookup(b'sha1:5b4efa64fdb308ec06ae56a9beba155a6f734b89') + assert dedup_lookup['u'] == url.encode('ascii') + assert re.match(br'^$', dedup_lookup['i']) + assert re.match(br'^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$', dedup_lookup['d']) + record_id = dedup_lookup['i'] + dedup_date = dedup_lookup['d'] - # test playback - response = self.poll_playback_until(url, status=200, timeout_sec=10) - self.assertEqual(response.status_code, 200) - self.assertEqual(response.headers['warcprox-test-header'], 'g!') - self.assertEqual(response.content, b'I am the warcprox test payload! hhhhhhhhhh!\n') + # need revisit to have a later timestamp than original, else playing + # back the latest record might not hit the revisit + time.sleep(1.5) - # check in dedup db - # {u'i': u'', u'u': u'https://localhost:62841/c/d', u'd': u'2013-11-22T00:14:37Z'} - dedup_lookup = self.warcprox.warc_writer_thread.warc_writer.dedup_db.lookup(b'sha1:5b4efa64fdb308ec06ae56a9beba155a6f734b89') - self.assertEqual(dedup_lookup['u'], url.encode('ascii')) - self.assertRegexpMatches(dedup_lookup['i'], br'^$') - self.assertRegexpMatches(dedup_lookup['d'], br'^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$') - record_id = dedup_lookup['i'] - dedup_date = dedup_lookup['d'] + # fetch & archive revisit + response = requests.get(url, proxies=archiving_proxies, verify=False) + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'g!' + assert response.content == b'I am the warcprox test payload! hhhhhhhhhh!\n' - # need revisit to have a later timestamp than original, else playing - # back the latest record might not hit the revisit - time.sleep(1.5) + # XXX need to give warc writer thread a chance, and we don't have any change to poll for :-\ + time.sleep(2.0) - # fetch & archive revisit - response = requests.get(url, proxies=self.archiving_proxies, verify=False) - self.assertEqual(response.status_code, 200) - self.assertEqual(response.headers['warcprox-test-header'], 'g!') - self.assertEqual(response.content, b'I am the warcprox test payload! hhhhhhhhhh!\n') - - # XXX need to give warc writer thread a chance, and we don't have any change to poll for :-\ - time.sleep(2.0) - - # check in dedup db (no change from prev) - dedup_lookup = self.warcprox.warc_writer_thread.warc_writer.dedup_db.lookup(b'sha1:5b4efa64fdb308ec06ae56a9beba155a6f734b89') - self.assertEqual(dedup_lookup['u'], url.encode('ascii')) - self.assertEqual(dedup_lookup['i'], record_id) - self.assertEqual(dedup_lookup['d'], dedup_date) - - # test playback - self.logger.debug('testing playback of revisit of {}'.format(url)) - response = self.poll_playback_until(url, status=200, timeout_sec=10) - self.assertEqual(response.status_code, 200) - self.assertEqual(response.headers['warcprox-test-header'], 'g!') - self.assertEqual(response.content, b'I am the warcprox test payload! hhhhhhhhhh!\n') - # XXX how to check dedup was used? - - - # run everything from here, otherwise it wants to setUp() and tearDown - # around each test - def runTest(self): - self._test_httpds_no_proxy() - self._test_archive_and_playback_http_url() - self._test_archive_and_playback_https_url() - self._test_dedup_http() - self._test_dedup_https() - # self._test_dedup_mixed_http() - # self._test_dedup_mixed_https() + # check in dedup db (no change from prev) + dedup_lookup = warcprox_.warc_writer_thread.writer_pool.default_warc_writer.dedup_db.lookup(b'sha1:5b4efa64fdb308ec06ae56a9beba155a6f734b89') + assert dedup_lookup['u'] == url.encode('ascii') + assert dedup_lookup['i'] == record_id + assert dedup_lookup['d'] == dedup_date + # test playback + logging.debug('testing playback of revisit of {}'.format(url)) + response = _poll_playback_until(playback_proxies, url, status=200, timeout_sec=10) + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'g!' + assert response.content == b'I am the warcprox test payload! hhhhhhhhhh!\n' + # XXX how to check dedup was used? if __name__ == '__main__': - unittest.main() + pytest.main() From 89fab332959ab2972b5a8c552f1d98d890c49261 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 29 Jul 2015 20:16:03 +0000 Subject: [PATCH 010/146] remove old unused, commented out tearDown method --- warcprox/tests/test_warcprox.py | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/warcprox/tests/test_warcprox.py b/warcprox/tests/test_warcprox.py index 66b0d0a..0b05080 100755 --- a/warcprox/tests/test_warcprox.py +++ b/warcprox/tests/test_warcprox.py @@ -193,28 +193,6 @@ def playback_proxies(warcprox_): playback_proxy = 'http://localhost:{}'.format(warcprox_.playback_proxy.server_port) return {'http':playback_proxy, 'https':playback_proxy} -# def tearDown(self): -# logging.info('stopping warcprox') -# self.warcprox.stop.set() -# -# logging.info('stopping http and https daemons') -# self.http_daemon.shutdown() -# self.https_daemon.shutdown() -# self.http_daemon.server_close() -# self.https_daemon.server_close() -# -# self.http_daemon_thread.join() -# self.https_daemon_thread.join() -# self.warcprox_thread.join() -# -# for f in (self.__cert, self._ca_file, self._ca_dir, self._warcs_dir, self._playback_index_db_file, self._dedup_db_file): -# if os.path.isdir(f): -# logging.info('deleting directory {}'.format(f)) -# shutil.rmtree(f) -# else: -# logging.info('deleting file {}'.format(f)) -# os.unlink(f) - def test_httpds_no_proxy(http_daemon, https_daemon): url = 'http://localhost:{}/'.format(http_daemon.server_port) response = requests.get(url) From 10c724637f34eb8e7002c6a6b24685f0e66ecf7c Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 29 Jul 2015 21:23:46 +0000 Subject: [PATCH 011/146] factor out warc record building into its own class --- warcprox/warcwriter.py | 107 ++++++++++++++++++++++------------------- 1 file changed, 57 insertions(+), 50 deletions(-) diff --git a/warcprox/warcwriter.py b/warcprox/warcwriter.py index ac69cb9..f219abf 100644 --- a/warcprox/warcwriter.py +++ b/warcprox/warcwriter.py @@ -19,38 +19,13 @@ import hanzo.httptools from hanzo import warctools import warcprox -class WarcWriter: - logger = logging.getLogger("warcprox.warcwriter.WarcWriter") +class WarcRecordBuilder: + logger = logging.getLogger("warcprox.warcwriter.WarcRecordBuilder") - # port is only used for warc filename - def __init__(self, directory='./warcs', rollover_size=1000000000, - gzip=False, prefix='WARCPROX', port=0, - digest_algorithm='sha1', base32=False, dedup_db=None, - playback_index_db=None, rollover_idle_time=None): - - self.rollover_size = rollover_size - self.rollover_idle_time = rollover_idle_time - self._last_activity = time.time() - - self.gzip = gzip + def __init__(self, dedup_db=None, digest_algorithm="sha1", base32=False): + self.dedup_db = dedup_db self.digest_algorithm = digest_algorithm self.base32 = base32 - self.dedup_db = dedup_db - - self.playback_index_db = playback_index_db - - # warc path and filename stuff - self.directory = directory - self.prefix = prefix - self.port = port - - self._f = None - self._fpath = None - self._serial = 0 - - if not os.path.exists(directory): - self.logger.info("warc destination directory {} doesn't exist, creating it".format(directory)) - os.mkdir(directory) def _build_response_principal_record(self, recorded_url, warc_date): """Builds response or revisit record, whichever is appropriate.""" @@ -163,21 +138,7 @@ class WarcWriter: return record - def timestamp17(self): - now = datetime.utcnow() - return '{:%Y%m%d%H%M%S}{:03d}'.format(now, now.microsecond//1000) - - def close_writer(self): - if self._fpath: - self.logger.info('closing {0}'.format(self._f_finalname)) - self._f.close() - finalpath = os.path.sep.join([self.directory, self._f_finalname]) - os.rename(self._fpath, finalpath) - - self._fpath = None - self._f = None - - def _build_warcinfo_record(self, filename): + def build_warcinfo_record(self, filename): warc_record_date = warctools.warc.warc_datetime_str(datetime.utcnow()) record_id = warctools.WarcRecord.random_warc_uuid() @@ -202,6 +163,52 @@ class WarcWriter: return record +class WarcWriter: + logger = logging.getLogger("warcprox.warcwriter.WarcWriter") + + # port is only used for warc filename + def __init__(self, directory='./warcs', rollover_size=1000000000, + gzip=False, prefix='WARCPROX', port=0, + digest_algorithm='sha1', base32=False, dedup_db=None, + playback_index_db=None, rollover_idle_time=None): + + self.rollover_size = rollover_size + self.rollover_idle_time = rollover_idle_time + self._last_activity = time.time() + + self.gzip = gzip + self.record_builder = WarcRecordBuilder(dedup_db=dedup_db, digest_algorithm=digest_algorithm, base32=base32) + self.dedup_db = dedup_db + + self.playback_index_db = playback_index_db + + # warc path and filename stuff + self.directory = directory + self.prefix = prefix + self.port = port + + self._f = None + self._fpath = None + self._serial = 0 + + if not os.path.exists(directory): + self.logger.info("warc destination directory {} doesn't exist, creating it".format(directory)) + os.mkdir(directory) + + def timestamp17(self): + now = datetime.utcnow() + return '{:%Y%m%d%H%M%S}{:03d}'.format(now, now.microsecond//1000) + + def close_writer(self): + if self._fpath: + self.logger.info('closing {0}'.format(self._f_finalname)) + self._f.close() + finalpath = os.path.sep.join([self.directory, self._f_finalname]) + os.rename(self._fpath, finalpath) + + self._fpath = None + self._f = None + # def _writer(self): if self._fpath and os.path.getsize(self._fpath) > self.rollover_size: @@ -215,7 +222,7 @@ class WarcWriter: self._f = open(self._fpath, 'wb') - warcinfo_record = self._build_warcinfo_record(self._f_finalname) + warcinfo_record = self.record_builder.build_warcinfo_record(self._f_finalname) self.logger.debug('warcinfo_record.headers={}'.format(warcinfo_record.headers)) warcinfo_record.write_to(self._f, gzip=self.gzip) @@ -233,7 +240,7 @@ class WarcWriter: if (self.dedup_db is not None and recordset[0].get_header(warctools.WarcRecord.TYPE) == warctools.WarcRecord.RESPONSE and recorded_url.response_recorder.payload_size() > 0): - key = self.digest_str(recorded_url.response_recorder.payload_digest) + key = self.record_builder.digest_str(recorded_url.response_recorder.payload_digest) self.dedup_db.save(key, recordset[0], recordset_offset) if self.playback_index_db is not None: @@ -267,7 +274,7 @@ class WarcWriter: recordset_offset)) def write_records(self, recorded_url): - recordset = self.build_warc_records(recorded_url) + recordset = self.record_builder.build_warc_records(recorded_url) writer = self._writer() recordset_offset = writer.tell() @@ -294,7 +301,7 @@ class WarcWriter: self.close_writer() class WarcWriterPool: - logger = logging.getLogger("warcprox.warcwriter.WarcWriterThread") + logger = logging.getLogger("warcprox.warcwriter.WarcWriterPool") def __init__(self, default_warc_writer): if default_warc_writer: @@ -321,8 +328,8 @@ class WarcWriterPool: rollover_idle_time=self.default_warc_writer.rollover_idle_time, gzip=self.default_warc_writer.gzip, port=self.default_warc_writer.port, - digest_algorithm=self.default_warc_writer.digest_algorithm, - base32=self.default_warc_writer.base32, + digest_algorithm=self.default_warc_writer.record_builder.digest_algorithm, + base32=self.default_warc_writer.record_builder.base32, dedup_db=self.default_warc_writer.dedup_db, playback_index_db=self.default_warc_writer.playback_index_db) w = self.warc_writers[prefix] From 274a2f6b1d0cad560269191514c5ee2842d49a7f Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 30 Jul 2015 00:12:59 +0000 Subject: [PATCH 012/146] refactor warc writing, deduplication for somewhat cleaner separation of concerns --- setup.py | 2 +- warcprox/__init__.py | 15 ++ warcprox/controller.py | 8 +- warcprox/dedup.py | 13 +- warcprox/main.py | 22 +-- warcprox/warc.py | 149 +++++++++++++++++ warcprox/warcprox.py | 17 +- warcprox/warcproxy.py | 345 +++++++++++++++++++++++++++++++++++++++ warcprox/writer.py | 158 ++++++++++++++++++ warcprox/writerthread.py | 112 +++++++++++++ 10 files changed, 812 insertions(+), 29 deletions(-) create mode 100644 warcprox/warc.py create mode 100644 warcprox/warcproxy.py create mode 100644 warcprox/writer.py create mode 100644 warcprox/writerthread.py diff --git a/setup.py b/setup.py index d8afc87..5bde72b 100755 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ from setuptools.command.test import test as TestCommand import sys import setuptools -VERSION_BYTES = b'1.4' +VERSION_BYTES = b'1.5' def full_version_bytes(): import subprocess, time diff --git a/warcprox/__init__.py b/warcprox/__init__.py index e061a70..c3379c6 100644 --- a/warcprox/__init__.py +++ b/warcprox/__init__.py @@ -1,3 +1,18 @@ +# vim:set sw=4 et: + +import warcprox.controller as controller +import warcprox.playback as playback +import warcprox.dedup as dedup +import warcprox.warcproxy as warcproxy +import warcprox.mitmproxy as mitmproxy +import warcprox.writer as writer +import warcprox.warc as warc +import warcprox.writerthread as writerthread + +def digest_str(hash_obj, base32): + import base64 + return hash_obj.name.encode('utf-8') + b':' + (base64.b32encode(hash_obj.digest()) if base32 else hash_obj.hexdigest().encode('ascii')) + def _read_version_bytes(): import os version_txt = os.path.sep.join(__file__.split(os.path.sep)[:-1] + ['version.txt']) diff --git a/warcprox/controller.py b/warcprox/controller.py index db76135..ba73859 100644 --- a/warcprox/controller.py +++ b/warcprox/controller.py @@ -5,9 +5,7 @@ from __future__ import absolute_import import logging import threading import time - -import warcprox.warcprox -import warcprox.warcwriter +import warcprox class WarcproxController(object): logger = logging.getLogger("warcprox.controller.WarcproxController") @@ -61,8 +59,8 @@ class WarcproxController(object): self.proxy.shutdown() self.proxy.server_close() - if self.warc_writer_thread.writer_pool.default_warc_writer.dedup_db is not None: - self.warc_writer_thread.writer_pool.default_warc_writer.dedup_db.close() + if self.warc_writer_thread.dedup_db is not None: + self.warc_writer_thread.dedup_db.close() if self.playback_proxy is not None: self.playback_proxy.shutdown() diff --git a/warcprox/dedup.py b/warcprox/dedup.py index 99a8d55..65962f9 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -14,6 +14,7 @@ import logging import os import json from hanzo import warctools +import warcprox class DedupDb(object): logger = logging.getLogger("warcprox.dedup.DedupDb") @@ -44,17 +45,21 @@ class DedupDb(object): json_value = json.dumps(py_value, separators=(',',':')) self.db[key] = json_value.encode('utf-8') - self.logger.debug('dedup db saved {}:{}'.format(key, json_value)) + self.logger.debug('dedup db saved %s:%s', key, json_value) def lookup(self, key): + result = None if key in self.db: json_result = self.db[key] result = json.loads(json_result.decode('utf-8')) result['i'] = result['i'].encode('latin1') result['u'] = result['u'].encode('latin1') result['d'] = result['d'].encode('latin1') - return result - else: - return None + self.logger.debug('dedup db lookup of key=%s returning %s', key, result) + return result +def decorate_with_dedup_info(dedup_db, recorded_url, base32=False): + if recorded_url.response_recorder.payload_digest: + key = warcprox.digest_str(recorded_url.response_recorder.payload_digest, base32) + recorded_url.dedup_info = dedup_db.lookup(key) diff --git a/warcprox/main.py b/warcprox/main.py index 7a9f0b3..a98691d 100644 --- a/warcprox/main.py +++ b/warcprox/main.py @@ -18,14 +18,8 @@ import pprint import traceback import signal import threading - import certauth.certauth - -import warcprox.playback -import warcprox.dedup -import warcprox.warcwriter -import warcprox.warcprox -import warcprox.controller +import warcprox def _build_arg_parser(prog=os.path.basename(sys.argv[0])): arg_parser = argparse.ArgumentParser(prog=prog, @@ -124,7 +118,7 @@ def main(argv=sys.argv): ca = certauth.certauth.CertificateAuthority(args.cacert, args.certs_dir, ca_name=ca_name) - proxy = warcprox.warcprox.WarcProxy( + proxy = warcprox.warcproxy.WarcProxy( server_address=(args.address, int(args.port)), ca=ca, recorded_url_q=recorded_url_q, digest_algorithm=args.digest_algorithm) @@ -139,15 +133,15 @@ def main(argv=sys.argv): playback_index_db = None playback_proxy = None - default_warc_writer = warcprox.warcwriter.WarcWriter(directory=args.directory, + default_warc_writer = warcprox.writer.WarcWriter(directory=args.directory, gzip=args.gzip, prefix=args.prefix, port=int(args.port), rollover_size=int(args.size), base32=args.base32, - dedup_db=dedup_db, digest_algorithm=args.digest_algorithm, - playback_index_db=playback_index_db, + digest_algorithm=args.digest_algorithm, rollover_idle_time=int(args.rollover_idle_time) if args.rollover_idle_time is not None else None) - writer_pool=warcprox.warcwriter.WarcWriterPool(default_warc_writer) - warc_writer_thread = warcprox.warcwriter.WarcWriterThread(recorded_url_q=recorded_url_q, - writer_pool=writer_pool) + writer_pool=warcprox.writer.WarcWriterPool(default_warc_writer) + warc_writer_thread = warcprox.writerthread.WarcWriterThread( + recorded_url_q=recorded_url_q, writer_pool=writer_pool, + dedup_db=dedup_db, playback_index_db=playback_index_db) controller = warcprox.controller.WarcproxController(proxy, warc_writer_thread, playback_proxy) diff --git a/warcprox/warc.py b/warcprox/warc.py new file mode 100644 index 0000000..91843a7 --- /dev/null +++ b/warcprox/warc.py @@ -0,0 +1,149 @@ +# vim:set sw=4 et: + +from __future__ import absolute_import + +import logging +import warcprox +import hashlib +import socket +import hanzo.httptools +from hanzo import warctools +import warcprox +from datetime import datetime + +class WarcRecordBuilder: + logger = logging.getLogger("warcprox.warc.WarcRecordBuilder") + + def __init__(self, digest_algorithm="sha1", base32=False): + self.digest_algorithm = digest_algorithm + self.base32 = base32 + + def _build_response_principal_record(self, recorded_url, warc_date): + """Builds response or revisit record, whichever is appropriate.""" + if hasattr(recorded_url, "dedup_info") and recorded_url.dedup_info: + # revisit record + recorded_url.response_recorder.tempfile.seek(0) + if recorded_url.response_recorder.payload_offset is not None: + response_header_block = recorded_url.response_recorder.tempfile.read(recorded_url.response_recorder.payload_offset) + else: + response_header_block = recorded_url.response_recorder.tempfile.read() + + return self.build_warc_record( + url=recorded_url.url, warc_date=warc_date, + data=response_header_block, + warc_type=warctools.WarcRecord.REVISIT, + refers_to=recorded_url.dedup_info['i'], + refers_to_target_uri=recorded_url.dedup_info['u'], + refers_to_date=recorded_url.dedup_info['d'], + payload_digest=warcprox.digest_str(recorded_url.response_recorder.payload_digest, self.base32), + profile=warctools.WarcRecord.PROFILE_IDENTICAL_PAYLOAD_DIGEST, + content_type=hanzo.httptools.ResponseMessage.CONTENT_TYPE, + remote_ip=recorded_url.remote_ip) + else: + # response record + return self.build_warc_record( + url=recorded_url.url, warc_date=warc_date, + recorder=recorded_url.response_recorder, + warc_type=warctools.WarcRecord.RESPONSE, + content_type=hanzo.httptools.ResponseMessage.CONTENT_TYPE, + remote_ip=recorded_url.remote_ip) + + def build_warc_records(self, recorded_url): + """Returns a tuple of hanzo.warctools.warc.WarcRecord (principal_record, ...)""" + warc_date = warctools.warc.warc_datetime_str(datetime.utcnow()) + + if recorded_url.response_recorder: + principal_record = self._build_response_principal_record(recorded_url, warc_date) + request_record = self.build_warc_record(url=recorded_url.url, + warc_date=warc_date, data=recorded_url.request_data, + warc_type=warctools.WarcRecord.REQUEST, + content_type=hanzo.httptools.RequestMessage.CONTENT_TYPE, + concurrent_to=principal_record.id) + return principal_record, request_record + else: + principal_record = self.build_warc_record(url=recorded_url.url, + warc_date=warc_date, data=recorded_url.request_data, + warc_type=recorded_url.custom_type, + content_type=recorded_url.content_type) + return (principal_record,) + + def build_warc_record(self, url, warc_date=None, recorder=None, data=None, + concurrent_to=None, warc_type=None, content_type=None, remote_ip=None, + profile=None, refers_to=None, refers_to_target_uri=None, + refers_to_date=None, payload_digest=None): + + if warc_date is None: + warc_date = warctools.warc.warc_datetime_str(datetime.utcnow()) + + record_id = warctools.WarcRecord.random_warc_uuid() + + headers = [] + if warc_type is not None: + headers.append((warctools.WarcRecord.TYPE, warc_type)) + headers.append((warctools.WarcRecord.ID, record_id)) + headers.append((warctools.WarcRecord.DATE, warc_date)) + headers.append((warctools.WarcRecord.URL, url)) + if remote_ip is not None: + headers.append((warctools.WarcRecord.IP_ADDRESS, remote_ip)) + if profile is not None: + headers.append((warctools.WarcRecord.PROFILE, profile)) + if refers_to is not None: + headers.append((warctools.WarcRecord.REFERS_TO, refers_to)) + if refers_to_target_uri is not None: + headers.append((warctools.WarcRecord.REFERS_TO_TARGET_URI, refers_to_target_uri)) + if refers_to_date is not None: + headers.append((warctools.WarcRecord.REFERS_TO_DATE, refers_to_date)) + if concurrent_to is not None: + headers.append((warctools.WarcRecord.CONCURRENT_TO, concurrent_to)) + if content_type is not None: + headers.append((warctools.WarcRecord.CONTENT_TYPE, content_type)) + if payload_digest is not None: + headers.append((warctools.WarcRecord.PAYLOAD_DIGEST, payload_digest)) + + if recorder is not None: + headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(recorder)).encode('latin1'))) + headers.append((warctools.WarcRecord.BLOCK_DIGEST, + warcprox.digest_str(recorder.block_digest, self.base32))) + if recorder.payload_digest is not None: + headers.append((warctools.WarcRecord.PAYLOAD_DIGEST, + warcprox.digest_str(recorder.payload_digest, self.base32))) + + recorder.tempfile.seek(0) + record = warctools.WarcRecord(headers=headers, content_file=recorder.tempfile) + + else: + headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(data)).encode('latin1'))) + block_digest = hashlib.new(self.digest_algorithm, data) + headers.append((warctools.WarcRecord.BLOCK_DIGEST, + warcprox.digest_str(block_digest, self.base32))) + + content_tuple = content_type, data + record = warctools.WarcRecord(headers=headers, content=content_tuple) + + return record + + def build_warcinfo_record(self, filename): + warc_record_date = warctools.warc.warc_datetime_str(datetime.utcnow()) + record_id = warctools.WarcRecord.random_warc_uuid() + + headers = [] + headers.append((warctools.WarcRecord.ID, record_id)) + headers.append((warctools.WarcRecord.TYPE, warctools.WarcRecord.WARCINFO)) + headers.append((warctools.WarcRecord.FILENAME, filename.encode('latin1'))) + headers.append((warctools.WarcRecord.DATE, warc_record_date)) + + warcinfo_fields = [] + warcinfo_fields.append(b'software: warcprox ' + warcprox.version_bytes) + hostname = socket.gethostname() + warcinfo_fields.append('hostname: {}'.format(hostname).encode('latin1')) + warcinfo_fields.append('ip: {}'.format(socket.gethostbyname(hostname)).encode('latin1')) + warcinfo_fields.append(b'format: WARC File Format 1.0') + # warcinfo_fields.append('robots: ignore') + # warcinfo_fields.append('description: {0}'.format(self.description)) + # warcinfo_fields.append('isPartOf: {0}'.format(self.is_part_of)) + data = b'\r\n'.join(warcinfo_fields) + b'\r\n' + + record = warctools.WarcRecord(headers=headers, content=(b'application/warc-fields', data)) + + return record + diff --git a/warcprox/warcprox.py b/warcprox/warcprox.py index 5a0dc58..19e207f 100644 --- a/warcprox/warcprox.py +++ b/warcprox/warcprox.py @@ -39,7 +39,7 @@ import socket from hanzo import warctools from certauth.certauth import CertificateAuthority -import warcprox.mitmproxy +import warcprox class ProxyingRecorder(object): """ @@ -47,7 +47,7 @@ class ProxyingRecorder(object): calculating digests, and sending them on to the proxy client. """ - logger = logging.getLogger("warcprox.warcprox.ProxyingRecorder") + logger = logging.getLogger("warcprox.warcproxy.ProxyingRecorder") def __init__(self, fp, proxy_dest, digest_algorithm='sha1', url=None): self.fp = fp @@ -153,7 +153,7 @@ class ProxyingRecordingHTTPResponse(http_client.HTTPResponse): class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): - logger = logging.getLogger("warcprox.warcprox.WarcProxyHandler") + logger = logging.getLogger("warcprox.warcproxy.WarcProxyHandler") def _proxy_request(self): # Build request @@ -273,7 +273,7 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): pass -class RecordedUrl(object): +class RecordedUrl: def __init__(self, url, request_data, response_recorder, remote_ip, warcprox_meta=None, content_type=None, custom_type=None, status=None, size=None, client_ip=None, method=None): @@ -305,8 +305,15 @@ class RecordedUrl(object): self.client_ip = client_ip self.method = method + def __del__(self): + self.logger.info("finished with %s", self) + if self.response_recorder: + self.response_recorder.tempfile.close() + self.response_recorder = None + + class WarcProxy(socketserver.ThreadingMixIn, http_server.HTTPServer): - logger = logging.getLogger("warcprox.warcprox.WarcProxy") + logger = logging.getLogger("warcprox.warcproxy.WarcProxy") def __init__(self, server_address=('localhost', 8000), req_handler_class=WarcProxyHandler, bind_and_activate=True, diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py new file mode 100644 index 0000000..c47e11c --- /dev/null +++ b/warcprox/warcproxy.py @@ -0,0 +1,345 @@ +#!/usr/bin/env python +# vim:set sw=4 et: +# +""" +WARC writing MITM HTTP/S proxy + +See README.rst or https://github.com/internetarchive/warcprox +""" + +from __future__ import absolute_import + +try: + import http.server as http_server +except ImportError: + import BaseHTTPServer as http_server + +try: + import socketserver +except ImportError: + import SocketServer as socketserver + +try: + import queue +except ImportError: + import Queue as queue + +try: + import http.client as http_client +except ImportError: + import httplib as http_client + +import logging +import re +import tempfile +import traceback +import hashlib +import json +import socket +from hanzo import warctools + +from certauth.certauth import CertificateAuthority +import warcprox.mitmproxy + +class ProxyingRecorder(object): + """ + Wraps a socket._fileobject, recording the bytes as they are read, + calculating digests, and sending them on to the proxy client. + """ + + logger = logging.getLogger("warcprox.warcproxy.ProxyingRecorder") + + def __init__(self, fp, proxy_dest, digest_algorithm='sha1', url=None): + self.fp = fp + # "The file has no name, and will cease to exist when it is closed." + self.tempfile = tempfile.SpooledTemporaryFile(max_size=512*1024) + self.digest_algorithm = digest_algorithm + self.block_digest = hashlib.new(digest_algorithm) + self.payload_offset = None + self.payload_digest = None + self.proxy_dest = proxy_dest + self._proxy_dest_conn_open = True + self._prev_hunk_last_two_bytes = b'' + self.len = 0 + self.url = url + + def _update_payload_digest(self, hunk): + if self.payload_digest is None: + # convoluted handling of two newlines crossing hunks + # XXX write tests for this + if self._prev_hunk_last_two_bytes.endswith(b'\n'): + if hunk.startswith(b'\n'): + self.payload_digest = hashlib.new(self.digest_algorithm) + self.payload_digest.update(hunk[1:]) + self.payload_offset = self.len + 1 + elif hunk.startswith(b'\r\n'): + self.payload_digest = hashlib.new(self.digest_algorithm) + self.payload_digest.update(hunk[2:]) + self.payload_offset = self.len + 2 + elif self._prev_hunk_last_two_bytes == b'\n\r': + if hunk.startswith(b'\n'): + self.payload_digest = hashlib.new(self.digest_algorithm) + self.payload_digest.update(hunk[1:]) + self.payload_offset = self.len + 1 + else: + m = re.search(br'\n\r?\n', hunk) + if m is not None: + self.payload_digest = hashlib.new(self.digest_algorithm) + self.payload_digest.update(hunk[m.end():]) + self.payload_offset = self.len + m.end() + + # if we still haven't found start of payload hold on to these bytes + if self.payload_digest is None: + self._prev_hunk_last_two_bytes = hunk[-2:] + else: + self.payload_digest.update(hunk) + + def _update(self, hunk): + self._update_payload_digest(hunk) + self.block_digest.update(hunk) + + self.tempfile.write(hunk) + + if self._proxy_dest_conn_open: + try: + self.proxy_dest.sendall(hunk) + except BaseException as e: + self._proxy_dest_conn_open = False + self.logger.warn('{} sending data to proxy client for url {}'.format(e, self.url)) + self.logger.info('will continue downloading from remote server without sending to client {}'.format(self.url)) + + self.len += len(hunk) + + def read(self, size=-1): + hunk = self.fp.read(size) + self._update(hunk) + return hunk + + def readinto(self, b): + n = self.fp.readinto(b) + self._update(b[:n]) + return n + + def readline(self, size=-1): + # XXX depends on implementation details of self.fp.readline(), in + # particular that it doesn't call self.fp.read() + hunk = self.fp.readline(size) + self._update(hunk) + return hunk + + def close(self): + return self.fp.close() + + def __len__(self): + return self.len + + def payload_size(self): + if self.payload_offset is not None: + return self.len - self.payload_offset + else: + return 0 + + +class ProxyingRecordingHTTPResponse(http_client.HTTPResponse): + + def __init__(self, sock, debuglevel=0, method=None, proxy_dest=None, digest_algorithm='sha1', url=None): + http_client.HTTPResponse.__init__(self, sock, debuglevel=debuglevel, method=method) + self.url = url + + # Keep around extra reference to self.fp because HTTPResponse sets + # self.fp=None after it finishes reading, but we still need it + self.recorder = ProxyingRecorder(self.fp, proxy_dest, digest_algorithm, url=url) + self.fp = self.recorder + + +class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): + logger = logging.getLogger("warcprox.warcprox.WarcProxyHandler") + + def _proxy_request(self): + # Build request + req_str = '{} {} {}\r\n'.format(self.command, self.path, self.request_version) + + warcprox_meta = self.headers.get('Warcprox-Meta') + + # Swallow headers that don't make sense to forward on, i.e. most + # hop-by-hop headers, see http://tools.ietf.org/html/rfc2616#section-13.5 + # self.headers is an email.message.Message, which is case-insensitive + # and doesn't throw KeyError in __delitem__ + for h in ('Connection', 'Proxy-Connection', 'Keep-Alive', + 'Proxy-Authenticate', 'Proxy-Authorization', 'Upgrade', + 'Warcprox-Meta'): + del self.headers[h] + + # Add headers to the request + # XXX in at least python3.3 str(self.headers) uses \n not \r\n :( + req_str += '\r\n'.join('{}: {}'.format(k,v) for (k,v) in self.headers.items()) + + req = req_str.encode('utf-8') + b'\r\n\r\n' + + # Append message body if present to the request + if 'Content-Length' in self.headers: + req += self.rfile.read(int(self.headers['Content-Length'])) + + self.logger.debug('sending to remote server req={}'.format(repr(req))) + + # Send it down the pipe! + self._proxy_sock.sendall(req) + + # We want HTTPResponse's smarts about http and handling of + # non-compliant servers. But HTTPResponse.read() doesn't return the raw + # bytes read from the server, it unchunks them if they're chunked, and + # might do other stuff. We want to send the raw bytes back to the + # client. So we ignore the values returned by h.read() below. Instead + # the ProxyingRecordingHTTPResponse takes care of sending the raw bytes + # to the proxy client. + + # Proxy and record the response + h = ProxyingRecordingHTTPResponse(self._proxy_sock, + proxy_dest=self.connection, + digest_algorithm=self.server.digest_algorithm, + url=self.url) + h.begin() + + buf = h.read(8192) + while buf != b'': + buf = h.read(8192) + + self.log_request(h.status, h.recorder.len) + + remote_ip = self._proxy_sock.getpeername()[0] + + # Let's close off the remote end + h.close() + self._proxy_sock.close() + + # XXX Close connection to proxy client. Doing this because we were + # seeing some connection hangs and this seems to solve that problem. + # Not clear what the correct, optimal behavior is. + self.connection.close() + + recorded_url = RecordedUrl(url=self.url, request_data=req, + response_recorder=h.recorder, remote_ip=remote_ip, + warcprox_meta=warcprox_meta, + status=h.status, size=h.recorder.len, + client_ip=self.client_address[0], + content_type=h.getheader("Content-Type"), + method=self.command) + self.server.recorded_url_q.put(recorded_url) + + # deprecated + def do_PUTMETA(self): + self.do_WARCPROX_WRITE_RECORD(warc_type=warctools.WarcRecord.METADATA) + + def do_WARCPROX_WRITE_RECORD(self, warc_type=None): + try: + self.url = self.path + + if ('Content-Length' in self.headers and 'Content-Type' in self.headers + and (warc_type or 'WARC-Type' in self.headers)): + # stream this? + request_data = self.rfile.read(int(self.headers['Content-Length'])) + + warcprox_meta = self.headers.get('Warcprox-Meta') + + rec_custom = RecordedUrl(url=self.url, + request_data=request_data, + response_recorder=None, + remote_ip=b'', + warcprox_meta=warcprox_meta, + content_type=self.headers['Content-Type'].encode('latin1'), + custom_type=warc_type or self.headers['WARC-Type'], + status=204, size=len(request_data), + client_ip=self.client_address[0], + method=self.command) + + self.server.recorded_url_q.put(rec_custom) + self.send_response(204, 'OK') + else: + self.send_error(400, 'Bad request') + + self.end_headers() + except: + self.logger.error("uncaught exception in do_WARCPROX_WRITE_RECORD", exc_info=True) + raise + + def log_error(self, fmt, *args): + # logging better handled elsewhere? + pass + + def log_message(self, fmt, *args): + # logging better handled elsewhere? + pass + + +class RecordedUrl: + logger = logging.getLogger("warcprox.warcproxy.RecordedUrl") + + def __init__(self, url, request_data, response_recorder, remote_ip, + warcprox_meta=None, content_type=None, custom_type=None, + status=None, size=None, client_ip=None, method=None): + # XXX should test what happens with non-ascii url (when does + # url-encoding happen?) + if type(url) is not bytes: + self.url = url.encode('ascii') + else: + self.url = url + + if type(remote_ip) is not bytes: + self.remote_ip = remote_ip.encode('ascii') + else: + self.remote_ip = remote_ip + + self.request_data = request_data + self.response_recorder = response_recorder + + if warcprox_meta: + self.warcprox_meta = json.loads(warcprox_meta) + else: + self.warcprox_meta = {} + + self.content_type = content_type + self.custom_type = custom_type + + self.status = status + self.size = size + self.client_ip = client_ip + self.method = method + + def __del__(self): + self.logger.debug("finished with %s", self) + if self.response_recorder: + self.response_recorder.tempfile.close() + self.response_recorder = None + + +class WarcProxy(socketserver.ThreadingMixIn, http_server.HTTPServer): + logger = logging.getLogger("warcprox.warcproxy.WarcProxy") + + def __init__(self, server_address=('localhost', 8000), + req_handler_class=WarcProxyHandler, bind_and_activate=True, + ca=None, recorded_url_q=None, digest_algorithm='sha1'): + http_server.HTTPServer.__init__(self, server_address, req_handler_class, bind_and_activate) + + self.digest_algorithm = digest_algorithm + + if ca is not None: + self.ca = ca + else: + ca_name = 'Warcprox CA on {}'.format(socket.gethostname())[:64] + self.ca = CertificateAuthority(ca_file='warcprox-ca.pem', + certs_dir='./warcprox-ca', + ca_name=ca_name) + + if recorded_url_q is not None: + self.recorded_url_q = recorded_url_q + else: + self.recorded_url_q = queue.Queue() + + def server_activate(self): + http_server.HTTPServer.server_activate(self) + self.logger.info('WarcProxy listening on {0}:{1}'.format(self.server_address[0], self.server_address[1])) + + def server_close(self): + self.logger.info('WarcProxy shutting down') + http_server.HTTPServer.server_close(self) + diff --git a/warcprox/writer.py b/warcprox/writer.py new file mode 100644 index 0000000..02dee72 --- /dev/null +++ b/warcprox/writer.py @@ -0,0 +1,158 @@ +# vim:set sw=4 et: + +from __future__ import absolute_import + +import logging +from datetime import datetime +from hanzo import warctools +import time +import warcprox +import os +import socket + +class WarcWriter: + logger = logging.getLogger("warcprox.writer.WarcWriter") + + # port is only used for warc filename + def __init__(self, directory='./warcs', rollover_size=1000000000, + gzip=False, prefix='WARCPROX', port=0, digest_algorithm='sha1', + base32=False, rollover_idle_time=None): + + self.rollover_size = rollover_size + self.rollover_idle_time = rollover_idle_time + self._last_activity = time.time() + + self.gzip = gzip + self.record_builder = warcprox.warc.WarcRecordBuilder(digest_algorithm=digest_algorithm, base32=base32) + + # warc path and filename stuff + self.directory = directory + self.prefix = prefix + self.port = port + + self._f = None + self._fpath = None + self._f_finalname = None + self._serial = 0 + + if not os.path.exists(directory): + self.logger.info("warc destination directory {} doesn't exist, creating it".format(directory)) + os.mkdir(directory) + + def timestamp17(self): + now = datetime.utcnow() + return '{:%Y%m%d%H%M%S}{:03d}'.format(now, now.microsecond//1000) + + def close_writer(self): + if self._fpath: + self.logger.info('closing {0}'.format(self._f_finalname)) + self._f.close() + finalpath = os.path.sep.join([self.directory, self._f_finalname]) + os.rename(self._fpath, finalpath) + + self._fpath = None + self._f = None + + # + def _writer(self): + if self._fpath and os.path.getsize(self._fpath) > self.rollover_size: + self.close_writer() + + if self._f == None: + self._f_finalname = '{}-{}-{:05d}-{}-{}-{}.warc{}'.format( + self.prefix, self.timestamp17(), self._serial, os.getpid(), + socket.gethostname(), self.port, '.gz' if self.gzip else '') + self._fpath = os.path.sep.join([self.directory, self._f_finalname + '.open']) + + self._f = open(self._fpath, 'wb') + + warcinfo_record = self.record_builder.build_warcinfo_record(self._f_finalname) + self.logger.debug('warcinfo_record.headers={}'.format(warcinfo_record.headers)) + warcinfo_record.write_to(self._f, gzip=self.gzip) + + self._serial += 1 + + return self._f + + def write_records(self, recorded_url): + """Returns tuple of records written, which are instances of + hanzo.warctools.warc.WarcRecord, decorated with "warc_filename" and + "offset" attributes.""" + records = self.record_builder.build_warc_records(recorded_url) + + writer = self._writer() + recordset_offset = writer.tell() + + for record in records: + offset = writer.tell() + record.write_to(writer, gzip=self.gzip) + record.offset = offset + record.warc_filename = self._f_finalname + self.logger.debug('wrote warc record: warc_type=%s content_length=%s url=%s warc=%s offset=%d', + record.get_header(warctools.WarcRecord.TYPE), + record.get_header(warctools.WarcRecord.CONTENT_LENGTH), + record.get_header(warctools.WarcRecord.URL), + self._fpath, record.offset) + + self._f.flush() + self._last_activity = time.time() + + return records + + def maybe_idle_rollover(self): + if (self._fpath is not None + and self.rollover_idle_time is not None + and self.rollover_idle_time > 0 + and time.time() - self._last_activity > self.rollover_idle_time): + self.logger.debug('rolling over {} after {} seconds idle'.format(self._f_finalname, time.time() - self._last_activity)) + self.close_writer() + +class WarcWriterPool: + logger = logging.getLogger("warcprox.writer.WarcWriterPool") + + def __init__(self, default_warc_writer=None): + if default_warc_writer: + self.default_warc_writer = default_warc_writer + else: + self.default_warc_writer = WarcWriter() + self.warc_writers = {} # {prefix:WarcWriter} + self._last_sync = time.time() + + self.logger.info('directory={} gzip={} rollover_size={} rollover_idle_time={} prefix={} port={}'.format( + os.path.abspath(self.default_warc_writer.directory), self.default_warc_writer.gzip, self.default_warc_writer.rollover_size, + self.default_warc_writer.rollover_idle_time, self.default_warc_writer.prefix, self.default_warc_writer.port)) + + # chooses writer for filename specified by warcprox_meta["warc-prefix"] if set + def _writer(self, recorded_url): + w = self.default_warc_writer + if recorded_url.warcprox_meta and "warc-prefix" in recorded_url.warcprox_meta: + # self.logger.info("recorded_url.warcprox_meta={} for {}".format(recorded_url.warcprox_meta, recorded_url.url)) + prefix = recorded_url.warcprox_meta["warc-prefix"] + if not prefix in self.warc_writers: + self.warc_writers[prefix] = WarcWriter(prefix=prefix, + directory=self.default_warc_writer.directory, + rollover_size=self.default_warc_writer.rollover_size, + rollover_idle_time=self.default_warc_writer.rollover_idle_time, + gzip=self.default_warc_writer.gzip, + port=self.default_warc_writer.port, + digest_algorithm=self.default_warc_writer.record_builder.digest_algorithm, + base32=self.default_warc_writer.record_builder.base32) + w = self.warc_writers[prefix] + return w + + def write_records(self, recorded_url): + """Returns tuple of records written, which are instances of + hanzo.warctools.warc.WarcRecord, decorated with "warc_filename" and + "offset" attributes.""" + return self._writer(recorded_url).write_records(recorded_url) + + def maybe_idle_rollover(self): + self.default_warc_writer.maybe_idle_rollover() + for w in self.warc_writers.values(): + w.maybe_idle_rollover() + + def close_writers(self): + self.default_warc_writer.close_writer() + for w in self.warc_writers.values(): + w.close_writer() + diff --git a/warcprox/writerthread.py b/warcprox/writerthread.py new file mode 100644 index 0000000..ceb34cd --- /dev/null +++ b/warcprox/writerthread.py @@ -0,0 +1,112 @@ +# vim:set sw=4 et: + +from __future__ import absolute_import + +try: + import queue +except ImportError: + import Queue as queue + +import logging +import threading +import os +import hashlib +import time +import socket +import base64 +from datetime import datetime +import hanzo.httptools +from hanzo import warctools +import warcprox + +class WarcWriterThread(threading.Thread): + logger = logging.getLogger("warcprox.warcproxwriter.WarcWriterThread") + + def __init__(self, recorded_url_q=None, writer_pool=None, dedup_db=None, playback_index_db=None): + """recorded_url_q is a queue.Queue of warcprox.warcprox.RecordedUrl.""" + threading.Thread.__init__(self, name='WarcWriterThread') + self.recorded_url_q = recorded_url_q + self.stop = threading.Event() + if writer_pool: + self.writer_pool = writer_pool + else: + self.writer_pool = WarcWriterPool() + self.dedup_db = dedup_db + self.playback_index_db = playback_index_db + self._last_sync = time.time() + + def run(self): + try: + while not self.stop.is_set(): + try: + recorded_url = self.recorded_url_q.get(block=True, timeout=0.5) + if self.dedup_db: + warcprox.dedup.decorate_with_dedup_info(self.dedup_db, recorded_url, + base32=self.writer_pool.default_warc_writer.record_builder.base32) + records = self.writer_pool.write_records(recorded_url) + self._final_tasks(recorded_url, records) + except queue.Empty: + self.writer_pool.maybe_idle_rollover() + self._sync() + + self.logger.info('WarcWriterThread shutting down') + self.writer_pool.close_writers() + except: + self.logger.critical("WarcWriterThread shutting down after unexpected error", exc_info=True) + + def _sync(self): + # XXX prob doesn't belong here (do we need it at all?) + if time.time() - self._last_sync > 60: + if self.dedup_db: + self.dedup_db.sync() + if self.playback_index_db: + self.playback_index_db.sync() + self._last_sync = time.time() + + def _save_dedup_info(self, recorded_url, records): + if (self.dedup_db + and records[0].get_header(warctools.WarcRecord.TYPE) == warctools.WarcRecord.RESPONSE + and recorded_url.response_recorder.payload_size() > 0): + key = warcprox.digest_str(recorded_url.response_recorder.payload_digest, + self.writer_pool.default_warc_writer.record_builder.base32) + self.dedup_db.save(key, records[0], records[0].offset) + + def _save_playback_info(self, recorded_url, records): + if self.playback_index_db is not None: + self.playback_index_db.save(records[0].warc_filename, records, records[0].offset) + + # closest thing we have to heritrix crawl log at the moment + def _log(self, recorded_url, records): + def _decode(x): + if isinstance(x, bytes): + return x.decode("utf-8") + else: + return x + + try: + payload_digest = records[0].get_header(warctools.WarcRecord.PAYLOAD_DIGEST).decode("utf-8") + except: + payload_digest = "-" + mimetype = _decode(recorded_url.content_type) + if mimetype: + n = mimetype.find(";") + if n >= 0: + mimetype = mimetype[:n] + + # 2015-07-17T22:32:23.672Z 1 58 dns:www.dhss.delaware.gov P http://www.dhss.delaware.gov/dhss/ text/dns #045 20150717223214881+316 sha1:63UTPB7GTWIHAGIK3WWL76E57BBTJGAK http://www.dhss.delaware.gov/dhss/ - {"warcFileOffset":2964,"warcFilename":"ARCHIVEIT-1303-WEEKLY-JOB165158-20150717223222113-00000.warc.gz"} + self.logger.info("{} {} {} {} {} size={} {} {} {} offset={}".format( + _decode(recorded_url.client_ip), + _decode(recorded_url.status), + _decode(recorded_url.method), + _decode(recorded_url.url), + mimetype, + recorded_url.size, + _decode(payload_digest), + _decode(records[0].get_header(warctools.WarcRecord.TYPE)), + _decode(records[0].warc_filename), + records[0].offset)) + + def _final_tasks(self, recorded_url, records): + self._save_dedup_info(recorded_url, records) + self._save_playback_info(recorded_url, records) + self._log(recorded_url, records) From 1f864515ce894deab432c4017c74206793dfd852 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 30 Jul 2015 00:13:15 +0000 Subject: [PATCH 013/146] refactor warc writing, deduplication for somewhat cleaner separation of concerns --- warcprox/warcwriter.py | 387 ----------------------------------------- 1 file changed, 387 deletions(-) delete mode 100644 warcprox/warcwriter.py diff --git a/warcprox/warcwriter.py b/warcprox/warcwriter.py deleted file mode 100644 index f219abf..0000000 --- a/warcprox/warcwriter.py +++ /dev/null @@ -1,387 +0,0 @@ -# vim:set sw=4 et: - -from __future__ import absolute_import - -try: - import queue -except ImportError: - import Queue as queue - -import logging -import threading -import os -import hashlib -import time -import socket -import base64 -from datetime import datetime -import hanzo.httptools -from hanzo import warctools -import warcprox - -class WarcRecordBuilder: - logger = logging.getLogger("warcprox.warcwriter.WarcRecordBuilder") - - def __init__(self, dedup_db=None, digest_algorithm="sha1", base32=False): - self.dedup_db = dedup_db - self.digest_algorithm = digest_algorithm - self.base32 = base32 - - def _build_response_principal_record(self, recorded_url, warc_date): - """Builds response or revisit record, whichever is appropriate.""" - if self.dedup_db is not None and recorded_url.response_recorder.payload_digest is not None: - key = self.digest_str(recorded_url.response_recorder.payload_digest) - dedup_info = self.dedup_db.lookup(key) - - if dedup_info is not None: - # revisit record - recorded_url.response_recorder.tempfile.seek(0) - if recorded_url.response_recorder.payload_offset is not None: - response_header_block = recorded_url.response_recorder.tempfile.read(recorded_url.response_recorder.payload_offset) - else: - response_header_block = recorded_url.response_recorder.tempfile.read() - - return self.build_warc_record( - url=recorded_url.url, warc_date=warc_date, - data=response_header_block, - warc_type=warctools.WarcRecord.REVISIT, - refers_to=dedup_info['i'], - refers_to_target_uri=dedup_info['u'], - refers_to_date=dedup_info['d'], - payload_digest=self.digest_str(recorded_url.response_recorder.payload_digest), - profile=warctools.WarcRecord.PROFILE_IDENTICAL_PAYLOAD_DIGEST, - content_type=hanzo.httptools.ResponseMessage.CONTENT_TYPE, - remote_ip=recorded_url.remote_ip) - else: - # response record - return self.build_warc_record( - url=recorded_url.url, warc_date=warc_date, - recorder=recorded_url.response_recorder, - warc_type=warctools.WarcRecord.RESPONSE, - content_type=hanzo.httptools.ResponseMessage.CONTENT_TYPE, - remote_ip=recorded_url.remote_ip) - - # returns a tuple (principal_record, ...) - def build_warc_records(self, recorded_url): - warc_date = warctools.warc.warc_datetime_str(datetime.utcnow()) - - if recorded_url.response_recorder: - principal_record = self._build_response_principal_record(recorded_url, warc_date) - request_record = self.build_warc_record(url=recorded_url.url, - warc_date=warc_date, data=recorded_url.request_data, - warc_type=warctools.WarcRecord.REQUEST, - content_type=hanzo.httptools.RequestMessage.CONTENT_TYPE, - concurrent_to=principal_record.id) - return principal_record, request_record - else: - principal_record = self.build_warc_record(url=recorded_url.url, - warc_date=warc_date, data=recorded_url.request_data, - warc_type=recorded_url.custom_type, - content_type=recorded_url.content_type) - return (principal_record,) - - def digest_str(self, hash_obj): - return hash_obj.name.encode('utf-8') + b':' + (base64.b32encode(hash_obj.digest()) if self.base32 else hash_obj.hexdigest().encode('ascii')) - - def build_warc_record(self, url, warc_date=None, recorder=None, data=None, - concurrent_to=None, warc_type=None, content_type=None, remote_ip=None, - profile=None, refers_to=None, refers_to_target_uri=None, - refers_to_date=None, payload_digest=None): - - if warc_date is None: - warc_date = warctools.warc.warc_datetime_str(datetime.utcnow()) - - record_id = warctools.WarcRecord.random_warc_uuid() - - headers = [] - if warc_type is not None: - headers.append((warctools.WarcRecord.TYPE, warc_type)) - headers.append((warctools.WarcRecord.ID, record_id)) - headers.append((warctools.WarcRecord.DATE, warc_date)) - headers.append((warctools.WarcRecord.URL, url)) - if remote_ip is not None: - headers.append((warctools.WarcRecord.IP_ADDRESS, remote_ip)) - if profile is not None: - headers.append((warctools.WarcRecord.PROFILE, profile)) - if refers_to is not None: - headers.append((warctools.WarcRecord.REFERS_TO, refers_to)) - if refers_to_target_uri is not None: - headers.append((warctools.WarcRecord.REFERS_TO_TARGET_URI, refers_to_target_uri)) - if refers_to_date is not None: - headers.append((warctools.WarcRecord.REFERS_TO_DATE, refers_to_date)) - if concurrent_to is not None: - headers.append((warctools.WarcRecord.CONCURRENT_TO, concurrent_to)) - if content_type is not None: - headers.append((warctools.WarcRecord.CONTENT_TYPE, content_type)) - if payload_digest is not None: - headers.append((warctools.WarcRecord.PAYLOAD_DIGEST, payload_digest)) - - if recorder is not None: - headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(recorder)).encode('latin1'))) - headers.append((warctools.WarcRecord.BLOCK_DIGEST, - self.digest_str(recorder.block_digest))) - if recorder.payload_digest is not None: - headers.append((warctools.WarcRecord.PAYLOAD_DIGEST, - self.digest_str(recorder.payload_digest))) - - recorder.tempfile.seek(0) - record = warctools.WarcRecord(headers=headers, content_file=recorder.tempfile) - - else: - headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(data)).encode('latin1'))) - block_digest = hashlib.new(self.digest_algorithm, data) - headers.append((warctools.WarcRecord.BLOCK_DIGEST, - self.digest_str(block_digest))) - - content_tuple = content_type, data - record = warctools.WarcRecord(headers=headers, content=content_tuple) - - return record - - def build_warcinfo_record(self, filename): - warc_record_date = warctools.warc.warc_datetime_str(datetime.utcnow()) - record_id = warctools.WarcRecord.random_warc_uuid() - - headers = [] - headers.append((warctools.WarcRecord.ID, record_id)) - headers.append((warctools.WarcRecord.TYPE, warctools.WarcRecord.WARCINFO)) - headers.append((warctools.WarcRecord.FILENAME, filename.encode('latin1'))) - headers.append((warctools.WarcRecord.DATE, warc_record_date)) - - warcinfo_fields = [] - warcinfo_fields.append(b'software: warcprox ' + warcprox.version_bytes) - hostname = socket.gethostname() - warcinfo_fields.append('hostname: {}'.format(hostname).encode('latin1')) - warcinfo_fields.append('ip: {0}'.format(socket.gethostbyname(hostname)).encode('latin1')) - warcinfo_fields.append(b'format: WARC File Format 1.0') - # warcinfo_fields.append('robots: ignore') - # warcinfo_fields.append('description: {0}'.format(self.description)) - # warcinfo_fields.append('isPartOf: {0}'.format(self.is_part_of)) - data = b'\r\n'.join(warcinfo_fields) + b'\r\n' - - record = warctools.WarcRecord(headers=headers, content=(b'application/warc-fields', data)) - - return record - -class WarcWriter: - logger = logging.getLogger("warcprox.warcwriter.WarcWriter") - - # port is only used for warc filename - def __init__(self, directory='./warcs', rollover_size=1000000000, - gzip=False, prefix='WARCPROX', port=0, - digest_algorithm='sha1', base32=False, dedup_db=None, - playback_index_db=None, rollover_idle_time=None): - - self.rollover_size = rollover_size - self.rollover_idle_time = rollover_idle_time - self._last_activity = time.time() - - self.gzip = gzip - self.record_builder = WarcRecordBuilder(dedup_db=dedup_db, digest_algorithm=digest_algorithm, base32=base32) - self.dedup_db = dedup_db - - self.playback_index_db = playback_index_db - - # warc path and filename stuff - self.directory = directory - self.prefix = prefix - self.port = port - - self._f = None - self._fpath = None - self._serial = 0 - - if not os.path.exists(directory): - self.logger.info("warc destination directory {} doesn't exist, creating it".format(directory)) - os.mkdir(directory) - - def timestamp17(self): - now = datetime.utcnow() - return '{:%Y%m%d%H%M%S}{:03d}'.format(now, now.microsecond//1000) - - def close_writer(self): - if self._fpath: - self.logger.info('closing {0}'.format(self._f_finalname)) - self._f.close() - finalpath = os.path.sep.join([self.directory, self._f_finalname]) - os.rename(self._fpath, finalpath) - - self._fpath = None - self._f = None - - # - def _writer(self): - if self._fpath and os.path.getsize(self._fpath) > self.rollover_size: - self.close_writer() - - if self._f == None: - self._f_finalname = '{}-{}-{:05d}-{}-{}-{}.warc{}'.format( - self.prefix, self.timestamp17(), self._serial, os.getpid(), - socket.gethostname(), self.port, '.gz' if self.gzip else '') - self._fpath = os.path.sep.join([self.directory, self._f_finalname + '.open']) - - self._f = open(self._fpath, 'wb') - - warcinfo_record = self.record_builder.build_warcinfo_record(self._f_finalname) - self.logger.debug('warcinfo_record.headers={}'.format(warcinfo_record.headers)) - warcinfo_record.write_to(self._f, gzip=self.gzip) - - self._serial += 1 - - return self._f - - def _decode(self, x): - if isinstance(x, bytes): - return x.decode("utf-8") - else: - return x - - def _final_tasks(self, recorded_url, recordset, recordset_offset): - if (self.dedup_db is not None - and recordset[0].get_header(warctools.WarcRecord.TYPE) == warctools.WarcRecord.RESPONSE - and recorded_url.response_recorder.payload_size() > 0): - key = self.record_builder.digest_str(recorded_url.response_recorder.payload_digest) - self.dedup_db.save(key, recordset[0], recordset_offset) - - if self.playback_index_db is not None: - self.playback_index_db.save(self._f_finalname, recordset, recordset_offset) - - if recorded_url.response_recorder is not None: - recorded_url.response_recorder.tempfile.close() - - self._last_activity = time.time() - - try: - payload_digest = recordset[0].get_header(warctools.WarcRecord.PAYLOAD_DIGEST).decode("utf-8") - except: - payload_digest = "-" - mimetype = self._decode(recorded_url.content_type) - if mimetype: - n = mimetype.find(";") - if n >= 0: - mimetype = mimetype[:n] - - # 2015-07-17T22:32:23.672Z 1 58 dns:www.dhss.delaware.gov P http://www.dhss.delaware.gov/dhss/ text/dns #045 20150717223214881+316 sha1:63UTPB7GTWIHAGIK3WWL76E57BBTJGAK http://www.dhss.delaware.gov/dhss/ - {"warcFileOffset":2964,"warcFilename":"ARCHIVEIT-1303-WEEKLY-JOB165158-20150717223222113-00000.warc.gz"} - self.logger.info("{} {} {} {} {} size={} {} {} offset={}".format( - self._decode(recorded_url.client_ip), - self._decode(recorded_url.status), - self._decode(recorded_url.method), - self._decode(recorded_url.url), - mimetype, - recorded_url.size, - self._decode(payload_digest), - self._decode(self._f_finalname), - recordset_offset)) - - def write_records(self, recorded_url): - recordset = self.record_builder.build_warc_records(recorded_url) - - writer = self._writer() - recordset_offset = writer.tell() - - for record in recordset: - offset = writer.tell() - record.write_to(writer, gzip=self.gzip) - self.logger.debug('wrote warc record: warc_type=%s content_length=%s url=%s warc=%s offset=%d', - record.get_header(warctools.WarcRecord.TYPE), - record.get_header(warctools.WarcRecord.CONTENT_LENGTH), - record.get_header(warctools.WarcRecord.URL), - self._fpath, offset) - - self._f.flush() - - self._final_tasks(recorded_url, recordset, recordset_offset) - - def maybe_idle_rollover(self): - if (self._fpath is not None - and self.rollover_idle_time is not None - and self.rollover_idle_time > 0 - and time.time() - self._last_activity > self.rollover_idle_time): - self.logger.debug('rolling over {} after {} seconds idle'.format(self._f_finalname, time.time() - self._last_activity)) - self.close_writer() - -class WarcWriterPool: - logger = logging.getLogger("warcprox.warcwriter.WarcWriterPool") - - def __init__(self, default_warc_writer): - if default_warc_writer: - self.default_warc_writer = default_warc_writer - else: - self.default_warc_writer = WarcWriter() - self.warc_writers = {} # {prefix:WarcWriter} - self._last_sync = time.time() - - self.logger.info('directory={} gzip={} rollover_size={} rollover_idle_time={} prefix={} port={}'.format( - os.path.abspath(self.default_warc_writer.directory), self.default_warc_writer.gzip, self.default_warc_writer.rollover_size, - self.default_warc_writer.rollover_idle_time, self.default_warc_writer.prefix, self.default_warc_writer.port)) - - # chooses writer for filename specified by warcprox_meta["warc-prefix"] if set - def _writer(self, recorded_url): - w = self.default_warc_writer - if recorded_url.warcprox_meta and "warc-prefix" in recorded_url.warcprox_meta: - # self.logger.info("recorded_url.warcprox_meta={} for {}".format(recorded_url.warcprox_meta, recorded_url.url)) - prefix = recorded_url.warcprox_meta["warc-prefix"] - if not prefix in self.warc_writers: - self.warc_writers[prefix] = WarcWriter(prefix=prefix, - directory=self.default_warc_writer.directory, - rollover_size=self.default_warc_writer.rollover_size, - rollover_idle_time=self.default_warc_writer.rollover_idle_time, - gzip=self.default_warc_writer.gzip, - port=self.default_warc_writer.port, - digest_algorithm=self.default_warc_writer.record_builder.digest_algorithm, - base32=self.default_warc_writer.record_builder.base32, - dedup_db=self.default_warc_writer.dedup_db, - playback_index_db=self.default_warc_writer.playback_index_db) - w = self.warc_writers[prefix] - return w - - def write_records(self, recorded_url): - self._writer(recorded_url).write_records(recorded_url) - - def maybe_idle_rollover(self): - self.default_warc_writer.maybe_idle_rollover() - for w in self.warc_writers.values(): - w.maybe_idle_rollover() - - def sync(self): - # XXX prob doesn't belong here (do we need it at all?) - if time.time() - self._last_sync > 60: - if self.default_warc_writer.dedup_db: - self.default_warc_writer.dedup_db.sync() - if self.default_warc_writer.playback_index_db: - self.default_warc_writer.playback_index_db.sync() - self._last_sync = time.time() - - def close_writers(self): - self.default_warc_writer.close_writer() - for w in self.warc_writers.values(): - w.close_writer() - -class WarcWriterThread(threading.Thread): - logger = logging.getLogger("warcprox.warcwriter.WarcWriterThread") - - def __init__(self, recorded_url_q=None, writer_pool=None): - """recorded_url_q is a queue.Queue of warcprox.warcprox.RecordedUrl.""" - threading.Thread.__init__(self, name='WarcWriterThread') - self.recorded_url_q = recorded_url_q - self.stop = threading.Event() - if writer_pool: - self.writer_pool = writer_pool - else: - self.writer_pool = WarcWriterPool() - - def run(self): - try: - while not self.stop.is_set(): - try: - recorded_url = self.recorded_url_q.get(block=True, timeout=0.5) - self.writer_pool.write_records(recorded_url) - except queue.Empty: - self.writer_pool.maybe_idle_rollover() - self.writer_pool.sync() - - self.logger.info('WarcWriterThread shutting down') - self.writer_pool.close_writers() - except: - self.logger.critical("WarcWriterThread shutting down after unexpected error", exc_info=True) - From 03c0fc848cf0140007f1a2ad68876c4ed9f8daf1 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 30 Jul 2015 00:14:17 +0000 Subject: [PATCH 014/146] fix old tests to work with refactored code; new test test_limits() (fails now, limits not implemented) --- warcprox/tests/test_warcprox.py | 56 ++++++++++++++++++++++----------- 1 file changed, 37 insertions(+), 19 deletions(-) diff --git a/warcprox/tests/test_warcprox.py b/warcprox/tests/test_warcprox.py index 0b05080..57ad613 100755 --- a/warcprox/tests/test_warcprox.py +++ b/warcprox/tests/test_warcprox.py @@ -14,6 +14,7 @@ import os import shutil import requests import re +import json try: import http.server as http_server @@ -27,11 +28,10 @@ except ImportError: import certauth.certauth -import warcprox.controller -import warcprox.warcprox -import warcprox.playback -import warcprox.warcwriter -import warcprox.dedup +import warcprox + +logging.basicConfig(stream=sys.stdout, level=logging.INFO, + format='%(asctime)s %(process)d %(levelname)s %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s') class _TestHttpRequestHandler(http_server.BaseHTTPRequestHandler): def do_GET(self): @@ -138,7 +138,7 @@ def warcprox_(request): recorded_url_q = queue.Queue() - proxy = warcprox.warcprox.WarcProxy(server_address=('localhost', 0), ca=ca, + proxy = warcprox.warcproxy.WarcProxy(server_address=('localhost', 0), ca=ca, recorded_url_q=recorded_url_q) warcs_dir = tempfile.mkdtemp(prefix='warcprox-test-warcs-') @@ -155,12 +155,12 @@ def warcprox_(request): dedup_db_file = f.name dedup_db = warcprox.dedup.DedupDb(dedup_db_file) - default_warc_writer = warcprox.warcwriter.WarcWriter(directory=warcs_dir, - port=proxy.server_port, dedup_db=dedup_db, - playback_index_db=playback_index_db) - writer_pool = warcprox.warcwriter.WarcWriterPool(default_warc_writer) - warc_writer_thread = warcprox.warcwriter.WarcWriterThread(recorded_url_q=recorded_url_q, - writer_pool=writer_pool) + default_warc_writer = warcprox.writer.WarcWriter(directory=warcs_dir, + port=proxy.server_port) + writer_pool = warcprox.writer.WarcWriterPool(default_warc_writer) + warc_writer_thread = warcprox.writerthread.WarcWriterThread( + recorded_url_q=recorded_url_q, writer_pool=writer_pool, + dedup_db=dedup_db, playback_index_db=playback_index_db) warcprox_ = warcprox.controller.WarcproxController(proxy, warc_writer_thread, playback_proxy) logging.info('starting warcprox') @@ -224,7 +224,6 @@ def _poll_playback_until(playback_proxies, url, status, timeout_sec): if response.status_code == status: break time.sleep(0.5) - return response def test_archive_and_playback_http_url(http_daemon, archiving_proxies, playback_proxies): @@ -276,7 +275,7 @@ def test_dedup_http(http_daemon, warcprox_, archiving_proxies, playback_proxies) assert response.content == b'404 Not in Archive\n' # check not in dedup db - dedup_lookup = warcprox_.warc_writer_thread.writer_pool.default_warc_writer.dedup_db.lookup(b'sha1:65e1216acfd220f0292715e74bd7a1ec35c99dfc') + dedup_lookup = warcprox_.warc_writer_thread.dedup_db.lookup(b'sha1:65e1216acfd220f0292715e74bd7a1ec35c99dfc') assert dedup_lookup is None # archive @@ -293,7 +292,7 @@ def test_dedup_http(http_daemon, warcprox_, archiving_proxies, playback_proxies) # check in dedup db # {u'i': u'', u'u': u'https://localhost:62841/c/d', u'd': u'2013-11-22T00:14:37Z'} - dedup_lookup = warcprox_.warc_writer_thread.writer_pool.default_warc_writer.dedup_db.lookup(b'sha1:65e1216acfd220f0292715e74bd7a1ec35c99dfc') + dedup_lookup = warcprox_.warc_writer_thread.dedup_db.lookup(b'sha1:65e1216acfd220f0292715e74bd7a1ec35c99dfc') assert dedup_lookup['u'] == url.encode('ascii') assert re.match(br'^$', dedup_lookup['i']) assert re.match(br'^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$', dedup_lookup['d']) @@ -314,7 +313,7 @@ def test_dedup_http(http_daemon, warcprox_, archiving_proxies, playback_proxies) time.sleep(2.0) # check in dedup db (no change from prev) - dedup_lookup = warcprox_.warc_writer_thread.writer_pool.default_warc_writer.dedup_db.lookup(b'sha1:65e1216acfd220f0292715e74bd7a1ec35c99dfc') + dedup_lookup = warcprox_.warc_writer_thread.dedup_db.lookup(b'sha1:65e1216acfd220f0292715e74bd7a1ec35c99dfc') assert dedup_lookup['u'] == url.encode('ascii') assert dedup_lookup['i'] == record_id assert dedup_lookup['d'] == dedup_date @@ -337,7 +336,7 @@ def test_dedup_https(https_daemon, warcprox_, archiving_proxies, playback_proxie assert response.content == b'404 Not in Archive\n' # check not in dedup db - dedup_lookup = warcprox_.warc_writer_thread.writer_pool.default_warc_writer.dedup_db.lookup(b'sha1:5b4efa64fdb308ec06ae56a9beba155a6f734b89') + dedup_lookup = warcprox_.warc_writer_thread.dedup_db.lookup(b'sha1:5b4efa64fdb308ec06ae56a9beba155a6f734b89') assert dedup_lookup is None # archive @@ -354,7 +353,7 @@ def test_dedup_https(https_daemon, warcprox_, archiving_proxies, playback_proxie # check in dedup db # {u'i': u'', u'u': u'https://localhost:62841/c/d', u'd': u'2013-11-22T00:14:37Z'} - dedup_lookup = warcprox_.warc_writer_thread.writer_pool.default_warc_writer.dedup_db.lookup(b'sha1:5b4efa64fdb308ec06ae56a9beba155a6f734b89') + dedup_lookup = warcprox_.warc_writer_thread.dedup_db.lookup(b'sha1:5b4efa64fdb308ec06ae56a9beba155a6f734b89') assert dedup_lookup['u'] == url.encode('ascii') assert re.match(br'^$', dedup_lookup['i']) assert re.match(br'^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$', dedup_lookup['d']) @@ -375,7 +374,7 @@ def test_dedup_https(https_daemon, warcprox_, archiving_proxies, playback_proxie time.sleep(2.0) # check in dedup db (no change from prev) - dedup_lookup = warcprox_.warc_writer_thread.writer_pool.default_warc_writer.dedup_db.lookup(b'sha1:5b4efa64fdb308ec06ae56a9beba155a6f734b89') + dedup_lookup = warcprox_.warc_writer_thread.dedup_db.lookup(b'sha1:5b4efa64fdb308ec06ae56a9beba155a6f734b89') assert dedup_lookup['u'] == url.encode('ascii') assert dedup_lookup['i'] == record_id assert dedup_lookup['d'] == dedup_date @@ -388,6 +387,25 @@ def test_dedup_https(https_daemon, warcprox_, archiving_proxies, playback_proxie assert response.content == b'I am the warcprox test payload! hhhhhhhhhh!\n' # XXX how to check dedup was used? +def test_limits(http_daemon, archiving_proxies): + url = 'http://localhost:{}/a/b'.format(http_daemon.server_port) + request_meta = {"stats":{"classifiers":["job1"]},"limits":{"job1.total.urls":10}} + headers = {"Warcprox-Meta": json.dumps(request_meta)} + + for i in range(10): + response = requests.get(url, proxies=archiving_proxies, headers=headers, stream=True) + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'a!' + assert response.content == b'I am the warcprox test payload! bbbbbbbbbb!\n' + + response = requests.get(url, proxies=archiving_proxies, headers=headers, stream=True) + assert response.status_code == 420 + assert response.reason == "Limit Reached" + response_meta = {"stats":{"job1":{"total":{"urls":10},"new":{"urls":1},"revisit":{"urls":9}}}} + assert json.loads(headers["warcprox-meta"]) == response_meta + assert response.headers["content-type"] == "text/plain;charset=utf-8" + assert response.raw.data == b"request rejected by warcprox: reached limit job1.total.urls=10\n" + if __name__ == '__main__': pytest.main() From d37d2d71e319c28449dc87532941052cba0e1f01 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 30 Jul 2015 01:09:51 +0000 Subject: [PATCH 015/146] meant to remove warcprox.py --- warcprox/warcprox.py | 345 ------------------------------------------ warcprox/warcproxy.py | 2 + 2 files changed, 2 insertions(+), 345 deletions(-) delete mode 100644 warcprox/warcprox.py diff --git a/warcprox/warcprox.py b/warcprox/warcprox.py deleted file mode 100644 index 19e207f..0000000 --- a/warcprox/warcprox.py +++ /dev/null @@ -1,345 +0,0 @@ -#!/usr/bin/env python -# vim:set sw=4 et: -# -""" -WARC writing MITM HTTP/S proxy - -See README.rst or https://github.com/internetarchive/warcprox -""" - -from __future__ import absolute_import - -try: - import http.server as http_server -except ImportError: - import BaseHTTPServer as http_server - -try: - import socketserver -except ImportError: - import SocketServer as socketserver - -try: - import queue -except ImportError: - import Queue as queue - -try: - import http.client as http_client -except ImportError: - import httplib as http_client - -import logging -import re -import tempfile -import traceback -import hashlib -import json -import socket -from hanzo import warctools - -from certauth.certauth import CertificateAuthority -import warcprox - -class ProxyingRecorder(object): - """ - Wraps a socket._fileobject, recording the bytes as they are read, - calculating digests, and sending them on to the proxy client. - """ - - logger = logging.getLogger("warcprox.warcproxy.ProxyingRecorder") - - def __init__(self, fp, proxy_dest, digest_algorithm='sha1', url=None): - self.fp = fp - # "The file has no name, and will cease to exist when it is closed." - self.tempfile = tempfile.SpooledTemporaryFile(max_size=512*1024) - self.digest_algorithm = digest_algorithm - self.block_digest = hashlib.new(digest_algorithm) - self.payload_offset = None - self.payload_digest = None - self.proxy_dest = proxy_dest - self._proxy_dest_conn_open = True - self._prev_hunk_last_two_bytes = b'' - self.len = 0 - self.url = url - - def _update_payload_digest(self, hunk): - if self.payload_digest is None: - # convoluted handling of two newlines crossing hunks - # XXX write tests for this - if self._prev_hunk_last_two_bytes.endswith(b'\n'): - if hunk.startswith(b'\n'): - self.payload_digest = hashlib.new(self.digest_algorithm) - self.payload_digest.update(hunk[1:]) - self.payload_offset = self.len + 1 - elif hunk.startswith(b'\r\n'): - self.payload_digest = hashlib.new(self.digest_algorithm) - self.payload_digest.update(hunk[2:]) - self.payload_offset = self.len + 2 - elif self._prev_hunk_last_two_bytes == b'\n\r': - if hunk.startswith(b'\n'): - self.payload_digest = hashlib.new(self.digest_algorithm) - self.payload_digest.update(hunk[1:]) - self.payload_offset = self.len + 1 - else: - m = re.search(br'\n\r?\n', hunk) - if m is not None: - self.payload_digest = hashlib.new(self.digest_algorithm) - self.payload_digest.update(hunk[m.end():]) - self.payload_offset = self.len + m.end() - - # if we still haven't found start of payload hold on to these bytes - if self.payload_digest is None: - self._prev_hunk_last_two_bytes = hunk[-2:] - else: - self.payload_digest.update(hunk) - - def _update(self, hunk): - self._update_payload_digest(hunk) - self.block_digest.update(hunk) - - self.tempfile.write(hunk) - - if self._proxy_dest_conn_open: - try: - self.proxy_dest.sendall(hunk) - except BaseException as e: - self._proxy_dest_conn_open = False - self.logger.warn('{} sending data to proxy client for url {}'.format(e, self.url)) - self.logger.info('will continue downloading from remote server without sending to client {}'.format(self.url)) - - self.len += len(hunk) - - def read(self, size=-1): - hunk = self.fp.read(size) - self._update(hunk) - return hunk - - def readinto(self, b): - n = self.fp.readinto(b) - self._update(b[:n]) - return n - - def readline(self, size=-1): - # XXX depends on implementation details of self.fp.readline(), in - # particular that it doesn't call self.fp.read() - hunk = self.fp.readline(size) - self._update(hunk) - return hunk - - def close(self): - return self.fp.close() - - def __len__(self): - return self.len - - def payload_size(self): - if self.payload_offset is not None: - return self.len - self.payload_offset - else: - return 0 - - -class ProxyingRecordingHTTPResponse(http_client.HTTPResponse): - - def __init__(self, sock, debuglevel=0, method=None, proxy_dest=None, digest_algorithm='sha1', url=None): - http_client.HTTPResponse.__init__(self, sock, debuglevel=debuglevel, method=method) - self.url = url - - # Keep around extra reference to self.fp because HTTPResponse sets - # self.fp=None after it finishes reading, but we still need it - self.recorder = ProxyingRecorder(self.fp, proxy_dest, digest_algorithm, url=url) - self.fp = self.recorder - - -class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): - logger = logging.getLogger("warcprox.warcproxy.WarcProxyHandler") - - def _proxy_request(self): - # Build request - req_str = '{} {} {}\r\n'.format(self.command, self.path, self.request_version) - - warcprox_meta = self.headers.get('Warcprox-Meta') - - # Swallow headers that don't make sense to forward on, i.e. most - # hop-by-hop headers, see http://tools.ietf.org/html/rfc2616#section-13.5 - # self.headers is an email.message.Message, which is case-insensitive - # and doesn't throw KeyError in __delitem__ - for h in ('Connection', 'Proxy-Connection', 'Keep-Alive', - 'Proxy-Authenticate', 'Proxy-Authorization', 'Upgrade', - 'Warcprox-Meta'): - del self.headers[h] - - # Add headers to the request - # XXX in at least python3.3 str(self.headers) uses \n not \r\n :( - req_str += '\r\n'.join('{}: {}'.format(k,v) for (k,v) in self.headers.items()) - - req = req_str.encode('utf-8') + b'\r\n\r\n' - - # Append message body if present to the request - if 'Content-Length' in self.headers: - req += self.rfile.read(int(self.headers['Content-Length'])) - - self.logger.debug('sending to remote server req={}'.format(repr(req))) - - # Send it down the pipe! - self._proxy_sock.sendall(req) - - # We want HTTPResponse's smarts about http and handling of - # non-compliant servers. But HTTPResponse.read() doesn't return the raw - # bytes read from the server, it unchunks them if they're chunked, and - # might do other stuff. We want to send the raw bytes back to the - # client. So we ignore the values returned by h.read() below. Instead - # the ProxyingRecordingHTTPResponse takes care of sending the raw bytes - # to the proxy client. - - # Proxy and record the response - h = ProxyingRecordingHTTPResponse(self._proxy_sock, - proxy_dest=self.connection, - digest_algorithm=self.server.digest_algorithm, - url=self.url) - h.begin() - - buf = h.read(8192) - while buf != b'': - buf = h.read(8192) - - self.log_request(h.status, h.recorder.len) - - remote_ip = self._proxy_sock.getpeername()[0] - - # Let's close off the remote end - h.close() - self._proxy_sock.close() - - # XXX Close connection to proxy client. Doing this because we were - # seeing some connection hangs and this seems to solve that problem. - # Not clear what the correct, optimal behavior is. - self.connection.close() - - recorded_url = RecordedUrl(url=self.url, request_data=req, - response_recorder=h.recorder, remote_ip=remote_ip, - warcprox_meta=warcprox_meta, - status=h.status, size=h.recorder.len, - client_ip=self.client_address[0], - content_type=h.getheader("Content-Type"), - method=self.command) - self.server.recorded_url_q.put(recorded_url) - - return recorded_url - - # deprecated - def do_PUTMETA(self): - self.do_WARCPROX_WRITE_RECORD(warc_type=warctools.WarcRecord.METADATA) - - def do_WARCPROX_WRITE_RECORD(self, warc_type=None): - try: - self.url = self.path - - if ('Content-Length' in self.headers and 'Content-Type' in self.headers - and (warc_type or 'WARC-Type' in self.headers)): - # stream this? - request_data = self.rfile.read(int(self.headers['Content-Length'])) - - warcprox_meta = self.headers.get('Warcprox-Meta') - - rec_custom = RecordedUrl(url=self.url, - request_data=request_data, - response_recorder=None, - remote_ip=b'', - warcprox_meta=warcprox_meta, - content_type=self.headers['Content-Type'].encode('latin1'), - custom_type=warc_type or self.headers['WARC-Type'], - status=204, size=len(request_data), - client_ip=self.client_address[0], - method=self.command) - - self.server.recorded_url_q.put(rec_custom) - self.send_response(204, 'OK') - else: - self.send_error(400, 'Bad request') - - self.end_headers() - except: - self.logger.error("uncaught exception in do_WARCPROX_WRITE_RECORD", exc_info=True) - raise - - def log_error(self, fmt, *args): - # logging better handled elsewhere? - pass - - def log_message(self, fmt, *args): - # logging better handled elsewhere? - pass - - -class RecordedUrl: - def __init__(self, url, request_data, response_recorder, remote_ip, - warcprox_meta=None, content_type=None, custom_type=None, - status=None, size=None, client_ip=None, method=None): - # XXX should test what happens with non-ascii url (when does - # url-encoding happen?) - if type(url) is not bytes: - self.url = url.encode('ascii') - else: - self.url = url - - if type(remote_ip) is not bytes: - self.remote_ip = remote_ip.encode('ascii') - else: - self.remote_ip = remote_ip - - self.request_data = request_data - self.response_recorder = response_recorder - - if warcprox_meta: - self.warcprox_meta = json.loads(warcprox_meta) - else: - self.warcprox_meta = {} - - self.content_type = content_type - self.custom_type = custom_type - - self.status = status - self.size = size - self.client_ip = client_ip - self.method = method - - def __del__(self): - self.logger.info("finished with %s", self) - if self.response_recorder: - self.response_recorder.tempfile.close() - self.response_recorder = None - - -class WarcProxy(socketserver.ThreadingMixIn, http_server.HTTPServer): - logger = logging.getLogger("warcprox.warcproxy.WarcProxy") - - def __init__(self, server_address=('localhost', 8000), - req_handler_class=WarcProxyHandler, bind_and_activate=True, - ca=None, recorded_url_q=None, digest_algorithm='sha1'): - http_server.HTTPServer.__init__(self, server_address, req_handler_class, bind_and_activate) - - self.digest_algorithm = digest_algorithm - - if ca is not None: - self.ca = ca - else: - ca_name = 'Warcprox CA on {}'.format(socket.gethostname())[:64] - self.ca = CertificateAuthority(ca_file='warcprox-ca.pem', - certs_dir='./warcprox-ca', - ca_name=ca_name) - - if recorded_url_q is not None: - self.recorded_url_q = recorded_url_q - else: - self.recorded_url_q = queue.Queue() - - def server_activate(self): - http_server.HTTPServer.server_activate(self) - self.logger.info('WarcProxy listening on {0}:{1}'.format(self.server_address[0], self.server_address[1])) - - def server_close(self): - self.logger.info('WarcProxy shutting down') - http_server.HTTPServer.server_close(self) - diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index c47e11c..d81ba87 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -226,6 +226,8 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): method=self.command) self.server.recorded_url_q.put(recorded_url) + return recorded_url + # deprecated def do_PUTMETA(self): self.do_WARCPROX_WRITE_RECORD(warc_type=warctools.WarcRecord.METADATA) From 4ce89e6d038d42845b84c8456fe7d7941d633181 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 30 Jul 2015 01:59:48 +0000 Subject: [PATCH 016/146] basic limits enforcement is working --- warcprox/__init__.py | 1 + warcprox/main.py | 14 ++++- warcprox/stats.py | 97 +++++++++++++++++++++++++++++++++ warcprox/tests/test_warcprox.py | 24 +++++--- warcprox/warcproxy.py | 39 +++++++++++-- warcprox/writerthread.py | 8 ++- 6 files changed, 167 insertions(+), 16 deletions(-) create mode 100644 warcprox/stats.py diff --git a/warcprox/__init__.py b/warcprox/__init__.py index c3379c6..7235056 100644 --- a/warcprox/__init__.py +++ b/warcprox/__init__.py @@ -8,6 +8,7 @@ import warcprox.mitmproxy as mitmproxy import warcprox.writer as writer import warcprox.warc as warc import warcprox.writerthread as writerthread +import warcprox.stats as stats def digest_str(hash_obj, base32): import base64 diff --git a/warcprox/main.py b/warcprox/main.py index a98691d..58d6a77 100644 --- a/warcprox/main.py +++ b/warcprox/main.py @@ -57,6 +57,8 @@ def _build_arg_parser(prog=os.path.basename(sys.argv[0])): default=False, help='write digests in Base32 instead of hex') arg_parser.add_argument('-j', '--dedup-db-file', dest='dedup_db_file', default='./warcprox-dedup.db', help='persistent deduplication database file; empty string or /dev/null disables deduplication') + arg_parser.add_argument('--stats-db-file', dest='stats_db_file', + default='./warcprox-stats.db', help='persistent statistics database file; empty string or /dev/null disables deduplication') arg_parser.add_argument('-P', '--playback-port', dest='playback_port', default=None, help='port to listen on for instant playback') arg_parser.add_argument('--playback-index-db-file', dest='playback_index_db_file', @@ -112,6 +114,12 @@ def main(argv=sys.argv): else: dedup_db = warcprox.dedup.DedupDb(args.dedup_db_file) + if args.stats_db_file in (None, '', '/dev/null'): + logging.info('statistics tracking disabled') + stats_db = None + else: + stats_db = warcprox.stats.StatsDb(args.stats_db_file) + recorded_url_q = queue.Queue() ca_name = 'Warcprox CA on {}'.format(socket.gethostname())[:64] @@ -121,7 +129,8 @@ def main(argv=sys.argv): proxy = warcprox.warcproxy.WarcProxy( server_address=(args.address, int(args.port)), ca=ca, recorded_url_q=recorded_url_q, - digest_algorithm=args.digest_algorithm) + digest_algorithm=args.digest_algorithm, + stats_db=stats_db) if args.playback_port is not None: playback_index_db = warcprox.playback.PlaybackIndexDb(args.playback_index_db_file) @@ -141,7 +150,8 @@ def main(argv=sys.argv): writer_pool=warcprox.writer.WarcWriterPool(default_warc_writer) warc_writer_thread = warcprox.writerthread.WarcWriterThread( recorded_url_q=recorded_url_q, writer_pool=writer_pool, - dedup_db=dedup_db, playback_index_db=playback_index_db) + dedup_db=dedup_db, playback_index_db=playback_index_db, + stats_db=stats_db) controller = warcprox.controller.WarcproxController(proxy, warc_writer_thread, playback_proxy) diff --git a/warcprox/stats.py b/warcprox/stats.py new file mode 100644 index 0000000..6ad3ca4 --- /dev/null +++ b/warcprox/stats.py @@ -0,0 +1,97 @@ +# vim:set sw=4 et: + +from __future__ import absolute_import + +try: + import dbm.gnu as dbm_gnu +except ImportError: + try: + import gdbm as dbm_gnu + except ImportError: + import anydbm as dbm_gnu + +import logging +import os +import json +from hanzo import warctools + +class StatsDb: + logger = logging.getLogger("warcprox.stats.StatsDb") + + def __init__(self, dbm_file='./warcprox-stats.db'): + if os.path.exists(dbm_file): + self.logger.info('opening existing stats database {}'.format(dbm_file)) + else: + self.logger.info('creating new stats database {}'.format(dbm_file)) + + self.db = dbm_gnu.open(dbm_file, 'c') + + def close(self): + self.db.close() + + def sync(self): + try: + self.db.sync() + except: + pass + + def _empty_bucket(self): + return { + "total": { + "urls": 0, + "wire_bytes": 0, + # "warc_bytes": 0, + }, + "new": { + "urls": 0, + "wire_bytes": 0, + # "warc_bytes": 0, + }, + "revisit": { + "urls": 0, + "wire_bytes": 0, + # "warc_bytes": 0, + }, + } + + def value(self, bucket0="__all__", bucket1=None, bucket2=None): + if bucket0 in self.db: + bucket0_stats = json.loads(self.db[bucket0].decode("utf-8")) + if bucket1: + if bucket2: + return bucket0_stats[bucket1][bucket2] + else: + return bucket0_stats[bucket1] + else: + return bucket0_stats + else: + return None + + def tally(self, recorded_url, records): + buckets = ["__all__"] + + if (recorded_url.warcprox_meta + and "stats" in recorded_url.warcprox_meta + and "buckets" in recorded_url.warcprox_meta["stats"]): + buckets.extend(recorded_url.warcprox_meta["stats"]["buckets"]) + else: + buckets.append("__unspecified__") + + for bucket in buckets: + if bucket in self.db: + bucket_stats = json.loads(self.db[bucket].decode("utf-8")) + else: + bucket_stats = self._empty_bucket() + + bucket_stats["total"]["urls"] += 1 + bucket_stats["total"]["wire_bytes"] += recorded_url.size + + if records[0].get_header(warctools.WarcRecord.TYPE) == warctools.WarcRecord.REVISIT: + bucket_stats["revisit"]["urls"] += 1 + bucket_stats["revisit"]["wire_bytes"] += recorded_url.size + else: + bucket_stats["new"]["urls"] += 1 + bucket_stats["new"]["wire_bytes"] += recorded_url.size + + self.db[bucket] = json.dumps(bucket_stats, separators=(',',':')).encode("utf-8") + diff --git a/warcprox/tests/test_warcprox.py b/warcprox/tests/test_warcprox.py index 57ad613..33a01bb 100755 --- a/warcprox/tests/test_warcprox.py +++ b/warcprox/tests/test_warcprox.py @@ -138,8 +138,13 @@ def warcprox_(request): recorded_url_q = queue.Queue() + f = tempfile.NamedTemporaryFile(prefix='warcprox-test-stats-', suffix='.db', delete=False) + f.close() + stats_db_file = f.name + stats_db = warcprox.stats.StatsDb(stats_db_file) + proxy = warcprox.warcproxy.WarcProxy(server_address=('localhost', 0), ca=ca, - recorded_url_q=recorded_url_q) + recorded_url_q=recorded_url_q, stats_db=stats_db) warcs_dir = tempfile.mkdtemp(prefix='warcprox-test-warcs-') @@ -160,7 +165,8 @@ def warcprox_(request): writer_pool = warcprox.writer.WarcWriterPool(default_warc_writer) warc_writer_thread = warcprox.writerthread.WarcWriterThread( recorded_url_q=recorded_url_q, writer_pool=writer_pool, - dedup_db=dedup_db, playback_index_db=playback_index_db) + dedup_db=dedup_db, playback_index_db=playback_index_db, + stats_db=stats_db) warcprox_ = warcprox.controller.WarcproxController(proxy, warc_writer_thread, playback_proxy) logging.info('starting warcprox') @@ -172,7 +178,7 @@ def warcprox_(request): logging.info('stopping warcprox') warcprox_.stop.set() warcprox_thread.join() - for f in (ca_file, ca_dir, warcs_dir, playback_index_db_file, dedup_db_file): + for f in (ca_file, ca_dir, warcs_dir, playback_index_db_file, dedup_db_file, stats_db_file): if os.path.isdir(f): logging.info('deleting directory {}'.format(f)) shutil.rmtree(f) @@ -389,7 +395,7 @@ def test_dedup_https(https_daemon, warcprox_, archiving_proxies, playback_proxie def test_limits(http_daemon, archiving_proxies): url = 'http://localhost:{}/a/b'.format(http_daemon.server_port) - request_meta = {"stats":{"classifiers":["job1"]},"limits":{"job1.total.urls":10}} + request_meta = {"stats":{"buckets":["job1"],"limits":{"job1.total.urls":10}}} headers = {"Warcprox-Meta": json.dumps(request_meta)} for i in range(10): @@ -400,11 +406,11 @@ def test_limits(http_daemon, archiving_proxies): response = requests.get(url, proxies=archiving_proxies, headers=headers, stream=True) assert response.status_code == 420 - assert response.reason == "Limit Reached" - response_meta = {"stats":{"job1":{"total":{"urls":10},"new":{"urls":1},"revisit":{"urls":9}}}} - assert json.loads(headers["warcprox-meta"]) == response_meta - assert response.headers["content-type"] == "text/plain;charset=utf-8" - assert response.raw.data == b"request rejected by warcprox: reached limit job1.total.urls=10\n" + assert response.reason == "Limit reached" + # response_meta = {"stats":{"job1":{"total":{"urls":10},"new":{"urls":1},"revisit":{"urls":9}}}} + # assert json.loads(headers["warcprox-meta"]) == response_meta + # assert response.headers["content-type"] == "text/plain;charset=utf-8" + # assert response.raw.data == b"request rejected by warcprox: reached limit job1.total.urls=10\n" if __name__ == '__main__': pytest.main() diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index d81ba87..b2a7345 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -153,13 +153,38 @@ class ProxyingRecordingHTTPResponse(http_client.HTTPResponse): class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): + # self.server is WarcProxy logger = logging.getLogger("warcprox.warcprox.WarcProxyHandler") + def _enforce_limits(self, warcprox_meta): + self.logger.info("warcprox_meta=%s", warcprox_meta) + if (warcprox_meta and "stats" in warcprox_meta + and "limits" in warcprox_meta["stats"]): + self.logger.info("warcprox_meta['stats']['limits']=%s", warcprox_meta['stats']['limits']) + for item in warcprox_meta["stats"]["limits"].items(): + self.logger.info("item=%s", item) + key, limit = item + self.logger.info("limit %s=%d", key, limit) + bucket0, bucket1, bucket2 = key.rsplit(".", 2) + self.logger.info("%s::%s::%s", bucket0, bucket1, bucket2) + value = self.server.stats_db.value(bucket0, bucket1, bucket2) + self.logger.info("stats value is %s", value) + if value and value >= limit: + self.send_error(420, "Limit reached") + self.connection.close() + return + def _proxy_request(self): # Build request req_str = '{} {} {}\r\n'.format(self.command, self.path, self.request_version) - warcprox_meta = self.headers.get('Warcprox-Meta') + warcprox_meta = None + raw_warcprox_meta = self.headers.get('Warcprox-Meta') + if raw_warcprox_meta: + warcprox_meta = json.loads(raw_warcprox_meta) + + if self._enforce_limits(warcprox_meta): + return # Swallow headers that don't make sense to forward on, i.e. most # hop-by-hop headers, see http://tools.ietf.org/html/rfc2616#section-13.5 @@ -241,7 +266,10 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): # stream this? request_data = self.rfile.read(int(self.headers['Content-Length'])) - warcprox_meta = self.headers.get('Warcprox-Meta') + warcprox_meta = None + raw_warcprox_meta = self.headers.get('Warcprox-Meta') + if raw_warcprox_meta: + warcprox_meta = json.loads(raw_warcprox_meta) rec_custom = RecordedUrl(url=self.url, request_data=request_data, @@ -295,7 +323,7 @@ class RecordedUrl: self.response_recorder = response_recorder if warcprox_meta: - self.warcprox_meta = json.loads(warcprox_meta) + self.warcprox_meta = warcprox_meta else: self.warcprox_meta = {} @@ -319,7 +347,8 @@ class WarcProxy(socketserver.ThreadingMixIn, http_server.HTTPServer): def __init__(self, server_address=('localhost', 8000), req_handler_class=WarcProxyHandler, bind_and_activate=True, - ca=None, recorded_url_q=None, digest_algorithm='sha1'): + ca=None, recorded_url_q=None, digest_algorithm='sha1', + stats_db=None): http_server.HTTPServer.__init__(self, server_address, req_handler_class, bind_and_activate) self.digest_algorithm = digest_algorithm @@ -337,6 +366,8 @@ class WarcProxy(socketserver.ThreadingMixIn, http_server.HTTPServer): else: self.recorded_url_q = queue.Queue() + self.stats_db = stats_db + def server_activate(self): http_server.HTTPServer.server_activate(self) self.logger.info('WarcProxy listening on {0}:{1}'.format(self.server_address[0], self.server_address[1])) diff --git a/warcprox/writerthread.py b/warcprox/writerthread.py index ceb34cd..68c5676 100644 --- a/warcprox/writerthread.py +++ b/warcprox/writerthread.py @@ -22,7 +22,7 @@ import warcprox class WarcWriterThread(threading.Thread): logger = logging.getLogger("warcprox.warcproxwriter.WarcWriterThread") - def __init__(self, recorded_url_q=None, writer_pool=None, dedup_db=None, playback_index_db=None): + def __init__(self, recorded_url_q=None, writer_pool=None, dedup_db=None, playback_index_db=None, stats_db=None): """recorded_url_q is a queue.Queue of warcprox.warcprox.RecordedUrl.""" threading.Thread.__init__(self, name='WarcWriterThread') self.recorded_url_q = recorded_url_q @@ -33,6 +33,7 @@ class WarcWriterThread(threading.Thread): self.writer_pool = WarcWriterPool() self.dedup_db = dedup_db self.playback_index_db = playback_index_db + self.stats_db = stats_db self._last_sync = time.time() def run(self): @@ -106,7 +107,12 @@ class WarcWriterThread(threading.Thread): _decode(records[0].warc_filename), records[0].offset)) + def _update_stats(self, recorded_url, records): + if self.stats_db: + self.stats_db.tally(recorded_url, records) + def _final_tasks(self, recorded_url, records): self._save_dedup_info(recorded_url, records) self._save_playback_info(recorded_url, records) + self._update_stats(recorded_url, records) self._log(recorded_url, records) From aa36ff2958940cb96f2848358e30e56743c5af0e Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 30 Jul 2015 21:18:27 +0000 Subject: [PATCH 017/146] include Warcprox-Meta response header with relevant info json, and an informative text/plain body, in "420 Limit reached" response --- warcprox/tests/test_warcprox.py | 11 +++++++---- warcprox/warcproxy.py | 22 ++++++++++++++-------- 2 files changed, 21 insertions(+), 12 deletions(-) diff --git a/warcprox/tests/test_warcprox.py b/warcprox/tests/test_warcprox.py index 33a01bb..0e6c3b9 100755 --- a/warcprox/tests/test_warcprox.py +++ b/warcprox/tests/test_warcprox.py @@ -404,13 +404,16 @@ def test_limits(http_daemon, archiving_proxies): assert response.headers['warcprox-test-header'] == 'a!' assert response.content == b'I am the warcprox test payload! bbbbbbbbbb!\n' + # XXX give warc writer thread a chance to update stats + time.sleep(2.0) + response = requests.get(url, proxies=archiving_proxies, headers=headers, stream=True) assert response.status_code == 420 assert response.reason == "Limit reached" - # response_meta = {"stats":{"job1":{"total":{"urls":10},"new":{"urls":1},"revisit":{"urls":9}}}} - # assert json.loads(headers["warcprox-meta"]) == response_meta - # assert response.headers["content-type"] == "text/plain;charset=utf-8" - # assert response.raw.data == b"request rejected by warcprox: reached limit job1.total.urls=10\n" + expected_response_meta = {'reached-limit': {'job1.total.urls': 10}, 'stats': {'job1': {'revisit': {'wire_bytes': 1215, 'urls': 9}, 'total': {'wire_bytes': 1350, 'urls': 10}, 'new': {'wire_bytes': 135, 'urls': 1}}}} + assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta + assert response.headers["content-type"] == "text/plain;charset=utf-8" + assert response.raw.data == b"request rejected by warcprox: reached limit job1.total.urls=10\n" if __name__ == '__main__': pytest.main() diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index b2a7345..6e5ccff 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -157,22 +157,28 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): logger = logging.getLogger("warcprox.warcprox.WarcProxyHandler") def _enforce_limits(self, warcprox_meta): - self.logger.info("warcprox_meta=%s", warcprox_meta) if (warcprox_meta and "stats" in warcprox_meta and "limits" in warcprox_meta["stats"]): - self.logger.info("warcprox_meta['stats']['limits']=%s", warcprox_meta['stats']['limits']) + # self.logger.info("warcprox_meta['stats']['limits']=%s", warcprox_meta['stats']['limits']) for item in warcprox_meta["stats"]["limits"].items(): - self.logger.info("item=%s", item) key, limit = item - self.logger.info("limit %s=%d", key, limit) bucket0, bucket1, bucket2 = key.rsplit(".", 2) - self.logger.info("%s::%s::%s", bucket0, bucket1, bucket2) value = self.server.stats_db.value(bucket0, bucket1, bucket2) - self.logger.info("stats value is %s", value) if value and value >= limit: - self.send_error(420, "Limit reached") + self.logger.info('sending "420 Limit reached" %s=%s', key, limit) + body = "request rejected by warcprox: reached limit {}={}\n".format(key, limit).encode("utf-8") + self.send_response(420, "Limit reached") + self.send_header("Content-Type", "text/plain;charset=utf-8") + self.send_header("Connection", "close") + self.send_header("Content-Length", len(body)) + response_meta = {"reached-limit":{key:limit}, "stats":{bucket0: self.server.stats_db.value(bucket0)}} + self.send_header("Warcprox-Meta", json.dumps(response_meta, separators=(",",":"))) + self.end_headers() + if self.command != "HEAD": + self.wfile.write(body) self.connection.close() - return + return True + return False def _proxy_request(self): # Build request From a87615202686a761c27bb393865caedf5a5c4b9e Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Sat, 1 Aug 2015 00:08:01 +0000 Subject: [PATCH 018/146] fix exception, make some tweaks --- warcprox/dedup.py | 2 +- warcprox/main.py | 2 +- warcprox/tests/test_warcprox.py | 2 +- warcprox/warcproxy.py | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/warcprox/dedup.py b/warcprox/dedup.py index 65962f9..2a99358 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -59,7 +59,7 @@ class DedupDb(object): return result def decorate_with_dedup_info(dedup_db, recorded_url, base32=False): - if recorded_url.response_recorder.payload_digest: + if recorded_url.response_recorder and recorded_url.response_recorder.payload_digest: key = warcprox.digest_str(recorded_url.response_recorder.payload_digest, base32) recorded_url.dedup_info = dedup_db.lookup(key) diff --git a/warcprox/main.py b/warcprox/main.py index 58d6a77..52553f9 100644 --- a/warcprox/main.py +++ b/warcprox/main.py @@ -58,7 +58,7 @@ def _build_arg_parser(prog=os.path.basename(sys.argv[0])): arg_parser.add_argument('-j', '--dedup-db-file', dest='dedup_db_file', default='./warcprox-dedup.db', help='persistent deduplication database file; empty string or /dev/null disables deduplication') arg_parser.add_argument('--stats-db-file', dest='stats_db_file', - default='./warcprox-stats.db', help='persistent statistics database file; empty string or /dev/null disables deduplication') + default='./warcprox-stats.db', help='persistent statistics database file; empty string or /dev/null disables statistics tracking') arg_parser.add_argument('-P', '--playback-port', dest='playback_port', default=None, help='port to listen on for instant playback') arg_parser.add_argument('--playback-index-db-file', dest='playback_index_db_file', diff --git a/warcprox/tests/test_warcprox.py b/warcprox/tests/test_warcprox.py index 0e6c3b9..3b05a56 100755 --- a/warcprox/tests/test_warcprox.py +++ b/warcprox/tests/test_warcprox.py @@ -409,7 +409,7 @@ def test_limits(http_daemon, archiving_proxies): response = requests.get(url, proxies=archiving_proxies, headers=headers, stream=True) assert response.status_code == 420 - assert response.reason == "Limit reached" + assert response.reason == "Reached limit" expected_response_meta = {'reached-limit': {'job1.total.urls': 10}, 'stats': {'job1': {'revisit': {'wire_bytes': 1215, 'urls': 9}, 'total': {'wire_bytes': 1350, 'urls': 10}, 'new': {'wire_bytes': 135, 'urls': 1}}}} assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta assert response.headers["content-type"] == "text/plain;charset=utf-8" diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index 6e5ccff..91891d1 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -165,9 +165,9 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): bucket0, bucket1, bucket2 = key.rsplit(".", 2) value = self.server.stats_db.value(bucket0, bucket1, bucket2) if value and value >= limit: - self.logger.info('sending "420 Limit reached" %s=%s', key, limit) + self.logger.info('sending "420 Reached limit" %s=%s', key, limit) body = "request rejected by warcprox: reached limit {}={}\n".format(key, limit).encode("utf-8") - self.send_response(420, "Limit reached") + self.send_response(420, "Reached limit") self.send_header("Content-Type", "text/plain;charset=utf-8") self.send_header("Connection", "close") self.send_header("Content-Length", len(body)) From 89e5991f7b90e280612e34d04628dca5acfb3be3 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 7 Aug 2015 18:34:50 +0000 Subject: [PATCH 019/146] move limits to toplevel of warcprox-meta json object --- warcprox/tests/test_warcprox.py | 2 +- warcprox/warcproxy.py | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/warcprox/tests/test_warcprox.py b/warcprox/tests/test_warcprox.py index 3b05a56..62b3553 100755 --- a/warcprox/tests/test_warcprox.py +++ b/warcprox/tests/test_warcprox.py @@ -395,7 +395,7 @@ def test_dedup_https(https_daemon, warcprox_, archiving_proxies, playback_proxie def test_limits(http_daemon, archiving_proxies): url = 'http://localhost:{}/a/b'.format(http_daemon.server_port) - request_meta = {"stats":{"buckets":["job1"],"limits":{"job1.total.urls":10}}} + request_meta = {"stats":{"buckets":["job1"]},"limits":{"job1.total.urls":10}} headers = {"Warcprox-Meta": json.dumps(request_meta)} for i in range(10): diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index 91891d1..d0d4c97 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -157,10 +157,9 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): logger = logging.getLogger("warcprox.warcprox.WarcProxyHandler") def _enforce_limits(self, warcprox_meta): - if (warcprox_meta and "stats" in warcprox_meta - and "limits" in warcprox_meta["stats"]): - # self.logger.info("warcprox_meta['stats']['limits']=%s", warcprox_meta['stats']['limits']) - for item in warcprox_meta["stats"]["limits"].items(): + if (warcprox_meta and "limits" in warcprox_meta): + # self.logger.info("warcprox_meta['limits']=%s", warcprox_meta['limits']) + for item in warcprox_meta["limits"].items(): key, limit = item bucket0, bucket1, bucket2 = key.rsplit(".", 2) value = self.server.stats_db.value(bucket0, bucket1, bucket2) From 0ce8022ea927b44d28176fcdb39755db8a8792a0 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 7 Aug 2015 22:28:18 +0000 Subject: [PATCH 020/146] better(?) handling of exceptions raised while proxying urls --- warcprox/warcproxy.py | 89 ++++++++++++++++++++++++------------------- 1 file changed, 50 insertions(+), 39 deletions(-) diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index d0d4c97..3212086 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -127,6 +127,9 @@ class ProxyingRecorder(object): self._update(hunk) return hunk + def flush(self): + return self.fp.flush() + def close(self): return self.fp.close() @@ -195,10 +198,10 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): # hop-by-hop headers, see http://tools.ietf.org/html/rfc2616#section-13.5 # self.headers is an email.message.Message, which is case-insensitive # and doesn't throw KeyError in __delitem__ - for h in ('Connection', 'Proxy-Connection', 'Keep-Alive', + for key in ('Connection', 'Proxy-Connection', 'Keep-Alive', 'Proxy-Authenticate', 'Proxy-Authorization', 'Upgrade', 'Warcprox-Meta'): - del self.headers[h] + del self.headers[key] # Add headers to the request # XXX in at least python3.3 str(self.headers) uses \n not \r\n :( @@ -210,51 +213,59 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): if 'Content-Length' in self.headers: req += self.rfile.read(int(self.headers['Content-Length'])) - self.logger.debug('sending to remote server req={}'.format(repr(req))) + try: + self.logger.debug('sending to remote server req=%s', repr(req)) - # Send it down the pipe! - self._proxy_sock.sendall(req) + # Send it down the pipe! + self._proxy_sock.sendall(req) - # We want HTTPResponse's smarts about http and handling of - # non-compliant servers. But HTTPResponse.read() doesn't return the raw - # bytes read from the server, it unchunks them if they're chunked, and - # might do other stuff. We want to send the raw bytes back to the - # client. So we ignore the values returned by h.read() below. Instead - # the ProxyingRecordingHTTPResponse takes care of sending the raw bytes - # to the proxy client. + # We want HTTPResponse's smarts about http and handling of + # non-compliant servers. But HTTPResponse.read() doesn't return the raw + # bytes read from the server, it unchunks them if they're chunked, and + # might do other stuff. We want to send the raw bytes back to the + # client. So we ignore the values returned by prox_rec_res.read() below. Instead + # the ProxyingRecordingHTTPResponse takes care of sending the raw bytes + # to the proxy client. - # Proxy and record the response - h = ProxyingRecordingHTTPResponse(self._proxy_sock, - proxy_dest=self.connection, - digest_algorithm=self.server.digest_algorithm, - url=self.url) - h.begin() + # Proxy and record the response + prox_rec_res = ProxyingRecordingHTTPResponse(self._proxy_sock, + proxy_dest=self.connection, + digest_algorithm=self.server.digest_algorithm, + url=self.url) + prox_rec_res.begin() - buf = h.read(8192) - while buf != b'': - buf = h.read(8192) + remote_ip=self._proxy_sock.getpeername()[0] - self.log_request(h.status, h.recorder.len) + buf = prox_rec_res.read(8192) + while buf != b'': + buf = prox_rec_res.read(8192) - remote_ip = self._proxy_sock.getpeername()[0] + recorded_url = RecordedUrl(url=self.url, request_data=req, + response_recorder=prox_rec_res.recorder, + remote_ip=remote_ip, warcprox_meta=warcprox_meta, + status=prox_rec_res.status, size=prox_rec_res.recorder.len, + client_ip=self.client_address[0], + content_type=prox_rec_res.getheader("Content-Type"), + method=self.command) + self.server.recorded_url_q.put(recorded_url) - # Let's close off the remote end - h.close() - self._proxy_sock.close() + self.log_request(prox_rec_res.status, prox_rec_res.recorder.len) + except socket.timeout as e: + self.logger.warn("%s proxying %s", repr(e), self.url) + except BaseException as e: + self.logger.error("%s proxying %s", repr(e), self.url, exc_info=True) + finally: + # Let's close off the remote end + prox_rec_res.close() + self._proxy_sock.close() - # XXX Close connection to proxy client. Doing this because we were - # seeing some connection hangs and this seems to solve that problem. - # Not clear what the correct, optimal behavior is. - self.connection.close() - - recorded_url = RecordedUrl(url=self.url, request_data=req, - response_recorder=h.recorder, remote_ip=remote_ip, - warcprox_meta=warcprox_meta, - status=h.status, size=h.recorder.len, - client_ip=self.client_address[0], - content_type=h.getheader("Content-Type"), - method=self.command) - self.server.recorded_url_q.put(recorded_url) + # XXX Close connection to proxy client. Doing this because we were + # seeing some connection hangs and this seems to solve that problem. + # Not clear what the correct, optimal behavior is. One thing we + # should probably(?) do is add "Connection: close" to response + # headers. Right now the response is passed through blindly as raw + # bytes so it's not completely trivial to do that. + self.connection.close() return recorded_url From d3df48b97e8cef36878ff581847095fc5b63529a Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 11 Aug 2015 18:05:45 +0000 Subject: [PATCH 021/146] shorten warc filename template --- warcprox/writer.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/warcprox/writer.py b/warcprox/writer.py index 02dee72..c8c1b44 100644 --- a/warcprox/writer.py +++ b/warcprox/writer.py @@ -9,6 +9,8 @@ import time import warcprox import os import socket +import string +import random class WarcWriter: logger = logging.getLogger("warcprox.writer.WarcWriter") @@ -35,6 +37,8 @@ class WarcWriter: self._f_finalname = None self._serial = 0 + self._randomtoken = "".join(random.Random().sample(string.digits + string.ascii_lowercase, 8)) + if not os.path.exists(directory): self.logger.info("warc destination directory {} doesn't exist, creating it".format(directory)) os.mkdir(directory) @@ -53,15 +57,15 @@ class WarcWriter: self._fpath = None self._f = None - # + # h3 default + # ${prefix}-${timestamp17}-${randomtoken}-${serialno}.warc.gz" def _writer(self): if self._fpath and os.path.getsize(self._fpath) > self.rollover_size: self.close_writer() if self._f == None: - self._f_finalname = '{}-{}-{:05d}-{}-{}-{}.warc{}'.format( - self.prefix, self.timestamp17(), self._serial, os.getpid(), - socket.gethostname(), self.port, '.gz' if self.gzip else '') + self._f_finalname = '{}-{}-{:05d}-{}.warc{}'.format( + self.prefix, self.timestamp17(), self._serial, self._randomtoken, '.gz' if self.gzip else '') self._fpath = os.path.sep.join([self.directory, self._f_finalname + '.open']) self._f = open(self._fpath, 'wb') From 3073d59303d2c859b3e49262bf7083714707e483 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 11 Aug 2015 18:06:17 +0000 Subject: [PATCH 022/146] skip stack trace for normal-ish problems --- warcprox/mitmproxy.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index 9d57b44..d60ecd1 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -83,13 +83,13 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): self._transition_to_ssl() except Exception as e: try: - self.logger.error("problem handling {}: {}".format(repr(self.requestline), e), exc_info=True) + self.logger.error("problem handling {}: {}".format(repr(self.requestline), e)) if type(e) is socket.timeout: self.send_error(504, str(e)) else: self.send_error(500, str(e)) except Exception as f: - self.logger.warn("failed to send error response ({}) to proxy client: {}".format(e, f), exc_info=True) + self.logger.warn("failed to send error response ({}) to proxy client: {}".format(e, f)) return # Reload! From 0e7a7fdd69589c99835782d8f502beba824e487d Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 19 Aug 2015 22:56:05 +0000 Subject: [PATCH 023/146] remove unusued method; fix exception at shutdown time --- warcprox/mitmproxy.py | 3 --- warcprox/warcproxy.py | 4 +++- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index d60ecd1..154d30e 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -136,9 +136,6 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): self.logger.error("exception from {}".format(self._proxy_request), exc_info=True) raise - def _special_request(self, method, type_): - raise Exception('Not supported') - def _proxy_request(self): raise Exception('_proxy_request() not implemented in MitmProxyHandler, must be implemented in subclass!') diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index 3212086..6839ddc 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -213,6 +213,7 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): if 'Content-Length' in self.headers: req += self.rfile.read(int(self.headers['Content-Length'])) + prox_rec_res = None try: self.logger.debug('sending to remote server req=%s', repr(req)) @@ -256,7 +257,8 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): self.logger.error("%s proxying %s", repr(e), self.url, exc_info=True) finally: # Let's close off the remote end - prox_rec_res.close() + if prox_rec_res: + prox_rec_res.close() self._proxy_sock.close() # XXX Close connection to proxy client. Doing this because we were From e66dc3a9fba672d13bbc8f0a58f124609dc34ff2 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 20 Aug 2015 21:46:40 +0000 Subject: [PATCH 024/146] rethinkdb dedup --- setup.py | 2 +- warcprox/dedup.py | 72 +++++++++++++++++++++++++++++++++++++--- warcprox/main.py | 18 ++++++++-- warcprox/warc.py | 6 ++-- warcprox/writerthread.py | 2 +- 5 files changed, 88 insertions(+), 12 deletions(-) diff --git a/setup.py b/setup.py index 5bde72b..e5b71d5 100755 --- a/setup.py +++ b/setup.py @@ -47,7 +47,7 @@ setuptools.setup(name='warcprox', license='GPL', packages=['warcprox'], package_data={'warcprox':['version.txt']}, - install_requires=['certauth>=1.1.0', 'warctools>=4.8.3'], # gdbm not in pip :( + install_requires=['certauth>=1.1.0', 'warctools>=4.8.3', 'rethinkdb'], # gdbm not in pip :( dependency_links=['git+https://github.com/internetarchive/warctools.git#egg=warctools-4.8.3'], tests_require=['requests>=2.0.1', 'pytest'], # >=2.0.1 for https://github.com/kennethreitz/requests/pull/1636 cmdclass = {'test': PyTest}, diff --git a/warcprox/dedup.py b/warcprox/dedup.py index 2a99358..a715b01 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -15,6 +15,9 @@ import os import json from hanzo import warctools import warcprox +import rethinkdb +r = rethinkdb +import random class DedupDb(object): logger = logging.getLogger("warcprox.dedup.DedupDb") @@ -36,12 +39,12 @@ class DedupDb(object): except: pass - def save(self, key, response_record, offset): + def save(self, key, response_record): record_id = response_record.get_header(warctools.WarcRecord.ID).decode('latin1') url = response_record.get_header(warctools.WarcRecord.URL).decode('latin1') date = response_record.get_header(warctools.WarcRecord.DATE).decode('latin1') - py_value = {'i':record_id, 'u':url, 'd':date} + py_value = {'id':record_id, 'url':url, 'date':date} json_value = json.dumps(py_value, separators=(',',':')) self.db[key] = json_value.encode('utf-8') @@ -52,9 +55,9 @@ class DedupDb(object): if key in self.db: json_result = self.db[key] result = json.loads(json_result.decode('utf-8')) - result['i'] = result['i'].encode('latin1') - result['u'] = result['u'].encode('latin1') - result['d'] = result['d'].encode('latin1') + result['id'] = result['id'].encode('latin1') + result['url'] = result['url'].encode('latin1') + result['date'] = result['date'].encode('latin1') self.logger.debug('dedup db lookup of key=%s returning %s', key, result) return result @@ -63,3 +66,62 @@ def decorate_with_dedup_info(dedup_db, recorded_url, base32=False): key = warcprox.digest_str(recorded_url.response_recorder.payload_digest, base32) recorded_url.dedup_info = dedup_db.lookup(key) +class RethinkDedupDb: + logger = logging.getLogger("warcprox.dedup.RethinkDedupDb") + + def __init__(self, servers=["localhost"], db="warcprox", table="dedup", shards=3, replicas=3): + self.servers = servers + self.db = db + self.table = table + self.shards = shards + self.replicas = replicas + self._ensure_db_table() + + # https://github.com/rethinkdb/rethinkdb-example-webpy-blog/blob/master/model.py + # "Best practices: Managing connections: a connection per request" + def _random_server_connection(self): + server = random.choice(self.servers) + try: + host, port = server.split(":") + return r.connect(host=host, port=port) + except ValueError: + return r.connect(host=server) + + def _ensure_db_table(self): + with self._random_server_connection() as conn: + dbs = r.db_list().run(conn) + if not self.db in dbs: + self.logger.info("creating rethinkdb database %s", repr(self.db)) + r.db_create(self.db).run(conn) + tables = r.db(self.db).table_list().run(conn) + if not self.table in tables: + self.logger.info("creating rethinkdb table %s in database %s", repr(self.table), repr(self.db)) + r.db(self.db).table_create(self.table, primary_key="key", shards=self.shards, replicas=self.replicas).run(conn) + + def close(self): + pass + + def sync(self): + pass + + def save(self, key, response_record): + k = key.decode("utf-8") if isinstance(key, bytes) else key + record_id = response_record.get_header(warctools.WarcRecord.ID).decode('latin1') + url = response_record.get_header(warctools.WarcRecord.URL).decode('latin1') + date = response_record.get_header(warctools.WarcRecord.DATE).decode('latin1') + record = {'key':k,'url':url,'date':date,'id':record_id} + with self._random_server_connection() as conn: + result = r.db(self.db).table(self.table).insert(record,conflict="replace").run(conn) + if sorted(result.values()) != [0,0,0,0,0,1] or [result["deleted"],result["skipped"],result["errors"]] != [0,0,0]: + raise Exception("unexpected result %s saving %s", result, record) + self.logger.debug('dedup db saved %s:%s', key, record) + + def lookup(self, key): + k = key.decode("utf-8") if isinstance(key, bytes) else key + with self._random_server_connection() as conn: + result = r.db(self.db).table(self.table).get(k).run(conn) + if result: + for x in result: + result[x] = result[x].encode("utf-8") + self.logger.debug('dedup db lookup of key=%s returning %s', key, result) + return result diff --git a/warcprox/main.py b/warcprox/main.py index 52553f9..f6bf322 100644 --- a/warcprox/main.py +++ b/warcprox/main.py @@ -20,6 +20,7 @@ import signal import threading import certauth.certauth import warcprox +import re def _build_arg_parser(prog=os.path.basename(sys.argv[0])): arg_parser = argparse.ArgumentParser(prog=prog, @@ -55,7 +56,10 @@ def _build_arg_parser(prog=os.path.basename(sys.argv[0])): default='sha1', help='digest algorithm, one of {}'.format(', '.join(hash_algos))) arg_parser.add_argument('--base32', dest='base32', action='store_true', default=False, help='write digests in Base32 instead of hex') - arg_parser.add_argument('-j', '--dedup-db-file', dest='dedup_db_file', + group = arg_parser.add_mutually_exclusive_group() + group.add_argument('--dedup-rethinkdb-url', dest='dedup_rethinkdb_url', + help='persistent deduplication rethink db url, e.g. rethinkdb://db0.foo.org,db0.foo.org:38015,db1.foo.org/warcprox/dedup') + group.add_argument('-j', '--dedup-db-file', dest='dedup_db_file', default='./warcprox-dedup.db', help='persistent deduplication database file; empty string or /dev/null disables deduplication') arg_parser.add_argument('--stats-db-file', dest='stats_db_file', default='./warcprox-stats.db', help='persistent statistics database file; empty string or /dev/null disables statistics tracking') @@ -108,7 +112,17 @@ def main(argv=sys.argv): logging.fatal(e) exit(1) - if args.dedup_db_file in (None, '', '/dev/null'): + if args.dedup_rethinkdb_url: + m = re.fullmatch(r"rethinkdb://([^/]+)/([^/]+)/([^/]+)", args.dedup_rethinkdb_url) + if m: + servers = m.group(1).split(",") + db = m.group(2) + table = m.group(3) + dedup_db = warcprox.dedup.RethinkDedupDb(servers, db, table) + else: + logging.fatal("failed to parse dedup rethinkdb url %s", args.dedup_rethinkdb_url) + exit(1) + elif args.dedup_db_file in (None, '', '/dev/null'): logging.info('deduplication disabled') dedup_db = None else: diff --git a/warcprox/warc.py b/warcprox/warc.py index 91843a7..eaeeedf 100644 --- a/warcprox/warc.py +++ b/warcprox/warc.py @@ -32,9 +32,9 @@ class WarcRecordBuilder: url=recorded_url.url, warc_date=warc_date, data=response_header_block, warc_type=warctools.WarcRecord.REVISIT, - refers_to=recorded_url.dedup_info['i'], - refers_to_target_uri=recorded_url.dedup_info['u'], - refers_to_date=recorded_url.dedup_info['d'], + refers_to=recorded_url.dedup_info['id'], + refers_to_target_uri=recorded_url.dedup_info['url'], + refers_to_date=recorded_url.dedup_info['date'], payload_digest=warcprox.digest_str(recorded_url.response_recorder.payload_digest, self.base32), profile=warctools.WarcRecord.PROFILE_IDENTICAL_PAYLOAD_DIGEST, content_type=hanzo.httptools.ResponseMessage.CONTENT_TYPE, diff --git a/warcprox/writerthread.py b/warcprox/writerthread.py index 68c5676..df70d63 100644 --- a/warcprox/writerthread.py +++ b/warcprox/writerthread.py @@ -70,7 +70,7 @@ class WarcWriterThread(threading.Thread): and recorded_url.response_recorder.payload_size() > 0): key = warcprox.digest_str(recorded_url.response_recorder.payload_digest, self.writer_pool.default_warc_writer.record_builder.base32) - self.dedup_db.save(key, records[0], records[0].offset) + self.dedup_db.save(key, records[0]) def _save_playback_info(self, recorded_url, records): if self.playback_index_db is not None: From 3d90b9c2e9543472038da2c5898f196340051114 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 20 Aug 2015 22:51:29 +0000 Subject: [PATCH 025/146] py.test option --rethinkdb-servers to run tests using rethinkdb --- warcprox/tests/conftest.py | 11 +++++ warcprox/tests/test_warcprox.py | 73 ++++++++++++++++++++++----------- 2 files changed, 59 insertions(+), 25 deletions(-) create mode 100644 warcprox/tests/conftest.py diff --git a/warcprox/tests/conftest.py b/warcprox/tests/conftest.py new file mode 100644 index 0000000..7d42917 --- /dev/null +++ b/warcprox/tests/conftest.py @@ -0,0 +1,11 @@ +# vim:set sw=4 et: +import pytest + +def pytest_addoption(parser): + parser.addoption('--rethinkdb-servers', dest='rethinkdb_servers', + help='rethink db servers for dedup, e.g. db0.foo.org,db0.foo.org:38015,db1.foo.org') + +@pytest.fixture +def rethinkdb_servers(request): + return request.config.getoption("--rethinkdb-servers") + diff --git a/warcprox/tests/test_warcprox.py b/warcprox/tests/test_warcprox.py index 62b3553..88e0e6a 100755 --- a/warcprox/tests/test_warcprox.py +++ b/warcprox/tests/test_warcprox.py @@ -15,6 +15,9 @@ import shutil import requests import re import json +import rethinkdb +r = rethinkdb +import random try: import http.server as http_server @@ -129,7 +132,32 @@ def https_daemon(request, cert): return https_daemon @pytest.fixture() -def warcprox_(request): +def dedup_db(request, rethinkdb_servers): + if rethinkdb_servers: + servers = rethinkdb_servers.split(",") + db = 'warcprox_test_' + "".join(random.sample("abcdefghijklmnopqrstuvwxyz0123456789_",8)) + ddb = warcprox.dedup.RethinkDedupDb(servers, db) + else: + f = tempfile.NamedTemporaryFile(prefix='warcprox-test-dedup-', suffix='.db', delete=False) + f.close() + dedup_db_file = f.name + ddb = warcprox.dedup.DedupDb(dedup_db_file) + + def fin(): + if rethinkdb_servers: + logging.info('dropping rethinkdb database {}'.format(db)) + with ddb._random_server_connection() as conn: + result = r.db_drop(db).run(conn) + logging.info("result=%s", result) + else: + logging.info('deleting file {}'.format(dedup_db_file)) + os.unlink(dedup_db_file) + request.addfinalizer(fin) + + return ddb + +@pytest.fixture() +def warcprox_(request, dedup_db): f = tempfile.NamedTemporaryFile(prefix='warcprox-test-ca-', suffix='.pem', delete=True) f.close() # delete it, or CertificateAuthority will try to read it ca_file = f.name @@ -155,11 +183,6 @@ def warcprox_(request): playback_proxy = warcprox.playback.PlaybackProxy(server_address=('localhost', 0), ca=ca, playback_index_db=playback_index_db, warcs_dir=warcs_dir) - f = tempfile.NamedTemporaryFile(prefix='warcprox-test-dedup-', suffix='.db', delete=False) - f.close() - dedup_db_file = f.name - dedup_db = warcprox.dedup.DedupDb(dedup_db_file) - default_warc_writer = warcprox.writer.WarcWriter(directory=warcs_dir, port=proxy.server_port) writer_pool = warcprox.writer.WarcWriterPool(default_warc_writer) @@ -178,7 +201,7 @@ def warcprox_(request): logging.info('stopping warcprox') warcprox_.stop.set() warcprox_thread.join() - for f in (ca_file, ca_dir, warcs_dir, playback_index_db_file, dedup_db_file, stats_db_file): + for f in (ca_file, ca_dir, warcs_dir, playback_index_db_file, stats_db_file): if os.path.isdir(f): logging.info('deleting directory {}'.format(f)) shutil.rmtree(f) @@ -297,13 +320,13 @@ def test_dedup_http(http_daemon, warcprox_, archiving_proxies, playback_proxies) assert response.content == b'I am the warcprox test payload! ffffffffff!\n' # check in dedup db - # {u'i': u'', u'u': u'https://localhost:62841/c/d', u'd': u'2013-11-22T00:14:37Z'} + # {u'id': u'', u'url': u'https://localhost:62841/c/d', u'date': u'2013-11-22T00:14:37Z'} dedup_lookup = warcprox_.warc_writer_thread.dedup_db.lookup(b'sha1:65e1216acfd220f0292715e74bd7a1ec35c99dfc') - assert dedup_lookup['u'] == url.encode('ascii') - assert re.match(br'^$', dedup_lookup['i']) - assert re.match(br'^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$', dedup_lookup['d']) - record_id = dedup_lookup['i'] - dedup_date = dedup_lookup['d'] + assert dedup_lookup['url'] == url.encode('ascii') + assert re.match(br'^$', dedup_lookup['id']) + assert re.match(br'^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$', dedup_lookup['date']) + record_id = dedup_lookup['id'] + dedup_date = dedup_lookup['date'] # need revisit to have a later timestamp than original, else playing # back the latest record might not hit the revisit @@ -320,9 +343,9 @@ def test_dedup_http(http_daemon, warcprox_, archiving_proxies, playback_proxies) # check in dedup db (no change from prev) dedup_lookup = warcprox_.warc_writer_thread.dedup_db.lookup(b'sha1:65e1216acfd220f0292715e74bd7a1ec35c99dfc') - assert dedup_lookup['u'] == url.encode('ascii') - assert dedup_lookup['i'] == record_id - assert dedup_lookup['d'] == dedup_date + assert dedup_lookup['url'] == url.encode('ascii') + assert dedup_lookup['id'] == record_id + assert dedup_lookup['date'] == dedup_date # test playback logging.debug('testing playback of revisit of {}'.format(url)) @@ -358,13 +381,13 @@ def test_dedup_https(https_daemon, warcprox_, archiving_proxies, playback_proxie assert response.content == b'I am the warcprox test payload! hhhhhhhhhh!\n' # check in dedup db - # {u'i': u'', u'u': u'https://localhost:62841/c/d', u'd': u'2013-11-22T00:14:37Z'} + # {u'id': u'', u'url': u'https://localhost:62841/c/d', u'date': u'2013-11-22T00:14:37Z'} dedup_lookup = warcprox_.warc_writer_thread.dedup_db.lookup(b'sha1:5b4efa64fdb308ec06ae56a9beba155a6f734b89') - assert dedup_lookup['u'] == url.encode('ascii') - assert re.match(br'^$', dedup_lookup['i']) - assert re.match(br'^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$', dedup_lookup['d']) - record_id = dedup_lookup['i'] - dedup_date = dedup_lookup['d'] + assert dedup_lookup['url'] == url.encode('ascii') + assert re.match(br'^$', dedup_lookup['id']) + assert re.match(br'^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$', dedup_lookup['date']) + record_id = dedup_lookup['id'] + dedup_date = dedup_lookup['date'] # need revisit to have a later timestamp than original, else playing # back the latest record might not hit the revisit @@ -381,9 +404,9 @@ def test_dedup_https(https_daemon, warcprox_, archiving_proxies, playback_proxie # check in dedup db (no change from prev) dedup_lookup = warcprox_.warc_writer_thread.dedup_db.lookup(b'sha1:5b4efa64fdb308ec06ae56a9beba155a6f734b89') - assert dedup_lookup['u'] == url.encode('ascii') - assert dedup_lookup['i'] == record_id - assert dedup_lookup['d'] == dedup_date + assert dedup_lookup['url'] == url.encode('ascii') + assert dedup_lookup['id'] == record_id + assert dedup_lookup['date'] == dedup_date # test playback logging.debug('testing playback of revisit of {}'.format(url)) From 788bc69f478abe81bb3b5c166fdfff8386e95b2c Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 20 Aug 2015 22:58:30 +0000 Subject: [PATCH 026/146] set up fixtures once for all tests --- warcprox/tests/conftest.py | 2 +- warcprox/tests/test_warcprox.py | 20 ++++++++++---------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/warcprox/tests/conftest.py b/warcprox/tests/conftest.py index 7d42917..db27210 100644 --- a/warcprox/tests/conftest.py +++ b/warcprox/tests/conftest.py @@ -5,7 +5,7 @@ def pytest_addoption(parser): parser.addoption('--rethinkdb-servers', dest='rethinkdb_servers', help='rethink db servers for dedup, e.g. db0.foo.org,db0.foo.org:38015,db1.foo.org') -@pytest.fixture +@pytest.fixture(scope="module") def rethinkdb_servers(request): return request.config.getoption("--rethinkdb-servers") diff --git a/warcprox/tests/test_warcprox.py b/warcprox/tests/test_warcprox.py index 88e0e6a..5d9141c 100755 --- a/warcprox/tests/test_warcprox.py +++ b/warcprox/tests/test_warcprox.py @@ -59,7 +59,7 @@ class _TestHttpRequestHandler(http_server.BaseHTTPRequestHandler): self.connection.sendall(headers) self.connection.sendall(payload) -@pytest.fixture() +@pytest.fixture(scope="module") def cert(request): f = tempfile.NamedTemporaryFile(prefix='warcprox-test-https-', suffix='.pem', delete=False) @@ -92,7 +92,7 @@ def cert(request): finally: f.close() -@pytest.fixture() +@pytest.fixture(scope="module") def http_daemon(request): http_daemon = http_server.HTTPServer(('localhost', 0), RequestHandlerClass=_TestHttpRequestHandler) @@ -110,7 +110,7 @@ def http_daemon(request): return http_daemon -@pytest.fixture() +@pytest.fixture(scope="module") def https_daemon(request, cert): # http://www.piware.de/2011/01/creating-an-https-server-in-python/ https_daemon = http_server.HTTPServer(('localhost', 0), @@ -131,7 +131,7 @@ def https_daemon(request, cert): return https_daemon -@pytest.fixture() +@pytest.fixture(scope="module") def dedup_db(request, rethinkdb_servers): if rethinkdb_servers: servers = rethinkdb_servers.split(",") @@ -156,7 +156,7 @@ def dedup_db(request, rethinkdb_servers): return ddb -@pytest.fixture() +@pytest.fixture(scope="module") def warcprox_(request, dedup_db): f = tempfile.NamedTemporaryFile(prefix='warcprox-test-ca-', suffix='.pem', delete=True) f.close() # delete it, or CertificateAuthority will try to read it @@ -212,12 +212,12 @@ def warcprox_(request, dedup_db): return warcprox_ -@pytest.fixture() +@pytest.fixture(scope="module") def archiving_proxies(warcprox_): archiving_proxy = 'http://localhost:{}'.format(warcprox_.proxy.server_port) return {'http':archiving_proxy, 'https':archiving_proxy} -@pytest.fixture() +@pytest.fixture(scope="module") def playback_proxies(warcprox_): playback_proxy = 'http://localhost:{}'.format(warcprox_.playback_proxy.server_port) return {'http':playback_proxy, 'https':playback_proxy} @@ -417,15 +417,15 @@ def test_dedup_https(https_daemon, warcprox_, archiving_proxies, playback_proxie # XXX how to check dedup was used? def test_limits(http_daemon, archiving_proxies): - url = 'http://localhost:{}/a/b'.format(http_daemon.server_port) + url = 'http://localhost:{}/i/j'.format(http_daemon.server_port) request_meta = {"stats":{"buckets":["job1"]},"limits":{"job1.total.urls":10}} headers = {"Warcprox-Meta": json.dumps(request_meta)} for i in range(10): response = requests.get(url, proxies=archiving_proxies, headers=headers, stream=True) assert response.status_code == 200 - assert response.headers['warcprox-test-header'] == 'a!' - assert response.content == b'I am the warcprox test payload! bbbbbbbbbb!\n' + assert response.headers['warcprox-test-header'] == 'i!' + assert response.content == b'I am the warcprox test payload! jjjjjjjjjj!\n' # XXX give warc writer thread a chance to update stats time.sleep(2.0) From df38cf856d30b265e29db5a88e3f2d28518ff2ae Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 21 Aug 2015 00:27:30 +0000 Subject: [PATCH 027/146] rethinkdb for stats --- warcprox/main.py | 29 ++++---- warcprox/stats.py | 124 ++++++++++++++++++++++++++------ warcprox/tests/test_warcprox.py | 38 +++++++--- 3 files changed, 146 insertions(+), 45 deletions(-) diff --git a/warcprox/main.py b/warcprox/main.py index f6bf322..e7bfee2 100644 --- a/warcprox/main.py +++ b/warcprox/main.py @@ -56,11 +56,6 @@ def _build_arg_parser(prog=os.path.basename(sys.argv[0])): default='sha1', help='digest algorithm, one of {}'.format(', '.join(hash_algos))) arg_parser.add_argument('--base32', dest='base32', action='store_true', default=False, help='write digests in Base32 instead of hex') - group = arg_parser.add_mutually_exclusive_group() - group.add_argument('--dedup-rethinkdb-url', dest='dedup_rethinkdb_url', - help='persistent deduplication rethink db url, e.g. rethinkdb://db0.foo.org,db0.foo.org:38015,db1.foo.org/warcprox/dedup') - group.add_argument('-j', '--dedup-db-file', dest='dedup_db_file', - default='./warcprox-dedup.db', help='persistent deduplication database file; empty string or /dev/null disables deduplication') arg_parser.add_argument('--stats-db-file', dest='stats_db_file', default='./warcprox-stats.db', help='persistent statistics database file; empty string or /dev/null disables statistics tracking') arg_parser.add_argument('-P', '--playback-port', dest='playback_port', @@ -68,6 +63,13 @@ def _build_arg_parser(prog=os.path.basename(sys.argv[0])): arg_parser.add_argument('--playback-index-db-file', dest='playback_index_db_file', default='./warcprox-playback-index.db', help='playback index database file (only used if --playback-port is specified)') + group = arg_parser.add_mutually_exclusive_group() + group.add_argument('-j', '--dedup-db-file', dest='dedup_db_file', + default='./warcprox-dedup.db', help='persistent deduplication database file; empty string or /dev/null disables deduplication') + group.add_argument('--rethinkdb-servers', dest='rethinkdb_servers', + help='rethinkdb servers, used for dedup and stats if specified; e.g. db0.foo.org,db0.foo.org:38015,db1.foo.org') + arg_parser.add_argument('--rethinkdb-db', dest='rethinkdb_db', default="warcprox", + help='rethinkdb database name (ignored unless --rethinkdb-servers is specified)') arg_parser.add_argument('--version', action='version', version="warcprox {}".format(warcprox.version_str)) arg_parser.add_argument('-v', '--verbose', dest='verbose', action='store_true') @@ -112,23 +114,18 @@ def main(argv=sys.argv): logging.fatal(e) exit(1) - if args.dedup_rethinkdb_url: - m = re.fullmatch(r"rethinkdb://([^/]+)/([^/]+)/([^/]+)", args.dedup_rethinkdb_url) - if m: - servers = m.group(1).split(",") - db = m.group(2) - table = m.group(3) - dedup_db = warcprox.dedup.RethinkDedupDb(servers, db, table) - else: - logging.fatal("failed to parse dedup rethinkdb url %s", args.dedup_rethinkdb_url) - exit(1) + if args.rethinkdb_servers: + dedup_db = warcprox.dedup.RethinkDedupDb(args.rethinkdb_servers.split(","), args.rethinkdb_db) elif args.dedup_db_file in (None, '', '/dev/null'): logging.info('deduplication disabled') dedup_db = None else: dedup_db = warcprox.dedup.DedupDb(args.dedup_db_file) - if args.stats_db_file in (None, '', '/dev/null'): + + if args.rethinkdb_servers: + stats_db = warcprox.stats.RethinkStatsDb(args.rethinkdb_servers.split(","), args.rethinkdb_db) + elif args.stats_db_file in (None, '', '/dev/null'): logging.info('statistics tracking disabled') stats_db = None else: diff --git a/warcprox/stats.py b/warcprox/stats.py index 6ad3ca4..f700cca 100644 --- a/warcprox/stats.py +++ b/warcprox/stats.py @@ -14,6 +14,29 @@ import logging import os import json from hanzo import warctools +import rethinkdb +r = rethinkdb +import random + +def _empty_bucket(bucket): + return { + "bucket": bucket, + "total": { + "urls": 0, + "wire_bytes": 0, + # "warc_bytes": 0, + }, + "new": { + "urls": 0, + "wire_bytes": 0, + # "warc_bytes": 0, + }, + "revisit": { + "urls": 0, + "wire_bytes": 0, + # "warc_bytes": 0, + }, + } class StatsDb: logger = logging.getLogger("warcprox.stats.StatsDb") @@ -35,25 +58,6 @@ class StatsDb: except: pass - def _empty_bucket(self): - return { - "total": { - "urls": 0, - "wire_bytes": 0, - # "warc_bytes": 0, - }, - "new": { - "urls": 0, - "wire_bytes": 0, - # "warc_bytes": 0, - }, - "revisit": { - "urls": 0, - "wire_bytes": 0, - # "warc_bytes": 0, - }, - } - def value(self, bucket0="__all__", bucket1=None, bucket2=None): if bucket0 in self.db: bucket0_stats = json.loads(self.db[bucket0].decode("utf-8")) @@ -81,7 +85,7 @@ class StatsDb: if bucket in self.db: bucket_stats = json.loads(self.db[bucket].decode("utf-8")) else: - bucket_stats = self._empty_bucket() + bucket_stats = _empty_bucket(bucket) bucket_stats["total"]["urls"] += 1 bucket_stats["total"]["wire_bytes"] += recorded_url.size @@ -95,3 +99,83 @@ class StatsDb: self.db[bucket] = json.dumps(bucket_stats, separators=(',',':')).encode("utf-8") +class RethinkStatsDb: + logger = logging.getLogger("warcprox.stats.RethinkStatsDb") + + def __init__(self, servers=["localhost"], db="warcprox", table="stats", shards=3, replicas=3): + self.servers = servers + self.db = db + self.table = table + self.shards = shards + self.replicas = replicas + self._ensure_db_table() + + # https://github.com/rethinkdb/rethinkdb-example-webpy-blog/blob/master/model.py + # "Best practices: Managing connections: a connection per request" + def _random_server_connection(self): + server = random.choice(self.servers) + try: + host, port = server.split(":") + return r.connect(host=host, port=port) + except ValueError: + return r.connect(host=server) + + def _ensure_db_table(self): + with self._random_server_connection() as conn: + dbs = r.db_list().run(conn) + if not self.db in dbs: + self.logger.info("creating rethinkdb database %s", repr(self.db)) + r.db_create(self.db).run(conn) + tables = r.db(self.db).table_list().run(conn) + if not self.table in tables: + self.logger.info("creating rethinkdb table %s in database %s", repr(self.table), repr(self.db)) + r.db(self.db).table_create(self.table, primary_key="bucket", shards=self.shards, replicas=self.replicas).run(conn) + + def close(self): + pass + + def sync(self): + pass + + def value(self, bucket0="__all__", bucket1=None, bucket2=None): + # XXX use pluck? + with self._random_server_connection() as conn: + bucket0_stats = r.db(self.db).table(self.table).get(bucket0).run(conn) + self.logger.debug('stats db lookup of bucket=%s returned %s', bucket0, bucket0_stats) + if bucket0_stats: + if bucket1: + if bucket2: + return bucket0_stats[bucket1][bucket2] + else: + return bucket0_stats[bucket1] + return bucket0_stats + + def tally(self, recorded_url, records): + buckets = ["__all__"] + + if (recorded_url.warcprox_meta + and "stats" in recorded_url.warcprox_meta + and "buckets" in recorded_url.warcprox_meta["stats"]): + buckets.extend(recorded_url.warcprox_meta["stats"]["buckets"]) + else: + buckets.append("__unspecified__") + + with self._random_server_connection() as conn: + for bucket in buckets: + bucket_stats = self.value(bucket) or _empty_bucket(bucket) + + bucket_stats["total"]["urls"] += 1 + bucket_stats["total"]["wire_bytes"] += recorded_url.size + + if records[0].get_header(warctools.WarcRecord.TYPE) == warctools.WarcRecord.REVISIT: + bucket_stats["revisit"]["urls"] += 1 + bucket_stats["revisit"]["wire_bytes"] += recorded_url.size + else: + bucket_stats["new"]["urls"] += 1 + bucket_stats["new"]["wire_bytes"] += recorded_url.size + + self.logger.info("saving %s", bucket_stats) + result = r.db(self.db).table(self.table).insert(bucket_stats, conflict="replace").run(conn) + if sorted(result.values()) != [0,0,0,0,0,1] or [result["deleted"],result["skipped"],result["errors"]] != [0,0,0]: + raise Exception("unexpected result %s saving %s", result, record) + diff --git a/warcprox/tests/test_warcprox.py b/warcprox/tests/test_warcprox.py index 5d9141c..2489807 100755 --- a/warcprox/tests/test_warcprox.py +++ b/warcprox/tests/test_warcprox.py @@ -135,7 +135,7 @@ def https_daemon(request, cert): def dedup_db(request, rethinkdb_servers): if rethinkdb_servers: servers = rethinkdb_servers.split(",") - db = 'warcprox_test_' + "".join(random.sample("abcdefghijklmnopqrstuvwxyz0123456789_",8)) + db = 'warcprox_test_dedup_' + "".join(random.sample("abcdefghijklmnopqrstuvwxyz0123456789_",8)) ddb = warcprox.dedup.RethinkDedupDb(servers, db) else: f = tempfile.NamedTemporaryFile(prefix='warcprox-test-dedup-', suffix='.db', delete=False) @@ -157,7 +157,32 @@ def dedup_db(request, rethinkdb_servers): return ddb @pytest.fixture(scope="module") -def warcprox_(request, dedup_db): +def stats_db(request, rethinkdb_servers): + if rethinkdb_servers: + servers = rethinkdb_servers.split(",") + db = 'warcprox_test_stats_' + "".join(random.sample("abcdefghijklmnopqrstuvwxyz0123456789_",8)) + sdb = warcprox.stats.RethinkStatsDb(servers, db) + else: + f = tempfile.NamedTemporaryFile(prefix='warcprox-test-stats-', suffix='.db', delete=False) + f.close() + stats_db_file = f.name + sdb = warcprox.stats.StatsDb(stats_db_file) + + def fin(): + if rethinkdb_servers: + logging.info('dropping rethinkdb database {}'.format(db)) + with sdb._random_server_connection() as conn: + result = r.db_drop(db).run(conn) + logging.info("result=%s", result) + else: + logging.info('deleting file {}'.format(stats_db_file)) + os.unlink(stats_db_file) + request.addfinalizer(fin) + + return sdb + +@pytest.fixture(scope="module") +def warcprox_(request, dedup_db, stats_db): f = tempfile.NamedTemporaryFile(prefix='warcprox-test-ca-', suffix='.pem', delete=True) f.close() # delete it, or CertificateAuthority will try to read it ca_file = f.name @@ -166,11 +191,6 @@ def warcprox_(request, dedup_db): recorded_url_q = queue.Queue() - f = tempfile.NamedTemporaryFile(prefix='warcprox-test-stats-', suffix='.db', delete=False) - f.close() - stats_db_file = f.name - stats_db = warcprox.stats.StatsDb(stats_db_file) - proxy = warcprox.warcproxy.WarcProxy(server_address=('localhost', 0), ca=ca, recorded_url_q=recorded_url_q, stats_db=stats_db) @@ -201,7 +221,7 @@ def warcprox_(request, dedup_db): logging.info('stopping warcprox') warcprox_.stop.set() warcprox_thread.join() - for f in (ca_file, ca_dir, warcs_dir, playback_index_db_file, stats_db_file): + for f in (ca_file, ca_dir, warcs_dir, playback_index_db_file): if os.path.isdir(f): logging.info('deleting directory {}'.format(f)) shutil.rmtree(f) @@ -433,7 +453,7 @@ def test_limits(http_daemon, archiving_proxies): response = requests.get(url, proxies=archiving_proxies, headers=headers, stream=True) assert response.status_code == 420 assert response.reason == "Reached limit" - expected_response_meta = {'reached-limit': {'job1.total.urls': 10}, 'stats': {'job1': {'revisit': {'wire_bytes': 1215, 'urls': 9}, 'total': {'wire_bytes': 1350, 'urls': 10}, 'new': {'wire_bytes': 135, 'urls': 1}}}} + expected_response_meta = {'reached-limit': {'job1.total.urls': 10}, 'stats': {'job1': {'bucket': 'job1', 'revisit': {'wire_bytes': 1215, 'urls': 9}, 'total': {'wire_bytes': 1350, 'urls': 10}, 'new': {'wire_bytes': 135, 'urls': 1}}}} assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta assert response.headers["content-type"] == "text/plain;charset=utf-8" assert response.raw.data == b"request rejected by warcprox: reached limit job1.total.urls=10\n" From f000d413a22de5a714641582c86ab6846623f3b8 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 21 Aug 2015 00:29:39 +0000 Subject: [PATCH 028/146] quiet stats logging --- warcprox/stats.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/warcprox/stats.py b/warcprox/stats.py index f700cca..38dabd6 100644 --- a/warcprox/stats.py +++ b/warcprox/stats.py @@ -174,7 +174,7 @@ class RethinkStatsDb: bucket_stats["new"]["urls"] += 1 bucket_stats["new"]["wire_bytes"] += recorded_url.size - self.logger.info("saving %s", bucket_stats) + self.logger.debug("saving %s", bucket_stats) result = r.db(self.db).table(self.table).insert(bucket_stats, conflict="replace").run(conn) if sorted(result.values()) != [0,0,0,0,0,1] or [result["deleted"],result["skipped"],result["errors"]] != [0,0,0]: raise Exception("unexpected result %s saving %s", result, record) From cc71c331a1095735a8d014b6eb628d932ea87983 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Mon, 24 Aug 2015 20:44:35 +0000 Subject: [PATCH 029/146] modify response headers from server, always send connection:close to proxy client --- warcprox/bigtable.py | 47 +++++++++++++++++++++++++++++++ warcprox/warcproxy.py | 64 +++++++++++++++++-------------------------- 2 files changed, 72 insertions(+), 39 deletions(-) create mode 100644 warcprox/bigtable.py diff --git a/warcprox/bigtable.py b/warcprox/bigtable.py new file mode 100644 index 0000000..60be77a --- /dev/null +++ b/warcprox/bigtable.py @@ -0,0 +1,47 @@ +# vim:set sw=4 et: + +from __future__ import absolute_import + +import logging +from hanzo import warctools +import rethinkdb +r = rethinkdb +import random + +class RethinkCaptures: + logger = logging.getLogger("warcprox.dedup.RethinkDedupDb") + + def __init__(self, servers=["localhost"], db="warcprox", table="captures", shards=3, replicas=3): + self.servers = servers + self.db = db + self.table = table + self.shards = shards + self.replicas = replicas + self._ensure_db_table() + + # https://github.com/rethinkdb/rethinkdb-example-webpy-blog/blob/master/model.py + # "Best practices: Managing connections: a connection per request" + def _random_server_connection(self): + server = random.choice(self.servers) + try: + host, port = server.split(":") + return r.connect(host=host, port=port) + except ValueError: + return r.connect(host=server) + + def _ensure_db_table(self): + with self._random_server_connection() as conn: + dbs = r.db_list().run(conn) + if not self.db in dbs: + self.logger.info("creating rethinkdb database %s", repr(self.db)) + r.db_create(self.db).run(conn) + tables = r.db(self.db).table_list().run(conn) + if not self.table in tables: + self.logger.info("creating rethinkdb table %s in database %s", repr(self.table), repr(self.db)) + r.db(db).table_create(table, shards=3, replicas=3).run(conn) + r.db(db).table(table).index_create("abbr_canon_surt_timesamp", [r.row["abbr_canon_surt"], r.row["timestamp"]]).run(conn) + r.db(db).table(table).index_create("sha1_warc_type", [r.row["sha1base32"], r.row["warc_type"]]).run(conn) + # r.db(self.db).table_create(self.table, primary_key="canon_surt", shards=self.shards, replicas=self.replicas).run(conn) + # r.db(self.db).table(self.table).index_create("timestamp").run(conn) + # r.db(self.db).table(self.table).index_create("sha1base32").run(conn) + diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index 6839ddc..21c72b8 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -63,35 +63,12 @@ class ProxyingRecorder(object): self.len = 0 self.url = url - def _update_payload_digest(self, hunk): - if self.payload_digest is None: - # convoluted handling of two newlines crossing hunks - # XXX write tests for this - if self._prev_hunk_last_two_bytes.endswith(b'\n'): - if hunk.startswith(b'\n'): - self.payload_digest = hashlib.new(self.digest_algorithm) - self.payload_digest.update(hunk[1:]) - self.payload_offset = self.len + 1 - elif hunk.startswith(b'\r\n'): - self.payload_digest = hashlib.new(self.digest_algorithm) - self.payload_digest.update(hunk[2:]) - self.payload_offset = self.len + 2 - elif self._prev_hunk_last_two_bytes == b'\n\r': - if hunk.startswith(b'\n'): - self.payload_digest = hashlib.new(self.digest_algorithm) - self.payload_digest.update(hunk[1:]) - self.payload_offset = self.len + 1 - else: - m = re.search(br'\n\r?\n', hunk) - if m is not None: - self.payload_digest = hashlib.new(self.digest_algorithm) - self.payload_digest.update(hunk[m.end():]) - self.payload_offset = self.len + m.end() + def payload_starts_now(self): + self.payload_digest = hashlib.new(self.digest_algorithm) + self.payload_offset = self.len - # if we still haven't found start of payload hold on to these bytes - if self.payload_digest is None: - self._prev_hunk_last_two_bytes = hunk[-2:] - else: + def _update_payload_digest(self, hunk): + if self.payload_digest: self.payload_digest.update(hunk) def _update(self, hunk): @@ -100,7 +77,7 @@ class ProxyingRecorder(object): self.tempfile.write(hunk) - if self._proxy_dest_conn_open: + if self.payload_digest and self._proxy_dest_conn_open: try: self.proxy_dest.sendall(hunk) except BaseException as e: @@ -147,6 +124,7 @@ class ProxyingRecordingHTTPResponse(http_client.HTTPResponse): def __init__(self, sock, debuglevel=0, method=None, proxy_dest=None, digest_algorithm='sha1', url=None): http_client.HTTPResponse.__init__(self, sock, debuglevel=debuglevel, method=method) + self.proxy_dest = proxy_dest self.url = url # Keep around extra reference to self.fp because HTTPResponse sets @@ -154,6 +132,19 @@ class ProxyingRecordingHTTPResponse(http_client.HTTPResponse): self.recorder = ProxyingRecorder(self.fp, proxy_dest, digest_algorithm, url=url) self.fp = self.recorder + def begin(self): + http_client.HTTPResponse.begin(self) # reads status line, headers + + status_and_headers = 'HTTP/1.1 {} {}\r\n'.format(self.status, self.reason) + for k,v in self.msg.items(): + if k.lower() not in ('connection', 'proxy-connection', 'keep-alive', + 'proxy-authenticate', 'proxy-authorization', 'upgrade'): + status_and_headers += '{}: {}\r\n'.format(k, v) + status_and_headers += 'Connection: close\r\n\r\n' + self.proxy_dest.sendall(status_and_headers.encode('latin1')) + + self.recorder.payload_starts_now() + class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): # self.server is WarcProxy @@ -207,7 +198,7 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): # XXX in at least python3.3 str(self.headers) uses \n not \r\n :( req_str += '\r\n'.join('{}: {}'.format(k,v) for (k,v) in self.headers.items()) - req = req_str.encode('utf-8') + b'\r\n\r\n' + req = req_str.encode('latin1') + b'\r\n\r\n' # Append message body if present to the request if 'Content-Length' in self.headers: @@ -261,14 +252,6 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): prox_rec_res.close() self._proxy_sock.close() - # XXX Close connection to proxy client. Doing this because we were - # seeing some connection hangs and this seems to solve that problem. - # Not clear what the correct, optimal behavior is. One thing we - # should probably(?) do is add "Connection: close" to response - # headers. Right now the response is passed through blindly as raw - # bytes so it's not completely trivial to do that. - self.connection.close() - return recorded_url # deprecated @@ -360,7 +343,7 @@ class RecordedUrl: self.response_recorder = None -class WarcProxy(socketserver.ThreadingMixIn, http_server.HTTPServer): +class SingleThreadedWarcProxy(http_server.HTTPServer): logger = logging.getLogger("warcprox.warcproxy.WarcProxy") def __init__(self, server_address=('localhost', 8000), @@ -394,3 +377,6 @@ class WarcProxy(socketserver.ThreadingMixIn, http_server.HTTPServer): self.logger.info('WarcProxy shutting down') http_server.HTTPServer.server_close(self) + +class WarcProxy(socketserver.ThreadingMixIn, SingleThreadedWarcProxy): + pass From c430f81883d740c978e805226958d08f3bf1ca20 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Mon, 24 Aug 2015 23:53:11 +0000 Subject: [PATCH 030/146] some refactoring to prep for big rethinkdb capture table --- requirements.txt | 4 +++ setup.py | 2 -- warcprox/__init__.py | 27 ++++++++++------ warcprox/bigtable.py | 17 ++++++++++ warcprox/controller.py | 3 +- warcprox/dedup.py | 22 +++++++++++-- warcprox/main.py | 55 +++++++++++++++++++-------------- warcprox/playback.py | 3 ++ warcprox/stats.py | 13 ++++++-- warcprox/tests/test_warcprox.py | 3 +- warcprox/warcproxy.py | 6 ++-- warcprox/writer.py | 25 +++++---------- warcprox/writerthread.py | 43 +++++--------------------- 13 files changed, 128 insertions(+), 95 deletions(-) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..dcc1f62 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +certauth>=1.1.0 +rethinkdb +git+https://github.com/internetarchive/warctools.git +git+https://github.com/nlevitt/surt.git@py3 diff --git a/setup.py b/setup.py index e5b71d5..ab42452 100755 --- a/setup.py +++ b/setup.py @@ -47,8 +47,6 @@ setuptools.setup(name='warcprox', license='GPL', packages=['warcprox'], package_data={'warcprox':['version.txt']}, - install_requires=['certauth>=1.1.0', 'warctools>=4.8.3', 'rethinkdb'], # gdbm not in pip :( - dependency_links=['git+https://github.com/internetarchive/warctools.git#egg=warctools-4.8.3'], tests_require=['requests>=2.0.1', 'pytest'], # >=2.0.1 for https://github.com/kennethreitz/requests/pull/1636 cmdclass = {'test': PyTest}, test_suite='warcprox.tests', diff --git a/warcprox/__init__.py b/warcprox/__init__.py index 7235056..4f8ad91 100644 --- a/warcprox/__init__.py +++ b/warcprox/__init__.py @@ -1,14 +1,6 @@ # vim:set sw=4 et: -import warcprox.controller as controller -import warcprox.playback as playback -import warcprox.dedup as dedup -import warcprox.warcproxy as warcproxy -import warcprox.mitmproxy as mitmproxy -import warcprox.writer as writer -import warcprox.warc as warc -import warcprox.writerthread as writerthread -import warcprox.stats as stats +from argparse import Namespace as _Namespace def digest_str(hash_obj, base32): import base64 @@ -20,5 +12,22 @@ def _read_version_bytes(): with open(version_txt, 'rb') as fin: return fin.read().strip() +class Options(_Namespace): + def __getattr__(self, name): + try: + return super(Options, self).__getattr__(self, name) + except AttributeError: + return None + version_bytes = _read_version_bytes().strip() version_str = version_bytes.decode('utf-8') + +import warcprox.controller as controller +import warcprox.playback as playback +import warcprox.dedup as dedup +import warcprox.warcproxy as warcproxy +import warcprox.mitmproxy as mitmproxy +import warcprox.writer as writer +import warcprox.warc as warc +import warcprox.writerthread as writerthread +import warcprox.stats as stats diff --git a/warcprox/bigtable.py b/warcprox/bigtable.py index 60be77a..787aa9b 100644 --- a/warcprox/bigtable.py +++ b/warcprox/bigtable.py @@ -45,3 +45,20 @@ class RethinkCaptures: # r.db(self.db).table(self.table).index_create("timestamp").run(conn) # r.db(self.db).table(self.table).index_create("sha1base32").run(conn) + def notify(self, recorded_url, records): + canon_surt = surt.surt(recorded_url.url, trailing_comma=True, host_massage=False) + entry = { + # id only specified for rethinkdb partitioning + "id": "{} {}".format(canon_surt[:20], record.id.decode("utf-8")[10:-1]), + "abbr_canon_surt": canon_surt[:150], + "timestamp": re.sub(r"[^0-9]", "", record.date.decode("utf-8")), + "url": record.url.decode("utf-8"), + "offset": offset, + "filename": os.path.basename(warc_file), + "warc_type": record.type.decode("utf-8"), + "warc_id": record.id.decode("utf-8"), + "sha1base32": record.get_header(b'WARC-Payload-Digest').decode("utf-8")[5:], + # mimetype + # response_code + # http_method + } diff --git a/warcprox/controller.py b/warcprox/controller.py index ba73859..e198006 100644 --- a/warcprox/controller.py +++ b/warcprox/controller.py @@ -10,7 +10,7 @@ import warcprox class WarcproxController(object): logger = logging.getLogger("warcprox.controller.WarcproxController") - def __init__(self, proxy=None, warc_writer_thread=None, playback_proxy=None): + def __init__(self, proxy=None, warc_writer_thread=None, playback_proxy=None, options=warcprox.Options()): """ Create warcprox controller. @@ -32,6 +32,7 @@ class WarcproxController(object): self.warc_writer_thread = warcprox.warcwriter.WarcWriterThread(recorded_url_q=self.proxy.recorded_url_q) self.playback_proxy = playback_proxy + self.options = options def run_until_shutdown(self): """ diff --git a/warcprox/dedup.py b/warcprox/dedup.py index a715b01..adf2c44 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -22,13 +22,14 @@ import random class DedupDb(object): logger = logging.getLogger("warcprox.dedup.DedupDb") - def __init__(self, dbm_file='./warcprox-dedup.db'): + def __init__(self, dbm_file='./warcprox-dedup.db', options=warcprox.Options()): if os.path.exists(dbm_file): self.logger.info('opening existing deduplication database {}'.format(dbm_file)) else: self.logger.info('creating new deduplication database {}'.format(dbm_file)) self.db = dbm_gnu.open(dbm_file, 'c') + self.options = options def close(self): self.db.close() @@ -61,6 +62,15 @@ class DedupDb(object): self.logger.debug('dedup db lookup of key=%s returning %s', key, result) return result + def notify(self, recorded_url, records): + if (records[0].get_header(warctools.WarcRecord.TYPE) == warctools.WarcRecord.RESPONSE + and recorded_url.response_recorder.payload_size() > 0): + key = warcprox.digest_str(recorded_url.response_recorder.payload_digest, + self.options.base32) + self.save(key, records[0]) + + + def decorate_with_dedup_info(dedup_db, recorded_url, base32=False): if recorded_url.response_recorder and recorded_url.response_recorder.payload_digest: key = warcprox.digest_str(recorded_url.response_recorder.payload_digest, base32) @@ -69,13 +79,14 @@ def decorate_with_dedup_info(dedup_db, recorded_url, base32=False): class RethinkDedupDb: logger = logging.getLogger("warcprox.dedup.RethinkDedupDb") - def __init__(self, servers=["localhost"], db="warcprox", table="dedup", shards=3, replicas=3): + def __init__(self, servers=["localhost"], db="warcprox", table="dedup", shards=3, replicas=3, options=warcprox.Options()): self.servers = servers self.db = db self.table = table self.shards = shards self.replicas = replicas self._ensure_db_table() + self.options = options # https://github.com/rethinkdb/rethinkdb-example-webpy-blog/blob/master/model.py # "Best practices: Managing connections: a connection per request" @@ -125,3 +136,10 @@ class RethinkDedupDb: result[x] = result[x].encode("utf-8") self.logger.debug('dedup db lookup of key=%s returning %s', key, result) return result + + def notify(self, recorded_url, records): + if (records[0].get_header(warctools.WarcRecord.TYPE) == warctools.WarcRecord.RESPONSE + and recorded_url.response_recorder.payload_size() > 0): + key = warcprox.digest_str(recorded_url.response_recorder.payload_digest, + self.options.base32) + self.save(key, records[0]) diff --git a/warcprox/main.py b/warcprox/main.py index e7bfee2..3a2d032 100644 --- a/warcprox/main.py +++ b/warcprox/main.py @@ -27,7 +27,7 @@ def _build_arg_parser(prog=os.path.basename(sys.argv[0])): description='warcprox - WARC writing MITM HTTP/S proxy', formatter_class=argparse.ArgumentDefaultsHelpFormatter) arg_parser.add_argument('-p', '--port', dest='port', default='8000', - help='port to listen on') + type=int, help='port to listen on') arg_parser.add_argument('-b', '--address', dest='address', default='localhost', help='address to listen on') arg_parser.add_argument('-c', '--cacert', dest='cacert', @@ -59,7 +59,7 @@ def _build_arg_parser(prog=os.path.basename(sys.argv[0])): arg_parser.add_argument('--stats-db-file', dest='stats_db_file', default='./warcprox-stats.db', help='persistent statistics database file; empty string or /dev/null disables statistics tracking') arg_parser.add_argument('-P', '--playback-port', dest='playback_port', - default=None, help='port to listen on for instant playback') + type=int, default=None, help='port to listen on for instant playback') arg_parser.add_argument('--playback-index-db-file', dest='playback_index_db_file', default='./warcprox-playback-index.db', help='playback index database file (only used if --playback-port is specified)') @@ -70,6 +70,9 @@ def _build_arg_parser(prog=os.path.basename(sys.argv[0])): help='rethinkdb servers, used for dedup and stats if specified; e.g. db0.foo.org,db0.foo.org:38015,db1.foo.org') arg_parser.add_argument('--rethinkdb-db', dest='rethinkdb_db', default="warcprox", help='rethinkdb database name (ignored unless --rethinkdb-servers is specified)') + arg_parser.add_argument('--rethinkdb-big-table', + dest='rethinkdb_big_table', action='store_true', default=False, + help='use a big rethinkdb table called "captures", instead of a small table called "dedup"; table is suitable for use as index for playback (ignored unless --rethinkdb-servers is specified)') arg_parser.add_argument('--version', action='version', version="warcprox {}".format(warcprox.version_str)) arg_parser.add_argument('-v', '--verbose', dest='verbose', action='store_true') @@ -97,6 +100,7 @@ def dump_state(signum=None, frame=None): def main(argv=sys.argv): arg_parser = _build_arg_parser(prog=os.path.basename(argv[0])) args = arg_parser.parse_args(args=argv[1:]) + options = warcprox.Options(**vars(args)) if args.verbose: loglevel = logging.DEBUG @@ -114,22 +118,31 @@ def main(argv=sys.argv): logging.fatal(e) exit(1) + listeners = [] if args.rethinkdb_servers: - dedup_db = warcprox.dedup.RethinkDedupDb(args.rethinkdb_servers.split(","), args.rethinkdb_db) + if args.rethinkdb_big_table: + captures_db = warcprox.bigtable.RethinkCaptures(args.rethinkdb_servers.split(","), args.rethinkdb_db, options=options) + dedup_db = warcprox.bigtable.RethinkCapturesDedup(bigtable, options=options) + listeners.append(captures_db) + else: + dedup_db = warcprox.dedup.RethinkDedupDb(args.rethinkdb_servers.split(","), args.rethinkdb_db, options=options) + listeners.append(dedup_db) elif args.dedup_db_file in (None, '', '/dev/null'): logging.info('deduplication disabled') dedup_db = None else: - dedup_db = warcprox.dedup.DedupDb(args.dedup_db_file) - + dedup_db = warcprox.dedup.DedupDb(args.dedup_db_file, options=options) + listeners.append(dedup_db) if args.rethinkdb_servers: - stats_db = warcprox.stats.RethinkStatsDb(args.rethinkdb_servers.split(","), args.rethinkdb_db) + stats_db = warcprox.stats.RethinkStatsDb(args.rethinkdb_servers.split(","), args.rethinkdb_db, options=options) + listeners.append(stats_db) elif args.stats_db_file in (None, '', '/dev/null'): logging.info('statistics tracking disabled') stats_db = None else: - stats_db = warcprox.stats.StatsDb(args.stats_db_file) + stats_db = warcprox.stats.StatsDb(args.stats_db_file, options=options) + listeners.append(stats_db) recorded_url_q = queue.Queue() @@ -138,33 +151,29 @@ def main(argv=sys.argv): ca_name=ca_name) proxy = warcprox.warcproxy.WarcProxy( - server_address=(args.address, int(args.port)), ca=ca, + server_address=(args.address, args.port), ca=ca, recorded_url_q=recorded_url_q, digest_algorithm=args.digest_algorithm, - stats_db=stats_db) + stats_db=stats_db, options=options) if args.playback_port is not None: - playback_index_db = warcprox.playback.PlaybackIndexDb(args.playback_index_db_file) - playback_server_address=(args.address, int(args.playback_port)) - playback_proxy = warcprox.playback.PlaybackProxy(server_address=playback_server_address, - ca=ca, playback_index_db=playback_index_db, - warcs_dir=args.directory) + playback_index_db = warcprox.playback.PlaybackIndexDb(args.playback_index_db_file, options=options) + playback_proxy = warcprox.playback.PlaybackProxy( + server_address=(args.address, args.playback_port), ca=ca, + playback_index_db=playback_index_db, warcs_dir=args.directory, + options=options) + listeners.append(playback_index_db) else: playback_index_db = None playback_proxy = None - default_warc_writer = warcprox.writer.WarcWriter(directory=args.directory, - gzip=args.gzip, prefix=args.prefix, port=int(args.port), - rollover_size=int(args.size), base32=args.base32, - digest_algorithm=args.digest_algorithm, - rollover_idle_time=int(args.rollover_idle_time) if args.rollover_idle_time is not None else None) - writer_pool=warcprox.writer.WarcWriterPool(default_warc_writer) + default_warc_writer = warcprox.writer.WarcWriter(args.prefix, options=options) + writer_pool = warcprox.writer.WarcWriterPool(default_warc_writer, options=options) warc_writer_thread = warcprox.writerthread.WarcWriterThread( recorded_url_q=recorded_url_q, writer_pool=writer_pool, - dedup_db=dedup_db, playback_index_db=playback_index_db, - stats_db=stats_db) + dedup_db=dedup_db, listeners=listeners, options=options) - controller = warcprox.controller.WarcproxController(proxy, warc_writer_thread, playback_proxy) + controller = warcprox.controller.WarcproxController(proxy, warc_writer_thread, playback_proxy, options=options) signal.signal(signal.SIGTERM, lambda a,b: controller.stop.set()) signal.signal(signal.SIGINT, lambda a,b: controller.stop.set()) diff --git a/warcprox/playback.py b/warcprox/playback.py index 9fae6e1..3424337 100644 --- a/warcprox/playback.py +++ b/warcprox/playback.py @@ -217,6 +217,9 @@ class PlaybackIndexDb(object): except: pass + def notify(self, recorded_url, records): + self.save(records[0].warc_filename, records, records[0].offset) + def save(self, warcfile, recordset, offset): response_record = recordset[0] # XXX canonicalize url? diff --git a/warcprox/stats.py b/warcprox/stats.py index 38dabd6..d246d69 100644 --- a/warcprox/stats.py +++ b/warcprox/stats.py @@ -17,6 +17,7 @@ from hanzo import warctools import rethinkdb r = rethinkdb import random +import warcprox def _empty_bucket(bucket): return { @@ -41,13 +42,14 @@ def _empty_bucket(bucket): class StatsDb: logger = logging.getLogger("warcprox.stats.StatsDb") - def __init__(self, dbm_file='./warcprox-stats.db'): + def __init__(self, dbm_file='./warcprox-stats.db', options=warcprox.Options()): if os.path.exists(dbm_file): self.logger.info('opening existing stats database {}'.format(dbm_file)) else: self.logger.info('creating new stats database {}'.format(dbm_file)) self.db = dbm_gnu.open(dbm_file, 'c') + self.options = options def close(self): self.db.close() @@ -71,6 +73,9 @@ class StatsDb: else: return None + def notify(self, recorded_url, records): + self.tally(recorded_url, records) + def tally(self, recorded_url, records): buckets = ["__all__"] @@ -102,13 +107,14 @@ class StatsDb: class RethinkStatsDb: logger = logging.getLogger("warcprox.stats.RethinkStatsDb") - def __init__(self, servers=["localhost"], db="warcprox", table="stats", shards=3, replicas=3): + def __init__(self, servers=["localhost"], db="warcprox", table="stats", shards=3, replicas=3, options=warcprox.Options()): self.servers = servers self.db = db self.table = table self.shards = shards self.replicas = replicas self._ensure_db_table() + self.options = options # https://github.com/rethinkdb/rethinkdb-example-webpy-blog/blob/master/model.py # "Best practices: Managing connections: a connection per request" @@ -179,3 +185,6 @@ class RethinkStatsDb: if sorted(result.values()) != [0,0,0,0,0,1] or [result["deleted"],result["skipped"],result["errors"]] != [0,0,0]: raise Exception("unexpected result %s saving %s", result, record) + def notify(self, recorded_url, records): + self.tally(recorded_url, records) + diff --git a/warcprox/tests/test_warcprox.py b/warcprox/tests/test_warcprox.py index 2489807..7477d05 100755 --- a/warcprox/tests/test_warcprox.py +++ b/warcprox/tests/test_warcprox.py @@ -208,8 +208,7 @@ def warcprox_(request, dedup_db, stats_db): writer_pool = warcprox.writer.WarcWriterPool(default_warc_writer) warc_writer_thread = warcprox.writerthread.WarcWriterThread( recorded_url_q=recorded_url_q, writer_pool=writer_pool, - dedup_db=dedup_db, playback_index_db=playback_index_db, - stats_db=stats_db) + dedup_db=dedup_db, listeners=[dedup_db, playback_index_db, stats_db]) warcprox_ = warcprox.controller.WarcproxController(proxy, warc_writer_thread, playback_proxy) logging.info('starting warcprox') diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index 21c72b8..ee3de49 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -39,7 +39,7 @@ import socket from hanzo import warctools from certauth.certauth import CertificateAuthority -import warcprox.mitmproxy +import warcprox class ProxyingRecorder(object): """ @@ -349,7 +349,7 @@ class SingleThreadedWarcProxy(http_server.HTTPServer): def __init__(self, server_address=('localhost', 8000), req_handler_class=WarcProxyHandler, bind_and_activate=True, ca=None, recorded_url_q=None, digest_algorithm='sha1', - stats_db=None): + stats_db=None, options=warcprox.Options()): http_server.HTTPServer.__init__(self, server_address, req_handler_class, bind_and_activate) self.digest_algorithm = digest_algorithm @@ -369,6 +369,8 @@ class SingleThreadedWarcProxy(http_server.HTTPServer): self.stats_db = stats_db + self.options = options + def server_activate(self): http_server.HTTPServer.server_activate(self) self.logger.info('WarcProxy listening on {0}:{1}'.format(self.server_address[0], self.server_address[1])) diff --git a/warcprox/writer.py b/warcprox/writer.py index c8c1b44..6f58809 100644 --- a/warcprox/writer.py +++ b/warcprox/writer.py @@ -16,9 +16,10 @@ class WarcWriter: logger = logging.getLogger("warcprox.writer.WarcWriter") # port is only used for warc filename - def __init__(self, directory='./warcs', rollover_size=1000000000, - gzip=False, prefix='WARCPROX', port=0, digest_algorithm='sha1', - base32=False, rollover_idle_time=None): + def __init__(self, prefix='WARCPROX', directory='./warcs', + rollover_size=1000000000, gzip=False, port=0, + digest_algorithm='sha1', base32=False, rollover_idle_time=None, + options=warcprox.Options()): self.rollover_size = rollover_size self.rollover_idle_time = rollover_idle_time @@ -114,17 +115,14 @@ class WarcWriter: class WarcWriterPool: logger = logging.getLogger("warcprox.writer.WarcWriterPool") - def __init__(self, default_warc_writer=None): + def __init__(self, default_warc_writer=None, options=warcprox.Options()): if default_warc_writer: self.default_warc_writer = default_warc_writer else: - self.default_warc_writer = WarcWriter() + self.default_warc_writer = WarcWriter(options=options) self.warc_writers = {} # {prefix:WarcWriter} self._last_sync = time.time() - - self.logger.info('directory={} gzip={} rollover_size={} rollover_idle_time={} prefix={} port={}'.format( - os.path.abspath(self.default_warc_writer.directory), self.default_warc_writer.gzip, self.default_warc_writer.rollover_size, - self.default_warc_writer.rollover_idle_time, self.default_warc_writer.prefix, self.default_warc_writer.port)) + self.options = options # chooses writer for filename specified by warcprox_meta["warc-prefix"] if set def _writer(self, recorded_url): @@ -133,14 +131,7 @@ class WarcWriterPool: # self.logger.info("recorded_url.warcprox_meta={} for {}".format(recorded_url.warcprox_meta, recorded_url.url)) prefix = recorded_url.warcprox_meta["warc-prefix"] if not prefix in self.warc_writers: - self.warc_writers[prefix] = WarcWriter(prefix=prefix, - directory=self.default_warc_writer.directory, - rollover_size=self.default_warc_writer.rollover_size, - rollover_idle_time=self.default_warc_writer.rollover_idle_time, - gzip=self.default_warc_writer.gzip, - port=self.default_warc_writer.port, - digest_algorithm=self.default_warc_writer.record_builder.digest_algorithm, - base32=self.default_warc_writer.record_builder.base32) + self.warc_writers[prefix] = WarcWriter(prefix=prefix, options=self.options) w = self.warc_writers[prefix] return w diff --git a/warcprox/writerthread.py b/warcprox/writerthread.py index df70d63..a766f6c 100644 --- a/warcprox/writerthread.py +++ b/warcprox/writerthread.py @@ -22,7 +22,7 @@ import warcprox class WarcWriterThread(threading.Thread): logger = logging.getLogger("warcprox.warcproxwriter.WarcWriterThread") - def __init__(self, recorded_url_q=None, writer_pool=None, dedup_db=None, playback_index_db=None, stats_db=None): + def __init__(self, recorded_url_q=None, writer_pool=None, dedup_db=None, listeners=None, options=warcprox.Options()): """recorded_url_q is a queue.Queue of warcprox.warcprox.RecordedUrl.""" threading.Thread.__init__(self, name='WarcWriterThread') self.recorded_url_q = recorded_url_q @@ -32,9 +32,8 @@ class WarcWriterThread(threading.Thread): else: self.writer_pool = WarcWriterPool() self.dedup_db = dedup_db - self.playback_index_db = playback_index_db - self.stats_db = stats_db - self._last_sync = time.time() + self.listeners = listeners + self.options = options def run(self): try: @@ -42,40 +41,18 @@ class WarcWriterThread(threading.Thread): try: recorded_url = self.recorded_url_q.get(block=True, timeout=0.5) if self.dedup_db: - warcprox.dedup.decorate_with_dedup_info(self.dedup_db, recorded_url, - base32=self.writer_pool.default_warc_writer.record_builder.base32) + warcprox.dedup.decorate_with_dedup_info(self.dedup_db, + recorded_url, base32=self.options.base32) records = self.writer_pool.write_records(recorded_url) self._final_tasks(recorded_url, records) except queue.Empty: self.writer_pool.maybe_idle_rollover() - self._sync() self.logger.info('WarcWriterThread shutting down') self.writer_pool.close_writers() except: self.logger.critical("WarcWriterThread shutting down after unexpected error", exc_info=True) - def _sync(self): - # XXX prob doesn't belong here (do we need it at all?) - if time.time() - self._last_sync > 60: - if self.dedup_db: - self.dedup_db.sync() - if self.playback_index_db: - self.playback_index_db.sync() - self._last_sync = time.time() - - def _save_dedup_info(self, recorded_url, records): - if (self.dedup_db - and records[0].get_header(warctools.WarcRecord.TYPE) == warctools.WarcRecord.RESPONSE - and recorded_url.response_recorder.payload_size() > 0): - key = warcprox.digest_str(recorded_url.response_recorder.payload_digest, - self.writer_pool.default_warc_writer.record_builder.base32) - self.dedup_db.save(key, records[0]) - - def _save_playback_info(self, recorded_url, records): - if self.playback_index_db is not None: - self.playback_index_db.save(records[0].warc_filename, records, records[0].offset) - # closest thing we have to heritrix crawl log at the moment def _log(self, recorded_url, records): def _decode(x): @@ -107,12 +84,8 @@ class WarcWriterThread(threading.Thread): _decode(records[0].warc_filename), records[0].offset)) - def _update_stats(self, recorded_url, records): - if self.stats_db: - self.stats_db.tally(recorded_url, records) - def _final_tasks(self, recorded_url, records): - self._save_dedup_info(recorded_url, records) - self._save_playback_info(recorded_url, records) - self._update_stats(recorded_url, records) + if self.listeners: + for listener in self.listeners: + listener.notify(recorded_url, records) self._log(recorded_url, records) From ab4e90c4b865468ba33341d0b8c796d94daa306d Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 25 Aug 2015 00:04:07 +0000 Subject: [PATCH 031/146] make warc-date follow warc spec "timestamp shall represent the instant that data capture for record creation began" --- warcprox/warc.py | 8 ++++---- warcprox/warcproxy.py | 19 +++++++++++-------- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/warcprox/warc.py b/warcprox/warc.py index eaeeedf..1c535ae 100644 --- a/warcprox/warc.py +++ b/warcprox/warc.py @@ -9,7 +9,7 @@ import socket import hanzo.httptools from hanzo import warctools import warcprox -from datetime import datetime +import datetime class WarcRecordBuilder: logger = logging.getLogger("warcprox.warc.WarcRecordBuilder") @@ -50,7 +50,7 @@ class WarcRecordBuilder: def build_warc_records(self, recorded_url): """Returns a tuple of hanzo.warctools.warc.WarcRecord (principal_record, ...)""" - warc_date = warctools.warc.warc_datetime_str(datetime.utcnow()) + warc_date = warctools.warc.warc_datetime_str(recorded_url.timestamp) if recorded_url.response_recorder: principal_record = self._build_response_principal_record(recorded_url, warc_date) @@ -73,7 +73,7 @@ class WarcRecordBuilder: refers_to_date=None, payload_digest=None): if warc_date is None: - warc_date = warctools.warc.warc_datetime_str(datetime.utcnow()) + warc_date = warctools.warc.warc_datetime_str(datetime.datetime.utcnow()) record_id = warctools.WarcRecord.random_warc_uuid() @@ -123,7 +123,7 @@ class WarcRecordBuilder: return record def build_warcinfo_record(self, filename): - warc_record_date = warctools.warc.warc_datetime_str(datetime.utcnow()) + warc_record_date = warctools.warc.warc_datetime_str(datetime.datetime.utcnow()) record_id = warctools.WarcRecord.random_warc_uuid() headers = [] diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index ee3de49..84702dc 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -13,22 +13,18 @@ try: import http.server as http_server except ImportError: import BaseHTTPServer as http_server - try: import socketserver except ImportError: import SocketServer as socketserver - try: import queue except ImportError: import Queue as queue - try: import http.client as http_client except ImportError: import httplib as http_client - import logging import re import tempfile @@ -37,9 +33,9 @@ import hashlib import json import socket from hanzo import warctools - from certauth.certauth import CertificateAuthority import warcprox +import datetime class ProxyingRecorder(object): """ @@ -208,6 +204,9 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): try: self.logger.debug('sending to remote server req=%s', repr(req)) + # warc-date "shall represent the instant that data capture for record creation began" + timestamp = datetime.datetime.utcnow() + # Send it down the pipe! self._proxy_sock.sendall(req) @@ -238,7 +237,7 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): status=prox_rec_res.status, size=prox_rec_res.recorder.len, client_ip=self.client_address[0], content_type=prox_rec_res.getheader("Content-Type"), - method=self.command) + method=self.command, timestamp=timestamp) self.server.recorded_url_q.put(recorded_url) self.log_request(prox_rec_res.status, prox_rec_res.recorder.len) @@ -264,6 +263,8 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): if ('Content-Length' in self.headers and 'Content-Type' in self.headers and (warc_type or 'WARC-Type' in self.headers)): + timestamp = datetime.datetime.utcnow() + # stream this? request_data = self.rfile.read(int(self.headers['Content-Length'])) @@ -281,7 +282,7 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): custom_type=warc_type or self.headers['WARC-Type'], status=204, size=len(request_data), client_ip=self.client_address[0], - method=self.command) + method=self.command, timestamp=timestamp) self.server.recorded_url_q.put(rec_custom) self.send_response(204, 'OK') @@ -307,7 +308,8 @@ class RecordedUrl: def __init__(self, url, request_data, response_recorder, remote_ip, warcprox_meta=None, content_type=None, custom_type=None, - status=None, size=None, client_ip=None, method=None): + status=None, size=None, client_ip=None, method=None, + timestamp=None): # XXX should test what happens with non-ascii url (when does # url-encoding happen?) if type(url) is not bytes: @@ -335,6 +337,7 @@ class RecordedUrl: self.size = size self.client_ip = client_ip self.method = method + self.timestamp = timestamp def __del__(self): self.logger.debug("finished with %s", self) From 6d673ee35f1e7ae3ce00724fd8f1c07c524de848 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 25 Aug 2015 01:26:51 +0000 Subject: [PATCH 032/146] tests pass with big rethinkdb captures table --- warcprox/__init__.py | 1 + warcprox/bigtable.py | 93 ++++++++++++++++++++++++++------- warcprox/dedup.py | 3 +- warcprox/main.py | 2 +- warcprox/tests/conftest.py | 8 +++ warcprox/tests/test_warcprox.py | 61 ++++++++++++++++----- 6 files changed, 135 insertions(+), 33 deletions(-) diff --git a/warcprox/__init__.py b/warcprox/__init__.py index 4f8ad91..994b919 100644 --- a/warcprox/__init__.py +++ b/warcprox/__init__.py @@ -31,3 +31,4 @@ import warcprox.writer as writer import warcprox.warc as warc import warcprox.writerthread as writerthread import warcprox.stats as stats +import warcprox.bigtable as bigtable diff --git a/warcprox/bigtable.py b/warcprox/bigtable.py index 787aa9b..f1494d6 100644 --- a/warcprox/bigtable.py +++ b/warcprox/bigtable.py @@ -7,16 +7,21 @@ from hanzo import warctools import rethinkdb r = rethinkdb import random +import warcprox +import base64 +import surt +import os class RethinkCaptures: - logger = logging.getLogger("warcprox.dedup.RethinkDedupDb") + logger = logging.getLogger("warcprox.dedup.RethinkCaptures") - def __init__(self, servers=["localhost"], db="warcprox", table="captures", shards=3, replicas=3): + def __init__(self, servers=["localhost"], db="warcprox", table="captures", shards=3, replicas=3, options=warcprox.Options()): self.servers = servers self.db = db self.table = table self.shards = shards self.replicas = replicas + self.options = options self._ensure_db_table() # https://github.com/rethinkdb/rethinkdb-example-webpy-blog/blob/master/model.py @@ -38,27 +43,79 @@ class RethinkCaptures: tables = r.db(self.db).table_list().run(conn) if not self.table in tables: self.logger.info("creating rethinkdb table %s in database %s", repr(self.table), repr(self.db)) - r.db(db).table_create(table, shards=3, replicas=3).run(conn) - r.db(db).table(table).index_create("abbr_canon_surt_timesamp", [r.row["abbr_canon_surt"], r.row["timestamp"]]).run(conn) - r.db(db).table(table).index_create("sha1_warc_type", [r.row["sha1base32"], r.row["warc_type"]]).run(conn) - # r.db(self.db).table_create(self.table, primary_key="canon_surt", shards=self.shards, replicas=self.replicas).run(conn) + r.db(self.db).table_create(self.table, shards=self.shards, replicas=self.replicas).run(conn) + r.db(self.db).table(self.table).index_create("abbr_canon_surt_timesamp", [r.row["abbr_canon_surt"], r.row["timestamp"]]).run(conn) + r.db(self.db).table(self.table).index_create("sha1_warc_type", [r.row["sha1base32"], r.row["warc_type"]]).run(conn) + # r.dself.b(self.db).table_create(self.table, primary_key="canon_surt", shards=self.shards, replicas=self.replicas).run(conn) # r.db(self.db).table(self.table).index_create("timestamp").run(conn) # r.db(self.db).table(self.table).index_create("sha1base32").run(conn) + def find_response_by_digest(self, algo, raw_digest): + if algo != "sha1": + raise Exception("digest type is {} but big capture table is indexed by sha1".format(algo)) + sha1base32 = base64.b32encode(raw_digest).decode("utf-8") + with self._random_server_connection() as conn: + cursor = r.db(self.db).table(self.table).get_all([sha1base32, "response"], index="sha1_warc_type").run(conn) + results = list(cursor) + if len(results) > 1: + raise Exception("expected 0 or 1 but found %s results for sha1base32=%s", len(results), sha1base32) + elif len(results) == 1: + result = results[0] + else: + result = None + self.logger.info("returning %s for sha1base32=%s", result, sha1base32) + return result + def notify(self, recorded_url, records): - canon_surt = surt.surt(recorded_url.url, trailing_comma=True, host_massage=False) + if recorded_url.response_recorder.payload_digest.name != "sha1": + self.logger.warn("digest type is %s but big capture table is indexed by sha1", recorded_url.response_recorder.payload_digest.name) + + canon_surt = surt.surt(recorded_url.url.decode("utf-8"), trailing_comma=True, host_massage=False) entry = { # id only specified for rethinkdb partitioning - "id": "{} {}".format(canon_surt[:20], record.id.decode("utf-8")[10:-1]), + "id": "{} {}".format(canon_surt[:20], records[0].id.decode("utf-8")[10:-1]), "abbr_canon_surt": canon_surt[:150], - "timestamp": re.sub(r"[^0-9]", "", record.date.decode("utf-8")), - "url": record.url.decode("utf-8"), - "offset": offset, - "filename": os.path.basename(warc_file), - "warc_type": record.type.decode("utf-8"), - "warc_id": record.id.decode("utf-8"), - "sha1base32": record.get_header(b'WARC-Payload-Digest').decode("utf-8")[5:], - # mimetype - # response_code - # http_method + # "timestamp": re.sub(r"[^0-9]", "", records[0].date.decode("utf-8")), + "timestamp": records[0].date.decode("utf-8"), + "url": recorded_url.url.decode("utf-8"), + "offset": records[0].offset, + "filename": os.path.basename(records[0].warc_filename), + "warc_type": records[0].type.decode("utf-8"), + "warc_id": records[0].id.decode("utf-8"), + "sha1base32": base64.b32encode(recorded_url.response_recorder.payload_digest.digest()).decode("utf-8"), + "content_type": recorded_url.content_type, + "response_code": recorded_url.status, + "http_method": recorded_url.method, } + + with self._random_server_connection() as conn: + result = r.db(self.db).table(self.table).insert(entry).run(conn) + if result["inserted"] == 1 and sorted(result.values()) != [0,0,0,0,0,1]: + raise Exception("unexpected result %s saving %s", result, entry) + self.logger.info('big capture table db saved %s', entry) + +class RethinkCapturesDedup: + logger = logging.getLogger("warcprox.dedup.RethinkCapturesDedup") + + def __init__(self, captures_db, options=warcprox.Options()): + self.captures_db = captures_db + self.options = options + + def lookup(self, digest_key): + k = digest_key.decode("utf-8") if isinstance(digest_key, bytes) else digest_key + algo, value_str = k.split(":") + self.logger.info("(algo,value_str)=(%s,%s)", algo, value_str) + if self.options.base32: + raw_digest = base64.b32decode(value_str, casefold=True) + else: + raw_digest = base64.b16decode(value_str, casefold=True) + entry = self.captures_db.find_response_by_digest(algo, raw_digest) + if entry: + dedup_info = {"url":entry["url"].encode("utf-8"), "date":entry["timestamp"].encode("utf-8"), "id":entry["warc_id"].encode("utf-8")} + self.logger.info("returning %s for digest_key=%s", dedup_info, digest_key) + return dedup_info + else: + return None + + def close(self): + pass diff --git a/warcprox/dedup.py b/warcprox/dedup.py index adf2c44..44c5503 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -70,7 +70,6 @@ class DedupDb(object): self.save(key, records[0]) - def decorate_with_dedup_info(dedup_db, recorded_url, base32=False): if recorded_url.response_recorder and recorded_url.response_recorder.payload_digest: key = warcprox.digest_str(recorded_url.response_recorder.payload_digest, base32) @@ -123,7 +122,7 @@ class RethinkDedupDb: record = {'key':k,'url':url,'date':date,'id':record_id} with self._random_server_connection() as conn: result = r.db(self.db).table(self.table).insert(record,conflict="replace").run(conn) - if sorted(result.values()) != [0,0,0,0,0,1] or [result["deleted"],result["skipped"],result["errors"]] != [0,0,0]: + if sorted(result.values()) != [0,0,0,0,0,1] and [result["deleted"],result["skipped"],result["errors"]] != [0,0,0]: raise Exception("unexpected result %s saving %s", result, record) self.logger.debug('dedup db saved %s:%s', key, record) diff --git a/warcprox/main.py b/warcprox/main.py index 3a2d032..eb20db6 100644 --- a/warcprox/main.py +++ b/warcprox/main.py @@ -122,7 +122,7 @@ def main(argv=sys.argv): if args.rethinkdb_servers: if args.rethinkdb_big_table: captures_db = warcprox.bigtable.RethinkCaptures(args.rethinkdb_servers.split(","), args.rethinkdb_db, options=options) - dedup_db = warcprox.bigtable.RethinkCapturesDedup(bigtable, options=options) + dedup_db = warcprox.bigtable.RethinkCapturesDedup(captures_db, options=options) listeners.append(captures_db) else: dedup_db = warcprox.dedup.RethinkDedupDb(args.rethinkdb_servers.split(","), args.rethinkdb_db, options=options) diff --git a/warcprox/tests/conftest.py b/warcprox/tests/conftest.py index db27210..f417fed 100644 --- a/warcprox/tests/conftest.py +++ b/warcprox/tests/conftest.py @@ -4,8 +4,16 @@ import pytest def pytest_addoption(parser): parser.addoption('--rethinkdb-servers', dest='rethinkdb_servers', help='rethink db servers for dedup, e.g. db0.foo.org,db0.foo.org:38015,db1.foo.org') + parser.addoption('--rethinkdb-big-table', + dest='rethinkdb_big_table', action='store_true', default=False, + help='use a big rethinkdb table called "captures", instead of a small table called "dedup"; table is suitable for use as index for playback (ignored unless --rethinkdb-servers is specified)') @pytest.fixture(scope="module") def rethinkdb_servers(request): return request.config.getoption("--rethinkdb-servers") +@pytest.fixture(scope="module") +def rethinkdb_big_table(request): + return request.config.getoption("--rethinkdb-big-table") + + diff --git a/warcprox/tests/test_warcprox.py b/warcprox/tests/test_warcprox.py index 7477d05..477ce6f 100755 --- a/warcprox/tests/test_warcprox.py +++ b/warcprox/tests/test_warcprox.py @@ -131,25 +131,62 @@ def https_daemon(request, cert): return https_daemon +# @pytest.fixture(scope="module") +# def options(request): +# return warcprox.Options(base32=True) + @pytest.fixture(scope="module") -def dedup_db(request, rethinkdb_servers): +def captures_db(request, rethinkdb_servers, rethinkdb_big_table): + captures_db = None if rethinkdb_servers: servers = rethinkdb_servers.split(",") - db = 'warcprox_test_dedup_' + "".join(random.sample("abcdefghijklmnopqrstuvwxyz0123456789_",8)) - ddb = warcprox.dedup.RethinkDedupDb(servers, db) - else: + if rethinkdb_big_table: + db = 'warcprox_test_captures_' + "".join(random.sample("abcdefghijklmnopqrstuvwxyz0123456789_",8)) + captures_db = warcprox.bigtable.RethinkCaptures(servers, db) + + def fin(): + if captures_db: + logging.info('dropping rethinkdb database {}'.format(db)) + with captures_db._random_server_connection() as conn: + result = r.db_drop(db).run(conn) + logging.info("result=%s", result) + request.addfinalizer(fin) + + return captures_db + +@pytest.fixture(scope="module") +def rethink_dedup_db(request, rethinkdb_servers, captures_db): + ddb = None + if rethinkdb_servers: + if captures_db: + ddb = warcprox.bigtable.RethinkCapturesDedup(captures_db) + else: + servers = rethinkdb_servers.split(",") + db = 'warcprox_test_dedup_' + "".join(random.sample("abcdefghijklmnopqrstuvwxyz0123456789_",8)) + ddb = warcprox.dedup.RethinkDedupDb(servers, db) + + def fin(): + if not captures_db: + logging.info('dropping rethinkdb database {}'.format(db)) + with ddb._random_server_connection() as conn: + result = r.db_drop(db).run(conn) + logging.info("result=%s", result) + request.addfinalizer(fin) + + return ddb + +@pytest.fixture(scope="module") +def dedup_db(request, rethink_dedup_db): + dedup_db_file = None + ddb = rethink_dedup_db + if not ddb: f = tempfile.NamedTemporaryFile(prefix='warcprox-test-dedup-', suffix='.db', delete=False) f.close() dedup_db_file = f.name ddb = warcprox.dedup.DedupDb(dedup_db_file) def fin(): - if rethinkdb_servers: - logging.info('dropping rethinkdb database {}'.format(db)) - with ddb._random_server_connection() as conn: - result = r.db_drop(db).run(conn) - logging.info("result=%s", result) - else: + if dedup_db_file: logging.info('deleting file {}'.format(dedup_db_file)) os.unlink(dedup_db_file) request.addfinalizer(fin) @@ -182,7 +219,7 @@ def stats_db(request, rethinkdb_servers): return sdb @pytest.fixture(scope="module") -def warcprox_(request, dedup_db, stats_db): +def warcprox_(request, captures_db, dedup_db, stats_db): f = tempfile.NamedTemporaryFile(prefix='warcprox-test-ca-', suffix='.pem', delete=True) f.close() # delete it, or CertificateAuthority will try to read it ca_file = f.name @@ -208,7 +245,7 @@ def warcprox_(request, dedup_db, stats_db): writer_pool = warcprox.writer.WarcWriterPool(default_warc_writer) warc_writer_thread = warcprox.writerthread.WarcWriterThread( recorded_url_q=recorded_url_q, writer_pool=writer_pool, - dedup_db=dedup_db, listeners=[dedup_db, playback_index_db, stats_db]) + dedup_db=dedup_db, listeners=[captures_db or dedup_db, playback_index_db, stats_db]) warcprox_ = warcprox.controller.WarcproxController(proxy, warc_writer_thread, playback_proxy) logging.info('starting warcprox') From 44a62111fb90c9c20f9fad285261d809678d2052 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 27 Aug 2015 20:09:21 +0000 Subject: [PATCH 033/146] support for deduplication buckets specified in warcprox-meta header {"captures-bucket":...,...} --- warcprox/bigtable.py | 24 +++-- warcprox/dedup.py | 42 +++++--- warcprox/main.py | 12 +-- warcprox/playback.py | 12 ++- warcprox/tests/test_warcprox.py | 168 +++++++++++++++++++++++++++----- warcprox/warcproxy.py | 10 +- warcprox/writer.py | 43 ++++---- warcprox/writerthread.py | 3 + 8 files changed, 226 insertions(+), 88 deletions(-) diff --git a/warcprox/bigtable.py b/warcprox/bigtable.py index f1494d6..aecb4ed 100644 --- a/warcprox/bigtable.py +++ b/warcprox/bigtable.py @@ -45,17 +45,14 @@ class RethinkCaptures: self.logger.info("creating rethinkdb table %s in database %s", repr(self.table), repr(self.db)) r.db(self.db).table_create(self.table, shards=self.shards, replicas=self.replicas).run(conn) r.db(self.db).table(self.table).index_create("abbr_canon_surt_timesamp", [r.row["abbr_canon_surt"], r.row["timestamp"]]).run(conn) - r.db(self.db).table(self.table).index_create("sha1_warc_type", [r.row["sha1base32"], r.row["warc_type"]]).run(conn) - # r.dself.b(self.db).table_create(self.table, primary_key="canon_surt", shards=self.shards, replicas=self.replicas).run(conn) - # r.db(self.db).table(self.table).index_create("timestamp").run(conn) - # r.db(self.db).table(self.table).index_create("sha1base32").run(conn) + r.db(self.db).table(self.table).index_create("sha1_warc_type", [r.row["sha1base32"], r.row["warc_type"], r.row["bucket"]]).run(conn) - def find_response_by_digest(self, algo, raw_digest): + def find_response_by_digest(self, algo, raw_digest, bucket="__unspecified__"): if algo != "sha1": raise Exception("digest type is {} but big capture table is indexed by sha1".format(algo)) sha1base32 = base64.b32encode(raw_digest).decode("utf-8") with self._random_server_connection() as conn: - cursor = r.db(self.db).table(self.table).get_all([sha1base32, "response"], index="sha1_warc_type").run(conn) + cursor = r.db(self.db).table(self.table).get_all([sha1base32, "response", bucket], index="sha1_warc_type").run(conn) results = list(cursor) if len(results) > 1: raise Exception("expected 0 or 1 but found %s results for sha1base32=%s", len(results), sha1base32) @@ -67,9 +64,17 @@ class RethinkCaptures: return result def notify(self, recorded_url, records): + if not recorded_url.response_recorder: + return + if recorded_url.response_recorder.payload_digest.name != "sha1": self.logger.warn("digest type is %s but big capture table is indexed by sha1", recorded_url.response_recorder.payload_digest.name) + if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta: + bucket = recorded_url.warcprox_meta["captures-bucket"] + else: + bucket = "__unspecified__" + canon_surt = surt.surt(recorded_url.url.decode("utf-8"), trailing_comma=True, host_massage=False) entry = { # id only specified for rethinkdb partitioning @@ -86,13 +91,14 @@ class RethinkCaptures: "content_type": recorded_url.content_type, "response_code": recorded_url.status, "http_method": recorded_url.method, + "bucket": bucket, } with self._random_server_connection() as conn: result = r.db(self.db).table(self.table).insert(entry).run(conn) if result["inserted"] == 1 and sorted(result.values()) != [0,0,0,0,0,1]: raise Exception("unexpected result %s saving %s", result, entry) - self.logger.info('big capture table db saved %s', entry) + self.logger.info("big capture table db saved %s", entry) class RethinkCapturesDedup: logger = logging.getLogger("warcprox.dedup.RethinkCapturesDedup") @@ -101,7 +107,7 @@ class RethinkCapturesDedup: self.captures_db = captures_db self.options = options - def lookup(self, digest_key): + def lookup(self, digest_key, bucket="__unspecified__"): k = digest_key.decode("utf-8") if isinstance(digest_key, bytes) else digest_key algo, value_str = k.split(":") self.logger.info("(algo,value_str)=(%s,%s)", algo, value_str) @@ -109,7 +115,7 @@ class RethinkCapturesDedup: raw_digest = base64.b32decode(value_str, casefold=True) else: raw_digest = base64.b16decode(value_str, casefold=True) - entry = self.captures_db.find_response_by_digest(algo, raw_digest) + entry = self.captures_db.find_response_by_digest(algo, raw_digest, bucket) if entry: dedup_info = {"url":entry["url"].encode("utf-8"), "date":entry["timestamp"].encode("utf-8"), "id":entry["warc_id"].encode("utf-8")} self.logger.info("returning %s for digest_key=%s", dedup_info, digest_key) diff --git a/warcprox/dedup.py b/warcprox/dedup.py index 44c5503..7148773 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -40,19 +40,22 @@ class DedupDb(object): except: pass - def save(self, key, response_record): + def save(self, digest_key, response_record, bucket=""): record_id = response_record.get_header(warctools.WarcRecord.ID).decode('latin1') url = response_record.get_header(warctools.WarcRecord.URL).decode('latin1') date = response_record.get_header(warctools.WarcRecord.DATE).decode('latin1') + key = digest_key + b"|" + bucket.encode("utf-8") + py_value = {'id':record_id, 'url':url, 'date':date} json_value = json.dumps(py_value, separators=(',',':')) self.db[key] = json_value.encode('utf-8') self.logger.debug('dedup db saved %s:%s', key, json_value) - def lookup(self, key): + def lookup(self, digest_key, bucket=""): result = None + key = digest_key + b"|" + bucket.encode("utf-8") if key in self.db: json_result = self.db[key] result = json.loads(json_result.decode('utf-8')) @@ -65,15 +68,21 @@ class DedupDb(object): def notify(self, recorded_url, records): if (records[0].get_header(warctools.WarcRecord.TYPE) == warctools.WarcRecord.RESPONSE and recorded_url.response_recorder.payload_size() > 0): - key = warcprox.digest_str(recorded_url.response_recorder.payload_digest, + digest_key = warcprox.digest_str(recorded_url.response_recorder.payload_digest, self.options.base32) - self.save(key, records[0]) + if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta: + self.save(digest_key, records[0], bucket=recorded_url.warcprox_meta["captures-bucket"]) + else: + self.save(digest_key, records[0]) def decorate_with_dedup_info(dedup_db, recorded_url, base32=False): if recorded_url.response_recorder and recorded_url.response_recorder.payload_digest: - key = warcprox.digest_str(recorded_url.response_recorder.payload_digest, base32) - recorded_url.dedup_info = dedup_db.lookup(key) + digest_key = warcprox.digest_str(recorded_url.response_recorder.payload_digest, base32) + if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta: + recorded_url.dedup_info = dedup_db.lookup(digest_key, recorded_url.warcprox_meta["captures-bucket"]) + else: + recorded_url.dedup_info = dedup_db.lookup(digest_key) class RethinkDedupDb: logger = logging.getLogger("warcprox.dedup.RethinkDedupDb") @@ -114,8 +123,9 @@ class RethinkDedupDb: def sync(self): pass - def save(self, key, response_record): - k = key.decode("utf-8") if isinstance(key, bytes) else key + def save(self, digest_key, response_record, bucket=""): + k = digest_key.decode("utf-8") if isinstance(digest_key, bytes) else digest_key + k = "{}|{}".format(k, bucket) record_id = response_record.get_header(warctools.WarcRecord.ID).decode('latin1') url = response_record.get_header(warctools.WarcRecord.URL).decode('latin1') date = response_record.get_header(warctools.WarcRecord.DATE).decode('latin1') @@ -124,21 +134,25 @@ class RethinkDedupDb: result = r.db(self.db).table(self.table).insert(record,conflict="replace").run(conn) if sorted(result.values()) != [0,0,0,0,0,1] and [result["deleted"],result["skipped"],result["errors"]] != [0,0,0]: raise Exception("unexpected result %s saving %s", result, record) - self.logger.debug('dedup db saved %s:%s', key, record) + self.logger.debug('dedup db saved %s:%s', k, record) - def lookup(self, key): - k = key.decode("utf-8") if isinstance(key, bytes) else key + def lookup(self, digest_key, bucket=""): + k = digest_key.decode("utf-8") if isinstance(digest_key, bytes) else digest_key + k = "{}|{}".format(k, bucket) with self._random_server_connection() as conn: result = r.db(self.db).table(self.table).get(k).run(conn) if result: for x in result: result[x] = result[x].encode("utf-8") - self.logger.debug('dedup db lookup of key=%s returning %s', key, result) + self.logger.debug('dedup db lookup of key=%s returning %s', k, result) return result def notify(self, recorded_url, records): if (records[0].get_header(warctools.WarcRecord.TYPE) == warctools.WarcRecord.RESPONSE and recorded_url.response_recorder.payload_size() > 0): - key = warcprox.digest_str(recorded_url.response_recorder.payload_digest, + digest_key = warcprox.digest_str(recorded_url.response_recorder.payload_digest, self.options.base32) - self.save(key, records[0]) + if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta: + self.save(digest_key, records[0], bucket=recorded_url.warcprox_meta["captures-bucket"]) + else: + self.save(digest_key, records[0]) diff --git a/warcprox/main.py b/warcprox/main.py index eb20db6..397f4db 100644 --- a/warcprox/main.py +++ b/warcprox/main.py @@ -43,10 +43,10 @@ def _build_arg_parser(prog=os.path.basename(sys.argv[0])): arg_parser.add_argument('-n', '--prefix', dest='prefix', default='WARCPROX', help='WARC filename prefix') arg_parser.add_argument('-s', '--size', dest='size', - default=1000*1000*1000, + default=1000*1000*1000, type=int, help='WARC file rollover size threshold in bytes') arg_parser.add_argument('--rollover-idle-time', - dest='rollover_idle_time', default=None, + dest='rollover_idle_time', default=None, type=int, help="WARC file rollover idle time threshold in seconds (so that Friday's last open WARC doesn't sit there all weekend waiting for more data)") try: hash_algos = hashlib.algorithms_guaranteed @@ -150,10 +150,7 @@ def main(argv=sys.argv): ca = certauth.certauth.CertificateAuthority(args.cacert, args.certs_dir, ca_name=ca_name) - proxy = warcprox.warcproxy.WarcProxy( - server_address=(args.address, args.port), ca=ca, - recorded_url_q=recorded_url_q, - digest_algorithm=args.digest_algorithm, + proxy = warcprox.warcproxy.WarcProxy(ca=ca, recorded_url_q=recorded_url_q, stats_db=stats_db, options=options) if args.playback_port is not None: @@ -167,8 +164,7 @@ def main(argv=sys.argv): playback_index_db = None playback_proxy = None - default_warc_writer = warcprox.writer.WarcWriter(args.prefix, options=options) - writer_pool = warcprox.writer.WarcWriterPool(default_warc_writer, options=options) + writer_pool = warcprox.writer.WarcWriterPool(options=options) warc_writer_thread = warcprox.writerthread.WarcWriterThread( recorded_url_q=recorded_url_q, writer_pool=writer_pool, dedup_db=dedup_db, listeners=listeners, options=options) diff --git a/warcprox/playback.py b/warcprox/playback.py index 3424337..30a5cb8 100644 --- a/warcprox/playback.py +++ b/warcprox/playback.py @@ -27,6 +27,7 @@ import json import traceback import re from warcprox.mitmproxy import MitmProxyHandler +import warcprox class PlaybackProxyHandler(MitmProxyHandler): logger = logging.getLogger("warcprox.playback.PlaybackProxyHandler") @@ -180,13 +181,14 @@ class PlaybackProxyHandler(MitmProxyHandler): class PlaybackProxy(socketserver.ThreadingMixIn, http_server.HTTPServer): logger = logging.getLogger("warcprox.playback.PlaybackProxy") - def __init__(self, server_address, req_handler_class=PlaybackProxyHandler, - bind_and_activate=True, ca=None, playback_index_db=None, - warcs_dir=None): - http_server.HTTPServer.__init__(self, server_address, req_handler_class, bind_and_activate) + + def __init__(self, ca=None, playback_index_db=None, options=warcprox.Options()): + server_address = (options.address or 'localhost', options.playback_port if options.playback_port is not None else 8001) + http_server.HTTPServer.__init__(self, server_address, PlaybackProxyHandler, bind_and_activate=True) self.ca = ca self.playback_index_db = playback_index_db - self.warcs_dir = warcs_dir + self.warcs_dir = options.directory + self.options = options def server_activate(self): http_server.HTTPServer.server_activate(self) diff --git a/warcprox/tests/test_warcprox.py b/warcprox/tests/test_warcprox.py index 477ce6f..e588754 100755 --- a/warcprox/tests/test_warcprox.py +++ b/warcprox/tests/test_warcprox.py @@ -18,6 +18,7 @@ import json import rethinkdb r = rethinkdb import random +from hanzo import warctools try: import http.server as http_server @@ -166,11 +167,12 @@ def rethink_dedup_db(request, rethinkdb_servers, captures_db): ddb = warcprox.dedup.RethinkDedupDb(servers, db) def fin(): - if not captures_db: - logging.info('dropping rethinkdb database {}'.format(db)) - with ddb._random_server_connection() as conn: - result = r.db_drop(db).run(conn) - logging.info("result=%s", result) + if rethinkdb_servers: + if not captures_db: + logging.info('dropping rethinkdb database {}'.format(db)) + with ddb._random_server_connection() as conn: + result = r.db_drop(db).run(conn) + logging.info("result=%s", result) request.addfinalizer(fin) return ddb @@ -228,26 +230,27 @@ def warcprox_(request, captures_db, dedup_db, stats_db): recorded_url_q = queue.Queue() - proxy = warcprox.warcproxy.WarcProxy(server_address=('localhost', 0), ca=ca, - recorded_url_q=recorded_url_q, stats_db=stats_db) + options = warcprox.Options(port=0, playback_port=0) + proxy = warcprox.warcproxy.WarcProxy(ca=ca, recorded_url_q=recorded_url_q, + stats_db=stats_db, options=options) + options.port = proxy.server_port - warcs_dir = tempfile.mkdtemp(prefix='warcprox-test-warcs-') + options.directory = tempfile.mkdtemp(prefix='warcprox-test-warcs-') f = tempfile.NamedTemporaryFile(prefix='warcprox-test-playback-index-', suffix='.db', delete=False) f.close() playback_index_db_file = f.name playback_index_db = warcprox.playback.PlaybackIndexDb(playback_index_db_file) - playback_proxy = warcprox.playback.PlaybackProxy(server_address=('localhost', 0), ca=ca, - playback_index_db=playback_index_db, warcs_dir=warcs_dir) + playback_proxy = warcprox.playback.PlaybackProxy(ca=ca, + playback_index_db=playback_index_db, options=options) + options.playback_proxy = playback_proxy.server_port - default_warc_writer = warcprox.writer.WarcWriter(directory=warcs_dir, - port=proxy.server_port) - writer_pool = warcprox.writer.WarcWriterPool(default_warc_writer) + writer_pool = warcprox.writer.WarcWriterPool(options) warc_writer_thread = warcprox.writerthread.WarcWriterThread( recorded_url_q=recorded_url_q, writer_pool=writer_pool, dedup_db=dedup_db, listeners=[captures_db or dedup_db, playback_index_db, stats_db]) - warcprox_ = warcprox.controller.WarcproxController(proxy, warc_writer_thread, playback_proxy) + warcprox_ = warcprox.controller.WarcproxController(proxy, warc_writer_thread, playback_proxy, options) logging.info('starting warcprox') warcprox_thread = threading.Thread(name='WarcproxThread', target=warcprox_.run_until_shutdown) @@ -257,7 +260,7 @@ def warcprox_(request, captures_db, dedup_db, stats_db): logging.info('stopping warcprox') warcprox_.stop.set() warcprox_thread.join() - for f in (ca_file, ca_dir, warcs_dir, playback_index_db_file): + for f in (ca_file, ca_dir, options.directory, playback_index_db_file): if os.path.isdir(f): logging.info('deleting directory {}'.format(f)) shutil.rmtree(f) @@ -394,8 +397,9 @@ def test_dedup_http(http_daemon, warcprox_, archiving_proxies, playback_proxies) assert response.headers['warcprox-test-header'] == 'e!' assert response.content == b'I am the warcprox test payload! ffffffffff!\n' - # XXX need to give warc writer thread a chance, and we don't have any change to poll for :-\ - time.sleep(2.0) + # wait for writer thread to process + while not warcprox_.warc_writer_thread.idle: + time.sleep(0.5) # check in dedup db (no change from prev) dedup_lookup = warcprox_.warc_writer_thread.dedup_db.lookup(b'sha1:65e1216acfd220f0292715e74bd7a1ec35c99dfc') @@ -455,8 +459,9 @@ def test_dedup_https(https_daemon, warcprox_, archiving_proxies, playback_proxie assert response.headers['warcprox-test-header'] == 'g!' assert response.content == b'I am the warcprox test payload! hhhhhhhhhh!\n' - # XXX need to give warc writer thread a chance, and we don't have any change to poll for :-\ - time.sleep(2.0) + # wait for writer thread to process + while not warcprox_.warc_writer_thread.idle: + time.sleep(0.5) # check in dedup db (no change from prev) dedup_lookup = warcprox_.warc_writer_thread.dedup_db.lookup(b'sha1:5b4efa64fdb308ec06ae56a9beba155a6f734b89') @@ -472,7 +477,7 @@ def test_dedup_https(https_daemon, warcprox_, archiving_proxies, playback_proxie assert response.content == b'I am the warcprox test payload! hhhhhhhhhh!\n' # XXX how to check dedup was used? -def test_limits(http_daemon, archiving_proxies): +def test_limits(http_daemon, warcprox_, archiving_proxies): url = 'http://localhost:{}/i/j'.format(http_daemon.server_port) request_meta = {"stats":{"buckets":["job1"]},"limits":{"job1.total.urls":10}} headers = {"Warcprox-Meta": json.dumps(request_meta)} @@ -483,8 +488,9 @@ def test_limits(http_daemon, archiving_proxies): assert response.headers['warcprox-test-header'] == 'i!' assert response.content == b'I am the warcprox test payload! jjjjjjjjjj!\n' - # XXX give warc writer thread a chance to update stats - time.sleep(2.0) + # wait for writer thread to process + while not warcprox_.warc_writer_thread.idle: + time.sleep(0.5) response = requests.get(url, proxies=archiving_proxies, headers=headers, stream=True) assert response.status_code == 420 @@ -494,6 +500,124 @@ def test_limits(http_daemon, archiving_proxies): assert response.headers["content-type"] == "text/plain;charset=utf-8" assert response.raw.data == b"request rejected by warcprox: reached limit job1.total.urls=10\n" +def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies, playback_proxies): + url1 = 'http://localhost:{}/k/l'.format(http_daemon.server_port) + url2 = 'https://localhost:{}/k/l'.format(https_daemon.server_port) + + # archive url1 bucket_a + headers = {"Warcprox-Meta": json.dumps({"warc-prefix":"test_dedup_buckets","captures-bucket":"bucket_a"})} + response = requests.get(url1, proxies=archiving_proxies, verify=False, headers=headers) + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'k!' + assert response.content == b'I am the warcprox test payload! llllllllll!\n' + + # wait for writer thread to process + while not warcprox_.warc_writer_thread.idle: + time.sleep(0.5) + + # check url1 in dedup db bucket_a + dedup_lookup = warcprox_.warc_writer_thread.dedup_db.lookup(b'sha1:bc3fac8847c9412f49d955e626fb58a76befbf81', bucket="bucket_a") + assert dedup_lookup['url'] == url1.encode('ascii') + assert re.match(br'^$', dedup_lookup['id']) + assert re.match(br'^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$', dedup_lookup['date']) + record_id = dedup_lookup['id'] + dedup_date = dedup_lookup['date'] + + # check url1 not in dedup db bucket_b + dedup_lookup = warcprox_.warc_writer_thread.dedup_db.lookup(b'sha1:bc3fac8847c9412f49d955e626fb58a76befbf81', bucket="bucket_b") + assert dedup_lookup is None + + # archive url2 bucket_b + headers = {"Warcprox-Meta": json.dumps({"warc-prefix":"test_dedup_buckets","captures-bucket":"bucket_b"})} + response = requests.get(url2, proxies=archiving_proxies, verify=False, headers=headers) + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'k!' + assert response.content == b'I am the warcprox test payload! llllllllll!\n' + + # wait for writer thread to process + while not warcprox_.warc_writer_thread.idle: + time.sleep(0.5) + + # check url2 in dedup db bucket_b + dedup_lookup = warcprox_.warc_writer_thread.dedup_db.lookup(b'sha1:bc3fac8847c9412f49d955e626fb58a76befbf81', bucket="bucket_b") + assert dedup_lookup['url'] == url2.encode('ascii') + assert re.match(br'^$', dedup_lookup['id']) + assert re.match(br'^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$', dedup_lookup['date']) + record_id = dedup_lookup['id'] + dedup_date = dedup_lookup['date'] + + # archive url2 bucket_a + headers = {"Warcprox-Meta": json.dumps({"warc-prefix":"test_dedup_buckets","captures-bucket":"bucket_a"})} + response = requests.get(url2, proxies=archiving_proxies, verify=False, headers=headers) + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'k!' + assert response.content == b'I am the warcprox test payload! llllllllll!\n' + + # archive url1 bucket_b + headers = {"Warcprox-Meta": json.dumps({"warc-prefix":"test_dedup_buckets","captures-bucket":"bucket_b"})} + response = requests.get(url1, proxies=archiving_proxies, verify=False, headers=headers) + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'k!' + assert response.content == b'I am the warcprox test payload! llllllllll!\n' + + # wait for writer thread to process + while not warcprox_.warc_writer_thread.idle: + time.sleep(0.5) + + # close the warc + assert warcprox_.warc_writer_thread.writer_pool.warc_writers["test_dedup_buckets"] + writer = warcprox_.warc_writer_thread.writer_pool.warc_writers["test_dedup_buckets"] + warc_path = os.path.join(writer.directory, writer._f_finalname) + warcprox_.warc_writer_thread.writer_pool.warc_writers["test_dedup_buckets"].close_writer() + assert os.path.exists(warc_path) + + # read the warc + fh = warctools.ArchiveRecord.open_archive(warc_path) + record_iter = fh.read_records(limit=None, offsets=True) + try: + (offset, record, errors) = next(record_iter) + assert record.type == b'warcinfo' + + # url1 bucket_a + (offset, record, errors) = next(record_iter) + assert record.type == b'response' + assert record.url == url1.encode('ascii') + assert record.content[1] == b'HTTP/1.1 200 OK\r\nContent-Type: text/plain\r\nwarcprox-test-header: k!\r\nContent-Length: 44\r\n\r\nI am the warcprox test payload! llllllllll!\n' + (offset, record, errors) = next(record_iter) + assert record.type == b'request' + + # url2 bucket_b + (offset, record, errors) = next(record_iter) + assert record.type == b'response' + assert record.url == url2.encode('ascii') + assert record.content[1] == b'HTTP/1.1 200 OK\r\nContent-Type: text/plain\r\nwarcprox-test-header: k!\r\nContent-Length: 44\r\n\r\nI am the warcprox test payload! llllllllll!\n' + (offset, record, errors) = next(record_iter) + assert record.type == b'request' + + # url2 bucket_a (revisit) + (offset, record, errors) = next(record_iter) + assert record.type == b'revisit' + assert record.url == url2.encode('ascii') + assert record.content[1] == b'HTTP/1.1 200 OK\r\nContent-Type: text/plain\r\nwarcprox-test-header: k!\r\nContent-Length: 44\r\n\r\n' + (offset, record, errors) = next(record_iter) + assert record.type == b'request' + + # url1 bucket_b (revisit) + (offset, record, errors) = next(record_iter) + assert record.type == b'revisit' + assert record.url == url1.encode('ascii') + assert record.content[1] == b'HTTP/1.1 200 OK\r\nContent-Type: text/plain\r\nwarcprox-test-header: k!\r\nContent-Length: 44\r\n\r\n' + (offset, record, errors) = next(record_iter) + assert record.type == b'request' + + # that's all folks + assert next(record_iter)[1] == None + assert next(record_iter, None) == None + + finally: + fh.close() + + if __name__ == '__main__': pytest.main() diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index 84702dc..1b56e4b 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -349,13 +349,11 @@ class RecordedUrl: class SingleThreadedWarcProxy(http_server.HTTPServer): logger = logging.getLogger("warcprox.warcproxy.WarcProxy") - def __init__(self, server_address=('localhost', 8000), - req_handler_class=WarcProxyHandler, bind_and_activate=True, - ca=None, recorded_url_q=None, digest_algorithm='sha1', - stats_db=None, options=warcprox.Options()): - http_server.HTTPServer.__init__(self, server_address, req_handler_class, bind_and_activate) + def __init__(self, ca=None, recorded_url_q=None, stats_db=None, options=warcprox.Options()): + server_address = (options.address or 'localhost', options.port if options.port is not None else 8000) + http_server.HTTPServer.__init__(self, server_address, WarcProxyHandler, bind_and_activate=True) - self.digest_algorithm = digest_algorithm + self.digest_algorithm = options.digest_algorithm or 'sha1' if ca is not None: self.ca = ca diff --git a/warcprox/writer.py b/warcprox/writer.py index 6f58809..21ae23f 100644 --- a/warcprox/writer.py +++ b/warcprox/writer.py @@ -13,25 +13,22 @@ import string import random class WarcWriter: - logger = logging.getLogger("warcprox.writer.WarcWriter") + logger = logging.getLogger('warcprox.writer.WarcWriter') - # port is only used for warc filename - def __init__(self, prefix='WARCPROX', directory='./warcs', - rollover_size=1000000000, gzip=False, port=0, - digest_algorithm='sha1', base32=False, rollover_idle_time=None, - options=warcprox.Options()): + def __init__(self, options=warcprox.Options()): - self.rollover_size = rollover_size - self.rollover_idle_time = rollover_idle_time + self.rollover_size = options.rollover_size or 1000000000 + self.rollover_idle_time = options.rollover_idle_time or None self._last_activity = time.time() - self.gzip = gzip + self.gzip = options.gzip or False + digest_algorithm = options.digest_algorithm or 'sha1' + base32 = options.base32 self.record_builder = warcprox.warc.WarcRecordBuilder(digest_algorithm=digest_algorithm, base32=base32) # warc path and filename stuff - self.directory = directory - self.prefix = prefix - self.port = port + self.directory = options.directory or './warcs' + self.prefix = options.prefix or 'warcprox' self._f = None self._fpath = None @@ -40,9 +37,9 @@ class WarcWriter: self._randomtoken = "".join(random.Random().sample(string.digits + string.ascii_lowercase, 8)) - if not os.path.exists(directory): - self.logger.info("warc destination directory {} doesn't exist, creating it".format(directory)) - os.mkdir(directory) + if not os.path.exists(self.directory): + self.logger.info("warc destination directory {} doesn't exist, creating it".format(self.directory)) + os.mkdir(self.directory) def timestamp17(self): now = datetime.utcnow() @@ -115,11 +112,8 @@ class WarcWriter: class WarcWriterPool: logger = logging.getLogger("warcprox.writer.WarcWriterPool") - def __init__(self, default_warc_writer=None, options=warcprox.Options()): - if default_warc_writer: - self.default_warc_writer = default_warc_writer - else: - self.default_warc_writer = WarcWriter(options=options) + def __init__(self, options=warcprox.Options()): + self.default_warc_writer = WarcWriter(options=options) self.warc_writers = {} # {prefix:WarcWriter} self._last_sync = time.time() self.options = options @@ -129,10 +123,11 @@ class WarcWriterPool: w = self.default_warc_writer if recorded_url.warcprox_meta and "warc-prefix" in recorded_url.warcprox_meta: # self.logger.info("recorded_url.warcprox_meta={} for {}".format(recorded_url.warcprox_meta, recorded_url.url)) - prefix = recorded_url.warcprox_meta["warc-prefix"] - if not prefix in self.warc_writers: - self.warc_writers[prefix] = WarcWriter(prefix=prefix, options=self.options) - w = self.warc_writers[prefix] + options = warcprox.Options(**vars(self.options)) + options.prefix = recorded_url.warcprox_meta["warc-prefix"] + if not options.prefix in self.warc_writers: + self.warc_writers[options.prefix] = WarcWriter(options=options) + w = self.warc_writers[options.prefix] return w def write_records(self, recorded_url): diff --git a/warcprox/writerthread.py b/warcprox/writerthread.py index a766f6c..182835f 100644 --- a/warcprox/writerthread.py +++ b/warcprox/writerthread.py @@ -34,18 +34,21 @@ class WarcWriterThread(threading.Thread): self.dedup_db = dedup_db self.listeners = listeners self.options = options + self.idle = None def run(self): try: while not self.stop.is_set(): try: recorded_url = self.recorded_url_q.get(block=True, timeout=0.5) + self.idle = None if self.dedup_db: warcprox.dedup.decorate_with_dedup_info(self.dedup_db, recorded_url, base32=self.options.base32) records = self.writer_pool.write_records(recorded_url) self._final_tasks(recorded_url, records) except queue.Empty: + self.idle = time.time() self.writer_pool.maybe_idle_rollover() self.logger.info('WarcWriterThread shutting down') From 022f6e72157218520955d4aac2f4be1e759ba59d Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 27 Aug 2015 23:57:12 +0000 Subject: [PATCH 034/146] wrap rethinkdb operations and retry if appropriate (as best as we can tell) --- warcprox/__init__.py | 36 ++++++++++++++- warcprox/bigtable.py | 64 +++++++++++---------------- warcprox/dedup.py | 52 ++++++++-------------- warcprox/stats.py | 78 ++++++++++++++------------------- warcprox/tests/test_warcprox.py | 15 +++---- 5 files changed, 116 insertions(+), 129 deletions(-) diff --git a/warcprox/__init__.py b/warcprox/__init__.py index 994b919..703952e 100644 --- a/warcprox/__init__.py +++ b/warcprox/__init__.py @@ -1,5 +1,3 @@ -# vim:set sw=4 et: - from argparse import Namespace as _Namespace def digest_str(hash_obj, base32): @@ -19,6 +17,40 @@ class Options(_Namespace): except AttributeError: return None +class Rethinker: + import logging + logger = logging.getLogger("warcprox.Rethinker") + + def __init__(self, servers=["localhost"], db=None): + self.servers = servers + self.db = db + + # https://github.com/rethinkdb/rethinkdb-example-webpy-blog/blob/master/model.py + # "Best practices: Managing connections: a connection per request" + def _random_server_connection(self): + import rethinkdb as r + import random + while True: + server = random.choice(self.servers) + try: + try: + host, port = server.split(":") + return r.connect(host=host, port=port) + except ValueError: + return r.connect(host=server) + except Exception as e: + self.logger.error("will keep trying to get a connection after failure connecting to %s", server, exc_info=True) + import time + time.sleep(0.5) + + def run(self, query): + while True: + with self._random_server_connection() as conn: + try: + return query.run(conn, db=self.db) + except (ReqlAvailabilityError, ReqlTimeoutError) as e: + self.logger.error("will retry rethinkdb query/operation %s which failed like so:", exc_info=True) + version_bytes = _read_version_bytes().strip() version_str = version_bytes.decode('utf-8') diff --git a/warcprox/bigtable.py b/warcprox/bigtable.py index aecb4ed..3c695db 100644 --- a/warcprox/bigtable.py +++ b/warcprox/bigtable.py @@ -16,52 +16,39 @@ class RethinkCaptures: logger = logging.getLogger("warcprox.dedup.RethinkCaptures") def __init__(self, servers=["localhost"], db="warcprox", table="captures", shards=3, replicas=3, options=warcprox.Options()): - self.servers = servers - self.db = db + self.r = warcprox.Rethinker(servers, db) self.table = table self.shards = shards self.replicas = replicas self.options = options self._ensure_db_table() - # https://github.com/rethinkdb/rethinkdb-example-webpy-blog/blob/master/model.py - # "Best practices: Managing connections: a connection per request" - def _random_server_connection(self): - server = random.choice(self.servers) - try: - host, port = server.split(":") - return r.connect(host=host, port=port) - except ValueError: - return r.connect(host=server) - def _ensure_db_table(self): - with self._random_server_connection() as conn: - dbs = r.db_list().run(conn) - if not self.db in dbs: - self.logger.info("creating rethinkdb database %s", repr(self.db)) - r.db_create(self.db).run(conn) - tables = r.db(self.db).table_list().run(conn) - if not self.table in tables: - self.logger.info("creating rethinkdb table %s in database %s", repr(self.table), repr(self.db)) - r.db(self.db).table_create(self.table, shards=self.shards, replicas=self.replicas).run(conn) - r.db(self.db).table(self.table).index_create("abbr_canon_surt_timesamp", [r.row["abbr_canon_surt"], r.row["timestamp"]]).run(conn) - r.db(self.db).table(self.table).index_create("sha1_warc_type", [r.row["sha1base32"], r.row["warc_type"], r.row["bucket"]]).run(conn) + dbs = self.r.run(r.db_list()) + if not self.r.db in dbs: + self.logger.info("creating rethinkdb database %s", repr(self.r.db)) + self.r.run(r.db_create(self.r.db)) + tables = self.r.run(r.table_list()) + if not self.table in tables: + self.logger.info("creating rethinkdb table %s in database %s", repr(self.table), repr(self.r.db)) + self.r.run(r.table_create(self.table, shards=self.shards, replicas=self.replicas)) + self.r.run(r.table(self.table).index_create("abbr_canon_surt_timesamp", [r.row["abbr_canon_surt"], r.row["timestamp"]])) + self.r.run(r.table(self.table).index_create("sha1_warc_type", [r.row["sha1base32"], r.row["warc_type"], r.row["bucket"]])) def find_response_by_digest(self, algo, raw_digest, bucket="__unspecified__"): if algo != "sha1": raise Exception("digest type is {} but big capture table is indexed by sha1".format(algo)) sha1base32 = base64.b32encode(raw_digest).decode("utf-8") - with self._random_server_connection() as conn: - cursor = r.db(self.db).table(self.table).get_all([sha1base32, "response", bucket], index="sha1_warc_type").run(conn) - results = list(cursor) - if len(results) > 1: - raise Exception("expected 0 or 1 but found %s results for sha1base32=%s", len(results), sha1base32) - elif len(results) == 1: - result = results[0] - else: - result = None - self.logger.info("returning %s for sha1base32=%s", result, sha1base32) - return result + cursor = self.r.run(r.table(self.table).get_all([sha1base32, "response", bucket], index="sha1_warc_type")) + results = list(cursor) + if len(results) > 1: + raise Exception("expected 0 or 1 but found %s results for sha1base32=%s", len(results), sha1base32) + elif len(results) == 1: + result = results[0] + else: + result = None + self.logger.info("returning %s for sha1base32=%s", result, sha1base32) + return result def notify(self, recorded_url, records): if not recorded_url.response_recorder: @@ -94,11 +81,10 @@ class RethinkCaptures: "bucket": bucket, } - with self._random_server_connection() as conn: - result = r.db(self.db).table(self.table).insert(entry).run(conn) - if result["inserted"] == 1 and sorted(result.values()) != [0,0,0,0,0,1]: - raise Exception("unexpected result %s saving %s", result, entry) - self.logger.info("big capture table db saved %s", entry) + result = self.r.run(r.table(self.table).insert(entry)) + if result["inserted"] == 1 and sorted(result.values()) != [0,0,0,0,0,1]: + raise Exception("unexpected result %s saving %s", result, entry) + self.logger.info("big capture table db saved %s", entry) class RethinkCapturesDedup: logger = logging.getLogger("warcprox.dedup.RethinkCapturesDedup") diff --git a/warcprox/dedup.py b/warcprox/dedup.py index 7148773..4eea112 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -88,34 +88,22 @@ class RethinkDedupDb: logger = logging.getLogger("warcprox.dedup.RethinkDedupDb") def __init__(self, servers=["localhost"], db="warcprox", table="dedup", shards=3, replicas=3, options=warcprox.Options()): - self.servers = servers - self.db = db + self.r = warcprox.Rethinker(servers, db) self.table = table self.shards = shards self.replicas = replicas self._ensure_db_table() self.options = options - # https://github.com/rethinkdb/rethinkdb-example-webpy-blog/blob/master/model.py - # "Best practices: Managing connections: a connection per request" - def _random_server_connection(self): - server = random.choice(self.servers) - try: - host, port = server.split(":") - return r.connect(host=host, port=port) - except ValueError: - return r.connect(host=server) - def _ensure_db_table(self): - with self._random_server_connection() as conn: - dbs = r.db_list().run(conn) - if not self.db in dbs: - self.logger.info("creating rethinkdb database %s", repr(self.db)) - r.db_create(self.db).run(conn) - tables = r.db(self.db).table_list().run(conn) - if not self.table in tables: - self.logger.info("creating rethinkdb table %s in database %s", repr(self.table), repr(self.db)) - r.db(self.db).table_create(self.table, primary_key="key", shards=self.shards, replicas=self.replicas).run(conn) + dbs = self.r.run(r.db_list()) + if not self.r.db in dbs: + self.logger.info("creating rethinkdb database %s", repr(self.r.db)) + self.r.run(r.db_create(self.r.db)) + tables = self.r.run(r.table_list()) + if not self.table in tables: + self.logger.info("creating rethinkdb table %s in database %s", repr(self.table), repr(self.r.db)) + self.r.run(r.table_create(self.table, primary_key="key", shards=self.shards, replicas=self.replicas)) def close(self): pass @@ -130,22 +118,20 @@ class RethinkDedupDb: url = response_record.get_header(warctools.WarcRecord.URL).decode('latin1') date = response_record.get_header(warctools.WarcRecord.DATE).decode('latin1') record = {'key':k,'url':url,'date':date,'id':record_id} - with self._random_server_connection() as conn: - result = r.db(self.db).table(self.table).insert(record,conflict="replace").run(conn) - if sorted(result.values()) != [0,0,0,0,0,1] and [result["deleted"],result["skipped"],result["errors"]] != [0,0,0]: - raise Exception("unexpected result %s saving %s", result, record) - self.logger.debug('dedup db saved %s:%s', k, record) + result = self.r.run(r.table(self.table).insert(record,conflict="replace")) + if sorted(result.values()) != [0,0,0,0,0,1] and [result["deleted"],result["skipped"],result["errors"]] != [0,0,0]: + raise Exception("unexpected result %s saving %s", result, record) + self.logger.debug('dedup db saved %s:%s', k, record) def lookup(self, digest_key, bucket=""): k = digest_key.decode("utf-8") if isinstance(digest_key, bytes) else digest_key k = "{}|{}".format(k, bucket) - with self._random_server_connection() as conn: - result = r.db(self.db).table(self.table).get(k).run(conn) - if result: - for x in result: - result[x] = result[x].encode("utf-8") - self.logger.debug('dedup db lookup of key=%s returning %s', k, result) - return result + result = self.r.run(r.table(self.table).get(k)) + if result: + for x in result: + result[x] = result[x].encode("utf-8") + self.logger.debug('dedup db lookup of key=%s returning %s', k, result) + return result def notify(self, recorded_url, records): if (records[0].get_header(warctools.WarcRecord.TYPE) == warctools.WarcRecord.RESPONSE diff --git a/warcprox/stats.py b/warcprox/stats.py index d246d69..6f6c04d 100644 --- a/warcprox/stats.py +++ b/warcprox/stats.py @@ -108,34 +108,22 @@ class RethinkStatsDb: logger = logging.getLogger("warcprox.stats.RethinkStatsDb") def __init__(self, servers=["localhost"], db="warcprox", table="stats", shards=3, replicas=3, options=warcprox.Options()): - self.servers = servers - self.db = db + self.r = warcprox.Rethinker(servers, db) self.table = table self.shards = shards self.replicas = replicas self._ensure_db_table() self.options = options - # https://github.com/rethinkdb/rethinkdb-example-webpy-blog/blob/master/model.py - # "Best practices: Managing connections: a connection per request" - def _random_server_connection(self): - server = random.choice(self.servers) - try: - host, port = server.split(":") - return r.connect(host=host, port=port) - except ValueError: - return r.connect(host=server) - def _ensure_db_table(self): - with self._random_server_connection() as conn: - dbs = r.db_list().run(conn) - if not self.db in dbs: - self.logger.info("creating rethinkdb database %s", repr(self.db)) - r.db_create(self.db).run(conn) - tables = r.db(self.db).table_list().run(conn) - if not self.table in tables: - self.logger.info("creating rethinkdb table %s in database %s", repr(self.table), repr(self.db)) - r.db(self.db).table_create(self.table, primary_key="bucket", shards=self.shards, replicas=self.replicas).run(conn) + dbs = self.r.run(r.db_list()) + if not self.r.db in dbs: + self.logger.info("creating rethinkdb database %s", repr(self.r.db)) + self.r.run(r.db_create(self.r.db)) + tables = self.r.run(r.table_list()) + if not self.table in tables: + self.logger.info("creating rethinkdb table %s in database %s", repr(self.table), repr(self.r.db)) + self.r.run(r.table_create(self.table, primary_key="bucket", shards=self.shards, replicas=self.replicas)) def close(self): pass @@ -145,16 +133,15 @@ class RethinkStatsDb: def value(self, bucket0="__all__", bucket1=None, bucket2=None): # XXX use pluck? - with self._random_server_connection() as conn: - bucket0_stats = r.db(self.db).table(self.table).get(bucket0).run(conn) - self.logger.debug('stats db lookup of bucket=%s returned %s', bucket0, bucket0_stats) - if bucket0_stats: - if bucket1: - if bucket2: - return bucket0_stats[bucket1][bucket2] - else: - return bucket0_stats[bucket1] - return bucket0_stats + bucket0_stats = self.r.run(r.table(self.table).get(bucket0)) + self.logger.debug('stats db lookup of bucket=%s returned %s', bucket0, bucket0_stats) + if bucket0_stats: + if bucket1: + if bucket2: + return bucket0_stats[bucket1][bucket2] + else: + return bucket0_stats[bucket1] + return bucket0_stats def tally(self, recorded_url, records): buckets = ["__all__"] @@ -166,24 +153,23 @@ class RethinkStatsDb: else: buckets.append("__unspecified__") - with self._random_server_connection() as conn: - for bucket in buckets: - bucket_stats = self.value(bucket) or _empty_bucket(bucket) + for bucket in buckets: + bucket_stats = self.value(bucket) or _empty_bucket(bucket) - bucket_stats["total"]["urls"] += 1 - bucket_stats["total"]["wire_bytes"] += recorded_url.size + bucket_stats["total"]["urls"] += 1 + bucket_stats["total"]["wire_bytes"] += recorded_url.size - if records[0].get_header(warctools.WarcRecord.TYPE) == warctools.WarcRecord.REVISIT: - bucket_stats["revisit"]["urls"] += 1 - bucket_stats["revisit"]["wire_bytes"] += recorded_url.size - else: - bucket_stats["new"]["urls"] += 1 - bucket_stats["new"]["wire_bytes"] += recorded_url.size + if records[0].get_header(warctools.WarcRecord.TYPE) == warctools.WarcRecord.REVISIT: + bucket_stats["revisit"]["urls"] += 1 + bucket_stats["revisit"]["wire_bytes"] += recorded_url.size + else: + bucket_stats["new"]["urls"] += 1 + bucket_stats["new"]["wire_bytes"] += recorded_url.size - self.logger.debug("saving %s", bucket_stats) - result = r.db(self.db).table(self.table).insert(bucket_stats, conflict="replace").run(conn) - if sorted(result.values()) != [0,0,0,0,0,1] or [result["deleted"],result["skipped"],result["errors"]] != [0,0,0]: - raise Exception("unexpected result %s saving %s", result, record) + self.logger.debug("saving %s", bucket_stats) + result = self.r.run(r.table(self.table).insert(bucket_stats, conflict="replace")) + if sorted(result.values()) != [0,0,0,0,0,1] or [result["deleted"],result["skipped"],result["errors"]] != [0,0,0]: + raise Exception("unexpected result %s saving %s", result, record) def notify(self, recorded_url, records): self.tally(recorded_url, records) diff --git a/warcprox/tests/test_warcprox.py b/warcprox/tests/test_warcprox.py index e588754..0331c7b 100755 --- a/warcprox/tests/test_warcprox.py +++ b/warcprox/tests/test_warcprox.py @@ -148,9 +148,8 @@ def captures_db(request, rethinkdb_servers, rethinkdb_big_table): def fin(): if captures_db: logging.info('dropping rethinkdb database {}'.format(db)) - with captures_db._random_server_connection() as conn: - result = r.db_drop(db).run(conn) - logging.info("result=%s", result) + result = captures_db.r.run(r.db_drop(db)) + logging.info("result=%s", result) request.addfinalizer(fin) return captures_db @@ -170,9 +169,8 @@ def rethink_dedup_db(request, rethinkdb_servers, captures_db): if rethinkdb_servers: if not captures_db: logging.info('dropping rethinkdb database {}'.format(db)) - with ddb._random_server_connection() as conn: - result = r.db_drop(db).run(conn) - logging.info("result=%s", result) + result = ddb.r.run(r.db_drop(db)) + logging.info("result=%s", result) request.addfinalizer(fin) return ddb @@ -210,9 +208,8 @@ def stats_db(request, rethinkdb_servers): def fin(): if rethinkdb_servers: logging.info('dropping rethinkdb database {}'.format(db)) - with sdb._random_server_connection() as conn: - result = r.db_drop(db).run(conn) - logging.info("result=%s", result) + result = sdb.r.run(r.db_drop(db)) + logging.info("result=%s", result) else: logging.info('deleting file {}'.format(stats_db_file)) os.unlink(stats_db_file) From a9986e4ce3c5c946710da1f2accdb85367f17887 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 28 Aug 2015 00:36:21 +0000 Subject: [PATCH 035/146] fix NameError, quiet logging --- warcprox/__init__.py | 2 +- warcprox/bigtable.py | 8 +++----- warcprox/stats.py | 2 -- 3 files changed, 4 insertions(+), 8 deletions(-) diff --git a/warcprox/__init__.py b/warcprox/__init__.py index 703952e..5dc9073 100644 --- a/warcprox/__init__.py +++ b/warcprox/__init__.py @@ -48,7 +48,7 @@ class Rethinker: with self._random_server_connection() as conn: try: return query.run(conn, db=self.db) - except (ReqlAvailabilityError, ReqlTimeoutError) as e: + except (r.ReqlAvailabilityError, r.ReqlTimeoutError) as e: self.logger.error("will retry rethinkdb query/operation %s which failed like so:", exc_info=True) version_bytes = _read_version_bytes().strip() diff --git a/warcprox/bigtable.py b/warcprox/bigtable.py index 3c695db..2ca7ba8 100644 --- a/warcprox/bigtable.py +++ b/warcprox/bigtable.py @@ -13,7 +13,7 @@ import surt import os class RethinkCaptures: - logger = logging.getLogger("warcprox.dedup.RethinkCaptures") + logger = logging.getLogger("warcprox.bigtables.RethinkCaptures") def __init__(self, servers=["localhost"], db="warcprox", table="captures", shards=3, replicas=3, options=warcprox.Options()): self.r = warcprox.Rethinker(servers, db) @@ -47,7 +47,7 @@ class RethinkCaptures: result = results[0] else: result = None - self.logger.info("returning %s for sha1base32=%s", result, sha1base32) + self.logger.debug("returning %s for sha1base32=%s bucket=%s", result, sha1base32, bucket) return result def notify(self, recorded_url, records): @@ -84,7 +84,7 @@ class RethinkCaptures: result = self.r.run(r.table(self.table).insert(entry)) if result["inserted"] == 1 and sorted(result.values()) != [0,0,0,0,0,1]: raise Exception("unexpected result %s saving %s", result, entry) - self.logger.info("big capture table db saved %s", entry) + self.logger.debug("big capture table db saved %s", entry) class RethinkCapturesDedup: logger = logging.getLogger("warcprox.dedup.RethinkCapturesDedup") @@ -96,7 +96,6 @@ class RethinkCapturesDedup: def lookup(self, digest_key, bucket="__unspecified__"): k = digest_key.decode("utf-8") if isinstance(digest_key, bytes) else digest_key algo, value_str = k.split(":") - self.logger.info("(algo,value_str)=(%s,%s)", algo, value_str) if self.options.base32: raw_digest = base64.b32decode(value_str, casefold=True) else: @@ -104,7 +103,6 @@ class RethinkCapturesDedup: entry = self.captures_db.find_response_by_digest(algo, raw_digest, bucket) if entry: dedup_info = {"url":entry["url"].encode("utf-8"), "date":entry["timestamp"].encode("utf-8"), "id":entry["warc_id"].encode("utf-8")} - self.logger.info("returning %s for digest_key=%s", dedup_info, digest_key) return dedup_info else: return None diff --git a/warcprox/stats.py b/warcprox/stats.py index 6f6c04d..852975f 100644 --- a/warcprox/stats.py +++ b/warcprox/stats.py @@ -1,5 +1,3 @@ -# vim:set sw=4 et: - from __future__ import absolute_import try: From decb985250fb101418061b26204fcbe8e0d4a643 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 1 Sep 2015 00:53:38 +0000 Subject: [PATCH 036/146] add length field to each record in big captures table (size in bytes of compressed warc record) because pywayback needs it --- warcprox/bigtable.py | 10 +++++++++- warcprox/warcproxy.py | 6 +++--- warcprox/writer.py | 1 + 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/warcprox/bigtable.py b/warcprox/bigtable.py index 2ca7ba8..ea38cc9 100644 --- a/warcprox/bigtable.py +++ b/warcprox/bigtable.py @@ -63,6 +63,13 @@ class RethinkCaptures: bucket = "__unspecified__" canon_surt = surt.surt(recorded_url.url.decode("utf-8"), trailing_comma=True, host_massage=False) + + mimetype = recorded_url.content_type + if mimetype: + n = mimetype.find(";") + if n >= 0: + mimetype = mimetype[:n] + entry = { # id only specified for rethinkdb partitioning "id": "{} {}".format(canon_surt[:20], records[0].id.decode("utf-8")[10:-1]), @@ -75,10 +82,11 @@ class RethinkCaptures: "warc_type": records[0].type.decode("utf-8"), "warc_id": records[0].id.decode("utf-8"), "sha1base32": base64.b32encode(recorded_url.response_recorder.payload_digest.digest()).decode("utf-8"), - "content_type": recorded_url.content_type, + "content_type": mimetype, "response_code": recorded_url.status, "http_method": recorded_url.method, "bucket": bucket, + "length": records[0].length, } result = self.r.run(r.table(self.table).insert(entry)) diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index 1b56e4b..9b82ac5 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -147,25 +147,25 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): logger = logging.getLogger("warcprox.warcprox.WarcProxyHandler") def _enforce_limits(self, warcprox_meta): - if (warcprox_meta and "limits" in warcprox_meta): + if warcprox_meta and "limits" in warcprox_meta: # self.logger.info("warcprox_meta['limits']=%s", warcprox_meta['limits']) for item in warcprox_meta["limits"].items(): key, limit = item bucket0, bucket1, bucket2 = key.rsplit(".", 2) value = self.server.stats_db.value(bucket0, bucket1, bucket2) if value and value >= limit: - self.logger.info('sending "420 Reached limit" %s=%s', key, limit) body = "request rejected by warcprox: reached limit {}={}\n".format(key, limit).encode("utf-8") self.send_response(420, "Reached limit") self.send_header("Content-Type", "text/plain;charset=utf-8") self.send_header("Connection", "close") self.send_header("Content-Length", len(body)) - response_meta = {"reached-limit":{key:limit}, "stats":{bucket0: self.server.stats_db.value(bucket0)}} + response_meta = {"reached-limit":{key:limit}, "stats":{bucket0:self.server.stats_db.value(bucket0)}} self.send_header("Warcprox-Meta", json.dumps(response_meta, separators=(",",":"))) self.end_headers() if self.command != "HEAD": self.wfile.write(body) self.connection.close() + self.logger.info("%s 420 %s %s -- reached limit %s=%s", self.client_address[0], self.command, self.url, key, limit) return True return False diff --git a/warcprox/writer.py b/warcprox/writer.py index 21ae23f..4603c0c 100644 --- a/warcprox/writer.py +++ b/warcprox/writer.py @@ -89,6 +89,7 @@ class WarcWriter: offset = writer.tell() record.write_to(writer, gzip=self.gzip) record.offset = offset + record.length = writer.tell() - offset record.warc_filename = self._f_finalname self.logger.debug('wrote warc record: warc_type=%s content_length=%s url=%s warc=%s offset=%d', record.get_header(warctools.WarcRecord.TYPE), From fee200c72c4ae706636b9af855d27c38908ab6d5 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 4 Sep 2015 01:30:16 +0000 Subject: [PATCH 037/146] get rid of silly _decode because we know which fields are bytes and which str --- warcprox/writerthread.py | 25 ++++--------------------- 1 file changed, 4 insertions(+), 21 deletions(-) diff --git a/warcprox/writerthread.py b/warcprox/writerthread.py index 182835f..d656951 100644 --- a/warcprox/writerthread.py +++ b/warcprox/writerthread.py @@ -58,34 +58,17 @@ class WarcWriterThread(threading.Thread): # closest thing we have to heritrix crawl log at the moment def _log(self, recorded_url, records): - def _decode(x): - if isinstance(x, bytes): - return x.decode("utf-8") - else: - return x - try: payload_digest = records[0].get_header(warctools.WarcRecord.PAYLOAD_DIGEST).decode("utf-8") except: payload_digest = "-" - mimetype = _decode(recorded_url.content_type) - if mimetype: - n = mimetype.find(";") - if n >= 0: - mimetype = mimetype[:n] # 2015-07-17T22:32:23.672Z 1 58 dns:www.dhss.delaware.gov P http://www.dhss.delaware.gov/dhss/ text/dns #045 20150717223214881+316 sha1:63UTPB7GTWIHAGIK3WWL76E57BBTJGAK http://www.dhss.delaware.gov/dhss/ - {"warcFileOffset":2964,"warcFilename":"ARCHIVEIT-1303-WEEKLY-JOB165158-20150717223222113-00000.warc.gz"} self.logger.info("{} {} {} {} {} size={} {} {} {} offset={}".format( - _decode(recorded_url.client_ip), - _decode(recorded_url.status), - _decode(recorded_url.method), - _decode(recorded_url.url), - mimetype, - recorded_url.size, - _decode(payload_digest), - _decode(records[0].get_header(warctools.WarcRecord.TYPE)), - _decode(records[0].warc_filename), - records[0].offset)) + recorded_url.client_ip, recorded_url.status, recorded_url.method, + recorded_url.url.decode("utf-8"), recorded_url.mimetype, + recorded_url.size, payload_digest, records[0].type.decode("utf-8"), + records[0].warc_filename, records[0].offset)) def _final_tasks(self, recorded_url, records): if self.listeners: From b30218027e0cc5ad17c52ce839279e4775d70cd7 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 4 Sep 2015 01:32:55 +0000 Subject: [PATCH 038/146] get "mimetype" (without ;params) from content-type in one place in RecordedUrl, and also note host and duration (time spent serving request) --- warcprox/bigtable.py | 8 +------- warcprox/warc.py | 2 +- warcprox/warcproxy.py | 19 +++++++++++++++---- 3 files changed, 17 insertions(+), 12 deletions(-) diff --git a/warcprox/bigtable.py b/warcprox/bigtable.py index ea38cc9..a799d78 100644 --- a/warcprox/bigtable.py +++ b/warcprox/bigtable.py @@ -64,12 +64,6 @@ class RethinkCaptures: canon_surt = surt.surt(recorded_url.url.decode("utf-8"), trailing_comma=True, host_massage=False) - mimetype = recorded_url.content_type - if mimetype: - n = mimetype.find(";") - if n >= 0: - mimetype = mimetype[:n] - entry = { # id only specified for rethinkdb partitioning "id": "{} {}".format(canon_surt[:20], records[0].id.decode("utf-8")[10:-1]), @@ -82,7 +76,7 @@ class RethinkCaptures: "warc_type": records[0].type.decode("utf-8"), "warc_id": records[0].id.decode("utf-8"), "sha1base32": base64.b32encode(recorded_url.response_recorder.payload_digest.digest()).decode("utf-8"), - "content_type": mimetype, + "content_type": recorded_url.mimetype, "response_code": recorded_url.status, "http_method": recorded_url.method, "bucket": bucket, diff --git a/warcprox/warc.py b/warcprox/warc.py index 1c535ae..bea4a89 100644 --- a/warcprox/warc.py +++ b/warcprox/warc.py @@ -64,7 +64,7 @@ class WarcRecordBuilder: principal_record = self.build_warc_record(url=recorded_url.url, warc_date=warc_date, data=recorded_url.request_data, warc_type=recorded_url.custom_type, - content_type=recorded_url.content_type) + content_type=recorded_url.content_type.encode("latin1")) return (principal_record,) def build_warc_record(self, url, warc_date=None, recorder=None, data=None, diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index 9b82ac5..ef95d28 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -237,7 +237,8 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): status=prox_rec_res.status, size=prox_rec_res.recorder.len, client_ip=self.client_address[0], content_type=prox_rec_res.getheader("Content-Type"), - method=self.command, timestamp=timestamp) + method=self.command, timestamp=timestamp, + host=self.hostname, duration=datetime.datetime.utcnow()-timestamp) self.server.recorded_url_q.put(recorded_url) self.log_request(prox_rec_res.status, prox_rec_res.recorder.len) @@ -278,7 +279,7 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): response_recorder=None, remote_ip=b'', warcprox_meta=warcprox_meta, - content_type=self.headers['Content-Type'].encode('latin1'), + content_type=self.headers['Content-Type'], custom_type=warc_type or self.headers['WARC-Type'], status=204, size=len(request_data), client_ip=self.client_address[0], @@ -309,7 +310,7 @@ class RecordedUrl: def __init__(self, url, request_data, response_recorder, remote_ip, warcprox_meta=None, content_type=None, custom_type=None, status=None, size=None, client_ip=None, method=None, - timestamp=None): + timestamp=None, host=None, duration=None): # XXX should test what happens with non-ascii url (when does # url-encoding happen?) if type(url) is not bytes: @@ -330,14 +331,24 @@ class RecordedUrl: else: self.warcprox_meta = {} + if isinstance(content_type, bytes): + raise Exception("content_type is not supposed to be bytes!") self.content_type = content_type - self.custom_type = custom_type + self.mimetype = content_type + if self.mimetype: + n = self.mimetype.find(";") + if n >= 0: + self.mimetype = self.mimetype[:n] + + self.custom_type = custom_type self.status = status self.size = size self.client_ip = client_ip self.method = method self.timestamp = timestamp + self.host = host + self.duration = duration def __del__(self): self.logger.debug("finished with %s", self) From d98f03012b99cdd224a0881c4891fcfdc8fbeb4a Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 4 Sep 2015 01:33:19 +0000 Subject: [PATCH 039/146] kafka capture feed, for druid --- requirements.txt | 1 + warcprox/__init__.py | 1 + warcprox/kafkafeed.py | 56 +++++++++++++++++++++++++++++++++++++++++++ warcprox/main.py | 14 +++++++---- 4 files changed, 67 insertions(+), 5 deletions(-) create mode 100644 warcprox/kafkafeed.py diff --git a/requirements.txt b/requirements.txt index dcc1f62..a320b31 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,3 +2,4 @@ certauth>=1.1.0 rethinkdb git+https://github.com/internetarchive/warctools.git git+https://github.com/nlevitt/surt.git@py3 +kafka-python diff --git a/warcprox/__init__.py b/warcprox/__init__.py index 5dc9073..ce56e59 100644 --- a/warcprox/__init__.py +++ b/warcprox/__init__.py @@ -64,3 +64,4 @@ import warcprox.warc as warc import warcprox.writerthread as writerthread import warcprox.stats as stats import warcprox.bigtable as bigtable +import warcprox.kafkafeed as kafkafeed diff --git a/warcprox/kafkafeed.py b/warcprox/kafkafeed.py new file mode 100644 index 0000000..65f3f1d --- /dev/null +++ b/warcprox/kafkafeed.py @@ -0,0 +1,56 @@ +import kafka +import datetime +import json +import logging +from hanzo import warctools + +class CaptureFeed: + logger = logging.getLogger('warcprox.kafkafeed.CaptureFeed') + + def __init__(self, broker_list, topic): + self.broker_list = broker_list + self.topic = topic.encode('utf-8') + self._producer = kafka.SimpleProducer(kafka.KafkaClient(broker_list)) + + def notify(self, recorded_url, records): + if records[0].type not in ('revisit', 'response'): + return + + try: + payload_digest = records[0].get_header(warctools.WarcRecord.PAYLOAD_DIGEST).decode('utf-8') + except: + payload_digest = '-' + + # {"status_code":200,"content_digest":"sha1:3VU56HI3BTMDZBL2TP7SQYXITT7VEAJQ","host":"www.kaosgl.com","via":"http://www.kaosgl.com/sayfa.php?id=4427","account_id":"877","seed":"http://www.kaosgl.com/","warc_filename":"ARCHIVEIT-6003-WEEKLY-JOB171310-20150903100014694-00002.warc.gz","url":"http://www.kaosgl.com/resim/HomofobiKarsitiBulusma/trabzon05.jpg","size":29700,"start_time_plus_duration":"20150903175709637+1049","timestamp":"2015-09-03T17:57:10.707Z","mimetype":"image/jpeg","collection_id":"6003","is_test_crawl":"false","job_name":"6003-20150902172136074","warc_offset":856320200,"thread":6,"hop_path":"RLLLLLE","extra_info":{},"annotations":"duplicate:digest","content_length":29432} + + now = datetime.datetime.utcnow() + d = { + 'timestamp': '{:%Y-%m-%dT%H:%M:%S}.{:03d}Z'.format(now, now.microsecond//1000), + 'size': recorded_url.size, + 'status_code': recorded_url.status, + 'url': recorded_url.url.decode('utf-8'), + 'mimetype': recorded_url.mimetype, + 'content_digest': payload_digest, + 'warc_filename': records[0].warc_filename, + 'warc_offset': records[0].offset, + 'host': recorded_url.host, + 'annotations': 'duplicate:digest' if records[0].type == 'revisit' else '', + 'content_length': recorded_url.response_recorder.len - recorded_url.response_recorder.payload_offset, + 'start_time_plus_duration': '{:%Y%m%d%H%M%S}{:03d}+{}'.format( + recorded_url.timestamp, recorded_url.timestamp.microsecond//1000, + int(recorded_url.duration.total_seconds() * 1000)), + # 'hop_path': ? # only used for seed redirects, which are n/a to brozzler (?) + # 'via': ? + # 'thread': ? # not needed + } + + # fields expected to be populated here are (for archive-it): + # account_id, collection_id, is_test_crawl, seed, job_name + if recorded_url.warcprox_meta and 'capture-feed-extra-fields' in recorded_url.warcprox_meta: + for (k,v) in recorded_url.warcprox_meta['capture-feed-extra-fields'].items(): + d[k] = v + + msg = json.dumps(d, separators=(',', ':')).encode('utf-8') + self.logger.debug('feeding kafka %s', msg) + self._producer.send_messages(self.topic, msg) + diff --git a/warcprox/main.py b/warcprox/main.py index 397f4db..cd0cbbe 100644 --- a/warcprox/main.py +++ b/warcprox/main.py @@ -68,19 +68,19 @@ def _build_arg_parser(prog=os.path.basename(sys.argv[0])): default='./warcprox-dedup.db', help='persistent deduplication database file; empty string or /dev/null disables deduplication') group.add_argument('--rethinkdb-servers', dest='rethinkdb_servers', help='rethinkdb servers, used for dedup and stats if specified; e.g. db0.foo.org,db0.foo.org:38015,db1.foo.org') - arg_parser.add_argument('--rethinkdb-db', dest='rethinkdb_db', default="warcprox", + arg_parser.add_argument('--rethinkdb-db', dest='rethinkdb_db', default='warcprox', help='rethinkdb database name (ignored unless --rethinkdb-servers is specified)') arg_parser.add_argument('--rethinkdb-big-table', dest='rethinkdb_big_table', action='store_true', default=False, help='use a big rethinkdb table called "captures", instead of a small table called "dedup"; table is suitable for use as index for playback (ignored unless --rethinkdb-servers is specified)') + arg_parser.add_argument('--kafka-broker-list', dest='kafka_broker_list', + default=None, help='kafka broker list for capture feed') + arg_parser.add_argument('--kafka-capture-feed-topic', dest='kafka_capture_feed_topic', + default=None, help='kafka capture feed topic') arg_parser.add_argument('--version', action='version', version="warcprox {}".format(warcprox.version_str)) arg_parser.add_argument('-v', '--verbose', dest='verbose', action='store_true') arg_parser.add_argument('-q', '--quiet', dest='quiet', action='store_true') - # [--ispartof=warcinfo ispartof] - # [--description=warcinfo description] - # [--operator=warcinfo operator] - # [--httpheader=warcinfo httpheader] return arg_parser @@ -144,6 +144,10 @@ def main(argv=sys.argv): stats_db = warcprox.stats.StatsDb(args.stats_db_file, options=options) listeners.append(stats_db) + if args.kafka_broker_list and args.kafka_capture_feed_topic: + kafka_capture_feed = warcprox.kafkafeed.CaptureFeed(args.kafka_broker_list, args.kafka_capture_feed_topic) + listeners.append(kafka_capture_feed) + recorded_url_q = queue.Queue() ca_name = 'Warcprox CA on {}'.format(socket.gethostname())[:64] From 67beec4b8088da713a2a009f1abbd393991ecaa3 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 4 Sep 2015 21:00:04 +0000 Subject: [PATCH 040/146] fix handling of rethinkdb exception --- warcprox/__init__.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/warcprox/__init__.py b/warcprox/__init__.py index ce56e59..875bea2 100644 --- a/warcprox/__init__.py +++ b/warcprox/__init__.py @@ -44,16 +44,27 @@ class Rethinker: time.sleep(0.5) def run(self, query): + import rethinkdb as r while True: with self._random_server_connection() as conn: try: return query.run(conn, db=self.db) except (r.ReqlAvailabilityError, r.ReqlTimeoutError) as e: - self.logger.error("will retry rethinkdb query/operation %s which failed like so:", exc_info=True) + self.logger.error("will retry rethinkdb query/operation %s which failed like so:", query, exc_info=True) version_bytes = _read_version_bytes().strip() version_str = version_bytes.decode('utf-8') +def gettid(): + try: + import ctypes + libc = ctypes.cdll.LoadLibrary('libc.so.6') + SYS_gettid = 186 + tid = libc.syscall(SYS_gettid) + return tid + except: + logging.warn("gettid failed?") + import warcprox.controller as controller import warcprox.playback as playback import warcprox.dedup as dedup From 44792151c995564adc2c11e8acd075ff2a163f02 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 4 Sep 2015 21:00:48 +0000 Subject: [PATCH 041/146] tiny fix to make it work! --- warcprox/kafkafeed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/warcprox/kafkafeed.py b/warcprox/kafkafeed.py index 65f3f1d..dd8a0a8 100644 --- a/warcprox/kafkafeed.py +++ b/warcprox/kafkafeed.py @@ -13,7 +13,7 @@ class CaptureFeed: self._producer = kafka.SimpleProducer(kafka.KafkaClient(broker_list)) def notify(self, recorded_url, records): - if records[0].type not in ('revisit', 'response'): + if records[0].type not in (b'revisit', b'response'): return try: From 6da3dd50acae88d321ae08a3d509a7ebb621b716 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 4 Sep 2015 21:01:18 +0000 Subject: [PATCH 042/146] include thread pid in thread name (linux-specific, not sure what happens on other systems) --- warcprox/writerthread.py | 1 + 1 file changed, 1 insertion(+) diff --git a/warcprox/writerthread.py b/warcprox/writerthread.py index d656951..11f3e3f 100644 --- a/warcprox/writerthread.py +++ b/warcprox/writerthread.py @@ -38,6 +38,7 @@ class WarcWriterThread(threading.Thread): def run(self): try: + self.setName('WarcWriterThread(tid={})'.format(warcprox.gettid())) while not self.stop.is_set(): try: recorded_url = self.recorded_url_q.get(block=True, timeout=0.5) From c02c98e369b978bfe8f08095422bab3ce6d995c7 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Mon, 7 Sep 2015 00:27:17 +0000 Subject: [PATCH 043/146] make sure warc headers are bytes --- warcprox/warcproxy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index ef95d28..b1772ac 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -280,7 +280,7 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): remote_ip=b'', warcprox_meta=warcprox_meta, content_type=self.headers['Content-Type'], - custom_type=warc_type or self.headers['WARC-Type'], + custom_type=warc_type or self.headers['WARC-Type'].encode('utf-8'), status=204, size=len(request_data), client_ip=self.client_address[0], method=self.command, timestamp=timestamp) From 686a297f98307d1f7443e32d9443a3df7f1d41b6 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Mon, 7 Sep 2015 06:15:15 +0000 Subject: [PATCH 044/146] fixes to let screenshot recordss be saved in big capture tables for wayback playback --- warcprox/bigtable.py | 16 ++++++++++------ warcprox/warc.py | 6 ++++-- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/warcprox/bigtable.py b/warcprox/bigtable.py index a799d78..0587cf9 100644 --- a/warcprox/bigtable.py +++ b/warcprox/bigtable.py @@ -11,6 +11,7 @@ import warcprox import base64 import surt import os +import hashlib class RethinkCaptures: logger = logging.getLogger("warcprox.bigtables.RethinkCaptures") @@ -51,11 +52,14 @@ class RethinkCaptures: return result def notify(self, recorded_url, records): - if not recorded_url.response_recorder: - return - - if recorded_url.response_recorder.payload_digest.name != "sha1": - self.logger.warn("digest type is %s but big capture table is indexed by sha1", recorded_url.response_recorder.payload_digest.name) + if recorded_url.response_recorder: + if recorded_url.response_recorder.payload_digest.name == "sha1": + sha1base32 = base64.b32encode(recorded_url.response_recorder.payload_digest.digest()).decode("utf-8") + else: + self.logger.warn("digest type is %s but big capture table is indexed by sha1", recorded_url.response_recorder.payload_digest.name) + else: + digest = hashlib.new("sha1", records[0].content[1]) + sha1base32 = base64.b32encode(digest.digest()).decode("utf-8") if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta: bucket = recorded_url.warcprox_meta["captures-bucket"] @@ -75,7 +79,7 @@ class RethinkCaptures: "filename": os.path.basename(records[0].warc_filename), "warc_type": records[0].type.decode("utf-8"), "warc_id": records[0].id.decode("utf-8"), - "sha1base32": base64.b32encode(recorded_url.response_recorder.payload_digest.digest()).decode("utf-8"), + "sha1base32": sha1base32, "content_type": recorded_url.mimetype, "response_code": recorded_url.status, "http_method": recorded_url.method, diff --git a/warcprox/warc.py b/warcprox/warc.py index bea4a89..9391890 100644 --- a/warcprox/warc.py +++ b/warcprox/warc.py @@ -113,9 +113,11 @@ class WarcRecordBuilder: else: headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(data)).encode('latin1'))) - block_digest = hashlib.new(self.digest_algorithm, data) + digest = hashlib.new(self.digest_algorithm, data) headers.append((warctools.WarcRecord.BLOCK_DIGEST, - warcprox.digest_str(block_digest, self.base32))) + warcprox.digest_str(digest, self.base32))) + headers.append((warctools.WarcRecord.PAYLOAD_DIGEST, + warcprox.digest_str(digest, self.base32))) content_tuple = content_type, data record = warctools.WarcRecord(headers=headers, content=content_tuple) From 12432b23ae1ca9968c23c8699861c008b07fac02 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 16 Sep 2015 01:14:13 +0000 Subject: [PATCH 045/146] for captures table generate canonical surt with scheme:// --- warcprox/bigtable.py | 4 +++- warcprox/warcproxy.py | 7 ++++--- warcprox/writerthread.py | 1 + 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/warcprox/bigtable.py b/warcprox/bigtable.py index 0587cf9..917edab 100644 --- a/warcprox/bigtable.py +++ b/warcprox/bigtable.py @@ -66,12 +66,14 @@ class RethinkCaptures: else: bucket = "__unspecified__" - canon_surt = surt.surt(recorded_url.url.decode("utf-8"), trailing_comma=True, host_massage=False) + canon_surt = surt.surt(recorded_url.url.decode("utf-8"), + trailing_comma=True, host_massage=False, with_scheme=True) entry = { # id only specified for rethinkdb partitioning "id": "{} {}".format(canon_surt[:20], records[0].id.decode("utf-8")[10:-1]), "abbr_canon_surt": canon_surt[:150], + "canon_surt": canon_surt, # "timestamp": re.sub(r"[^0-9]", "", records[0].date.decode("utf-8")), "timestamp": records[0].date.decode("utf-8"), "url": recorded_url.url.decode("utf-8"), diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index b1772ac..01f94cd 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -148,11 +148,12 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): def _enforce_limits(self, warcprox_meta): if warcprox_meta and "limits" in warcprox_meta: - # self.logger.info("warcprox_meta['limits']=%s", warcprox_meta['limits']) for item in warcprox_meta["limits"].items(): key, limit = item bucket0, bucket1, bucket2 = key.rsplit(".", 2) value = self.server.stats_db.value(bucket0, bucket1, bucket2) + # self.logger.debug("warcprox_meta['limits']=%s stats['%s']=%s recorded_url_q.qsize()=%s", + # warcprox_meta['limits'], key, value, self.server.recorded_url_q.qsize()) if value and value >= limit: body = "request rejected by warcprox: reached limit {}={}\n".format(key, limit).encode("utf-8") self.send_response(420, "Reached limit") @@ -243,9 +244,9 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): self.log_request(prox_rec_res.status, prox_rec_res.recorder.len) except socket.timeout as e: - self.logger.warn("%s proxying %s", repr(e), self.url) + self.logger.warn("%s proxying %s %s", repr(e), self.command, self.url) except BaseException as e: - self.logger.error("%s proxying %s", repr(e), self.url, exc_info=True) + self.logger.error("%s proxying %s %s", repr(e), self.command, self.url, exc_info=True) finally: # Let's close off the remote end if prox_rec_res: diff --git a/warcprox/writerthread.py b/warcprox/writerthread.py index 11f3e3f..8da6c11 100644 --- a/warcprox/writerthread.py +++ b/warcprox/writerthread.py @@ -38,6 +38,7 @@ class WarcWriterThread(threading.Thread): def run(self): try: + # XXX warcprox can shut down with urls to archive left in the queue self.setName('WarcWriterThread(tid={})'.format(warcprox.gettid())) while not self.stop.is_set(): try: From 2e482d67cc4447ead7a53d547273f264c08af0fb Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 16 Sep 2015 18:18:43 +0000 Subject: [PATCH 046/146] more patience waiting for warc writer thread --- warcprox/tests/test_warcprox.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/warcprox/tests/test_warcprox.py b/warcprox/tests/test_warcprox.py index 0331c7b..a994810 100755 --- a/warcprox/tests/test_warcprox.py +++ b/warcprox/tests/test_warcprox.py @@ -395,6 +395,7 @@ def test_dedup_http(http_daemon, warcprox_, archiving_proxies, playback_proxies) assert response.content == b'I am the warcprox test payload! ffffffffff!\n' # wait for writer thread to process + time.sleep(0.5) while not warcprox_.warc_writer_thread.idle: time.sleep(0.5) @@ -457,6 +458,7 @@ def test_dedup_https(https_daemon, warcprox_, archiving_proxies, playback_proxie assert response.content == b'I am the warcprox test payload! hhhhhhhhhh!\n' # wait for writer thread to process + time.sleep(0.5) while not warcprox_.warc_writer_thread.idle: time.sleep(0.5) @@ -486,6 +488,7 @@ def test_limits(http_daemon, warcprox_, archiving_proxies): assert response.content == b'I am the warcprox test payload! jjjjjjjjjj!\n' # wait for writer thread to process + time.sleep(0.5) while not warcprox_.warc_writer_thread.idle: time.sleep(0.5) @@ -509,6 +512,7 @@ def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies, assert response.content == b'I am the warcprox test payload! llllllllll!\n' # wait for writer thread to process + time.sleep(0.5) while not warcprox_.warc_writer_thread.idle: time.sleep(0.5) @@ -532,6 +536,7 @@ def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies, assert response.content == b'I am the warcprox test payload! llllllllll!\n' # wait for writer thread to process + time.sleep(0.5) while not warcprox_.warc_writer_thread.idle: time.sleep(0.5) @@ -558,6 +563,7 @@ def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies, assert response.content == b'I am the warcprox test payload! llllllllll!\n' # wait for writer thread to process + time.sleep(0.5) while not warcprox_.warc_writer_thread.idle: time.sleep(0.5) From f90c3a64032c91cdb1c3326b11786063bba69610 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 16 Sep 2015 18:19:20 +0000 Subject: [PATCH 047/146] Rethinker class moved to its own pyrethink project --- requirements.txt | 10 +++++++++- warcprox/__init__.py | 38 ++------------------------------------ warcprox/bigtable.py | 7 ++++--- warcprox/dedup.py | 3 ++- warcprox/stats.py | 3 ++- 5 files changed, 19 insertions(+), 42 deletions(-) diff --git a/requirements.txt b/requirements.txt index a320b31..810de6a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,13 @@ certauth>=1.1.0 rethinkdb git+https://github.com/internetarchive/warctools.git -git+https://github.com/nlevitt/surt.git@py3 kafka-python + +. +# -e . + +git+https://github.com/nlevitt/surt.git@py3 +# -e /home/nlevitt/workspace/surt + +https://github.com/nlevitt/pyrethink.git +# -e /home/nlevitt/workspace/pyrethink diff --git a/warcprox/__init__.py b/warcprox/__init__.py index 875bea2..e437074 100644 --- a/warcprox/__init__.py +++ b/warcprox/__init__.py @@ -17,44 +17,10 @@ class Options(_Namespace): except AttributeError: return None -class Rethinker: - import logging - logger = logging.getLogger("warcprox.Rethinker") - - def __init__(self, servers=["localhost"], db=None): - self.servers = servers - self.db = db - - # https://github.com/rethinkdb/rethinkdb-example-webpy-blog/blob/master/model.py - # "Best practices: Managing connections: a connection per request" - def _random_server_connection(self): - import rethinkdb as r - import random - while True: - server = random.choice(self.servers) - try: - try: - host, port = server.split(":") - return r.connect(host=host, port=port) - except ValueError: - return r.connect(host=server) - except Exception as e: - self.logger.error("will keep trying to get a connection after failure connecting to %s", server, exc_info=True) - import time - time.sleep(0.5) - - def run(self, query): - import rethinkdb as r - while True: - with self._random_server_connection() as conn: - try: - return query.run(conn, db=self.db) - except (r.ReqlAvailabilityError, r.ReqlTimeoutError) as e: - self.logger.error("will retry rethinkdb query/operation %s which failed like so:", query, exc_info=True) - version_bytes = _read_version_bytes().strip() version_str = version_bytes.decode('utf-8') +# XXX linux-specific def gettid(): try: import ctypes @@ -63,7 +29,7 @@ def gettid(): tid = libc.syscall(SYS_gettid) return tid except: - logging.warn("gettid failed?") + logging.warn("gettid failed?", exc_info=True) import warcprox.controller as controller import warcprox.playback as playback diff --git a/warcprox/bigtable.py b/warcprox/bigtable.py index 917edab..551fdca 100644 --- a/warcprox/bigtable.py +++ b/warcprox/bigtable.py @@ -12,12 +12,13 @@ import base64 import surt import os import hashlib +import pyrethink class RethinkCaptures: logger = logging.getLogger("warcprox.bigtables.RethinkCaptures") def __init__(self, servers=["localhost"], db="warcprox", table="captures", shards=3, replicas=3, options=warcprox.Options()): - self.r = warcprox.Rethinker(servers, db) + self.r = pyrethink.Rethinker(servers, db) self.table = table self.shards = shards self.replicas = replicas @@ -40,8 +41,8 @@ class RethinkCaptures: if algo != "sha1": raise Exception("digest type is {} but big capture table is indexed by sha1".format(algo)) sha1base32 = base64.b32encode(raw_digest).decode("utf-8") - cursor = self.r.run(r.table(self.table).get_all([sha1base32, "response", bucket], index="sha1_warc_type")) - results = list(cursor) + results_iter = self.r.results_iter(r.table(self.table).get_all([sha1base32, "response", bucket], index="sha1_warc_type")) + results = list(results_iter) if len(results) > 1: raise Exception("expected 0 or 1 but found %s results for sha1base32=%s", len(results), sha1base32) elif len(results) == 1: diff --git a/warcprox/dedup.py b/warcprox/dedup.py index 4eea112..368054a 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -18,6 +18,7 @@ import warcprox import rethinkdb r = rethinkdb import random +import pyrethink class DedupDb(object): logger = logging.getLogger("warcprox.dedup.DedupDb") @@ -88,7 +89,7 @@ class RethinkDedupDb: logger = logging.getLogger("warcprox.dedup.RethinkDedupDb") def __init__(self, servers=["localhost"], db="warcprox", table="dedup", shards=3, replicas=3, options=warcprox.Options()): - self.r = warcprox.Rethinker(servers, db) + self.r = pyrethink.Rethinker(servers, db) self.table = table self.shards = shards self.replicas = replicas diff --git a/warcprox/stats.py b/warcprox/stats.py index 852975f..3c16833 100644 --- a/warcprox/stats.py +++ b/warcprox/stats.py @@ -16,6 +16,7 @@ import rethinkdb r = rethinkdb import random import warcprox +import pyrethink def _empty_bucket(bucket): return { @@ -106,7 +107,7 @@ class RethinkStatsDb: logger = logging.getLogger("warcprox.stats.RethinkStatsDb") def __init__(self, servers=["localhost"], db="warcprox", table="stats", shards=3, replicas=3, options=warcprox.Options()): - self.r = warcprox.Rethinker(servers, db) + self.r = pyrethink.Rethinker(servers, db) self.table = table self.shards = shards self.replicas = replicas From 3b9345e7d7a5304c2865ed5cda0a0e0e4a5cfea6 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 22 Sep 2015 01:31:24 +0000 Subject: [PATCH 048/146] use nicer rethinkdbstuff.Rethinker api --- requirements.txt | 12 +++++------- warcprox/bigtable.py | 23 ++++++++++------------- warcprox/dedup.py | 21 ++++++++------------- warcprox/main.py | 10 +++++++--- warcprox/stats.py | 19 ++++++++----------- warcprox/tests/test_warcprox.py | 18 ++++++++++-------- 6 files changed, 48 insertions(+), 55 deletions(-) diff --git a/requirements.txt b/requirements.txt index 810de6a..b00387c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,13 +1,11 @@ certauth>=1.1.0 -rethinkdb git+https://github.com/internetarchive/warctools.git kafka-python -. -# -e . - git+https://github.com/nlevitt/surt.git@py3 -# -e /home/nlevitt/workspace/surt +git+https://github.com/nlevitt/rethinkstuff.git +. -https://github.com/nlevitt/pyrethink.git -# -e /home/nlevitt/workspace/pyrethink +# -e /home/nlevitt/workspace/surt +# -e /home/nlevitt/workspace/rethinkstuff +# -e . diff --git a/warcprox/bigtable.py b/warcprox/bigtable.py index 551fdca..a1620e2 100644 --- a/warcprox/bigtable.py +++ b/warcprox/bigtable.py @@ -4,21 +4,18 @@ from __future__ import absolute_import import logging from hanzo import warctools -import rethinkdb -r = rethinkdb import random import warcprox import base64 import surt import os import hashlib -import pyrethink class RethinkCaptures: logger = logging.getLogger("warcprox.bigtables.RethinkCaptures") - def __init__(self, servers=["localhost"], db="warcprox", table="captures", shards=3, replicas=3, options=warcprox.Options()): - self.r = pyrethink.Rethinker(servers, db) + def __init__(self, r, table="captures", shards=3, replicas=3, options=warcprox.Options()): + self.r = r self.table = table self.shards = shards self.replicas = replicas @@ -26,22 +23,22 @@ class RethinkCaptures: self._ensure_db_table() def _ensure_db_table(self): - dbs = self.r.run(r.db_list()) + dbs = self.r.db_list().run() if not self.r.db in dbs: self.logger.info("creating rethinkdb database %s", repr(self.r.db)) - self.r.run(r.db_create(self.r.db)) - tables = self.r.run(r.table_list()) + self.r.db_create(self.r.db).run() + tables = self.r.table_list().run() if not self.table in tables: self.logger.info("creating rethinkdb table %s in database %s", repr(self.table), repr(self.r.db)) - self.r.run(r.table_create(self.table, shards=self.shards, replicas=self.replicas)) - self.r.run(r.table(self.table).index_create("abbr_canon_surt_timesamp", [r.row["abbr_canon_surt"], r.row["timestamp"]])) - self.r.run(r.table(self.table).index_create("sha1_warc_type", [r.row["sha1base32"], r.row["warc_type"], r.row["bucket"]])) + self.r.table_create(self.table, shards=self.shards, replicas=self.replicas).run() + self.r.table(self.table).index_create("abbr_canon_surt_timesamp", [self.r.row["abbr_canon_surt"], self.r.row["timestamp"]]).run() + self.r.table(self.table).index_create("sha1_warc_type", [self.r.row["sha1base32"], self.r.row["warc_type"], self.r.row["bucket"]]).run() def find_response_by_digest(self, algo, raw_digest, bucket="__unspecified__"): if algo != "sha1": raise Exception("digest type is {} but big capture table is indexed by sha1".format(algo)) sha1base32 = base64.b32encode(raw_digest).decode("utf-8") - results_iter = self.r.results_iter(r.table(self.table).get_all([sha1base32, "response", bucket], index="sha1_warc_type")) + results_iter = self.r.table(self.table).get_all([sha1base32, "response", bucket], index="sha1_warc_type").run() results = list(results_iter) if len(results) > 1: raise Exception("expected 0 or 1 but found %s results for sha1base32=%s", len(results), sha1base32) @@ -90,7 +87,7 @@ class RethinkCaptures: "length": records[0].length, } - result = self.r.run(r.table(self.table).insert(entry)) + result = self.r.table(self.table).insert(entry).run() if result["inserted"] == 1 and sorted(result.values()) != [0,0,0,0,0,1]: raise Exception("unexpected result %s saving %s", result, entry) self.logger.debug("big capture table db saved %s", entry) diff --git a/warcprox/dedup.py b/warcprox/dedup.py index 368054a..33e93af 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -1,5 +1,3 @@ -# vim:set sw=4 et: - from __future__ import absolute_import try: @@ -15,10 +13,7 @@ import os import json from hanzo import warctools import warcprox -import rethinkdb -r = rethinkdb import random -import pyrethink class DedupDb(object): logger = logging.getLogger("warcprox.dedup.DedupDb") @@ -88,8 +83,8 @@ def decorate_with_dedup_info(dedup_db, recorded_url, base32=False): class RethinkDedupDb: logger = logging.getLogger("warcprox.dedup.RethinkDedupDb") - def __init__(self, servers=["localhost"], db="warcprox", table="dedup", shards=3, replicas=3, options=warcprox.Options()): - self.r = pyrethink.Rethinker(servers, db) + def __init__(self, r, table="dedup", shards=3, replicas=3, options=warcprox.Options()): + self.r = r self.table = table self.shards = shards self.replicas = replicas @@ -97,14 +92,14 @@ class RethinkDedupDb: self.options = options def _ensure_db_table(self): - dbs = self.r.run(r.db_list()) + dbs = self.r.db_list().run() if not self.r.db in dbs: self.logger.info("creating rethinkdb database %s", repr(self.r.db)) - self.r.run(r.db_create(self.r.db)) - tables = self.r.run(r.table_list()) + self.r.db_create(self.r.db).run() + tables = self.r.table_list().run() if not self.table in tables: self.logger.info("creating rethinkdb table %s in database %s", repr(self.table), repr(self.r.db)) - self.r.run(r.table_create(self.table, primary_key="key", shards=self.shards, replicas=self.replicas)) + self.r.table_create(self.table, primary_key="key", shards=self.shards, replicas=self.replicas).run() def close(self): pass @@ -119,7 +114,7 @@ class RethinkDedupDb: url = response_record.get_header(warctools.WarcRecord.URL).decode('latin1') date = response_record.get_header(warctools.WarcRecord.DATE).decode('latin1') record = {'key':k,'url':url,'date':date,'id':record_id} - result = self.r.run(r.table(self.table).insert(record,conflict="replace")) + result = self.r.table(self.table).insert(record,conflict="replace").run() if sorted(result.values()) != [0,0,0,0,0,1] and [result["deleted"],result["skipped"],result["errors"]] != [0,0,0]: raise Exception("unexpected result %s saving %s", result, record) self.logger.debug('dedup db saved %s:%s', k, record) @@ -127,7 +122,7 @@ class RethinkDedupDb: def lookup(self, digest_key, bucket=""): k = digest_key.decode("utf-8") if isinstance(digest_key, bytes) else digest_key k = "{}|{}".format(k, bucket) - result = self.r.run(r.table(self.table).get(k)) + result = self.r.table(self.table).get(k).run() if result: for x in result: result[x] = result[x].encode("utf-8") diff --git a/warcprox/main.py b/warcprox/main.py index cd0cbbe..e647a7e 100644 --- a/warcprox/main.py +++ b/warcprox/main.py @@ -21,6 +21,7 @@ import threading import certauth.certauth import warcprox import re +import rethinkstuff def _build_arg_parser(prog=os.path.basename(sys.argv[0])): arg_parser = argparse.ArgumentParser(prog=prog, @@ -120,12 +121,13 @@ def main(argv=sys.argv): listeners = [] if args.rethinkdb_servers: + r = rethinkstuff.Rethinker(args.rethinkdb_servers.split(","), args.rethinkdb_db) if args.rethinkdb_big_table: - captures_db = warcprox.bigtable.RethinkCaptures(args.rethinkdb_servers.split(","), args.rethinkdb_db, options=options) + captures_db = warcprox.bigtable.RethinkCaptures(r, options=options) dedup_db = warcprox.bigtable.RethinkCapturesDedup(captures_db, options=options) listeners.append(captures_db) else: - dedup_db = warcprox.dedup.RethinkDedupDb(args.rethinkdb_servers.split(","), args.rethinkdb_db, options=options) + dedup_db = warcprox.dedup.RethinkDedupDb(r, options=options) listeners.append(dedup_db) elif args.dedup_db_file in (None, '', '/dev/null'): logging.info('deduplication disabled') @@ -135,7 +137,7 @@ def main(argv=sys.argv): listeners.append(dedup_db) if args.rethinkdb_servers: - stats_db = warcprox.stats.RethinkStatsDb(args.rethinkdb_servers.split(","), args.rethinkdb_db, options=options) + stats_db = warcprox.stats.RethinkStatsDb(r, options=options) listeners.append(stats_db) elif args.stats_db_file in (None, '', '/dev/null'): logging.info('statistics tracking disabled') @@ -183,5 +185,7 @@ def main(argv=sys.argv): if __name__ == '__main__': + import gc + gc.set_debug(gc.DEBUG_LEAK) main() diff --git a/warcprox/stats.py b/warcprox/stats.py index 3c16833..61c573d 100644 --- a/warcprox/stats.py +++ b/warcprox/stats.py @@ -12,11 +12,8 @@ import logging import os import json from hanzo import warctools -import rethinkdb -r = rethinkdb import random import warcprox -import pyrethink def _empty_bucket(bucket): return { @@ -106,8 +103,8 @@ class StatsDb: class RethinkStatsDb: logger = logging.getLogger("warcprox.stats.RethinkStatsDb") - def __init__(self, servers=["localhost"], db="warcprox", table="stats", shards=3, replicas=3, options=warcprox.Options()): - self.r = pyrethink.Rethinker(servers, db) + def __init__(self, r, table="stats", shards=3, replicas=3, options=warcprox.Options()): + self.r = r self.table = table self.shards = shards self.replicas = replicas @@ -115,14 +112,14 @@ class RethinkStatsDb: self.options = options def _ensure_db_table(self): - dbs = self.r.run(r.db_list()) + dbs = self.r.db_list().run() if not self.r.db in dbs: self.logger.info("creating rethinkdb database %s", repr(self.r.db)) - self.r.run(r.db_create(self.r.db)) - tables = self.r.run(r.table_list()) + self.r.db_create(self.r.db).run() + tables = self.r.table_list().run() if not self.table in tables: self.logger.info("creating rethinkdb table %s in database %s", repr(self.table), repr(self.r.db)) - self.r.run(r.table_create(self.table, primary_key="bucket", shards=self.shards, replicas=self.replicas)) + self.r.table_create(self.table, primary_key="bucket", shards=self.shards, replicas=self.replicas).run() def close(self): pass @@ -132,7 +129,7 @@ class RethinkStatsDb: def value(self, bucket0="__all__", bucket1=None, bucket2=None): # XXX use pluck? - bucket0_stats = self.r.run(r.table(self.table).get(bucket0)) + bucket0_stats = self.r.table(self.table).get(bucket0).run() self.logger.debug('stats db lookup of bucket=%s returned %s', bucket0, bucket0_stats) if bucket0_stats: if bucket1: @@ -166,7 +163,7 @@ class RethinkStatsDb: bucket_stats["new"]["wire_bytes"] += recorded_url.size self.logger.debug("saving %s", bucket_stats) - result = self.r.run(r.table(self.table).insert(bucket_stats, conflict="replace")) + result = self.r.table(self.table).insert(bucket_stats, conflict="replace").run() if sorted(result.values()) != [0,0,0,0,0,1] or [result["deleted"],result["skipped"],result["errors"]] != [0,0,0]: raise Exception("unexpected result %s saving %s", result, record) diff --git a/warcprox/tests/test_warcprox.py b/warcprox/tests/test_warcprox.py index a994810..badf9ed 100755 --- a/warcprox/tests/test_warcprox.py +++ b/warcprox/tests/test_warcprox.py @@ -15,9 +15,8 @@ import shutil import requests import re import json -import rethinkdb -r = rethinkdb import random +import rethinkstuff from hanzo import warctools try: @@ -143,12 +142,13 @@ def captures_db(request, rethinkdb_servers, rethinkdb_big_table): servers = rethinkdb_servers.split(",") if rethinkdb_big_table: db = 'warcprox_test_captures_' + "".join(random.sample("abcdefghijklmnopqrstuvwxyz0123456789_",8)) - captures_db = warcprox.bigtable.RethinkCaptures(servers, db) + r = rethinkstuff.Rethinker(servers, db) + captures_db = warcprox.bigtable.RethinkCaptures(r) def fin(): if captures_db: logging.info('dropping rethinkdb database {}'.format(db)) - result = captures_db.r.run(r.db_drop(db)) + result = captures_db.r.db_drop(db).run() logging.info("result=%s", result) request.addfinalizer(fin) @@ -163,13 +163,14 @@ def rethink_dedup_db(request, rethinkdb_servers, captures_db): else: servers = rethinkdb_servers.split(",") db = 'warcprox_test_dedup_' + "".join(random.sample("abcdefghijklmnopqrstuvwxyz0123456789_",8)) - ddb = warcprox.dedup.RethinkDedupDb(servers, db) + r = rethinkstuff.Rethinker(servers, db) + ddb = warcprox.dedup.RethinkDedupDb(r) def fin(): if rethinkdb_servers: if not captures_db: logging.info('dropping rethinkdb database {}'.format(db)) - result = ddb.r.run(r.db_drop(db)) + result = ddb.r.db_drop(db).run() logging.info("result=%s", result) request.addfinalizer(fin) @@ -198,7 +199,8 @@ def stats_db(request, rethinkdb_servers): if rethinkdb_servers: servers = rethinkdb_servers.split(",") db = 'warcprox_test_stats_' + "".join(random.sample("abcdefghijklmnopqrstuvwxyz0123456789_",8)) - sdb = warcprox.stats.RethinkStatsDb(servers, db) + r = rethinkstuff.Rethinker(servers, db) + sdb = warcprox.stats.RethinkStatsDb(r) else: f = tempfile.NamedTemporaryFile(prefix='warcprox-test-stats-', suffix='.db', delete=False) f.close() @@ -208,7 +210,7 @@ def stats_db(request, rethinkdb_servers): def fin(): if rethinkdb_servers: logging.info('dropping rethinkdb database {}'.format(db)) - result = sdb.r.run(r.db_drop(db)) + result = sdb.r.db_drop(db).run() logging.info("result=%s", result) else: logging.info('deleting file {}'.format(stats_db_file)) From dd1c7b5f7dd66fa2111c6f55a162e4a6d52cd71b Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 22 Sep 2015 01:32:49 +0000 Subject: [PATCH 049/146] don't implement __del__, maybe it can cause mem leaks; bunch of logging to try to detect leaks --- warcprox/controller.py | 42 +++++++++++++++++++++++++++++++++++++++- warcprox/warcproxy.py | 10 ++++------ warcprox/writerthread.py | 4 ++++ 3 files changed, 49 insertions(+), 7 deletions(-) diff --git a/warcprox/controller.py b/warcprox/controller.py index e198006..1850857 100644 --- a/warcprox/controller.py +++ b/warcprox/controller.py @@ -6,6 +6,8 @@ import logging import threading import time import warcprox +import sys +import gc class WarcproxController(object): logger = logging.getLogger("warcprox.controller.WarcproxController") @@ -50,10 +52,48 @@ class WarcproxController(object): self.stop = threading.Event() try: + t = time.time() - 30 while not self.stop.is_set(): time.sleep(0.5) + if time.time() - t > 60: + num_unreachable = gc.collect() + all_objects = gc.get_objects() + total_size = 0 + summary = {} + biggest_objects = [None] * 10 + for obj in all_objects: + size = sys.getsizeof(obj) + total_size += size + if not type(obj) in summary: + summary[type(obj)] = {"count":0,"size":0} + summary[type(obj)]["count"] += 1 + summary[type(obj)]["size"] += size + if size > sys.getsizeof(biggest_objects[-1]): + for i in range(len(biggest_objects)): + if size > sys.getsizeof(biggest_objects[i]): + index = i + break + biggest_objects[index+1:] = biggest_objects[index:-1] + biggest_objects[index] = obj + + self.logger.info("%s objects totaling %s bytes", len(all_objects), total_size) + for item in sorted(summary.items(), key=lambda item: item[1]["size"], reverse=True)[:10]: + self.logger.info("%s bytes in %s instances of %s", item[1]["size"], item[1]["count"], item[0]) + for i in range(len(biggest_objects)): + obj = biggest_objects[i] + try: + value = repr(bytes(obj.getbuffer()[:100])) + except: + try: + value = repr(obj)[:100] + except BaseException as e: + value = "<{} getting value>".format(e) + self.logger.info("#%s (%s) (%s bytes) (%s refs) (id=%s): %s", i+1, type(obj), sys.getsizeof(obj), sys.getrefcount(obj), id(obj), value) + self.logger.info("%s unreachable objects totaling %s bytes", len(gc.garbage), sum(sys.getsizeof(x) for x in gc.garbage)) + + t = time.time() except: - self.logger.critical("fatal exception, shutting down", exc_info=1) + self.logger.critical("fatal exception, shutting down", exc_info=True) pass finally: self.warc_writer_thread.stop.set() diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index 01f94cd..b4d541a 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -115,7 +115,6 @@ class ProxyingRecorder(object): else: return 0 - class ProxyingRecordingHTTPResponse(http_client.HTTPResponse): def __init__(self, sock, debuglevel=0, method=None, proxy_dest=None, digest_algorithm='sha1', url=None): @@ -351,11 +350,10 @@ class RecordedUrl: self.host = host self.duration = duration - def __del__(self): - self.logger.debug("finished with %s", self) - if self.response_recorder: - self.response_recorder.tempfile.close() - self.response_recorder = None + # def __del__(self): + # self.logger.debug("finished with %s", self) + # if self.response_recorder: + # del self.response_recorder class SingleThreadedWarcProxy(http_server.HTTPServer): diff --git a/warcprox/writerthread.py b/warcprox/writerthread.py index 8da6c11..a95a68f 100644 --- a/warcprox/writerthread.py +++ b/warcprox/writerthread.py @@ -49,6 +49,10 @@ class WarcWriterThread(threading.Thread): recorded_url, base32=self.options.base32) records = self.writer_pool.write_records(recorded_url) self._final_tasks(recorded_url, records) + + # try to release resources in a timely fashion + if recorded_url.response_recorder and recorded_url.response_recorder.tempfile: + recorded_url.response_recorder.tempfile.close() except queue.Empty: self.idle = time.time() self.writer_pool.maybe_idle_rollover() From 4c380dcc41084e197eaec2a762a448b268176235 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 22 Sep 2015 05:45:05 +0000 Subject: [PATCH 050/146] move tests out of installed package dir --- {warcprox/tests => tests}/__init__.py | 0 {warcprox/tests => tests}/conftest.py | 0 {warcprox/tests => tests}/test_dump-anydbm.py | 3 ++- {warcprox/tests => tests}/test_warcprox.py | 0 4 files changed, 2 insertions(+), 1 deletion(-) rename {warcprox/tests => tests}/__init__.py (100%) rename {warcprox/tests => tests}/conftest.py (100%) rename {warcprox/tests => tests}/test_dump-anydbm.py (97%) rename {warcprox/tests => tests}/test_warcprox.py (100%) diff --git a/warcprox/tests/__init__.py b/tests/__init__.py similarity index 100% rename from warcprox/tests/__init__.py rename to tests/__init__.py diff --git a/warcprox/tests/conftest.py b/tests/conftest.py similarity index 100% rename from warcprox/tests/conftest.py rename to tests/conftest.py diff --git a/warcprox/tests/test_dump-anydbm.py b/tests/test_dump-anydbm.py similarity index 97% rename from warcprox/tests/test_dump-anydbm.py rename to tests/test_dump-anydbm.py index 4cca48d..6bb600d 100644 --- a/warcprox/tests/test_dump-anydbm.py +++ b/tests/test_dump-anydbm.py @@ -6,6 +6,7 @@ import tempfile import subprocess # to access the script from shell import sys import glob +import distutils # will try as python 3 then default to python 2 modules try: @@ -38,7 +39,7 @@ val1 = 'very first value' val2 = 'second value' py = sys.executable -dump_anydbm_loc = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "bin/dump-anydbm") +dump_anydbm_loc = distutils.spawn.find_executable("dump-anydbm") @pytest.fixture(scope="function") def gdbm_test_db(request): diff --git a/warcprox/tests/test_warcprox.py b/tests/test_warcprox.py similarity index 100% rename from warcprox/tests/test_warcprox.py rename to tests/test_warcprox.py From abc2d28787a3e9ec7e163aea94dc543bbd742005 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 22 Sep 2015 18:35:59 +0000 Subject: [PATCH 051/146] report actual exception, avoid incomprehensible error message "TypeError: NoneType object is not callable" in python2 (apparently due to fact that BaseHTTPServer.BaseHTTPRequestHandler is an old-style class) --- warcprox/mitmproxy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index 154d30e..3c3f95b 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -133,7 +133,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): try: self._proxy_request() except: - self.logger.error("exception from {}".format(self._proxy_request), exc_info=True) + self.logger.error("exception proxying request", exc_info=True) raise def _proxy_request(self): From 0171cdd01daf11f87908fe33598e7d68e85273b4 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 22 Sep 2015 19:26:09 +0000 Subject: [PATCH 052/146] fixes for python 2.7 --- warcprox/stats.py | 29 +++++++++++++++++++---------- warcprox/warcproxy.py | 7 ------- 2 files changed, 19 insertions(+), 17 deletions(-) diff --git a/warcprox/stats.py b/warcprox/stats.py index 61c573d..da394a7 100644 --- a/warcprox/stats.py +++ b/warcprox/stats.py @@ -57,13 +57,19 @@ class StatsDb: pass def value(self, bucket0="__all__", bucket1=None, bucket2=None): - if bucket0 in self.db: - bucket0_stats = json.loads(self.db[bucket0].decode("utf-8")) - if bucket1: - if bucket2: - return bucket0_stats[bucket1][bucket2] + # Gdbm wants str/bytes keys in python2, str/unicode keys in python3. + # This ugliness deals with keys that arrive as unicode in py2. + b0 = bucket0.encode("utf-8") if bucket0 and not isinstance(bucket0, str) else bucket0 + b1 = bucket1.encode("utf-8") if bucket1 and not isinstance(bucket1, str) else bucket1 + b2 = bucket2.encode("utf-8") if bucket2 and not isinstance(bucket2, str) else bucket2 + + if b0 in self.db: + bucket0_stats = json.loads(self.db[b0].decode("utf-8")) + if b1: + if b2: + return bucket0_stats[b1][b2] else: - return bucket0_stats[bucket1] + return bucket0_stats[b1] else: return bucket0_stats else: @@ -83,10 +89,13 @@ class StatsDb: buckets.append("__unspecified__") for bucket in buckets: - if bucket in self.db: - bucket_stats = json.loads(self.db[bucket].decode("utf-8")) + # Gdbm wants str/bytes keys in python2, str/unicode keys in python3. + # This ugliness deals with keys that arrive as unicode in py2. + b = bucket.encode("utf-8") if bucket and not isinstance(bucket, str) else bucket + if b in self.db: + bucket_stats = json.loads(self.db[b].decode("utf-8")) else: - bucket_stats = _empty_bucket(bucket) + bucket_stats = _empty_bucket(b) bucket_stats["total"]["urls"] += 1 bucket_stats["total"]["wire_bytes"] += recorded_url.size @@ -98,7 +107,7 @@ class StatsDb: bucket_stats["new"]["urls"] += 1 bucket_stats["new"]["wire_bytes"] += recorded_url.size - self.db[bucket] = json.dumps(bucket_stats, separators=(',',':')).encode("utf-8") + self.db[b] = json.dumps(bucket_stats, separators=(',',':')).encode("utf-8") class RethinkStatsDb: logger = logging.getLogger("warcprox.stats.RethinkStatsDb") diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index b4d541a..4e19d4f 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -331,8 +331,6 @@ class RecordedUrl: else: self.warcprox_meta = {} - if isinstance(content_type, bytes): - raise Exception("content_type is not supposed to be bytes!") self.content_type = content_type self.mimetype = content_type @@ -350,11 +348,6 @@ class RecordedUrl: self.host = host self.duration = duration - # def __del__(self): - # self.logger.debug("finished with %s", self) - # if self.response_recorder: - # del self.response_recorder - class SingleThreadedWarcProxy(http_server.HTTPServer): logger = logging.getLogger("warcprox.warcproxy.WarcProxy") From 69d641cd50f53dfc93ee7baeee513b20c537bf53 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 22 Sep 2015 20:45:09 +0000 Subject: [PATCH 053/146] avoid attempting to create tables with more shards or replicas than the number of servers --- warcprox/bigtable.py | 6 +++--- warcprox/dedup.py | 9 +++++---- warcprox/stats.py | 9 +++++---- 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/warcprox/bigtable.py b/warcprox/bigtable.py index a1620e2..3c610e4 100644 --- a/warcprox/bigtable.py +++ b/warcprox/bigtable.py @@ -14,11 +14,11 @@ import hashlib class RethinkCaptures: logger = logging.getLogger("warcprox.bigtables.RethinkCaptures") - def __init__(self, r, table="captures", shards=3, replicas=3, options=warcprox.Options()): + def __init__(self, r, table="captures", shards=None, replicas=None, options=warcprox.Options()): self.r = r self.table = table - self.shards = shards - self.replicas = replicas + self.shards = shards or len(r.servers) + self.replicas = replicas or min(3, len(r.servers)) self.options = options self._ensure_db_table() diff --git a/warcprox/dedup.py b/warcprox/dedup.py index 33e93af..358af05 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -83,11 +83,11 @@ def decorate_with_dedup_info(dedup_db, recorded_url, base32=False): class RethinkDedupDb: logger = logging.getLogger("warcprox.dedup.RethinkDedupDb") - def __init__(self, r, table="dedup", shards=3, replicas=3, options=warcprox.Options()): + def __init__(self, r, table="dedup", shards=None, replicas=None, options=warcprox.Options()): self.r = r self.table = table - self.shards = shards - self.replicas = replicas + self.shards = shards or len(r.servers) + self.replicas = replicas or min(3, len(r.servers)) self._ensure_db_table() self.options = options @@ -98,7 +98,8 @@ class RethinkDedupDb: self.r.db_create(self.r.db).run() tables = self.r.table_list().run() if not self.table in tables: - self.logger.info("creating rethinkdb table %s in database %s", repr(self.table), repr(self.r.db)) + self.logger.info("creating rethinkdb table %s in database %s shards=%s replicas=%s", + repr(self.table), repr(self.r.db), self.shards, self.replicas) self.r.table_create(self.table, primary_key="key", shards=self.shards, replicas=self.replicas).run() def close(self): diff --git a/warcprox/stats.py b/warcprox/stats.py index da394a7..a6e5dbf 100644 --- a/warcprox/stats.py +++ b/warcprox/stats.py @@ -112,11 +112,11 @@ class StatsDb: class RethinkStatsDb: logger = logging.getLogger("warcprox.stats.RethinkStatsDb") - def __init__(self, r, table="stats", shards=3, replicas=3, options=warcprox.Options()): + def __init__(self, r, table="stats", shards=None, replicas=None, options=warcprox.Options()): self.r = r self.table = table - self.shards = shards - self.replicas = replicas + self.shards = shards or len(r.servers) + self.replicas = replicas or min(3, len(r.servers)) self._ensure_db_table() self.options = options @@ -127,7 +127,8 @@ class RethinkStatsDb: self.r.db_create(self.r.db).run() tables = self.r.table_list().run() if not self.table in tables: - self.logger.info("creating rethinkdb table %s in database %s", repr(self.table), repr(self.r.db)) + self.logger.info("creating rethinkdb table %s in database %s shards=%s replicas=%s", + repr(self.table), repr(self.r.db), self.shards, self.replicas) self.r.table_create(self.table, primary_key="bucket", shards=self.shards, replicas=self.replicas).run() def close(self): From 28d213fb180cce749c91e2b68fc47969e1efa9d4 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 22 Sep 2015 20:57:00 +0000 Subject: [PATCH 054/146] spin up rethinkdb in docker, run tests in there --- tests/Dockerfile | 21 +++++++++++++++++++++ tests/run-tests.sh | 22 ++++++++++++++++++++++ 2 files changed, 43 insertions(+) create mode 100644 tests/Dockerfile create mode 100755 tests/run-tests.sh diff --git a/tests/Dockerfile b/tests/Dockerfile new file mode 100644 index 0000000..f2b040a --- /dev/null +++ b/tests/Dockerfile @@ -0,0 +1,21 @@ +FROM phusion/baseimage +MAINTAINER Noah Levitt + +# see https://github.com/stuartpb/rethinkdb-dockerfiles/blob/master/trusty/2.1.3/Dockerfile + +ENV LANG=C.UTF-8 + +RUN apt-get update && apt-get --auto-remove -y dist-upgrade + +# Add the RethinkDB repository and public key +# "RethinkDB Packaging " http://download.rethinkdb.com/apt/pubkey.gpg +RUN apt-key adv --keyserver pgp.mit.edu --recv-keys 1614552E5765227AEC39EFCFA7E00EF33A8F2399 \ + && echo "deb http://download.rethinkdb.com/apt trusty main" > /etc/apt/sources.list.d/rethinkdb.list \ + && apt-get update && apt-get -y install rethinkdb + +RUN mkdir -vp /etc/service/rethinkdb \ + && echo "#!/bin/sh\nrethinkdb --bind 0.0.0.0 --directory /tmp/rethink-data --runuser rethinkdb --rungroup rethinkdb\n" > /etc/service/rethinkdb/run \ + && chmod a+x /etc/service/rethinkdb/run + +RUN apt-get -y install python-virtualenv git +RUN apt-get -y install python-gdbm python3-gdbm libpython2.7-dev libpython3.4-dev libffi-dev libssl-dev diff --git a/tests/run-tests.sh b/tests/run-tests.sh new file mode 100755 index 0000000..50deaec --- /dev/null +++ b/tests/run-tests.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" + +docker build -t internetarchive/rethinkdb $script_dir || exit 1 + +uid=$(id -u) +user=$(id -un) + +for python in python2.7 python3.4 +do + docker run --rm -i -t --volume="$script_dir/..:/warcprox" internetarchive/rethinkdb /sbin/my_init -- \ + bash -x -c "adduser --gecos=$user --disabled-password --quiet --uid=$uid $user \ + && sudo -u $user bash -x -c 'cd /warcprox \ + && virtualenv -p $python /tmp/venv \ + && source /tmp/venv/bin/activate \ + && pip --log-file /tmp/pip.log install -r requirements.txt . pytest requests \ + && py.test tests \ + && py.test --rethinkdb-servers=localhost tests \ + && py.test --rethinkdb-servers=localhost --rethinkdb-big-table tests'" +done + From f806cd3e4a3ad3d2d6a324986299bc7626e9fd14 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 23 Sep 2015 00:32:39 +0000 Subject: [PATCH 055/146] use Rethinker.dbname to avoid conflict with rethinkdb.db --- warcprox/bigtable.py | 8 ++++---- warcprox/dedup.py | 8 ++++---- warcprox/stats.py | 8 ++++---- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/warcprox/bigtable.py b/warcprox/bigtable.py index 3c610e4..ec75360 100644 --- a/warcprox/bigtable.py +++ b/warcprox/bigtable.py @@ -24,12 +24,12 @@ class RethinkCaptures: def _ensure_db_table(self): dbs = self.r.db_list().run() - if not self.r.db in dbs: - self.logger.info("creating rethinkdb database %s", repr(self.r.db)) - self.r.db_create(self.r.db).run() + if not self.r.dbname in dbs: + self.logger.info("creating rethinkdb database %s", repr(self.r.dbname)) + self.r.db_create(self.r.dbname).run() tables = self.r.table_list().run() if not self.table in tables: - self.logger.info("creating rethinkdb table %s in database %s", repr(self.table), repr(self.r.db)) + self.logger.info("creating rethinkdb table %s in database %s", repr(self.table), repr(self.r.dbname)) self.r.table_create(self.table, shards=self.shards, replicas=self.replicas).run() self.r.table(self.table).index_create("abbr_canon_surt_timesamp", [self.r.row["abbr_canon_surt"], self.r.row["timestamp"]]).run() self.r.table(self.table).index_create("sha1_warc_type", [self.r.row["sha1base32"], self.r.row["warc_type"], self.r.row["bucket"]]).run() diff --git a/warcprox/dedup.py b/warcprox/dedup.py index 358af05..942ee9f 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -93,13 +93,13 @@ class RethinkDedupDb: def _ensure_db_table(self): dbs = self.r.db_list().run() - if not self.r.db in dbs: - self.logger.info("creating rethinkdb database %s", repr(self.r.db)) - self.r.db_create(self.r.db).run() + if not self.r.dbname in dbs: + self.logger.info("creating rethinkdb database %s", repr(self.r.dbname)) + self.r.db_create(self.r.dbname).run() tables = self.r.table_list().run() if not self.table in tables: self.logger.info("creating rethinkdb table %s in database %s shards=%s replicas=%s", - repr(self.table), repr(self.r.db), self.shards, self.replicas) + repr(self.table), repr(self.r.dbname), self.shards, self.replicas) self.r.table_create(self.table, primary_key="key", shards=self.shards, replicas=self.replicas).run() def close(self): diff --git a/warcprox/stats.py b/warcprox/stats.py index a6e5dbf..b27c021 100644 --- a/warcprox/stats.py +++ b/warcprox/stats.py @@ -122,13 +122,13 @@ class RethinkStatsDb: def _ensure_db_table(self): dbs = self.r.db_list().run() - if not self.r.db in dbs: - self.logger.info("creating rethinkdb database %s", repr(self.r.db)) - self.r.db_create(self.r.db).run() + if not self.r.dbname in dbs: + self.logger.info("creating rethinkdb database %s", repr(self.r.dbname)) + self.r.db_create(self.r.dbname).run() tables = self.r.table_list().run() if not self.table in tables: self.logger.info("creating rethinkdb table %s in database %s shards=%s replicas=%s", - repr(self.table), repr(self.r.db), self.shards, self.replicas) + repr(self.table), repr(self.r.dbname), self.shards, self.replicas) self.r.table_create(self.table, primary_key="bucket", shards=self.shards, replicas=self.replicas).run() def close(self): From 97a30eb319ba5d61795b85eef6b7c6cbfd243041 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 23 Sep 2015 23:06:33 +0000 Subject: [PATCH 056/146] back to setup.py now that we have devpi --- requirements.txt | 11 ----------- setup.py | 16 +++++++++------- 2 files changed, 9 insertions(+), 18 deletions(-) delete mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index b00387c..0000000 --- a/requirements.txt +++ /dev/null @@ -1,11 +0,0 @@ -certauth>=1.1.0 -git+https://github.com/internetarchive/warctools.git -kafka-python - -git+https://github.com/nlevitt/surt.git@py3 -git+https://github.com/nlevitt/rethinkstuff.git -. - -# -e /home/nlevitt/workspace/surt -# -e /home/nlevitt/workspace/rethinkstuff -# -e . diff --git a/setup.py b/setup.py index ab42452..75138b4 100755 --- a/setup.py +++ b/setup.py @@ -10,13 +10,8 @@ VERSION_BYTES = b'1.5' def full_version_bytes(): import subprocess, time try: - commit_bytes = subprocess.check_output(['git', 'log', '-1', '--pretty=format:%h']) - - t_bytes = subprocess.check_output(['git', 'log', '-1', '--pretty=format:%ct']) - t = int(t_bytes.strip().decode('utf-8')) - tm = time.gmtime(t) - timestamp_utc = time.strftime("%Y%m%d%H%M%S", time.gmtime(t)) - return VERSION_BYTES + b'-' + timestamp_utc.encode('utf-8') + b'-' + commit_bytes.strip() + commit_num_bytes = subprocess.check_output(['git', 'rev-list', '--count', 'HEAD']).strip() + return VERSION_BYTES + b'.' + commit_num_bytes except subprocess.CalledProcessError: return VERSION_BYTES @@ -47,6 +42,13 @@ setuptools.setup(name='warcprox', license='GPL', packages=['warcprox'], package_data={'warcprox':['version.txt']}, + install_requires=[ + 'certauth>=1.1.0', + 'warctools', + 'kafka-python', + 'surt', + 'rethinkstuff', + ], tests_require=['requests>=2.0.1', 'pytest'], # >=2.0.1 for https://github.com/kennethreitz/requests/pull/1636 cmdclass = {'test': PyTest}, test_suite='warcprox.tests', From a41c426b0a7221653785b327a5dce0d5e1e94e8c Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 24 Sep 2015 00:19:32 +0000 Subject: [PATCH 057/146] giving up on using git revision in version number :( latest issue is when installing a package that calls git to compute a version number, but cwd is some other git project, you get the wrong thing --- setup.py | 18 +----------------- tests/Dockerfile | 1 + tests/run-tests.sh | 5 +++-- warcprox/__init__.py | 11 ++--------- warcprox/main.py | 2 +- warcprox/warc.py | 4 +--- 6 files changed, 9 insertions(+), 32 deletions(-) diff --git a/setup.py b/setup.py index 75138b4..ed8bbb2 100755 --- a/setup.py +++ b/setup.py @@ -5,21 +5,6 @@ from setuptools.command.test import test as TestCommand import sys import setuptools -VERSION_BYTES = b'1.5' - -def full_version_bytes(): - import subprocess, time - try: - commit_num_bytes = subprocess.check_output(['git', 'rev-list', '--count', 'HEAD']).strip() - return VERSION_BYTES + b'.' + commit_num_bytes - except subprocess.CalledProcessError: - return VERSION_BYTES - -version_bytes = full_version_bytes() -with open('warcprox/version.txt', 'wb') as out: - out.write(version_bytes) - out.write(b'\n'); - # special class needs to be added to support the pytest written dump-anydbm tests class PyTest(TestCommand): def finalize_options(self): @@ -33,7 +18,7 @@ class PyTest(TestCommand): sys.exit(errno) setuptools.setup(name='warcprox', - version=version_bytes.decode('utf-8'), + version='1.5.0', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', @@ -41,7 +26,6 @@ setuptools.setup(name='warcprox', long_description=open('README.rst').read(), license='GPL', packages=['warcprox'], - package_data={'warcprox':['version.txt']}, install_requires=[ 'certauth>=1.1.0', 'warctools', diff --git a/tests/Dockerfile b/tests/Dockerfile index f2b040a..4a8b5b8 100644 --- a/tests/Dockerfile +++ b/tests/Dockerfile @@ -19,3 +19,4 @@ RUN mkdir -vp /etc/service/rethinkdb \ RUN apt-get -y install python-virtualenv git RUN apt-get -y install python-gdbm python3-gdbm libpython2.7-dev libpython3.4-dev libffi-dev libssl-dev +RUN pip install devpi-client diff --git a/tests/run-tests.sh b/tests/run-tests.sh index 50deaec..c3f5f07 100755 --- a/tests/run-tests.sh +++ b/tests/run-tests.sh @@ -10,11 +10,12 @@ user=$(id -un) for python in python2.7 python3.4 do docker run --rm -i -t --volume="$script_dir/..:/warcprox" internetarchive/rethinkdb /sbin/my_init -- \ - bash -x -c "adduser --gecos=$user --disabled-password --quiet --uid=$uid $user \ + bash -x -c " adduser --gecos=$user --disabled-password --quiet --uid=$uid $user \ && sudo -u $user bash -x -c 'cd /warcprox \ + && devpi use --set-cfg http://crawl342.us.archive.org:9000/nlevitt/dev \ && virtualenv -p $python /tmp/venv \ && source /tmp/venv/bin/activate \ - && pip --log-file /tmp/pip.log install -r requirements.txt . pytest requests \ + && pip --log-file /tmp/pip.log install . pytest requests \ && py.test tests \ && py.test --rethinkdb-servers=localhost tests \ && py.test --rethinkdb-servers=localhost --rethinkdb-big-table tests'" diff --git a/warcprox/__init__.py b/warcprox/__init__.py index e437074..4c22670 100644 --- a/warcprox/__init__.py +++ b/warcprox/__init__.py @@ -1,15 +1,11 @@ from argparse import Namespace as _Namespace +from pkg_resources import get_distribution as _get_distribution +__version__ = _get_distribution('warcprox').version def digest_str(hash_obj, base32): import base64 return hash_obj.name.encode('utf-8') + b':' + (base64.b32encode(hash_obj.digest()) if base32 else hash_obj.hexdigest().encode('ascii')) -def _read_version_bytes(): - import os - version_txt = os.path.sep.join(__file__.split(os.path.sep)[:-1] + ['version.txt']) - with open(version_txt, 'rb') as fin: - return fin.read().strip() - class Options(_Namespace): def __getattr__(self, name): try: @@ -17,9 +13,6 @@ class Options(_Namespace): except AttributeError: return None -version_bytes = _read_version_bytes().strip() -version_str = version_bytes.decode('utf-8') - # XXX linux-specific def gettid(): try: diff --git a/warcprox/main.py b/warcprox/main.py index e647a7e..01acf1f 100644 --- a/warcprox/main.py +++ b/warcprox/main.py @@ -79,7 +79,7 @@ def _build_arg_parser(prog=os.path.basename(sys.argv[0])): arg_parser.add_argument('--kafka-capture-feed-topic', dest='kafka_capture_feed_topic', default=None, help='kafka capture feed topic') arg_parser.add_argument('--version', action='version', - version="warcprox {}".format(warcprox.version_str)) + version="warcprox {}".format(warcprox.__version__)) arg_parser.add_argument('-v', '--verbose', dest='verbose', action='store_true') arg_parser.add_argument('-q', '--quiet', dest='quiet', action='store_true') diff --git a/warcprox/warc.py b/warcprox/warc.py index 9391890..eed045c 100644 --- a/warcprox/warc.py +++ b/warcprox/warc.py @@ -1,5 +1,3 @@ -# vim:set sw=4 et: - from __future__ import absolute_import import logging @@ -135,7 +133,7 @@ class WarcRecordBuilder: headers.append((warctools.WarcRecord.DATE, warc_record_date)) warcinfo_fields = [] - warcinfo_fields.append(b'software: warcprox ' + warcprox.version_bytes) + warcinfo_fields.append(b'software: warcprox ' + warcprox.__version__.encode('latin1')) hostname = socket.gethostname() warcinfo_fields.append('hostname: {}'.format(hostname).encode('latin1')) warcinfo_fields.append('ip: {}'.format(socket.gethostbyname(hostname)).encode('latin1')) From 6b3cd9de2ed84fb74f2b1351727837d41d888c91 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 28 Oct 2015 20:59:05 +0000 Subject: [PATCH 058/146] make note of extra packages needed on ubuntu --- README.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.rst b/README.rst index ff6a6b0..c1a0969 100644 --- a/README.rst +++ b/README.rst @@ -17,8 +17,10 @@ Warcprox runs on python 3.4. To install latest release run: + :: + # apt-get install libffi-dev libssl-dev python3-gdbm pip install warcprox You can also install the latest bleeding edge code: From 95e611a5d02924cbfea92f34918d3513e617a9b7 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 28 Oct 2015 21:02:42 +0000 Subject: [PATCH 059/146] update stats in RethinkDb asynchronously, since profiling shows this to be a bottleneck in WarcWriterThread (which in turn makes it a bottleneck for the whole app) --- setup.py | 14 +++++----- tests/test_warcprox.py | 31 ++++++++++++++++----- warcprox/stats.py | 61 ++++++++++++++++++++++++++++-------------- 3 files changed, 73 insertions(+), 33 deletions(-) diff --git a/setup.py b/setup.py index ed8bbb2..a26166b 100755 --- a/setup.py +++ b/setup.py @@ -17,6 +17,12 @@ class PyTest(TestCommand): errno = pytest.main(self.test_args) sys.exit(errno) +deps = ['certauth>=1.1.0', 'warctools', 'kafka-python', 'surt', 'rethinkstuff'] +try: + import concurrent.futures +except: + deps.append('futures') + setuptools.setup(name='warcprox', version='1.5.0', description='WARC writing MITM HTTP/S proxy', @@ -26,13 +32,7 @@ setuptools.setup(name='warcprox', long_description=open('README.rst').read(), license='GPL', packages=['warcprox'], - install_requires=[ - 'certauth>=1.1.0', - 'warctools', - 'kafka-python', - 'surt', - 'rethinkstuff', - ], + install_requires=deps, tests_require=['requests>=2.0.1', 'pytest'], # >=2.0.1 for https://github.com/kennethreitz/requests/pull/1636 cmdclass = {'test': PyTest}, test_suite='warcprox.tests', diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index badf9ed..1f6c2fd 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -398,8 +398,12 @@ def test_dedup_http(http_daemon, warcprox_, archiving_proxies, playback_proxies) # wait for writer thread to process time.sleep(0.5) - while not warcprox_.warc_writer_thread.idle: + while (not warcprox_.warc_writer_thread.idle + or (warcprox_.proxy.stats_db + and hasattr(warcprox_.proxy.stats_db, "_executor") + and warcprox_.proxy.stats_db._executor._work_queue.qsize() > 0)): time.sleep(0.5) + time.sleep(0.5) # check in dedup db (no change from prev) dedup_lookup = warcprox_.warc_writer_thread.dedup_db.lookup(b'sha1:65e1216acfd220f0292715e74bd7a1ec35c99dfc') @@ -461,8 +465,13 @@ def test_dedup_https(https_daemon, warcprox_, archiving_proxies, playback_proxie # wait for writer thread to process time.sleep(0.5) - while not warcprox_.warc_writer_thread.idle: + while (not warcprox_.warc_writer_thread.idle + or (warcprox_.proxy.stats_db + and hasattr(warcprox_.proxy.stats_db, "_executor") + and warcprox_.proxy.stats_db._executor._work_queue.qsize() > 0)): time.sleep(0.5) + time.sleep(0.5) + # check in dedup db (no change from prev) dedup_lookup = warcprox_.warc_writer_thread.dedup_db.lookup(b'sha1:5b4efa64fdb308ec06ae56a9beba155a6f734b89') @@ -491,8 +500,12 @@ def test_limits(http_daemon, warcprox_, archiving_proxies): # wait for writer thread to process time.sleep(0.5) - while not warcprox_.warc_writer_thread.idle: + while (not warcprox_.warc_writer_thread.idle + or (warcprox_.proxy.stats_db + and hasattr(warcprox_.proxy.stats_db, "_executor") + and warcprox_.proxy.stats_db._executor._work_queue.qsize() > 0)): time.sleep(0.5) + time.sleep(0.5) response = requests.get(url, proxies=archiving_proxies, headers=headers, stream=True) assert response.status_code == 420 @@ -515,8 +528,12 @@ def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies, # wait for writer thread to process time.sleep(0.5) - while not warcprox_.warc_writer_thread.idle: + while (not warcprox_.warc_writer_thread.idle + or (warcprox_.proxy.stats_db + and hasattr(warcprox_.proxy.stats_db, "_executor") + and warcprox_.proxy.stats_db._executor._work_queue.qsize() > 0)): time.sleep(0.5) + time.sleep(0.5) # check url1 in dedup db bucket_a dedup_lookup = warcprox_.warc_writer_thread.dedup_db.lookup(b'sha1:bc3fac8847c9412f49d955e626fb58a76befbf81', bucket="bucket_a") @@ -541,6 +558,7 @@ def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies, time.sleep(0.5) while not warcprox_.warc_writer_thread.idle: time.sleep(0.5) + time.sleep(0.5) # check url2 in dedup db bucket_b dedup_lookup = warcprox_.warc_writer_thread.dedup_db.lookup(b'sha1:bc3fac8847c9412f49d955e626fb58a76befbf81', bucket="bucket_b") @@ -568,6 +586,7 @@ def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies, time.sleep(0.5) while not warcprox_.warc_writer_thread.idle: time.sleep(0.5) + time.sleep(0.5) # close the warc assert warcprox_.warc_writer_thread.writer_pool.warc_writers["test_dedup_buckets"] @@ -575,14 +594,14 @@ def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies, warc_path = os.path.join(writer.directory, writer._f_finalname) warcprox_.warc_writer_thread.writer_pool.warc_writers["test_dedup_buckets"].close_writer() assert os.path.exists(warc_path) - + # read the warc fh = warctools.ArchiveRecord.open_archive(warc_path) record_iter = fh.read_records(limit=None, offsets=True) try: (offset, record, errors) = next(record_iter) assert record.type == b'warcinfo' - + # url1 bucket_a (offset, record, errors) = next(record_iter) assert record.type == b'response' diff --git a/warcprox/stats.py b/warcprox/stats.py index b27c021..9be1b54 100644 --- a/warcprox/stats.py +++ b/warcprox/stats.py @@ -14,6 +14,7 @@ import json from hanzo import warctools import random import warcprox +import concurrent.futures def _empty_bucket(bucket): return { @@ -95,7 +96,7 @@ class StatsDb: if b in self.db: bucket_stats = json.loads(self.db[b].decode("utf-8")) else: - bucket_stats = _empty_bucket(b) + bucket_stats = _empty_bucket(b) bucket_stats["total"]["urls"] += 1 bucket_stats["total"]["wire_bytes"] += recorded_url.size @@ -119,6 +120,7 @@ class RethinkStatsDb: self.replicas = replicas or min(3, len(r.servers)) self._ensure_db_table() self.options = options + self._executor = concurrent.futures.ThreadPoolExecutor(max_workers=10) def _ensure_db_table(self): dbs = self.r.db_list().run() @@ -127,12 +129,15 @@ class RethinkStatsDb: self.r.db_create(self.r.dbname).run() tables = self.r.table_list().run() if not self.table in tables: - self.logger.info("creating rethinkdb table %s in database %s shards=%s replicas=%s", + self.logger.info("creating rethinkdb table %s in database %s shards=%s replicas=%s", repr(self.table), repr(self.r.dbname), self.shards, self.replicas) self.r.table_create(self.table, primary_key="bucket", shards=self.shards, replicas=self.replicas).run() def close(self): - pass + self.logger.info("waiting for ~%s tasks to finish", + self._executor._work_queue.qsize() + (self._executor._max_workers/2)) + self._executor.shutdown(wait=True) + self.logger.info("shut down complete") def sync(self): pass @@ -149,7 +154,32 @@ class RethinkStatsDb: return bucket0_stats[bucket1] return bucket0_stats - def tally(self, recorded_url, records): + def _tally(self, buckets, size, is_revisit): + try: + self.logger.info("starting task self._tally(%s)", (buckets, size, is_revisit)) + for bucket in buckets: + bucket_stats = self.value(bucket) or _empty_bucket(bucket) + + bucket_stats["total"]["urls"] += 1 + bucket_stats["total"]["wire_bytes"] += size + + if is_revisit: + bucket_stats["revisit"]["urls"] += 1 + bucket_stats["revisit"]["wire_bytes"] += size + else: + bucket_stats["new"]["urls"] += 1 + bucket_stats["new"]["wire_bytes"] += size + + self.logger.debug("saving %s", bucket_stats) + result = self.r.table(self.table).insert(bucket_stats, conflict="replace").run() + if sorted(result.values()) != [0,0,0,0,0,1] or [result["deleted"],result["skipped"],result["errors"]] != [0,0,0]: + raise Exception("unexpected result %s saving %s", result, record) + + self.logger.info("finished task self._tally(%s)", (buckets, size, is_revisit)) + except: + self.logger.error("unexpected problem tallying stats", exc_info=True) + + def _extract_stats_info(self, recorded_url, records): buckets = ["__all__"] if (recorded_url.warcprox_meta @@ -159,24 +189,15 @@ class RethinkStatsDb: else: buckets.append("__unspecified__") - for bucket in buckets: - bucket_stats = self.value(bucket) or _empty_bucket(bucket) + is_revisit = records[0].get_header(warctools.WarcRecord.TYPE) == warctools.WarcRecord.REVISIT - bucket_stats["total"]["urls"] += 1 - bucket_stats["total"]["wire_bytes"] += recorded_url.size + return buckets, recorded_url.size, is_revisit - if records[0].get_header(warctools.WarcRecord.TYPE) == warctools.WarcRecord.REVISIT: - bucket_stats["revisit"]["urls"] += 1 - bucket_stats["revisit"]["wire_bytes"] += recorded_url.size - else: - bucket_stats["new"]["urls"] += 1 - bucket_stats["new"]["wire_bytes"] += recorded_url.size - - self.logger.debug("saving %s", bucket_stats) - result = self.r.table(self.table).insert(bucket_stats, conflict="replace").run() - if sorted(result.values()) != [0,0,0,0,0,1] or [result["deleted"],result["skipped"],result["errors"]] != [0,0,0]: - raise Exception("unexpected result %s saving %s", result, record) + def tally(self, recorded_url, records): + self._tally(self._extract_stats_info(recorded_url, records)) def notify(self, recorded_url, records): - self.tally(recorded_url, records) + args = self._extract_stats_info(recorded_url, records) + self.logger.info("submitting task self._tally(%s)", args) + self._executor.submit(self._tally, *args) From 2169369dab11415f9910c180b9b4d13a29fe9979 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 28 Oct 2015 21:34:34 +0000 Subject: [PATCH 060/146] working on benchmarking code... so far they seem to reveal that warcprox behaves poorly under load (perhaps timeouts are configured too short?) --- benchmarks/requirements.txt | 1 + benchmarks/run-benchmarks.py | 152 +++++++++++++++++++++++++++++++++++ warcprox/main.py | 45 ++++++----- 3 files changed, 180 insertions(+), 18 deletions(-) create mode 100644 benchmarks/requirements.txt create mode 100755 benchmarks/run-benchmarks.py diff --git a/benchmarks/requirements.txt b/benchmarks/requirements.txt new file mode 100644 index 0000000..ee4ba4f --- /dev/null +++ b/benchmarks/requirements.txt @@ -0,0 +1 @@ +aiohttp diff --git a/benchmarks/run-benchmarks.py b/benchmarks/run-benchmarks.py new file mode 100755 index 0000000..2f1414b --- /dev/null +++ b/benchmarks/run-benchmarks.py @@ -0,0 +1,152 @@ +#!/usr/bin/env python + +import sys +import aiohttp +import aiohttp.server +import asyncio +import ssl +import tempfile +import OpenSSL.crypto +import OpenSSL.SSL +import random +import os +import threading +import time +import logging +import warcprox.main + +logging.basicConfig(stream=sys.stdout, level=logging.INFO, + format='%(asctime)s %(process)d %(levelname)s %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s') + +def self_signed_cert(): + key = OpenSSL.crypto.PKey() + key.generate_key(OpenSSL.crypto.TYPE_RSA, 2048) + + cert = OpenSSL.crypto.X509() + cert.set_serial_number(random.randint(0, 2 ** 64 - 1)) + cert.get_subject().CN = 'localhost' + + cert.set_version(2) + cert.gmtime_adj_notBefore(0) + cert.gmtime_adj_notAfter(10 * 365 * 24 * 60 * 60) + + cert.set_issuer(cert.get_subject()) + cert.set_pubkey(key) + cert.sign(key, "sha1") + + return key, cert + +class HttpRequestHandler(aiohttp.server.ServerHttpProtocol): + @asyncio.coroutine + def handle_request(self, message, payload): + response = aiohttp.Response( + self.writer, 200, http_version=message.version + ) + n = int(message.path.partition('/')[2]) + response.add_header('Content-Type', 'text/plain') + # response.add_header('Content-Length', '18') + response.send_headers() + for i in range(n): + response.write(b'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\n') + yield from response.write_eof() + +def run_servers(): + loop.run_forever() + +def start_servers(): + loop = asyncio.get_event_loop() + http = loop.create_server(lambda: HttpRequestHandler(debug=True, keep_alive=75), '127.0.0.1', '8080') + sslcontext = ssl.SSLContext(ssl.PROTOCOL_SSLv23) + key, cert = self_signed_cert() + with tempfile.NamedTemporaryFile(delete=False) as certfile: + certfile.write(OpenSSL.crypto.dump_privatekey(OpenSSL.SSL.FILETYPE_PEM, key)) + certfile.write(OpenSSL.crypto.dump_certificate(OpenSSL.SSL.FILETYPE_PEM, cert)) + sslcontext.load_cert_chain(certfile.name) + os.remove(certfile.name) + https = loop.create_server(lambda: HttpRequestHandler(debug=True, keep_alive=75), '127.0.0.1', '8443', ssl=sslcontext) + srv = loop.run_until_complete(http) + srv = loop.run_until_complete(https) + logging.info('serving on http://127.0.0.1:8080 and https://127.0.0.1:8443') + +class AsyncClient(object): + def __init__(self, proxy=None): + self.n_urls = 0 + self.n_bytes = 0 + self.proxy = proxy + if proxy: + self.connector = aiohttp.connector.ProxyConnector(proxy, verify_ssl=False) + else: + self.connector = aiohttp.connector.TCPConnector(verify_ssl=False) + + @asyncio.coroutine + def read_response(self, r, url): + # time.sleep(random.random() * 10) + while True: + chunk = yield from r.content.read(2**16) + self.n_bytes += len(chunk) + if not chunk: + self.n_urls += 1 + logging.info("finished reading from %s", url) + r.close() + break + + @asyncio.coroutine + def one_request(self, url): + logging.info("issuing request to %s", url) + r = yield from aiohttp.get(url, connector=self.connector) + logging.info("issued request to %s", url) + yield from self.read_response(r, url) + +def benchmark(client): + try: + start = time.time() + tasks_https = [client.one_request('https://localhost:8443/%s' % int(1.1**i)) for i in range(120)] + asyncio.get_event_loop().run_until_complete(asyncio.wait(tasks_https)) + tasks_http = [client.one_request('http://localhost:8080/%s' % int(1.1**i)) for i in range(120)] + asyncio.get_event_loop().run_until_complete(asyncio.wait(tasks_http)) + finally: + finish = time.time() + logging.info("proxy=%s: %s urls totaling %s bytes in %s seconds", client.proxy, client.n_urls, client.n_bytes, (finish - start)) + +if __name__ == '__main__': + args = warcprox.main.parse_args() + + start_servers() + + baseline_client = AsyncClient() + logging.info("===== baseline benchmark starting (no proxy) =====") + benchmark(baseline_client) + logging.info("===== baseline benchmark finished =====") + + + # Queue size of 1 makes warcprox behave as though it were synchronous (each + # request blocks until the warc writer starts working on the last request). + # This gives us a better sense of sustained max throughput. The + # asynchronous nature of warcprox helps with bursty traffic, as long as the + # average throughput stays below the sustained max. + with TemporaryDirectory() as tmpdir: + args.queue_size = 1 + args.cacert = os.path.join(tmpdir, "benchmark-warcprox-ca.pem") + args.certs_dir = os.path.join(tmpdir, "benchmark-warcprox-ca") + args.directory = os.path.join(tmpdir, "warcs") + args.gzip = True + args.base32 = True + args.stats_db_file = os.path.join(tmpdir, "stats.db") + args.dedup_db_file = os.path.join(tmpdir, "dedup.db") + + warcprox_controller = warcprox.main.init_controller(args) + warcprox_controller_thread = threading.Thread(target=warcprox_controller.run_until_shutdown) + warcprox_controller_thread.start() + proxy = "http://%s:%s" % (args.address, args.port) + proxied_client = AsyncClient(proxy=proxy) + + logging.info("===== warcprox benchmark starting =====") + benchmark(proxied_client) + logging.info("===== warcprox benchmark finished =====") + + warcprox_controller.stop.set() + warcprox_controller_thread.join() + + asyncio.get_event_loop().stop() + logging.info("finished") + diff --git a/warcprox/main.py b/warcprox/main.py index 01acf1f..8d471de 100644 --- a/warcprox/main.py +++ b/warcprox/main.py @@ -78,6 +78,8 @@ def _build_arg_parser(prog=os.path.basename(sys.argv[0])): default=None, help='kafka broker list for capture feed') arg_parser.add_argument('--kafka-capture-feed-topic', dest='kafka_capture_feed_topic', default=None, help='kafka capture feed topic') + arg_parser.add_argument('--queue-size', dest='queue_size', default=1000, + help='argparse.SUPPRESS') arg_parser.add_argument('--version', action='version', version="warcprox {}".format(warcprox.__version__)) arg_parser.add_argument('-v', '--verbose', dest='verbose', action='store_true') @@ -85,7 +87,6 @@ def _build_arg_parser(prog=os.path.basename(sys.argv[0])): return arg_parser - def dump_state(signum=None, frame=None): pp = pprint.PrettyPrinter(indent=4) state_strs = [] @@ -97,22 +98,9 @@ def dump_state(signum=None, frame=None): logging.warn("dumping state (caught signal {})\n{}".format(signum, "\n".join(state_strs))) - -def main(argv=sys.argv): - arg_parser = _build_arg_parser(prog=os.path.basename(argv[0])) - args = arg_parser.parse_args(args=argv[1:]) +def init_controller(args): options = warcprox.Options(**vars(args)) - if args.verbose: - loglevel = logging.DEBUG - elif args.quiet: - loglevel = logging.WARNING - else: - loglevel = logging.INFO - - logging.basicConfig(stream=sys.stdout, level=loglevel, - format='%(asctime)s %(process)d %(levelname)s %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s') - try: hashlib.new(args.digest_algorithm) except Exception as e: @@ -150,7 +138,7 @@ def main(argv=sys.argv): kafka_capture_feed = warcprox.kafkafeed.CaptureFeed(args.kafka_broker_list, args.kafka_capture_feed_topic) listeners.append(kafka_capture_feed) - recorded_url_q = queue.Queue() + recorded_url_q = queue.Queue(maxsize=args.queue_size) ca_name = 'Warcprox CA on {}'.format(socket.gethostname())[:64] ca = certauth.certauth.CertificateAuthority(args.cacert, args.certs_dir, @@ -181,11 +169,32 @@ def main(argv=sys.argv): signal.signal(signal.SIGINT, lambda a,b: controller.stop.set()) signal.signal(signal.SIGQUIT, dump_state) + return controller + +def real_main(args): + controller = init_controller(args) controller.run_until_shutdown() +def parse_args(argv=sys.argv): + arg_parser = _build_arg_parser(prog=os.path.basename(argv[0])) + args = arg_parser.parse_args(args=argv[1:]) + return args + +def main(argv=sys.argv): + args = parse_args(argv) + + if args.verbose: + loglevel = logging.DEBUG + elif args.quiet: + loglevel = logging.WARNING + else: + loglevel = logging.INFO + + logging.basicConfig(stream=sys.stdout, level=loglevel, + format='%(asctime)s %(process)d %(levelname)s %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s') + + real_main(args) if __name__ == '__main__': - import gc - gc.set_debug(gc.DEBUG_LEAK) main() From 03c506dade2ad3ca145c91b3562ae3d885a1984c Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 28 Oct 2015 21:35:49 +0000 Subject: [PATCH 061/146] stop after first failing test, use py.test -s --- tests/run-tests.sh | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/run-tests.sh b/tests/run-tests.sh index c3f5f07..89dc3b8 100755 --- a/tests/run-tests.sh +++ b/tests/run-tests.sh @@ -7,6 +7,8 @@ docker build -t internetarchive/rethinkdb $script_dir || exit 1 uid=$(id -u) user=$(id -un) +set -e + for python in python2.7 python3.4 do docker run --rm -i -t --volume="$script_dir/..:/warcprox" internetarchive/rethinkdb /sbin/my_init -- \ @@ -16,8 +18,8 @@ do && virtualenv -p $python /tmp/venv \ && source /tmp/venv/bin/activate \ && pip --log-file /tmp/pip.log install . pytest requests \ - && py.test tests \ - && py.test --rethinkdb-servers=localhost tests \ - && py.test --rethinkdb-servers=localhost --rethinkdb-big-table tests'" + && py.test -s tests \ + && py.test -s --rethinkdb-servers=localhost tests \ + && py.test -s --rethinkdb-servers=localhost --rethinkdb-big-table tests'" done From 1b8d83203c7101eafbcd1e36a494e67e4cfc5c81 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 28 Oct 2015 21:37:21 +0000 Subject: [PATCH 062/146] tweaks to memory debugging --- warcprox/controller.py | 110 +++++++++++++++++++++++++++-------------- 1 file changed, 72 insertions(+), 38 deletions(-) diff --git a/warcprox/controller.py b/warcprox/controller.py index 1850857..b431895 100644 --- a/warcprox/controller.py +++ b/warcprox/controller.py @@ -1,5 +1,3 @@ -# vim: set sw=4 et: - from __future__ import absolute_import import logging @@ -36,6 +34,74 @@ class WarcproxController(object): self.playback_proxy = playback_proxy self.options = options + self._last_rss = None + + def debug_mem(self): + self.logger.info("self.proxy.recorded_url_q.qsize()=%s", self.proxy.recorded_url_q.qsize()) + if self.proxy.stats_db and hasattr(self.proxy.stats_db, "_executor"): + self.logger.info("self.proxy.stats_db._executor._work_queue.qsize()=%s", + self.proxy.stats_db._executor._work_queue.qsize()) + with open("/proc/self/status") as f: + for line in f: + fields = line.split() + if len(fields) >= 2: + k, v = fields[0:2] + if k == "VmHWM:": + hwm = int(v) + elif k == "VmRSS:": + rss = int(v) + elif k == "VmData:": + data = int(v) + elif k == "VmStk:": + stk = int(v) + self.logger.info("rss=%s data=%s stack=%s hwm=%s", rss, data, stk, hwm) + self._last_rss = self._last_rss or rss # to set initial value + + if rss - self._last_rss > 1024: + num_unreachable = gc.collect() + all_objects = gc.get_objects() + total_size = 0 + summary = {} + biggest_objects = [None] * 10 + for obj in all_objects: + size = sys.getsizeof(obj) + total_size += size + if not type(obj) in summary: + summary[type(obj)] = {"count":0,"size":0} + summary[type(obj)]["count"] += 1 + summary[type(obj)]["size"] += size + if size > sys.getsizeof(biggest_objects[-1]): + for i in range(len(biggest_objects)): + if size > sys.getsizeof(biggest_objects[i]): + index = i + break + biggest_objects[index+1:] = biggest_objects[index:-1] + biggest_objects[index] = obj + + self.logger.info("%s objects totaling %s bytes", len(all_objects), total_size) + + self.logger.info("=== biggest types ===") + for item in sorted(summary.items(), key=lambda item: item[1]["size"], reverse=True)[:10]: + self.logger.info("%s bytes in %s instances of %s", item[1]["size"], item[1]["count"], item[0]) + + self.logger.info("=== warcprox types ===") + for t in (t for t in summary if str(t).find("warcprox") >= 0): + self.logger.info("%s bytes in %s instances of %s", summary[t]["size"], summary[t]["count"], t) + + for i in range(len(biggest_objects)): + obj = biggest_objects[i] + try: + value = repr(bytes(obj.getbuffer()[:100])) + except: + try: + value = repr(obj)[:100] + except BaseException as e: + value = "<{} getting value>".format(e) + self.logger.info("#%s (%s) (%s bytes) (%s refs) (id=%s): %s", i+1, type(obj), sys.getsizeof(obj), sys.getrefcount(obj), id(obj), value) + self.logger.info("%s unreachable objects totaling %s bytes", len(gc.garbage), sum(sys.getsizeof(x) for x in gc.garbage)) + + self._last_rss = rss + def run_until_shutdown(self): """ Start warcprox and run until shut down. Call @@ -51,46 +117,14 @@ class WarcproxController(object): self.stop = threading.Event() + self.debug_mem() + try: - t = time.time() - 30 + t = time.time() - 30 while not self.stop.is_set(): time.sleep(0.5) if time.time() - t > 60: - num_unreachable = gc.collect() - all_objects = gc.get_objects() - total_size = 0 - summary = {} - biggest_objects = [None] * 10 - for obj in all_objects: - size = sys.getsizeof(obj) - total_size += size - if not type(obj) in summary: - summary[type(obj)] = {"count":0,"size":0} - summary[type(obj)]["count"] += 1 - summary[type(obj)]["size"] += size - if size > sys.getsizeof(biggest_objects[-1]): - for i in range(len(biggest_objects)): - if size > sys.getsizeof(biggest_objects[i]): - index = i - break - biggest_objects[index+1:] = biggest_objects[index:-1] - biggest_objects[index] = obj - - self.logger.info("%s objects totaling %s bytes", len(all_objects), total_size) - for item in sorted(summary.items(), key=lambda item: item[1]["size"], reverse=True)[:10]: - self.logger.info("%s bytes in %s instances of %s", item[1]["size"], item[1]["count"], item[0]) - for i in range(len(biggest_objects)): - obj = biggest_objects[i] - try: - value = repr(bytes(obj.getbuffer()[:100])) - except: - try: - value = repr(obj)[:100] - except BaseException as e: - value = "<{} getting value>".format(e) - self.logger.info("#%s (%s) (%s bytes) (%s refs) (id=%s): %s", i+1, type(obj), sys.getsizeof(obj), sys.getrefcount(obj), id(obj), value) - self.logger.info("%s unreachable objects totaling %s bytes", len(gc.garbage), sum(sys.getsizeof(x) for x in gc.garbage)) - + self.debug_mem() t = time.time() except: self.logger.critical("fatal exception, shutting down", exc_info=True) From e0fe06c891e1d64a943a2c930a41a89304bd3541 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 28 Oct 2015 21:39:17 +0000 Subject: [PATCH 063/146] make warcprox finish writing all urls in the queue before shutting down --- warcprox/writerthread.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/warcprox/writerthread.py b/warcprox/writerthread.py index a95a68f..76a6c2a 100644 --- a/warcprox/writerthread.py +++ b/warcprox/writerthread.py @@ -1,5 +1,3 @@ -# vim:set sw=4 et: - from __future__ import absolute_import try: @@ -38,9 +36,8 @@ class WarcWriterThread(threading.Thread): def run(self): try: - # XXX warcprox can shut down with urls to archive left in the queue self.setName('WarcWriterThread(tid={})'.format(warcprox.gettid())) - while not self.stop.is_set(): + while True: try: recorded_url = self.recorded_url_q.get(block=True, timeout=0.5) self.idle = None @@ -54,6 +51,8 @@ class WarcWriterThread(threading.Thread): if recorded_url.response_recorder and recorded_url.response_recorder.tempfile: recorded_url.response_recorder.tempfile.close() except queue.Empty: + if self.stop.is_set(): + break self.idle = time.time() self.writer_pool.maybe_idle_rollover() From 6476262f11a08164921a2d570b006a5229bc1943 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 28 Oct 2015 21:48:54 +0000 Subject: [PATCH 064/146] run warc writer thread with profiling enabled, dump results when shutting down --- warcprox/writerthread.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/warcprox/writerthread.py b/warcprox/writerthread.py index 76a6c2a..07441e7 100644 --- a/warcprox/writerthread.py +++ b/warcprox/writerthread.py @@ -16,6 +16,7 @@ from datetime import datetime import hanzo.httptools from hanzo import warctools import warcprox +import cProfile class WarcWriterThread(threading.Thread): logger = logging.getLogger("warcprox.warcproxwriter.WarcWriterThread") @@ -35,6 +36,9 @@ class WarcWriterThread(threading.Thread): self.idle = None def run(self): + cProfile.runctx('self._run()', globals(), locals(), sort='cumulative') + + def _run(self): try: self.setName('WarcWriterThread(tid={})'.format(warcprox.gettid())) while True: From 818bdda68731fff9f52dad72e79ffe032c25df9b Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 28 Oct 2015 23:34:34 +0000 Subject: [PATCH 065/146] fix NameError, twiddles --- benchmarks/run-benchmarks.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/benchmarks/run-benchmarks.py b/benchmarks/run-benchmarks.py index 2f1414b..73ef96d 100755 --- a/benchmarks/run-benchmarks.py +++ b/benchmarks/run-benchmarks.py @@ -86,23 +86,23 @@ class AsyncClient(object): self.n_bytes += len(chunk) if not chunk: self.n_urls += 1 - logging.info("finished reading from %s", url) + logging.debug("finished reading from %s", url) r.close() break @asyncio.coroutine def one_request(self, url): - logging.info("issuing request to %s", url) + logging.debug("issuing request to %s", url) r = yield from aiohttp.get(url, connector=self.connector) - logging.info("issued request to %s", url) + logging.debug("issued request to %s", url) yield from self.read_response(r, url) def benchmark(client): try: start = time.time() - tasks_https = [client.one_request('https://localhost:8443/%s' % int(1.1**i)) for i in range(120)] + tasks_https = [client.one_request('https://localhost:8443/%s' % int(1.1**i)) for i in range(80)] asyncio.get_event_loop().run_until_complete(asyncio.wait(tasks_https)) - tasks_http = [client.one_request('http://localhost:8080/%s' % int(1.1**i)) for i in range(120)] + tasks_http = [client.one_request('http://localhost:8080/%s' % int(1.1**i)) for i in range(80)] asyncio.get_event_loop().run_until_complete(asyncio.wait(tasks_http)) finally: finish = time.time() @@ -124,7 +124,7 @@ if __name__ == '__main__': # This gives us a better sense of sustained max throughput. The # asynchronous nature of warcprox helps with bursty traffic, as long as the # average throughput stays below the sustained max. - with TemporaryDirectory() as tmpdir: + with tempfile.TemporaryDirectory() as tmpdir: args.queue_size = 1 args.cacert = os.path.join(tmpdir, "benchmark-warcprox-ca.pem") args.certs_dir = os.path.join(tmpdir, "benchmark-warcprox-ca") From 4930cc2d24548371065c42c3cdbc2fffa1eed18f Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 28 Oct 2015 23:35:23 +0000 Subject: [PATCH 066/146] try to avoid conflicts with *.pyc files from outside of the docker tests --- tests/run-tests.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/run-tests.sh b/tests/run-tests.sh index 89dc3b8..069986f 100755 --- a/tests/run-tests.sh +++ b/tests/run-tests.sh @@ -18,8 +18,8 @@ do && virtualenv -p $python /tmp/venv \ && source /tmp/venv/bin/activate \ && pip --log-file /tmp/pip.log install . pytest requests \ - && py.test -s tests \ - && py.test -s --rethinkdb-servers=localhost tests \ - && py.test -s --rethinkdb-servers=localhost --rethinkdb-big-table tests'" + && PYTHONDONTWRITEBYTECODE=1 py.test -s tests \ + && PYTHONDONTWRITEBYTECODE=1 py.test -s --rethinkdb-servers=localhost tests \ + && PYTHONDONTWRITEBYTECODE=1 py.test -s --rethinkdb-servers=localhost --rethinkdb-big-table tests'" done From f1362e4da037f83006998efaafff13549ddde268 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 28 Oct 2015 23:38:15 +0000 Subject: [PATCH 067/146] use only one worker thread for asynchronous rethinkdb stats updates, to fix race condition causing some numbers to be lost --- warcprox/stats.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/warcprox/stats.py b/warcprox/stats.py index 9be1b54..660d4c7 100644 --- a/warcprox/stats.py +++ b/warcprox/stats.py @@ -120,7 +120,10 @@ class RethinkStatsDb: self.replicas = replicas or min(3, len(r.servers)) self._ensure_db_table() self.options = options - self._executor = concurrent.futures.ThreadPoolExecutor(max_workers=10) + + # only one worker thread to ensure consistency, see + # https://rethinkdb.com/docs/consistency/ + self._executor = concurrent.futures.ThreadPoolExecutor(max_workers=1) def _ensure_db_table(self): dbs = self.r.db_list().run() @@ -156,7 +159,7 @@ class RethinkStatsDb: def _tally(self, buckets, size, is_revisit): try: - self.logger.info("starting task self._tally(%s)", (buckets, size, is_revisit)) + self.logger.debug("starting task self._tally(%s)", (buckets, size, is_revisit)) for bucket in buckets: bucket_stats = self.value(bucket) or _empty_bucket(bucket) @@ -175,7 +178,7 @@ class RethinkStatsDb: if sorted(result.values()) != [0,0,0,0,0,1] or [result["deleted"],result["skipped"],result["errors"]] != [0,0,0]: raise Exception("unexpected result %s saving %s", result, record) - self.logger.info("finished task self._tally(%s)", (buckets, size, is_revisit)) + self.logger.debug("finished task self._tally(%s)", (buckets, size, is_revisit)) except: self.logger.error("unexpected problem tallying stats", exc_info=True) @@ -198,6 +201,6 @@ class RethinkStatsDb: def notify(self, recorded_url, records): args = self._extract_stats_info(recorded_url, records) - self.logger.info("submitting task self._tally(%s)", args) + self.logger.debug("submitting task self._tally(%s)", args) self._executor.submit(self._tally, *args) From 3e1566cd6f650653e74fe0c52d182705c1206bea Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 28 Oct 2015 23:49:44 +0000 Subject: [PATCH 068/146] update big captures table asynchronously --- warcprox/bigtable.py | 35 +++++++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 8 deletions(-) diff --git a/warcprox/bigtable.py b/warcprox/bigtable.py index ec75360..e020364 100644 --- a/warcprox/bigtable.py +++ b/warcprox/bigtable.py @@ -1,5 +1,3 @@ -# vim:set sw=4 et: - from __future__ import absolute_import import logging @@ -10,6 +8,7 @@ import base64 import surt import os import hashlib +import concurrent.futures class RethinkCaptures: logger = logging.getLogger("warcprox.bigtables.RethinkCaptures") @@ -22,6 +21,10 @@ class RethinkCaptures: self.options = options self._ensure_db_table() + # only one worker thread to ensure consistency, see + # https://rethinkdb.com/docs/consistency/ + self._executor = concurrent.futures.ThreadPoolExecutor(max_workers=1) + def _ensure_db_table(self): dbs = self.r.db_list().run() if not self.r.dbname in dbs: @@ -49,7 +52,7 @@ class RethinkCaptures: self.logger.debug("returning %s for sha1base32=%s bucket=%s", result, sha1base32, bucket) return result - def notify(self, recorded_url, records): + def _assemble_entry(self, recorded_url, records): if recorded_url.response_recorder: if recorded_url.response_recorder.payload_digest.name == "sha1": sha1base32 = base64.b32encode(recorded_url.response_recorder.payload_digest.digest()).decode("utf-8") @@ -87,10 +90,26 @@ class RethinkCaptures: "length": records[0].length, } - result = self.r.table(self.table).insert(entry).run() - if result["inserted"] == 1 and sorted(result.values()) != [0,0,0,0,0,1]: - raise Exception("unexpected result %s saving %s", result, entry) - self.logger.debug("big capture table db saved %s", entry) + return entry + + def _save_entry(self, entry): + try: + result = self.r.table(self.table).insert(entry).run() + if result["inserted"] == 1 and sorted(result.values()) != [0,0,0,0,0,1]: + raise Exception("unexpected result %s saving %s", result, entry) + self.logger.debug("big capture table db saved %s", entry) + except: + self.logger.error("unexpected problem ", exc_info=True) + + def notify(self, recorded_url, records): + entry = self._assemble_entry(recorded_url, records) + self._executor.submit(self._save_entry, entry) + + def close(self): + self.logger.info("waiting for ~%s tasks to finish", + self._executor._work_queue.qsize() + (self._executor._max_workers/2)) + self._executor.shutdown(wait=True) + self.logger.info("shut down complete") class RethinkCapturesDedup: logger = logging.getLogger("warcprox.dedup.RethinkCapturesDedup") @@ -114,4 +133,4 @@ class RethinkCapturesDedup: return None def close(self): - pass + self.captures_db.close() From fd847f01cd677f506893e9780d954e4f9d0d44cc Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 30 Oct 2015 01:14:17 +0000 Subject: [PATCH 069/146] log error but don't give up if there is >1 record with same digest --- warcprox/bigtable.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/warcprox/bigtable.py b/warcprox/bigtable.py index e020364..07e1923 100644 --- a/warcprox/bigtable.py +++ b/warcprox/bigtable.py @@ -43,9 +43,9 @@ class RethinkCaptures: sha1base32 = base64.b32encode(raw_digest).decode("utf-8") results_iter = self.r.table(self.table).get_all([sha1base32, "response", bucket], index="sha1_warc_type").run() results = list(results_iter) - if len(results) > 1: - raise Exception("expected 0 or 1 but found %s results for sha1base32=%s", len(results), sha1base32) - elif len(results) == 1: + if len(results) > 0: + if len(results) > 1: + self.logger.error("expected 0 or 1 but found %s results for sha1base32=%s bucket=%s (will use first result)", len(results), sha1base32, bucket) result = results[0] else: result = None From 3363b2ec95b0a6b5ae6817487a1429e970e3aade Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 30 Oct 2015 01:15:03 +0000 Subject: [PATCH 070/146] continue after unexpected error --- warcprox/writerthread.py | 48 +++++++++++++++++++++------------------- 1 file changed, 25 insertions(+), 23 deletions(-) diff --git a/warcprox/writerthread.py b/warcprox/writerthread.py index 07441e7..f0c7d25 100644 --- a/warcprox/writerthread.py +++ b/warcprox/writerthread.py @@ -39,31 +39,33 @@ class WarcWriterThread(threading.Thread): cProfile.runctx('self._run()', globals(), locals(), sort='cumulative') def _run(self): - try: - self.setName('WarcWriterThread(tid={})'.format(warcprox.gettid())) - while True: - try: - recorded_url = self.recorded_url_q.get(block=True, timeout=0.5) - self.idle = None - if self.dedup_db: - warcprox.dedup.decorate_with_dedup_info(self.dedup_db, - recorded_url, base32=self.options.base32) - records = self.writer_pool.write_records(recorded_url) - self._final_tasks(recorded_url, records) + while not self.stop.is_set(): + try: + self.setName('WarcWriterThread(tid={})'.format(warcprox.gettid())) + while True: + try: + recorded_url = self.recorded_url_q.get(block=True, timeout=0.5) + self.idle = None + if self.dedup_db: + warcprox.dedup.decorate_with_dedup_info(self.dedup_db, + recorded_url, base32=self.options.base32) + records = self.writer_pool.write_records(recorded_url) + self._final_tasks(recorded_url, records) - # try to release resources in a timely fashion - if recorded_url.response_recorder and recorded_url.response_recorder.tempfile: - recorded_url.response_recorder.tempfile.close() - except queue.Empty: - if self.stop.is_set(): - break - self.idle = time.time() - self.writer_pool.maybe_idle_rollover() + # try to release resources in a timely fashion + if recorded_url.response_recorder and recorded_url.response_recorder.tempfile: + recorded_url.response_recorder.tempfile.close() + except queue.Empty: + if self.stop.is_set(): + break + self.idle = time.time() + self.writer_pool.maybe_idle_rollover() - self.logger.info('WarcWriterThread shutting down') - self.writer_pool.close_writers() - except: - self.logger.critical("WarcWriterThread shutting down after unexpected error", exc_info=True) + self.logger.info('WarcWriterThread shutting down') + self.writer_pool.close_writers() + except: + self.logger.critical("WarcWriterThread will try to continue after unexpected error", exc_info=True) + time.sleep(0.5) # closest thing we have to heritrix crawl log at the moment def _log(self, recorded_url, records): From ca4c62fc6d91184be6a69a0798e31aeb5d94682f Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 30 Oct 2015 01:15:27 +0000 Subject: [PATCH 071/146] don't load dedup info for empty payload --- warcprox/dedup.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/warcprox/dedup.py b/warcprox/dedup.py index 942ee9f..17735ed 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -73,7 +73,9 @@ class DedupDb(object): def decorate_with_dedup_info(dedup_db, recorded_url, base32=False): - if recorded_url.response_recorder and recorded_url.response_recorder.payload_digest: + if (recorded_url.response_recorder + and recorded_url.response_recorder.payload_digest + and recorded_url.response_recorder.payload_size() > 0): digest_key = warcprox.digest_str(recorded_url.response_recorder.payload_digest, base32) if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta: recorded_url.dedup_info = dedup_db.lookup(digest_key, recorded_url.warcprox_meta["captures-bucket"]) From 465cf1ef45da51e44143a8812744825c74cc8b1a Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 30 Oct 2015 23:04:45 +0000 Subject: [PATCH 072/146] ./tests/run-tests.sh is better than tox --- tox.ini | 13 ------------- 1 file changed, 13 deletions(-) delete mode 100644 tox.ini diff --git a/tox.ini b/tox.ini deleted file mode 100644 index f5b0c23..0000000 --- a/tox.ini +++ /dev/null @@ -1,13 +0,0 @@ -# Tox (http://tox.testrun.org/) is a tool for running tests -# in multiple virtualenvs. This configuration file will run the -# test suite on all supported python versions. To use it, "pip install tox" -# and then run "tox" from this directory. - -[tox] -envlist = py27, py34 - -[testenv] -commands = py.test warcprox -deps = - pytest - requests From 7e731d40bcf04aae3466f4c07c42b6f3f3f8b621 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 30 Oct 2015 23:07:01 +0000 Subject: [PATCH 073/146] try new travis docker-based infrastructure, more versions of python --- .travis.yml | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/.travis.yml b/.travis.yml index 5b54afb..ad3f8f8 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,21 +1,24 @@ -# vim: set sw=4 et: -# -# tox approach stolen from -# https://github.com/pypa/pip/blob/abdb597dbfb51b21cc76c1cff068b72c80f3a77d/.travis.yml -# +# http://docs.travis-ci.com/user/migrating-from-legacy/ +sudo: false language: python -env: - - TOXENV=py27 - - TOXENV=py34 +python: + - 3.5 + - 3.4 + - 2.7 + - nightly + - pypy + - pypy3 -before_install: - - sudo apt-get update - - sudo apt-get -y install python-gdbm python3-gdbm +addons: + apt: + packages: + - python-gdbm + - python3-gdbm before_script: - - pip install tox + - pip install . pytest requests -script: tox +script: py.test warcprox From d7d992731c6af30b8ee354da8fd923246dae3925 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Sat, 31 Oct 2015 01:17:45 +0000 Subject: [PATCH 074/146] register self for service discovery --- warcprox/controller.py | 34 ++++++++++++++++++++++++++++------ warcprox/main.py | 12 ++++++++++-- 2 files changed, 38 insertions(+), 8 deletions(-) diff --git a/warcprox/controller.py b/warcprox/controller.py index b431895..d8813f6 100644 --- a/warcprox/controller.py +++ b/warcprox/controller.py @@ -6,11 +6,16 @@ import time import warcprox import sys import gc +import datetime class WarcproxController(object): logger = logging.getLogger("warcprox.controller.WarcproxController") - def __init__(self, proxy=None, warc_writer_thread=None, playback_proxy=None, options=warcprox.Options()): + HEARTBEAT_INTERVAL = 20.0 + + def __init__(self, proxy=None, warc_writer_thread=None, + playback_proxy=None, service_registry=None, + options=warcprox.Options()): """ Create warcprox controller. @@ -32,6 +37,7 @@ class WarcproxController(object): self.warc_writer_thread = warcprox.warcwriter.WarcWriterThread(recorded_url_q=self.proxy.recorded_url_q) self.playback_proxy = playback_proxy + self.service_registry = service_registry self.options = options self._last_rss = None @@ -102,6 +108,19 @@ class WarcproxController(object): self._last_rss = rss + def _service_heartbeat(self): + if hasattr(self, 'status_info'): + status_info = self.status_info + else: + status_info = { + 'role': 'warcprox', + 'heartbeat_interval': self.HEARTBEAT_INTERVAL, + } + status_info['load'] = self.proxy.recorded_url_q.qsize() / (self.proxy.recorded_url_q.maxsize or 100) + + self.status_info = self.service_registry.heartbeat(status_info) + self.logger.debug("status in service registry: %s", self.status_info) + def run_until_shutdown(self): """ Start warcprox and run until shut down. Call @@ -117,15 +136,18 @@ class WarcproxController(object): self.stop = threading.Event() - self.debug_mem() + last_mem_dbg = datetime.datetime.utcfromtimestamp(0) try: - t = time.time() - 30 while not self.stop.is_set(): - time.sleep(0.5) - if time.time() - t > 60: + if not hasattr(self, "status_info") or (datetime.datetime.now(datetime.timezone.utc) - self.status_info["last_heartbeat"]).total_seconds() > self.HEARTBEAT_INTERVAL: + self._service_heartbeat() + + if (datetime.datetime.utcnow() - last_mem_dbg).total_seconds() > 60: self.debug_mem() - t = time.time() + last_mem_dbg = datetime.datetime.utcnow() + + time.sleep(0.5) except: self.logger.critical("fatal exception, shutting down", exc_info=True) pass diff --git a/warcprox/main.py b/warcprox/main.py index 8d471de..2d6c2e2 100644 --- a/warcprox/main.py +++ b/warcprox/main.py @@ -92,7 +92,10 @@ def dump_state(signum=None, frame=None): state_strs = [] for th in threading.enumerate(): - state_strs.append(str(th)) + try: + state_strs.append(str(th)) + except AssertionError: + state_strs.append("") stack = traceback.format_stack(sys._current_frames()[th.ident]) state_strs.append("".join(stack)) @@ -163,7 +166,12 @@ def init_controller(args): recorded_url_q=recorded_url_q, writer_pool=writer_pool, dedup_db=dedup_db, listeners=listeners, options=options) - controller = warcprox.controller.WarcproxController(proxy, warc_writer_thread, playback_proxy, options=options) + if args.rethinkdb_servers: + svcreg = rethinkstuff.ServiceRegistry(r) + + controller = warcprox.controller.WarcproxController(proxy, + warc_writer_thread, playback_proxy, service_registry=svcreg, + options=options) signal.signal(signal.SIGTERM, lambda a,b: controller.stop.set()) signal.signal(signal.SIGINT, lambda a,b: controller.stop.set()) From 4dcaedb5d96e822905c881ac39c42f6c5541aa12 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Mon, 2 Nov 2015 18:21:19 +0000 Subject: [PATCH 075/146] py.test the right thing --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index ad3f8f8..b31c9fc 100644 --- a/.travis.yml +++ b/.travis.yml @@ -20,5 +20,5 @@ addons: before_script: - pip install . pytest requests -script: py.test warcprox +script: py.test -v -s tests From 2ecd2facd9142a53471e01b59371ffafc7385595 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Mon, 2 Nov 2015 22:01:26 +0000 Subject: [PATCH 076/146] surt 0.3b2 is in pypi now, no need for devpi --- setup.py | 2 +- tests/run-tests.sh | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/setup.py b/setup.py index a26166b..ef33be2 100755 --- a/setup.py +++ b/setup.py @@ -17,7 +17,7 @@ class PyTest(TestCommand): errno = pytest.main(self.test_args) sys.exit(errno) -deps = ['certauth>=1.1.0', 'warctools', 'kafka-python', 'surt', 'rethinkstuff'] +deps = ['certauth>=1.1.0', 'warctools', 'kafka-python', 'surt==0.3b2', 'rethinkstuff'] try: import concurrent.futures except: diff --git a/tests/run-tests.sh b/tests/run-tests.sh index 069986f..7e67fe6 100755 --- a/tests/run-tests.sh +++ b/tests/run-tests.sh @@ -14,7 +14,6 @@ do docker run --rm -i -t --volume="$script_dir/..:/warcprox" internetarchive/rethinkdb /sbin/my_init -- \ bash -x -c " adduser --gecos=$user --disabled-password --quiet --uid=$uid $user \ && sudo -u $user bash -x -c 'cd /warcprox \ - && devpi use --set-cfg http://crawl342.us.archive.org:9000/nlevitt/dev \ && virtualenv -p $python /tmp/venv \ && source /tmp/venv/bin/activate \ && pip --log-file /tmp/pip.log install . pytest requests \ From e67c7be5bc65c33168dc6a7b9fd6d90740a72ad5 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Mon, 2 Nov 2015 22:04:29 +0000 Subject: [PATCH 077/146] service registry init --- tests/test_warcprox.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 1f6c2fd..2285d8b 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -220,7 +220,24 @@ def stats_db(request, rethinkdb_servers): return sdb @pytest.fixture(scope="module") -def warcprox_(request, captures_db, dedup_db, stats_db): +def service_registry(request, rethinkdb_servers): + if rethinkdb_servers: + servers = rethinkdb_servers.split(",") + db = 'warcprox_test_services_' + "".join(random.sample("abcdefghijklmnopqrstuvwxyz0123456789_",8)) + r = rethinkstuff.Rethinker(servers, db) + + def fin(): + logging.info('dropping rethinkdb database {}'.format(db)) + result = r.db_drop(db).run() + logging.info("result=%s", result) + request.addfinalizer(fin) + + return rethinkstuff.ServiceRegistry(r) + else: + return None + +@pytest.fixture(scope="module") +def warcprox_(request, captures_db, dedup_db, stats_db, service_registry): f = tempfile.NamedTemporaryFile(prefix='warcprox-test-ca-', suffix='.pem', delete=True) f.close() # delete it, or CertificateAuthority will try to read it ca_file = f.name @@ -249,7 +266,9 @@ def warcprox_(request, captures_db, dedup_db, stats_db): recorded_url_q=recorded_url_q, writer_pool=writer_pool, dedup_db=dedup_db, listeners=[captures_db or dedup_db, playback_index_db, stats_db]) - warcprox_ = warcprox.controller.WarcproxController(proxy, warc_writer_thread, playback_proxy, options) + warcprox_ = warcprox.controller.WarcproxController(proxy=proxy, + warc_writer_thread=warc_writer_thread, playback_proxy=playback_proxy, + service_registry=service_registry, options=options) logging.info('starting warcprox') warcprox_thread = threading.Thread(name='WarcproxThread', target=warcprox_.run_until_shutdown) From 248d110f810b1082e0d669e82cecc860034caa85 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Mon, 2 Nov 2015 22:06:01 +0000 Subject: [PATCH 078/146] add port to service registry, fix bug with service hearbeat --- warcprox/controller.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/warcprox/controller.py b/warcprox/controller.py index d8813f6..b1d0a83 100644 --- a/warcprox/controller.py +++ b/warcprox/controller.py @@ -115,6 +115,7 @@ class WarcproxController(object): status_info = { 'role': 'warcprox', 'heartbeat_interval': self.HEARTBEAT_INTERVAL, + 'port': self.options.port, } status_info['load'] = self.proxy.recorded_url_q.qsize() / (self.proxy.recorded_url_q.maxsize or 100) @@ -138,9 +139,19 @@ class WarcproxController(object): last_mem_dbg = datetime.datetime.utcfromtimestamp(0) + try: + utc = datetime.timezone.utc + except AttributeError: + # python2 :-\ + class UTC(datetime.tzinfo): + def tzname(self, dt): return "UTC+00:00" + def dst(self, dt): return datetime.timedelta(0) + def utcoffset(self, dt): return datetime.timedelta(0) + utc = UTC() + try: while not self.stop.is_set(): - if not hasattr(self, "status_info") or (datetime.datetime.now(datetime.timezone.utc) - self.status_info["last_heartbeat"]).total_seconds() > self.HEARTBEAT_INTERVAL: + if self.service_registry and (not hasattr(self, "status_info") or (datetime.datetime.now(utc) - self.status_info["last_heartbeat"]).total_seconds() > self.HEARTBEAT_INTERVAL): self._service_heartbeat() if (datetime.datetime.utcnow() - last_mem_dbg).total_seconds() > 60: From 3e2696525b8600c869cd31243943e8a093092198 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Mon, 2 Nov 2015 22:12:06 +0000 Subject: [PATCH 079/146] make sure svcreg is set --- warcprox/main.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/warcprox/main.py b/warcprox/main.py index 2d6c2e2..574c144 100644 --- a/warcprox/main.py +++ b/warcprox/main.py @@ -168,6 +168,8 @@ def init_controller(args): if args.rethinkdb_servers: svcreg = rethinkstuff.ServiceRegistry(r) + else: + svcreg = None controller = warcprox.controller.WarcproxController(proxy, warc_writer_thread, playback_proxy, service_registry=svcreg, From 9f84c2027482a78df85501d19530ef2d5c7982bc Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Mon, 2 Nov 2015 22:21:43 +0000 Subject: [PATCH 080/146] test with rethinkdb flags too --- .travis.yml | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/.travis.yml b/.travis.yml index b31c9fc..21e990b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,6 +1,3 @@ -# http://docs.travis-ci.com/user/migrating-from-legacy/ -sudo: false - language: python python: @@ -17,8 +14,15 @@ addons: - python-gdbm - python3-gdbm +before_install: + - sudo service docker restart ; sleep 10 # https://github.com/travis-ci/travis-ci/issues/4778 + - docker run -d --publish=28015:28015 rethinkdb + before_script: - pip install . pytest requests -script: py.test -v -s tests +script: + - py.test -v -s tests + - py.test -v -s --rethinkdb-servers=localhost tests tests + - py.test -v -s --rethinkdb-servers=localhost --rethinkdb-big-table tests From 93a2e4ff85c98f87cbe0bd983d42ce2b52faa4e8 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Mon, 2 Nov 2015 22:26:02 +0000 Subject: [PATCH 081/146] .travis.yml - disable pypy (not working because of cryptography library), require docker service --- .travis.yml | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index 21e990b..56470e2 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,12 +1,11 @@ language: python - python: - 3.5 - 3.4 - 2.7 - nightly - - pypy - - pypy3 + # - pypy + # - pypy3 addons: apt: @@ -14,6 +13,9 @@ addons: - python-gdbm - python3-gdbm +services: + - docker + before_install: - sudo service docker restart ; sleep 10 # https://github.com/travis-ci/travis-ci/issues/4778 - docker run -d --publish=28015:28015 rethinkdb From afdb6cf557aec3fd2b42f17cdd854b3bbd174c64 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 4 Nov 2015 23:15:29 +0000 Subject: [PATCH 082/146] log status in close() --- warcprox/stats.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/warcprox/stats.py b/warcprox/stats.py index 660d4c7..750ba5d 100644 --- a/warcprox/stats.py +++ b/warcprox/stats.py @@ -137,16 +137,23 @@ class RethinkStatsDb: self.r.table_create(self.table, primary_key="bucket", shards=self.shards, replicas=self.replicas).run() def close(self): - self.logger.info("waiting for ~%s tasks to finish", - self._executor._work_queue.qsize() + (self._executor._max_workers/2)) + self._executor.shutdown(wait=False) + last_update = 0 + while True: + time.sleep(0.5) + remaining_estimate = self._executor._work_queue.qsize() + self._executor._max_workers/2 + if remaining_estimate < self._executor._max_workers: + break + if time.time() - last_update >= 30: + self.logger.info("waiting for ~%s tasks to finish", remaining_estimate) + last_update = time.time() self._executor.shutdown(wait=True) - self.logger.info("shut down complete") + self.logger.info("all tasks finished") def sync(self): pass def value(self, bucket0="__all__", bucket1=None, bucket2=None): - # XXX use pluck? bucket0_stats = self.r.table(self.table).get(bucket0).run() self.logger.debug('stats db lookup of bucket=%s returned %s', bucket0, bucket0_stats) if bucket0_stats: @@ -157,8 +164,12 @@ class RethinkStatsDb: return bucket0_stats[bucket1] return bucket0_stats + + # >>> r.db("archiveit_brozzler").table("test00").get("foo01").replace(lambda old: r.branch(old.eq(None), {"id":"foo01", "a":{"b":88}}, old.merge({"a":{"b":old["a"]["b"].add(3)}}))).run(conn) + def _tally(self, buckets, size, is_revisit): try: + threading.current_thread.name = 'RethinkStatsDb-futures-thread(tid={})'.format(warcprox.gettid()) self.logger.debug("starting task self._tally(%s)", (buckets, size, is_revisit)) for bucket in buckets: bucket_stats = self.value(bucket) or _empty_bucket(bucket) From 783e730e52e9b927c23066dbbcc670a593555a42 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 5 Nov 2015 02:23:24 +0000 Subject: [PATCH 083/146] insert captures entries in batch every 0.5 seconds, since rethinkdb updates were falling way behind sometimes --- warcprox/bigtable.py | 39 ++++++++++++++++++++++++++++++--------- 1 file changed, 30 insertions(+), 9 deletions(-) diff --git a/warcprox/bigtable.py b/warcprox/bigtable.py index 07e1923..9fe0e6e 100644 --- a/warcprox/bigtable.py +++ b/warcprox/bigtable.py @@ -8,9 +8,10 @@ import base64 import surt import os import hashlib -import concurrent.futures +import threading class RethinkCaptures: + """Inserts in batches every 0.5 seconds""" logger = logging.getLogger("warcprox.bigtables.RethinkCaptures") def __init__(self, r, table="captures", shards=None, replicas=None, options=warcprox.Options()): @@ -21,9 +22,28 @@ class RethinkCaptures: self.options = options self._ensure_db_table() - # only one worker thread to ensure consistency, see - # https://rethinkdb.com/docs/consistency/ - self._executor = concurrent.futures.ThreadPoolExecutor(max_workers=1) + self._stop = threading.Event() + self._batch_lock = threading.RLock() + with self._batch_lock: + self._batch = [] + self._insert_batch() # starts repeating timer + + def _insert_batch(self): + with self._batch_lock: + if len(self._batch) > 0: + result = self.r.table(self.table).insert(self._batch).run() + if result["inserted"] != len(self._batch) or sorted(result.values()) != [0,0,0,0,0,len(self._batch)]: + raise Exception("unexpected result %s saving batch of %s entries", result, len(self._batch)) + self.logger.info("big capture table db saved %s entries", len(self._batch)) + self.logger.info("saved %s", self._batch) + self._batch = [] + + if not self._stop.is_set(): + self._timer = threading.Timer(0.5, self._insert_batch) + self._timer.name = "RethinkCaptures-batch-insert-timer" + self._timer.start() + else: + self.logger.info("finished") def _ensure_db_table(self): dbs = self.r.db_list().run() @@ -94,6 +114,7 @@ class RethinkCaptures: def _save_entry(self, entry): try: + threading.current_thread.name = 'RethinkCaptures-futures-thread(tid={})'.format(warcprox.gettid()) result = self.r.table(self.table).insert(entry).run() if result["inserted"] == 1 and sorted(result.values()) != [0,0,0,0,0,1]: raise Exception("unexpected result %s saving %s", result, entry) @@ -103,13 +124,13 @@ class RethinkCaptures: def notify(self, recorded_url, records): entry = self._assemble_entry(recorded_url, records) - self._executor.submit(self._save_entry, entry) + with self._batch_lock: + self._batch.append(entry) def close(self): - self.logger.info("waiting for ~%s tasks to finish", - self._executor._work_queue.qsize() + (self._executor._max_workers/2)) - self._executor.shutdown(wait=True) - self.logger.info("shut down complete") + self.logger.info("closing rethinkdb captures table") + self._stop.set() + self._timer.join() class RethinkCapturesDedup: logger = logging.getLogger("warcprox.dedup.RethinkCapturesDedup") From 9af17ba7c339f435548b4a549628a60655260ab5 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 5 Nov 2015 02:23:36 +0000 Subject: [PATCH 084/146] update stats batch every 0.5 seconds, since rethinkdb updates were falling way behind sometimes --- warcprox/stats.py | 95 +++++++++++++++++++++++++++-------------------- 1 file changed, 55 insertions(+), 40 deletions(-) diff --git a/warcprox/stats.py b/warcprox/stats.py index 750ba5d..44b724b 100644 --- a/warcprox/stats.py +++ b/warcprox/stats.py @@ -14,7 +14,8 @@ import json from hanzo import warctools import random import warcprox -import concurrent.futures +import threading +import rethinkdb as r def _empty_bucket(bucket): return { @@ -111,19 +112,58 @@ class StatsDb: self.db[b] = json.dumps(bucket_stats, separators=(',',':')).encode("utf-8") class RethinkStatsDb: + """Updates database in batch every 0.5 seconds""" logger = logging.getLogger("warcprox.stats.RethinkStatsDb") - def __init__(self, r, table="stats", shards=None, replicas=None, options=warcprox.Options()): - self.r = r + def __init__(self, rethinker, table="stats", shards=None, replicas=None, options=warcprox.Options()): + self.r = rethinker self.table = table - self.shards = shards or len(r.servers) - self.replicas = replicas or min(3, len(r.servers)) + self.shards = shards or 1 # 1 shard by default because it's probably a small table + self.replicas = replicas or min(3, len(self.r.servers)) self._ensure_db_table() self.options = options - # only one worker thread to ensure consistency, see - # https://rethinkdb.com/docs/consistency/ - self._executor = concurrent.futures.ThreadPoolExecutor(max_workers=1) + self._stop = threading.Event() + self._batch_lock = threading.RLock() + with self._batch_lock: + self._batch = {} + + self._update_batch() # starts repeating timer + + def _update_batch(self): + with self._batch_lock: + if len(self._batch) > 0: + # XXX can this be done in one query? + # r.db("archiveit_brozzler").table("test00").get_all(*["foo01","foo"])... + # >>> r.db("archiveit_brozzler").table("test00").get("foo01").replace(lambda old: r.branch(old.eq(None), {"id":"foo01", "a":{"b":88}}, old.merge({"a":{"b":old["a"]["b"].add(3)}}))).run(conn) + for k in self._batch: + result = self.r.table(self.table).get(k).replace( + lambda old: r.branch(old.eq(None), self._batch[k], old.merge( + { + "total": { + "urls": old["total"]["urls"].add(self._batch[k]["total"]["urls"]), + "wire_bytes": old["total"]["wire_bytes"].add(self._batch[k]["total"]["wire_bytes"]), + }, + "new": { + "urls": old["new"]["urls"].add(self._batch[k]["new"]["urls"]), + "wire_bytes": old["new"]["wire_bytes"].add(self._batch[k]["new"]["wire_bytes"]), + }, + "revisit": { + "urls": old["revisit"]["urls"].add(self._batch[k]["revisit"]["urls"]), + "wire_bytes": old["revisit"]["wire_bytes"].add(self._batch[k]["revisit"]["wire_bytes"]), + }, + } + ))).run() + if not result["inserted"] and not result["replaced"] or sorted(result.values()) != [0,0,0,0,0,1]: + raise Exception("unexpected result %s updating stats %s" % (result, self._batch[k])) + self._batch = {} + + if not self._stop.is_set(): + self._timer = threading.Timer(0.5, self._update_batch) + self._timer.name = "RethinkCaptures-batch-insert-timer" + self._timer.start() + else: + self.logger.info("finished") def _ensure_db_table(self): dbs = self.r.db_list().run() @@ -137,18 +177,9 @@ class RethinkStatsDb: self.r.table_create(self.table, primary_key="bucket", shards=self.shards, replicas=self.replicas).run() def close(self): - self._executor.shutdown(wait=False) - last_update = 0 - while True: - time.sleep(0.5) - remaining_estimate = self._executor._work_queue.qsize() + self._executor._max_workers/2 - if remaining_estimate < self._executor._max_workers: - break - if time.time() - last_update >= 30: - self.logger.info("waiting for ~%s tasks to finish", remaining_estimate) - last_update = time.time() - self._executor.shutdown(wait=True) - self.logger.info("all tasks finished") + self.logger.info("closing rethinkdb stats table") + self._stop.set() + self._timer.join() def sync(self): pass @@ -164,15 +195,10 @@ class RethinkStatsDb: return bucket0_stats[bucket1] return bucket0_stats - - # >>> r.db("archiveit_brozzler").table("test00").get("foo01").replace(lambda old: r.branch(old.eq(None), {"id":"foo01", "a":{"b":88}}, old.merge({"a":{"b":old["a"]["b"].add(3)}}))).run(conn) - def _tally(self, buckets, size, is_revisit): - try: - threading.current_thread.name = 'RethinkStatsDb-futures-thread(tid={})'.format(warcprox.gettid()) - self.logger.debug("starting task self._tally(%s)", (buckets, size, is_revisit)) + with self._batch_lock: for bucket in buckets: - bucket_stats = self.value(bucket) or _empty_bucket(bucket) + bucket_stats = self._batch.setdefault(bucket, _empty_bucket(bucket)) bucket_stats["total"]["urls"] += 1 bucket_stats["total"]["wire_bytes"] += size @@ -184,15 +210,6 @@ class RethinkStatsDb: bucket_stats["new"]["urls"] += 1 bucket_stats["new"]["wire_bytes"] += size - self.logger.debug("saving %s", bucket_stats) - result = self.r.table(self.table).insert(bucket_stats, conflict="replace").run() - if sorted(result.values()) != [0,0,0,0,0,1] or [result["deleted"],result["skipped"],result["errors"]] != [0,0,0]: - raise Exception("unexpected result %s saving %s", result, record) - - self.logger.debug("finished task self._tally(%s)", (buckets, size, is_revisit)) - except: - self.logger.error("unexpected problem tallying stats", exc_info=True) - def _extract_stats_info(self, recorded_url, records): buckets = ["__all__"] @@ -208,10 +225,8 @@ class RethinkStatsDb: return buckets, recorded_url.size, is_revisit def tally(self, recorded_url, records): - self._tally(self._extract_stats_info(recorded_url, records)) + self._tally(*self._extract_stats_info(recorded_url, records)) def notify(self, recorded_url, records): - args = self._extract_stats_info(recorded_url, records) - self.logger.debug("submitting task self._tally(%s)", args) - self._executor.submit(self._tally, *args) + self.tally(recorded_url, records) From 95ef8b80b01bf8bc0291a30c69f33dfbf897db69 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 5 Nov 2015 02:26:03 +0000 Subject: [PATCH 085/146] make sure load score for service registry is a float; comment out memory debugging call; close dedup db after warc writer thread finishes --- warcprox/controller.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/warcprox/controller.py b/warcprox/controller.py index b1d0a83..63af764 100644 --- a/warcprox/controller.py +++ b/warcprox/controller.py @@ -117,7 +117,7 @@ class WarcproxController(object): 'heartbeat_interval': self.HEARTBEAT_INTERVAL, 'port': self.options.port, } - status_info['load'] = self.proxy.recorded_url_q.qsize() / (self.proxy.recorded_url_q.maxsize or 100) + status_info['load'] = 1.0 * self.proxy.recorded_url_q.qsize() / (self.proxy.recorded_url_q.maxsize or 100) self.status_info = self.service_registry.heartbeat(status_info) self.logger.debug("status in service registry: %s", self.status_info) @@ -154,9 +154,9 @@ class WarcproxController(object): if self.service_registry and (not hasattr(self, "status_info") or (datetime.datetime.now(utc) - self.status_info["last_heartbeat"]).total_seconds() > self.HEARTBEAT_INTERVAL): self._service_heartbeat() - if (datetime.datetime.utcnow() - last_mem_dbg).total_seconds() > 60: - self.debug_mem() - last_mem_dbg = datetime.datetime.utcnow() + # if (datetime.datetime.utcnow() - last_mem_dbg).total_seconds() > 60: + # self.debug_mem() + # last_mem_dbg = datetime.datetime.utcnow() time.sleep(0.5) except: @@ -167,9 +167,6 @@ class WarcproxController(object): self.proxy.shutdown() self.proxy.server_close() - if self.warc_writer_thread.dedup_db is not None: - self.warc_writer_thread.dedup_db.close() - if self.playback_proxy is not None: self.playback_proxy.shutdown() self.playback_proxy.server_close() @@ -178,6 +175,10 @@ class WarcproxController(object): # wait for threads to finish self.warc_writer_thread.join() + + if self.warc_writer_thread.dedup_db is not None: + self.warc_writer_thread.dedup_db.close() + proxy_thread.join() if self.playback_proxy is not None: playback_proxy_thread.join() From a9fc550453ac51edb2f6e333fb538bbac0c8dee2 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 5 Nov 2015 02:26:43 +0000 Subject: [PATCH 086/146] oops, argparse.SUPPRESS isn't supposed to be in quotes --- warcprox/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/warcprox/main.py b/warcprox/main.py index 574c144..7976753 100644 --- a/warcprox/main.py +++ b/warcprox/main.py @@ -79,7 +79,7 @@ def _build_arg_parser(prog=os.path.basename(sys.argv[0])): arg_parser.add_argument('--kafka-capture-feed-topic', dest='kafka_capture_feed_topic', default=None, help='kafka capture feed topic') arg_parser.add_argument('--queue-size', dest='queue_size', default=1000, - help='argparse.SUPPRESS') + help=argparse.SUPPRESS) arg_parser.add_argument('--version', action='version', version="warcprox {}".format(warcprox.__version__)) arg_parser.add_argument('-v', '--verbose', dest='verbose', action='store_true') From fcaaa7b09bed13a5bc913e80d3ab03f264ec7be9 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 5 Nov 2015 02:29:15 +0000 Subject: [PATCH 087/146] include tid in thread name for more threads (linux only) for correlation with top -H --- warcprox/mitmproxy.py | 3 +++ warcprox/writerthread.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index 3c3f95b..532c221 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -15,11 +15,14 @@ except ImportError: import socket import logging import ssl +import warcprox +import threading class MitmProxyHandler(http_server.BaseHTTPRequestHandler): logger = logging.getLogger("warcprox.mitmproxy.MitmProxyHandler") def __init__(self, request, client_address, server): + threading.current_thread.name = 'MitmProxyHandler-thread(tid={})'.format(warcprox.gettid()) self.is_connect = False self._headers_buffer = [] http_server.BaseHTTPRequestHandler.__init__(self, request, client_address, server) diff --git a/warcprox/writerthread.py b/warcprox/writerthread.py index f0c7d25..c69e514 100644 --- a/warcprox/writerthread.py +++ b/warcprox/writerthread.py @@ -41,7 +41,7 @@ class WarcWriterThread(threading.Thread): def _run(self): while not self.stop.is_set(): try: - self.setName('WarcWriterThread(tid={})'.format(warcprox.gettid())) + self.name = 'WarcWriterThread(tid={})'.format(warcprox.gettid()) while True: try: recorded_url = self.recorded_url_q.get(block=True, timeout=0.5) From 18cc818cf0e23a56c6d27a4a564ab67ff3367c42 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 5 Nov 2015 02:55:18 +0000 Subject: [PATCH 088/146] more timing tweaks to make sure tests pass, improved logging etc --- tests/test_warcprox.py | 73 +++++++++++++++++++++++++++++++----------- 1 file changed, 54 insertions(+), 19 deletions(-) diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 2285d8b..404279d 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -18,6 +18,10 @@ import json import random import rethinkstuff from hanzo import warctools +import warnings +import pprint +import traceback +import signal try: import http.server as http_server @@ -35,6 +39,25 @@ import warcprox logging.basicConfig(stream=sys.stdout, level=logging.INFO, format='%(asctime)s %(process)d %(levelname)s %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s') +logging.getLogger("requests.packages.urllib3").setLevel(logging.WARN) +warnings.simplefilter("ignore", category=requests.packages.urllib3.exceptions.InsecureRequestWarning) +warnings.simplefilter("ignore", category=requests.packages.urllib3.exceptions.InsecurePlatformWarning) + +def dump_state(signum=None, frame=None): + pp = pprint.PrettyPrinter(indent=4) + state_strs = [] + + for th in threading.enumerate(): + try: + state_strs.append(str(th)) + except AssertionError: + state_strs.append("") + stack = traceback.format_stack(sys._current_frames()[th.ident]) + state_strs.append("".join(stack)) + + logging.warn("dumping state (caught signal {})\n{}".format(signum, "\n".join(state_strs))) + +signal.signal(signal.SIGQUIT, dump_state) class _TestHttpRequestHandler(http_server.BaseHTTPRequestHandler): def do_GET(self): @@ -147,6 +170,7 @@ def captures_db(request, rethinkdb_servers, rethinkdb_big_table): def fin(): if captures_db: + captures_db.close() logging.info('dropping rethinkdb database {}'.format(db)) result = captures_db.r.db_drop(db).run() logging.info("result=%s", result) @@ -168,6 +192,7 @@ def rethink_dedup_db(request, rethinkdb_servers, captures_db): def fin(): if rethinkdb_servers: + ddb.close() if not captures_db: logging.info('dropping rethinkdb database {}'.format(db)) result = ddb.r.db_drop(db).run() @@ -208,6 +233,7 @@ def stats_db(request, rethinkdb_servers): sdb = warcprox.stats.StatsDb(stats_db_file) def fin(): + sdb.close() if rethinkdb_servers: logging.info('dropping rethinkdb database {}'.format(db)) result = sdb.r.db_drop(db).run() @@ -396,6 +422,12 @@ def test_dedup_http(http_daemon, warcprox_, archiving_proxies, playback_proxies) assert response.headers['warcprox-test-header'] == 'e!' assert response.content == b'I am the warcprox test payload! ffffffffff!\n' + # wait for writer thread to process + time.sleep(0.5) + while not warcprox_.warc_writer_thread.idle: + time.sleep(0.5) + time.sleep(0.5) + # check in dedup db # {u'id': u'', u'url': u'https://localhost:62841/c/d', u'date': u'2013-11-22T00:14:37Z'} dedup_lookup = warcprox_.warc_writer_thread.dedup_db.lookup(b'sha1:65e1216acfd220f0292715e74bd7a1ec35c99dfc') @@ -417,10 +449,7 @@ def test_dedup_http(http_daemon, warcprox_, archiving_proxies, playback_proxies) # wait for writer thread to process time.sleep(0.5) - while (not warcprox_.warc_writer_thread.idle - or (warcprox_.proxy.stats_db - and hasattr(warcprox_.proxy.stats_db, "_executor") - and warcprox_.proxy.stats_db._executor._work_queue.qsize() > 0)): + while not warcprox_.warc_writer_thread.idle: time.sleep(0.5) time.sleep(0.5) @@ -463,6 +492,12 @@ def test_dedup_https(https_daemon, warcprox_, archiving_proxies, playback_proxie assert response.headers['warcprox-test-header'] == 'g!' assert response.content == b'I am the warcprox test payload! hhhhhhhhhh!\n' + # wait for writer thread to process + time.sleep(0.5) + while not warcprox_.warc_writer_thread.idle: + time.sleep(0.5) + time.sleep(0.5) + # check in dedup db # {u'id': u'', u'url': u'https://localhost:62841/c/d', u'date': u'2013-11-22T00:14:37Z'} dedup_lookup = warcprox_.warc_writer_thread.dedup_db.lookup(b'sha1:5b4efa64fdb308ec06ae56a9beba155a6f734b89') @@ -484,14 +519,10 @@ def test_dedup_https(https_daemon, warcprox_, archiving_proxies, playback_proxie # wait for writer thread to process time.sleep(0.5) - while (not warcprox_.warc_writer_thread.idle - or (warcprox_.proxy.stats_db - and hasattr(warcprox_.proxy.stats_db, "_executor") - and warcprox_.proxy.stats_db._executor._work_queue.qsize() > 0)): + while not warcprox_.warc_writer_thread.idle: time.sleep(0.5) time.sleep(0.5) - # check in dedup db (no change from prev) dedup_lookup = warcprox_.warc_writer_thread.dedup_db.lookup(b'sha1:5b4efa64fdb308ec06ae56a9beba155a6f734b89') assert dedup_lookup['url'] == url.encode('ascii') @@ -511,7 +542,18 @@ def test_limits(http_daemon, warcprox_, archiving_proxies): request_meta = {"stats":{"buckets":["job1"]},"limits":{"job1.total.urls":10}} headers = {"Warcprox-Meta": json.dumps(request_meta)} - for i in range(10): + response = requests.get(url, proxies=archiving_proxies, headers=headers, stream=True) + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'i!' + assert response.content == b'I am the warcprox test payload! jjjjjjjjjj!\n' + + # wait for writer thread to process + time.sleep(0.5) + while not warcprox_.warc_writer_thread.idle: + time.sleep(0.5) + time.sleep(0.5) + + for i in range(9): response = requests.get(url, proxies=archiving_proxies, headers=headers, stream=True) assert response.status_code == 200 assert response.headers['warcprox-test-header'] == 'i!' @@ -519,10 +561,7 @@ def test_limits(http_daemon, warcprox_, archiving_proxies): # wait for writer thread to process time.sleep(0.5) - while (not warcprox_.warc_writer_thread.idle - or (warcprox_.proxy.stats_db - and hasattr(warcprox_.proxy.stats_db, "_executor") - and warcprox_.proxy.stats_db._executor._work_queue.qsize() > 0)): + while not warcprox_.warc_writer_thread.idle: time.sleep(0.5) time.sleep(0.5) @@ -547,10 +586,7 @@ def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies, # wait for writer thread to process time.sleep(0.5) - while (not warcprox_.warc_writer_thread.idle - or (warcprox_.proxy.stats_db - and hasattr(warcprox_.proxy.stats_db, "_executor") - and warcprox_.proxy.stats_db._executor._work_queue.qsize() > 0)): + while not warcprox_.warc_writer_thread.idle: time.sleep(0.5) time.sleep(0.5) @@ -660,7 +696,6 @@ def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies, finally: fh.close() - if __name__ == '__main__': pytest.main() From f38ce708bfbf64e4712c7da2705f5e87988525ac Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 5 Nov 2015 02:55:52 +0000 Subject: [PATCH 089/146] set PYTHONDONTWRITEBYTECODE in one place --- tests/run-tests.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/run-tests.sh b/tests/run-tests.sh index 7e67fe6..522110f 100755 --- a/tests/run-tests.sh +++ b/tests/run-tests.sh @@ -13,12 +13,12 @@ for python in python2.7 python3.4 do docker run --rm -i -t --volume="$script_dir/..:/warcprox" internetarchive/rethinkdb /sbin/my_init -- \ bash -x -c " adduser --gecos=$user --disabled-password --quiet --uid=$uid $user \ - && sudo -u $user bash -x -c 'cd /warcprox \ + && sudo PYTHONDONTWRITEBYTECODE=1 -u $user bash -x -c 'cd /warcprox \ && virtualenv -p $python /tmp/venv \ && source /tmp/venv/bin/activate \ && pip --log-file /tmp/pip.log install . pytest requests \ - && PYTHONDONTWRITEBYTECODE=1 py.test -s tests \ - && PYTHONDONTWRITEBYTECODE=1 py.test -s --rethinkdb-servers=localhost tests \ - && PYTHONDONTWRITEBYTECODE=1 py.test -s --rethinkdb-servers=localhost --rethinkdb-big-table tests'" + && py.test -s tests \ + && py.test -s --rethinkdb-servers=localhost tests \ + && py.test -s --rethinkdb-servers=localhost --rethinkdb-big-table tests'" done From 7eb82ab8a2fce5ddcce3e6312cc8fbb268229bb6 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 6 Nov 2015 22:25:57 +0000 Subject: [PATCH 090/146] adding missing import, remove unused method, logging tweaks, avoid exception at shutdown joining unstarted timer thread --- warcprox/bigtable.py | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/warcprox/bigtable.py b/warcprox/bigtable.py index 9fe0e6e..4feeb7d 100644 --- a/warcprox/bigtable.py +++ b/warcprox/bigtable.py @@ -9,6 +9,7 @@ import surt import os import hashlib import threading +import datetime class RethinkCaptures: """Inserts in batches every 0.5 seconds""" @@ -34,14 +35,15 @@ class RethinkCaptures: result = self.r.table(self.table).insert(self._batch).run() if result["inserted"] != len(self._batch) or sorted(result.values()) != [0,0,0,0,0,len(self._batch)]: raise Exception("unexpected result %s saving batch of %s entries", result, len(self._batch)) - self.logger.info("big capture table db saved %s entries", len(self._batch)) - self.logger.info("saved %s", self._batch) + self.logger.info("saved %s entries to big capture table db", len(self._batch)) self._batch = [] if not self._stop.is_set(): - self._timer = threading.Timer(0.5, self._insert_batch) - self._timer.name = "RethinkCaptures-batch-insert-timer" - self._timer.start() + t = threading.Timer(0.5, self._insert_batch) + t.name = "RethinkCaptures-batch-insert-timer-%s" % datetime.datetime.utcnow().isoformat() + t.start() + # ensure self._timer joinable (already started) whenever close() happens to be called + self._timer = t else: self.logger.info("finished") @@ -65,7 +67,7 @@ class RethinkCaptures: results = list(results_iter) if len(results) > 0: if len(results) > 1: - self.logger.error("expected 0 or 1 but found %s results for sha1base32=%s bucket=%s (will use first result)", len(results), sha1base32, bucket) + self.logger.debug("expected 0 or 1 but found %s results for sha1base32=%s bucket=%s (will use first result)", len(results), sha1base32, bucket) result = results[0] else: result = None @@ -112,16 +114,6 @@ class RethinkCaptures: return entry - def _save_entry(self, entry): - try: - threading.current_thread.name = 'RethinkCaptures-futures-thread(tid={})'.format(warcprox.gettid()) - result = self.r.table(self.table).insert(entry).run() - if result["inserted"] == 1 and sorted(result.values()) != [0,0,0,0,0,1]: - raise Exception("unexpected result %s saving %s", result, entry) - self.logger.debug("big capture table db saved %s", entry) - except: - self.logger.error("unexpected problem ", exc_info=True) - def notify(self, recorded_url, records): entry = self._assemble_entry(recorded_url, records) with self._batch_lock: From e3a5717446f5f59cee27980e9f72adb0892ce444 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 6 Nov 2015 22:30:14 +0000 Subject: [PATCH 091/146] hidden --profile option to enable profiling of warc writer thread and periodic logging of memory usage info; at shutdown, close stats db and unregister from service registry; logging improvements --- .gitignore | 1 + warcprox/controller.py | 17 ++++++++++------- warcprox/main.py | 2 ++ warcprox/stats.py | 3 ++- warcprox/warcproxy.py | 9 +++++---- warcprox/writerthread.py | 10 +++++++++- 6 files changed, 29 insertions(+), 13 deletions(-) diff --git a/.gitignore b/.gitignore index 72e3644..1da5ebc 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,4 @@ warcs build dist .tox +out.* diff --git a/warcprox/controller.py b/warcprox/controller.py index 63af764..3c7dfe1 100644 --- a/warcprox/controller.py +++ b/warcprox/controller.py @@ -44,9 +44,6 @@ class WarcproxController(object): def debug_mem(self): self.logger.info("self.proxy.recorded_url_q.qsize()=%s", self.proxy.recorded_url_q.qsize()) - if self.proxy.stats_db and hasattr(self.proxy.stats_db, "_executor"): - self.logger.info("self.proxy.stats_db._executor._work_queue.qsize()=%s", - self.proxy.stats_db._executor._work_queue.qsize()) with open("/proc/self/status") as f: for line in f: fields = line.split() @@ -118,6 +115,7 @@ class WarcproxController(object): 'port': self.options.port, } status_info['load'] = 1.0 * self.proxy.recorded_url_q.qsize() / (self.proxy.recorded_url_q.maxsize or 100) + status_info['queue_size'] = self.proxy.recorded_url_q.qsize() self.status_info = self.service_registry.heartbeat(status_info) self.logger.debug("status in service registry: %s", self.status_info) @@ -154,9 +152,9 @@ class WarcproxController(object): if self.service_registry and (not hasattr(self, "status_info") or (datetime.datetime.now(utc) - self.status_info["last_heartbeat"]).total_seconds() > self.HEARTBEAT_INTERVAL): self._service_heartbeat() - # if (datetime.datetime.utcnow() - last_mem_dbg).total_seconds() > 60: - # self.debug_mem() - # last_mem_dbg = datetime.datetime.utcnow() + if self.options.profile and (datetime.datetime.utcnow() - last_mem_dbg).total_seconds() > 60: + self.debug_mem() + last_mem_dbg = datetime.datetime.utcnow() time.sleep(0.5) except: @@ -176,10 +174,15 @@ class WarcproxController(object): # wait for threads to finish self.warc_writer_thread.join() - if self.warc_writer_thread.dedup_db is not None: + if self.proxy.stats_db: + self.proxy.stats_db.close() + if self.warc_writer_thread.dedup_db: self.warc_writer_thread.dedup_db.close() proxy_thread.join() if self.playback_proxy is not None: playback_proxy_thread.join() + if self.service_registry and hasattr(self, "status_info"): + self.service_registry.unregister(self.status_info["id"]) + diff --git a/warcprox/main.py b/warcprox/main.py index 7976753..0854cee 100644 --- a/warcprox/main.py +++ b/warcprox/main.py @@ -80,6 +80,8 @@ def _build_arg_parser(prog=os.path.basename(sys.argv[0])): default=None, help='kafka capture feed topic') arg_parser.add_argument('--queue-size', dest='queue_size', default=1000, help=argparse.SUPPRESS) + arg_parser.add_argument('--profile', action='store_true', default=False, + help=argparse.SUPPRESS) arg_parser.add_argument('--version', action='version', version="warcprox {}".format(warcprox.__version__)) arg_parser.add_argument('-v', '--verbose', dest='verbose', action='store_true') diff --git a/warcprox/stats.py b/warcprox/stats.py index 44b724b..316531d 100644 --- a/warcprox/stats.py +++ b/warcprox/stats.py @@ -16,6 +16,7 @@ import random import warcprox import threading import rethinkdb as r +import datetime def _empty_bucket(bucket): return { @@ -160,7 +161,7 @@ class RethinkStatsDb: if not self._stop.is_set(): self._timer = threading.Timer(0.5, self._update_batch) - self._timer.name = "RethinkCaptures-batch-insert-timer" + self._timer.name = "RethinkStats-batch-update-timer-%s" % datetime.datetime.utcnow().isoformat() self._timer.start() else: self.logger.info("finished") diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index 4e19d4f..93107b1 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# vim:set sw=4 et: # """ WARC writing MITM HTTP/S proxy @@ -151,8 +150,8 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): key, limit = item bucket0, bucket1, bucket2 = key.rsplit(".", 2) value = self.server.stats_db.value(bucket0, bucket1, bucket2) - # self.logger.debug("warcprox_meta['limits']=%s stats['%s']=%s recorded_url_q.qsize()=%s", - # warcprox_meta['limits'], key, value, self.server.recorded_url_q.qsize()) + self.logger.debug("warcprox_meta['limits']=%s stats['%s']=%s recorded_url_q.qsize()=%s", + warcprox_meta['limits'], key, value, self.server.recorded_url_q.qsize()) if value and value >= limit: body = "request rejected by warcprox: reached limit {}={}\n".format(key, limit).encode("utf-8") self.send_response(420, "Reached limit") @@ -369,7 +368,7 @@ class SingleThreadedWarcProxy(http_server.HTTPServer): if recorded_url_q is not None: self.recorded_url_q = recorded_url_q else: - self.recorded_url_q = queue.Queue() + self.recorded_url_q = queue.Queue(maxsize=options.queue_size or 1000) self.stats_db = stats_db @@ -383,6 +382,8 @@ class SingleThreadedWarcProxy(http_server.HTTPServer): self.logger.info('WarcProxy shutting down') http_server.HTTPServer.server_close(self) + def handle_error(self, request, client_address): + self.logger.warn("exception processing request %s from %s", request, client_address, exc_info=True) class WarcProxy(socketserver.ThreadingMixIn, SingleThreadedWarcProxy): pass diff --git a/warcprox/writerthread.py b/warcprox/writerthread.py index c69e514..25beff8 100644 --- a/warcprox/writerthread.py +++ b/warcprox/writerthread.py @@ -36,7 +36,10 @@ class WarcWriterThread(threading.Thread): self.idle = None def run(self): - cProfile.runctx('self._run()', globals(), locals(), sort='cumulative') + if self.options.profile: + cProfile.runctx('self._run()', globals(), locals(), sort='cumulative') + else: + self._run() def _run(self): while not self.stop.is_set(): @@ -44,6 +47,11 @@ class WarcWriterThread(threading.Thread): self.name = 'WarcWriterThread(tid={})'.format(warcprox.gettid()) while True: try: + if self.stop.is_set(): + qsize = self.recorded_url_q.qsize() + if qsize % 50 == 0: + self.logger.info("%s urls left to write", qsize) + recorded_url = self.recorded_url_q.get(block=True, timeout=0.5) self.idle = None if self.dedup_db: From fe4d7a2769f373e8248680b444c3b1ae2a0edd4f Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 6 Nov 2015 14:44:52 -0800 Subject: [PATCH 092/146] tid="n/a" if not available --- warcprox/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/warcprox/__init__.py b/warcprox/__init__.py index 4c22670..8afe606 100644 --- a/warcprox/__init__.py +++ b/warcprox/__init__.py @@ -22,7 +22,7 @@ def gettid(): tid = libc.syscall(SYS_gettid) return tid except: - logging.warn("gettid failed?", exc_info=True) + return "n/a" import warcprox.controller as controller import warcprox.playback as playback From d8f97ad472c940f0ab9f059132b6e391f8fdef61 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 12 Nov 2015 00:45:36 +0000 Subject: [PATCH 093/146] single threaded warcprox (that doesn't write warcs), useful for debugging --- tests/single-threaded-proxy.py | 80 ++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) create mode 100755 tests/single-threaded-proxy.py diff --git a/tests/single-threaded-proxy.py b/tests/single-threaded-proxy.py new file mode 100755 index 0000000..fd6808e --- /dev/null +++ b/tests/single-threaded-proxy.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python +"""Useful for debugging. Does not write warcs.""" + +from __future__ import absolute_import + +import warcprox +import logging +import sys +import argparse +import certauth +import queue +import socket +import os + +class FakeQueue(object): + logger = logging.getLogger("FakeQueue") + def __init__(self, maxsize=0): pass + def join(self): pass + def qsize(self): return 0 + def empty(self): return True + def full(self): return False + def get(self, block=True, timeout=None): raise queue.Empty + def put_nowait(self, item): return self.put(item, block=False) + def get_nowait(self): return self.get(block=False) + def put(self, recorded_url, block=True, timeout=None): + logging.info("{} {} {} {} {} size={} {}".format( + recorded_url.client_ip, recorded_url.status, recorded_url.method, + recorded_url.url.decode("utf-8"), recorded_url.mimetype, + recorded_url.size, warcprox.digest_str(recorded_url.response_recorder.payload_digest, False).decode('utf-8'))) + +def parse_args(): + prog = os.path.basename(sys.argv[0]) + arg_parser = argparse.ArgumentParser(prog=prog, + description='%s - single threaded mitm http/s proxy, for debugging' % prog, + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + arg_parser.add_argument('-p', '--port', dest='port', default='8000', + type=int, help='port to listen on') + arg_parser.add_argument('-b', '--address', dest='address', + default='localhost', help='address to listen on') + arg_parser.add_argument('-c', '--cacert', dest='cacert', + default='./{0}-warcprox-ca.pem'.format(socket.gethostname()), + help='CA certificate file; if file does not exist, it will be created') + arg_parser.add_argument('--certs-dir', dest='certs_dir', + default='./{0}-warcprox-ca'.format(socket.gethostname()), + help='where to store and load generated certificates') + arg_parser.add_argument('--version', action='version', + version="warcprox {}".format(warcprox.__version__)) + arg_parser.add_argument('-v', '--verbose', dest='verbose', action='store_true') + arg_parser.add_argument('-q', '--quiet', dest='quiet', action='store_true') + + return arg_parser.parse_args(args=sys.argv[1:]) + +def init_logging(verbose): + if args.verbose: + loglevel = logging.DEBUG + elif args.quiet: + loglevel = logging.WARNING + else: + loglevel = logging.INFO + + logging.basicConfig(stream=sys.stdout, level=loglevel, + format='%(asctime)s %(process)d %(levelname)s %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s') + # format='%(asctime)s %(funcName) 21s() %(filename)15s:%(lineno)05d %(message)s') + +def init_proxy(args): + ca_name = 'Warcprox CA on {}'.format(socket.gethostname())[:64] + ca = certauth.certauth.CertificateAuthority(args.cacert, args.certs_dir, + ca_name=ca_name) + options = warcprox.Options(**vars(args)) + proxy = warcprox.warcproxy.SingleThreadedWarcProxy(ca, + recorded_url_q=FakeQueue(), options=options) + return proxy + +if __name__ == "__main__": + args = parse_args() + init_logging(args.verbose) + proxy = init_proxy(args) + + proxy.serve_forever() + From 734b2f5396b97976a2a00812be6496c59a889de5 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 12 Nov 2015 00:53:59 +0000 Subject: [PATCH 094/146] limit max number of threads to 500; make sure connection with proxy client has a timeout; log errors from connection with proxy client --- warcprox/main.py | 4 +++- warcprox/mitmproxy.py | 10 ++++++---- warcprox/warcproxy.py | 18 ++++++++++++------ 3 files changed, 21 insertions(+), 11 deletions(-) diff --git a/warcprox/main.py b/warcprox/main.py index 0854cee..bcff5b0 100644 --- a/warcprox/main.py +++ b/warcprox/main.py @@ -78,7 +78,9 @@ def _build_arg_parser(prog=os.path.basename(sys.argv[0])): default=None, help='kafka broker list for capture feed') arg_parser.add_argument('--kafka-capture-feed-topic', dest='kafka_capture_feed_topic', default=None, help='kafka capture feed topic') - arg_parser.add_argument('--queue-size', dest='queue_size', default=1000, + arg_parser.add_argument('--queue-size', dest='queue_size', default=500, + help=argparse.SUPPRESS) + arg_parser.add_argument('--max-threads', dest='max_threads', default=500, help=argparse.SUPPRESS) arg_parser.add_argument('--profile', action='store_true', default=False, help=argparse.SUPPRESS) diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index 532c221..3df9f33 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -1,5 +1,3 @@ -# vim:set sw=4 et: - from __future__ import absolute_import try: @@ -17,14 +15,16 @@ import logging import ssl import warcprox import threading +import datetime class MitmProxyHandler(http_server.BaseHTTPRequestHandler): logger = logging.getLogger("warcprox.mitmproxy.MitmProxyHandler") def __init__(self, request, client_address, server): - threading.current_thread.name = 'MitmProxyHandler-thread(tid={})'.format(warcprox.gettid()) + threading.current_thread().name = 'MitmProxyHandler(tid={},started={},client={}:{})'.format(warcprox.gettid(), datetime.datetime.utcnow().isoformat(), client_address[0], client_address[1]) self.is_connect = False self._headers_buffer = [] + request.settimeout(60) # XXX what value should this have? http_server.BaseHTTPRequestHandler.__init__(self, request, client_address, server) def _determine_host_port(self): @@ -52,7 +52,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): def _connect_to_host(self): # Connect to destination self._proxy_sock = socket.socket() - self._proxy_sock.settimeout(60) + self._proxy_sock.settimeout(60) # XXX what value should this have? self._proxy_sock.connect((self.hostname, int(self.port))) # Wrap socket if SSL is required @@ -146,4 +146,6 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): if item.startswith('do_'): return self.do_COMMAND + def log_error(self, fmt, *args): + self.logger.warn(fmt, *args) diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index 93107b1..2b83564 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -35,6 +35,7 @@ from hanzo import warctools from certauth.certauth import CertificateAuthority import warcprox import datetime +import concurrent.futures class ProxyingRecorder(object): """ @@ -294,10 +295,6 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): self.logger.error("uncaught exception in do_WARCPROX_WRITE_RECORD", exc_info=True) raise - def log_error(self, fmt, *args): - # logging better handled elsewhere? - pass - def log_message(self, fmt, *args): # logging better handled elsewhere? pass @@ -385,5 +382,14 @@ class SingleThreadedWarcProxy(http_server.HTTPServer): def handle_error(self, request, client_address): self.logger.warn("exception processing request %s from %s", request, client_address, exc_info=True) -class WarcProxy(socketserver.ThreadingMixIn, SingleThreadedWarcProxy): - pass +class PooledMixIn(socketserver.ThreadingMixIn): + def process_request(self, request, client_address): + if hasattr(self, 'pool') and self.pool: + self.pool.submit(self.process_request_thread, request, client_address) + else: + socketserver.ThreadingMixIn.process_request(self, request, client_address) + +class WarcProxy(PooledMixIn, SingleThreadedWarcProxy): + def __init__(self, *args, **kwargs): + SingleThreadedWarcProxy.__init__(self, *args, **kwargs) + self.pool = concurrent.futures.ThreadPoolExecutor(max_workers=self.options.max_threads or 500) From fb58244c4fa988da185dc7c920379a4e7ce901a2 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 12 Nov 2015 00:54:29 +0000 Subject: [PATCH 095/146] update stats in rethinkdb only every 2.0 seconds instead of every 0.5 --- warcprox/stats.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/warcprox/stats.py b/warcprox/stats.py index 316531d..a6a8298 100644 --- a/warcprox/stats.py +++ b/warcprox/stats.py @@ -160,7 +160,7 @@ class RethinkStatsDb: self._batch = {} if not self._stop.is_set(): - self._timer = threading.Timer(0.5, self._update_batch) + self._timer = threading.Timer(2.0, self._update_batch) self._timer.name = "RethinkStats-batch-update-timer-%s" % datetime.datetime.utcnow().isoformat() self._timer.start() else: From 00dc9eed84cda7556c9a2719da8e621fbc74384f Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 13 Nov 2015 01:17:35 +0000 Subject: [PATCH 096/146] new option --onion-tor-socks-proxy, host:port of tor socks proxy, used only to connect to .onion sites --- .travis.yml | 1 + setup.py | 11 +++++++++-- tests/Dockerfile | 2 ++ tests/single-threaded-proxy.py | 2 ++ tests/test_warcprox.py | 15 ++++++++++++++- warcprox/main.py | 2 ++ warcprox/mitmproxy.py | 14 +++++++++++++- warcprox/warcproxy.py | 10 ++++++++++ 8 files changed, 53 insertions(+), 4 deletions(-) diff --git a/.travis.yml b/.travis.yml index 56470e2..de744f3 100644 --- a/.travis.yml +++ b/.travis.yml @@ -12,6 +12,7 @@ addons: packages: - python-gdbm - python3-gdbm + - tor services: - docker diff --git a/setup.py b/setup.py index ef33be2..0245625 100755 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from setuptools.command.test import test as TestCommand import sys -import setuptools +import setuptools # special class needs to be added to support the pytest written dump-anydbm tests class PyTest(TestCommand): @@ -17,7 +17,14 @@ class PyTest(TestCommand): errno = pytest.main(self.test_args) sys.exit(errno) -deps = ['certauth>=1.1.0', 'warctools', 'kafka-python', 'surt==0.3b2', 'rethinkstuff'] +deps = [ + 'certauth>=1.1.0', + 'warctools', + 'kafka-python', + 'surt==0.3b2', + 'rethinkstuff', + 'PySocks', +] try: import concurrent.futures except: diff --git a/tests/Dockerfile b/tests/Dockerfile index 4a8b5b8..aa3746f 100644 --- a/tests/Dockerfile +++ b/tests/Dockerfile @@ -20,3 +20,5 @@ RUN mkdir -vp /etc/service/rethinkdb \ RUN apt-get -y install python-virtualenv git RUN apt-get -y install python-gdbm python3-gdbm libpython2.7-dev libpython3.4-dev libffi-dev libssl-dev RUN pip install devpi-client +RUN apt-get -y install tor # starts tor socks proxy on port 9050 + diff --git a/tests/single-threaded-proxy.py b/tests/single-threaded-proxy.py index fd6808e..69db94c 100755 --- a/tests/single-threaded-proxy.py +++ b/tests/single-threaded-proxy.py @@ -43,6 +43,8 @@ def parse_args(): arg_parser.add_argument('--certs-dir', dest='certs_dir', default='./{0}-warcprox-ca'.format(socket.gethostname()), help='where to store and load generated certificates') + arg_parser.add_argument('--onion-tor-socks-proxy', dest='onion_tor_socks_proxy', + default=None, help='host:port of tor socks proxy, used only to connect to .onion sites') arg_parser.add_argument('--version', action='version', version="warcprox {}".format(warcprox.__version__)) arg_parser.add_argument('-v', '--verbose', dest='verbose', action='store_true') diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 404279d..c660964 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -272,7 +272,8 @@ def warcprox_(request, captures_db, dedup_db, stats_db, service_registry): recorded_url_q = queue.Queue() - options = warcprox.Options(port=0, playback_port=0) + options = warcprox.Options(port=0, playback_port=0, + onion_tor_socks_proxy='localhost:9050') proxy = warcprox.warcproxy.WarcProxy(ca=ca, recorded_url_q=recorded_url_q, stats_db=stats_db, options=options) options.port = proxy.server_port @@ -696,6 +697,18 @@ def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies, finally: fh.close() +# XXX this test relies on a tor proxy running at localhost:9050 with a working +# connection to the internet, and relies on a third party site (facebook) being +# up and behaving a certain way +def test_tor_onion(archiving_proxies): + response = requests.get('http://www.facebookcorewwwi.onion/', + proxies=archiving_proxies, verify=False, allow_redirects=False) + assert response.status_code == 302 + + response = requests.get('https://www.facebookcorewwwi.onion/', + proxies=archiving_proxies, verify=False, allow_redirects=False) + assert response.status_code == 200 + if __name__ == '__main__': pytest.main() diff --git a/warcprox/main.py b/warcprox/main.py index bcff5b0..3ad92fe 100644 --- a/warcprox/main.py +++ b/warcprox/main.py @@ -84,6 +84,8 @@ def _build_arg_parser(prog=os.path.basename(sys.argv[0])): help=argparse.SUPPRESS) arg_parser.add_argument('--profile', action='store_true', default=False, help=argparse.SUPPRESS) + arg_parser.add_argument('--onion-tor-socks-proxy', dest='onion_tor_socks_proxy', + default=None, help='host:port of tor socks proxy, used only to connect to .onion sites') arg_parser.add_argument('--version', action='version', version="warcprox {}".format(warcprox.__version__)) arg_parser.add_argument('-v', '--verbose', dest='verbose', action='store_true') diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index 3df9f33..b8f645e 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -16,6 +16,7 @@ import ssl import warcprox import threading import datetime +import socks class MitmProxyHandler(http_server.BaseHTTPRequestHandler): logger = logging.getLogger("warcprox.mitmproxy.MitmProxyHandler") @@ -51,7 +52,18 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): def _connect_to_host(self): # Connect to destination - self._proxy_sock = socket.socket() + if self.onion_tor_socks_proxy_host and self.hostname.lower().endswith('.onion'): + self.logger.info("using tor socks proxy at %s:%s to connect to %s", + self.onion_tor_socks_proxy_host, + self.onion_tor_socks_proxy_port or 1080, + self.hostname) + self._proxy_sock = socks.socksocket() + self._proxy_sock.set_proxy(socks.SOCKS5, + addr=self.onion_tor_socks_proxy_host, + port=self.onion_tor_socks_proxy_port, rdns=True) + else: + self._proxy_sock = socket.socket() + self._proxy_sock.settimeout(60) # XXX what value should this have? self._proxy_sock.connect((self.hostname, int(self.port))) diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index 2b83564..b46f610 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -350,6 +350,16 @@ class SingleThreadedWarcProxy(http_server.HTTPServer): def __init__(self, ca=None, recorded_url_q=None, stats_db=None, options=warcprox.Options()): server_address = (options.address or 'localhost', options.port if options.port is not None else 8000) + + if options.onion_tor_socks_proxy: + try: + host, port = options.onion_tor_socks_proxy.split(':') + WarcProxyHandler.onion_tor_socks_proxy_host = host + WarcProxyHandler.onion_tor_socks_proxy_port = int(port) + except ValueError: + WarcProxyHandler.onion_tor_socks_proxy_host = options.onion_tor_socks_proxy + WarcProxyHandler.onion_tor_socks_proxy_port = None + http_server.HTTPServer.__init__(self, server_address, WarcProxyHandler, bind_and_activate=True) self.digest_algorithm = options.digest_algorithm or 'sha1' From c9f5b72fd7c1e067f39a0cb6eeaf11822881fa3f Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 13 Nov 2015 01:38:32 +0000 Subject: [PATCH 097/146] really run tor in docker container for tests --- tests/Dockerfile | 6 +++++- tests/run-tests.sh | 4 ++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/tests/Dockerfile b/tests/Dockerfile index aa3746f..975e767 100644 --- a/tests/Dockerfile +++ b/tests/Dockerfile @@ -20,5 +20,9 @@ RUN mkdir -vp /etc/service/rethinkdb \ RUN apt-get -y install python-virtualenv git RUN apt-get -y install python-gdbm python3-gdbm libpython2.7-dev libpython3.4-dev libffi-dev libssl-dev RUN pip install devpi-client -RUN apt-get -y install tor # starts tor socks proxy on port 9050 + +RUN apt-get -y install tor +RUN mkdir -vp /etc/service/tor \ + && echo "#!/bin/sh\ntor\n" > /etc/service/tor/run \ + && chmod a+x /etc/service/tor/run diff --git a/tests/run-tests.sh b/tests/run-tests.sh index 522110f..680f549 100755 --- a/tests/run-tests.sh +++ b/tests/run-tests.sh @@ -2,7 +2,7 @@ script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" -docker build -t internetarchive/rethinkdb $script_dir || exit 1 +docker build -t internetarchive/warcprox-tests $script_dir || exit 1 uid=$(id -u) user=$(id -un) @@ -11,7 +11,7 @@ set -e for python in python2.7 python3.4 do - docker run --rm -i -t --volume="$script_dir/..:/warcprox" internetarchive/rethinkdb /sbin/my_init -- \ + docker run --rm -i -t --volume="$script_dir/..:/warcprox" internetarchive/warcprox-tests /sbin/my_init -- \ bash -x -c " adduser --gecos=$user --disabled-password --quiet --uid=$uid $user \ && sudo PYTHONDONTWRITEBYTECODE=1 -u $user bash -x -c 'cd /warcprox \ && virtualenv -p $python /tmp/venv \ From 4bb7e043d4a36d2158beab5143661edcbe241b43 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 13 Nov 2015 01:49:30 +0000 Subject: [PATCH 098/146] wait longer for stats to be updated in test_limits(), now that rethinkdb stats are pushed only every 2.0 seconds --- tests/test_warcprox.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index c660964..0d724c7 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -564,7 +564,7 @@ def test_limits(http_daemon, warcprox_, archiving_proxies): time.sleep(0.5) while not warcprox_.warc_writer_thread.idle: time.sleep(0.5) - time.sleep(0.5) + time.sleep(2.5) response = requests.get(url, proxies=archiving_proxies, headers=headers, stream=True) assert response.status_code == 420 From df31068c804e4c854c412144203ed5b1c3acb6fa Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 18 Nov 2015 01:59:32 +0000 Subject: [PATCH 099/146] improve test running script --- tests/run-tests.sh | 37 +++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/tests/run-tests.sh b/tests/run-tests.sh index 680f549..86e2a7e 100755 --- a/tests/run-tests.sh +++ b/tests/run-tests.sh @@ -1,24 +1,29 @@ #!/bin/bash - -script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" - -docker build -t internetarchive/warcprox-tests $script_dir || exit 1 - -uid=$(id -u) -user=$(id -un) +# +# Runs tests in a docker container. Also runs a temporary instance of rethinkdb +# inside the container. The tests run with rethinkdb features enabled, against +# that instance of rethinkdb, and also run without rethinkdb features enabled. +# With python 2.7 and 3.4. +# +# 😬 +# set -e +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +docker build -t internetarchive/warcprox-tests $script_dir + for python in python2.7 python3.4 do - docker run --rm -i -t --volume="$script_dir/..:/warcprox" internetarchive/warcprox-tests /sbin/my_init -- \ - bash -x -c " adduser --gecos=$user --disabled-password --quiet --uid=$uid $user \ - && sudo PYTHONDONTWRITEBYTECODE=1 -u $user bash -x -c 'cd /warcprox \ - && virtualenv -p $python /tmp/venv \ - && source /tmp/venv/bin/activate \ - && pip --log-file /tmp/pip.log install . pytest requests \ - && py.test -s tests \ - && py.test -s --rethinkdb-servers=localhost tests \ - && py.test -s --rethinkdb-servers=localhost --rethinkdb-big-table tests'" + docker run --rm --volume="$script_dir/..:/warcprox" internetarchive/warcprox-tests /sbin/my_init -- \ + bash -x -c "cd /tmp && git clone /warcprox && cd /tmp/warcprox \ + && (cd /warcprox && git diff) | patch -p1 \ + && virtualenv -p $python /tmp/venv \ + && source /tmp/venv/bin/activate \ + && pip --log-file /tmp/pip.log install . pytest requests \ + && py.test -s tests \ + && py.test -s --rethinkdb-servers=localhost tests \ + && py.test -s --rethinkdb-servers=localhost --rethinkdb-big-table tests" done From 927419645b221d15c9518b0c63ee462743ee0c98 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 18 Nov 2015 02:00:48 +0000 Subject: [PATCH 100/146] use rethinkdb native time type for captures table timestamp --- warcprox/bigtable.py | 10 +++++++--- warcprox/dedup.py | 12 ++++++------ warcprox/writerthread.py | 2 +- 3 files changed, 14 insertions(+), 10 deletions(-) diff --git a/warcprox/bigtable.py b/warcprox/bigtable.py index 4feeb7d..df4ec1a 100644 --- a/warcprox/bigtable.py +++ b/warcprox/bigtable.py @@ -10,6 +10,7 @@ import os import hashlib import threading import datetime +import rethinkstuff class RethinkCaptures: """Inserts in batches every 0.5 seconds""" @@ -97,8 +98,7 @@ class RethinkCaptures: "id": "{} {}".format(canon_surt[:20], records[0].id.decode("utf-8")[10:-1]), "abbr_canon_surt": canon_surt[:150], "canon_surt": canon_surt, - # "timestamp": re.sub(r"[^0-9]", "", records[0].date.decode("utf-8")), - "timestamp": records[0].date.decode("utf-8"), + "timestamp": recorded_url.timestamp.replace(tzinfo=rethinkstuff.UTC), "url": recorded_url.url.decode("utf-8"), "offset": records[0].offset, "filename": os.path.basename(records[0].warc_filename), @@ -140,7 +140,11 @@ class RethinkCapturesDedup: raw_digest = base64.b16decode(value_str, casefold=True) entry = self.captures_db.find_response_by_digest(algo, raw_digest, bucket) if entry: - dedup_info = {"url":entry["url"].encode("utf-8"), "date":entry["timestamp"].encode("utf-8"), "id":entry["warc_id"].encode("utf-8")} + dedup_info = { + "url": entry["url"].encode("utf-8"), + "date": entry["timestamp"].strftime("%Y-%m-%dT%H:%M:%SZ").encode("utf-8"), + "id": entry["warc_id"].encode("utf-8") + } return dedup_info else: return None diff --git a/warcprox/dedup.py b/warcprox/dedup.py index 17735ed..7ac99d8 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -64,8 +64,8 @@ class DedupDb(object): def notify(self, recorded_url, records): if (records[0].get_header(warctools.WarcRecord.TYPE) == warctools.WarcRecord.RESPONSE and recorded_url.response_recorder.payload_size() > 0): - digest_key = warcprox.digest_str(recorded_url.response_recorder.payload_digest, - self.options.base32) + digest_key = warcprox.digest_str(recorded_url.response_recorder.payload_digest, + self.options.base32) if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta: self.save(digest_key, records[0], bucket=recorded_url.warcprox_meta["captures-bucket"]) else: @@ -73,8 +73,8 @@ class DedupDb(object): def decorate_with_dedup_info(dedup_db, recorded_url, base32=False): - if (recorded_url.response_recorder - and recorded_url.response_recorder.payload_digest + if (recorded_url.response_recorder + and recorded_url.response_recorder.payload_digest and recorded_url.response_recorder.payload_size() > 0): digest_key = warcprox.digest_str(recorded_url.response_recorder.payload_digest, base32) if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta: @@ -100,7 +100,7 @@ class RethinkDedupDb: self.r.db_create(self.r.dbname).run() tables = self.r.table_list().run() if not self.table in tables: - self.logger.info("creating rethinkdb table %s in database %s shards=%s replicas=%s", + self.logger.info("creating rethinkdb table %s in database %s shards=%s replicas=%s", repr(self.table), repr(self.r.dbname), self.shards, self.replicas) self.r.table_create(self.table, primary_key="key", shards=self.shards, replicas=self.replicas).run() @@ -135,7 +135,7 @@ class RethinkDedupDb: def notify(self, recorded_url, records): if (records[0].get_header(warctools.WarcRecord.TYPE) == warctools.WarcRecord.RESPONSE and recorded_url.response_recorder.payload_size() > 0): - digest_key = warcprox.digest_str(recorded_url.response_recorder.payload_digest, + digest_key = warcprox.digest_str(recorded_url.response_recorder.payload_digest, self.options.base32) if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta: self.save(digest_key, records[0], bucket=recorded_url.warcprox_meta["captures-bucket"]) diff --git a/warcprox/writerthread.py b/warcprox/writerthread.py index 25beff8..3f1642c 100644 --- a/warcprox/writerthread.py +++ b/warcprox/writerthread.py @@ -81,7 +81,7 @@ class WarcWriterThread(threading.Thread): payload_digest = records[0].get_header(warctools.WarcRecord.PAYLOAD_DIGEST).decode("utf-8") except: payload_digest = "-" - + # 2015-07-17T22:32:23.672Z 1 58 dns:www.dhss.delaware.gov P http://www.dhss.delaware.gov/dhss/ text/dns #045 20150717223214881+316 sha1:63UTPB7GTWIHAGIK3WWL76E57BBTJGAK http://www.dhss.delaware.gov/dhss/ - {"warcFileOffset":2964,"warcFilename":"ARCHIVEIT-1303-WEEKLY-JOB165158-20150717223222113-00000.warc.gz"} self.logger.info("{} {} {} {} {} size={} {} {} {} offset={}".format( recorded_url.client_ip, recorded_url.status, recorded_url.method, From 2cb145430278f5a5197464e8d7f05c06b00ff526 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 21 Jan 2016 02:06:09 +0000 Subject: [PATCH 101/146] s/abbr_canon_surt_timesamp/abbr_canon_surt_timestamp/ --- warcprox/bigtable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/warcprox/bigtable.py b/warcprox/bigtable.py index df4ec1a..9f2bcae 100644 --- a/warcprox/bigtable.py +++ b/warcprox/bigtable.py @@ -57,7 +57,7 @@ class RethinkCaptures: if not self.table in tables: self.logger.info("creating rethinkdb table %s in database %s", repr(self.table), repr(self.r.dbname)) self.r.table_create(self.table, shards=self.shards, replicas=self.replicas).run() - self.r.table(self.table).index_create("abbr_canon_surt_timesamp", [self.r.row["abbr_canon_surt"], self.r.row["timestamp"]]).run() + self.r.table(self.table).index_create("abbr_canon_surt_timestamp", [self.r.row["abbr_canon_surt"], self.r.row["timestamp"]]).run() self.r.table(self.table).index_create("sha1_warc_type", [self.r.row["sha1base32"], self.r.row["warc_type"], self.r.row["bucket"]]).run() def find_response_by_digest(self, algo, raw_digest, bucket="__unspecified__"): From 1e0a3f01353283271ae08ea58ffce38d121d177c Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 27 Jan 2016 21:18:02 +0000 Subject: [PATCH 102/146] import dbm only if used --- warcprox/dedup.py | 16 ++++++++-------- warcprox/playback.py | 16 ++++++++-------- warcprox/stats.py | 16 ++++++++-------- 3 files changed, 24 insertions(+), 24 deletions(-) diff --git a/warcprox/dedup.py b/warcprox/dedup.py index 7ac99d8..baa5fc3 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -1,13 +1,5 @@ from __future__ import absolute_import -try: - import dbm.gnu as dbm_gnu -except ImportError: - try: - import gdbm as dbm_gnu - except ImportError: - import anydbm as dbm_gnu - import logging import os import json @@ -19,6 +11,14 @@ class DedupDb(object): logger = logging.getLogger("warcprox.dedup.DedupDb") def __init__(self, dbm_file='./warcprox-dedup.db', options=warcprox.Options()): + try: + import dbm.gnu as dbm_gnu + except ImportError: + try: + import gdbm as dbm_gnu + except ImportError: + import anydbm as dbm_gnu + if os.path.exists(dbm_file): self.logger.info('opening existing deduplication database {}'.format(dbm_file)) else: diff --git a/warcprox/playback.py b/warcprox/playback.py index 30a5cb8..089fc64 100644 --- a/warcprox/playback.py +++ b/warcprox/playback.py @@ -12,14 +12,6 @@ try: except ImportError: import SocketServer as socketserver -try: - import dbm.gnu as dbm_gnu -except ImportError: - try: - import gdbm as dbm_gnu - except ImportError: - import anydbm as dbm_gnu - import logging import os from hanzo import warctools @@ -203,6 +195,14 @@ class PlaybackIndexDb(object): logger = logging.getLogger("warcprox.playback.PlaybackIndexDb") def __init__(self, dbm_file='./warcprox-playback-index.db'): + try: + import dbm.gnu as dbm_gnu + except ImportError: + try: + import gdbm as dbm_gnu + except ImportError: + import anydbm as dbm_gnu + if os.path.exists(dbm_file): self.logger.info('opening existing playback index database {}'.format(dbm_file)) else: diff --git a/warcprox/stats.py b/warcprox/stats.py index a6a8298..68f7586 100644 --- a/warcprox/stats.py +++ b/warcprox/stats.py @@ -1,13 +1,5 @@ from __future__ import absolute_import -try: - import dbm.gnu as dbm_gnu -except ImportError: - try: - import gdbm as dbm_gnu - except ImportError: - import anydbm as dbm_gnu - import logging import os import json @@ -42,6 +34,14 @@ class StatsDb: logger = logging.getLogger("warcprox.stats.StatsDb") def __init__(self, dbm_file='./warcprox-stats.db', options=warcprox.Options()): + try: + import dbm.gnu as dbm_gnu + except ImportError: + try: + import gdbm as dbm_gnu + except ImportError: + import anydbm as dbm_gnu + if os.path.exists(dbm_file): self.logger.info('opening existing stats database {}'.format(dbm_file)) else: From ee3ee5d621473aba6e5e2bf627e75cd78a383138 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 25 Feb 2016 01:36:36 +0000 Subject: [PATCH 103/146] call this 1.5.0.dev1 for now --- setup.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 0245625..19d2fe3 100755 --- a/setup.py +++ b/setup.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# vim: set sw=4 et: from setuptools.command.test import test as TestCommand import sys @@ -21,7 +20,7 @@ deps = [ 'certauth>=1.1.0', 'warctools', 'kafka-python', - 'surt==0.3b2', + 'surt>=0.3b4', 'rethinkstuff', 'PySocks', ] @@ -31,7 +30,7 @@ except: deps.append('futures') setuptools.setup(name='warcprox', - version='1.5.0', + version='1.5.0.dev1', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', From 89f965d1d350a81379e9a18695cd731da61947e3 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 3 Mar 2016 18:58:52 +0000 Subject: [PATCH 104/146] use kafka-python 1.0 recommended api; use kafka capture feed specified in warcprox-meta header, if any --- setup.py | 2 +- warcprox/kafkafeed.py | 14 ++++++++------ 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/setup.py b/setup.py index 19d2fe3..8f0c08e 100755 --- a/setup.py +++ b/setup.py @@ -19,7 +19,7 @@ class PyTest(TestCommand): deps = [ 'certauth>=1.1.0', 'warctools', - 'kafka-python', + 'kafka-python>=1.0.1', 'surt>=0.3b4', 'rethinkstuff', 'PySocks', diff --git a/warcprox/kafkafeed.py b/warcprox/kafkafeed.py index dd8a0a8..8f8aea1 100644 --- a/warcprox/kafkafeed.py +++ b/warcprox/kafkafeed.py @@ -9,8 +9,8 @@ class CaptureFeed: def __init__(self, broker_list, topic): self.broker_list = broker_list - self.topic = topic.encode('utf-8') - self._producer = kafka.SimpleProducer(kafka.KafkaClient(broker_list)) + self.topic = topic + self._producer = kafka.KafkaProducer(bootstrap_servers=broker_list) def notify(self, recorded_url, records): if records[0].type not in (b'revisit', b'response'): @@ -37,7 +37,7 @@ class CaptureFeed: 'annotations': 'duplicate:digest' if records[0].type == 'revisit' else '', 'content_length': recorded_url.response_recorder.len - recorded_url.response_recorder.payload_offset, 'start_time_plus_duration': '{:%Y%m%d%H%M%S}{:03d}+{}'.format( - recorded_url.timestamp, recorded_url.timestamp.microsecond//1000, + recorded_url.timestamp, recorded_url.timestamp.microsecond//1000, int(recorded_url.duration.total_seconds() * 1000)), # 'hop_path': ? # only used for seed redirects, which are n/a to brozzler (?) # 'via': ? @@ -50,7 +50,9 @@ class CaptureFeed: for (k,v) in recorded_url.warcprox_meta['capture-feed-extra-fields'].items(): d[k] = v - msg = json.dumps(d, separators=(',', ':')).encode('utf-8') - self.logger.debug('feeding kafka %s', msg) - self._producer.send_messages(self.topic, msg) + topic = recorded_url.warcprox_meta.get('capture-feed-topic', self.topic) + + msg = json.dumps(d, separators=(',', ':')).encode('utf-8') + self.logger.debug('feeding kafka topic=%s msg=%s', repr(topic), msg) + self._producer.send(topic, msg) From 46887f7594234bc53cb02e0303da246e5c13937d Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 3 Mar 2016 18:59:13 +0000 Subject: [PATCH 105/146] better handle exceptions from listeners --- warcprox/writerthread.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/warcprox/writerthread.py b/warcprox/writerthread.py index 3f1642c..b3d2d9c 100644 --- a/warcprox/writerthread.py +++ b/warcprox/writerthread.py @@ -92,5 +92,9 @@ class WarcWriterThread(threading.Thread): def _final_tasks(self, recorded_url, records): if self.listeners: for listener in self.listeners: - listener.notify(recorded_url, records) + try: + listener.notify(recorded_url, records) + except: + self.logger.error('%s raised exception', + listener.notify, exc_info=True) self._log(recorded_url, records) From 918fdd3e9b02fc2df9b58eb7d0155eb5daa48148 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 4 Mar 2016 20:59:11 +0000 Subject: [PATCH 106/146] heuristic to set size of thread pool based on open files limit, to hopefully fix problem where warcprox got stuck because it ran out of file handles --- warcprox/main.py | 6 +++--- warcprox/warcproxy.py | 24 +++++++++++++++++++----- 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/warcprox/main.py b/warcprox/main.py index 3ad92fe..de7ce89 100644 --- a/warcprox/main.py +++ b/warcprox/main.py @@ -74,13 +74,13 @@ def _build_arg_parser(prog=os.path.basename(sys.argv[0])): arg_parser.add_argument('--rethinkdb-big-table', dest='rethinkdb_big_table', action='store_true', default=False, help='use a big rethinkdb table called "captures", instead of a small table called "dedup"; table is suitable for use as index for playback (ignored unless --rethinkdb-servers is specified)') - arg_parser.add_argument('--kafka-broker-list', dest='kafka_broker_list', + arg_parser.add_argument('--kafka-broker-list', dest='kafka_broker_list', default=None, help='kafka broker list for capture feed') - arg_parser.add_argument('--kafka-capture-feed-topic', dest='kafka_capture_feed_topic', + arg_parser.add_argument('--kafka-capture-feed-topic', dest='kafka_capture_feed_topic', default=None, help='kafka capture feed topic') arg_parser.add_argument('--queue-size', dest='queue_size', default=500, help=argparse.SUPPRESS) - arg_parser.add_argument('--max-threads', dest='max_threads', default=500, + arg_parser.add_argument('--max-threads', dest='max_threads', help=argparse.SUPPRESS) arg_parser.add_argument('--profile', action='store_true', default=False, help=argparse.SUPPRESS) diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index b46f610..b9ed3ec 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -36,6 +36,7 @@ from certauth.certauth import CertificateAuthority import warcprox import datetime import concurrent.futures +import resource class ProxyingRecorder(object): """ @@ -394,12 +395,25 @@ class SingleThreadedWarcProxy(http_server.HTTPServer): class PooledMixIn(socketserver.ThreadingMixIn): def process_request(self, request, client_address): - if hasattr(self, 'pool') and self.pool: - self.pool.submit(self.process_request_thread, request, client_address) - else: - socketserver.ThreadingMixIn.process_request(self, request, client_address) + self.pool.submit(self.process_request_thread, request, client_address) class WarcProxy(PooledMixIn, SingleThreadedWarcProxy): + logger = logging.getLogger("warcprox.warcproxy.WarcProxy") + def __init__(self, *args, **kwargs): SingleThreadedWarcProxy.__init__(self, *args, **kwargs) - self.pool = concurrent.futures.ThreadPoolExecutor(max_workers=self.options.max_threads or 500) + if self.options.max_threads: + max_threads = self.options.max_threads + self.logger.info("max_threads=%s set by command line option", + max_threads) + else: + # man getrlimit: "RLIMIT_NPROC The maximum number of processes (or, + # more precisely on Linux, threads) that can be created for the + # real user ID of the calling process." + rlimit_nproc = resource.getrlimit(resource.RLIMIT_NPROC)[0] + rlimit_nofile = resource.getrlimit(resource.RLIMIT_NOFILE)[0] + max_threads = min(rlimit_nofile // 10, rlimit_nproc // 2) + self.logger.info("max_threads=%s (rlimit_nproc=%s, rlimit_nofile=%s)", + max_threads, rlimit_nproc, rlimit_nofile) + + self.pool = concurrent.futures.ThreadPoolExecutor(max_threads) From 422672408ac4be9db2bf403ea4b5bf3d4f56941a Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 4 Mar 2016 21:02:47 +0000 Subject: [PATCH 107/146] fix this error File "/home/nlevitt/workspace/warcprox/warcprox/warcproxy.py", line 256, in _proxy_request return recorded_url UnboundLocalError: local variable 'recorded_url' referenced before assignment --- warcprox/warcproxy.py | 1 + 1 file changed, 1 insertion(+) diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index b9ed3ec..9a22545 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -202,6 +202,7 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): req += self.rfile.read(int(self.headers['Content-Length'])) prox_rec_res = None + recorded_url = None try: self.logger.debug('sending to remote server req=%s', repr(req)) From 2bec9db7dfd99788c3c91bca0545536666f12e88 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 8 Mar 2016 22:52:02 +0000 Subject: [PATCH 108/146] handle old dedup entries missing "warc_id" --- warcprox/bigtable.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/warcprox/bigtable.py b/warcprox/bigtable.py index 9f2bcae..7e6670c 100644 --- a/warcprox/bigtable.py +++ b/warcprox/bigtable.py @@ -143,8 +143,9 @@ class RethinkCapturesDedup: dedup_info = { "url": entry["url"].encode("utf-8"), "date": entry["timestamp"].strftime("%Y-%m-%dT%H:%M:%SZ").encode("utf-8"), - "id": entry["warc_id"].encode("utf-8") } + if "warc_id" in entry: + dedup_info["id"] = entry["warc_id"].encode("utf-8") return dedup_info else: return None From 910cd062ee47c636bd78334b3916cf4493182f56 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 8 Mar 2016 22:55:42 +0000 Subject: [PATCH 109/146] bump version number --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 8f0c08e..85aa827 100755 --- a/setup.py +++ b/setup.py @@ -30,7 +30,7 @@ except: deps.append('futures') setuptools.setup(name='warcprox', - version='1.5.0.dev1', + version='1.5.0.dev2', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', From 2c91eb03d379cfb96a4ca47721e19109390fa1f9 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Sun, 13 Mar 2016 07:46:33 +0000 Subject: [PATCH 110/146] support new Warcprox-Meta json field captures-table-extra-fields, extra fields to include in the rethinkdb captures table entry --- warcprox/bigtable.py | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/warcprox/bigtable.py b/warcprox/bigtable.py index 7e6670c..231d54c 100644 --- a/warcprox/bigtable.py +++ b/warcprox/bigtable.py @@ -72,20 +72,27 @@ class RethinkCaptures: result = results[0] else: result = None - self.logger.debug("returning %s for sha1base32=%s bucket=%s", result, sha1base32, bucket) + self.logger.debug("returning %s for sha1base32=%s bucket=%s", + result, sha1base32, bucket) return result def _assemble_entry(self, recorded_url, records): if recorded_url.response_recorder: if recorded_url.response_recorder.payload_digest.name == "sha1": - sha1base32 = base64.b32encode(recorded_url.response_recorder.payload_digest.digest()).decode("utf-8") + sha1base32 = base64.b32encode( + recorded_url.response_recorder.payload_digest.digest() + ).decode("utf-8") else: - self.logger.warn("digest type is %s but big capture table is indexed by sha1", recorded_url.response_recorder.payload_digest.name) + self.logger.warn( + "digest type is %s but big capture table is indexed " + "by sha1", + recorded_url.response_recorder.payload_digest.name) else: digest = hashlib.new("sha1", records[0].content[1]) sha1base32 = base64.b32encode(digest.digest()).decode("utf-8") - if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta: + if (recorded_url.warcprox_meta + and "captures-bucket" in recorded_url.warcprox_meta): bucket = recorded_url.warcprox_meta["captures-bucket"] else: bucket = "__unspecified__" @@ -95,10 +102,12 @@ class RethinkCaptures: entry = { # id only specified for rethinkdb partitioning - "id": "{} {}".format(canon_surt[:20], records[0].id.decode("utf-8")[10:-1]), + "id": "{} {}".format( + canon_surt[:20], records[0].id.decode("utf-8")[10:-1]), "abbr_canon_surt": canon_surt[:150], "canon_surt": canon_surt, - "timestamp": recorded_url.timestamp.replace(tzinfo=rethinkstuff.UTC), + "timestamp": recorded_url.timestamp.replace( + tzinfo=rethinkstuff.UTC), "url": recorded_url.url.decode("utf-8"), "offset": records[0].offset, "filename": os.path.basename(records[0].warc_filename), @@ -112,6 +121,12 @@ class RethinkCaptures: "length": records[0].length, } + if (recorded_url.warcprox_meta and + "captures-table-extra-fields" in recorded_url.warcprox_meta): + extras = recorded_url.warcprox_meta["captures-table-extra-fields"] + for extra_field in extras: + entry[extra_field] = extras[extra_field] + return entry def notify(self, recorded_url, records): From 42a81d8f8fc854c9a5e13ee1885c95dd1ddc829b Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 15 Mar 2016 06:27:21 +0000 Subject: [PATCH 111/146] fix bug where two warc-payload-digest headers were written to revisit records --- setup.py | 2 +- tests/test_warcprox.py | 9 +++++++++ warcprox/bigtable.py | 2 +- warcprox/warc.py | 5 +++-- 4 files changed, 14 insertions(+), 4 deletions(-) diff --git a/setup.py b/setup.py index 85aa827..f630e37 100755 --- a/setup.py +++ b/setup.py @@ -30,7 +30,7 @@ except: deps.append('futures') setuptools.setup(name='warcprox', - version='1.5.0.dev2', + version='1.5.0.dev3', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 0d724c7..5dcd4df 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -22,6 +22,7 @@ import warnings import pprint import traceback import signal +from collections import Counter try: import http.server as http_server @@ -662,6 +663,8 @@ def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies, (offset, record, errors) = next(record_iter) assert record.type == b'response' assert record.url == url1.encode('ascii') + # check for duplicate warc record headers + assert Counter(h[0] for h in record.headers).most_common(1)[0][1] == 1 assert record.content[1] == b'HTTP/1.1 200 OK\r\nContent-Type: text/plain\r\nwarcprox-test-header: k!\r\nContent-Length: 44\r\n\r\nI am the warcprox test payload! llllllllll!\n' (offset, record, errors) = next(record_iter) assert record.type == b'request' @@ -670,6 +673,8 @@ def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies, (offset, record, errors) = next(record_iter) assert record.type == b'response' assert record.url == url2.encode('ascii') + # check for duplicate warc record headers + assert Counter(h[0] for h in record.headers).most_common(1)[0][1] == 1 assert record.content[1] == b'HTTP/1.1 200 OK\r\nContent-Type: text/plain\r\nwarcprox-test-header: k!\r\nContent-Length: 44\r\n\r\nI am the warcprox test payload! llllllllll!\n' (offset, record, errors) = next(record_iter) assert record.type == b'request' @@ -678,6 +683,8 @@ def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies, (offset, record, errors) = next(record_iter) assert record.type == b'revisit' assert record.url == url2.encode('ascii') + # check for duplicate warc record headers + assert Counter(h[0] for h in record.headers).most_common(1)[0][1] == 1 assert record.content[1] == b'HTTP/1.1 200 OK\r\nContent-Type: text/plain\r\nwarcprox-test-header: k!\r\nContent-Length: 44\r\n\r\n' (offset, record, errors) = next(record_iter) assert record.type == b'request' @@ -686,6 +693,8 @@ def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies, (offset, record, errors) = next(record_iter) assert record.type == b'revisit' assert record.url == url1.encode('ascii') + # check for duplicate warc record headers + assert Counter(h[0] for h in record.headers).most_common(1)[0][1] == 1 assert record.content[1] == b'HTTP/1.1 200 OK\r\nContent-Type: text/plain\r\nwarcprox-test-header: k!\r\nContent-Length: 44\r\n\r\n' (offset, record, errors) = next(record_iter) assert record.type == b'request' diff --git a/warcprox/bigtable.py b/warcprox/bigtable.py index 231d54c..9bb3d6d 100644 --- a/warcprox/bigtable.py +++ b/warcprox/bigtable.py @@ -14,7 +14,7 @@ import rethinkstuff class RethinkCaptures: """Inserts in batches every 0.5 seconds""" - logger = logging.getLogger("warcprox.bigtables.RethinkCaptures") + logger = logging.getLogger("warcprox.bigtable.RethinkCaptures") def __init__(self, r, table="captures", shards=None, replicas=None, options=warcprox.Options()): self.r = r diff --git a/warcprox/warc.py b/warcprox/warc.py index eed045c..5cba38d 100644 --- a/warcprox/warc.py +++ b/warcprox/warc.py @@ -114,8 +114,9 @@ class WarcRecordBuilder: digest = hashlib.new(self.digest_algorithm, data) headers.append((warctools.WarcRecord.BLOCK_DIGEST, warcprox.digest_str(digest, self.base32))) - headers.append((warctools.WarcRecord.PAYLOAD_DIGEST, - warcprox.digest_str(digest, self.base32))) + if not payload_digest: + headers.append((warctools.WarcRecord.PAYLOAD_DIGEST, + warcprox.digest_str(digest, self.base32))) content_tuple = content_type, data record = warctools.WarcRecord(headers=headers, content=content_tuple) From 6b6c0b3bac56bf834981127b88c3cab245a88040 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 18 Mar 2016 02:06:07 +0000 Subject: [PATCH 112/146] make sure batch insert timer thread survives rethinkdb outages --- warcprox/bigtable.py | 38 ++++++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/warcprox/bigtable.py b/warcprox/bigtable.py index 9bb3d6d..8aea52c 100644 --- a/warcprox/bigtable.py +++ b/warcprox/bigtable.py @@ -31,22 +31,28 @@ class RethinkCaptures: self._insert_batch() # starts repeating timer def _insert_batch(self): - with self._batch_lock: - if len(self._batch) > 0: - result = self.r.table(self.table).insert(self._batch).run() - if result["inserted"] != len(self._batch) or sorted(result.values()) != [0,0,0,0,0,len(self._batch)]: - raise Exception("unexpected result %s saving batch of %s entries", result, len(self._batch)) - self.logger.info("saved %s entries to big capture table db", len(self._batch)) - self._batch = [] - - if not self._stop.is_set(): - t = threading.Timer(0.5, self._insert_batch) - t.name = "RethinkCaptures-batch-insert-timer-%s" % datetime.datetime.utcnow().isoformat() - t.start() - # ensure self._timer joinable (already started) whenever close() happens to be called - self._timer = t - else: - self.logger.info("finished") + try: + with self._batch_lock: + if len(self._batch) > 0: + result = self.r.table(self.table).insert(self._batch).run() + if result["inserted"] != len(self._batch) or sorted(result.values()) != [0,0,0,0,0,len(self._batch)]: + raise Exception("unexpected result %s saving batch of %s entries", result, len(self._batch)) + self.logger.info("saved %s entries to big capture table db", len(self._batch)) + self._batch = [] + except BaseException as e: + self.logger.error( + "caught exception trying to save %s entries, they will " + "be included in the next batch", len(self._batch), + exc_info=True) + finally: + if not self._stop.is_set(): + t = threading.Timer(0.5, self._insert_batch) + t.name = "RethinkCaptures-batch-insert-timer-%s" % datetime.datetime.utcnow().isoformat() + t.start() + # ensure self._timer joinable (already started) whenever close() happens to be called + self._timer = t + else: + self.logger.info("finished") def _ensure_db_table(self): dbs = self.r.db_list().run() From 6490583dd0e7d323890631b9868be2ff3498ef34 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 18 Mar 2016 02:07:29 +0000 Subject: [PATCH 113/146] this brozzler branch will be warcprox 2.0, today it's 2.0.dev4 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index f630e37..66924ae 100755 --- a/setup.py +++ b/setup.py @@ -30,7 +30,7 @@ except: deps.append('futures') setuptools.setup(name='warcprox', - version='1.5.0.dev3', + version='2.0.dev4', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', From ad661dcead0c44ed3c548704bd77bc091c49881e Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 6 Apr 2016 19:37:16 -0700 Subject: [PATCH 114/146] expand license info, update usage section, remove crufty todo section --- README.rst | 144 ++++++++++++++++++++++++++++------------------------- 1 file changed, 76 insertions(+), 68 deletions(-) diff --git a/README.rst b/README.rst index c1a0969..1a4bfbb 100644 --- a/README.rst +++ b/README.rst @@ -1,15 +1,11 @@ warcprox - WARC writing MITM HTTP/S proxy ----------------------------------------- -.. image:: https://travis-ci.org/internetarchive/warcprox.png?branch=master +.. image:: https://travis-ci.org/internetarchive/warcprox.png?branch=master :target: https://travis-ci.org/internetarchive/warcprox Based on the excellent and simple pymiproxy by Nadeem Douba. https://github.com/allfro/pymiproxy -License: because pymiproxy is GPL and warcprox is a derivative work of -pymiproxy, warcprox is also GPL. - - Install ~~~~~~~ @@ -47,10 +43,15 @@ Usage usage: warcprox [-h] [-p PORT] [-b ADDRESS] [-c CACERT] [--certs-dir CERTS_DIR] [-d DIRECTORY] [-z] [-n PREFIX] [-s SIZE] [--rollover-idle-time ROLLOVER_IDLE_TIME] - [-g DIGEST_ALGORITHM] [--base32] [-j DEDUP_DB_FILE] - [-P PLAYBACK_PORT] - [--playback-index-db-file PLAYBACK_INDEX_DB_FILE] [--version] - [-v] [-q] + [-g DIGEST_ALGORITHM] [--base32] + [--stats-db-file STATS_DB_FILE] [-P PLAYBACK_PORT] + [--playback-index-db-file PLAYBACK_INDEX_DB_FILE] + [-j DEDUP_DB_FILE | --rethinkdb-servers RETHINKDB_SERVERS] + [--rethinkdb-db RETHINKDB_DB] [--rethinkdb-big-table] + [--kafka-broker-list KAFKA_BROKER_LIST] + [--kafka-capture-feed-topic KAFKA_CAPTURE_FEED_TOPIC] + [--onion-tor-socks-proxy ONION_TOR_SOCKS_PROXY] + [--version] [-v] [-q] warcprox - WARC writing MITM HTTP/S proxy @@ -60,84 +61,91 @@ Usage -b ADDRESS, --address ADDRESS address to listen on (default: localhost) -c CACERT, --cacert CACERT - CA certificate file; if file does not exist, it will - be created (default: ./desktop-nlevitt-warcprox- - ca.pem) + CA certificate file; if file does not exist, it + will be created (default: ./MacBook-Pro.local- + warcprox-ca.pem) --certs-dir CERTS_DIR where to store and load generated certificates - (default: ./desktop-nlevitt-warcprox-ca) + (default: ./MacBook-Pro.local-warcprox-ca) -d DIRECTORY, --dir DIRECTORY where to write warcs (default: ./warcs) - -z, --gzip write gzip-compressed warc records (default: False) + -z, --gzip write gzip-compressed warc records (default: + False) -n PREFIX, --prefix PREFIX WARC filename prefix (default: WARCPROX) - -s SIZE, --size SIZE WARC file rollover size threshold in bytes (default: - 1000000000) + -s SIZE, --size SIZE WARC file rollover size threshold in bytes + (default: 1000000000) --rollover-idle-time ROLLOVER_IDLE_TIME - WARC file rollover idle time threshold in seconds (so - that Friday's last open WARC doesn't sit there all - weekend waiting for more data) (default: None) + WARC file rollover idle time threshold in seconds + (so that Friday's last open WARC doesn't sit there + all weekend waiting for more data) (default: None) -g DIGEST_ALGORITHM, --digest-algorithm DIGEST_ALGORITHM - digest algorithm, one of sha384, sha512, md5, sha224, - sha256, sha1 (default: sha1) + digest algorithm, one of sha1, sha256, md5, + sha224, sha512, sha384 (default: sha1) --base32 write digests in Base32 instead of hex (default: False) - -j DEDUP_DB_FILE, --dedup-db-file DEDUP_DB_FILE - persistent deduplication database file; empty string - or /dev/null disables deduplication (default: - ./warcprox-dedup.db) + --stats-db-file STATS_DB_FILE + persistent statistics database file; empty string + or /dev/null disables statistics tracking + (default: ./warcprox-stats.db) -P PLAYBACK_PORT, --playback-port PLAYBACK_PORT - port to listen on for instant playback (default: None) + port to listen on for instant playback (default: + None) --playback-index-db-file PLAYBACK_INDEX_DB_FILE - playback index database file (only used if --playback- - port is specified) (default: ./warcprox-playback- - index.db) + playback index database file (only used if + --playback-port is specified) (default: + ./warcprox-playback-index.db) + -j DEDUP_DB_FILE, --dedup-db-file DEDUP_DB_FILE + persistent deduplication database file; empty + string or /dev/null disables deduplication + (default: ./warcprox-dedup.db) + --rethinkdb-servers RETHINKDB_SERVERS + rethinkdb servers, used for dedup and stats if + specified; e.g. + db0.foo.org,db0.foo.org:38015,db1.foo.org + (default: None) + --rethinkdb-db RETHINKDB_DB + rethinkdb database name (ignored unless + --rethinkdb-servers is specified) (default: + warcprox) + --rethinkdb-big-table + use a big rethinkdb table called "captures", + instead of a small table called "dedup"; table is + suitable for use as index for playback (ignored + unless --rethinkdb-servers is specified) (default: + False) + --kafka-broker-list KAFKA_BROKER_LIST + kafka broker list for capture feed (default: None) + --kafka-capture-feed-topic KAFKA_CAPTURE_FEED_TOPIC + kafka capture feed topic (default: None) + --onion-tor-socks-proxy ONION_TOR_SOCKS_PROXY + host:port of tor socks proxy, used only to connect + to .onion sites (default: None) --version show program's version number and exit -v, --verbose -q, --quiet -To do -~~~~~ -* (partly done) integration tests, unit tests -* (done) url-agnostic deduplication -* unchunk and/or ungzip before storing payload, or alter request to - discourage server from chunking/gzipping -* check certs from proxied website, like browser does, and present - browser-like warning if appropriate -* keep statistics, produce reports -* write cdx while crawling? -* performance testing -* (done) base32 sha1 like heritrix? -* configurable timeouts and stuff -* evaluate ipv6 support -* (done) more explicit handling of connection closed exception - during transfer -* dns cache?? the system already does a fine job I'm thinking -* keepalive with remote servers? -* (done) python3 -* special handling for 304 not-modified (write nothing or write revisit - record... and/or modify request so server never responds with 304) -* (done) instant playback on a second proxy port -* special url for downloading ca cert e.g. http(s)://warcprox./ca.pem -* special url for other stuff, some status info or something? -* browser plugin for warcprox mode +License +~~~~~~~ - - accept warcprox CA cert only when in warcprox mode - - separate temporary cookie store, like incognito - - "careful! your activity is being archived" banner - - easy switch between archiving and instant playback proxy port +Warcprox is a derivative work of pymiproxy, which is GPL. Thus warcprox is also +GPL. -To not do -^^^^^^^^^ +Copyright (C) 2012 Cygnos Corporation +Copyright (C) 2013-2016 Internet Archive -The features below could also be part of warcprox. But maybe they don't -belong here, since this is a proxy, not a crawler/robot. It can be used -by a human with a browser, or by something automated, i.e. a robot. My -feeling is that it's more appropriate to implement these in the robot. +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. -* politeness, i.e. throttle requests per server -* fetch and obey robots.txt -* alter user-agent, maybe insert something like "warcprox mitm - archiving proxy; +http://archive.org/details/archive.org\_bot" +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. From 2c65ff89fa52b0e14ba7a2f6dd09692af1550505 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 6 Apr 2016 19:37:55 -0700 Subject: [PATCH 115/146] add license headers --- benchmarks/run-benchmarks.py | 22 +++++++++++++++++++++- bin/dump-anydbm | 22 ++++++++++++++++++++-- bin/warcprox | 1 - setup.py | 22 +++++++++++++++++++++- tests/Dockerfile | 21 +++++++++++++++++++++ tests/conftest.py | 22 +++++++++++++++++++++- tests/run-tests.sh | 27 +++++++++++++++++++++++---- tests/single-threaded-proxy.py | 22 ++++++++++++++++++++++ tests/test_dump-anydbm.py | 20 ++++++++++++++++++++ tests/test_warcprox.py | 21 ++++++++++++++++++++- warcprox/__init__.py | 25 ++++++++++++++++++++++++- warcprox/bigtable.py | 24 ++++++++++++++++++++++++ warcprox/controller.py | 24 ++++++++++++++++++++++++ warcprox/dedup.py | 21 +++++++++++++++++++++ warcprox/kafkafeed.py | 22 ++++++++++++++++++++++ warcprox/main.py | 22 +++++++++++++++++++++- warcprox/mitmproxy.py | 26 ++++++++++++++++++++++++++ warcprox/playback.py | 22 +++++++++++++++++++++- warcprox/stats.py | 21 +++++++++++++++++++++ warcprox/warc.py | 21 +++++++++++++++++++++ warcprox/warcproxy.py | 26 ++++++++++++++++++++------ warcprox/writer.py | 21 ++++++++++++++++++++- warcprox/writerthread.py | 22 ++++++++++++++++++++++ 23 files changed, 476 insertions(+), 21 deletions(-) diff --git a/benchmarks/run-benchmarks.py b/benchmarks/run-benchmarks.py index 73ef96d..275581b 100755 --- a/benchmarks/run-benchmarks.py +++ b/benchmarks/run-benchmarks.py @@ -1,4 +1,24 @@ #!/usr/bin/env python +# +# run-benchmarks.py - some benchmarking code for warcprox +# +# Copyright (C) 2015-2016 Internet Archive +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, +# USA. +# import sys import aiohttp @@ -89,7 +109,7 @@ class AsyncClient(object): logging.debug("finished reading from %s", url) r.close() break - + @asyncio.coroutine def one_request(self, url): logging.debug("issuing request to %s", url) diff --git a/bin/dump-anydbm b/bin/dump-anydbm index 1d1ae4b..b9200bc 100755 --- a/bin/dump-anydbm +++ b/bin/dump-anydbm @@ -1,5 +1,23 @@ #!/usr/bin/env python -# vim:set sw=4 et: +# +# dump-anydbm - dumps contents of dbm file to stdout +# +# Copyright (C) 2013-2016 Internet Archive +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, +# USA. # """ @@ -14,7 +32,7 @@ try: whichdb = dbm.whichdb except: - import anydbm + import anydbm dbm = anydbm from whichdb import whichdb diff --git a/bin/warcprox b/bin/warcprox index d978c53..d236cf6 100755 --- a/bin/warcprox +++ b/bin/warcprox @@ -1,5 +1,4 @@ #!/usr/bin/env python -# vim: set sw=4 et: from __future__ import absolute_import diff --git a/setup.py b/setup.py index 66924ae..c33494d 100755 --- a/setup.py +++ b/setup.py @@ -1,4 +1,24 @@ #!/usr/bin/env python +# +# setup.py - setuptools installation config for warcprox +# +# Copyright (C) 2013-2016 Internet Archive +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, +# USA. +# from setuptools.command.test import test as TestCommand import sys @@ -30,7 +50,7 @@ except: deps.append('futures') setuptools.setup(name='warcprox', - version='2.0.dev4', + version='2.0.dev5', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', diff --git a/tests/Dockerfile b/tests/Dockerfile index 975e767..ab1f01a 100644 --- a/tests/Dockerfile +++ b/tests/Dockerfile @@ -1,3 +1,24 @@ +# +# Dockerfile for warcprox tests +# +# Copyright (C) 2015-2016 Internet Archive +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, +# USA. +# + FROM phusion/baseimage MAINTAINER Noah Levitt diff --git a/tests/conftest.py b/tests/conftest.py index f417fed..27d4141 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,4 +1,24 @@ -# vim:set sw=4 et: +# +# tests/conftest.py - command line options for warcprox tests +# +# Copyright (C) 2015-2016 Internet Archive +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, +# USA. +# + import pytest def pytest_addoption(parser): diff --git a/tests/run-tests.sh b/tests/run-tests.sh index 86e2a7e..b28e606 100755 --- a/tests/run-tests.sh +++ b/tests/run-tests.sh @@ -1,9 +1,28 @@ #!/bin/bash # -# Runs tests in a docker container. Also runs a temporary instance of rethinkdb -# inside the container. The tests run with rethinkdb features enabled, against -# that instance of rethinkdb, and also run without rethinkdb features enabled. -# With python 2.7 and 3.4. +# tests/run-tests.sh - Runs tests in a docker container. Also runs a temporary +# instance of rethinkdb inside the container. The tests run with rethinkdb +# features enabled, against that instance of rethinkdb, and also run without +# rethinkdb features enabled. With python 2.7 and 3.4. +# +# tests/conftest.py - command line options for warcprox tests +# +# Copyright (C) 2015-2016 Internet Archive +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, +# USA. # # 😬 # diff --git a/tests/single-threaded-proxy.py b/tests/single-threaded-proxy.py index 69db94c..5954fd1 100755 --- a/tests/single-threaded-proxy.py +++ b/tests/single-threaded-proxy.py @@ -1,4 +1,26 @@ #!/usr/bin/env python +# +# tests/single-threaded-proxy.py - single-threaded recording proxy, useful for +# debugging +# +# Copyright (C) 2015-2016 Internet Archive +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, +# USA. +# + """Useful for debugging. Does not write warcs.""" from __future__ import absolute_import diff --git a/tests/test_dump-anydbm.py b/tests/test_dump-anydbm.py index 6bb600d..1bc6ccc 100644 --- a/tests/test_dump-anydbm.py +++ b/tests/test_dump-anydbm.py @@ -1,4 +1,24 @@ #!/usr/bin/env python +# +# tests/test_dump-anydbm.py - tests for dump-anydbm +# +# Copyright (C) 2013-2016 Internet Archive +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, +# USA. +# import pytest import os diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 5dcd4df..6bb99ae 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -1,5 +1,24 @@ #!/usr/bin/env python -# vim: set sw=4 et: +# +# tests/test_warcprox.py - automated tests for warcprox +# +# Copyright (C) 2013-2016 Internet Archive +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, +# USA. +# import pytest import threading diff --git a/warcprox/__init__.py b/warcprox/__init__.py index 8afe606..394d8b8 100644 --- a/warcprox/__init__.py +++ b/warcprox/__init__.py @@ -1,10 +1,33 @@ +# +# warcprox/__init__.py - warcprox package main file, contains some utility code +# +# Copyright (C) 2013-2016 Internet Archive +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, +# USA. +# + from argparse import Namespace as _Namespace from pkg_resources import get_distribution as _get_distribution __version__ = _get_distribution('warcprox').version def digest_str(hash_obj, base32): import base64 - return hash_obj.name.encode('utf-8') + b':' + (base64.b32encode(hash_obj.digest()) if base32 else hash_obj.hexdigest().encode('ascii')) + return hash_obj.name.encode('utf-8') + b':' + ( + base64.b32encode(hash_obj.digest()) if base32 + else hash_obj.hexdigest().encode('ascii')) class Options(_Namespace): def __getattr__(self, name): diff --git a/warcprox/bigtable.py b/warcprox/bigtable.py index 8aea52c..3f8989a 100644 --- a/warcprox/bigtable.py +++ b/warcprox/bigtable.py @@ -1,3 +1,27 @@ +# +# warcprox/bigtable.py - module for "big" RethinkDB table for deduplication; +# the table is "big" in the sense that it is designed to be usable as an index +# for playback software outside of warcprox, and contains information not +# needed merely for deduplication +# +# Copyright (C) 2015-2016 Internet Archive +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, +# USA. +# + from __future__ import absolute_import import logging diff --git a/warcprox/controller.py b/warcprox/controller.py index 3c7dfe1..a813345 100644 --- a/warcprox/controller.py +++ b/warcprox/controller.py @@ -1,3 +1,27 @@ +# +# warcprox/controller.py - contains WarcproxController class, responsible for +# starting up and shutting down the various components of warcprox, and for +# sending heartbeats to the service registry if configured to do so; also has +# some memory profiling capabilities +# +# Copyright (C) 2013-2016 Internet Archive +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, +# USA. +# + from __future__ import absolute_import import logging diff --git a/warcprox/dedup.py b/warcprox/dedup.py index baa5fc3..eb71cb4 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -1,3 +1,24 @@ +# +# warcprox/dedup.py - identical payload digest deduplication +# +# Copyright (C) 2013-2016 Internet Archive +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, +# USA. +# + from __future__ import absolute_import import logging diff --git a/warcprox/kafkafeed.py b/warcprox/kafkafeed.py index 8f8aea1..e9d2176 100644 --- a/warcprox/kafkafeed.py +++ b/warcprox/kafkafeed.py @@ -1,3 +1,25 @@ +# +# warcprox/kafkafeed.py - support for publishing information about archived +# urls to apache kafka +# +# Copyright (C) 2015-2016 Internet Archive +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, +# USA. +# + import kafka import datetime import json diff --git a/warcprox/main.py b/warcprox/main.py index de7ce89..6f791d7 100644 --- a/warcprox/main.py +++ b/warcprox/main.py @@ -1,5 +1,25 @@ #!/usr/bin/env python -# vim:set sw=4 et: +# +# warcprox/main.py - entrypoint for warcprox executable, parses command line +# arguments, initializes components, starts controller, handles signals +# +# Copyright (C) 2013-2016 Internet Archive +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, +# USA. +# from __future__ import absolute_import diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index b8f645e..083f43f 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -1,3 +1,29 @@ +# +# warcprox/mitmproxy.py - man-in-the-middle http/s proxy code, handles http +# CONNECT method by creating a snakeoil certificate for the requested site, +# calling ssl.wrap_socket() on the client connection; connects to remote +# (proxied) host, possibly using tor if host tld is .onion and tor proxy is +# configured +# +# Copyright (C) 2012 Cygnos Corporation +# Copyright (C) 2013-2016 Internet Archive +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, +# USA. +# + from __future__ import absolute_import try: diff --git a/warcprox/playback.py b/warcprox/playback.py index 089fc64..2624175 100644 --- a/warcprox/playback.py +++ b/warcprox/playback.py @@ -1,4 +1,24 @@ -# vim:set sw=4 et: +# +# warcprox/playback.py - rudimentary support for playback of urls archived by +# warcprox (not much used or maintained) +# +# Copyright (C) 2013-2016 Internet Archive +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, +# USA. +# from __future__ import absolute_import diff --git a/warcprox/stats.py b/warcprox/stats.py index 68f7586..8dd3a86 100644 --- a/warcprox/stats.py +++ b/warcprox/stats.py @@ -1,3 +1,24 @@ +# +# warcprox/stats.py - keeps statistics on what has been proxied, archived +# +# Copyright (C) 2013-2016 Internet Archive +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, +# USA. +# + from __future__ import absolute_import import logging diff --git a/warcprox/warc.py b/warcprox/warc.py index 5cba38d..fbc2a33 100644 --- a/warcprox/warc.py +++ b/warcprox/warc.py @@ -1,3 +1,24 @@ +# +# warcprox/warc.py - assembles warc records +# +# Copyright (C) 2013-2016 Internet Archive +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, +# USA. +# + from __future__ import absolute_import import logging diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index 9a22545..c41d8e1 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -1,10 +1,24 @@ -#!/usr/bin/env python # -""" -WARC writing MITM HTTP/S proxy - -See README.rst or https://github.com/internetarchive/warcprox -""" +# warcprox/warcproxy.py - recording proxy, extends mitmproxy to record traffic, +# enqueue info on the recorded url queue +# +# Copyright (C) 2013-2016 Internet Archive +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, +# USA. +# from __future__ import absolute_import diff --git a/warcprox/writer.py b/warcprox/writer.py index 4603c0c..72c292f 100644 --- a/warcprox/writer.py +++ b/warcprox/writer.py @@ -1,4 +1,23 @@ -# vim:set sw=4 et: +# +# warcprox/writer.py - warc writer, manages and writes records to warc files +# +# Copyright (C) 2013-2016 Internet Archive +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, +# USA. +# from __future__ import absolute_import diff --git a/warcprox/writerthread.py b/warcprox/writerthread.py index b3d2d9c..002b897 100644 --- a/warcprox/writerthread.py +++ b/warcprox/writerthread.py @@ -1,3 +1,25 @@ +# +# warcprox/writerthread.py - warc writer thread, reads from the recorded url +# queue, writes warc records, runs final tasks after warc records are written +# +# Copyright (C) 2013-2016 Internet Archive +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, +# USA. +# + from __future__ import absolute_import try: From 6f10e2708d5c649673398da9c33d0cce5b1b2b7c Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 6 Apr 2016 19:39:28 -0700 Subject: [PATCH 116/146] disable tor test to give travis build a chance to pass tests (waiting on https://github.com/travis-ci/apt-package-whitelist/issues/1753) --- setup.py | 2 +- tests/test_warcprox.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index c33494d..197b896 100755 --- a/setup.py +++ b/setup.py @@ -50,7 +50,7 @@ except: deps.append('futures') setuptools.setup(name='warcprox', - version='2.0.dev5', + version='2.0.dev6', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 6bb99ae..a17d2ea 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -728,7 +728,7 @@ def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies, # XXX this test relies on a tor proxy running at localhost:9050 with a working # connection to the internet, and relies on a third party site (facebook) being # up and behaving a certain way -def test_tor_onion(archiving_proxies): +def _test_tor_onion(archiving_proxies): response = requests.get('http://www.facebookcorewwwi.onion/', proxies=archiving_proxies, verify=False, allow_redirects=False) assert response.status_code == 302 From 0809c78486ad332005584421d1d41ac84fe79f78 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 8 Apr 2016 23:26:20 -0700 Subject: [PATCH 117/146] add Strict-Transport-Security to list of http response headers to swallow, to avoid some problems with HSTS when browsing through warcprox (doesn't solve the case of preloaded HSTS though) --- setup.py | 2 +- warcprox/warcproxy.py | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 197b896..de82385 100755 --- a/setup.py +++ b/setup.py @@ -50,7 +50,7 @@ except: deps.append('futures') setuptools.setup(name='warcprox', - version='2.0.dev6', + version='2.0.dev7', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index c41d8e1..869ba5f 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -147,8 +147,10 @@ class ProxyingRecordingHTTPResponse(http_client.HTTPResponse): status_and_headers = 'HTTP/1.1 {} {}\r\n'.format(self.status, self.reason) for k,v in self.msg.items(): - if k.lower() not in ('connection', 'proxy-connection', 'keep-alive', - 'proxy-authenticate', 'proxy-authorization', 'upgrade'): + if k.lower() not in ( + 'connection', 'proxy-connection', 'keep-alive', + 'proxy-authenticate', 'proxy-authorization', 'upgrade', + 'strict-transport-security'): status_and_headers += '{}: {}\r\n'.format(k, v) status_and_headers += 'Connection: close\r\n\r\n' self.proxy_dest.sendall(status_and_headers.encode('latin1')) From 4fd17be339e68673a0cd2fc3f0c30c2c174713ed Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 10 May 2016 01:11:17 -0700 Subject: [PATCH 118/146] started adding some docstrings, and moved some of the more generally man-in-the-middle recording proxy code from warcproxy.py into mitmproxy.py --- setup.py | 2 +- warcprox/main.py | 55 +++++--- warcprox/mitmproxy.py | 289 ++++++++++++++++++++++++++++++++++++------ warcprox/warcproxy.py | 265 +++++++++----------------------------- 4 files changed, 344 insertions(+), 267 deletions(-) diff --git a/setup.py b/setup.py index de82385..e543c33 100755 --- a/setup.py +++ b/setup.py @@ -50,7 +50,7 @@ except: deps.append('futures') setuptools.setup(name='warcprox', - version='2.0.dev7', + version='2.0.dev8', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', diff --git a/warcprox/main.py b/warcprox/main.py index 6f791d7..d8529b5 100644 --- a/warcprox/main.py +++ b/warcprox/main.py @@ -1,25 +1,25 @@ #!/usr/bin/env python -# -# warcprox/main.py - entrypoint for warcprox executable, parses command line -# arguments, initializes components, starts controller, handles signals -# -# Copyright (C) 2013-2016 Internet Archive -# -# This program is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License -# as published by the Free Software Foundation; either version 2 -# of the License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, -# USA. -# +''' +warcprox/main.py - entrypoint for warcprox executable, parses command line +arguments, initializes components, starts controller, handles signals + +Copyright (C) 2013-2016 Internet Archive + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, +USA. +''' from __future__ import absolute_import @@ -114,6 +114,9 @@ def _build_arg_parser(prog=os.path.basename(sys.argv[0])): return arg_parser def dump_state(signum=None, frame=None): + ''' + Signal handler, logs stack traces of active threads. + ''' pp = pprint.PrettyPrinter(indent=4) state_strs = [] @@ -128,6 +131,10 @@ def dump_state(signum=None, frame=None): logging.warn("dumping state (caught signal {})\n{}".format(signum, "\n".join(state_strs))) def init_controller(args): + ''' + Creates a warcprox.controller.WarcproxController configured according to + the supplied arguments (normally the result of parse_args(sys.argv)). + ''' options = warcprox.Options(**vars(args)) try: @@ -212,11 +219,17 @@ def real_main(args): controller.run_until_shutdown() def parse_args(argv=sys.argv): + ''' + Parses command line arguments with argparse. + ''' arg_parser = _build_arg_parser(prog=os.path.basename(argv[0])) args = arg_parser.parse_args(args=argv[1:]) return args def main(argv=sys.argv): + ''' + Main method, entry point of warcprox command. + ''' args = parse_args(argv) if args.verbose: diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index 083f43f..333bad7 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -1,28 +1,28 @@ -# -# warcprox/mitmproxy.py - man-in-the-middle http/s proxy code, handles http -# CONNECT method by creating a snakeoil certificate for the requested site, -# calling ssl.wrap_socket() on the client connection; connects to remote -# (proxied) host, possibly using tor if host tld is .onion and tor proxy is -# configured -# -# Copyright (C) 2012 Cygnos Corporation -# Copyright (C) 2013-2016 Internet Archive -# -# This program is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License -# as published by the Free Software Foundation; either version 2 -# of the License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, -# USA. -# +''' +warcprox/mitmproxy.py - man-in-the-middle http/s proxy code, handles http +CONNECT method by creating a snakeoil certificate for the requested site, +calling ssl.wrap_socket() on the client connection; connects to remote +(proxied) host, possibly using tor if host tld is .onion and tor proxy is +configured + +Copyright (C) 2012 Cygnos Corporation +Copyright (C) 2013-2016 Internet Archive + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, +USA. +''' from __future__ import absolute_import @@ -35,7 +35,10 @@ try: import urllib.parse as urllib_parse except ImportError: import urlparse as urllib_parse - +try: + import http.client as http_client +except ImportError: + import httplib as http_client import socket import logging import ssl @@ -43,8 +46,132 @@ import warcprox import threading import datetime import socks +import tempfile +import hashlib + +class ProxyingRecorder(object): + """ + Wraps a socket._fileobject, recording the bytes as they are read, + calculating digests, and sending them on to the proxy client. + """ + + logger = logging.getLogger("warcprox.mitmproxy.ProxyingRecorder") + + def __init__(self, fp, proxy_client, digest_algorithm='sha1', url=None): + self.fp = fp + # "The file has no name, and will cease to exist when it is closed." + self.tempfile = tempfile.SpooledTemporaryFile(max_size=512*1024) + self.digest_algorithm = digest_algorithm + self.block_digest = hashlib.new(digest_algorithm) + self.payload_offset = None + self.payload_digest = None + self.proxy_client = proxy_client + self._proxy_client_conn_open = True + self.len = 0 + self.url = url + + def payload_starts_now(self): + self.payload_digest = hashlib.new(self.digest_algorithm) + self.payload_offset = self.len + + def _update_payload_digest(self, hunk): + if self.payload_digest: + self.payload_digest.update(hunk) + + def _update(self, hunk): + self._update_payload_digest(hunk) + self.block_digest.update(hunk) + + self.tempfile.write(hunk) + + if self.payload_digest and self._proxy_client_conn_open: + try: + self.proxy_client.sendall(hunk) + except BaseException as e: + self._proxy_client_conn_open = False + self.logger.warn( + '%s sending data to proxy client for url %s', + e, self.url) + self.logger.info( + 'will continue downloading from remote server without ' + 'sending to client %s', self.url) + + self.len += len(hunk) + + def read(self, size=-1): + hunk = self.fp.read(size) + self._update(hunk) + return hunk + + def readinto(self, b): + n = self.fp.readinto(b) + self._update(b[:n]) + return n + + def readline(self, size=-1): + # XXX depends on implementation details of self.fp.readline(), in + # particular that it doesn't call self.fp.read() + hunk = self.fp.readline(size) + self._update(hunk) + return hunk + + def flush(self): + return self.fp.flush() + + def close(self): + return self.fp.close() + + def __len__(self): + return self.len + + def payload_size(self): + if self.payload_offset is not None: + return self.len - self.payload_offset + else: + return 0 + +class ProxyingRecordingHTTPResponse(http_client.HTTPResponse): + ''' + Implementation of HTTPResponse that uses a ProxyingRecorder to read the + response from the remote web server and send it on to the proxy client, + while recording the bytes in transit. + ''' + def __init__( + self, sock, debuglevel=0, method=None, proxy_client=None, + digest_algorithm='sha1', url=None): + http_client.HTTPResponse.__init__( + self, sock, debuglevel=debuglevel, method=method) + self.proxy_client = proxy_client + self.url = url + + # Keep around extra reference to self.fp because HTTPResponse sets + # self.fp=None after it finishes reading, but we still need it + self.recorder = ProxyingRecorder( + self.fp, proxy_client, digest_algorithm, url=url) + self.fp = self.recorder + + def begin(self): + http_client.HTTPResponse.begin(self) # reads status line, headers + + status_and_headers = 'HTTP/1.1 {} {}\r\n'.format( + self.status, self.reason) + for k,v in self.msg.items(): + if k.lower() not in ( + 'connection', 'proxy-connection', 'keep-alive', + 'proxy-authenticate', 'proxy-authorization', 'upgrade', + 'strict-transport-security'): + status_and_headers += '{}: {}\r\n'.format(k, v) + status_and_headers += 'Connection: close\r\n\r\n' + self.proxy_client.sendall(status_and_headers.encode('latin1')) + + self.recorder.payload_starts_now() class MitmProxyHandler(http_server.BaseHTTPRequestHandler): + ''' + An http proxy implementation of BaseHTTPRequestHandler, that acts as a + man-in-the-middle in order to peek at the content of https transactions, + and records the bytes in transit as it proxies them. + ''' logger = logging.getLogger("warcprox.mitmproxy.MitmProxyHandler") def __init__(self, request, client_address, server): @@ -76,22 +203,23 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): ) ) - def _connect_to_host(self): + def _connect_to_remote_server(self): # Connect to destination if self.onion_tor_socks_proxy_host and self.hostname.lower().endswith('.onion'): self.logger.info("using tor socks proxy at %s:%s to connect to %s", self.onion_tor_socks_proxy_host, self.onion_tor_socks_proxy_port or 1080, self.hostname) - self._proxy_sock = socks.socksocket() - self._proxy_sock.set_proxy(socks.SOCKS5, - addr=self.onion_tor_socks_proxy_host, - port=self.onion_tor_socks_proxy_port, rdns=True) + self._remote_server_sock = socks.socksocket() + self._remote_server_sock.set_proxy( + socks.SOCKS5, addr=self.onion_tor_socks_proxy_host, + port=self.onion_tor_socks_proxy_port, rdns=True) else: - self._proxy_sock = socket.socket() + self._remote_server_sock = socket.socket() - self._proxy_sock.settimeout(60) # XXX what value should this have? - self._proxy_sock.connect((self.hostname, int(self.port))) + # XXX what value should this timeout have? + self._remote_server_sock.settimeout(60) + self._remote_server_sock.connect((self.hostname, int(self.port))) # Wrap socket if SSL is required if self.is_connect: @@ -99,12 +227,18 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): context = ssl.create_default_context() context.check_hostname = False context.verify_mode = ssl.CERT_NONE - self._proxy_sock = context.wrap_socket(self._proxy_sock, server_hostname=self.hostname) + self._remote_server_sock = context.wrap_socket( + self._remote_server_sock, server_hostname=self.hostname) except AttributeError: try: - self._proxy_sock = ssl.wrap_socket(self._proxy_sock) + self._remote_server_sock = ssl.wrap_socket( + self._remote_server_sock) except ssl.SSLError: - self.logger.warn("failed to establish ssl connection to {}; python ssl library does not support SNI, considering upgrading to python >= 2.7.9 or python 3.4".format(self.hostname)) + self.logger.warn( + "failed to establish ssl connection to %s; python " + "ssl library does not support SNI, considering " + "upgrading to python >= 2.7.9 or python 3.4", + self.hostname) raise def _transition_to_ssl(self): @@ -112,11 +246,25 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): server_side=True, certfile=self.server.ca.cert_for_host(self.hostname)) def do_CONNECT(self): + ''' + Handles a http CONNECT request. + + The CONNECT method is meant to "convert the request connection to a + transparent TCP/IP tunnel, usually to facilitate SSL-encrypted + communication (HTTPS) through an unencrypted HTTP proxy" (Wikipedia). + + do_CONNECT is where the man-in-the-middle logic happens. In do_CONNECT + the proxy transitions the proxy client connection to ssl while + masquerading as the remote web server using a generated certificate. + Meanwhile makes its own separate ssl connection to the remote web + server. Then it calls self.handle_one_request() again to handle the + request intended for the remote server. + ''' self.is_connect = True try: # Connect to destination first self._determine_host_port() - self._connect_to_host() + self._connect_to_remote_server() # If successful, let's do this! self.send_response(200, 'Connection established') @@ -161,7 +309,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): try: # Connect to destination self._determine_host_port() - self._connect_to_host() + self._connect_to_remote_server() assert self.url except Exception as e: self.logger.error("problem processing request {}: {}".format(repr(self.requestline), e)) @@ -178,7 +326,68 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): raise def _proxy_request(self): - raise Exception('_proxy_request() not implemented in MitmProxyHandler, must be implemented in subclass!') + ''' + Sends the request to the remote server, then uses a ProxyingRecorder to + read the response and send it to the proxy client, while recording the + bytes in transit. Returns a tuple (request, response) where request is + the raw request bytes, and response is a ProxyingRecorder. + ''' + # Build request + req_str = '{} {} {}\r\n'.format( + self.command, self.path, self.request_version) + + # Swallow headers that don't make sense to forward on, i.e. most + # hop-by-hop headers, see + # http://tools.ietf.org/html/rfc2616#section-13.5. + # self.headers is an email.message.Message, which is case-insensitive + # and doesn't throw KeyError in __delitem__ + for key in ( + 'Connection', 'Proxy-Connection', 'Keep-Alive', + 'Proxy-Authenticate', 'Proxy-Authorization', 'Upgrade'): + del self.headers[key] + + # Add headers to the request + # XXX in at least python3.3 str(self.headers) uses \n not \r\n :( + req_str += '\r\n'.join( + '{}: {}'.format(k,v) for (k,v) in self.headers.items()) + + req = req_str.encode('latin1') + b'\r\n\r\n' + + # Append message body if present to the request + if 'Content-Length' in self.headers: + req += self.rfile.read(int(self.headers['Content-Length'])) + + try: + self.logger.debug('sending to remote server req=%s', repr(req)) + + # Send it down the pipe! + self._remote_server_sock.sendall(req) + + prox_rec_res = ProxyingRecordingHTTPResponse( + self._remote_server_sock, proxy_client=self.connection, + digest_algorithm=self.server.digest_algorithm, + url=self.url) + prox_rec_res.begin() + + buf = prox_rec_res.read(8192) + while buf != b'': + buf = prox_rec_res.read(8192) + + self.log_request(prox_rec_res.status, prox_rec_res.recorder.len) + except socket.timeout as e: + self.logger.warn( + "%s proxying %s %s", repr(e), self.command, self.url) + except BaseException as e: + self.logger.error( + "%s proxying %s %s", repr(e), self.command, self.url, + exc_info=True) + finally: + # Let's close off the remote end + if prox_rec_res: + prox_rec_res.close() + self._remote_server_sock.close() + + return req, prox_rec_res def __getattr__(self, item): if item.startswith('do_'): diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index 869ba5f..0549179 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -1,24 +1,24 @@ -# -# warcprox/warcproxy.py - recording proxy, extends mitmproxy to record traffic, -# enqueue info on the recorded url queue -# -# Copyright (C) 2013-2016 Internet Archive -# -# This program is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License -# as published by the Free Software Foundation; either version 2 -# of the License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, -# USA. -# +''' +warcprox/warcproxy.py - recording proxy, extends mitmproxy to record traffic, +enqueue info on the recorded url queue + +Copyright (C) 2013-2016 Internet Archive + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, +USA. +''' from __future__ import absolute_import @@ -34,15 +34,9 @@ try: import queue except ImportError: import Queue as queue -try: - import http.client as http_client -except ImportError: - import httplib as http_client import logging import re -import tempfile import traceback -import hashlib import json import socket from hanzo import warctools @@ -52,112 +46,6 @@ import datetime import concurrent.futures import resource -class ProxyingRecorder(object): - """ - Wraps a socket._fileobject, recording the bytes as they are read, - calculating digests, and sending them on to the proxy client. - """ - - logger = logging.getLogger("warcprox.warcproxy.ProxyingRecorder") - - def __init__(self, fp, proxy_dest, digest_algorithm='sha1', url=None): - self.fp = fp - # "The file has no name, and will cease to exist when it is closed." - self.tempfile = tempfile.SpooledTemporaryFile(max_size=512*1024) - self.digest_algorithm = digest_algorithm - self.block_digest = hashlib.new(digest_algorithm) - self.payload_offset = None - self.payload_digest = None - self.proxy_dest = proxy_dest - self._proxy_dest_conn_open = True - self._prev_hunk_last_two_bytes = b'' - self.len = 0 - self.url = url - - def payload_starts_now(self): - self.payload_digest = hashlib.new(self.digest_algorithm) - self.payload_offset = self.len - - def _update_payload_digest(self, hunk): - if self.payload_digest: - self.payload_digest.update(hunk) - - def _update(self, hunk): - self._update_payload_digest(hunk) - self.block_digest.update(hunk) - - self.tempfile.write(hunk) - - if self.payload_digest and self._proxy_dest_conn_open: - try: - self.proxy_dest.sendall(hunk) - except BaseException as e: - self._proxy_dest_conn_open = False - self.logger.warn('{} sending data to proxy client for url {}'.format(e, self.url)) - self.logger.info('will continue downloading from remote server without sending to client {}'.format(self.url)) - - self.len += len(hunk) - - def read(self, size=-1): - hunk = self.fp.read(size) - self._update(hunk) - return hunk - - def readinto(self, b): - n = self.fp.readinto(b) - self._update(b[:n]) - return n - - def readline(self, size=-1): - # XXX depends on implementation details of self.fp.readline(), in - # particular that it doesn't call self.fp.read() - hunk = self.fp.readline(size) - self._update(hunk) - return hunk - - def flush(self): - return self.fp.flush() - - def close(self): - return self.fp.close() - - def __len__(self): - return self.len - - def payload_size(self): - if self.payload_offset is not None: - return self.len - self.payload_offset - else: - return 0 - -class ProxyingRecordingHTTPResponse(http_client.HTTPResponse): - - def __init__(self, sock, debuglevel=0, method=None, proxy_dest=None, digest_algorithm='sha1', url=None): - http_client.HTTPResponse.__init__(self, sock, debuglevel=debuglevel, method=method) - self.proxy_dest = proxy_dest - self.url = url - - # Keep around extra reference to self.fp because HTTPResponse sets - # self.fp=None after it finishes reading, but we still need it - self.recorder = ProxyingRecorder(self.fp, proxy_dest, digest_algorithm, url=url) - self.fp = self.recorder - - def begin(self): - http_client.HTTPResponse.begin(self) # reads status line, headers - - status_and_headers = 'HTTP/1.1 {} {}\r\n'.format(self.status, self.reason) - for k,v in self.msg.items(): - if k.lower() not in ( - 'connection', 'proxy-connection', 'keep-alive', - 'proxy-authenticate', 'proxy-authorization', 'upgrade', - 'strict-transport-security'): - status_and_headers += '{}: {}\r\n'.format(k, v) - status_and_headers += 'Connection: close\r\n\r\n' - self.proxy_dest.sendall(status_and_headers.encode('latin1')) - - self.recorder.payload_starts_now() - - class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): # self.server is WarcProxy logger = logging.getLogger("warcprox.warcprox.WarcProxyHandler") @@ -187,96 +75,63 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): return False def _proxy_request(self): - # Build request - req_str = '{} {} {}\r\n'.format(self.command, self.path, self.request_version) - warcprox_meta = None raw_warcprox_meta = self.headers.get('Warcprox-Meta') if raw_warcprox_meta: warcprox_meta = json.loads(raw_warcprox_meta) + del self.headers['Warcprox-Meta'] if self._enforce_limits(warcprox_meta): return - # Swallow headers that don't make sense to forward on, i.e. most - # hop-by-hop headers, see http://tools.ietf.org/html/rfc2616#section-13.5 - # self.headers is an email.message.Message, which is case-insensitive - # and doesn't throw KeyError in __delitem__ - for key in ('Connection', 'Proxy-Connection', 'Keep-Alive', - 'Proxy-Authenticate', 'Proxy-Authorization', 'Upgrade', - 'Warcprox-Meta'): - del self.headers[key] + remote_ip = self._remote_server_sock.getpeername()[0] + timestamp = datetime.datetime.utcnow() - # Add headers to the request - # XXX in at least python3.3 str(self.headers) uses \n not \r\n :( - req_str += '\r\n'.join('{}: {}'.format(k,v) for (k,v) in self.headers.items()) + req, prox_rec_res = warcprox.mitmproxy.MitmProxyHandler._proxy_request( + self) - req = req_str.encode('latin1') + b'\r\n\r\n' - - # Append message body if present to the request - if 'Content-Length' in self.headers: - req += self.rfile.read(int(self.headers['Content-Length'])) - - prox_rec_res = None - recorded_url = None - try: - self.logger.debug('sending to remote server req=%s', repr(req)) - - # warc-date "shall represent the instant that data capture for record creation began" - timestamp = datetime.datetime.utcnow() - - # Send it down the pipe! - self._proxy_sock.sendall(req) - - # We want HTTPResponse's smarts about http and handling of - # non-compliant servers. But HTTPResponse.read() doesn't return the raw - # bytes read from the server, it unchunks them if they're chunked, and - # might do other stuff. We want to send the raw bytes back to the - # client. So we ignore the values returned by prox_rec_res.read() below. Instead - # the ProxyingRecordingHTTPResponse takes care of sending the raw bytes - # to the proxy client. - - # Proxy and record the response - prox_rec_res = ProxyingRecordingHTTPResponse(self._proxy_sock, - proxy_dest=self.connection, - digest_algorithm=self.server.digest_algorithm, - url=self.url) - prox_rec_res.begin() - - remote_ip=self._proxy_sock.getpeername()[0] - - buf = prox_rec_res.read(8192) - while buf != b'': - buf = prox_rec_res.read(8192) - - recorded_url = RecordedUrl(url=self.url, request_data=req, - response_recorder=prox_rec_res.recorder, - remote_ip=remote_ip, warcprox_meta=warcprox_meta, - status=prox_rec_res.status, size=prox_rec_res.recorder.len, - client_ip=self.client_address[0], - content_type=prox_rec_res.getheader("Content-Type"), - method=self.command, timestamp=timestamp, - host=self.hostname, duration=datetime.datetime.utcnow()-timestamp) - self.server.recorded_url_q.put(recorded_url) - - self.log_request(prox_rec_res.status, prox_rec_res.recorder.len) - except socket.timeout as e: - self.logger.warn("%s proxying %s %s", repr(e), self.command, self.url) - except BaseException as e: - self.logger.error("%s proxying %s %s", repr(e), self.command, self.url, exc_info=True) - finally: - # Let's close off the remote end - if prox_rec_res: - prox_rec_res.close() - self._proxy_sock.close() + recorded_url = RecordedUrl( + url=self.url, request_data=req, + response_recorder=prox_rec_res.recorder, remote_ip=remote_ip, + warcprox_meta=warcprox_meta, status=prox_rec_res.status, + size=prox_rec_res.recorder.len, + client_ip=self.client_address[0], + content_type=prox_rec_res.getheader("Content-Type"), + method=self.command, timestamp=timestamp, host=self.hostname, + duration=datetime.datetime.utcnow()-timestamp) + self.server.recorded_url_q.put(recorded_url) return recorded_url # deprecated def do_PUTMETA(self): + ''' + Handles a special warcprox PUTMETA request (deprecated). A PUTMETA + request is equivalent to a WARCPROX_WRITE_RECORD request with + WARC-Type: metadata. + ''' self.do_WARCPROX_WRITE_RECORD(warc_type=warctools.WarcRecord.METADATA) def do_WARCPROX_WRITE_RECORD(self, warc_type=None): + ''' + Handles a request with http method WARCPROX_WRITE_RECORD, a special + type of request which tells warcprox to construct a warc record from + the request more or less verbatim, and write it to a warc. + + To honor the request, this method creates a RecordedUrl queues it for + the WarcWriterThread to process. The warc record headers Content-Type + and WARC-Type are taken from the request headers, as is the payload. + + Example request: + + WARCPROX_WRITE_RECORD screenshot:https://example.com/ HTTP/1.1 + WARC-Type: metadata + Content-Type: image/png + Content-Length: 12345 + Connection: close + + + ''' try: self.url = self.path From d74be60795ed41db7022f50bc2d63ab8feeff37b Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 10 May 2016 17:55:18 +0000 Subject: [PATCH 119/146] fix renamed overridden method name in subclass --- warcprox/playback.py | 46 ++++++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/warcprox/playback.py b/warcprox/playback.py index 2624175..164ba48 100644 --- a/warcprox/playback.py +++ b/warcprox/playback.py @@ -1,24 +1,24 @@ -# -# warcprox/playback.py - rudimentary support for playback of urls archived by -# warcprox (not much used or maintained) -# -# Copyright (C) 2013-2016 Internet Archive -# -# This program is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License -# as published by the Free Software Foundation; either version 2 -# of the License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, -# USA. -# +''' +warcprox/playback.py - rudimentary support for playback of urls archived by +warcprox (not much used or maintained) + +Copyright (C) 2013-2016 Internet Archive + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, +USA. +''' from __future__ import absolute_import @@ -45,8 +45,8 @@ class PlaybackProxyHandler(MitmProxyHandler): logger = logging.getLogger("warcprox.playback.PlaybackProxyHandler") # @Override - def _connect_to_host(self): - # don't connect to host! + def _connect_to_remote_server(self): + # don't connect to any remote server! pass From 4bb35567095dae3837c09738b382b63cfcbcfc52 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 10 May 2016 23:11:47 +0000 Subject: [PATCH 120/146] implement enforcement of Warcprox-Meta header block rules; includes automated tests --- setup.py | 2 +- tests/test_warcprox.py | 159 +++++++++++++++++++++++++++++++++++------ warcprox/__init__.py | 53 ++++++++------ warcprox/main.py | 5 +- warcprox/mitmproxy.py | 32 +++++---- warcprox/warcproxy.py | 146 +++++++++++++++++++++++++++++++++++-- 6 files changed, 331 insertions(+), 66 deletions(-) diff --git a/setup.py b/setup.py index e543c33..6584d18 100755 --- a/setup.py +++ b/setup.py @@ -50,7 +50,7 @@ except: deps.append('futures') setuptools.setup(name='warcprox', - version='2.0.dev8', + version='2.0.dev9', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index a17d2ea..45933b5 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -1,24 +1,24 @@ #!/usr/bin/env python -# -# tests/test_warcprox.py - automated tests for warcprox -# -# Copyright (C) 2013-2016 Internet Archive -# -# This program is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License -# as published by the Free Software Foundation; either version 2 -# of the License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, -# USA. -# +''' +tests/test_warcprox.py - automated tests for warcprox + +Copyright (C) 2013-2016 Internet Archive + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, +USA. +''' import pytest import threading @@ -58,7 +58,8 @@ import certauth.certauth import warcprox logging.basicConfig(stream=sys.stdout, level=logging.INFO, - format='%(asctime)s %(process)d %(levelname)s %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s') + format='%(asctime)s %(process)d %(levelname)s %(threadName)s ' + '%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s') logging.getLogger("requests.packages.urllib3").setLevel(logging.WARN) warnings.simplefilter("ignore", category=requests.packages.urllib3.exceptions.InsecureRequestWarning) warnings.simplefilter("ignore", category=requests.packages.urllib3.exceptions.InsecurePlatformWarning) @@ -137,8 +138,8 @@ def cert(request): @pytest.fixture(scope="module") def http_daemon(request): - http_daemon = http_server.HTTPServer(('localhost', 0), - RequestHandlerClass=_TestHttpRequestHandler) + http_daemon = http_server.HTTPServer( + ('localhost', 0), RequestHandlerClass=_TestHttpRequestHandler) logging.info('starting http://{}:{}'.format(http_daemon.server_address[0], http_daemon.server_address[1])) http_daemon_thread = threading.Thread(name='HttpDaemonThread', target=http_daemon.serve_forever) @@ -725,6 +726,118 @@ def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies, finally: fh.close() +def test_block_rules(http_daemon, https_daemon, warcprox_, archiving_proxies): + rules = [ + { + "host": "localhost", + "url_match": "STRING_MATCH", + "value": "bar", + }, + { + "url_match": "SURT_MATCH", + "value": "http://(localhost:%s,)/fuh/" % (http_daemon.server_port), + }, + { + "url_match": "SURT_MATCH", + # this rule won't match because of http scheme, https port + "value": "http://(localhost:%s,)/fuh/" % (https_daemon.server_port), + }, + { + "host": "badhost.com", + }, + ] + request_meta = {"blocks":rules} + headers = {"Warcprox-Meta":json.dumps(request_meta)} + + # blocked by STRING_MATCH rule + url = 'http://localhost:{}/bar'.format(http_daemon.server_port) + response = requests.get( + url, proxies=archiving_proxies, headers=headers, stream=True) + assert response.status_code == 403 + assert response.content.startswith(b"request rejected by warcprox: blocked by rule found in Warcprox-Meta header:") + assert json.loads(response.headers['warcprox-meta']) == {"blocked-by-rule":rules[0]} + + # not blocked + url = 'http://localhost:{}/m/n'.format(http_daemon.server_port) + response = requests.get( + url, proxies=archiving_proxies, headers=headers, stream=True) + assert response.status_code == 200 + + # blocked by SURT_MATCH + url = 'http://localhost:{}/fuh/guh'.format(http_daemon.server_port) + # logging.info("%s => %s", repr(url), repr(warcprox.warcproxy.Url(url).surt)) + response = requests.get( + url, proxies=archiving_proxies, headers=headers, stream=True) + assert response.status_code == 403 + assert response.content.startswith(b"request rejected by warcprox: blocked by rule found in Warcprox-Meta header:") + assert json.loads(response.headers['warcprox-meta']) == {"blocked-by-rule":rules[1]} + + # not blocked (no trailing slash) + url = 'http://localhost:{}/fuh'.format(http_daemon.server_port) + response = requests.get( + url, proxies=archiving_proxies, headers=headers, stream=True) + # 404 because server set up at the top of this file doesn't handle this url + assert response.status_code == 404 + + # not blocked because surt scheme does not match (differs from heritrix + # behavior where https urls are coerced to http surt form) + url = 'https://localhost:{}/fuh/guh'.format(https_daemon.server_port) + response = requests.get( + url, proxies=archiving_proxies, headers=headers, stream=True, + verify=False) + assert response.status_code == 200 + + # blocked by blanket host block + url = 'http://badhost.com/' + response = requests.get( + url, proxies=archiving_proxies, headers=headers, stream=True) + assert response.status_code == 403 + assert response.content.startswith(b"request rejected by warcprox: blocked by rule found in Warcprox-Meta header:") + assert json.loads(response.headers['warcprox-meta']) == {"blocked-by-rule":rules[3]} + + # blocked by blanket host block + url = 'https://badhost.com/' + response = requests.get( + url, proxies=archiving_proxies, headers=headers, stream=True, + verify=False) + assert response.status_code == 403 + assert response.content.startswith(b"request rejected by warcprox: blocked by rule found in Warcprox-Meta header:") + assert json.loads(response.headers['warcprox-meta']) == {"blocked-by-rule":rules[3]} + + # blocked by blanket host block + url = 'http://badhost.com:1234/' + response = requests.get( + url, proxies=archiving_proxies, headers=headers, stream=True) + assert response.status_code == 403 + assert response.content.startswith(b"request rejected by warcprox: blocked by rule found in Warcprox-Meta header:") + assert json.loads(response.headers['warcprox-meta']) == {"blocked-by-rule":rules[3]} + + # blocked by blanket host block + url = 'http://foo.bar.badhost.com/' + response = requests.get( + url, proxies=archiving_proxies, headers=headers, stream=True) + assert response.status_code == 403 + assert response.content.startswith(b"request rejected by warcprox: blocked by rule found in Warcprox-Meta header:") + assert json.loads(response.headers['warcprox-meta']) == {"blocked-by-rule":rules[3]} + + # host block also applies to subdomains + url = 'https://foo.bar.badhost.com/' + response = requests.get( + url, proxies=archiving_proxies, headers=headers, stream=True, + verify=False) + assert response.status_code == 403 + assert response.content.startswith(b"request rejected by warcprox: blocked by rule found in Warcprox-Meta header:") + assert json.loads(response.headers['warcprox-meta']) == {"blocked-by-rule":rules[3]} + + # blocked by blanket host block + url = 'http://foo.bar.badhost.com:1234/' + response = requests.get( + url, proxies=archiving_proxies, headers=headers, stream=True) + assert response.status_code == 403 + assert response.content.startswith(b"request rejected by warcprox: blocked by rule found in Warcprox-Meta header:") + assert json.loads(response.headers['warcprox-meta']) == {"blocked-by-rule":rules[3]} + + # XXX this test relies on a tor proxy running at localhost:9050 with a working # connection to the internet, and relies on a third party site (facebook) being # up and behaving a certain way diff --git a/warcprox/__init__.py b/warcprox/__init__.py index 394d8b8..1eeb9a4 100644 --- a/warcprox/__init__.py +++ b/warcprox/__init__.py @@ -1,23 +1,23 @@ -# -# warcprox/__init__.py - warcprox package main file, contains some utility code -# -# Copyright (C) 2013-2016 Internet Archive -# -# This program is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License -# as published by the Free Software Foundation; either version 2 -# of the License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, -# USA. -# +""" +warcprox/__init__.py - warcprox package main file, contains some utility code + +Copyright (C) 2013-2016 Internet Archive + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, +USA. +""" from argparse import Namespace as _Namespace from pkg_resources import get_distribution as _get_distribution @@ -47,6 +47,19 @@ def gettid(): except: return "n/a" +class RequestBlockedByRule(Exception): + """ + An exception raised when a request should be blocked to respect a + Warcprox-Meta rule. + """ + def __init__(self, msg): + self.msg = msg + def __str__(self): + return "%s: %s" % (self.__class__.__name__, self.msg) + +# logging level more fine-grained than logging.DEBUG==10 +TRACE = 5 + import warcprox.controller as controller import warcprox.playback as playback import warcprox.dedup as dedup diff --git a/warcprox/main.py b/warcprox/main.py index d8529b5..00d2d85 100644 --- a/warcprox/main.py +++ b/warcprox/main.py @@ -109,6 +109,7 @@ def _build_arg_parser(prog=os.path.basename(sys.argv[0])): arg_parser.add_argument('--version', action='version', version="warcprox {}".format(warcprox.__version__)) arg_parser.add_argument('-v', '--verbose', dest='verbose', action='store_true') + arg_parser.add_argument('--trace', dest='trace', action='store_true') arg_parser.add_argument('-q', '--quiet', dest='quiet', action='store_true') return arg_parser @@ -232,7 +233,9 @@ def main(argv=sys.argv): ''' args = parse_args(argv) - if args.verbose: + if args.trace: + loglevel = warcprox.TRACE + elif args.verbose: loglevel = logging.DEBUG elif args.quiet: loglevel = logging.WARNING diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index 333bad7..3950e4e 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -241,6 +241,8 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): self.hostname) raise + return self._remote_server_sock + def _transition_to_ssl(self): self.request = self.connection = ssl.wrap_socket(self.connection, server_side=True, certfile=self.server.ca.cert_for_host(self.hostname)) @@ -262,9 +264,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): ''' self.is_connect = True try: - # Connect to destination first self._determine_host_port() - self._connect_to_remote_server() # If successful, let's do this! self.send_response(200, 'Connection established') @@ -305,19 +305,23 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): return result def do_COMMAND(self): - if not self.is_connect: - try: - # Connect to destination - self._determine_host_port() - self._connect_to_remote_server() - assert self.url - except Exception as e: - self.logger.error("problem processing request {}: {}".format(repr(self.requestline), e)) - self.send_error(500, str(e)) - return - else: - # if self.is_connect we already connected in do_CONNECT + if self.is_connect: self.url = self._construct_tunneled_url() + else: + self._determine_host_port() + assert self.url + + try: + # Connect to destination + self._connect_to_remote_server() + except warcprox.RequestBlockedByRule as e: + # limit enforcers have already sent the appropriate response + self.logger.info("%s: %s", repr(self.requestline), e) + return + except Exception as e: + self.logger.error("problem processing request {}: {}".format(repr(self.requestline), e)) + self.send_error(500, str(e)) + return try: self._proxy_request() diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index 0549179..d342774 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -45,18 +45,135 @@ import warcprox import datetime import concurrent.futures import resource +import ipaddress +import surt + +class Url: + def __init__(self, url): + self.url = url + self._surt = None + self._host = None + + @property + def surt(self): + if not self._surt: + hurl = surt.handyurl.parse(self.url) + surt.GoogleURLCanonicalizer.canonicalize(hurl) + hurl.query = None + hurl.hash = None + self._surt = hurl.getURLString(surt=True, trailing_comma=True) + return self._surt + + @property + def host(self): + if not self._host: + self._host = surt.handyurl.parse(self.url).host + return self._host + + def matches_ip_or_domain(self, ip_or_domain): + """Returns true if + - ip_or_domain is an ip address and self.host is the same ip address + - ip_or_domain is a domain and self.host is the same domain + - ip_or_domain is a domain and self.host is a subdomain of it + """ + if ip_or_domain == self.host: + return True + + # if either ip_or_domain or self.host are ip addresses, and they're not + # identical (previous check), not a match + try: + ipaddress.ip_address(ip_or_domain) + return False + except: + pass + try: + ipaddress.ip_address(self.host) + return False + except: + pass + + # if we get here, we're looking at two hostnames + # XXX do we need to handle case of one punycoded idn, other not? + domain_parts = ip_or_domain.split(".") + host_parts = self.host.split(".") + + return host_parts[-len(domain_parts):] == domain_parts class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): # self.server is WarcProxy logger = logging.getLogger("warcprox.warcprox.WarcProxyHandler") + # XXX nearly identical to brozzler.site.Site._scope_rule_applies() but + # there's no obvious common dependency where this code should go... TBD + def _scope_rule_applies(self, rule): + u = Url(self.url) + + if "host" in rule and not u.matches_ip_or_domain(rule["host"]): + return False + if "url_match" in rule: + if rule["url_match"] == "STRING_MATCH": + return u.url.find(rule["value"]) >= 0 + elif rule["url_match"] == "REGEX_MATCH": + try: + return re.fullmatch(rule["value"], u.url) + except Exception as e: + self.logger.warn( + "caught exception matching against regex %s: %s", + rule["value"], e) + return False + elif rule["url_match"] == "SURT_MATCH": + return u.surt.startswith(rule["value"]) + else: + self.logger.warn("invalid rule.url_match=%s", rule.url_match) + return False + else: + if "host" in rule: + # we already know that it matches from earlier check + return True + else: + self.logger.warn("unable to make sense of scope rule %s", rule) + return False + + def _enforce_blocks(self, warcprox_meta): + """ + Sends a 403 response and raises warcprox.RequestBlockedByRule if the + url is blocked by a rule in warcprox_meta. + """ + if warcprox_meta and "blocks" in warcprox_meta: + for rule in warcprox_meta["blocks"]: + if self._scope_rule_applies(rule): + body = ("request rejected by warcprox: blocked by " + "rule found in Warcprox-Meta header: %s" + % rule).encode("utf-8") + self.send_response(403, "Forbidden") + self.send_header("Content-Type", "text/plain;charset=utf-8") + self.send_header("Connection", "close") + self.send_header("Content-Length", len(body)) + response_meta = {"blocked-by-rule":rule} + self.send_header( + "Warcprox-Meta", + json.dumps(response_meta, separators=(",",":"))) + self.end_headers() + if self.command != "HEAD": + self.wfile.write(body) + self.connection.close() + raise warcprox.RequestBlockedByRule( + "%s 403 %s %s -- blocked by rule in Warcprox-Meta " + "request header %s" % ( + self.client_address[0], self.command, + self.url, rule)) + def _enforce_limits(self, warcprox_meta): + """ + Sends a 420 response and raises warcprox.RequestBlockedByRule if a + limit specified in warcprox_meta is reached. + """ if warcprox_meta and "limits" in warcprox_meta: for item in warcprox_meta["limits"].items(): key, limit = item bucket0, bucket1, bucket2 = key.rsplit(".", 2) value = self.server.stats_db.value(bucket0, bucket1, bucket2) - self.logger.debug("warcprox_meta['limits']=%s stats['%s']=%s recorded_url_q.qsize()=%s", + self.logger.debug("warcprox_meta['limits']=%s stats['%s']=%s recorded_url_q.qsize()=%s", warcprox_meta['limits'], key, value, self.server.recorded_url_q.qsize()) if value and value >= limit: body = "request rejected by warcprox: reached limit {}={}\n".format(key, limit).encode("utf-8") @@ -70,20 +187,35 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): if self.command != "HEAD": self.wfile.write(body) self.connection.close() - self.logger.info("%s 420 %s %s -- reached limit %s=%s", self.client_address[0], self.command, self.url, key, limit) - return True - return False + raise warcprox.RequestBlockedByRule( + "%s 420 %s %s -- reached limit %s=%s" % ( + self.client_address[0], self.command, + self.url, key, limit)) + + def _connect_to_remote_server(self): + ''' + Wraps MitmProxyHandler._connect_to_remote_server, first enforcing + limits and block rules in the Warcprox-Meta request header, if any. + Raises warcprox.RequestBlockedByRule if a rule has been enforced. + Otherwise calls MitmProxyHandler._connect_to_remote_server, which + initializes self._remote_server_sock. + ''' + if 'Warcprox-Meta' in self.headers: + warcprox_meta = json.loads(self.headers['Warcprox-Meta']) + self._enforce_limits(warcprox_meta) + self._enforce_blocks(warcprox_meta) + return warcprox.mitmproxy.MitmProxyHandler._connect_to_remote_server(self) def _proxy_request(self): warcprox_meta = None raw_warcprox_meta = self.headers.get('Warcprox-Meta') + self.logger.log( + warcprox.TRACE, 'request for %s Warcprox-Meta header: %s', + self.url, repr(raw_warcprox_meta)) if raw_warcprox_meta: warcprox_meta = json.loads(raw_warcprox_meta) del self.headers['Warcprox-Meta'] - if self._enforce_limits(warcprox_meta): - return - remote_ip = self._remote_server_sock.getpeername()[0] timestamp = datetime.datetime.utcnow() From d48e2c462dbe9e0897e14576b4f5c4d09cbadd7f Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 16 Jun 2016 00:04:59 +0000 Subject: [PATCH 121/146] add a start() method to the two classes that save data to rethinkdb periodically in batches, instead of starting the timer in __init__ --- tests/test_warcprox.py | 2 ++ warcprox/bigtable.py | 18 ++++++++++++++++-- warcprox/controller.py | 10 ++++++++-- warcprox/dedup.py | 13 +++++++++++++ warcprox/stats.py | 33 ++++++++++++++++++++++++++------- 5 files changed, 65 insertions(+), 11 deletions(-) diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 45933b5..db97674 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -188,6 +188,7 @@ def captures_db(request, rethinkdb_servers, rethinkdb_big_table): db = 'warcprox_test_captures_' + "".join(random.sample("abcdefghijklmnopqrstuvwxyz0123456789_",8)) r = rethinkstuff.Rethinker(servers, db) captures_db = warcprox.bigtable.RethinkCaptures(r) + captures_db.start() def fin(): if captures_db: @@ -247,6 +248,7 @@ def stats_db(request, rethinkdb_servers): db = 'warcprox_test_stats_' + "".join(random.sample("abcdefghijklmnopqrstuvwxyz0123456789_",8)) r = rethinkstuff.Rethinker(servers, db) sdb = warcprox.stats.RethinkStatsDb(r) + sdb.start() else: f = tempfile.NamedTemporaryFile(prefix='warcprox-test-stats-', suffix='.db', delete=False) f.close() diff --git a/warcprox/bigtable.py b/warcprox/bigtable.py index 3f8989a..66b84f0 100644 --- a/warcprox/bigtable.py +++ b/warcprox/bigtable.py @@ -52,7 +52,11 @@ class RethinkCaptures: self._batch_lock = threading.RLock() with self._batch_lock: self._batch = [] - self._insert_batch() # starts repeating timer + self._timer = None + + def start(self): + """Starts batch insert repeating timer""" + self._insert_batch() def _insert_batch(self): try: @@ -165,9 +169,13 @@ class RethinkCaptures: self._batch.append(entry) def close(self): + self.stop() + + def stop(self): self.logger.info("closing rethinkdb captures table") self._stop.set() - self._timer.join() + if self._timer: + self._timer.join() class RethinkCapturesDedup: logger = logging.getLogger("warcprox.dedup.RethinkCapturesDedup") @@ -195,5 +203,11 @@ class RethinkCapturesDedup: else: return None + def start(self): + self.captures_db.start() + + def stop(self): + self.captures_db.stop() + def close(self): self.captures_db.close() diff --git a/warcprox/controller.py b/warcprox/controller.py index a813345..760a1e8 100644 --- a/warcprox/controller.py +++ b/warcprox/controller.py @@ -149,8 +149,14 @@ class WarcproxController(object): Start warcprox and run until shut down. Call warcprox_controller.stop.set() to initiate graceful shutdown. """ - proxy_thread = threading.Thread(target=self.proxy.serve_forever, name='ProxyThread') + if self.proxy.stats_db: + self.proxy.stats_db.start() + proxy_thread = threading.Thread( + target=self.proxy.serve_forever, name='ProxyThread') proxy_thread.start() + + if self.warc_writer_thread.dedup_db: + self.warc_writer_thread.dedup_db.start() self.warc_writer_thread.start() if self.playback_proxy is not None: @@ -199,7 +205,7 @@ class WarcproxController(object): self.warc_writer_thread.join() if self.proxy.stats_db: - self.proxy.stats_db.close() + self.proxy.stats_db.stop() if self.warc_writer_thread.dedup_db: self.warc_writer_thread.dedup_db.close() diff --git a/warcprox/dedup.py b/warcprox/dedup.py index eb71cb4..c5080d3 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -48,6 +48,12 @@ class DedupDb(object): self.db = dbm_gnu.open(dbm_file, 'c') self.options = options + def start(self): + pass + + def stop(self): + self.close() + def close(self): self.db.close() @@ -125,6 +131,13 @@ class RethinkDedupDb: repr(self.table), repr(self.r.dbname), self.shards, self.replicas) self.r.table_create(self.table, primary_key="key", shards=self.shards, replicas=self.replicas).run() + + def start(self): + pass + + def stop(self): + pass + def close(self): pass diff --git a/warcprox/stats.py b/warcprox/stats.py index 8dd3a86..7bf3fbc 100644 --- a/warcprox/stats.py +++ b/warcprox/stats.py @@ -71,6 +71,13 @@ class StatsDb: self.db = dbm_gnu.open(dbm_file, 'c') self.options = options + def start(self): + # method only exists to match RethinkStatsDb + pass + + def stop(self): + self.close() + def close(self): self.db.close() @@ -134,7 +141,7 @@ class StatsDb: self.db[b] = json.dumps(bucket_stats, separators=(',',':')).encode("utf-8") class RethinkStatsDb: - """Updates database in batch every 0.5 seconds""" + """Updates database in batch every 2.0 seconds""" logger = logging.getLogger("warcprox.stats.RethinkStatsDb") def __init__(self, rethinker, table="stats", shards=None, replicas=None, options=warcprox.Options()): @@ -149,7 +156,10 @@ class RethinkStatsDb: self._batch_lock = threading.RLock() with self._batch_lock: self._batch = {} + self._timer = None + def start(self): + """Starts batch update repeating timer.""" self._update_batch() # starts repeating timer def _update_batch(self): @@ -190,18 +200,27 @@ class RethinkStatsDb: def _ensure_db_table(self): dbs = self.r.db_list().run() if not self.r.dbname in dbs: - self.logger.info("creating rethinkdb database %s", repr(self.r.dbname)) + self.logger.info( + "creating rethinkdb database %s", repr(self.r.dbname)) self.r.db_create(self.r.dbname).run() tables = self.r.table_list().run() if not self.table in tables: - self.logger.info("creating rethinkdb table %s in database %s shards=%s replicas=%s", - repr(self.table), repr(self.r.dbname), self.shards, self.replicas) - self.r.table_create(self.table, primary_key="bucket", shards=self.shards, replicas=self.replicas).run() + self.logger.info( + "creating rethinkdb table %s in database %s shards=%s " + "replicas=%s", repr(self.table), repr(self.r.dbname), + self.shards, self.replicas) + self.r.table_create( + self.table, primary_key="bucket", shards=self.shards, + replicas=self.replicas).run() def close(self): - self.logger.info("closing rethinkdb stats table") + self.stop() + + def stop(self): + self.logger.info("stopping rethinkdb stats table batch updates") self._stop.set() - self._timer.join() + if self._timer: + self._timer.join() def sync(self): pass From 2fe0c2f25b12c6e521afa93b9a0d41d2f46022ee Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 24 Jun 2016 20:04:27 -0500 Subject: [PATCH 122/146] support for tallying substats of a configured bucket by host, and enforcing limits host limits using those stats, with tests --- setup.py | 3 +- tests/test_warcprox.py | 143 +++++++++++++++++++++++++++++++- warcprox/stats.py | 181 ++++++++++++++++++++++++----------------- warcprox/warcproxy.py | 30 +++++-- 4 files changed, 272 insertions(+), 85 deletions(-) diff --git a/setup.py b/setup.py index 6584d18..5dcfc75 100755 --- a/setup.py +++ b/setup.py @@ -50,7 +50,7 @@ except: deps.append('futures') setuptools.setup(name='warcprox', - version='2.0.dev9', + version='2.0.dev10', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', @@ -70,6 +70,7 @@ setuptools.setup(name='warcprox', 'License :: OSI Approved :: GNU General Public License (GPL)', 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3.4', + 'Programming Language :: Python :: 3.5', 'Topic :: Internet :: Proxy Servers', 'Topic :: Internet :: WWW/HTTP', 'Topic :: Software Development :: Libraries :: Python Modules', diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index db97674..281d1f9 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -57,7 +57,8 @@ import certauth.certauth import warcprox -logging.basicConfig(stream=sys.stdout, level=logging.INFO, +# logging.basicConfig(stream=sys.stdout, level=logging.INFO, +logging.basicConfig(stream=sys.stdout, level=warcprox.TRACE, format='%(asctime)s %(process)d %(levelname)s %(threadName)s ' '%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s') logging.getLogger("requests.packages.urllib3").setLevel(logging.WARN) @@ -563,7 +564,7 @@ def test_dedup_https(https_daemon, warcprox_, archiving_proxies, playback_proxie def test_limits(http_daemon, warcprox_, archiving_proxies): url = 'http://localhost:{}/i/j'.format(http_daemon.server_port) - request_meta = {"stats":{"buckets":["job1"]},"limits":{"job1.total.urls":10}} + request_meta = {"stats":{"buckets":["test_limits_bucket"]},"limits":{"test_limits_bucket.total.urls":10}} headers = {"Warcprox-Meta": json.dumps(request_meta)} response = requests.get(url, proxies=archiving_proxies, headers=headers, stream=True) @@ -592,10 +593,10 @@ def test_limits(http_daemon, warcprox_, archiving_proxies): response = requests.get(url, proxies=archiving_proxies, headers=headers, stream=True) assert response.status_code == 420 assert response.reason == "Reached limit" - expected_response_meta = {'reached-limit': {'job1.total.urls': 10}, 'stats': {'job1': {'bucket': 'job1', 'revisit': {'wire_bytes': 1215, 'urls': 9}, 'total': {'wire_bytes': 1350, 'urls': 10}, 'new': {'wire_bytes': 135, 'urls': 1}}}} + expected_response_meta = {'reached-limit': {'test_limits_bucket.total.urls': 10}, 'stats': {'test_limits_bucket': {'bucket': 'test_limits_bucket', 'revisit': {'wire_bytes': 1215, 'urls': 9}, 'total': {'wire_bytes': 1350, 'urls': 10}, 'new': {'wire_bytes': 135, 'urls': 1}}}} assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta assert response.headers["content-type"] == "text/plain;charset=utf-8" - assert response.raw.data == b"request rejected by warcprox: reached limit job1.total.urls=10\n" + assert response.raw.data == b"request rejected by warcprox: reached limit test_limits_bucket.total.urls=10\n" def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies, playback_proxies): url1 = 'http://localhost:{}/k/l'.format(http_daemon.server_port) @@ -839,6 +840,140 @@ def test_block_rules(http_daemon, https_daemon, warcprox_, archiving_proxies): assert response.content.startswith(b"request rejected by warcprox: blocked by rule found in Warcprox-Meta header:") assert json.loads(response.headers['warcprox-meta']) == {"blocked-by-rule":rules[3]} +def test_host_doc_limit( + http_daemon, https_daemon, warcprox_, archiving_proxies): + request_meta = { + "stats": {"buckets": [{"bucket":"test_host_doc_limit_bucket","tally-host-stats":True}]}, + "limits": {"test_host_doc_limit_bucket:localhost.total.urls":10}, + } + headers = {"Warcprox-Meta": json.dumps(request_meta)} + + url = 'http://localhost:{}/o/p'.format(http_daemon.server_port) + response = requests.get( + url, proxies=archiving_proxies, headers=headers, stream=True) + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'o!' + assert response.content == b'I am the warcprox test payload! pppppppppp!\n' + + # wait for writer thread to process + time.sleep(0.5) + while not warcprox_.warc_writer_thread.idle: + time.sleep(0.5) + time.sleep(0.5) + + # same host but different scheme and port -- host limit still applies + url = 'https://localhost:{}/q/r'.format(https_daemon.server_port) + for i in range(9): + response = requests.get( + url, proxies=archiving_proxies, headers=headers, stream=True, + verify=False) + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'q!' + assert response.content == b'I am the warcprox test payload! rrrrrrrrrr!\n' + + # wait for writer thread to process + time.sleep(0.5) + while not warcprox_.warc_writer_thread.idle: + time.sleep(0.5) + time.sleep(0.5) + + # back to http, and this is the 11th request + url = 'http://localhost:{}/u/v'.format(http_daemon.server_port) + response = requests.get( + url, proxies=archiving_proxies, headers=headers, stream=True) + assert response.status_code == 420 + assert response.reason == "Reached limit" + expected_response_meta = {'reached-limit': {'test_host_doc_limit_bucket:localhost.total.urls': 10}, 'stats': {'test_host_doc_limit_bucket:localhost': {'total': {'urls': 10, 'wire_bytes': 1350}, 'revisit': {'urls': 8, 'wire_bytes': 1080}, 'bucket': 'test_host_doc_limit_bucket:localhost', 'new': {'urls': 2, 'wire_bytes': 270}}}} + assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta + assert response.headers["content-type"] == "text/plain;charset=utf-8" + assert response.raw.data == b"request rejected by warcprox: reached limit test_host_doc_limit_bucket:localhost.total.urls=10\n" + + # https also blocked + url = 'https://localhost:{}/w/x'.format(https_daemon.server_port) + response = requests.get( + url, proxies=archiving_proxies, headers=headers, stream=True, + verify=False) + assert response.status_code == 420 + assert response.reason == "Reached limit" + expected_response_meta = {'reached-limit': {'test_host_doc_limit_bucket:localhost.total.urls': 10}, 'stats': {'test_host_doc_limit_bucket:localhost': {'total': {'urls': 10, 'wire_bytes': 1350}, 'revisit': {'urls': 8, 'wire_bytes': 1080}, 'bucket': 'test_host_doc_limit_bucket:localhost', 'new': {'urls': 2, 'wire_bytes': 270}}}} + assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta + assert response.headers["content-type"] == "text/plain;charset=utf-8" + assert response.raw.data == b"request rejected by warcprox: reached limit test_host_doc_limit_bucket:localhost.total.urls=10\n" + +def test_host_data_limit( + http_daemon, https_daemon, warcprox_, archiving_proxies): + request_meta = { + "stats": {"buckets": [{"bucket":"test_host_data_limit_bucket","tally-host-stats":True}]}, + # response is 135 bytes, so 3rd novel url should be disallowed + "limits": {"test_host_data_limit_bucket:localhost.new.wire_bytes":200}, + } + headers = {"Warcprox-Meta": json.dumps(request_meta)} + + url = 'http://localhost:{}/y/z'.format(http_daemon.server_port) + response = requests.get( + url, proxies=archiving_proxies, headers=headers, stream=True) + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'y!' + assert response.content == b'I am the warcprox test payload! zzzzzzzzzz!\n' + + # wait for writer thread to process + time.sleep(0.5) + while not warcprox_.warc_writer_thread.idle: + time.sleep(0.5) + time.sleep(0.5) + + # duplicate, does not count toward limit + url = 'https://localhost:{}/y/z'.format(https_daemon.server_port) + response = requests.get( + url, proxies=archiving_proxies, headers=headers, stream=True, + verify=False) + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'y!' + assert response.content == b'I am the warcprox test payload! zzzzzzzzzz!\n' + + # wait for writer thread to process + time.sleep(0.5) + while not warcprox_.warc_writer_thread.idle: + time.sleep(0.5) + time.sleep(0.5) + + # novel, pushes stats over the limit + url = 'https://localhost:{}/z/~'.format(https_daemon.server_port) + response = requests.get( + url, proxies=archiving_proxies, headers=headers, stream=True, + verify=False) + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'z!' + assert response.content == b'I am the warcprox test payload! ~~~~~~~~~~!\n' + + # wait for writer thread to process + time.sleep(0.5) + while not warcprox_.warc_writer_thread.idle: + time.sleep(0.5) + time.sleep(0.5) + + # blocked because we're over the limit now + url = 'http://localhost:{}/y/z'.format(http_daemon.server_port) + response = requests.get( + url, proxies=archiving_proxies, headers=headers, stream=True) + assert response.status_code == 420 + assert response.reason == "Reached limit" + expected_response_meta = {'reached-limit': {'test_host_data_limit_bucket:localhost.new.wire_bytes': 200}, 'stats': {'test_host_data_limit_bucket:localhost': {'total': {'wire_bytes': 405, 'urls': 3}, 'revisit': {'wire_bytes': 135, 'urls': 1}, 'new': {'wire_bytes': 270, 'urls': 2}, 'bucket': 'test_host_data_limit_bucket:localhost'}}} + assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta + assert response.headers["content-type"] == "text/plain;charset=utf-8" + assert response.raw.data == b"request rejected by warcprox: reached limit test_host_data_limit_bucket:localhost.new.wire_bytes=200\n" + + # https also blocked + url = 'https://localhost:{}/w/x'.format(https_daemon.server_port) + response = requests.get( + url, proxies=archiving_proxies, headers=headers, stream=True, + verify=False) + assert response.status_code == 420 + assert response.reason == "Reached limit" + expected_response_meta = {'reached-limit': {'test_host_data_limit_bucket:localhost.new.wire_bytes': 200}, 'stats': {'test_host_data_limit_bucket:localhost': {'total': {'wire_bytes': 405, 'urls': 3}, 'revisit': {'wire_bytes': 135, 'urls': 1}, 'new': {'wire_bytes': 270, 'urls': 2}, 'bucket': 'test_host_data_limit_bucket:localhost'}}} + assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta + assert response.headers["content-type"] == "text/plain;charset=utf-8" + assert response.raw.data == b"request rejected by warcprox: reached limit test_host_data_limit_bucket:localhost.new.wire_bytes=200\n" # XXX this test relies on a tor proxy running at localhost:9050 with a working # connection to the internet, and relies on a third party site (facebook) being diff --git a/warcprox/stats.py b/warcprox/stats.py index 7bf3fbc..edbb131 100644 --- a/warcprox/stats.py +++ b/warcprox/stats.py @@ -1,23 +1,23 @@ -# -# warcprox/stats.py - keeps statistics on what has been proxied, archived -# -# Copyright (C) 2013-2016 Internet Archive -# -# This program is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License -# as published by the Free Software Foundation; either version 2 -# of the License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, -# USA. -# +''' +warcprox/stats.py - keeps statistics on what has been archived + +Copyright (C) 2013-2016 Internet Archive + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, +USA. +''' from __future__ import absolute_import @@ -30,6 +30,7 @@ import warcprox import threading import rethinkdb as r import datetime +import surt def _empty_bucket(bucket): return { @@ -37,17 +38,14 @@ def _empty_bucket(bucket): "total": { "urls": 0, "wire_bytes": 0, - # "warc_bytes": 0, }, "new": { "urls": 0, "wire_bytes": 0, - # "warc_bytes": 0, }, "revisit": { "urls": 0, "wire_bytes": 0, - # "warc_bytes": 0, }, } @@ -109,17 +107,51 @@ class StatsDb: def notify(self, recorded_url, records): self.tally(recorded_url, records) - def tally(self, recorded_url, records): - buckets = ["__all__"] + def buckets(self, recorded_url): + ''' + Unravels bucket definitions in Warcprox-Meta header. Each bucket + definition can either be a string, which signifies the name of the + bucket, or a dict. If a dict it is expected to have at least an item + with key 'bucket' whose value is the name of the bucket. The other + currently recognized item is 'tally-host-stats', which if true, + instructs warcprox to additionally tally substats of the given bucket + by host. Host stats are stored in the stats table under the key + '{parent-bucket}:{host}'. + Example Warcprox-Meta header (a real one will likely have other + sections besides 'stats'): + + Warcprox-Meta: {'stats':{'buckets':['bucket1',{'bucket':'bucket2','tally-host-stats':true}]}} + ''' + buckets = ["__all__"] if (recorded_url.warcprox_meta and "stats" in recorded_url.warcprox_meta and "buckets" in recorded_url.warcprox_meta["stats"]): - buckets.extend(recorded_url.warcprox_meta["stats"]["buckets"]) + for bucket in recorded_url.warcprox_meta["stats"]["buckets"]: + if isinstance(bucket, dict): + if not 'bucket' in bucket: + self.logger.warn( + 'ignoring invalid stats bucket in ' + 'warcprox-meta header %s', bucket) + continue + buckets.append(bucket['bucket']) + # XXX maybe host has been computed elsewhere and can be + # cached somewhere, but maybe the performance gain would be + # negligible + if bucket.get('tally-host-stats'): + buckets.append('%s:%s' % ( + bucket['bucket'], + surt.handyurl.parse(recorded_url.url.decode( + 'utf-8')).host)) + else: + buckets.append(bucket) else: buckets.append("__unspecified__") - for bucket in buckets: + return buckets + + def tally(self, recorded_url, records): + for bucket in self.buckets(recorded_url): # Gdbm wants str/bytes keys in python2, str/unicode keys in python3. # This ugliness deals with keys that arrive as unicode in py2. b = bucket.encode("utf-8") if bucket and not isinstance(bucket, str) else bucket @@ -140,7 +172,7 @@ class StatsDb: self.db[b] = json.dumps(bucket_stats, separators=(',',':')).encode("utf-8") -class RethinkStatsDb: +class RethinkStatsDb(StatsDb): """Updates database in batch every 2.0 seconds""" logger = logging.getLogger("warcprox.stats.RethinkStatsDb") @@ -162,37 +194,47 @@ class RethinkStatsDb: """Starts batch update repeating timer.""" self._update_batch() # starts repeating timer + def _bucket_batch_update_reql(bucket): + return self.r.table(self.table).get(bucket).replace( + lambda old: r.branch( + old.eq(None), self._batch[bucket], old.merge({ + "total": { + "urls": old["total"]["urls"].add( + self._batch[bucket]["total"]["urls"]), + "wire_bytes": old["total"]["wire_bytes"].add( + self._batch[bucket]["total"]["wire_bytes"]), + }, + "new": { + "urls": old["new"]["urls"].add( + self._batch[bucket]["new"]["urls"]), + "wire_bytes": old["new"]["wire_bytes"].add( + self._batch[bucket]["new"]["wire_bytes"]), + }, + "revisit": { + "urls": old["revisit"]["urls"].add( + self._batch[bucket]["revisit"]["urls"]), + "wire_bytes": old["revisit"]["wire_bytes"].add( + self._batch[bucket]["revisit"]["wire_bytes"]), + }, + }))) + def _update_batch(self): with self._batch_lock: if len(self._batch) > 0: - # XXX can this be done in one query? - # r.db("archiveit_brozzler").table("test00").get_all(*["foo01","foo"])... - # >>> r.db("archiveit_brozzler").table("test00").get("foo01").replace(lambda old: r.branch(old.eq(None), {"id":"foo01", "a":{"b":88}}, old.merge({"a":{"b":old["a"]["b"].add(3)}}))).run(conn) - for k in self._batch: - result = self.r.table(self.table).get(k).replace( - lambda old: r.branch(old.eq(None), self._batch[k], old.merge( - { - "total": { - "urls": old["total"]["urls"].add(self._batch[k]["total"]["urls"]), - "wire_bytes": old["total"]["wire_bytes"].add(self._batch[k]["total"]["wire_bytes"]), - }, - "new": { - "urls": old["new"]["urls"].add(self._batch[k]["new"]["urls"]), - "wire_bytes": old["new"]["wire_bytes"].add(self._batch[k]["new"]["wire_bytes"]), - }, - "revisit": { - "urls": old["revisit"]["urls"].add(self._batch[k]["revisit"]["urls"]), - "wire_bytes": old["revisit"]["wire_bytes"].add(self._batch[k]["revisit"]["wire_bytes"]), - }, - } - ))).run() - if not result["inserted"] and not result["replaced"] or sorted(result.values()) != [0,0,0,0,0,1]: - raise Exception("unexpected result %s updating stats %s" % (result, self._batch[k])) + # XXX can all the buckets be done in one query? + for bucket in self._batch: + result = self._bucket_batch_update_reql(bucket).run() + if (not result["inserted"] and not result["replaced"] + or sorted(result.values()) != [0,0,0,0,0,1]): + raise Exception( + "unexpected result %s updating stats %s" % ( + result, self._batch[bucket])) self._batch = {} if not self._stop.is_set(): self._timer = threading.Timer(2.0, self._update_batch) - self._timer.name = "RethinkStats-batch-update-timer-%s" % datetime.datetime.utcnow().isoformat() + self._timer.name = "RethinkStats-batch-update-timer-%s" % ( + datetime.datetime.utcnow().isoformat()) self._timer.start() else: self.logger.info("finished") @@ -227,7 +269,9 @@ class RethinkStatsDb: def value(self, bucket0="__all__", bucket1=None, bucket2=None): bucket0_stats = self.r.table(self.table).get(bucket0).run() - self.logger.debug('stats db lookup of bucket=%s returned %s', bucket0, bucket0_stats) + self.logger.debug( + 'stats db lookup of bucket=%s returned %s', + bucket0, bucket0_stats) if bucket0_stats: if bucket1: if bucket2: @@ -236,37 +280,24 @@ class RethinkStatsDb: return bucket0_stats[bucket1] return bucket0_stats - def _tally(self, buckets, size, is_revisit): + def tally(self, recorded_url, records): + buckets = self.buckets(recorded_url) + is_revisit = records[0].get_header( + warctools.WarcRecord.TYPE) == warctools.WarcRecord.REVISIT with self._batch_lock: for bucket in buckets: - bucket_stats = self._batch.setdefault(bucket, _empty_bucket(bucket)) + bucket_stats = self._batch.setdefault( + bucket, _empty_bucket(bucket)) bucket_stats["total"]["urls"] += 1 - bucket_stats["total"]["wire_bytes"] += size + bucket_stats["total"]["wire_bytes"] += recorded_url.size if is_revisit: bucket_stats["revisit"]["urls"] += 1 - bucket_stats["revisit"]["wire_bytes"] += size + bucket_stats["revisit"]["wire_bytes"] += recorded_url.size else: bucket_stats["new"]["urls"] += 1 - bucket_stats["new"]["wire_bytes"] += size - - def _extract_stats_info(self, recorded_url, records): - buckets = ["__all__"] - - if (recorded_url.warcprox_meta - and "stats" in recorded_url.warcprox_meta - and "buckets" in recorded_url.warcprox_meta["stats"]): - buckets.extend(recorded_url.warcprox_meta["stats"]["buckets"]) - else: - buckets.append("__unspecified__") - - is_revisit = records[0].get_header(warctools.WarcRecord.TYPE) == warctools.WarcRecord.REVISIT - - return buckets, recorded_url.size, is_revisit - - def tally(self, recorded_url, records): - self._tally(*self._extract_stats_info(recorded_url, records)) + bucket_stats["new"]["wire_bytes"] += recorded_url.size def notify(self, recorded_url, records): self.tally(recorded_url, records) diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index d342774..5ffe83d 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -100,6 +100,18 @@ class Url: return host_parts[-len(domain_parts):] == domain_parts class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): + ''' + XXX add more information. + + Among other things, this class enforces limits specified in the + Warcprox-Meta request header. If a limit is deemed to have been reached, no + request will be made to the remote destination server. This implementation + detail has implications worth noting. For example, if a limit applies to + "new" (not deduplicated) bytes, and the limit has already been reached, no + request will be made, even if it would have resulted in duplicate content, + which would not count toward the limit. To reiterate, this is because the + limit enforcer does not know that the content would be deduplicated. + ''' # self.server is WarcProxy logger = logging.getLogger("warcprox.warcprox.WarcProxyHandler") @@ -173,16 +185,24 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): key, limit = item bucket0, bucket1, bucket2 = key.rsplit(".", 2) value = self.server.stats_db.value(bucket0, bucket1, bucket2) - self.logger.debug("warcprox_meta['limits']=%s stats['%s']=%s recorded_url_q.qsize()=%s", - warcprox_meta['limits'], key, value, self.server.recorded_url_q.qsize()) + self.logger.debug( + "warcprox_meta['limits']=%s stats['%s']=%s " + "recorded_url_q.qsize()=%s", warcprox_meta['limits'], + key, value, self.server.recorded_url_q.qsize()) if value and value >= limit: - body = "request rejected by warcprox: reached limit {}={}\n".format(key, limit).encode("utf-8") + body = ("request rejected by warcprox: reached limit " + "%s=%s\n" % (key, limit)).encode("utf-8") self.send_response(420, "Reached limit") self.send_header("Content-Type", "text/plain;charset=utf-8") self.send_header("Connection", "close") self.send_header("Content-Length", len(body)) - response_meta = {"reached-limit":{key:limit}, "stats":{bucket0:self.server.stats_db.value(bucket0)}} - self.send_header("Warcprox-Meta", json.dumps(response_meta, separators=(",",":"))) + response_meta = { + "reached-limit": {key:limit}, + "stats": {bucket0:self.server.stats_db.value(bucket0)} + } + self.send_header( + "Warcprox-Meta", + json.dumps(response_meta, separators=(",",":"))) self.end_headers() if self.command != "HEAD": self.wfile.write(body) From fabd732b7f2e4466703bdcdfa8298566811001ee Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 24 Jun 2016 21:58:37 -0500 Subject: [PATCH 123/146] couple of fixes for host limits --- tests/test_warcprox.py | 50 ++++++++++++++++++++++++++++-------------- warcprox/controller.py | 4 +++- warcprox/stats.py | 2 +- 3 files changed, 38 insertions(+), 18 deletions(-) diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 281d1f9..ebca589 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -57,8 +57,8 @@ import certauth.certauth import warcprox -# logging.basicConfig(stream=sys.stdout, level=logging.INFO, -logging.basicConfig(stream=sys.stdout, level=warcprox.TRACE, +logging.basicConfig( + stream=sys.stdout, level=logging.INFO, # level=warcprox.TRACE, format='%(asctime)s %(process)d %(levelname)s %(threadName)s ' '%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s') logging.getLogger("requests.packages.urllib3").setLevel(logging.WARN) @@ -194,9 +194,9 @@ def captures_db(request, rethinkdb_servers, rethinkdb_big_table): def fin(): if captures_db: captures_db.close() - logging.info('dropping rethinkdb database {}'.format(db)) - result = captures_db.r.db_drop(db).run() - logging.info("result=%s", result) + # logging.info('dropping rethinkdb database {}'.format(db)) + # result = captures_db.r.db_drop(db).run() + # logging.info("result=%s", result) request.addfinalizer(fin) return captures_db @@ -862,40 +862,55 @@ def test_host_doc_limit( time.sleep(0.5) # same host but different scheme and port -- host limit still applies - url = 'https://localhost:{}/q/r'.format(https_daemon.server_port) - for i in range(9): + url = 'https://localhost:{}/o/p'.format(https_daemon.server_port) + for i in range(8): response = requests.get( url, proxies=archiving_proxies, headers=headers, stream=True, verify=False) assert response.status_code == 200 - assert response.headers['warcprox-test-header'] == 'q!' - assert response.content == b'I am the warcprox test payload! rrrrrrrrrr!\n' + assert response.headers['warcprox-test-header'] == 'o!' + assert response.content == b'I am the warcprox test payload! pppppppppp!\n' # wait for writer thread to process time.sleep(0.5) while not warcprox_.warc_writer_thread.idle: time.sleep(0.5) + # rethinkdb stats db update cycle is 2 seconds (at the moment anyway) + time.sleep(2.0) + + response = requests.get( + url, proxies=archiving_proxies, headers=headers, stream=True, + verify=False) + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'o!' + assert response.content == b'I am the warcprox test payload! pppppppppp!\n' + + # wait for writer thread to process time.sleep(0.5) + while not warcprox_.warc_writer_thread.idle: + time.sleep(0.5) + # rethinkdb stats db update cycle is 2 seconds (at the moment anyway) + time.sleep(2.0) # back to http, and this is the 11th request - url = 'http://localhost:{}/u/v'.format(http_daemon.server_port) + url = 'http://localhost:{}/o/p'.format(http_daemon.server_port) response = requests.get( url, proxies=archiving_proxies, headers=headers, stream=True) assert response.status_code == 420 assert response.reason == "Reached limit" - expected_response_meta = {'reached-limit': {'test_host_doc_limit_bucket:localhost.total.urls': 10}, 'stats': {'test_host_doc_limit_bucket:localhost': {'total': {'urls': 10, 'wire_bytes': 1350}, 'revisit': {'urls': 8, 'wire_bytes': 1080}, 'bucket': 'test_host_doc_limit_bucket:localhost', 'new': {'urls': 2, 'wire_bytes': 270}}}} + expected_response_meta = {'reached-limit': {'test_host_doc_limit_bucket:localhost.total.urls': 10}, 'stats': {'test_host_doc_limit_bucket:localhost': {'bucket': 'test_host_doc_limit_bucket:localhost', 'revisit': {'wire_bytes': 1215, 'urls': 9}, 'new': {'wire_bytes': 135, 'urls': 1}, 'total': {'wire_bytes': 1350, 'urls': 10}}}} assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta assert response.headers["content-type"] == "text/plain;charset=utf-8" assert response.raw.data == b"request rejected by warcprox: reached limit test_host_doc_limit_bucket:localhost.total.urls=10\n" # https also blocked - url = 'https://localhost:{}/w/x'.format(https_daemon.server_port) + url = 'https://localhost:{}/o/p'.format(https_daemon.server_port) response = requests.get( url, proxies=archiving_proxies, headers=headers, stream=True, verify=False) assert response.status_code == 420 assert response.reason == "Reached limit" - expected_response_meta = {'reached-limit': {'test_host_doc_limit_bucket:localhost.total.urls': 10}, 'stats': {'test_host_doc_limit_bucket:localhost': {'total': {'urls': 10, 'wire_bytes': 1350}, 'revisit': {'urls': 8, 'wire_bytes': 1080}, 'bucket': 'test_host_doc_limit_bucket:localhost', 'new': {'urls': 2, 'wire_bytes': 270}}}} + expected_response_meta = {'reached-limit': {'test_host_doc_limit_bucket:localhost.total.urls': 10}, 'stats': {'test_host_doc_limit_bucket:localhost': {'bucket': 'test_host_doc_limit_bucket:localhost', 'revisit': {'wire_bytes': 1215, 'urls': 9}, 'new': {'wire_bytes': 135, 'urls': 1}, 'total': {'wire_bytes': 1350, 'urls': 10}}}} assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta assert response.headers["content-type"] == "text/plain;charset=utf-8" assert response.raw.data == b"request rejected by warcprox: reached limit test_host_doc_limit_bucket:localhost.total.urls=10\n" @@ -920,7 +935,8 @@ def test_host_data_limit( time.sleep(0.5) while not warcprox_.warc_writer_thread.idle: time.sleep(0.5) - time.sleep(0.5) + # rethinkdb stats db update cycle is 2 seconds (at the moment anyway) + time.sleep(2.0) # duplicate, does not count toward limit url = 'https://localhost:{}/y/z'.format(https_daemon.server_port) @@ -935,7 +951,8 @@ def test_host_data_limit( time.sleep(0.5) while not warcprox_.warc_writer_thread.idle: time.sleep(0.5) - time.sleep(0.5) + # rethinkdb stats db update cycle is 2 seconds (at the moment anyway) + time.sleep(2.0) # novel, pushes stats over the limit url = 'https://localhost:{}/z/~'.format(https_daemon.server_port) @@ -950,7 +967,8 @@ def test_host_data_limit( time.sleep(0.5) while not warcprox_.warc_writer_thread.idle: time.sleep(0.5) - time.sleep(0.5) + # rethinkdb stats db update cycle is 2 seconds (at the moment anyway) + time.sleep(2.0) # blocked because we're over the limit now url = 'http://localhost:{}/y/z'.format(http_daemon.server_port) diff --git a/warcprox/controller.py b/warcprox/controller.py index 760a1e8..540371a 100644 --- a/warcprox/controller.py +++ b/warcprox/controller.py @@ -142,7 +142,9 @@ class WarcproxController(object): status_info['queue_size'] = self.proxy.recorded_url_q.qsize() self.status_info = self.service_registry.heartbeat(status_info) - self.logger.debug("status in service registry: %s", self.status_info) + self.logger.log( + warcprox.TRACE, "status in service registry: %s", + self.status_info) def run_until_shutdown(self): """ diff --git a/warcprox/stats.py b/warcprox/stats.py index edbb131..8d5b324 100644 --- a/warcprox/stats.py +++ b/warcprox/stats.py @@ -194,7 +194,7 @@ class RethinkStatsDb(StatsDb): """Starts batch update repeating timer.""" self._update_batch() # starts repeating timer - def _bucket_batch_update_reql(bucket): + def _bucket_batch_update_reql(self, bucket): return self.r.table(self.table).get(bucket).replace( lambda old: r.branch( old.eq(None), self._batch[bucket], old.merge({ From 6410e4c8c779a881d104c9a4176c78db2bf5cc75 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Mon, 27 Jun 2016 14:18:21 -0500 Subject: [PATCH 124/146] reorganize WarcproxController.run_until_shutdown, moving parts of it into new start() and shutdown() methods, for easier integration into a separate python program --- setup.py | 49 +++++++++++++------------- warcprox/controller.py | 78 ++++++++++++++++++++++++------------------ warcprox/main.py | 9 ++--- 3 files changed, 74 insertions(+), 62 deletions(-) diff --git a/setup.py b/setup.py index 5dcfc75..e8e43e8 100755 --- a/setup.py +++ b/setup.py @@ -1,37 +1,36 @@ #!/usr/bin/env python -# -# setup.py - setuptools installation config for warcprox -# -# Copyright (C) 2013-2016 Internet Archive -# -# This program is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License -# as published by the Free Software Foundation; either version 2 -# of the License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, -# USA. -# +''' +setup.py - setuptools installation configuration for warcprox + +Copyright (C) 2013-2016 Internet Archive + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, +USA. +''' -from setuptools.command.test import test as TestCommand import sys import setuptools # special class needs to be added to support the pytest written dump-anydbm tests -class PyTest(TestCommand): +class PyTest(setuptools.command.test.TestCommand): def finalize_options(self): - TestCommand.finalize_options(self) + setuptools.command.test.TestCommand.finalize_options(self) self.test_args = [] self.test_suite = True def run_tests(self): - #import here, cause outside the eggs aren't loaded + # import here, because outside the eggs aren't loaded import pytest errno = pytest.main(self.test_args) sys.exit(errno) @@ -50,7 +49,7 @@ except: deps.append('futures') setuptools.setup(name='warcprox', - version='2.0.dev10', + version='2.0.dev11', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', diff --git a/warcprox/controller.py b/warcprox/controller.py index 540371a..b56ad50 100644 --- a/warcprox/controller.py +++ b/warcprox/controller.py @@ -146,27 +146,60 @@ class WarcproxController(object): warcprox.TRACE, "status in service registry: %s", self.status_info) - def run_until_shutdown(self): - """ - Start warcprox and run until shut down. Call - warcprox_controller.stop.set() to initiate graceful shutdown. - """ + def start(self): + # XXX check if already started if self.proxy.stats_db: self.proxy.stats_db.start() - proxy_thread = threading.Thread( + self.proxy_thread = threading.Thread( target=self.proxy.serve_forever, name='ProxyThread') - proxy_thread.start() + self.proxy_thread.start() if self.warc_writer_thread.dedup_db: self.warc_writer_thread.dedup_db.start() self.warc_writer_thread.start() if self.playback_proxy is not None: - playback_proxy_thread = threading.Thread(target=self.playback_proxy.serve_forever, name='PlaybackProxyThread') + self.playback_proxy_thread = threading.Thread( + target=self.playback_proxy.serve_forever, + name='PlaybackProxyThread') playback_proxy_thread.start() self.stop = threading.Event() + def shutdown(self): + # XXX check if already shut down + self.warc_writer_thread.stop.set() + self.proxy.shutdown() + self.proxy.server_close() + + if self.playback_proxy is not None: + self.playback_proxy.shutdown() + self.playback_proxy.server_close() + if self.playback_proxy.playback_index_db is not None: + self.playback_proxy.playback_index_db.close() + + # wait for threads to finish + self.warc_writer_thread.join() + + if self.proxy.stats_db: + self.proxy.stats_db.stop() + if self.warc_writer_thread.dedup_db: + self.warc_writer_thread.dedup_db.close() + + self.proxy_thread.join() + if self.playback_proxy is not None: + self.playback_proxy_thread.join() + + if self.service_registry and hasattr(self, "status_info"): + self.service_registry.unregister(self.status_info["id"]) + + def run_until_shutdown(self): + """ + Start warcprox and run until shut down. Call + warcprox_controller.stop.set() to initiate graceful shutdown. + """ + self.start() + last_mem_dbg = datetime.datetime.utcfromtimestamp(0) try: @@ -190,31 +223,10 @@ class WarcproxController(object): time.sleep(0.5) except: - self.logger.critical("fatal exception, shutting down", exc_info=True) + self.logger.critical( + "shutting down in response to fatal exception", + exc_info=True) pass finally: - self.warc_writer_thread.stop.set() - self.proxy.shutdown() - self.proxy.server_close() - - if self.playback_proxy is not None: - self.playback_proxy.shutdown() - self.playback_proxy.server_close() - if self.playback_proxy.playback_index_db is not None: - self.playback_proxy.playback_index_db.close() - - # wait for threads to finish - self.warc_writer_thread.join() - - if self.proxy.stats_db: - self.proxy.stats_db.stop() - if self.warc_writer_thread.dedup_db: - self.warc_writer_thread.dedup_db.close() - - proxy_thread.join() - if self.playback_proxy is not None: - playback_proxy_thread.join() - - if self.service_registry and hasattr(self, "status_info"): - self.service_registry.unregister(self.status_info["id"]) + self.shutdown() diff --git a/warcprox/main.py b/warcprox/main.py index 00d2d85..b203c1c 100644 --- a/warcprox/main.py +++ b/warcprox/main.py @@ -209,14 +209,15 @@ def init_controller(args): warc_writer_thread, playback_proxy, service_registry=svcreg, options=options) - signal.signal(signal.SIGTERM, lambda a,b: controller.stop.set()) - signal.signal(signal.SIGINT, lambda a,b: controller.stop.set()) - signal.signal(signal.SIGQUIT, dump_state) - return controller def real_main(args): controller = init_controller(args) + + signal.signal(signal.SIGTERM, lambda a,b: controller.stop.set()) + signal.signal(signal.SIGINT, lambda a,b: controller.stop.set()) + signal.signal(signal.SIGQUIT, dump_state) + controller.run_until_shutdown() def parse_args(argv=sys.argv): From 84767af0f609fc5058b2c1d04d764186c97cf88a Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Mon, 27 Jun 2016 14:36:06 -0500 Subject: [PATCH 125/146] check if already started/stopped in WarcproxController.{start,shutdown}, fix bugs --- setup.py | 7 ++- warcprox/controller.py | 137 +++++++++++++++++++++++------------------ 2 files changed, 81 insertions(+), 63 deletions(-) diff --git a/setup.py b/setup.py index e8e43e8..571521e 100755 --- a/setup.py +++ b/setup.py @@ -22,11 +22,12 @@ USA. import sys import setuptools +import setuptools.command.test # special class needs to be added to support the pytest written dump-anydbm tests -class PyTest(setuptools.command.test.TestCommand): +class PyTest(setuptools.command.test.test): def finalize_options(self): - setuptools.command.test.TestCommand.finalize_options(self) + setuptools.command.test.test.finalize_options(self) self.test_args = [] self.test_suite = True def run_tests(self): @@ -49,7 +50,7 @@ except: deps.append('futures') setuptools.setup(name='warcprox', - version='2.0.dev11', + version='2.0.dev12', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', diff --git a/warcprox/controller.py b/warcprox/controller.py index b56ad50..9796e71 100644 --- a/warcprox/controller.py +++ b/warcprox/controller.py @@ -1,26 +1,26 @@ -# -# warcprox/controller.py - contains WarcproxController class, responsible for -# starting up and shutting down the various components of warcprox, and for -# sending heartbeats to the service registry if configured to do so; also has -# some memory profiling capabilities -# -# Copyright (C) 2013-2016 Internet Archive -# -# This program is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License -# as published by the Free Software Foundation; either version 2 -# of the License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, -# USA. -# +''' +warcprox/controller.py - contains WarcproxController class, responsible for +starting up and shutting down the various components of warcprox, and for +sending heartbeats to the service registry if configured to do so; also has +some memory profiling capabilities + +Copyright (C) 2013-2016 Internet Archive + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, +USA. +''' from __future__ import absolute_import @@ -60,12 +60,17 @@ class WarcproxController(object): else: self.warc_writer_thread = warcprox.warcwriter.WarcWriterThread(recorded_url_q=self.proxy.recorded_url_q) + self.proxy_thread = None + self.playback_proxy_thread = None self.playback_proxy = playback_proxy self.service_registry = service_registry self.options = options self._last_rss = None + self.stop = threading.Event() + self._start_stop_lock = threading.Lock() + def debug_mem(self): self.logger.info("self.proxy.recorded_url_q.qsize()=%s", self.proxy.recorded_url_q.qsize()) with open("/proc/self/status") as f: @@ -147,51 +152,57 @@ class WarcproxController(object): self.status_info) def start(self): - # XXX check if already started - if self.proxy.stats_db: - self.proxy.stats_db.start() - self.proxy_thread = threading.Thread( - target=self.proxy.serve_forever, name='ProxyThread') - self.proxy_thread.start() + with self._start_stop_lock: + if self.proxy_thread and self.proxy_thread.is_alive(): + self.logger.info('warcprox is already running') + return - if self.warc_writer_thread.dedup_db: - self.warc_writer_thread.dedup_db.start() - self.warc_writer_thread.start() + if self.proxy.stats_db: + self.proxy.stats_db.start() + self.proxy_thread = threading.Thread( + target=self.proxy.serve_forever, name='ProxyThread') + self.proxy_thread.start() - if self.playback_proxy is not None: - self.playback_proxy_thread = threading.Thread( - target=self.playback_proxy.serve_forever, - name='PlaybackProxyThread') - playback_proxy_thread.start() + if self.warc_writer_thread.dedup_db: + self.warc_writer_thread.dedup_db.start() + self.warc_writer_thread.start() - self.stop = threading.Event() + if self.playback_proxy is not None: + self.playback_proxy_thread = threading.Thread( + target=self.playback_proxy.serve_forever, + name='PlaybackProxyThread') + self.playback_proxy_thread.start() def shutdown(self): - # XXX check if already shut down - self.warc_writer_thread.stop.set() - self.proxy.shutdown() - self.proxy.server_close() + with self._start_stop_lock: + if not self.proxy_thread or not self.proxy_thread.is_alive(): + self.logger.info('warcprox is not running') + return - if self.playback_proxy is not None: - self.playback_proxy.shutdown() - self.playback_proxy.server_close() - if self.playback_proxy.playback_index_db is not None: - self.playback_proxy.playback_index_db.close() + self.warc_writer_thread.stop.set() + self.proxy.shutdown() + self.proxy.server_close() - # wait for threads to finish - self.warc_writer_thread.join() + if self.playback_proxy is not None: + self.playback_proxy.shutdown() + self.playback_proxy.server_close() + if self.playback_proxy.playback_index_db is not None: + self.playback_proxy.playback_index_db.close() - if self.proxy.stats_db: - self.proxy.stats_db.stop() - if self.warc_writer_thread.dedup_db: - self.warc_writer_thread.dedup_db.close() + # wait for threads to finish + self.warc_writer_thread.join() - self.proxy_thread.join() - if self.playback_proxy is not None: - self.playback_proxy_thread.join() + if self.proxy.stats_db: + self.proxy.stats_db.stop() + if self.warc_writer_thread.dedup_db: + self.warc_writer_thread.dedup_db.close() - if self.service_registry and hasattr(self, "status_info"): - self.service_registry.unregister(self.status_info["id"]) + self.proxy_thread.join() + if self.playback_proxy is not None: + self.playback_proxy_thread.join() + + if self.service_registry and hasattr(self, "status_info"): + self.service_registry.unregister(self.status_info["id"]) def run_until_shutdown(self): """ @@ -214,10 +225,16 @@ class WarcproxController(object): try: while not self.stop.is_set(): - if self.service_registry and (not hasattr(self, "status_info") or (datetime.datetime.now(utc) - self.status_info["last_heartbeat"]).total_seconds() > self.HEARTBEAT_INTERVAL): + if self.service_registry and ( + not hasattr(self, "status_info") or ( + datetime.datetime.now(utc) + - self.status_info["last_heartbeat"] + ).total_seconds() > self.HEARTBEAT_INTERVAL): self._service_heartbeat() - if self.options.profile and (datetime.datetime.utcnow() - last_mem_dbg).total_seconds() > 60: + if self.options.profile and ( + datetime.datetime.utcnow() - last_mem_dbg + ).total_seconds() > 60: self.debug_mem() last_mem_dbg = datetime.datetime.utcnow() From 9df2ce0fbeb7d3163bcd08937e6f21f7c1e86149 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Mon, 27 Jun 2016 14:46:42 -0500 Subject: [PATCH 126/146] convert command-line executables to entry_points console_scripts, best practice according to Python Packaging Authority (eases testing, etc) --- bin/warcprox | 7 ---- setup.py | 12 ++++-- bin/dump-anydbm => warcprox/dump_anydbm.py | 45 +++++++++++----------- 3 files changed, 32 insertions(+), 32 deletions(-) delete mode 100755 bin/warcprox rename bin/dump-anydbm => warcprox/dump_anydbm.py (67%) diff --git a/bin/warcprox b/bin/warcprox deleted file mode 100755 index d236cf6..0000000 --- a/bin/warcprox +++ /dev/null @@ -1,7 +0,0 @@ -#!/usr/bin/env python - -from __future__ import absolute_import - -import warcprox.main - -warcprox.main.main() diff --git a/setup.py b/setup.py index 571521e..9b760f5 100755 --- a/setup.py +++ b/setup.py @@ -49,8 +49,9 @@ try: except: deps.append('futures') -setuptools.setup(name='warcprox', - version='2.0.dev12', +setuptools.setup( + name='warcprox', + version='2.0.dev13', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', @@ -62,7 +63,12 @@ setuptools.setup(name='warcprox', tests_require=['requests>=2.0.1', 'pytest'], # >=2.0.1 for https://github.com/kennethreitz/requests/pull/1636 cmdclass = {'test': PyTest}, test_suite='warcprox.tests', - scripts=['bin/dump-anydbm', 'bin/warcprox'], + entry_points={ + 'console_scripts': [ + 'warcprox=warprox.main:main', + 'dump-anydbm=warcprox.dump_anydbm:main', + ], + }, zip_safe=False, classifiers=[ 'Development Status :: 5 - Production/Stable', diff --git a/bin/dump-anydbm b/warcprox/dump_anydbm.py similarity index 67% rename from bin/dump-anydbm rename to warcprox/dump_anydbm.py index b9200bc..6de00c6 100755 --- a/bin/dump-anydbm +++ b/warcprox/dump_anydbm.py @@ -1,30 +1,28 @@ #!/usr/bin/env python -# -# dump-anydbm - dumps contents of dbm file to stdout -# -# Copyright (C) 2013-2016 Internet Archive -# -# This program is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License -# as published by the Free Software Foundation; either version 2 -# of the License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, -# USA. -# +''' +dump-anydbm - dumps contents of dbm file to stdout -""" Dump contents of database to stdout. Database can be any file that the anydbm module can read. Included with warcprox because it's useful for inspecting a deduplication database or a playback index database, but it is a generic tool. -""" + +Copyright (C) 2013-2016 Internet Archive + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, +USA. +''' try: import dbm @@ -40,6 +38,9 @@ import sys import os.path if __name__ == "__main__": + main() + +def main(): if len(sys.argv) != 2: sys.stderr.write("usage: {} DBM_FILE\n".format(sys.argv[0])) exit(1) From 320df0565ec92b732c7ea0b254c2e6592cc4ba8c Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Mon, 27 Jun 2016 16:07:20 -0500 Subject: [PATCH 127/146] support "soft limits" which result in a different response code (430) than regular (hard) limits (which result in a 420) --- setup.py | 2 +- tests/test_warcprox.py | 46 +++++++++++++------------- warcprox/mitmproxy.py | 2 +- warcprox/warcproxy.py | 75 +++++++++++++++++++++++++----------------- 4 files changed, 69 insertions(+), 56 deletions(-) diff --git a/setup.py b/setup.py index 9b760f5..6fc4875 100755 --- a/setup.py +++ b/setup.py @@ -51,7 +51,7 @@ except: setuptools.setup( name='warcprox', - version='2.0.dev13', + version='2.0.dev14', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index ebca589..8793f80 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -564,7 +564,7 @@ def test_dedup_https(https_daemon, warcprox_, archiving_proxies, playback_proxie def test_limits(http_daemon, warcprox_, archiving_proxies): url = 'http://localhost:{}/i/j'.format(http_daemon.server_port) - request_meta = {"stats":{"buckets":["test_limits_bucket"]},"limits":{"test_limits_bucket.total.urls":10}} + request_meta = {"stats":{"buckets":["test_limits_bucket"]},"limits":{"test_limits_bucket/total/urls":10}} headers = {"Warcprox-Meta": json.dumps(request_meta)} response = requests.get(url, proxies=archiving_proxies, headers=headers, stream=True) @@ -593,10 +593,10 @@ def test_limits(http_daemon, warcprox_, archiving_proxies): response = requests.get(url, proxies=archiving_proxies, headers=headers, stream=True) assert response.status_code == 420 assert response.reason == "Reached limit" - expected_response_meta = {'reached-limit': {'test_limits_bucket.total.urls': 10}, 'stats': {'test_limits_bucket': {'bucket': 'test_limits_bucket', 'revisit': {'wire_bytes': 1215, 'urls': 9}, 'total': {'wire_bytes': 1350, 'urls': 10}, 'new': {'wire_bytes': 135, 'urls': 1}}}} + expected_response_meta = {'reached-limit': {'test_limits_bucket/total/urls': 10}, 'stats': {'test_limits_bucket': {'bucket': 'test_limits_bucket', 'revisit': {'wire_bytes': 1215, 'urls': 9}, 'total': {'wire_bytes': 1350, 'urls': 10}, 'new': {'wire_bytes': 135, 'urls': 1}}}} assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta assert response.headers["content-type"] == "text/plain;charset=utf-8" - assert response.raw.data == b"request rejected by warcprox: reached limit test_limits_bucket.total.urls=10\n" + assert response.raw.data == b"request rejected by warcprox: reached limit test_limits_bucket/total/urls=10\n" def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies, playback_proxies): url1 = 'http://localhost:{}/k/l'.format(http_daemon.server_port) @@ -840,11 +840,11 @@ def test_block_rules(http_daemon, https_daemon, warcprox_, archiving_proxies): assert response.content.startswith(b"request rejected by warcprox: blocked by rule found in Warcprox-Meta header:") assert json.loads(response.headers['warcprox-meta']) == {"blocked-by-rule":rules[3]} -def test_host_doc_limit( +def test_host_doc_soft_limit( http_daemon, https_daemon, warcprox_, archiving_proxies): request_meta = { "stats": {"buckets": [{"bucket":"test_host_doc_limit_bucket","tally-host-stats":True}]}, - "limits": {"test_host_doc_limit_bucket:localhost.total.urls":10}, + "soft-limits": {"test_host_doc_limit_bucket:localhost/total/urls":10}, } headers = {"Warcprox-Meta": json.dumps(request_meta)} @@ -896,31 +896,31 @@ def test_host_doc_limit( url = 'http://localhost:{}/o/p'.format(http_daemon.server_port) response = requests.get( url, proxies=archiving_proxies, headers=headers, stream=True) - assert response.status_code == 420 - assert response.reason == "Reached limit" - expected_response_meta = {'reached-limit': {'test_host_doc_limit_bucket:localhost.total.urls': 10}, 'stats': {'test_host_doc_limit_bucket:localhost': {'bucket': 'test_host_doc_limit_bucket:localhost', 'revisit': {'wire_bytes': 1215, 'urls': 9}, 'new': {'wire_bytes': 135, 'urls': 1}, 'total': {'wire_bytes': 1350, 'urls': 10}}}} + assert response.status_code == 430 + assert response.reason == "Reached soft limit" + expected_response_meta = {'reached-soft-limit': {'test_host_doc_limit_bucket:localhost/total/urls': 10}, 'stats': {'test_host_doc_limit_bucket:localhost': {'bucket': 'test_host_doc_limit_bucket:localhost', 'revisit': {'wire_bytes': 1215, 'urls': 9}, 'new': {'wire_bytes': 135, 'urls': 1}, 'total': {'wire_bytes': 1350, 'urls': 10}}}} assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta assert response.headers["content-type"] == "text/plain;charset=utf-8" - assert response.raw.data == b"request rejected by warcprox: reached limit test_host_doc_limit_bucket:localhost.total.urls=10\n" + assert response.raw.data == b"request rejected by warcprox: reached soft limit test_host_doc_limit_bucket:localhost/total/urls=10\n" # https also blocked url = 'https://localhost:{}/o/p'.format(https_daemon.server_port) response = requests.get( url, proxies=archiving_proxies, headers=headers, stream=True, verify=False) - assert response.status_code == 420 - assert response.reason == "Reached limit" - expected_response_meta = {'reached-limit': {'test_host_doc_limit_bucket:localhost.total.urls': 10}, 'stats': {'test_host_doc_limit_bucket:localhost': {'bucket': 'test_host_doc_limit_bucket:localhost', 'revisit': {'wire_bytes': 1215, 'urls': 9}, 'new': {'wire_bytes': 135, 'urls': 1}, 'total': {'wire_bytes': 1350, 'urls': 10}}}} + assert response.status_code == 430 + assert response.reason == "Reached soft limit" + expected_response_meta = {'reached-soft-limit': {'test_host_doc_limit_bucket:localhost/total/urls': 10}, 'stats': {'test_host_doc_limit_bucket:localhost': {'bucket': 'test_host_doc_limit_bucket:localhost', 'revisit': {'wire_bytes': 1215, 'urls': 9}, 'new': {'wire_bytes': 135, 'urls': 1}, 'total': {'wire_bytes': 1350, 'urls': 10}}}} assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta assert response.headers["content-type"] == "text/plain;charset=utf-8" - assert response.raw.data == b"request rejected by warcprox: reached limit test_host_doc_limit_bucket:localhost.total.urls=10\n" + assert response.raw.data == b"request rejected by warcprox: reached soft limit test_host_doc_limit_bucket:localhost/total/urls=10\n" -def test_host_data_limit( +def test_host_data_soft_limit( http_daemon, https_daemon, warcprox_, archiving_proxies): request_meta = { "stats": {"buckets": [{"bucket":"test_host_data_limit_bucket","tally-host-stats":True}]}, # response is 135 bytes, so 3rd novel url should be disallowed - "limits": {"test_host_data_limit_bucket:localhost.new.wire_bytes":200}, + "soft-limits": {"test_host_data_limit_bucket:localhost/new/wire_bytes":200}, } headers = {"Warcprox-Meta": json.dumps(request_meta)} @@ -974,24 +974,24 @@ def test_host_data_limit( url = 'http://localhost:{}/y/z'.format(http_daemon.server_port) response = requests.get( url, proxies=archiving_proxies, headers=headers, stream=True) - assert response.status_code == 420 - assert response.reason == "Reached limit" - expected_response_meta = {'reached-limit': {'test_host_data_limit_bucket:localhost.new.wire_bytes': 200}, 'stats': {'test_host_data_limit_bucket:localhost': {'total': {'wire_bytes': 405, 'urls': 3}, 'revisit': {'wire_bytes': 135, 'urls': 1}, 'new': {'wire_bytes': 270, 'urls': 2}, 'bucket': 'test_host_data_limit_bucket:localhost'}}} + assert response.status_code == 430 + assert response.reason == "Reached soft limit" + expected_response_meta = {'reached-soft-limit': {'test_host_data_limit_bucket:localhost/new/wire_bytes': 200}, 'stats': {'test_host_data_limit_bucket:localhost': {'total': {'wire_bytes': 405, 'urls': 3}, 'revisit': {'wire_bytes': 135, 'urls': 1}, 'new': {'wire_bytes': 270, 'urls': 2}, 'bucket': 'test_host_data_limit_bucket:localhost'}}} assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta assert response.headers["content-type"] == "text/plain;charset=utf-8" - assert response.raw.data == b"request rejected by warcprox: reached limit test_host_data_limit_bucket:localhost.new.wire_bytes=200\n" + assert response.raw.data == b"request rejected by warcprox: reached soft limit test_host_data_limit_bucket:localhost/new/wire_bytes=200\n" # https also blocked url = 'https://localhost:{}/w/x'.format(https_daemon.server_port) response = requests.get( url, proxies=archiving_proxies, headers=headers, stream=True, verify=False) - assert response.status_code == 420 - assert response.reason == "Reached limit" - expected_response_meta = {'reached-limit': {'test_host_data_limit_bucket:localhost.new.wire_bytes': 200}, 'stats': {'test_host_data_limit_bucket:localhost': {'total': {'wire_bytes': 405, 'urls': 3}, 'revisit': {'wire_bytes': 135, 'urls': 1}, 'new': {'wire_bytes': 270, 'urls': 2}, 'bucket': 'test_host_data_limit_bucket:localhost'}}} + assert response.status_code == 430 + assert response.reason == "Reached soft limit" + expected_response_meta = {'reached-soft-limit': {'test_host_data_limit_bucket:localhost/new/wire_bytes': 200}, 'stats': {'test_host_data_limit_bucket:localhost': {'total': {'wire_bytes': 405, 'urls': 3}, 'revisit': {'wire_bytes': 135, 'urls': 1}, 'new': {'wire_bytes': 270, 'urls': 2}, 'bucket': 'test_host_data_limit_bucket:localhost'}}} assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta assert response.headers["content-type"] == "text/plain;charset=utf-8" - assert response.raw.data == b"request rejected by warcprox: reached limit test_host_data_limit_bucket:localhost.new.wire_bytes=200\n" + assert response.raw.data == b"request rejected by warcprox: reached soft limit test_host_data_limit_bucket:localhost/new/wire_bytes=200\n" # XXX this test relies on a tor proxy running at localhost:9050 with a working # connection to the internet, and relies on a third party site (facebook) being diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index 3950e4e..c6c75b9 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -319,7 +319,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): self.logger.info("%s: %s", repr(self.requestline), e) return except Exception as e: - self.logger.error("problem processing request {}: {}".format(repr(self.requestline), e)) + self.logger.error("problem processing request {}: {}".format(repr(self.requestline), e), exc_info=True) self.send_error(500, str(e)) return diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index 5ffe83d..882fec9 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -175,42 +175,55 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): self.client_address[0], self.command, self.url, rule)) + def _enforce_limit(self, limit_key, limit_value, soft=False): + bucket0, bucket1, bucket2 = limit_key.rsplit("/", 2) + value = self.server.stats_db.value(bucket0, bucket1, bucket2) + if value and value >= limit_value: + body = ("request rejected by warcprox: reached %s %s=%s\n" % ( + "soft limit" if soft else "limit", limit_key, + limit_value)).encode("utf-8") + if soft: + self.send_response(430, "Reached soft limit") + else: + self.send_response(420, "Reached limit") + self.send_header("Content-Type", "text/plain;charset=utf-8") + self.send_header("Connection", "close") + self.send_header("Content-Length", len(body)) + response_meta = { + "stats": {bucket0:self.server.stats_db.value(bucket0)} + } + if soft: + response_meta["reached-soft-limit"] = {limit_key:limit_value} + else: + response_meta["reached-limit"] = {limit_key:limit_value} + self.send_header( + "Warcprox-Meta", + json.dumps(response_meta, separators=(",",":"))) + self.end_headers() + if self.command != "HEAD": + self.wfile.write(body) + self.connection.close() + raise warcprox.RequestBlockedByRule( + "%s %s %s %s -- reached %s %s=%s" % ( + self.client_address[0], 430 if soft else 420, + self.command, self.url, + "soft limit" if soft else "limit", + limit_key, limit_value)) + def _enforce_limits(self, warcprox_meta): """ - Sends a 420 response and raises warcprox.RequestBlockedByRule if a - limit specified in warcprox_meta is reached. + Sends a 420 (hard limit) or 430 (soft limit) response and raises + warcprox.RequestBlockedByRule if a limit specified in warcprox_meta is + reached. """ if warcprox_meta and "limits" in warcprox_meta: for item in warcprox_meta["limits"].items(): - key, limit = item - bucket0, bucket1, bucket2 = key.rsplit(".", 2) - value = self.server.stats_db.value(bucket0, bucket1, bucket2) - self.logger.debug( - "warcprox_meta['limits']=%s stats['%s']=%s " - "recorded_url_q.qsize()=%s", warcprox_meta['limits'], - key, value, self.server.recorded_url_q.qsize()) - if value and value >= limit: - body = ("request rejected by warcprox: reached limit " - "%s=%s\n" % (key, limit)).encode("utf-8") - self.send_response(420, "Reached limit") - self.send_header("Content-Type", "text/plain;charset=utf-8") - self.send_header("Connection", "close") - self.send_header("Content-Length", len(body)) - response_meta = { - "reached-limit": {key:limit}, - "stats": {bucket0:self.server.stats_db.value(bucket0)} - } - self.send_header( - "Warcprox-Meta", - json.dumps(response_meta, separators=(",",":"))) - self.end_headers() - if self.command != "HEAD": - self.wfile.write(body) - self.connection.close() - raise warcprox.RequestBlockedByRule( - "%s 420 %s %s -- reached limit %s=%s" % ( - self.client_address[0], self.command, - self.url, key, limit)) + limit_key, limit_value = item + self._enforce_limit(limit_key, limit_value, soft=False) + if warcprox_meta and "soft-limits" in warcprox_meta: + for item in warcprox_meta["soft-limits"].items(): + limit_key, limit_value = item + self._enforce_limit(limit_key, limit_value, soft=True) def _connect_to_remote_server(self): ''' From 04c21408d799335153124de2456c58621e6bcc9d Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Mon, 27 Jun 2016 23:13:00 +0000 Subject: [PATCH 128/146] fix typo --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 6fc4875..3339930 100755 --- a/setup.py +++ b/setup.py @@ -51,7 +51,7 @@ except: setuptools.setup( name='warcprox', - version='2.0.dev14', + version='2.0.dev15', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', @@ -65,7 +65,7 @@ setuptools.setup( test_suite='warcprox.tests', entry_points={ 'console_scripts': [ - 'warcprox=warprox.main:main', + 'warcprox=warcprox.main:main', 'dump-anydbm=warcprox.dump_anydbm:main', ], }, From 04c4b63f03de82d518262a97aec25ae0949a4ab8 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 28 Jun 2016 15:35:02 -0500 Subject: [PATCH 129/146] renaming scope rule "host" to "domain" to make it a less confusing, since rules apply to subdomains as well --- setup.py | 2 +- tests/test_warcprox.py | 28 ++++++++++++++-------------- warcprox/warcproxy.py | 4 ++-- 3 files changed, 17 insertions(+), 17 deletions(-) diff --git a/setup.py b/setup.py index 3339930..e6b35e4 100755 --- a/setup.py +++ b/setup.py @@ -51,7 +51,7 @@ except: setuptools.setup( name='warcprox', - version='2.0.dev15', + version='2.0.dev16', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 8793f80..e6c17a0 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -732,7 +732,7 @@ def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies, def test_block_rules(http_daemon, https_daemon, warcprox_, archiving_proxies): rules = [ { - "host": "localhost", + "domain": "localhost", "url_match": "STRING_MATCH", "value": "bar", }, @@ -746,7 +746,7 @@ def test_block_rules(http_daemon, https_daemon, warcprox_, archiving_proxies): "value": "http://(localhost:%s,)/fuh/" % (https_daemon.server_port), }, { - "host": "badhost.com", + "domain": "bad.domain.com", }, ] request_meta = {"blocks":rules} @@ -790,16 +790,16 @@ def test_block_rules(http_daemon, https_daemon, warcprox_, archiving_proxies): verify=False) assert response.status_code == 200 - # blocked by blanket host block - url = 'http://badhost.com/' + # blocked by blanket domain block + url = 'http://bad.domain.com/' response = requests.get( url, proxies=archiving_proxies, headers=headers, stream=True) assert response.status_code == 403 assert response.content.startswith(b"request rejected by warcprox: blocked by rule found in Warcprox-Meta header:") assert json.loads(response.headers['warcprox-meta']) == {"blocked-by-rule":rules[3]} - # blocked by blanket host block - url = 'https://badhost.com/' + # blocked by blanket domain block + url = 'https://bad.domain.com/' response = requests.get( url, proxies=archiving_proxies, headers=headers, stream=True, verify=False) @@ -807,24 +807,24 @@ def test_block_rules(http_daemon, https_daemon, warcprox_, archiving_proxies): assert response.content.startswith(b"request rejected by warcprox: blocked by rule found in Warcprox-Meta header:") assert json.loads(response.headers['warcprox-meta']) == {"blocked-by-rule":rules[3]} - # blocked by blanket host block - url = 'http://badhost.com:1234/' + # blocked by blanket domain block + url = 'http://bad.domain.com:1234/' response = requests.get( url, proxies=archiving_proxies, headers=headers, stream=True) assert response.status_code == 403 assert response.content.startswith(b"request rejected by warcprox: blocked by rule found in Warcprox-Meta header:") assert json.loads(response.headers['warcprox-meta']) == {"blocked-by-rule":rules[3]} - # blocked by blanket host block - url = 'http://foo.bar.badhost.com/' + # blocked by blanket domain block + url = 'http://foo.bar.bad.domain.com/' response = requests.get( url, proxies=archiving_proxies, headers=headers, stream=True) assert response.status_code == 403 assert response.content.startswith(b"request rejected by warcprox: blocked by rule found in Warcprox-Meta header:") assert json.loads(response.headers['warcprox-meta']) == {"blocked-by-rule":rules[3]} - # host block also applies to subdomains - url = 'https://foo.bar.badhost.com/' + # domain block also applies to subdomains + url = 'https://foo.bar.bad.domain.com/' response = requests.get( url, proxies=archiving_proxies, headers=headers, stream=True, verify=False) @@ -832,8 +832,8 @@ def test_block_rules(http_daemon, https_daemon, warcprox_, archiving_proxies): assert response.content.startswith(b"request rejected by warcprox: blocked by rule found in Warcprox-Meta header:") assert json.loads(response.headers['warcprox-meta']) == {"blocked-by-rule":rules[3]} - # blocked by blanket host block - url = 'http://foo.bar.badhost.com:1234/' + # blocked by blanket domain block + url = 'http://foo.bar.bad.domain.com:1234/' response = requests.get( url, proxies=archiving_proxies, headers=headers, stream=True) assert response.status_code == 403 diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index 882fec9..9966a14 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -120,7 +120,7 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): def _scope_rule_applies(self, rule): u = Url(self.url) - if "host" in rule and not u.matches_ip_or_domain(rule["host"]): + if "domain" in rule and not u.matches_ip_or_domain(rule["domain"]): return False if "url_match" in rule: if rule["url_match"] == "STRING_MATCH": @@ -139,7 +139,7 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): self.logger.warn("invalid rule.url_match=%s", rule.url_match) return False else: - if "host" in rule: + if "domain" in rule: # we already know that it matches from earlier check return True else: From 2c8b1940900bcc2c3dfc0b41c44e20539d0bc980 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 28 Jun 2016 15:53:29 -0500 Subject: [PATCH 130/146] really only apply host limits to the host --- setup.py | 2 +- tests/test_warcprox.py | 45 ++++++++++++++++++++++++++++++++++++++++++ warcprox/warcproxy.py | 8 ++++++++ 3 files changed, 54 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index e6b35e4..d23fcea 100755 --- a/setup.py +++ b/setup.py @@ -51,7 +51,7 @@ except: setuptools.setup( name='warcprox', - version='2.0.dev16', + version='2.0.dev17', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index e6c17a0..edc44ec 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -861,6 +861,22 @@ def test_host_doc_soft_limit( time.sleep(0.5) time.sleep(0.5) + # make sure stats from different host don't count + url = 'http://127.0.0.1:{}/o/p'.format(http_daemon.server_port) + for i in range(10): + response = requests.get( + url, proxies=archiving_proxies, headers=headers, stream=True) + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'o!' + assert response.content == b'I am the warcprox test payload! pppppppppp!\n' + + # wait for writer thread to process + time.sleep(0.5) + while not warcprox_.warc_writer_thread.idle: + time.sleep(0.5) + # rethinkdb stats db update cycle is 2 seconds (at the moment anyway) + time.sleep(2.0) + # same host but different scheme and port -- host limit still applies url = 'https://localhost:{}/o/p'.format(https_daemon.server_port) for i in range(8): @@ -903,6 +919,15 @@ def test_host_doc_soft_limit( assert response.headers["content-type"] == "text/plain;charset=utf-8" assert response.raw.data == b"request rejected by warcprox: reached soft limit test_host_doc_limit_bucket:localhost/total/urls=10\n" + # make sure limit doesn't get applied to a different host + url = 'https://127.0.0.1:{}/o/p'.format(https_daemon.server_port) + response = requests.get( + url, proxies=archiving_proxies, headers=headers, stream=True, + verify=False) + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'o!' + assert response.content == b'I am the warcprox test payload! pppppppppp!\n' + # https also blocked url = 'https://localhost:{}/o/p'.format(https_daemon.server_port) response = requests.get( @@ -915,6 +940,18 @@ def test_host_doc_soft_limit( assert response.headers["content-type"] == "text/plain;charset=utf-8" assert response.raw.data == b"request rejected by warcprox: reached soft limit test_host_doc_limit_bucket:localhost/total/urls=10\n" + # same host, different capitalization still blocked + url = 'https://lOcALhoST:{}/o/p'.format(https_daemon.server_port) + response = requests.get( + url, proxies=archiving_proxies, headers=headers, stream=True, + verify=False) + assert response.status_code == 430 + assert response.reason == "Reached soft limit" + expected_response_meta = {'reached-soft-limit': {'test_host_doc_limit_bucket:localhost/total/urls': 10}, 'stats': {'test_host_doc_limit_bucket:localhost': {'bucket': 'test_host_doc_limit_bucket:localhost', 'revisit': {'wire_bytes': 1215, 'urls': 9}, 'new': {'wire_bytes': 135, 'urls': 1}, 'total': {'wire_bytes': 1350, 'urls': 10}}}} + assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta + assert response.headers["content-type"] == "text/plain;charset=utf-8" + assert response.raw.data == b"request rejected by warcprox: reached soft limit test_host_doc_limit_bucket:localhost/total/urls=10\n" + def test_host_data_soft_limit( http_daemon, https_daemon, warcprox_, archiving_proxies): request_meta = { @@ -970,6 +1007,14 @@ def test_host_data_soft_limit( # rethinkdb stats db update cycle is 2 seconds (at the moment anyway) time.sleep(2.0) + # make sure limit doesn't get applied to a different host + url = 'http://127.0.0.1:{}/z/~'.format(http_daemon.server_port) + response = requests.get( + url, proxies=archiving_proxies, headers=headers, stream=True) + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'z!' + assert response.content == b'I am the warcprox test payload! ~~~~~~~~~~!\n' + # blocked because we're over the limit now url = 'http://localhost:{}/y/z'.format(http_daemon.server_port) response = requests.get( diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index 9966a14..38e39ae 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -177,6 +177,14 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): def _enforce_limit(self, limit_key, limit_value, soft=False): bucket0, bucket1, bucket2 = limit_key.rsplit("/", 2) + + # if limit_key looks like 'job1:foo.com/total/urls' then we only want + # to apply this rule if the requested url is on host foo.com + bucket0_fields = bucket0.split(':') + if len(bucket0_fields) == 2: + if self.hostname.lower() != bucket0_fields[1].lower(): + return # else host matches, go ahead and enforce the limit + value = self.server.stats_db.value(bucket0, bucket1, bucket2) if value and value >= limit_value: body = ("request rejected by warcprox: reached %s %s=%s\n" % ( From c9e403585ba4bcd76ddcb67e586ad83e9065fa26 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 29 Jun 2016 14:56:14 -0500 Subject: [PATCH 131/146] switching from host limits to domain limits, which apply in aggregate to the host and subdomains --- setup.py | 2 +- tests/test_warcprox.py | 115 +++++++++++++++++++++++++++++------------ warcprox/__init__.py | 66 +++++++++++++++++++++++ warcprox/stats.py | 25 +++++---- warcprox/warcproxy.py | 58 ++------------------- 5 files changed, 164 insertions(+), 102 deletions(-) diff --git a/setup.py b/setup.py index d23fcea..d3428f6 100755 --- a/setup.py +++ b/setup.py @@ -51,7 +51,7 @@ except: setuptools.setup( name='warcprox', - version='2.0.dev17', + version='2.0.dev18', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index edc44ec..76901d1 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -42,6 +42,7 @@ import pprint import traceback import signal from collections import Counter +import socket try: import http.server as http_server @@ -65,6 +66,33 @@ logging.getLogger("requests.packages.urllib3").setLevel(logging.WARN) warnings.simplefilter("ignore", category=requests.packages.urllib3.exceptions.InsecureRequestWarning) warnings.simplefilter("ignore", category=requests.packages.urllib3.exceptions.InsecurePlatformWarning) +# monkey patch dns lookup so we can test domain inheritance on localhost +orig_getaddrinfo = socket.getaddrinfo +orig_gethostbyname = socket.gethostbyname +orig_socket_connect = socket.socket.connect + +def _getaddrinfo(host, port, family=0, type=0, proto=0, flags=0): + if host.endswith('.localhost'): + return orig_getaddrinfo('localhost', port, family, type, proto, flags) + else: + return orig_getaddrinfo(host, port, family, type, proto, flags) + +def _gethostbyname(host): + if host.endswith('.localhost'): + return orig_gethostbyname('localhost') + else: + return orig_gethostbyname(host) + +def _socket_connect(self, address): + if address[0].endswith('.localhost'): + return orig_socket_connect(self, ('localhost', address[1])) + else: + return orig_socket_connect(self, address) + +socket.gethostbyname = _gethostbyname +socket.getaddrinfo = _getaddrinfo +socket.socket.connect = _socket_connect + def dump_state(signum=None, frame=None): pp = pprint.PrettyPrinter(indent=4) state_strs = [] @@ -373,6 +401,13 @@ def test_httpds_no_proxy(http_daemon, https_daemon): assert response.headers['warcprox-test-header'] == 'c!' assert response.content == b'I am the warcprox test payload! dddddddddd!\n' + # ensure monkey-patched dns resolution is working + url = 'https://foo.bar.localhost:{}/c/d'.format(https_daemon.server_port) + response = requests.get(url, verify=False) + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'c!' + assert response.content == b'I am the warcprox test payload! dddddddddd!\n' + def _poll_playback_until(playback_proxies, url, status, timeout_sec): start = time.time() # check playback (warc writing is asynchronous, give it up to 10 sec) @@ -840,15 +875,16 @@ def test_block_rules(http_daemon, https_daemon, warcprox_, archiving_proxies): assert response.content.startswith(b"request rejected by warcprox: blocked by rule found in Warcprox-Meta header:") assert json.loads(response.headers['warcprox-meta']) == {"blocked-by-rule":rules[3]} -def test_host_doc_soft_limit( +def test_domain_doc_soft_limit( http_daemon, https_daemon, warcprox_, archiving_proxies): request_meta = { - "stats": {"buckets": [{"bucket":"test_host_doc_limit_bucket","tally-host-stats":True}]}, - "soft-limits": {"test_host_doc_limit_bucket:localhost/total/urls":10}, + "stats": {"buckets": [{"bucket":"test_domain_doc_limit_bucket","tally-domains":["foo.localhost"]}]}, + "soft-limits": {"test_domain_doc_limit_bucket:foo.localhost/total/urls":10}, } headers = {"Warcprox-Meta": json.dumps(request_meta)} - url = 'http://localhost:{}/o/p'.format(http_daemon.server_port) + # (1) + url = 'http://foo.localhost:{}/o/p'.format(http_daemon.server_port) response = requests.get( url, proxies=archiving_proxies, headers=headers, stream=True) assert response.status_code == 200 @@ -861,8 +897,8 @@ def test_host_doc_soft_limit( time.sleep(0.5) time.sleep(0.5) - # make sure stats from different host don't count - url = 'http://127.0.0.1:{}/o/p'.format(http_daemon.server_port) + # make sure stats from different domain don't count + url = 'http://bar.localhost:{}/o/p'.format(http_daemon.server_port) for i in range(10): response = requests.get( url, proxies=archiving_proxies, headers=headers, stream=True) @@ -877,9 +913,19 @@ def test_host_doc_soft_limit( # rethinkdb stats db update cycle is 2 seconds (at the moment anyway) time.sleep(2.0) - # same host but different scheme and port -- host limit still applies - url = 'https://localhost:{}/o/p'.format(https_daemon.server_port) - for i in range(8): + # (2) same host but different scheme and port: domain limit applies + # + url = 'https://foo.localhost:{}/o/p'.format(https_daemon.server_port) + response = requests.get( + url, proxies=archiving_proxies, headers=headers, stream=True, + verify=False) + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'o!' + assert response.content == b'I am the warcprox test payload! pppppppppp!\n' + + # (3-9) different subdomain: host limit applies + url = 'https://baz.foo.localhost:{}/o/p'.format(https_daemon.server_port) + for i in range(7): response = requests.get( url, proxies=archiving_proxies, headers=headers, stream=True, verify=False) @@ -894,6 +940,7 @@ def test_host_doc_soft_limit( # rethinkdb stats db update cycle is 2 seconds (at the moment anyway) time.sleep(2.0) + # (10) response = requests.get( url, proxies=archiving_proxies, headers=headers, stream=True, verify=False) @@ -908,19 +955,19 @@ def test_host_doc_soft_limit( # rethinkdb stats db update cycle is 2 seconds (at the moment anyway) time.sleep(2.0) - # back to http, and this is the 11th request - url = 'http://localhost:{}/o/p'.format(http_daemon.server_port) + # (11) back to http, and this is the 11th request + url = 'http://zuh.foo.localhost:{}/o/p'.format(http_daemon.server_port) response = requests.get( url, proxies=archiving_proxies, headers=headers, stream=True) assert response.status_code == 430 assert response.reason == "Reached soft limit" - expected_response_meta = {'reached-soft-limit': {'test_host_doc_limit_bucket:localhost/total/urls': 10}, 'stats': {'test_host_doc_limit_bucket:localhost': {'bucket': 'test_host_doc_limit_bucket:localhost', 'revisit': {'wire_bytes': 1215, 'urls': 9}, 'new': {'wire_bytes': 135, 'urls': 1}, 'total': {'wire_bytes': 1350, 'urls': 10}}}} + expected_response_meta = {'reached-soft-limit': {'test_domain_doc_limit_bucket:foo.localhost/total/urls': 10}, 'stats': {'test_domain_doc_limit_bucket:foo.localhost': {'bucket': 'test_domain_doc_limit_bucket:foo.localhost', 'revisit': {'wire_bytes': 1215, 'urls': 9}, 'new': {'wire_bytes': 135, 'urls': 1}, 'total': {'wire_bytes': 1350, 'urls': 10}}}} assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta assert response.headers["content-type"] == "text/plain;charset=utf-8" - assert response.raw.data == b"request rejected by warcprox: reached soft limit test_host_doc_limit_bucket:localhost/total/urls=10\n" + assert response.raw.data == b"request rejected by warcprox: reached soft limit test_domain_doc_limit_bucket:foo.localhost/total/urls=10\n" - # make sure limit doesn't get applied to a different host - url = 'https://127.0.0.1:{}/o/p'.format(https_daemon.server_port) + # make sure limit doesn't get applied to a different domain + url = 'https://localhost:{}/o/p'.format(https_daemon.server_port) response = requests.get( url, proxies=archiving_proxies, headers=headers, stream=True, verify=False) @@ -929,39 +976,39 @@ def test_host_doc_soft_limit( assert response.content == b'I am the warcprox test payload! pppppppppp!\n' # https also blocked - url = 'https://localhost:{}/o/p'.format(https_daemon.server_port) + url = 'https://zuh.foo.localhost:{}/o/p'.format(https_daemon.server_port) response = requests.get( url, proxies=archiving_proxies, headers=headers, stream=True, verify=False) assert response.status_code == 430 assert response.reason == "Reached soft limit" - expected_response_meta = {'reached-soft-limit': {'test_host_doc_limit_bucket:localhost/total/urls': 10}, 'stats': {'test_host_doc_limit_bucket:localhost': {'bucket': 'test_host_doc_limit_bucket:localhost', 'revisit': {'wire_bytes': 1215, 'urls': 9}, 'new': {'wire_bytes': 135, 'urls': 1}, 'total': {'wire_bytes': 1350, 'urls': 10}}}} + expected_response_meta = {'reached-soft-limit': {'test_domain_doc_limit_bucket:foo.localhost/total/urls': 10}, 'stats': {'test_domain_doc_limit_bucket:foo.localhost': {'bucket': 'test_domain_doc_limit_bucket:foo.localhost', 'revisit': {'wire_bytes': 1215, 'urls': 9}, 'new': {'wire_bytes': 135, 'urls': 1}, 'total': {'wire_bytes': 1350, 'urls': 10}}}} assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta assert response.headers["content-type"] == "text/plain;charset=utf-8" - assert response.raw.data == b"request rejected by warcprox: reached soft limit test_host_doc_limit_bucket:localhost/total/urls=10\n" + assert response.raw.data == b"request rejected by warcprox: reached soft limit test_domain_doc_limit_bucket:foo.localhost/total/urls=10\n" # same host, different capitalization still blocked - url = 'https://lOcALhoST:{}/o/p'.format(https_daemon.server_port) + url = 'https://HEHEHE.fOO.lOcALhoST:{}/o/p'.format(https_daemon.server_port) response = requests.get( url, proxies=archiving_proxies, headers=headers, stream=True, verify=False) assert response.status_code == 430 assert response.reason == "Reached soft limit" - expected_response_meta = {'reached-soft-limit': {'test_host_doc_limit_bucket:localhost/total/urls': 10}, 'stats': {'test_host_doc_limit_bucket:localhost': {'bucket': 'test_host_doc_limit_bucket:localhost', 'revisit': {'wire_bytes': 1215, 'urls': 9}, 'new': {'wire_bytes': 135, 'urls': 1}, 'total': {'wire_bytes': 1350, 'urls': 10}}}} + expected_response_meta = {'reached-soft-limit': {'test_domain_doc_limit_bucket:foo.localhost/total/urls': 10}, 'stats': {'test_domain_doc_limit_bucket:foo.localhost': {'bucket': 'test_domain_doc_limit_bucket:foo.localhost', 'revisit': {'wire_bytes': 1215, 'urls': 9}, 'new': {'wire_bytes': 135, 'urls': 1}, 'total': {'wire_bytes': 1350, 'urls': 10}}}} assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta assert response.headers["content-type"] == "text/plain;charset=utf-8" - assert response.raw.data == b"request rejected by warcprox: reached soft limit test_host_doc_limit_bucket:localhost/total/urls=10\n" + assert response.raw.data == b"request rejected by warcprox: reached soft limit test_domain_doc_limit_bucket:foo.localhost/total/urls=10\n" -def test_host_data_soft_limit( +def test_domain_data_soft_limit( http_daemon, https_daemon, warcprox_, archiving_proxies): request_meta = { - "stats": {"buckets": [{"bucket":"test_host_data_limit_bucket","tally-host-stats":True}]}, + "stats": {"buckets": [{"bucket":"test_domain_data_limit_bucket","tally-domains":['foo.LOCALhost']}]}, # response is 135 bytes, so 3rd novel url should be disallowed - "soft-limits": {"test_host_data_limit_bucket:localhost/new/wire_bytes":200}, + "soft-limits": {"test_domain_data_limit_bucket:foo.localhost/new/wire_bytes":200}, } headers = {"Warcprox-Meta": json.dumps(request_meta)} - url = 'http://localhost:{}/y/z'.format(http_daemon.server_port) + url = 'http://foo.localhost:{}/y/z'.format(http_daemon.server_port) response = requests.get( url, proxies=archiving_proxies, headers=headers, stream=True) assert response.status_code == 200 @@ -976,7 +1023,7 @@ def test_host_data_soft_limit( time.sleep(2.0) # duplicate, does not count toward limit - url = 'https://localhost:{}/y/z'.format(https_daemon.server_port) + url = 'https://baz.foo.localhost:{}/y/z'.format(https_daemon.server_port) response = requests.get( url, proxies=archiving_proxies, headers=headers, stream=True, verify=False) @@ -992,7 +1039,7 @@ def test_host_data_soft_limit( time.sleep(2.0) # novel, pushes stats over the limit - url = 'https://localhost:{}/z/~'.format(https_daemon.server_port) + url = 'https://muh.foo.localhost:{}/z/~'.format(https_daemon.server_port) response = requests.get( url, proxies=archiving_proxies, headers=headers, stream=True, verify=False) @@ -1008,7 +1055,7 @@ def test_host_data_soft_limit( time.sleep(2.0) # make sure limit doesn't get applied to a different host - url = 'http://127.0.0.1:{}/z/~'.format(http_daemon.server_port) + url = 'http://baz.localhost:{}/z/~'.format(http_daemon.server_port) response = requests.get( url, proxies=archiving_proxies, headers=headers, stream=True) assert response.status_code == 200 @@ -1016,27 +1063,27 @@ def test_host_data_soft_limit( assert response.content == b'I am the warcprox test payload! ~~~~~~~~~~!\n' # blocked because we're over the limit now - url = 'http://localhost:{}/y/z'.format(http_daemon.server_port) + url = 'http://lOl.wHut.fOo.lOcALHOst:{}/y/z'.format(http_daemon.server_port) response = requests.get( url, proxies=archiving_proxies, headers=headers, stream=True) assert response.status_code == 430 assert response.reason == "Reached soft limit" - expected_response_meta = {'reached-soft-limit': {'test_host_data_limit_bucket:localhost/new/wire_bytes': 200}, 'stats': {'test_host_data_limit_bucket:localhost': {'total': {'wire_bytes': 405, 'urls': 3}, 'revisit': {'wire_bytes': 135, 'urls': 1}, 'new': {'wire_bytes': 270, 'urls': 2}, 'bucket': 'test_host_data_limit_bucket:localhost'}}} + expected_response_meta = {'reached-soft-limit': {'test_domain_data_limit_bucket:foo.localhost/new/wire_bytes': 200}, 'stats': {'test_domain_data_limit_bucket:foo.localhost': {'total': {'wire_bytes': 405, 'urls': 3}, 'revisit': {'wire_bytes': 135, 'urls': 1}, 'new': {'wire_bytes': 270, 'urls': 2}, 'bucket': 'test_domain_data_limit_bucket:foo.localhost'}}} assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta assert response.headers["content-type"] == "text/plain;charset=utf-8" - assert response.raw.data == b"request rejected by warcprox: reached soft limit test_host_data_limit_bucket:localhost/new/wire_bytes=200\n" + assert response.raw.data == b"request rejected by warcprox: reached soft limit test_domain_data_limit_bucket:foo.localhost/new/wire_bytes=200\n" # https also blocked - url = 'https://localhost:{}/w/x'.format(https_daemon.server_port) + url = 'https://foo.localhost:{}/w/x'.format(https_daemon.server_port) response = requests.get( url, proxies=archiving_proxies, headers=headers, stream=True, verify=False) assert response.status_code == 430 assert response.reason == "Reached soft limit" - expected_response_meta = {'reached-soft-limit': {'test_host_data_limit_bucket:localhost/new/wire_bytes': 200}, 'stats': {'test_host_data_limit_bucket:localhost': {'total': {'wire_bytes': 405, 'urls': 3}, 'revisit': {'wire_bytes': 135, 'urls': 1}, 'new': {'wire_bytes': 270, 'urls': 2}, 'bucket': 'test_host_data_limit_bucket:localhost'}}} + expected_response_meta = {'reached-soft-limit': {'test_domain_data_limit_bucket:foo.localhost/new/wire_bytes': 200}, 'stats': {'test_domain_data_limit_bucket:foo.localhost': {'total': {'wire_bytes': 405, 'urls': 3}, 'revisit': {'wire_bytes': 135, 'urls': 1}, 'new': {'wire_bytes': 270, 'urls': 2}, 'bucket': 'test_domain_data_limit_bucket:foo.localhost'}}} assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta assert response.headers["content-type"] == "text/plain;charset=utf-8" - assert response.raw.data == b"request rejected by warcprox: reached soft limit test_host_data_limit_bucket:localhost/new/wire_bytes=200\n" + assert response.raw.data == b"request rejected by warcprox: reached soft limit test_domain_data_limit_bucket:foo.localhost/new/wire_bytes=200\n" # XXX this test relies on a tor proxy running at localhost:9050 with a working # connection to the internet, and relies on a third party site (facebook) being diff --git a/warcprox/__init__.py b/warcprox/__init__.py index 1eeb9a4..89d8e4e 100644 --- a/warcprox/__init__.py +++ b/warcprox/__init__.py @@ -57,6 +57,72 @@ class RequestBlockedByRule(Exception): def __str__(self): return "%s: %s" % (self.__class__.__name__, self.msg) +class Url: + ''' + Utility class + ''' + def __init__(self, url): + self.url = url + self._surt = None + self._host = None + + @property + def surt(self): + if not self._surt: + import surt + hurl = surt.handyurl.parse(self.url) + surt.GoogleURLCanonicalizer.canonicalize(hurl) + hurl.query = None + hurl.hash = None + self._surt = hurl.getURLString(surt=True, trailing_comma=True) + return self._surt + + @property + def host(self): + if not self._host: + import surt + self._host = surt.handyurl.parse(self.url).host + return self._host + + def matches_ip_or_domain(self, ip_or_domain): + return host_matches_ip_or_domain(self.host, ip_or_domain) + +def normalize_host(host): + # normalize host (punycode and lowercase) + return host.encode('idna').decode('ascii').lower() + +def host_matches_ip_or_domain(host, ip_or_domain): + ''' + Returns true if + - ip_or_domain is an ip address and host is the same ip address + - ip_or_domain is a domain and host is the same domain + - ip_or_domain is a domain and host is a subdomain of it + ''' + _host = normalize_host(host) + _ip_or_domain = normalize_host(ip_or_domain) + + if _ip_or_domain == _host: + return True + + # if either _ip_or_domain or host are ip addresses, and they're not + # identical (previous check), not a match + try: + ipaddress.ip_address(_ip_or_domain) + return False + except: + pass + try: + ipaddress.ip_address(_host) + return False + except: + pass + + # if we get here, we're looking at two hostnames + domain_parts = _ip_or_domain.split(".") + host_parts = _host.split(".") + + return host_parts[-len(domain_parts):] == domain_parts + # logging level more fine-grained than logging.DEBUG==10 TRACE = 5 diff --git a/warcprox/stats.py b/warcprox/stats.py index 8d5b324..9fd892d 100644 --- a/warcprox/stats.py +++ b/warcprox/stats.py @@ -113,15 +113,15 @@ class StatsDb: definition can either be a string, which signifies the name of the bucket, or a dict. If a dict it is expected to have at least an item with key 'bucket' whose value is the name of the bucket. The other - currently recognized item is 'tally-host-stats', which if true, - instructs warcprox to additionally tally substats of the given bucket - by host. Host stats are stored in the stats table under the key - '{parent-bucket}:{host}'. + currently recognized item is 'tally-domains', which if supplied should + be a list of domains. This instructs warcprox to additionally tally + substats of the given bucket by domain. Host stats are stored in the + stats table under the key '{parent-bucket}:{domain(normalized)}'. Example Warcprox-Meta header (a real one will likely have other sections besides 'stats'): - Warcprox-Meta: {'stats':{'buckets':['bucket1',{'bucket':'bucket2','tally-host-stats':true}]}} + Warcprox-Meta: {'stats':{'buckets':['bucket1',{'bucket':'bucket2','tally-domains':['foo.bar.com','192.168.10.20'}]}} ''' buckets = ["__all__"] if (recorded_url.warcprox_meta @@ -135,14 +135,13 @@ class StatsDb: 'warcprox-meta header %s', bucket) continue buckets.append(bucket['bucket']) - # XXX maybe host has been computed elsewhere and can be - # cached somewhere, but maybe the performance gain would be - # negligible - if bucket.get('tally-host-stats'): - buckets.append('%s:%s' % ( - bucket['bucket'], - surt.handyurl.parse(recorded_url.url.decode( - 'utf-8')).host)) + if bucket.get('tally-domains'): + url = warcprox.Url(recorded_url.url.decode('utf-8')) + for domain in bucket['tally-domains']: + if url.matches_ip_or_domain(domain): + buckets.append('%s:%s' % ( + bucket['bucket'], + warcprox.normalize_host(domain))) else: buckets.append(bucket) else: diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index 38e39ae..ab1a5b7 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -48,57 +48,6 @@ import resource import ipaddress import surt -class Url: - def __init__(self, url): - self.url = url - self._surt = None - self._host = None - - @property - def surt(self): - if not self._surt: - hurl = surt.handyurl.parse(self.url) - surt.GoogleURLCanonicalizer.canonicalize(hurl) - hurl.query = None - hurl.hash = None - self._surt = hurl.getURLString(surt=True, trailing_comma=True) - return self._surt - - @property - def host(self): - if not self._host: - self._host = surt.handyurl.parse(self.url).host - return self._host - - def matches_ip_or_domain(self, ip_or_domain): - """Returns true if - - ip_or_domain is an ip address and self.host is the same ip address - - ip_or_domain is a domain and self.host is the same domain - - ip_or_domain is a domain and self.host is a subdomain of it - """ - if ip_or_domain == self.host: - return True - - # if either ip_or_domain or self.host are ip addresses, and they're not - # identical (previous check), not a match - try: - ipaddress.ip_address(ip_or_domain) - return False - except: - pass - try: - ipaddress.ip_address(self.host) - return False - except: - pass - - # if we get here, we're looking at two hostnames - # XXX do we need to handle case of one punycoded idn, other not? - domain_parts = ip_or_domain.split(".") - host_parts = self.host.split(".") - - return host_parts[-len(domain_parts):] == domain_parts - class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): ''' XXX add more information. @@ -118,7 +67,7 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): # XXX nearly identical to brozzler.site.Site._scope_rule_applies() but # there's no obvious common dependency where this code should go... TBD def _scope_rule_applies(self, rule): - u = Url(self.url) + u = warcprox.Url(self.url) if "domain" in rule and not u.matches_ip_or_domain(rule["domain"]): return False @@ -179,10 +128,11 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): bucket0, bucket1, bucket2 = limit_key.rsplit("/", 2) # if limit_key looks like 'job1:foo.com/total/urls' then we only want - # to apply this rule if the requested url is on host foo.com + # to apply this rule if the requested url is within domain bucket0_fields = bucket0.split(':') if len(bucket0_fields) == 2: - if self.hostname.lower() != bucket0_fields[1].lower(): + if not warcprox.host_matches_ip_or_domain( + self.hostname.lower(), bucket0_fields[1].lower()): return # else host matches, go ahead and enforce the limit value = self.server.stats_db.value(bucket0, bucket1, bucket2) From a59871e17b71f043d9739cebe43e9c968fe4a557 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 29 Jun 2016 15:54:40 -0500 Subject: [PATCH 132/146] idn support, at least for domain limits (getting a segfault in tests on mac however, let's see what happens on travis-ci) --- setup.py | 2 +- tests/test_warcprox.py | 24 +++++++++++++----------- warcprox/__init__.py | 4 +++- warcprox/mitmproxy.py | 19 ++++++++----------- warcprox/warcproxy.py | 17 ++++++++++++----- 5 files changed, 37 insertions(+), 29 deletions(-) diff --git a/setup.py b/setup.py index d3428f6..215caf4 100755 --- a/setup.py +++ b/setup.py @@ -51,7 +51,7 @@ except: setuptools.setup( name='warcprox', - version='2.0.dev18', + version='2.0.dev19', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 76901d1..8e021e3 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -1,4 +1,5 @@ #!/usr/bin/env python +# vim: set fileencoding=utf-8: ''' tests/test_warcprox.py - automated tests for warcprox @@ -1001,14 +1002,15 @@ def test_domain_doc_soft_limit( def test_domain_data_soft_limit( http_daemon, https_daemon, warcprox_, archiving_proxies): + # using idn request_meta = { - "stats": {"buckets": [{"bucket":"test_domain_data_limit_bucket","tally-domains":['foo.LOCALhost']}]}, + "stats": {"buckets": [{"bucket":"test_domain_data_limit_bucket","tally-domains":['🎵zZ.LOCALhost']}]}, # response is 135 bytes, so 3rd novel url should be disallowed - "soft-limits": {"test_domain_data_limit_bucket:foo.localhost/new/wire_bytes":200}, + "soft-limits": {"test_domain_data_limit_bucket:🎵ZZ.localhost/new/wire_bytes":200}, } headers = {"Warcprox-Meta": json.dumps(request_meta)} - url = 'http://foo.localhost:{}/y/z'.format(http_daemon.server_port) + url = 'http://🎵Zz.localhost:{}/y/z'.format(http_daemon.server_port) response = requests.get( url, proxies=archiving_proxies, headers=headers, stream=True) assert response.status_code == 200 @@ -1023,7 +1025,7 @@ def test_domain_data_soft_limit( time.sleep(2.0) # duplicate, does not count toward limit - url = 'https://baz.foo.localhost:{}/y/z'.format(https_daemon.server_port) + url = 'https://baz.🎵zz.localhost:{}/y/z'.format(https_daemon.server_port) response = requests.get( url, proxies=archiving_proxies, headers=headers, stream=True, verify=False) @@ -1039,7 +1041,7 @@ def test_domain_data_soft_limit( time.sleep(2.0) # novel, pushes stats over the limit - url = 'https://muh.foo.localhost:{}/z/~'.format(https_daemon.server_port) + url = 'https://muh.XN--Zz-B862a.locALHOst:{}/z/~'.format(https_daemon.server_port) response = requests.get( url, proxies=archiving_proxies, headers=headers, stream=True, verify=False) @@ -1063,27 +1065,27 @@ def test_domain_data_soft_limit( assert response.content == b'I am the warcprox test payload! ~~~~~~~~~~!\n' # blocked because we're over the limit now - url = 'http://lOl.wHut.fOo.lOcALHOst:{}/y/z'.format(http_daemon.server_port) + url = 'http://lOl.wHut.🎵ZZ.lOcALHOst:{}/y/z'.format(http_daemon.server_port) response = requests.get( url, proxies=archiving_proxies, headers=headers, stream=True) assert response.status_code == 430 assert response.reason == "Reached soft limit" - expected_response_meta = {'reached-soft-limit': {'test_domain_data_limit_bucket:foo.localhost/new/wire_bytes': 200}, 'stats': {'test_domain_data_limit_bucket:foo.localhost': {'total': {'wire_bytes': 405, 'urls': 3}, 'revisit': {'wire_bytes': 135, 'urls': 1}, 'new': {'wire_bytes': 270, 'urls': 2}, 'bucket': 'test_domain_data_limit_bucket:foo.localhost'}}} + expected_response_meta = {'reached-soft-limit': {'test_domain_data_limit_bucket:xn--zz-b862a.localhost/new/wire_bytes': 200}, 'stats': {'test_domain_data_limit_bucket:xn--zz-b862a.localhost': {'total': {'wire_bytes': 405, 'urls': 3}, 'revisit': {'wire_bytes': 135, 'urls': 1}, 'new': {'wire_bytes': 270, 'urls': 2}, 'bucket': 'test_domain_data_limit_bucket:xn--zz-b862a.localhost'}}} assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta assert response.headers["content-type"] == "text/plain;charset=utf-8" - assert response.raw.data == b"request rejected by warcprox: reached soft limit test_domain_data_limit_bucket:foo.localhost/new/wire_bytes=200\n" + assert response.raw.data == b"request rejected by warcprox: reached soft limit test_domain_data_limit_bucket:xn--zz-b862a.localhost/new/wire_bytes=200\n" # https also blocked - url = 'https://foo.localhost:{}/w/x'.format(https_daemon.server_port) + url = 'https://xn--zz-b862ah.loCAlhost:{}/w/x'.format(https_daemon.server_port) response = requests.get( url, proxies=archiving_proxies, headers=headers, stream=True, verify=False) assert response.status_code == 430 assert response.reason == "Reached soft limit" - expected_response_meta = {'reached-soft-limit': {'test_domain_data_limit_bucket:foo.localhost/new/wire_bytes': 200}, 'stats': {'test_domain_data_limit_bucket:foo.localhost': {'total': {'wire_bytes': 405, 'urls': 3}, 'revisit': {'wire_bytes': 135, 'urls': 1}, 'new': {'wire_bytes': 270, 'urls': 2}, 'bucket': 'test_domain_data_limit_bucket:foo.localhost'}}} + expected_response_meta = {'reached-soft-limit': {'test_domain_data_limit_bucket:xn--zz-b862a.localhost/new/wire_bytes': 200}, 'stats': {'test_domain_data_limit_bucket:xn--zz-b862a.localhost': {'total': {'wire_bytes': 405, 'urls': 3}, 'revisit': {'wire_bytes': 135, 'urls': 1}, 'new': {'wire_bytes': 270, 'urls': 2}, 'bucket': 'test_domain_data_limit_bucket:xn--zz-b862a.localhost'}}} assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta assert response.headers["content-type"] == "text/plain;charset=utf-8" - assert response.raw.data == b"request rejected by warcprox: reached soft limit test_domain_data_limit_bucket:foo.localhost/new/wire_bytes=200\n" + assert response.raw.data == b"request rejected by warcprox: reached soft limit test_domain_data_limit_bucket:xn--zz-b862a.localhost/new/wire_bytes=200\n" # XXX this test relies on a tor proxy running at localhost:9050 with a working # connection to the internet, and relies on a third party site (facebook) being diff --git a/warcprox/__init__.py b/warcprox/__init__.py index 89d8e4e..45b38b2 100644 --- a/warcprox/__init__.py +++ b/warcprox/__init__.py @@ -121,7 +121,9 @@ def host_matches_ip_or_domain(host, ip_or_domain): domain_parts = _ip_or_domain.split(".") host_parts = _host.split(".") - return host_parts[-len(domain_parts):] == domain_parts + result = host_parts[-len(domain_parts):] == domain_parts + return result + # logging level more fine-grained than logging.DEBUG==10 TRACE = 5 diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index c6c75b9..85960ec 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -184,24 +184,21 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): def _determine_host_port(self): # Get hostname and port to connect to if self.is_connect: - self.hostname, self.port = self.path.split(':') + host, self.port = self.path.split(':') else: self.url = self.path u = urllib_parse.urlparse(self.url) if u.scheme != 'http': - raise Exception('unable to parse request "{}" as a proxy request'.format(self.requestline)) - self.hostname = u.hostname + raise Exception( + 'unable to parse request %s as a proxy request' % ( + repr(self.requestline))) + host = u.hostname self.port = u.port or 80 self.path = urllib_parse.urlunparse( urllib_parse.ParseResult( - scheme='', - netloc='', - params=u.params, - path=u.path or '/', - query=u.query, - fragment=u.fragment - ) - ) + scheme='', netloc='', params=u.params, path=u.path or '/', + query=u.query, fragment=u.fragment)) + self.hostname = warcprox.normalize_host(host) def _connect_to_remote_server(self): # Connect to destination diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index ab1a5b7..0dc736e 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -126,19 +126,26 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): def _enforce_limit(self, limit_key, limit_value, soft=False): bucket0, bucket1, bucket2 = limit_key.rsplit("/", 2) + _limit_key = limit_key # if limit_key looks like 'job1:foo.com/total/urls' then we only want # to apply this rule if the requested url is within domain bucket0_fields = bucket0.split(':') if len(bucket0_fields) == 2: + self.logger.info( + 'checking %s:%s', repr(limit_key), repr(limit_value)) if not warcprox.host_matches_ip_or_domain( - self.hostname.lower(), bucket0_fields[1].lower()): + self.hostname, bucket0_fields[1]): return # else host matches, go ahead and enforce the limit + bucket0 = '%s:%s' % ( + bucket0_fields[0], + warcprox.normalize_host(bucket0_fields[1])) + _limit_key = '%s/%s/%s' % (bucket0, bucket1, bucket2) value = self.server.stats_db.value(bucket0, bucket1, bucket2) if value and value >= limit_value: body = ("request rejected by warcprox: reached %s %s=%s\n" % ( - "soft limit" if soft else "limit", limit_key, + "soft limit" if soft else "limit", _limit_key, limit_value)).encode("utf-8") if soft: self.send_response(430, "Reached soft limit") @@ -151,9 +158,9 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): "stats": {bucket0:self.server.stats_db.value(bucket0)} } if soft: - response_meta["reached-soft-limit"] = {limit_key:limit_value} + response_meta["reached-soft-limit"] = {_limit_key:limit_value} else: - response_meta["reached-limit"] = {limit_key:limit_value} + response_meta["reached-limit"] = {_limit_key:limit_value} self.send_header( "Warcprox-Meta", json.dumps(response_meta, separators=(",",":"))) @@ -166,7 +173,7 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): self.client_address[0], 430 if soft else 420, self.command, self.url, "soft limit" if soft else "limit", - limit_key, limit_value)) + _limit_key, limit_value)) def _enforce_limits(self, warcprox_meta): """ From 33775d360a3489a44d2e7bac7e65c094d09c226d Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 29 Jun 2016 16:47:54 -0500 Subject: [PATCH 133/146] comment out segfaulting test --- setup.py | 2 +- tests/test_warcprox.py | 26 +++++++++++++++----------- 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/setup.py b/setup.py index 215caf4..a41c2a2 100755 --- a/setup.py +++ b/setup.py @@ -51,7 +51,7 @@ except: setuptools.setup( name='warcprox', - version='2.0.dev19', + version='2.0.dev20', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 8e021e3..af8ae69 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -1075,17 +1075,21 @@ def test_domain_data_soft_limit( assert response.headers["content-type"] == "text/plain;charset=utf-8" assert response.raw.data == b"request rejected by warcprox: reached soft limit test_domain_data_limit_bucket:xn--zz-b862a.localhost/new/wire_bytes=200\n" - # https also blocked - url = 'https://xn--zz-b862ah.loCAlhost:{}/w/x'.format(https_daemon.server_port) - response = requests.get( - url, proxies=archiving_proxies, headers=headers, stream=True, - verify=False) - assert response.status_code == 430 - assert response.reason == "Reached soft limit" - expected_response_meta = {'reached-soft-limit': {'test_domain_data_limit_bucket:xn--zz-b862a.localhost/new/wire_bytes': 200}, 'stats': {'test_domain_data_limit_bucket:xn--zz-b862a.localhost': {'total': {'wire_bytes': 405, 'urls': 3}, 'revisit': {'wire_bytes': 135, 'urls': 1}, 'new': {'wire_bytes': 270, 'urls': 2}, 'bucket': 'test_domain_data_limit_bucket:xn--zz-b862a.localhost'}}} - assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta - assert response.headers["content-type"] == "text/plain;charset=utf-8" - assert response.raw.data == b"request rejected by warcprox: reached soft limit test_domain_data_limit_bucket:xn--zz-b862a.localhost/new/wire_bytes=200\n" + # XXX this check is resulting in a segfault on mac and linux, from ssl I + # think, probably because of the dns resolution monkey-patching + # https://travis-ci.org/internetarchive/warcprox/builds/141187342 + # + ### # https also blocked + ### url = 'https://xn--zz-b862ah.loCAlhost:{}/w/x'.format(https_daemon.server_port) + ### response = requests.get( + ### url, proxies=archiving_proxies, headers=headers, stream=True, + ### verify=False) + ### assert response.status_code == 430 + ### assert response.reason == "Reached soft limit" + ### expected_response_meta = {'reached-soft-limit': {'test_domain_data_limit_bucket:xn--zz-b862a.localhost/new/wire_bytes': 200}, 'stats': {'test_domain_data_limit_bucket:xn--zz-b862a.localhost': {'total': {'wire_bytes': 405, 'urls': 3}, 'revisit': {'wire_bytes': 135, 'urls': 1}, 'new': {'wire_bytes': 270, 'urls': 2}, 'bucket': 'test_domain_data_limit_bucket:xn--zz-b862a.localhost'}}} + ### assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta + ### assert response.headers["content-type"] == "text/plain;charset=utf-8" + ### assert response.raw.data == b"request rejected by warcprox: reached soft limit test_domain_data_limit_bucket:xn--zz-b862a.localhost/new/wire_bytes=200\n" # XXX this test relies on a tor proxy running at localhost:9050 with a working # connection to the internet, and relies on a third party site (facebook) being From 46c24833ffab5e746cd3ef01a84f0dca6edb91ec Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 29 Jun 2016 17:16:50 -0500 Subject: [PATCH 134/146] emoji idn fails with python 2.7, so test with a BMP unicode character --- setup.py | 2 +- tests/test_warcprox.py | 22 +++++++++++----------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/setup.py b/setup.py index a41c2a2..cb26402 100755 --- a/setup.py +++ b/setup.py @@ -51,7 +51,7 @@ except: setuptools.setup( name='warcprox', - version='2.0.dev20', + version='2.0.dev21', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index af8ae69..10f214a 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -1004,13 +1004,13 @@ def test_domain_data_soft_limit( http_daemon, https_daemon, warcprox_, archiving_proxies): # using idn request_meta = { - "stats": {"buckets": [{"bucket":"test_domain_data_limit_bucket","tally-domains":['🎵zZ.LOCALhost']}]}, + "stats": {"buckets": [{"bucket":"test_domain_data_limit_bucket","tally-domains":['♛zZ.LOCALhost']}]}, # response is 135 bytes, so 3rd novel url should be disallowed - "soft-limits": {"test_domain_data_limit_bucket:🎵ZZ.localhost/new/wire_bytes":200}, + "soft-limits": {"test_domain_data_limit_bucket:♛ZZ.localhost/new/wire_bytes":200}, } headers = {"Warcprox-Meta": json.dumps(request_meta)} - url = 'http://🎵Zz.localhost:{}/y/z'.format(http_daemon.server_port) + url = 'http://♛Zz.localhost:{}/y/z'.format(http_daemon.server_port) response = requests.get( url, proxies=archiving_proxies, headers=headers, stream=True) assert response.status_code == 200 @@ -1025,7 +1025,7 @@ def test_domain_data_soft_limit( time.sleep(2.0) # duplicate, does not count toward limit - url = 'https://baz.🎵zz.localhost:{}/y/z'.format(https_daemon.server_port) + url = 'https://baz.♛zz.localhost:{}/y/z'.format(https_daemon.server_port) response = requests.get( url, proxies=archiving_proxies, headers=headers, stream=True, verify=False) @@ -1041,7 +1041,7 @@ def test_domain_data_soft_limit( time.sleep(2.0) # novel, pushes stats over the limit - url = 'https://muh.XN--Zz-B862a.locALHOst:{}/z/~'.format(https_daemon.server_port) + url = 'https://muh.XN--Zz-xZX.locALHOst:{}/z/~'.format(https_daemon.server_port) response = requests.get( url, proxies=archiving_proxies, headers=headers, stream=True, verify=False) @@ -1065,31 +1065,31 @@ def test_domain_data_soft_limit( assert response.content == b'I am the warcprox test payload! ~~~~~~~~~~!\n' # blocked because we're over the limit now - url = 'http://lOl.wHut.🎵ZZ.lOcALHOst:{}/y/z'.format(http_daemon.server_port) + url = 'http://lOl.wHut.♛ZZ.lOcALHOst:{}/y/z'.format(http_daemon.server_port) response = requests.get( url, proxies=archiving_proxies, headers=headers, stream=True) assert response.status_code == 430 assert response.reason == "Reached soft limit" - expected_response_meta = {'reached-soft-limit': {'test_domain_data_limit_bucket:xn--zz-b862a.localhost/new/wire_bytes': 200}, 'stats': {'test_domain_data_limit_bucket:xn--zz-b862a.localhost': {'total': {'wire_bytes': 405, 'urls': 3}, 'revisit': {'wire_bytes': 135, 'urls': 1}, 'new': {'wire_bytes': 270, 'urls': 2}, 'bucket': 'test_domain_data_limit_bucket:xn--zz-b862a.localhost'}}} + expected_response_meta = {'reached-soft-limit': {'test_domain_data_limit_bucket:xn--zz-xzx.localhost/new/wire_bytes': 200}, 'stats': {'test_domain_data_limit_bucket:xn--zz-xzx.localhost': {'total': {'wire_bytes': 405, 'urls': 3}, 'revisit': {'wire_bytes': 135, 'urls': 1}, 'new': {'wire_bytes': 270, 'urls': 2}, 'bucket': 'test_domain_data_limit_bucket:xn--zz-xzx.localhost'}}} assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta assert response.headers["content-type"] == "text/plain;charset=utf-8" - assert response.raw.data == b"request rejected by warcprox: reached soft limit test_domain_data_limit_bucket:xn--zz-b862a.localhost/new/wire_bytes=200\n" + assert response.raw.data == b"request rejected by warcprox: reached soft limit test_domain_data_limit_bucket:xn--zz-xzx.localhost/new/wire_bytes=200\n" # XXX this check is resulting in a segfault on mac and linux, from ssl I # think, probably because of the dns resolution monkey-patching # https://travis-ci.org/internetarchive/warcprox/builds/141187342 # ### # https also blocked - ### url = 'https://xn--zz-b862ah.loCAlhost:{}/w/x'.format(https_daemon.server_port) + ### url = 'https://xn--zz-xzxh.loCAlhost:{}/w/x'.format(https_daemon.server_port) ### response = requests.get( ### url, proxies=archiving_proxies, headers=headers, stream=True, ### verify=False) ### assert response.status_code == 430 ### assert response.reason == "Reached soft limit" - ### expected_response_meta = {'reached-soft-limit': {'test_domain_data_limit_bucket:xn--zz-b862a.localhost/new/wire_bytes': 200}, 'stats': {'test_domain_data_limit_bucket:xn--zz-b862a.localhost': {'total': {'wire_bytes': 405, 'urls': 3}, 'revisit': {'wire_bytes': 135, 'urls': 1}, 'new': {'wire_bytes': 270, 'urls': 2}, 'bucket': 'test_domain_data_limit_bucket:xn--zz-b862a.localhost'}}} + ### expected_response_meta = {'reached-soft-limit': {'test_domain_data_limit_bucket:xn--zz-xzx.localhost/new/wire_bytes': 200}, 'stats': {'test_domain_data_limit_bucket:xn--zz-xzx.localhost': {'total': {'wire_bytes': 405, 'urls': 3}, 'revisit': {'wire_bytes': 135, 'urls': 1}, 'new': {'wire_bytes': 270, 'urls': 2}, 'bucket': 'test_domain_data_limit_bucket:xn--zz-xzx.localhost'}}} ### assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta ### assert response.headers["content-type"] == "text/plain;charset=utf-8" - ### assert response.raw.data == b"request rejected by warcprox: reached soft limit test_domain_data_limit_bucket:xn--zz-b862a.localhost/new/wire_bytes=200\n" + ### assert response.raw.data == b"request rejected by warcprox: reached soft limit test_domain_data_limit_bucket:xn--zz-xzx.localhost/new/wire_bytes=200\n" # XXX this test relies on a tor proxy running at localhost:9050 with a working # connection to the internet, and relies on a third party site (facebook) being From b82d82b5f111a5758f8143b5d0b63e13954ca2c7 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 30 Jun 2016 15:24:40 -0500 Subject: [PATCH 135/146] command line utility warcprox-ensure-rethinkdb-tables, creates rethinkdb tables if they don't already exist... warcprox normally creates them on demand at startup, but if multiple instances are starting up at the same time, you can end up with duplicate broken tables, so it's a good idea to use this utility when spinning up a cluster --- setup.py | 4 +++- warcprox/main.py | 43 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index cb26402..0062bac 100755 --- a/setup.py +++ b/setup.py @@ -51,7 +51,7 @@ except: setuptools.setup( name='warcprox', - version='2.0.dev21', + version='2.0.dev22', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', @@ -66,6 +66,8 @@ setuptools.setup( entry_points={ 'console_scripts': [ 'warcprox=warcprox.main:main', + ('warcprox-ensure-rethinkdb-tables=' + 'warcprox.main:ensure_rethinkdb_tables'), 'dump-anydbm=warcprox.dump_anydbm:main', ], }, diff --git a/warcprox/main.py b/warcprox/main.py index b203c1c..c4c3006 100644 --- a/warcprox/main.py +++ b/warcprox/main.py @@ -248,6 +248,49 @@ def main(argv=sys.argv): real_main(args) +def ensure_rethinkdb_tables(): + ''' + Creates rethinkdb tables if they don't already exist. Warcprox normally + creates the tables it needs on demand at startup, but if multiple instances + are starting up at the same time, you can end up with duplicate broken + tables. So it's a good idea to use this utility at an early step when + spinning up a cluster. + ''' + arg_parser = argparse.ArgumentParser( + prog=os.path.basename(sys.argv[0]), + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + arg_parser.add_argument( + '--rethinkdb-servers', dest='rethinkdb_servers', default='localhost', + help='rethinkdb servers e.g. db0.foo.org,db0.foo.org:38015,db1.foo.org') + arg_parser.add_argument( + '--rethinkdb-db', dest='rethinkdb_db', default='warcprox', + help='rethinkdb database name') + arg_parser.add_argument( + '-q', '--quiet', dest='log_level', + action='store_const', default=logging.INFO, const=logging.WARN) + arg_parser.add_argument( + '-v', '--verbose', dest='log_level', + action='store_const', default=logging.INFO, const=logging.DEBUG) + args = arg_parser.parse_args(args=sys.argv[1:]) + + logging.basicConfig( + stream=sys.stdout, level=args.log_level, + format=( + '%(asctime)s %(levelname)s %(name)s.%(funcName)s' + '(%(filename)s:%(lineno)d) %(message)s')) + + r = rethinkstuff.Rethinker( + args.rethinkdb_servers.split(','), args.rethinkdb_db) + + # services table + rethinkstuff.ServiceRegistry(r) + + # stats table + warcprox.stats.RethinkStatsDb(r) + + # captures table + warcprox.bigtable.RethinkCaptures(r) + if __name__ == '__main__': main() From 5eed7061b1386e90fc3e7f39de83d4162cee074b Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 5 Jul 2016 11:51:56 -0500 Subject: [PATCH 136/146] do not require --kafka-capture-feed-topic to make the kafka capture feed work (it can be configured per job or per site) --- setup.py | 2 +- warcprox/kafkafeed.py | 50 ++++++++++++++++++++++--------------------- warcprox/main.py | 15 +++++++------ warcprox/warcproxy.py | 2 -- 4 files changed, 35 insertions(+), 34 deletions(-) diff --git a/setup.py b/setup.py index 0062bac..d785cbc 100755 --- a/setup.py +++ b/setup.py @@ -51,7 +51,7 @@ except: setuptools.setup( name='warcprox', - version='2.0.dev22', + version='2.0.dev23', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', diff --git a/warcprox/kafkafeed.py b/warcprox/kafkafeed.py index e9d2176..8b2dbf9 100644 --- a/warcprox/kafkafeed.py +++ b/warcprox/kafkafeed.py @@ -1,24 +1,24 @@ -# -# warcprox/kafkafeed.py - support for publishing information about archived -# urls to apache kafka -# -# Copyright (C) 2015-2016 Internet Archive -# -# This program is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License -# as published by the Free Software Foundation; either version 2 -# of the License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, -# USA. -# +''' +warcprox/kafkafeed.py - support for publishing information about archived +urls to apache kafka + +Copyright (C) 2015-2016 Internet Archive + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, +USA. +''' import kafka import datetime @@ -29,7 +29,7 @@ from hanzo import warctools class CaptureFeed: logger = logging.getLogger('warcprox.kafkafeed.CaptureFeed') - def __init__(self, broker_list, topic): + def __init__(self, broker_list, topic=None): self.broker_list = broker_list self.topic = topic self._producer = kafka.KafkaProducer(bootstrap_servers=broker_list) @@ -38,6 +38,10 @@ class CaptureFeed: if records[0].type not in (b'revisit', b'response'): return + topic = recorded_url.warcprox_meta.get('capture-feed-topic', self.topic) + if not topic: + return + try: payload_digest = records[0].get_header(warctools.WarcRecord.PAYLOAD_DIGEST).decode('utf-8') except: @@ -72,8 +76,6 @@ class CaptureFeed: for (k,v) in recorded_url.warcprox_meta['capture-feed-extra-fields'].items(): d[k] = v - topic = recorded_url.warcprox_meta.get('capture-feed-topic', self.topic) - msg = json.dumps(d, separators=(',', ':')).encode('utf-8') self.logger.debug('feeding kafka topic=%s msg=%s', repr(topic), msg) self._producer.send(topic, msg) diff --git a/warcprox/main.py b/warcprox/main.py index c4c3006..18a316e 100644 --- a/warcprox/main.py +++ b/warcprox/main.py @@ -34,7 +34,6 @@ import hashlib import argparse import os import socket -import pprint import traceback import signal import threading @@ -118,18 +117,19 @@ def dump_state(signum=None, frame=None): ''' Signal handler, logs stack traces of active threads. ''' - pp = pprint.PrettyPrinter(indent=4) state_strs = [] for th in threading.enumerate(): try: state_strs.append(str(th)) except AssertionError: - state_strs.append("") + state_strs.append('') stack = traceback.format_stack(sys._current_frames()[th.ident]) - state_strs.append("".join(stack)) + state_strs.append(''.join(stack)) - logging.warn("dumping state (caught signal {})\n{}".format(signum, "\n".join(state_strs))) + logging.warn( + 'dumping state (caught signal %s)\n%s', + signum, '\n'.join(state_strs)) def init_controller(args): ''' @@ -171,8 +171,9 @@ def init_controller(args): stats_db = warcprox.stats.StatsDb(args.stats_db_file, options=options) listeners.append(stats_db) - if args.kafka_broker_list and args.kafka_capture_feed_topic: - kafka_capture_feed = warcprox.kafkafeed.CaptureFeed(args.kafka_broker_list, args.kafka_capture_feed_topic) + if args.kafka_broker_list: + kafka_capture_feed = warcprox.kafkafeed.CaptureFeed( + args.kafka_broker_list, args.kafka_capture_feed_topic) listeners.append(kafka_capture_feed) recorded_url_q = queue.Queue(maxsize=args.queue_size) diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index 0dc736e..4bf693f 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -132,8 +132,6 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): # to apply this rule if the requested url is within domain bucket0_fields = bucket0.split(':') if len(bucket0_fields) == 2: - self.logger.info( - 'checking %s:%s', repr(limit_key), repr(limit_value)) if not warcprox.host_matches_ip_or_domain( self.hostname, bucket0_fields[1]): return # else host matches, go ahead and enforce the limit From 00f48d6566cf84790dbbd092be275dab2149501b Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 5 Jul 2016 18:45:17 -0500 Subject: [PATCH 137/146] less verbose logging about updating big captures table --- setup.py | 2 +- warcprox/bigtable.py | 57 ++++++++++++++++++++++++-------------------- 2 files changed, 32 insertions(+), 27 deletions(-) diff --git a/setup.py b/setup.py index d785cbc..704c598 100755 --- a/setup.py +++ b/setup.py @@ -51,7 +51,7 @@ except: setuptools.setup( name='warcprox', - version='2.0.dev23', + version='2.0.dev24', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', diff --git a/warcprox/bigtable.py b/warcprox/bigtable.py index 66b84f0..a1ac377 100644 --- a/warcprox/bigtable.py +++ b/warcprox/bigtable.py @@ -1,26 +1,26 @@ -# -# warcprox/bigtable.py - module for "big" RethinkDB table for deduplication; -# the table is "big" in the sense that it is designed to be usable as an index -# for playback software outside of warcprox, and contains information not -# needed merely for deduplication -# -# Copyright (C) 2015-2016 Internet Archive -# -# This program is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License -# as published by the Free Software Foundation; either version 2 -# of the License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, -# USA. -# +""" +warcprox/bigtable.py - module for "big" RethinkDB table for deduplication; +the table is "big" in the sense that it is designed to be usable as an index +for playback software outside of warcprox, and contains information not +needed merely for deduplication + +Copyright (C) 2015-2016 Internet Archive + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, +USA. +""" from __future__ import absolute_import @@ -63,9 +63,14 @@ class RethinkCaptures: with self._batch_lock: if len(self._batch) > 0: result = self.r.table(self.table).insert(self._batch).run() - if result["inserted"] != len(self._batch) or sorted(result.values()) != [0,0,0,0,0,len(self._batch)]: - raise Exception("unexpected result %s saving batch of %s entries", result, len(self._batch)) - self.logger.info("saved %s entries to big capture table db", len(self._batch)) + if result["inserted"] != len(self._batch) or sorted( + result.values()) != [0,0,0,0,0,len(self._batch)]: + raise Exception( + "unexpected result %s saving batch of %s " + "entries", result, len(self._batch)) + self.logger.debug( + "saved %s entries to big capture table db", + len(self._batch)) self._batch = [] except BaseException as e: self.logger.error( From a5d6d634d8e7a9a9fa9f95ea267966624b3dcf7b Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Mon, 11 Jul 2016 11:23:53 -0500 Subject: [PATCH 138/146] enable pypy and pypy3 travis-ci tests, but allow failures --- .travis.yml | 41 +++++++++++++++++++++++------------------ setup.py | 2 +- 2 files changed, 24 insertions(+), 19 deletions(-) diff --git a/.travis.yml b/.travis.yml index de744f3..7bac95a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,31 +1,36 @@ language: python python: - - 3.5 - - 3.4 - - 2.7 - - nightly - # - pypy - # - pypy3 +- 3.5 +- 3.4 +- 2.7 +- nightly +- pypy +- pypy3 + +matrix: + allow_failures: + - python: pypy + - python: pypy3 addons: - apt: - packages: - - python-gdbm - - python3-gdbm - - tor + apt: + packages: + - python-gdbm + - python3-gdbm + - tor services: - - docker +- docker before_install: - - sudo service docker restart ; sleep 10 # https://github.com/travis-ci/travis-ci/issues/4778 - - docker run -d --publish=28015:28015 rethinkdb +- sudo service docker restart ; sleep 10 # https://github.com/travis-ci/travis-ci/issues/4778 +- docker run -d --publish=28015:28015 rethinkdb before_script: - - pip install . pytest requests +- pip install . pytest requests script: - - py.test -v -s tests - - py.test -v -s --rethinkdb-servers=localhost tests tests - - py.test -v -s --rethinkdb-servers=localhost --rethinkdb-big-table tests +- py.test -v -s tests +- py.test -v -s --rethinkdb-servers=localhost tests tests +- py.test -v -s --rethinkdb-servers=localhost --rethinkdb-big-table tests diff --git a/setup.py b/setup.py index 704c598..151684d 100755 --- a/setup.py +++ b/setup.py @@ -51,7 +51,7 @@ except: setuptools.setup( name='warcprox', - version='2.0.dev24', + version='2.0.dev25', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', From fdd6086d65a7da17744632e08839d5f93cc9add7 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 21 Jul 2016 19:09:35 -0500 Subject: [PATCH 139/146] version 2.0b1 for upload to pypi --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 151684d..94d7750 100755 --- a/setup.py +++ b/setup.py @@ -51,7 +51,7 @@ except: setuptools.setup( name='warcprox', - version='2.0.dev25', + version='2.0b1', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', From 1ddebbc50e05b07de22c5765a3ddf96ece1c63e1 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 21 Jul 2016 19:12:46 -0500 Subject: [PATCH 140/146] bump up to next dev version number --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 94d7750..844d8ca 100755 --- a/setup.py +++ b/setup.py @@ -51,7 +51,7 @@ except: setuptools.setup( name='warcprox', - version='2.0b1', + version='2.0b2.dev26', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', From 504af2fb0f2f45633adda8f8a80edf480915cdff Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 7 Sep 2016 13:01:11 -0700 Subject: [PATCH 141/146] try to avoid ever blocking when sending messages to kafka --- README.rst | 1 - setup.py | 2 +- warcprox/kafkafeed.py | 4 +++- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/README.rst b/README.rst index 1a4bfbb..388bcf5 100644 --- a/README.rst +++ b/README.rst @@ -13,7 +13,6 @@ Warcprox runs on python 3.4. To install latest release run: - :: # apt-get install libffi-dev libssl-dev python3-gdbm diff --git a/setup.py b/setup.py index 844d8ca..147cb35 100755 --- a/setup.py +++ b/setup.py @@ -51,7 +51,7 @@ except: setuptools.setup( name='warcprox', - version='2.0b2.dev26', + version='2.0b2.dev27', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', diff --git a/warcprox/kafkafeed.py b/warcprox/kafkafeed.py index 8b2dbf9..683e925 100644 --- a/warcprox/kafkafeed.py +++ b/warcprox/kafkafeed.py @@ -32,7 +32,9 @@ class CaptureFeed: def __init__(self, broker_list, topic=None): self.broker_list = broker_list self.topic = topic - self._producer = kafka.KafkaProducer(bootstrap_servers=broker_list) + # acks=0 to avoid ever blocking + self._producer = kafka.KafkaProducer( + bootstrap_servers=broker_list, acks=0) def notify(self, recorded_url, records): if records[0].type not in (b'revisit', b'response'): From 5d44859ba8dfe84cd97cbcdf4a86b97a6c07aa4b Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 7 Sep 2016 13:43:01 -0700 Subject: [PATCH 142/146] keep trying to connect to kafka and don't let connection failure interfere with other warcprox operations --- setup.py | 2 +- warcprox/kafkafeed.py | 25 +++++++++++++++++++++---- 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/setup.py b/setup.py index 147cb35..4eeb13d 100755 --- a/setup.py +++ b/setup.py @@ -51,7 +51,7 @@ except: setuptools.setup( name='warcprox', - version='2.0b2.dev27', + version='2.0b2.dev28', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', diff --git a/warcprox/kafkafeed.py b/warcprox/kafkafeed.py index 683e925..e17d2c4 100644 --- a/warcprox/kafkafeed.py +++ b/warcprox/kafkafeed.py @@ -32,9 +32,24 @@ class CaptureFeed: def __init__(self, broker_list, topic=None): self.broker_list = broker_list self.topic = topic - # acks=0 to avoid ever blocking - self._producer = kafka.KafkaProducer( - bootstrap_servers=broker_list, acks=0) + self.__producer = None + self._connection_exception = None + + def _producer(self): + if not self.__producer: + try: + # acks=0 to avoid ever blocking + self.__producer = kafka.KafkaProducer( + bootstrap_servers=self.broker_list, acks=0) + if self._connection_exception: + logging.info('connected to kafka successfully!') + self._connection_exception = None + except Exception as e: + if not self._connection_exception: + self._connection_exception = e + logging.error('problem connecting to kafka', exc_info=True) + + return self.__producer def notify(self, recorded_url, records): if records[0].type not in (b'revisit', b'response'): @@ -80,5 +95,7 @@ class CaptureFeed: msg = json.dumps(d, separators=(',', ':')).encode('utf-8') self.logger.debug('feeding kafka topic=%s msg=%s', repr(topic), msg) - self._producer.send(topic, msg) + p = self._producer() + if p: + p.send(topic, msg) From 6000237c47540016a79071ea2d678a59d89e848a Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 23 Sep 2016 15:54:31 +0100 Subject: [PATCH 143/146] workaround for nasty python/ssl deadlock that has been affecting warcprox, same issue as https://github.com/pyca/cryptography/issues/2911 --- setup.py | 2 +- warcprox/main.py | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 4eeb13d..6eca329 100755 --- a/setup.py +++ b/setup.py @@ -51,7 +51,7 @@ except: setuptools.setup( name='warcprox', - version='2.0b2.dev28', + version='2.0b2.dev29', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', diff --git a/warcprox/main.py b/warcprox/main.py index 18a316e..a127016 100644 --- a/warcprox/main.py +++ b/warcprox/main.py @@ -41,6 +41,7 @@ import certauth.certauth import warcprox import re import rethinkstuff +import cryptography.hazmat.backends.openssl def _build_arg_parser(prog=os.path.basename(sys.argv[0])): arg_parser = argparse.ArgumentParser(prog=prog, @@ -213,6 +214,9 @@ def init_controller(args): return controller def real_main(args): + # see https://github.com/pyca/cryptography/issues/2911 + cryptography.hazmat.backends.openssl.backend.activate_builtin_random() + controller = init_controller(args) signal.signal(signal.SIGTERM, lambda a,b: controller.stop.set()) From 314be33707ce047716fcbd7453346c10f10e0640 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 19 Oct 2016 13:43:44 -0700 Subject: [PATCH 144/146] new test that reveals connection hang on https urls missing a content-length http response header (not chunked and server leaves connection open) -- reported by Alex Osborne --- setup.py | 2 +- tests/test_warcprox.py | 42 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 6eca329..e29a4f1 100755 --- a/setup.py +++ b/setup.py @@ -51,7 +51,7 @@ except: setuptools.setup( name='warcprox', - version='2.0b2.dev29', + version='2.0b2.dev30', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 10f214a..6d4b986 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -123,6 +123,11 @@ class _TestHttpRequestHandler(http_server.BaseHTTPRequestHandler): + special_header + b'\r\n' + b'Content-Length: ' + str(len(payload)).encode('ascii') + b'\r\n' + b'\r\n') + elif self.path == '/missing-content-length': + headers = (b'HTTP/1.1 200 OK\r\n' + + b'Content-Type: text/plain\r\n' + + b'\r\n') + payload = b'This response is missing a Content-Length http header.' else: payload = b'404 Not Found\n' headers = (b'HTTP/1.1 404 Not Found\r\n' @@ -1103,6 +1108,43 @@ def _test_tor_onion(archiving_proxies): proxies=archiving_proxies, verify=False, allow_redirects=False) assert response.status_code == 200 +def test_missing_content_length(archiving_proxies, http_daemon, https_daemon): + # double-check that our test http server is responding as expected + url = 'http://localhost:%s/missing-content-length' % ( + http_daemon.server_port) + response = requests.get(url, verify=False, timeout=10) + assert response.content == ( + b'This response is missing a Content-Length http header.') + assert not 'content-length' in response.headers + + # double-check that our test https server is responding as expected + url = 'https://localhost:%s/missing-content-length' % ( + https_daemon.server_port) + response = requests.get(url, verify=False, timeout=10) + assert response.content == ( + b'This response is missing a Content-Length http header.') + assert not 'content-length' in response.headers + + # now check that the proxy doesn't hang (http) + url = 'http://localhost:%s/missing-content-length' % ( + http_daemon.server_port) + response = requests.get( + url, proxies=archiving_proxies, verify=False, timeout=10) + assert response.content == ( + b'This response is missing a Content-Length http header.') + assert not 'content-length' in response.headers + + # now check that the proxy doesn't hang (https) + url = 'https://localhost:%s/missing-content-length' % ( + https_daemon.server_port) + # before fixing the issue this tests for, this would fail by raising + # requests.exceptions.ConnectionError: ... Read timed out + response = requests.get( + url, proxies=archiving_proxies, verify=False, timeout=10) + assert response.content == ( + b'This response is missing a Content-Length http header.') + assert not 'content-length' in response.headers + if __name__ == '__main__': pytest.main() From 15eeaebde5cccf70d95f61294795314193dfed12 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 19 Oct 2016 13:45:46 -0700 Subject: [PATCH 145/146] fix for connection hang on https urls missing a content-length http response header --- setup.py | 2 +- warcprox/warcproxy.py | 37 ++++++++++++++++++++++++++++++++++--- 2 files changed, 35 insertions(+), 4 deletions(-) diff --git a/setup.py b/setup.py index e29a4f1..31481ea 100755 --- a/setup.py +++ b/setup.py @@ -51,7 +51,7 @@ except: setuptools.setup( name='warcprox', - version='2.0b2.dev30', + version='2.0b2.dev31', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index 4bf693f..61cccf7 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -349,8 +349,12 @@ class RecordedUrl: class SingleThreadedWarcProxy(http_server.HTTPServer): logger = logging.getLogger("warcprox.warcproxy.WarcProxy") - def __init__(self, ca=None, recorded_url_q=None, stats_db=None, options=warcprox.Options()): - server_address = (options.address or 'localhost', options.port if options.port is not None else 8000) + def __init__( + self, ca=None, recorded_url_q=None, stats_db=None, + options=warcprox.Options()): + server_address = ( + options.address or 'localhost', + options.port if options.port is not None else 8000) if options.onion_tor_socks_proxy: try: @@ -361,7 +365,8 @@ class SingleThreadedWarcProxy(http_server.HTTPServer): WarcProxyHandler.onion_tor_socks_proxy_host = options.onion_tor_socks_proxy WarcProxyHandler.onion_tor_socks_proxy_port = None - http_server.HTTPServer.__init__(self, server_address, WarcProxyHandler, bind_and_activate=True) + http_server.HTTPServer.__init__( + self, server_address, WarcProxyHandler, bind_and_activate=True) self.digest_algorithm = options.digest_algorithm or 'sha1' @@ -393,9 +398,35 @@ class SingleThreadedWarcProxy(http_server.HTTPServer): def handle_error(self, request, client_address): self.logger.warn("exception processing request %s from %s", request, client_address, exc_info=True) + def finish_request(self, request, client_address): + ''' + We override socketserver.BaseServer.finish_request to get at + WarcProxyHandler's self.request. A normal socket server's self.request + is set to `request` and never changes, but in our case, it may be + replaced with an SSL socket. The caller of this method, e.g. + PooledMixIn.process_request_thread, needs to get a hold of that socket + so it can close it. + ''' + req_handler = WarcProxyHandler(request, client_address, self) + return req_handler.request + class PooledMixIn(socketserver.ThreadingMixIn): def process_request(self, request, client_address): self.pool.submit(self.process_request_thread, request, client_address) + def process_request_thread(self, request, client_address): + ''' + This an almost verbatim copy/paste of + socketserver.ThreadingMixIn.process_request_thread. + The only difference is that it expects self.finish_request to return + a request. See the comment on SingleThreadedWarcProxy.finish_request + above. + ''' + try: + request = self.finish_request(request, client_address) + self.shutdown_request(request) + except: + self.handle_error(request, client_address) + self.shutdown_request(request) class WarcProxy(PooledMixIn, SingleThreadedWarcProxy): logger = logging.getLogger("warcprox.warcproxy.WarcProxy") From 719380e6124f5f9db49fe1cc13fea891611d1e69 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 19 Oct 2016 15:32:58 -0700 Subject: [PATCH 146/146] refactor some general mitm proxy stuff into mitmproxy.py --- setup.py | 2 +- tests/single-threaded-proxy.py | 42 +++++++++---------- warcprox/mitmproxy.py | 70 +++++++++++++++++++++++++++++++ warcprox/warcproxy.py | 77 ++++++++++------------------------ 4 files changed, 112 insertions(+), 79 deletions(-) diff --git a/setup.py b/setup.py index 31481ea..9e87a09 100755 --- a/setup.py +++ b/setup.py @@ -51,7 +51,7 @@ except: setuptools.setup( name='warcprox', - version='2.0b2.dev31', + version='2.0b2.dev32', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', diff --git a/tests/single-threaded-proxy.py b/tests/single-threaded-proxy.py index 5954fd1..dd5e709 100755 --- a/tests/single-threaded-proxy.py +++ b/tests/single-threaded-proxy.py @@ -1,27 +1,25 @@ #!/usr/bin/env python -# -# tests/single-threaded-proxy.py - single-threaded recording proxy, useful for -# debugging -# -# Copyright (C) 2015-2016 Internet Archive -# -# This program is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License -# as published by the Free Software Foundation; either version 2 -# of the License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, -# USA. -# +""" +tests/single-threaded-proxy.py - single-threaded MITM proxy, useful for +debugging, does not write warcs -"""Useful for debugging. Does not write warcs.""" +Copyright (C) 2015-2016 Internet Archive + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, +USA. +""" from __future__ import absolute_import diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index 85960ec..6f48d30 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -48,6 +48,12 @@ import datetime import socks import tempfile import hashlib +try: + import socketserver +except ImportError: + import SocketServer as socketserver +import resource +import concurrent.futures class ProxyingRecorder(object): """ @@ -397,3 +403,67 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): def log_error(self, fmt, *args): self.logger.warn(fmt, *args) +class PooledMixIn(socketserver.ThreadingMixIn): + logger = logging.getLogger("warcprox.mitmproxy.PooledMixIn") + def __init__(self, max_threads=None): + ''' + If max_threads is not supplied, calculates a reasonable value based + on system resource limits. + ''' + if not max_threads: + # man getrlimit: "RLIMIT_NPROC The maximum number of processes (or, + # more precisely on Linux, threads) that can be created for the + # real user ID of the calling process." + rlimit_nproc = resource.getrlimit(resource.RLIMIT_NPROC)[0] + rlimit_nofile = resource.getrlimit(resource.RLIMIT_NOFILE)[0] + max_threads = min(rlimit_nofile // 10, rlimit_nproc // 2) + self.logger.info( + "max_threads=%s (rlimit_nproc=%s, rlimit_nofile=%s)", + max_threads, rlimit_nproc, rlimit_nofile) + self.pool = concurrent.futures.ThreadPoolExecutor(max_threads) + + def process_request(self, request, client_address): + self.pool.submit(self.process_request_thread, request, client_address) + +class MitmProxy(http_server.HTTPServer): + def finish_request(self, request, client_address): + ''' + We override socketserver.BaseServer.finish_request to get at + MitmProxyHandler's self.request. A normal socket server's self.request + is set to `request` and never changes, but in our case, it may be + replaced with an SSL socket. The caller of this method (e.g. + self.process_request or PooledMitmProxy.process_request_thread) needs + to get a hold of that socket so it can close it. + ''' + req_handler = self.RequestHandlerClass(request, client_address, self) + return req_handler.request + + def process_request(self, request, client_address): + ''' + This an almost verbatim copy/paste of + socketserver.BaseServer.process_request. + The only difference is that it expects self.finish_request to return + the request (i.e. the socket). This new value of request is passed on + to self.shutdown_request. See the comment on self.finish_request for + the rationale. + ''' + request = self.finish_request(request, client_address) + self.shutdown_request(request) + +class PooledMitmProxy(PooledMixIn, MitmProxy): + def process_request_thread(self, request, client_address): + ''' + This an almost verbatim copy/paste of + socketserver.ThreadingMixIn.process_request_thread. + The only difference is that it expects self.finish_request to return + the request (i.e. the socket). This new value of request is passed on + to self.shutdown_request. See the comment on MitmProxy.finish_request + for the rationale. + ''' + try: + request = self.finish_request(request, client_address) + self.shutdown_request(request) + except: + self.handle_error(request, client_address) + self.shutdown_request(request) + diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index 61cccf7..f9e07c3 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -43,8 +43,6 @@ from hanzo import warctools from certauth.certauth import CertificateAuthority import warcprox import datetime -import concurrent.futures -import resource import ipaddress import surt @@ -387,64 +385,31 @@ class SingleThreadedWarcProxy(http_server.HTTPServer): self.options = options +class WarcProxy(SingleThreadedWarcProxy, warcprox.mitmproxy.PooledMitmProxy): + logger = logging.getLogger("warcprox.warcproxy.WarcProxy") + + def __init__( + self, ca=None, recorded_url_q=None, stats_db=None, + options=warcprox.Options()): + if options.max_threads: + self.logger.info( + "max_threads=%s set by command line option", + options.max_threads) + warcprox.mitmproxy.PooledMitmProxy.__init__(self, options.max_threads) + SingleThreadedWarcProxy.__init__( + self, ca, recorded_url_q, stats_db, options) + def server_activate(self): http_server.HTTPServer.server_activate(self) - self.logger.info('WarcProxy listening on {0}:{1}'.format(self.server_address[0], self.server_address[1])) + self.logger.info( + 'listening on %s:%s', self.server_address[0], + self.server_address[1]) def server_close(self): - self.logger.info('WarcProxy shutting down') + self.logger.info('shutting down') http_server.HTTPServer.server_close(self) def handle_error(self, request, client_address): - self.logger.warn("exception processing request %s from %s", request, client_address, exc_info=True) - - def finish_request(self, request, client_address): - ''' - We override socketserver.BaseServer.finish_request to get at - WarcProxyHandler's self.request. A normal socket server's self.request - is set to `request` and never changes, but in our case, it may be - replaced with an SSL socket. The caller of this method, e.g. - PooledMixIn.process_request_thread, needs to get a hold of that socket - so it can close it. - ''' - req_handler = WarcProxyHandler(request, client_address, self) - return req_handler.request - -class PooledMixIn(socketserver.ThreadingMixIn): - def process_request(self, request, client_address): - self.pool.submit(self.process_request_thread, request, client_address) - def process_request_thread(self, request, client_address): - ''' - This an almost verbatim copy/paste of - socketserver.ThreadingMixIn.process_request_thread. - The only difference is that it expects self.finish_request to return - a request. See the comment on SingleThreadedWarcProxy.finish_request - above. - ''' - try: - request = self.finish_request(request, client_address) - self.shutdown_request(request) - except: - self.handle_error(request, client_address) - self.shutdown_request(request) - -class WarcProxy(PooledMixIn, SingleThreadedWarcProxy): - logger = logging.getLogger("warcprox.warcproxy.WarcProxy") - - def __init__(self, *args, **kwargs): - SingleThreadedWarcProxy.__init__(self, *args, **kwargs) - if self.options.max_threads: - max_threads = self.options.max_threads - self.logger.info("max_threads=%s set by command line option", - max_threads) - else: - # man getrlimit: "RLIMIT_NPROC The maximum number of processes (or, - # more precisely on Linux, threads) that can be created for the - # real user ID of the calling process." - rlimit_nproc = resource.getrlimit(resource.RLIMIT_NPROC)[0] - rlimit_nofile = resource.getrlimit(resource.RLIMIT_NOFILE)[0] - max_threads = min(rlimit_nofile // 10, rlimit_nproc // 2) - self.logger.info("max_threads=%s (rlimit_nproc=%s, rlimit_nofile=%s)", - max_threads, rlimit_nproc, rlimit_nofile) - - self.pool = concurrent.futures.ThreadPoolExecutor(max_threads) + self.logger.warn( + "exception processing request %s from %s", request, + client_address, exc_info=True)