diff --git a/setup.py b/setup.py index d9c6587..a665f26 100755 --- a/setup.py +++ b/setup.py @@ -51,7 +51,7 @@ except: setuptools.setup( name='warcprox', - version='2.1b1.dev46', + version='2.1b1.dev47', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 82f80fe..8294679 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -44,6 +44,7 @@ import traceback import signal from collections import Counter import socket +import urllib try: import http.server as http_server @@ -59,6 +60,23 @@ import certauth.certauth import warcprox +import http.client +orig_send = http.client.HTTPConnection.send +def _send(self, data): + if isinstance(data, bytes): + logging.info('sending data (bytes): ') + logging.root.handlers[0].stream.buffer.write(data) + logging.root.handlers[0].stream.buffer.write(b'\n') + elif isinstance(data, str): + logging.info('sending data (str): ') + logging.root.handlers[0].stream.write(data) + logging.root.handlers[0].stream.write('\n') + else: + logging.info('sending data from %s', repr(data)) + orig_send(self, data) +### uncomment this to see raw requests going over the wire +# http.client.HTTPConnection.send = _send + logging.basicConfig( stream=sys.stdout, level=logging.INFO, # level=warcprox.TRACE, format='%(asctime)s %(process)d %(levelname)s %(threadName)s ' @@ -1153,7 +1171,8 @@ def test_missing_content_length(archiving_proxies, http_daemon, https_daemon): assert not 'content-length' in response.headers def test_method_filter( - https_daemon, http_daemon, archiving_proxies, playback_proxies): + https_daemon, http_daemon, archiving_proxies, playback_proxies, + warcprox_): # we've configured warcprox with method_filters=['GET','POST'] so HEAD # requests should not be archived @@ -1168,6 +1187,30 @@ def test_method_filter( assert response.status_code == 404 assert response.content == b'404 Not in Archive\n' + # WARCPROX_WRITE_RECORD is exempt from method filter + headers = { + 'Content-Type': 'text/plain', + 'WARC-Type': 'metadata', + 'Host': 'N/A' + } + url = 'http://fakeurl/' + payload = b'I am the WARCPROX_WRITE_RECORD payload' + request = urllib.request.Request( + url, method="WARCPROX_WRITE_RECORD", headers=headers, data=payload) + + # XXX setting request.type="http" is a hack to stop urllib from trying + # to tunnel if url is https + request.type = 'http' + request.set_proxy('localhost:%s' % warcprox_.proxy.server_port, 'http') + + with urllib.request.urlopen(request) as response: + assert response.getcode() == 204 + + response = _poll_playback_until( + playback_proxies, url, status=200, timeout_sec=10) + assert response.status_code == 200 + assert response.content == payload + def test_dedup_ok_flag( https_daemon, http_daemon, warcprox_, archiving_proxies, rethinkdb_big_table): diff --git a/warcprox/playback.py b/warcprox/playback.py index 164ba48..c244843 100644 --- a/warcprox/playback.py +++ b/warcprox/playback.py @@ -182,7 +182,13 @@ class PlaybackProxyHandler(MitmProxyHandler): return self._send_headers_and_refd_payload(record.content[1], refers_to, refers_to_target_uri, refers_to_date) else: - raise Exception('unknown warc record type {}'.format(warc_type)) + # send it back raw, whatever it is + headers_buf = bytearray() + headers_buf.extend(b'HTTP/1.0 200 OK\r\n') + headers_buf.extend(b'content-length: ' + record.get_header(b'content-length') + b'\r\n') + headers_buf.extend(b'content-type: ' + record.get_header(b'content-type') + b'\r\n') + headers_buf.extend(b'\r\n') + return self._send_response(headers_buf, record.content_file) finally: fh.close() diff --git a/warcprox/writerthread.py b/warcprox/writerthread.py index a255717..00238d4 100644 --- a/warcprox/writerthread.py +++ b/warcprox/writerthread.py @@ -64,8 +64,12 @@ class WarcWriterThread(threading.Thread): else: self._run() + _ALWAYS_ACCEPT = {'WARCPROX_WRITE_RECORD'} def _filter_accepts(self, recorded_url): - return not self.method_filter or recorded_url.method.upper() in self.method_filter + if not self.method_filter: + return True + meth = recorded_url.method.upper() + return meth in self._ALWAYS_ACCEPT or meth in self.method_filter def _run(self): while not self.stop.is_set():