WARCPROX_WRITE_RECORD is exempt from method filter

This commit is contained in:
Noah Levitt 2017-02-01 15:30:22 -08:00
parent f5498e1822
commit ddb60876a3
4 changed files with 57 additions and 4 deletions

View File

@ -51,7 +51,7 @@ except:
setuptools.setup(
name='warcprox',
version='2.1b1.dev46',
version='2.1b1.dev47',
description='WARC writing MITM HTTP/S proxy',
url='https://github.com/internetarchive/warcprox',
author='Noah Levitt',

View File

@ -44,6 +44,7 @@ import traceback
import signal
from collections import Counter
import socket
import urllib
try:
import http.server as http_server
@ -59,6 +60,23 @@ import certauth.certauth
import warcprox
import http.client
orig_send = http.client.HTTPConnection.send
def _send(self, data):
if isinstance(data, bytes):
logging.info('sending data (bytes): ')
logging.root.handlers[0].stream.buffer.write(data)
logging.root.handlers[0].stream.buffer.write(b'\n')
elif isinstance(data, str):
logging.info('sending data (str): ')
logging.root.handlers[0].stream.write(data)
logging.root.handlers[0].stream.write('\n')
else:
logging.info('sending data from %s', repr(data))
orig_send(self, data)
### uncomment this to see raw requests going over the wire
# http.client.HTTPConnection.send = _send
logging.basicConfig(
stream=sys.stdout, level=logging.INFO, # level=warcprox.TRACE,
format='%(asctime)s %(process)d %(levelname)s %(threadName)s '
@ -1153,7 +1171,8 @@ def test_missing_content_length(archiving_proxies, http_daemon, https_daemon):
assert not 'content-length' in response.headers
def test_method_filter(
https_daemon, http_daemon, archiving_proxies, playback_proxies):
https_daemon, http_daemon, archiving_proxies, playback_proxies,
warcprox_):
# we've configured warcprox with method_filters=['GET','POST'] so HEAD
# requests should not be archived
@ -1168,6 +1187,30 @@ def test_method_filter(
assert response.status_code == 404
assert response.content == b'404 Not in Archive\n'
# WARCPROX_WRITE_RECORD is exempt from method filter
headers = {
'Content-Type': 'text/plain',
'WARC-Type': 'metadata',
'Host': 'N/A'
}
url = 'http://fakeurl/'
payload = b'I am the WARCPROX_WRITE_RECORD payload'
request = urllib.request.Request(
url, method="WARCPROX_WRITE_RECORD", headers=headers, data=payload)
# XXX setting request.type="http" is a hack to stop urllib from trying
# to tunnel if url is https
request.type = 'http'
request.set_proxy('localhost:%s' % warcprox_.proxy.server_port, 'http')
with urllib.request.urlopen(request) as response:
assert response.getcode() == 204
response = _poll_playback_until(
playback_proxies, url, status=200, timeout_sec=10)
assert response.status_code == 200
assert response.content == payload
def test_dedup_ok_flag(
https_daemon, http_daemon, warcprox_, archiving_proxies,
rethinkdb_big_table):

View File

@ -182,7 +182,13 @@ class PlaybackProxyHandler(MitmProxyHandler):
return self._send_headers_and_refd_payload(record.content[1], refers_to, refers_to_target_uri, refers_to_date)
else:
raise Exception('unknown warc record type {}'.format(warc_type))
# send it back raw, whatever it is
headers_buf = bytearray()
headers_buf.extend(b'HTTP/1.0 200 OK\r\n')
headers_buf.extend(b'content-length: ' + record.get_header(b'content-length') + b'\r\n')
headers_buf.extend(b'content-type: ' + record.get_header(b'content-type') + b'\r\n')
headers_buf.extend(b'\r\n')
return self._send_response(headers_buf, record.content_file)
finally:
fh.close()

View File

@ -64,8 +64,12 @@ class WarcWriterThread(threading.Thread):
else:
self._run()
_ALWAYS_ACCEPT = {'WARCPROX_WRITE_RECORD'}
def _filter_accepts(self, recorded_url):
return not self.method_filter or recorded_url.method.upper() in self.method_filter
if not self.method_filter:
return True
meth = recorded_url.method.upper()
return meth in self._ALWAYS_ACCEPT or meth in self.method_filter
def _run(self):
while not self.stop.is_set():