mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
WARCPROX_WRITE_RECORD is exempt from method filter
This commit is contained in:
parent
f5498e1822
commit
ddb60876a3
2
setup.py
2
setup.py
@ -51,7 +51,7 @@ except:
|
|||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='warcprox',
|
name='warcprox',
|
||||||
version='2.1b1.dev46',
|
version='2.1b1.dev47',
|
||||||
description='WARC writing MITM HTTP/S proxy',
|
description='WARC writing MITM HTTP/S proxy',
|
||||||
url='https://github.com/internetarchive/warcprox',
|
url='https://github.com/internetarchive/warcprox',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
@ -44,6 +44,7 @@ import traceback
|
|||||||
import signal
|
import signal
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
import socket
|
import socket
|
||||||
|
import urllib
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import http.server as http_server
|
import http.server as http_server
|
||||||
@ -59,6 +60,23 @@ import certauth.certauth
|
|||||||
|
|
||||||
import warcprox
|
import warcprox
|
||||||
|
|
||||||
|
import http.client
|
||||||
|
orig_send = http.client.HTTPConnection.send
|
||||||
|
def _send(self, data):
|
||||||
|
if isinstance(data, bytes):
|
||||||
|
logging.info('sending data (bytes): ')
|
||||||
|
logging.root.handlers[0].stream.buffer.write(data)
|
||||||
|
logging.root.handlers[0].stream.buffer.write(b'\n')
|
||||||
|
elif isinstance(data, str):
|
||||||
|
logging.info('sending data (str): ')
|
||||||
|
logging.root.handlers[0].stream.write(data)
|
||||||
|
logging.root.handlers[0].stream.write('\n')
|
||||||
|
else:
|
||||||
|
logging.info('sending data from %s', repr(data))
|
||||||
|
orig_send(self, data)
|
||||||
|
### uncomment this to see raw requests going over the wire
|
||||||
|
# http.client.HTTPConnection.send = _send
|
||||||
|
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
stream=sys.stdout, level=logging.INFO, # level=warcprox.TRACE,
|
stream=sys.stdout, level=logging.INFO, # level=warcprox.TRACE,
|
||||||
format='%(asctime)s %(process)d %(levelname)s %(threadName)s '
|
format='%(asctime)s %(process)d %(levelname)s %(threadName)s '
|
||||||
@ -1153,7 +1171,8 @@ def test_missing_content_length(archiving_proxies, http_daemon, https_daemon):
|
|||||||
assert not 'content-length' in response.headers
|
assert not 'content-length' in response.headers
|
||||||
|
|
||||||
def test_method_filter(
|
def test_method_filter(
|
||||||
https_daemon, http_daemon, archiving_proxies, playback_proxies):
|
https_daemon, http_daemon, archiving_proxies, playback_proxies,
|
||||||
|
warcprox_):
|
||||||
# we've configured warcprox with method_filters=['GET','POST'] so HEAD
|
# we've configured warcprox with method_filters=['GET','POST'] so HEAD
|
||||||
# requests should not be archived
|
# requests should not be archived
|
||||||
|
|
||||||
@ -1168,6 +1187,30 @@ def test_method_filter(
|
|||||||
assert response.status_code == 404
|
assert response.status_code == 404
|
||||||
assert response.content == b'404 Not in Archive\n'
|
assert response.content == b'404 Not in Archive\n'
|
||||||
|
|
||||||
|
# WARCPROX_WRITE_RECORD is exempt from method filter
|
||||||
|
headers = {
|
||||||
|
'Content-Type': 'text/plain',
|
||||||
|
'WARC-Type': 'metadata',
|
||||||
|
'Host': 'N/A'
|
||||||
|
}
|
||||||
|
url = 'http://fakeurl/'
|
||||||
|
payload = b'I am the WARCPROX_WRITE_RECORD payload'
|
||||||
|
request = urllib.request.Request(
|
||||||
|
url, method="WARCPROX_WRITE_RECORD", headers=headers, data=payload)
|
||||||
|
|
||||||
|
# XXX setting request.type="http" is a hack to stop urllib from trying
|
||||||
|
# to tunnel if url is https
|
||||||
|
request.type = 'http'
|
||||||
|
request.set_proxy('localhost:%s' % warcprox_.proxy.server_port, 'http')
|
||||||
|
|
||||||
|
with urllib.request.urlopen(request) as response:
|
||||||
|
assert response.getcode() == 204
|
||||||
|
|
||||||
|
response = _poll_playback_until(
|
||||||
|
playback_proxies, url, status=200, timeout_sec=10)
|
||||||
|
assert response.status_code == 200
|
||||||
|
assert response.content == payload
|
||||||
|
|
||||||
def test_dedup_ok_flag(
|
def test_dedup_ok_flag(
|
||||||
https_daemon, http_daemon, warcprox_, archiving_proxies,
|
https_daemon, http_daemon, warcprox_, archiving_proxies,
|
||||||
rethinkdb_big_table):
|
rethinkdb_big_table):
|
||||||
|
@ -182,7 +182,13 @@ class PlaybackProxyHandler(MitmProxyHandler):
|
|||||||
return self._send_headers_and_refd_payload(record.content[1], refers_to, refers_to_target_uri, refers_to_date)
|
return self._send_headers_and_refd_payload(record.content[1], refers_to, refers_to_target_uri, refers_to_date)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
raise Exception('unknown warc record type {}'.format(warc_type))
|
# send it back raw, whatever it is
|
||||||
|
headers_buf = bytearray()
|
||||||
|
headers_buf.extend(b'HTTP/1.0 200 OK\r\n')
|
||||||
|
headers_buf.extend(b'content-length: ' + record.get_header(b'content-length') + b'\r\n')
|
||||||
|
headers_buf.extend(b'content-type: ' + record.get_header(b'content-type') + b'\r\n')
|
||||||
|
headers_buf.extend(b'\r\n')
|
||||||
|
return self._send_response(headers_buf, record.content_file)
|
||||||
|
|
||||||
finally:
|
finally:
|
||||||
fh.close()
|
fh.close()
|
||||||
|
@ -64,8 +64,12 @@ class WarcWriterThread(threading.Thread):
|
|||||||
else:
|
else:
|
||||||
self._run()
|
self._run()
|
||||||
|
|
||||||
|
_ALWAYS_ACCEPT = {'WARCPROX_WRITE_RECORD'}
|
||||||
def _filter_accepts(self, recorded_url):
|
def _filter_accepts(self, recorded_url):
|
||||||
return not self.method_filter or recorded_url.method.upper() in self.method_filter
|
if not self.method_filter:
|
||||||
|
return True
|
||||||
|
meth = recorded_url.method.upper()
|
||||||
|
return meth in self._ALWAYS_ACCEPT or meth in self.method_filter
|
||||||
|
|
||||||
def _run(self):
|
def _run(self):
|
||||||
while not self.stop.is_set():
|
while not self.stop.is_set():
|
||||||
|
Loading…
x
Reference in New Issue
Block a user