started adding some docstrings, and moved some of the more generally man-in-the-middle recording proxy code from warcproxy.py into mitmproxy.py

This commit is contained in:
Noah Levitt 2016-05-10 01:11:17 -07:00
parent 0809c78486
commit 4fd17be339
4 changed files with 344 additions and 267 deletions

View File

@ -50,7 +50,7 @@ except:
deps.append('futures') deps.append('futures')
setuptools.setup(name='warcprox', setuptools.setup(name='warcprox',
version='2.0.dev7', version='2.0.dev8',
description='WARC writing MITM HTTP/S proxy', description='WARC writing MITM HTTP/S proxy',
url='https://github.com/internetarchive/warcprox', url='https://github.com/internetarchive/warcprox',
author='Noah Levitt', author='Noah Levitt',

View File

@ -1,25 +1,25 @@
#!/usr/bin/env python #!/usr/bin/env python
# '''
# warcprox/main.py - entrypoint for warcprox executable, parses command line warcprox/main.py - entrypoint for warcprox executable, parses command line
# arguments, initializes components, starts controller, handles signals arguments, initializes components, starts controller, handles signals
#
# Copyright (C) 2013-2016 Internet Archive Copyright (C) 2013-2016 Internet Archive
#
# This program is free software; you can redistribute it and/or This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2 as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version. of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details. GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
# USA. USA.
# '''
from __future__ import absolute_import from __future__ import absolute_import
@ -114,6 +114,9 @@ def _build_arg_parser(prog=os.path.basename(sys.argv[0])):
return arg_parser return arg_parser
def dump_state(signum=None, frame=None): def dump_state(signum=None, frame=None):
'''
Signal handler, logs stack traces of active threads.
'''
pp = pprint.PrettyPrinter(indent=4) pp = pprint.PrettyPrinter(indent=4)
state_strs = [] state_strs = []
@ -128,6 +131,10 @@ def dump_state(signum=None, frame=None):
logging.warn("dumping state (caught signal {})\n{}".format(signum, "\n".join(state_strs))) logging.warn("dumping state (caught signal {})\n{}".format(signum, "\n".join(state_strs)))
def init_controller(args): def init_controller(args):
'''
Creates a warcprox.controller.WarcproxController configured according to
the supplied arguments (normally the result of parse_args(sys.argv)).
'''
options = warcprox.Options(**vars(args)) options = warcprox.Options(**vars(args))
try: try:
@ -212,11 +219,17 @@ def real_main(args):
controller.run_until_shutdown() controller.run_until_shutdown()
def parse_args(argv=sys.argv): def parse_args(argv=sys.argv):
'''
Parses command line arguments with argparse.
'''
arg_parser = _build_arg_parser(prog=os.path.basename(argv[0])) arg_parser = _build_arg_parser(prog=os.path.basename(argv[0]))
args = arg_parser.parse_args(args=argv[1:]) args = arg_parser.parse_args(args=argv[1:])
return args return args
def main(argv=sys.argv): def main(argv=sys.argv):
'''
Main method, entry point of warcprox command.
'''
args = parse_args(argv) args = parse_args(argv)
if args.verbose: if args.verbose:

View File

@ -1,28 +1,28 @@
# '''
# warcprox/mitmproxy.py - man-in-the-middle http/s proxy code, handles http warcprox/mitmproxy.py - man-in-the-middle http/s proxy code, handles http
# CONNECT method by creating a snakeoil certificate for the requested site, CONNECT method by creating a snakeoil certificate for the requested site,
# calling ssl.wrap_socket() on the client connection; connects to remote calling ssl.wrap_socket() on the client connection; connects to remote
# (proxied) host, possibly using tor if host tld is .onion and tor proxy is (proxied) host, possibly using tor if host tld is .onion and tor proxy is
# configured configured
#
# Copyright (C) 2012 Cygnos Corporation Copyright (C) 2012 Cygnos Corporation
# Copyright (C) 2013-2016 Internet Archive Copyright (C) 2013-2016 Internet Archive
#
# This program is free software; you can redistribute it and/or This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2 as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version. of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details. GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
# USA. USA.
# '''
from __future__ import absolute_import from __future__ import absolute_import
@ -35,7 +35,10 @@ try:
import urllib.parse as urllib_parse import urllib.parse as urllib_parse
except ImportError: except ImportError:
import urlparse as urllib_parse import urlparse as urllib_parse
try:
import http.client as http_client
except ImportError:
import httplib as http_client
import socket import socket
import logging import logging
import ssl import ssl
@ -43,8 +46,132 @@ import warcprox
import threading import threading
import datetime import datetime
import socks import socks
import tempfile
import hashlib
class ProxyingRecorder(object):
"""
Wraps a socket._fileobject, recording the bytes as they are read,
calculating digests, and sending them on to the proxy client.
"""
logger = logging.getLogger("warcprox.mitmproxy.ProxyingRecorder")
def __init__(self, fp, proxy_client, digest_algorithm='sha1', url=None):
self.fp = fp
# "The file has no name, and will cease to exist when it is closed."
self.tempfile = tempfile.SpooledTemporaryFile(max_size=512*1024)
self.digest_algorithm = digest_algorithm
self.block_digest = hashlib.new(digest_algorithm)
self.payload_offset = None
self.payload_digest = None
self.proxy_client = proxy_client
self._proxy_client_conn_open = True
self.len = 0
self.url = url
def payload_starts_now(self):
self.payload_digest = hashlib.new(self.digest_algorithm)
self.payload_offset = self.len
def _update_payload_digest(self, hunk):
if self.payload_digest:
self.payload_digest.update(hunk)
def _update(self, hunk):
self._update_payload_digest(hunk)
self.block_digest.update(hunk)
self.tempfile.write(hunk)
if self.payload_digest and self._proxy_client_conn_open:
try:
self.proxy_client.sendall(hunk)
except BaseException as e:
self._proxy_client_conn_open = False
self.logger.warn(
'%s sending data to proxy client for url %s',
e, self.url)
self.logger.info(
'will continue downloading from remote server without '
'sending to client %s', self.url)
self.len += len(hunk)
def read(self, size=-1):
hunk = self.fp.read(size)
self._update(hunk)
return hunk
def readinto(self, b):
n = self.fp.readinto(b)
self._update(b[:n])
return n
def readline(self, size=-1):
# XXX depends on implementation details of self.fp.readline(), in
# particular that it doesn't call self.fp.read()
hunk = self.fp.readline(size)
self._update(hunk)
return hunk
def flush(self):
return self.fp.flush()
def close(self):
return self.fp.close()
def __len__(self):
return self.len
def payload_size(self):
if self.payload_offset is not None:
return self.len - self.payload_offset
else:
return 0
class ProxyingRecordingHTTPResponse(http_client.HTTPResponse):
'''
Implementation of HTTPResponse that uses a ProxyingRecorder to read the
response from the remote web server and send it on to the proxy client,
while recording the bytes in transit.
'''
def __init__(
self, sock, debuglevel=0, method=None, proxy_client=None,
digest_algorithm='sha1', url=None):
http_client.HTTPResponse.__init__(
self, sock, debuglevel=debuglevel, method=method)
self.proxy_client = proxy_client
self.url = url
# Keep around extra reference to self.fp because HTTPResponse sets
# self.fp=None after it finishes reading, but we still need it
self.recorder = ProxyingRecorder(
self.fp, proxy_client, digest_algorithm, url=url)
self.fp = self.recorder
def begin(self):
http_client.HTTPResponse.begin(self) # reads status line, headers
status_and_headers = 'HTTP/1.1 {} {}\r\n'.format(
self.status, self.reason)
for k,v in self.msg.items():
if k.lower() not in (
'connection', 'proxy-connection', 'keep-alive',
'proxy-authenticate', 'proxy-authorization', 'upgrade',
'strict-transport-security'):
status_and_headers += '{}: {}\r\n'.format(k, v)
status_and_headers += 'Connection: close\r\n\r\n'
self.proxy_client.sendall(status_and_headers.encode('latin1'))
self.recorder.payload_starts_now()
class MitmProxyHandler(http_server.BaseHTTPRequestHandler): class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
'''
An http proxy implementation of BaseHTTPRequestHandler, that acts as a
man-in-the-middle in order to peek at the content of https transactions,
and records the bytes in transit as it proxies them.
'''
logger = logging.getLogger("warcprox.mitmproxy.MitmProxyHandler") logger = logging.getLogger("warcprox.mitmproxy.MitmProxyHandler")
def __init__(self, request, client_address, server): def __init__(self, request, client_address, server):
@ -76,22 +203,23 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
) )
) )
def _connect_to_host(self): def _connect_to_remote_server(self):
# Connect to destination # Connect to destination
if self.onion_tor_socks_proxy_host and self.hostname.lower().endswith('.onion'): if self.onion_tor_socks_proxy_host and self.hostname.lower().endswith('.onion'):
self.logger.info("using tor socks proxy at %s:%s to connect to %s", self.logger.info("using tor socks proxy at %s:%s to connect to %s",
self.onion_tor_socks_proxy_host, self.onion_tor_socks_proxy_host,
self.onion_tor_socks_proxy_port or 1080, self.onion_tor_socks_proxy_port or 1080,
self.hostname) self.hostname)
self._proxy_sock = socks.socksocket() self._remote_server_sock = socks.socksocket()
self._proxy_sock.set_proxy(socks.SOCKS5, self._remote_server_sock.set_proxy(
addr=self.onion_tor_socks_proxy_host, socks.SOCKS5, addr=self.onion_tor_socks_proxy_host,
port=self.onion_tor_socks_proxy_port, rdns=True) port=self.onion_tor_socks_proxy_port, rdns=True)
else: else:
self._proxy_sock = socket.socket() self._remote_server_sock = socket.socket()
self._proxy_sock.settimeout(60) # XXX what value should this have? # XXX what value should this timeout have?
self._proxy_sock.connect((self.hostname, int(self.port))) self._remote_server_sock.settimeout(60)
self._remote_server_sock.connect((self.hostname, int(self.port)))
# Wrap socket if SSL is required # Wrap socket if SSL is required
if self.is_connect: if self.is_connect:
@ -99,12 +227,18 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
context = ssl.create_default_context() context = ssl.create_default_context()
context.check_hostname = False context.check_hostname = False
context.verify_mode = ssl.CERT_NONE context.verify_mode = ssl.CERT_NONE
self._proxy_sock = context.wrap_socket(self._proxy_sock, server_hostname=self.hostname) self._remote_server_sock = context.wrap_socket(
self._remote_server_sock, server_hostname=self.hostname)
except AttributeError: except AttributeError:
try: try:
self._proxy_sock = ssl.wrap_socket(self._proxy_sock) self._remote_server_sock = ssl.wrap_socket(
self._remote_server_sock)
except ssl.SSLError: except ssl.SSLError:
self.logger.warn("failed to establish ssl connection to {}; python ssl library does not support SNI, considering upgrading to python >= 2.7.9 or python 3.4".format(self.hostname)) self.logger.warn(
"failed to establish ssl connection to %s; python "
"ssl library does not support SNI, considering "
"upgrading to python >= 2.7.9 or python 3.4",
self.hostname)
raise raise
def _transition_to_ssl(self): def _transition_to_ssl(self):
@ -112,11 +246,25 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
server_side=True, certfile=self.server.ca.cert_for_host(self.hostname)) server_side=True, certfile=self.server.ca.cert_for_host(self.hostname))
def do_CONNECT(self): def do_CONNECT(self):
'''
Handles a http CONNECT request.
The CONNECT method is meant to "convert the request connection to a
transparent TCP/IP tunnel, usually to facilitate SSL-encrypted
communication (HTTPS) through an unencrypted HTTP proxy" (Wikipedia).
do_CONNECT is where the man-in-the-middle logic happens. In do_CONNECT
the proxy transitions the proxy client connection to ssl while
masquerading as the remote web server using a generated certificate.
Meanwhile makes its own separate ssl connection to the remote web
server. Then it calls self.handle_one_request() again to handle the
request intended for the remote server.
'''
self.is_connect = True self.is_connect = True
try: try:
# Connect to destination first # Connect to destination first
self._determine_host_port() self._determine_host_port()
self._connect_to_host() self._connect_to_remote_server()
# If successful, let's do this! # If successful, let's do this!
self.send_response(200, 'Connection established') self.send_response(200, 'Connection established')
@ -161,7 +309,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
try: try:
# Connect to destination # Connect to destination
self._determine_host_port() self._determine_host_port()
self._connect_to_host() self._connect_to_remote_server()
assert self.url assert self.url
except Exception as e: except Exception as e:
self.logger.error("problem processing request {}: {}".format(repr(self.requestline), e)) self.logger.error("problem processing request {}: {}".format(repr(self.requestline), e))
@ -178,7 +326,68 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
raise raise
def _proxy_request(self): def _proxy_request(self):
raise Exception('_proxy_request() not implemented in MitmProxyHandler, must be implemented in subclass!') '''
Sends the request to the remote server, then uses a ProxyingRecorder to
read the response and send it to the proxy client, while recording the
bytes in transit. Returns a tuple (request, response) where request is
the raw request bytes, and response is a ProxyingRecorder.
'''
# Build request
req_str = '{} {} {}\r\n'.format(
self.command, self.path, self.request_version)
# Swallow headers that don't make sense to forward on, i.e. most
# hop-by-hop headers, see
# http://tools.ietf.org/html/rfc2616#section-13.5.
# self.headers is an email.message.Message, which is case-insensitive
# and doesn't throw KeyError in __delitem__
for key in (
'Connection', 'Proxy-Connection', 'Keep-Alive',
'Proxy-Authenticate', 'Proxy-Authorization', 'Upgrade'):
del self.headers[key]
# Add headers to the request
# XXX in at least python3.3 str(self.headers) uses \n not \r\n :(
req_str += '\r\n'.join(
'{}: {}'.format(k,v) for (k,v) in self.headers.items())
req = req_str.encode('latin1') + b'\r\n\r\n'
# Append message body if present to the request
if 'Content-Length' in self.headers:
req += self.rfile.read(int(self.headers['Content-Length']))
try:
self.logger.debug('sending to remote server req=%s', repr(req))
# Send it down the pipe!
self._remote_server_sock.sendall(req)
prox_rec_res = ProxyingRecordingHTTPResponse(
self._remote_server_sock, proxy_client=self.connection,
digest_algorithm=self.server.digest_algorithm,
url=self.url)
prox_rec_res.begin()
buf = prox_rec_res.read(8192)
while buf != b'':
buf = prox_rec_res.read(8192)
self.log_request(prox_rec_res.status, prox_rec_res.recorder.len)
except socket.timeout as e:
self.logger.warn(
"%s proxying %s %s", repr(e), self.command, self.url)
except BaseException as e:
self.logger.error(
"%s proxying %s %s", repr(e), self.command, self.url,
exc_info=True)
finally:
# Let's close off the remote end
if prox_rec_res:
prox_rec_res.close()
self._remote_server_sock.close()
return req, prox_rec_res
def __getattr__(self, item): def __getattr__(self, item):
if item.startswith('do_'): if item.startswith('do_'):

View File

@ -1,24 +1,24 @@
# '''
# warcprox/warcproxy.py - recording proxy, extends mitmproxy to record traffic, warcprox/warcproxy.py - recording proxy, extends mitmproxy to record traffic,
# enqueue info on the recorded url queue enqueue info on the recorded url queue
#
# Copyright (C) 2013-2016 Internet Archive Copyright (C) 2013-2016 Internet Archive
#
# This program is free software; you can redistribute it and/or This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2 as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version. of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details. GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
# USA. USA.
# '''
from __future__ import absolute_import from __future__ import absolute_import
@ -34,15 +34,9 @@ try:
import queue import queue
except ImportError: except ImportError:
import Queue as queue import Queue as queue
try:
import http.client as http_client
except ImportError:
import httplib as http_client
import logging import logging
import re import re
import tempfile
import traceback import traceback
import hashlib
import json import json
import socket import socket
from hanzo import warctools from hanzo import warctools
@ -52,112 +46,6 @@ import datetime
import concurrent.futures import concurrent.futures
import resource import resource
class ProxyingRecorder(object):
"""
Wraps a socket._fileobject, recording the bytes as they are read,
calculating digests, and sending them on to the proxy client.
"""
logger = logging.getLogger("warcprox.warcproxy.ProxyingRecorder")
def __init__(self, fp, proxy_dest, digest_algorithm='sha1', url=None):
self.fp = fp
# "The file has no name, and will cease to exist when it is closed."
self.tempfile = tempfile.SpooledTemporaryFile(max_size=512*1024)
self.digest_algorithm = digest_algorithm
self.block_digest = hashlib.new(digest_algorithm)
self.payload_offset = None
self.payload_digest = None
self.proxy_dest = proxy_dest
self._proxy_dest_conn_open = True
self._prev_hunk_last_two_bytes = b''
self.len = 0
self.url = url
def payload_starts_now(self):
self.payload_digest = hashlib.new(self.digest_algorithm)
self.payload_offset = self.len
def _update_payload_digest(self, hunk):
if self.payload_digest:
self.payload_digest.update(hunk)
def _update(self, hunk):
self._update_payload_digest(hunk)
self.block_digest.update(hunk)
self.tempfile.write(hunk)
if self.payload_digest and self._proxy_dest_conn_open:
try:
self.proxy_dest.sendall(hunk)
except BaseException as e:
self._proxy_dest_conn_open = False
self.logger.warn('{} sending data to proxy client for url {}'.format(e, self.url))
self.logger.info('will continue downloading from remote server without sending to client {}'.format(self.url))
self.len += len(hunk)
def read(self, size=-1):
hunk = self.fp.read(size)
self._update(hunk)
return hunk
def readinto(self, b):
n = self.fp.readinto(b)
self._update(b[:n])
return n
def readline(self, size=-1):
# XXX depends on implementation details of self.fp.readline(), in
# particular that it doesn't call self.fp.read()
hunk = self.fp.readline(size)
self._update(hunk)
return hunk
def flush(self):
return self.fp.flush()
def close(self):
return self.fp.close()
def __len__(self):
return self.len
def payload_size(self):
if self.payload_offset is not None:
return self.len - self.payload_offset
else:
return 0
class ProxyingRecordingHTTPResponse(http_client.HTTPResponse):
def __init__(self, sock, debuglevel=0, method=None, proxy_dest=None, digest_algorithm='sha1', url=None):
http_client.HTTPResponse.__init__(self, sock, debuglevel=debuglevel, method=method)
self.proxy_dest = proxy_dest
self.url = url
# Keep around extra reference to self.fp because HTTPResponse sets
# self.fp=None after it finishes reading, but we still need it
self.recorder = ProxyingRecorder(self.fp, proxy_dest, digest_algorithm, url=url)
self.fp = self.recorder
def begin(self):
http_client.HTTPResponse.begin(self) # reads status line, headers
status_and_headers = 'HTTP/1.1 {} {}\r\n'.format(self.status, self.reason)
for k,v in self.msg.items():
if k.lower() not in (
'connection', 'proxy-connection', 'keep-alive',
'proxy-authenticate', 'proxy-authorization', 'upgrade',
'strict-transport-security'):
status_and_headers += '{}: {}\r\n'.format(k, v)
status_and_headers += 'Connection: close\r\n\r\n'
self.proxy_dest.sendall(status_and_headers.encode('latin1'))
self.recorder.payload_starts_now()
class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
# self.server is WarcProxy # self.server is WarcProxy
logger = logging.getLogger("warcprox.warcprox.WarcProxyHandler") logger = logging.getLogger("warcprox.warcprox.WarcProxyHandler")
@ -187,96 +75,63 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
return False return False
def _proxy_request(self): def _proxy_request(self):
# Build request
req_str = '{} {} {}\r\n'.format(self.command, self.path, self.request_version)
warcprox_meta = None warcprox_meta = None
raw_warcprox_meta = self.headers.get('Warcprox-Meta') raw_warcprox_meta = self.headers.get('Warcprox-Meta')
if raw_warcprox_meta: if raw_warcprox_meta:
warcprox_meta = json.loads(raw_warcprox_meta) warcprox_meta = json.loads(raw_warcprox_meta)
del self.headers['Warcprox-Meta']
if self._enforce_limits(warcprox_meta): if self._enforce_limits(warcprox_meta):
return return
# Swallow headers that don't make sense to forward on, i.e. most remote_ip = self._remote_server_sock.getpeername()[0]
# hop-by-hop headers, see http://tools.ietf.org/html/rfc2616#section-13.5 timestamp = datetime.datetime.utcnow()
# self.headers is an email.message.Message, which is case-insensitive
# and doesn't throw KeyError in __delitem__
for key in ('Connection', 'Proxy-Connection', 'Keep-Alive',
'Proxy-Authenticate', 'Proxy-Authorization', 'Upgrade',
'Warcprox-Meta'):
del self.headers[key]
# Add headers to the request req, prox_rec_res = warcprox.mitmproxy.MitmProxyHandler._proxy_request(
# XXX in at least python3.3 str(self.headers) uses \n not \r\n :( self)
req_str += '\r\n'.join('{}: {}'.format(k,v) for (k,v) in self.headers.items())
req = req_str.encode('latin1') + b'\r\n\r\n' recorded_url = RecordedUrl(
url=self.url, request_data=req,
# Append message body if present to the request response_recorder=prox_rec_res.recorder, remote_ip=remote_ip,
if 'Content-Length' in self.headers: warcprox_meta=warcprox_meta, status=prox_rec_res.status,
req += self.rfile.read(int(self.headers['Content-Length'])) size=prox_rec_res.recorder.len,
client_ip=self.client_address[0],
prox_rec_res = None content_type=prox_rec_res.getheader("Content-Type"),
recorded_url = None method=self.command, timestamp=timestamp, host=self.hostname,
try: duration=datetime.datetime.utcnow()-timestamp)
self.logger.debug('sending to remote server req=%s', repr(req)) self.server.recorded_url_q.put(recorded_url)
# warc-date "shall represent the instant that data capture for record creation began"
timestamp = datetime.datetime.utcnow()
# Send it down the pipe!
self._proxy_sock.sendall(req)
# We want HTTPResponse's smarts about http and handling of
# non-compliant servers. But HTTPResponse.read() doesn't return the raw
# bytes read from the server, it unchunks them if they're chunked, and
# might do other stuff. We want to send the raw bytes back to the
# client. So we ignore the values returned by prox_rec_res.read() below. Instead
# the ProxyingRecordingHTTPResponse takes care of sending the raw bytes
# to the proxy client.
# Proxy and record the response
prox_rec_res = ProxyingRecordingHTTPResponse(self._proxy_sock,
proxy_dest=self.connection,
digest_algorithm=self.server.digest_algorithm,
url=self.url)
prox_rec_res.begin()
remote_ip=self._proxy_sock.getpeername()[0]
buf = prox_rec_res.read(8192)
while buf != b'':
buf = prox_rec_res.read(8192)
recorded_url = RecordedUrl(url=self.url, request_data=req,
response_recorder=prox_rec_res.recorder,
remote_ip=remote_ip, warcprox_meta=warcprox_meta,
status=prox_rec_res.status, size=prox_rec_res.recorder.len,
client_ip=self.client_address[0],
content_type=prox_rec_res.getheader("Content-Type"),
method=self.command, timestamp=timestamp,
host=self.hostname, duration=datetime.datetime.utcnow()-timestamp)
self.server.recorded_url_q.put(recorded_url)
self.log_request(prox_rec_res.status, prox_rec_res.recorder.len)
except socket.timeout as e:
self.logger.warn("%s proxying %s %s", repr(e), self.command, self.url)
except BaseException as e:
self.logger.error("%s proxying %s %s", repr(e), self.command, self.url, exc_info=True)
finally:
# Let's close off the remote end
if prox_rec_res:
prox_rec_res.close()
self._proxy_sock.close()
return recorded_url return recorded_url
# deprecated # deprecated
def do_PUTMETA(self): def do_PUTMETA(self):
'''
Handles a special warcprox PUTMETA request (deprecated). A PUTMETA
request is equivalent to a WARCPROX_WRITE_RECORD request with
WARC-Type: metadata.
'''
self.do_WARCPROX_WRITE_RECORD(warc_type=warctools.WarcRecord.METADATA) self.do_WARCPROX_WRITE_RECORD(warc_type=warctools.WarcRecord.METADATA)
def do_WARCPROX_WRITE_RECORD(self, warc_type=None): def do_WARCPROX_WRITE_RECORD(self, warc_type=None):
'''
Handles a request with http method WARCPROX_WRITE_RECORD, a special
type of request which tells warcprox to construct a warc record from
the request more or less verbatim, and write it to a warc.
To honor the request, this method creates a RecordedUrl queues it for
the WarcWriterThread to process. The warc record headers Content-Type
and WARC-Type are taken from the request headers, as is the payload.
Example request:
WARCPROX_WRITE_RECORD screenshot:https://example.com/ HTTP/1.1
WARC-Type: metadata
Content-Type: image/png
Content-Length: 12345
Connection: close
<png image data>
'''
try: try:
self.url = self.path self.url = self.path