mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
started adding some docstrings, and moved some of the more generally man-in-the-middle recording proxy code from warcproxy.py into mitmproxy.py
This commit is contained in:
parent
0809c78486
commit
4fd17be339
2
setup.py
2
setup.py
@ -50,7 +50,7 @@ except:
|
||||
deps.append('futures')
|
||||
|
||||
setuptools.setup(name='warcprox',
|
||||
version='2.0.dev7',
|
||||
version='2.0.dev8',
|
||||
description='WARC writing MITM HTTP/S proxy',
|
||||
url='https://github.com/internetarchive/warcprox',
|
||||
author='Noah Levitt',
|
||||
|
@ -1,25 +1,25 @@
|
||||
#!/usr/bin/env python
|
||||
#
|
||||
# warcprox/main.py - entrypoint for warcprox executable, parses command line
|
||||
# arguments, initializes components, starts controller, handles signals
|
||||
#
|
||||
# Copyright (C) 2013-2016 Internet Archive
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU General Public License
|
||||
# as published by the Free Software Foundation; either version 2
|
||||
# of the License, or (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
|
||||
# USA.
|
||||
#
|
||||
'''
|
||||
warcprox/main.py - entrypoint for warcprox executable, parses command line
|
||||
arguments, initializes components, starts controller, handles signals
|
||||
|
||||
Copyright (C) 2013-2016 Internet Archive
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU General Public License
|
||||
as published by the Free Software Foundation; either version 2
|
||||
of the License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
|
||||
USA.
|
||||
'''
|
||||
|
||||
from __future__ import absolute_import
|
||||
|
||||
@ -114,6 +114,9 @@ def _build_arg_parser(prog=os.path.basename(sys.argv[0])):
|
||||
return arg_parser
|
||||
|
||||
def dump_state(signum=None, frame=None):
|
||||
'''
|
||||
Signal handler, logs stack traces of active threads.
|
||||
'''
|
||||
pp = pprint.PrettyPrinter(indent=4)
|
||||
state_strs = []
|
||||
|
||||
@ -128,6 +131,10 @@ def dump_state(signum=None, frame=None):
|
||||
logging.warn("dumping state (caught signal {})\n{}".format(signum, "\n".join(state_strs)))
|
||||
|
||||
def init_controller(args):
|
||||
'''
|
||||
Creates a warcprox.controller.WarcproxController configured according to
|
||||
the supplied arguments (normally the result of parse_args(sys.argv)).
|
||||
'''
|
||||
options = warcprox.Options(**vars(args))
|
||||
|
||||
try:
|
||||
@ -212,11 +219,17 @@ def real_main(args):
|
||||
controller.run_until_shutdown()
|
||||
|
||||
def parse_args(argv=sys.argv):
|
||||
'''
|
||||
Parses command line arguments with argparse.
|
||||
'''
|
||||
arg_parser = _build_arg_parser(prog=os.path.basename(argv[0]))
|
||||
args = arg_parser.parse_args(args=argv[1:])
|
||||
return args
|
||||
|
||||
def main(argv=sys.argv):
|
||||
'''
|
||||
Main method, entry point of warcprox command.
|
||||
'''
|
||||
args = parse_args(argv)
|
||||
|
||||
if args.verbose:
|
||||
|
@ -1,28 +1,28 @@
|
||||
#
|
||||
# warcprox/mitmproxy.py - man-in-the-middle http/s proxy code, handles http
|
||||
# CONNECT method by creating a snakeoil certificate for the requested site,
|
||||
# calling ssl.wrap_socket() on the client connection; connects to remote
|
||||
# (proxied) host, possibly using tor if host tld is .onion and tor proxy is
|
||||
# configured
|
||||
#
|
||||
# Copyright (C) 2012 Cygnos Corporation
|
||||
# Copyright (C) 2013-2016 Internet Archive
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU General Public License
|
||||
# as published by the Free Software Foundation; either version 2
|
||||
# of the License, or (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
|
||||
# USA.
|
||||
#
|
||||
'''
|
||||
warcprox/mitmproxy.py - man-in-the-middle http/s proxy code, handles http
|
||||
CONNECT method by creating a snakeoil certificate for the requested site,
|
||||
calling ssl.wrap_socket() on the client connection; connects to remote
|
||||
(proxied) host, possibly using tor if host tld is .onion and tor proxy is
|
||||
configured
|
||||
|
||||
Copyright (C) 2012 Cygnos Corporation
|
||||
Copyright (C) 2013-2016 Internet Archive
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU General Public License
|
||||
as published by the Free Software Foundation; either version 2
|
||||
of the License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
|
||||
USA.
|
||||
'''
|
||||
|
||||
from __future__ import absolute_import
|
||||
|
||||
@ -35,7 +35,10 @@ try:
|
||||
import urllib.parse as urllib_parse
|
||||
except ImportError:
|
||||
import urlparse as urllib_parse
|
||||
|
||||
try:
|
||||
import http.client as http_client
|
||||
except ImportError:
|
||||
import httplib as http_client
|
||||
import socket
|
||||
import logging
|
||||
import ssl
|
||||
@ -43,8 +46,132 @@ import warcprox
|
||||
import threading
|
||||
import datetime
|
||||
import socks
|
||||
import tempfile
|
||||
import hashlib
|
||||
|
||||
class ProxyingRecorder(object):
|
||||
"""
|
||||
Wraps a socket._fileobject, recording the bytes as they are read,
|
||||
calculating digests, and sending them on to the proxy client.
|
||||
"""
|
||||
|
||||
logger = logging.getLogger("warcprox.mitmproxy.ProxyingRecorder")
|
||||
|
||||
def __init__(self, fp, proxy_client, digest_algorithm='sha1', url=None):
|
||||
self.fp = fp
|
||||
# "The file has no name, and will cease to exist when it is closed."
|
||||
self.tempfile = tempfile.SpooledTemporaryFile(max_size=512*1024)
|
||||
self.digest_algorithm = digest_algorithm
|
||||
self.block_digest = hashlib.new(digest_algorithm)
|
||||
self.payload_offset = None
|
||||
self.payload_digest = None
|
||||
self.proxy_client = proxy_client
|
||||
self._proxy_client_conn_open = True
|
||||
self.len = 0
|
||||
self.url = url
|
||||
|
||||
def payload_starts_now(self):
|
||||
self.payload_digest = hashlib.new(self.digest_algorithm)
|
||||
self.payload_offset = self.len
|
||||
|
||||
def _update_payload_digest(self, hunk):
|
||||
if self.payload_digest:
|
||||
self.payload_digest.update(hunk)
|
||||
|
||||
def _update(self, hunk):
|
||||
self._update_payload_digest(hunk)
|
||||
self.block_digest.update(hunk)
|
||||
|
||||
self.tempfile.write(hunk)
|
||||
|
||||
if self.payload_digest and self._proxy_client_conn_open:
|
||||
try:
|
||||
self.proxy_client.sendall(hunk)
|
||||
except BaseException as e:
|
||||
self._proxy_client_conn_open = False
|
||||
self.logger.warn(
|
||||
'%s sending data to proxy client for url %s',
|
||||
e, self.url)
|
||||
self.logger.info(
|
||||
'will continue downloading from remote server without '
|
||||
'sending to client %s', self.url)
|
||||
|
||||
self.len += len(hunk)
|
||||
|
||||
def read(self, size=-1):
|
||||
hunk = self.fp.read(size)
|
||||
self._update(hunk)
|
||||
return hunk
|
||||
|
||||
def readinto(self, b):
|
||||
n = self.fp.readinto(b)
|
||||
self._update(b[:n])
|
||||
return n
|
||||
|
||||
def readline(self, size=-1):
|
||||
# XXX depends on implementation details of self.fp.readline(), in
|
||||
# particular that it doesn't call self.fp.read()
|
||||
hunk = self.fp.readline(size)
|
||||
self._update(hunk)
|
||||
return hunk
|
||||
|
||||
def flush(self):
|
||||
return self.fp.flush()
|
||||
|
||||
def close(self):
|
||||
return self.fp.close()
|
||||
|
||||
def __len__(self):
|
||||
return self.len
|
||||
|
||||
def payload_size(self):
|
||||
if self.payload_offset is not None:
|
||||
return self.len - self.payload_offset
|
||||
else:
|
||||
return 0
|
||||
|
||||
class ProxyingRecordingHTTPResponse(http_client.HTTPResponse):
|
||||
'''
|
||||
Implementation of HTTPResponse that uses a ProxyingRecorder to read the
|
||||
response from the remote web server and send it on to the proxy client,
|
||||
while recording the bytes in transit.
|
||||
'''
|
||||
def __init__(
|
||||
self, sock, debuglevel=0, method=None, proxy_client=None,
|
||||
digest_algorithm='sha1', url=None):
|
||||
http_client.HTTPResponse.__init__(
|
||||
self, sock, debuglevel=debuglevel, method=method)
|
||||
self.proxy_client = proxy_client
|
||||
self.url = url
|
||||
|
||||
# Keep around extra reference to self.fp because HTTPResponse sets
|
||||
# self.fp=None after it finishes reading, but we still need it
|
||||
self.recorder = ProxyingRecorder(
|
||||
self.fp, proxy_client, digest_algorithm, url=url)
|
||||
self.fp = self.recorder
|
||||
|
||||
def begin(self):
|
||||
http_client.HTTPResponse.begin(self) # reads status line, headers
|
||||
|
||||
status_and_headers = 'HTTP/1.1 {} {}\r\n'.format(
|
||||
self.status, self.reason)
|
||||
for k,v in self.msg.items():
|
||||
if k.lower() not in (
|
||||
'connection', 'proxy-connection', 'keep-alive',
|
||||
'proxy-authenticate', 'proxy-authorization', 'upgrade',
|
||||
'strict-transport-security'):
|
||||
status_and_headers += '{}: {}\r\n'.format(k, v)
|
||||
status_and_headers += 'Connection: close\r\n\r\n'
|
||||
self.proxy_client.sendall(status_and_headers.encode('latin1'))
|
||||
|
||||
self.recorder.payload_starts_now()
|
||||
|
||||
class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
||||
'''
|
||||
An http proxy implementation of BaseHTTPRequestHandler, that acts as a
|
||||
man-in-the-middle in order to peek at the content of https transactions,
|
||||
and records the bytes in transit as it proxies them.
|
||||
'''
|
||||
logger = logging.getLogger("warcprox.mitmproxy.MitmProxyHandler")
|
||||
|
||||
def __init__(self, request, client_address, server):
|
||||
@ -76,22 +203,23 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
||||
)
|
||||
)
|
||||
|
||||
def _connect_to_host(self):
|
||||
def _connect_to_remote_server(self):
|
||||
# Connect to destination
|
||||
if self.onion_tor_socks_proxy_host and self.hostname.lower().endswith('.onion'):
|
||||
self.logger.info("using tor socks proxy at %s:%s to connect to %s",
|
||||
self.onion_tor_socks_proxy_host,
|
||||
self.onion_tor_socks_proxy_port or 1080,
|
||||
self.hostname)
|
||||
self._proxy_sock = socks.socksocket()
|
||||
self._proxy_sock.set_proxy(socks.SOCKS5,
|
||||
addr=self.onion_tor_socks_proxy_host,
|
||||
port=self.onion_tor_socks_proxy_port, rdns=True)
|
||||
self._remote_server_sock = socks.socksocket()
|
||||
self._remote_server_sock.set_proxy(
|
||||
socks.SOCKS5, addr=self.onion_tor_socks_proxy_host,
|
||||
port=self.onion_tor_socks_proxy_port, rdns=True)
|
||||
else:
|
||||
self._proxy_sock = socket.socket()
|
||||
self._remote_server_sock = socket.socket()
|
||||
|
||||
self._proxy_sock.settimeout(60) # XXX what value should this have?
|
||||
self._proxy_sock.connect((self.hostname, int(self.port)))
|
||||
# XXX what value should this timeout have?
|
||||
self._remote_server_sock.settimeout(60)
|
||||
self._remote_server_sock.connect((self.hostname, int(self.port)))
|
||||
|
||||
# Wrap socket if SSL is required
|
||||
if self.is_connect:
|
||||
@ -99,12 +227,18 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
||||
context = ssl.create_default_context()
|
||||
context.check_hostname = False
|
||||
context.verify_mode = ssl.CERT_NONE
|
||||
self._proxy_sock = context.wrap_socket(self._proxy_sock, server_hostname=self.hostname)
|
||||
self._remote_server_sock = context.wrap_socket(
|
||||
self._remote_server_sock, server_hostname=self.hostname)
|
||||
except AttributeError:
|
||||
try:
|
||||
self._proxy_sock = ssl.wrap_socket(self._proxy_sock)
|
||||
self._remote_server_sock = ssl.wrap_socket(
|
||||
self._remote_server_sock)
|
||||
except ssl.SSLError:
|
||||
self.logger.warn("failed to establish ssl connection to {}; python ssl library does not support SNI, considering upgrading to python >= 2.7.9 or python 3.4".format(self.hostname))
|
||||
self.logger.warn(
|
||||
"failed to establish ssl connection to %s; python "
|
||||
"ssl library does not support SNI, considering "
|
||||
"upgrading to python >= 2.7.9 or python 3.4",
|
||||
self.hostname)
|
||||
raise
|
||||
|
||||
def _transition_to_ssl(self):
|
||||
@ -112,11 +246,25 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
||||
server_side=True, certfile=self.server.ca.cert_for_host(self.hostname))
|
||||
|
||||
def do_CONNECT(self):
|
||||
'''
|
||||
Handles a http CONNECT request.
|
||||
|
||||
The CONNECT method is meant to "convert the request connection to a
|
||||
transparent TCP/IP tunnel, usually to facilitate SSL-encrypted
|
||||
communication (HTTPS) through an unencrypted HTTP proxy" (Wikipedia).
|
||||
|
||||
do_CONNECT is where the man-in-the-middle logic happens. In do_CONNECT
|
||||
the proxy transitions the proxy client connection to ssl while
|
||||
masquerading as the remote web server using a generated certificate.
|
||||
Meanwhile makes its own separate ssl connection to the remote web
|
||||
server. Then it calls self.handle_one_request() again to handle the
|
||||
request intended for the remote server.
|
||||
'''
|
||||
self.is_connect = True
|
||||
try:
|
||||
# Connect to destination first
|
||||
self._determine_host_port()
|
||||
self._connect_to_host()
|
||||
self._connect_to_remote_server()
|
||||
|
||||
# If successful, let's do this!
|
||||
self.send_response(200, 'Connection established')
|
||||
@ -161,7 +309,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
||||
try:
|
||||
# Connect to destination
|
||||
self._determine_host_port()
|
||||
self._connect_to_host()
|
||||
self._connect_to_remote_server()
|
||||
assert self.url
|
||||
except Exception as e:
|
||||
self.logger.error("problem processing request {}: {}".format(repr(self.requestline), e))
|
||||
@ -178,7 +326,68 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
||||
raise
|
||||
|
||||
def _proxy_request(self):
|
||||
raise Exception('_proxy_request() not implemented in MitmProxyHandler, must be implemented in subclass!')
|
||||
'''
|
||||
Sends the request to the remote server, then uses a ProxyingRecorder to
|
||||
read the response and send it to the proxy client, while recording the
|
||||
bytes in transit. Returns a tuple (request, response) where request is
|
||||
the raw request bytes, and response is a ProxyingRecorder.
|
||||
'''
|
||||
# Build request
|
||||
req_str = '{} {} {}\r\n'.format(
|
||||
self.command, self.path, self.request_version)
|
||||
|
||||
# Swallow headers that don't make sense to forward on, i.e. most
|
||||
# hop-by-hop headers, see
|
||||
# http://tools.ietf.org/html/rfc2616#section-13.5.
|
||||
# self.headers is an email.message.Message, which is case-insensitive
|
||||
# and doesn't throw KeyError in __delitem__
|
||||
for key in (
|
||||
'Connection', 'Proxy-Connection', 'Keep-Alive',
|
||||
'Proxy-Authenticate', 'Proxy-Authorization', 'Upgrade'):
|
||||
del self.headers[key]
|
||||
|
||||
# Add headers to the request
|
||||
# XXX in at least python3.3 str(self.headers) uses \n not \r\n :(
|
||||
req_str += '\r\n'.join(
|
||||
'{}: {}'.format(k,v) for (k,v) in self.headers.items())
|
||||
|
||||
req = req_str.encode('latin1') + b'\r\n\r\n'
|
||||
|
||||
# Append message body if present to the request
|
||||
if 'Content-Length' in self.headers:
|
||||
req += self.rfile.read(int(self.headers['Content-Length']))
|
||||
|
||||
try:
|
||||
self.logger.debug('sending to remote server req=%s', repr(req))
|
||||
|
||||
# Send it down the pipe!
|
||||
self._remote_server_sock.sendall(req)
|
||||
|
||||
prox_rec_res = ProxyingRecordingHTTPResponse(
|
||||
self._remote_server_sock, proxy_client=self.connection,
|
||||
digest_algorithm=self.server.digest_algorithm,
|
||||
url=self.url)
|
||||
prox_rec_res.begin()
|
||||
|
||||
buf = prox_rec_res.read(8192)
|
||||
while buf != b'':
|
||||
buf = prox_rec_res.read(8192)
|
||||
|
||||
self.log_request(prox_rec_res.status, prox_rec_res.recorder.len)
|
||||
except socket.timeout as e:
|
||||
self.logger.warn(
|
||||
"%s proxying %s %s", repr(e), self.command, self.url)
|
||||
except BaseException as e:
|
||||
self.logger.error(
|
||||
"%s proxying %s %s", repr(e), self.command, self.url,
|
||||
exc_info=True)
|
||||
finally:
|
||||
# Let's close off the remote end
|
||||
if prox_rec_res:
|
||||
prox_rec_res.close()
|
||||
self._remote_server_sock.close()
|
||||
|
||||
return req, prox_rec_res
|
||||
|
||||
def __getattr__(self, item):
|
||||
if item.startswith('do_'):
|
||||
|
@ -1,24 +1,24 @@
|
||||
#
|
||||
# warcprox/warcproxy.py - recording proxy, extends mitmproxy to record traffic,
|
||||
# enqueue info on the recorded url queue
|
||||
#
|
||||
# Copyright (C) 2013-2016 Internet Archive
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU General Public License
|
||||
# as published by the Free Software Foundation; either version 2
|
||||
# of the License, or (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
|
||||
# USA.
|
||||
#
|
||||
'''
|
||||
warcprox/warcproxy.py - recording proxy, extends mitmproxy to record traffic,
|
||||
enqueue info on the recorded url queue
|
||||
|
||||
Copyright (C) 2013-2016 Internet Archive
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU General Public License
|
||||
as published by the Free Software Foundation; either version 2
|
||||
of the License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
|
||||
USA.
|
||||
'''
|
||||
|
||||
from __future__ import absolute_import
|
||||
|
||||
@ -34,15 +34,9 @@ try:
|
||||
import queue
|
||||
except ImportError:
|
||||
import Queue as queue
|
||||
try:
|
||||
import http.client as http_client
|
||||
except ImportError:
|
||||
import httplib as http_client
|
||||
import logging
|
||||
import re
|
||||
import tempfile
|
||||
import traceback
|
||||
import hashlib
|
||||
import json
|
||||
import socket
|
||||
from hanzo import warctools
|
||||
@ -52,112 +46,6 @@ import datetime
|
||||
import concurrent.futures
|
||||
import resource
|
||||
|
||||
class ProxyingRecorder(object):
|
||||
"""
|
||||
Wraps a socket._fileobject, recording the bytes as they are read,
|
||||
calculating digests, and sending them on to the proxy client.
|
||||
"""
|
||||
|
||||
logger = logging.getLogger("warcprox.warcproxy.ProxyingRecorder")
|
||||
|
||||
def __init__(self, fp, proxy_dest, digest_algorithm='sha1', url=None):
|
||||
self.fp = fp
|
||||
# "The file has no name, and will cease to exist when it is closed."
|
||||
self.tempfile = tempfile.SpooledTemporaryFile(max_size=512*1024)
|
||||
self.digest_algorithm = digest_algorithm
|
||||
self.block_digest = hashlib.new(digest_algorithm)
|
||||
self.payload_offset = None
|
||||
self.payload_digest = None
|
||||
self.proxy_dest = proxy_dest
|
||||
self._proxy_dest_conn_open = True
|
||||
self._prev_hunk_last_two_bytes = b''
|
||||
self.len = 0
|
||||
self.url = url
|
||||
|
||||
def payload_starts_now(self):
|
||||
self.payload_digest = hashlib.new(self.digest_algorithm)
|
||||
self.payload_offset = self.len
|
||||
|
||||
def _update_payload_digest(self, hunk):
|
||||
if self.payload_digest:
|
||||
self.payload_digest.update(hunk)
|
||||
|
||||
def _update(self, hunk):
|
||||
self._update_payload_digest(hunk)
|
||||
self.block_digest.update(hunk)
|
||||
|
||||
self.tempfile.write(hunk)
|
||||
|
||||
if self.payload_digest and self._proxy_dest_conn_open:
|
||||
try:
|
||||
self.proxy_dest.sendall(hunk)
|
||||
except BaseException as e:
|
||||
self._proxy_dest_conn_open = False
|
||||
self.logger.warn('{} sending data to proxy client for url {}'.format(e, self.url))
|
||||
self.logger.info('will continue downloading from remote server without sending to client {}'.format(self.url))
|
||||
|
||||
self.len += len(hunk)
|
||||
|
||||
def read(self, size=-1):
|
||||
hunk = self.fp.read(size)
|
||||
self._update(hunk)
|
||||
return hunk
|
||||
|
||||
def readinto(self, b):
|
||||
n = self.fp.readinto(b)
|
||||
self._update(b[:n])
|
||||
return n
|
||||
|
||||
def readline(self, size=-1):
|
||||
# XXX depends on implementation details of self.fp.readline(), in
|
||||
# particular that it doesn't call self.fp.read()
|
||||
hunk = self.fp.readline(size)
|
||||
self._update(hunk)
|
||||
return hunk
|
||||
|
||||
def flush(self):
|
||||
return self.fp.flush()
|
||||
|
||||
def close(self):
|
||||
return self.fp.close()
|
||||
|
||||
def __len__(self):
|
||||
return self.len
|
||||
|
||||
def payload_size(self):
|
||||
if self.payload_offset is not None:
|
||||
return self.len - self.payload_offset
|
||||
else:
|
||||
return 0
|
||||
|
||||
class ProxyingRecordingHTTPResponse(http_client.HTTPResponse):
|
||||
|
||||
def __init__(self, sock, debuglevel=0, method=None, proxy_dest=None, digest_algorithm='sha1', url=None):
|
||||
http_client.HTTPResponse.__init__(self, sock, debuglevel=debuglevel, method=method)
|
||||
self.proxy_dest = proxy_dest
|
||||
self.url = url
|
||||
|
||||
# Keep around extra reference to self.fp because HTTPResponse sets
|
||||
# self.fp=None after it finishes reading, but we still need it
|
||||
self.recorder = ProxyingRecorder(self.fp, proxy_dest, digest_algorithm, url=url)
|
||||
self.fp = self.recorder
|
||||
|
||||
def begin(self):
|
||||
http_client.HTTPResponse.begin(self) # reads status line, headers
|
||||
|
||||
status_and_headers = 'HTTP/1.1 {} {}\r\n'.format(self.status, self.reason)
|
||||
for k,v in self.msg.items():
|
||||
if k.lower() not in (
|
||||
'connection', 'proxy-connection', 'keep-alive',
|
||||
'proxy-authenticate', 'proxy-authorization', 'upgrade',
|
||||
'strict-transport-security'):
|
||||
status_and_headers += '{}: {}\r\n'.format(k, v)
|
||||
status_and_headers += 'Connection: close\r\n\r\n'
|
||||
self.proxy_dest.sendall(status_and_headers.encode('latin1'))
|
||||
|
||||
self.recorder.payload_starts_now()
|
||||
|
||||
|
||||
class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
||||
# self.server is WarcProxy
|
||||
logger = logging.getLogger("warcprox.warcprox.WarcProxyHandler")
|
||||
@ -187,96 +75,63 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
||||
return False
|
||||
|
||||
def _proxy_request(self):
|
||||
# Build request
|
||||
req_str = '{} {} {}\r\n'.format(self.command, self.path, self.request_version)
|
||||
|
||||
warcprox_meta = None
|
||||
raw_warcprox_meta = self.headers.get('Warcprox-Meta')
|
||||
if raw_warcprox_meta:
|
||||
warcprox_meta = json.loads(raw_warcprox_meta)
|
||||
del self.headers['Warcprox-Meta']
|
||||
|
||||
if self._enforce_limits(warcprox_meta):
|
||||
return
|
||||
|
||||
# Swallow headers that don't make sense to forward on, i.e. most
|
||||
# hop-by-hop headers, see http://tools.ietf.org/html/rfc2616#section-13.5
|
||||
# self.headers is an email.message.Message, which is case-insensitive
|
||||
# and doesn't throw KeyError in __delitem__
|
||||
for key in ('Connection', 'Proxy-Connection', 'Keep-Alive',
|
||||
'Proxy-Authenticate', 'Proxy-Authorization', 'Upgrade',
|
||||
'Warcprox-Meta'):
|
||||
del self.headers[key]
|
||||
remote_ip = self._remote_server_sock.getpeername()[0]
|
||||
timestamp = datetime.datetime.utcnow()
|
||||
|
||||
# Add headers to the request
|
||||
# XXX in at least python3.3 str(self.headers) uses \n not \r\n :(
|
||||
req_str += '\r\n'.join('{}: {}'.format(k,v) for (k,v) in self.headers.items())
|
||||
req, prox_rec_res = warcprox.mitmproxy.MitmProxyHandler._proxy_request(
|
||||
self)
|
||||
|
||||
req = req_str.encode('latin1') + b'\r\n\r\n'
|
||||
|
||||
# Append message body if present to the request
|
||||
if 'Content-Length' in self.headers:
|
||||
req += self.rfile.read(int(self.headers['Content-Length']))
|
||||
|
||||
prox_rec_res = None
|
||||
recorded_url = None
|
||||
try:
|
||||
self.logger.debug('sending to remote server req=%s', repr(req))
|
||||
|
||||
# warc-date "shall represent the instant that data capture for record creation began"
|
||||
timestamp = datetime.datetime.utcnow()
|
||||
|
||||
# Send it down the pipe!
|
||||
self._proxy_sock.sendall(req)
|
||||
|
||||
# We want HTTPResponse's smarts about http and handling of
|
||||
# non-compliant servers. But HTTPResponse.read() doesn't return the raw
|
||||
# bytes read from the server, it unchunks them if they're chunked, and
|
||||
# might do other stuff. We want to send the raw bytes back to the
|
||||
# client. So we ignore the values returned by prox_rec_res.read() below. Instead
|
||||
# the ProxyingRecordingHTTPResponse takes care of sending the raw bytes
|
||||
# to the proxy client.
|
||||
|
||||
# Proxy and record the response
|
||||
prox_rec_res = ProxyingRecordingHTTPResponse(self._proxy_sock,
|
||||
proxy_dest=self.connection,
|
||||
digest_algorithm=self.server.digest_algorithm,
|
||||
url=self.url)
|
||||
prox_rec_res.begin()
|
||||
|
||||
remote_ip=self._proxy_sock.getpeername()[0]
|
||||
|
||||
buf = prox_rec_res.read(8192)
|
||||
while buf != b'':
|
||||
buf = prox_rec_res.read(8192)
|
||||
|
||||
recorded_url = RecordedUrl(url=self.url, request_data=req,
|
||||
response_recorder=prox_rec_res.recorder,
|
||||
remote_ip=remote_ip, warcprox_meta=warcprox_meta,
|
||||
status=prox_rec_res.status, size=prox_rec_res.recorder.len,
|
||||
client_ip=self.client_address[0],
|
||||
content_type=prox_rec_res.getheader("Content-Type"),
|
||||
method=self.command, timestamp=timestamp,
|
||||
host=self.hostname, duration=datetime.datetime.utcnow()-timestamp)
|
||||
self.server.recorded_url_q.put(recorded_url)
|
||||
|
||||
self.log_request(prox_rec_res.status, prox_rec_res.recorder.len)
|
||||
except socket.timeout as e:
|
||||
self.logger.warn("%s proxying %s %s", repr(e), self.command, self.url)
|
||||
except BaseException as e:
|
||||
self.logger.error("%s proxying %s %s", repr(e), self.command, self.url, exc_info=True)
|
||||
finally:
|
||||
# Let's close off the remote end
|
||||
if prox_rec_res:
|
||||
prox_rec_res.close()
|
||||
self._proxy_sock.close()
|
||||
recorded_url = RecordedUrl(
|
||||
url=self.url, request_data=req,
|
||||
response_recorder=prox_rec_res.recorder, remote_ip=remote_ip,
|
||||
warcprox_meta=warcprox_meta, status=prox_rec_res.status,
|
||||
size=prox_rec_res.recorder.len,
|
||||
client_ip=self.client_address[0],
|
||||
content_type=prox_rec_res.getheader("Content-Type"),
|
||||
method=self.command, timestamp=timestamp, host=self.hostname,
|
||||
duration=datetime.datetime.utcnow()-timestamp)
|
||||
self.server.recorded_url_q.put(recorded_url)
|
||||
|
||||
return recorded_url
|
||||
|
||||
# deprecated
|
||||
def do_PUTMETA(self):
|
||||
'''
|
||||
Handles a special warcprox PUTMETA request (deprecated). A PUTMETA
|
||||
request is equivalent to a WARCPROX_WRITE_RECORD request with
|
||||
WARC-Type: metadata.
|
||||
'''
|
||||
self.do_WARCPROX_WRITE_RECORD(warc_type=warctools.WarcRecord.METADATA)
|
||||
|
||||
def do_WARCPROX_WRITE_RECORD(self, warc_type=None):
|
||||
'''
|
||||
Handles a request with http method WARCPROX_WRITE_RECORD, a special
|
||||
type of request which tells warcprox to construct a warc record from
|
||||
the request more or less verbatim, and write it to a warc.
|
||||
|
||||
To honor the request, this method creates a RecordedUrl queues it for
|
||||
the WarcWriterThread to process. The warc record headers Content-Type
|
||||
and WARC-Type are taken from the request headers, as is the payload.
|
||||
|
||||
Example request:
|
||||
|
||||
WARCPROX_WRITE_RECORD screenshot:https://example.com/ HTTP/1.1
|
||||
WARC-Type: metadata
|
||||
Content-Type: image/png
|
||||
Content-Length: 12345
|
||||
Connection: close
|
||||
|
||||
<png image data>
|
||||
'''
|
||||
try:
|
||||
self.url = self.path
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user