Merge branch 'do_not_archive' into qa

2025-01-18 13:22:09 +01:00 · 2018-02-20 16:01:55 -08:00 · 2018-02-20 16:01:55 -08:00 · f202f12bc5
commit f202f12bc5
parent 9058ea1166 a6acc9cf5e
7 changed files with 52 additions and 6 deletions
--- a/setup.py
+++ b/setup.py
@ -40,7 +40,7 @@ except:

 setuptools.setup(
        name='warcprox',
-        version='2.4b2.dev149',
+        version='2.4b2.dev150',
        description='WARC writing MITM HTTP/S proxy',
        url='https://github.com/internetarchive/warcprox',
        author='Noah Levitt',
--- a/tests/test_warcprox.py
+++ b/tests/test_warcprox.py
@ -181,6 +181,12 @@ class _TestHttpRequestHandler(http_server.BaseHTTPRequestHandler):
                    +  b'Content-Type: text/plain\r\n'
                    +  b'\r\n')
            payload = b'This response is missing a Content-Length http header.'
+        elif self.path == '/300k-content':
+            payload = b'0123456789' * 30000
+            headers = (b'HTTP/1.1 200 OK\r\n'
+                    +  b'Content-Type: text/plain\r\n'
+                    +  b'Content-Length: ' + str(len(payload)).encode('ascii') + b'\r\n'
+                    +  b'\r\n')
        elif self.path.startswith('/test_payload_digest-'):
            content_body = (
                    b'Hello. How are you. I am the test_payload_digest '
@ -365,7 +371,8 @@ def warcprox_(request):
            '--playback-port=0',
            '--onion-tor-socks-proxy=localhost:9050',
            '--crawl-log-dir=crawl-logs',
-            '--socket-timeout=4']
+            '--socket-timeout=4',
+            '--max-resource-size=200000']
    if request.config.getoption('--rethinkdb-dedup-url'):
        argv.append('--rethinkdb-dedup-url=%s' % request.config.getoption('--rethinkdb-dedup-url'))
        # test these here only
@ -1211,6 +1218,23 @@ def test_missing_content_length(archiving_proxies, http_daemon, https_daemon, wa
    # wait for postfetch chain
    wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 2)

+def test_limit_large_resource(archiving_proxies, http_daemon, warcprox_):
+    """We try to load a 300k response but we use --max-resource-size=200000 in
+    `warcprox_` so it will be truncated. We expect it to limit the result as
+    soon as it passes the 200000 limit. As warcprox read() chunk size is 65536,
+    the expected result size is 65536*4=262144.
+    """
+    urls_before = warcprox_.proxy.running_stats.urls
+
+    url = 'http://localhost:%s/300k-content' % http_daemon.server_port
+    response = requests.get(
+        url, proxies=archiving_proxies, verify=False, timeout=10)
+    assert len(response.content) == 262144
+
+    # wait for processing of this url to finish so that it doesn't interfere
+    # with subsequent tests
+    wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 1)
+
 def test_method_filter(
        warcprox_, https_daemon, http_daemon, archiving_proxies,
        playback_proxies):
--- a/warcprox/main.py
+++ b/warcprox/main.py
@ -166,6 +166,9 @@ def _build_arg_parser(prog='warcprox'):
    arg_parser.add_argument(
            '--socket-timeout', dest='socket_timeout', type=float,
            default=None, help=argparse.SUPPRESS)
+    arg_parser.add_argument(
+            '--max-resource-size', dest='max_resource_size', type=int,
+            default=None, help='maximum resource size limit in bytes')
    arg_parser.add_argument(
            '--crawl-log-dir', dest='crawl_log_dir', default=None, help=(
                'if specified, write crawl log files in the specified '
--- a/warcprox/mitmproxy.py
+++ b/warcprox/mitmproxy.py
@ -161,6 +161,7 @@ class ProxyingRecordingHTTPResponse(http_client.HTTPResponse):
        self.fp = self.recorder

        self.payload_digest = None
+        self.truncated = None

    def begin(self, extra_response_headers={}):
        http_client.HTTPResponse.begin(self)  # reads status line, headers
@ -207,6 +208,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
    '''
    logger = logging.getLogger("warcprox.mitmproxy.MitmProxyHandler")
    _socket_timeout = 60
+    _max_resource_size = None

    def __init__(self, request, client_address, server):
        threading.current_thread().name = 'MitmProxyHandler(tid={},started={},client={}:{})'.format(warcprox.gettid(), datetime.datetime.utcnow().isoformat(), client_address[0], client_address[1])
@ -431,6 +433,13 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
            buf = prox_rec_res.read(65536)
            while buf != b'':
                buf = prox_rec_res.read(65536)
+                if self._max_resource_size:
+                    if prox_rec_res.recorder.len > self._max_resource_size:
+                        prox_rec_res.truncated = b'length'
+                        self.logger.error(
+                            'Max resource size %d bytes exceeded for URL %s',
+                            self._max_resource_size, self.url)
+                        break

            self.log_request(prox_rec_res.status, prox_rec_res.recorder.len)
        finally:
--- a/warcprox/version.txt
+++ b/warcprox/version.txt
@ -0,0 +1 @@
+1.4-20160105052702-f79e744
--- a/warcprox/warc.py
+++ b/warcprox/warc.py
@ -67,7 +67,8 @@ class WarcRecordBuilder:
                    content_type=hanzo.httptools.ResponseMessage.CONTENT_TYPE,
                    remote_ip=recorded_url.remote_ip,
                    payload_digest=warcprox.digest_str(
-                        recorded_url.payload_digest, self.base32))
+                        recorded_url.payload_digest, self.base32),
+                    truncated=recorded_url.truncated)

    def build_warc_records(self, recorded_url):
        """Returns a tuple of hanzo.warctools.warc.WarcRecord (principal_record, ...)"""
@ -91,7 +92,7 @@ class WarcRecordBuilder:
    def build_warc_record(self, url, warc_date=None, recorder=None, data=None,
        concurrent_to=None, warc_type=None, content_type=None, remote_ip=None,
        profile=None, refers_to=None, refers_to_target_uri=None,
-        refers_to_date=None, payload_digest=None):
+        refers_to_date=None, payload_digest=None, truncated=None):

        if warc_date is None:
            warc_date = warctools.warc.warc_datetime_str(datetime.datetime.utcnow())
@ -120,6 +121,9 @@ class WarcRecordBuilder:
            headers.append((warctools.WarcRecord.CONTENT_TYPE, content_type))
        if payload_digest is not None:
            headers.append((warctools.WarcRecord.PAYLOAD_DIGEST, payload_digest))
+        # truncated value may be 'length' or 'time'
+        if truncated is not None:
+            headers.append((b'WARC-Truncated', truncated))

        if recorder is not None:
            headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(recorder)).encode('latin1')))
--- a/warcprox/warcproxy.py
+++ b/warcprox/warcproxy.py
@ -221,7 +221,8 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
                timestamp=timestamp, host=self.hostname,
                duration=datetime.datetime.utcnow()-timestamp,
                referer=self.headers.get('referer'),
-                payload_digest=prox_rec_res.payload_digest)
+                payload_digest=prox_rec_res.payload_digest,
+                truncated=prox_rec_res.truncated)
        self.server.recorded_url_q.put(recorded_url)

        return recorded_url
@ -330,7 +331,8 @@ class RecordedUrl:
            warcprox_meta=None, content_type=None, custom_type=None,
            status=None, size=None, client_ip=None, method=None,
            timestamp=None, host=None, duration=None, referer=None,
-            payload_digest=None, warc_records=None, do_not_archive=False):
+            payload_digest=None, truncated=None, warc_records=None,
+            do_not_archive=False):
        # XXX should test what happens with non-ascii url (when does
        # url-encoding happen?)
        if type(url) is not bytes:
@ -369,6 +371,7 @@ class RecordedUrl:
        self.duration = duration
        self.referer = referer
        self.payload_digest = payload_digest
+        self.truncated = truncated
        self.warc_records = warc_records
        self.do_not_archive = do_not_archive

@ -400,6 +403,8 @@ class SingleThreadedWarcProxy(http_server.HTTPServer, object):

        if options.socket_timeout:
            WarcProxyHandler._socket_timeout = options.socket_timeout
+        if options.max_resource_size:
+            WarcProxyHandler._max_resource_size = options.max_resource_size

        http_server.HTTPServer.__init__(
                self, server_address, WarcProxyHandler, bind_and_activate=True)