From 7aed867c90873f5aa41a5c3841a757f9080bd690 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Mon, 7 Aug 2017 11:30:52 -0700 Subject: [PATCH] disallow slash and backslash in warc-prefix --- tests/test_warcprox.py | 13 ++++++++ warcprox/crawl_log.py | 68 ++++++++++++++++++++++++++++++++++++++++++ warcprox/warcproxy.py | 21 ++++++++++--- 3 files changed, 98 insertions(+), 4 deletions(-) create mode 100644 warcprox/crawl_log.py diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index dd80a86..fb908d9 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -1429,6 +1429,19 @@ def test_via_response_header(warcprox_, http_daemon, archiving_proxies, playback elif record.rec_type == 'request': assert record.http_headers.get_header('via') == '1.1 warcprox' +def test_slash_in_warc_prefix(warcprox_, http_daemon, archiving_proxies): + url = 'http://localhost:%s/b/b' % http_daemon.server_port + headers = {"Warcprox-Meta": json.dumps({"warc-prefix":"../../../../etc/a"})} + response = requests.get(url, proxies=archiving_proxies, headers=headers) + assert response.status_code == 500 + assert response.reason == 'request rejected by warcprox: slash and backslash are not permitted in warc-prefix' + + url = 'http://localhost:%s/b/c' % http_daemon.server_port + headers = {"Warcprox-Meta": json.dumps({"warc-prefix":"..\\..\\..\\derp\\monkey"})} + response = requests.get(url, proxies=archiving_proxies, headers=headers) + assert response.status_code == 500 + assert response.reason == 'request rejected by warcprox: slash and backslash are not permitted in warc-prefix' + if __name__ == '__main__': pytest.main() diff --git a/warcprox/crawl_log.py b/warcprox/crawl_log.py new file mode 100644 index 0000000..4c04563 --- /dev/null +++ b/warcprox/crawl_log.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python +''' +warcprox/crawl_log.py - heritrix-style crawl logger + +Copyright (C) 2017 Internet Archive + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, +USA. +''' +import logging +import datetime +import json + +class CrawlLogger(object): + def __init__(self, dir_): + self.dir = dir_ + + def notify(self, recorded_url, records): + # 2017-08-03T21:45:24.496Z 200 2189 https://autismcouncil.wisconsin.gov/robots.txt P https://autismcouncil.wisconsin.gov/ text/plain #001 20170803214523617+365 sha1:PBS2CEF7B4OSEXZZF3QE2XN2VHYCPNPX https://autismcouncil.wisconsin.gov/ duplicate:digest {"warcFileOffset":942,"contentSize":2495,"warcFilename":"ARCHIVEIT-2159-TEST-JOB319150-20170803214522386-00000.warc.gz"} + now = datetime.datetime.utcnow() + extra_info = { + 'contentSize': recorded_url.size, + 'warcFilename': records[0].warc_filename, + 'warcFileOffset': records[0].offset, + } + fields = [ + '{:%Y-%m-%dT%H:%M:%S}.{:03d}'.format(now, now.microsecond//1000), + '% 5s' % recorded_url.status, + '% 10s' % (recorded_url.response_recorder.len - recorded_url.response_recorder.payload_offset), + recorded_url.url, + '-', # hop path + recorded_url.referer or '-', + recorded_url.mimetype, + '-', + '{:%Y%m%d%H%M%S}{:03d}+{:03d}'.format( + recorded_url.timestamp, recorded_url.microsecond//1000, + recorded_url.duration.microseconds//1000), + warcprox.digest_str( + recorded_url.response_recorder.payload_digest, True), + recorded_url.warcprox_meta.get('metadata', {}).get('seed', '-'), + 'duplicate:digest' if records[0].type == b'revisit' else '0', + json.dumps(extra_info, separators=(',',':')), + ] + for i in range(len(fields)): + # `fields` is a mix of `bytes` and `unicode`, make them all `bytes + try: + fields[i] = fields[i].encode('utf-8') + except: + pass + line = b' '.join(fields) + + if 'warc-prefix' in recorded_url.warcprox_meta: + filename = '%s.log' % recorded_url.warcprox_meta['warc-prefix'] + os.path.join( + self.dir, ) + diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index b6c96d6..6cbc9e4 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -156,16 +156,29 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): limit_key, limit_value = item self._enforce_limit(limit_key, limit_value, soft=True) + def _security_check(self, warcprox_meta): + ''' + Sends a 400 if `warcprox_meta` specifies a 'warc-prefix' and the + 'warc-prefix' contains a slash or backslash. + ''' + if warcprox_meta and 'warc-prefix' in warcprox_meta and ( + '/' in warcprox_meta['warc-prefix'] + or '\\' in warcprox_meta['warc-prefix']): + raise Exception( + "request rejected by warcprox: slash and backslash are not " + "permitted in warc-prefix") + def _connect_to_remote_server(self): ''' - Wraps MitmProxyHandler._connect_to_remote_server, first enforcing + Wraps `MitmProxyHandler._connect_to_remote_server`, first enforcing limits and block rules in the Warcprox-Meta request header, if any. - Raises warcprox.RequestBlockedByRule if a rule has been enforced. - Otherwise calls MitmProxyHandler._connect_to_remote_server, which - initializes self._remote_server_sock. + Raises `warcprox.RequestBlockedByRule` if a rule has been enforced. + Otherwise calls `MitmProxyHandler._connect_to_remote_server`, which + initializes `self._remote_server_sock`. ''' if 'Warcprox-Meta' in self.headers: warcprox_meta = json.loads(self.headers['Warcprox-Meta']) + self._security_check(warcprox_meta) self._enforce_limits(warcprox_meta) self._enforce_blocks(warcprox_meta) return warcprox.mitmproxy.MitmProxyHandler._connect_to_remote_server(self)