disallow slash and backslash in warc-prefix

This commit is contained in:
Noah Levitt 2017-08-07 11:30:52 -07:00
parent 0cf283f058
commit 7aed867c90
3 changed files with 98 additions and 4 deletions

View File

@ -1429,6 +1429,19 @@ def test_via_response_header(warcprox_, http_daemon, archiving_proxies, playback
elif record.rec_type == 'request':
assert record.http_headers.get_header('via') == '1.1 warcprox'
def test_slash_in_warc_prefix(warcprox_, http_daemon, archiving_proxies):
url = 'http://localhost:%s/b/b' % http_daemon.server_port
headers = {"Warcprox-Meta": json.dumps({"warc-prefix":"../../../../etc/a"})}
response = requests.get(url, proxies=archiving_proxies, headers=headers)
assert response.status_code == 500
assert response.reason == 'request rejected by warcprox: slash and backslash are not permitted in warc-prefix'
url = 'http://localhost:%s/b/c' % http_daemon.server_port
headers = {"Warcprox-Meta": json.dumps({"warc-prefix":"..\\..\\..\\derp\\monkey"})}
response = requests.get(url, proxies=archiving_proxies, headers=headers)
assert response.status_code == 500
assert response.reason == 'request rejected by warcprox: slash and backslash are not permitted in warc-prefix'
if __name__ == '__main__':
pytest.main()

68
warcprox/crawl_log.py Normal file
View File

@ -0,0 +1,68 @@
#!/usr/bin/env python
'''
warcprox/crawl_log.py - heritrix-style crawl logger
Copyright (C) 2017 Internet Archive
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
USA.
'''
import logging
import datetime
import json
class CrawlLogger(object):
def __init__(self, dir_):
self.dir = dir_
def notify(self, recorded_url, records):
# 2017-08-03T21:45:24.496Z 200 2189 https://autismcouncil.wisconsin.gov/robots.txt P https://autismcouncil.wisconsin.gov/ text/plain #001 20170803214523617+365 sha1:PBS2CEF7B4OSEXZZF3QE2XN2VHYCPNPX https://autismcouncil.wisconsin.gov/ duplicate:digest {"warcFileOffset":942,"contentSize":2495,"warcFilename":"ARCHIVEIT-2159-TEST-JOB319150-20170803214522386-00000.warc.gz"}
now = datetime.datetime.utcnow()
extra_info = {
'contentSize': recorded_url.size,
'warcFilename': records[0].warc_filename,
'warcFileOffset': records[0].offset,
}
fields = [
'{:%Y-%m-%dT%H:%M:%S}.{:03d}'.format(now, now.microsecond//1000),
'% 5s' % recorded_url.status,
'% 10s' % (recorded_url.response_recorder.len - recorded_url.response_recorder.payload_offset),
recorded_url.url,
'-', # hop path
recorded_url.referer or '-',
recorded_url.mimetype,
'-',
'{:%Y%m%d%H%M%S}{:03d}+{:03d}'.format(
recorded_url.timestamp, recorded_url.microsecond//1000,
recorded_url.duration.microseconds//1000),
warcprox.digest_str(
recorded_url.response_recorder.payload_digest, True),
recorded_url.warcprox_meta.get('metadata', {}).get('seed', '-'),
'duplicate:digest' if records[0].type == b'revisit' else '0',
json.dumps(extra_info, separators=(',',':')),
]
for i in range(len(fields)):
# `fields` is a mix of `bytes` and `unicode`, make them all `bytes
try:
fields[i] = fields[i].encode('utf-8')
except:
pass
line = b' '.join(fields)
if 'warc-prefix' in recorded_url.warcprox_meta:
filename = '%s.log' % recorded_url.warcprox_meta['warc-prefix']
os.path.join(
self.dir, )

View File

@ -156,16 +156,29 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
limit_key, limit_value = item
self._enforce_limit(limit_key, limit_value, soft=True)
def _security_check(self, warcprox_meta):
'''
Sends a 400 if `warcprox_meta` specifies a 'warc-prefix' and the
'warc-prefix' contains a slash or backslash.
'''
if warcprox_meta and 'warc-prefix' in warcprox_meta and (
'/' in warcprox_meta['warc-prefix']
or '\\' in warcprox_meta['warc-prefix']):
raise Exception(
"request rejected by warcprox: slash and backslash are not "
"permitted in warc-prefix")
def _connect_to_remote_server(self):
'''
Wraps MitmProxyHandler._connect_to_remote_server, first enforcing
Wraps `MitmProxyHandler._connect_to_remote_server`, first enforcing
limits and block rules in the Warcprox-Meta request header, if any.
Raises warcprox.RequestBlockedByRule if a rule has been enforced.
Otherwise calls MitmProxyHandler._connect_to_remote_server, which
initializes self._remote_server_sock.
Raises `warcprox.RequestBlockedByRule` if a rule has been enforced.
Otherwise calls `MitmProxyHandler._connect_to_remote_server`, which
initializes `self._remote_server_sock`.
'''
if 'Warcprox-Meta' in self.headers:
warcprox_meta = json.loads(self.headers['Warcprox-Meta'])
self._security_check(warcprox_meta)
self._enforce_limits(warcprox_meta)
self._enforce_blocks(warcprox_meta)
return warcprox.mitmproxy.MitmProxyHandler._connect_to_remote_server(self)