mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
disallow slash and backslash in warc-prefix
This commit is contained in:
parent
0cf283f058
commit
7aed867c90
@ -1429,6 +1429,19 @@ def test_via_response_header(warcprox_, http_daemon, archiving_proxies, playback
|
|||||||
elif record.rec_type == 'request':
|
elif record.rec_type == 'request':
|
||||||
assert record.http_headers.get_header('via') == '1.1 warcprox'
|
assert record.http_headers.get_header('via') == '1.1 warcprox'
|
||||||
|
|
||||||
|
def test_slash_in_warc_prefix(warcprox_, http_daemon, archiving_proxies):
|
||||||
|
url = 'http://localhost:%s/b/b' % http_daemon.server_port
|
||||||
|
headers = {"Warcprox-Meta": json.dumps({"warc-prefix":"../../../../etc/a"})}
|
||||||
|
response = requests.get(url, proxies=archiving_proxies, headers=headers)
|
||||||
|
assert response.status_code == 500
|
||||||
|
assert response.reason == 'request rejected by warcprox: slash and backslash are not permitted in warc-prefix'
|
||||||
|
|
||||||
|
url = 'http://localhost:%s/b/c' % http_daemon.server_port
|
||||||
|
headers = {"Warcprox-Meta": json.dumps({"warc-prefix":"..\\..\\..\\derp\\monkey"})}
|
||||||
|
response = requests.get(url, proxies=archiving_proxies, headers=headers)
|
||||||
|
assert response.status_code == 500
|
||||||
|
assert response.reason == 'request rejected by warcprox: slash and backslash are not permitted in warc-prefix'
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
pytest.main()
|
pytest.main()
|
||||||
|
|
||||||
|
68
warcprox/crawl_log.py
Normal file
68
warcprox/crawl_log.py
Normal file
@ -0,0 +1,68 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
'''
|
||||||
|
warcprox/crawl_log.py - heritrix-style crawl logger
|
||||||
|
|
||||||
|
Copyright (C) 2017 Internet Archive
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or
|
||||||
|
modify it under the terms of the GNU General Public License
|
||||||
|
as published by the Free Software Foundation; either version 2
|
||||||
|
of the License, or (at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License
|
||||||
|
along with this program; if not, write to the Free Software
|
||||||
|
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
|
||||||
|
USA.
|
||||||
|
'''
|
||||||
|
import logging
|
||||||
|
import datetime
|
||||||
|
import json
|
||||||
|
|
||||||
|
class CrawlLogger(object):
|
||||||
|
def __init__(self, dir_):
|
||||||
|
self.dir = dir_
|
||||||
|
|
||||||
|
def notify(self, recorded_url, records):
|
||||||
|
# 2017-08-03T21:45:24.496Z 200 2189 https://autismcouncil.wisconsin.gov/robots.txt P https://autismcouncil.wisconsin.gov/ text/plain #001 20170803214523617+365 sha1:PBS2CEF7B4OSEXZZF3QE2XN2VHYCPNPX https://autismcouncil.wisconsin.gov/ duplicate:digest {"warcFileOffset":942,"contentSize":2495,"warcFilename":"ARCHIVEIT-2159-TEST-JOB319150-20170803214522386-00000.warc.gz"}
|
||||||
|
now = datetime.datetime.utcnow()
|
||||||
|
extra_info = {
|
||||||
|
'contentSize': recorded_url.size,
|
||||||
|
'warcFilename': records[0].warc_filename,
|
||||||
|
'warcFileOffset': records[0].offset,
|
||||||
|
}
|
||||||
|
fields = [
|
||||||
|
'{:%Y-%m-%dT%H:%M:%S}.{:03d}'.format(now, now.microsecond//1000),
|
||||||
|
'% 5s' % recorded_url.status,
|
||||||
|
'% 10s' % (recorded_url.response_recorder.len - recorded_url.response_recorder.payload_offset),
|
||||||
|
recorded_url.url,
|
||||||
|
'-', # hop path
|
||||||
|
recorded_url.referer or '-',
|
||||||
|
recorded_url.mimetype,
|
||||||
|
'-',
|
||||||
|
'{:%Y%m%d%H%M%S}{:03d}+{:03d}'.format(
|
||||||
|
recorded_url.timestamp, recorded_url.microsecond//1000,
|
||||||
|
recorded_url.duration.microseconds//1000),
|
||||||
|
warcprox.digest_str(
|
||||||
|
recorded_url.response_recorder.payload_digest, True),
|
||||||
|
recorded_url.warcprox_meta.get('metadata', {}).get('seed', '-'),
|
||||||
|
'duplicate:digest' if records[0].type == b'revisit' else '0',
|
||||||
|
json.dumps(extra_info, separators=(',',':')),
|
||||||
|
]
|
||||||
|
for i in range(len(fields)):
|
||||||
|
# `fields` is a mix of `bytes` and `unicode`, make them all `bytes
|
||||||
|
try:
|
||||||
|
fields[i] = fields[i].encode('utf-8')
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
line = b' '.join(fields)
|
||||||
|
|
||||||
|
if 'warc-prefix' in recorded_url.warcprox_meta:
|
||||||
|
filename = '%s.log' % recorded_url.warcprox_meta['warc-prefix']
|
||||||
|
os.path.join(
|
||||||
|
self.dir, )
|
||||||
|
|
@ -156,16 +156,29 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
|||||||
limit_key, limit_value = item
|
limit_key, limit_value = item
|
||||||
self._enforce_limit(limit_key, limit_value, soft=True)
|
self._enforce_limit(limit_key, limit_value, soft=True)
|
||||||
|
|
||||||
|
def _security_check(self, warcprox_meta):
|
||||||
|
'''
|
||||||
|
Sends a 400 if `warcprox_meta` specifies a 'warc-prefix' and the
|
||||||
|
'warc-prefix' contains a slash or backslash.
|
||||||
|
'''
|
||||||
|
if warcprox_meta and 'warc-prefix' in warcprox_meta and (
|
||||||
|
'/' in warcprox_meta['warc-prefix']
|
||||||
|
or '\\' in warcprox_meta['warc-prefix']):
|
||||||
|
raise Exception(
|
||||||
|
"request rejected by warcprox: slash and backslash are not "
|
||||||
|
"permitted in warc-prefix")
|
||||||
|
|
||||||
def _connect_to_remote_server(self):
|
def _connect_to_remote_server(self):
|
||||||
'''
|
'''
|
||||||
Wraps MitmProxyHandler._connect_to_remote_server, first enforcing
|
Wraps `MitmProxyHandler._connect_to_remote_server`, first enforcing
|
||||||
limits and block rules in the Warcprox-Meta request header, if any.
|
limits and block rules in the Warcprox-Meta request header, if any.
|
||||||
Raises warcprox.RequestBlockedByRule if a rule has been enforced.
|
Raises `warcprox.RequestBlockedByRule` if a rule has been enforced.
|
||||||
Otherwise calls MitmProxyHandler._connect_to_remote_server, which
|
Otherwise calls `MitmProxyHandler._connect_to_remote_server`, which
|
||||||
initializes self._remote_server_sock.
|
initializes `self._remote_server_sock`.
|
||||||
'''
|
'''
|
||||||
if 'Warcprox-Meta' in self.headers:
|
if 'Warcprox-Meta' in self.headers:
|
||||||
warcprox_meta = json.loads(self.headers['Warcprox-Meta'])
|
warcprox_meta = json.loads(self.headers['Warcprox-Meta'])
|
||||||
|
self._security_check(warcprox_meta)
|
||||||
self._enforce_limits(warcprox_meta)
|
self._enforce_limits(warcprox_meta)
|
||||||
self._enforce_blocks(warcprox_meta)
|
self._enforce_blocks(warcprox_meta)
|
||||||
return warcprox.mitmproxy.MitmProxyHandler._connect_to_remote_server(self)
|
return warcprox.mitmproxy.MitmProxyHandler._connect_to_remote_server(self)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user