mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
disallow slash and backslash in warc-prefix
This commit is contained in:
parent
0cf283f058
commit
7aed867c90
@ -1429,6 +1429,19 @@ def test_via_response_header(warcprox_, http_daemon, archiving_proxies, playback
|
||||
elif record.rec_type == 'request':
|
||||
assert record.http_headers.get_header('via') == '1.1 warcprox'
|
||||
|
||||
def test_slash_in_warc_prefix(warcprox_, http_daemon, archiving_proxies):
|
||||
url = 'http://localhost:%s/b/b' % http_daemon.server_port
|
||||
headers = {"Warcprox-Meta": json.dumps({"warc-prefix":"../../../../etc/a"})}
|
||||
response = requests.get(url, proxies=archiving_proxies, headers=headers)
|
||||
assert response.status_code == 500
|
||||
assert response.reason == 'request rejected by warcprox: slash and backslash are not permitted in warc-prefix'
|
||||
|
||||
url = 'http://localhost:%s/b/c' % http_daemon.server_port
|
||||
headers = {"Warcprox-Meta": json.dumps({"warc-prefix":"..\\..\\..\\derp\\monkey"})}
|
||||
response = requests.get(url, proxies=archiving_proxies, headers=headers)
|
||||
assert response.status_code == 500
|
||||
assert response.reason == 'request rejected by warcprox: slash and backslash are not permitted in warc-prefix'
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main()
|
||||
|
||||
|
68
warcprox/crawl_log.py
Normal file
68
warcprox/crawl_log.py
Normal file
@ -0,0 +1,68 @@
|
||||
#!/usr/bin/env python
|
||||
'''
|
||||
warcprox/crawl_log.py - heritrix-style crawl logger
|
||||
|
||||
Copyright (C) 2017 Internet Archive
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU General Public License
|
||||
as published by the Free Software Foundation; either version 2
|
||||
of the License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
|
||||
USA.
|
||||
'''
|
||||
import logging
|
||||
import datetime
|
||||
import json
|
||||
|
||||
class CrawlLogger(object):
|
||||
def __init__(self, dir_):
|
||||
self.dir = dir_
|
||||
|
||||
def notify(self, recorded_url, records):
|
||||
# 2017-08-03T21:45:24.496Z 200 2189 https://autismcouncil.wisconsin.gov/robots.txt P https://autismcouncil.wisconsin.gov/ text/plain #001 20170803214523617+365 sha1:PBS2CEF7B4OSEXZZF3QE2XN2VHYCPNPX https://autismcouncil.wisconsin.gov/ duplicate:digest {"warcFileOffset":942,"contentSize":2495,"warcFilename":"ARCHIVEIT-2159-TEST-JOB319150-20170803214522386-00000.warc.gz"}
|
||||
now = datetime.datetime.utcnow()
|
||||
extra_info = {
|
||||
'contentSize': recorded_url.size,
|
||||
'warcFilename': records[0].warc_filename,
|
||||
'warcFileOffset': records[0].offset,
|
||||
}
|
||||
fields = [
|
||||
'{:%Y-%m-%dT%H:%M:%S}.{:03d}'.format(now, now.microsecond//1000),
|
||||
'% 5s' % recorded_url.status,
|
||||
'% 10s' % (recorded_url.response_recorder.len - recorded_url.response_recorder.payload_offset),
|
||||
recorded_url.url,
|
||||
'-', # hop path
|
||||
recorded_url.referer or '-',
|
||||
recorded_url.mimetype,
|
||||
'-',
|
||||
'{:%Y%m%d%H%M%S}{:03d}+{:03d}'.format(
|
||||
recorded_url.timestamp, recorded_url.microsecond//1000,
|
||||
recorded_url.duration.microseconds//1000),
|
||||
warcprox.digest_str(
|
||||
recorded_url.response_recorder.payload_digest, True),
|
||||
recorded_url.warcprox_meta.get('metadata', {}).get('seed', '-'),
|
||||
'duplicate:digest' if records[0].type == b'revisit' else '0',
|
||||
json.dumps(extra_info, separators=(',',':')),
|
||||
]
|
||||
for i in range(len(fields)):
|
||||
# `fields` is a mix of `bytes` and `unicode`, make them all `bytes
|
||||
try:
|
||||
fields[i] = fields[i].encode('utf-8')
|
||||
except:
|
||||
pass
|
||||
line = b' '.join(fields)
|
||||
|
||||
if 'warc-prefix' in recorded_url.warcprox_meta:
|
||||
filename = '%s.log' % recorded_url.warcprox_meta['warc-prefix']
|
||||
os.path.join(
|
||||
self.dir, )
|
||||
|
@ -156,16 +156,29 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
||||
limit_key, limit_value = item
|
||||
self._enforce_limit(limit_key, limit_value, soft=True)
|
||||
|
||||
def _security_check(self, warcprox_meta):
|
||||
'''
|
||||
Sends a 400 if `warcprox_meta` specifies a 'warc-prefix' and the
|
||||
'warc-prefix' contains a slash or backslash.
|
||||
'''
|
||||
if warcprox_meta and 'warc-prefix' in warcprox_meta and (
|
||||
'/' in warcprox_meta['warc-prefix']
|
||||
or '\\' in warcprox_meta['warc-prefix']):
|
||||
raise Exception(
|
||||
"request rejected by warcprox: slash and backslash are not "
|
||||
"permitted in warc-prefix")
|
||||
|
||||
def _connect_to_remote_server(self):
|
||||
'''
|
||||
Wraps MitmProxyHandler._connect_to_remote_server, first enforcing
|
||||
Wraps `MitmProxyHandler._connect_to_remote_server`, first enforcing
|
||||
limits and block rules in the Warcprox-Meta request header, if any.
|
||||
Raises warcprox.RequestBlockedByRule if a rule has been enforced.
|
||||
Otherwise calls MitmProxyHandler._connect_to_remote_server, which
|
||||
initializes self._remote_server_sock.
|
||||
Raises `warcprox.RequestBlockedByRule` if a rule has been enforced.
|
||||
Otherwise calls `MitmProxyHandler._connect_to_remote_server`, which
|
||||
initializes `self._remote_server_sock`.
|
||||
'''
|
||||
if 'Warcprox-Meta' in self.headers:
|
||||
warcprox_meta = json.loads(self.headers['Warcprox-Meta'])
|
||||
self._security_check(warcprox_meta)
|
||||
self._enforce_limits(warcprox_meta)
|
||||
self._enforce_blocks(warcprox_meta)
|
||||
return warcprox.mitmproxy.MitmProxyHandler._connect_to_remote_server(self)
|
||||
|
Loading…
x
Reference in New Issue
Block a user