mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
Merge branch 'master' into trough-dedup
* master: not gonna bother figuring out why pypy regex is not matching https://travis-ci.org/internetarchive/warcprox/jobs/299864258#L615 fix failing test just committed, which involves running "listeners" for all urls, including those not archived; make adjustments accordingly make test_crawl_log expect HEAD request to be logged fix crawl log handling of WARCPROX_WRITE_RECORD request modify test_crawl_log to expect crawl log to honor --base32 setting and add tests of WARCPROX_WRITE_RECORD request and HEAD request (not written to warc) bump dev version number add --crawl-log-dir option to fix failing test create crawl log dir at startup if it doesn't exist make test pass with py27 fix crawl log test to avoid any dedup collisions fix crawl log test heritrix-style crawl log support disallow slash and backslash in warc-prefix can't see any reason to split the main() like this (anymore?) add missing dependency warcio to tests_require
This commit is contained in:
commit
b2adb778ee
@ -13,6 +13,7 @@ python:
|
||||
|
||||
matrix:
|
||||
allow_failures:
|
||||
- python: pypy
|
||||
- python: pypy3
|
||||
- python: nightly
|
||||
- python: 3.7-dev
|
||||
|
2
setup.py
2
setup.py
@ -52,7 +52,7 @@ except:
|
||||
|
||||
setuptools.setup(
|
||||
name='warcprox',
|
||||
version='2.2.1b2.dev107',
|
||||
version='2.2.1b2.dev112',
|
||||
description='WARC writing MITM HTTP/S proxy',
|
||||
url='https://github.com/internetarchive/warcprox',
|
||||
author='Noah Levitt',
|
||||
|
@ -254,7 +254,8 @@ def warcprox_(request):
|
||||
'--method-filter=POST',
|
||||
'--port=0',
|
||||
'--playback-port=0',
|
||||
'--onion-tor-socks-proxy=localhost:9050']
|
||||
'--onion-tor-socks-proxy=localhost:9050',
|
||||
'--crawl-log-dir=crawl-logs']
|
||||
if request.config.getoption('--rethinkdb-dedup-url'):
|
||||
argv.append('--rethinkdb-dedup-url=%s' % request.config.getoption('--rethinkdb-dedup-url'))
|
||||
# test these here only
|
||||
@ -1339,6 +1340,200 @@ def test_via_response_header(warcprox_, http_daemon, archiving_proxies, playback
|
||||
elif record.rec_type == 'request':
|
||||
assert record.http_headers.get_header('via') == '1.1 warcprox'
|
||||
|
||||
def test_slash_in_warc_prefix(warcprox_, http_daemon, archiving_proxies):
|
||||
url = 'http://localhost:%s/b/b' % http_daemon.server_port
|
||||
headers = {"Warcprox-Meta": json.dumps({"warc-prefix":"../../../../etc/a"})}
|
||||
response = requests.get(url, proxies=archiving_proxies, headers=headers)
|
||||
assert response.status_code == 500
|
||||
assert response.reason == 'request rejected by warcprox: slash and backslash are not permitted in warc-prefix'
|
||||
|
||||
url = 'http://localhost:%s/b/c' % http_daemon.server_port
|
||||
headers = {"Warcprox-Meta": json.dumps({"warc-prefix":"..\\..\\..\\derp\\monkey"})}
|
||||
response = requests.get(url, proxies=archiving_proxies, headers=headers)
|
||||
assert response.status_code == 500
|
||||
assert response.reason == 'request rejected by warcprox: slash and backslash are not permitted in warc-prefix'
|
||||
|
||||
def test_crawl_log(warcprox_, http_daemon, archiving_proxies):
|
||||
try:
|
||||
os.unlink(os.path.join(warcprox_.options.crawl_log_dir, 'crawl.log'))
|
||||
except:
|
||||
pass
|
||||
|
||||
url = 'http://localhost:%s/b/aa' % http_daemon.server_port
|
||||
response = requests.get(url, proxies=archiving_proxies)
|
||||
assert response.status_code == 200
|
||||
|
||||
url = 'http://localhost:%s/b/bb' % http_daemon.server_port
|
||||
headers = {
|
||||
"Warcprox-Meta": json.dumps({"warc-prefix":"test_crawl_log_1"}),
|
||||
"Referer": "http://example.com/referer",
|
||||
}
|
||||
response = requests.get(url, proxies=archiving_proxies, headers=headers)
|
||||
assert response.status_code == 200
|
||||
|
||||
start = time.time()
|
||||
while time.time() - start < 10:
|
||||
if os.path.exists(os.path.join(
|
||||
warcprox_.options.crawl_log_dir, 'test_crawl_log_1.log')):
|
||||
break
|
||||
time.sleep(0.5)
|
||||
|
||||
crawl_log = open(os.path.join(
|
||||
warcprox_.options.crawl_log_dir, 'crawl.log'), 'rb').read()
|
||||
# tests will fail in year 3000 :)
|
||||
assert re.match(b'\A2[^\n]+\n\Z', crawl_log)
|
||||
assert crawl_log[24:31] == b' 200 '
|
||||
assert crawl_log[31:42] == b' 54 '
|
||||
fields = crawl_log.split()
|
||||
assert len(fields) == 13
|
||||
assert fields[3].endswith(b'/b/aa')
|
||||
assert fields[4] == b'-'
|
||||
assert fields[5] == b'-'
|
||||
assert fields[6] == b'text/plain'
|
||||
assert fields[7] == b'-'
|
||||
assert re.match(br'^\d{17}[+]\d{3}', fields[8])
|
||||
assert fields[9] == b'sha1:69d51a46e44a04e8110da0c91897cece979fa70f'
|
||||
assert fields[10] == b'-'
|
||||
assert fields[11] == b'-'
|
||||
extra_info = json.loads(fields[12].decode('utf-8'))
|
||||
assert set(extra_info.keys()) == {
|
||||
'contentSize', 'warcFilename', 'warcFileOffset'}
|
||||
assert extra_info['contentSize'] == 145
|
||||
|
||||
crawl_log_1 = open(os.path.join(
|
||||
warcprox_.options.crawl_log_dir, 'test_crawl_log_1.log'), 'rb').read()
|
||||
assert re.match(b'\A2[^\n]+\n\Z', crawl_log_1)
|
||||
assert crawl_log_1[24:31] == b' 200 '
|
||||
assert crawl_log_1[31:42] == b' 54 '
|
||||
fields = crawl_log_1.split()
|
||||
assert len(fields) == 13
|
||||
assert fields[3].endswith(b'/b/bb')
|
||||
assert fields[4] == b'-'
|
||||
assert fields[5] == b'http://example.com/referer'
|
||||
assert fields[6] == b'text/plain'
|
||||
assert fields[7] == b'-'
|
||||
assert re.match(br'^\d{17}[+]\d{3}', fields[8])
|
||||
assert fields[9] == b'sha1:9aae6acb797c75ca8eb5dded9be2127cc61b3fbb'
|
||||
assert fields[10] == b'-'
|
||||
assert fields[11] == b'-'
|
||||
extra_info = json.loads(fields[12].decode('utf-8'))
|
||||
assert set(extra_info.keys()) == {
|
||||
'contentSize', 'warcFilename', 'warcFileOffset'}
|
||||
assert extra_info['contentSize'] == 145
|
||||
|
||||
# should be deduplicated
|
||||
url = 'http://localhost:%s/b/aa' % http_daemon.server_port
|
||||
headers = {"Warcprox-Meta": json.dumps({
|
||||
"warc-prefix": "test_crawl_log_2",
|
||||
"metadata": {"seed": "http://example.com/seed"}})}
|
||||
response = requests.get(url, proxies=archiving_proxies, headers=headers)
|
||||
assert response.status_code == 200
|
||||
|
||||
start = time.time()
|
||||
while time.time() - start < 10:
|
||||
if os.path.exists(os.path.join(
|
||||
warcprox_.options.crawl_log_dir, 'test_crawl_log_2.log')):
|
||||
break
|
||||
time.sleep(0.5)
|
||||
|
||||
crawl_log_2 = open(os.path.join(
|
||||
warcprox_.options.crawl_log_dir, 'test_crawl_log_2.log'), 'rb').read()
|
||||
|
||||
assert re.match(b'\A2[^\n]+\n\Z', crawl_log_2)
|
||||
assert crawl_log_2[24:31] == b' 200 '
|
||||
assert crawl_log_2[31:42] == b' 54 '
|
||||
fields = crawl_log_2.split()
|
||||
assert len(fields) == 13
|
||||
assert fields[3].endswith(b'/b/aa')
|
||||
assert fields[4] == b'-'
|
||||
assert fields[5] == b'-'
|
||||
assert fields[6] == b'text/plain'
|
||||
assert fields[7] == b'-'
|
||||
assert re.match(br'^\d{17}[+]\d{3}', fields[8])
|
||||
assert fields[9] == b'sha1:69d51a46e44a04e8110da0c91897cece979fa70f'
|
||||
assert fields[10] == b'http://example.com/seed'
|
||||
assert fields[11] == b'duplicate:digest'
|
||||
extra_info = json.loads(fields[12].decode('utf-8'))
|
||||
assert set(extra_info.keys()) == {
|
||||
'contentSize', 'warcFilename', 'warcFileOffset'}
|
||||
assert extra_info['contentSize'] == 145
|
||||
|
||||
# a request that is not saved to a warc (because of --method-filter)
|
||||
# currently not logged at all (XXX maybe it should be)
|
||||
url = 'http://localhost:%s/b/cc' % http_daemon.server_port
|
||||
headers = {'Warcprox-Meta': json.dumps({'warc-prefix': 'test_crawl_log_3'})}
|
||||
response = requests.head(url, proxies=archiving_proxies, headers=headers)
|
||||
|
||||
start = time.time()
|
||||
while time.time() - start < 10:
|
||||
if os.path.exists(os.path.join(
|
||||
warcprox_.options.crawl_log_dir, 'test_crawl_log_3.log')):
|
||||
break
|
||||
time.sleep(0.5)
|
||||
|
||||
crawl_log_3 = open(os.path.join(
|
||||
warcprox_.options.crawl_log_dir, 'test_crawl_log_3.log'), 'rb').read()
|
||||
|
||||
assert re.match(b'\A2[^\n]+\n\Z', crawl_log_3)
|
||||
assert crawl_log_3[24:31] == b' 200 '
|
||||
assert crawl_log_3[31:42] == b' 0 '
|
||||
fields = crawl_log_3.split()
|
||||
assert len(fields) == 13
|
||||
assert fields[3].endswith(b'/b/cc')
|
||||
assert fields[4] == b'-'
|
||||
assert fields[5] == b'-'
|
||||
assert fields[6] == b'text/plain'
|
||||
assert fields[7] == b'-'
|
||||
assert re.match(br'^\d{17}[+]\d{3}', fields[8])
|
||||
assert fields[9] == b'sha1:da39a3ee5e6b4b0d3255bfef95601890afd80709'
|
||||
assert fields[10] == b'-'
|
||||
assert fields[11] == b'-'
|
||||
extra_info = json.loads(fields[12].decode('utf-8'))
|
||||
assert extra_info == {'contentSize': 91}
|
||||
|
||||
# WARCPROX_WRITE_RECORD
|
||||
url = 'http://fakeurl/'
|
||||
payload = b'I am the WARCPROX_WRITE_RECORD payload'
|
||||
headers = {
|
||||
'Content-Type': 'text/plain',
|
||||
'WARC-Type': 'metadata',
|
||||
'Host': 'N/A',
|
||||
'Warcprox-Meta': json.dumps({'warc-prefix': 'test_crawl_log_4'}),
|
||||
}
|
||||
response = requests.request(
|
||||
method='WARCPROX_WRITE_RECORD', url=url, data=payload,
|
||||
headers=headers, proxies=archiving_proxies)
|
||||
assert response.status_code == 204
|
||||
|
||||
start = time.time()
|
||||
while time.time() - start < 10:
|
||||
if os.path.exists(os.path.join(
|
||||
warcprox_.options.crawl_log_dir, 'test_crawl_log_4.log')):
|
||||
break
|
||||
time.sleep(0.5)
|
||||
|
||||
crawl_log_4 = open(os.path.join(
|
||||
warcprox_.options.crawl_log_dir, 'test_crawl_log_4.log'), 'rb').read()
|
||||
|
||||
assert re.match(b'\A2[^\n]+\n\Z', crawl_log_4)
|
||||
assert crawl_log_4[24:31] == b' 204 '
|
||||
assert crawl_log_4[31:42] == b' 38 '
|
||||
fields = crawl_log_4.split()
|
||||
assert len(fields) == 13
|
||||
assert fields[3] == b'http://fakeurl/'
|
||||
assert fields[4] == b'-'
|
||||
assert fields[5] == b'-'
|
||||
assert fields[6] == b'text/plain'
|
||||
assert fields[7] == b'-'
|
||||
assert re.match(br'^\d{17}[+]\d{3}', fields[8])
|
||||
assert fields[9] == b'sha1:bb56497c17d2684f5eca4af9df908c78ba74ca1c'
|
||||
assert fields[10] == b'-'
|
||||
assert fields[11] == b'-'
|
||||
extra_info = json.loads(fields[12].decode('utf-8'))
|
||||
assert set(extra_info.keys()) == {
|
||||
'contentSize', 'warcFilename', 'warcFileOffset'}
|
||||
assert extra_info['contentSize'] == 38
|
||||
|
||||
def test_long_warcprox_meta(
|
||||
warcprox_, http_daemon, archiving_proxies, playback_proxies):
|
||||
url = 'http://localhost:%s/b/g' % http_daemon.server_port
|
||||
|
@ -114,3 +114,4 @@ import warcprox.warc as warc
|
||||
import warcprox.writerthread as writerthread
|
||||
import warcprox.stats as stats
|
||||
import warcprox.bigtable as bigtable
|
||||
import warcprox.crawl_log as crawl_log
|
||||
|
@ -201,9 +201,10 @@ class RethinkCaptures:
|
||||
return entry
|
||||
|
||||
def notify(self, recorded_url, records):
|
||||
entry = self._assemble_entry(recorded_url, records)
|
||||
with self._batch_lock:
|
||||
self._batch.append(entry)
|
||||
if records:
|
||||
entry = self._assemble_entry(recorded_url, records)
|
||||
with self._batch_lock:
|
||||
self._batch.append(entry)
|
||||
|
||||
def close(self):
|
||||
self.stop()
|
||||
|
86
warcprox/crawl_log.py
Normal file
86
warcprox/crawl_log.py
Normal file
@ -0,0 +1,86 @@
|
||||
#!/usr/bin/env python
|
||||
'''
|
||||
warcprox/crawl_log.py - heritrix-style crawl logger
|
||||
|
||||
Copyright (C) 2017 Internet Archive
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU General Public License
|
||||
as published by the Free Software Foundation; either version 2
|
||||
of the License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
|
||||
USA.
|
||||
'''
|
||||
import logging
|
||||
import datetime
|
||||
import json
|
||||
import os
|
||||
import warcprox
|
||||
|
||||
class CrawlLogger(object):
|
||||
def __init__(self, dir_, options=warcprox.Options()):
|
||||
self.dir = dir_
|
||||
self.options = options
|
||||
if not os.path.exists(self.dir):
|
||||
logging.info('creating directory %r', self.dir)
|
||||
os.mkdir(self.dir)
|
||||
|
||||
def notify(self, recorded_url, records):
|
||||
# 2017-08-03T21:45:24.496Z 200 2189 https://autismcouncil.wisconsin.gov/robots.txt P https://autismcouncil.wisconsin.gov/ text/plain #001 20170803214523617+365 sha1:PBS2CEF7B4OSEXZZF3QE2XN2VHYCPNPX https://autismcouncil.wisconsin.gov/ duplicate:digest {"warcFileOffset":942,"contentSize":2495,"warcFilename":"ARCHIVEIT-2159-TEST-JOB319150-20170803214522386-00000.warc.gz"}
|
||||
now = datetime.datetime.utcnow()
|
||||
extra_info = {'contentSize': recorded_url.size,}
|
||||
if records:
|
||||
extra_info['warcFilename'] = records[0].warc_filename
|
||||
extra_info['warcFileOffset'] = records[0].offset
|
||||
if recorded_url.response_recorder:
|
||||
content_length = recorded_url.response_recorder.len - recorded_url.response_recorder.payload_offset
|
||||
payload_digest = warcprox.digest_str(
|
||||
recorded_url.response_recorder.payload_digest,
|
||||
self.options.base32)
|
||||
else:
|
||||
# WARCPROX_WRITE_RECORD request
|
||||
content_length = len(recorded_url.request_data)
|
||||
payload_digest = records[0].get_header(b'WARC-Payload-Digest')
|
||||
fields = [
|
||||
'{:%Y-%m-%dT%H:%M:%S}.{:03d}Z'.format(now, now.microsecond//1000),
|
||||
'% 5s' % recorded_url.status,
|
||||
'% 10s' % content_length,
|
||||
recorded_url.url,
|
||||
'-', # hop path
|
||||
recorded_url.referer or '-',
|
||||
recorded_url.mimetype or '-',
|
||||
'-',
|
||||
'{:%Y%m%d%H%M%S}{:03d}+{:03d}'.format(
|
||||
recorded_url.timestamp,
|
||||
recorded_url.timestamp.microsecond//1000,
|
||||
recorded_url.duration.microseconds//1000),
|
||||
payload_digest,
|
||||
recorded_url.warcprox_meta.get('metadata', {}).get('seed', '-'),
|
||||
'duplicate:digest' if records and records[0].type == b'revisit' else '-',
|
||||
json.dumps(extra_info, separators=(',',':')),
|
||||
]
|
||||
for i in range(len(fields)):
|
||||
# `fields` is a mix of `bytes` and `unicode`, make them all `bytes`
|
||||
try:
|
||||
fields[i] = fields[i].encode('utf-8')
|
||||
except:
|
||||
pass
|
||||
line = b' '.join(fields) + b'\n'
|
||||
|
||||
if 'warc-prefix' in recorded_url.warcprox_meta:
|
||||
filename = '%s.log' % recorded_url.warcprox_meta['warc-prefix']
|
||||
else:
|
||||
filename = 'crawl.log'
|
||||
|
||||
crawl_log_path = os.path.join(self.dir, filename)
|
||||
with open(crawl_log_path, 'ab') as f:
|
||||
f.write(line)
|
||||
|
@ -96,7 +96,7 @@ class DedupDb(object):
|
||||
return result
|
||||
|
||||
def notify(self, recorded_url, records):
|
||||
if (records[0].get_header(warctools.WarcRecord.TYPE) == warctools.WarcRecord.RESPONSE
|
||||
if (records and records[0].type == b'response'
|
||||
and recorded_url.response_recorder.payload_size() > 0):
|
||||
digest_key = warcprox.digest_str(
|
||||
recorded_url.response_recorder.payload_digest,
|
||||
@ -174,7 +174,7 @@ class RethinkDedupDb:
|
||||
return result
|
||||
|
||||
def notify(self, recorded_url, records):
|
||||
if (records[0].get_header(warctools.WarcRecord.TYPE) == warctools.WarcRecord.RESPONSE
|
||||
if (records and records[0].type == b'response'
|
||||
and recorded_url.response_recorder.payload_size() > 0):
|
||||
digest_key = warcprox.digest_str(recorded_url.response_recorder.payload_digest,
|
||||
self.options.base32)
|
||||
|
@ -152,6 +152,11 @@ def _build_arg_parser(prog=os.path.basename(sys.argv[0])):
|
||||
default=None, help=(
|
||||
'host:port of tor socks proxy, used only to connect to '
|
||||
'.onion sites'))
|
||||
arg_parser.add_argument(
|
||||
'--crawl-log-dir', dest='crawl_log_dir', default=None, help=(
|
||||
'if specified, write crawl log files in the specified '
|
||||
'directory; one crawl log is written per warc filename '
|
||||
'prefix; crawl log format mimics heritrix'))
|
||||
arg_parser.add_argument(
|
||||
'--plugin', metavar='PLUGIN_CLASS', dest='plugins',
|
||||
action='append', help=(
|
||||
@ -248,6 +253,10 @@ def init_controller(args):
|
||||
playback_index_db = None
|
||||
playback_proxy = None
|
||||
|
||||
if args.crawl_log_dir:
|
||||
listeners.append(warcprox.crawl_log.CrawlLogger(
|
||||
args.crawl_log_dir, options=options))
|
||||
|
||||
for qualname in args.plugins or []:
|
||||
try:
|
||||
(module_name, class_name) = qualname.rsplit('.', 1)
|
||||
@ -285,22 +294,6 @@ def init_controller(args):
|
||||
|
||||
return controller
|
||||
|
||||
def real_main(args):
|
||||
# see https://github.com/pyca/cryptography/issues/2911
|
||||
cryptography.hazmat.backends.openssl.backend.activate_builtin_random()
|
||||
|
||||
controller = init_controller(args)
|
||||
|
||||
signal.signal(signal.SIGTERM, lambda a,b: controller.stop.set())
|
||||
signal.signal(signal.SIGINT, lambda a,b: controller.stop.set())
|
||||
try:
|
||||
signal.signal(signal.SIGQUIT, dump_state)
|
||||
except AttributeError:
|
||||
# SIGQUIT does not exist on some platforms (windows)
|
||||
pass
|
||||
|
||||
controller.run_until_shutdown()
|
||||
|
||||
def parse_args(argv=sys.argv):
|
||||
'''
|
||||
Parses command line arguments with argparse.
|
||||
@ -329,7 +322,20 @@ def main(argv=sys.argv):
|
||||
'%(asctime)s %(process)d %(levelname)s %(threadName)s '
|
||||
'%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s'))
|
||||
|
||||
real_main(args)
|
||||
# see https://github.com/pyca/cryptography/issues/2911
|
||||
cryptography.hazmat.backends.openssl.backend.activate_builtin_random()
|
||||
|
||||
controller = init_controller(args)
|
||||
|
||||
signal.signal(signal.SIGTERM, lambda a,b: controller.stop.set())
|
||||
signal.signal(signal.SIGINT, lambda a,b: controller.stop.set())
|
||||
try:
|
||||
signal.signal(signal.SIGQUIT, dump_state)
|
||||
except AttributeError:
|
||||
# SIGQUIT does not exist on some platforms (windows)
|
||||
pass
|
||||
|
||||
controller.run_until_shutdown()
|
||||
|
||||
def ensure_rethinkdb_tables():
|
||||
'''
|
||||
|
@ -259,7 +259,8 @@ class PlaybackIndexDb(object):
|
||||
pass
|
||||
|
||||
def notify(self, recorded_url, records):
|
||||
self.save(records[0].warc_filename, records, records[0].offset)
|
||||
if records:
|
||||
self.save(records[0].warc_filename, records, records[0].offset)
|
||||
|
||||
def save(self, warcfile, recordset, offset):
|
||||
response_record = recordset[0]
|
||||
|
@ -171,12 +171,13 @@ class StatsDb:
|
||||
bucket_stats["total"]["urls"] += 1
|
||||
bucket_stats["total"]["wire_bytes"] += recorded_url.size
|
||||
|
||||
if records[0].get_header(warctools.WarcRecord.TYPE) == warctools.WarcRecord.REVISIT:
|
||||
bucket_stats["revisit"]["urls"] += 1
|
||||
bucket_stats["revisit"]["wire_bytes"] += recorded_url.size
|
||||
else:
|
||||
bucket_stats["new"]["urls"] += 1
|
||||
bucket_stats["new"]["wire_bytes"] += recorded_url.size
|
||||
if records:
|
||||
if records[0].type == b'revisit':
|
||||
bucket_stats["revisit"]["urls"] += 1
|
||||
bucket_stats["revisit"]["wire_bytes"] += recorded_url.size
|
||||
else:
|
||||
bucket_stats["new"]["urls"] += 1
|
||||
bucket_stats["new"]["wire_bytes"] += recorded_url.size
|
||||
|
||||
json_value = json.dumps(bucket_stats, separators=(',',':'))
|
||||
conn.execute(
|
||||
@ -306,8 +307,7 @@ class RethinkStatsDb(StatsDb):
|
||||
|
||||
def tally(self, recorded_url, records):
|
||||
buckets = self.buckets(recorded_url)
|
||||
is_revisit = records[0].get_header(
|
||||
warctools.WarcRecord.TYPE) == warctools.WarcRecord.REVISIT
|
||||
is_revisit = records[0].type == b'revisit'
|
||||
with self._batch_lock:
|
||||
for bucket in buckets:
|
||||
bucket_stats = self._batch.setdefault(
|
||||
|
@ -153,16 +153,29 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
||||
limit_key, limit_value = item
|
||||
self._enforce_limit(limit_key, limit_value, soft=True)
|
||||
|
||||
def _security_check(self, warcprox_meta):
|
||||
'''
|
||||
Sends a 400 if `warcprox_meta` specifies a 'warc-prefix' and the
|
||||
'warc-prefix' contains a slash or backslash.
|
||||
'''
|
||||
if warcprox_meta and 'warc-prefix' in warcprox_meta and (
|
||||
'/' in warcprox_meta['warc-prefix']
|
||||
or '\\' in warcprox_meta['warc-prefix']):
|
||||
raise Exception(
|
||||
"request rejected by warcprox: slash and backslash are not "
|
||||
"permitted in warc-prefix")
|
||||
|
||||
def _connect_to_remote_server(self):
|
||||
'''
|
||||
Wraps MitmProxyHandler._connect_to_remote_server, first enforcing
|
||||
Wraps `MitmProxyHandler._connect_to_remote_server`, first enforcing
|
||||
limits and block rules in the Warcprox-Meta request header, if any.
|
||||
Raises warcprox.RequestBlockedByRule if a rule has been enforced.
|
||||
Otherwise calls MitmProxyHandler._connect_to_remote_server, which
|
||||
initializes self._remote_server_sock.
|
||||
Raises `warcprox.RequestBlockedByRule` if a rule has been enforced.
|
||||
Otherwise calls `MitmProxyHandler._connect_to_remote_server`, which
|
||||
initializes `self._remote_server_sock`.
|
||||
'''
|
||||
if 'Warcprox-Meta' in self.headers:
|
||||
warcprox_meta = json.loads(self.headers['Warcprox-Meta'])
|
||||
self._security_check(warcprox_meta)
|
||||
self._enforce_limits(warcprox_meta)
|
||||
self._enforce_blocks(warcprox_meta)
|
||||
return warcprox.mitmproxy.MitmProxyHandler._connect_to_remote_server(self)
|
||||
@ -204,7 +217,8 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
||||
client_ip=self.client_address[0],
|
||||
content_type=content_type, method=self.command,
|
||||
timestamp=timestamp, host=self.hostname,
|
||||
duration=datetime.datetime.utcnow()-timestamp)
|
||||
duration=datetime.datetime.utcnow()-timestamp,
|
||||
referer=self.headers.get('referer'))
|
||||
self.server.recorded_url_q.put(recorded_url)
|
||||
|
||||
return recorded_url
|
||||
@ -279,16 +293,19 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
||||
if raw_warcprox_meta:
|
||||
warcprox_meta = json.loads(raw_warcprox_meta)
|
||||
|
||||
rec_custom = RecordedUrl(url=self.url,
|
||||
request_data=request_data,
|
||||
response_recorder=None,
|
||||
remote_ip=b'',
|
||||
warcprox_meta=warcprox_meta,
|
||||
content_type=self.headers['Content-Type'],
|
||||
custom_type=warc_type or self.headers['WARC-Type'].encode('utf-8'),
|
||||
status=204, size=len(request_data),
|
||||
client_ip=self.client_address[0],
|
||||
method=self.command, timestamp=timestamp)
|
||||
rec_custom = RecordedUrl(
|
||||
url=self.url,
|
||||
request_data=request_data,
|
||||
response_recorder=None,
|
||||
remote_ip=b'',
|
||||
warcprox_meta=warcprox_meta,
|
||||
content_type=self.headers['Content-Type'],
|
||||
custom_type=warc_type or self.headers['WARC-Type'].encode('utf-8'),
|
||||
status=204, size=len(request_data),
|
||||
client_ip=self.client_address[0],
|
||||
method=self.command,
|
||||
timestamp=timestamp,
|
||||
duration=datetime.datetime.utcnow()-timestamp)
|
||||
|
||||
self.server.recorded_url_q.put(rec_custom)
|
||||
self.send_response(204, 'OK')
|
||||
@ -311,7 +328,7 @@ class RecordedUrl:
|
||||
def __init__(self, url, request_data, response_recorder, remote_ip,
|
||||
warcprox_meta=None, content_type=None, custom_type=None,
|
||||
status=None, size=None, client_ip=None, method=None,
|
||||
timestamp=None, host=None, duration=None):
|
||||
timestamp=None, host=None, duration=None, referer=None):
|
||||
# XXX should test what happens with non-ascii url (when does
|
||||
# url-encoding happen?)
|
||||
if type(url) is not bytes:
|
||||
@ -348,6 +365,7 @@ class RecordedUrl:
|
||||
self.timestamp = timestamp
|
||||
self.host = host
|
||||
self.duration = duration
|
||||
self.referer = referer
|
||||
|
||||
# inherit from object so that multiple inheritance from this class works
|
||||
# properly in python 2
|
||||
|
@ -82,13 +82,15 @@ class WarcWriterThread(threading.Thread):
|
||||
self.logger.info("%s urls left to write", qsize)
|
||||
|
||||
recorded_url = self.recorded_url_q.get(block=True, timeout=0.5)
|
||||
records = []
|
||||
self.idle = None
|
||||
if self._filter_accepts(recorded_url):
|
||||
if self.dedup_db:
|
||||
warcprox.dedup.decorate_with_dedup_info(self.dedup_db,
|
||||
recorded_url, base32=self.options.base32)
|
||||
records = self.writer_pool.write_records(recorded_url)
|
||||
self._final_tasks(recorded_url, records)
|
||||
|
||||
self._final_tasks(recorded_url, records)
|
||||
|
||||
# try to release resources in a timely fashion
|
||||
if recorded_url.response_recorder and recorded_url.response_recorder.tempfile:
|
||||
@ -134,11 +136,15 @@ class WarcWriterThread(threading.Thread):
|
||||
payload_digest = "-"
|
||||
|
||||
# 2015-07-17T22:32:23.672Z 1 58 dns:www.dhss.delaware.gov P http://www.dhss.delaware.gov/dhss/ text/dns #045 20150717223214881+316 sha1:63UTPB7GTWIHAGIK3WWL76E57BBTJGAK http://www.dhss.delaware.gov/dhss/ - {"warcFileOffset":2964,"warcFilename":"ARCHIVEIT-1303-WEEKLY-JOB165158-20150717223222113-00000.warc.gz"}
|
||||
self.logger.info("{} {} {} {} {} size={} {} {} {} offset={}".format(
|
||||
recorded_url.client_ip, recorded_url.status, recorded_url.method,
|
||||
recorded_url.url.decode("utf-8"), recorded_url.mimetype,
|
||||
recorded_url.size, payload_digest, records[0].type.decode("utf-8"),
|
||||
records[0].warc_filename, records[0].offset))
|
||||
type_ = records[0].type.decode("utf-8") if records else '-'
|
||||
filename = records[0].warc_filename if records else '-'
|
||||
offset = records[0].offset if records else '-'
|
||||
self.logger.info(
|
||||
"%s %s %s %s %s size=%s %s %s %s offset=%s",
|
||||
recorded_url.client_ip, recorded_url.status,
|
||||
recorded_url.method, recorded_url.url.decode("utf-8"),
|
||||
recorded_url.mimetype, recorded_url.size, payload_digest,
|
||||
type_, filename, offset)
|
||||
|
||||
def _final_tasks(self, recorded_url, records):
|
||||
if self.listeners:
|
||||
|
Loading…
x
Reference in New Issue
Block a user