mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
heritrix-style crawl log support
This commit is contained in:
parent
7aed867c90
commit
ecb07fc9cd
@ -374,13 +374,17 @@ def warcprox_(request, captures_db, dedup_db, stats_db, service_registry):
|
||||
|
||||
options.method_filter = ['GET','POST']
|
||||
|
||||
options.crawl_log_dir = tempfile.mkdtemp(
|
||||
prefix='warcprox-test-', suffix='-crawl-log')
|
||||
crawl_logger = warcprox.crawl_log.CrawlLogger(options.crawl_log_dir)
|
||||
|
||||
writer_pool = warcprox.writer.WarcWriterPool(options)
|
||||
warc_writer_threads = [
|
||||
warcprox.writerthread.WarcWriterThread(
|
||||
recorded_url_q=recorded_url_q, writer_pool=writer_pool,
|
||||
dedup_db=dedup_db, listeners=[
|
||||
captures_db or dedup_db, playback_index_db, stats_db],
|
||||
options=options)
|
||||
captures_db or dedup_db, playback_index_db, stats_db,
|
||||
crawl_logger], options=options)
|
||||
for i in range(int(proxy.max_threads ** 0.5))]
|
||||
|
||||
warcprox_ = warcprox.controller.WarcproxController(
|
||||
@ -396,7 +400,8 @@ def warcprox_(request, captures_db, dedup_db, stats_db, service_registry):
|
||||
logging.info('stopping warcprox')
|
||||
warcprox_.stop.set()
|
||||
warcprox_thread.join()
|
||||
for f in (ca_file, ca_dir, options.directory, playback_index_db_file):
|
||||
for f in (ca_file, ca_dir, options.directory, playback_index_db_file,
|
||||
options.crawl_log_dir):
|
||||
if os.path.isdir(f):
|
||||
logging.info('deleting directory {}'.format(f))
|
||||
shutil.rmtree(f)
|
||||
@ -1442,6 +1447,101 @@ def test_slash_in_warc_prefix(warcprox_, http_daemon, archiving_proxies):
|
||||
assert response.status_code == 500
|
||||
assert response.reason == 'request rejected by warcprox: slash and backslash are not permitted in warc-prefix'
|
||||
|
||||
def test_crawl_log(warcprox_, http_daemon, archiving_proxies):
|
||||
try:
|
||||
os.unlink(os.path.join(warcprox_.options.crawl_log_dir, 'crawl.log'))
|
||||
except:
|
||||
pass
|
||||
|
||||
url = 'http://localhost:%s/b/d' % http_daemon.server_port
|
||||
response = requests.get(url, proxies=archiving_proxies)
|
||||
assert response.status_code == 200
|
||||
|
||||
url = 'http://localhost:%s/b/e' % http_daemon.server_port
|
||||
headers = {
|
||||
"Warcprox-Meta": json.dumps({"warc-prefix":"test_crawl_log_1"}),
|
||||
"Referer": "http://example.com/referer",
|
||||
}
|
||||
response = requests.get(url, proxies=archiving_proxies, headers=headers)
|
||||
assert response.status_code == 200
|
||||
|
||||
# should be deduplicated
|
||||
url = 'http://localhost:%s/b/d' % http_daemon.server_port
|
||||
headers = {"Warcprox-Meta": json.dumps({
|
||||
"warc-prefix": "test_crawl_log_2",
|
||||
"metadata": {"seed": "http://example.com/seed"}})}
|
||||
response = requests.get(url, proxies=archiving_proxies, headers=headers)
|
||||
assert response.status_code == 200
|
||||
|
||||
start = time.time()
|
||||
while time.time() - start < 10:
|
||||
if os.path.exists(os.path.join(
|
||||
warcprox_.options.crawl_log_dir, 'test_crawl_log_2.log')):
|
||||
break
|
||||
time.sleep(0.5)
|
||||
|
||||
crawl_log = open(os.path.join(
|
||||
warcprox_.options.crawl_log_dir, 'crawl.log'), 'rb').read()
|
||||
crawl_log_1 = open(os.path.join(
|
||||
warcprox_.options.crawl_log_dir, 'test_crawl_log_1.log'), 'rb').read()
|
||||
crawl_log_2 = open(os.path.join(
|
||||
warcprox_.options.crawl_log_dir, 'test_crawl_log_2.log'), 'rb').read()
|
||||
|
||||
# tests will fail in year 3000 :)
|
||||
assert re.match(b'\A2[^\n]+\n\Z', crawl_log)
|
||||
assert crawl_log[24:31] == b' 200 '
|
||||
assert crawl_log[31:42] == b' 44 '
|
||||
fields = crawl_log.split()
|
||||
assert len(fields) == 13
|
||||
assert fields[3].endswith(b'/b/d')
|
||||
assert fields[4] == b'-'
|
||||
assert fields[5] == b'-'
|
||||
assert fields[6] == b'text/plain'
|
||||
assert fields[7] == b'-'
|
||||
assert re.match(br'^\d{17}[+]\d{3}', fields[8])
|
||||
assert fields[9] == b'sha1:NKW7OKGZHXIMRKILQGOB2EB22U2MXJLM'
|
||||
assert fields[10] == b'-'
|
||||
assert fields[11] == b'-'
|
||||
extra_info = json.loads(fields[12].decode('utf-8'))
|
||||
assert extra_info.keys() == {'contentSize','warcFilename','warcFileOffset'}
|
||||
assert extra_info['contentSize'] == 135
|
||||
|
||||
assert re.match(b'\A2[^\n]+\n\Z', crawl_log_1)
|
||||
assert crawl_log_1[24:31] == b' 200 '
|
||||
assert crawl_log_1[31:42] == b' 44 '
|
||||
fields = crawl_log_1.split()
|
||||
assert len(fields) == 13
|
||||
assert fields[3].endswith(b'/b/e')
|
||||
assert fields[4] == b'-'
|
||||
assert fields[5] == b'http://example.com/referer'
|
||||
assert fields[6] == b'text/plain'
|
||||
assert fields[7] == b'-'
|
||||
assert re.match(br'^\d{17}[+]\d{3}', fields[8])
|
||||
assert fields[9] == b'sha1:DJURQDWPRKWTNMHDA6YS2KN2RLTWQ4JJ'
|
||||
assert fields[10] == b'-'
|
||||
assert fields[11] == b'-'
|
||||
extra_info = json.loads(fields[12].decode('utf-8'))
|
||||
assert extra_info.keys() == {'contentSize','warcFilename','warcFileOffset'}
|
||||
assert extra_info['contentSize'] == 135
|
||||
|
||||
assert re.match(b'\A2[^\n]+\n\Z', crawl_log_2)
|
||||
assert crawl_log_2[24:31] == b' 200 '
|
||||
assert crawl_log_2[31:42] == b' 44 '
|
||||
fields = crawl_log_2.split()
|
||||
assert len(fields) == 13
|
||||
assert fields[3].endswith(b'/b/d')
|
||||
assert fields[4] == b'-'
|
||||
assert fields[5] == b'-'
|
||||
assert fields[6] == b'text/plain'
|
||||
assert fields[7] == b'-'
|
||||
assert re.match(br'^\d{17}[+]\d{3}', fields[8])
|
||||
assert fields[9] == b'sha1:NKW7OKGZHXIMRKILQGOB2EB22U2MXJLM'
|
||||
assert fields[10] == b'http://example.com/seed'
|
||||
assert fields[11] == b'-'
|
||||
extra_info = json.loads(fields[12].decode('utf-8'))
|
||||
assert extra_info.keys() == {'contentSize','warcFilename','warcFileOffset'}
|
||||
assert extra_info['contentSize'] == 135
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main()
|
||||
|
||||
|
@ -114,3 +114,4 @@ import warcprox.warc as warc
|
||||
import warcprox.writerthread as writerthread
|
||||
import warcprox.stats as stats
|
||||
import warcprox.bigtable as bigtable
|
||||
import warcprox.crawl_log as crawl_log
|
||||
|
@ -22,6 +22,8 @@ USA.
|
||||
import logging
|
||||
import datetime
|
||||
import json
|
||||
import os
|
||||
import warcprox
|
||||
|
||||
class CrawlLogger(object):
|
||||
def __init__(self, dir_):
|
||||
@ -36,33 +38,38 @@ class CrawlLogger(object):
|
||||
'warcFileOffset': records[0].offset,
|
||||
}
|
||||
fields = [
|
||||
'{:%Y-%m-%dT%H:%M:%S}.{:03d}'.format(now, now.microsecond//1000),
|
||||
'{:%Y-%m-%dT%H:%M:%S}.{:03d}Z'.format(now, now.microsecond//1000),
|
||||
'% 5s' % recorded_url.status,
|
||||
'% 10s' % (recorded_url.response_recorder.len - recorded_url.response_recorder.payload_offset),
|
||||
recorded_url.url,
|
||||
'-', # hop path
|
||||
recorded_url.referer or '-',
|
||||
recorded_url.mimetype,
|
||||
recorded_url.mimetype or '-',
|
||||
'-',
|
||||
'{:%Y%m%d%H%M%S}{:03d}+{:03d}'.format(
|
||||
recorded_url.timestamp, recorded_url.microsecond//1000,
|
||||
recorded_url.timestamp,
|
||||
recorded_url.timestamp.microsecond//1000,
|
||||
recorded_url.duration.microseconds//1000),
|
||||
warcprox.digest_str(
|
||||
recorded_url.response_recorder.payload_digest, True),
|
||||
recorded_url.warcprox_meta.get('metadata', {}).get('seed', '-'),
|
||||
'duplicate:digest' if records[0].type == b'revisit' else '0',
|
||||
'duplicate:digest' if records[0].type == b'revisit' else '-',
|
||||
json.dumps(extra_info, separators=(',',':')),
|
||||
]
|
||||
for i in range(len(fields)):
|
||||
# `fields` is a mix of `bytes` and `unicode`, make them all `bytes
|
||||
# `fields` is a mix of `bytes` and `unicode`, make them all `bytes`
|
||||
try:
|
||||
fields[i] = fields[i].encode('utf-8')
|
||||
except:
|
||||
pass
|
||||
line = b' '.join(fields)
|
||||
line = b' '.join(fields) + b'\n'
|
||||
|
||||
if 'warc-prefix' in recorded_url.warcprox_meta:
|
||||
filename = '%s.log' % recorded_url.warcprox_meta['warc-prefix']
|
||||
os.path.join(
|
||||
self.dir, )
|
||||
else:
|
||||
filename = 'crawl.log'
|
||||
|
||||
crawl_log_path = os.path.join(self.dir, filename)
|
||||
with open(crawl_log_path, 'ab') as f:
|
||||
f.write(line)
|
||||
|
||||
|
@ -128,6 +128,11 @@ def _build_arg_parser(prog=os.path.basename(sys.argv[0])):
|
||||
default=None, help=(
|
||||
'host:port of tor socks proxy, used only to connect to '
|
||||
'.onion sites'))
|
||||
arg_parser.add_argument(
|
||||
'--crawl-log-dir', dest='crawl_log_dir', default=None, help=(
|
||||
'if specified, write crawl log files in the specified '
|
||||
'directory; one crawl log is written per warc filename '
|
||||
'prefix; crawl log format mimics heritrix'))
|
||||
arg_parser.add_argument(
|
||||
'--plugin', metavar='PLUGIN_CLASS', dest='plugins',
|
||||
action='append', help=(
|
||||
@ -228,6 +233,9 @@ def init_controller(args):
|
||||
playback_index_db = None
|
||||
playback_proxy = None
|
||||
|
||||
if args.crawl_log_dir:
|
||||
listeners.append(warcprox.crawl_log.CrawlLogger(args.crawl_log_dir))
|
||||
|
||||
for qualname in args.plugins or []:
|
||||
try:
|
||||
(module_name, class_name) = qualname.rsplit('.', 1)
|
||||
|
@ -207,7 +207,8 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
||||
client_ip=self.client_address[0],
|
||||
content_type=prox_rec_res.getheader("Content-Type"),
|
||||
method=self.command, timestamp=timestamp, host=self.hostname,
|
||||
duration=datetime.datetime.utcnow()-timestamp)
|
||||
duration=datetime.datetime.utcnow()-timestamp,
|
||||
referer=self.headers.get('referer'))
|
||||
self.server.recorded_url_q.put(recorded_url)
|
||||
|
||||
return recorded_url
|
||||
@ -314,7 +315,7 @@ class RecordedUrl:
|
||||
def __init__(self, url, request_data, response_recorder, remote_ip,
|
||||
warcprox_meta=None, content_type=None, custom_type=None,
|
||||
status=None, size=None, client_ip=None, method=None,
|
||||
timestamp=None, host=None, duration=None):
|
||||
timestamp=None, host=None, duration=None, referer=None):
|
||||
# XXX should test what happens with non-ascii url (when does
|
||||
# url-encoding happen?)
|
||||
if type(url) is not bytes:
|
||||
@ -351,6 +352,7 @@ class RecordedUrl:
|
||||
self.timestamp = timestamp
|
||||
self.host = host
|
||||
self.duration = duration
|
||||
self.referer = referer
|
||||
|
||||
# inherit from object so that multiple inheritance from this class works
|
||||
# properly in python 2
|
||||
|
Loading…
x
Reference in New Issue
Block a user