Moving more variables from RecordedUrl to RequiredUrl

This commit is contained in:
Adam Miller 2020-01-04 01:41:28 +00:00
parent e88a88f247
commit 4ceebe1fa9
6 changed files with 35 additions and 31 deletions

View File

@ -223,7 +223,7 @@ class ListenerPostfetchProcessor(BaseStandardPostfetchProcessor):
self.name = listener.__class__.__name__
def _process_url(self, recorded_url):
return self.listener.notify(recorded_url, recorded_url.warc_records if hasattr(recorded_url, "warc_records") else None)
return self.listener.notify(recorded_url, recorded_url.warc_records)
def start(self):
if hasattr(self.listener, 'start'):

View File

@ -42,13 +42,13 @@ class CrawlLogger(object):
# 2017-08-03T21:45:24.496Z 200 2189 https://autismcouncil.wisconsin.gov/robots.txt P https://autismcouncil.wisconsin.gov/ text/plain #001 20170803214523617+365 sha1:PBS2CEF7B4OSEXZZF3QE2XN2VHYCPNPX https://autismcouncil.wisconsin.gov/ duplicate:digest {"warcFileOffset":942,"contentSize":2495,"warcFilename":"ARCHIVEIT-2159-TEST-JOB319150-20170803214522386-00000.warc.gz"}
now = datetime.datetime.utcnow()
status = self.get_artificial_status(recorded_url)
extra_info = {'contentSize': recorded_url.size,} if hasattr(recorded_url, "size") and recorded_url.size > 0 else {}
extra_info = {'contentSize': recorded_url.size,} if recorded_url.size is not None and recorded_url.size > 0 else {}
if records:
extra_info['warcFilename'] = records[0].warc_filename
extra_info['warcFileOffset'] = records[0].offset
if recorded_url.method != 'GET':
extra_info['method'] = recorded_url.method
if hasattr(recorded_url, "response_recorder") and recorded_url.response_recorder:
if recorded_url.response_recorder:
content_length = recorded_url.response_recorder.len - recorded_url.response_recorder.payload_offset
payload_digest = warcprox.digest_str(
recorded_url.payload_digest,
@ -67,7 +67,7 @@ class CrawlLogger(object):
recorded_url.url,
'-', # hop path
recorded_url.referer or '-',
recorded_url.mimetype if hasattr(recorded_url, "mimetype") and recorded_url.mimetype is not None else '-',
recorded_url.mimetype if recorded_url.mimetype is not None else '-',
'-',
'{:%Y%m%d%H%M%S}{:03d}+{:03d}'.format(
recorded_url.timestamp,

View File

@ -65,7 +65,7 @@ class DedupLoader(warcprox.BaseStandardPostfetchProcessor, DedupableMixin):
self.dedup_db = dedup_db
def _process_url(self, recorded_url):
if not hasattr(recorded_url, 'response_recorder'):
if isinstance(recorded_url, warcprox.warcproxy.FailedUrl):
return
if (recorded_url.response_recorder
and recorded_url.payload_digest

View File

@ -162,7 +162,7 @@ class StatsProcessor(warcprox.BaseBatchPostfetchProcessor):
def _tally_batch(self, batch):
batch_buckets = {}
for recorded_url in batch:
if not hasattr(recorded_url, 'response_recorder'):
if isinstance(recorded_url, warcprox.warcproxy.FailedUrl):
continue
for bucket in self.buckets(recorded_url):
bucket_stats = batch_buckets.get(bucket)
@ -299,7 +299,7 @@ class RunningStats:
(self.first_snap_time - 120 + i * 10, 0, 0))
def notify(self, recorded_url, records):
if not hasattr(recorded_url, 'response_recorder'):
if isinstance(recorded_url, warcprox.warcproxy.FailedUrl):
return
with self._lock:
self.urls += 1

View File

@ -395,9 +395,12 @@ RE_MIMETYPE = re.compile(r'[;\s]')
class RequestedUrl:
logger = logging.getLogger("warcprox.warcproxy.RequestedUrl")
def __init__(self, url, request_data, warcprox_meta=None, status=None,
client_ip=None, method=None, timestamp=None, host=None, duration=None,
referer=None, do_not_archive=True):
def __init__(self, url, request_data, response_recorder=None, remote_ip=None,
warcprox_meta=None, content_type=None, custom_type=None,
status=None, size=None, client_ip=None, method=None,
timestamp=None, host=None, duration=None, referer=None,
payload_digest=None, truncated=None, warc_records=None,
do_not_archive=False):
# XXX should test what happens with non-ascii url (when does
# url-encoding happen?)
if type(url) is not bytes:
@ -406,6 +409,7 @@ class RequestedUrl:
self.url = url
self.request_data = request_data
self.response_recorder = response_recorder
if warcprox_meta:
if 'captures-bucket' in warcprox_meta:
@ -422,13 +426,25 @@ class RequestedUrl:
else:
self.warcprox_meta = {}
self.content_type = content_type
self.mimetype = content_type
if self.mimetype:
# chop off subtype, and ensure there's no whitespace
self.mimetype = RE_MIMETYPE.split(self.mimetype, 2)[0]
self.custom_type = custom_type
self.status = status
self.size = size
self.client_ip = client_ip
self.method = method
self.timestamp = timestamp
self.host = host
self.duration = duration
self.referer = referer
self.payload_digest = payload_digest
self.truncated = truncated
self.warc_records = warc_records
self.do_not_archive = do_not_archive
class FailedUrl(RequestedUrl):
@ -438,9 +454,9 @@ class FailedUrl(RequestedUrl):
client_ip=None, method=None, timestamp=None, host=None, duration=None,
referer=None, do_not_archive=True, exception=None):
super().__init__(url, request_data, warcprox_meta=warcprox_meta, status=status,
super().__init__(url, request_data, response_recorder=None, warcprox_meta=warcprox_meta, content_type=None, custom_type=None, status=status, size=None,
client_ip=client_ip, method=method, timestamp=timestamp, host=host, duration=duration,
referer=referer, do_not_archive=do_not_archive)
referer=referer, payload_digest=None, truncated=None, warc_records=None, do_not_archive=do_not_archive)
self.exception = exception
@ -454,30 +470,18 @@ class RecordedUrl(RequestedUrl):
payload_digest=None, truncated=None, warc_records=None,
do_not_archive=False):
super().__init__(url, request_data, warcprox_meta=warcprox_meta, status=status,
client_ip=client_ip, method=method, timestamp=timestamp, host=host, duration=duration,
referer=referer, do_not_archive=do_not_archive)
super().__init__(url, request_data, response_recorder=response_recorder,
warcprox_meta=warcprox_meta, content_type=content_type,
custom_type=custom_type, status=status, size=size, client_ip=client_ip,
method=method, timestamp=timestamp, host=host, duration=duration,
referer=referer, payload_digest=payload_digest, truncated=truncated,
warc_records=warc_records, do_not_archive=do_not_archive)
if type(remote_ip) is not bytes:
self.remote_ip = remote_ip.encode('ascii')
else:
self.remote_ip = remote_ip
self.content_type = content_type
self.mimetype = content_type
if self.mimetype:
# chop off subtype, and ensure there's no whitespace
self.mimetype = RE_MIMETYPE.split(self.mimetype, 2)[0]
self.custom_type = custom_type
self.size = size
self.response_recorder = response_recorder
self.custom_type = custom_type
self.payload_digest = payload_digest
self.truncated = truncated
self.warc_records = warc_records
def is_text(self):
"""Ref: https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Complete_list_of_MIME_types
Alternative method: try to decode('ascii') first N bytes to make sure

View File

@ -72,7 +72,7 @@ class WarcWriterProcessor(warcprox.BaseStandardPostfetchProcessor):
self.close_prefix_reqs.put(prefix)
def _process_url(self, recorded_url):
if not hasattr(recorded_url, 'response_recorder'):
if isinstance(recorded_url, warcprox.warcproxy.FailedUrl):
return
try:
records = []