Moving more variables from RecordedUrl to RequiredUrl

This commit is contained in:
Adam Miller 2020-01-04 01:41:28 +00:00
parent e88a88f247
commit 4ceebe1fa9
6 changed files with 35 additions and 31 deletions

View File

@ -223,7 +223,7 @@ class ListenerPostfetchProcessor(BaseStandardPostfetchProcessor):
self.name = listener.__class__.__name__ self.name = listener.__class__.__name__
def _process_url(self, recorded_url): def _process_url(self, recorded_url):
return self.listener.notify(recorded_url, recorded_url.warc_records if hasattr(recorded_url, "warc_records") else None) return self.listener.notify(recorded_url, recorded_url.warc_records)
def start(self): def start(self):
if hasattr(self.listener, 'start'): if hasattr(self.listener, 'start'):

View File

@ -42,13 +42,13 @@ class CrawlLogger(object):
# 2017-08-03T21:45:24.496Z 200 2189 https://autismcouncil.wisconsin.gov/robots.txt P https://autismcouncil.wisconsin.gov/ text/plain #001 20170803214523617+365 sha1:PBS2CEF7B4OSEXZZF3QE2XN2VHYCPNPX https://autismcouncil.wisconsin.gov/ duplicate:digest {"warcFileOffset":942,"contentSize":2495,"warcFilename":"ARCHIVEIT-2159-TEST-JOB319150-20170803214522386-00000.warc.gz"} # 2017-08-03T21:45:24.496Z 200 2189 https://autismcouncil.wisconsin.gov/robots.txt P https://autismcouncil.wisconsin.gov/ text/plain #001 20170803214523617+365 sha1:PBS2CEF7B4OSEXZZF3QE2XN2VHYCPNPX https://autismcouncil.wisconsin.gov/ duplicate:digest {"warcFileOffset":942,"contentSize":2495,"warcFilename":"ARCHIVEIT-2159-TEST-JOB319150-20170803214522386-00000.warc.gz"}
now = datetime.datetime.utcnow() now = datetime.datetime.utcnow()
status = self.get_artificial_status(recorded_url) status = self.get_artificial_status(recorded_url)
extra_info = {'contentSize': recorded_url.size,} if hasattr(recorded_url, "size") and recorded_url.size > 0 else {} extra_info = {'contentSize': recorded_url.size,} if recorded_url.size is not None and recorded_url.size > 0 else {}
if records: if records:
extra_info['warcFilename'] = records[0].warc_filename extra_info['warcFilename'] = records[0].warc_filename
extra_info['warcFileOffset'] = records[0].offset extra_info['warcFileOffset'] = records[0].offset
if recorded_url.method != 'GET': if recorded_url.method != 'GET':
extra_info['method'] = recorded_url.method extra_info['method'] = recorded_url.method
if hasattr(recorded_url, "response_recorder") and recorded_url.response_recorder: if recorded_url.response_recorder:
content_length = recorded_url.response_recorder.len - recorded_url.response_recorder.payload_offset content_length = recorded_url.response_recorder.len - recorded_url.response_recorder.payload_offset
payload_digest = warcprox.digest_str( payload_digest = warcprox.digest_str(
recorded_url.payload_digest, recorded_url.payload_digest,
@ -67,7 +67,7 @@ class CrawlLogger(object):
recorded_url.url, recorded_url.url,
'-', # hop path '-', # hop path
recorded_url.referer or '-', recorded_url.referer or '-',
recorded_url.mimetype if hasattr(recorded_url, "mimetype") and recorded_url.mimetype is not None else '-', recorded_url.mimetype if recorded_url.mimetype is not None else '-',
'-', '-',
'{:%Y%m%d%H%M%S}{:03d}+{:03d}'.format( '{:%Y%m%d%H%M%S}{:03d}+{:03d}'.format(
recorded_url.timestamp, recorded_url.timestamp,

View File

@ -65,7 +65,7 @@ class DedupLoader(warcprox.BaseStandardPostfetchProcessor, DedupableMixin):
self.dedup_db = dedup_db self.dedup_db = dedup_db
def _process_url(self, recorded_url): def _process_url(self, recorded_url):
if not hasattr(recorded_url, 'response_recorder'): if isinstance(recorded_url, warcprox.warcproxy.FailedUrl):
return return
if (recorded_url.response_recorder if (recorded_url.response_recorder
and recorded_url.payload_digest and recorded_url.payload_digest

View File

@ -162,7 +162,7 @@ class StatsProcessor(warcprox.BaseBatchPostfetchProcessor):
def _tally_batch(self, batch): def _tally_batch(self, batch):
batch_buckets = {} batch_buckets = {}
for recorded_url in batch: for recorded_url in batch:
if not hasattr(recorded_url, 'response_recorder'): if isinstance(recorded_url, warcprox.warcproxy.FailedUrl):
continue continue
for bucket in self.buckets(recorded_url): for bucket in self.buckets(recorded_url):
bucket_stats = batch_buckets.get(bucket) bucket_stats = batch_buckets.get(bucket)
@ -299,7 +299,7 @@ class RunningStats:
(self.first_snap_time - 120 + i * 10, 0, 0)) (self.first_snap_time - 120 + i * 10, 0, 0))
def notify(self, recorded_url, records): def notify(self, recorded_url, records):
if not hasattr(recorded_url, 'response_recorder'): if isinstance(recorded_url, warcprox.warcproxy.FailedUrl):
return return
with self._lock: with self._lock:
self.urls += 1 self.urls += 1

View File

@ -395,9 +395,12 @@ RE_MIMETYPE = re.compile(r'[;\s]')
class RequestedUrl: class RequestedUrl:
logger = logging.getLogger("warcprox.warcproxy.RequestedUrl") logger = logging.getLogger("warcprox.warcproxy.RequestedUrl")
def __init__(self, url, request_data, warcprox_meta=None, status=None, def __init__(self, url, request_data, response_recorder=None, remote_ip=None,
client_ip=None, method=None, timestamp=None, host=None, duration=None, warcprox_meta=None, content_type=None, custom_type=None,
referer=None, do_not_archive=True): status=None, size=None, client_ip=None, method=None,
timestamp=None, host=None, duration=None, referer=None,
payload_digest=None, truncated=None, warc_records=None,
do_not_archive=False):
# XXX should test what happens with non-ascii url (when does # XXX should test what happens with non-ascii url (when does
# url-encoding happen?) # url-encoding happen?)
if type(url) is not bytes: if type(url) is not bytes:
@ -406,6 +409,7 @@ class RequestedUrl:
self.url = url self.url = url
self.request_data = request_data self.request_data = request_data
self.response_recorder = response_recorder
if warcprox_meta: if warcprox_meta:
if 'captures-bucket' in warcprox_meta: if 'captures-bucket' in warcprox_meta:
@ -422,13 +426,25 @@ class RequestedUrl:
else: else:
self.warcprox_meta = {} self.warcprox_meta = {}
self.content_type = content_type
self.mimetype = content_type
if self.mimetype:
# chop off subtype, and ensure there's no whitespace
self.mimetype = RE_MIMETYPE.split(self.mimetype, 2)[0]
self.custom_type = custom_type
self.status = status self.status = status
self.size = size
self.client_ip = client_ip self.client_ip = client_ip
self.method = method self.method = method
self.timestamp = timestamp self.timestamp = timestamp
self.host = host self.host = host
self.duration = duration self.duration = duration
self.referer = referer self.referer = referer
self.payload_digest = payload_digest
self.truncated = truncated
self.warc_records = warc_records
self.do_not_archive = do_not_archive self.do_not_archive = do_not_archive
class FailedUrl(RequestedUrl): class FailedUrl(RequestedUrl):
@ -438,9 +454,9 @@ class FailedUrl(RequestedUrl):
client_ip=None, method=None, timestamp=None, host=None, duration=None, client_ip=None, method=None, timestamp=None, host=None, duration=None,
referer=None, do_not_archive=True, exception=None): referer=None, do_not_archive=True, exception=None):
super().__init__(url, request_data, warcprox_meta=warcprox_meta, status=status, super().__init__(url, request_data, response_recorder=None, warcprox_meta=warcprox_meta, content_type=None, custom_type=None, status=status, size=None,
client_ip=client_ip, method=method, timestamp=timestamp, host=host, duration=duration, client_ip=client_ip, method=method, timestamp=timestamp, host=host, duration=duration,
referer=referer, do_not_archive=do_not_archive) referer=referer, payload_digest=None, truncated=None, warc_records=None, do_not_archive=do_not_archive)
self.exception = exception self.exception = exception
@ -454,30 +470,18 @@ class RecordedUrl(RequestedUrl):
payload_digest=None, truncated=None, warc_records=None, payload_digest=None, truncated=None, warc_records=None,
do_not_archive=False): do_not_archive=False):
super().__init__(url, request_data, warcprox_meta=warcprox_meta, status=status, super().__init__(url, request_data, response_recorder=response_recorder,
client_ip=client_ip, method=method, timestamp=timestamp, host=host, duration=duration, warcprox_meta=warcprox_meta, content_type=content_type,
referer=referer, do_not_archive=do_not_archive) custom_type=custom_type, status=status, size=size, client_ip=client_ip,
method=method, timestamp=timestamp, host=host, duration=duration,
referer=referer, payload_digest=payload_digest, truncated=truncated,
warc_records=warc_records, do_not_archive=do_not_archive)
if type(remote_ip) is not bytes: if type(remote_ip) is not bytes:
self.remote_ip = remote_ip.encode('ascii') self.remote_ip = remote_ip.encode('ascii')
else: else:
self.remote_ip = remote_ip self.remote_ip = remote_ip
self.content_type = content_type
self.mimetype = content_type
if self.mimetype:
# chop off subtype, and ensure there's no whitespace
self.mimetype = RE_MIMETYPE.split(self.mimetype, 2)[0]
self.custom_type = custom_type
self.size = size
self.response_recorder = response_recorder
self.custom_type = custom_type
self.payload_digest = payload_digest
self.truncated = truncated
self.warc_records = warc_records
def is_text(self): def is_text(self):
"""Ref: https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Complete_list_of_MIME_types """Ref: https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Complete_list_of_MIME_types
Alternative method: try to decode('ascii') first N bytes to make sure Alternative method: try to decode('ascii') first N bytes to make sure

View File

@ -72,7 +72,7 @@ class WarcWriterProcessor(warcprox.BaseStandardPostfetchProcessor):
self.close_prefix_reqs.put(prefix) self.close_prefix_reqs.put(prefix)
def _process_url(self, recorded_url): def _process_url(self, recorded_url):
if not hasattr(recorded_url, 'response_recorder'): if isinstance(recorded_url, warcprox.warcproxy.FailedUrl):
return return
try: try:
records = [] records = []