mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
Moving more variables from RecordedUrl to RequiredUrl
This commit is contained in:
parent
e88a88f247
commit
4ceebe1fa9
@ -223,7 +223,7 @@ class ListenerPostfetchProcessor(BaseStandardPostfetchProcessor):
|
|||||||
self.name = listener.__class__.__name__
|
self.name = listener.__class__.__name__
|
||||||
|
|
||||||
def _process_url(self, recorded_url):
|
def _process_url(self, recorded_url):
|
||||||
return self.listener.notify(recorded_url, recorded_url.warc_records if hasattr(recorded_url, "warc_records") else None)
|
return self.listener.notify(recorded_url, recorded_url.warc_records)
|
||||||
|
|
||||||
def start(self):
|
def start(self):
|
||||||
if hasattr(self.listener, 'start'):
|
if hasattr(self.listener, 'start'):
|
||||||
|
@ -42,13 +42,13 @@ class CrawlLogger(object):
|
|||||||
# 2017-08-03T21:45:24.496Z 200 2189 https://autismcouncil.wisconsin.gov/robots.txt P https://autismcouncil.wisconsin.gov/ text/plain #001 20170803214523617+365 sha1:PBS2CEF7B4OSEXZZF3QE2XN2VHYCPNPX https://autismcouncil.wisconsin.gov/ duplicate:digest {"warcFileOffset":942,"contentSize":2495,"warcFilename":"ARCHIVEIT-2159-TEST-JOB319150-20170803214522386-00000.warc.gz"}
|
# 2017-08-03T21:45:24.496Z 200 2189 https://autismcouncil.wisconsin.gov/robots.txt P https://autismcouncil.wisconsin.gov/ text/plain #001 20170803214523617+365 sha1:PBS2CEF7B4OSEXZZF3QE2XN2VHYCPNPX https://autismcouncil.wisconsin.gov/ duplicate:digest {"warcFileOffset":942,"contentSize":2495,"warcFilename":"ARCHIVEIT-2159-TEST-JOB319150-20170803214522386-00000.warc.gz"}
|
||||||
now = datetime.datetime.utcnow()
|
now = datetime.datetime.utcnow()
|
||||||
status = self.get_artificial_status(recorded_url)
|
status = self.get_artificial_status(recorded_url)
|
||||||
extra_info = {'contentSize': recorded_url.size,} if hasattr(recorded_url, "size") and recorded_url.size > 0 else {}
|
extra_info = {'contentSize': recorded_url.size,} if recorded_url.size is not None and recorded_url.size > 0 else {}
|
||||||
if records:
|
if records:
|
||||||
extra_info['warcFilename'] = records[0].warc_filename
|
extra_info['warcFilename'] = records[0].warc_filename
|
||||||
extra_info['warcFileOffset'] = records[0].offset
|
extra_info['warcFileOffset'] = records[0].offset
|
||||||
if recorded_url.method != 'GET':
|
if recorded_url.method != 'GET':
|
||||||
extra_info['method'] = recorded_url.method
|
extra_info['method'] = recorded_url.method
|
||||||
if hasattr(recorded_url, "response_recorder") and recorded_url.response_recorder:
|
if recorded_url.response_recorder:
|
||||||
content_length = recorded_url.response_recorder.len - recorded_url.response_recorder.payload_offset
|
content_length = recorded_url.response_recorder.len - recorded_url.response_recorder.payload_offset
|
||||||
payload_digest = warcprox.digest_str(
|
payload_digest = warcprox.digest_str(
|
||||||
recorded_url.payload_digest,
|
recorded_url.payload_digest,
|
||||||
@ -67,7 +67,7 @@ class CrawlLogger(object):
|
|||||||
recorded_url.url,
|
recorded_url.url,
|
||||||
'-', # hop path
|
'-', # hop path
|
||||||
recorded_url.referer or '-',
|
recorded_url.referer or '-',
|
||||||
recorded_url.mimetype if hasattr(recorded_url, "mimetype") and recorded_url.mimetype is not None else '-',
|
recorded_url.mimetype if recorded_url.mimetype is not None else '-',
|
||||||
'-',
|
'-',
|
||||||
'{:%Y%m%d%H%M%S}{:03d}+{:03d}'.format(
|
'{:%Y%m%d%H%M%S}{:03d}+{:03d}'.format(
|
||||||
recorded_url.timestamp,
|
recorded_url.timestamp,
|
||||||
|
@ -65,7 +65,7 @@ class DedupLoader(warcprox.BaseStandardPostfetchProcessor, DedupableMixin):
|
|||||||
self.dedup_db = dedup_db
|
self.dedup_db = dedup_db
|
||||||
|
|
||||||
def _process_url(self, recorded_url):
|
def _process_url(self, recorded_url):
|
||||||
if not hasattr(recorded_url, 'response_recorder'):
|
if isinstance(recorded_url, warcprox.warcproxy.FailedUrl):
|
||||||
return
|
return
|
||||||
if (recorded_url.response_recorder
|
if (recorded_url.response_recorder
|
||||||
and recorded_url.payload_digest
|
and recorded_url.payload_digest
|
||||||
|
@ -162,7 +162,7 @@ class StatsProcessor(warcprox.BaseBatchPostfetchProcessor):
|
|||||||
def _tally_batch(self, batch):
|
def _tally_batch(self, batch):
|
||||||
batch_buckets = {}
|
batch_buckets = {}
|
||||||
for recorded_url in batch:
|
for recorded_url in batch:
|
||||||
if not hasattr(recorded_url, 'response_recorder'):
|
if isinstance(recorded_url, warcprox.warcproxy.FailedUrl):
|
||||||
continue
|
continue
|
||||||
for bucket in self.buckets(recorded_url):
|
for bucket in self.buckets(recorded_url):
|
||||||
bucket_stats = batch_buckets.get(bucket)
|
bucket_stats = batch_buckets.get(bucket)
|
||||||
@ -299,7 +299,7 @@ class RunningStats:
|
|||||||
(self.first_snap_time - 120 + i * 10, 0, 0))
|
(self.first_snap_time - 120 + i * 10, 0, 0))
|
||||||
|
|
||||||
def notify(self, recorded_url, records):
|
def notify(self, recorded_url, records):
|
||||||
if not hasattr(recorded_url, 'response_recorder'):
|
if isinstance(recorded_url, warcprox.warcproxy.FailedUrl):
|
||||||
return
|
return
|
||||||
with self._lock:
|
with self._lock:
|
||||||
self.urls += 1
|
self.urls += 1
|
||||||
|
@ -395,9 +395,12 @@ RE_MIMETYPE = re.compile(r'[;\s]')
|
|||||||
|
|
||||||
class RequestedUrl:
|
class RequestedUrl:
|
||||||
logger = logging.getLogger("warcprox.warcproxy.RequestedUrl")
|
logger = logging.getLogger("warcprox.warcproxy.RequestedUrl")
|
||||||
def __init__(self, url, request_data, warcprox_meta=None, status=None,
|
def __init__(self, url, request_data, response_recorder=None, remote_ip=None,
|
||||||
client_ip=None, method=None, timestamp=None, host=None, duration=None,
|
warcprox_meta=None, content_type=None, custom_type=None,
|
||||||
referer=None, do_not_archive=True):
|
status=None, size=None, client_ip=None, method=None,
|
||||||
|
timestamp=None, host=None, duration=None, referer=None,
|
||||||
|
payload_digest=None, truncated=None, warc_records=None,
|
||||||
|
do_not_archive=False):
|
||||||
# XXX should test what happens with non-ascii url (when does
|
# XXX should test what happens with non-ascii url (when does
|
||||||
# url-encoding happen?)
|
# url-encoding happen?)
|
||||||
if type(url) is not bytes:
|
if type(url) is not bytes:
|
||||||
@ -406,6 +409,7 @@ class RequestedUrl:
|
|||||||
self.url = url
|
self.url = url
|
||||||
|
|
||||||
self.request_data = request_data
|
self.request_data = request_data
|
||||||
|
self.response_recorder = response_recorder
|
||||||
|
|
||||||
if warcprox_meta:
|
if warcprox_meta:
|
||||||
if 'captures-bucket' in warcprox_meta:
|
if 'captures-bucket' in warcprox_meta:
|
||||||
@ -422,13 +426,25 @@ class RequestedUrl:
|
|||||||
else:
|
else:
|
||||||
self.warcprox_meta = {}
|
self.warcprox_meta = {}
|
||||||
|
|
||||||
|
self.content_type = content_type
|
||||||
|
|
||||||
|
self.mimetype = content_type
|
||||||
|
if self.mimetype:
|
||||||
|
# chop off subtype, and ensure there's no whitespace
|
||||||
|
self.mimetype = RE_MIMETYPE.split(self.mimetype, 2)[0]
|
||||||
|
|
||||||
|
self.custom_type = custom_type
|
||||||
self.status = status
|
self.status = status
|
||||||
|
self.size = size
|
||||||
self.client_ip = client_ip
|
self.client_ip = client_ip
|
||||||
self.method = method
|
self.method = method
|
||||||
self.timestamp = timestamp
|
self.timestamp = timestamp
|
||||||
self.host = host
|
self.host = host
|
||||||
self.duration = duration
|
self.duration = duration
|
||||||
self.referer = referer
|
self.referer = referer
|
||||||
|
self.payload_digest = payload_digest
|
||||||
|
self.truncated = truncated
|
||||||
|
self.warc_records = warc_records
|
||||||
self.do_not_archive = do_not_archive
|
self.do_not_archive = do_not_archive
|
||||||
|
|
||||||
class FailedUrl(RequestedUrl):
|
class FailedUrl(RequestedUrl):
|
||||||
@ -438,9 +454,9 @@ class FailedUrl(RequestedUrl):
|
|||||||
client_ip=None, method=None, timestamp=None, host=None, duration=None,
|
client_ip=None, method=None, timestamp=None, host=None, duration=None,
|
||||||
referer=None, do_not_archive=True, exception=None):
|
referer=None, do_not_archive=True, exception=None):
|
||||||
|
|
||||||
super().__init__(url, request_data, warcprox_meta=warcprox_meta, status=status,
|
super().__init__(url, request_data, response_recorder=None, warcprox_meta=warcprox_meta, content_type=None, custom_type=None, status=status, size=None,
|
||||||
client_ip=client_ip, method=method, timestamp=timestamp, host=host, duration=duration,
|
client_ip=client_ip, method=method, timestamp=timestamp, host=host, duration=duration,
|
||||||
referer=referer, do_not_archive=do_not_archive)
|
referer=referer, payload_digest=None, truncated=None, warc_records=None, do_not_archive=do_not_archive)
|
||||||
|
|
||||||
self.exception = exception
|
self.exception = exception
|
||||||
|
|
||||||
@ -454,30 +470,18 @@ class RecordedUrl(RequestedUrl):
|
|||||||
payload_digest=None, truncated=None, warc_records=None,
|
payload_digest=None, truncated=None, warc_records=None,
|
||||||
do_not_archive=False):
|
do_not_archive=False):
|
||||||
|
|
||||||
super().__init__(url, request_data, warcprox_meta=warcprox_meta, status=status,
|
super().__init__(url, request_data, response_recorder=response_recorder,
|
||||||
client_ip=client_ip, method=method, timestamp=timestamp, host=host, duration=duration,
|
warcprox_meta=warcprox_meta, content_type=content_type,
|
||||||
referer=referer, do_not_archive=do_not_archive)
|
custom_type=custom_type, status=status, size=size, client_ip=client_ip,
|
||||||
|
method=method, timestamp=timestamp, host=host, duration=duration,
|
||||||
|
referer=referer, payload_digest=payload_digest, truncated=truncated,
|
||||||
|
warc_records=warc_records, do_not_archive=do_not_archive)
|
||||||
|
|
||||||
if type(remote_ip) is not bytes:
|
if type(remote_ip) is not bytes:
|
||||||
self.remote_ip = remote_ip.encode('ascii')
|
self.remote_ip = remote_ip.encode('ascii')
|
||||||
else:
|
else:
|
||||||
self.remote_ip = remote_ip
|
self.remote_ip = remote_ip
|
||||||
|
|
||||||
self.content_type = content_type
|
|
||||||
|
|
||||||
self.mimetype = content_type
|
|
||||||
if self.mimetype:
|
|
||||||
# chop off subtype, and ensure there's no whitespace
|
|
||||||
self.mimetype = RE_MIMETYPE.split(self.mimetype, 2)[0]
|
|
||||||
|
|
||||||
self.custom_type = custom_type
|
|
||||||
self.size = size
|
|
||||||
self.response_recorder = response_recorder
|
|
||||||
self.custom_type = custom_type
|
|
||||||
self.payload_digest = payload_digest
|
|
||||||
self.truncated = truncated
|
|
||||||
self.warc_records = warc_records
|
|
||||||
|
|
||||||
def is_text(self):
|
def is_text(self):
|
||||||
"""Ref: https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Complete_list_of_MIME_types
|
"""Ref: https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Complete_list_of_MIME_types
|
||||||
Alternative method: try to decode('ascii') first N bytes to make sure
|
Alternative method: try to decode('ascii') first N bytes to make sure
|
||||||
|
@ -72,7 +72,7 @@ class WarcWriterProcessor(warcprox.BaseStandardPostfetchProcessor):
|
|||||||
self.close_prefix_reqs.put(prefix)
|
self.close_prefix_reqs.put(prefix)
|
||||||
|
|
||||||
def _process_url(self, recorded_url):
|
def _process_url(self, recorded_url):
|
||||||
if not hasattr(recorded_url, 'response_recorder'):
|
if isinstance(recorded_url, warcprox.warcproxy.FailedUrl):
|
||||||
return
|
return
|
||||||
try:
|
try:
|
||||||
records = []
|
records = []
|
||||||
|
Loading…
x
Reference in New Issue
Block a user