From 4ceebe1fa92166844483eab599f6bf4d91af3bdb Mon Sep 17 00:00:00 2001 From: Adam Miller Date: Sat, 4 Jan 2020 01:41:28 +0000 Subject: [PATCH] Moving more variables from RecordedUrl to RequiredUrl --- warcprox/__init__.py | 2 +- warcprox/crawl_log.py | 6 ++--- warcprox/dedup.py | 2 +- warcprox/stats.py | 4 ++-- warcprox/warcproxy.py | 50 ++++++++++++++++++++++------------------ warcprox/writerthread.py | 2 +- 6 files changed, 35 insertions(+), 31 deletions(-) diff --git a/warcprox/__init__.py b/warcprox/__init__.py index 3dd84c8..9cd09a8 100644 --- a/warcprox/__init__.py +++ b/warcprox/__init__.py @@ -223,7 +223,7 @@ class ListenerPostfetchProcessor(BaseStandardPostfetchProcessor): self.name = listener.__class__.__name__ def _process_url(self, recorded_url): - return self.listener.notify(recorded_url, recorded_url.warc_records if hasattr(recorded_url, "warc_records") else None) + return self.listener.notify(recorded_url, recorded_url.warc_records) def start(self): if hasattr(self.listener, 'start'): diff --git a/warcprox/crawl_log.py b/warcprox/crawl_log.py index b30dd30..eea17d5 100644 --- a/warcprox/crawl_log.py +++ b/warcprox/crawl_log.py @@ -42,13 +42,13 @@ class CrawlLogger(object): # 2017-08-03T21:45:24.496Z 200 2189 https://autismcouncil.wisconsin.gov/robots.txt P https://autismcouncil.wisconsin.gov/ text/plain #001 20170803214523617+365 sha1:PBS2CEF7B4OSEXZZF3QE2XN2VHYCPNPX https://autismcouncil.wisconsin.gov/ duplicate:digest {"warcFileOffset":942,"contentSize":2495,"warcFilename":"ARCHIVEIT-2159-TEST-JOB319150-20170803214522386-00000.warc.gz"} now = datetime.datetime.utcnow() status = self.get_artificial_status(recorded_url) - extra_info = {'contentSize': recorded_url.size,} if hasattr(recorded_url, "size") and recorded_url.size > 0 else {} + extra_info = {'contentSize': recorded_url.size,} if recorded_url.size is not None and recorded_url.size > 0 else {} if records: extra_info['warcFilename'] = records[0].warc_filename extra_info['warcFileOffset'] = records[0].offset if recorded_url.method != 'GET': extra_info['method'] = recorded_url.method - if hasattr(recorded_url, "response_recorder") and recorded_url.response_recorder: + if recorded_url.response_recorder: content_length = recorded_url.response_recorder.len - recorded_url.response_recorder.payload_offset payload_digest = warcprox.digest_str( recorded_url.payload_digest, @@ -67,7 +67,7 @@ class CrawlLogger(object): recorded_url.url, '-', # hop path recorded_url.referer or '-', - recorded_url.mimetype if hasattr(recorded_url, "mimetype") and recorded_url.mimetype is not None else '-', + recorded_url.mimetype if recorded_url.mimetype is not None else '-', '-', '{:%Y%m%d%H%M%S}{:03d}+{:03d}'.format( recorded_url.timestamp, diff --git a/warcprox/dedup.py b/warcprox/dedup.py index a07091e..223427e 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -65,7 +65,7 @@ class DedupLoader(warcprox.BaseStandardPostfetchProcessor, DedupableMixin): self.dedup_db = dedup_db def _process_url(self, recorded_url): - if not hasattr(recorded_url, 'response_recorder'): + if isinstance(recorded_url, warcprox.warcproxy.FailedUrl): return if (recorded_url.response_recorder and recorded_url.payload_digest diff --git a/warcprox/stats.py b/warcprox/stats.py index 2fe0848..3bc560e 100644 --- a/warcprox/stats.py +++ b/warcprox/stats.py @@ -162,7 +162,7 @@ class StatsProcessor(warcprox.BaseBatchPostfetchProcessor): def _tally_batch(self, batch): batch_buckets = {} for recorded_url in batch: - if not hasattr(recorded_url, 'response_recorder'): + if isinstance(recorded_url, warcprox.warcproxy.FailedUrl): continue for bucket in self.buckets(recorded_url): bucket_stats = batch_buckets.get(bucket) @@ -299,7 +299,7 @@ class RunningStats: (self.first_snap_time - 120 + i * 10, 0, 0)) def notify(self, recorded_url, records): - if not hasattr(recorded_url, 'response_recorder'): + if isinstance(recorded_url, warcprox.warcproxy.FailedUrl): return with self._lock: self.urls += 1 diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index 2603759..17d0682 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -395,9 +395,12 @@ RE_MIMETYPE = re.compile(r'[;\s]') class RequestedUrl: logger = logging.getLogger("warcprox.warcproxy.RequestedUrl") - def __init__(self, url, request_data, warcprox_meta=None, status=None, - client_ip=None, method=None, timestamp=None, host=None, duration=None, - referer=None, do_not_archive=True): + def __init__(self, url, request_data, response_recorder=None, remote_ip=None, + warcprox_meta=None, content_type=None, custom_type=None, + status=None, size=None, client_ip=None, method=None, + timestamp=None, host=None, duration=None, referer=None, + payload_digest=None, truncated=None, warc_records=None, + do_not_archive=False): # XXX should test what happens with non-ascii url (when does # url-encoding happen?) if type(url) is not bytes: @@ -406,6 +409,7 @@ class RequestedUrl: self.url = url self.request_data = request_data + self.response_recorder = response_recorder if warcprox_meta: if 'captures-bucket' in warcprox_meta: @@ -422,13 +426,25 @@ class RequestedUrl: else: self.warcprox_meta = {} + self.content_type = content_type + + self.mimetype = content_type + if self.mimetype: + # chop off subtype, and ensure there's no whitespace + self.mimetype = RE_MIMETYPE.split(self.mimetype, 2)[0] + + self.custom_type = custom_type self.status = status + self.size = size self.client_ip = client_ip self.method = method self.timestamp = timestamp self.host = host self.duration = duration self.referer = referer + self.payload_digest = payload_digest + self.truncated = truncated + self.warc_records = warc_records self.do_not_archive = do_not_archive class FailedUrl(RequestedUrl): @@ -438,9 +454,9 @@ class FailedUrl(RequestedUrl): client_ip=None, method=None, timestamp=None, host=None, duration=None, referer=None, do_not_archive=True, exception=None): - super().__init__(url, request_data, warcprox_meta=warcprox_meta, status=status, + super().__init__(url, request_data, response_recorder=None, warcprox_meta=warcprox_meta, content_type=None, custom_type=None, status=status, size=None, client_ip=client_ip, method=method, timestamp=timestamp, host=host, duration=duration, - referer=referer, do_not_archive=do_not_archive) + referer=referer, payload_digest=None, truncated=None, warc_records=None, do_not_archive=do_not_archive) self.exception = exception @@ -454,30 +470,18 @@ class RecordedUrl(RequestedUrl): payload_digest=None, truncated=None, warc_records=None, do_not_archive=False): - super().__init__(url, request_data, warcprox_meta=warcprox_meta, status=status, - client_ip=client_ip, method=method, timestamp=timestamp, host=host, duration=duration, - referer=referer, do_not_archive=do_not_archive) + super().__init__(url, request_data, response_recorder=response_recorder, + warcprox_meta=warcprox_meta, content_type=content_type, + custom_type=custom_type, status=status, size=size, client_ip=client_ip, + method=method, timestamp=timestamp, host=host, duration=duration, + referer=referer, payload_digest=payload_digest, truncated=truncated, + warc_records=warc_records, do_not_archive=do_not_archive) if type(remote_ip) is not bytes: self.remote_ip = remote_ip.encode('ascii') else: self.remote_ip = remote_ip - self.content_type = content_type - - self.mimetype = content_type - if self.mimetype: - # chop off subtype, and ensure there's no whitespace - self.mimetype = RE_MIMETYPE.split(self.mimetype, 2)[0] - - self.custom_type = custom_type - self.size = size - self.response_recorder = response_recorder - self.custom_type = custom_type - self.payload_digest = payload_digest - self.truncated = truncated - self.warc_records = warc_records - def is_text(self): """Ref: https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Complete_list_of_MIME_types Alternative method: try to decode('ascii') first N bytes to make sure diff --git a/warcprox/writerthread.py b/warcprox/writerthread.py index 968a90c..3cd6bc6 100644 --- a/warcprox/writerthread.py +++ b/warcprox/writerthread.py @@ -72,7 +72,7 @@ class WarcWriterProcessor(warcprox.BaseStandardPostfetchProcessor): self.close_prefix_reqs.put(prefix) def _process_url(self, recorded_url): - if not hasattr(recorded_url, 'response_recorder'): + if isinstance(recorded_url, warcprox.warcproxy.FailedUrl): return try: records = []