From 4ceebe1fa92166844483eab599f6bf4d91af3bdb Mon Sep 17 00:00:00 2001
From: Adam Miller <adam@archive.org>
Date: Sat, 4 Jan 2020 01:41:28 +0000
Subject: [PATCH] Moving more variables from RecordedUrl to RequiredUrl

---
 warcprox/__init__.py     |  2 +-
 warcprox/crawl_log.py    |  6 ++---
 warcprox/dedup.py        |  2 +-
 warcprox/stats.py        |  4 ++--
 warcprox/warcproxy.py    | 50 ++++++++++++++++++++++------------------
 warcprox/writerthread.py |  2 +-
 6 files changed, 35 insertions(+), 31 deletions(-)

diff --git a/warcprox/__init__.py b/warcprox/__init__.py
index 3dd84c8..9cd09a8 100644
--- a/warcprox/__init__.py
+++ b/warcprox/__init__.py
@@ -223,7 +223,7 @@ class ListenerPostfetchProcessor(BaseStandardPostfetchProcessor):
         self.name = listener.__class__.__name__
 
     def _process_url(self, recorded_url):
-        return self.listener.notify(recorded_url, recorded_url.warc_records if hasattr(recorded_url, "warc_records") else None)
+        return self.listener.notify(recorded_url, recorded_url.warc_records)
 
     def start(self):
         if hasattr(self.listener, 'start'):
diff --git a/warcprox/crawl_log.py b/warcprox/crawl_log.py
index b30dd30..eea17d5 100644
--- a/warcprox/crawl_log.py
+++ b/warcprox/crawl_log.py
@@ -42,13 +42,13 @@ class CrawlLogger(object):
         # 2017-08-03T21:45:24.496Z   200       2189 https://autismcouncil.wisconsin.gov/robots.txt P https://autismcouncil.wisconsin.gov/ text/plain #001 20170803214523617+365 sha1:PBS2CEF7B4OSEXZZF3QE2XN2VHYCPNPX https://autismcouncil.wisconsin.gov/ duplicate:digest {"warcFileOffset":942,"contentSize":2495,"warcFilename":"ARCHIVEIT-2159-TEST-JOB319150-20170803214522386-00000.warc.gz"}
         now = datetime.datetime.utcnow()
         status = self.get_artificial_status(recorded_url)
-        extra_info = {'contentSize': recorded_url.size,} if hasattr(recorded_url, "size") and recorded_url.size > 0 else {}
+        extra_info = {'contentSize': recorded_url.size,} if recorded_url.size is not None and recorded_url.size > 0 else {}
         if records:
             extra_info['warcFilename'] = records[0].warc_filename
             extra_info['warcFileOffset'] = records[0].offset
         if recorded_url.method != 'GET':
             extra_info['method'] = recorded_url.method
-        if hasattr(recorded_url, "response_recorder") and recorded_url.response_recorder:
+        if recorded_url.response_recorder:
             content_length = recorded_url.response_recorder.len - recorded_url.response_recorder.payload_offset
             payload_digest = warcprox.digest_str(
                 recorded_url.payload_digest,
@@ -67,7 +67,7 @@ class CrawlLogger(object):
             recorded_url.url,
             '-', # hop path
             recorded_url.referer or '-',
-            recorded_url.mimetype if hasattr(recorded_url, "mimetype") and recorded_url.mimetype is not None else '-',
+            recorded_url.mimetype if recorded_url.mimetype is not None else '-',
             '-',
             '{:%Y%m%d%H%M%S}{:03d}+{:03d}'.format(
                 recorded_url.timestamp,
diff --git a/warcprox/dedup.py b/warcprox/dedup.py
index a07091e..223427e 100644
--- a/warcprox/dedup.py
+++ b/warcprox/dedup.py
@@ -65,7 +65,7 @@ class DedupLoader(warcprox.BaseStandardPostfetchProcessor, DedupableMixin):
         self.dedup_db = dedup_db
 
     def _process_url(self, recorded_url):
-        if not hasattr(recorded_url, 'response_recorder'):
+        if isinstance(recorded_url, warcprox.warcproxy.FailedUrl):
             return
         if (recorded_url.response_recorder
                 and recorded_url.payload_digest
diff --git a/warcprox/stats.py b/warcprox/stats.py
index 2fe0848..3bc560e 100644
--- a/warcprox/stats.py
+++ b/warcprox/stats.py
@@ -162,7 +162,7 @@ class StatsProcessor(warcprox.BaseBatchPostfetchProcessor):
     def _tally_batch(self, batch):
         batch_buckets = {}
         for recorded_url in batch:
-            if not hasattr(recorded_url, 'response_recorder'):
+            if isinstance(recorded_url, warcprox.warcproxy.FailedUrl):
                 continue
             for bucket in self.buckets(recorded_url):
                 bucket_stats = batch_buckets.get(bucket)
@@ -299,7 +299,7 @@ class RunningStats:
                     (self.first_snap_time - 120 + i * 10, 0, 0))
 
     def notify(self, recorded_url, records):
-        if not hasattr(recorded_url, 'response_recorder'):
+        if isinstance(recorded_url, warcprox.warcproxy.FailedUrl):
             return
         with self._lock:
             self.urls += 1
diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py
index 2603759..17d0682 100644
--- a/warcprox/warcproxy.py
+++ b/warcprox/warcproxy.py
@@ -395,9 +395,12 @@ RE_MIMETYPE = re.compile(r'[;\s]')
 
 class RequestedUrl:
     logger = logging.getLogger("warcprox.warcproxy.RequestedUrl")
-    def __init__(self, url, request_data, warcprox_meta=None, status=None,
-            client_ip=None, method=None, timestamp=None, host=None, duration=None,
-            referer=None, do_not_archive=True):
+    def __init__(self, url, request_data, response_recorder=None, remote_ip=None,
+            warcprox_meta=None, content_type=None, custom_type=None,
+            status=None, size=None, client_ip=None, method=None,
+            timestamp=None, host=None, duration=None, referer=None,
+            payload_digest=None, truncated=None, warc_records=None,
+            do_not_archive=False):
         # XXX should test what happens with non-ascii url (when does
         # url-encoding happen?)
         if type(url) is not bytes:
@@ -406,6 +409,7 @@ class RequestedUrl:
             self.url = url
 
         self.request_data = request_data
+        self.response_recorder = response_recorder
 
         if warcprox_meta:
             if 'captures-bucket' in warcprox_meta:
@@ -422,13 +426,25 @@ class RequestedUrl:
         else:
             self.warcprox_meta = {}
 
+        self.content_type = content_type
+
+        self.mimetype = content_type
+        if self.mimetype:
+            # chop off subtype, and ensure there's no whitespace
+            self.mimetype = RE_MIMETYPE.split(self.mimetype, 2)[0]
+
+        self.custom_type = custom_type
         self.status = status
+        self.size = size
         self.client_ip = client_ip
         self.method = method
         self.timestamp = timestamp
         self.host = host
         self.duration = duration
         self.referer = referer
+        self.payload_digest = payload_digest
+        self.truncated = truncated
+        self.warc_records = warc_records
         self.do_not_archive = do_not_archive
 
 class FailedUrl(RequestedUrl):
@@ -438,9 +454,9 @@ class FailedUrl(RequestedUrl):
             client_ip=None, method=None, timestamp=None, host=None, duration=None,
             referer=None, do_not_archive=True, exception=None):
 
-        super().__init__(url, request_data, warcprox_meta=warcprox_meta, status=status,
+        super().__init__(url, request_data, response_recorder=None, warcprox_meta=warcprox_meta, content_type=None, custom_type=None, status=status, size=None,
         client_ip=client_ip, method=method, timestamp=timestamp, host=host, duration=duration,
-        referer=referer, do_not_archive=do_not_archive)
+        referer=referer, payload_digest=None, truncated=None, warc_records=None, do_not_archive=do_not_archive)
 
         self.exception = exception
 
@@ -454,30 +470,18 @@ class RecordedUrl(RequestedUrl):
             payload_digest=None, truncated=None, warc_records=None,
             do_not_archive=False):
 
-        super().__init__(url, request_data, warcprox_meta=warcprox_meta, status=status,
-        client_ip=client_ip, method=method, timestamp=timestamp, host=host, duration=duration,
-        referer=referer, do_not_archive=do_not_archive)
+        super().__init__(url, request_data, response_recorder=response_recorder,
+        warcprox_meta=warcprox_meta, content_type=content_type,
+        custom_type=custom_type, status=status, size=size, client_ip=client_ip,
+        method=method, timestamp=timestamp, host=host, duration=duration,
+        referer=referer, payload_digest=payload_digest, truncated=truncated,
+        warc_records=warc_records, do_not_archive=do_not_archive)
 
         if type(remote_ip) is not bytes:
             self.remote_ip = remote_ip.encode('ascii')
         else:
             self.remote_ip = remote_ip
 
-        self.content_type = content_type
-
-        self.mimetype = content_type
-        if self.mimetype:
-            # chop off subtype, and ensure there's no whitespace
-            self.mimetype = RE_MIMETYPE.split(self.mimetype, 2)[0]
-
-        self.custom_type = custom_type
-        self.size = size
-        self.response_recorder = response_recorder
-        self.custom_type = custom_type
-        self.payload_digest = payload_digest
-        self.truncated = truncated
-        self.warc_records = warc_records
-
     def is_text(self):
         """Ref: https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Complete_list_of_MIME_types
         Alternative method: try to decode('ascii') first N bytes to make sure
diff --git a/warcprox/writerthread.py b/warcprox/writerthread.py
index 968a90c..3cd6bc6 100644
--- a/warcprox/writerthread.py
+++ b/warcprox/writerthread.py
@@ -72,7 +72,7 @@ class WarcWriterProcessor(warcprox.BaseStandardPostfetchProcessor):
         self.close_prefix_reqs.put(prefix)
 
     def _process_url(self, recorded_url):
-        if not hasattr(recorded_url, 'response_recorder'):
+        if isinstance(recorded_url, warcprox.warcproxy.FailedUrl):
             return
         try:
             records = []