Moving more variables from RecordedUrl to RequiredUrl

2025-01-18 13:22:09 +01:00 · 2020-01-04 01:41:28 +00:00 · 2020-01-04 01:41:28 +00:00 · 4ceebe1fa9
commit 4ceebe1fa9
parent e88a88f247
6 changed files with 35 additions and 31 deletions
--- a/warcprox/init.py
+++ b/warcprox/init.py
@ -223,7 +223,7 @@ class ListenerPostfetchProcessor(BaseStandardPostfetchProcessor):
        self.name = listener.__class__.__name__

    def _process_url(self, recorded_url):
-        return self.listener.notify(recorded_url, recorded_url.warc_records if hasattr(recorded_url, "warc_records") else None)
+        return self.listener.notify(recorded_url, recorded_url.warc_records)

    def start(self):
        if hasattr(self.listener, 'start'):
--- a/warcprox/crawl_log.py
+++ b/warcprox/crawl_log.py
@ -42,13 +42,13 @@ class CrawlLogger(object):
        # 2017-08-03T21:45:24.496Z   200       2189 https://autismcouncil.wisconsin.gov/robots.txt P https://autismcouncil.wisconsin.gov/ text/plain #001 20170803214523617+365 sha1:PBS2CEF7B4OSEXZZF3QE2XN2VHYCPNPX https://autismcouncil.wisconsin.gov/ duplicate:digest {"warcFileOffset":942,"contentSize":2495,"warcFilename":"ARCHIVEIT-2159-TEST-JOB319150-20170803214522386-00000.warc.gz"}
        now = datetime.datetime.utcnow()
        status = self.get_artificial_status(recorded_url)
-        extra_info = {'contentSize': recorded_url.size,} if hasattr(recorded_url, "size") and recorded_url.size > 0 else {}
+        extra_info = {'contentSize': recorded_url.size,} if recorded_url.size is not None and recorded_url.size > 0 else {}
        if records:
            extra_info['warcFilename'] = records[0].warc_filename
            extra_info['warcFileOffset'] = records[0].offset
        if recorded_url.method != 'GET':
            extra_info['method'] = recorded_url.method
-        if hasattr(recorded_url, "response_recorder") and recorded_url.response_recorder:
+        if recorded_url.response_recorder:
            content_length = recorded_url.response_recorder.len - recorded_url.response_recorder.payload_offset
            payload_digest = warcprox.digest_str(
                recorded_url.payload_digest,
@ -67,7 +67,7 @@ class CrawlLogger(object):
            recorded_url.url,
            '-', # hop path
            recorded_url.referer or '-',
-            recorded_url.mimetype if hasattr(recorded_url, "mimetype") and recorded_url.mimetype is not None else '-',
+            recorded_url.mimetype if recorded_url.mimetype is not None else '-',
            '-',
            '{:%Y%m%d%H%M%S}{:03d}+{:03d}'.format(
                recorded_url.timestamp,
--- a/warcprox/dedup.py
+++ b/warcprox/dedup.py
@ -65,7 +65,7 @@ class DedupLoader(warcprox.BaseStandardPostfetchProcessor, DedupableMixin):
        self.dedup_db = dedup_db

    def _process_url(self, recorded_url):
-        if not hasattr(recorded_url, 'response_recorder'):
+        if isinstance(recorded_url, warcprox.warcproxy.FailedUrl):
            return
        if (recorded_url.response_recorder
                and recorded_url.payload_digest
--- a/warcprox/stats.py
+++ b/warcprox/stats.py
@ -162,7 +162,7 @@ class StatsProcessor(warcprox.BaseBatchPostfetchProcessor):
    def _tally_batch(self, batch):
        batch_buckets = {}
        for recorded_url in batch:
-            if not hasattr(recorded_url, 'response_recorder'):
+            if isinstance(recorded_url, warcprox.warcproxy.FailedUrl):
                continue
            for bucket in self.buckets(recorded_url):
                bucket_stats = batch_buckets.get(bucket)
@ -299,7 +299,7 @@ class RunningStats:
                    (self.first_snap_time - 120 + i * 10, 0, 0))

    def notify(self, recorded_url, records):
-        if not hasattr(recorded_url, 'response_recorder'):
+        if isinstance(recorded_url, warcprox.warcproxy.FailedUrl):
            return
        with self._lock:
            self.urls += 1
--- a/warcprox/warcproxy.py
+++ b/warcprox/warcproxy.py
@ -395,9 +395,12 @@ RE_MIMETYPE = re.compile(r'[;\s]')

 class RequestedUrl:
    logger = logging.getLogger("warcprox.warcproxy.RequestedUrl")
-    def __init__(self, url, request_data, warcprox_meta=None, status=None,
-            client_ip=None, method=None, timestamp=None, host=None, duration=None,
-            referer=None, do_not_archive=True):
+    def __init__(self, url, request_data, response_recorder=None, remote_ip=None,
+            warcprox_meta=None, content_type=None, custom_type=None,
+            status=None, size=None, client_ip=None, method=None,
+            timestamp=None, host=None, duration=None, referer=None,
+            payload_digest=None, truncated=None, warc_records=None,
+            do_not_archive=False):
        # XXX should test what happens with non-ascii url (when does
        # url-encoding happen?)
        if type(url) is not bytes:
@ -406,6 +409,7 @@ class RequestedUrl:
            self.url = url

        self.request_data = request_data
+        self.response_recorder = response_recorder

        if warcprox_meta:
            if 'captures-bucket' in warcprox_meta:
@ -422,13 +426,25 @@ class RequestedUrl:
        else:
            self.warcprox_meta = {}

+        self.content_type = content_type
+
+        self.mimetype = content_type
+        if self.mimetype:
+            # chop off subtype, and ensure there's no whitespace
+            self.mimetype = RE_MIMETYPE.split(self.mimetype, 2)[0]
+
+        self.custom_type = custom_type
        self.status = status
+        self.size = size
        self.client_ip = client_ip
        self.method = method
        self.timestamp = timestamp
        self.host = host
        self.duration = duration
        self.referer = referer
+        self.payload_digest = payload_digest
+        self.truncated = truncated
+        self.warc_records = warc_records
        self.do_not_archive = do_not_archive

 class FailedUrl(RequestedUrl):
@ -438,9 +454,9 @@ class FailedUrl(RequestedUrl):
            client_ip=None, method=None, timestamp=None, host=None, duration=None,
            referer=None, do_not_archive=True, exception=None):

-        super().__init__(url, request_data, warcprox_meta=warcprox_meta, status=status,
+        super().__init__(url, request_data, response_recorder=None, warcprox_meta=warcprox_meta, content_type=None, custom_type=None, status=status, size=None,
        client_ip=client_ip, method=method, timestamp=timestamp, host=host, duration=duration,
-        referer=referer, do_not_archive=do_not_archive)
+        referer=referer, payload_digest=None, truncated=None, warc_records=None, do_not_archive=do_not_archive)

        self.exception = exception

@ -454,30 +470,18 @@ class RecordedUrl(RequestedUrl):
            payload_digest=None, truncated=None, warc_records=None,
            do_not_archive=False):

-        super().__init__(url, request_data, warcprox_meta=warcprox_meta, status=status,
-        client_ip=client_ip, method=method, timestamp=timestamp, host=host, duration=duration,
-        referer=referer, do_not_archive=do_not_archive)
+        super().__init__(url, request_data, response_recorder=response_recorder,
+        warcprox_meta=warcprox_meta, content_type=content_type,
+        custom_type=custom_type, status=status, size=size, client_ip=client_ip,
+        method=method, timestamp=timestamp, host=host, duration=duration,
+        referer=referer, payload_digest=payload_digest, truncated=truncated,
+        warc_records=warc_records, do_not_archive=do_not_archive)

        if type(remote_ip) is not bytes:
            self.remote_ip = remote_ip.encode('ascii')
        else:
            self.remote_ip = remote_ip

-        self.content_type = content_type
-
-        self.mimetype = content_type
-        if self.mimetype:
-            # chop off subtype, and ensure there's no whitespace
-            self.mimetype = RE_MIMETYPE.split(self.mimetype, 2)[0]
-
-        self.custom_type = custom_type
-        self.size = size
-        self.response_recorder = response_recorder
-        self.custom_type = custom_type
-        self.payload_digest = payload_digest
-        self.truncated = truncated
-        self.warc_records = warc_records
-
    def is_text(self):
        """Ref: https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Complete_list_of_MIME_types
        Alternative method: try to decode('ascii') first N bytes to make sure
--- a/warcprox/writerthread.py
+++ b/warcprox/writerthread.py
@ -72,7 +72,7 @@ class WarcWriterProcessor(warcprox.BaseStandardPostfetchProcessor):
        self.close_prefix_reqs.put(prefix)

    def _process_url(self, recorded_url):
-        if not hasattr(recorded_url, 'response_recorder'):
+        if isinstance(recorded_url, warcprox.warcproxy.FailedUrl):
            return
        try:
            records = []