diff --git a/warcprox/bigtable.py b/warcprox/bigtable.py index 387d05c..c3a9bd8 100644 --- a/warcprox/bigtable.py +++ b/warcprox/bigtable.py @@ -233,8 +233,6 @@ class RethinkCapturesDedup: "url": entry["url"].encode("utf-8"), "date": entry["timestamp"].strftime("%Y-%m-%dT%H:%M:%SZ").encode("utf-8"), } - if "warc_id" in entry: - dedup_info["id"] = entry["warc_id"].encode("utf-8") return dedup_info else: return None diff --git a/warcprox/dedup.py b/warcprox/dedup.py index fd1ada4..79be80f 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -55,13 +55,12 @@ class DedupDb(object): conn.close() def save(self, digest_key, response_record, bucket=""): - record_id = response_record.get_header(warctools.WarcRecord.ID).decode('latin1') url = response_record.get_header(warctools.WarcRecord.URL).decode('latin1') date = response_record.get_header(warctools.WarcRecord.DATE).decode('latin1') key = digest_key.decode('utf-8') + "|" + bucket - py_value = {'id':record_id, 'url':url, 'date':date} + py_value = {'url':url, 'date':date} json_value = json.dumps(py_value, separators=(',',':')) conn = sqlite3.connect(self.file) @@ -81,7 +80,6 @@ class DedupDb(object): conn.close() if result_tuple: result = json.loads(result_tuple[0]) - result['id'] = result['id'].encode('latin1') result['url'] = result['url'].encode('latin1') result['date'] = result['date'].encode('latin1') self.logger.debug('dedup db lookup of key=%s returning %s', key, result) @@ -144,10 +142,9 @@ class RethinkDedupDb: def save(self, digest_key, response_record, bucket=""): k = digest_key.decode("utf-8") if isinstance(digest_key, bytes) else digest_key k = "{}|{}".format(k, bucket) - record_id = response_record.get_header(warctools.WarcRecord.ID).decode('latin1') url = response_record.get_header(warctools.WarcRecord.URL).decode('latin1') date = response_record.get_header(warctools.WarcRecord.DATE).decode('latin1') - record = {'key':k,'url':url,'date':date,'id':record_id} + record = {'key': k, 'url': url, 'date': date} result = self.rr.table(self.table).insert( record, conflict="replace").run() if sorted(result.values()) != [0,0,0,0,0,1] and [result["deleted"],result["skipped"],result["errors"]] != [0,0,0]: diff --git a/warcprox/playback.py b/warcprox/playback.py index 663e10a..af4639f 100644 --- a/warcprox/playback.py +++ b/warcprox/playback.py @@ -120,9 +120,12 @@ class PlaybackProxyHandler(MitmProxyHandler): def _send_headers_and_refd_payload( - self, headers, refers_to, refers_to_target_uri, refers_to_date): + self, headers, refers_to_target_uri, refers_to_date, payload_digest): + """Parameters: + + """ location = self.server.playback_index_db.lookup_exact( - refers_to_target_uri, refers_to_date, record_id=refers_to) + refers_to_target_uri, refers_to_date, payload_digest) self.logger.debug('loading http payload from {}'.format(location)) fh = self._open_warc_at_offset(location['f'], location['o']) @@ -177,20 +180,19 @@ class PlaybackProxyHandler(MitmProxyHandler): if warc_profile != warctools.WarcRecord.PROFILE_IDENTICAL_PAYLOAD_DIGEST: raise Exception('unknown revisit record profile {}'.format(warc_profile)) - refers_to = record.get_header( - warctools.WarcRecord.REFERS_TO).decode('latin1') refers_to_target_uri = record.get_header( warctools.WarcRecord.REFERS_TO_TARGET_URI).decode( 'latin1') refers_to_date = record.get_header( warctools.WarcRecord.REFERS_TO_DATE).decode('latin1') - + payload_digest = record.get_header( + warctools.WarcRecord.PAYLOAD_DIGEST).decode('latin1') self.logger.debug( 'revisit record references %s:%s capture of %s', - refers_to_date, refers_to, refers_to_target_uri) + refers_to_date, payload_digest, refers_to_target_uri) return self._send_headers_and_refd_payload( - record.content[1], refers_to, refers_to_target_uri, - refers_to_date) + record.content[1], refers_to_target_uri, refers_to_date, + payload_digest) else: # send it back raw, whatever it is @@ -264,12 +266,12 @@ class PlaybackIndexDb(object): # XXX canonicalize url? url = response_record.get_header(warctools.WarcRecord.URL).decode('latin1') date_str = response_record.get_header(warctools.WarcRecord.DATE).decode('latin1') - record_id_str = response_record.get_header(warctools.WarcRecord.ID).decode('latin1') + payload_digest_str = response_record.get_header(warctools.WarcRecord.PAYLOAD_DIGEST).decode('latin1') # there could be two visits of same url in the same second, and WARC-Date is # prescribed as YYYY-MM-DDThh:mm:ssZ, so we have to handle it :-\ - # url:{date1:[record1={'f':warcfile,'o':response_offset,'q':request_offset,'i':record_id},record2,...],date2:[{...}],...} + # url:{date1:[record1={'f':warcfile,'o':response_offset,'q':request_offset,'d':payload_digest},record2,...],date2:[{...}],...} with self._lock: conn = sqlite3.connect(self.file) @@ -283,10 +285,10 @@ class PlaybackIndexDb(object): if date_str in py_value: py_value[date_str].append( - {'f':warcfile, 'o':offset, 'i':record_id_str}) + {'f': warcfile, 'o': offset, 'd': payload_digest_str}) else: py_value[date_str] = [ - {'f':warcfile, 'o':offset, 'i':record_id_str}] + {'f': warcfile, 'o': offset, 'd': payload_digest_str}] json_value = json.dumps(py_value, separators=(',',':')) @@ -314,11 +316,11 @@ class PlaybackIndexDb(object): latest_date = max(py_value) result = py_value[latest_date][0] - result['i'] = result['i'].encode('ascii') + result['d'] = result['d'].encode('ascii') return latest_date, result # in python3 params are bytes - def lookup_exact(self, url, warc_date, record_id): + def lookup_exact(self, url, warc_date, payload_digest): conn = sqlite3.connect(self.file) cursor = conn.execute( 'select value from playback where url = ?', (url,)) @@ -334,14 +336,13 @@ class PlaybackIndexDb(object): if warc_date in py_value: for record in py_value[warc_date]: - if record['i'] == record_id: + if record['d'] == payload_digest: self.logger.debug( "found exact match for (%r,%r,%r)", - warc_date, record_id, url) - record['i'] = record['i'] + warc_date, payload_digest, url) + record['d'] = record['d'] return record else: self.logger.info( - "match not found for (%r,%r,%r)", warc_date, record_id, url) + "match not found for (%r,%r,%r)", warc_date, payload_digest, url) return None - diff --git a/warcprox/warc.py b/warcprox/warc.py index 51b1c35..53e049f 100644 --- a/warcprox/warc.py +++ b/warcprox/warc.py @@ -50,7 +50,6 @@ class WarcRecordBuilder: url=recorded_url.url, warc_date=warc_date, data=response_header_block, warc_type=warctools.WarcRecord.REVISIT, - refers_to=recorded_url.dedup_info['id'], refers_to_target_uri=recorded_url.dedup_info['url'], refers_to_date=recorded_url.dedup_info['date'], payload_digest=warcprox.digest_str(recorded_url.response_recorder.payload_digest, self.base32), @@ -87,8 +86,8 @@ class WarcRecordBuilder: def build_warc_record(self, url, warc_date=None, recorder=None, data=None, concurrent_to=None, warc_type=None, content_type=None, remote_ip=None, - profile=None, refers_to=None, refers_to_target_uri=None, - refers_to_date=None, payload_digest=None): + profile=None, refers_to_target_uri=None, refers_to_date=None, + payload_digest=None): if warc_date is None: warc_date = warctools.warc.warc_datetime_str(datetime.datetime.utcnow()) @@ -105,8 +104,6 @@ class WarcRecordBuilder: headers.append((warctools.WarcRecord.IP_ADDRESS, remote_ip)) if profile is not None: headers.append((warctools.WarcRecord.PROFILE, profile)) - if refers_to is not None: - headers.append((warctools.WarcRecord.REFERS_TO, refers_to)) if refers_to_target_uri is not None: headers.append((warctools.WarcRecord.REFERS_TO_TARGET_URI, refers_to_target_uri)) if refers_to_date is not None: