Apply blackout on when dedup URL equals request URL

This commit is contained in:
Vangelis Banos 2018-07-24 07:16:21 +00:00
parent 2c2c1d008a
commit 6b1d60c390
2 changed files with 3 additions and 8 deletions

View File

@ -166,8 +166,7 @@ def test_special_dont_write_prefix():
recorder.read()
old = datetime.utcnow() - timedelta(0, 3600)
ru = RecordedUrl(
url='http://example.com/yes',
# content_type=hanzo.httptools.ResponseMessage.CONTENT_TYPE,
url='http://example.com/dup',
content_type='text/plain',
status=200, client_ip='127.0.0.2', request_data=b'abc',
response_recorder=recorder, remote_ip='127.0.0.3',
@ -181,7 +180,7 @@ def test_special_dont_write_prefix():
recorder.read()
recent = datetime.utcnow() - timedelta(0, 5)
ru = RecordedUrl(
url='http://example.com/yes', content_type='text/plain',
url='http://example.com/dup', content_type='text/plain',
status=200, client_ip='127.0.0.2', request_data=b'abc',
response_recorder=recorder, remote_ip='127.0.0.3',
timestamp=datetime.utcnow(),

View File

@ -114,10 +114,6 @@ class WarcWriterProcessor(warcprox.BaseStandardPostfetchProcessor):
if recorded_url.warcprox_meta
and 'warc-prefix' in recorded_url.warcprox_meta
else self.options.prefix)
res = (prefix != '-' and not recorded_url.do_not_archive
and self._filter_accepts(recorded_url)
and not self._in_blackout(recorded_url))
# special warc name prefix '-' means "don't archive"
return (prefix != '-' and not recorded_url.do_not_archive
and self._filter_accepts(recorded_url)
@ -132,7 +128,7 @@ class WarcWriterProcessor(warcprox.BaseStandardPostfetchProcessor):
if self.blackout_period and hasattr(recorded_url, "dedup_info") and \
recorded_url.dedup_info:
dedup_date = recorded_url.dedup_info.get('date')
if dedup_date:
if dedup_date and recorded_url.dedup_info.get('url') == recorded_url.url:
try:
dt = datetime.strptime(dedup_date.decode('utf-8'),
'%Y-%m-%dT%H:%M:%SZ')