mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
Apply blackout on when dedup URL equals request URL
This commit is contained in:
parent
2c2c1d008a
commit
6b1d60c390
@ -166,8 +166,7 @@ def test_special_dont_write_prefix():
|
|||||||
recorder.read()
|
recorder.read()
|
||||||
old = datetime.utcnow() - timedelta(0, 3600)
|
old = datetime.utcnow() - timedelta(0, 3600)
|
||||||
ru = RecordedUrl(
|
ru = RecordedUrl(
|
||||||
url='http://example.com/yes',
|
url='http://example.com/dup',
|
||||||
# content_type=hanzo.httptools.ResponseMessage.CONTENT_TYPE,
|
|
||||||
content_type='text/plain',
|
content_type='text/plain',
|
||||||
status=200, client_ip='127.0.0.2', request_data=b'abc',
|
status=200, client_ip='127.0.0.2', request_data=b'abc',
|
||||||
response_recorder=recorder, remote_ip='127.0.0.3',
|
response_recorder=recorder, remote_ip='127.0.0.3',
|
||||||
@ -181,7 +180,7 @@ def test_special_dont_write_prefix():
|
|||||||
recorder.read()
|
recorder.read()
|
||||||
recent = datetime.utcnow() - timedelta(0, 5)
|
recent = datetime.utcnow() - timedelta(0, 5)
|
||||||
ru = RecordedUrl(
|
ru = RecordedUrl(
|
||||||
url='http://example.com/yes', content_type='text/plain',
|
url='http://example.com/dup', content_type='text/plain',
|
||||||
status=200, client_ip='127.0.0.2', request_data=b'abc',
|
status=200, client_ip='127.0.0.2', request_data=b'abc',
|
||||||
response_recorder=recorder, remote_ip='127.0.0.3',
|
response_recorder=recorder, remote_ip='127.0.0.3',
|
||||||
timestamp=datetime.utcnow(),
|
timestamp=datetime.utcnow(),
|
||||||
|
@ -114,10 +114,6 @@ class WarcWriterProcessor(warcprox.BaseStandardPostfetchProcessor):
|
|||||||
if recorded_url.warcprox_meta
|
if recorded_url.warcprox_meta
|
||||||
and 'warc-prefix' in recorded_url.warcprox_meta
|
and 'warc-prefix' in recorded_url.warcprox_meta
|
||||||
else self.options.prefix)
|
else self.options.prefix)
|
||||||
res = (prefix != '-' and not recorded_url.do_not_archive
|
|
||||||
and self._filter_accepts(recorded_url)
|
|
||||||
and not self._in_blackout(recorded_url))
|
|
||||||
|
|
||||||
# special warc name prefix '-' means "don't archive"
|
# special warc name prefix '-' means "don't archive"
|
||||||
return (prefix != '-' and not recorded_url.do_not_archive
|
return (prefix != '-' and not recorded_url.do_not_archive
|
||||||
and self._filter_accepts(recorded_url)
|
and self._filter_accepts(recorded_url)
|
||||||
@ -132,7 +128,7 @@ class WarcWriterProcessor(warcprox.BaseStandardPostfetchProcessor):
|
|||||||
if self.blackout_period and hasattr(recorded_url, "dedup_info") and \
|
if self.blackout_period and hasattr(recorded_url, "dedup_info") and \
|
||||||
recorded_url.dedup_info:
|
recorded_url.dedup_info:
|
||||||
dedup_date = recorded_url.dedup_info.get('date')
|
dedup_date = recorded_url.dedup_info.get('date')
|
||||||
if dedup_date:
|
if dedup_date and recorded_url.dedup_info.get('url') == recorded_url.url:
|
||||||
try:
|
try:
|
||||||
dt = datetime.strptime(dedup_date.decode('utf-8'),
|
dt = datetime.strptime(dedup_date.decode('utf-8'),
|
||||||
'%Y-%m-%dT%H:%M:%SZ')
|
'%Y-%m-%dT%H:%M:%SZ')
|
||||||
|
Loading…
x
Reference in New Issue
Block a user