mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-23 06:32:24 +01:00
add support for excluding certain headers when writing WARCs tests: add first batch of tests for recorder, using live upstream server
63 lines
2.0 KiB
Python
63 lines
2.0 KiB
Python
from pywb.utils.canonicalize import calc_search_range
|
|
from pywb.cdx.cdxobject import CDXObject
|
|
from pywb.warc.cdxindexer import write_cdx_index
|
|
from pywb.utils.timeutils import timestamp_to_datetime
|
|
from pywb.utils.timeutils import datetime_to_iso_date, iso_date_to_timestamp
|
|
|
|
from io import BytesIO
|
|
import os
|
|
|
|
from webagg.indexsource import RedisIndexSource
|
|
from webagg.aggregator import SimpleAggregator
|
|
from webagg.utils import res_template
|
|
|
|
|
|
#==============================================================================
|
|
class WritableRedisIndexer(RedisIndexSource):
|
|
def __init__(self, redis_url, rel_path_template='', name='recorder'):
|
|
super(WritableRedisIndexer, self).__init__(redis_url)
|
|
self.cdx_lookup = SimpleAggregator({name: self})
|
|
self.rel_path_template = rel_path_template
|
|
|
|
def add_record(self, stream, params, filename=None):
|
|
if not filename and hasattr(stream, 'name'):
|
|
filename = stream.name
|
|
|
|
rel_path = res_template(self.rel_path_template, params)
|
|
filename = os.path.relpath(filename, rel_path)
|
|
|
|
cdxout = BytesIO()
|
|
write_cdx_index(cdxout, stream, filename,
|
|
cdxj=True, append_post=True, rel_root=rel_path)
|
|
|
|
z_key = res_template(self.redis_key_template, params)
|
|
|
|
cdxes = cdxout.getvalue()
|
|
for cdx in cdxes.split(b'\n'):
|
|
if cdx:
|
|
self.redis.zadd(z_key, 0, cdx)
|
|
|
|
return cdx
|
|
|
|
def lookup_revisit(self, params, digest, url, iso_dt):
|
|
params['url'] = url
|
|
params['closest'] = iso_date_to_timestamp(iso_dt)
|
|
|
|
filters = []
|
|
|
|
filters.append('!mime:warc/revisit')
|
|
|
|
if digest and digest != '-':
|
|
filters.append('digest:' + digest.split(':')[-1])
|
|
|
|
params['filter'] = filters
|
|
|
|
cdx_iter, errs = self.cdx_lookup(params)
|
|
|
|
for cdx in cdx_iter:
|
|
dt = timestamp_to_datetime(cdx['timestamp'])
|
|
return ('revisit', cdx['url'],
|
|
datetime_to_iso_date(dt))
|
|
|
|
return None
|