1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 08:04:49 +01:00

recorder: support overridings get_params() in subclass

multiwarcwriter: support multiple warcs in same dir, support random component in path, and a custom
key template for selecting current warc file, not related to current directory
This commit is contained in:
Ilya Kreymer 2016-06-07 12:55:04 -04:00
parent 3fec766e39
commit 4c7da0f6ef
3 changed files with 52 additions and 17 deletions

View File

@ -115,11 +115,14 @@ class RecorderApp(object):
'200 OK',
start_response)
def _get_params(self, environ):
params = dict(parse_qsl(environ.get('QUERY_STRING')))
return params
def __call__(self, environ, start_response):
input_req = DirectWSGIInputRequest(environ)
params = dict(parse_qsl(environ.get('QUERY_STRING')))
params = self._get_params(environ)
request_uri = input_req.get_full_request_uri()

View File

@ -168,10 +168,10 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass)
assert ('Cookie', 'boo=far') in stored_req.status_headers.headers
def test_record_cookies_skip_header(self):
base_path = to_path(self.root_dir + '/warcs/cookieskip/')
warc_path = to_path(self.root_dir + '/warcs/cookieskip/')
header_filter = ExcludeSpecificHeaders(['Set-Cookie', 'Cookie'])
recorder_app = RecorderApp(self.upstream_url,
PerRecordWARCWriter(base_path, header_filter=header_filter),
PerRecordWARCWriter(warc_path, header_filter=header_filter),
accept_colls='live')
resp = self._test_warc_write(recorder_app, 'httpbin.org', '/cookies/set%3Fname%3Dvalue%26foo%3Dbar')
@ -182,7 +182,7 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass)
assert ('Set-Cookie', 'name=value; Path=/') in record.status_headers.headers
assert ('Set-Cookie', 'foo=bar; Path=/') in record.status_headers.headers
stored_req, stored_resp = self._load_resp_req(base_path)
stored_req, stored_resp = self._load_resp_req(warc_path)
assert ('Set-Cookie', 'name=value; Path=/') not in stored_resp.status_headers.headers
assert ('Set-Cookie', 'foo=bar; Path=/') not in stored_resp.status_headers.headers
@ -201,7 +201,6 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass)
self._test_all_warcs('/warcs/', 2)
def test_record_param_user_coll(self):
warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')
dedup_index = self._get_dedup_index()
@ -234,6 +233,25 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass)
full_path = self.root_dir + '/warcs/' + cdx['filename']
assert warcs == {cdx['filename'].encode('utf-8'): full_path.encode('utf-8')}
def test_record_param_user_coll_same_dir(self):
warc_path = to_path(self.root_dir + '/warcs2/')
dedup_index = self._get_dedup_index()
recorder_app = RecorderApp(self.upstream_url,
PerRecordWARCWriter(warc_path, dedup_index=dedup_index, key_template='{user}:{coll}'))
resp = self._test_warc_write(recorder_app, 'httpbin.org',
'/get?foo=bar', '&param.recorder.user=USER2&param.recorder.coll=COLL2')
assert b'HTTP/1.1 200 OK' in resp.body
assert b'"foo": "bar"' in resp.body
resp = self._test_warc_write(recorder_app, 'httpbin.org',
'/get?foo=bar', '&param.recorder.user=USER2&param.recorder.coll=COLL3')
assert b'HTTP/1.1 200 OK' in resp.body
assert b'"foo": "bar"' in resp.body
self._test_all_warcs('/warcs2', 2)
def test_record_param_user_coll_revisit(self):
warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')
@ -395,8 +413,7 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass)
assert len(writer.fh_cache) == 1
writer.close_dir(self.root_dir + '/warcs/FOO/')
#writer.close_file({'param.recorder.coll': 'FOO'})
writer.close_key(self.root_dir + '/warcs/FOO/')
assert len(writer.fh_cache) == 0

View File

@ -332,6 +332,7 @@ class MultiFileWARCWriter(BaseWARCWriter):
filename_template = self.FILE_TEMPLATE
self.dir_template = dir_template
self.key_template = kwargs.get('key_template', self.dir_template)
self.filename_template = filename_template
self.max_size = max_size
if max_idle_secs > 0:
@ -344,9 +345,12 @@ class MultiFileWARCWriter(BaseWARCWriter):
def _open_file(self, dir_, params):
timestamp = timestamp20_now()
randstr = base64.b32encode(os.urandom(5)).decode('utf-8')
filename = dir_ + res_template(self.filename_template, params,
hostname=self.hostname,
timestamp=timestamp)
timestamp=timestamp,
random=randstr)
path, name = os.path.split(filename)
@ -366,9 +370,14 @@ class MultiFileWARCWriter(BaseWARCWriter):
fcntl.flock(fh, fcntl.LOCK_UN)
fh.close()
def close_dir(self, full_dir):
#full_dir = res_template(self.dir_template, params)
result = self.fh_cache.pop(full_dir, None)
def get_dir_key(self, params):
return res_template(self.key_template, params)
def close_key(self, dir_key):
if isinstance(dir_key, dict):
dir_key = self.get_dir_key(dir_key)
result = self.fh_cache.pop(dir_key, None)
if not result:
return
@ -376,6 +385,11 @@ class MultiFileWARCWriter(BaseWARCWriter):
self._close_file(out)
return filename
def close_file(self, match_filename):
for dir_key, out, filename in self.iter_open_files():
if filename == match_filename:
return self.close_key(dir_key)
def _is_write_resp(self, resp, params):
return True
@ -389,8 +403,9 @@ class MultiFileWARCWriter(BaseWARCWriter):
def _do_write_req_resp(self, req, resp, params):
full_dir = res_template(self.dir_template, params)
dir_key = self.get_dir_key(params)
result = self.fh_cache.get(full_dir)
result = self.fh_cache.get(dir_key)
close_file = False
@ -436,11 +451,11 @@ class MultiFileWARCWriter(BaseWARCWriter):
if close_file:
self._close_file(out)
if not is_new:
self.fh_cache.pop(full_dir, None)
self.fh_cache.pop(dir_key, None)
elif is_new:
fcntl.flock(out, fcntl.LOCK_EX | fcntl.LOCK_NB)
self.fh_cache[full_dir] = (out, filename)
self.fh_cache[dir_key] = (out, filename)
def iter_open_files(self):
for n, v in list(self.fh_cache.items()):
@ -448,7 +463,7 @@ class MultiFileWARCWriter(BaseWARCWriter):
yield n, out, filename
def close(self):
for dirname, out, filename in self.iter_open_files():
for dir_key, out, filename in self.iter_open_files():
self._close_file(out)
self.fh_cache = {}
@ -459,12 +474,12 @@ class MultiFileWARCWriter(BaseWARCWriter):
now = datetime.datetime.now()
for dirname, out, filename in self.iter_open_files():
for dir_key, out, filename in self.iter_open_files():
mtime = os.path.getmtime(filename)
mtime = datetime.datetime.fromtimestamp(mtime)
if (now - mtime) > self.max_idle_time:
print('Closing idle ' + filename)
self.close_dir(dirname)
self.close_key(dir_key)
# ============================================================================