2016-03-11 11:12:25 -08:00
|
|
|
#from gevent import monkey; monkey.patch_all()
|
|
|
|
import gevent
|
|
|
|
|
|
|
|
from webagg.test.testutils import TempDirTests, LiveServerTests, BaseTestClass, to_path
|
2016-03-24 10:45:48 -04:00
|
|
|
from webagg.test.testutils import FakeRedisTests
|
2016-03-11 11:12:25 -08:00
|
|
|
|
|
|
|
import os
|
|
|
|
import webtest
|
|
|
|
|
2016-03-18 15:28:24 -07:00
|
|
|
from pytest import raises
|
2016-03-24 10:45:48 -04:00
|
|
|
from fakeredis import FakeStrictRedis
|
2016-03-11 11:12:25 -08:00
|
|
|
|
|
|
|
from recorder.recorderapp import RecorderApp
|
|
|
|
from recorder.redisindexer import WritableRedisIndexer
|
2016-04-03 12:19:54 -07:00
|
|
|
from recorder.warcwriter import PerRecordWARCWriter, MultiFileWARCWriter, SimpleTempWARCWriter
|
2016-03-19 10:24:28 -07:00
|
|
|
from recorder.filters import ExcludeSpecificHeaders
|
|
|
|
from recorder.filters import SkipDupePolicy, WriteDupePolicy, WriteRevisitDupePolicy
|
2016-03-11 11:12:25 -08:00
|
|
|
|
|
|
|
from webagg.utils import MementoUtils
|
|
|
|
|
|
|
|
from pywb.cdx.cdxobject import CDXObject
|
|
|
|
from pywb.utils.statusandheaders import StatusAndHeadersParser
|
|
|
|
from pywb.utils.bufferedreaders import DecompressingBufferedReader
|
|
|
|
from pywb.warc.recordloader import ArcWarcRecordLoader
|
2016-03-18 15:28:24 -07:00
|
|
|
from pywb.warc.cdxindexer import write_cdx_index
|
2016-04-27 09:52:56 -07:00
|
|
|
from pywb.warc.archiveiterator import ArchiveIterator
|
2016-03-11 11:12:25 -08:00
|
|
|
|
|
|
|
from six.moves.urllib.parse import quote, unquote
|
|
|
|
from io import BytesIO
|
2016-03-18 15:28:24 -07:00
|
|
|
import time
|
2016-04-03 12:19:54 -07:00
|
|
|
import json
|
2016-03-11 11:12:25 -08:00
|
|
|
|
|
|
|
general_req_data = "\
|
|
|
|
GET {path} HTTP/1.1\r\n\
|
|
|
|
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8\r\n\
|
|
|
|
User-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36\r\n\
|
2016-04-27 09:52:56 -07:00
|
|
|
X-Other: foo\r\n\
|
2016-03-11 11:12:25 -08:00
|
|
|
Host: {host}\r\n\
|
2016-04-27 09:52:56 -07:00
|
|
|
Cookie: boo=far\r\n\
|
2016-03-11 11:12:25 -08:00
|
|
|
\r\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
2016-03-24 10:45:48 -04:00
|
|
|
class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass):
|
2016-03-11 11:12:25 -08:00
|
|
|
@classmethod
|
|
|
|
def setup_class(cls):
|
|
|
|
super(TestRecorder, cls).setup_class()
|
|
|
|
|
2016-04-02 21:36:36 -07:00
|
|
|
cls.warcs_dir = to_path(cls.root_dir + '/warcs')
|
2016-03-11 11:12:25 -08:00
|
|
|
|
2016-04-02 21:36:36 -07:00
|
|
|
os.makedirs(cls.warcs_dir)
|
2016-03-11 11:12:25 -08:00
|
|
|
|
|
|
|
cls.upstream_url = 'http://localhost:{0}'.format(cls.server.port)
|
|
|
|
|
2016-03-19 10:24:28 -07:00
|
|
|
def _get_dedup_index(self, dupe_policy=WriteRevisitDupePolicy()):
|
2016-04-02 21:36:36 -07:00
|
|
|
dedup_index = WritableRedisIndexer(redis_url='redis://localhost/2/{user}:{coll}:cdxj',
|
2016-03-19 10:24:28 -07:00
|
|
|
file_key_template='{user}:{coll}:warc',
|
|
|
|
rel_path_template=self.root_dir + '/warcs/',
|
|
|
|
dupe_policy=dupe_policy)
|
|
|
|
|
|
|
|
return dedup_index
|
2016-03-11 11:12:25 -08:00
|
|
|
|
2016-03-18 15:28:24 -07:00
|
|
|
def _test_warc_write(self, recorder_app, host, path, other_params=''):
|
2016-03-11 11:12:25 -08:00
|
|
|
url = 'http://' + host + path
|
|
|
|
req_url = '/live/resource/postreq?url=' + url + other_params
|
|
|
|
testapp = webtest.TestApp(recorder_app)
|
|
|
|
resp = testapp.post(req_url, general_req_data.format(host=host, path=path).encode('utf-8'))
|
2016-03-21 11:47:12 -07:00
|
|
|
|
|
|
|
if not recorder_app.write_queue.empty():
|
|
|
|
recorder_app._write_one()
|
2016-03-11 11:12:25 -08:00
|
|
|
|
|
|
|
assert resp.headers['WebAgg-Source-Coll'] == 'live'
|
|
|
|
|
|
|
|
assert resp.headers['Link'] == MementoUtils.make_link(unquote(url), 'original')
|
|
|
|
assert resp.headers['Memento-Datetime'] != ''
|
|
|
|
|
|
|
|
return resp
|
|
|
|
|
|
|
|
def _test_all_warcs(self, dirname, num):
|
|
|
|
coll_dir = to_path(self.root_dir + dirname)
|
|
|
|
assert os.path.isdir(coll_dir)
|
|
|
|
|
|
|
|
files = [x for x in os.listdir(coll_dir) if os.path.isfile(os.path.join(coll_dir, x))]
|
|
|
|
assert len(files) == num
|
|
|
|
assert all(x.endswith('.warc.gz') for x in files)
|
2016-03-18 19:49:14 -07:00
|
|
|
return files, coll_dir
|
2016-03-11 11:12:25 -08:00
|
|
|
|
2016-04-27 09:52:56 -07:00
|
|
|
def _load_resp_req(self, base_path):
|
|
|
|
warcs = os.listdir(base_path)
|
|
|
|
assert len(warcs) == 1
|
|
|
|
warc = warcs[0]
|
|
|
|
|
|
|
|
stored_resp = None
|
|
|
|
stored_req = None
|
|
|
|
|
|
|
|
with open(os.path.join(base_path, warc), 'rb') as fh:
|
|
|
|
for rec in ArchiveIterator(fh)():
|
|
|
|
if rec.rec_type == 'response':
|
|
|
|
stored_resp = rec
|
|
|
|
elif rec.rec_type == 'request':
|
|
|
|
stored_req = rec
|
|
|
|
|
|
|
|
assert stored_resp is not None
|
|
|
|
assert stored_req is not None
|
|
|
|
return stored_req, stored_resp
|
|
|
|
|
2016-03-11 11:12:25 -08:00
|
|
|
def test_record_warc_1(self):
|
|
|
|
recorder_app = RecorderApp(self.upstream_url,
|
2016-03-18 19:49:14 -07:00
|
|
|
PerRecordWARCWriter(to_path(self.root_dir + '/warcs/')))
|
2016-03-11 11:12:25 -08:00
|
|
|
|
2016-03-18 15:28:24 -07:00
|
|
|
resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar')
|
2016-03-11 11:12:25 -08:00
|
|
|
assert b'HTTP/1.1 200 OK' in resp.body
|
|
|
|
assert b'"foo": "bar"' in resp.body
|
|
|
|
|
2016-03-18 19:49:14 -07:00
|
|
|
self._test_all_warcs('/warcs/', 1)
|
2016-03-11 11:12:25 -08:00
|
|
|
|
|
|
|
def test_record_warc_2(self):
|
|
|
|
recorder_app = RecorderApp(self.upstream_url,
|
2016-03-18 19:49:14 -07:00
|
|
|
PerRecordWARCWriter(to_path(self.root_dir + '/warcs/')), accept_colls='live')
|
2016-03-11 11:12:25 -08:00
|
|
|
|
2016-03-18 15:28:24 -07:00
|
|
|
resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar')
|
2016-03-11 11:12:25 -08:00
|
|
|
assert b'HTTP/1.1 200 OK' in resp.body
|
|
|
|
assert b'"foo": "bar"' in resp.body
|
|
|
|
|
2016-03-18 19:49:14 -07:00
|
|
|
self._test_all_warcs('/warcs/', 2)
|
2016-03-11 11:12:25 -08:00
|
|
|
|
|
|
|
def test_error_url(self):
|
|
|
|
recorder_app = RecorderApp(self.upstream_url + '01',
|
2016-03-18 19:49:14 -07:00
|
|
|
PerRecordWARCWriter(to_path(self.root_dir + '/warcs/')), accept_colls='live')
|
2016-03-11 11:12:25 -08:00
|
|
|
|
|
|
|
|
|
|
|
testapp = webtest.TestApp(recorder_app)
|
|
|
|
resp = testapp.get('/live/resource?url=http://example.com/', status=400)
|
|
|
|
|
|
|
|
assert resp.json['error'] != ''
|
|
|
|
|
2016-03-18 19:49:14 -07:00
|
|
|
self._test_all_warcs('/warcs/', 2)
|
2016-03-11 11:12:25 -08:00
|
|
|
|
|
|
|
def test_record_cookies_header(self):
|
|
|
|
base_path = to_path(self.root_dir + '/warcs/cookiecheck/')
|
|
|
|
recorder_app = RecorderApp(self.upstream_url,
|
2016-03-18 19:49:14 -07:00
|
|
|
PerRecordWARCWriter(base_path), accept_colls='live')
|
2016-03-11 11:12:25 -08:00
|
|
|
|
2016-03-18 15:28:24 -07:00
|
|
|
resp = self._test_warc_write(recorder_app, 'httpbin.org', '/cookies/set%3Fname%3Dvalue%26foo%3Dbar')
|
2016-03-11 11:12:25 -08:00
|
|
|
assert b'HTTP/1.1 302' in resp.body
|
|
|
|
|
|
|
|
buff = BytesIO(resp.body)
|
|
|
|
record = ArcWarcRecordLoader().parse_record_stream(buff)
|
|
|
|
assert ('Set-Cookie', 'name=value; Path=/') in record.status_headers.headers
|
|
|
|
assert ('Set-Cookie', 'foo=bar; Path=/') in record.status_headers.headers
|
|
|
|
|
2016-04-27 09:52:56 -07:00
|
|
|
stored_req, stored_resp = self._load_resp_req(base_path)
|
2016-03-11 11:12:25 -08:00
|
|
|
|
2016-04-27 09:52:56 -07:00
|
|
|
assert ('Set-Cookie', 'name=value; Path=/') in stored_resp.status_headers.headers
|
|
|
|
assert ('Set-Cookie', 'foo=bar; Path=/') in stored_resp.status_headers.headers
|
2016-03-11 11:12:25 -08:00
|
|
|
|
2016-04-27 09:52:56 -07:00
|
|
|
assert ('X-Other', 'foo') in stored_req.status_headers.headers
|
|
|
|
assert ('Cookie', 'boo=far') in stored_req.status_headers.headers
|
2016-03-11 11:12:25 -08:00
|
|
|
|
|
|
|
def test_record_cookies_skip_header(self):
|
|
|
|
base_path = to_path(self.root_dir + '/warcs/cookieskip/')
|
2016-03-17 18:22:26 -07:00
|
|
|
header_filter = ExcludeSpecificHeaders(['Set-Cookie', 'Cookie'])
|
2016-03-11 11:12:25 -08:00
|
|
|
recorder_app = RecorderApp(self.upstream_url,
|
2016-03-18 19:49:14 -07:00
|
|
|
PerRecordWARCWriter(base_path, header_filter=header_filter),
|
2016-03-11 11:12:25 -08:00
|
|
|
accept_colls='live')
|
|
|
|
|
2016-03-18 15:28:24 -07:00
|
|
|
resp = self._test_warc_write(recorder_app, 'httpbin.org', '/cookies/set%3Fname%3Dvalue%26foo%3Dbar')
|
2016-03-11 11:12:25 -08:00
|
|
|
assert b'HTTP/1.1 302' in resp.body
|
|
|
|
|
|
|
|
buff = BytesIO(resp.body)
|
|
|
|
record = ArcWarcRecordLoader().parse_record_stream(buff)
|
|
|
|
assert ('Set-Cookie', 'name=value; Path=/') in record.status_headers.headers
|
|
|
|
assert ('Set-Cookie', 'foo=bar; Path=/') in record.status_headers.headers
|
|
|
|
|
2016-04-27 09:52:56 -07:00
|
|
|
stored_req, stored_resp = self._load_resp_req(base_path)
|
2016-03-11 11:12:25 -08:00
|
|
|
|
2016-04-27 09:52:56 -07:00
|
|
|
assert ('Set-Cookie', 'name=value; Path=/') not in stored_resp.status_headers.headers
|
|
|
|
assert ('Set-Cookie', 'foo=bar; Path=/') not in stored_resp.status_headers.headers
|
2016-03-11 11:12:25 -08:00
|
|
|
|
2016-04-27 09:52:56 -07:00
|
|
|
assert ('X-Other', 'foo') in stored_req.status_headers.headers
|
|
|
|
assert ('Cookie', 'boo=far') not in stored_req.status_headers.headers
|
2016-03-11 11:12:25 -08:00
|
|
|
|
|
|
|
def test_record_skip_wrong_coll(self):
|
|
|
|
recorder_app = RecorderApp(self.upstream_url,
|
2016-03-18 19:49:14 -07:00
|
|
|
writer=PerRecordWARCWriter(to_path(self.root_dir + '/warcs/')), accept_colls='not-live')
|
2016-03-11 11:12:25 -08:00
|
|
|
|
2016-03-18 15:28:24 -07:00
|
|
|
resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar')
|
2016-03-11 11:12:25 -08:00
|
|
|
assert b'HTTP/1.1 200 OK' in resp.body
|
|
|
|
assert b'"foo": "bar"' in resp.body
|
|
|
|
|
2016-03-18 19:49:14 -07:00
|
|
|
self._test_all_warcs('/warcs/', 2)
|
2016-03-11 11:12:25 -08:00
|
|
|
|
|
|
|
def test_record_param_user_coll(self):
|
|
|
|
|
|
|
|
warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')
|
|
|
|
|
2016-03-19 10:24:28 -07:00
|
|
|
dedup_index = self._get_dedup_index()
|
2016-03-11 11:12:25 -08:00
|
|
|
|
|
|
|
recorder_app = RecorderApp(self.upstream_url,
|
2016-03-18 19:49:14 -07:00
|
|
|
PerRecordWARCWriter(warc_path, dedup_index=dedup_index))
|
2016-03-11 11:12:25 -08:00
|
|
|
|
2016-03-18 19:49:14 -07:00
|
|
|
self._test_all_warcs('/warcs/', 2)
|
2016-03-11 11:12:25 -08:00
|
|
|
|
2016-03-18 15:28:24 -07:00
|
|
|
resp = self._test_warc_write(recorder_app, 'httpbin.org',
|
2016-03-11 11:12:25 -08:00
|
|
|
'/get?foo=bar', '¶m.recorder.user=USER¶m.recorder.coll=COLL')
|
|
|
|
assert b'HTTP/1.1 200 OK' in resp.body
|
|
|
|
assert b'"foo": "bar"' in resp.body
|
|
|
|
|
2016-03-18 19:49:14 -07:00
|
|
|
self._test_all_warcs('/warcs/USER/COLL/', 1)
|
2016-03-11 11:12:25 -08:00
|
|
|
|
|
|
|
r = FakeStrictRedis.from_url('redis://localhost/2')
|
|
|
|
|
2016-03-18 15:28:24 -07:00
|
|
|
res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/', '(org,httpbin,')
|
2016-03-11 11:12:25 -08:00
|
|
|
assert len(res) == 1
|
|
|
|
|
|
|
|
cdx = CDXObject(res[0])
|
|
|
|
assert cdx['urlkey'] == 'org,httpbin)/get?foo=bar'
|
|
|
|
assert cdx['mime'] == 'application/json'
|
|
|
|
assert cdx['offset'] == '0'
|
|
|
|
assert cdx['filename'].startswith('USER/COLL/')
|
|
|
|
assert cdx['filename'].endswith('.warc.gz')
|
|
|
|
|
2016-03-19 10:24:28 -07:00
|
|
|
warcs = r.hgetall('USER:COLL:warc')
|
|
|
|
full_path = self.root_dir + '/warcs/' + cdx['filename']
|
|
|
|
assert warcs == {cdx['filename'].encode('utf-8'): full_path.encode('utf-8')}
|
|
|
|
|
2016-03-11 11:12:25 -08:00
|
|
|
|
|
|
|
def test_record_param_user_coll_revisit(self):
|
|
|
|
warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')
|
|
|
|
|
2016-03-19 10:24:28 -07:00
|
|
|
dedup_index = self._get_dedup_index()
|
2016-03-11 11:12:25 -08:00
|
|
|
|
|
|
|
recorder_app = RecorderApp(self.upstream_url,
|
2016-03-18 19:49:14 -07:00
|
|
|
PerRecordWARCWriter(warc_path, dedup_index=dedup_index))
|
2016-03-11 11:12:25 -08:00
|
|
|
|
2016-03-18 19:49:14 -07:00
|
|
|
self._test_all_warcs('/warcs/', 2)
|
2016-03-11 11:12:25 -08:00
|
|
|
|
2016-03-18 15:28:24 -07:00
|
|
|
resp = self._test_warc_write(recorder_app, 'httpbin.org',
|
2016-03-11 11:12:25 -08:00
|
|
|
'/get?foo=bar', '¶m.recorder.user=USER¶m.recorder.coll=COLL')
|
|
|
|
assert b'HTTP/1.1 200 OK' in resp.body
|
|
|
|
assert b'"foo": "bar"' in resp.body
|
|
|
|
|
2016-03-18 19:49:14 -07:00
|
|
|
self._test_all_warcs('/warcs/USER/COLL/', 2)
|
2016-03-11 11:12:25 -08:00
|
|
|
|
|
|
|
# Test Redis CDX
|
|
|
|
r = FakeStrictRedis.from_url('redis://localhost/2')
|
|
|
|
|
2016-03-18 15:28:24 -07:00
|
|
|
res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/', '(org,httpbin,')
|
2016-03-11 11:12:25 -08:00
|
|
|
assert len(res) == 2
|
|
|
|
|
|
|
|
cdx = CDXObject(res[1])
|
|
|
|
assert cdx['urlkey'] == 'org,httpbin)/get?foo=bar'
|
|
|
|
assert cdx['mime'] == 'warc/revisit'
|
|
|
|
assert cdx['offset'] == '0'
|
|
|
|
assert cdx['filename'].startswith('USER/COLL/')
|
|
|
|
assert cdx['filename'].endswith('.warc.gz')
|
|
|
|
|
|
|
|
fullwarc = os.path.join(self.root_dir, 'warcs', cdx['filename'])
|
|
|
|
|
2016-03-19 10:24:28 -07:00
|
|
|
warcs = r.hgetall('USER:COLL:warc')
|
|
|
|
assert len(warcs) == 2
|
|
|
|
assert warcs[cdx['filename'].encode('utf-8')] == fullwarc.encode('utf-8')
|
|
|
|
|
2016-03-11 11:12:25 -08:00
|
|
|
with open(fullwarc, 'rb') as fh:
|
|
|
|
decomp = DecompressingBufferedReader(fh)
|
|
|
|
# Test refers-to headers
|
|
|
|
status_headers = StatusAndHeadersParser(['WARC/1.0']).parse(decomp)
|
|
|
|
assert status_headers.get_header('WARC-Type') == 'revisit'
|
|
|
|
assert status_headers.get_header('WARC-Target-URI') == 'http://httpbin.org/get?foo=bar'
|
|
|
|
assert status_headers.get_header('WARC-Date') != ''
|
|
|
|
assert status_headers.get_header('WARC-Refers-To-Target-URI') == 'http://httpbin.org/get?foo=bar'
|
|
|
|
assert status_headers.get_header('WARC-Refers-To-Date') != ''
|
|
|
|
|
2016-03-18 15:28:24 -07:00
|
|
|
def test_record_param_user_coll_skip(self):
|
|
|
|
warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')
|
|
|
|
|
2016-03-19 10:24:28 -07:00
|
|
|
dedup_index = self._get_dedup_index(dupe_policy=SkipDupePolicy())
|
2016-03-18 15:28:24 -07:00
|
|
|
|
|
|
|
recorder_app = RecorderApp(self.upstream_url,
|
2016-03-18 19:49:14 -07:00
|
|
|
PerRecordWARCWriter(warc_path, dedup_index=dedup_index))
|
2016-03-18 15:28:24 -07:00
|
|
|
|
|
|
|
# No new entries written
|
2016-03-18 19:49:14 -07:00
|
|
|
self._test_all_warcs('/warcs/', 2)
|
2016-03-18 15:28:24 -07:00
|
|
|
|
|
|
|
resp = self._test_warc_write(recorder_app, 'httpbin.org',
|
|
|
|
'/get?foo=bar', '¶m.recorder.user=USER¶m.recorder.coll=COLL')
|
|
|
|
assert b'HTTP/1.1 200 OK' in resp.body
|
|
|
|
assert b'"foo": "bar"' in resp.body
|
|
|
|
|
2016-03-18 19:49:14 -07:00
|
|
|
self._test_all_warcs('/warcs/USER/COLL/', 2)
|
2016-03-18 15:28:24 -07:00
|
|
|
|
|
|
|
# Test Redis CDX
|
|
|
|
r = FakeStrictRedis.from_url('redis://localhost/2')
|
|
|
|
|
|
|
|
res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/', '(org,httpbin,')
|
|
|
|
assert len(res) == 2
|
|
|
|
|
|
|
|
def test_record_param_user_coll_write_dupe_no_revisit(self):
|
|
|
|
|
|
|
|
warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')
|
|
|
|
|
2016-03-19 10:24:28 -07:00
|
|
|
dedup_index = self._get_dedup_index(dupe_policy=WriteDupePolicy())
|
2016-03-18 15:28:24 -07:00
|
|
|
|
2016-03-18 21:40:41 -07:00
|
|
|
writer = PerRecordWARCWriter(warc_path, dedup_index=dedup_index)
|
|
|
|
recorder_app = RecorderApp(self.upstream_url, writer)
|
2016-03-18 15:28:24 -07:00
|
|
|
|
|
|
|
resp = self._test_warc_write(recorder_app, 'httpbin.org',
|
|
|
|
'/get?foo=bar', '¶m.recorder.user=USER¶m.recorder.coll=COLL')
|
|
|
|
assert b'HTTP/1.1 200 OK' in resp.body
|
|
|
|
assert b'"foo": "bar"' in resp.body
|
|
|
|
|
2016-03-18 19:49:14 -07:00
|
|
|
self._test_all_warcs('/warcs/USER/COLL/', 3)
|
2016-03-18 15:28:24 -07:00
|
|
|
|
|
|
|
r = FakeStrictRedis.from_url('redis://localhost/2')
|
|
|
|
|
|
|
|
res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/', '(org,httpbin,')
|
|
|
|
assert len(res) == 3
|
|
|
|
|
|
|
|
mimes = [CDXObject(x)['mime'] for x in res]
|
|
|
|
|
|
|
|
assert sorted(mimes) == ['application/json', 'application/json', 'warc/revisit']
|
|
|
|
|
2016-03-18 21:40:41 -07:00
|
|
|
assert len(writer.fh_cache) == 0
|
|
|
|
|
|
|
|
# Keep Open
|
|
|
|
def test_record_file_warc_keep_open(self):
|
2016-03-18 15:28:24 -07:00
|
|
|
path = to_path(self.root_dir + '/warcs/A.warc.gz')
|
2016-03-18 21:40:41 -07:00
|
|
|
writer = MultiFileWARCWriter(path)
|
|
|
|
recorder_app = RecorderApp(self.upstream_url, writer)
|
2016-03-18 15:28:24 -07:00
|
|
|
|
|
|
|
resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar')
|
|
|
|
assert b'HTTP/1.1 200 OK' in resp.body
|
|
|
|
assert b'"foo": "bar"' in resp.body
|
|
|
|
|
|
|
|
assert os.path.isfile(path)
|
2016-03-18 21:40:41 -07:00
|
|
|
assert len(writer.fh_cache) == 1
|
2016-03-18 15:28:24 -07:00
|
|
|
|
2016-03-18 21:40:41 -07:00
|
|
|
def test_record_multiple_writes_keep_open(self):
|
|
|
|
warc_path = to_path(self.root_dir + '/warcs/FOO/ABC-{hostname}-{timestamp}.warc.gz')
|
2016-03-18 15:28:24 -07:00
|
|
|
|
|
|
|
rel_path = self.root_dir + '/warcs/'
|
|
|
|
|
2016-04-02 21:36:36 -07:00
|
|
|
dedup_index = WritableRedisIndexer(redis_url='redis://localhost/2/{coll}:cdxj',
|
2016-03-19 10:24:28 -07:00
|
|
|
file_key_template='{coll}:warc',
|
2016-03-18 15:28:24 -07:00
|
|
|
rel_path_template=rel_path)
|
|
|
|
|
2016-03-19 10:24:28 -07:00
|
|
|
|
2016-03-18 21:40:41 -07:00
|
|
|
writer = MultiFileWARCWriter(warc_path, dedup_index=dedup_index)
|
2016-03-18 15:28:24 -07:00
|
|
|
recorder_app = RecorderApp(self.upstream_url, writer)
|
|
|
|
|
|
|
|
# First Record
|
|
|
|
resp = self._test_warc_write(recorder_app, 'httpbin.org',
|
|
|
|
'/get?foo=bar', '¶m.recorder.coll=FOO')
|
|
|
|
|
|
|
|
assert b'HTTP/1.1 200 OK' in resp.body
|
|
|
|
assert b'"foo": "bar"' in resp.body
|
|
|
|
|
|
|
|
|
|
|
|
# Second Record
|
|
|
|
resp = self._test_warc_write(recorder_app, 'httpbin.org',
|
|
|
|
'/get?boo=far', '¶m.recorder.coll=FOO')
|
|
|
|
|
|
|
|
assert b'HTTP/1.1 200 OK' in resp.body
|
|
|
|
assert b'"boo": "far"' in resp.body
|
|
|
|
|
|
|
|
self._test_all_warcs('/warcs/FOO/', 1)
|
|
|
|
|
|
|
|
r = FakeStrictRedis.from_url('redis://localhost/2')
|
|
|
|
res = r.zrangebylex('FOO:cdxj', '[org,httpbin)/', '(org,httpbin,')
|
|
|
|
assert len(res) == 2
|
|
|
|
|
2016-03-18 19:49:14 -07:00
|
|
|
files, coll_dir = self._test_all_warcs('/warcs/FOO/', 1)
|
|
|
|
fullname = coll_dir + files[0]
|
2016-03-18 15:28:24 -07:00
|
|
|
|
|
|
|
cdxout = BytesIO()
|
2016-03-18 19:49:14 -07:00
|
|
|
with open(fullname, 'rb') as fh:
|
|
|
|
filename = os.path.relpath(fullname, rel_path)
|
2016-03-18 15:28:24 -07:00
|
|
|
write_cdx_index(cdxout, fh, filename,
|
|
|
|
cdxj=True, append_post=True, sort=True)
|
|
|
|
|
|
|
|
res = [CDXObject(x) for x in res]
|
|
|
|
|
|
|
|
cdxres = cdxout.getvalue().strip()
|
|
|
|
cdxres = cdxres.split(b'\n')
|
|
|
|
cdxres = [CDXObject(x) for x in cdxres]
|
|
|
|
|
|
|
|
assert cdxres == res
|
|
|
|
|
2016-03-18 21:40:41 -07:00
|
|
|
assert len(writer.fh_cache) == 1
|
|
|
|
|
2016-04-02 21:36:36 -07:00
|
|
|
writer.close_file(self.root_dir + '/warcs/FOO/')
|
|
|
|
#writer.close_file({'param.recorder.coll': 'FOO'})
|
2016-03-18 21:40:41 -07:00
|
|
|
|
|
|
|
assert len(writer.fh_cache) == 0
|
|
|
|
|
2016-03-18 15:28:24 -07:00
|
|
|
writer.close()
|
|
|
|
|
2016-03-18 19:49:14 -07:00
|
|
|
resp = self._test_warc_write(recorder_app, 'httpbin.org',
|
2016-03-18 15:28:24 -07:00
|
|
|
'/get?boo=far', '¶m.recorder.coll=FOO')
|
2016-03-19 10:24:28 -07:00
|
|
|
|
|
|
|
self._test_all_warcs('/warcs/FOO/', 2)
|
|
|
|
|
|
|
|
warcs = r.hgetall('FOO:warc')
|
|
|
|
assert len(warcs) == 2
|
2016-04-03 12:19:54 -07:00
|
|
|
|
|
|
|
def test_warcinfo_record(self):
|
|
|
|
simplewriter = SimpleTempWARCWriter(gzip=False)
|
|
|
|
params = {'software': 'recorder test',
|
|
|
|
'format': 'WARC File Format 1.0',
|
|
|
|
'json-metadata': json.dumps({'foo': 'bar'})}
|
|
|
|
|
|
|
|
record = simplewriter.create_warcinfo_record('testfile.warc.gz', **params)
|
|
|
|
simplewriter.write_record(record)
|
|
|
|
buff = simplewriter.get_buffer()
|
|
|
|
assert isinstance(buff, bytes)
|
|
|
|
|
|
|
|
buff = BytesIO(buff)
|
|
|
|
parsed_record = ArcWarcRecordLoader().parse_record_stream(buff)
|
|
|
|
|
|
|
|
assert parsed_record.rec_headers.get_header('WARC-Type') == 'warcinfo'
|
|
|
|
assert parsed_record.rec_headers.get_header('WARC-Filename') == 'testfile.warc.gz'
|
|
|
|
|
|
|
|
buff = parsed_record.stream.read().decode('utf-8')
|
|
|
|
|
|
|
|
length = parsed_record.rec_headers.get_header('Content-Length')
|
|
|
|
|
|
|
|
assert len(buff) == int(length)
|
|
|
|
|
|
|
|
assert 'json-metadata: {"foo": "bar"}\r\n' in buff
|
|
|
|
assert 'format: WARC File Format 1.0\r\n' in buff
|
|
|
|
assert 'json-metadata: {"foo": "bar"}\r\n' in buff
|
|
|
|
|