diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..98b9385d --- /dev/null +++ b/Dockerfile @@ -0,0 +1,23 @@ +#webrecorder/webrecore 1.0 + +FROM python:3.5.2 + +RUN pip install gevent uwsgi bottle urllib3 youtube-dl + +RUN pip install git+https://github.com/ikreymer/pywb.git@master#egg=pywb-0.33.0 +#RUN pip install pywb + +RUN pip install git+https://github.com/t0m/pyamf.git@python3 + +RUN pip install boto webassets + +ADD . /webrecore/ +WORKDIR /webrecore/ + +RUN pip install -e ./ + +RUN useradd -ms /bin/bash -u 1000 apprun + +USER apprun + + diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 00000000..463ec243 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,19 @@ +version: '2' + +services: + proxy: + build: ./proxy/ + links: + - webagg:webagg + + environment: + - "WEBAGG=http://webrecplatform_webagg_1:8080" + + ports: + - 9080:9080 + + volumes: + - ${HOME}/.mitmproxy/:/root/.mitmproxy/ + + webagg: + build: ./webagg/ diff --git a/recorder/__init__.py b/recorder/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/recorder/filters.py b/recorder/filters.py new file mode 100644 index 00000000..c9ab74ee --- /dev/null +++ b/recorder/filters.py @@ -0,0 +1,84 @@ +from pywb.utils.timeutils import timestamp_to_datetime, datetime_to_iso_date +import re + + +# ============================================================================ +# Header Exclusions +# ============================================================================ +class ExcludeNone(object): + def __call__(self, record): + return None + + +# ============================================================================ +class ExcludeSpecificHeaders(object): + def __init__(self, exclude_headers=[]): + self.exclude_headers = [x.lower() for x in exclude_headers] + + def __call__(self, record): + return self.exclude_headers + + +# ============================================================================ +# Revisit Policy +# ============================================================================ +class WriteRevisitDupePolicy(object): + def __call__(self, cdx, params): + dt = timestamp_to_datetime(cdx['timestamp']) + return ('revisit', cdx['url'], datetime_to_iso_date(dt)) + + +# ============================================================================ +class SkipDupePolicy(object): + def __call__(self, cdx, params): + if cdx['url'] == params['url']: + return 'skip' + else: + return 'write' + + +# ============================================================================ +class WriteDupePolicy(object): + def __call__(self, cdx, params): + return 'write' + + +# ============================================================================ +# Skip Record Filters +# ============================================================================ +class SkipNothingFilter(object): + def skip_request(self, req_headers): + return False + + def skip_response(self, req_headers, resp_headers): + return False + + +# ============================================================================ +class CollectionFilter(SkipNothingFilter): + def __init__(self, accept_colls): + self.rx_accept_colls = re.compile(accept_colls) + + def skip_request(self, req_headers): + if req_headers.get('Recorder-Skip') == '1': + return True + + return False + + def skip_response(self, req_headers, resp_headers): + if not self.rx_accept_colls.match(resp_headers.get('WebAgg-Source-Coll', '')): + return True + + return False + + +# ============================================================================ +class SkipRangeRequestFilter(SkipNothingFilter): + def skip_request(self, req_headers): + range_ = req_headers.get('Range') + if range_ and not range_.lower().startswith('bytes=0-'): + return True + + return False + + diff --git a/recorder/recorderapp.py b/recorder/recorderapp.py new file mode 100644 index 00000000..eeded251 --- /dev/null +++ b/recorder/recorderapp.py @@ -0,0 +1,293 @@ +from webagg.utils import StreamIter, chunk_encode_iter, BUFF_SIZE +from webagg.inputrequest import DirectWSGIInputRequest + +from recorder.filters import SkipRangeRequestFilter, CollectionFilter + +from six.moves.urllib.parse import parse_qsl + +import json +import tempfile + +from requests.structures import CaseInsensitiveDict +import requests + +import traceback + +import gevent.queue +import gevent + + +#============================================================================== +class RecorderApp(object): + def __init__(self, upstream_host, writer, skip_filters=None, **kwargs): + self.upstream_host = upstream_host + + self.writer = writer + + self.write_queue = gevent.queue.Queue() + gevent.spawn(self._write_loop) + + if not skip_filters: + skip_filters = self.create_default_filters(kwargs) + + self.skip_filters = skip_filters + + @staticmethod + def create_default_filters(kwargs): + skip_filters = [SkipRangeRequestFilter()] + + accept_colls = kwargs.get('accept_colls') + if accept_colls: + skip_filters.append(CollectionFilter(accept_colls)) + + return skip_filters + + def _write_loop(self): + while True: + try: + self._write_one() + except: + traceback.print_exc() + + def _write_one(self): + req = None + resp = None + try: + result = self.write_queue.get() + + req_head, req_pay, resp_head, resp_pay, params = result + + resp_type, resp = self.writer.read_resp_record(resp_head, resp_pay) + + if resp_type == 'response': + req = self.writer.create_req_record(req_head, req_pay) + + self.writer.write_req_resp(req, resp, params) + + else: + self.writer.write_record(resp, params) + + + finally: + try: + if req: + req.stream.close() + + if resp: + resp.stream.close() + except Exception as e: + traceback.print_exc() + + def send_error(self, exc, start_response): + return self.send_message({'error': repr(exc)}, + '400 Bad Request', + start_response) + + def send_message(self, msg, status, start_response): + message = json.dumps(msg) + headers = [('Content-Type', 'application/json; charset=utf-8'), + ('Content-Length', str(len(message)))] + + start_response(status, headers) + return [message.encode('utf-8')] + + def _put_record(self, request_uri, input_buff, record_type, + headers, params, start_response): + + if record_type == 'stream': + if self.writer.write_stream_to_file(params, input_buff): + msg = {'success': 'true'} + else: + msg = {'error_message': 'upload_error'} + + return self.send_message(msg, '200 OK', + start_response) + + req_stream = ReqWrapper(input_buff, headers) + + while True: + buff = req_stream.read() + if not buff: + break + + content_type = headers.get('Content-Type') + + record = self.writer.create_custom_record(params['url'], + req_stream.out, + record_type, + content_type, + req_stream.headers) + + self.writer.write_record(record, params) + + msg = {'success': 'true', + 'WARC-Date': record.rec_headers.get('WARC-Date')} + + return self.send_message(msg, + '200 OK', + start_response) + + def _get_params(self, environ): + params = dict(parse_qsl(environ.get('QUERY_STRING'))) + return params + + def __call__(self, environ, start_response): + try: + return self.handle_call(environ, start_response) + except: + import traceback + traceback.print_exc() + + def handle_call(self, environ, start_response): + input_req = DirectWSGIInputRequest(environ) + + params = self._get_params(environ) + + request_uri = input_req.get_full_request_uri() + + input_buff = input_req.get_req_body() + + headers = input_req.get_req_headers() + + method = input_req.get_req_method() + + # write request body as metadata/resource + put_record = params.get('put_record') + if put_record and method in ('PUT', 'POST'): + return self._put_record(request_uri, + input_buff, + put_record, + headers, + params, + start_response) + + skipping = any(x.skip_request(headers) for x in self.skip_filters) + + if not skipping: + req_stream = ReqWrapper(input_buff, headers) + else: + req_stream = input_buff + + data = None + if input_buff: + data = req_stream + + try: + res = requests.request(url=self.upstream_host + request_uri, + method=method, + data=data, + headers=headers, + allow_redirects=False, + stream=True) + res.raise_for_status() + except Exception as e: + #traceback.print_exc() + return self.send_error(e, start_response) + + start_response('200 OK', list(res.headers.items())) + + if not skipping: + resp_stream = RespWrapper(res.raw, + res.headers, + req_stream, + params, + self.write_queue, + self.skip_filters) + else: + resp_stream = res.raw + + resp_iter = StreamIter(resp_stream) + + if res.headers.get('Transfer-Encoding') == 'chunked': + resp_iter = chunk_encode_iter(resp_iter) + + return resp_iter + + +#============================================================================== +class Wrapper(object): + def __init__(self, stream): + self.stream = stream + self.out = self._create_buffer() + self.interrupted = False + + def _create_buffer(self): + return tempfile.SpooledTemporaryFile(max_size=512*1024) + + def read(self, *args, **kwargs): + try: + buff = self.stream.read(*args, **kwargs) + except Exception as e: + print('INTERRUPT READ') + self.interrupted = True + raise + + self.out.write(buff) + return buff + + +#============================================================================== +class RespWrapper(Wrapper): + def __init__(self, stream, headers, req, + params, queue, skip_filters): + + super(RespWrapper, self).__init__(stream) + self.headers = headers + self.req = req + self.params = params + self.queue = queue + self.skip_filters = skip_filters + + def close(self): + try: + while True: + if not self.read(BUFF_SIZE): + break + + except Exception as e: + print(e) + self.interrupted = True + + finally: + try: + self.stream.close() + except Exception as e: + traceback.print_exc() + + self._write_to_file() + + def _write_to_file(self): + skipping = any(x.skip_response(self.req.headers, self.headers) + for x in self.skip_filters) + + if self.interrupted or skipping: + self.out.close() + self.req.out.close() + self.req.close() + return + + try: + entry = (self.req.headers, self.req.out, + self.headers, self.out, self.params) + self.queue.put(entry) + self.req.close() + self.req = None + except: + traceback.print_exc() + + +#============================================================================== +class ReqWrapper(Wrapper): + def __init__(self, stream, req_headers): + super(ReqWrapper, self).__init__(stream) + self.headers = CaseInsensitiveDict(req_headers) + + for n in req_headers.keys(): + if not n.upper().startswith('WARC-'): + del self.headers[n] + + def close(self): + # no need to close wsgi.input + pass + + diff --git a/recorder/redisindexer.py b/recorder/redisindexer.py new file mode 100644 index 00000000..577bf036 --- /dev/null +++ b/recorder/redisindexer.py @@ -0,0 +1,83 @@ +from pywb.utils.canonicalize import calc_search_range +from pywb.cdx.cdxobject import CDXObject +from pywb.warc.cdxindexer import write_cdx_index +from pywb.utils.timeutils import iso_date_to_timestamp + +from io import BytesIO +import os + +from webagg.indexsource import RedisIndexSource +from webagg.aggregator import SimpleAggregator +from webagg.utils import res_template + +from recorder.filters import WriteRevisitDupePolicy + + +#============================================================================== +class WritableRedisIndexer(RedisIndexSource): + def __init__(self, *args, **kwargs): + redis_url = kwargs.get('redis_url') + redis = kwargs.get('redis') + cdx_key_template = kwargs.get('cdx_key_template') + + super(WritableRedisIndexer, self).__init__(redis_url, + redis, + cdx_key_template) + + name = kwargs.get('name', 'recorder') + self.cdx_lookup = SimpleAggregator({name: self}) + + self.rel_path_template = kwargs.get('rel_path_template', '') + self.file_key_template = kwargs.get('file_key_template', '') + self.full_warc_prefix = kwargs.get('full_warc_prefix', '') + self.dupe_policy = kwargs.get('dupe_policy', WriteRevisitDupePolicy()) + + def add_warc_file(self, full_filename, params): + rel_path = res_template(self.rel_path_template, params) + rel_filename = os.path.relpath(full_filename, rel_path) + + file_key = res_template(self.file_key_template, params) + + full_load_path = self.full_warc_prefix + full_filename + + self.redis.hset(file_key, rel_filename, full_load_path) + + def add_urls_to_index(self, stream, params, filename, length): + rel_path = res_template(self.rel_path_template, params) + filename = os.path.relpath(filename, rel_path) + + cdxout = BytesIO() + write_cdx_index(cdxout, stream, filename, + cdxj=True, append_post=True) + + z_key = res_template(self.redis_key_template, params) + + cdx_list = cdxout.getvalue().rstrip().split(b'\n') + + for cdx in cdx_list: + if cdx: + self.redis.zadd(z_key, 0, cdx) + + return cdx_list + + def lookup_revisit(self, params, digest, url, iso_dt): + params['url'] = url + params['closest'] = iso_date_to_timestamp(iso_dt) + + filters = [] + + filters.append('!mime:warc/revisit') + + if digest and digest != '-': + filters.append('digest:' + digest.split(':')[-1]) + + params['filter'] = filters + + cdx_iter, errs = self.cdx_lookup(params) + + for cdx in cdx_iter: + res = self.dupe_policy(cdx, params) + if res: + return res + + return None diff --git a/recorder/test/rec.ini b/recorder/test/rec.ini new file mode 100644 index 00000000..06a5f8ea --- /dev/null +++ b/recorder/test/rec.ini @@ -0,0 +1,17 @@ +[uwsgi] +if-not-env = PORT +http-socket = :8010 +endif = + +master = true +buffer-size = 65536 +die-on-term = true + +if-env = VIRTUAL_ENV +venv = $(VIRTUAL_ENV) +endif = + +gevent = 100 +#gevent-early-monkey-patch = + +wsgi = recorder.test.simplerec diff --git a/recorder/test/simplerec.py b/recorder/test/simplerec.py new file mode 100644 index 00000000..f9c73b99 --- /dev/null +++ b/recorder/test/simplerec.py @@ -0,0 +1,42 @@ +from gevent import monkey; monkey.patch_all() + +from recorder.recorderapp import RecorderApp +from recorder.redisindexer import WritableRedisIndexer + +from recorder.warcwriter import MultiFileWARCWriter +from recorder.filters import SkipDupePolicy + +import atexit +import tempfile +import redis + +upstream_url = 'http://localhost:8080' + +target = tempfile.mkdtemp(prefix='tmprec') + '/' + +print('Recording to ' + target) + +def rm_target(): + print('Removing ' + target) + shutil.rmtree(target) + +atexit.register(rm_target) + +local_r = redis.StrictRedis.from_url('redis://localhost/2') +local_r.delete('rec:cdxj') +local_r.delete('rec:warc') + +#target = './_recordings/' + +dedup_index = WritableRedisIndexer( + redis_url='redis://localhost/2/rec:cdxj', + file_key_template='rec:warc', + rel_path_template=target, + dupe_policy=SkipDupePolicy()) + +recorder_app = RecorderApp(upstream_url, + MultiFileWARCWriter(target, dedup_index=dedup_index), + accept_colls='live') + +application = recorder_app + diff --git a/recorder/test/test_recorder.py b/recorder/test/test_recorder.py new file mode 100644 index 00000000..e6f75d9e --- /dev/null +++ b/recorder/test/test_recorder.py @@ -0,0 +1,582 @@ +#from gevent import monkey; monkey.patch_all() +import gevent + +from webagg.test.testutils import TempDirTests, LiveServerTests, BaseTestClass, to_path +from webagg.test.testutils import FakeRedisTests + +import os +import webtest + +from pytest import raises +from fakeredis import FakeStrictRedis + +from recorder.recorderapp import RecorderApp +from recorder.redisindexer import WritableRedisIndexer +from recorder.warcwriter import PerRecordWARCWriter, MultiFileWARCWriter, SimpleTempWARCWriter +from recorder.filters import ExcludeSpecificHeaders +from recorder.filters import SkipDupePolicy, WriteDupePolicy, WriteRevisitDupePolicy + +from webagg.utils import MementoUtils + +from pywb.cdx.cdxobject import CDXObject +from pywb.utils.statusandheaders import StatusAndHeadersParser +from pywb.utils.bufferedreaders import DecompressingBufferedReader +from pywb.warc.recordloader import ArcWarcRecordLoader +from pywb.warc.cdxindexer import write_cdx_index +from pywb.warc.archiveiterator import ArchiveIterator + +from six.moves.urllib.parse import quote, unquote, urlencode +from io import BytesIO +import time +import json + +general_req_data = "\ +GET {path} HTTP/1.1\r\n\ +Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8\r\n\ +User-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36\r\n\ +X-Other: foo\r\n\ +Host: {host}\r\n\ +Cookie: boo=far\r\n\ +\r\n" + + + +class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass): + @classmethod + def setup_class(cls): + super(TestRecorder, cls).setup_class() + + cls.warcs_dir = to_path(cls.root_dir + '/warcs') + + os.makedirs(cls.warcs_dir) + + cls.upstream_url = 'http://localhost:{0}'.format(cls.server.port) + + def _get_dedup_index(self, dupe_policy=WriteRevisitDupePolicy(), user=True): + if user: + file_key_template = '{user}:{coll}:warc' + redis_url = 'redis://localhost/2/{user}:{coll}:cdxj' + else: + file_key_template = '{coll}:warc' + redis_url = 'redis://localhost/2/{coll}:cdxj' + + dedup_index = WritableRedisIndexer(redis_url=redis_url, + file_key_template=file_key_template, + rel_path_template=self.root_dir + '/warcs/', + dupe_policy=dupe_policy) + + return dedup_index + + def _test_warc_write(self, recorder_app, host, path, other_params='', link_url=''): + url = 'http://' + host + path + req_url = '/live/resource/postreq?url=' + url + other_params + testapp = webtest.TestApp(recorder_app) + resp = testapp.post(req_url, general_req_data.format(host=host, path=path).encode('utf-8')) + + if not recorder_app.write_queue.empty(): + recorder_app._write_one() + + assert resp.headers['WebAgg-Source-Coll'] == 'live' + + if not link_url: + link_url = unquote(url) + + assert resp.headers['Link'] == MementoUtils.make_link(link_url, 'original') + assert resp.headers['Memento-Datetime'] != '' + + return resp + + def _test_all_warcs(self, dirname, num): + coll_dir = to_path(self.root_dir + dirname) + assert os.path.isdir(coll_dir) + + files = [x for x in os.listdir(coll_dir) if os.path.isfile(os.path.join(coll_dir, x))] + assert len(files) == num + assert all(x.endswith('.warc.gz') for x in files) + return files, coll_dir + + def _load_resp_req(self, base_path): + warcs = os.listdir(base_path) + assert len(warcs) == 1 + warc = warcs[0] + + stored_resp = None + stored_req = None + + with open(os.path.join(base_path, warc), 'rb') as fh: + for rec in ArchiveIterator(fh)(): + if rec.rec_type == 'response': + stored_resp = rec + elif rec.rec_type == 'request': + stored_req = rec + + assert stored_resp is not None + assert stored_req is not None + return stored_req, stored_resp + + def test_record_warc_1(self): + recorder_app = RecorderApp(self.upstream_url, + PerRecordWARCWriter(to_path(self.root_dir + '/warcs/'))) + + resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar') + assert b'HTTP/1.1 200 OK' in resp.body + assert b'"foo": "bar"' in resp.body + + self._test_all_warcs('/warcs/', 1) + + def test_record_warc_2(self): + recorder_app = RecorderApp(self.upstream_url, + PerRecordWARCWriter(to_path(self.root_dir + '/warcs/')), accept_colls='live') + + resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar') + assert b'HTTP/1.1 200 OK' in resp.body + assert b'"foo": "bar"' in resp.body + + self._test_all_warcs('/warcs/', 2) + + def test_error_url(self): + recorder_app = RecorderApp(self.upstream_url + '01', + PerRecordWARCWriter(to_path(self.root_dir + '/warcs/')), accept_colls='live') + + + testapp = webtest.TestApp(recorder_app) + resp = testapp.get('/live/resource?url=http://example.com/', status=400) + + assert resp.json['error'] != '' + + self._test_all_warcs('/warcs/', 2) + + def test_record_cookies_header(self): + base_path = to_path(self.root_dir + '/warcs/cookiecheck/') + recorder_app = RecorderApp(self.upstream_url, + PerRecordWARCWriter(base_path), accept_colls='live') + + resp = self._test_warc_write(recorder_app, 'httpbin.org', '/cookies/set%3Fname%3Dvalue%26foo%3Dbar') + assert b'HTTP/1.1 302' in resp.body + + buff = BytesIO(resp.body) + record = ArcWarcRecordLoader().parse_record_stream(buff) + assert ('Set-Cookie', 'name=value; Path=/') in record.status_headers.headers + assert ('Set-Cookie', 'foo=bar; Path=/') in record.status_headers.headers + + stored_req, stored_resp = self._load_resp_req(base_path) + + assert ('Set-Cookie', 'name=value; Path=/') in stored_resp.status_headers.headers + assert ('Set-Cookie', 'foo=bar; Path=/') in stored_resp.status_headers.headers + + assert ('X-Other', 'foo') in stored_req.status_headers.headers + assert ('Cookie', 'boo=far') in stored_req.status_headers.headers + + def test_record_cookies_skip_header(self): + warc_path = to_path(self.root_dir + '/warcs/cookieskip/') + header_filter = ExcludeSpecificHeaders(['Set-Cookie', 'Cookie']) + recorder_app = RecorderApp(self.upstream_url, + PerRecordWARCWriter(warc_path, header_filter=header_filter), + accept_colls='live') + + resp = self._test_warc_write(recorder_app, 'httpbin.org', '/cookies/set%3Fname%3Dvalue%26foo%3Dbar') + assert b'HTTP/1.1 302' in resp.body + + buff = BytesIO(resp.body) + record = ArcWarcRecordLoader().parse_record_stream(buff) + assert ('Set-Cookie', 'name=value; Path=/') in record.status_headers.headers + assert ('Set-Cookie', 'foo=bar; Path=/') in record.status_headers.headers + + stored_req, stored_resp = self._load_resp_req(warc_path) + + assert ('Set-Cookie', 'name=value; Path=/') not in stored_resp.status_headers.headers + assert ('Set-Cookie', 'foo=bar; Path=/') not in stored_resp.status_headers.headers + + assert ('X-Other', 'foo') in stored_req.status_headers.headers + assert ('Cookie', 'boo=far') not in stored_req.status_headers.headers + + def test_record_skip_wrong_coll(self): + recorder_app = RecorderApp(self.upstream_url, + writer=PerRecordWARCWriter(to_path(self.root_dir + '/warcs/')), accept_colls='not-live') + + resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar') + assert b'HTTP/1.1 200 OK' in resp.body + assert b'"foo": "bar"' in resp.body + + self._test_all_warcs('/warcs/', 2) + + def test_record_param_user_coll(self): + warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/') + + dedup_index = self._get_dedup_index() + + recorder_app = RecorderApp(self.upstream_url, + PerRecordWARCWriter(warc_path, dedup_index=dedup_index)) + + self._test_all_warcs('/warcs/', 2) + + resp = self._test_warc_write(recorder_app, 'httpbin.org', + '/get?foo=bar', '¶m.recorder.user=USER¶m.recorder.coll=COLL') + assert b'HTTP/1.1 200 OK' in resp.body + assert b'"foo": "bar"' in resp.body + + self._test_all_warcs('/warcs/USER/COLL/', 1) + + r = FakeStrictRedis.from_url('redis://localhost/2') + + res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/', '(org,httpbin,') + assert len(res) == 1 + + cdx = CDXObject(res[0]) + assert cdx['urlkey'] == 'org,httpbin)/get?foo=bar' + assert cdx['mime'] == 'application/json' + assert cdx['offset'] == '0' + assert cdx['filename'].startswith('USER/COLL/') + assert cdx['filename'].endswith('.warc.gz') + + warcs = r.hgetall('USER:COLL:warc') + full_path = self.root_dir + '/warcs/' + cdx['filename'] + assert warcs == {cdx['filename'].encode('utf-8'): full_path.encode('utf-8')} + + def test_record_param_user_coll_same_dir(self): + warc_path = to_path(self.root_dir + '/warcs2/') + + dedup_index = self._get_dedup_index() + + recorder_app = RecorderApp(self.upstream_url, + PerRecordWARCWriter(warc_path, dedup_index=dedup_index, key_template='{user}:{coll}')) + + resp = self._test_warc_write(recorder_app, 'httpbin.org', + '/get?foo=bar', '¶m.recorder.user=USER2¶m.recorder.coll=COLL2') + assert b'HTTP/1.1 200 OK' in resp.body + assert b'"foo": "bar"' in resp.body + + resp = self._test_warc_write(recorder_app, 'httpbin.org', + '/get?foo=bar', '¶m.recorder.user=USER2¶m.recorder.coll=COLL3') + assert b'HTTP/1.1 200 OK' in resp.body + assert b'"foo": "bar"' in resp.body + + self._test_all_warcs('/warcs2', 2) + + def test_record_param_user_coll_revisit(self): + warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/') + + dedup_index = self._get_dedup_index() + + recorder_app = RecorderApp(self.upstream_url, + PerRecordWARCWriter(warc_path, dedup_index=dedup_index)) + + self._test_all_warcs('/warcs/', 2) + + resp = self._test_warc_write(recorder_app, 'httpbin.org', + '/get?foo=bar', '¶m.recorder.user=USER¶m.recorder.coll=COLL') + assert b'HTTP/1.1 200 OK' in resp.body + assert b'"foo": "bar"' in resp.body + + self._test_all_warcs('/warcs/USER/COLL/', 2) + + # Test Redis CDX + r = FakeStrictRedis.from_url('redis://localhost/2') + + res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/', '(org,httpbin,') + assert len(res) == 2 + + cdx = CDXObject(res[1]) + assert cdx['urlkey'] == 'org,httpbin)/get?foo=bar' + assert cdx['mime'] == 'warc/revisit' + assert cdx['offset'] == '0' + assert cdx['filename'].startswith('USER/COLL/') + assert cdx['filename'].endswith('.warc.gz') + + fullwarc = os.path.join(self.root_dir, 'warcs', cdx['filename']) + + warcs = r.hgetall('USER:COLL:warc') + assert len(warcs) == 2 + assert warcs[cdx['filename'].encode('utf-8')] == fullwarc.encode('utf-8') + + with open(fullwarc, 'rb') as fh: + decomp = DecompressingBufferedReader(fh) + # Test refers-to headers + status_headers = StatusAndHeadersParser(['WARC/1.0']).parse(decomp) + assert status_headers.get_header('WARC-Type') == 'revisit' + assert status_headers.get_header('WARC-Target-URI') == 'http://httpbin.org/get?foo=bar' + assert status_headers.get_header('WARC-Date') != '' + assert status_headers.get_header('WARC-Refers-To-Target-URI') == 'http://httpbin.org/get?foo=bar' + assert status_headers.get_header('WARC-Refers-To-Date') != '' + + def test_record_param_user_coll_skip(self): + warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/') + + dedup_index = self._get_dedup_index(dupe_policy=SkipDupePolicy()) + + recorder_app = RecorderApp(self.upstream_url, + PerRecordWARCWriter(warc_path, dedup_index=dedup_index)) + + # No new entries written + self._test_all_warcs('/warcs/', 2) + + resp = self._test_warc_write(recorder_app, 'httpbin.org', + '/get?foo=bar', '¶m.recorder.user=USER¶m.recorder.coll=COLL') + assert b'HTTP/1.1 200 OK' in resp.body + assert b'"foo": "bar"' in resp.body + + self._test_all_warcs('/warcs/USER/COLL/', 2) + + # Test Redis CDX + r = FakeStrictRedis.from_url('redis://localhost/2') + + res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/', '(org,httpbin,') + assert len(res) == 2 + + def test_record_param_user_coll_write_dupe_no_revisit(self): + warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/') + + dedup_index = self._get_dedup_index(dupe_policy=WriteDupePolicy()) + + writer = PerRecordWARCWriter(warc_path, dedup_index=dedup_index) + recorder_app = RecorderApp(self.upstream_url, writer) + + resp = self._test_warc_write(recorder_app, 'httpbin.org', + '/get?foo=bar', '¶m.recorder.user=USER¶m.recorder.coll=COLL') + assert b'HTTP/1.1 200 OK' in resp.body + assert b'"foo": "bar"' in resp.body + + self._test_all_warcs('/warcs/USER/COLL/', 3) + + r = FakeStrictRedis.from_url('redis://localhost/2') + + res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/', '(org,httpbin,') + assert len(res) == 3 + + mimes = [CDXObject(x)['mime'] for x in res] + + assert sorted(mimes) == ['application/json', 'application/json', 'warc/revisit'] + + assert len(writer.fh_cache) == 0 + + # Keep Open + def test_record_file_warc_keep_open(self): + path = to_path(self.root_dir + '/warcs/A.warc.gz') + writer = MultiFileWARCWriter(path) + recorder_app = RecorderApp(self.upstream_url, writer) + + resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar') + assert b'HTTP/1.1 200 OK' in resp.body + assert b'"foo": "bar"' in resp.body + + assert os.path.isfile(path) + assert len(writer.fh_cache) == 1 + + def test_record_multiple_writes_keep_open(self): + warc_path = to_path(self.root_dir + '/warcs/FOO/ABC-{hostname}-{timestamp}.warc.gz') + + rel_path = self.root_dir + '/warcs/' + + dedup_index = self._get_dedup_index(user=False) + + writer = MultiFileWARCWriter(warc_path, dedup_index=dedup_index) + recorder_app = RecorderApp(self.upstream_url, writer) + + # First Record + resp = self._test_warc_write(recorder_app, 'httpbin.org', + '/get?foo=bar', '¶m.recorder.coll=FOO') + + assert b'HTTP/1.1 200 OK' in resp.body + assert b'"foo": "bar"' in resp.body + + + # Second Record + resp = self._test_warc_write(recorder_app, 'httpbin.org', + '/get?boo=far', '¶m.recorder.coll=FOO') + + assert b'HTTP/1.1 200 OK' in resp.body + assert b'"boo": "far"' in resp.body + + self._test_all_warcs('/warcs/FOO/', 1) + + # Check two records in WARC + r = FakeStrictRedis.from_url('redis://localhost/2') + res = r.zrangebylex('FOO:cdxj', '[org,httpbin)/', '(org,httpbin,') + assert len(res) == 2 + + files, coll_dir = self._test_all_warcs('/warcs/FOO/', 1) + fullname = coll_dir + files[0] + + cdxout = BytesIO() + with open(fullname, 'rb') as fh: + filename = os.path.relpath(fullname, rel_path) + write_cdx_index(cdxout, fh, filename, + cdxj=True, append_post=True, sort=True) + + res = [CDXObject(x) for x in res] + + cdxres = cdxout.getvalue().strip() + cdxres = cdxres.split(b'\n') + cdxres = [CDXObject(x) for x in cdxres] + + assert cdxres == res + + assert len(writer.fh_cache) == 1 + + writer.close_key(self.root_dir + '/warcs/FOO/') + + assert len(writer.fh_cache) == 0 + + writer.close() + + resp = self._test_warc_write(recorder_app, 'httpbin.org', + '/get?boo=far', '¶m.recorder.coll=FOO') + + self._test_all_warcs('/warcs/FOO/', 2) + + warcs = r.hgetall('FOO:warc') + assert len(warcs) == 2 + + def test_record_multiple_writes_rollover_idle(self): + warc_path = to_path(self.root_dir + '/warcs/GOO/ABC-{hostname}-{timestamp}.warc.gz') + + rel_path = self.root_dir + '/warcs/' + + dedup_index = self._get_dedup_index(user=False) + + writer = MultiFileWARCWriter(warc_path, dedup_index=dedup_index, max_idle_secs=0.9) + recorder_app = RecorderApp(self.upstream_url, writer) + + # First Record + resp = self._test_warc_write(recorder_app, 'httpbin.org', + '/get?foo=bar', '¶m.recorder.coll=GOO') + + assert b'HTTP/1.1 200 OK' in resp.body + assert b'"foo": "bar"' in resp.body + + # Second Record + resp = self._test_warc_write(recorder_app, 'httpbin.org', + '/get?boo=far', '¶m.recorder.coll=GOO') + + assert b'HTTP/1.1 200 OK' in resp.body + assert b'"boo": "far"' in resp.body + + self._test_all_warcs('/warcs/GOO/', 1) + + time.sleep(1.0) + writer.close_idle_files() + + # Third Record + resp = self._test_warc_write(recorder_app, 'httpbin.org', + '/get?goo=bar', '¶m.recorder.coll=GOO') + + assert b'HTTP/1.1 200 OK' in resp.body + assert b'"goo": "bar"' in resp.body + + self._test_all_warcs('/warcs/GOO/', 2) + + def test_warcinfo_record(self): + simplewriter = SimpleTempWARCWriter(gzip=False) + params = {'software': 'recorder test', + 'format': 'WARC File Format 1.0', + 'json-metadata': json.dumps({'foo': 'bar'})} + + record = simplewriter.create_warcinfo_record('testfile.warc.gz', params) + simplewriter.write_record(record) + buff = simplewriter.get_buffer() + assert isinstance(buff, bytes) + + buff = BytesIO(buff) + parsed_record = ArcWarcRecordLoader().parse_record_stream(buff) + + assert parsed_record.rec_headers.get_header('WARC-Type') == 'warcinfo' + assert parsed_record.rec_headers.get_header('Content-Type') == 'application/warc-fields' + assert parsed_record.rec_headers.get_header('WARC-Filename') == 'testfile.warc.gz' + + buff = parsed_record.stream.read().decode('utf-8') + + length = parsed_record.rec_headers.get_header('Content-Length') + + assert len(buff) == int(length) + + assert 'json-metadata: {"foo": "bar"}\r\n' in buff + assert 'format: WARC File Format 1.0\r\n' in buff + assert 'json-metadata: {"foo": "bar"}\r\n' in buff + + def test_record_custom_record(self): + dedup_index = self._get_dedup_index(user=False) + + warc_path = to_path(self.root_dir + '/warcs/meta/meta.warc.gz') + + recorder_app = RecorderApp(self.upstream_url, + MultiFileWARCWriter(warc_path, dedup_index=dedup_index)) + + req_url = '/live/resource/postreq?url=custom://httpbin.org¶m.recorder.coll=META&put_record=resource' + + buff = b'Some Data' + + testapp = webtest.TestApp(recorder_app) + headers = {'content-type': 'text/plain', + 'WARC-Custom': 'foo' + } + + resp = testapp.put(req_url, headers=headers, params=buff) + + assert resp.json['success'] == 'true' + assert resp.json['WARC-Date'] != '' + + self._test_all_warcs('/warcs/meta', 1) + + r = FakeStrictRedis.from_url('redis://localhost/2') + + warcs = r.hgetall('META:warc') + assert len(warcs) == 1 + + with open(warcs[b'meta/meta.warc.gz'], 'rb') as fh: + decomp = DecompressingBufferedReader(fh) + record = ArcWarcRecordLoader().parse_record_stream(decomp) + + status_headers = record.rec_headers + assert len(record.rec_headers.headers) == 9 + assert status_headers.get_header('WARC-Type') == 'resource' + assert status_headers.get_header('WARC-Target-URI') == 'custom://httpbin.org' + assert status_headers.get_header('WARC-Record-ID') != '' + assert status_headers.get_header('WARC-Date') != '' + assert status_headers.get_header('WARC-Block-Digest') != '' + assert status_headers.get_header('WARC-Block-Digest') == status_headers.get_header('WARC-Payload-Digest') + assert status_headers.get_header('Content-Type') == 'text/plain' + assert status_headers.get_header('Content-Length') == str(len(buff)) + assert status_headers.get_header('WARC-Custom') == 'foo' + + assert record.stream.read() == buff + + status_headers = record.status_headers + assert len(record.status_headers.headers) == 2 + + assert status_headers.get_header('Content-Type') == 'text/plain' + assert status_headers.get_header('Content-Length') == str(len(buff)) + + def test_record_video_metadata(self): + warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/') + + dedup_index = self._get_dedup_index() + + writer = PerRecordWARCWriter(warc_path, dedup_index=dedup_index) + recorder_app = RecorderApp(self.upstream_url, writer) + + params = {'param.recorder.user': 'USER', + 'param.recorder.coll': 'VIDEO', + 'content_type': 'application/vnd.youtube-dl_formats+json' + } + + resp = self._test_warc_write(recorder_app, + 'www.youtube.com', '/v/BfBgWtAIbRc', '&' + urlencode(params), + link_url='metadata://www.youtube.com/v/BfBgWtAIbRc') + + r = FakeStrictRedis.from_url('redis://localhost/2') + + warcs = r.hgetall('USER:VIDEO:warc') + assert len(warcs) == 1 + + filename = list(warcs.values())[0] + + with open(filename, 'rb') as fh: + decomp = DecompressingBufferedReader(fh) + record = ArcWarcRecordLoader().parse_record_stream(decomp) + + status_headers = record.rec_headers + assert status_headers.get_header('WARC-Type') == 'metadata' + assert status_headers.get_header('Content-Type') == 'application/vnd.youtube-dl_formats+json' + assert status_headers.get_header('WARC-Block-Digest') != '' + assert status_headers.get_header('WARC-Block-Digest') == status_headers.get_header('WARC-Payload-Digest') + diff --git a/recorder/warcwriter.py b/recorder/warcwriter.py new file mode 100644 index 00000000..b125da5e --- /dev/null +++ b/recorder/warcwriter.py @@ -0,0 +1,553 @@ +import tempfile +import uuid +import base64 +import hashlib +import datetime +import zlib +import sys +import os +import six +import shutil + +import traceback + +from collections import OrderedDict + +from socket import gethostname +from io import BytesIO + +import fcntl + +from pywb.utils.loaders import LimitReader, to_native_str +from pywb.utils.bufferedreaders import BufferedReader +from pywb.utils.timeutils import timestamp20_now, datetime_to_iso_date + +from pywb.utils.statusandheaders import StatusAndHeadersParser +from pywb.warc.recordloader import ArcWarcRecord +from pywb.warc.recordloader import ArcWarcRecordLoader + +from requests.structures import CaseInsensitiveDict +from webagg.utils import ParamFormatter, res_template + +from recorder.filters import ExcludeNone + + +# ============================================================================ +class BaseWARCWriter(object): + WARC_RECORDS = {'warcinfo': 'application/warc-fields', + 'response': 'application/http; msgtype=response', + 'revisit': 'application/http; msgtype=response', + 'request': 'application/http; msgtype=request', + 'metadata': 'application/warc-fields', + } + + REVISIT_PROFILE = 'http://netpreserve.org/warc/1.0/revisit/uri-agnostic-identical-payload-digest' + + BUFF_SIZE = 8192 + + FILE_TEMPLATE = 'rec-{timestamp}-{hostname}.warc.gz' + + def __init__(self, gzip=True, dedup_index=None, name='recorder', + header_filter=ExcludeNone(), *args, **kwargs): + self.gzip = gzip + self.dedup_index = dedup_index + self.rec_source_name = name + self.header_filter = header_filter + self.hostname = gethostname() + + self.parser = StatusAndHeadersParser([], verify=False) + + def ensure_digest(self, record): + block_digest = record.rec_headers.get('WARC-Block-Digest') + payload_digest = record.rec_headers.get('WARC-Payload-Digest') + if block_digest and payload_digest: + return + + block_digester = self._create_digester() + payload_digester = self._create_digester() + + pos = record.stream.tell() + + if record.status_headers and hasattr(record.status_headers, 'headers_buff'): + block_digester.update(record.status_headers.headers_buff) + + while True: + buf = record.stream.read(self.BUFF_SIZE) + if not buf: + break + + block_digester.update(buf) + payload_digester.update(buf) + + record.stream.seek(pos) + record.rec_headers['WARC-Block-Digest'] = str(block_digester) + record.rec_headers['WARC-Payload-Digest'] = str(payload_digester) + + def _create_digester(self): + return Digester('sha1') + + def _set_header_buff(self, record): + exclude_list = self.header_filter(record) + buff = record.status_headers.to_bytes(exclude_list) + record.status_headers.headers_buff = buff + + def write_req_resp(self, req, resp, params): + url = resp.rec_headers.get('WARC-Target-URI') + dt = resp.rec_headers.get('WARC-Date') + + #req.rec_headers['Content-Type'] = req.content_type + req.rec_headers['WARC-Target-URI'] = url + req.rec_headers['WARC-Date'] = dt + + resp_id = resp.rec_headers.get('WARC-Record-ID') + if resp_id: + req.rec_headers['WARC-Concurrent-To'] = resp_id + + resp = self._check_revisit(resp, params) + if not resp: + print('Skipping due to dedup') + return + + params['_formatter'] = ParamFormatter(params, name=self.rec_source_name) + self._do_write_req_resp(req, resp, params) + + def create_req_record(self, req_headers, payload): + len_ = payload.tell() + payload.seek(0) + + warc_headers = req_headers + warc_headers['WARC-Type'] = 'request' + if not warc_headers.get('WARC-Record-ID'): + warc_headers['WARC-Record-ID'] = self._make_warc_id() + + status_headers = self.parser.parse(payload) + + record = ArcWarcRecord('warc', 'request', warc_headers, payload, + status_headers, '', len_) + + self._set_header_buff(record) + + return record + + def read_resp_record(self, resp_headers, payload): + len_ = payload.tell() + payload.seek(0) + + warc_headers = self.parser.parse(payload) + warc_headers = CaseInsensitiveDict(warc_headers.headers) + + record_type = warc_headers.get('WARC-Type', 'response') + + if record_type == 'response': + status_headers = self.parser.parse(payload) + else: + status_headers = None + + record = ArcWarcRecord('warc', record_type, warc_headers, payload, + status_headers, '', len_) + + if record_type == 'response': + self._set_header_buff(record) + + self.ensure_digest(record) + + return record_type, record + + def create_warcinfo_record(self, filename, info): + warc_headers = {} + warc_headers['WARC-Record-ID'] = self._make_warc_id() + warc_headers['WARC-Type'] = 'warcinfo' + if filename: + warc_headers['WARC-Filename'] = filename + warc_headers['WARC-Date'] = datetime_to_iso_date(datetime.datetime.utcnow()) + + warcinfo = BytesIO() + for n, v in six.iteritems(info): + self._header(warcinfo, n, v) + + warcinfo.seek(0) + + record = ArcWarcRecord('warc', 'warcinfo', warc_headers, warcinfo, + None, '', len(warcinfo.getvalue())) + + return record + + def create_custom_record(self, uri, payload, record_type, content_type, + warc_headers=None): + len_ = payload.tell() + payload.seek(0) + + warc_headers = warc_headers or {} + warc_headers['WARC-Record-ID'] = self._make_warc_id() + warc_headers['WARC-Type'] = record_type + warc_headers['WARC-Target-URI'] = uri + + if 'WARC-Date' not in warc_headers: + warc_headers['WARC-Date'] = datetime_to_iso_date(datetime.datetime.utcnow()) + + record = ArcWarcRecord('warc', record_type, warc_headers, payload, + None, content_type, len_) + + self.ensure_digest(record) + + return record + + def _check_revisit(self, record, params): + if not self.dedup_index: + return record + + try: + url = record.rec_headers.get('WARC-Target-URI') + digest = record.rec_headers.get('WARC-Payload-Digest') + iso_dt = record.rec_headers.get('WARC-Date') + result = self.dedup_index.lookup_revisit(params, digest, url, iso_dt) + except Exception as e: + traceback.print_exc() + result = None + + if result == 'skip': + return None + + if isinstance(result, tuple) and result[0] == 'revisit': + record.rec_headers['WARC-Type'] = 'revisit' + record.rec_headers['WARC-Profile'] = self.REVISIT_PROFILE + + record.rec_headers['WARC-Refers-To-Target-URI'] = result[1] + record.rec_headers['WARC-Refers-To-Date'] = result[2] + + return record + + def _write_warc_record(self, out, record): + if self.gzip: + out = GzippingWrapper(out) + + self._line(out, b'WARC/1.0') + + for n, v in six.iteritems(record.rec_headers): + if n.lower() in ('content-length', 'content-type'): + continue + + self._header(out, n, v) + + content_type = record.rec_headers.get('Content-Type') + + if not content_type: + content_type = record.content_type + + if not content_type: + content_type = self.WARC_RECORDS.get(record.rec_headers['WARC-Type']) + + if content_type: + self._header(out, 'Content-Type', content_type) + + if record.rec_headers['WARC-Type'] == 'revisit': + http_headers_only = True + else: + http_headers_only = False + + if record.length: + actual_len = 0 + if record.status_headers: + actual_len = len(record.status_headers.headers_buff) + + if not http_headers_only: + diff = record.stream.tell() - actual_len + actual_len = record.length - diff + + self._header(out, 'Content-Length', str(actual_len)) + + # add empty line + self._line(out, b'') + + # write headers buffer, if any + if record.status_headers: + out.write(record.status_headers.headers_buff) + + if not http_headers_only: + out.write(record.stream.read()) + + # add two lines + self._line(out, b'\r\n') + else: + # add three lines (1 for end of header, 2 for end of record) + self._line(out, b'Content-Length: 0\r\n\r\n') + + out.flush() + + def _header(self, out, name, value): + if not value: + return + + self._line(out, (name + ': ' + str(value)).encode('latin-1')) + + def _line(self, out, line): + out.write(line + b'\r\n') + + @staticmethod + def _make_warc_id(id_=None): + if not id_: + id_ = uuid.uuid1() + return ''.format(id_) + + +# ============================================================================ +class GzippingWrapper(object): + def __init__(self, out): + self.compressor = zlib.compressobj(9, zlib.DEFLATED, zlib.MAX_WBITS + 16) + self.out = out + + def write(self, buff): + #if isinstance(buff, str): + # buff = buff.encode('utf-8') + buff = self.compressor.compress(buff) + self.out.write(buff) + + def flush(self): + buff = self.compressor.flush() + self.out.write(buff) + self.out.flush() + + +# ============================================================================ +class Digester(object): + def __init__(self, type_='sha1'): + self.type_ = type_ + self.digester = hashlib.new(type_) + + def update(self, buff): + self.digester.update(buff) + + def __str__(self): + return self.type_ + ':' + to_native_str(base64.b32encode(self.digester.digest())) + + +# ============================================================================ +class MultiFileWARCWriter(BaseWARCWriter): + def __init__(self, dir_template, filename_template=None, max_size=0, + max_idle_secs=1800, *args, **kwargs): + super(MultiFileWARCWriter, self).__init__(*args, **kwargs) + + if not filename_template: + dir_template, filename_template = os.path.split(dir_template) + dir_template += os.path.sep + + if not filename_template: + filename_template = self.FILE_TEMPLATE + + self.dir_template = dir_template + self.key_template = kwargs.get('key_template', self.dir_template) + self.filename_template = filename_template + self.max_size = max_size + if max_idle_secs > 0: + self.max_idle_time = datetime.timedelta(seconds=max_idle_secs) + else: + self.max_idle_time = None + + self.fh_cache = {} + + def get_new_filename(self, dir_, params): + timestamp = timestamp20_now() + + randstr = base64.b32encode(os.urandom(5)).decode('utf-8') + + filename = dir_ + res_template(self.filename_template, params, + hostname=self.hostname, + timestamp=timestamp, + random=randstr) + + return filename + + def allow_new_file(self, filename, params): + return True + + def _open_file(self, filename, params): + path, name = os.path.split(filename) + + try: + os.makedirs(path) + except: + pass + + fh = open(filename, 'a+b') + + if self.dedup_index: + self.dedup_index.add_warc_file(filename, params) + + return fh + + def _close_file(self, fh): + try: + fcntl.flock(fh, fcntl.LOCK_UN) + fh.close() + except Exception as e: + print(e) + + def get_dir_key(self, params): + return res_template(self.key_template, params) + + def close_key(self, dir_key): + if isinstance(dir_key, dict): + dir_key = self.get_dir_key(dir_key) + + result = self.fh_cache.pop(dir_key, None) + if not result: + return + + out, filename = result + self._close_file(out) + return filename + + def close_file(self, match_filename): + for dir_key, out, filename in self.iter_open_files(): + if filename == match_filename: + return self.close_key(dir_key) + + def _is_write_resp(self, resp, params): + return True + + def _is_write_req(self, req, params): + return True + + def write_record(self, record, params=None): + params = params or {} + params['_formatter'] = ParamFormatter(params, name=self.rec_source_name) + self._do_write_req_resp(None, record, params) + + def _do_write_req_resp(self, req, resp, params): + def write_callback(out, filename): + url = resp.rec_headers.get('WARC-Target-URI') + print('Writing req/resp {0} to {1} '.format(url, filename)) + + if resp and self._is_write_resp(resp, params): + self._write_warc_record(out, resp) + + if req and self._is_write_req(req, params): + self._write_warc_record(out, req) + + return self._write_to_file(params, write_callback) + + def write_stream_to_file(self, params, stream): + def write_callback(out, filename): + print('Writing stream to {0}'.format(filename)) + shutil.copyfileobj(stream, out) + + return self._write_to_file(params, write_callback) + + def _write_to_file(self, params, write_callback): + full_dir = res_template(self.dir_template, params) + dir_key = self.get_dir_key(params) + + result = self.fh_cache.get(dir_key) + + close_file = False + + if result: + out, filename = result + is_new = False + else: + filename = self.get_new_filename(full_dir, params) + + if not self.allow_new_file(filename, params): + return False + + out = self._open_file(filename, params) + + is_new = True + + try: + start = out.tell() + + write_callback(out, filename) + + out.flush() + + new_size = out.tell() + + out.seek(start) + + if self.dedup_index: + self.dedup_index.add_urls_to_index(out, params, + filename, + new_size - start) + + return True + + except Exception as e: + traceback.print_exc() + close_file = True + return False + + finally: + # check for rollover + if self.max_size and new_size > self.max_size: + close_file = True + + if close_file: + self._close_file(out) + if not is_new: + self.fh_cache.pop(dir_key, None) + + elif is_new: + fcntl.flock(out, fcntl.LOCK_EX | fcntl.LOCK_NB) + self.fh_cache[dir_key] = (out, filename) + + def iter_open_files(self): + for n, v in list(self.fh_cache.items()): + out, filename = v + yield n, out, filename + + def close(self): + for dir_key, out, filename in self.iter_open_files(): + self._close_file(out) + + self.fh_cache = {} + + def close_idle_files(self): + if not self.max_idle_time: + return + + now = datetime.datetime.now() + + for dir_key, out, filename in self.iter_open_files(): + try: + mtime = os.path.getmtime(filename) + except: + self.close_key(dir_key) + return + + mtime = datetime.datetime.fromtimestamp(mtime) + + if (now - mtime) > self.max_idle_time: + print('Closing idle ' + filename) + self.close_key(dir_key) + + +# ============================================================================ +class PerRecordWARCWriter(MultiFileWARCWriter): + def __init__(self, *args, **kwargs): + kwargs['max_size'] = 1 + super(PerRecordWARCWriter, self).__init__(*args, **kwargs) + + +# ============================================================================ +class SimpleTempWARCWriter(BaseWARCWriter): + def __init__(self, *args, **kwargs): + super(SimpleTempWARCWriter, self).__init__(*args, **kwargs) + self.out = self._create_buffer() + + def _create_buffer(self): + return tempfile.SpooledTemporaryFile(max_size=512*1024) + + def _do_write_req_resp(self, req, resp, params): + self._write_warc_record(self.out, resp) + self._write_warc_record(self.out, req) + + def write_record(self, record, params=None): + self._write_warc_record(self.out, record) + + def get_buffer(self): + pos = self.out.tell() + self.out.seek(0) + buff = self.out.read() + self.out.seek(pos) + return buff diff --git a/testdata/dupes.cdxj b/testdata/dupes.cdxj new file mode 100644 index 00000000..6d42a7b1 --- /dev/null +++ b/testdata/dupes.cdxj @@ -0,0 +1,12 @@ +com,example)/ 20140127171200 {"url": "http://example.com", "mime": "text/html", "status": "200", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "1046", "offset": "334", "filename": "dupes.warc.gz"} +com,example)/ 20140127171251 {"url": "http://example.com", "mime": "warc/revisit", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "553", "offset": "11875", "filename": "dupes.warc.gz"} +org,iana)/ 20140127171238 {"url": "http://iana.org", "mime": "unk", "status": "302", "digest": "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "343", "offset": "1858", "filename": "dupes.warc.gz"} +org,iana)/ 20140127171238 {"url": "http://www.iana.org/", "mime": "warc/revisit", "digest": "OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB", "length": "536", "offset": "2678", "filename": "dupes.warc.gz"} +org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140127171240 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf", "mime": "warc/revisit", "digest": "YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW", "length": "556", "offset": "10826", "filename": "dupes.warc.gz"} +org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140127171240 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Regular.ttf", "mime": "warc/revisit", "digest": "GVSO2C2TMPPVZ4TXYFXAY27NYWTIEIL7", "length": "540", "offset": "9793", "filename": "dupes.warc.gz"} +org,iana)/_css/2013.1/print.css 20140127171239 {"url": "http://www.iana.org/_css/2013.1/print.css", "mime": "warc/revisit", "digest": "VNBXHMUNWJQC5OWWGZ3X7GM5C7X6ZAB4", "length": "537", "offset": "6684", "filename": "dupes.warc.gz"} +org,iana)/_css/2013.1/screen.css 20140127171239 {"url": "http://www.iana.org/_css/2013.1/screen.css", "mime": "warc/revisit", "digest": "BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD", "length": "541", "offset": "4630", "filename": "dupes.warc.gz"} +org,iana)/_img/2013.1/iana-logo-homepage.png 20140127171240 {"url": "http://www.iana.org/_img/2013.1/iana-logo-homepage.png", "mime": "warc/revisit", "digest": "GCW2GM3SIMHEIQYZX25MLSRYVWUCZ7OK", "length": "549", "offset": "8750", "filename": "dupes.warc.gz"} +org,iana)/_img/2013.1/icann-logo.svg 20140127171239 {"url": "http://www.iana.org/_img/2013.1/icann-logo.svg", "mime": "warc/revisit", "digest": "HGRZHOH73EFQQWBYWBSOIV2UU5JDTSGJ", "length": "549", "offset": "7709", "filename": "dupes.warc.gz"} +org,iana)/_js/2013.1/iana.js 20140127171239 {"url": "http://www.iana.org/_js/2013.1/iana.js", "mime": "application/x-javascript", "status": "200", "digest": "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "457", "offset": "3696", "filename": "dupes.warc.gz"} +org,iana)/_js/2013.1/jquery.js 20140127171239 {"url": "http://www.iana.org/_js/2013.1/jquery.js", "mime": "warc/revisit", "digest": "AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO", "length": "547", "offset": "5658", "filename": "dupes.warc.gz"} diff --git a/testdata/dupes.warc.gz b/testdata/dupes.warc.gz new file mode 100644 index 00000000..48e6b6fd Binary files /dev/null and b/testdata/dupes.warc.gz differ diff --git a/testdata/example-url-agnostic-orig.warc.gz b/testdata/example-url-agnostic-orig.warc.gz new file mode 100644 index 00000000..98700373 Binary files /dev/null and b/testdata/example-url-agnostic-orig.warc.gz differ diff --git a/testdata/example-url-agnostic-revisit.warc.gz b/testdata/example-url-agnostic-revisit.warc.gz new file mode 100644 index 00000000..3770ed0a Binary files /dev/null and b/testdata/example-url-agnostic-revisit.warc.gz differ diff --git a/testdata/example.cdxj b/testdata/example.cdxj new file mode 100644 index 00000000..72f092f5 --- /dev/null +++ b/testdata/example.cdxj @@ -0,0 +1 @@ +com,example)/ 20160225042329 {"url": "http://example.com/", "mime": "text/html", "status": "200", "digest": "37cf167c2672a4a64af901d9484e75eee0e2c98a", "length": "1286", "offset": "363", "filename": "example.warc.gz"} diff --git a/testdata/example.warc.gz b/testdata/example.warc.gz new file mode 100644 index 00000000..143b947d Binary files /dev/null and b/testdata/example.warc.gz differ diff --git a/testdata/iana.cdxj b/testdata/iana.cdxj new file mode 100644 index 00000000..aadc54c0 --- /dev/null +++ b/testdata/iana.cdxj @@ -0,0 +1,171 @@ +org,iana)/ 20140126200624 {"url": "http://www.iana.org/", "mime": "text/html", "status": "200", "digest": "OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB", "length": "2258", "offset": "334", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200826 {"url": "http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf", "mime": "application/octet-stream", "status": "200", "digest": "LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR", "length": "34054", "offset": "620049", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200912 {"url": "http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf", "mime": "warc/revisit", "digest": "LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR", "length": "546", "offset": "667073", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200930 {"url": "http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf", "mime": "warc/revisit", "digest": "LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR", "length": "534", "offset": "697255", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201055 {"url": "http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf", "mime": "warc/revisit", "digest": "LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR", "length": "547", "offset": "714833", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201249 {"url": "http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf", "mime": "warc/revisit", "digest": "LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR", "length": "551", "offset": "768625", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200625 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf", "mime": "application/octet-stream", "status": "200", "digest": "YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW", "length": "117166", "offset": "198285", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200654 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf", "mime": "warc/revisit", "digest": "YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW", "length": "548", "offset": "482544", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200706 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf", "mime": "warc/revisit", "digest": "YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW", "length": "552", "offset": "495230", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200718 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf", "mime": "warc/revisit", "digest": "YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW", "length": "536", "offset": "566542", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200738 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf", "mime": "warc/revisit", "digest": "YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW", "length": "552", "offset": "578743", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200805 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf", "mime": "warc/revisit", "digest": "YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW", "length": "535", "offset": "593400", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200816 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf", "mime": "warc/revisit", "digest": "YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW", "length": "554", "offset": "608401", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200826 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf", "mime": "warc/revisit", "digest": "YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW", "length": "550", "offset": "654593", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200912 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf", "mime": "warc/revisit", "digest": "YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW", "length": "553", "offset": "670224", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200930 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf", "mime": "warc/revisit", "digest": "YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW", "length": "551", "offset": "699343", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201055 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf", "mime": "warc/revisit", "digest": "YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW", "length": "552", "offset": "712719", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201128 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf", "mime": "warc/revisit", "digest": "YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW", "length": "554", "offset": "731718", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201228 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf", "mime": "warc/revisit", "digest": "YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW", "length": "551", "offset": "745730", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201240 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf", "mime": "warc/revisit", "digest": "YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW", "length": "551", "offset": "757988", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201249 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf", "mime": "warc/revisit", "digest": "YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW", "length": "552", "offset": "771773", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201308 {"url": "https://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf", "mime": "warc/revisit", "digest": "YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW", "length": "551", "offset": "783712", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200626 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Regular.ttf", "mime": "application/octet-stream", "status": "200", "digest": "GVSO2C2TMPPVZ4TXYFXAY27NYWTIEIL7", "length": "114499", "offset": "83293", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200654 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Regular.ttf", "mime": "warc/revisit", "digest": "GVSO2C2TMPPVZ4TXYFXAY27NYWTIEIL7", "length": "550", "offset": "446529", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200706 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Regular.ttf", "mime": "warc/revisit", "digest": "GVSO2C2TMPPVZ4TXYFXAY27NYWTIEIL7", "length": "553", "offset": "493141", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200718 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Regular.ttf", "mime": "warc/revisit", "digest": "GVSO2C2TMPPVZ4TXYFXAY27NYWTIEIL7", "length": "554", "offset": "567576", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200738 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Regular.ttf", "mime": "warc/revisit", "digest": "GVSO2C2TMPPVZ4TXYFXAY27NYWTIEIL7", "length": "555", "offset": "580835", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200805 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Regular.ttf", "mime": "warc/revisit", "digest": "GVSO2C2TMPPVZ4TXYFXAY27NYWTIEIL7", "length": "551", "offset": "595503", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200816 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Regular.ttf", "mime": "warc/revisit", "digest": "GVSO2C2TMPPVZ4TXYFXAY27NYWTIEIL7", "length": "554", "offset": "609468", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200826 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Regular.ttf", "mime": "warc/revisit", "digest": "GVSO2C2TMPPVZ4TXYFXAY27NYWTIEIL7", "length": "551", "offset": "655640", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200912 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Regular.ttf", "mime": "warc/revisit", "digest": "GVSO2C2TMPPVZ4TXYFXAY27NYWTIEIL7", "length": "551", "offset": "669172", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200930 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Regular.ttf", "mime": "warc/revisit", "digest": "GVSO2C2TMPPVZ4TXYFXAY27NYWTIEIL7", "length": "553", "offset": "698287", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126201055 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Regular.ttf", "mime": "warc/revisit", "digest": "GVSO2C2TMPPVZ4TXYFXAY27NYWTIEIL7", "length": "553", "offset": "711664", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126201128 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Regular.ttf", "mime": "warc/revisit", "digest": "GVSO2C2TMPPVZ4TXYFXAY27NYWTIEIL7", "length": "553", "offset": "730663", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126201228 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Regular.ttf", "mime": "warc/revisit", "digest": "GVSO2C2TMPPVZ4TXYFXAY27NYWTIEIL7", "length": "537", "offset": "743642", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126201240 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Regular.ttf", "mime": "warc/revisit", "digest": "GVSO2C2TMPPVZ4TXYFXAY27NYWTIEIL7", "length": "552", "offset": "755896", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126201249 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Regular.ttf", "mime": "warc/revisit", "digest": "GVSO2C2TMPPVZ4TXYFXAY27NYWTIEIL7", "length": "553", "offset": "769676", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126201308 {"url": "https://www.iana.org/_css/2013.1/fonts/OpenSans-Regular.ttf", "mime": "warc/revisit", "digest": "GVSO2C2TMPPVZ4TXYFXAY27NYWTIEIL7", "length": "551", "offset": "784758", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126200654 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Semibold.ttf", "mime": "application/octet-stream", "status": "200", "digest": "6HXHVHDNCPXC2ZBKQBWATZZXE5PGCN4S", "length": "116641", "offset": "329393", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126200706 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Semibold.ttf", "mime": "warc/revisit", "digest": "6HXHVHDNCPXC2ZBKQBWATZZXE5PGCN4S", "length": "538", "offset": "494192", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126200718 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Semibold.ttf", "mime": "warc/revisit", "digest": "6HXHVHDNCPXC2ZBKQBWATZZXE5PGCN4S", "length": "538", "offset": "565504", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126200738 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Semibold.ttf", "mime": "warc/revisit", "digest": "6HXHVHDNCPXC2ZBKQBWATZZXE5PGCN4S", "length": "539", "offset": "579795", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126200805 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Semibold.ttf", "mime": "warc/revisit", "digest": "6HXHVHDNCPXC2ZBKQBWATZZXE5PGCN4S", "length": "555", "offset": "592333", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126200816 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Semibold.ttf", "mime": "warc/revisit", "digest": "6HXHVHDNCPXC2ZBKQBWATZZXE5PGCN4S", "length": "556", "offset": "607332", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126200826 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Semibold.ttf", "mime": "warc/revisit", "digest": "6HXHVHDNCPXC2ZBKQBWATZZXE5PGCN4S", "length": "556", "offset": "656690", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126200912 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Semibold.ttf", "mime": "warc/revisit", "digest": "6HXHVHDNCPXC2ZBKQBWATZZXE5PGCN4S", "length": "554", "offset": "668113", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126200930 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Semibold.ttf", "mime": "warc/revisit", "digest": "6HXHVHDNCPXC2ZBKQBWATZZXE5PGCN4S", "length": "556", "offset": "700397", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126201055 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Semibold.ttf", "mime": "warc/revisit", "digest": "6HXHVHDNCPXC2ZBKQBWATZZXE5PGCN4S", "length": "555", "offset": "713774", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126201128 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Semibold.ttf", "mime": "warc/revisit", "digest": "6HXHVHDNCPXC2ZBKQBWATZZXE5PGCN4S", "length": "556", "offset": "732779", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126201228 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Semibold.ttf", "mime": "warc/revisit", "digest": "6HXHVHDNCPXC2ZBKQBWATZZXE5PGCN4S", "length": "538", "offset": "744686", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126201240 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Semibold.ttf", "mime": "warc/revisit", "digest": "6HXHVHDNCPXC2ZBKQBWATZZXE5PGCN4S", "length": "537", "offset": "756949", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126201249 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Semibold.ttf", "mime": "warc/revisit", "digest": "6HXHVHDNCPXC2ZBKQBWATZZXE5PGCN4S", "length": "539", "offset": "770730", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126201308 {"url": "https://www.iana.org/_css/2013.1/fonts/OpenSans-Semibold.ttf", "mime": "warc/revisit", "digest": "6HXHVHDNCPXC2ZBKQBWATZZXE5PGCN4S", "length": "554", "offset": "782657", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/print.css 20140126200625 {"url": "http://www.iana.org/_css/2013.1/print.css", "mime": "text/css", "status": "200", "digest": "VNBXHMUNWJQC5OWWGZ3X7GM5C7X6ZAB4", "length": "4662", "offset": "50482", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/print.css 20140126200653 {"url": "http://www.iana.org/_css/2013.1/print.css", "mime": "warc/revisit", "digest": "VNBXHMUNWJQC5OWWGZ3X7GM5C7X6ZAB4", "length": "534", "offset": "326315", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/print.css 20140126200706 {"url": "http://www.iana.org/_css/2013.1/print.css", "mime": "warc/revisit", "digest": "VNBXHMUNWJQC5OWWGZ3X7GM5C7X6ZAB4", "length": "534", "offset": "487982", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/print.css 20140126200716 {"url": "http://www.iana.org/_css/2013.1/print.css", "mime": "warc/revisit", "digest": "VNBXHMUNWJQC5OWWGZ3X7GM5C7X6ZAB4", "length": "535", "offset": "561375", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/print.css 20140126200737 {"url": "http://www.iana.org/_css/2013.1/print.css", "mime": "warc/revisit", "digest": "VNBXHMUNWJQC5OWWGZ3X7GM5C7X6ZAB4", "length": "536", "offset": "574583", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/print.css 20140126200804 {"url": "http://www.iana.org/_css/2013.1/print.css", "mime": "warc/revisit", "digest": "VNBXHMUNWJQC5OWWGZ3X7GM5C7X6ZAB4", "length": "538", "offset": "588168", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/print.css 20140126200816 {"url": "http://www.iana.org/_css/2013.1/print.css", "mime": "warc/revisit", "digest": "VNBXHMUNWJQC5OWWGZ3X7GM5C7X6ZAB4", "length": "537", "offset": "602081", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/print.css 20140126200825 {"url": "http://www.iana.org/_css/2013.1/print.css", "mime": "warc/revisit", "digest": "VNBXHMUNWJQC5OWWGZ3X7GM5C7X6ZAB4", "length": "535", "offset": "613943", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/print.css 20140126200912 {"url": "http://www.iana.org/_css/2013.1/print.css", "mime": "warc/revisit", "digest": "VNBXHMUNWJQC5OWWGZ3X7GM5C7X6ZAB4", "length": "536", "offset": "662904", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/print.css 20140126200929 {"url": "http://www.iana.org/_css/2013.1/print.css", "mime": "warc/revisit", "digest": "VNBXHMUNWJQC5OWWGZ3X7GM5C7X6ZAB4", "length": "537", "offset": "693076", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/print.css 20140126201054 {"url": "http://www.iana.org/_css/2013.1/print.css", "mime": "warc/revisit", "digest": "VNBXHMUNWJQC5OWWGZ3X7GM5C7X6ZAB4", "length": "526", "offset": "707519", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/print.css 20140126201127 {"url": "http://www.iana.org/_css/2013.1/print.css", "mime": "warc/revisit", "digest": "VNBXHMUNWJQC5OWWGZ3X7GM5C7X6ZAB4", "length": "525", "offset": "726489", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/print.css 20140126201227 {"url": "http://www.iana.org/_css/2013.1/print.css", "mime": "warc/revisit", "digest": "VNBXHMUNWJQC5OWWGZ3X7GM5C7X6ZAB4", "length": "527", "offset": "738432", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/print.css 20140126201239 {"url": "http://www.iana.org/_css/2013.1/print.css", "mime": "warc/revisit", "digest": "VNBXHMUNWJQC5OWWGZ3X7GM5C7X6ZAB4", "length": "526", "offset": "750710", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/print.css 20140126201248 {"url": "http://www.iana.org/_css/2013.1/print.css", "mime": "warc/revisit", "digest": "VNBXHMUNWJQC5OWWGZ3X7GM5C7X6ZAB4", "length": "535", "offset": "763424", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/print.css 20140126201307 {"url": "https://www.iana.org/_css/2013.1/print.css", "mime": "warc/revisit", "digest": "VNBXHMUNWJQC5OWWGZ3X7GM5C7X6ZAB4", "length": "539", "offset": "777477", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/screen.css 20140126200625 {"url": "http://www.iana.org/_css/2013.1/screen.css", "mime": "text/css", "status": "200", "digest": "BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD", "length": "8754", "offset": "41238", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/screen.css 20140126200653 {"url": "http://www.iana.org/_css/2013.1/screen.css", "mime": "warc/revisit", "digest": "BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD", "length": "533", "offset": "328367", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/screen.css 20140126200706 {"url": "http://www.iana.org/_css/2013.1/screen.css", "mime": "warc/revisit", "digest": "BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD", "length": "539", "offset": "489005", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/screen.css 20140126200716 {"url": "http://www.iana.org/_css/2013.1/screen.css", "mime": "warc/revisit", "digest": "BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD", "length": "542", "offset": "563417", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/screen.css 20140126200737 {"url": "http://www.iana.org/_css/2013.1/screen.css", "mime": "warc/revisit", "digest": "BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD", "length": "528", "offset": "572623", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/screen.css 20140126200804 {"url": "http://www.iana.org/_css/2013.1/screen.css", "mime": "warc/revisit", "digest": "BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD", "length": "527", "offset": "589212", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/screen.css 20140126200816 {"url": "http://www.iana.org/_css/2013.1/screen.css", "mime": "warc/revisit", "digest": "BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD", "length": "528", "offset": "603125", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/screen.css 20140126200825 {"url": "http://www.iana.org/_css/2013.1/screen.css", "mime": "warc/revisit", "digest": "BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD", "length": "527", "offset": "614971", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/screen.css 20140126200912 {"url": "http://www.iana.org/_css/2013.1/screen.css", "mime": "warc/revisit", "digest": "BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD", "length": "531", "offset": "661876", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/screen.css 20140126200929 {"url": "http://www.iana.org/_css/2013.1/screen.css", "mime": "warc/revisit", "digest": "BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD", "length": "538", "offset": "691096", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/screen.css 20140126201054 {"url": "http://www.iana.org/_css/2013.1/screen.css", "mime": "warc/revisit", "digest": "BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD", "length": "543", "offset": "706476", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/screen.css 20140126201127 {"url": "http://www.iana.org/_css/2013.1/screen.css", "mime": "warc/revisit", "digest": "BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD", "length": "543", "offset": "725445", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/screen.css 20140126201227 {"url": "http://www.iana.org/_css/2013.1/screen.css", "mime": "warc/revisit", "digest": "BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD", "length": "543", "offset": "739461", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/screen.css 20140126201239 {"url": "http://www.iana.org/_css/2013.1/screen.css", "mime": "warc/revisit", "digest": "BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD", "length": "541", "offset": "751731", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/screen.css 20140126201248 {"url": "http://www.iana.org/_css/2013.1/screen.css", "mime": "warc/revisit", "digest": "BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD", "length": "541", "offset": "764454", "filename": "iana.warc.gz"} +org,iana)/_css/2013.1/screen.css 20140126201307 {"url": "https://www.iana.org/_css/2013.1/screen.css", "mime": "warc/revisit", "digest": "BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD", "length": "537", "offset": "779533", "filename": "iana.warc.gz"} +org,iana)/_img/2013.1/iana-logo-header.svg 20140126200654 {"url": "http://www.iana.org/_img/2013.1/iana-logo-header.svg", "mime": "image/svg+xml", "status": "200", "digest": "N6T6ZRHLEHKP2675D7JVKDYKVKYKWQ6X", "length": "9739", "offset": "447577", "filename": "iana.warc.gz"} +org,iana)/_img/2013.1/iana-logo-header.svg 20140126200706 {"url": "http://www.iana.org/_img/2013.1/iana-logo-header.svg", "mime": "warc/revisit", "digest": "N6T6ZRHLEHKP2675D7JVKDYKVKYKWQ6X", "length": "553", "offset": "491049", "filename": "iana.warc.gz"} +org,iana)/_img/2013.1/iana-logo-header.svg 20140126200718 {"url": "http://www.iana.org/_img/2013.1/iana-logo-header.svg", "mime": "warc/revisit", "digest": "N6T6ZRHLEHKP2675D7JVKDYKVKYKWQ6X", "length": "551", "offset": "564454", "filename": "iana.warc.gz"} +org,iana)/_img/2013.1/iana-logo-header.svg 20140126200737 {"url": "http://www.iana.org/_img/2013.1/iana-logo-header.svg", "mime": "warc/revisit", "digest": "N6T6ZRHLEHKP2675D7JVKDYKVKYKWQ6X", "length": "550", "offset": "576643", "filename": "iana.warc.gz"} +org,iana)/_img/2013.1/iana-logo-header.svg 20140126200805 {"url": "http://www.iana.org/_img/2013.1/iana-logo-header.svg", "mime": "warc/revisit", "digest": "N6T6ZRHLEHKP2675D7JVKDYKVKYKWQ6X", "length": "552", "offset": "591269", "filename": "iana.warc.gz"} +org,iana)/_img/2013.1/iana-logo-header.svg 20140126200816 {"url": "http://www.iana.org/_img/2013.1/iana-logo-header.svg", "mime": "warc/revisit", "digest": "N6T6ZRHLEHKP2675D7JVKDYKVKYKWQ6X", "length": "552", "offset": "605204", "filename": "iana.warc.gz"} +org,iana)/_img/2013.1/iana-logo-header.svg 20140126200826 {"url": "http://www.iana.org/_img/2013.1/iana-logo-header.svg", "mime": "warc/revisit", "digest": "N6T6ZRHLEHKP2675D7JVKDYKVKYKWQ6X", "length": "552", "offset": "617954", "filename": "iana.warc.gz"} +org,iana)/_img/2013.1/iana-logo-header.svg 20140126200912 {"url": "http://www.iana.org/_img/2013.1/iana-logo-header.svg", "mime": "warc/revisit", "digest": "N6T6ZRHLEHKP2675D7JVKDYKVKYKWQ6X", "length": "553", "offset": "664967", "filename": "iana.warc.gz"} +org,iana)/_img/2013.1/iana-logo-header.svg 20140126200929 {"url": "http://www.iana.org/_img/2013.1/iana-logo-header.svg", "mime": "warc/revisit", "digest": "N6T6ZRHLEHKP2675D7JVKDYKVKYKWQ6X", "length": "550", "offset": "695150", "filename": "iana.warc.gz"} +org,iana)/_img/2013.1/iana-logo-header.svg 20140126201054 {"url": "http://www.iana.org/_img/2013.1/iana-logo-header.svg", "mime": "warc/revisit", "digest": "N6T6ZRHLEHKP2675D7JVKDYKVKYKWQ6X", "length": "548", "offset": "709577", "filename": "iana.warc.gz"} +org,iana)/_img/2013.1/iana-logo-header.svg 20140126201128 {"url": "http://www.iana.org/_img/2013.1/iana-logo-header.svg", "mime": "warc/revisit", "digest": "N6T6ZRHLEHKP2675D7JVKDYKVKYKWQ6X", "length": "552", "offset": "728551", "filename": "iana.warc.gz"} +org,iana)/_img/2013.1/iana-logo-header.svg 20140126201228 {"url": "http://www.iana.org/_img/2013.1/iana-logo-header.svg", "mime": "warc/revisit", "digest": "N6T6ZRHLEHKP2675D7JVKDYKVKYKWQ6X", "length": "548", "offset": "741538", "filename": "iana.warc.gz"} +org,iana)/_img/2013.1/iana-logo-header.svg 20140126201239 {"url": "http://www.iana.org/_img/2013.1/iana-logo-header.svg", "mime": "warc/revisit", "digest": "N6T6ZRHLEHKP2675D7JVKDYKVKYKWQ6X", "length": "549", "offset": "753801", "filename": "iana.warc.gz"} +org,iana)/_img/2013.1/iana-logo-header.svg 20140126201249 {"url": "http://www.iana.org/_img/2013.1/iana-logo-header.svg", "mime": "warc/revisit", "digest": "N6T6ZRHLEHKP2675D7JVKDYKVKYKWQ6X", "length": "551", "offset": "766525", "filename": "iana.warc.gz"} +org,iana)/_img/2013.1/iana-logo-header.svg 20140126201307 {"url": "https://www.iana.org/_img/2013.1/iana-logo-header.svg", "mime": "warc/revisit", "digest": "N6T6ZRHLEHKP2675D7JVKDYKVKYKWQ6X", "length": "552", "offset": "780562", "filename": "iana.warc.gz"} +org,iana)/_img/2013.1/iana-logo-homepage.png 20140126200625 {"url": "http://www.iana.org/_img/2013.1/iana-logo-homepage.png", "mime": "image/png", "status": "200", "digest": "GCW2GM3SIMHEIQYZX25MLSRYVWUCZ7OK", "length": "27163", "offset": "55631", "filename": "iana.warc.gz"} +org,iana)/_img/2013.1/icann-logo.svg 20140126200625 {"url": "http://www.iana.org/_img/2013.1/icann-logo.svg", "mime": "image/svg+xml", "status": "200", "digest": "HGRZHOH73EFQQWBYWBSOIV2UU5JDTSGJ", "length": "2809", "offset": "4009", "filename": "iana.warc.gz"} +org,iana)/_img/2013.1/icann-logo.svg 20140126200654 {"url": "http://www.iana.org/_img/2013.1/icann-logo.svg", "mime": "warc/revisit", "digest": "HGRZHOH73EFQQWBYWBSOIV2UU5JDTSGJ", "length": "546", "offset": "457816", "filename": "iana.warc.gz"} +org,iana)/_img/2013.1/icann-logo.svg 20140126200706 {"url": "http://www.iana.org/_img/2013.1/icann-logo.svg", "mime": "warc/revisit", "digest": "HGRZHOH73EFQQWBYWBSOIV2UU5JDTSGJ", "length": "545", "offset": "492101", "filename": "iana.warc.gz"} +org,iana)/_img/2013.1/icann-logo.svg 20140126200719 {"url": "http://www.iana.org/_img/2013.1/icann-logo.svg", "mime": "warc/revisit", "digest": "HGRZHOH73EFQQWBYWBSOIV2UU5JDTSGJ", "length": "548", "offset": "568628", "filename": "iana.warc.gz"} +org,iana)/_img/2013.1/icann-logo.svg 20140126200738 {"url": "http://www.iana.org/_img/2013.1/icann-logo.svg", "mime": "warc/revisit", "digest": "HGRZHOH73EFQQWBYWBSOIV2UU5JDTSGJ", "length": "548", "offset": "577695", "filename": "iana.warc.gz"} +org,iana)/_img/2013.1/icann-logo.svg 20140126200805 {"url": "http://www.iana.org/_img/2013.1/icann-logo.svg", "mime": "warc/revisit", "digest": "HGRZHOH73EFQQWBYWBSOIV2UU5JDTSGJ", "length": "547", "offset": "594444", "filename": "iana.warc.gz"} +org,iana)/_img/2013.1/icann-logo.svg 20140126200816 {"url": "http://www.iana.org/_img/2013.1/icann-logo.svg", "mime": "warc/revisit", "digest": "HGRZHOH73EFQQWBYWBSOIV2UU5JDTSGJ", "length": "548", "offset": "606272", "filename": "iana.warc.gz"} +org,iana)/_img/2013.1/icann-logo.svg 20140126200826 {"url": "http://www.iana.org/_img/2013.1/icann-logo.svg", "mime": "warc/revisit", "digest": "HGRZHOH73EFQQWBYWBSOIV2UU5JDTSGJ", "length": "545", "offset": "619007", "filename": "iana.warc.gz"} +org,iana)/_img/2013.1/icann-logo.svg 20140126200912 {"url": "http://www.iana.org/_img/2013.1/icann-logo.svg", "mime": "warc/revisit", "digest": "HGRZHOH73EFQQWBYWBSOIV2UU5JDTSGJ", "length": "547", "offset": "666025", "filename": "iana.warc.gz"} +org,iana)/_img/2013.1/icann-logo.svg 20140126200930 {"url": "http://www.iana.org/_img/2013.1/icann-logo.svg", "mime": "warc/revisit", "digest": "HGRZHOH73EFQQWBYWBSOIV2UU5JDTSGJ", "length": "547", "offset": "696207", "filename": "iana.warc.gz"} +org,iana)/_img/2013.1/icann-logo.svg 20140126201055 {"url": "http://www.iana.org/_img/2013.1/icann-logo.svg", "mime": "warc/revisit", "digest": "HGRZHOH73EFQQWBYWBSOIV2UU5JDTSGJ", "length": "529", "offset": "710633", "filename": "iana.warc.gz"} +org,iana)/_img/2013.1/icann-logo.svg 20140126201128 {"url": "http://www.iana.org/_img/2013.1/icann-logo.svg", "mime": "warc/revisit", "digest": "HGRZHOH73EFQQWBYWBSOIV2UU5JDTSGJ", "length": "549", "offset": "729609", "filename": "iana.warc.gz"} +org,iana)/_img/2013.1/icann-logo.svg 20140126201228 {"url": "http://www.iana.org/_img/2013.1/icann-logo.svg", "mime": "warc/revisit", "digest": "HGRZHOH73EFQQWBYWBSOIV2UU5JDTSGJ", "length": "544", "offset": "742593", "filename": "iana.warc.gz"} +org,iana)/_img/2013.1/icann-logo.svg 20140126201240 {"url": "http://www.iana.org/_img/2013.1/icann-logo.svg", "mime": "warc/revisit", "digest": "HGRZHOH73EFQQWBYWBSOIV2UU5JDTSGJ", "length": "546", "offset": "754853", "filename": "iana.warc.gz"} +org,iana)/_img/2013.1/icann-logo.svg 20140126201249 {"url": "http://www.iana.org/_img/2013.1/icann-logo.svg", "mime": "warc/revisit", "digest": "HGRZHOH73EFQQWBYWBSOIV2UU5JDTSGJ", "length": "544", "offset": "767580", "filename": "iana.warc.gz"} +org,iana)/_img/2013.1/icann-logo.svg 20140126201308 {"url": "https://www.iana.org/_img/2013.1/icann-logo.svg", "mime": "warc/revisit", "digest": "HGRZHOH73EFQQWBYWBSOIV2UU5JDTSGJ", "length": "546", "offset": "781613", "filename": "iana.warc.gz"} +org,iana)/_img/2013.1/rir-map.svg 20140126200654 {"url": "http://www.iana.org/_img/2013.1/rir-map.svg", "mime": "image/svg+xml", "status": "200", "digest": "C4LTM7ATRZYZL3W2UCEEX6A26L6PIT4K", "length": "23189", "offset": "458860", "filename": "iana.warc.gz"} +org,iana)/_img/bookmark_icon.ico 20140126200631 {"url": "http://www.iana.org/_img/bookmark_icon.ico", "mime": "application/octet-stream", "status": "200", "digest": "PG3PAWWE72JQ37CXJSPCJNNF7QI3SNX7", "length": "4968", "offset": "315944", "filename": "iana.warc.gz"} +org,iana)/_img/bookmark_icon.ico 20140126201310 {"url": "https://www.iana.org/_img/bookmark_icon.ico", "mime": "warc/revisit", "digest": "PG3PAWWE72JQ37CXJSPCJNNF7QI3SNX7", "length": "548", "offset": "785806", "filename": "iana.warc.gz"} +org,iana)/_js/2013.1/iana.js 20140126200625 {"url": "http://www.iana.org/_js/2013.1/iana.js", "mime": "application/x-javascript", "status": "200", "digest": "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "458", "offset": "3074", "filename": "iana.warc.gz"} +org,iana)/_js/2013.1/iana.js 20140126200653 {"url": "http://www.iana.org/_js/2013.1/iana.js", "mime": "application/x-javascript", "status": "200", "digest": "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "456", "offset": "325380", "filename": "iana.warc.gz"} +org,iana)/_js/2013.1/iana.js 20140126200706 {"url": "http://www.iana.org/_js/2013.1/iana.js", "mime": "application/x-javascript", "status": "200", "digest": "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "458", "offset": "487044", "filename": "iana.warc.gz"} +org,iana)/_js/2013.1/iana.js 20140126200716 {"url": "http://www.iana.org/_js/2013.1/iana.js", "mime": "application/x-javascript", "status": "200", "digest": "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "457", "offset": "560436", "filename": "iana.warc.gz"} +org,iana)/_js/2013.1/iana.js 20140126200737 {"url": "http://www.iana.org/_js/2013.1/iana.js", "mime": "application/x-javascript", "status": "200", "digest": "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "457", "offset": "573645", "filename": "iana.warc.gz"} +org,iana)/_js/2013.1/iana.js 20140126200804 {"url": "http://www.iana.org/_js/2013.1/iana.js", "mime": "application/x-javascript", "status": "200", "digest": "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "460", "offset": "587215", "filename": "iana.warc.gz"} +org,iana)/_js/2013.1/iana.js 20140126200816 {"url": "http://www.iana.org/_js/2013.1/iana.js", "mime": "application/x-javascript", "status": "200", "digest": "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "459", "offset": "601126", "filename": "iana.warc.gz"} +org,iana)/_js/2013.1/iana.js 20140126200825 {"url": "http://www.iana.org/_js/2013.1/iana.js", "mime": "application/x-javascript", "status": "200", "digest": "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "458", "offset": "615991", "filename": "iana.warc.gz"} +org,iana)/_js/2013.1/iana.js 20140126200912 {"url": "http://www.iana.org/_js/2013.1/iana.js", "mime": "application/x-javascript", "status": "200", "digest": "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "456", "offset": "660937", "filename": "iana.warc.gz"} +org,iana)/_js/2013.1/iana.js 20140126200929 {"url": "http://www.iana.org/_js/2013.1/iana.js", "mime": "application/x-javascript", "status": "200", "digest": "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "458", "offset": "692132", "filename": "iana.warc.gz"} +org,iana)/_js/2013.1/iana.js 20140126201054 {"url": "http://www.iana.org/_js/2013.1/iana.js", "mime": "application/x-javascript", "status": "200", "digest": "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "456", "offset": "705534", "filename": "iana.warc.gz"} +org,iana)/_js/2013.1/iana.js 20140126201127 {"url": "http://www.iana.org/_js/2013.1/iana.js", "mime": "application/x-javascript", "status": "200", "digest": "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "457", "offset": "724500", "filename": "iana.warc.gz"} +org,iana)/_js/2013.1/iana.js 20140126201227 {"url": "http://www.iana.org/_js/2013.1/iana.js", "mime": "application/x-javascript", "status": "200", "digest": "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "458", "offset": "737484", "filename": "iana.warc.gz"} +org,iana)/_js/2013.1/iana.js 20140126201239 {"url": "http://www.iana.org/_js/2013.1/iana.js", "mime": "application/x-javascript", "status": "200", "digest": "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "457", "offset": "749770", "filename": "iana.warc.gz"} +org,iana)/_js/2013.1/iana.js 20140126201248 {"url": "http://www.iana.org/_js/2013.1/iana.js", "mime": "application/x-javascript", "status": "200", "digest": "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "458", "offset": "762480", "filename": "iana.warc.gz"} +org,iana)/_js/2013.1/iana.js 20140126201307 {"url": "https://www.iana.org/_js/2013.1/iana.js", "mime": "application/x-javascript", "status": "200", "digest": "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "453", "offset": "776543", "filename": "iana.warc.gz"} +org,iana)/_js/2013.1/jquery.js 20140126200625 {"url": "http://www.iana.org/_js/2013.1/jquery.js", "mime": "application/x-javascript", "status": "200", "digest": "AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO", "length": "33449", "offset": "7311", "filename": "iana.warc.gz"} +org,iana)/_js/2013.1/jquery.js 20140126200653 {"url": "http://www.iana.org/_js/2013.1/jquery.js", "mime": "warc/revisit", "digest": "AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO", "length": "542", "offset": "327341", "filename": "iana.warc.gz"} +org,iana)/_js/2013.1/jquery.js 20140126200706 {"url": "http://www.iana.org/_js/2013.1/jquery.js", "mime": "warc/revisit", "digest": "AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO", "length": "529", "offset": "490037", "filename": "iana.warc.gz"} +org,iana)/_js/2013.1/jquery.js 20140126200716 {"url": "http://www.iana.org/_js/2013.1/jquery.js", "mime": "warc/revisit", "digest": "AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO", "length": "529", "offset": "562402", "filename": "iana.warc.gz"} +org,iana)/_js/2013.1/jquery.js 20140126200737 {"url": "http://www.iana.org/_js/2013.1/jquery.js", "mime": "warc/revisit", "digest": "AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO", "length": "543", "offset": "575613", "filename": "iana.warc.gz"} +org,iana)/_js/2013.1/jquery.js 20140126200804 {"url": "http://www.iana.org/_js/2013.1/jquery.js", "mime": "warc/revisit", "digest": "AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO", "length": "530", "offset": "590244", "filename": "iana.warc.gz"} +org,iana)/_js/2013.1/jquery.js 20140126200816 {"url": "http://www.iana.org/_js/2013.1/jquery.js", "mime": "warc/revisit", "digest": "AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO", "length": "544", "offset": "604162", "filename": "iana.warc.gz"} +org,iana)/_js/2013.1/jquery.js 20140126200825 {"url": "http://www.iana.org/_js/2013.1/jquery.js", "mime": "warc/revisit", "digest": "AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO", "length": "543", "offset": "616929", "filename": "iana.warc.gz"} +org,iana)/_js/2013.1/jquery.js 20140126200912 {"url": "http://www.iana.org/_js/2013.1/jquery.js", "mime": "warc/revisit", "digest": "AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO", "length": "544", "offset": "663936", "filename": "iana.warc.gz"} +org,iana)/_js/2013.1/jquery.js 20140126200929 {"url": "http://www.iana.org/_js/2013.1/jquery.js", "mime": "warc/revisit", "digest": "AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO", "length": "546", "offset": "694112", "filename": "iana.warc.gz"} +org,iana)/_js/2013.1/jquery.js 20140126201054 {"url": "http://www.iana.org/_js/2013.1/jquery.js", "mime": "warc/revisit", "digest": "AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO", "length": "544", "offset": "708544", "filename": "iana.warc.gz"} +org,iana)/_js/2013.1/jquery.js 20140126201127 {"url": "http://www.iana.org/_js/2013.1/jquery.js", "mime": "warc/revisit", "digest": "AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO", "length": "545", "offset": "727515", "filename": "iana.warc.gz"} +org,iana)/_js/2013.1/jquery.js 20140126201227 {"url": "http://www.iana.org/_js/2013.1/jquery.js", "mime": "warc/revisit", "digest": "AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO", "length": "543", "offset": "740505", "filename": "iana.warc.gz"} +org,iana)/_js/2013.1/jquery.js 20140126201239 {"url": "http://www.iana.org/_js/2013.1/jquery.js", "mime": "warc/revisit", "digest": "AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO", "length": "545", "offset": "752769", "filename": "iana.warc.gz"} +org,iana)/_js/2013.1/jquery.js 20140126201248 {"url": "http://www.iana.org/_js/2013.1/jquery.js", "mime": "warc/revisit", "digest": "AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO", "length": "544", "offset": "765491", "filename": "iana.warc.gz"} +org,iana)/_js/2013.1/jquery.js 20140126201307 {"url": "https://www.iana.org/_js/2013.1/jquery.js", "mime": "warc/revisit", "digest": "AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO", "length": "543", "offset": "778507", "filename": "iana.warc.gz"} +org,iana)/about 20140126200706 {"url": "http://www.iana.org/about", "mime": "text/html", "status": "200", "digest": "6G77LZKFAVKH4PCWWKMW6TRJPSHWUBI3", "length": "2962", "offset": "483588", "filename": "iana.warc.gz"} +org,iana)/about/performance/ietf-draft-status 20140126200815 {"url": "http://www.iana.org/about/performance/ietf-draft-status", "mime": "text/html", "status": "302", "digest": "Y7CTA2QZUSCDTJCSECZNSPIBLJDO7PJJ", "length": "584", "offset": "596566", "filename": "iana.warc.gz"} +org,iana)/about/performance/ietf-statistics 20140126200804 {"url": "http://www.iana.org/about/performance/ietf-statistics", "mime": "text/html", "status": "302", "digest": "HNYDN7XRX46RQTT2OFIWXKEYMZQAJWHD", "length": "582", "offset": "581890", "filename": "iana.warc.gz"} +org,iana)/dnssec 20140126201306 {"url": "http://www.iana.org/dnssec", "mime": "text/html", "status": "302", "digest": "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "442", "offset": "772827", "filename": "iana.warc.gz"} +org,iana)/dnssec 20140126201307 {"url": "https://www.iana.org/dnssec", "mime": "text/html", "status": "200", "digest": "PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI", "length": "2278", "offset": "773766", "filename": "iana.warc.gz"} +org,iana)/domains 20140126200825 {"url": "http://www.iana.org/domains", "mime": "text/html", "status": "200", "digest": "7UPSCLNWNZP33LGW6OJGSF2Y4CDG4ES7", "length": "2912", "offset": "610534", "filename": "iana.warc.gz"} +org,iana)/domains/arpa 20140126201248 {"url": "http://www.iana.org/domains/arpa", "mime": "text/html", "status": "200", "digest": "QOFZZRN6JIKAL2JRL6ZC2VVG42SPKGHT", "length": "2939", "offset": "759039", "filename": "iana.warc.gz"} +org,iana)/domains/idn-tables 20140126201127 {"url": "http://www.iana.org/domains/idn-tables", "mime": "text/html", "status": "200", "digest": "HNCUFTJMOQOGAEY6T56KVC3T7TVLKGEW", "length": "8118", "offset": "715878", "filename": "iana.warc.gz"} +org,iana)/domains/int 20140126201239 {"url": "http://www.iana.org/domains/int", "mime": "text/html", "status": "200", "digest": "X32BBNNORV4SPEHTQF5KI5NFHSKTZK6Q", "length": "2482", "offset": "746788", "filename": "iana.warc.gz"} +org,iana)/domains/reserved 20140126201054 {"url": "http://www.iana.org/domains/reserved", "mime": "text/html", "status": "200", "digest": "R5AAEQX5XY5X5DG66B23ODN5DUBWRA27", "length": "3573", "offset": "701457", "filename": "iana.warc.gz"} +org,iana)/domains/root 20140126200912 {"url": "http://www.iana.org/domains/root", "mime": "text/html", "status": "200", "digest": "YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX", "length": "2691", "offset": "657746", "filename": "iana.warc.gz"} +org,iana)/domains/root/db 20140126200927 {"url": "http://www.iana.org/domains/root/db/", "mime": "text/html", "status": "302", "digest": "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "446", "offset": "671278", "filename": "iana.warc.gz"} +org,iana)/domains/root/db 20140126200928 {"url": "http://www.iana.org/domains/root/db", "mime": "text/html", "status": "200", "digest": "DHXA725IW5VJJFRTWBQT6BEZKRE7H57S", "length": "18365", "offset": "672225", "filename": "iana.warc.gz"} +org,iana)/domains/root/servers 20140126201227 {"url": "http://www.iana.org/domains/root/servers", "mime": "text/html", "status": "200", "digest": "AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU", "length": "3137", "offset": "733840", "filename": "iana.warc.gz"} +org,iana)/numbers 20140126200651 {"url": "http://www.iana.org/numbers", "mime": "text/html", "status": "200", "digest": "HWT5UZKURYLW5QNWVZCWFCANGEMU7XWK", "length": "3498", "offset": "321385", "filename": "iana.warc.gz"} +org,iana)/performance/ietf-draft-status 20140126200815 {"url": "http://www.iana.org/performance/ietf-draft-status", "mime": "text/html", "status": "200", "digest": "T5IQTX6DWV5KABGH454CYEDWKRI5Y23E", "length": "2940", "offset": "597667", "filename": "iana.warc.gz"} +org,iana)/performance/ietf-statistics 20140126200804 {"url": "http://www.iana.org/performance/ietf-statistics", "mime": "text/html", "status": "200", "digest": "XOFML5WNBQMTSULLIIPLSP6U5MX33HN6", "length": "3712", "offset": "582987", "filename": "iana.warc.gz"} +org,iana)/protocols 20140126200715 {"url": "http://www.iana.org/protocols", "mime": "text/html", "status": "200", "digest": "IRUJZEUAXOUUG224ZMI4VWTUPJX6XJTT", "length": "63663", "offset": "496277", "filename": "iana.warc.gz"} +org,iana)/time-zones 20140126200737 {"url": "http://www.iana.org/time-zones", "mime": "text/html", "status": "200", "digest": "4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R", "length": "2449", "offset": "569675", "filename": "iana.warc.gz"} diff --git a/testdata/iana.warc.gz b/testdata/iana.warc.gz new file mode 100644 index 00000000..3a88a71a Binary files /dev/null and b/testdata/iana.warc.gz differ diff --git a/testdata/post-test.cdxj b/testdata/post-test.cdxj new file mode 100644 index 00000000..5856b8b1 --- /dev/null +++ b/testdata/post-test.cdxj @@ -0,0 +1,3 @@ +org,httpbin)/post?foo=bar&test=abc 20140610000859 {"url": "http://httpbin.org/post", "mime": "application/json", "status": "200", "digest": "M532K5WS4GY2H4OVZO6HRPOP47A7KDWU", "length": "720", "offset": "0", "filename": "post-test.warc.gz"} +org,httpbin)/post?a=1&b=[]&c=3 20140610001151 {"url": "http://httpbin.org/post", "mime": "application/json", "status": "200", "digest": "M7YCTM7HS3YKYQTAWQVMQSQZBNEOXGU2", "length": "723", "offset": "1196", "filename": "post-test.warc.gz"} +org,httpbin)/post?data=^&foo=bar 20140610001255 {"url": "http://httpbin.org/post?foo=bar", "mime": "application/json", "status": "200", "digest": "B6E5P6JUZI6UPDTNO4L2BCHMGLTNCUAJ", "length": "723", "offset": "2395", "filename": "post-test.warc.gz"} diff --git a/testdata/post-test.warc.gz b/testdata/post-test.warc.gz new file mode 100644 index 00000000..b9cc1f48 Binary files /dev/null and b/testdata/post-test.warc.gz differ diff --git a/testdata/url-agnost-example.cdxj b/testdata/url-agnost-example.cdxj new file mode 100644 index 00000000..6eebd255 --- /dev/null +++ b/testdata/url-agnost-example.cdxj @@ -0,0 +1,2 @@ +com,example)/ 20130729195151 {"url": "http://test@example.com/", "mime": "warc/revisit", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "591", "offset": "355", "filename": "example-url-agnostic-revisit.warc.gz"} +org,iana,example)/ 20130702195402 {"url": "http://example.iana.org/", "mime": "text/html", "status": "200", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "1001", "offset": "353", "filename": "example-url-agnostic-orig.warc.gz"} diff --git a/urlrewrite/__init__.py b/urlrewrite/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/urlrewrite/cookies.py b/urlrewrite/cookies.py new file mode 100644 index 00000000..9823da47 --- /dev/null +++ b/urlrewrite/cookies.py @@ -0,0 +1,153 @@ +from pywb.rewrite.cookie_rewriter import WbUrlBaseCookieRewriter +from pywb.utils.timeutils import datetime_to_http_date +from six.moves.http_cookiejar import CookieJar, DefaultCookiePolicy +from six.moves import zip + +import redis + +import tldextract +import time +import datetime +import six + + +# ============================================================================= +class CookieTracker(object): + def __init__(self, redis, expire_time=120): + self.redis = redis + self.expire_time = expire_time + + def get_rewriter(self, url_rewriter, cookie_key): + return DomainCacheCookieRewriter(url_rewriter, self, cookie_key) + + def get_cookie_headers(self, url, cookie_key): + subds = self.get_subdomains(url) + + if not subds: + return None, None + + with redis.utils.pipeline(self.redis) as pi: + for domain in subds: + pi.hgetall(cookie_key + '.' + domain) + + all_res = pi.execute() + + cookies = [] + set_cookies = [] + + with redis.utils.pipeline(self.redis) as pi: + for res, domain in zip(all_res, subds): + if not res: + continue + + for n, v in six.iteritems(res): + n = n.decode('utf-8') + v = v.decode('utf-8') + full = n + '=' + v + cookies.append(full.split(';')[0]) + set_cookies.append(('Set-Cookie', full + '; Max-Age=' + str(self.expire_time))) + + pi.expire(cookie_key + '.' + domain, self.expire_time) + + + cookies = ';'.join(cookies) + return cookies, set_cookies + + def add_cookie(self, cookie_key, domain, name, value): + if domain[0] != '.': + domain = '.' + domain + + with redis.utils.pipeline(self.redis) as pi: + pi.hset(cookie_key + domain, name, value) + pi.expire(cookie_key + domain, self.expire_time) + + @staticmethod + def get_subdomains(url): + tld = tldextract.extract(url) + + if not tld.subdomain: + return None + + main = tld.domain + '.' + tld.suffix + full = tld.subdomain + '.' + main + + def get_all_subdomains(main, full): + doms = [] + while main != full: + full = full.split('.', 1)[1] + doms.append(full) + + return doms + + all_subs = get_all_subdomains(main, full) + return all_subs + + +# ============================================================================= +class DomainCacheCookieRewriter(WbUrlBaseCookieRewriter): + def __init__(self, url_rewriter, cookie_tracker, cookie_key): + super(DomainCacheCookieRewriter, self).__init__(url_rewriter) + self.cookie_tracker = cookie_tracker + self.cookie_key = cookie_key + + def rewrite_cookie(self, name, morsel): + # if domain set, no choice but to expand cookie path to root + domain = morsel.pop('domain', '') + + if domain: + #if morsel.get('max-age'): + # morsel['max-age'] = int(morsel['max-age']) + + #self.cookiejar.set_cookie(self.morsel_to_cookie(morsel)) + #print(morsel, self.cookie_key + domain) + + string = morsel.value + if morsel.get('path'): + string += '; Path=' + morsel.get('path') + + if morsel.get('httponly'): + string += '; HttpOnly' + + if morsel.get('secure'): + string += '; Secure' + + self.cookie_tracker.add_cookie(self.cookie_key, + domain, + morsel.key, + string) + + # else set cookie to rewritten path + if morsel.get('path'): + morsel['path'] = self.url_rewriter.rewrite(morsel['path']) + + return morsel + + def get_expire_sec(self, morsel): + expires = None + + if morsel.get('max-age'): + return int(morsel['max-age']) + + expires = morsel.get('expires') + if not expires: + return None + + expires = expires.replace(' UTC', ' GMT') + + try: + expires = time.strptime(expires, '%a, %d-%b-%Y %H:%M:%S GMT') + except: + pass + + try: + expires = time.strptime(expires, '%a, %d %b %Y %H:%M:%S GMT') + except: + pass + + expires = time.mktime(expires) + expires = expires - time.timezone - time.time() + return expires + + +# ============================================================================ + diff --git a/urlrewrite/platformhandler.py b/urlrewrite/platformhandler.py new file mode 100644 index 00000000..02e0c117 --- /dev/null +++ b/urlrewrite/platformhandler.py @@ -0,0 +1,99 @@ +from gevent.monkey import patch_all; patch_all() + +import requests + +from pywb.framework.archivalrouter import Route + +from pywb.rewrite.rewrite_content import RewriteContent +from pywb.rewrite.wburl import WbUrl +from pywb.warc.recordloader import ArcWarcRecordLoader +from pywb.webapp.live_rewrite_handler import RewriteHandler +from pywb.utils.canonicalize import canonicalize +from pywb.utils.timeutils import http_date_to_timestamp +from pywb.cdx.cdxobject import CDXObject + +from io import BytesIO + +from rewriteinputreq import RewriteInputRequest + +from six.moves.urllib.parse import quote + + +# ============================================================================ +class PlatformRoute(Route): + def apply_filters(self, wbrequest, matcher): + wbrequest.matchdict = matcher.groupdict() + + +# ============================================================================ +class PlatformHandler(RewriteHandler): + def __init__(self, config): + super(PlatformHandler, self).__init__(config) + self.upstream_url = config.get('upstream_url') + self.loader = ArcWarcRecordLoader() + + framed = config.get('framed_replay') + self.content_rewriter = RewriteContent(is_framed_replay=framed) + + def render_content(self, wbrequest): + if wbrequest.wb_url.mod == 'vi_': + return self._get_video_info(wbrequest) + + ref_wburl_str = wbrequest.extract_referrer_wburl_str() + if ref_wburl_str: + wbrequest.env['HTTP_REFERER'] = WbUrl(ref_wburl_str).url + + urlkey = canonicalize(wbrequest.wb_url.url) + url = wbrequest.wb_url.url + + inputreq = RewriteInputRequest(wbrequest.env, urlkey, url, + self.content_rewriter) + + req_data = inputreq.reconstruct_request(url) + + headers = {'Content-Length': len(req_data), + 'Content-Type': 'application/request'} + + if wbrequest.wb_url.is_latest_replay(): + closest = 'now' + else: + closest = wbrequest.wb_url.timestamp + + upstream_url = self.upstream_url.format(url=quote(url), + closest=closest, + #coll=wbrequest.coll, + **wbrequest.matchdict) + + r = requests.post(upstream_url, + data=BytesIO(req_data), + headers=headers, + stream=True, + allow_redirects=False) + + r.raise_for_status() + + record = self.loader.parse_record_stream(r.raw) + + cdx = CDXObject() + cdx['urlkey'] = urlkey + cdx['timestamp'] = http_date_to_timestamp(r.headers.get('Memento-Datetime')) + cdx['url'] = url + + head_insert_func = self.head_insert_view.create_insert_func(wbrequest) + result = self.content_rewriter.rewrite_content(wbrequest.urlrewriter, + record.status_headers, + record.stream, + head_insert_func, + urlkey, + cdx) + + status_headers, gen, is_rw = result + return self._make_response(wbrequest, *result) + + +if __name__ == "__main__": + from gevent.wsgi import WSGIServer + from pywb.apps.wayback import application + + server = WSGIServer(('', 8090), application) + server.serve_forever() diff --git a/urlrewrite/rewriteinputreq.py b/urlrewrite/rewriteinputreq.py new file mode 100644 index 00000000..18d84905 --- /dev/null +++ b/urlrewrite/rewriteinputreq.py @@ -0,0 +1,134 @@ +from webagg.inputrequest import DirectWSGIInputRequest +from pywb.utils.loaders import extract_client_cookie + +from six import iteritems +from six.moves.urllib.parse import urlsplit +import re + + +#============================================================================= +class RewriteInputRequest(DirectWSGIInputRequest): + RANGE_ARG_RX = re.compile('.*.googlevideo.com/videoplayback.*([&?]range=(\d+)-(\d+))') + + RANGE_HEADER = re.compile('bytes=(\d+)-(\d+)?') + + def __init__(self, env, urlkey, url, rewriter): + super(RewriteInputRequest, self).__init__(env) + self.urlkey = urlkey + self.url = url + self.rewriter = rewriter + self.extra_cookie = None + + self.splits = urlsplit(self.url) + + def get_full_request_uri(self): + uri = self.splits.path + if self.splits.query: + uri += '?' + self.splits.query + + return uri + + def get_req_headers(self): + headers = {} + + has_cookies = False + + for name, value in iteritems(self.env): + if name == 'HTTP_HOST': + name = 'Host' + value = self.splits.netloc + + elif name == 'HTTP_ORIGIN': + name = 'Origin' + value = (self.splits.scheme + '://' + self.splits.netloc) + + elif name == 'HTTP_X_CSRFTOKEN': + name = 'X-CSRFToken' + cookie_val = extract_client_cookie(self.env, 'csrftoken') + if cookie_val: + value = cookie_val + + elif name == 'HTTP_X_PYWB_REQUESTED_WITH': + continue + + elif name == 'HTTP_X_FORWARDED_PROTO': + name = 'X-Forwarded-Proto' + value = self.splits.scheme + + elif name == 'HTTP_COOKIE': + name = 'Cookie' + value = self._req_cookie_rewrite(value) + has_cookies = True + + elif name.startswith('HTTP_'): + name = name[5:].title().replace('_', '-') + + elif name in ('CONTENT_LENGTH', 'CONTENT_TYPE'): + name = name.title().replace('_', '-') + + else: + value = None + + if value: + headers[name] = value + + if not has_cookies: + value = self._req_cookie_rewrite('') + if value: + headers['Cookie'] = value + + if self.extra_cookie: + headers['Cookie'] = self.extra_cookie + ';' + headers.get('Cookie', '') + print('Cookie', headers['Cookie']) + + return headers + + def _req_cookie_rewrite(self, value): + rule = self.rewriter.ruleset.get_first_match(self.urlkey) + if not rule or not rule.req_cookie_rewrite: + return value + + for cr in rule.req_cookie_rewrite: + try: + value = cr['rx'].sub(cr['replace'], value) + except KeyError: + pass + + return value + + def extract_range(self): + use_206 = False + start = None + end = None + url = self.url + + range_h = self.env.get('HTTP_RANGE') + + if range_h: + m = self.RANGE_HEADER.match(range_h) + if m: + start = m.group(1) + end = m.group(2) + use_206 = True + + else: + m = self.RANGE_ARG_RX.match(url) + if m: + start = m.group(2) + end = m.group(3) + url = url[:m.start(1)] + url[m.end(1):] + use_206 = False + + if not start: + return None + + start = int(start) + + if end: + end = int(end) + else: + end = '' + + result = (url, start, end, use_206) + return result + diff --git a/urlrewrite/rewriterapp.py b/urlrewrite/rewriterapp.py new file mode 100644 index 00000000..ab7eba17 --- /dev/null +++ b/urlrewrite/rewriterapp.py @@ -0,0 +1,425 @@ +import requests + +from pywb.rewrite.rewrite_amf import RewriteContentAMF +from pywb.rewrite.wburl import WbUrl +from pywb.rewrite.url_rewriter import UrlRewriter + +from pywb.utils.wbexception import WbException +from pywb.utils.canonicalize import canonicalize +from pywb.utils.timeutils import http_date_to_timestamp +from pywb.utils.loaders import extract_client_cookie + +from pywb.cdx.cdxobject import CDXObject +from pywb.warc.recordloader import ArcWarcRecordLoader +from pywb.framework.wbrequestresponse import WbResponse + +from six.moves.urllib.parse import urlencode + +from urlrewrite.rewriteinputreq import RewriteInputRequest +from urlrewrite.templateview import JinjaEnv, HeadInsertView, TopFrameView, BaseInsertView + +from io import BytesIO + +import gevent +import json + + +# ============================================================================ +class UpstreamException(WbException): + def __init__(self, status_code, url, details): + super(UpstreamException, self).__init__(url=url, msg=details) + self.status_code = status_code + + +# ============================================================================ +class RewriterApp(object): + VIDEO_INFO_CONTENT_TYPE = 'application/vnd.youtube-dl_formats+json' + + def __init__(self, framed_replay=False, jinja_env=None, config=None): + self.loader = ArcWarcRecordLoader() + + config = config or {} + self.paths = config['url_templates'] + + self.framed_replay = framed_replay + self.frame_mod = '' + self.replay_mod = 'mp_' + + frame_type = 'inverse' if framed_replay else False + + self.content_rewriter = RewriteContentAMF(is_framed_replay=frame_type) + + if not jinja_env: + jinja_env = JinjaEnv(globals={'static_path': 'static/__pywb'}) + + self.jinja_env = jinja_env + + self.head_insert_view = HeadInsertView(self.jinja_env, 'head_insert.html', 'banner.html') + self.frame_insert_view = TopFrameView(self.jinja_env, 'frame_insert.html', 'banner.html') + self.error_view = BaseInsertView(self.jinja_env, 'error.html') + self.query_view = BaseInsertView(self.jinja_env, config.get('query_html', 'query.html')) + + self.cookie_tracker = None + + def call_with_params(self, **kwargs): + def run_app(environ, start_response): + environ['pywb.kwargs'] = kwargs + return self(environ, start_response) + + return run_app + + def __call__(self, environ, start_response): + wb_url = self.get_wburl(environ) + kwargs = environ.get('pywb.kwargs', {}) + + try: + response = self.render_content(wb_url, kwargs, environ) + except UpstreamException as ue: + response = self.handle_error(environ, ue) + + return response(environ, start_response) + + def is_framed_replay(self, wb_url): + return (self.framed_replay and + wb_url.mod == self.frame_mod and + wb_url.is_replay()) + + def render_content(self, wb_url, kwargs, environ): + wb_url = WbUrl(wb_url) + + host_prefix = self.get_host_prefix(environ) + rel_prefix = self.get_rel_prefix(environ) + full_prefix = host_prefix + rel_prefix + + resp = self.handle_custom_response(environ, wb_url, + full_prefix, host_prefix, kwargs) + if resp is not None: + content_type = 'text/html' + + # if not replay outer frame, specify utf-8 charset + if not self.is_framed_replay(wb_url): + content_type += '; charset=utf-8' + + return WbResponse.text_response(resp, content_type=content_type) + + urlrewriter = UrlRewriter(wb_url, + prefix=full_prefix, + full_prefix=full_prefix, + rel_prefix=rel_prefix) + + self.unrewrite_referrer(environ) + + urlkey = canonicalize(wb_url.url) + + inputreq = RewriteInputRequest(environ, urlkey, wb_url.url, + self.content_rewriter) + + inputreq.include_post_query(wb_url.url) + + mod_url = None + use_206 = False + rangeres = None + + readd_range = False + async_record_url = None + + if kwargs.get('type') in ('record', 'patch'): + rangeres = inputreq.extract_range() + + if rangeres: + mod_url, start, end, use_206 = rangeres + + # if bytes=0- Range request, + # simply remove the range and still proxy + if start == 0 and not end and use_206: + wb_url.url = mod_url + inputreq.url = mod_url + + del environ['HTTP_RANGE'] + readd_range = True + else: + async_record_url = mod_url + + skip = async_record_url is not None + + setcookie_headers = None + if self.cookie_tracker: + cookie_key = self.get_cookie_key(kwargs) + res = self.cookie_tracker.get_cookie_headers(wb_url.url, cookie_key) + inputreq.extra_cookie, setcookie_headers = res + + r = self._do_req(inputreq, wb_url, kwargs, skip) + + if r.status_code >= 400: + error = None + try: + error = r.raw.read() + r.raw.close() + except: + pass + + if error: + error = error.decode('utf-8') + else: + error = '' + + details = dict(args=kwargs, error=error) + raise UpstreamException(r.status_code, url=wb_url.url, details=details) + + if async_record_url: + environ.pop('HTTP_RANGE', '') + gevent.spawn(self._do_async_req, + inputreq, + async_record_url, + wb_url, + kwargs, + False) + + record = self.loader.parse_record_stream(r.raw) + + cdx = CDXObject() + cdx['urlkey'] = urlkey + cdx['timestamp'] = http_date_to_timestamp(r.headers.get('Memento-Datetime')) + cdx['url'] = wb_url.url + + self._add_custom_params(cdx, r.headers, kwargs) + + if readd_range: + content_length = (record.status_headers. + get_header('Content-Length')) + try: + content_length = int(content_length) + record.status_headers.add_range(0, content_length, + content_length) + except (ValueError, TypeError): + pass + + if self.is_ajax(environ): + head_insert_func = None + urlrewriter.rewrite_opts['is_ajax'] = True + else: + top_url = self.get_top_url(full_prefix, wb_url, cdx, kwargs) + head_insert_func = (self.head_insert_view. + create_insert_func(wb_url, + full_prefix, + host_prefix, + top_url, + environ, + self.framed_replay)) + + cookie_rewriter = None + if self.cookie_tracker: + cookie_rewriter = self.cookie_tracker.get_rewriter(urlrewriter, + cookie_key) + + result = self.content_rewriter.rewrite_content(urlrewriter, + record.status_headers, + record.stream, + head_insert_func, + urlkey, + cdx, + cookie_rewriter, + environ) + + status_headers, gen, is_rw = result + + if setcookie_headers: + status_headers.headers.extend(setcookie_headers) + + return WbResponse(status_headers, gen) + + def get_top_url(self, full_prefix, wb_url, cdx, kwargs): + top_url = full_prefix + top_url += wb_url.to_str(mod='') + return top_url + + def _do_async_req(self, *args): + count = 0 + try: + r = self._do_req(*args) + while True: + buff = r.raw.read(8192) + count += len(buff) + if not buff: + return + except: + import traceback + traceback.print_exc() + + finally: + try: + r.raw.close() + except: + pass + + def handle_error(self, environ, ue): + error_html = self.error_view.render_to_string(environ, + err_msg=ue.url, + err_details=ue.msg) + + return WbResponse.text_response(error_html, content_type='text/html') + + def _do_req(self, inputreq, wb_url, kwargs, skip): + req_data = inputreq.reconstruct_request(wb_url.url) + + headers = {'Content-Length': str(len(req_data)), + 'Content-Type': 'application/request'} + + if skip: + headers['Recorder-Skip'] = '1' + + if wb_url.is_latest_replay(): + closest = 'now' + else: + closest = wb_url.timestamp + + params = {} + params['url'] = wb_url.url + params['closest'] = closest + + if wb_url.mod == 'vi_': + params['content_type'] = self.VIDEO_INFO_CONTENT_TYPE + + upstream_url = self.get_upstream_url(wb_url, kwargs, params) + + r = requests.post(upstream_url, + data=BytesIO(req_data), + headers=headers, + stream=True) + + return r + + def do_query(self, wb_url, kwargs): + params = {} + params['url'] = wb_url.url + params['output'] = 'json' + params['from'] = wb_url.timestamp + params['to'] = wb_url.end_timestamp + + upstream_url = self.get_upstream_url(wb_url, kwargs, params) + upstream_url = upstream_url.replace('/resource/postreq', '/index') + + r = requests.get(upstream_url) + + return r.text + + def handle_query(self, environ, wb_url, kwargs): + res = self.do_query(wb_url, kwargs) + + def format_cdx(text): + cdx_lines = text.rstrip().split('\n') + for cdx in cdx_lines: + if not cdx: + continue + + cdx = json.loads(cdx) + self.process_query_cdx(cdx, wb_url, kwargs) + yield cdx + + prefix = self.get_full_prefix(environ) + + params = dict(url=wb_url.url, + prefix=prefix, + cdx_lines=list(format_cdx(res))) + + extra_params = self.get_query_params(wb_url, kwargs) + if extra_params: + params.update(extra_params) + + return self.query_view.render_to_string(environ, **params) + + def process_query_cdx(self, cdx, wb_url, kwargs): + return + + def get_query_params(self, wb_url, kwargs): + return None + + def get_host_prefix(self, environ): + #return request.urlparts.scheme + '://' + request.urlparts.netloc + url = environ['wsgi.url_scheme'] + '://' + if environ.get('HTTP_HOST'): + url += environ['HTTP_HOST'] + else: + url += environ['SERVER_NAME'] + if environ['wsgi.url_scheme'] == 'https': + if environ['SERVER_PORT'] != '443': + url += ':' + environ['SERVER_PORT'] + else: + if environ['SERVER_PORT'] != '80': + url += ':' + environ['SERVER_PORT'] + + return url + + def get_rel_prefix(self, environ): + #return request.script_name + return environ.get('SCRIPT_NAME') + '/' + + def get_full_prefix(self, environ): + return self.get_host_prefix(environ) + self.get_rel_prefix(environ) + + def get_wburl(self, environ): + wb_url = environ.get('PATH_INFO', '/')[1:] + if environ.get('QUERY_STRING'): + wb_url += '?' + environ.get('QUERY_STRING') + + return wb_url + + def unrewrite_referrer(self, environ): + referrer = environ.get('HTTP_REFERER') + if not referrer: + return False + + full_prefix = self.get_full_prefix(environ) + + if referrer.startswith(full_prefix): + referrer = referrer[len(full_prefix):] + environ['HTTP_REFERER'] = WbUrl(referrer).url + return True + + return False + + def is_ajax(self, environ): + value = environ.get('HTTP_X_REQUESTED_WITH') + value = value or environ.get('HTTP_X_PYWB_REQUESTED_WITH') + if value and value.lower() == 'xmlhttprequest': + return True + + return False + + def get_base_url(self, wb_url, kwargs): + type = kwargs.get('type') + return self.paths[type] + + def get_upstream_url(self, wb_url, kwargs, params): + base_url = self.get_base_url(wb_url, kwargs) + param_str = urlencode(params, True) + if param_str: + base_url += '&' + param_str + return base_url + + def get_cookie_key(self, kwargs): + raise NotImplemented() + + def _add_custom_params(self, cdx, headers, kwargs): + cdx['is_live'] = 'true' + pass + + def get_top_frame_params(self, wb_url, kwargs): + return None + + def handle_custom_response(self, environ, wb_url, full_prefix, host_prefix, kwargs): + if wb_url.is_query(): + return self.handle_query(environ, wb_url, kwargs) + + if self.is_framed_replay(wb_url): + extra_params = self.get_top_frame_params(wb_url, kwargs) + return self.frame_insert_view.get_top_frame(wb_url, + full_prefix, + host_prefix, + environ, + self.frame_mod, + self.replay_mod, + coll='', + extra_params=extra_params) + + return None diff --git a/urlrewrite/templateview.py b/urlrewrite/templateview.py new file mode 100644 index 00000000..e6b8cdd3 --- /dev/null +++ b/urlrewrite/templateview.py @@ -0,0 +1,225 @@ +from pywb.utils.timeutils import timestamp_to_datetime, timestamp_to_sec +from pywb.utils.timeutils import timestamp_now +from six.moves.urllib.parse import urlsplit + +from jinja2 import Environment +from jinja2 import FileSystemLoader, PackageLoader, ChoiceLoader + +from webassets.ext.jinja2 import AssetsExtension +from webassets.loaders import YAMLLoader +from webassets.env import Resolver + +from pkg_resources import resource_filename + +import json +import os + + +# ============================================================================ +class FileOnlyPackageLoader(PackageLoader): + def get_source(self, env, template): + dir_, file_ = os.path.split(template) + return super(FileOnlyPackageLoader, self).get_source(env, file_) + + +# ============================================================================ +class RelEnvironment(Environment): + """Override join_path() to enable relative template paths.""" + def join_path(self, template, parent): + return os.path.join(os.path.dirname(parent), template) + + +# ============================================================================ +class JinjaEnv(object): + def __init__(self, paths=['templates', '.', '/'], + packages=['pywb'], + assets_path=None, + globals=None, + overlay=None, + extensions=None): + + self._init_filters() + + loader = ChoiceLoader(self._make_loaders(paths, packages)) + + extensions = extensions or [] + + if assets_path: + extensions.append(AssetsExtension) + + if overlay: + jinja_env = overlay.jinja_env.overlay(loader=loader, + trim_blocks=True, + extensions=extensions) + else: + jinja_env = RelEnvironment(loader=loader, + trim_blocks=True, + extensions=extensions) + + jinja_env.filters.update(self.filters) + + if globals: + jinja_env.globals.update(globals) + + self.jinja_env = jinja_env + + # init assets + if assets_path: + assets_loader = YAMLLoader(assets_path) + assets_env = assets_loader.load_environment() + assets_env.resolver = PkgResResolver() + jinja_env.assets_environment = assets_env + + def _make_loaders(self, paths, packages): + loaders = [] + # add loaders for paths + for path in paths: + loaders.append(FileSystemLoader(path)) + + # add loaders for all specified packages + for package in packages: + loaders.append(FileOnlyPackageLoader(package)) + + return loaders + + def template_filter(self, param=None): + def deco(func): + name = param or func.__name__ + self.filters[name] = func + return func + + return deco + + def _init_filters(self): + self.filters = {} + + @self.template_filter() + def format_ts(value, format_='%a, %b %d %Y %H:%M:%S'): + if format_ == '%s': + return timestamp_to_sec(value) + else: + value = timestamp_to_datetime(value) + return value.strftime(format_) + + @self.template_filter('urlsplit') + def get_urlsplit(url): + split = urlsplit(url) + return split + + @self.template_filter() + def tojson(obj): + return json.dumps(obj) + + +# ============================================================================ +class BaseInsertView(object): + def __init__(self, jenv, insert_file, banner_file=''): + self.jenv = jenv + self.insert_file = insert_file + self.banner_file = banner_file + + def render_to_string(self, env, **kwargs): + template = self.jenv.jinja_env.get_template(self.insert_file) + params = env.get('webrec.template_params') + if params: + kwargs.update(params) + + return template.render(**kwargs) + + +# ============================================================================ +class HeadInsertView(BaseInsertView): + def create_insert_func(self, wb_url, + wb_prefix, + host_prefix, + top_url, + env, + is_framed, + coll='', + include_ts=True): + + url = wb_url.get_url() + + include_wombat = not wb_url.is_banner_only + + wbrequest = {'host_prefix': host_prefix, + 'wb_prefix': wb_prefix, + 'wb_url': wb_url, + 'coll': coll, + 'env': env, + 'options': {'is_framed': is_framed}, + 'rewrite_opts': {} + } + + def make_head_insert(rule, cdx): + return (self.render_to_string(env, wbrequest=wbrequest, + cdx=cdx, + top_url=top_url, + include_ts=include_ts, + include_wombat=include_wombat, + banner_html=self.banner_file, + rule=rule)) + return make_head_insert + + +# ============================================================================ +class TopFrameView(BaseInsertView): + def get_top_frame(self, wb_url, + wb_prefix, + host_prefix, + env, + frame_mod, + replay_mod, + coll='', + extra_params=None): + + embed_url = wb_url.to_str(mod=replay_mod) + + if wb_url.timestamp: + timestamp = wb_url.timestamp + else: + timestamp = timestamp_now() + + wbrequest = {'host_prefix': host_prefix, + 'wb_prefix': wb_prefix, + 'wb_url': wb_url, + 'coll': coll, + + 'options': {'frame_mod': frame_mod, + 'replay_mod': replay_mod}, + } + + params = dict(embed_url=embed_url, + wbrequest=wbrequest, + timestamp=timestamp, + url=wb_url.get_url(), + banner_html=self.banner_file) + + if extra_params: + params.update(extra_params) + + return self.render_to_string(env, **params) + + +# ============================================================================ +class PkgResResolver(Resolver): + def get_pkg_path(self, item): + if not isinstance(item, str): + return None + + parts = urlsplit(item) + if parts.scheme == 'pkg' and parts.netloc: + return (parts.netloc, parts.path) + + return None + + def resolve_source(self, ctx, item): + pkg = self.get_pkg_path(item) + if pkg: + filename = resource_filename(pkg[0], pkg[1]) + if filename: + return filename + + return super(PkgResResolver, self).resolve_source(ctx, item) + + diff --git a/urlrewrite/test/__init__.py b/urlrewrite/test/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/urlrewrite/test/simpleapp.py b/urlrewrite/test/simpleapp.py new file mode 100644 index 00000000..b651e24f --- /dev/null +++ b/urlrewrite/test/simpleapp.py @@ -0,0 +1,74 @@ +from gevent.monkey import patch_all; patch_all() + +from bottle import run, Bottle, request, response, debug + +from six.moves.urllib.parse import quote + +from pywb.utils.loaders import LocalFileLoader + +import mimetypes +import redis + +from urlrewrite.rewriterapp import RewriterApp +from urlrewrite.cookies import CookieTracker + + +# ============================================================================ +class RWApp(RewriterApp): + def __init__(self, upstream_urls, cookie_key_templ, redis): + config = {} + config['url_templates'] = upstream_urls + + self.cookie_key_templ = cookie_key_templ + self.app = Bottle() + self.block_loader = LocalFileLoader() + self.init_routes() + + super(RWApp, self).__init__(True, config=config) + + self.cookie_tracker = CookieTracker(redis) + + self.orig_error_handler = self.app.default_error_handler + self.app.default_error_handler = self.err_handler + + def err_handler(self, exc): + print(exc) + import traceback + traceback.print_exc() + return self.orig_error_handler(exc) + + def get_cookie_key(self, kwargs): + return self.cookie_key_templ.format(**kwargs) + + def init_routes(self): + @self.app.get('/static/__pywb/') + def server_static(filepath): + data = self.block_loader.load('pywb/static/' + filepath) + guessed = mimetypes.guess_type(filepath) + if guessed[0]: + response.headers['Content-Type'] = guessed[0] + + return data + + self.app.mount('/live/', self.call_with_params(type='live')) + self.app.mount('/record/', self.call_with_params(type='record')) + self.app.mount('/replay/', self.call_with_params(type='replay')) + + @staticmethod + def create_app(replay_port=8080, record_port=8010): + upstream_urls = {'live': 'http://localhost:%s/live/resource/postreq?' % replay_port, + 'record': 'http://localhost:%s/live/resource/postreq?' % record_port, + 'replay': 'http://localhost:%s/replay/resource/postreq?' % replay_port, + } + + r = redis.StrictRedis.from_url('redis://localhost/2') + rwapp = RWApp(upstream_urls, 'cookies:', r) + return rwapp + + +# ============================================================================ +if __name__ == "__main__": + application = RWApp.create_app() + application.app.run(port=8090, server='gevent') + + diff --git a/urlrewrite/test/test_rewriter.py b/urlrewrite/test/test_rewriter.py new file mode 100644 index 00000000..4fdaff48 --- /dev/null +++ b/urlrewrite/test/test_rewriter.py @@ -0,0 +1,43 @@ + +from webagg.test.testutils import LiveServerTests, BaseTestClass +from webagg.test.testutils import FakeRedisTests + +from .simpleapp import RWApp, debug + +import os +import webtest + + +class TestRewriter(LiveServerTests, FakeRedisTests, BaseTestClass): + @classmethod + def setup_class(cls): + super(TestRewriter, cls).setup_class() + #cls.upstream_url = 'http://localhost:{0}'.format(cls.server.port) + #cls.upstream_url += '/{type}/resource/postreq?url={url}&closest={closest}' + #cls.app = RWApp(cls.upstream_url) + + cls.app = RWApp.create_app(replay_port=cls.server.port) + cls.testapp = webtest.TestApp(cls.app.app) + debug(True) + + def test_replay(self): + resp = self.testapp.get('/live/mp_/http://example.com/') + resp.charset = 'utf-8' + + assert '"http://localhost:80/live/mp_/http://www.iana.org/domains/example"' in resp.text + + assert 'wbinfo.url = "http://example.com/"' + + def test_top_frame(self): + resp = self.testapp.get('/live/http://example.com/') + resp.charset = 'utf-8' + + assert '"http://localhost:80/live/mp_/http://example.com/"' in resp.text + + assert 'wbinfo.capture_url = "http://example.com/"' in resp.text + + def test_cookie_track_1(self): + resp = self.testapp.get('/live/mp_/https://twitter.com/') + + assert resp.headers['set-cookie'] != None + diff --git a/urlrewrite/test/uwsgi.ini b/urlrewrite/test/uwsgi.ini new file mode 100644 index 00000000..7acd4f0b --- /dev/null +++ b/urlrewrite/test/uwsgi.ini @@ -0,0 +1,18 @@ +[uwsgi] +if-not-env = PORT +http-socket = :8090 +endif = + +master = true +buffer-size = 65536 +die-on-term = true + +if-env = VIRTUAL_ENV +venv = $(VIRTUAL_ENV) +endif = + +gevent = 100 + +wsgi = urlrewrite.test.simpleapp + + diff --git a/webagg/Dockerfile b/webagg/Dockerfile new file mode 100644 index 00000000..9dc3c623 --- /dev/null +++ b/webagg/Dockerfile @@ -0,0 +1,14 @@ +FROM python:3.5 + +WORKDIR /code/ + +RUN pip install -U git+https://github.com/ikreymer/pywb.git@develop#egg=pywb-0.30.0-develop +RUN pip install uwsgi gevent bottle + +ADD . /code/webagg/ +ADD ./test/ /code/test/ + +WORKDIR /code/ +CMD uwsgi /code/test/live.ini + + diff --git a/webagg/README.rst b/webagg/README.rst new file mode 100644 index 00000000..f06334b6 --- /dev/null +++ b/webagg/README.rst @@ -0,0 +1,6 @@ +Resource Memento/Aggregator +=========================== + +This is a reference implementation of the `Resource/Memento Aggregator `_ +from the `Webrecorder Platform `_ + diff --git a/webagg/__init__.py b/webagg/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/webagg/aggregator.py b/webagg/aggregator.py new file mode 100644 index 00000000..9ca59b52 --- /dev/null +++ b/webagg/aggregator.py @@ -0,0 +1,287 @@ +from gevent.pool import Pool +import gevent + +from concurrent import futures + +import json +import time +import os + +from pywb.utils.timeutils import timestamp_now +from pywb.cdx.cdxops import process_cdx +from pywb.cdx.query import CDXQuery + +from heapq import merge +from collections import deque +from itertools import chain + +from webagg.indexsource import FileIndexSource, RedisIndexSource +from pywb.utils.wbexception import NotFoundException, WbException + +from webagg.utils import ParamFormatter, res_template + +import six +import glob + + +#============================================================================= +class BaseAggregator(object): + def __call__(self, params): + if params.get('closest') == 'now': + params['closest'] = timestamp_now() + + content_type = params.get('content_type') + if content_type: + params['filter'] = '=mime:' + content_type + + query = CDXQuery(params) + + cdx_iter, errs = self.load_index(query.params) + + cdx_iter = process_cdx(cdx_iter, query) + return cdx_iter, dict(errs) + + def load_child_source(self, name, source, params): + try: + params['_formatter'] = ParamFormatter(params, name) + res = source.load_index(params) + if isinstance(res, tuple): + cdx_iter, err_list = res + else: + cdx_iter = res + err_list = [] + except WbException as wbe: + #print('Not found in ' + name) + cdx_iter = iter([]) + err_list = [(name, repr(wbe))] + + def add_name(cdx, name): + if cdx.get('source'): + cdx['source'] = name + ':' + cdx['source'] + else: + cdx['source'] = name + return cdx + + return (add_name(cdx, name) for cdx in cdx_iter), err_list + + def load_index(self, params): + res_list = self._load_all(params) + + iter_list = [res[0] for res in res_list] + err_list = chain(*[res[1] for res in res_list]) + + #optimization: if only a single entry (or empty) just load directly + if len(iter_list) <= 1: + cdx_iter = iter_list[0] if iter_list else iter([]) + else: + cdx_iter = merge(*(iter_list)) + + return cdx_iter, err_list + + def _on_source_error(self, name): #pragma: no cover + pass + + def _load_all(self, params): #pragma: no cover + raise NotImplemented() + + def _iter_sources(self, params): #pragma: no cover + raise NotImplemented() + + def get_source_list(self, params): + srcs = self._iter_sources(params) + result = [(name, str(value)) for name, value in srcs] + result = {'sources': dict(result)} + return result + + +#============================================================================= +class BaseSourceListAggregator(BaseAggregator): + def __init__(self, sources, **kwargs): + self.sources = sources + + def get_all_sources(self, params): + return self.sources + + def _iter_sources(self, params): + sources = self.get_all_sources(params) + srcs_list = params.get('sources') + if not srcs_list: + return sources.items() + + sel_sources = tuple(srcs_list.split(',')) + + return [(name, sources[name]) for name in sources.keys() if name in sel_sources] + + +#============================================================================= +class SeqAggMixin(object): + def __init__(self, *args, **kwargs): + super(SeqAggMixin, self).__init__(*args, **kwargs) + + + def _load_all(self, params): + sources = self._iter_sources(params) + return [self.load_child_source(name, source, params) + for name, source in sources] + + +#============================================================================= +class SimpleAggregator(SeqAggMixin, BaseSourceListAggregator): + pass + + +#============================================================================= +class TimeoutMixin(object): + def __init__(self, *args, **kwargs): + super(TimeoutMixin, self).__init__(*args, **kwargs) + self.t_count = kwargs.get('t_count', 3) + self.t_dura = kwargs.get('t_duration', 20) + self.timeouts = {} + + def is_timed_out(self, name): + timeout_deq = self.timeouts.get(name) + if not timeout_deq: + return False + + the_time = time.time() + for t in list(timeout_deq): + if (the_time - t) > self.t_dura: + timeout_deq.popleft() + + if len(timeout_deq) >= self.t_count: + print('Skipping {0}, {1} timeouts in {2} seconds'. + format(name, self.t_count, self.t_dura)) + return True + + return False + + def _iter_sources(self, params): + sources = super(TimeoutMixin, self)._iter_sources(params) + for name, source in sources: + if not self.is_timed_out(name): + yield name, source + + def _on_source_error(self, name): + the_time = time.time() + if name not in self.timeouts: + self.timeouts[name] = deque() + + self.timeouts[name].append(the_time) + print(name + ' timed out!') + + +#============================================================================= +class GeventMixin(object): + def __init__(self, *args, **kwargs): + super(GeventMixin, self).__init__(*args, **kwargs) + self.pool = Pool(size=kwargs.get('size')) + self.timeout = kwargs.get('timeout', 5.0) + + def _load_all(self, params): + params['_timeout'] = self.timeout + + sources = list(self._iter_sources(params)) + + def do_spawn(name, source): + return self.pool.spawn(self.load_child_source, name, source, params) + + jobs = [do_spawn(name, source) for name, source in sources] + + gevent.joinall(jobs, timeout=self.timeout) + + results = [] + for (name, source), job in zip(sources, jobs): + if job.value is not None: + results.append(job.value) + else: + results.append((iter([]), [(name, 'timeout')])) + self._on_source_error(name) + + return results + + +#============================================================================= +class GeventTimeoutAggregator(TimeoutMixin, GeventMixin, BaseSourceListAggregator): + pass + + +#============================================================================= +class BaseDirectoryIndexSource(BaseAggregator): + CDX_EXT = ('.cdx', '.cdxj') + + def __init__(self, base_prefix, base_dir=''): + self.base_prefix = base_prefix + self.base_dir = base_dir + + def _iter_sources(self, params): + the_dir = res_template(self.base_dir, params) + the_dir = os.path.join(self.base_prefix, the_dir) + try: + sources = list(self._load_files(the_dir)) + except Exception: + raise NotFoundException(the_dir) + + return sources + + def _load_files(self, glob_dir): + for the_dir in glob.iglob(glob_dir): + for result in self._load_files_single_dir(the_dir): + yield result + + def _load_files_single_dir(self, the_dir): + for name in os.listdir(the_dir): + filename = os.path.join(the_dir, name) + + if filename.endswith(self.CDX_EXT): + print('Adding ' + filename) + rel_path = os.path.relpath(the_dir, self.base_prefix) + if rel_path == '.': + full_name = name + else: + full_name = rel_path + '/' + name + + yield full_name, FileIndexSource(filename) + + def __str__(self): + return 'file_dir' + + +#============================================================================= +class DirectoryIndexSource(SeqAggMixin, BaseDirectoryIndexSource): + pass + + +#============================================================================= +class CacheDirectoryIndexSource(DirectoryIndexSource): + def __init__(self, *args, **kwargs): + super(CacheDirectoryIndexSource, self).__init__(*args, **kwargs) + self.cached_file_list = {} + + def _load_files_single_dir(self, the_dir): + try: + stat = os.stat(the_dir) + except Exception as e: + stat = 0 + + result = self.cached_file_list.get(the_dir) + + if result: + last_stat, files = result + if stat and last_stat == stat: + print('Dir {0} unchanged'.format(the_dir)) + return files + + files = super(CacheDirectoryIndexSource, self)._load_files_single_dir(the_dir) + files = list(files) + self.cached_file_list[the_dir] = (stat, files) + return files + + +#============================================================================= +class RedisMultiKeyIndexSource(SeqAggMixin, BaseAggregator, RedisIndexSource): + def _iter_sources(self, params): + redis_key_pattern = res_template(self.redis_key_template, params) + + for key in self.redis.scan_iter(match=redis_key_pattern): + key = key.decode('utf-8') + yield key, RedisIndexSource(None, self.redis, key) diff --git a/webagg/app.py b/webagg/app.py new file mode 100644 index 00000000..e045480b --- /dev/null +++ b/webagg/app.py @@ -0,0 +1,124 @@ +from webagg.inputrequest import DirectWSGIInputRequest, POSTInputRequest +from werkzeug.routing import Map, Rule + +import requests +import traceback +import json + +from six.moves.urllib.parse import parse_qsl +import six + +JSON_CT = 'application/json; charset=utf-8' + + +#============================================================================= +class ResAggApp(object): + def __init__(self, *args, **kwargs): + self.route_dict = {} + self.debug = kwargs.get('debug', False) + + self.url_map = Map() + + def list_routes(environ): + return {}, self.route_dict, {} + + self.url_map.add(Rule('/', endpoint=list_routes)) + + def add_route(self, path, handler): + def direct_input_request(environ, mode=''): + params = self.get_query_dict(environ) + params['mode'] = mode + params['_input_req'] = DirectWSGIInputRequest(environ) + return handler(params) + + def post_fullrequest(environ, mode=''): + params = self.get_query_dict(environ) + params['mode'] = mode + params['_input_req'] = POSTInputRequest(environ) + return handler(params) + + self.url_map.add(Rule(path, endpoint=direct_input_request)) + self.url_map.add(Rule(path + '/', endpoint=direct_input_request)) + + self.url_map.add(Rule(path + '/postreq', endpoint=post_fullrequest)) + self.url_map.add(Rule(path + '//postreq', endpoint=post_fullrequest)) + + handler_dict = handler.get_supported_modes() + + self.route_dict[path] = handler_dict + self.route_dict[path + '/postreq'] = handler_dict + + def get_query_dict(self, environ): + query_str = environ.get('QUERY_STRING') + if query_str: + return dict(parse_qsl(query_str)) + else: + return {} + + def __call__(self, environ, start_response): + urls = self.url_map.bind_to_environ(environ) + try: + endpoint, args = urls.match() + except HTTPException as e: + return e(environ, start_response) + + try: + result = endpoint(environ, **args) + + out_headers, res, errs = result + + if not res: + return self.send_error(errs, start_response) + + if isinstance(res, dict): + res = self.json_encode(res, out_headers) + + if errs: + if 'last_exc' in errs: + errs['last_exc'] = str(errs['last_exc']) + out_headers['ResErrors'] = json.dumps(errs) + + start_response('200 OK', list(out_headers.items())) + return res + + except Exception as e: + if self.debug: + traceback.print_exc() + message = 'Internal Error: ' + str(e) + status = 500 + return self.send_error({}, start_response, + message=message, + status=status) + + def json_encode(self, res, out_headers): + res = json.dumps(res).encode('utf-8') + out_headers['Content-Type'] = JSON_CT + out_headers['Content-Length'] = str(len(res)) + return [res] + + def send_error(self, errs, start_response, + message='No Resource Found', status=404): + last_exc = errs.pop('last_exc', None) + if last_exc: + if self.debug: + traceback.print_exc() + + status = last_exc.status() + message = last_exc.msg + + res = {'message': message} + if errs: + res['errors'] = errs + + out_headers = {} + res = self.json_encode(res, out_headers) + + if six.PY3: + out_headers['ResErrors'] = res[0].decode('utf-8') + else: + out_headers['ResErrors'] = res[0] + message = message.encode('utf-8') + + message = str(status) + ' ' + message + start_response(message, list(out_headers.items())) + return res diff --git a/webagg/handlers.py b/webagg/handlers.py new file mode 100644 index 00000000..a8e067f3 --- /dev/null +++ b/webagg/handlers.py @@ -0,0 +1,194 @@ +from webagg.responseloader import WARCPathLoader, LiveWebLoader, VideoLoader +from webagg.utils import MementoUtils +from pywb.utils.wbexception import BadRequestException, WbException +from pywb.utils.wbexception import NotFoundException + +from pywb.cdx.query import CDXQuery +from pywb.cdx.cdxdomainspecific import load_domain_specific_cdx_rules + +import six + + +#============================================================================= +def to_cdxj(cdx_iter, fields): + content_type = 'text/x-cdxj' + return content_type, (cdx.to_cdxj(fields) for cdx in cdx_iter) + +def to_json(cdx_iter, fields): + content_type = 'application/x-ndjson' + return content_type, (cdx.to_json(fields) for cdx in cdx_iter) + +def to_text(cdx_iter, fields): + content_type = 'text/plain' + return content_type, (cdx.to_text(fields) for cdx in cdx_iter) + +def to_link(cdx_iter, fields): + content_type = 'application/link' + return content_type, MementoUtils.make_timemap(cdx_iter) + + + +#============================================================================= +class FuzzyMatcher(object): + def __init__(self): + res = load_domain_specific_cdx_rules('pywb/rules.yaml', True) + self.url_canon, self.fuzzy_query = res + + def __call__(self, index_source, params): + cdx_iter, errs = index_source(params) + return self.do_fuzzy(cdx_iter, index_source, params), errs + + def do_fuzzy(self, cdx_iter, index_source, params): + found = False + for cdx in cdx_iter: + found = True + yield cdx + + fuzzy_query_params = None + if not found: + query = CDXQuery(params) + fuzzy_query_params = self.fuzzy_query(query) + + if not fuzzy_query_params: + return + + fuzzy_query_params.pop('alt_url', '') + + new_iter, errs = index_source(fuzzy_query_params) + + for cdx in new_iter: + yield cdx + + +#============================================================================= +class IndexHandler(object): + OUTPUTS = { + 'cdxj': to_cdxj, + 'json': to_json, + 'text': to_text, + 'link': to_link, + } + + DEF_OUTPUT = 'cdxj' + + def __init__(self, index_source, opts=None, *args, **kwargs): + self.index_source = index_source + self.opts = opts or {} + self.fuzzy = FuzzyMatcher() + + def get_supported_modes(self): + return dict(modes=['list_sources', 'index']) + + def _load_index_source(self, params): + url = params.get('url') + if not url: + errs = dict(last_exc=BadRequestException('The "url" param is required')) + return None, errs + + input_req = params.get('_input_req') + if input_req: + params['alt_url'] = input_req.include_post_query(url) + + return self.fuzzy(self.index_source, params) + + def __call__(self, params): + mode = params.get('mode', 'index') + if mode == 'list_sources': + return {}, self.index_source.get_source_list(params), {} + + if mode != 'index': + return {}, self.get_supported_modes(), {} + + output = params.get('output', self.DEF_OUTPUT) + fields = params.get('fields') + + handler = self.OUTPUTS.get(output) + if not handler: + errs = dict(last_exc=BadRequestException('output={0} not supported'.format(output))) + return None, None, errs + + cdx_iter, errs = self._load_index_source(params) + if not cdx_iter: + return None, None, errs + + content_type, res = handler(cdx_iter, fields) + out_headers = {'Content-Type': content_type} + + def check_str(lines): + for line in lines: + if isinstance(line, six.text_type): + line = line.encode('utf-8') + yield line + + return out_headers, check_str(res), errs + + +#============================================================================= +class ResourceHandler(IndexHandler): + def __init__(self, index_source, resource_loaders): + super(ResourceHandler, self).__init__(index_source) + self.resource_loaders = resource_loaders + + def get_supported_modes(self): + res = super(ResourceHandler, self).get_supported_modes() + res['modes'].append('resource') + return res + + def __call__(self, params): + if params.get('mode', 'resource') != 'resource': + return super(ResourceHandler, self).__call__(params) + + cdx_iter, errs = self._load_index_source(params) + if not cdx_iter: + return None, None, errs + + last_exc = None + + for cdx in cdx_iter: + for loader in self.resource_loaders: + try: + out_headers, resp = loader(cdx, params) + if resp is not None: + return out_headers, resp, errs + except WbException as e: + last_exc = e + errs[str(loader)] = str(e) + + if last_exc: + errs['last_exc'] = last_exc + + return None, None, errs + + +#============================================================================= +class DefaultResourceHandler(ResourceHandler): + def __init__(self, index_source, warc_paths=''): + loaders = [WARCPathLoader(warc_paths, index_source), + LiveWebLoader(), + VideoLoader() + ] + super(DefaultResourceHandler, self).__init__(index_source, loaders) + + +#============================================================================= +class HandlerSeq(object): + def __init__(self, handlers): + self.handlers = handlers + + def get_supported_modes(self): + if self.handlers: + return self.handlers[0].get_supported_modes() + else: + return {} + + def __call__(self, params): + all_errs = {} + for handler in self.handlers: + out_headers, res, errs = handler(params) + all_errs.update(errs) + if res is not None: + return out_headers, res, all_errs + + return None, None, all_errs + + diff --git a/webagg/indexsource.py b/webagg/indexsource.py new file mode 100644 index 00000000..a52bb11a --- /dev/null +++ b/webagg/indexsource.py @@ -0,0 +1,226 @@ +import redis + +from pywb.utils.binsearch import iter_range +from pywb.utils.timeutils import timestamp_to_http_date, http_date_to_timestamp +from pywb.utils.timeutils import timestamp_now +from pywb.utils.canonicalize import canonicalize +from pywb.utils.wbexception import NotFoundException + +from pywb.cdx.cdxobject import CDXObject + +#from webagg.liverec import patched_requests as requests +import requests + +from webagg.utils import ParamFormatter, res_template +from webagg.utils import MementoUtils + + +WAYBACK_ORIG_SUFFIX = '{timestamp}id_/{url}' + + +#============================================================================= +class BaseIndexSource(object): + def load_index(self, params): #pragma: no cover + raise NotImplemented() + + +#============================================================================= +class FileIndexSource(BaseIndexSource): + def __init__(self, filename): + self.filename_template = filename + + def load_index(self, params): + filename = res_template(self.filename_template, params) + + try: + fh = open(filename, 'rb') + except IOError: + raise NotFoundException(filename) + + def do_load(fh): + with fh: + gen = iter_range(fh, params['key'], params['end_key']) + for line in gen: + yield CDXObject(line) + + return do_load(fh) + + def __str__(self): + return 'file' + + +#============================================================================= +class RemoteIndexSource(BaseIndexSource): + def __init__(self, api_url, replay_url, url_field='load_url'): + self.api_url_template = api_url + self.replay_url = replay_url + self.url_field = url_field + + def load_index(self, params): + api_url = res_template(self.api_url_template, params) + r = requests.get(api_url, timeout=params.get('_timeout')) + if r.status_code >= 400: + raise NotFoundException(api_url) + + lines = r.content.strip().split(b'\n') + def do_load(lines): + for line in lines: + cdx = CDXObject(line) + self._set_load_url(cdx) + yield cdx + + return do_load(lines) + + def _set_load_url(self, cdx): + cdx[self.url_field] = self.replay_url.format( + timestamp=cdx['timestamp'], + url=cdx['url']) + + def __str__(self): + return 'remote' + + +#============================================================================= +class LiveIndexSource(BaseIndexSource): + def __init__(self, proxy_url='{url}'): + self.proxy_url = proxy_url + + def load_index(self, params): + cdx = CDXObject() + cdx['urlkey'] = params.get('key').decode('utf-8') + cdx['timestamp'] = timestamp_now() + cdx['url'] = params['url'] + cdx['load_url'] = res_template(self.proxy_url, params) + cdx['is_live'] = 'true' + cdx['mime'] = params.get('content_type', '') + def live(): + yield cdx + + return live() + + def __str__(self): + return 'live' + + +#============================================================================= +class RedisIndexSource(BaseIndexSource): + def __init__(self, redis_url, redis=None, key_template=None): + if redis_url and not redis: + redis, key_template = self.parse_redis_url(redis_url) + + self.redis = redis + self.redis_key_template = key_template + + @staticmethod + def parse_redis_url(redis_url): + parts = redis_url.split('/') + key_prefix = '' + if len(parts) > 4: + key_prefix = parts[4] + redis_url = 'redis://' + parts[2] + '/' + parts[3] + + redis_key_template = key_prefix + red = redis.StrictRedis.from_url(redis_url) + return red, key_prefix + + def load_index(self, params): + return self.load_key_index(self.redis_key_template, params) + + def load_key_index(self, key_template, params): + z_key = res_template(key_template, params) + index_list = self.redis.zrangebylex(z_key, + b'[' + params['key'], + b'(' + params['end_key']) + + def do_load(index_list): + for line in index_list: + yield CDXObject(line) + + return do_load(index_list) + + def __str__(self): + return 'redis' + + +#============================================================================= +class MementoIndexSource(BaseIndexSource): + def __init__(self, timegate_url, timemap_url, replay_url): + self.timegate_url = timegate_url + self.timemap_url = timemap_url + self.replay_url = replay_url + + def links_to_cdxobject(self, link_header, def_name): + results = MementoUtils.parse_links(link_header, def_name) + + #meta = MementoUtils.meta_field('timegate', results) + #if meta: + # yield meta + + #meta = MementoUtils.meta_field('timemap', results) + #if meta: + # yield meta + + #meta = MementoUtils.meta_field('original', results) + #if meta: + # yield meta + + original = results['original']['url'] + key = canonicalize(original) + + mementos = results['mementos'] + + for val in mementos: + dt = val['datetime'] + ts = http_date_to_timestamp(dt) + cdx = CDXObject() + cdx['urlkey'] = key + cdx['timestamp'] = ts + cdx['url'] = original + cdx['mem_rel'] = val.get('rel', '') + cdx['memento_url'] = val['url'] + + load_url = self.replay_url.format(timestamp=cdx['timestamp'], + url=original) + + cdx['load_url'] = load_url + yield cdx + + def get_timegate_links(self, params, closest): + url = res_template(self.timegate_url, params) + accept_dt = timestamp_to_http_date(closest) + res = requests.head(url, headers={'Accept-Datetime': accept_dt}) + if res.status_code >= 400: + raise NotFoundException(url) + + return res.headers.get('Link') + + def get_timemap_links(self, params): + url = res_template(self.timemap_url, params) + res = requests.get(url, timeout=params.get('_timeout')) + if res.status_code >= 400: + raise NotFoundException(url) + + return res.text + + def load_index(self, params): + closest = params.get('closest') + + if not closest: + links = self.get_timemap_links(params) + def_name = 'timemap' + else: + links = self.get_timegate_links(params, closest) + def_name = 'timegate' + + return self.links_to_cdxobject(links, def_name) + + @staticmethod + def from_timegate_url(timegate_url, path='link'): + return MementoIndexSource(timegate_url + '{url}', + timegate_url + 'timemap/' + path + '/{url}', + timegate_url + WAYBACK_ORIG_SUFFIX) + + def __str__(self): + return 'memento' + + diff --git a/webagg/inputrequest.py b/webagg/inputrequest.py new file mode 100644 index 00000000..50112959 --- /dev/null +++ b/webagg/inputrequest.py @@ -0,0 +1,170 @@ +from pywb.utils.loaders import extract_post_query, append_post_query +from pywb.utils.loaders import LimitReader +from pywb.utils.statusandheaders import StatusAndHeadersParser + +from six.moves.urllib.parse import urlsplit, quote +from six import iteritems, StringIO +from io import BytesIO + + +#============================================================================= +class DirectWSGIInputRequest(object): + def __init__(self, env): + self.env = env + + def get_req_method(self): + return self.env['REQUEST_METHOD'].upper() + + def get_req_protocol(self): + return self.env['SERVER_PROTOCOL'] + + def get_req_headers(self): + headers = {} + + for name, value in iteritems(self.env): + # will be set by requests to match actual host + if name == 'HTTP_HOST': + continue + + elif name.startswith('HTTP_'): + name = name[5:].title().replace('_', '-') + + elif name in ('CONTENT_LENGTH', 'CONTENT_TYPE'): + name = name.title().replace('_', '-') + + else: + value = None + + if value: + headers[name] = value + + return headers + + def get_req_body(self): + input_ = self.env['wsgi.input'] + len_ = self._get_content_length() + enc = self._get_header('Transfer-Encoding') + + if len_: + data = LimitReader(input_, int(len_)) + elif enc: + data = input_ + else: + data = None + + return data + + def _get_content_type(self): + return self.env.get('CONTENT_TYPE') + + def _get_content_length(self): + return self.env.get('CONTENT_LENGTH') + + def _get_header(self, name): + return self.env.get('HTTP_' + name.upper().replace('-', '_')) + + def include_post_query(self, url): + if not url or self.get_req_method() != 'POST': + return url + + mime = self._get_content_type() + #mime = mime.split(';')[0] if mime else '' + length = self._get_content_length() + stream = self.env['wsgi.input'] + + buffered_stream = BytesIO() + + post_query = extract_post_query('POST', mime, length, stream, + buffered_stream=buffered_stream, + environ=self.env) + + if post_query: + self.env['wsgi.input'] = buffered_stream + url = append_post_query(url, post_query) + + return url + + def get_full_request_uri(self): + req_uri = self.env.get('REQUEST_URI') + if req_uri and not self.env.get('SCRIPT_NAME'): + return req_uri + + req_uri = quote(self.env.get('PATH_INFO', ''), safe='/~!$&\'()*+,;=:@') + query = self.env.get('QUERY_STRING') + if query: + req_uri += '?' + query + + return req_uri + + def reconstruct_request(self, url=None): + buff = StringIO() + buff.write(self.get_req_method()) + buff.write(' ') + buff.write(self.get_full_request_uri()) + buff.write(' ') + buff.write(self.get_req_protocol()) + buff.write('\r\n') + + headers = self.get_req_headers() + + if url: + parts = urlsplit(url) + buff.write('Host: ') + buff.write(parts.netloc) + buff.write('\r\n') + + for name, value in iteritems(headers): + if name.lower() == 'host': + continue + + buff.write(name) + buff.write(': ') + buff.write(value) + buff.write('\r\n') + + buff.write('\r\n') + buff = buff.getvalue().encode('latin-1') + + body = self.get_req_body() + if body: + buff += body.read() + + return buff + + +#============================================================================= +class POSTInputRequest(DirectWSGIInputRequest): + def __init__(self, env): + self.env = env + + parser = StatusAndHeadersParser([], verify=False) + + self.status_headers = parser.parse(self.env['wsgi.input']) + + def get_req_method(self): + return self.status_headers.protocol + + def get_req_headers(self): + headers = {} + for n, v in self.status_headers.headers: + headers[n] = v + + return headers + + def get_full_request_uri(self): + return self.status_headers.statusline.split(' ', 1)[0] + + def get_req_protocol(self): + return self.status_headers.statusline.split(' ', 1)[-1] + + def _get_content_type(self): + return self.status_headers.get_header('Content-Type') + + def _get_content_length(self): + return self.status_headers.get_header('Content-Length') + + def _get_header(self, name): + return self.status_headers.get_header(name) + + + diff --git a/webagg/proxyindexsource.py b/webagg/proxyindexsource.py new file mode 100644 index 00000000..435c9240 --- /dev/null +++ b/webagg/proxyindexsource.py @@ -0,0 +1,54 @@ +from pywb.cdx.cdxobject import CDXObject +from pywb.utils.wbexception import NotFoundException +from webagg.indexsource import BaseIndexSource, RemoteIndexSource +from webagg.responseloader import LiveWebLoader +from webagg.utils import ParamFormatter, res_template +from pywb.utils.timeutils import timestamp_now + + +#============================================================================= +class UpstreamAggIndexSource(RemoteIndexSource): + def __init__(self, base_url): + api_url = base_url + '/index?url={url}' + proxy_url = base_url + '/resource?url={url}&closest={timestamp}' + super(UpstreamAggIndexSource, self).__init__(api_url, proxy_url, 'filename') + + def _set_load_url(self, cdx): + super(UpstreamAggIndexSource, self)._set_load_url(cdx) + cdx['offset'] = '0' + cdx.pop('load_url', '') + + +#============================================================================= +class ProxyMementoIndexSource(BaseIndexSource): + def __init__(self, proxy_url='{url}'): + self.proxy_url = proxy_url + self.loader = LiveWebLoader() + + def load_index(self, params): + cdx = CDXObject() + cdx['urlkey'] = params.get('key').decode('utf-8') + + closest = params.get('closest') + cdx['timestamp'] = closest if closest else timestamp_now() + cdx['url'] = params['url'] + cdx['load_url'] = res_template(self.proxy_url, params) + cdx['memento_url'] = cdx['load_url'] + return self._do_load(cdx, params) + + def _do_load(self, cdx, params): + result = self.loader.load_resource(cdx, params) + if not result: + raise NotFoundException('Not a memento: ' + cdx['url']) + + cdx['_cached_result'] = result + yield cdx + + def __str__(self): + return 'proxy' + + @staticmethod + def upstream_resource(base_url): + return ProxyMementoIndexSource(base_url + '/resource?url={url}&closest={closest}') + + diff --git a/webagg/responseloader.py b/webagg/responseloader.py new file mode 100644 index 00000000..ecda0723 --- /dev/null +++ b/webagg/responseloader.py @@ -0,0 +1,436 @@ +from webagg.utils import MementoUtils, StreamIter, chunk_encode_iter +from webagg.utils import ParamFormatter +from webagg.indexsource import RedisIndexSource + +from pywb.utils.timeutils import timestamp_to_datetime, datetime_to_timestamp +from pywb.utils.timeutils import iso_date_to_datetime, datetime_to_iso_date +from pywb.utils.timeutils import http_date_to_datetime, datetime_to_http_date + +from pywb.utils.wbexception import LiveResourceException, WbException +from pywb.utils.statusandheaders import StatusAndHeaders, StatusAndHeadersParser + +from pywb.warc.resolvingloader import ResolvingLoader + +from six.moves.urllib.parse import urlsplit, quote, unquote + +from io import BytesIO + +import uuid +import six +import itertools +import json + +from requests.models import PreparedRequest +import urllib3 + + +#============================================================================= +class BaseLoader(object): + def __call__(self, cdx, params): + entry = self.load_resource(cdx, params) + if not entry: + return None, None + + warc_headers, other_headers, stream = entry + + out_headers = {} + out_headers['WebAgg-Type'] = 'warc' + out_headers['WebAgg-Source-Coll'] = quote(cdx.get('source', ''), safe=':/') + out_headers['Content-Type'] = 'application/warc-record' + + if not warc_headers: + if other_headers: + out_headers['Link'] = other_headers.get('Link') + out_headers['Memento-Datetime'] = other_headers.get('Memento-Datetime') + out_headers['Content-Length'] = other_headers.get('Content-Length') + + return out_headers, StreamIter(stream) + + out_headers['Link'] = MementoUtils.make_link( + warc_headers.get_header('WARC-Target-URI'), + 'original') + + memento_dt = iso_date_to_datetime(warc_headers.get_header('WARC-Date')) + out_headers['Memento-Datetime'] = datetime_to_http_date(memento_dt) + + warc_headers_buff = warc_headers.to_bytes() + + lenset = self._set_content_len(warc_headers.get_header('Content-Length'), + out_headers, + len(warc_headers_buff)) + + streamiter = StreamIter(stream, + header1=warc_headers_buff, + header2=other_headers) + + if not lenset: + out_headers['Transfer-Encoding'] = 'chunked' + streamiter = chunk_encode_iter(streamiter) + + return out_headers, streamiter + + def _set_content_len(self, content_len_str, headers, existing_len): + # Try to set content-length, if it is available and valid + try: + content_len = int(content_len_str) + except (KeyError, TypeError): + content_len = -1 + + if content_len >= 0: + content_len += existing_len + headers['Content-Length'] = str(content_len) + return True + + return False + + def raise_on_self_redirect(self, params, cdx, status_code, location_url): + """ + Check if response is a 3xx redirect to the same url + If so, reject this capture to avoid causing redirect loop + """ + if cdx.get('is_live'): + return + + if not status_code.startswith('3') or status_code == '304': + return + + request_url = params['url'].lower() + if not location_url: + return + + location_url = location_url.lower() + if location_url.startswith('/'): + host = urlsplit(cdx['url']).netloc + location_url = host + location_url + + if request_url == location_url: + msg = 'Self Redirect {0} -> {1}' + msg = msg.format(request_url, location_url) + #print(msg) + raise LiveResourceException(msg) + + @staticmethod + def _make_warc_id(id_=None): + if not id_: + id_ = uuid.uuid1() + return ''.format(id_) + + +#============================================================================= +class PrefixResolver(object): + def __init__(self, template): + self.template = template + + def __call__(self, filename, cdx): + full_path = self.template + if hasattr(cdx, '_formatter') and cdx._formatter: + full_path = cdx._formatter.format(full_path) + + return full_path + filename + + +#============================================================================= +class RedisResolver(RedisIndexSource): + def __call__(self, filename, cdx): + redis_key = self.redis_key_template + if hasattr(cdx, '_formatter') and cdx._formatter: + redis_key = cdx._formatter.format(redis_key) + + res = None + + if '*' in redis_key: + for key in self.redis.scan_iter(redis_key): + #key = key.decode('utf-8') + res = self.redis.hget(key, filename) + if res: + break + else: + res = self.redis.hget(redis_key, filename) + + if res and six.PY3: + res = res.decode('utf-8') + + return res + + +#============================================================================= +class WARCPathLoader(BaseLoader): + def __init__(self, paths, cdx_source): + self.paths = paths + if isinstance(paths, six.string_types): + self.paths = [paths] + + self.resolvers = [self._make_resolver(path) for path in self.paths] + + self.resolve_loader = ResolvingLoader(self.resolvers, + no_record_parse=True) + + self.headers_parser = StatusAndHeadersParser([], verify=False) + + self.cdx_source = cdx_source + + def _make_resolver(self, path): + if hasattr(path, '__call__'): + return path + + if path.startswith('redis://'): + return RedisResolver(path) + + else: + return PrefixResolver(path) + + def load_resource(self, cdx, params): + if cdx.get('_cached_result'): + return cdx.get('_cached_result') + + if not cdx.get('filename') or cdx.get('offset') is None: + return None + + orig_source = cdx.get('source', '').split(':')[0] + formatter = ParamFormatter(params, orig_source) + cdx._formatter = formatter + + def local_index_query(local_params): + for n, v in six.iteritems(params): + if n.startswith('param.'): + local_params[n] = v + + cdx_iter, errs = self.cdx_source(local_params) + for cdx in cdx_iter: + cdx._formatter = formatter + yield cdx + + return cdx_iter + + failed_files = [] + headers, payload = (self.resolve_loader. + load_headers_and_payload(cdx, + failed_files, + local_index_query)) + + status = cdx.get('status') + if not status or status.startswith('3'): + status_headers = self.headers_parser.parse(payload.stream) + self.raise_on_self_redirect(params, cdx, + status_headers.get_statuscode(), + status_headers.get_header('Location')) + http_headers_buff = status_headers.to_bytes() + else: + http_headers_buff = None + + warc_headers = payload.rec_headers + + if headers != payload: + warc_headers.replace_header('WARC-Refers-To-Target-URI', + payload.rec_headers.get_header('WARC-Target-URI')) + + warc_headers.replace_header('WARC-Refers-To-Date', + payload.rec_headers.get_header('WARC-Date')) + + warc_headers.replace_header('WARC-Target-URI', + headers.rec_headers.get_header('WARC-Target-URI')) + + warc_headers.replace_header('WARC-Date', + headers.rec_headers.get_header('WARC-Date')) + + headers.stream.close() + + return (warc_headers, http_headers_buff, payload.stream) + + def __str__(self): + return 'WARCPathLoader' + + +#============================================================================= +class LiveWebLoader(BaseLoader): + SKIP_HEADERS = ('link', + 'memento-datetime', + 'content-location', + 'x-archive') + + def __init__(self): + self.num_retries = 3 + self.num_pools = 10 + self.num_conn_per_pool = 10 + + self.pool = urllib3.PoolManager(num_pools=self.num_pools, + maxsize=self.num_conn_per_pool) + + def load_resource(self, cdx, params): + load_url = cdx.get('load_url') + if not load_url: + return None + + if params.get('content_type') == VideoLoader.CONTENT_TYPE: + return None + + input_req = params['_input_req'] + + req_headers = input_req.get_req_headers() + + dt = timestamp_to_datetime(cdx['timestamp']) + + if cdx.get('memento_url'): + req_headers['Accept-Datetime'] = datetime_to_http_date(dt) + + method = input_req.get_req_method() + data = input_req.get_req_body() + + p = PreparedRequest() + p.prepare_url(load_url, None) + p.prepare_headers(None) + p.prepare_auth(None, load_url) + + auth = p.headers.get('Authorization') + if auth: + req_headers['Authorization'] = auth + + load_url = p.url + + try: + upstream_res = self.pool.urlopen(method=method, + url=load_url, + body=data, + headers=req_headers, + redirect=False, + assert_same_host=False, + preload_content=False, + decode_content=False, + retries=self.num_retries, + timeout=params.get('_timeout')) + + except Exception as e: + raise LiveResourceException(load_url) + + memento_dt = upstream_res.headers.get('Memento-Datetime') + if memento_dt: + dt = http_date_to_datetime(memento_dt) + cdx['timestamp'] = datetime_to_timestamp(dt) + elif cdx.get('memento_url'): + # if 'memento_url' set and no Memento-Datetime header present + # then its an error + return None + + agg_type = upstream_res.headers.get('WebAgg-Type') + if agg_type == 'warc': + cdx['source'] = unquote(upstream_res.headers.get('WebAgg-Source-Coll')) + return None, upstream_res.headers, upstream_res + + self.raise_on_self_redirect(params, cdx, + str(upstream_res.status), + upstream_res.headers.get('Location')) + + + if upstream_res.version == 11: + version = '1.1' + else: + version = '1.0' + + status = 'HTTP/{version} {status} {reason}\r\n' + status = status.format(version=version, + status=upstream_res.status, + reason=upstream_res.reason) + + http_headers_buff = status + + orig_resp = upstream_res._original_response + + try: #pragma: no cover + #PY 3 + resp_headers = orig_resp.headers._headers + for n, v in resp_headers: + if n.lower() in self.SKIP_HEADERS: + continue + + http_headers_buff += n + ': ' + v + '\r\n' + except: #pragma: no cover + #PY 2 + resp_headers = orig_resp.msg.headers + for n, v in zip(orig_resp.getheaders(), resp_headers): + if n in self.SKIP_HEADERS: + continue + + http_headers_buff += v + + http_headers_buff += '\r\n' + http_headers_buff = http_headers_buff.encode('latin-1') + + try: + fp = upstream_res._fp.fp + if hasattr(fp, 'raw'): #pragma: no cover + fp = fp.raw + remote_ip = fp._sock.getpeername()[0] + except: #pragma: no cover + remote_ip = None + + warc_headers = {} + + warc_headers['WARC-Type'] = 'response' + warc_headers['WARC-Record-ID'] = self._make_warc_id() + warc_headers['WARC-Target-URI'] = cdx['url'] + warc_headers['WARC-Date'] = datetime_to_iso_date(dt) + if remote_ip: + warc_headers['WARC-IP-Address'] = remote_ip + + warc_headers['Content-Type'] = 'application/http; msgtype=response' + + self._set_content_len(upstream_res.headers.get('Content-Length', -1), + warc_headers, + len(http_headers_buff)) + + warc_headers = StatusAndHeaders('WARC/1.0', warc_headers.items()) + return (warc_headers, http_headers_buff, upstream_res) + + def __str__(self): + return 'LiveWebLoader' + + +#============================================================================= +class VideoLoader(BaseLoader): + CONTENT_TYPE = 'application/vnd.youtube-dl_formats+json' + + def __init__(self): + try: + from youtube_dl import YoutubeDL as YoutubeDL + except ImportError: + self.ydl = None + return + + self.ydl = YoutubeDL(dict(simulate=True, + youtube_include_dash_manifest=False)) + + self.ydl.add_default_info_extractors() + + def load_resource(self, cdx, params): + load_url = cdx.get('load_url') + if not load_url: + return None + + if params.get('content_type') != self.CONTENT_TYPE: + return None + + if not self.ydl: + return None + + info = self.ydl.extract_info(load_url) + info_buff = json.dumps(info) + info_buff = info_buff.encode('utf-8') + + warc_headers = {} + + schema, rest = load_url.split('://', 1) + target_url = 'metadata://' + rest + + dt = timestamp_to_datetime(cdx['timestamp']) + + warc_headers['WARC-Type'] = 'metadata' + warc_headers['WARC-Record-ID'] = self._make_warc_id() + warc_headers['WARC-Target-URI'] = target_url + warc_headers['WARC-Date'] = datetime_to_iso_date(dt) + warc_headers['Content-Type'] = self.CONTENT_TYPE + warc_headers['Content-Length'] = str(len(info_buff)) + + warc_headers = StatusAndHeaders('WARC/1.0', warc_headers.items()) + + return warc_headers, None, BytesIO(info_buff) + diff --git a/webagg/test/__init__.py b/webagg/test/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/webagg/test/live.ini b/webagg/test/live.ini new file mode 100644 index 00000000..f63d5896 --- /dev/null +++ b/webagg/test/live.ini @@ -0,0 +1,17 @@ +[uwsgi] +if-not-env = PORT +http-socket = :8080 +endif = + +master = true +buffer-size = 65536 +die-on-term = true + +if-env = VIRTUAL_ENV +venv = $(VIRTUAL_ENV) +endif = + +gevent = 100 +gevent-monkey-patch = + +wsgi = webagg.test.live diff --git a/webagg/test/live.py b/webagg/test/live.py new file mode 100644 index 00000000..2e4f84a9 --- /dev/null +++ b/webagg/test/live.py @@ -0,0 +1,44 @@ +from gevent.monkey import patch_all; patch_all() + +from webagg.test.testutils import LiveServerTests +from webagg.handlers import DefaultResourceHandler +from webagg.app import ResAggApp +from webagg.indexsource import LiveIndexSource, RedisIndexSource +from webagg.aggregator import SimpleAggregator, CacheDirectoryIndexSource + +def simpleapp(): + app = ResAggApp(debug=True) + app.add_route('/live', + DefaultResourceHandler(SimpleAggregator( + {'live': LiveIndexSource()}) + ) + ) + + app.add_route('/replay', + DefaultResourceHandler(SimpleAggregator( + {'replay': RedisIndexSource('redis://localhost/2/rec:cdxj')}), + 'redis://localhost/2/rec:warc' + ) + ) + + app.add_route('/replay-testdata', + DefaultResourceHandler(SimpleAggregator( + {'test': CacheDirectoryIndexSource('./testdata/')}), + './testdata/' + ) + ) + return app + + + +application = simpleapp() + + +if __name__ == "__main__": +# from bottle import run +# run(application, server='gevent', port=8080, fast=True) + + from gevent.wsgi import WSGIServer + server = WSGIServer(('', 8080), application) + server.serve_forever() + diff --git a/webagg/test/test_dir_agg.py b/webagg/test/test_dir_agg.py new file mode 100644 index 00000000..bce07046 --- /dev/null +++ b/webagg/test/test_dir_agg.py @@ -0,0 +1,216 @@ +import tempfile +import os +import shutil +import json + +from .testutils import to_path, to_json_list, TempDirTests, BaseTestClass + +from mock import patch + +import time + +from webagg.aggregator import DirectoryIndexSource, CacheDirectoryIndexSource +from webagg.aggregator import SimpleAggregator +from webagg.indexsource import MementoIndexSource + + +#============================================================================= +linkheader = """\ +; rel="original", ; rel="timemap"; type="application/link-format", ; rel="first memento"; datetime="Sun, 20 Jan 2002 14:25:10 GMT", ; rel="prev memento"; datetime="Sat, 01 May 2010 12:34:14 GMT", ; rel="memento"; datetime="Fri, 14 May 2010 23:18:57 GMT", ; rel="next memento"; datetime="Wed, 19 May 2010 20:24:18 GMT", ; rel="last memento"; datetime="Mon, 07 Mar 2016 20:06:19 GMT"\ +""" + + +def mock_link_header(*args, **kwargs): + return linkheader + + +class TestDirAgg(TempDirTests, BaseTestClass): + @classmethod + def setup_class(cls): + super(TestDirAgg, cls).setup_class() + coll_A = to_path(cls.root_dir + '/colls/A/indexes') + coll_B = to_path(cls.root_dir + '/colls/B/indexes') + coll_C = to_path(cls.root_dir + '/colls/C/indexes') + + os.makedirs(coll_A) + os.makedirs(coll_B) + os.makedirs(coll_C) + + dir_prefix = to_path(cls.root_dir) + dir_path ='colls/{coll}/indexes' + + shutil.copy(to_path('testdata/example.cdxj'), coll_A) + shutil.copy(to_path('testdata/iana.cdxj'), coll_B) + shutil.copy(to_path('testdata/dupes.cdxj'), coll_C) + + with open(to_path(cls.root_dir) + '/somefile', 'w') as fh: + fh.write('foo') + + cls.dir_loader = DirectoryIndexSource(dir_prefix, dir_path) + cls.cache_dir_loader = CacheDirectoryIndexSource(dir_prefix, dir_path) + + def test_agg_no_coll_set(self): + res, errs = self.dir_loader(dict(url='example.com/')) + assert(to_json_list(res) == []) + assert(errs == {}) + + def test_agg_collA_found(self): + res, errs = self.dir_loader({'url': 'example.com/', 'param.coll': 'A'}) + + exp = [{'source': 'colls/A/indexes/example.cdxj', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'}] + + assert(to_json_list(res) == exp) + assert(errs == {}) + + def test_agg_collB(self): + res, errs = self.dir_loader({'url': 'example.com/', 'param.coll': 'B'}) + + exp = [] + + assert(to_json_list(res) == exp) + assert(errs == {}) + + def test_agg_collB_found(self): + res, errs = self.dir_loader({'url': 'iana.org/', 'param.coll': 'B'}) + + exp = [{'source': 'colls/B/indexes/iana.cdxj', 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}] + + assert(to_json_list(res) == exp) + assert(errs == {}) + + + def test_extra_agg_collB(self): + agg_source = SimpleAggregator({'dir': self.dir_loader}) + res, errs = agg_source({'url': 'iana.org/', 'param.coll': 'B'}) + + exp = [{'source': 'dir:colls/B/indexes/iana.cdxj', 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}] + + assert(to_json_list(res) == exp) + assert(errs == {}) + + + def test_agg_all_found_1(self): + res, errs = self.dir_loader({'url': 'iana.org/', 'param.coll': '*'}) + + exp = [ + {'source': 'colls/B/indexes/iana.cdxj', 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}, + {'source': 'colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171238', 'filename': 'dupes.warc.gz'}, + {'source': 'colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171238', 'filename': 'dupes.warc.gz'}, + ] + + assert(to_json_list(res) == exp) + assert(errs == {}) + + + def test_agg_all_found_2(self): + res, errs = self.dir_loader({'url': 'example.com/', 'param.coll': '*'}) + + exp = [ + {'source': 'colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'}, + {'source': 'colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'}, + {'source': 'colls/A/indexes/example.cdxj', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'} + ] + + assert(to_json_list(res) == exp) + assert(errs == {}) + + @patch('webagg.indexsource.MementoIndexSource.get_timegate_links', mock_link_header) + def test_agg_dir_and_memento(self): + sources = {'ia': MementoIndexSource.from_timegate_url('http://web.archive.org/web/'), + 'local': self.dir_loader} + agg_source = SimpleAggregator(sources) + + res, errs = agg_source({'url': 'example.com/', 'param.local.coll': '*', 'closest': '20100512', 'limit': 6}) + + exp = [ + {'source': 'ia', 'timestamp': '20100514231857', 'load_url': 'http://web.archive.org/web/20100514231857id_/http://example.com/'}, + {'source': 'ia', 'timestamp': '20100519202418', 'load_url': 'http://web.archive.org/web/20100519202418id_/http://example.com/'}, + {'source': 'ia', 'timestamp': '20100501123414', 'load_url': 'http://web.archive.org/web/20100501123414id_/http://example.com/'}, + {'source': 'local:colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'}, + {'source': 'local:colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'}, + {'source': 'local:colls/A/indexes/example.cdxj', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'} + ] + + assert(to_json_list(res) == exp) + assert(errs == {}) + + + def test_agg_no_dir_1(self): + res, errs = self.dir_loader({'url': 'example.com/', 'param.coll': 'X'}) + + exp = [] + + assert(to_json_list(res) == exp) + assert(errs == {}) + + + def test_agg_no_dir_2(self): + loader = DirectoryIndexSource(self.root_dir, '') + res, errs = loader({'url': 'example.com/', 'param.coll': 'X'}) + + exp = [] + + assert(to_json_list(res) == exp) + assert(errs == {}) + + + def test_agg_dir_sources_1(self): + res = self.dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'}) + exp = {'sources': {'colls/A/indexes/example.cdxj': 'file', + 'colls/B/indexes/iana.cdxj': 'file', + 'colls/C/indexes/dupes.cdxj': 'file'} + } + + assert(res == exp) + + + def test_agg_dir_sources_2(self): + res = self.dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '[A,C]'}) + exp = {'sources': {'colls/A/indexes/example.cdxj': 'file', + 'colls/C/indexes/dupes.cdxj': 'file'} + } + + assert(res == exp) + + + def test_agg_dir_sources_single_dir(self): + loader = DirectoryIndexSource(os.path.join(self.root_dir, 'colls', 'A', 'indexes'), '') + res = loader.get_source_list({'url': 'example.com/'}) + + exp = {'sources': {'example.cdxj': 'file'}} + + assert(res == exp) + + + def test_agg_dir_sources_not_found_dir(self): + loader = DirectoryIndexSource(os.path.join(self.root_dir, 'colls', 'Z', 'indexes'), '') + res = loader.get_source_list({'url': 'example.com/'}) + + exp = {'sources': {}} + + assert(res == exp) + + + + def test_cache_dir_sources_1(self): + exp = {'sources': {'colls/A/indexes/example.cdxj': 'file', + 'colls/B/indexes/iana.cdxj': 'file', + 'colls/C/indexes/dupes.cdxj': 'file'} + } + + res = self.cache_dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'}) + assert(res == exp) + + res = self.cache_dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'}) + assert(res == exp) + + new_file = os.path.join(self.root_dir, 'colls/C/indexes/empty.cdxj') + + with open(new_file, 'a') as fh: + os.utime(new_file, None) + + res = self.cache_dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'}) + + # New File Included + exp['sources']['colls/C/indexes/empty.cdxj'] = 'file' + assert(res == exp) diff --git a/webagg/test/test_handlers.py b/webagg/test/test_handlers.py new file mode 100644 index 00000000..6fb5c8d8 --- /dev/null +++ b/webagg/test/test_handlers.py @@ -0,0 +1,463 @@ +#from gevent import monkey; monkey.patch_all(thread=False) + +from collections import OrderedDict + +from webagg.handlers import DefaultResourceHandler, HandlerSeq + +from webagg.indexsource import MementoIndexSource, FileIndexSource, LiveIndexSource +from webagg.aggregator import GeventTimeoutAggregator, SimpleAggregator +from webagg.aggregator import DirectoryIndexSource + +from webagg.app import ResAggApp +from webagg.utils import MementoUtils + +from pywb.utils.statusandheaders import StatusAndHeadersParser +from pywb.utils.bufferedreaders import ChunkedDataReader +from io import BytesIO +from six.moves.urllib.parse import urlencode + +import webtest +from fakeredis import FakeStrictRedis + +from .testutils import to_path, FakeRedisTests, BaseTestClass + +import json + +sources = { + 'local': DirectoryIndexSource(to_path('testdata/'), ''), + 'ia': MementoIndexSource.from_timegate_url('http://web.archive.org/web/'), + 'rhiz': MementoIndexSource.from_timegate_url('http://webenact.rhizome.org/vvork/', path='*'), + 'live': LiveIndexSource(), +} + + +class TestResAgg(FakeRedisTests, BaseTestClass): + def setup_class(cls): + super(TestResAgg, cls).setup_class() + + live_source = SimpleAggregator({'live': LiveIndexSource()}) + live_handler = DefaultResourceHandler(live_source) + app = ResAggApp() + app.add_route('/live', live_handler) + + source1 = GeventTimeoutAggregator(sources) + handler1 = DefaultResourceHandler(source1, to_path('testdata/')) + app.add_route('/many', handler1) + + source2 = SimpleAggregator({'post': FileIndexSource(to_path('testdata/post-test.cdxj'))}) + handler2 = DefaultResourceHandler(source2, to_path('testdata/')) + app.add_route('/posttest', handler2) + + source3 = SimpleAggregator({'example': FileIndexSource(to_path('testdata/example.cdxj'))}) + handler3 = DefaultResourceHandler(source3, to_path('testdata/')) + + app.add_route('/fallback', HandlerSeq([handler3, + handler2, + live_handler])) + + app.add_route('/seq', HandlerSeq([handler3, + handler2])) + + app.add_route('/allredis', DefaultResourceHandler(source3, 'redis://localhost/2/test:warc')) + + app.add_route('/empty', HandlerSeq([])) + app.add_route('/invalid', DefaultResourceHandler([SimpleAggregator({'invalid': 'should not be a callable'})])) + + url_agnost = SimpleAggregator({'url-agnost': FileIndexSource(to_path('testdata/url-agnost-example.cdxj'))}) + app.add_route('/urlagnost', DefaultResourceHandler(url_agnost, 'redis://localhost/2/test:{arg}:warc')) + + cls.testapp = webtest.TestApp(app) + + def _check_uri_date(self, resp, uri, dt): + buff = BytesIO(resp.body) + buff = ChunkedDataReader(buff) + status_headers = StatusAndHeadersParser(['WARC/1.0']).parse(buff) + assert status_headers.get_header('WARC-Target-URI') == uri + if dt == True: + assert status_headers.get_header('WARC-Date') != '' + else: + assert status_headers.get_header('WARC-Date') == dt + + def test_list_routes(self): + resp = self.testapp.get('/') + res = resp.json + assert set(res.keys()) == set(['/empty', '/empty/postreq', + '/fallback', '/fallback/postreq', + '/live', '/live/postreq', + '/many', '/many/postreq', + '/posttest', '/posttest/postreq', + '/seq', '/seq/postreq', + '/allredis', '/allredis/postreq', + '/urlagnost', '/urlagnost/postreq', + '/invalid', '/invalid/postreq']) + + assert res['/fallback'] == {'modes': ['list_sources', 'index', 'resource']} + + def test_list_handlers(self): + resp = self.testapp.get('/many') + assert resp.json == {'modes': ['list_sources', 'index', 'resource']} + assert 'ResErrors' not in resp.headers + + resp = self.testapp.get('/many/other') + assert resp.json == {'modes': ['list_sources', 'index', 'resource']} + assert 'ResErrors' not in resp.headers + + def test_list_errors(self): + # must specify url for index or resource + resp = self.testapp.get('/many/index', status=400) + assert resp.json == {'message': 'The "url" param is required'} + assert resp.text == resp.headers['ResErrors'] + + resp = self.testapp.get('/many/index', status=400) + assert resp.json == {'message': 'The "url" param is required'} + assert resp.text == resp.headers['ResErrors'] + + resp = self.testapp.get('/many/resource', status=400) + assert resp.json == {'message': 'The "url" param is required'} + assert resp.text == resp.headers['ResErrors'] + + def test_list_sources(self): + resp = self.testapp.get('/many/list_sources') + assert resp.json == {'sources': {'local': 'file_dir', 'ia': 'memento', 'rhiz': 'memento', 'live': 'live'}} + assert 'ResErrors' not in resp.headers + + def test_live_index(self): + resp = self.testapp.get('/live/index?url=http://httpbin.org/get&output=json') + resp.charset = 'utf-8' + + cdxlist = list([json.loads(cdx) for cdx in resp.text.rstrip().split('\n')]) + cdxlist[0]['timestamp'] = '2016' + assert(cdxlist == [{'url': 'http://httpbin.org/get', 'urlkey': 'org,httpbin)/get', 'is_live': 'true', + 'mime': '', 'load_url': 'http://httpbin.org/get', 'source': 'live', 'timestamp': '2016'}]) + + def test_live_resource(self): + headers = {'foo': 'bar'} + resp = self.testapp.get('/live/resource?url=http://httpbin.org/get?foo=bar', headers=headers) + + assert resp.headers['WebAgg-Source-Coll'] == 'live' + + self._check_uri_date(resp, 'http://httpbin.org/get?foo=bar', True) + + assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/get?foo=bar', 'original') + assert resp.headers['Memento-Datetime'] != '' + + assert b'HTTP/1.1 200 OK' in resp.body + assert b'"foo": "bar"' in resp.body + + assert 'ResErrors' not in resp.headers + + def test_live_post_resource(self): + resp = self.testapp.post('/live/resource?url=http://httpbin.org/post', + OrderedDict([('foo', 'bar')])) + + assert resp.headers['WebAgg-Source-Coll'] == 'live' + + self._check_uri_date(resp, 'http://httpbin.org/post', True) + + assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/post', 'original') + assert resp.headers['Memento-Datetime'] != '' + + assert b'HTTP/1.1 200 OK' in resp.body + assert b'"foo": "bar"' in resp.body + + assert 'ResErrors' not in resp.headers + + def test_agg_select_mem_1(self): + resp = self.testapp.get('/many/resource?url=http://vvork.com/&closest=20141001') + + assert resp.headers['WebAgg-Source-Coll'] == 'rhiz' + + self._check_uri_date(resp, 'http://www.vvork.com/', '2014-10-06T18:43:57Z') + + assert b'HTTP/1.1 200 OK' in resp.body + + assert resp.headers['Link'] == MementoUtils.make_link('http://www.vvork.com/', 'original') + assert resp.headers['Memento-Datetime'] == 'Mon, 06 Oct 2014 18:43:57 GMT' + + assert 'ResErrors' not in resp.headers + + def test_agg_select_mem_2(self): + resp = self.testapp.get('/many/resource?url=http://vvork.com/&closest=20151231') + + assert resp.headers['WebAgg-Source-Coll'] == 'ia' + + self._check_uri_date(resp, 'http://vvork.com/', '2016-01-10T13:48:55Z') + + assert b'HTTP/1.1 200 OK' in resp.body + + assert resp.headers['Link'] == MementoUtils.make_link('http://vvork.com/', 'original') + assert resp.headers['Memento-Datetime'] == 'Sun, 10 Jan 2016 13:48:55 GMT' + + assert 'ResErrors' not in resp.headers + + def test_agg_select_live(self): + resp = self.testapp.get('/many/resource?url=http://vvork.com/&closest=2016') + + assert resp.headers['WebAgg-Source-Coll'] == 'live' + + self._check_uri_date(resp, 'http://vvork.com/', True) + + assert resp.headers['Link'] == MementoUtils.make_link('http://vvork.com/', 'original') + assert resp.headers['Memento-Datetime'] != '' + + assert 'ResErrors' not in resp.headers + + def test_agg_select_local(self): + resp = self.testapp.get('/many/resource?url=http://iana.org/&closest=20140126200624') + + assert resp.headers['WebAgg-Source-Coll'] == 'local:iana.cdxj' + + self._check_uri_date(resp, 'http://www.iana.org/', '2014-01-26T20:06:24Z') + + assert resp.headers['Link'] == MementoUtils.make_link('http://www.iana.org/', 'original') + assert resp.headers['Memento-Datetime'] == 'Sun, 26 Jan 2014 20:06:24 GMT' + + assert json.loads(resp.headers['ResErrors']) == {"rhiz": "NotFoundException('http://webenact.rhizome.org/vvork/http://iana.org/',)"} + + def test_agg_select_local_postreq(self): + req_data = """\ +GET / HTTP/1.1 +Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8 +User-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36 +Host: iana.org +""" + + resp = self.testapp.post('/many/resource/postreq?url=http://iana.org/&closest=20140126200624', req_data) + + assert resp.headers['WebAgg-Source-Coll'] == 'local:iana.cdxj' + + self._check_uri_date(resp, 'http://www.iana.org/', '2014-01-26T20:06:24Z') + + assert resp.headers['Link'] == MementoUtils.make_link('http://www.iana.org/', 'original') + assert resp.headers['Memento-Datetime'] == 'Sun, 26 Jan 2014 20:06:24 GMT' + + assert json.loads(resp.headers['ResErrors']) == {"rhiz": "NotFoundException('http://webenact.rhizome.org/vvork/http://iana.org/',)"} + + def test_agg_live_postreq(self): + req_data = """\ +GET /get?foo=bar HTTP/1.1 +Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8 +User-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36 +Host: httpbin.org +""" + + resp = self.testapp.post('/many/resource/postreq?url=http://httpbin.org/get?foo=bar&closest=now', req_data) + + assert resp.headers['WebAgg-Source-Coll'] == 'live' + + self._check_uri_date(resp, 'http://httpbin.org/get?foo=bar', True) + + assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/get?foo=bar', 'original') + assert resp.headers['Memento-Datetime'] != '' + + assert b'HTTP/1.1 200 OK' in resp.body + assert b'"foo": "bar"' in resp.body + + assert json.loads(resp.headers['ResErrors']) == {"rhiz": "NotFoundException('http://webenact.rhizome.org/vvork/http://httpbin.org/get?foo=bar',)"} + + def test_agg_post_resolve_postreq(self): + req_data = """\ +POST /post HTTP/1.1 +content-length: 16 +accept-encoding: gzip, deflate +accept: */* +host: httpbin.org +content-type: application/x-www-form-urlencoded + +foo=bar&test=abc""" + + resp = self.testapp.post('/posttest/resource/postreq?url=http://httpbin.org/post', req_data) + + assert resp.headers['WebAgg-Source-Coll'] == 'post' + + self._check_uri_date(resp, 'http://httpbin.org/post', True) + + assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/post', 'original') + assert resp.headers['Memento-Datetime'] != '' + + assert b'HTTP/1.1 200 OK' in resp.body + assert b'"foo": "bar"' in resp.body + assert b'"test": "abc"' in resp.body + assert b'"url": "http://httpbin.org/post"' in resp.body + + assert 'ResErrors' not in resp.headers + + def test_agg_post_resolve_fallback(self): + req_data = OrderedDict([('foo', 'bar'), ('test', 'abc')]) + + resp = self.testapp.post('/fallback/resource?url=http://httpbin.org/post', req_data) + + assert resp.headers['WebAgg-Source-Coll'] == 'post' + + self._check_uri_date(resp, 'http://httpbin.org/post', True) + + assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/post', 'original') + + assert b'HTTP/1.1 200 OK' in resp.body + assert b'"foo": "bar"' in resp.body + assert b'"test": "abc"' in resp.body + assert b'"url": "http://httpbin.org/post"' in resp.body + + assert 'ResErrors' not in resp.headers + + def test_agg_seq_fallback_1(self): + resp = self.testapp.get('/fallback/resource?url=http://www.iana.org/') + + assert resp.headers['WebAgg-Source-Coll'] == 'live' + + self._check_uri_date(resp, 'http://www.iana.org/', True) + + assert resp.headers['Link'] == MementoUtils.make_link('http://www.iana.org/', 'original') + + assert b'HTTP/1.1 200 OK' in resp.body + + assert 'ResErrors' not in resp.headers + + def test_agg_seq_fallback_2(self): + resp = self.testapp.get('/fallback/resource?url=http://www.example.com/') + + assert resp.headers['WebAgg-Source-Coll'] == 'example' + + self._check_uri_date(resp, 'http://example.com/', '2016-02-25T04:23:29Z') + + assert resp.headers['Link'] == MementoUtils.make_link('http://example.com/', 'original') + assert resp.headers['Memento-Datetime'] == 'Thu, 25 Feb 2016 04:23:29 GMT' + + assert b'HTTP/1.1 200 OK' in resp.body + + assert 'ResErrors' not in resp.headers + + def test_redis_warc_1(self): + f = FakeStrictRedis.from_url('redis://localhost/2') + f.hset('test:warc', 'example.warc.gz', './testdata/example.warc.gz') + + resp = self.testapp.get('/allredis/resource?url=http://www.example.com/') + + assert resp.headers['WebAgg-Source-Coll'] == 'example' + + def test_url_agnost(self): + f = FakeStrictRedis.from_url('redis://localhost/2') + f.hset('test:foo:warc', 'example-url-agnostic-revisit.warc.gz', './testdata/example-url-agnostic-revisit.warc.gz') + f.hset('test:foo:warc', 'example-url-agnostic-orig.warc.gz', './testdata/example-url-agnostic-orig.warc.gz') + + resp = self.testapp.get('/urlagnost/resource?url=http://example.com/¶m.arg=foo') + + assert resp.status_int == 200 + assert resp.headers['Link'] == MementoUtils.make_link('http://test@example.com/', 'original') + assert resp.headers['WebAgg-Source-Coll'] == 'url-agnost' + assert resp.headers['Memento-Datetime'] == 'Mon, 29 Jul 2013 19:51:51 GMT' + + def test_live_video_loader(self): + params = {'url': 'http://www.youtube.com/v/BfBgWtAIbRc', + 'content_type': 'application/vnd.youtube-dl_formats+json' + } + + resp = self.testapp.get('/live/resource', params=params) + + assert resp.headers['WebAgg-Source-Coll'] == 'live' + + self._check_uri_date(resp, 'metadata://www.youtube.com/v/BfBgWtAIbRc', True) + + assert resp.headers['Link'] == MementoUtils.make_link('metadata://www.youtube.com/v/BfBgWtAIbRc', 'original') + assert resp.headers['Memento-Datetime'] != '' + + assert b'WARC-Type: metadata' in resp.body + assert b'Content-Type: application/vnd.youtube-dl_formats+json' in resp.body + + def test_live_video_loader_post(self): + req_data = """\ +GET /v/BfBgWtAIbRc HTTP/1.1 +accept-encoding: gzip, deflate +accept: */* +host: www.youtube.com\ +""" + + params = {'url': 'http://www.youtube.com/v/BfBgWtAIbRc', + 'content_type': 'application/vnd.youtube-dl_formats+json' + } + + resp = self.testapp.post('/live/resource/postreq?&' + urlencode(params), req_data) + + assert resp.headers['WebAgg-Source-Coll'] == 'live' + + self._check_uri_date(resp, 'metadata://www.youtube.com/v/BfBgWtAIbRc', True) + + assert resp.headers['Link'] == MementoUtils.make_link('metadata://www.youtube.com/v/BfBgWtAIbRc', 'original') + assert resp.headers['Memento-Datetime'] != '' + + assert b'WARC-Type: metadata' in resp.body + assert b'Content-Type: application/vnd.youtube-dl_formats+json' in resp.body + + def test_error_redis_file_not_found(self): + f = FakeStrictRedis.from_url('redis://localhost/2') + f.hset('test:warc', 'example.warc.gz', './testdata/example2.warc.gz') + + resp = self.testapp.get('/allredis/resource?url=http://www.example.com/', status=503) + assert resp.json['message'] == "example.warc.gz: [Errno 2] No such file or directory: './testdata/example2.warc.gz'" + + f.hdel('test:warc', 'example.warc.gz') + resp = self.testapp.get('/allredis/resource?url=http://www.example.com/', status=503) + + assert resp.json == {'message': 'example.warc.gz: Archive File Not Found', + 'errors': {'WARCPathLoader': 'example.warc.gz: Archive File Not Found'}} + + f.delete('test:warc') + resp = self.testapp.get('/allredis/resource?url=http://www.example.com/', status=503) + + assert resp.json == {'message': 'example.warc.gz: Archive File Not Found', + 'errors': {'WARCPathLoader': 'example.warc.gz: Archive File Not Found'}} + + + def test_error_fallback_live_not_found(self): + resp = self.testapp.get('/fallback/resource?url=http://invalid.url-not-found', status=400) + + assert resp.json == {'message': 'http://invalid.url-not-found/', + 'errors': {'LiveWebLoader': 'http://invalid.url-not-found/'}} + + assert resp.text == resp.headers['ResErrors'] + + def test_agg_local_revisit(self): + resp = self.testapp.get('/many/resource?url=http://www.example.com/&closest=20140127171251&sources=local') + + assert resp.headers['WebAgg-Source-Coll'] == 'local:dupes.cdxj' + + buff = BytesIO(resp.body) + status_headers = StatusAndHeadersParser(['WARC/1.0']).parse(buff) + assert status_headers.get_header('WARC-Target-URI') == 'http://example.com' + assert status_headers.get_header('WARC-Date') == '2014-01-27T17:12:51Z' + assert status_headers.get_header('WARC-Refers-To-Target-URI') == 'http://example.com' + assert status_headers.get_header('WARC-Refers-To-Date') == '2014-01-27T17:12:00Z' + + assert resp.headers['Link'] == MementoUtils.make_link('http://example.com', 'original') + assert resp.headers['Memento-Datetime'] == 'Mon, 27 Jan 2014 17:12:51 GMT' + + assert b'HTTP/1.1 200 OK' in resp.body + assert b'' in resp.body + + assert 'ResErrors' not in resp.headers + + def test_error_invalid_index_output(self): + resp = self.testapp.get('/live/index?url=http://httpbin.org/get&output=foobar', status=400) + + assert resp.json == {'message': 'output=foobar not supported'} + assert resp.text == resp.headers['ResErrors'] + + def test_error_local_not_found(self): + resp = self.testapp.get('/many/resource?url=http://not-found.error/&sources=local', status=404) + + assert resp.json == {'message': 'No Resource Found'} + assert resp.text == resp.headers['ResErrors'] + + def test_error_empty(self): + resp = self.testapp.get('/empty/resource?url=http://example.com/', status=404) + + assert resp.json == {'message': 'No Resource Found'} + assert resp.text == resp.headers['ResErrors'] + + def test_error_invalid(self): + resp = self.testapp.get('/invalid/resource?url=http://example.com/', status=500) + + assert resp.json == {'message': "Internal Error: 'list' object is not callable"} + assert resp.text == resp.headers['ResErrors'] + + diff --git a/webagg/test/test_indexsource.py b/webagg/test/test_indexsource.py new file mode 100644 index 00000000..40dc825e --- /dev/null +++ b/webagg/test/test_indexsource.py @@ -0,0 +1,219 @@ +from webagg.indexsource import FileIndexSource, RemoteIndexSource, MementoIndexSource, RedisIndexSource +from webagg.indexsource import LiveIndexSource + +from webagg.aggregator import SimpleAggregator + +from pywb.utils.timeutils import timestamp_now + +from .testutils import key_ts_res + + +import pytest + +from fakeredis import FakeStrictRedis +from mock import patch + +redismock = patch('redis.StrictRedis', FakeStrictRedis) +redismock.start() + +def setup_module(): + r = FakeStrictRedis.from_url('redis://localhost:6379/2') + r.delete('test:rediscdx') + with open('testdata/iana.cdxj', 'rb') as fh: + for line in fh: + r.zadd('test:rediscdx', 0, line.rstrip()) + + +def teardown_module(): + redismock.stop() + + +local_sources = [ + FileIndexSource('testdata/iana.cdxj'), + RedisIndexSource('redis://localhost:6379/2/test:rediscdx') +] + + +remote_sources = [ + RemoteIndexSource('http://webenact.rhizome.org/all-cdx?url={url}', + 'http://webenact.rhizome.org/all/{timestamp}id_/{url}'), + + MementoIndexSource('http://webenact.rhizome.org/all/{url}', + 'http://webenact.rhizome.org/all/timemap/*/{url}', + 'http://webenact.rhizome.org/all/{timestamp}id_/{url}') +] + +ait_source = RemoteIndexSource('http://wayback.archive-it.org/cdx?url={url}', + 'http://wayback.archive-it.org/all/{timestamp}id_/{url}') + + +def query_single_source(source, params): + string = str(source) + return SimpleAggregator({'source': source})(params) + + + +# Url Match -- Local Loaders +# ============================================================================ +@pytest.mark.parametrize("source", local_sources, ids=["file", "redis"]) +def test_local_cdxj_loader(source): + url = 'http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf' + res, errs = query_single_source(source, dict(url=url, limit=3)) + + expected = """\ +org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200826 iana.warc.gz +org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200912 iana.warc.gz +org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200930 iana.warc.gz""" + + assert(key_ts_res(res) == expected) + assert(errs == {}) + + +# Closest -- Local Loaders +# ============================================================================ +@pytest.mark.parametrize("source", local_sources, ids=["file", "redis"]) +def test_local_closest_loader(source): + url = 'http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf' + res, errs = query_single_source(source, dict(url=url, + closest='20140126200930', + limit=3)) + + expected = """\ +org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200930 iana.warc.gz +org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200912 iana.warc.gz +org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200826 iana.warc.gz""" + + assert(key_ts_res(res) == expected) + assert(errs == {}) + + +# Prefix -- Local Loaders +# ============================================================================ +@pytest.mark.parametrize("source", local_sources, ids=["file", "redis"]) +def test_file_prefix_loader(source): + res, errs = query_single_source(source, dict(url='http://iana.org/domains/root/*')) + + expected = """\ +org,iana)/domains/root/db 20140126200927 iana.warc.gz +org,iana)/domains/root/db 20140126200928 iana.warc.gz +org,iana)/domains/root/servers 20140126201227 iana.warc.gz""" + + assert(key_ts_res(res) == expected) + assert(errs == {}) + + +# Url Match -- Remote Loaders +# ============================================================================ +@pytest.mark.parametrize("source", remote_sources, ids=["remote_cdx", "memento"]) +def test_remote_loader(source): + url = 'http://instagram.com/amaliaulman' + res, errs = query_single_source(source, dict(url=url)) + + expected = """\ +com,instagram)/amaliaulman 20141014150552 http://webenact.rhizome.org/all/20141014150552id_/http://instagram.com/amaliaulman +com,instagram)/amaliaulman 20141014155217 http://webenact.rhizome.org/all/20141014155217id_/http://instagram.com/amaliaulman +com,instagram)/amaliaulman 20141014162333 http://webenact.rhizome.org/all/20141014162333id_/http://instagram.com/amaliaulman +com,instagram)/amaliaulman 20141014171636 http://webenact.rhizome.org/all/20141014171636id_/http://instagram.com/amaliaulman""" + + assert(key_ts_res(res, 'load_url') == expected) + assert(errs == {}) + + +# Url Match -- Remote Loaders +# ============================================================================ +@pytest.mark.parametrize("source", remote_sources, ids=["remote_cdx", "memento"]) +def test_remote_closest_loader(source): + url = 'http://instagram.com/amaliaulman' + res, errs = query_single_source(source, dict(url=url, closest='20141014162332', limit=1)) + + expected = """\ +com,instagram)/amaliaulman 20141014162333 http://webenact.rhizome.org/all/20141014162333id_/http://instagram.com/amaliaulman""" + + assert(key_ts_res(res, 'load_url') == expected) + assert(errs == {}) + + +# Url Match -- Memento +# ============================================================================ +@pytest.mark.parametrize("source", remote_sources, ids=["remote_cdx", "memento"]) +def test_remote_closest_loader(source): + url = 'http://instagram.com/amaliaulman' + res, errs = query_single_source(source, dict(url=url, closest='20141014162332', limit=1)) + + expected = """\ +com,instagram)/amaliaulman 20141014162333 http://webenact.rhizome.org/all/20141014162333id_/http://instagram.com/amaliaulman""" + + assert(key_ts_res(res, 'load_url') == expected) + assert(errs == {}) + + +# Live Index -- No Load! +# ============================================================================ +def test_live(): + url = 'http://example.com/' + source = LiveIndexSource() + res, errs = query_single_source(source, dict(url=url)) + + expected = 'com,example)/ {0} http://example.com/'.format(timestamp_now()) + + assert(key_ts_res(res, 'load_url') == expected) + assert(errs == {}) + + +# Errors -- Not Found All +# ============================================================================ +@pytest.mark.parametrize("source", local_sources + remote_sources, ids=["file", "redis", "remote_cdx", "memento"]) +def test_all_not_found(source): + url = 'http://x-not-found-x.notfound/' + res, errs = query_single_source(source, dict(url=url, limit=3)) + + expected = '' + assert(key_ts_res(res) == expected) + if source == remote_sources[0]: + assert('http://x-not-found-x.notfound/' in errs['source']) + else: + assert(errs == {}) + + +# ============================================================================ +def test_another_remote_not_found(): + source = MementoIndexSource.from_timegate_url('http://www.webarchive.org.uk/wayback/archive/') + url = 'http://x-not-found-x.notfound/' + res, errs = query_single_source(source, dict(url=url, limit=3)) + + + expected = '' + assert(key_ts_res(res) == expected) + assert(errs['source'] == "NotFoundException('http://www.webarchive.org.uk/wayback/archive/timemap/link/http://x-not-found-x.notfound/',)") + +# ============================================================================ +def test_file_not_found(): + source = FileIndexSource('testdata/not-found-x') + url = 'http://x-not-found-x.notfound/' + res, errs = query_single_source(source, dict(url=url, limit=3)) + + expected = '' + assert(key_ts_res(res) == expected) + assert(errs['source'] == "NotFoundException('testdata/not-found-x',)"), errs + + +# ============================================================================ +def test_ait_filters(): + ait_source = RemoteIndexSource('http://wayback.archive-it.org/cdx/search/cdx?url={url}&filter=filename:ARCHIVEIT-({colls})-.*', + 'http://wayback.archive-it.org/all/{timestamp}id_/{url}') + + cdxlist, errs = query_single_source(ait_source, {'url': 'http://iana.org/', 'param.source.colls': '5610|933'}) + filenames = [cdx['filename'] for cdx in cdxlist] + + prefix = ('ARCHIVEIT-5610-', 'ARCHIVEIT-933-') + + assert(all([x.startswith(prefix) for x in filenames])) + + + cdxlist, errs = query_single_source(ait_source, {'url': 'http://iana.org/', 'param.source.colls': '1883|366|905'}) + filenames = [cdx['filename'] for cdx in cdxlist] + + prefix = ('ARCHIVEIT-1883-', 'ARCHIVEIT-366-', 'ARCHIVEIT-905-') + + assert(all([x.startswith(prefix) for x in filenames])) + diff --git a/webagg/test/test_inputreq.py b/webagg/test/test_inputreq.py new file mode 100644 index 00000000..bdc47705 --- /dev/null +++ b/webagg/test/test_inputreq.py @@ -0,0 +1,67 @@ +from webagg.inputrequest import DirectWSGIInputRequest, POSTInputRequest +from bottle import Bottle, request, response, debug +import webtest +import traceback + + +#============================================================================= +class InputReqApp(object): + def __init__(self): + self.application = Bottle() + debug(True) + + @self.application.route('/test/', 'ANY') + def direct_input_request(url=''): + inputreq = DirectWSGIInputRequest(request.environ) + response['Content-Type'] = 'text/plain; charset=utf-8' + return inputreq.reconstruct_request(url) + + @self.application.route('/test-postreq', 'POST') + def post_fullrequest(): + params = dict(request.query) + inputreq = POSTInputRequest(request.environ) + response['Content-Type'] = 'text/plain; charset=utf-8' + return inputreq.reconstruct_request(params.get('url')) + + +#============================================================================= +class TestInputReq(object): + def setup(self): + self.app = InputReqApp() + self.testapp = webtest.TestApp(self.app.application) + + def test_get_direct(self): + res = self.testapp.get('/test/http://example.com/', headers={'Foo': 'Bar'}) + assert res.text == '\ +GET /test/http://example.com/ HTTP/1.0\r\n\ +Host: example.com\r\n\ +Foo: Bar\r\n\ +\r\n\ +' + + def test_post_direct(self): + res = self.testapp.post('/test/http://example.com/', headers={'Foo': 'Bar'}, params='ABC') + lines = res.text.split('\r\n') + assert lines[0] == 'POST /test/http://example.com/ HTTP/1.0' + assert 'Host: example.com' in lines + assert 'Content-Length: 3' in lines + assert 'Content-Type: application/x-www-form-urlencoded' in lines + assert 'Foo: Bar' in lines + + assert 'ABC' in lines + + def test_post_req(self): + postdata = '\ +GET /example.html HTTP/1.0\r\n\ +Foo: Bar\r\n\ +\r\n\ +' + res = self.testapp.post('/test-postreq?url=http://example.com/', params=postdata) + + assert res.text == '\ +GET /example.html HTTP/1.0\r\n\ +Host: example.com\r\n\ +Foo: Bar\r\n\ +\r\n\ +' + diff --git a/webagg/test/test_memento_agg.py b/webagg/test/test_memento_agg.py new file mode 100644 index 00000000..73bd0409 --- /dev/null +++ b/webagg/test/test_memento_agg.py @@ -0,0 +1,241 @@ +from gevent import monkey; monkey.patch_all(thread=False) + +from webagg.aggregator import SimpleAggregator, GeventTimeoutAggregator +from webagg.aggregator import BaseAggregator + +from webagg.indexsource import FileIndexSource, RemoteIndexSource, MementoIndexSource +from .testutils import to_json_list, to_path + +import json +import pytest +import time +import six + +from webagg.handlers import IndexHandler + + +sources = { + 'local': FileIndexSource(to_path('testdata/iana.cdxj')), + 'ia': MementoIndexSource.from_timegate_url('http://web.archive.org/web/'), + 'ait': MementoIndexSource.from_timegate_url('http://wayback.archive-it.org/all/'), + 'bl': MementoIndexSource.from_timegate_url('http://www.webarchive.org.uk/wayback/archive/'), + 'rhiz': MementoIndexSource.from_timegate_url('http://webenact.rhizome.org/vvork/', path='*') +} + + +aggs = {'simple': SimpleAggregator(sources), + 'gevent': GeventTimeoutAggregator(sources, timeout=5.0), + } + +agg_tm = {'gevent': GeventTimeoutAggregator(sources, timeout=0.0)} + +nf = {'notfound': FileIndexSource(to_path('testdata/not-found-x'))} +agg_nf = {'simple': SimpleAggregator(nf), + 'gevent': GeventTimeoutAggregator(nf, timeout=5.0), + } + + +@pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys())) +def test_mem_agg_index_1(agg): + url = 'http://iana.org/' + res, errs = agg(dict(url=url, closest='20140126000000', limit=5)) + + + exp = [{"timestamp": "20140126093743", "load_url": "http://web.archive.org/web/20140126093743id_/http://iana.org/", "source": "ia"}, + {"timestamp": "20140126200624", "filename": "iana.warc.gz", "source": "local"}, + {"timestamp": "20140123034755", "load_url": "http://web.archive.org/web/20140123034755id_/http://iana.org/", "source": "ia"}, + {"timestamp": "20140129175203", "load_url": "http://web.archive.org/web/20140129175203id_/http://iana.org/", "source": "ia"}, + {"timestamp": "20140107040552", "load_url": "http://wayback.archive-it.org/all/20140107040552id_/http://iana.org/", "source": "ait"} + ] + + assert(to_json_list(res) == exp) + assert(errs == {'bl': "NotFoundException('http://www.webarchive.org.uk/wayback/archive/http://iana.org/',)", + 'rhiz': "NotFoundException('http://webenact.rhizome.org/vvork/http://iana.org/',)"}) + +@pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys())) +def test_mem_agg_index_2(agg): + url = 'http://example.com/' + res, errs = agg(dict(url=url, closest='20100512', limit=6)) + + exp = [{"timestamp": "20100513010014", "load_url": "http://www.webarchive.org.uk/wayback/archive/20100513010014id_/http://example.com/", "source": "bl"}, + {"timestamp": "20100512204410", "load_url": "http://www.webarchive.org.uk/wayback/archive/20100512204410id_/http://example.com/", "source": "bl"}, + #{"timestamp": "20100513052358", "load_url": "http://web.archive.org/web/20100513052358id_/http://example.com/", "source": "ia"}, + {"timestamp": "20100511201151", "load_url": "http://wayback.archive-it.org/all/20100511201151id_/http://example.com/", "source": "ait"}, + {"timestamp": "20100514231857", "load_url": "http://web.archive.org/web/20100514231857id_/http://example.com/", "source": "ia"}, + {"timestamp": "20100514231857", "load_url": "http://wayback.archive-it.org/all/20100514231857id_/http://example.com/", "source": "ait"}, + {"timestamp": "20100510233601", "load_url": "http://web.archive.org/web/20100510233601id_/http://example.com/", "source": "ia"}] + + assert(to_json_list(res) == exp) + assert(errs == {'rhiz': "NotFoundException('http://webenact.rhizome.org/vvork/http://example.com/',)"}) + + +@pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys())) +def test_mem_agg_index_3(agg): + url = 'http://vvork.com/' + res, errs = agg(dict(url=url, closest='20141001', limit=5)) + + exp = [{"timestamp": "20141006184357", "load_url": "http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/", "source": "rhiz"}, + {"timestamp": "20141018133107", "load_url": "http://web.archive.org/web/20141018133107id_/http://vvork.com/", "source": "ia"}, + {"timestamp": "20141020161243", "load_url": "http://web.archive.org/web/20141020161243id_/http://vvork.com/", "source": "ia"}, + {"timestamp": "20140806161228", "load_url": "http://web.archive.org/web/20140806161228id_/http://vvork.com/", "source": "ia"}, + {"timestamp": "20131004231540", "load_url": "http://wayback.archive-it.org/all/20131004231540id_/http://vvork.com/", "source": "ait"}] + + assert(to_json_list(res) == exp) + assert(errs == {}) + + +@pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys())) +def test_mem_agg_index_4(agg): + url = 'http://vvork.com/' + res, errs = agg(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait')) + + exp = [{"timestamp": "20141006184357", "load_url": "http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/", "source": "rhiz"}, + {"timestamp": "20131004231540", "load_url": "http://wayback.archive-it.org/all/20131004231540id_/http://vvork.com/", "source": "ait"}] + + assert(to_json_list(res) == exp) + assert(errs == {}) + + +@pytest.mark.parametrize("agg", list(agg_nf.values()), ids=list(agg_nf.keys())) +def test_mem_agg_not_found(agg): + url = 'http://vvork.com/' + res, errs = agg(dict(url=url, closest='20141001', limit=2)) + + assert(to_json_list(res) == []) + assert(errs == {'notfound': "NotFoundException('testdata/not-found-x',)"}) + + +@pytest.mark.parametrize("agg", list(agg_tm.values()), ids=list(agg_tm.keys())) +def test_mem_agg_timeout(agg): + url = 'http://vvork.com/' + + orig_source = BaseAggregator.load_child_source + def load_child_source(self, name, source, params): + time.sleep(0.1) + return orig_source(name, source, params) + + BaseAggregator.load_child_source = load_child_source + res, errs = agg(dict(url=url, closest='20141001', limit=2)) + BaseAggregator.load_child_source = orig_source + + assert(to_json_list(res) == []) + assert(errs == {'local': 'timeout', + 'ait': 'timeout', 'bl': 'timeout', 'ia': 'timeout', 'rhiz': 'timeout'}) + + +def test_handler_output_cdxj(): + agg = GeventTimeoutAggregator(sources, timeout=5.0) + handler = IndexHandler(agg) + url = 'http://vvork.com/' + headers, res, errs = handler(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait')) + + exp = b"""\ +com,vvork)/ 20141006184357 {"url": "http://www.vvork.com/", "mem_rel": "memento", "memento_url": "http://webenact.rhizome.org/vvork/20141006184357/http://www.vvork.com/", "load_url": "http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/", "source": "rhiz"} +com,vvork)/ 20131004231540 {"url": "http://vvork.com/", "mem_rel": "last memento", "memento_url": "http://wayback.archive-it.org/all/20131004231540/http://vvork.com/", "load_url": "http://wayback.archive-it.org/all/20131004231540id_/http://vvork.com/", "source": "ait"} +""" + + assert(headers['Content-Type'] == 'text/x-cdxj') + assert(b''.join(res) == exp) + assert(errs == {}) + + +def test_handler_output_json(): + agg = GeventTimeoutAggregator(sources, timeout=5.0) + handler = IndexHandler(agg) + url = 'http://vvork.com/' + headers, res, errs = handler(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait', output='json')) + + exp = b"""\ +{"urlkey": "com,vvork)/", "timestamp": "20141006184357", "url": "http://www.vvork.com/", "mem_rel": "memento", "memento_url": "http://webenact.rhizome.org/vvork/20141006184357/http://www.vvork.com/", "load_url": "http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/", "source": "rhiz"} +{"urlkey": "com,vvork)/", "timestamp": "20131004231540", "url": "http://vvork.com/", "mem_rel": "last memento", "memento_url": "http://wayback.archive-it.org/all/20131004231540/http://vvork.com/", "load_url": "http://wayback.archive-it.org/all/20131004231540id_/http://vvork.com/", "source": "ait"} +""" + + assert(headers['Content-Type'] == 'application/x-ndjson') + assert(b''.join(res) == exp) + assert(errs == {}) + +def test_handler_output_link(): + agg = GeventTimeoutAggregator(sources, timeout=5.0) + handler = IndexHandler(agg) + url = 'http://vvork.com/' + headers, res, errs = handler(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait', output='link')) + + exp = b"""\ +; rel="memento"; datetime="Mon, 06 Oct 2014 18:43:57 GMT"; src="rhiz", +; rel="memento"; datetime="Fri, 04 Oct 2013 23:15:40 GMT"; src="ait" +""" + assert(headers['Content-Type'] == 'application/link') + assert(b''.join(res) == exp) + assert(errs == {}) + + +def test_handler_output_link_2(): + agg = GeventTimeoutAggregator(sources, timeout=5.0) + handler = IndexHandler(agg) + url = 'http://iana.org/' + headers, res, errs = handler(dict(url=url, closest='20140126000000', limit=5, output='link')) + + exp = b"""\ +; rel="memento"; datetime="Sun, 26 Jan 2014 09:37:43 GMT"; src="ia", +; rel="memento"; datetime="Sun, 26 Jan 2014 20:06:24 GMT"; src="local", +; rel="memento"; datetime="Thu, 23 Jan 2014 03:47:55 GMT"; src="ia", +; rel="memento"; datetime="Wed, 29 Jan 2014 17:52:03 GMT"; src="ia", +; rel="memento"; datetime="Tue, 07 Jan 2014 04:05:52 GMT"; src="ait" +""" + assert(headers['Content-Type'] == 'application/link') + assert(b''.join(res) == exp) + + exp_errs = {'bl': "NotFoundException('http://www.webarchive.org.uk/wayback/archive/http://iana.org/',)", + 'rhiz': "NotFoundException('http://webenact.rhizome.org/vvork/http://iana.org/',)"} + + assert(errs == exp_errs) + + + +def test_handler_output_link_3(): + agg = GeventTimeoutAggregator(sources, timeout=5.0) + handler = IndexHandler(agg) + url = 'http://foo.bar.non-existent' + headers, res, errs = handler(dict(url=url, closest='20140126000000', limit=5, output='link')) + + exp = b'' + + assert(headers['Content-Type'] == 'application/link') + assert(b''.join(res) == exp) + + exp_errs = {'ait': "NotFoundException('http://wayback.archive-it.org/all/http://foo.bar.non-existent',)", + 'bl': "NotFoundException('http://www.webarchive.org.uk/wayback/archive/http://foo.bar.non-existent',)", + 'ia': "NotFoundException('http://web.archive.org/web/http://foo.bar.non-existent',)", + 'rhiz': "NotFoundException('http://webenact.rhizome.org/vvork/http://foo.bar.non-existent',)"} + + assert(errs == exp_errs) + +def test_handler_output_text(): + agg = GeventTimeoutAggregator(sources, timeout=5.0) + handler = IndexHandler(agg) + url = 'http://vvork.com/' + headers, res, errs = handler(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait', output='text')) + + exp = b"""\ +com,vvork)/ 20141006184357 http://www.vvork.com/ memento http://webenact.rhizome.org/vvork/20141006184357/http://www.vvork.com/ http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/ rhiz +com,vvork)/ 20131004231540 http://vvork.com/ last memento http://wayback.archive-it.org/all/20131004231540/http://vvork.com/ http://wayback.archive-it.org/all/20131004231540id_/http://vvork.com/ ait +""" + assert(headers['Content-Type'] == 'text/plain') + assert(b''.join(res) == exp) + assert(errs == {}) + + +def test_handler_list_sources(): + agg = GeventTimeoutAggregator(sources, timeout=5.0) + handler = IndexHandler(agg) + headers, res, errs = handler(dict(mode='list_sources')) + + assert(headers == {}) + assert(res == {'sources': {'bl': 'memento', + 'ait': 'memento', + 'ia': 'memento', + 'rhiz': 'memento', + 'local': 'file'}}) + assert(errs == {}) + + diff --git a/webagg/test/test_redis_agg.py b/webagg/test/test_redis_agg.py new file mode 100644 index 00000000..505350f7 --- /dev/null +++ b/webagg/test/test_redis_agg.py @@ -0,0 +1,45 @@ +from webagg.aggregator import RedisMultiKeyIndexSource +from .testutils import to_path, to_json_list, FakeRedisTests, BaseTestClass + + +class TestRedisAgg(FakeRedisTests, BaseTestClass): + @classmethod + def setup_class(cls): + super(TestRedisAgg, cls).setup_class() + cls.add_cdx_to_redis(to_path('testdata/example.cdxj'), 'FOO:example:cdxj') + cls.add_cdx_to_redis(to_path('testdata/dupes.cdxj'), 'FOO:dupes:cdxj') + + cls.indexloader = RedisMultiKeyIndexSource('redis://localhost/2/{user}:{coll}:cdxj') + + def test_redis_agg_all(self): + res, errs = self.indexloader({'url': 'example.com/', 'param.user': 'FOO', 'param.coll': '*'}) + + exp = [ + {'source': 'FOO:dupes:cdxj', 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'}, + {'source': 'FOO:dupes:cdxj', 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'}, + {'source': 'FOO:example:cdxj', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'} + ] + + assert(errs == {}) + assert(to_json_list(res) == exp) + + def test_redis_agg_one(self): + res, errs = self.indexloader({'url': 'example.com/', 'param.user': 'FOO', 'param.coll': 'dupes'}) + + exp = [ + {'source': 'FOO:dupes:cdxj', 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'}, + {'source': 'FOO:dupes:cdxj', 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'}, + ] + + assert(errs == {}) + assert(to_json_list(res) == exp) + + def test_redis_not_found(self): + res, errs = self.indexloader({'url': 'example.com/'}) + + exp = [] + + assert(errs == {}) + assert(to_json_list(res) == exp) + + diff --git a/webagg/test/test_timeouts.py b/webagg/test/test_timeouts.py new file mode 100644 index 00000000..60080ce6 --- /dev/null +++ b/webagg/test/test_timeouts.py @@ -0,0 +1,118 @@ +from gevent import monkey; monkey.patch_all(thread=False) +import time +from webagg.indexsource import FileIndexSource + +from webagg.aggregator import SimpleAggregator, TimeoutMixin +from webagg.aggregator import GeventTimeoutAggregator, GeventTimeoutAggregator + +from .testutils import to_json_list + + +class TimeoutFileSource(FileIndexSource): + def __init__(self, filename, timeout): + super(TimeoutFileSource, self).__init__(filename) + self.timeout = timeout + self.calls = 0 + + def load_index(self, params): + self.calls += 1 + print('Sleeping') + time.sleep(self.timeout) + return super(TimeoutFileSource, self).load_index(params) + +TimeoutAggregator = GeventTimeoutAggregator + + + +def setup_module(): + global sources + sources = {'slow': TimeoutFileSource('testdata/example.cdxj', 0.2), + 'slower': TimeoutFileSource('testdata/dupes.cdxj', 0.5) + } + + + +def test_timeout_long_all_pass(): + agg = TimeoutAggregator(sources, timeout=1.0) + + res, errs = agg(dict(url='http://example.com/')) + + exp = [{'source': 'slower', 'timestamp': '20140127171200'}, + {'source': 'slower', 'timestamp': '20140127171251'}, + {'source': 'slow', 'timestamp': '20160225042329'}] + + assert(to_json_list(res, fields=['source', 'timestamp']) == exp) + + assert(errs == {}) + + +def test_timeout_slower_skipped_1(): + agg = GeventTimeoutAggregator(sources, timeout=0.49) + + res, errs = agg(dict(url='http://example.com/')) + + exp = [{'source': 'slow', 'timestamp': '20160225042329'}] + + assert(to_json_list(res, fields=['source', 'timestamp']) == exp) + + assert(errs == {'slower': 'timeout'}) + + +def test_timeout_slower_skipped_2(): + agg = GeventTimeoutAggregator(sources, timeout=0.19) + + res, errs = agg(dict(url='http://example.com/')) + + exp = [] + + assert(to_json_list(res, fields=['source', 'timestamp']) == exp) + + assert(errs == {'slower': 'timeout', 'slow': 'timeout'}) + + +def test_timeout_skipping(): + assert(sources['slow'].calls == 3) + assert(sources['slower'].calls == 3) + + agg = GeventTimeoutAggregator(sources, timeout=0.49, + t_count=2, t_duration=2.0) + + exp = [{'source': 'slow', 'timestamp': '20160225042329'}] + + res, errs = agg(dict(url='http://example.com/')) + assert(to_json_list(res, fields=['source', 'timestamp']) == exp) + assert(sources['slow'].calls == 4) + assert(sources['slower'].calls == 4) + + assert(errs == {'slower': 'timeout'}) + + res, errs = agg(dict(url='http://example.com/')) + assert(to_json_list(res, fields=['source', 'timestamp']) == exp) + assert(sources['slow'].calls == 5) + assert(sources['slower'].calls == 5) + + assert(errs == {'slower': 'timeout'}) + + res, errs = agg(dict(url='http://example.com/')) + assert(to_json_list(res, fields=['source', 'timestamp']) == exp) + assert(sources['slow'].calls == 6) + assert(sources['slower'].calls == 5) + + assert(errs == {}) + + res, errs = agg(dict(url='http://example.com/')) + assert(to_json_list(res, fields=['source', 'timestamp']) == exp) + assert(sources['slow'].calls == 7) + assert(sources['slower'].calls == 5) + + assert(errs == {}) + + time.sleep(2.01) + + res, errs = agg(dict(url='http://example.com/')) + assert(to_json_list(res, fields=['source', 'timestamp']) == exp) + assert(sources['slow'].calls == 8) + assert(sources['slower'].calls == 6) + + assert(errs == {'slower': 'timeout'}) + diff --git a/webagg/test/test_upstream.py b/webagg/test/test_upstream.py new file mode 100644 index 00000000..59854f90 --- /dev/null +++ b/webagg/test/test_upstream.py @@ -0,0 +1,74 @@ +import webtest + +from io import BytesIO +from webagg.app import ResAggApp +import requests + +from webagg.handlers import DefaultResourceHandler +from webagg.aggregator import SimpleAggregator +from webagg.proxyindexsource import ProxyMementoIndexSource, UpstreamAggIndexSource + +from pywb.warc.recordloader import ArcWarcRecordLoader + +from .testutils import LiveServerTests, BaseTestClass + + +class TestUpstream(LiveServerTests, BaseTestClass): + def setup(self): + app = ResAggApp() + + base_url = 'http://localhost:{0}'.format(self.server.port) + app.add_route('/upstream', + DefaultResourceHandler(SimpleAggregator( + {'upstream': UpstreamAggIndexSource(base_url + '/live')}) + ) + ) + + app.add_route('/upstream_opt', + DefaultResourceHandler(SimpleAggregator( + {'upstream_opt': ProxyMementoIndexSource.upstream_resource(base_url + '/live')}) + ) + ) + + self.base_url = base_url + self.testapp = webtest.TestApp(app) + + + def test_live_paths(self): + res = requests.get(self.base_url + '/') + assert set(res.json().keys()) == {'/live/postreq', '/live'} + + def test_upstream_paths(self): + res = self.testapp.get('/') + assert set(res.json.keys()) == {'/upstream/postreq', '/upstream', '/upstream_opt', '/upstream_opt/postreq'} + + def test_live_1(self): + resp = requests.get(self.base_url + '/live/resource?url=http://httpbin.org/get', stream=True) + assert resp.headers['WebAgg-Source-Coll'] == 'live' + + record = ArcWarcRecordLoader().parse_record_stream(resp.raw, no_record_parse=False) + assert record.rec_headers.get_header('WARC-Target-URI') == 'http://httpbin.org/get' + assert record.status_headers.get_header('Date') != '' + + def test_upstream_1(self): + resp = self.testapp.get('/upstream/resource?url=http://httpbin.org/get') + assert resp.headers['WebAgg-Source-Coll'] == 'upstream:live' + + raw = BytesIO(resp.body) + + record = ArcWarcRecordLoader().parse_record_stream(raw, no_record_parse=False) + assert record.rec_headers.get_header('WARC-Target-URI') == 'http://httpbin.org/get' + assert record.status_headers.get_header('Date') != '' + + def test_upstream_2(self): + resp = self.testapp.get('/upstream_opt/resource?url=http://httpbin.org/get') + assert resp.headers['WebAgg-Source-Coll'] == 'upstream_opt:live', resp.headers + + raw = BytesIO(resp.body) + + record = ArcWarcRecordLoader().parse_record_stream(raw, no_record_parse=False) + assert record.rec_headers.get_header('WARC-Target-URI') == 'http://httpbin.org/get' + assert record.status_headers.get_header('Date') != '' + + + diff --git a/webagg/test/testutils.py b/webagg/test/testutils.py new file mode 100644 index 00000000..c9ba5be0 --- /dev/null +++ b/webagg/test/testutils.py @@ -0,0 +1,127 @@ +import json +import os +import tempfile +import shutil + +from multiprocessing import Process + +from fakeredis import FakeStrictRedis +from mock import patch + +from wsgiref.simple_server import make_server + +from webagg.aggregator import SimpleAggregator +from webagg.app import ResAggApp +from webagg.handlers import DefaultResourceHandler +from webagg.indexsource import LiveIndexSource + + +# ============================================================================ +def to_json_list(cdxlist, fields=['timestamp', 'load_url', 'filename', 'source']): + return list([json.loads(cdx.to_json(fields)) for cdx in cdxlist]) + +def key_ts_res(cdxlist, extra='filename'): + return '\n'.join([cdx['urlkey'] + ' ' + cdx['timestamp'] + ' ' + cdx[extra] for cdx in cdxlist]) + +def to_path(path): + if os.path.sep != '/': + path = path.replace('/', os.path.sep) + + return path + + +# ============================================================================ +class BaseTestClass(object): + @classmethod + def setup_class(cls): + pass + + @classmethod + def teardown_class(cls): + pass + + +# ============================================================================ +PUBSUBS = [] + +class FakeStrictRedisSharedPubSub(FakeStrictRedis): + def __init__(self, *args, **kwargs): + super(FakeStrictRedisSharedPubSub, self).__init__(*args, **kwargs) + self._pubsubs = PUBSUBS + + +# ============================================================================ +class FakeRedisTests(object): + @classmethod + def setup_class(cls): + super(FakeRedisTests, cls).setup_class() + cls.redismock = patch('redis.StrictRedis', FakeStrictRedisSharedPubSub) + cls.redismock.start() + + @staticmethod + def add_cdx_to_redis(filename, key, redis_url='redis://localhost:6379/2'): + r = FakeStrictRedis.from_url(redis_url) + with open(filename, 'rb') as fh: + for line in fh: + r.zadd(key, 0, line.rstrip()) + + @classmethod + def teardown_class(cls): + super(FakeRedisTests, cls).teardown_class() + FakeStrictRedis().flushall() + cls.redismock.stop() + + +# ============================================================================ +class TempDirTests(object): + @classmethod + def setup_class(cls): + super(TempDirTests, cls).setup_class() + cls.root_dir = tempfile.mkdtemp() + + @classmethod + def teardown_class(cls): + super(TempDirTests, cls).teardown_class() + shutil.rmtree(cls.root_dir) + + +# ============================================================================ +class LiveServerTests(object): + @classmethod + def setup_class(cls): + super(LiveServerTests, cls).setup_class() + cls.server = ServerThreadRunner(cls.make_live_app()) + + @staticmethod + def make_live_app(): + app = ResAggApp() + app.add_route('/live', + DefaultResourceHandler(SimpleAggregator( + {'live': LiveIndexSource()}) + ) + ) + return app + + @classmethod + def teardown_class(cls): + super(LiveServerTests, cls).teardown_class() + cls.server.stop() + + +# ============================================================================ +class ServerThreadRunner(object): + def __init__(self, app): + self.httpd = make_server('', 0, app) + self.port = self.httpd.socket.getsockname()[1] + + def run(): + self.httpd.serve_forever() + + self.proc = Process(target=run) + #self.proc.daemon = True + self.proc.start() + + def stop(self): + self.proc.terminate() + + diff --git a/webagg/utils.py b/webagg/utils.py new file mode 100644 index 00000000..5617d048 --- /dev/null +++ b/webagg/utils.py @@ -0,0 +1,200 @@ +import re +import six +import string +import yaml +import os + +from contextlib import closing + +from pywb.utils.timeutils import timestamp_to_http_date +from pywb.utils.wbexception import BadRequestException + +LINK_SPLIT = re.compile(',\s*(?=[<])') +LINK_SEG_SPLIT = re.compile(';\s*') +LINK_URL = re.compile('<(.*)>') +LINK_PROP = re.compile('([\w]+)="([^"]+)') + +BUFF_SIZE = 16384 + + +#============================================================================= +class MementoException(BadRequestException): + pass + + +#============================================================================= +class MementoUtils(object): + @staticmethod + def parse_links(link_header, def_name='timemap'): + links = LINK_SPLIT.split(link_header) + results = {} + mementos = [] + + for link in links: + props = LINK_SEG_SPLIT.split(link) + m = LINK_URL.match(props[0]) + if not m: + raise MementoException('Invalid Link Url: ' + props[0]) + + result = dict(url=m.group(1)) + key = '' + is_mem = False + + for prop in props[1:]: + m = LINK_PROP.match(prop) + if not m: + raise MementoException('Invalid prop ' + prop) + + name = m.group(1) + value = m.group(2) + + if name == 'rel': + if 'memento' in value: + is_mem = True + result[name] = value + elif value == 'self': + key = def_name + else: + key = value + else: + result[name] = value + + if key: + results[key] = result + elif is_mem: + mementos.append(result) + + results['mementos'] = mementos + return results + + @staticmethod + def make_timemap_memento_link(cdx, datetime=None, rel='memento', end=',\n'): + url = cdx.get('load_url') + if not url: + url = 'file://{0}:{1}:{2}'.format(cdx.get('filename'), cdx.get('offset'), cdx.get('length')) + + memento = '<{0}>; rel="{1}"; datetime="{2}"; src="{3}"' + end + + if not datetime: + datetime = timestamp_to_http_date(cdx['timestamp']) + + return memento.format(url, rel, datetime, cdx.get('source', '')) + + + @staticmethod + def make_timemap(cdx_iter): + # get first memento as it'll be used for 'from' field + try: + first_cdx = six.next(cdx_iter) + from_date = timestamp_to_http_date(first_cdx['timestamp']) + except StopIteration: + first_cdx = None + return + + # first memento link + yield MementoUtils.make_timemap_memento_link(first_cdx, datetime=from_date) + + prev_cdx = None + + for cdx in cdx_iter: + if prev_cdx: + yield MementoUtils.make_timemap_memento_link(prev_cdx) + + prev_cdx = cdx + + # last memento link, if any + if prev_cdx: + yield MementoUtils.make_timemap_memento_link(prev_cdx, end='\n') + + @staticmethod + def make_link(url, type): + return '<{0}>; rel="{1}"'.format(url, type) + + +#============================================================================= +class ParamFormatter(string.Formatter): + def __init__(self, params, name='', prefix='param.'): + self.params = params + self.prefix = prefix + self.name = name + + def get_value(self, key, args, kwargs): + # First, try the named param 'param.{name}.{key}' + if self.name: + named_key = self.prefix + self.name + '.' + key + value = self.params.get(named_key) + if value is not None: + return value + + # Then, try 'param.{key}' + named_key = self.prefix + key + value = self.params.get(named_key) + if value is not None: + return value + + # default to just '{key}' + value = kwargs.get(key, '') + return value + + +#============================================================================= +def res_template(template, params, **extra_params): + formatter = params.get('_formatter') + if not formatter: + formatter = ParamFormatter(params) + res = formatter.format(template, url=params.get('url', ''), **extra_params) + + return res + + +#============================================================================= +def StreamIter(stream, header1=None, header2=None, size=BUFF_SIZE): + with closing(stream): + if header1: + yield header1 + + if header2: + yield header2 + + while True: + buff = stream.read(size) + if not buff: + break + yield buff + + +#============================================================================= +def chunk_encode_iter(orig_iter): + for chunk in orig_iter: + if not len(chunk): + continue + chunk_len = b'%X\r\n' % len(chunk) + yield chunk_len + yield chunk + yield b'\r\n' + + yield b'0\r\n\r\n' + + +#============================================================================= +def load_config(main_env_var, main_default_file='', + overlay_env_var='', overlay_file=''): + + configfile = os.environ.get(main_env_var, main_default_file) + + if configfile: + # Load config + with open(configfile, 'rb') as fh: + config = yaml.load(fh) + + else: + config = {} + + overlay_configfile = os.environ.get(overlay_env_var, overlay_file) + + if overlay_configfile: + with open(overlay_configfile, 'rb') as fh: + config.update(yaml.load(fh)) + + return config +