mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
Merge remote-tracking branch 'webrec-platform' system into pywb for furthering refactoring!
This commit is contained in:
commit
88d6b9e097
23
Dockerfile
Normal file
23
Dockerfile
Normal file
@ -0,0 +1,23 @@
|
||||
#webrecorder/webrecore 1.0
|
||||
|
||||
FROM python:3.5.2
|
||||
|
||||
RUN pip install gevent uwsgi bottle urllib3 youtube-dl
|
||||
|
||||
RUN pip install git+https://github.com/ikreymer/pywb.git@master#egg=pywb-0.33.0
|
||||
#RUN pip install pywb
|
||||
|
||||
RUN pip install git+https://github.com/t0m/pyamf.git@python3
|
||||
|
||||
RUN pip install boto webassets
|
||||
|
||||
ADD . /webrecore/
|
||||
WORKDIR /webrecore/
|
||||
|
||||
RUN pip install -e ./
|
||||
|
||||
RUN useradd -ms /bin/bash -u 1000 apprun
|
||||
|
||||
USER apprun
|
||||
|
||||
|
19
docker-compose.yml
Normal file
19
docker-compose.yml
Normal file
@ -0,0 +1,19 @@
|
||||
version: '2'
|
||||
|
||||
services:
|
||||
proxy:
|
||||
build: ./proxy/
|
||||
links:
|
||||
- webagg:webagg
|
||||
|
||||
environment:
|
||||
- "WEBAGG=http://webrecplatform_webagg_1:8080"
|
||||
|
||||
ports:
|
||||
- 9080:9080
|
||||
|
||||
volumes:
|
||||
- ${HOME}/.mitmproxy/:/root/.mitmproxy/
|
||||
|
||||
webagg:
|
||||
build: ./webagg/
|
0
recorder/__init__.py
Normal file
0
recorder/__init__.py
Normal file
84
recorder/filters.py
Normal file
84
recorder/filters.py
Normal file
@ -0,0 +1,84 @@
|
||||
from pywb.utils.timeutils import timestamp_to_datetime, datetime_to_iso_date
|
||||
import re
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Header Exclusions
|
||||
# ============================================================================
|
||||
class ExcludeNone(object):
|
||||
def __call__(self, record):
|
||||
return None
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class ExcludeSpecificHeaders(object):
|
||||
def __init__(self, exclude_headers=[]):
|
||||
self.exclude_headers = [x.lower() for x in exclude_headers]
|
||||
|
||||
def __call__(self, record):
|
||||
return self.exclude_headers
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Revisit Policy
|
||||
# ============================================================================
|
||||
class WriteRevisitDupePolicy(object):
|
||||
def __call__(self, cdx, params):
|
||||
dt = timestamp_to_datetime(cdx['timestamp'])
|
||||
return ('revisit', cdx['url'], datetime_to_iso_date(dt))
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class SkipDupePolicy(object):
|
||||
def __call__(self, cdx, params):
|
||||
if cdx['url'] == params['url']:
|
||||
return 'skip'
|
||||
else:
|
||||
return 'write'
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class WriteDupePolicy(object):
|
||||
def __call__(self, cdx, params):
|
||||
return 'write'
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Skip Record Filters
|
||||
# ============================================================================
|
||||
class SkipNothingFilter(object):
|
||||
def skip_request(self, req_headers):
|
||||
return False
|
||||
|
||||
def skip_response(self, req_headers, resp_headers):
|
||||
return False
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class CollectionFilter(SkipNothingFilter):
|
||||
def __init__(self, accept_colls):
|
||||
self.rx_accept_colls = re.compile(accept_colls)
|
||||
|
||||
def skip_request(self, req_headers):
|
||||
if req_headers.get('Recorder-Skip') == '1':
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def skip_response(self, req_headers, resp_headers):
|
||||
if not self.rx_accept_colls.match(resp_headers.get('WebAgg-Source-Coll', '')):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class SkipRangeRequestFilter(SkipNothingFilter):
|
||||
def skip_request(self, req_headers):
|
||||
range_ = req_headers.get('Range')
|
||||
if range_ and not range_.lower().startswith('bytes=0-'):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
293
recorder/recorderapp.py
Normal file
293
recorder/recorderapp.py
Normal file
@ -0,0 +1,293 @@
|
||||
from webagg.utils import StreamIter, chunk_encode_iter, BUFF_SIZE
|
||||
from webagg.inputrequest import DirectWSGIInputRequest
|
||||
|
||||
from recorder.filters import SkipRangeRequestFilter, CollectionFilter
|
||||
|
||||
from six.moves.urllib.parse import parse_qsl
|
||||
|
||||
import json
|
||||
import tempfile
|
||||
|
||||
from requests.structures import CaseInsensitiveDict
|
||||
import requests
|
||||
|
||||
import traceback
|
||||
|
||||
import gevent.queue
|
||||
import gevent
|
||||
|
||||
|
||||
#==============================================================================
|
||||
class RecorderApp(object):
|
||||
def __init__(self, upstream_host, writer, skip_filters=None, **kwargs):
|
||||
self.upstream_host = upstream_host
|
||||
|
||||
self.writer = writer
|
||||
|
||||
self.write_queue = gevent.queue.Queue()
|
||||
gevent.spawn(self._write_loop)
|
||||
|
||||
if not skip_filters:
|
||||
skip_filters = self.create_default_filters(kwargs)
|
||||
|
||||
self.skip_filters = skip_filters
|
||||
|
||||
@staticmethod
|
||||
def create_default_filters(kwargs):
|
||||
skip_filters = [SkipRangeRequestFilter()]
|
||||
|
||||
accept_colls = kwargs.get('accept_colls')
|
||||
if accept_colls:
|
||||
skip_filters.append(CollectionFilter(accept_colls))
|
||||
|
||||
return skip_filters
|
||||
|
||||
def _write_loop(self):
|
||||
while True:
|
||||
try:
|
||||
self._write_one()
|
||||
except:
|
||||
traceback.print_exc()
|
||||
|
||||
def _write_one(self):
|
||||
req = None
|
||||
resp = None
|
||||
try:
|
||||
result = self.write_queue.get()
|
||||
|
||||
req_head, req_pay, resp_head, resp_pay, params = result
|
||||
|
||||
resp_type, resp = self.writer.read_resp_record(resp_head, resp_pay)
|
||||
|
||||
if resp_type == 'response':
|
||||
req = self.writer.create_req_record(req_head, req_pay)
|
||||
|
||||
self.writer.write_req_resp(req, resp, params)
|
||||
|
||||
else:
|
||||
self.writer.write_record(resp, params)
|
||||
|
||||
|
||||
finally:
|
||||
try:
|
||||
if req:
|
||||
req.stream.close()
|
||||
|
||||
if resp:
|
||||
resp.stream.close()
|
||||
except Exception as e:
|
||||
traceback.print_exc()
|
||||
|
||||
def send_error(self, exc, start_response):
|
||||
return self.send_message({'error': repr(exc)},
|
||||
'400 Bad Request',
|
||||
start_response)
|
||||
|
||||
def send_message(self, msg, status, start_response):
|
||||
message = json.dumps(msg)
|
||||
headers = [('Content-Type', 'application/json; charset=utf-8'),
|
||||
('Content-Length', str(len(message)))]
|
||||
|
||||
start_response(status, headers)
|
||||
return [message.encode('utf-8')]
|
||||
|
||||
def _put_record(self, request_uri, input_buff, record_type,
|
||||
headers, params, start_response):
|
||||
|
||||
if record_type == 'stream':
|
||||
if self.writer.write_stream_to_file(params, input_buff):
|
||||
msg = {'success': 'true'}
|
||||
else:
|
||||
msg = {'error_message': 'upload_error'}
|
||||
|
||||
return self.send_message(msg, '200 OK',
|
||||
start_response)
|
||||
|
||||
req_stream = ReqWrapper(input_buff, headers)
|
||||
|
||||
while True:
|
||||
buff = req_stream.read()
|
||||
if not buff:
|
||||
break
|
||||
|
||||
content_type = headers.get('Content-Type')
|
||||
|
||||
record = self.writer.create_custom_record(params['url'],
|
||||
req_stream.out,
|
||||
record_type,
|
||||
content_type,
|
||||
req_stream.headers)
|
||||
|
||||
self.writer.write_record(record, params)
|
||||
|
||||
msg = {'success': 'true',
|
||||
'WARC-Date': record.rec_headers.get('WARC-Date')}
|
||||
|
||||
return self.send_message(msg,
|
||||
'200 OK',
|
||||
start_response)
|
||||
|
||||
def _get_params(self, environ):
|
||||
params = dict(parse_qsl(environ.get('QUERY_STRING')))
|
||||
return params
|
||||
|
||||
def __call__(self, environ, start_response):
|
||||
try:
|
||||
return self.handle_call(environ, start_response)
|
||||
except:
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
def handle_call(self, environ, start_response):
|
||||
input_req = DirectWSGIInputRequest(environ)
|
||||
|
||||
params = self._get_params(environ)
|
||||
|
||||
request_uri = input_req.get_full_request_uri()
|
||||
|
||||
input_buff = input_req.get_req_body()
|
||||
|
||||
headers = input_req.get_req_headers()
|
||||
|
||||
method = input_req.get_req_method()
|
||||
|
||||
# write request body as metadata/resource
|
||||
put_record = params.get('put_record')
|
||||
if put_record and method in ('PUT', 'POST'):
|
||||
return self._put_record(request_uri,
|
||||
input_buff,
|
||||
put_record,
|
||||
headers,
|
||||
params,
|
||||
start_response)
|
||||
|
||||
skipping = any(x.skip_request(headers) for x in self.skip_filters)
|
||||
|
||||
if not skipping:
|
||||
req_stream = ReqWrapper(input_buff, headers)
|
||||
else:
|
||||
req_stream = input_buff
|
||||
|
||||
data = None
|
||||
if input_buff:
|
||||
data = req_stream
|
||||
|
||||
try:
|
||||
res = requests.request(url=self.upstream_host + request_uri,
|
||||
method=method,
|
||||
data=data,
|
||||
headers=headers,
|
||||
allow_redirects=False,
|
||||
stream=True)
|
||||
res.raise_for_status()
|
||||
except Exception as e:
|
||||
#traceback.print_exc()
|
||||
return self.send_error(e, start_response)
|
||||
|
||||
start_response('200 OK', list(res.headers.items()))
|
||||
|
||||
if not skipping:
|
||||
resp_stream = RespWrapper(res.raw,
|
||||
res.headers,
|
||||
req_stream,
|
||||
params,
|
||||
self.write_queue,
|
||||
self.skip_filters)
|
||||
else:
|
||||
resp_stream = res.raw
|
||||
|
||||
resp_iter = StreamIter(resp_stream)
|
||||
|
||||
if res.headers.get('Transfer-Encoding') == 'chunked':
|
||||
resp_iter = chunk_encode_iter(resp_iter)
|
||||
|
||||
return resp_iter
|
||||
|
||||
|
||||
#==============================================================================
|
||||
class Wrapper(object):
|
||||
def __init__(self, stream):
|
||||
self.stream = stream
|
||||
self.out = self._create_buffer()
|
||||
self.interrupted = False
|
||||
|
||||
def _create_buffer(self):
|
||||
return tempfile.SpooledTemporaryFile(max_size=512*1024)
|
||||
|
||||
def read(self, *args, **kwargs):
|
||||
try:
|
||||
buff = self.stream.read(*args, **kwargs)
|
||||
except Exception as e:
|
||||
print('INTERRUPT READ')
|
||||
self.interrupted = True
|
||||
raise
|
||||
|
||||
self.out.write(buff)
|
||||
return buff
|
||||
|
||||
|
||||
#==============================================================================
|
||||
class RespWrapper(Wrapper):
|
||||
def __init__(self, stream, headers, req,
|
||||
params, queue, skip_filters):
|
||||
|
||||
super(RespWrapper, self).__init__(stream)
|
||||
self.headers = headers
|
||||
self.req = req
|
||||
self.params = params
|
||||
self.queue = queue
|
||||
self.skip_filters = skip_filters
|
||||
|
||||
def close(self):
|
||||
try:
|
||||
while True:
|
||||
if not self.read(BUFF_SIZE):
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
print(e)
|
||||
self.interrupted = True
|
||||
|
||||
finally:
|
||||
try:
|
||||
self.stream.close()
|
||||
except Exception as e:
|
||||
traceback.print_exc()
|
||||
|
||||
self._write_to_file()
|
||||
|
||||
def _write_to_file(self):
|
||||
skipping = any(x.skip_response(self.req.headers, self.headers)
|
||||
for x in self.skip_filters)
|
||||
|
||||
if self.interrupted or skipping:
|
||||
self.out.close()
|
||||
self.req.out.close()
|
||||
self.req.close()
|
||||
return
|
||||
|
||||
try:
|
||||
entry = (self.req.headers, self.req.out,
|
||||
self.headers, self.out, self.params)
|
||||
self.queue.put(entry)
|
||||
self.req.close()
|
||||
self.req = None
|
||||
except:
|
||||
traceback.print_exc()
|
||||
|
||||
|
||||
#==============================================================================
|
||||
class ReqWrapper(Wrapper):
|
||||
def __init__(self, stream, req_headers):
|
||||
super(ReqWrapper, self).__init__(stream)
|
||||
self.headers = CaseInsensitiveDict(req_headers)
|
||||
|
||||
for n in req_headers.keys():
|
||||
if not n.upper().startswith('WARC-'):
|
||||
del self.headers[n]
|
||||
|
||||
def close(self):
|
||||
# no need to close wsgi.input
|
||||
pass
|
||||
|
||||
|
83
recorder/redisindexer.py
Normal file
83
recorder/redisindexer.py
Normal file
@ -0,0 +1,83 @@
|
||||
from pywb.utils.canonicalize import calc_search_range
|
||||
from pywb.cdx.cdxobject import CDXObject
|
||||
from pywb.warc.cdxindexer import write_cdx_index
|
||||
from pywb.utils.timeutils import iso_date_to_timestamp
|
||||
|
||||
from io import BytesIO
|
||||
import os
|
||||
|
||||
from webagg.indexsource import RedisIndexSource
|
||||
from webagg.aggregator import SimpleAggregator
|
||||
from webagg.utils import res_template
|
||||
|
||||
from recorder.filters import WriteRevisitDupePolicy
|
||||
|
||||
|
||||
#==============================================================================
|
||||
class WritableRedisIndexer(RedisIndexSource):
|
||||
def __init__(self, *args, **kwargs):
|
||||
redis_url = kwargs.get('redis_url')
|
||||
redis = kwargs.get('redis')
|
||||
cdx_key_template = kwargs.get('cdx_key_template')
|
||||
|
||||
super(WritableRedisIndexer, self).__init__(redis_url,
|
||||
redis,
|
||||
cdx_key_template)
|
||||
|
||||
name = kwargs.get('name', 'recorder')
|
||||
self.cdx_lookup = SimpleAggregator({name: self})
|
||||
|
||||
self.rel_path_template = kwargs.get('rel_path_template', '')
|
||||
self.file_key_template = kwargs.get('file_key_template', '')
|
||||
self.full_warc_prefix = kwargs.get('full_warc_prefix', '')
|
||||
self.dupe_policy = kwargs.get('dupe_policy', WriteRevisitDupePolicy())
|
||||
|
||||
def add_warc_file(self, full_filename, params):
|
||||
rel_path = res_template(self.rel_path_template, params)
|
||||
rel_filename = os.path.relpath(full_filename, rel_path)
|
||||
|
||||
file_key = res_template(self.file_key_template, params)
|
||||
|
||||
full_load_path = self.full_warc_prefix + full_filename
|
||||
|
||||
self.redis.hset(file_key, rel_filename, full_load_path)
|
||||
|
||||
def add_urls_to_index(self, stream, params, filename, length):
|
||||
rel_path = res_template(self.rel_path_template, params)
|
||||
filename = os.path.relpath(filename, rel_path)
|
||||
|
||||
cdxout = BytesIO()
|
||||
write_cdx_index(cdxout, stream, filename,
|
||||
cdxj=True, append_post=True)
|
||||
|
||||
z_key = res_template(self.redis_key_template, params)
|
||||
|
||||
cdx_list = cdxout.getvalue().rstrip().split(b'\n')
|
||||
|
||||
for cdx in cdx_list:
|
||||
if cdx:
|
||||
self.redis.zadd(z_key, 0, cdx)
|
||||
|
||||
return cdx_list
|
||||
|
||||
def lookup_revisit(self, params, digest, url, iso_dt):
|
||||
params['url'] = url
|
||||
params['closest'] = iso_date_to_timestamp(iso_dt)
|
||||
|
||||
filters = []
|
||||
|
||||
filters.append('!mime:warc/revisit')
|
||||
|
||||
if digest and digest != '-':
|
||||
filters.append('digest:' + digest.split(':')[-1])
|
||||
|
||||
params['filter'] = filters
|
||||
|
||||
cdx_iter, errs = self.cdx_lookup(params)
|
||||
|
||||
for cdx in cdx_iter:
|
||||
res = self.dupe_policy(cdx, params)
|
||||
if res:
|
||||
return res
|
||||
|
||||
return None
|
17
recorder/test/rec.ini
Normal file
17
recorder/test/rec.ini
Normal file
@ -0,0 +1,17 @@
|
||||
[uwsgi]
|
||||
if-not-env = PORT
|
||||
http-socket = :8010
|
||||
endif =
|
||||
|
||||
master = true
|
||||
buffer-size = 65536
|
||||
die-on-term = true
|
||||
|
||||
if-env = VIRTUAL_ENV
|
||||
venv = $(VIRTUAL_ENV)
|
||||
endif =
|
||||
|
||||
gevent = 100
|
||||
#gevent-early-monkey-patch =
|
||||
|
||||
wsgi = recorder.test.simplerec
|
42
recorder/test/simplerec.py
Normal file
42
recorder/test/simplerec.py
Normal file
@ -0,0 +1,42 @@
|
||||
from gevent import monkey; monkey.patch_all()
|
||||
|
||||
from recorder.recorderapp import RecorderApp
|
||||
from recorder.redisindexer import WritableRedisIndexer
|
||||
|
||||
from recorder.warcwriter import MultiFileWARCWriter
|
||||
from recorder.filters import SkipDupePolicy
|
||||
|
||||
import atexit
|
||||
import tempfile
|
||||
import redis
|
||||
|
||||
upstream_url = 'http://localhost:8080'
|
||||
|
||||
target = tempfile.mkdtemp(prefix='tmprec') + '/'
|
||||
|
||||
print('Recording to ' + target)
|
||||
|
||||
def rm_target():
|
||||
print('Removing ' + target)
|
||||
shutil.rmtree(target)
|
||||
|
||||
atexit.register(rm_target)
|
||||
|
||||
local_r = redis.StrictRedis.from_url('redis://localhost/2')
|
||||
local_r.delete('rec:cdxj')
|
||||
local_r.delete('rec:warc')
|
||||
|
||||
#target = './_recordings/'
|
||||
|
||||
dedup_index = WritableRedisIndexer(
|
||||
redis_url='redis://localhost/2/rec:cdxj',
|
||||
file_key_template='rec:warc',
|
||||
rel_path_template=target,
|
||||
dupe_policy=SkipDupePolicy())
|
||||
|
||||
recorder_app = RecorderApp(upstream_url,
|
||||
MultiFileWARCWriter(target, dedup_index=dedup_index),
|
||||
accept_colls='live')
|
||||
|
||||
application = recorder_app
|
||||
|
582
recorder/test/test_recorder.py
Normal file
582
recorder/test/test_recorder.py
Normal file
@ -0,0 +1,582 @@
|
||||
#from gevent import monkey; monkey.patch_all()
|
||||
import gevent
|
||||
|
||||
from webagg.test.testutils import TempDirTests, LiveServerTests, BaseTestClass, to_path
|
||||
from webagg.test.testutils import FakeRedisTests
|
||||
|
||||
import os
|
||||
import webtest
|
||||
|
||||
from pytest import raises
|
||||
from fakeredis import FakeStrictRedis
|
||||
|
||||
from recorder.recorderapp import RecorderApp
|
||||
from recorder.redisindexer import WritableRedisIndexer
|
||||
from recorder.warcwriter import PerRecordWARCWriter, MultiFileWARCWriter, SimpleTempWARCWriter
|
||||
from recorder.filters import ExcludeSpecificHeaders
|
||||
from recorder.filters import SkipDupePolicy, WriteDupePolicy, WriteRevisitDupePolicy
|
||||
|
||||
from webagg.utils import MementoUtils
|
||||
|
||||
from pywb.cdx.cdxobject import CDXObject
|
||||
from pywb.utils.statusandheaders import StatusAndHeadersParser
|
||||
from pywb.utils.bufferedreaders import DecompressingBufferedReader
|
||||
from pywb.warc.recordloader import ArcWarcRecordLoader
|
||||
from pywb.warc.cdxindexer import write_cdx_index
|
||||
from pywb.warc.archiveiterator import ArchiveIterator
|
||||
|
||||
from six.moves.urllib.parse import quote, unquote, urlencode
|
||||
from io import BytesIO
|
||||
import time
|
||||
import json
|
||||
|
||||
general_req_data = "\
|
||||
GET {path} HTTP/1.1\r\n\
|
||||
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8\r\n\
|
||||
User-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36\r\n\
|
||||
X-Other: foo\r\n\
|
||||
Host: {host}\r\n\
|
||||
Cookie: boo=far\r\n\
|
||||
\r\n"
|
||||
|
||||
|
||||
|
||||
class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass):
|
||||
@classmethod
|
||||
def setup_class(cls):
|
||||
super(TestRecorder, cls).setup_class()
|
||||
|
||||
cls.warcs_dir = to_path(cls.root_dir + '/warcs')
|
||||
|
||||
os.makedirs(cls.warcs_dir)
|
||||
|
||||
cls.upstream_url = 'http://localhost:{0}'.format(cls.server.port)
|
||||
|
||||
def _get_dedup_index(self, dupe_policy=WriteRevisitDupePolicy(), user=True):
|
||||
if user:
|
||||
file_key_template = '{user}:{coll}:warc'
|
||||
redis_url = 'redis://localhost/2/{user}:{coll}:cdxj'
|
||||
else:
|
||||
file_key_template = '{coll}:warc'
|
||||
redis_url = 'redis://localhost/2/{coll}:cdxj'
|
||||
|
||||
dedup_index = WritableRedisIndexer(redis_url=redis_url,
|
||||
file_key_template=file_key_template,
|
||||
rel_path_template=self.root_dir + '/warcs/',
|
||||
dupe_policy=dupe_policy)
|
||||
|
||||
return dedup_index
|
||||
|
||||
def _test_warc_write(self, recorder_app, host, path, other_params='', link_url=''):
|
||||
url = 'http://' + host + path
|
||||
req_url = '/live/resource/postreq?url=' + url + other_params
|
||||
testapp = webtest.TestApp(recorder_app)
|
||||
resp = testapp.post(req_url, general_req_data.format(host=host, path=path).encode('utf-8'))
|
||||
|
||||
if not recorder_app.write_queue.empty():
|
||||
recorder_app._write_one()
|
||||
|
||||
assert resp.headers['WebAgg-Source-Coll'] == 'live'
|
||||
|
||||
if not link_url:
|
||||
link_url = unquote(url)
|
||||
|
||||
assert resp.headers['Link'] == MementoUtils.make_link(link_url, 'original')
|
||||
assert resp.headers['Memento-Datetime'] != ''
|
||||
|
||||
return resp
|
||||
|
||||
def _test_all_warcs(self, dirname, num):
|
||||
coll_dir = to_path(self.root_dir + dirname)
|
||||
assert os.path.isdir(coll_dir)
|
||||
|
||||
files = [x for x in os.listdir(coll_dir) if os.path.isfile(os.path.join(coll_dir, x))]
|
||||
assert len(files) == num
|
||||
assert all(x.endswith('.warc.gz') for x in files)
|
||||
return files, coll_dir
|
||||
|
||||
def _load_resp_req(self, base_path):
|
||||
warcs = os.listdir(base_path)
|
||||
assert len(warcs) == 1
|
||||
warc = warcs[0]
|
||||
|
||||
stored_resp = None
|
||||
stored_req = None
|
||||
|
||||
with open(os.path.join(base_path, warc), 'rb') as fh:
|
||||
for rec in ArchiveIterator(fh)():
|
||||
if rec.rec_type == 'response':
|
||||
stored_resp = rec
|
||||
elif rec.rec_type == 'request':
|
||||
stored_req = rec
|
||||
|
||||
assert stored_resp is not None
|
||||
assert stored_req is not None
|
||||
return stored_req, stored_resp
|
||||
|
||||
def test_record_warc_1(self):
|
||||
recorder_app = RecorderApp(self.upstream_url,
|
||||
PerRecordWARCWriter(to_path(self.root_dir + '/warcs/')))
|
||||
|
||||
resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar')
|
||||
assert b'HTTP/1.1 200 OK' in resp.body
|
||||
assert b'"foo": "bar"' in resp.body
|
||||
|
||||
self._test_all_warcs('/warcs/', 1)
|
||||
|
||||
def test_record_warc_2(self):
|
||||
recorder_app = RecorderApp(self.upstream_url,
|
||||
PerRecordWARCWriter(to_path(self.root_dir + '/warcs/')), accept_colls='live')
|
||||
|
||||
resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar')
|
||||
assert b'HTTP/1.1 200 OK' in resp.body
|
||||
assert b'"foo": "bar"' in resp.body
|
||||
|
||||
self._test_all_warcs('/warcs/', 2)
|
||||
|
||||
def test_error_url(self):
|
||||
recorder_app = RecorderApp(self.upstream_url + '01',
|
||||
PerRecordWARCWriter(to_path(self.root_dir + '/warcs/')), accept_colls='live')
|
||||
|
||||
|
||||
testapp = webtest.TestApp(recorder_app)
|
||||
resp = testapp.get('/live/resource?url=http://example.com/', status=400)
|
||||
|
||||
assert resp.json['error'] != ''
|
||||
|
||||
self._test_all_warcs('/warcs/', 2)
|
||||
|
||||
def test_record_cookies_header(self):
|
||||
base_path = to_path(self.root_dir + '/warcs/cookiecheck/')
|
||||
recorder_app = RecorderApp(self.upstream_url,
|
||||
PerRecordWARCWriter(base_path), accept_colls='live')
|
||||
|
||||
resp = self._test_warc_write(recorder_app, 'httpbin.org', '/cookies/set%3Fname%3Dvalue%26foo%3Dbar')
|
||||
assert b'HTTP/1.1 302' in resp.body
|
||||
|
||||
buff = BytesIO(resp.body)
|
||||
record = ArcWarcRecordLoader().parse_record_stream(buff)
|
||||
assert ('Set-Cookie', 'name=value; Path=/') in record.status_headers.headers
|
||||
assert ('Set-Cookie', 'foo=bar; Path=/') in record.status_headers.headers
|
||||
|
||||
stored_req, stored_resp = self._load_resp_req(base_path)
|
||||
|
||||
assert ('Set-Cookie', 'name=value; Path=/') in stored_resp.status_headers.headers
|
||||
assert ('Set-Cookie', 'foo=bar; Path=/') in stored_resp.status_headers.headers
|
||||
|
||||
assert ('X-Other', 'foo') in stored_req.status_headers.headers
|
||||
assert ('Cookie', 'boo=far') in stored_req.status_headers.headers
|
||||
|
||||
def test_record_cookies_skip_header(self):
|
||||
warc_path = to_path(self.root_dir + '/warcs/cookieskip/')
|
||||
header_filter = ExcludeSpecificHeaders(['Set-Cookie', 'Cookie'])
|
||||
recorder_app = RecorderApp(self.upstream_url,
|
||||
PerRecordWARCWriter(warc_path, header_filter=header_filter),
|
||||
accept_colls='live')
|
||||
|
||||
resp = self._test_warc_write(recorder_app, 'httpbin.org', '/cookies/set%3Fname%3Dvalue%26foo%3Dbar')
|
||||
assert b'HTTP/1.1 302' in resp.body
|
||||
|
||||
buff = BytesIO(resp.body)
|
||||
record = ArcWarcRecordLoader().parse_record_stream(buff)
|
||||
assert ('Set-Cookie', 'name=value; Path=/') in record.status_headers.headers
|
||||
assert ('Set-Cookie', 'foo=bar; Path=/') in record.status_headers.headers
|
||||
|
||||
stored_req, stored_resp = self._load_resp_req(warc_path)
|
||||
|
||||
assert ('Set-Cookie', 'name=value; Path=/') not in stored_resp.status_headers.headers
|
||||
assert ('Set-Cookie', 'foo=bar; Path=/') not in stored_resp.status_headers.headers
|
||||
|
||||
assert ('X-Other', 'foo') in stored_req.status_headers.headers
|
||||
assert ('Cookie', 'boo=far') not in stored_req.status_headers.headers
|
||||
|
||||
def test_record_skip_wrong_coll(self):
|
||||
recorder_app = RecorderApp(self.upstream_url,
|
||||
writer=PerRecordWARCWriter(to_path(self.root_dir + '/warcs/')), accept_colls='not-live')
|
||||
|
||||
resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar')
|
||||
assert b'HTTP/1.1 200 OK' in resp.body
|
||||
assert b'"foo": "bar"' in resp.body
|
||||
|
||||
self._test_all_warcs('/warcs/', 2)
|
||||
|
||||
def test_record_param_user_coll(self):
|
||||
warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')
|
||||
|
||||
dedup_index = self._get_dedup_index()
|
||||
|
||||
recorder_app = RecorderApp(self.upstream_url,
|
||||
PerRecordWARCWriter(warc_path, dedup_index=dedup_index))
|
||||
|
||||
self._test_all_warcs('/warcs/', 2)
|
||||
|
||||
resp = self._test_warc_write(recorder_app, 'httpbin.org',
|
||||
'/get?foo=bar', '¶m.recorder.user=USER¶m.recorder.coll=COLL')
|
||||
assert b'HTTP/1.1 200 OK' in resp.body
|
||||
assert b'"foo": "bar"' in resp.body
|
||||
|
||||
self._test_all_warcs('/warcs/USER/COLL/', 1)
|
||||
|
||||
r = FakeStrictRedis.from_url('redis://localhost/2')
|
||||
|
||||
res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/', '(org,httpbin,')
|
||||
assert len(res) == 1
|
||||
|
||||
cdx = CDXObject(res[0])
|
||||
assert cdx['urlkey'] == 'org,httpbin)/get?foo=bar'
|
||||
assert cdx['mime'] == 'application/json'
|
||||
assert cdx['offset'] == '0'
|
||||
assert cdx['filename'].startswith('USER/COLL/')
|
||||
assert cdx['filename'].endswith('.warc.gz')
|
||||
|
||||
warcs = r.hgetall('USER:COLL:warc')
|
||||
full_path = self.root_dir + '/warcs/' + cdx['filename']
|
||||
assert warcs == {cdx['filename'].encode('utf-8'): full_path.encode('utf-8')}
|
||||
|
||||
def test_record_param_user_coll_same_dir(self):
|
||||
warc_path = to_path(self.root_dir + '/warcs2/')
|
||||
|
||||
dedup_index = self._get_dedup_index()
|
||||
|
||||
recorder_app = RecorderApp(self.upstream_url,
|
||||
PerRecordWARCWriter(warc_path, dedup_index=dedup_index, key_template='{user}:{coll}'))
|
||||
|
||||
resp = self._test_warc_write(recorder_app, 'httpbin.org',
|
||||
'/get?foo=bar', '¶m.recorder.user=USER2¶m.recorder.coll=COLL2')
|
||||
assert b'HTTP/1.1 200 OK' in resp.body
|
||||
assert b'"foo": "bar"' in resp.body
|
||||
|
||||
resp = self._test_warc_write(recorder_app, 'httpbin.org',
|
||||
'/get?foo=bar', '¶m.recorder.user=USER2¶m.recorder.coll=COLL3')
|
||||
assert b'HTTP/1.1 200 OK' in resp.body
|
||||
assert b'"foo": "bar"' in resp.body
|
||||
|
||||
self._test_all_warcs('/warcs2', 2)
|
||||
|
||||
def test_record_param_user_coll_revisit(self):
|
||||
warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')
|
||||
|
||||
dedup_index = self._get_dedup_index()
|
||||
|
||||
recorder_app = RecorderApp(self.upstream_url,
|
||||
PerRecordWARCWriter(warc_path, dedup_index=dedup_index))
|
||||
|
||||
self._test_all_warcs('/warcs/', 2)
|
||||
|
||||
resp = self._test_warc_write(recorder_app, 'httpbin.org',
|
||||
'/get?foo=bar', '¶m.recorder.user=USER¶m.recorder.coll=COLL')
|
||||
assert b'HTTP/1.1 200 OK' in resp.body
|
||||
assert b'"foo": "bar"' in resp.body
|
||||
|
||||
self._test_all_warcs('/warcs/USER/COLL/', 2)
|
||||
|
||||
# Test Redis CDX
|
||||
r = FakeStrictRedis.from_url('redis://localhost/2')
|
||||
|
||||
res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/', '(org,httpbin,')
|
||||
assert len(res) == 2
|
||||
|
||||
cdx = CDXObject(res[1])
|
||||
assert cdx['urlkey'] == 'org,httpbin)/get?foo=bar'
|
||||
assert cdx['mime'] == 'warc/revisit'
|
||||
assert cdx['offset'] == '0'
|
||||
assert cdx['filename'].startswith('USER/COLL/')
|
||||
assert cdx['filename'].endswith('.warc.gz')
|
||||
|
||||
fullwarc = os.path.join(self.root_dir, 'warcs', cdx['filename'])
|
||||
|
||||
warcs = r.hgetall('USER:COLL:warc')
|
||||
assert len(warcs) == 2
|
||||
assert warcs[cdx['filename'].encode('utf-8')] == fullwarc.encode('utf-8')
|
||||
|
||||
with open(fullwarc, 'rb') as fh:
|
||||
decomp = DecompressingBufferedReader(fh)
|
||||
# Test refers-to headers
|
||||
status_headers = StatusAndHeadersParser(['WARC/1.0']).parse(decomp)
|
||||
assert status_headers.get_header('WARC-Type') == 'revisit'
|
||||
assert status_headers.get_header('WARC-Target-URI') == 'http://httpbin.org/get?foo=bar'
|
||||
assert status_headers.get_header('WARC-Date') != ''
|
||||
assert status_headers.get_header('WARC-Refers-To-Target-URI') == 'http://httpbin.org/get?foo=bar'
|
||||
assert status_headers.get_header('WARC-Refers-To-Date') != ''
|
||||
|
||||
def test_record_param_user_coll_skip(self):
|
||||
warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')
|
||||
|
||||
dedup_index = self._get_dedup_index(dupe_policy=SkipDupePolicy())
|
||||
|
||||
recorder_app = RecorderApp(self.upstream_url,
|
||||
PerRecordWARCWriter(warc_path, dedup_index=dedup_index))
|
||||
|
||||
# No new entries written
|
||||
self._test_all_warcs('/warcs/', 2)
|
||||
|
||||
resp = self._test_warc_write(recorder_app, 'httpbin.org',
|
||||
'/get?foo=bar', '¶m.recorder.user=USER¶m.recorder.coll=COLL')
|
||||
assert b'HTTP/1.1 200 OK' in resp.body
|
||||
assert b'"foo": "bar"' in resp.body
|
||||
|
||||
self._test_all_warcs('/warcs/USER/COLL/', 2)
|
||||
|
||||
# Test Redis CDX
|
||||
r = FakeStrictRedis.from_url('redis://localhost/2')
|
||||
|
||||
res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/', '(org,httpbin,')
|
||||
assert len(res) == 2
|
||||
|
||||
def test_record_param_user_coll_write_dupe_no_revisit(self):
|
||||
warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')
|
||||
|
||||
dedup_index = self._get_dedup_index(dupe_policy=WriteDupePolicy())
|
||||
|
||||
writer = PerRecordWARCWriter(warc_path, dedup_index=dedup_index)
|
||||
recorder_app = RecorderApp(self.upstream_url, writer)
|
||||
|
||||
resp = self._test_warc_write(recorder_app, 'httpbin.org',
|
||||
'/get?foo=bar', '¶m.recorder.user=USER¶m.recorder.coll=COLL')
|
||||
assert b'HTTP/1.1 200 OK' in resp.body
|
||||
assert b'"foo": "bar"' in resp.body
|
||||
|
||||
self._test_all_warcs('/warcs/USER/COLL/', 3)
|
||||
|
||||
r = FakeStrictRedis.from_url('redis://localhost/2')
|
||||
|
||||
res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/', '(org,httpbin,')
|
||||
assert len(res) == 3
|
||||
|
||||
mimes = [CDXObject(x)['mime'] for x in res]
|
||||
|
||||
assert sorted(mimes) == ['application/json', 'application/json', 'warc/revisit']
|
||||
|
||||
assert len(writer.fh_cache) == 0
|
||||
|
||||
# Keep Open
|
||||
def test_record_file_warc_keep_open(self):
|
||||
path = to_path(self.root_dir + '/warcs/A.warc.gz')
|
||||
writer = MultiFileWARCWriter(path)
|
||||
recorder_app = RecorderApp(self.upstream_url, writer)
|
||||
|
||||
resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar')
|
||||
assert b'HTTP/1.1 200 OK' in resp.body
|
||||
assert b'"foo": "bar"' in resp.body
|
||||
|
||||
assert os.path.isfile(path)
|
||||
assert len(writer.fh_cache) == 1
|
||||
|
||||
def test_record_multiple_writes_keep_open(self):
|
||||
warc_path = to_path(self.root_dir + '/warcs/FOO/ABC-{hostname}-{timestamp}.warc.gz')
|
||||
|
||||
rel_path = self.root_dir + '/warcs/'
|
||||
|
||||
dedup_index = self._get_dedup_index(user=False)
|
||||
|
||||
writer = MultiFileWARCWriter(warc_path, dedup_index=dedup_index)
|
||||
recorder_app = RecorderApp(self.upstream_url, writer)
|
||||
|
||||
# First Record
|
||||
resp = self._test_warc_write(recorder_app, 'httpbin.org',
|
||||
'/get?foo=bar', '¶m.recorder.coll=FOO')
|
||||
|
||||
assert b'HTTP/1.1 200 OK' in resp.body
|
||||
assert b'"foo": "bar"' in resp.body
|
||||
|
||||
|
||||
# Second Record
|
||||
resp = self._test_warc_write(recorder_app, 'httpbin.org',
|
||||
'/get?boo=far', '¶m.recorder.coll=FOO')
|
||||
|
||||
assert b'HTTP/1.1 200 OK' in resp.body
|
||||
assert b'"boo": "far"' in resp.body
|
||||
|
||||
self._test_all_warcs('/warcs/FOO/', 1)
|
||||
|
||||
# Check two records in WARC
|
||||
r = FakeStrictRedis.from_url('redis://localhost/2')
|
||||
res = r.zrangebylex('FOO:cdxj', '[org,httpbin)/', '(org,httpbin,')
|
||||
assert len(res) == 2
|
||||
|
||||
files, coll_dir = self._test_all_warcs('/warcs/FOO/', 1)
|
||||
fullname = coll_dir + files[0]
|
||||
|
||||
cdxout = BytesIO()
|
||||
with open(fullname, 'rb') as fh:
|
||||
filename = os.path.relpath(fullname, rel_path)
|
||||
write_cdx_index(cdxout, fh, filename,
|
||||
cdxj=True, append_post=True, sort=True)
|
||||
|
||||
res = [CDXObject(x) for x in res]
|
||||
|
||||
cdxres = cdxout.getvalue().strip()
|
||||
cdxres = cdxres.split(b'\n')
|
||||
cdxres = [CDXObject(x) for x in cdxres]
|
||||
|
||||
assert cdxres == res
|
||||
|
||||
assert len(writer.fh_cache) == 1
|
||||
|
||||
writer.close_key(self.root_dir + '/warcs/FOO/')
|
||||
|
||||
assert len(writer.fh_cache) == 0
|
||||
|
||||
writer.close()
|
||||
|
||||
resp = self._test_warc_write(recorder_app, 'httpbin.org',
|
||||
'/get?boo=far', '¶m.recorder.coll=FOO')
|
||||
|
||||
self._test_all_warcs('/warcs/FOO/', 2)
|
||||
|
||||
warcs = r.hgetall('FOO:warc')
|
||||
assert len(warcs) == 2
|
||||
|
||||
def test_record_multiple_writes_rollover_idle(self):
|
||||
warc_path = to_path(self.root_dir + '/warcs/GOO/ABC-{hostname}-{timestamp}.warc.gz')
|
||||
|
||||
rel_path = self.root_dir + '/warcs/'
|
||||
|
||||
dedup_index = self._get_dedup_index(user=False)
|
||||
|
||||
writer = MultiFileWARCWriter(warc_path, dedup_index=dedup_index, max_idle_secs=0.9)
|
||||
recorder_app = RecorderApp(self.upstream_url, writer)
|
||||
|
||||
# First Record
|
||||
resp = self._test_warc_write(recorder_app, 'httpbin.org',
|
||||
'/get?foo=bar', '¶m.recorder.coll=GOO')
|
||||
|
||||
assert b'HTTP/1.1 200 OK' in resp.body
|
||||
assert b'"foo": "bar"' in resp.body
|
||||
|
||||
# Second Record
|
||||
resp = self._test_warc_write(recorder_app, 'httpbin.org',
|
||||
'/get?boo=far', '¶m.recorder.coll=GOO')
|
||||
|
||||
assert b'HTTP/1.1 200 OK' in resp.body
|
||||
assert b'"boo": "far"' in resp.body
|
||||
|
||||
self._test_all_warcs('/warcs/GOO/', 1)
|
||||
|
||||
time.sleep(1.0)
|
||||
writer.close_idle_files()
|
||||
|
||||
# Third Record
|
||||
resp = self._test_warc_write(recorder_app, 'httpbin.org',
|
||||
'/get?goo=bar', '¶m.recorder.coll=GOO')
|
||||
|
||||
assert b'HTTP/1.1 200 OK' in resp.body
|
||||
assert b'"goo": "bar"' in resp.body
|
||||
|
||||
self._test_all_warcs('/warcs/GOO/', 2)
|
||||
|
||||
def test_warcinfo_record(self):
|
||||
simplewriter = SimpleTempWARCWriter(gzip=False)
|
||||
params = {'software': 'recorder test',
|
||||
'format': 'WARC File Format 1.0',
|
||||
'json-metadata': json.dumps({'foo': 'bar'})}
|
||||
|
||||
record = simplewriter.create_warcinfo_record('testfile.warc.gz', params)
|
||||
simplewriter.write_record(record)
|
||||
buff = simplewriter.get_buffer()
|
||||
assert isinstance(buff, bytes)
|
||||
|
||||
buff = BytesIO(buff)
|
||||
parsed_record = ArcWarcRecordLoader().parse_record_stream(buff)
|
||||
|
||||
assert parsed_record.rec_headers.get_header('WARC-Type') == 'warcinfo'
|
||||
assert parsed_record.rec_headers.get_header('Content-Type') == 'application/warc-fields'
|
||||
assert parsed_record.rec_headers.get_header('WARC-Filename') == 'testfile.warc.gz'
|
||||
|
||||
buff = parsed_record.stream.read().decode('utf-8')
|
||||
|
||||
length = parsed_record.rec_headers.get_header('Content-Length')
|
||||
|
||||
assert len(buff) == int(length)
|
||||
|
||||
assert 'json-metadata: {"foo": "bar"}\r\n' in buff
|
||||
assert 'format: WARC File Format 1.0\r\n' in buff
|
||||
assert 'json-metadata: {"foo": "bar"}\r\n' in buff
|
||||
|
||||
def test_record_custom_record(self):
|
||||
dedup_index = self._get_dedup_index(user=False)
|
||||
|
||||
warc_path = to_path(self.root_dir + '/warcs/meta/meta.warc.gz')
|
||||
|
||||
recorder_app = RecorderApp(self.upstream_url,
|
||||
MultiFileWARCWriter(warc_path, dedup_index=dedup_index))
|
||||
|
||||
req_url = '/live/resource/postreq?url=custom://httpbin.org¶m.recorder.coll=META&put_record=resource'
|
||||
|
||||
buff = b'Some Data'
|
||||
|
||||
testapp = webtest.TestApp(recorder_app)
|
||||
headers = {'content-type': 'text/plain',
|
||||
'WARC-Custom': 'foo'
|
||||
}
|
||||
|
||||
resp = testapp.put(req_url, headers=headers, params=buff)
|
||||
|
||||
assert resp.json['success'] == 'true'
|
||||
assert resp.json['WARC-Date'] != ''
|
||||
|
||||
self._test_all_warcs('/warcs/meta', 1)
|
||||
|
||||
r = FakeStrictRedis.from_url('redis://localhost/2')
|
||||
|
||||
warcs = r.hgetall('META:warc')
|
||||
assert len(warcs) == 1
|
||||
|
||||
with open(warcs[b'meta/meta.warc.gz'], 'rb') as fh:
|
||||
decomp = DecompressingBufferedReader(fh)
|
||||
record = ArcWarcRecordLoader().parse_record_stream(decomp)
|
||||
|
||||
status_headers = record.rec_headers
|
||||
assert len(record.rec_headers.headers) == 9
|
||||
assert status_headers.get_header('WARC-Type') == 'resource'
|
||||
assert status_headers.get_header('WARC-Target-URI') == 'custom://httpbin.org'
|
||||
assert status_headers.get_header('WARC-Record-ID') != ''
|
||||
assert status_headers.get_header('WARC-Date') != ''
|
||||
assert status_headers.get_header('WARC-Block-Digest') != ''
|
||||
assert status_headers.get_header('WARC-Block-Digest') == status_headers.get_header('WARC-Payload-Digest')
|
||||
assert status_headers.get_header('Content-Type') == 'text/plain'
|
||||
assert status_headers.get_header('Content-Length') == str(len(buff))
|
||||
assert status_headers.get_header('WARC-Custom') == 'foo'
|
||||
|
||||
assert record.stream.read() == buff
|
||||
|
||||
status_headers = record.status_headers
|
||||
assert len(record.status_headers.headers) == 2
|
||||
|
||||
assert status_headers.get_header('Content-Type') == 'text/plain'
|
||||
assert status_headers.get_header('Content-Length') == str(len(buff))
|
||||
|
||||
def test_record_video_metadata(self):
|
||||
warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')
|
||||
|
||||
dedup_index = self._get_dedup_index()
|
||||
|
||||
writer = PerRecordWARCWriter(warc_path, dedup_index=dedup_index)
|
||||
recorder_app = RecorderApp(self.upstream_url, writer)
|
||||
|
||||
params = {'param.recorder.user': 'USER',
|
||||
'param.recorder.coll': 'VIDEO',
|
||||
'content_type': 'application/vnd.youtube-dl_formats+json'
|
||||
}
|
||||
|
||||
resp = self._test_warc_write(recorder_app,
|
||||
'www.youtube.com', '/v/BfBgWtAIbRc', '&' + urlencode(params),
|
||||
link_url='metadata://www.youtube.com/v/BfBgWtAIbRc')
|
||||
|
||||
r = FakeStrictRedis.from_url('redis://localhost/2')
|
||||
|
||||
warcs = r.hgetall('USER:VIDEO:warc')
|
||||
assert len(warcs) == 1
|
||||
|
||||
filename = list(warcs.values())[0]
|
||||
|
||||
with open(filename, 'rb') as fh:
|
||||
decomp = DecompressingBufferedReader(fh)
|
||||
record = ArcWarcRecordLoader().parse_record_stream(decomp)
|
||||
|
||||
status_headers = record.rec_headers
|
||||
assert status_headers.get_header('WARC-Type') == 'metadata'
|
||||
assert status_headers.get_header('Content-Type') == 'application/vnd.youtube-dl_formats+json'
|
||||
assert status_headers.get_header('WARC-Block-Digest') != ''
|
||||
assert status_headers.get_header('WARC-Block-Digest') == status_headers.get_header('WARC-Payload-Digest')
|
||||
|
553
recorder/warcwriter.py
Normal file
553
recorder/warcwriter.py
Normal file
@ -0,0 +1,553 @@
|
||||
import tempfile
|
||||
import uuid
|
||||
import base64
|
||||
import hashlib
|
||||
import datetime
|
||||
import zlib
|
||||
import sys
|
||||
import os
|
||||
import six
|
||||
import shutil
|
||||
|
||||
import traceback
|
||||
|
||||
from collections import OrderedDict
|
||||
|
||||
from socket import gethostname
|
||||
from io import BytesIO
|
||||
|
||||
import fcntl
|
||||
|
||||
from pywb.utils.loaders import LimitReader, to_native_str
|
||||
from pywb.utils.bufferedreaders import BufferedReader
|
||||
from pywb.utils.timeutils import timestamp20_now, datetime_to_iso_date
|
||||
|
||||
from pywb.utils.statusandheaders import StatusAndHeadersParser
|
||||
from pywb.warc.recordloader import ArcWarcRecord
|
||||
from pywb.warc.recordloader import ArcWarcRecordLoader
|
||||
|
||||
from requests.structures import CaseInsensitiveDict
|
||||
from webagg.utils import ParamFormatter, res_template
|
||||
|
||||
from recorder.filters import ExcludeNone
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class BaseWARCWriter(object):
|
||||
WARC_RECORDS = {'warcinfo': 'application/warc-fields',
|
||||
'response': 'application/http; msgtype=response',
|
||||
'revisit': 'application/http; msgtype=response',
|
||||
'request': 'application/http; msgtype=request',
|
||||
'metadata': 'application/warc-fields',
|
||||
}
|
||||
|
||||
REVISIT_PROFILE = 'http://netpreserve.org/warc/1.0/revisit/uri-agnostic-identical-payload-digest'
|
||||
|
||||
BUFF_SIZE = 8192
|
||||
|
||||
FILE_TEMPLATE = 'rec-{timestamp}-{hostname}.warc.gz'
|
||||
|
||||
def __init__(self, gzip=True, dedup_index=None, name='recorder',
|
||||
header_filter=ExcludeNone(), *args, **kwargs):
|
||||
self.gzip = gzip
|
||||
self.dedup_index = dedup_index
|
||||
self.rec_source_name = name
|
||||
self.header_filter = header_filter
|
||||
self.hostname = gethostname()
|
||||
|
||||
self.parser = StatusAndHeadersParser([], verify=False)
|
||||
|
||||
def ensure_digest(self, record):
|
||||
block_digest = record.rec_headers.get('WARC-Block-Digest')
|
||||
payload_digest = record.rec_headers.get('WARC-Payload-Digest')
|
||||
if block_digest and payload_digest:
|
||||
return
|
||||
|
||||
block_digester = self._create_digester()
|
||||
payload_digester = self._create_digester()
|
||||
|
||||
pos = record.stream.tell()
|
||||
|
||||
if record.status_headers and hasattr(record.status_headers, 'headers_buff'):
|
||||
block_digester.update(record.status_headers.headers_buff)
|
||||
|
||||
while True:
|
||||
buf = record.stream.read(self.BUFF_SIZE)
|
||||
if not buf:
|
||||
break
|
||||
|
||||
block_digester.update(buf)
|
||||
payload_digester.update(buf)
|
||||
|
||||
record.stream.seek(pos)
|
||||
record.rec_headers['WARC-Block-Digest'] = str(block_digester)
|
||||
record.rec_headers['WARC-Payload-Digest'] = str(payload_digester)
|
||||
|
||||
def _create_digester(self):
|
||||
return Digester('sha1')
|
||||
|
||||
def _set_header_buff(self, record):
|
||||
exclude_list = self.header_filter(record)
|
||||
buff = record.status_headers.to_bytes(exclude_list)
|
||||
record.status_headers.headers_buff = buff
|
||||
|
||||
def write_req_resp(self, req, resp, params):
|
||||
url = resp.rec_headers.get('WARC-Target-URI')
|
||||
dt = resp.rec_headers.get('WARC-Date')
|
||||
|
||||
#req.rec_headers['Content-Type'] = req.content_type
|
||||
req.rec_headers['WARC-Target-URI'] = url
|
||||
req.rec_headers['WARC-Date'] = dt
|
||||
|
||||
resp_id = resp.rec_headers.get('WARC-Record-ID')
|
||||
if resp_id:
|
||||
req.rec_headers['WARC-Concurrent-To'] = resp_id
|
||||
|
||||
resp = self._check_revisit(resp, params)
|
||||
if not resp:
|
||||
print('Skipping due to dedup')
|
||||
return
|
||||
|
||||
params['_formatter'] = ParamFormatter(params, name=self.rec_source_name)
|
||||
self._do_write_req_resp(req, resp, params)
|
||||
|
||||
def create_req_record(self, req_headers, payload):
|
||||
len_ = payload.tell()
|
||||
payload.seek(0)
|
||||
|
||||
warc_headers = req_headers
|
||||
warc_headers['WARC-Type'] = 'request'
|
||||
if not warc_headers.get('WARC-Record-ID'):
|
||||
warc_headers['WARC-Record-ID'] = self._make_warc_id()
|
||||
|
||||
status_headers = self.parser.parse(payload)
|
||||
|
||||
record = ArcWarcRecord('warc', 'request', warc_headers, payload,
|
||||
status_headers, '', len_)
|
||||
|
||||
self._set_header_buff(record)
|
||||
|
||||
return record
|
||||
|
||||
def read_resp_record(self, resp_headers, payload):
|
||||
len_ = payload.tell()
|
||||
payload.seek(0)
|
||||
|
||||
warc_headers = self.parser.parse(payload)
|
||||
warc_headers = CaseInsensitiveDict(warc_headers.headers)
|
||||
|
||||
record_type = warc_headers.get('WARC-Type', 'response')
|
||||
|
||||
if record_type == 'response':
|
||||
status_headers = self.parser.parse(payload)
|
||||
else:
|
||||
status_headers = None
|
||||
|
||||
record = ArcWarcRecord('warc', record_type, warc_headers, payload,
|
||||
status_headers, '', len_)
|
||||
|
||||
if record_type == 'response':
|
||||
self._set_header_buff(record)
|
||||
|
||||
self.ensure_digest(record)
|
||||
|
||||
return record_type, record
|
||||
|
||||
def create_warcinfo_record(self, filename, info):
|
||||
warc_headers = {}
|
||||
warc_headers['WARC-Record-ID'] = self._make_warc_id()
|
||||
warc_headers['WARC-Type'] = 'warcinfo'
|
||||
if filename:
|
||||
warc_headers['WARC-Filename'] = filename
|
||||
warc_headers['WARC-Date'] = datetime_to_iso_date(datetime.datetime.utcnow())
|
||||
|
||||
warcinfo = BytesIO()
|
||||
for n, v in six.iteritems(info):
|
||||
self._header(warcinfo, n, v)
|
||||
|
||||
warcinfo.seek(0)
|
||||
|
||||
record = ArcWarcRecord('warc', 'warcinfo', warc_headers, warcinfo,
|
||||
None, '', len(warcinfo.getvalue()))
|
||||
|
||||
return record
|
||||
|
||||
def create_custom_record(self, uri, payload, record_type, content_type,
|
||||
warc_headers=None):
|
||||
len_ = payload.tell()
|
||||
payload.seek(0)
|
||||
|
||||
warc_headers = warc_headers or {}
|
||||
warc_headers['WARC-Record-ID'] = self._make_warc_id()
|
||||
warc_headers['WARC-Type'] = record_type
|
||||
warc_headers['WARC-Target-URI'] = uri
|
||||
|
||||
if 'WARC-Date' not in warc_headers:
|
||||
warc_headers['WARC-Date'] = datetime_to_iso_date(datetime.datetime.utcnow())
|
||||
|
||||
record = ArcWarcRecord('warc', record_type, warc_headers, payload,
|
||||
None, content_type, len_)
|
||||
|
||||
self.ensure_digest(record)
|
||||
|
||||
return record
|
||||
|
||||
def _check_revisit(self, record, params):
|
||||
if not self.dedup_index:
|
||||
return record
|
||||
|
||||
try:
|
||||
url = record.rec_headers.get('WARC-Target-URI')
|
||||
digest = record.rec_headers.get('WARC-Payload-Digest')
|
||||
iso_dt = record.rec_headers.get('WARC-Date')
|
||||
result = self.dedup_index.lookup_revisit(params, digest, url, iso_dt)
|
||||
except Exception as e:
|
||||
traceback.print_exc()
|
||||
result = None
|
||||
|
||||
if result == 'skip':
|
||||
return None
|
||||
|
||||
if isinstance(result, tuple) and result[0] == 'revisit':
|
||||
record.rec_headers['WARC-Type'] = 'revisit'
|
||||
record.rec_headers['WARC-Profile'] = self.REVISIT_PROFILE
|
||||
|
||||
record.rec_headers['WARC-Refers-To-Target-URI'] = result[1]
|
||||
record.rec_headers['WARC-Refers-To-Date'] = result[2]
|
||||
|
||||
return record
|
||||
|
||||
def _write_warc_record(self, out, record):
|
||||
if self.gzip:
|
||||
out = GzippingWrapper(out)
|
||||
|
||||
self._line(out, b'WARC/1.0')
|
||||
|
||||
for n, v in six.iteritems(record.rec_headers):
|
||||
if n.lower() in ('content-length', 'content-type'):
|
||||
continue
|
||||
|
||||
self._header(out, n, v)
|
||||
|
||||
content_type = record.rec_headers.get('Content-Type')
|
||||
|
||||
if not content_type:
|
||||
content_type = record.content_type
|
||||
|
||||
if not content_type:
|
||||
content_type = self.WARC_RECORDS.get(record.rec_headers['WARC-Type'])
|
||||
|
||||
if content_type:
|
||||
self._header(out, 'Content-Type', content_type)
|
||||
|
||||
if record.rec_headers['WARC-Type'] == 'revisit':
|
||||
http_headers_only = True
|
||||
else:
|
||||
http_headers_only = False
|
||||
|
||||
if record.length:
|
||||
actual_len = 0
|
||||
if record.status_headers:
|
||||
actual_len = len(record.status_headers.headers_buff)
|
||||
|
||||
if not http_headers_only:
|
||||
diff = record.stream.tell() - actual_len
|
||||
actual_len = record.length - diff
|
||||
|
||||
self._header(out, 'Content-Length', str(actual_len))
|
||||
|
||||
# add empty line
|
||||
self._line(out, b'')
|
||||
|
||||
# write headers buffer, if any
|
||||
if record.status_headers:
|
||||
out.write(record.status_headers.headers_buff)
|
||||
|
||||
if not http_headers_only:
|
||||
out.write(record.stream.read())
|
||||
|
||||
# add two lines
|
||||
self._line(out, b'\r\n')
|
||||
else:
|
||||
# add three lines (1 for end of header, 2 for end of record)
|
||||
self._line(out, b'Content-Length: 0\r\n\r\n')
|
||||
|
||||
out.flush()
|
||||
|
||||
def _header(self, out, name, value):
|
||||
if not value:
|
||||
return
|
||||
|
||||
self._line(out, (name + ': ' + str(value)).encode('latin-1'))
|
||||
|
||||
def _line(self, out, line):
|
||||
out.write(line + b'\r\n')
|
||||
|
||||
@staticmethod
|
||||
def _make_warc_id(id_=None):
|
||||
if not id_:
|
||||
id_ = uuid.uuid1()
|
||||
return '<urn:uuid:{0}>'.format(id_)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class GzippingWrapper(object):
|
||||
def __init__(self, out):
|
||||
self.compressor = zlib.compressobj(9, zlib.DEFLATED, zlib.MAX_WBITS + 16)
|
||||
self.out = out
|
||||
|
||||
def write(self, buff):
|
||||
#if isinstance(buff, str):
|
||||
# buff = buff.encode('utf-8')
|
||||
buff = self.compressor.compress(buff)
|
||||
self.out.write(buff)
|
||||
|
||||
def flush(self):
|
||||
buff = self.compressor.flush()
|
||||
self.out.write(buff)
|
||||
self.out.flush()
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class Digester(object):
|
||||
def __init__(self, type_='sha1'):
|
||||
self.type_ = type_
|
||||
self.digester = hashlib.new(type_)
|
||||
|
||||
def update(self, buff):
|
||||
self.digester.update(buff)
|
||||
|
||||
def __str__(self):
|
||||
return self.type_ + ':' + to_native_str(base64.b32encode(self.digester.digest()))
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class MultiFileWARCWriter(BaseWARCWriter):
|
||||
def __init__(self, dir_template, filename_template=None, max_size=0,
|
||||
max_idle_secs=1800, *args, **kwargs):
|
||||
super(MultiFileWARCWriter, self).__init__(*args, **kwargs)
|
||||
|
||||
if not filename_template:
|
||||
dir_template, filename_template = os.path.split(dir_template)
|
||||
dir_template += os.path.sep
|
||||
|
||||
if not filename_template:
|
||||
filename_template = self.FILE_TEMPLATE
|
||||
|
||||
self.dir_template = dir_template
|
||||
self.key_template = kwargs.get('key_template', self.dir_template)
|
||||
self.filename_template = filename_template
|
||||
self.max_size = max_size
|
||||
if max_idle_secs > 0:
|
||||
self.max_idle_time = datetime.timedelta(seconds=max_idle_secs)
|
||||
else:
|
||||
self.max_idle_time = None
|
||||
|
||||
self.fh_cache = {}
|
||||
|
||||
def get_new_filename(self, dir_, params):
|
||||
timestamp = timestamp20_now()
|
||||
|
||||
randstr = base64.b32encode(os.urandom(5)).decode('utf-8')
|
||||
|
||||
filename = dir_ + res_template(self.filename_template, params,
|
||||
hostname=self.hostname,
|
||||
timestamp=timestamp,
|
||||
random=randstr)
|
||||
|
||||
return filename
|
||||
|
||||
def allow_new_file(self, filename, params):
|
||||
return True
|
||||
|
||||
def _open_file(self, filename, params):
|
||||
path, name = os.path.split(filename)
|
||||
|
||||
try:
|
||||
os.makedirs(path)
|
||||
except:
|
||||
pass
|
||||
|
||||
fh = open(filename, 'a+b')
|
||||
|
||||
if self.dedup_index:
|
||||
self.dedup_index.add_warc_file(filename, params)
|
||||
|
||||
return fh
|
||||
|
||||
def _close_file(self, fh):
|
||||
try:
|
||||
fcntl.flock(fh, fcntl.LOCK_UN)
|
||||
fh.close()
|
||||
except Exception as e:
|
||||
print(e)
|
||||
|
||||
def get_dir_key(self, params):
|
||||
return res_template(self.key_template, params)
|
||||
|
||||
def close_key(self, dir_key):
|
||||
if isinstance(dir_key, dict):
|
||||
dir_key = self.get_dir_key(dir_key)
|
||||
|
||||
result = self.fh_cache.pop(dir_key, None)
|
||||
if not result:
|
||||
return
|
||||
|
||||
out, filename = result
|
||||
self._close_file(out)
|
||||
return filename
|
||||
|
||||
def close_file(self, match_filename):
|
||||
for dir_key, out, filename in self.iter_open_files():
|
||||
if filename == match_filename:
|
||||
return self.close_key(dir_key)
|
||||
|
||||
def _is_write_resp(self, resp, params):
|
||||
return True
|
||||
|
||||
def _is_write_req(self, req, params):
|
||||
return True
|
||||
|
||||
def write_record(self, record, params=None):
|
||||
params = params or {}
|
||||
params['_formatter'] = ParamFormatter(params, name=self.rec_source_name)
|
||||
self._do_write_req_resp(None, record, params)
|
||||
|
||||
def _do_write_req_resp(self, req, resp, params):
|
||||
def write_callback(out, filename):
|
||||
url = resp.rec_headers.get('WARC-Target-URI')
|
||||
print('Writing req/resp {0} to {1} '.format(url, filename))
|
||||
|
||||
if resp and self._is_write_resp(resp, params):
|
||||
self._write_warc_record(out, resp)
|
||||
|
||||
if req and self._is_write_req(req, params):
|
||||
self._write_warc_record(out, req)
|
||||
|
||||
return self._write_to_file(params, write_callback)
|
||||
|
||||
def write_stream_to_file(self, params, stream):
|
||||
def write_callback(out, filename):
|
||||
print('Writing stream to {0}'.format(filename))
|
||||
shutil.copyfileobj(stream, out)
|
||||
|
||||
return self._write_to_file(params, write_callback)
|
||||
|
||||
def _write_to_file(self, params, write_callback):
|
||||
full_dir = res_template(self.dir_template, params)
|
||||
dir_key = self.get_dir_key(params)
|
||||
|
||||
result = self.fh_cache.get(dir_key)
|
||||
|
||||
close_file = False
|
||||
|
||||
if result:
|
||||
out, filename = result
|
||||
is_new = False
|
||||
else:
|
||||
filename = self.get_new_filename(full_dir, params)
|
||||
|
||||
if not self.allow_new_file(filename, params):
|
||||
return False
|
||||
|
||||
out = self._open_file(filename, params)
|
||||
|
||||
is_new = True
|
||||
|
||||
try:
|
||||
start = out.tell()
|
||||
|
||||
write_callback(out, filename)
|
||||
|
||||
out.flush()
|
||||
|
||||
new_size = out.tell()
|
||||
|
||||
out.seek(start)
|
||||
|
||||
if self.dedup_index:
|
||||
self.dedup_index.add_urls_to_index(out, params,
|
||||
filename,
|
||||
new_size - start)
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
traceback.print_exc()
|
||||
close_file = True
|
||||
return False
|
||||
|
||||
finally:
|
||||
# check for rollover
|
||||
if self.max_size and new_size > self.max_size:
|
||||
close_file = True
|
||||
|
||||
if close_file:
|
||||
self._close_file(out)
|
||||
if not is_new:
|
||||
self.fh_cache.pop(dir_key, None)
|
||||
|
||||
elif is_new:
|
||||
fcntl.flock(out, fcntl.LOCK_EX | fcntl.LOCK_NB)
|
||||
self.fh_cache[dir_key] = (out, filename)
|
||||
|
||||
def iter_open_files(self):
|
||||
for n, v in list(self.fh_cache.items()):
|
||||
out, filename = v
|
||||
yield n, out, filename
|
||||
|
||||
def close(self):
|
||||
for dir_key, out, filename in self.iter_open_files():
|
||||
self._close_file(out)
|
||||
|
||||
self.fh_cache = {}
|
||||
|
||||
def close_idle_files(self):
|
||||
if not self.max_idle_time:
|
||||
return
|
||||
|
||||
now = datetime.datetime.now()
|
||||
|
||||
for dir_key, out, filename in self.iter_open_files():
|
||||
try:
|
||||
mtime = os.path.getmtime(filename)
|
||||
except:
|
||||
self.close_key(dir_key)
|
||||
return
|
||||
|
||||
mtime = datetime.datetime.fromtimestamp(mtime)
|
||||
|
||||
if (now - mtime) > self.max_idle_time:
|
||||
print('Closing idle ' + filename)
|
||||
self.close_key(dir_key)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class PerRecordWARCWriter(MultiFileWARCWriter):
|
||||
def __init__(self, *args, **kwargs):
|
||||
kwargs['max_size'] = 1
|
||||
super(PerRecordWARCWriter, self).__init__(*args, **kwargs)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class SimpleTempWARCWriter(BaseWARCWriter):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(SimpleTempWARCWriter, self).__init__(*args, **kwargs)
|
||||
self.out = self._create_buffer()
|
||||
|
||||
def _create_buffer(self):
|
||||
return tempfile.SpooledTemporaryFile(max_size=512*1024)
|
||||
|
||||
def _do_write_req_resp(self, req, resp, params):
|
||||
self._write_warc_record(self.out, resp)
|
||||
self._write_warc_record(self.out, req)
|
||||
|
||||
def write_record(self, record, params=None):
|
||||
self._write_warc_record(self.out, record)
|
||||
|
||||
def get_buffer(self):
|
||||
pos = self.out.tell()
|
||||
self.out.seek(0)
|
||||
buff = self.out.read()
|
||||
self.out.seek(pos)
|
||||
return buff
|
12
testdata/dupes.cdxj
vendored
Normal file
12
testdata/dupes.cdxj
vendored
Normal file
@ -0,0 +1,12 @@
|
||||
com,example)/ 20140127171200 {"url": "http://example.com", "mime": "text/html", "status": "200", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "1046", "offset": "334", "filename": "dupes.warc.gz"}
|
||||
com,example)/ 20140127171251 {"url": "http://example.com", "mime": "warc/revisit", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "553", "offset": "11875", "filename": "dupes.warc.gz"}
|
||||
org,iana)/ 20140127171238 {"url": "http://iana.org", "mime": "unk", "status": "302", "digest": "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "343", "offset": "1858", "filename": "dupes.warc.gz"}
|
||||
org,iana)/ 20140127171238 {"url": "http://www.iana.org/", "mime": "warc/revisit", "digest": "OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB", "length": "536", "offset": "2678", "filename": "dupes.warc.gz"}
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140127171240 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf", "mime": "warc/revisit", "digest": "YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW", "length": "556", "offset": "10826", "filename": "dupes.warc.gz"}
|
||||
org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140127171240 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Regular.ttf", "mime": "warc/revisit", "digest": "GVSO2C2TMPPVZ4TXYFXAY27NYWTIEIL7", "length": "540", "offset": "9793", "filename": "dupes.warc.gz"}
|
||||
org,iana)/_css/2013.1/print.css 20140127171239 {"url": "http://www.iana.org/_css/2013.1/print.css", "mime": "warc/revisit", "digest": "VNBXHMUNWJQC5OWWGZ3X7GM5C7X6ZAB4", "length": "537", "offset": "6684", "filename": "dupes.warc.gz"}
|
||||
org,iana)/_css/2013.1/screen.css 20140127171239 {"url": "http://www.iana.org/_css/2013.1/screen.css", "mime": "warc/revisit", "digest": "BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD", "length": "541", "offset": "4630", "filename": "dupes.warc.gz"}
|
||||
org,iana)/_img/2013.1/iana-logo-homepage.png 20140127171240 {"url": "http://www.iana.org/_img/2013.1/iana-logo-homepage.png", "mime": "warc/revisit", "digest": "GCW2GM3SIMHEIQYZX25MLSRYVWUCZ7OK", "length": "549", "offset": "8750", "filename": "dupes.warc.gz"}
|
||||
org,iana)/_img/2013.1/icann-logo.svg 20140127171239 {"url": "http://www.iana.org/_img/2013.1/icann-logo.svg", "mime": "warc/revisit", "digest": "HGRZHOH73EFQQWBYWBSOIV2UU5JDTSGJ", "length": "549", "offset": "7709", "filename": "dupes.warc.gz"}
|
||||
org,iana)/_js/2013.1/iana.js 20140127171239 {"url": "http://www.iana.org/_js/2013.1/iana.js", "mime": "application/x-javascript", "status": "200", "digest": "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "457", "offset": "3696", "filename": "dupes.warc.gz"}
|
||||
org,iana)/_js/2013.1/jquery.js 20140127171239 {"url": "http://www.iana.org/_js/2013.1/jquery.js", "mime": "warc/revisit", "digest": "AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO", "length": "547", "offset": "5658", "filename": "dupes.warc.gz"}
|
BIN
testdata/dupes.warc.gz
vendored
Normal file
BIN
testdata/dupes.warc.gz
vendored
Normal file
Binary file not shown.
BIN
testdata/example-url-agnostic-orig.warc.gz
vendored
Normal file
BIN
testdata/example-url-agnostic-orig.warc.gz
vendored
Normal file
Binary file not shown.
BIN
testdata/example-url-agnostic-revisit.warc.gz
vendored
Normal file
BIN
testdata/example-url-agnostic-revisit.warc.gz
vendored
Normal file
Binary file not shown.
1
testdata/example.cdxj
vendored
Normal file
1
testdata/example.cdxj
vendored
Normal file
@ -0,0 +1 @@
|
||||
com,example)/ 20160225042329 {"url": "http://example.com/", "mime": "text/html", "status": "200", "digest": "37cf167c2672a4a64af901d9484e75eee0e2c98a", "length": "1286", "offset": "363", "filename": "example.warc.gz"}
|
BIN
testdata/example.warc.gz
vendored
Normal file
BIN
testdata/example.warc.gz
vendored
Normal file
Binary file not shown.
171
testdata/iana.cdxj
vendored
Normal file
171
testdata/iana.cdxj
vendored
Normal file
@ -0,0 +1,171 @@
|
||||
org,iana)/ 20140126200624 {"url": "http://www.iana.org/", "mime": "text/html", "status": "200", "digest": "OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB", "length": "2258", "offset": "334", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200826 {"url": "http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf", "mime": "application/octet-stream", "status": "200", "digest": "LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR", "length": "34054", "offset": "620049", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200912 {"url": "http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf", "mime": "warc/revisit", "digest": "LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR", "length": "546", "offset": "667073", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200930 {"url": "http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf", "mime": "warc/revisit", "digest": "LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR", "length": "534", "offset": "697255", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201055 {"url": "http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf", "mime": "warc/revisit", "digest": "LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR", "length": "547", "offset": "714833", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201249 {"url": "http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf", "mime": "warc/revisit", "digest": "LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR", "length": "551", "offset": "768625", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200625 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf", "mime": "application/octet-stream", "status": "200", "digest": "YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW", "length": "117166", "offset": "198285", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200654 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf", "mime": "warc/revisit", "digest": "YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW", "length": "548", "offset": "482544", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200706 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf", "mime": "warc/revisit", "digest": "YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW", "length": "552", "offset": "495230", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200718 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf", "mime": "warc/revisit", "digest": "YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW", "length": "536", "offset": "566542", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200738 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf", "mime": "warc/revisit", "digest": "YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW", "length": "552", "offset": "578743", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200805 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf", "mime": "warc/revisit", "digest": "YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW", "length": "535", "offset": "593400", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200816 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf", "mime": "warc/revisit", "digest": "YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW", "length": "554", "offset": "608401", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200826 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf", "mime": "warc/revisit", "digest": "YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW", "length": "550", "offset": "654593", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200912 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf", "mime": "warc/revisit", "digest": "YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW", "length": "553", "offset": "670224", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200930 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf", "mime": "warc/revisit", "digest": "YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW", "length": "551", "offset": "699343", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201055 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf", "mime": "warc/revisit", "digest": "YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW", "length": "552", "offset": "712719", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201128 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf", "mime": "warc/revisit", "digest": "YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW", "length": "554", "offset": "731718", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201228 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf", "mime": "warc/revisit", "digest": "YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW", "length": "551", "offset": "745730", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201240 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf", "mime": "warc/revisit", "digest": "YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW", "length": "551", "offset": "757988", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201249 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf", "mime": "warc/revisit", "digest": "YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW", "length": "552", "offset": "771773", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201308 {"url": "https://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf", "mime": "warc/revisit", "digest": "YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW", "length": "551", "offset": "783712", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200626 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Regular.ttf", "mime": "application/octet-stream", "status": "200", "digest": "GVSO2C2TMPPVZ4TXYFXAY27NYWTIEIL7", "length": "114499", "offset": "83293", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200654 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Regular.ttf", "mime": "warc/revisit", "digest": "GVSO2C2TMPPVZ4TXYFXAY27NYWTIEIL7", "length": "550", "offset": "446529", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200706 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Regular.ttf", "mime": "warc/revisit", "digest": "GVSO2C2TMPPVZ4TXYFXAY27NYWTIEIL7", "length": "553", "offset": "493141", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200718 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Regular.ttf", "mime": "warc/revisit", "digest": "GVSO2C2TMPPVZ4TXYFXAY27NYWTIEIL7", "length": "554", "offset": "567576", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200738 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Regular.ttf", "mime": "warc/revisit", "digest": "GVSO2C2TMPPVZ4TXYFXAY27NYWTIEIL7", "length": "555", "offset": "580835", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200805 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Regular.ttf", "mime": "warc/revisit", "digest": "GVSO2C2TMPPVZ4TXYFXAY27NYWTIEIL7", "length": "551", "offset": "595503", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200816 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Regular.ttf", "mime": "warc/revisit", "digest": "GVSO2C2TMPPVZ4TXYFXAY27NYWTIEIL7", "length": "554", "offset": "609468", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200826 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Regular.ttf", "mime": "warc/revisit", "digest": "GVSO2C2TMPPVZ4TXYFXAY27NYWTIEIL7", "length": "551", "offset": "655640", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200912 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Regular.ttf", "mime": "warc/revisit", "digest": "GVSO2C2TMPPVZ4TXYFXAY27NYWTIEIL7", "length": "551", "offset": "669172", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200930 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Regular.ttf", "mime": "warc/revisit", "digest": "GVSO2C2TMPPVZ4TXYFXAY27NYWTIEIL7", "length": "553", "offset": "698287", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126201055 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Regular.ttf", "mime": "warc/revisit", "digest": "GVSO2C2TMPPVZ4TXYFXAY27NYWTIEIL7", "length": "553", "offset": "711664", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126201128 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Regular.ttf", "mime": "warc/revisit", "digest": "GVSO2C2TMPPVZ4TXYFXAY27NYWTIEIL7", "length": "553", "offset": "730663", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126201228 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Regular.ttf", "mime": "warc/revisit", "digest": "GVSO2C2TMPPVZ4TXYFXAY27NYWTIEIL7", "length": "537", "offset": "743642", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126201240 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Regular.ttf", "mime": "warc/revisit", "digest": "GVSO2C2TMPPVZ4TXYFXAY27NYWTIEIL7", "length": "552", "offset": "755896", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126201249 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Regular.ttf", "mime": "warc/revisit", "digest": "GVSO2C2TMPPVZ4TXYFXAY27NYWTIEIL7", "length": "553", "offset": "769676", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126201308 {"url": "https://www.iana.org/_css/2013.1/fonts/OpenSans-Regular.ttf", "mime": "warc/revisit", "digest": "GVSO2C2TMPPVZ4TXYFXAY27NYWTIEIL7", "length": "551", "offset": "784758", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126200654 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Semibold.ttf", "mime": "application/octet-stream", "status": "200", "digest": "6HXHVHDNCPXC2ZBKQBWATZZXE5PGCN4S", "length": "116641", "offset": "329393", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126200706 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Semibold.ttf", "mime": "warc/revisit", "digest": "6HXHVHDNCPXC2ZBKQBWATZZXE5PGCN4S", "length": "538", "offset": "494192", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126200718 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Semibold.ttf", "mime": "warc/revisit", "digest": "6HXHVHDNCPXC2ZBKQBWATZZXE5PGCN4S", "length": "538", "offset": "565504", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126200738 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Semibold.ttf", "mime": "warc/revisit", "digest": "6HXHVHDNCPXC2ZBKQBWATZZXE5PGCN4S", "length": "539", "offset": "579795", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126200805 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Semibold.ttf", "mime": "warc/revisit", "digest": "6HXHVHDNCPXC2ZBKQBWATZZXE5PGCN4S", "length": "555", "offset": "592333", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126200816 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Semibold.ttf", "mime": "warc/revisit", "digest": "6HXHVHDNCPXC2ZBKQBWATZZXE5PGCN4S", "length": "556", "offset": "607332", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126200826 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Semibold.ttf", "mime": "warc/revisit", "digest": "6HXHVHDNCPXC2ZBKQBWATZZXE5PGCN4S", "length": "556", "offset": "656690", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126200912 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Semibold.ttf", "mime": "warc/revisit", "digest": "6HXHVHDNCPXC2ZBKQBWATZZXE5PGCN4S", "length": "554", "offset": "668113", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126200930 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Semibold.ttf", "mime": "warc/revisit", "digest": "6HXHVHDNCPXC2ZBKQBWATZZXE5PGCN4S", "length": "556", "offset": "700397", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126201055 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Semibold.ttf", "mime": "warc/revisit", "digest": "6HXHVHDNCPXC2ZBKQBWATZZXE5PGCN4S", "length": "555", "offset": "713774", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126201128 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Semibold.ttf", "mime": "warc/revisit", "digest": "6HXHVHDNCPXC2ZBKQBWATZZXE5PGCN4S", "length": "556", "offset": "732779", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126201228 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Semibold.ttf", "mime": "warc/revisit", "digest": "6HXHVHDNCPXC2ZBKQBWATZZXE5PGCN4S", "length": "538", "offset": "744686", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126201240 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Semibold.ttf", "mime": "warc/revisit", "digest": "6HXHVHDNCPXC2ZBKQBWATZZXE5PGCN4S", "length": "537", "offset": "756949", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126201249 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Semibold.ttf", "mime": "warc/revisit", "digest": "6HXHVHDNCPXC2ZBKQBWATZZXE5PGCN4S", "length": "539", "offset": "770730", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126201308 {"url": "https://www.iana.org/_css/2013.1/fonts/OpenSans-Semibold.ttf", "mime": "warc/revisit", "digest": "6HXHVHDNCPXC2ZBKQBWATZZXE5PGCN4S", "length": "554", "offset": "782657", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/print.css 20140126200625 {"url": "http://www.iana.org/_css/2013.1/print.css", "mime": "text/css", "status": "200", "digest": "VNBXHMUNWJQC5OWWGZ3X7GM5C7X6ZAB4", "length": "4662", "offset": "50482", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/print.css 20140126200653 {"url": "http://www.iana.org/_css/2013.1/print.css", "mime": "warc/revisit", "digest": "VNBXHMUNWJQC5OWWGZ3X7GM5C7X6ZAB4", "length": "534", "offset": "326315", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/print.css 20140126200706 {"url": "http://www.iana.org/_css/2013.1/print.css", "mime": "warc/revisit", "digest": "VNBXHMUNWJQC5OWWGZ3X7GM5C7X6ZAB4", "length": "534", "offset": "487982", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/print.css 20140126200716 {"url": "http://www.iana.org/_css/2013.1/print.css", "mime": "warc/revisit", "digest": "VNBXHMUNWJQC5OWWGZ3X7GM5C7X6ZAB4", "length": "535", "offset": "561375", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/print.css 20140126200737 {"url": "http://www.iana.org/_css/2013.1/print.css", "mime": "warc/revisit", "digest": "VNBXHMUNWJQC5OWWGZ3X7GM5C7X6ZAB4", "length": "536", "offset": "574583", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/print.css 20140126200804 {"url": "http://www.iana.org/_css/2013.1/print.css", "mime": "warc/revisit", "digest": "VNBXHMUNWJQC5OWWGZ3X7GM5C7X6ZAB4", "length": "538", "offset": "588168", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/print.css 20140126200816 {"url": "http://www.iana.org/_css/2013.1/print.css", "mime": "warc/revisit", "digest": "VNBXHMUNWJQC5OWWGZ3X7GM5C7X6ZAB4", "length": "537", "offset": "602081", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/print.css 20140126200825 {"url": "http://www.iana.org/_css/2013.1/print.css", "mime": "warc/revisit", "digest": "VNBXHMUNWJQC5OWWGZ3X7GM5C7X6ZAB4", "length": "535", "offset": "613943", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/print.css 20140126200912 {"url": "http://www.iana.org/_css/2013.1/print.css", "mime": "warc/revisit", "digest": "VNBXHMUNWJQC5OWWGZ3X7GM5C7X6ZAB4", "length": "536", "offset": "662904", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/print.css 20140126200929 {"url": "http://www.iana.org/_css/2013.1/print.css", "mime": "warc/revisit", "digest": "VNBXHMUNWJQC5OWWGZ3X7GM5C7X6ZAB4", "length": "537", "offset": "693076", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/print.css 20140126201054 {"url": "http://www.iana.org/_css/2013.1/print.css", "mime": "warc/revisit", "digest": "VNBXHMUNWJQC5OWWGZ3X7GM5C7X6ZAB4", "length": "526", "offset": "707519", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/print.css 20140126201127 {"url": "http://www.iana.org/_css/2013.1/print.css", "mime": "warc/revisit", "digest": "VNBXHMUNWJQC5OWWGZ3X7GM5C7X6ZAB4", "length": "525", "offset": "726489", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/print.css 20140126201227 {"url": "http://www.iana.org/_css/2013.1/print.css", "mime": "warc/revisit", "digest": "VNBXHMUNWJQC5OWWGZ3X7GM5C7X6ZAB4", "length": "527", "offset": "738432", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/print.css 20140126201239 {"url": "http://www.iana.org/_css/2013.1/print.css", "mime": "warc/revisit", "digest": "VNBXHMUNWJQC5OWWGZ3X7GM5C7X6ZAB4", "length": "526", "offset": "750710", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/print.css 20140126201248 {"url": "http://www.iana.org/_css/2013.1/print.css", "mime": "warc/revisit", "digest": "VNBXHMUNWJQC5OWWGZ3X7GM5C7X6ZAB4", "length": "535", "offset": "763424", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/print.css 20140126201307 {"url": "https://www.iana.org/_css/2013.1/print.css", "mime": "warc/revisit", "digest": "VNBXHMUNWJQC5OWWGZ3X7GM5C7X6ZAB4", "length": "539", "offset": "777477", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/screen.css 20140126200625 {"url": "http://www.iana.org/_css/2013.1/screen.css", "mime": "text/css", "status": "200", "digest": "BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD", "length": "8754", "offset": "41238", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/screen.css 20140126200653 {"url": "http://www.iana.org/_css/2013.1/screen.css", "mime": "warc/revisit", "digest": "BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD", "length": "533", "offset": "328367", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/screen.css 20140126200706 {"url": "http://www.iana.org/_css/2013.1/screen.css", "mime": "warc/revisit", "digest": "BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD", "length": "539", "offset": "489005", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/screen.css 20140126200716 {"url": "http://www.iana.org/_css/2013.1/screen.css", "mime": "warc/revisit", "digest": "BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD", "length": "542", "offset": "563417", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/screen.css 20140126200737 {"url": "http://www.iana.org/_css/2013.1/screen.css", "mime": "warc/revisit", "digest": "BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD", "length": "528", "offset": "572623", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/screen.css 20140126200804 {"url": "http://www.iana.org/_css/2013.1/screen.css", "mime": "warc/revisit", "digest": "BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD", "length": "527", "offset": "589212", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/screen.css 20140126200816 {"url": "http://www.iana.org/_css/2013.1/screen.css", "mime": "warc/revisit", "digest": "BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD", "length": "528", "offset": "603125", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/screen.css 20140126200825 {"url": "http://www.iana.org/_css/2013.1/screen.css", "mime": "warc/revisit", "digest": "BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD", "length": "527", "offset": "614971", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/screen.css 20140126200912 {"url": "http://www.iana.org/_css/2013.1/screen.css", "mime": "warc/revisit", "digest": "BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD", "length": "531", "offset": "661876", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/screen.css 20140126200929 {"url": "http://www.iana.org/_css/2013.1/screen.css", "mime": "warc/revisit", "digest": "BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD", "length": "538", "offset": "691096", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/screen.css 20140126201054 {"url": "http://www.iana.org/_css/2013.1/screen.css", "mime": "warc/revisit", "digest": "BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD", "length": "543", "offset": "706476", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/screen.css 20140126201127 {"url": "http://www.iana.org/_css/2013.1/screen.css", "mime": "warc/revisit", "digest": "BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD", "length": "543", "offset": "725445", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/screen.css 20140126201227 {"url": "http://www.iana.org/_css/2013.1/screen.css", "mime": "warc/revisit", "digest": "BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD", "length": "543", "offset": "739461", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/screen.css 20140126201239 {"url": "http://www.iana.org/_css/2013.1/screen.css", "mime": "warc/revisit", "digest": "BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD", "length": "541", "offset": "751731", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/screen.css 20140126201248 {"url": "http://www.iana.org/_css/2013.1/screen.css", "mime": "warc/revisit", "digest": "BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD", "length": "541", "offset": "764454", "filename": "iana.warc.gz"}
|
||||
org,iana)/_css/2013.1/screen.css 20140126201307 {"url": "https://www.iana.org/_css/2013.1/screen.css", "mime": "warc/revisit", "digest": "BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD", "length": "537", "offset": "779533", "filename": "iana.warc.gz"}
|
||||
org,iana)/_img/2013.1/iana-logo-header.svg 20140126200654 {"url": "http://www.iana.org/_img/2013.1/iana-logo-header.svg", "mime": "image/svg+xml", "status": "200", "digest": "N6T6ZRHLEHKP2675D7JVKDYKVKYKWQ6X", "length": "9739", "offset": "447577", "filename": "iana.warc.gz"}
|
||||
org,iana)/_img/2013.1/iana-logo-header.svg 20140126200706 {"url": "http://www.iana.org/_img/2013.1/iana-logo-header.svg", "mime": "warc/revisit", "digest": "N6T6ZRHLEHKP2675D7JVKDYKVKYKWQ6X", "length": "553", "offset": "491049", "filename": "iana.warc.gz"}
|
||||
org,iana)/_img/2013.1/iana-logo-header.svg 20140126200718 {"url": "http://www.iana.org/_img/2013.1/iana-logo-header.svg", "mime": "warc/revisit", "digest": "N6T6ZRHLEHKP2675D7JVKDYKVKYKWQ6X", "length": "551", "offset": "564454", "filename": "iana.warc.gz"}
|
||||
org,iana)/_img/2013.1/iana-logo-header.svg 20140126200737 {"url": "http://www.iana.org/_img/2013.1/iana-logo-header.svg", "mime": "warc/revisit", "digest": "N6T6ZRHLEHKP2675D7JVKDYKVKYKWQ6X", "length": "550", "offset": "576643", "filename": "iana.warc.gz"}
|
||||
org,iana)/_img/2013.1/iana-logo-header.svg 20140126200805 {"url": "http://www.iana.org/_img/2013.1/iana-logo-header.svg", "mime": "warc/revisit", "digest": "N6T6ZRHLEHKP2675D7JVKDYKVKYKWQ6X", "length": "552", "offset": "591269", "filename": "iana.warc.gz"}
|
||||
org,iana)/_img/2013.1/iana-logo-header.svg 20140126200816 {"url": "http://www.iana.org/_img/2013.1/iana-logo-header.svg", "mime": "warc/revisit", "digest": "N6T6ZRHLEHKP2675D7JVKDYKVKYKWQ6X", "length": "552", "offset": "605204", "filename": "iana.warc.gz"}
|
||||
org,iana)/_img/2013.1/iana-logo-header.svg 20140126200826 {"url": "http://www.iana.org/_img/2013.1/iana-logo-header.svg", "mime": "warc/revisit", "digest": "N6T6ZRHLEHKP2675D7JVKDYKVKYKWQ6X", "length": "552", "offset": "617954", "filename": "iana.warc.gz"}
|
||||
org,iana)/_img/2013.1/iana-logo-header.svg 20140126200912 {"url": "http://www.iana.org/_img/2013.1/iana-logo-header.svg", "mime": "warc/revisit", "digest": "N6T6ZRHLEHKP2675D7JVKDYKVKYKWQ6X", "length": "553", "offset": "664967", "filename": "iana.warc.gz"}
|
||||
org,iana)/_img/2013.1/iana-logo-header.svg 20140126200929 {"url": "http://www.iana.org/_img/2013.1/iana-logo-header.svg", "mime": "warc/revisit", "digest": "N6T6ZRHLEHKP2675D7JVKDYKVKYKWQ6X", "length": "550", "offset": "695150", "filename": "iana.warc.gz"}
|
||||
org,iana)/_img/2013.1/iana-logo-header.svg 20140126201054 {"url": "http://www.iana.org/_img/2013.1/iana-logo-header.svg", "mime": "warc/revisit", "digest": "N6T6ZRHLEHKP2675D7JVKDYKVKYKWQ6X", "length": "548", "offset": "709577", "filename": "iana.warc.gz"}
|
||||
org,iana)/_img/2013.1/iana-logo-header.svg 20140126201128 {"url": "http://www.iana.org/_img/2013.1/iana-logo-header.svg", "mime": "warc/revisit", "digest": "N6T6ZRHLEHKP2675D7JVKDYKVKYKWQ6X", "length": "552", "offset": "728551", "filename": "iana.warc.gz"}
|
||||
org,iana)/_img/2013.1/iana-logo-header.svg 20140126201228 {"url": "http://www.iana.org/_img/2013.1/iana-logo-header.svg", "mime": "warc/revisit", "digest": "N6T6ZRHLEHKP2675D7JVKDYKVKYKWQ6X", "length": "548", "offset": "741538", "filename": "iana.warc.gz"}
|
||||
org,iana)/_img/2013.1/iana-logo-header.svg 20140126201239 {"url": "http://www.iana.org/_img/2013.1/iana-logo-header.svg", "mime": "warc/revisit", "digest": "N6T6ZRHLEHKP2675D7JVKDYKVKYKWQ6X", "length": "549", "offset": "753801", "filename": "iana.warc.gz"}
|
||||
org,iana)/_img/2013.1/iana-logo-header.svg 20140126201249 {"url": "http://www.iana.org/_img/2013.1/iana-logo-header.svg", "mime": "warc/revisit", "digest": "N6T6ZRHLEHKP2675D7JVKDYKVKYKWQ6X", "length": "551", "offset": "766525", "filename": "iana.warc.gz"}
|
||||
org,iana)/_img/2013.1/iana-logo-header.svg 20140126201307 {"url": "https://www.iana.org/_img/2013.1/iana-logo-header.svg", "mime": "warc/revisit", "digest": "N6T6ZRHLEHKP2675D7JVKDYKVKYKWQ6X", "length": "552", "offset": "780562", "filename": "iana.warc.gz"}
|
||||
org,iana)/_img/2013.1/iana-logo-homepage.png 20140126200625 {"url": "http://www.iana.org/_img/2013.1/iana-logo-homepage.png", "mime": "image/png", "status": "200", "digest": "GCW2GM3SIMHEIQYZX25MLSRYVWUCZ7OK", "length": "27163", "offset": "55631", "filename": "iana.warc.gz"}
|
||||
org,iana)/_img/2013.1/icann-logo.svg 20140126200625 {"url": "http://www.iana.org/_img/2013.1/icann-logo.svg", "mime": "image/svg+xml", "status": "200", "digest": "HGRZHOH73EFQQWBYWBSOIV2UU5JDTSGJ", "length": "2809", "offset": "4009", "filename": "iana.warc.gz"}
|
||||
org,iana)/_img/2013.1/icann-logo.svg 20140126200654 {"url": "http://www.iana.org/_img/2013.1/icann-logo.svg", "mime": "warc/revisit", "digest": "HGRZHOH73EFQQWBYWBSOIV2UU5JDTSGJ", "length": "546", "offset": "457816", "filename": "iana.warc.gz"}
|
||||
org,iana)/_img/2013.1/icann-logo.svg 20140126200706 {"url": "http://www.iana.org/_img/2013.1/icann-logo.svg", "mime": "warc/revisit", "digest": "HGRZHOH73EFQQWBYWBSOIV2UU5JDTSGJ", "length": "545", "offset": "492101", "filename": "iana.warc.gz"}
|
||||
org,iana)/_img/2013.1/icann-logo.svg 20140126200719 {"url": "http://www.iana.org/_img/2013.1/icann-logo.svg", "mime": "warc/revisit", "digest": "HGRZHOH73EFQQWBYWBSOIV2UU5JDTSGJ", "length": "548", "offset": "568628", "filename": "iana.warc.gz"}
|
||||
org,iana)/_img/2013.1/icann-logo.svg 20140126200738 {"url": "http://www.iana.org/_img/2013.1/icann-logo.svg", "mime": "warc/revisit", "digest": "HGRZHOH73EFQQWBYWBSOIV2UU5JDTSGJ", "length": "548", "offset": "577695", "filename": "iana.warc.gz"}
|
||||
org,iana)/_img/2013.1/icann-logo.svg 20140126200805 {"url": "http://www.iana.org/_img/2013.1/icann-logo.svg", "mime": "warc/revisit", "digest": "HGRZHOH73EFQQWBYWBSOIV2UU5JDTSGJ", "length": "547", "offset": "594444", "filename": "iana.warc.gz"}
|
||||
org,iana)/_img/2013.1/icann-logo.svg 20140126200816 {"url": "http://www.iana.org/_img/2013.1/icann-logo.svg", "mime": "warc/revisit", "digest": "HGRZHOH73EFQQWBYWBSOIV2UU5JDTSGJ", "length": "548", "offset": "606272", "filename": "iana.warc.gz"}
|
||||
org,iana)/_img/2013.1/icann-logo.svg 20140126200826 {"url": "http://www.iana.org/_img/2013.1/icann-logo.svg", "mime": "warc/revisit", "digest": "HGRZHOH73EFQQWBYWBSOIV2UU5JDTSGJ", "length": "545", "offset": "619007", "filename": "iana.warc.gz"}
|
||||
org,iana)/_img/2013.1/icann-logo.svg 20140126200912 {"url": "http://www.iana.org/_img/2013.1/icann-logo.svg", "mime": "warc/revisit", "digest": "HGRZHOH73EFQQWBYWBSOIV2UU5JDTSGJ", "length": "547", "offset": "666025", "filename": "iana.warc.gz"}
|
||||
org,iana)/_img/2013.1/icann-logo.svg 20140126200930 {"url": "http://www.iana.org/_img/2013.1/icann-logo.svg", "mime": "warc/revisit", "digest": "HGRZHOH73EFQQWBYWBSOIV2UU5JDTSGJ", "length": "547", "offset": "696207", "filename": "iana.warc.gz"}
|
||||
org,iana)/_img/2013.1/icann-logo.svg 20140126201055 {"url": "http://www.iana.org/_img/2013.1/icann-logo.svg", "mime": "warc/revisit", "digest": "HGRZHOH73EFQQWBYWBSOIV2UU5JDTSGJ", "length": "529", "offset": "710633", "filename": "iana.warc.gz"}
|
||||
org,iana)/_img/2013.1/icann-logo.svg 20140126201128 {"url": "http://www.iana.org/_img/2013.1/icann-logo.svg", "mime": "warc/revisit", "digest": "HGRZHOH73EFQQWBYWBSOIV2UU5JDTSGJ", "length": "549", "offset": "729609", "filename": "iana.warc.gz"}
|
||||
org,iana)/_img/2013.1/icann-logo.svg 20140126201228 {"url": "http://www.iana.org/_img/2013.1/icann-logo.svg", "mime": "warc/revisit", "digest": "HGRZHOH73EFQQWBYWBSOIV2UU5JDTSGJ", "length": "544", "offset": "742593", "filename": "iana.warc.gz"}
|
||||
org,iana)/_img/2013.1/icann-logo.svg 20140126201240 {"url": "http://www.iana.org/_img/2013.1/icann-logo.svg", "mime": "warc/revisit", "digest": "HGRZHOH73EFQQWBYWBSOIV2UU5JDTSGJ", "length": "546", "offset": "754853", "filename": "iana.warc.gz"}
|
||||
org,iana)/_img/2013.1/icann-logo.svg 20140126201249 {"url": "http://www.iana.org/_img/2013.1/icann-logo.svg", "mime": "warc/revisit", "digest": "HGRZHOH73EFQQWBYWBSOIV2UU5JDTSGJ", "length": "544", "offset": "767580", "filename": "iana.warc.gz"}
|
||||
org,iana)/_img/2013.1/icann-logo.svg 20140126201308 {"url": "https://www.iana.org/_img/2013.1/icann-logo.svg", "mime": "warc/revisit", "digest": "HGRZHOH73EFQQWBYWBSOIV2UU5JDTSGJ", "length": "546", "offset": "781613", "filename": "iana.warc.gz"}
|
||||
org,iana)/_img/2013.1/rir-map.svg 20140126200654 {"url": "http://www.iana.org/_img/2013.1/rir-map.svg", "mime": "image/svg+xml", "status": "200", "digest": "C4LTM7ATRZYZL3W2UCEEX6A26L6PIT4K", "length": "23189", "offset": "458860", "filename": "iana.warc.gz"}
|
||||
org,iana)/_img/bookmark_icon.ico 20140126200631 {"url": "http://www.iana.org/_img/bookmark_icon.ico", "mime": "application/octet-stream", "status": "200", "digest": "PG3PAWWE72JQ37CXJSPCJNNF7QI3SNX7", "length": "4968", "offset": "315944", "filename": "iana.warc.gz"}
|
||||
org,iana)/_img/bookmark_icon.ico 20140126201310 {"url": "https://www.iana.org/_img/bookmark_icon.ico", "mime": "warc/revisit", "digest": "PG3PAWWE72JQ37CXJSPCJNNF7QI3SNX7", "length": "548", "offset": "785806", "filename": "iana.warc.gz"}
|
||||
org,iana)/_js/2013.1/iana.js 20140126200625 {"url": "http://www.iana.org/_js/2013.1/iana.js", "mime": "application/x-javascript", "status": "200", "digest": "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "458", "offset": "3074", "filename": "iana.warc.gz"}
|
||||
org,iana)/_js/2013.1/iana.js 20140126200653 {"url": "http://www.iana.org/_js/2013.1/iana.js", "mime": "application/x-javascript", "status": "200", "digest": "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "456", "offset": "325380", "filename": "iana.warc.gz"}
|
||||
org,iana)/_js/2013.1/iana.js 20140126200706 {"url": "http://www.iana.org/_js/2013.1/iana.js", "mime": "application/x-javascript", "status": "200", "digest": "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "458", "offset": "487044", "filename": "iana.warc.gz"}
|
||||
org,iana)/_js/2013.1/iana.js 20140126200716 {"url": "http://www.iana.org/_js/2013.1/iana.js", "mime": "application/x-javascript", "status": "200", "digest": "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "457", "offset": "560436", "filename": "iana.warc.gz"}
|
||||
org,iana)/_js/2013.1/iana.js 20140126200737 {"url": "http://www.iana.org/_js/2013.1/iana.js", "mime": "application/x-javascript", "status": "200", "digest": "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "457", "offset": "573645", "filename": "iana.warc.gz"}
|
||||
org,iana)/_js/2013.1/iana.js 20140126200804 {"url": "http://www.iana.org/_js/2013.1/iana.js", "mime": "application/x-javascript", "status": "200", "digest": "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "460", "offset": "587215", "filename": "iana.warc.gz"}
|
||||
org,iana)/_js/2013.1/iana.js 20140126200816 {"url": "http://www.iana.org/_js/2013.1/iana.js", "mime": "application/x-javascript", "status": "200", "digest": "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "459", "offset": "601126", "filename": "iana.warc.gz"}
|
||||
org,iana)/_js/2013.1/iana.js 20140126200825 {"url": "http://www.iana.org/_js/2013.1/iana.js", "mime": "application/x-javascript", "status": "200", "digest": "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "458", "offset": "615991", "filename": "iana.warc.gz"}
|
||||
org,iana)/_js/2013.1/iana.js 20140126200912 {"url": "http://www.iana.org/_js/2013.1/iana.js", "mime": "application/x-javascript", "status": "200", "digest": "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "456", "offset": "660937", "filename": "iana.warc.gz"}
|
||||
org,iana)/_js/2013.1/iana.js 20140126200929 {"url": "http://www.iana.org/_js/2013.1/iana.js", "mime": "application/x-javascript", "status": "200", "digest": "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "458", "offset": "692132", "filename": "iana.warc.gz"}
|
||||
org,iana)/_js/2013.1/iana.js 20140126201054 {"url": "http://www.iana.org/_js/2013.1/iana.js", "mime": "application/x-javascript", "status": "200", "digest": "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "456", "offset": "705534", "filename": "iana.warc.gz"}
|
||||
org,iana)/_js/2013.1/iana.js 20140126201127 {"url": "http://www.iana.org/_js/2013.1/iana.js", "mime": "application/x-javascript", "status": "200", "digest": "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "457", "offset": "724500", "filename": "iana.warc.gz"}
|
||||
org,iana)/_js/2013.1/iana.js 20140126201227 {"url": "http://www.iana.org/_js/2013.1/iana.js", "mime": "application/x-javascript", "status": "200", "digest": "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "458", "offset": "737484", "filename": "iana.warc.gz"}
|
||||
org,iana)/_js/2013.1/iana.js 20140126201239 {"url": "http://www.iana.org/_js/2013.1/iana.js", "mime": "application/x-javascript", "status": "200", "digest": "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "457", "offset": "749770", "filename": "iana.warc.gz"}
|
||||
org,iana)/_js/2013.1/iana.js 20140126201248 {"url": "http://www.iana.org/_js/2013.1/iana.js", "mime": "application/x-javascript", "status": "200", "digest": "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "458", "offset": "762480", "filename": "iana.warc.gz"}
|
||||
org,iana)/_js/2013.1/iana.js 20140126201307 {"url": "https://www.iana.org/_js/2013.1/iana.js", "mime": "application/x-javascript", "status": "200", "digest": "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "453", "offset": "776543", "filename": "iana.warc.gz"}
|
||||
org,iana)/_js/2013.1/jquery.js 20140126200625 {"url": "http://www.iana.org/_js/2013.1/jquery.js", "mime": "application/x-javascript", "status": "200", "digest": "AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO", "length": "33449", "offset": "7311", "filename": "iana.warc.gz"}
|
||||
org,iana)/_js/2013.1/jquery.js 20140126200653 {"url": "http://www.iana.org/_js/2013.1/jquery.js", "mime": "warc/revisit", "digest": "AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO", "length": "542", "offset": "327341", "filename": "iana.warc.gz"}
|
||||
org,iana)/_js/2013.1/jquery.js 20140126200706 {"url": "http://www.iana.org/_js/2013.1/jquery.js", "mime": "warc/revisit", "digest": "AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO", "length": "529", "offset": "490037", "filename": "iana.warc.gz"}
|
||||
org,iana)/_js/2013.1/jquery.js 20140126200716 {"url": "http://www.iana.org/_js/2013.1/jquery.js", "mime": "warc/revisit", "digest": "AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO", "length": "529", "offset": "562402", "filename": "iana.warc.gz"}
|
||||
org,iana)/_js/2013.1/jquery.js 20140126200737 {"url": "http://www.iana.org/_js/2013.1/jquery.js", "mime": "warc/revisit", "digest": "AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO", "length": "543", "offset": "575613", "filename": "iana.warc.gz"}
|
||||
org,iana)/_js/2013.1/jquery.js 20140126200804 {"url": "http://www.iana.org/_js/2013.1/jquery.js", "mime": "warc/revisit", "digest": "AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO", "length": "530", "offset": "590244", "filename": "iana.warc.gz"}
|
||||
org,iana)/_js/2013.1/jquery.js 20140126200816 {"url": "http://www.iana.org/_js/2013.1/jquery.js", "mime": "warc/revisit", "digest": "AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO", "length": "544", "offset": "604162", "filename": "iana.warc.gz"}
|
||||
org,iana)/_js/2013.1/jquery.js 20140126200825 {"url": "http://www.iana.org/_js/2013.1/jquery.js", "mime": "warc/revisit", "digest": "AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO", "length": "543", "offset": "616929", "filename": "iana.warc.gz"}
|
||||
org,iana)/_js/2013.1/jquery.js 20140126200912 {"url": "http://www.iana.org/_js/2013.1/jquery.js", "mime": "warc/revisit", "digest": "AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO", "length": "544", "offset": "663936", "filename": "iana.warc.gz"}
|
||||
org,iana)/_js/2013.1/jquery.js 20140126200929 {"url": "http://www.iana.org/_js/2013.1/jquery.js", "mime": "warc/revisit", "digest": "AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO", "length": "546", "offset": "694112", "filename": "iana.warc.gz"}
|
||||
org,iana)/_js/2013.1/jquery.js 20140126201054 {"url": "http://www.iana.org/_js/2013.1/jquery.js", "mime": "warc/revisit", "digest": "AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO", "length": "544", "offset": "708544", "filename": "iana.warc.gz"}
|
||||
org,iana)/_js/2013.1/jquery.js 20140126201127 {"url": "http://www.iana.org/_js/2013.1/jquery.js", "mime": "warc/revisit", "digest": "AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO", "length": "545", "offset": "727515", "filename": "iana.warc.gz"}
|
||||
org,iana)/_js/2013.1/jquery.js 20140126201227 {"url": "http://www.iana.org/_js/2013.1/jquery.js", "mime": "warc/revisit", "digest": "AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO", "length": "543", "offset": "740505", "filename": "iana.warc.gz"}
|
||||
org,iana)/_js/2013.1/jquery.js 20140126201239 {"url": "http://www.iana.org/_js/2013.1/jquery.js", "mime": "warc/revisit", "digest": "AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO", "length": "545", "offset": "752769", "filename": "iana.warc.gz"}
|
||||
org,iana)/_js/2013.1/jquery.js 20140126201248 {"url": "http://www.iana.org/_js/2013.1/jquery.js", "mime": "warc/revisit", "digest": "AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO", "length": "544", "offset": "765491", "filename": "iana.warc.gz"}
|
||||
org,iana)/_js/2013.1/jquery.js 20140126201307 {"url": "https://www.iana.org/_js/2013.1/jquery.js", "mime": "warc/revisit", "digest": "AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO", "length": "543", "offset": "778507", "filename": "iana.warc.gz"}
|
||||
org,iana)/about 20140126200706 {"url": "http://www.iana.org/about", "mime": "text/html", "status": "200", "digest": "6G77LZKFAVKH4PCWWKMW6TRJPSHWUBI3", "length": "2962", "offset": "483588", "filename": "iana.warc.gz"}
|
||||
org,iana)/about/performance/ietf-draft-status 20140126200815 {"url": "http://www.iana.org/about/performance/ietf-draft-status", "mime": "text/html", "status": "302", "digest": "Y7CTA2QZUSCDTJCSECZNSPIBLJDO7PJJ", "length": "584", "offset": "596566", "filename": "iana.warc.gz"}
|
||||
org,iana)/about/performance/ietf-statistics 20140126200804 {"url": "http://www.iana.org/about/performance/ietf-statistics", "mime": "text/html", "status": "302", "digest": "HNYDN7XRX46RQTT2OFIWXKEYMZQAJWHD", "length": "582", "offset": "581890", "filename": "iana.warc.gz"}
|
||||
org,iana)/dnssec 20140126201306 {"url": "http://www.iana.org/dnssec", "mime": "text/html", "status": "302", "digest": "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "442", "offset": "772827", "filename": "iana.warc.gz"}
|
||||
org,iana)/dnssec 20140126201307 {"url": "https://www.iana.org/dnssec", "mime": "text/html", "status": "200", "digest": "PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI", "length": "2278", "offset": "773766", "filename": "iana.warc.gz"}
|
||||
org,iana)/domains 20140126200825 {"url": "http://www.iana.org/domains", "mime": "text/html", "status": "200", "digest": "7UPSCLNWNZP33LGW6OJGSF2Y4CDG4ES7", "length": "2912", "offset": "610534", "filename": "iana.warc.gz"}
|
||||
org,iana)/domains/arpa 20140126201248 {"url": "http://www.iana.org/domains/arpa", "mime": "text/html", "status": "200", "digest": "QOFZZRN6JIKAL2JRL6ZC2VVG42SPKGHT", "length": "2939", "offset": "759039", "filename": "iana.warc.gz"}
|
||||
org,iana)/domains/idn-tables 20140126201127 {"url": "http://www.iana.org/domains/idn-tables", "mime": "text/html", "status": "200", "digest": "HNCUFTJMOQOGAEY6T56KVC3T7TVLKGEW", "length": "8118", "offset": "715878", "filename": "iana.warc.gz"}
|
||||
org,iana)/domains/int 20140126201239 {"url": "http://www.iana.org/domains/int", "mime": "text/html", "status": "200", "digest": "X32BBNNORV4SPEHTQF5KI5NFHSKTZK6Q", "length": "2482", "offset": "746788", "filename": "iana.warc.gz"}
|
||||
org,iana)/domains/reserved 20140126201054 {"url": "http://www.iana.org/domains/reserved", "mime": "text/html", "status": "200", "digest": "R5AAEQX5XY5X5DG66B23ODN5DUBWRA27", "length": "3573", "offset": "701457", "filename": "iana.warc.gz"}
|
||||
org,iana)/domains/root 20140126200912 {"url": "http://www.iana.org/domains/root", "mime": "text/html", "status": "200", "digest": "YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX", "length": "2691", "offset": "657746", "filename": "iana.warc.gz"}
|
||||
org,iana)/domains/root/db 20140126200927 {"url": "http://www.iana.org/domains/root/db/", "mime": "text/html", "status": "302", "digest": "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "446", "offset": "671278", "filename": "iana.warc.gz"}
|
||||
org,iana)/domains/root/db 20140126200928 {"url": "http://www.iana.org/domains/root/db", "mime": "text/html", "status": "200", "digest": "DHXA725IW5VJJFRTWBQT6BEZKRE7H57S", "length": "18365", "offset": "672225", "filename": "iana.warc.gz"}
|
||||
org,iana)/domains/root/servers 20140126201227 {"url": "http://www.iana.org/domains/root/servers", "mime": "text/html", "status": "200", "digest": "AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU", "length": "3137", "offset": "733840", "filename": "iana.warc.gz"}
|
||||
org,iana)/numbers 20140126200651 {"url": "http://www.iana.org/numbers", "mime": "text/html", "status": "200", "digest": "HWT5UZKURYLW5QNWVZCWFCANGEMU7XWK", "length": "3498", "offset": "321385", "filename": "iana.warc.gz"}
|
||||
org,iana)/performance/ietf-draft-status 20140126200815 {"url": "http://www.iana.org/performance/ietf-draft-status", "mime": "text/html", "status": "200", "digest": "T5IQTX6DWV5KABGH454CYEDWKRI5Y23E", "length": "2940", "offset": "597667", "filename": "iana.warc.gz"}
|
||||
org,iana)/performance/ietf-statistics 20140126200804 {"url": "http://www.iana.org/performance/ietf-statistics", "mime": "text/html", "status": "200", "digest": "XOFML5WNBQMTSULLIIPLSP6U5MX33HN6", "length": "3712", "offset": "582987", "filename": "iana.warc.gz"}
|
||||
org,iana)/protocols 20140126200715 {"url": "http://www.iana.org/protocols", "mime": "text/html", "status": "200", "digest": "IRUJZEUAXOUUG224ZMI4VWTUPJX6XJTT", "length": "63663", "offset": "496277", "filename": "iana.warc.gz"}
|
||||
org,iana)/time-zones 20140126200737 {"url": "http://www.iana.org/time-zones", "mime": "text/html", "status": "200", "digest": "4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R", "length": "2449", "offset": "569675", "filename": "iana.warc.gz"}
|
BIN
testdata/iana.warc.gz
vendored
Normal file
BIN
testdata/iana.warc.gz
vendored
Normal file
Binary file not shown.
3
testdata/post-test.cdxj
vendored
Normal file
3
testdata/post-test.cdxj
vendored
Normal file
@ -0,0 +1,3 @@
|
||||
org,httpbin)/post?foo=bar&test=abc 20140610000859 {"url": "http://httpbin.org/post", "mime": "application/json", "status": "200", "digest": "M532K5WS4GY2H4OVZO6HRPOP47A7KDWU", "length": "720", "offset": "0", "filename": "post-test.warc.gz"}
|
||||
org,httpbin)/post?a=1&b=[]&c=3 20140610001151 {"url": "http://httpbin.org/post", "mime": "application/json", "status": "200", "digest": "M7YCTM7HS3YKYQTAWQVMQSQZBNEOXGU2", "length": "723", "offset": "1196", "filename": "post-test.warc.gz"}
|
||||
org,httpbin)/post?data=^&foo=bar 20140610001255 {"url": "http://httpbin.org/post?foo=bar", "mime": "application/json", "status": "200", "digest": "B6E5P6JUZI6UPDTNO4L2BCHMGLTNCUAJ", "length": "723", "offset": "2395", "filename": "post-test.warc.gz"}
|
BIN
testdata/post-test.warc.gz
vendored
Normal file
BIN
testdata/post-test.warc.gz
vendored
Normal file
Binary file not shown.
2
testdata/url-agnost-example.cdxj
vendored
Normal file
2
testdata/url-agnost-example.cdxj
vendored
Normal file
@ -0,0 +1,2 @@
|
||||
com,example)/ 20130729195151 {"url": "http://test@example.com/", "mime": "warc/revisit", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "591", "offset": "355", "filename": "example-url-agnostic-revisit.warc.gz"}
|
||||
org,iana,example)/ 20130702195402 {"url": "http://example.iana.org/", "mime": "text/html", "status": "200", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "1001", "offset": "353", "filename": "example-url-agnostic-orig.warc.gz"}
|
0
urlrewrite/__init__.py
Normal file
0
urlrewrite/__init__.py
Normal file
153
urlrewrite/cookies.py
Normal file
153
urlrewrite/cookies.py
Normal file
@ -0,0 +1,153 @@
|
||||
from pywb.rewrite.cookie_rewriter import WbUrlBaseCookieRewriter
|
||||
from pywb.utils.timeutils import datetime_to_http_date
|
||||
from six.moves.http_cookiejar import CookieJar, DefaultCookiePolicy
|
||||
from six.moves import zip
|
||||
|
||||
import redis
|
||||
|
||||
import tldextract
|
||||
import time
|
||||
import datetime
|
||||
import six
|
||||
|
||||
|
||||
# =============================================================================
|
||||
class CookieTracker(object):
|
||||
def __init__(self, redis, expire_time=120):
|
||||
self.redis = redis
|
||||
self.expire_time = expire_time
|
||||
|
||||
def get_rewriter(self, url_rewriter, cookie_key):
|
||||
return DomainCacheCookieRewriter(url_rewriter, self, cookie_key)
|
||||
|
||||
def get_cookie_headers(self, url, cookie_key):
|
||||
subds = self.get_subdomains(url)
|
||||
|
||||
if not subds:
|
||||
return None, None
|
||||
|
||||
with redis.utils.pipeline(self.redis) as pi:
|
||||
for domain in subds:
|
||||
pi.hgetall(cookie_key + '.' + domain)
|
||||
|
||||
all_res = pi.execute()
|
||||
|
||||
cookies = []
|
||||
set_cookies = []
|
||||
|
||||
with redis.utils.pipeline(self.redis) as pi:
|
||||
for res, domain in zip(all_res, subds):
|
||||
if not res:
|
||||
continue
|
||||
|
||||
for n, v in six.iteritems(res):
|
||||
n = n.decode('utf-8')
|
||||
v = v.decode('utf-8')
|
||||
full = n + '=' + v
|
||||
cookies.append(full.split(';')[0])
|
||||
set_cookies.append(('Set-Cookie', full + '; Max-Age=' + str(self.expire_time)))
|
||||
|
||||
pi.expire(cookie_key + '.' + domain, self.expire_time)
|
||||
|
||||
|
||||
cookies = ';'.join(cookies)
|
||||
return cookies, set_cookies
|
||||
|
||||
def add_cookie(self, cookie_key, domain, name, value):
|
||||
if domain[0] != '.':
|
||||
domain = '.' + domain
|
||||
|
||||
with redis.utils.pipeline(self.redis) as pi:
|
||||
pi.hset(cookie_key + domain, name, value)
|
||||
pi.expire(cookie_key + domain, self.expire_time)
|
||||
|
||||
@staticmethod
|
||||
def get_subdomains(url):
|
||||
tld = tldextract.extract(url)
|
||||
|
||||
if not tld.subdomain:
|
||||
return None
|
||||
|
||||
main = tld.domain + '.' + tld.suffix
|
||||
full = tld.subdomain + '.' + main
|
||||
|
||||
def get_all_subdomains(main, full):
|
||||
doms = []
|
||||
while main != full:
|
||||
full = full.split('.', 1)[1]
|
||||
doms.append(full)
|
||||
|
||||
return doms
|
||||
|
||||
all_subs = get_all_subdomains(main, full)
|
||||
return all_subs
|
||||
|
||||
|
||||
# =============================================================================
|
||||
class DomainCacheCookieRewriter(WbUrlBaseCookieRewriter):
|
||||
def __init__(self, url_rewriter, cookie_tracker, cookie_key):
|
||||
super(DomainCacheCookieRewriter, self).__init__(url_rewriter)
|
||||
self.cookie_tracker = cookie_tracker
|
||||
self.cookie_key = cookie_key
|
||||
|
||||
def rewrite_cookie(self, name, morsel):
|
||||
# if domain set, no choice but to expand cookie path to root
|
||||
domain = morsel.pop('domain', '')
|
||||
|
||||
if domain:
|
||||
#if morsel.get('max-age'):
|
||||
# morsel['max-age'] = int(morsel['max-age'])
|
||||
|
||||
#self.cookiejar.set_cookie(self.morsel_to_cookie(morsel))
|
||||
#print(morsel, self.cookie_key + domain)
|
||||
|
||||
string = morsel.value
|
||||
if morsel.get('path'):
|
||||
string += '; Path=' + morsel.get('path')
|
||||
|
||||
if morsel.get('httponly'):
|
||||
string += '; HttpOnly'
|
||||
|
||||
if morsel.get('secure'):
|
||||
string += '; Secure'
|
||||
|
||||
self.cookie_tracker.add_cookie(self.cookie_key,
|
||||
domain,
|
||||
morsel.key,
|
||||
string)
|
||||
|
||||
# else set cookie to rewritten path
|
||||
if morsel.get('path'):
|
||||
morsel['path'] = self.url_rewriter.rewrite(morsel['path'])
|
||||
|
||||
return morsel
|
||||
|
||||
def get_expire_sec(self, morsel):
|
||||
expires = None
|
||||
|
||||
if morsel.get('max-age'):
|
||||
return int(morsel['max-age'])
|
||||
|
||||
expires = morsel.get('expires')
|
||||
if not expires:
|
||||
return None
|
||||
|
||||
expires = expires.replace(' UTC', ' GMT')
|
||||
|
||||
try:
|
||||
expires = time.strptime(expires, '%a, %d-%b-%Y %H:%M:%S GMT')
|
||||
except:
|
||||
pass
|
||||
|
||||
try:
|
||||
expires = time.strptime(expires, '%a, %d %b %Y %H:%M:%S GMT')
|
||||
except:
|
||||
pass
|
||||
|
||||
expires = time.mktime(expires)
|
||||
expires = expires - time.timezone - time.time()
|
||||
return expires
|
||||
|
||||
|
||||
# ============================================================================
|
||||
|
99
urlrewrite/platformhandler.py
Normal file
99
urlrewrite/platformhandler.py
Normal file
@ -0,0 +1,99 @@
|
||||
from gevent.monkey import patch_all; patch_all()
|
||||
|
||||
import requests
|
||||
|
||||
from pywb.framework.archivalrouter import Route
|
||||
|
||||
from pywb.rewrite.rewrite_content import RewriteContent
|
||||
from pywb.rewrite.wburl import WbUrl
|
||||
from pywb.warc.recordloader import ArcWarcRecordLoader
|
||||
from pywb.webapp.live_rewrite_handler import RewriteHandler
|
||||
from pywb.utils.canonicalize import canonicalize
|
||||
from pywb.utils.timeutils import http_date_to_timestamp
|
||||
from pywb.cdx.cdxobject import CDXObject
|
||||
|
||||
from io import BytesIO
|
||||
|
||||
from rewriteinputreq import RewriteInputRequest
|
||||
|
||||
from six.moves.urllib.parse import quote
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class PlatformRoute(Route):
|
||||
def apply_filters(self, wbrequest, matcher):
|
||||
wbrequest.matchdict = matcher.groupdict()
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class PlatformHandler(RewriteHandler):
|
||||
def __init__(self, config):
|
||||
super(PlatformHandler, self).__init__(config)
|
||||
self.upstream_url = config.get('upstream_url')
|
||||
self.loader = ArcWarcRecordLoader()
|
||||
|
||||
framed = config.get('framed_replay')
|
||||
self.content_rewriter = RewriteContent(is_framed_replay=framed)
|
||||
|
||||
def render_content(self, wbrequest):
|
||||
if wbrequest.wb_url.mod == 'vi_':
|
||||
return self._get_video_info(wbrequest)
|
||||
|
||||
ref_wburl_str = wbrequest.extract_referrer_wburl_str()
|
||||
if ref_wburl_str:
|
||||
wbrequest.env['HTTP_REFERER'] = WbUrl(ref_wburl_str).url
|
||||
|
||||
urlkey = canonicalize(wbrequest.wb_url.url)
|
||||
url = wbrequest.wb_url.url
|
||||
|
||||
inputreq = RewriteInputRequest(wbrequest.env, urlkey, url,
|
||||
self.content_rewriter)
|
||||
|
||||
req_data = inputreq.reconstruct_request(url)
|
||||
|
||||
headers = {'Content-Length': len(req_data),
|
||||
'Content-Type': 'application/request'}
|
||||
|
||||
if wbrequest.wb_url.is_latest_replay():
|
||||
closest = 'now'
|
||||
else:
|
||||
closest = wbrequest.wb_url.timestamp
|
||||
|
||||
upstream_url = self.upstream_url.format(url=quote(url),
|
||||
closest=closest,
|
||||
#coll=wbrequest.coll,
|
||||
**wbrequest.matchdict)
|
||||
|
||||
r = requests.post(upstream_url,
|
||||
data=BytesIO(req_data),
|
||||
headers=headers,
|
||||
stream=True,
|
||||
allow_redirects=False)
|
||||
|
||||
r.raise_for_status()
|
||||
|
||||
record = self.loader.parse_record_stream(r.raw)
|
||||
|
||||
cdx = CDXObject()
|
||||
cdx['urlkey'] = urlkey
|
||||
cdx['timestamp'] = http_date_to_timestamp(r.headers.get('Memento-Datetime'))
|
||||
cdx['url'] = url
|
||||
|
||||
head_insert_func = self.head_insert_view.create_insert_func(wbrequest)
|
||||
result = self.content_rewriter.rewrite_content(wbrequest.urlrewriter,
|
||||
record.status_headers,
|
||||
record.stream,
|
||||
head_insert_func,
|
||||
urlkey,
|
||||
cdx)
|
||||
|
||||
status_headers, gen, is_rw = result
|
||||
return self._make_response(wbrequest, *result)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from gevent.wsgi import WSGIServer
|
||||
from pywb.apps.wayback import application
|
||||
|
||||
server = WSGIServer(('', 8090), application)
|
||||
server.serve_forever()
|
134
urlrewrite/rewriteinputreq.py
Normal file
134
urlrewrite/rewriteinputreq.py
Normal file
@ -0,0 +1,134 @@
|
||||
from webagg.inputrequest import DirectWSGIInputRequest
|
||||
from pywb.utils.loaders import extract_client_cookie
|
||||
|
||||
from six import iteritems
|
||||
from six.moves.urllib.parse import urlsplit
|
||||
import re
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class RewriteInputRequest(DirectWSGIInputRequest):
|
||||
RANGE_ARG_RX = re.compile('.*.googlevideo.com/videoplayback.*([&?]range=(\d+)-(\d+))')
|
||||
|
||||
RANGE_HEADER = re.compile('bytes=(\d+)-(\d+)?')
|
||||
|
||||
def __init__(self, env, urlkey, url, rewriter):
|
||||
super(RewriteInputRequest, self).__init__(env)
|
||||
self.urlkey = urlkey
|
||||
self.url = url
|
||||
self.rewriter = rewriter
|
||||
self.extra_cookie = None
|
||||
|
||||
self.splits = urlsplit(self.url)
|
||||
|
||||
def get_full_request_uri(self):
|
||||
uri = self.splits.path
|
||||
if self.splits.query:
|
||||
uri += '?' + self.splits.query
|
||||
|
||||
return uri
|
||||
|
||||
def get_req_headers(self):
|
||||
headers = {}
|
||||
|
||||
has_cookies = False
|
||||
|
||||
for name, value in iteritems(self.env):
|
||||
if name == 'HTTP_HOST':
|
||||
name = 'Host'
|
||||
value = self.splits.netloc
|
||||
|
||||
elif name == 'HTTP_ORIGIN':
|
||||
name = 'Origin'
|
||||
value = (self.splits.scheme + '://' + self.splits.netloc)
|
||||
|
||||
elif name == 'HTTP_X_CSRFTOKEN':
|
||||
name = 'X-CSRFToken'
|
||||
cookie_val = extract_client_cookie(self.env, 'csrftoken')
|
||||
if cookie_val:
|
||||
value = cookie_val
|
||||
|
||||
elif name == 'HTTP_X_PYWB_REQUESTED_WITH':
|
||||
continue
|
||||
|
||||
elif name == 'HTTP_X_FORWARDED_PROTO':
|
||||
name = 'X-Forwarded-Proto'
|
||||
value = self.splits.scheme
|
||||
|
||||
elif name == 'HTTP_COOKIE':
|
||||
name = 'Cookie'
|
||||
value = self._req_cookie_rewrite(value)
|
||||
has_cookies = True
|
||||
|
||||
elif name.startswith('HTTP_'):
|
||||
name = name[5:].title().replace('_', '-')
|
||||
|
||||
elif name in ('CONTENT_LENGTH', 'CONTENT_TYPE'):
|
||||
name = name.title().replace('_', '-')
|
||||
|
||||
else:
|
||||
value = None
|
||||
|
||||
if value:
|
||||
headers[name] = value
|
||||
|
||||
if not has_cookies:
|
||||
value = self._req_cookie_rewrite('')
|
||||
if value:
|
||||
headers['Cookie'] = value
|
||||
|
||||
if self.extra_cookie:
|
||||
headers['Cookie'] = self.extra_cookie + ';' + headers.get('Cookie', '')
|
||||
print('Cookie', headers['Cookie'])
|
||||
|
||||
return headers
|
||||
|
||||
def _req_cookie_rewrite(self, value):
|
||||
rule = self.rewriter.ruleset.get_first_match(self.urlkey)
|
||||
if not rule or not rule.req_cookie_rewrite:
|
||||
return value
|
||||
|
||||
for cr in rule.req_cookie_rewrite:
|
||||
try:
|
||||
value = cr['rx'].sub(cr['replace'], value)
|
||||
except KeyError:
|
||||
pass
|
||||
|
||||
return value
|
||||
|
||||
def extract_range(self):
|
||||
use_206 = False
|
||||
start = None
|
||||
end = None
|
||||
url = self.url
|
||||
|
||||
range_h = self.env.get('HTTP_RANGE')
|
||||
|
||||
if range_h:
|
||||
m = self.RANGE_HEADER.match(range_h)
|
||||
if m:
|
||||
start = m.group(1)
|
||||
end = m.group(2)
|
||||
use_206 = True
|
||||
|
||||
else:
|
||||
m = self.RANGE_ARG_RX.match(url)
|
||||
if m:
|
||||
start = m.group(2)
|
||||
end = m.group(3)
|
||||
url = url[:m.start(1)] + url[m.end(1):]
|
||||
use_206 = False
|
||||
|
||||
if not start:
|
||||
return None
|
||||
|
||||
start = int(start)
|
||||
|
||||
if end:
|
||||
end = int(end)
|
||||
else:
|
||||
end = ''
|
||||
|
||||
result = (url, start, end, use_206)
|
||||
return result
|
||||
|
425
urlrewrite/rewriterapp.py
Normal file
425
urlrewrite/rewriterapp.py
Normal file
@ -0,0 +1,425 @@
|
||||
import requests
|
||||
|
||||
from pywb.rewrite.rewrite_amf import RewriteContentAMF
|
||||
from pywb.rewrite.wburl import WbUrl
|
||||
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||
|
||||
from pywb.utils.wbexception import WbException
|
||||
from pywb.utils.canonicalize import canonicalize
|
||||
from pywb.utils.timeutils import http_date_to_timestamp
|
||||
from pywb.utils.loaders import extract_client_cookie
|
||||
|
||||
from pywb.cdx.cdxobject import CDXObject
|
||||
from pywb.warc.recordloader import ArcWarcRecordLoader
|
||||
from pywb.framework.wbrequestresponse import WbResponse
|
||||
|
||||
from six.moves.urllib.parse import urlencode
|
||||
|
||||
from urlrewrite.rewriteinputreq import RewriteInputRequest
|
||||
from urlrewrite.templateview import JinjaEnv, HeadInsertView, TopFrameView, BaseInsertView
|
||||
|
||||
from io import BytesIO
|
||||
|
||||
import gevent
|
||||
import json
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class UpstreamException(WbException):
|
||||
def __init__(self, status_code, url, details):
|
||||
super(UpstreamException, self).__init__(url=url, msg=details)
|
||||
self.status_code = status_code
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class RewriterApp(object):
|
||||
VIDEO_INFO_CONTENT_TYPE = 'application/vnd.youtube-dl_formats+json'
|
||||
|
||||
def __init__(self, framed_replay=False, jinja_env=None, config=None):
|
||||
self.loader = ArcWarcRecordLoader()
|
||||
|
||||
config = config or {}
|
||||
self.paths = config['url_templates']
|
||||
|
||||
self.framed_replay = framed_replay
|
||||
self.frame_mod = ''
|
||||
self.replay_mod = 'mp_'
|
||||
|
||||
frame_type = 'inverse' if framed_replay else False
|
||||
|
||||
self.content_rewriter = RewriteContentAMF(is_framed_replay=frame_type)
|
||||
|
||||
if not jinja_env:
|
||||
jinja_env = JinjaEnv(globals={'static_path': 'static/__pywb'})
|
||||
|
||||
self.jinja_env = jinja_env
|
||||
|
||||
self.head_insert_view = HeadInsertView(self.jinja_env, 'head_insert.html', 'banner.html')
|
||||
self.frame_insert_view = TopFrameView(self.jinja_env, 'frame_insert.html', 'banner.html')
|
||||
self.error_view = BaseInsertView(self.jinja_env, 'error.html')
|
||||
self.query_view = BaseInsertView(self.jinja_env, config.get('query_html', 'query.html'))
|
||||
|
||||
self.cookie_tracker = None
|
||||
|
||||
def call_with_params(self, **kwargs):
|
||||
def run_app(environ, start_response):
|
||||
environ['pywb.kwargs'] = kwargs
|
||||
return self(environ, start_response)
|
||||
|
||||
return run_app
|
||||
|
||||
def __call__(self, environ, start_response):
|
||||
wb_url = self.get_wburl(environ)
|
||||
kwargs = environ.get('pywb.kwargs', {})
|
||||
|
||||
try:
|
||||
response = self.render_content(wb_url, kwargs, environ)
|
||||
except UpstreamException as ue:
|
||||
response = self.handle_error(environ, ue)
|
||||
|
||||
return response(environ, start_response)
|
||||
|
||||
def is_framed_replay(self, wb_url):
|
||||
return (self.framed_replay and
|
||||
wb_url.mod == self.frame_mod and
|
||||
wb_url.is_replay())
|
||||
|
||||
def render_content(self, wb_url, kwargs, environ):
|
||||
wb_url = WbUrl(wb_url)
|
||||
|
||||
host_prefix = self.get_host_prefix(environ)
|
||||
rel_prefix = self.get_rel_prefix(environ)
|
||||
full_prefix = host_prefix + rel_prefix
|
||||
|
||||
resp = self.handle_custom_response(environ, wb_url,
|
||||
full_prefix, host_prefix, kwargs)
|
||||
if resp is not None:
|
||||
content_type = 'text/html'
|
||||
|
||||
# if not replay outer frame, specify utf-8 charset
|
||||
if not self.is_framed_replay(wb_url):
|
||||
content_type += '; charset=utf-8'
|
||||
|
||||
return WbResponse.text_response(resp, content_type=content_type)
|
||||
|
||||
urlrewriter = UrlRewriter(wb_url,
|
||||
prefix=full_prefix,
|
||||
full_prefix=full_prefix,
|
||||
rel_prefix=rel_prefix)
|
||||
|
||||
self.unrewrite_referrer(environ)
|
||||
|
||||
urlkey = canonicalize(wb_url.url)
|
||||
|
||||
inputreq = RewriteInputRequest(environ, urlkey, wb_url.url,
|
||||
self.content_rewriter)
|
||||
|
||||
inputreq.include_post_query(wb_url.url)
|
||||
|
||||
mod_url = None
|
||||
use_206 = False
|
||||
rangeres = None
|
||||
|
||||
readd_range = False
|
||||
async_record_url = None
|
||||
|
||||
if kwargs.get('type') in ('record', 'patch'):
|
||||
rangeres = inputreq.extract_range()
|
||||
|
||||
if rangeres:
|
||||
mod_url, start, end, use_206 = rangeres
|
||||
|
||||
# if bytes=0- Range request,
|
||||
# simply remove the range and still proxy
|
||||
if start == 0 and not end and use_206:
|
||||
wb_url.url = mod_url
|
||||
inputreq.url = mod_url
|
||||
|
||||
del environ['HTTP_RANGE']
|
||||
readd_range = True
|
||||
else:
|
||||
async_record_url = mod_url
|
||||
|
||||
skip = async_record_url is not None
|
||||
|
||||
setcookie_headers = None
|
||||
if self.cookie_tracker:
|
||||
cookie_key = self.get_cookie_key(kwargs)
|
||||
res = self.cookie_tracker.get_cookie_headers(wb_url.url, cookie_key)
|
||||
inputreq.extra_cookie, setcookie_headers = res
|
||||
|
||||
r = self._do_req(inputreq, wb_url, kwargs, skip)
|
||||
|
||||
if r.status_code >= 400:
|
||||
error = None
|
||||
try:
|
||||
error = r.raw.read()
|
||||
r.raw.close()
|
||||
except:
|
||||
pass
|
||||
|
||||
if error:
|
||||
error = error.decode('utf-8')
|
||||
else:
|
||||
error = ''
|
||||
|
||||
details = dict(args=kwargs, error=error)
|
||||
raise UpstreamException(r.status_code, url=wb_url.url, details=details)
|
||||
|
||||
if async_record_url:
|
||||
environ.pop('HTTP_RANGE', '')
|
||||
gevent.spawn(self._do_async_req,
|
||||
inputreq,
|
||||
async_record_url,
|
||||
wb_url,
|
||||
kwargs,
|
||||
False)
|
||||
|
||||
record = self.loader.parse_record_stream(r.raw)
|
||||
|
||||
cdx = CDXObject()
|
||||
cdx['urlkey'] = urlkey
|
||||
cdx['timestamp'] = http_date_to_timestamp(r.headers.get('Memento-Datetime'))
|
||||
cdx['url'] = wb_url.url
|
||||
|
||||
self._add_custom_params(cdx, r.headers, kwargs)
|
||||
|
||||
if readd_range:
|
||||
content_length = (record.status_headers.
|
||||
get_header('Content-Length'))
|
||||
try:
|
||||
content_length = int(content_length)
|
||||
record.status_headers.add_range(0, content_length,
|
||||
content_length)
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
||||
if self.is_ajax(environ):
|
||||
head_insert_func = None
|
||||
urlrewriter.rewrite_opts['is_ajax'] = True
|
||||
else:
|
||||
top_url = self.get_top_url(full_prefix, wb_url, cdx, kwargs)
|
||||
head_insert_func = (self.head_insert_view.
|
||||
create_insert_func(wb_url,
|
||||
full_prefix,
|
||||
host_prefix,
|
||||
top_url,
|
||||
environ,
|
||||
self.framed_replay))
|
||||
|
||||
cookie_rewriter = None
|
||||
if self.cookie_tracker:
|
||||
cookie_rewriter = self.cookie_tracker.get_rewriter(urlrewriter,
|
||||
cookie_key)
|
||||
|
||||
result = self.content_rewriter.rewrite_content(urlrewriter,
|
||||
record.status_headers,
|
||||
record.stream,
|
||||
head_insert_func,
|
||||
urlkey,
|
||||
cdx,
|
||||
cookie_rewriter,
|
||||
environ)
|
||||
|
||||
status_headers, gen, is_rw = result
|
||||
|
||||
if setcookie_headers:
|
||||
status_headers.headers.extend(setcookie_headers)
|
||||
|
||||
return WbResponse(status_headers, gen)
|
||||
|
||||
def get_top_url(self, full_prefix, wb_url, cdx, kwargs):
|
||||
top_url = full_prefix
|
||||
top_url += wb_url.to_str(mod='')
|
||||
return top_url
|
||||
|
||||
def _do_async_req(self, *args):
|
||||
count = 0
|
||||
try:
|
||||
r = self._do_req(*args)
|
||||
while True:
|
||||
buff = r.raw.read(8192)
|
||||
count += len(buff)
|
||||
if not buff:
|
||||
return
|
||||
except:
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
finally:
|
||||
try:
|
||||
r.raw.close()
|
||||
except:
|
||||
pass
|
||||
|
||||
def handle_error(self, environ, ue):
|
||||
error_html = self.error_view.render_to_string(environ,
|
||||
err_msg=ue.url,
|
||||
err_details=ue.msg)
|
||||
|
||||
return WbResponse.text_response(error_html, content_type='text/html')
|
||||
|
||||
def _do_req(self, inputreq, wb_url, kwargs, skip):
|
||||
req_data = inputreq.reconstruct_request(wb_url.url)
|
||||
|
||||
headers = {'Content-Length': str(len(req_data)),
|
||||
'Content-Type': 'application/request'}
|
||||
|
||||
if skip:
|
||||
headers['Recorder-Skip'] = '1'
|
||||
|
||||
if wb_url.is_latest_replay():
|
||||
closest = 'now'
|
||||
else:
|
||||
closest = wb_url.timestamp
|
||||
|
||||
params = {}
|
||||
params['url'] = wb_url.url
|
||||
params['closest'] = closest
|
||||
|
||||
if wb_url.mod == 'vi_':
|
||||
params['content_type'] = self.VIDEO_INFO_CONTENT_TYPE
|
||||
|
||||
upstream_url = self.get_upstream_url(wb_url, kwargs, params)
|
||||
|
||||
r = requests.post(upstream_url,
|
||||
data=BytesIO(req_data),
|
||||
headers=headers,
|
||||
stream=True)
|
||||
|
||||
return r
|
||||
|
||||
def do_query(self, wb_url, kwargs):
|
||||
params = {}
|
||||
params['url'] = wb_url.url
|
||||
params['output'] = 'json'
|
||||
params['from'] = wb_url.timestamp
|
||||
params['to'] = wb_url.end_timestamp
|
||||
|
||||
upstream_url = self.get_upstream_url(wb_url, kwargs, params)
|
||||
upstream_url = upstream_url.replace('/resource/postreq', '/index')
|
||||
|
||||
r = requests.get(upstream_url)
|
||||
|
||||
return r.text
|
||||
|
||||
def handle_query(self, environ, wb_url, kwargs):
|
||||
res = self.do_query(wb_url, kwargs)
|
||||
|
||||
def format_cdx(text):
|
||||
cdx_lines = text.rstrip().split('\n')
|
||||
for cdx in cdx_lines:
|
||||
if not cdx:
|
||||
continue
|
||||
|
||||
cdx = json.loads(cdx)
|
||||
self.process_query_cdx(cdx, wb_url, kwargs)
|
||||
yield cdx
|
||||
|
||||
prefix = self.get_full_prefix(environ)
|
||||
|
||||
params = dict(url=wb_url.url,
|
||||
prefix=prefix,
|
||||
cdx_lines=list(format_cdx(res)))
|
||||
|
||||
extra_params = self.get_query_params(wb_url, kwargs)
|
||||
if extra_params:
|
||||
params.update(extra_params)
|
||||
|
||||
return self.query_view.render_to_string(environ, **params)
|
||||
|
||||
def process_query_cdx(self, cdx, wb_url, kwargs):
|
||||
return
|
||||
|
||||
def get_query_params(self, wb_url, kwargs):
|
||||
return None
|
||||
|
||||
def get_host_prefix(self, environ):
|
||||
#return request.urlparts.scheme + '://' + request.urlparts.netloc
|
||||
url = environ['wsgi.url_scheme'] + '://'
|
||||
if environ.get('HTTP_HOST'):
|
||||
url += environ['HTTP_HOST']
|
||||
else:
|
||||
url += environ['SERVER_NAME']
|
||||
if environ['wsgi.url_scheme'] == 'https':
|
||||
if environ['SERVER_PORT'] != '443':
|
||||
url += ':' + environ['SERVER_PORT']
|
||||
else:
|
||||
if environ['SERVER_PORT'] != '80':
|
||||
url += ':' + environ['SERVER_PORT']
|
||||
|
||||
return url
|
||||
|
||||
def get_rel_prefix(self, environ):
|
||||
#return request.script_name
|
||||
return environ.get('SCRIPT_NAME') + '/'
|
||||
|
||||
def get_full_prefix(self, environ):
|
||||
return self.get_host_prefix(environ) + self.get_rel_prefix(environ)
|
||||
|
||||
def get_wburl(self, environ):
|
||||
wb_url = environ.get('PATH_INFO', '/')[1:]
|
||||
if environ.get('QUERY_STRING'):
|
||||
wb_url += '?' + environ.get('QUERY_STRING')
|
||||
|
||||
return wb_url
|
||||
|
||||
def unrewrite_referrer(self, environ):
|
||||
referrer = environ.get('HTTP_REFERER')
|
||||
if not referrer:
|
||||
return False
|
||||
|
||||
full_prefix = self.get_full_prefix(environ)
|
||||
|
||||
if referrer.startswith(full_prefix):
|
||||
referrer = referrer[len(full_prefix):]
|
||||
environ['HTTP_REFERER'] = WbUrl(referrer).url
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def is_ajax(self, environ):
|
||||
value = environ.get('HTTP_X_REQUESTED_WITH')
|
||||
value = value or environ.get('HTTP_X_PYWB_REQUESTED_WITH')
|
||||
if value and value.lower() == 'xmlhttprequest':
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def get_base_url(self, wb_url, kwargs):
|
||||
type = kwargs.get('type')
|
||||
return self.paths[type]
|
||||
|
||||
def get_upstream_url(self, wb_url, kwargs, params):
|
||||
base_url = self.get_base_url(wb_url, kwargs)
|
||||
param_str = urlencode(params, True)
|
||||
if param_str:
|
||||
base_url += '&' + param_str
|
||||
return base_url
|
||||
|
||||
def get_cookie_key(self, kwargs):
|
||||
raise NotImplemented()
|
||||
|
||||
def _add_custom_params(self, cdx, headers, kwargs):
|
||||
cdx['is_live'] = 'true'
|
||||
pass
|
||||
|
||||
def get_top_frame_params(self, wb_url, kwargs):
|
||||
return None
|
||||
|
||||
def handle_custom_response(self, environ, wb_url, full_prefix, host_prefix, kwargs):
|
||||
if wb_url.is_query():
|
||||
return self.handle_query(environ, wb_url, kwargs)
|
||||
|
||||
if self.is_framed_replay(wb_url):
|
||||
extra_params = self.get_top_frame_params(wb_url, kwargs)
|
||||
return self.frame_insert_view.get_top_frame(wb_url,
|
||||
full_prefix,
|
||||
host_prefix,
|
||||
environ,
|
||||
self.frame_mod,
|
||||
self.replay_mod,
|
||||
coll='',
|
||||
extra_params=extra_params)
|
||||
|
||||
return None
|
225
urlrewrite/templateview.py
Normal file
225
urlrewrite/templateview.py
Normal file
@ -0,0 +1,225 @@
|
||||
from pywb.utils.timeutils import timestamp_to_datetime, timestamp_to_sec
|
||||
from pywb.utils.timeutils import timestamp_now
|
||||
from six.moves.urllib.parse import urlsplit
|
||||
|
||||
from jinja2 import Environment
|
||||
from jinja2 import FileSystemLoader, PackageLoader, ChoiceLoader
|
||||
|
||||
from webassets.ext.jinja2 import AssetsExtension
|
||||
from webassets.loaders import YAMLLoader
|
||||
from webassets.env import Resolver
|
||||
|
||||
from pkg_resources import resource_filename
|
||||
|
||||
import json
|
||||
import os
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class FileOnlyPackageLoader(PackageLoader):
|
||||
def get_source(self, env, template):
|
||||
dir_, file_ = os.path.split(template)
|
||||
return super(FileOnlyPackageLoader, self).get_source(env, file_)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class RelEnvironment(Environment):
|
||||
"""Override join_path() to enable relative template paths."""
|
||||
def join_path(self, template, parent):
|
||||
return os.path.join(os.path.dirname(parent), template)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class JinjaEnv(object):
|
||||
def __init__(self, paths=['templates', '.', '/'],
|
||||
packages=['pywb'],
|
||||
assets_path=None,
|
||||
globals=None,
|
||||
overlay=None,
|
||||
extensions=None):
|
||||
|
||||
self._init_filters()
|
||||
|
||||
loader = ChoiceLoader(self._make_loaders(paths, packages))
|
||||
|
||||
extensions = extensions or []
|
||||
|
||||
if assets_path:
|
||||
extensions.append(AssetsExtension)
|
||||
|
||||
if overlay:
|
||||
jinja_env = overlay.jinja_env.overlay(loader=loader,
|
||||
trim_blocks=True,
|
||||
extensions=extensions)
|
||||
else:
|
||||
jinja_env = RelEnvironment(loader=loader,
|
||||
trim_blocks=True,
|
||||
extensions=extensions)
|
||||
|
||||
jinja_env.filters.update(self.filters)
|
||||
|
||||
if globals:
|
||||
jinja_env.globals.update(globals)
|
||||
|
||||
self.jinja_env = jinja_env
|
||||
|
||||
# init assets
|
||||
if assets_path:
|
||||
assets_loader = YAMLLoader(assets_path)
|
||||
assets_env = assets_loader.load_environment()
|
||||
assets_env.resolver = PkgResResolver()
|
||||
jinja_env.assets_environment = assets_env
|
||||
|
||||
def _make_loaders(self, paths, packages):
|
||||
loaders = []
|
||||
# add loaders for paths
|
||||
for path in paths:
|
||||
loaders.append(FileSystemLoader(path))
|
||||
|
||||
# add loaders for all specified packages
|
||||
for package in packages:
|
||||
loaders.append(FileOnlyPackageLoader(package))
|
||||
|
||||
return loaders
|
||||
|
||||
def template_filter(self, param=None):
|
||||
def deco(func):
|
||||
name = param or func.__name__
|
||||
self.filters[name] = func
|
||||
return func
|
||||
|
||||
return deco
|
||||
|
||||
def _init_filters(self):
|
||||
self.filters = {}
|
||||
|
||||
@self.template_filter()
|
||||
def format_ts(value, format_='%a, %b %d %Y %H:%M:%S'):
|
||||
if format_ == '%s':
|
||||
return timestamp_to_sec(value)
|
||||
else:
|
||||
value = timestamp_to_datetime(value)
|
||||
return value.strftime(format_)
|
||||
|
||||
@self.template_filter('urlsplit')
|
||||
def get_urlsplit(url):
|
||||
split = urlsplit(url)
|
||||
return split
|
||||
|
||||
@self.template_filter()
|
||||
def tojson(obj):
|
||||
return json.dumps(obj)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class BaseInsertView(object):
|
||||
def __init__(self, jenv, insert_file, banner_file=''):
|
||||
self.jenv = jenv
|
||||
self.insert_file = insert_file
|
||||
self.banner_file = banner_file
|
||||
|
||||
def render_to_string(self, env, **kwargs):
|
||||
template = self.jenv.jinja_env.get_template(self.insert_file)
|
||||
params = env.get('webrec.template_params')
|
||||
if params:
|
||||
kwargs.update(params)
|
||||
|
||||
return template.render(**kwargs)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class HeadInsertView(BaseInsertView):
|
||||
def create_insert_func(self, wb_url,
|
||||
wb_prefix,
|
||||
host_prefix,
|
||||
top_url,
|
||||
env,
|
||||
is_framed,
|
||||
coll='',
|
||||
include_ts=True):
|
||||
|
||||
url = wb_url.get_url()
|
||||
|
||||
include_wombat = not wb_url.is_banner_only
|
||||
|
||||
wbrequest = {'host_prefix': host_prefix,
|
||||
'wb_prefix': wb_prefix,
|
||||
'wb_url': wb_url,
|
||||
'coll': coll,
|
||||
'env': env,
|
||||
'options': {'is_framed': is_framed},
|
||||
'rewrite_opts': {}
|
||||
}
|
||||
|
||||
def make_head_insert(rule, cdx):
|
||||
return (self.render_to_string(env, wbrequest=wbrequest,
|
||||
cdx=cdx,
|
||||
top_url=top_url,
|
||||
include_ts=include_ts,
|
||||
include_wombat=include_wombat,
|
||||
banner_html=self.banner_file,
|
||||
rule=rule))
|
||||
return make_head_insert
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class TopFrameView(BaseInsertView):
|
||||
def get_top_frame(self, wb_url,
|
||||
wb_prefix,
|
||||
host_prefix,
|
||||
env,
|
||||
frame_mod,
|
||||
replay_mod,
|
||||
coll='',
|
||||
extra_params=None):
|
||||
|
||||
embed_url = wb_url.to_str(mod=replay_mod)
|
||||
|
||||
if wb_url.timestamp:
|
||||
timestamp = wb_url.timestamp
|
||||
else:
|
||||
timestamp = timestamp_now()
|
||||
|
||||
wbrequest = {'host_prefix': host_prefix,
|
||||
'wb_prefix': wb_prefix,
|
||||
'wb_url': wb_url,
|
||||
'coll': coll,
|
||||
|
||||
'options': {'frame_mod': frame_mod,
|
||||
'replay_mod': replay_mod},
|
||||
}
|
||||
|
||||
params = dict(embed_url=embed_url,
|
||||
wbrequest=wbrequest,
|
||||
timestamp=timestamp,
|
||||
url=wb_url.get_url(),
|
||||
banner_html=self.banner_file)
|
||||
|
||||
if extra_params:
|
||||
params.update(extra_params)
|
||||
|
||||
return self.render_to_string(env, **params)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class PkgResResolver(Resolver):
|
||||
def get_pkg_path(self, item):
|
||||
if not isinstance(item, str):
|
||||
return None
|
||||
|
||||
parts = urlsplit(item)
|
||||
if parts.scheme == 'pkg' and parts.netloc:
|
||||
return (parts.netloc, parts.path)
|
||||
|
||||
return None
|
||||
|
||||
def resolve_source(self, ctx, item):
|
||||
pkg = self.get_pkg_path(item)
|
||||
if pkg:
|
||||
filename = resource_filename(pkg[0], pkg[1])
|
||||
if filename:
|
||||
return filename
|
||||
|
||||
return super(PkgResResolver, self).resolve_source(ctx, item)
|
||||
|
||||
|
0
urlrewrite/test/__init__.py
Normal file
0
urlrewrite/test/__init__.py
Normal file
74
urlrewrite/test/simpleapp.py
Normal file
74
urlrewrite/test/simpleapp.py
Normal file
@ -0,0 +1,74 @@
|
||||
from gevent.monkey import patch_all; patch_all()
|
||||
|
||||
from bottle import run, Bottle, request, response, debug
|
||||
|
||||
from six.moves.urllib.parse import quote
|
||||
|
||||
from pywb.utils.loaders import LocalFileLoader
|
||||
|
||||
import mimetypes
|
||||
import redis
|
||||
|
||||
from urlrewrite.rewriterapp import RewriterApp
|
||||
from urlrewrite.cookies import CookieTracker
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class RWApp(RewriterApp):
|
||||
def __init__(self, upstream_urls, cookie_key_templ, redis):
|
||||
config = {}
|
||||
config['url_templates'] = upstream_urls
|
||||
|
||||
self.cookie_key_templ = cookie_key_templ
|
||||
self.app = Bottle()
|
||||
self.block_loader = LocalFileLoader()
|
||||
self.init_routes()
|
||||
|
||||
super(RWApp, self).__init__(True, config=config)
|
||||
|
||||
self.cookie_tracker = CookieTracker(redis)
|
||||
|
||||
self.orig_error_handler = self.app.default_error_handler
|
||||
self.app.default_error_handler = self.err_handler
|
||||
|
||||
def err_handler(self, exc):
|
||||
print(exc)
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return self.orig_error_handler(exc)
|
||||
|
||||
def get_cookie_key(self, kwargs):
|
||||
return self.cookie_key_templ.format(**kwargs)
|
||||
|
||||
def init_routes(self):
|
||||
@self.app.get('/static/__pywb/<filepath:path>')
|
||||
def server_static(filepath):
|
||||
data = self.block_loader.load('pywb/static/' + filepath)
|
||||
guessed = mimetypes.guess_type(filepath)
|
||||
if guessed[0]:
|
||||
response.headers['Content-Type'] = guessed[0]
|
||||
|
||||
return data
|
||||
|
||||
self.app.mount('/live/', self.call_with_params(type='live'))
|
||||
self.app.mount('/record/', self.call_with_params(type='record'))
|
||||
self.app.mount('/replay/', self.call_with_params(type='replay'))
|
||||
|
||||
@staticmethod
|
||||
def create_app(replay_port=8080, record_port=8010):
|
||||
upstream_urls = {'live': 'http://localhost:%s/live/resource/postreq?' % replay_port,
|
||||
'record': 'http://localhost:%s/live/resource/postreq?' % record_port,
|
||||
'replay': 'http://localhost:%s/replay/resource/postreq?' % replay_port,
|
||||
}
|
||||
|
||||
r = redis.StrictRedis.from_url('redis://localhost/2')
|
||||
rwapp = RWApp(upstream_urls, 'cookies:', r)
|
||||
return rwapp
|
||||
|
||||
|
||||
# ============================================================================
|
||||
if __name__ == "__main__":
|
||||
application = RWApp.create_app()
|
||||
application.app.run(port=8090, server='gevent')
|
||||
|
||||
|
43
urlrewrite/test/test_rewriter.py
Normal file
43
urlrewrite/test/test_rewriter.py
Normal file
@ -0,0 +1,43 @@
|
||||
|
||||
from webagg.test.testutils import LiveServerTests, BaseTestClass
|
||||
from webagg.test.testutils import FakeRedisTests
|
||||
|
||||
from .simpleapp import RWApp, debug
|
||||
|
||||
import os
|
||||
import webtest
|
||||
|
||||
|
||||
class TestRewriter(LiveServerTests, FakeRedisTests, BaseTestClass):
|
||||
@classmethod
|
||||
def setup_class(cls):
|
||||
super(TestRewriter, cls).setup_class()
|
||||
#cls.upstream_url = 'http://localhost:{0}'.format(cls.server.port)
|
||||
#cls.upstream_url += '/{type}/resource/postreq?url={url}&closest={closest}'
|
||||
#cls.app = RWApp(cls.upstream_url)
|
||||
|
||||
cls.app = RWApp.create_app(replay_port=cls.server.port)
|
||||
cls.testapp = webtest.TestApp(cls.app.app)
|
||||
debug(True)
|
||||
|
||||
def test_replay(self):
|
||||
resp = self.testapp.get('/live/mp_/http://example.com/')
|
||||
resp.charset = 'utf-8'
|
||||
|
||||
assert '"http://localhost:80/live/mp_/http://www.iana.org/domains/example"' in resp.text
|
||||
|
||||
assert 'wbinfo.url = "http://example.com/"'
|
||||
|
||||
def test_top_frame(self):
|
||||
resp = self.testapp.get('/live/http://example.com/')
|
||||
resp.charset = 'utf-8'
|
||||
|
||||
assert '"http://localhost:80/live/mp_/http://example.com/"' in resp.text
|
||||
|
||||
assert 'wbinfo.capture_url = "http://example.com/"' in resp.text
|
||||
|
||||
def test_cookie_track_1(self):
|
||||
resp = self.testapp.get('/live/mp_/https://twitter.com/')
|
||||
|
||||
assert resp.headers['set-cookie'] != None
|
||||
|
18
urlrewrite/test/uwsgi.ini
Normal file
18
urlrewrite/test/uwsgi.ini
Normal file
@ -0,0 +1,18 @@
|
||||
[uwsgi]
|
||||
if-not-env = PORT
|
||||
http-socket = :8090
|
||||
endif =
|
||||
|
||||
master = true
|
||||
buffer-size = 65536
|
||||
die-on-term = true
|
||||
|
||||
if-env = VIRTUAL_ENV
|
||||
venv = $(VIRTUAL_ENV)
|
||||
endif =
|
||||
|
||||
gevent = 100
|
||||
|
||||
wsgi = urlrewrite.test.simpleapp
|
||||
|
||||
|
14
webagg/Dockerfile
Normal file
14
webagg/Dockerfile
Normal file
@ -0,0 +1,14 @@
|
||||
FROM python:3.5
|
||||
|
||||
WORKDIR /code/
|
||||
|
||||
RUN pip install -U git+https://github.com/ikreymer/pywb.git@develop#egg=pywb-0.30.0-develop
|
||||
RUN pip install uwsgi gevent bottle
|
||||
|
||||
ADD . /code/webagg/
|
||||
ADD ./test/ /code/test/
|
||||
|
||||
WORKDIR /code/
|
||||
CMD uwsgi /code/test/live.ini
|
||||
|
||||
|
6
webagg/README.rst
Normal file
6
webagg/README.rst
Normal file
@ -0,0 +1,6 @@
|
||||
Resource Memento/Aggregator
|
||||
===========================
|
||||
|
||||
This is a reference implementation of the `Resource/Memento Aggregator <https://github.com/webrecorder/platform-spec/wiki/ResourceMementoAggregator>`_
|
||||
from the `Webrecorder Platform <https://github.com/webrecorder/platform-spec/wiki>`_
|
||||
|
0
webagg/__init__.py
Normal file
0
webagg/__init__.py
Normal file
287
webagg/aggregator.py
Normal file
287
webagg/aggregator.py
Normal file
@ -0,0 +1,287 @@
|
||||
from gevent.pool import Pool
|
||||
import gevent
|
||||
|
||||
from concurrent import futures
|
||||
|
||||
import json
|
||||
import time
|
||||
import os
|
||||
|
||||
from pywb.utils.timeutils import timestamp_now
|
||||
from pywb.cdx.cdxops import process_cdx
|
||||
from pywb.cdx.query import CDXQuery
|
||||
|
||||
from heapq import merge
|
||||
from collections import deque
|
||||
from itertools import chain
|
||||
|
||||
from webagg.indexsource import FileIndexSource, RedisIndexSource
|
||||
from pywb.utils.wbexception import NotFoundException, WbException
|
||||
|
||||
from webagg.utils import ParamFormatter, res_template
|
||||
|
||||
import six
|
||||
import glob
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class BaseAggregator(object):
|
||||
def __call__(self, params):
|
||||
if params.get('closest') == 'now':
|
||||
params['closest'] = timestamp_now()
|
||||
|
||||
content_type = params.get('content_type')
|
||||
if content_type:
|
||||
params['filter'] = '=mime:' + content_type
|
||||
|
||||
query = CDXQuery(params)
|
||||
|
||||
cdx_iter, errs = self.load_index(query.params)
|
||||
|
||||
cdx_iter = process_cdx(cdx_iter, query)
|
||||
return cdx_iter, dict(errs)
|
||||
|
||||
def load_child_source(self, name, source, params):
|
||||
try:
|
||||
params['_formatter'] = ParamFormatter(params, name)
|
||||
res = source.load_index(params)
|
||||
if isinstance(res, tuple):
|
||||
cdx_iter, err_list = res
|
||||
else:
|
||||
cdx_iter = res
|
||||
err_list = []
|
||||
except WbException as wbe:
|
||||
#print('Not found in ' + name)
|
||||
cdx_iter = iter([])
|
||||
err_list = [(name, repr(wbe))]
|
||||
|
||||
def add_name(cdx, name):
|
||||
if cdx.get('source'):
|
||||
cdx['source'] = name + ':' + cdx['source']
|
||||
else:
|
||||
cdx['source'] = name
|
||||
return cdx
|
||||
|
||||
return (add_name(cdx, name) for cdx in cdx_iter), err_list
|
||||
|
||||
def load_index(self, params):
|
||||
res_list = self._load_all(params)
|
||||
|
||||
iter_list = [res[0] for res in res_list]
|
||||
err_list = chain(*[res[1] for res in res_list])
|
||||
|
||||
#optimization: if only a single entry (or empty) just load directly
|
||||
if len(iter_list) <= 1:
|
||||
cdx_iter = iter_list[0] if iter_list else iter([])
|
||||
else:
|
||||
cdx_iter = merge(*(iter_list))
|
||||
|
||||
return cdx_iter, err_list
|
||||
|
||||
def _on_source_error(self, name): #pragma: no cover
|
||||
pass
|
||||
|
||||
def _load_all(self, params): #pragma: no cover
|
||||
raise NotImplemented()
|
||||
|
||||
def _iter_sources(self, params): #pragma: no cover
|
||||
raise NotImplemented()
|
||||
|
||||
def get_source_list(self, params):
|
||||
srcs = self._iter_sources(params)
|
||||
result = [(name, str(value)) for name, value in srcs]
|
||||
result = {'sources': dict(result)}
|
||||
return result
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class BaseSourceListAggregator(BaseAggregator):
|
||||
def __init__(self, sources, **kwargs):
|
||||
self.sources = sources
|
||||
|
||||
def get_all_sources(self, params):
|
||||
return self.sources
|
||||
|
||||
def _iter_sources(self, params):
|
||||
sources = self.get_all_sources(params)
|
||||
srcs_list = params.get('sources')
|
||||
if not srcs_list:
|
||||
return sources.items()
|
||||
|
||||
sel_sources = tuple(srcs_list.split(','))
|
||||
|
||||
return [(name, sources[name]) for name in sources.keys() if name in sel_sources]
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class SeqAggMixin(object):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(SeqAggMixin, self).__init__(*args, **kwargs)
|
||||
|
||||
|
||||
def _load_all(self, params):
|
||||
sources = self._iter_sources(params)
|
||||
return [self.load_child_source(name, source, params)
|
||||
for name, source in sources]
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class SimpleAggregator(SeqAggMixin, BaseSourceListAggregator):
|
||||
pass
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class TimeoutMixin(object):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(TimeoutMixin, self).__init__(*args, **kwargs)
|
||||
self.t_count = kwargs.get('t_count', 3)
|
||||
self.t_dura = kwargs.get('t_duration', 20)
|
||||
self.timeouts = {}
|
||||
|
||||
def is_timed_out(self, name):
|
||||
timeout_deq = self.timeouts.get(name)
|
||||
if not timeout_deq:
|
||||
return False
|
||||
|
||||
the_time = time.time()
|
||||
for t in list(timeout_deq):
|
||||
if (the_time - t) > self.t_dura:
|
||||
timeout_deq.popleft()
|
||||
|
||||
if len(timeout_deq) >= self.t_count:
|
||||
print('Skipping {0}, {1} timeouts in {2} seconds'.
|
||||
format(name, self.t_count, self.t_dura))
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def _iter_sources(self, params):
|
||||
sources = super(TimeoutMixin, self)._iter_sources(params)
|
||||
for name, source in sources:
|
||||
if not self.is_timed_out(name):
|
||||
yield name, source
|
||||
|
||||
def _on_source_error(self, name):
|
||||
the_time = time.time()
|
||||
if name not in self.timeouts:
|
||||
self.timeouts[name] = deque()
|
||||
|
||||
self.timeouts[name].append(the_time)
|
||||
print(name + ' timed out!')
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class GeventMixin(object):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(GeventMixin, self).__init__(*args, **kwargs)
|
||||
self.pool = Pool(size=kwargs.get('size'))
|
||||
self.timeout = kwargs.get('timeout', 5.0)
|
||||
|
||||
def _load_all(self, params):
|
||||
params['_timeout'] = self.timeout
|
||||
|
||||
sources = list(self._iter_sources(params))
|
||||
|
||||
def do_spawn(name, source):
|
||||
return self.pool.spawn(self.load_child_source, name, source, params)
|
||||
|
||||
jobs = [do_spawn(name, source) for name, source in sources]
|
||||
|
||||
gevent.joinall(jobs, timeout=self.timeout)
|
||||
|
||||
results = []
|
||||
for (name, source), job in zip(sources, jobs):
|
||||
if job.value is not None:
|
||||
results.append(job.value)
|
||||
else:
|
||||
results.append((iter([]), [(name, 'timeout')]))
|
||||
self._on_source_error(name)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class GeventTimeoutAggregator(TimeoutMixin, GeventMixin, BaseSourceListAggregator):
|
||||
pass
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class BaseDirectoryIndexSource(BaseAggregator):
|
||||
CDX_EXT = ('.cdx', '.cdxj')
|
||||
|
||||
def __init__(self, base_prefix, base_dir=''):
|
||||
self.base_prefix = base_prefix
|
||||
self.base_dir = base_dir
|
||||
|
||||
def _iter_sources(self, params):
|
||||
the_dir = res_template(self.base_dir, params)
|
||||
the_dir = os.path.join(self.base_prefix, the_dir)
|
||||
try:
|
||||
sources = list(self._load_files(the_dir))
|
||||
except Exception:
|
||||
raise NotFoundException(the_dir)
|
||||
|
||||
return sources
|
||||
|
||||
def _load_files(self, glob_dir):
|
||||
for the_dir in glob.iglob(glob_dir):
|
||||
for result in self._load_files_single_dir(the_dir):
|
||||
yield result
|
||||
|
||||
def _load_files_single_dir(self, the_dir):
|
||||
for name in os.listdir(the_dir):
|
||||
filename = os.path.join(the_dir, name)
|
||||
|
||||
if filename.endswith(self.CDX_EXT):
|
||||
print('Adding ' + filename)
|
||||
rel_path = os.path.relpath(the_dir, self.base_prefix)
|
||||
if rel_path == '.':
|
||||
full_name = name
|
||||
else:
|
||||
full_name = rel_path + '/' + name
|
||||
|
||||
yield full_name, FileIndexSource(filename)
|
||||
|
||||
def __str__(self):
|
||||
return 'file_dir'
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class DirectoryIndexSource(SeqAggMixin, BaseDirectoryIndexSource):
|
||||
pass
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class CacheDirectoryIndexSource(DirectoryIndexSource):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(CacheDirectoryIndexSource, self).__init__(*args, **kwargs)
|
||||
self.cached_file_list = {}
|
||||
|
||||
def _load_files_single_dir(self, the_dir):
|
||||
try:
|
||||
stat = os.stat(the_dir)
|
||||
except Exception as e:
|
||||
stat = 0
|
||||
|
||||
result = self.cached_file_list.get(the_dir)
|
||||
|
||||
if result:
|
||||
last_stat, files = result
|
||||
if stat and last_stat == stat:
|
||||
print('Dir {0} unchanged'.format(the_dir))
|
||||
return files
|
||||
|
||||
files = super(CacheDirectoryIndexSource, self)._load_files_single_dir(the_dir)
|
||||
files = list(files)
|
||||
self.cached_file_list[the_dir] = (stat, files)
|
||||
return files
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class RedisMultiKeyIndexSource(SeqAggMixin, BaseAggregator, RedisIndexSource):
|
||||
def _iter_sources(self, params):
|
||||
redis_key_pattern = res_template(self.redis_key_template, params)
|
||||
|
||||
for key in self.redis.scan_iter(match=redis_key_pattern):
|
||||
key = key.decode('utf-8')
|
||||
yield key, RedisIndexSource(None, self.redis, key)
|
124
webagg/app.py
Normal file
124
webagg/app.py
Normal file
@ -0,0 +1,124 @@
|
||||
from webagg.inputrequest import DirectWSGIInputRequest, POSTInputRequest
|
||||
from werkzeug.routing import Map, Rule
|
||||
|
||||
import requests
|
||||
import traceback
|
||||
import json
|
||||
|
||||
from six.moves.urllib.parse import parse_qsl
|
||||
import six
|
||||
|
||||
JSON_CT = 'application/json; charset=utf-8'
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class ResAggApp(object):
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.route_dict = {}
|
||||
self.debug = kwargs.get('debug', False)
|
||||
|
||||
self.url_map = Map()
|
||||
|
||||
def list_routes(environ):
|
||||
return {}, self.route_dict, {}
|
||||
|
||||
self.url_map.add(Rule('/', endpoint=list_routes))
|
||||
|
||||
def add_route(self, path, handler):
|
||||
def direct_input_request(environ, mode=''):
|
||||
params = self.get_query_dict(environ)
|
||||
params['mode'] = mode
|
||||
params['_input_req'] = DirectWSGIInputRequest(environ)
|
||||
return handler(params)
|
||||
|
||||
def post_fullrequest(environ, mode=''):
|
||||
params = self.get_query_dict(environ)
|
||||
params['mode'] = mode
|
||||
params['_input_req'] = POSTInputRequest(environ)
|
||||
return handler(params)
|
||||
|
||||
self.url_map.add(Rule(path, endpoint=direct_input_request))
|
||||
self.url_map.add(Rule(path + '/<path:mode>', endpoint=direct_input_request))
|
||||
|
||||
self.url_map.add(Rule(path + '/postreq', endpoint=post_fullrequest))
|
||||
self.url_map.add(Rule(path + '/<path:mode>/postreq', endpoint=post_fullrequest))
|
||||
|
||||
handler_dict = handler.get_supported_modes()
|
||||
|
||||
self.route_dict[path] = handler_dict
|
||||
self.route_dict[path + '/postreq'] = handler_dict
|
||||
|
||||
def get_query_dict(self, environ):
|
||||
query_str = environ.get('QUERY_STRING')
|
||||
if query_str:
|
||||
return dict(parse_qsl(query_str))
|
||||
else:
|
||||
return {}
|
||||
|
||||
def __call__(self, environ, start_response):
|
||||
urls = self.url_map.bind_to_environ(environ)
|
||||
try:
|
||||
endpoint, args = urls.match()
|
||||
except HTTPException as e:
|
||||
return e(environ, start_response)
|
||||
|
||||
try:
|
||||
result = endpoint(environ, **args)
|
||||
|
||||
out_headers, res, errs = result
|
||||
|
||||
if not res:
|
||||
return self.send_error(errs, start_response)
|
||||
|
||||
if isinstance(res, dict):
|
||||
res = self.json_encode(res, out_headers)
|
||||
|
||||
if errs:
|
||||
if 'last_exc' in errs:
|
||||
errs['last_exc'] = str(errs['last_exc'])
|
||||
out_headers['ResErrors'] = json.dumps(errs)
|
||||
|
||||
start_response('200 OK', list(out_headers.items()))
|
||||
return res
|
||||
|
||||
except Exception as e:
|
||||
if self.debug:
|
||||
traceback.print_exc()
|
||||
message = 'Internal Error: ' + str(e)
|
||||
status = 500
|
||||
return self.send_error({}, start_response,
|
||||
message=message,
|
||||
status=status)
|
||||
|
||||
def json_encode(self, res, out_headers):
|
||||
res = json.dumps(res).encode('utf-8')
|
||||
out_headers['Content-Type'] = JSON_CT
|
||||
out_headers['Content-Length'] = str(len(res))
|
||||
return [res]
|
||||
|
||||
def send_error(self, errs, start_response,
|
||||
message='No Resource Found', status=404):
|
||||
last_exc = errs.pop('last_exc', None)
|
||||
if last_exc:
|
||||
if self.debug:
|
||||
traceback.print_exc()
|
||||
|
||||
status = last_exc.status()
|
||||
message = last_exc.msg
|
||||
|
||||
res = {'message': message}
|
||||
if errs:
|
||||
res['errors'] = errs
|
||||
|
||||
out_headers = {}
|
||||
res = self.json_encode(res, out_headers)
|
||||
|
||||
if six.PY3:
|
||||
out_headers['ResErrors'] = res[0].decode('utf-8')
|
||||
else:
|
||||
out_headers['ResErrors'] = res[0]
|
||||
message = message.encode('utf-8')
|
||||
|
||||
message = str(status) + ' ' + message
|
||||
start_response(message, list(out_headers.items()))
|
||||
return res
|
194
webagg/handlers.py
Normal file
194
webagg/handlers.py
Normal file
@ -0,0 +1,194 @@
|
||||
from webagg.responseloader import WARCPathLoader, LiveWebLoader, VideoLoader
|
||||
from webagg.utils import MementoUtils
|
||||
from pywb.utils.wbexception import BadRequestException, WbException
|
||||
from pywb.utils.wbexception import NotFoundException
|
||||
|
||||
from pywb.cdx.query import CDXQuery
|
||||
from pywb.cdx.cdxdomainspecific import load_domain_specific_cdx_rules
|
||||
|
||||
import six
|
||||
|
||||
|
||||
#=============================================================================
|
||||
def to_cdxj(cdx_iter, fields):
|
||||
content_type = 'text/x-cdxj'
|
||||
return content_type, (cdx.to_cdxj(fields) for cdx in cdx_iter)
|
||||
|
||||
def to_json(cdx_iter, fields):
|
||||
content_type = 'application/x-ndjson'
|
||||
return content_type, (cdx.to_json(fields) for cdx in cdx_iter)
|
||||
|
||||
def to_text(cdx_iter, fields):
|
||||
content_type = 'text/plain'
|
||||
return content_type, (cdx.to_text(fields) for cdx in cdx_iter)
|
||||
|
||||
def to_link(cdx_iter, fields):
|
||||
content_type = 'application/link'
|
||||
return content_type, MementoUtils.make_timemap(cdx_iter)
|
||||
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class FuzzyMatcher(object):
|
||||
def __init__(self):
|
||||
res = load_domain_specific_cdx_rules('pywb/rules.yaml', True)
|
||||
self.url_canon, self.fuzzy_query = res
|
||||
|
||||
def __call__(self, index_source, params):
|
||||
cdx_iter, errs = index_source(params)
|
||||
return self.do_fuzzy(cdx_iter, index_source, params), errs
|
||||
|
||||
def do_fuzzy(self, cdx_iter, index_source, params):
|
||||
found = False
|
||||
for cdx in cdx_iter:
|
||||
found = True
|
||||
yield cdx
|
||||
|
||||
fuzzy_query_params = None
|
||||
if not found:
|
||||
query = CDXQuery(params)
|
||||
fuzzy_query_params = self.fuzzy_query(query)
|
||||
|
||||
if not fuzzy_query_params:
|
||||
return
|
||||
|
||||
fuzzy_query_params.pop('alt_url', '')
|
||||
|
||||
new_iter, errs = index_source(fuzzy_query_params)
|
||||
|
||||
for cdx in new_iter:
|
||||
yield cdx
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class IndexHandler(object):
|
||||
OUTPUTS = {
|
||||
'cdxj': to_cdxj,
|
||||
'json': to_json,
|
||||
'text': to_text,
|
||||
'link': to_link,
|
||||
}
|
||||
|
||||
DEF_OUTPUT = 'cdxj'
|
||||
|
||||
def __init__(self, index_source, opts=None, *args, **kwargs):
|
||||
self.index_source = index_source
|
||||
self.opts = opts or {}
|
||||
self.fuzzy = FuzzyMatcher()
|
||||
|
||||
def get_supported_modes(self):
|
||||
return dict(modes=['list_sources', 'index'])
|
||||
|
||||
def _load_index_source(self, params):
|
||||
url = params.get('url')
|
||||
if not url:
|
||||
errs = dict(last_exc=BadRequestException('The "url" param is required'))
|
||||
return None, errs
|
||||
|
||||
input_req = params.get('_input_req')
|
||||
if input_req:
|
||||
params['alt_url'] = input_req.include_post_query(url)
|
||||
|
||||
return self.fuzzy(self.index_source, params)
|
||||
|
||||
def __call__(self, params):
|
||||
mode = params.get('mode', 'index')
|
||||
if mode == 'list_sources':
|
||||
return {}, self.index_source.get_source_list(params), {}
|
||||
|
||||
if mode != 'index':
|
||||
return {}, self.get_supported_modes(), {}
|
||||
|
||||
output = params.get('output', self.DEF_OUTPUT)
|
||||
fields = params.get('fields')
|
||||
|
||||
handler = self.OUTPUTS.get(output)
|
||||
if not handler:
|
||||
errs = dict(last_exc=BadRequestException('output={0} not supported'.format(output)))
|
||||
return None, None, errs
|
||||
|
||||
cdx_iter, errs = self._load_index_source(params)
|
||||
if not cdx_iter:
|
||||
return None, None, errs
|
||||
|
||||
content_type, res = handler(cdx_iter, fields)
|
||||
out_headers = {'Content-Type': content_type}
|
||||
|
||||
def check_str(lines):
|
||||
for line in lines:
|
||||
if isinstance(line, six.text_type):
|
||||
line = line.encode('utf-8')
|
||||
yield line
|
||||
|
||||
return out_headers, check_str(res), errs
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class ResourceHandler(IndexHandler):
|
||||
def __init__(self, index_source, resource_loaders):
|
||||
super(ResourceHandler, self).__init__(index_source)
|
||||
self.resource_loaders = resource_loaders
|
||||
|
||||
def get_supported_modes(self):
|
||||
res = super(ResourceHandler, self).get_supported_modes()
|
||||
res['modes'].append('resource')
|
||||
return res
|
||||
|
||||
def __call__(self, params):
|
||||
if params.get('mode', 'resource') != 'resource':
|
||||
return super(ResourceHandler, self).__call__(params)
|
||||
|
||||
cdx_iter, errs = self._load_index_source(params)
|
||||
if not cdx_iter:
|
||||
return None, None, errs
|
||||
|
||||
last_exc = None
|
||||
|
||||
for cdx in cdx_iter:
|
||||
for loader in self.resource_loaders:
|
||||
try:
|
||||
out_headers, resp = loader(cdx, params)
|
||||
if resp is not None:
|
||||
return out_headers, resp, errs
|
||||
except WbException as e:
|
||||
last_exc = e
|
||||
errs[str(loader)] = str(e)
|
||||
|
||||
if last_exc:
|
||||
errs['last_exc'] = last_exc
|
||||
|
||||
return None, None, errs
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class DefaultResourceHandler(ResourceHandler):
|
||||
def __init__(self, index_source, warc_paths=''):
|
||||
loaders = [WARCPathLoader(warc_paths, index_source),
|
||||
LiveWebLoader(),
|
||||
VideoLoader()
|
||||
]
|
||||
super(DefaultResourceHandler, self).__init__(index_source, loaders)
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class HandlerSeq(object):
|
||||
def __init__(self, handlers):
|
||||
self.handlers = handlers
|
||||
|
||||
def get_supported_modes(self):
|
||||
if self.handlers:
|
||||
return self.handlers[0].get_supported_modes()
|
||||
else:
|
||||
return {}
|
||||
|
||||
def __call__(self, params):
|
||||
all_errs = {}
|
||||
for handler in self.handlers:
|
||||
out_headers, res, errs = handler(params)
|
||||
all_errs.update(errs)
|
||||
if res is not None:
|
||||
return out_headers, res, all_errs
|
||||
|
||||
return None, None, all_errs
|
||||
|
||||
|
226
webagg/indexsource.py
Normal file
226
webagg/indexsource.py
Normal file
@ -0,0 +1,226 @@
|
||||
import redis
|
||||
|
||||
from pywb.utils.binsearch import iter_range
|
||||
from pywb.utils.timeutils import timestamp_to_http_date, http_date_to_timestamp
|
||||
from pywb.utils.timeutils import timestamp_now
|
||||
from pywb.utils.canonicalize import canonicalize
|
||||
from pywb.utils.wbexception import NotFoundException
|
||||
|
||||
from pywb.cdx.cdxobject import CDXObject
|
||||
|
||||
#from webagg.liverec import patched_requests as requests
|
||||
import requests
|
||||
|
||||
from webagg.utils import ParamFormatter, res_template
|
||||
from webagg.utils import MementoUtils
|
||||
|
||||
|
||||
WAYBACK_ORIG_SUFFIX = '{timestamp}id_/{url}'
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class BaseIndexSource(object):
|
||||
def load_index(self, params): #pragma: no cover
|
||||
raise NotImplemented()
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class FileIndexSource(BaseIndexSource):
|
||||
def __init__(self, filename):
|
||||
self.filename_template = filename
|
||||
|
||||
def load_index(self, params):
|
||||
filename = res_template(self.filename_template, params)
|
||||
|
||||
try:
|
||||
fh = open(filename, 'rb')
|
||||
except IOError:
|
||||
raise NotFoundException(filename)
|
||||
|
||||
def do_load(fh):
|
||||
with fh:
|
||||
gen = iter_range(fh, params['key'], params['end_key'])
|
||||
for line in gen:
|
||||
yield CDXObject(line)
|
||||
|
||||
return do_load(fh)
|
||||
|
||||
def __str__(self):
|
||||
return 'file'
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class RemoteIndexSource(BaseIndexSource):
|
||||
def __init__(self, api_url, replay_url, url_field='load_url'):
|
||||
self.api_url_template = api_url
|
||||
self.replay_url = replay_url
|
||||
self.url_field = url_field
|
||||
|
||||
def load_index(self, params):
|
||||
api_url = res_template(self.api_url_template, params)
|
||||
r = requests.get(api_url, timeout=params.get('_timeout'))
|
||||
if r.status_code >= 400:
|
||||
raise NotFoundException(api_url)
|
||||
|
||||
lines = r.content.strip().split(b'\n')
|
||||
def do_load(lines):
|
||||
for line in lines:
|
||||
cdx = CDXObject(line)
|
||||
self._set_load_url(cdx)
|
||||
yield cdx
|
||||
|
||||
return do_load(lines)
|
||||
|
||||
def _set_load_url(self, cdx):
|
||||
cdx[self.url_field] = self.replay_url.format(
|
||||
timestamp=cdx['timestamp'],
|
||||
url=cdx['url'])
|
||||
|
||||
def __str__(self):
|
||||
return 'remote'
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class LiveIndexSource(BaseIndexSource):
|
||||
def __init__(self, proxy_url='{url}'):
|
||||
self.proxy_url = proxy_url
|
||||
|
||||
def load_index(self, params):
|
||||
cdx = CDXObject()
|
||||
cdx['urlkey'] = params.get('key').decode('utf-8')
|
||||
cdx['timestamp'] = timestamp_now()
|
||||
cdx['url'] = params['url']
|
||||
cdx['load_url'] = res_template(self.proxy_url, params)
|
||||
cdx['is_live'] = 'true'
|
||||
cdx['mime'] = params.get('content_type', '')
|
||||
def live():
|
||||
yield cdx
|
||||
|
||||
return live()
|
||||
|
||||
def __str__(self):
|
||||
return 'live'
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class RedisIndexSource(BaseIndexSource):
|
||||
def __init__(self, redis_url, redis=None, key_template=None):
|
||||
if redis_url and not redis:
|
||||
redis, key_template = self.parse_redis_url(redis_url)
|
||||
|
||||
self.redis = redis
|
||||
self.redis_key_template = key_template
|
||||
|
||||
@staticmethod
|
||||
def parse_redis_url(redis_url):
|
||||
parts = redis_url.split('/')
|
||||
key_prefix = ''
|
||||
if len(parts) > 4:
|
||||
key_prefix = parts[4]
|
||||
redis_url = 'redis://' + parts[2] + '/' + parts[3]
|
||||
|
||||
redis_key_template = key_prefix
|
||||
red = redis.StrictRedis.from_url(redis_url)
|
||||
return red, key_prefix
|
||||
|
||||
def load_index(self, params):
|
||||
return self.load_key_index(self.redis_key_template, params)
|
||||
|
||||
def load_key_index(self, key_template, params):
|
||||
z_key = res_template(key_template, params)
|
||||
index_list = self.redis.zrangebylex(z_key,
|
||||
b'[' + params['key'],
|
||||
b'(' + params['end_key'])
|
||||
|
||||
def do_load(index_list):
|
||||
for line in index_list:
|
||||
yield CDXObject(line)
|
||||
|
||||
return do_load(index_list)
|
||||
|
||||
def __str__(self):
|
||||
return 'redis'
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class MementoIndexSource(BaseIndexSource):
|
||||
def __init__(self, timegate_url, timemap_url, replay_url):
|
||||
self.timegate_url = timegate_url
|
||||
self.timemap_url = timemap_url
|
||||
self.replay_url = replay_url
|
||||
|
||||
def links_to_cdxobject(self, link_header, def_name):
|
||||
results = MementoUtils.parse_links(link_header, def_name)
|
||||
|
||||
#meta = MementoUtils.meta_field('timegate', results)
|
||||
#if meta:
|
||||
# yield meta
|
||||
|
||||
#meta = MementoUtils.meta_field('timemap', results)
|
||||
#if meta:
|
||||
# yield meta
|
||||
|
||||
#meta = MementoUtils.meta_field('original', results)
|
||||
#if meta:
|
||||
# yield meta
|
||||
|
||||
original = results['original']['url']
|
||||
key = canonicalize(original)
|
||||
|
||||
mementos = results['mementos']
|
||||
|
||||
for val in mementos:
|
||||
dt = val['datetime']
|
||||
ts = http_date_to_timestamp(dt)
|
||||
cdx = CDXObject()
|
||||
cdx['urlkey'] = key
|
||||
cdx['timestamp'] = ts
|
||||
cdx['url'] = original
|
||||
cdx['mem_rel'] = val.get('rel', '')
|
||||
cdx['memento_url'] = val['url']
|
||||
|
||||
load_url = self.replay_url.format(timestamp=cdx['timestamp'],
|
||||
url=original)
|
||||
|
||||
cdx['load_url'] = load_url
|
||||
yield cdx
|
||||
|
||||
def get_timegate_links(self, params, closest):
|
||||
url = res_template(self.timegate_url, params)
|
||||
accept_dt = timestamp_to_http_date(closest)
|
||||
res = requests.head(url, headers={'Accept-Datetime': accept_dt})
|
||||
if res.status_code >= 400:
|
||||
raise NotFoundException(url)
|
||||
|
||||
return res.headers.get('Link')
|
||||
|
||||
def get_timemap_links(self, params):
|
||||
url = res_template(self.timemap_url, params)
|
||||
res = requests.get(url, timeout=params.get('_timeout'))
|
||||
if res.status_code >= 400:
|
||||
raise NotFoundException(url)
|
||||
|
||||
return res.text
|
||||
|
||||
def load_index(self, params):
|
||||
closest = params.get('closest')
|
||||
|
||||
if not closest:
|
||||
links = self.get_timemap_links(params)
|
||||
def_name = 'timemap'
|
||||
else:
|
||||
links = self.get_timegate_links(params, closest)
|
||||
def_name = 'timegate'
|
||||
|
||||
return self.links_to_cdxobject(links, def_name)
|
||||
|
||||
@staticmethod
|
||||
def from_timegate_url(timegate_url, path='link'):
|
||||
return MementoIndexSource(timegate_url + '{url}',
|
||||
timegate_url + 'timemap/' + path + '/{url}',
|
||||
timegate_url + WAYBACK_ORIG_SUFFIX)
|
||||
|
||||
def __str__(self):
|
||||
return 'memento'
|
||||
|
||||
|
170
webagg/inputrequest.py
Normal file
170
webagg/inputrequest.py
Normal file
@ -0,0 +1,170 @@
|
||||
from pywb.utils.loaders import extract_post_query, append_post_query
|
||||
from pywb.utils.loaders import LimitReader
|
||||
from pywb.utils.statusandheaders import StatusAndHeadersParser
|
||||
|
||||
from six.moves.urllib.parse import urlsplit, quote
|
||||
from six import iteritems, StringIO
|
||||
from io import BytesIO
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class DirectWSGIInputRequest(object):
|
||||
def __init__(self, env):
|
||||
self.env = env
|
||||
|
||||
def get_req_method(self):
|
||||
return self.env['REQUEST_METHOD'].upper()
|
||||
|
||||
def get_req_protocol(self):
|
||||
return self.env['SERVER_PROTOCOL']
|
||||
|
||||
def get_req_headers(self):
|
||||
headers = {}
|
||||
|
||||
for name, value in iteritems(self.env):
|
||||
# will be set by requests to match actual host
|
||||
if name == 'HTTP_HOST':
|
||||
continue
|
||||
|
||||
elif name.startswith('HTTP_'):
|
||||
name = name[5:].title().replace('_', '-')
|
||||
|
||||
elif name in ('CONTENT_LENGTH', 'CONTENT_TYPE'):
|
||||
name = name.title().replace('_', '-')
|
||||
|
||||
else:
|
||||
value = None
|
||||
|
||||
if value:
|
||||
headers[name] = value
|
||||
|
||||
return headers
|
||||
|
||||
def get_req_body(self):
|
||||
input_ = self.env['wsgi.input']
|
||||
len_ = self._get_content_length()
|
||||
enc = self._get_header('Transfer-Encoding')
|
||||
|
||||
if len_:
|
||||
data = LimitReader(input_, int(len_))
|
||||
elif enc:
|
||||
data = input_
|
||||
else:
|
||||
data = None
|
||||
|
||||
return data
|
||||
|
||||
def _get_content_type(self):
|
||||
return self.env.get('CONTENT_TYPE')
|
||||
|
||||
def _get_content_length(self):
|
||||
return self.env.get('CONTENT_LENGTH')
|
||||
|
||||
def _get_header(self, name):
|
||||
return self.env.get('HTTP_' + name.upper().replace('-', '_'))
|
||||
|
||||
def include_post_query(self, url):
|
||||
if not url or self.get_req_method() != 'POST':
|
||||
return url
|
||||
|
||||
mime = self._get_content_type()
|
||||
#mime = mime.split(';')[0] if mime else ''
|
||||
length = self._get_content_length()
|
||||
stream = self.env['wsgi.input']
|
||||
|
||||
buffered_stream = BytesIO()
|
||||
|
||||
post_query = extract_post_query('POST', mime, length, stream,
|
||||
buffered_stream=buffered_stream,
|
||||
environ=self.env)
|
||||
|
||||
if post_query:
|
||||
self.env['wsgi.input'] = buffered_stream
|
||||
url = append_post_query(url, post_query)
|
||||
|
||||
return url
|
||||
|
||||
def get_full_request_uri(self):
|
||||
req_uri = self.env.get('REQUEST_URI')
|
||||
if req_uri and not self.env.get('SCRIPT_NAME'):
|
||||
return req_uri
|
||||
|
||||
req_uri = quote(self.env.get('PATH_INFO', ''), safe='/~!$&\'()*+,;=:@')
|
||||
query = self.env.get('QUERY_STRING')
|
||||
if query:
|
||||
req_uri += '?' + query
|
||||
|
||||
return req_uri
|
||||
|
||||
def reconstruct_request(self, url=None):
|
||||
buff = StringIO()
|
||||
buff.write(self.get_req_method())
|
||||
buff.write(' ')
|
||||
buff.write(self.get_full_request_uri())
|
||||
buff.write(' ')
|
||||
buff.write(self.get_req_protocol())
|
||||
buff.write('\r\n')
|
||||
|
||||
headers = self.get_req_headers()
|
||||
|
||||
if url:
|
||||
parts = urlsplit(url)
|
||||
buff.write('Host: ')
|
||||
buff.write(parts.netloc)
|
||||
buff.write('\r\n')
|
||||
|
||||
for name, value in iteritems(headers):
|
||||
if name.lower() == 'host':
|
||||
continue
|
||||
|
||||
buff.write(name)
|
||||
buff.write(': ')
|
||||
buff.write(value)
|
||||
buff.write('\r\n')
|
||||
|
||||
buff.write('\r\n')
|
||||
buff = buff.getvalue().encode('latin-1')
|
||||
|
||||
body = self.get_req_body()
|
||||
if body:
|
||||
buff += body.read()
|
||||
|
||||
return buff
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class POSTInputRequest(DirectWSGIInputRequest):
|
||||
def __init__(self, env):
|
||||
self.env = env
|
||||
|
||||
parser = StatusAndHeadersParser([], verify=False)
|
||||
|
||||
self.status_headers = parser.parse(self.env['wsgi.input'])
|
||||
|
||||
def get_req_method(self):
|
||||
return self.status_headers.protocol
|
||||
|
||||
def get_req_headers(self):
|
||||
headers = {}
|
||||
for n, v in self.status_headers.headers:
|
||||
headers[n] = v
|
||||
|
||||
return headers
|
||||
|
||||
def get_full_request_uri(self):
|
||||
return self.status_headers.statusline.split(' ', 1)[0]
|
||||
|
||||
def get_req_protocol(self):
|
||||
return self.status_headers.statusline.split(' ', 1)[-1]
|
||||
|
||||
def _get_content_type(self):
|
||||
return self.status_headers.get_header('Content-Type')
|
||||
|
||||
def _get_content_length(self):
|
||||
return self.status_headers.get_header('Content-Length')
|
||||
|
||||
def _get_header(self, name):
|
||||
return self.status_headers.get_header(name)
|
||||
|
||||
|
||||
|
54
webagg/proxyindexsource.py
Normal file
54
webagg/proxyindexsource.py
Normal file
@ -0,0 +1,54 @@
|
||||
from pywb.cdx.cdxobject import CDXObject
|
||||
from pywb.utils.wbexception import NotFoundException
|
||||
from webagg.indexsource import BaseIndexSource, RemoteIndexSource
|
||||
from webagg.responseloader import LiveWebLoader
|
||||
from webagg.utils import ParamFormatter, res_template
|
||||
from pywb.utils.timeutils import timestamp_now
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class UpstreamAggIndexSource(RemoteIndexSource):
|
||||
def __init__(self, base_url):
|
||||
api_url = base_url + '/index?url={url}'
|
||||
proxy_url = base_url + '/resource?url={url}&closest={timestamp}'
|
||||
super(UpstreamAggIndexSource, self).__init__(api_url, proxy_url, 'filename')
|
||||
|
||||
def _set_load_url(self, cdx):
|
||||
super(UpstreamAggIndexSource, self)._set_load_url(cdx)
|
||||
cdx['offset'] = '0'
|
||||
cdx.pop('load_url', '')
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class ProxyMementoIndexSource(BaseIndexSource):
|
||||
def __init__(self, proxy_url='{url}'):
|
||||
self.proxy_url = proxy_url
|
||||
self.loader = LiveWebLoader()
|
||||
|
||||
def load_index(self, params):
|
||||
cdx = CDXObject()
|
||||
cdx['urlkey'] = params.get('key').decode('utf-8')
|
||||
|
||||
closest = params.get('closest')
|
||||
cdx['timestamp'] = closest if closest else timestamp_now()
|
||||
cdx['url'] = params['url']
|
||||
cdx['load_url'] = res_template(self.proxy_url, params)
|
||||
cdx['memento_url'] = cdx['load_url']
|
||||
return self._do_load(cdx, params)
|
||||
|
||||
def _do_load(self, cdx, params):
|
||||
result = self.loader.load_resource(cdx, params)
|
||||
if not result:
|
||||
raise NotFoundException('Not a memento: ' + cdx['url'])
|
||||
|
||||
cdx['_cached_result'] = result
|
||||
yield cdx
|
||||
|
||||
def __str__(self):
|
||||
return 'proxy'
|
||||
|
||||
@staticmethod
|
||||
def upstream_resource(base_url):
|
||||
return ProxyMementoIndexSource(base_url + '/resource?url={url}&closest={closest}')
|
||||
|
||||
|
436
webagg/responseloader.py
Normal file
436
webagg/responseloader.py
Normal file
@ -0,0 +1,436 @@
|
||||
from webagg.utils import MementoUtils, StreamIter, chunk_encode_iter
|
||||
from webagg.utils import ParamFormatter
|
||||
from webagg.indexsource import RedisIndexSource
|
||||
|
||||
from pywb.utils.timeutils import timestamp_to_datetime, datetime_to_timestamp
|
||||
from pywb.utils.timeutils import iso_date_to_datetime, datetime_to_iso_date
|
||||
from pywb.utils.timeutils import http_date_to_datetime, datetime_to_http_date
|
||||
|
||||
from pywb.utils.wbexception import LiveResourceException, WbException
|
||||
from pywb.utils.statusandheaders import StatusAndHeaders, StatusAndHeadersParser
|
||||
|
||||
from pywb.warc.resolvingloader import ResolvingLoader
|
||||
|
||||
from six.moves.urllib.parse import urlsplit, quote, unquote
|
||||
|
||||
from io import BytesIO
|
||||
|
||||
import uuid
|
||||
import six
|
||||
import itertools
|
||||
import json
|
||||
|
||||
from requests.models import PreparedRequest
|
||||
import urllib3
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class BaseLoader(object):
|
||||
def __call__(self, cdx, params):
|
||||
entry = self.load_resource(cdx, params)
|
||||
if not entry:
|
||||
return None, None
|
||||
|
||||
warc_headers, other_headers, stream = entry
|
||||
|
||||
out_headers = {}
|
||||
out_headers['WebAgg-Type'] = 'warc'
|
||||
out_headers['WebAgg-Source-Coll'] = quote(cdx.get('source', ''), safe=':/')
|
||||
out_headers['Content-Type'] = 'application/warc-record'
|
||||
|
||||
if not warc_headers:
|
||||
if other_headers:
|
||||
out_headers['Link'] = other_headers.get('Link')
|
||||
out_headers['Memento-Datetime'] = other_headers.get('Memento-Datetime')
|
||||
out_headers['Content-Length'] = other_headers.get('Content-Length')
|
||||
|
||||
return out_headers, StreamIter(stream)
|
||||
|
||||
out_headers['Link'] = MementoUtils.make_link(
|
||||
warc_headers.get_header('WARC-Target-URI'),
|
||||
'original')
|
||||
|
||||
memento_dt = iso_date_to_datetime(warc_headers.get_header('WARC-Date'))
|
||||
out_headers['Memento-Datetime'] = datetime_to_http_date(memento_dt)
|
||||
|
||||
warc_headers_buff = warc_headers.to_bytes()
|
||||
|
||||
lenset = self._set_content_len(warc_headers.get_header('Content-Length'),
|
||||
out_headers,
|
||||
len(warc_headers_buff))
|
||||
|
||||
streamiter = StreamIter(stream,
|
||||
header1=warc_headers_buff,
|
||||
header2=other_headers)
|
||||
|
||||
if not lenset:
|
||||
out_headers['Transfer-Encoding'] = 'chunked'
|
||||
streamiter = chunk_encode_iter(streamiter)
|
||||
|
||||
return out_headers, streamiter
|
||||
|
||||
def _set_content_len(self, content_len_str, headers, existing_len):
|
||||
# Try to set content-length, if it is available and valid
|
||||
try:
|
||||
content_len = int(content_len_str)
|
||||
except (KeyError, TypeError):
|
||||
content_len = -1
|
||||
|
||||
if content_len >= 0:
|
||||
content_len += existing_len
|
||||
headers['Content-Length'] = str(content_len)
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def raise_on_self_redirect(self, params, cdx, status_code, location_url):
|
||||
"""
|
||||
Check if response is a 3xx redirect to the same url
|
||||
If so, reject this capture to avoid causing redirect loop
|
||||
"""
|
||||
if cdx.get('is_live'):
|
||||
return
|
||||
|
||||
if not status_code.startswith('3') or status_code == '304':
|
||||
return
|
||||
|
||||
request_url = params['url'].lower()
|
||||
if not location_url:
|
||||
return
|
||||
|
||||
location_url = location_url.lower()
|
||||
if location_url.startswith('/'):
|
||||
host = urlsplit(cdx['url']).netloc
|
||||
location_url = host + location_url
|
||||
|
||||
if request_url == location_url:
|
||||
msg = 'Self Redirect {0} -> {1}'
|
||||
msg = msg.format(request_url, location_url)
|
||||
#print(msg)
|
||||
raise LiveResourceException(msg)
|
||||
|
||||
@staticmethod
|
||||
def _make_warc_id(id_=None):
|
||||
if not id_:
|
||||
id_ = uuid.uuid1()
|
||||
return '<urn:uuid:{0}>'.format(id_)
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class PrefixResolver(object):
|
||||
def __init__(self, template):
|
||||
self.template = template
|
||||
|
||||
def __call__(self, filename, cdx):
|
||||
full_path = self.template
|
||||
if hasattr(cdx, '_formatter') and cdx._formatter:
|
||||
full_path = cdx._formatter.format(full_path)
|
||||
|
||||
return full_path + filename
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class RedisResolver(RedisIndexSource):
|
||||
def __call__(self, filename, cdx):
|
||||
redis_key = self.redis_key_template
|
||||
if hasattr(cdx, '_formatter') and cdx._formatter:
|
||||
redis_key = cdx._formatter.format(redis_key)
|
||||
|
||||
res = None
|
||||
|
||||
if '*' in redis_key:
|
||||
for key in self.redis.scan_iter(redis_key):
|
||||
#key = key.decode('utf-8')
|
||||
res = self.redis.hget(key, filename)
|
||||
if res:
|
||||
break
|
||||
else:
|
||||
res = self.redis.hget(redis_key, filename)
|
||||
|
||||
if res and six.PY3:
|
||||
res = res.decode('utf-8')
|
||||
|
||||
return res
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class WARCPathLoader(BaseLoader):
|
||||
def __init__(self, paths, cdx_source):
|
||||
self.paths = paths
|
||||
if isinstance(paths, six.string_types):
|
||||
self.paths = [paths]
|
||||
|
||||
self.resolvers = [self._make_resolver(path) for path in self.paths]
|
||||
|
||||
self.resolve_loader = ResolvingLoader(self.resolvers,
|
||||
no_record_parse=True)
|
||||
|
||||
self.headers_parser = StatusAndHeadersParser([], verify=False)
|
||||
|
||||
self.cdx_source = cdx_source
|
||||
|
||||
def _make_resolver(self, path):
|
||||
if hasattr(path, '__call__'):
|
||||
return path
|
||||
|
||||
if path.startswith('redis://'):
|
||||
return RedisResolver(path)
|
||||
|
||||
else:
|
||||
return PrefixResolver(path)
|
||||
|
||||
def load_resource(self, cdx, params):
|
||||
if cdx.get('_cached_result'):
|
||||
return cdx.get('_cached_result')
|
||||
|
||||
if not cdx.get('filename') or cdx.get('offset') is None:
|
||||
return None
|
||||
|
||||
orig_source = cdx.get('source', '').split(':')[0]
|
||||
formatter = ParamFormatter(params, orig_source)
|
||||
cdx._formatter = formatter
|
||||
|
||||
def local_index_query(local_params):
|
||||
for n, v in six.iteritems(params):
|
||||
if n.startswith('param.'):
|
||||
local_params[n] = v
|
||||
|
||||
cdx_iter, errs = self.cdx_source(local_params)
|
||||
for cdx in cdx_iter:
|
||||
cdx._formatter = formatter
|
||||
yield cdx
|
||||
|
||||
return cdx_iter
|
||||
|
||||
failed_files = []
|
||||
headers, payload = (self.resolve_loader.
|
||||
load_headers_and_payload(cdx,
|
||||
failed_files,
|
||||
local_index_query))
|
||||
|
||||
status = cdx.get('status')
|
||||
if not status or status.startswith('3'):
|
||||
status_headers = self.headers_parser.parse(payload.stream)
|
||||
self.raise_on_self_redirect(params, cdx,
|
||||
status_headers.get_statuscode(),
|
||||
status_headers.get_header('Location'))
|
||||
http_headers_buff = status_headers.to_bytes()
|
||||
else:
|
||||
http_headers_buff = None
|
||||
|
||||
warc_headers = payload.rec_headers
|
||||
|
||||
if headers != payload:
|
||||
warc_headers.replace_header('WARC-Refers-To-Target-URI',
|
||||
payload.rec_headers.get_header('WARC-Target-URI'))
|
||||
|
||||
warc_headers.replace_header('WARC-Refers-To-Date',
|
||||
payload.rec_headers.get_header('WARC-Date'))
|
||||
|
||||
warc_headers.replace_header('WARC-Target-URI',
|
||||
headers.rec_headers.get_header('WARC-Target-URI'))
|
||||
|
||||
warc_headers.replace_header('WARC-Date',
|
||||
headers.rec_headers.get_header('WARC-Date'))
|
||||
|
||||
headers.stream.close()
|
||||
|
||||
return (warc_headers, http_headers_buff, payload.stream)
|
||||
|
||||
def __str__(self):
|
||||
return 'WARCPathLoader'
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class LiveWebLoader(BaseLoader):
|
||||
SKIP_HEADERS = ('link',
|
||||
'memento-datetime',
|
||||
'content-location',
|
||||
'x-archive')
|
||||
|
||||
def __init__(self):
|
||||
self.num_retries = 3
|
||||
self.num_pools = 10
|
||||
self.num_conn_per_pool = 10
|
||||
|
||||
self.pool = urllib3.PoolManager(num_pools=self.num_pools,
|
||||
maxsize=self.num_conn_per_pool)
|
||||
|
||||
def load_resource(self, cdx, params):
|
||||
load_url = cdx.get('load_url')
|
||||
if not load_url:
|
||||
return None
|
||||
|
||||
if params.get('content_type') == VideoLoader.CONTENT_TYPE:
|
||||
return None
|
||||
|
||||
input_req = params['_input_req']
|
||||
|
||||
req_headers = input_req.get_req_headers()
|
||||
|
||||
dt = timestamp_to_datetime(cdx['timestamp'])
|
||||
|
||||
if cdx.get('memento_url'):
|
||||
req_headers['Accept-Datetime'] = datetime_to_http_date(dt)
|
||||
|
||||
method = input_req.get_req_method()
|
||||
data = input_req.get_req_body()
|
||||
|
||||
p = PreparedRequest()
|
||||
p.prepare_url(load_url, None)
|
||||
p.prepare_headers(None)
|
||||
p.prepare_auth(None, load_url)
|
||||
|
||||
auth = p.headers.get('Authorization')
|
||||
if auth:
|
||||
req_headers['Authorization'] = auth
|
||||
|
||||
load_url = p.url
|
||||
|
||||
try:
|
||||
upstream_res = self.pool.urlopen(method=method,
|
||||
url=load_url,
|
||||
body=data,
|
||||
headers=req_headers,
|
||||
redirect=False,
|
||||
assert_same_host=False,
|
||||
preload_content=False,
|
||||
decode_content=False,
|
||||
retries=self.num_retries,
|
||||
timeout=params.get('_timeout'))
|
||||
|
||||
except Exception as e:
|
||||
raise LiveResourceException(load_url)
|
||||
|
||||
memento_dt = upstream_res.headers.get('Memento-Datetime')
|
||||
if memento_dt:
|
||||
dt = http_date_to_datetime(memento_dt)
|
||||
cdx['timestamp'] = datetime_to_timestamp(dt)
|
||||
elif cdx.get('memento_url'):
|
||||
# if 'memento_url' set and no Memento-Datetime header present
|
||||
# then its an error
|
||||
return None
|
||||
|
||||
agg_type = upstream_res.headers.get('WebAgg-Type')
|
||||
if agg_type == 'warc':
|
||||
cdx['source'] = unquote(upstream_res.headers.get('WebAgg-Source-Coll'))
|
||||
return None, upstream_res.headers, upstream_res
|
||||
|
||||
self.raise_on_self_redirect(params, cdx,
|
||||
str(upstream_res.status),
|
||||
upstream_res.headers.get('Location'))
|
||||
|
||||
|
||||
if upstream_res.version == 11:
|
||||
version = '1.1'
|
||||
else:
|
||||
version = '1.0'
|
||||
|
||||
status = 'HTTP/{version} {status} {reason}\r\n'
|
||||
status = status.format(version=version,
|
||||
status=upstream_res.status,
|
||||
reason=upstream_res.reason)
|
||||
|
||||
http_headers_buff = status
|
||||
|
||||
orig_resp = upstream_res._original_response
|
||||
|
||||
try: #pragma: no cover
|
||||
#PY 3
|
||||
resp_headers = orig_resp.headers._headers
|
||||
for n, v in resp_headers:
|
||||
if n.lower() in self.SKIP_HEADERS:
|
||||
continue
|
||||
|
||||
http_headers_buff += n + ': ' + v + '\r\n'
|
||||
except: #pragma: no cover
|
||||
#PY 2
|
||||
resp_headers = orig_resp.msg.headers
|
||||
for n, v in zip(orig_resp.getheaders(), resp_headers):
|
||||
if n in self.SKIP_HEADERS:
|
||||
continue
|
||||
|
||||
http_headers_buff += v
|
||||
|
||||
http_headers_buff += '\r\n'
|
||||
http_headers_buff = http_headers_buff.encode('latin-1')
|
||||
|
||||
try:
|
||||
fp = upstream_res._fp.fp
|
||||
if hasattr(fp, 'raw'): #pragma: no cover
|
||||
fp = fp.raw
|
||||
remote_ip = fp._sock.getpeername()[0]
|
||||
except: #pragma: no cover
|
||||
remote_ip = None
|
||||
|
||||
warc_headers = {}
|
||||
|
||||
warc_headers['WARC-Type'] = 'response'
|
||||
warc_headers['WARC-Record-ID'] = self._make_warc_id()
|
||||
warc_headers['WARC-Target-URI'] = cdx['url']
|
||||
warc_headers['WARC-Date'] = datetime_to_iso_date(dt)
|
||||
if remote_ip:
|
||||
warc_headers['WARC-IP-Address'] = remote_ip
|
||||
|
||||
warc_headers['Content-Type'] = 'application/http; msgtype=response'
|
||||
|
||||
self._set_content_len(upstream_res.headers.get('Content-Length', -1),
|
||||
warc_headers,
|
||||
len(http_headers_buff))
|
||||
|
||||
warc_headers = StatusAndHeaders('WARC/1.0', warc_headers.items())
|
||||
return (warc_headers, http_headers_buff, upstream_res)
|
||||
|
||||
def __str__(self):
|
||||
return 'LiveWebLoader'
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class VideoLoader(BaseLoader):
|
||||
CONTENT_TYPE = 'application/vnd.youtube-dl_formats+json'
|
||||
|
||||
def __init__(self):
|
||||
try:
|
||||
from youtube_dl import YoutubeDL as YoutubeDL
|
||||
except ImportError:
|
||||
self.ydl = None
|
||||
return
|
||||
|
||||
self.ydl = YoutubeDL(dict(simulate=True,
|
||||
youtube_include_dash_manifest=False))
|
||||
|
||||
self.ydl.add_default_info_extractors()
|
||||
|
||||
def load_resource(self, cdx, params):
|
||||
load_url = cdx.get('load_url')
|
||||
if not load_url:
|
||||
return None
|
||||
|
||||
if params.get('content_type') != self.CONTENT_TYPE:
|
||||
return None
|
||||
|
||||
if not self.ydl:
|
||||
return None
|
||||
|
||||
info = self.ydl.extract_info(load_url)
|
||||
info_buff = json.dumps(info)
|
||||
info_buff = info_buff.encode('utf-8')
|
||||
|
||||
warc_headers = {}
|
||||
|
||||
schema, rest = load_url.split('://', 1)
|
||||
target_url = 'metadata://' + rest
|
||||
|
||||
dt = timestamp_to_datetime(cdx['timestamp'])
|
||||
|
||||
warc_headers['WARC-Type'] = 'metadata'
|
||||
warc_headers['WARC-Record-ID'] = self._make_warc_id()
|
||||
warc_headers['WARC-Target-URI'] = target_url
|
||||
warc_headers['WARC-Date'] = datetime_to_iso_date(dt)
|
||||
warc_headers['Content-Type'] = self.CONTENT_TYPE
|
||||
warc_headers['Content-Length'] = str(len(info_buff))
|
||||
|
||||
warc_headers = StatusAndHeaders('WARC/1.0', warc_headers.items())
|
||||
|
||||
return warc_headers, None, BytesIO(info_buff)
|
||||
|
0
webagg/test/__init__.py
Normal file
0
webagg/test/__init__.py
Normal file
17
webagg/test/live.ini
Normal file
17
webagg/test/live.ini
Normal file
@ -0,0 +1,17 @@
|
||||
[uwsgi]
|
||||
if-not-env = PORT
|
||||
http-socket = :8080
|
||||
endif =
|
||||
|
||||
master = true
|
||||
buffer-size = 65536
|
||||
die-on-term = true
|
||||
|
||||
if-env = VIRTUAL_ENV
|
||||
venv = $(VIRTUAL_ENV)
|
||||
endif =
|
||||
|
||||
gevent = 100
|
||||
gevent-monkey-patch =
|
||||
|
||||
wsgi = webagg.test.live
|
44
webagg/test/live.py
Normal file
44
webagg/test/live.py
Normal file
@ -0,0 +1,44 @@
|
||||
from gevent.monkey import patch_all; patch_all()
|
||||
|
||||
from webagg.test.testutils import LiveServerTests
|
||||
from webagg.handlers import DefaultResourceHandler
|
||||
from webagg.app import ResAggApp
|
||||
from webagg.indexsource import LiveIndexSource, RedisIndexSource
|
||||
from webagg.aggregator import SimpleAggregator, CacheDirectoryIndexSource
|
||||
|
||||
def simpleapp():
|
||||
app = ResAggApp(debug=True)
|
||||
app.add_route('/live',
|
||||
DefaultResourceHandler(SimpleAggregator(
|
||||
{'live': LiveIndexSource()})
|
||||
)
|
||||
)
|
||||
|
||||
app.add_route('/replay',
|
||||
DefaultResourceHandler(SimpleAggregator(
|
||||
{'replay': RedisIndexSource('redis://localhost/2/rec:cdxj')}),
|
||||
'redis://localhost/2/rec:warc'
|
||||
)
|
||||
)
|
||||
|
||||
app.add_route('/replay-testdata',
|
||||
DefaultResourceHandler(SimpleAggregator(
|
||||
{'test': CacheDirectoryIndexSource('./testdata/')}),
|
||||
'./testdata/'
|
||||
)
|
||||
)
|
||||
return app
|
||||
|
||||
|
||||
|
||||
application = simpleapp()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# from bottle import run
|
||||
# run(application, server='gevent', port=8080, fast=True)
|
||||
|
||||
from gevent.wsgi import WSGIServer
|
||||
server = WSGIServer(('', 8080), application)
|
||||
server.serve_forever()
|
||||
|
216
webagg/test/test_dir_agg.py
Normal file
216
webagg/test/test_dir_agg.py
Normal file
@ -0,0 +1,216 @@
|
||||
import tempfile
|
||||
import os
|
||||
import shutil
|
||||
import json
|
||||
|
||||
from .testutils import to_path, to_json_list, TempDirTests, BaseTestClass
|
||||
|
||||
from mock import patch
|
||||
|
||||
import time
|
||||
|
||||
from webagg.aggregator import DirectoryIndexSource, CacheDirectoryIndexSource
|
||||
from webagg.aggregator import SimpleAggregator
|
||||
from webagg.indexsource import MementoIndexSource
|
||||
|
||||
|
||||
#=============================================================================
|
||||
linkheader = """\
|
||||
<http://example.com/>; rel="original", <http://web.archive.org/web/timemap/link/http://example.com/>; rel="timemap"; type="application/link-format", <http://web.archive.org/web/20020120142510/http://example.com/>; rel="first memento"; datetime="Sun, 20 Jan 2002 14:25:10 GMT", <http://web.archive.org/web/20100501123414/http://example.com/>; rel="prev memento"; datetime="Sat, 01 May 2010 12:34:14 GMT", <http://web.archive.org/web/20100514231857/http://example.com/>; rel="memento"; datetime="Fri, 14 May 2010 23:18:57 GMT", <http://web.archive.org/web/20100519202418/http://example.com/>; rel="next memento"; datetime="Wed, 19 May 2010 20:24:18 GMT", <http://web.archive.org/web/20160307200619/http://example.com/>; rel="last memento"; datetime="Mon, 07 Mar 2016 20:06:19 GMT"\
|
||||
"""
|
||||
|
||||
|
||||
def mock_link_header(*args, **kwargs):
|
||||
return linkheader
|
||||
|
||||
|
||||
class TestDirAgg(TempDirTests, BaseTestClass):
|
||||
@classmethod
|
||||
def setup_class(cls):
|
||||
super(TestDirAgg, cls).setup_class()
|
||||
coll_A = to_path(cls.root_dir + '/colls/A/indexes')
|
||||
coll_B = to_path(cls.root_dir + '/colls/B/indexes')
|
||||
coll_C = to_path(cls.root_dir + '/colls/C/indexes')
|
||||
|
||||
os.makedirs(coll_A)
|
||||
os.makedirs(coll_B)
|
||||
os.makedirs(coll_C)
|
||||
|
||||
dir_prefix = to_path(cls.root_dir)
|
||||
dir_path ='colls/{coll}/indexes'
|
||||
|
||||
shutil.copy(to_path('testdata/example.cdxj'), coll_A)
|
||||
shutil.copy(to_path('testdata/iana.cdxj'), coll_B)
|
||||
shutil.copy(to_path('testdata/dupes.cdxj'), coll_C)
|
||||
|
||||
with open(to_path(cls.root_dir) + '/somefile', 'w') as fh:
|
||||
fh.write('foo')
|
||||
|
||||
cls.dir_loader = DirectoryIndexSource(dir_prefix, dir_path)
|
||||
cls.cache_dir_loader = CacheDirectoryIndexSource(dir_prefix, dir_path)
|
||||
|
||||
def test_agg_no_coll_set(self):
|
||||
res, errs = self.dir_loader(dict(url='example.com/'))
|
||||
assert(to_json_list(res) == [])
|
||||
assert(errs == {})
|
||||
|
||||
def test_agg_collA_found(self):
|
||||
res, errs = self.dir_loader({'url': 'example.com/', 'param.coll': 'A'})
|
||||
|
||||
exp = [{'source': 'colls/A/indexes/example.cdxj', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'}]
|
||||
|
||||
assert(to_json_list(res) == exp)
|
||||
assert(errs == {})
|
||||
|
||||
def test_agg_collB(self):
|
||||
res, errs = self.dir_loader({'url': 'example.com/', 'param.coll': 'B'})
|
||||
|
||||
exp = []
|
||||
|
||||
assert(to_json_list(res) == exp)
|
||||
assert(errs == {})
|
||||
|
||||
def test_agg_collB_found(self):
|
||||
res, errs = self.dir_loader({'url': 'iana.org/', 'param.coll': 'B'})
|
||||
|
||||
exp = [{'source': 'colls/B/indexes/iana.cdxj', 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}]
|
||||
|
||||
assert(to_json_list(res) == exp)
|
||||
assert(errs == {})
|
||||
|
||||
|
||||
def test_extra_agg_collB(self):
|
||||
agg_source = SimpleAggregator({'dir': self.dir_loader})
|
||||
res, errs = agg_source({'url': 'iana.org/', 'param.coll': 'B'})
|
||||
|
||||
exp = [{'source': 'dir:colls/B/indexes/iana.cdxj', 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}]
|
||||
|
||||
assert(to_json_list(res) == exp)
|
||||
assert(errs == {})
|
||||
|
||||
|
||||
def test_agg_all_found_1(self):
|
||||
res, errs = self.dir_loader({'url': 'iana.org/', 'param.coll': '*'})
|
||||
|
||||
exp = [
|
||||
{'source': 'colls/B/indexes/iana.cdxj', 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'},
|
||||
{'source': 'colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171238', 'filename': 'dupes.warc.gz'},
|
||||
{'source': 'colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171238', 'filename': 'dupes.warc.gz'},
|
||||
]
|
||||
|
||||
assert(to_json_list(res) == exp)
|
||||
assert(errs == {})
|
||||
|
||||
|
||||
def test_agg_all_found_2(self):
|
||||
res, errs = self.dir_loader({'url': 'example.com/', 'param.coll': '*'})
|
||||
|
||||
exp = [
|
||||
{'source': 'colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'},
|
||||
{'source': 'colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'},
|
||||
{'source': 'colls/A/indexes/example.cdxj', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'}
|
||||
]
|
||||
|
||||
assert(to_json_list(res) == exp)
|
||||
assert(errs == {})
|
||||
|
||||
@patch('webagg.indexsource.MementoIndexSource.get_timegate_links', mock_link_header)
|
||||
def test_agg_dir_and_memento(self):
|
||||
sources = {'ia': MementoIndexSource.from_timegate_url('http://web.archive.org/web/'),
|
||||
'local': self.dir_loader}
|
||||
agg_source = SimpleAggregator(sources)
|
||||
|
||||
res, errs = agg_source({'url': 'example.com/', 'param.local.coll': '*', 'closest': '20100512', 'limit': 6})
|
||||
|
||||
exp = [
|
||||
{'source': 'ia', 'timestamp': '20100514231857', 'load_url': 'http://web.archive.org/web/20100514231857id_/http://example.com/'},
|
||||
{'source': 'ia', 'timestamp': '20100519202418', 'load_url': 'http://web.archive.org/web/20100519202418id_/http://example.com/'},
|
||||
{'source': 'ia', 'timestamp': '20100501123414', 'load_url': 'http://web.archive.org/web/20100501123414id_/http://example.com/'},
|
||||
{'source': 'local:colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'},
|
||||
{'source': 'local:colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'},
|
||||
{'source': 'local:colls/A/indexes/example.cdxj', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'}
|
||||
]
|
||||
|
||||
assert(to_json_list(res) == exp)
|
||||
assert(errs == {})
|
||||
|
||||
|
||||
def test_agg_no_dir_1(self):
|
||||
res, errs = self.dir_loader({'url': 'example.com/', 'param.coll': 'X'})
|
||||
|
||||
exp = []
|
||||
|
||||
assert(to_json_list(res) == exp)
|
||||
assert(errs == {})
|
||||
|
||||
|
||||
def test_agg_no_dir_2(self):
|
||||
loader = DirectoryIndexSource(self.root_dir, '')
|
||||
res, errs = loader({'url': 'example.com/', 'param.coll': 'X'})
|
||||
|
||||
exp = []
|
||||
|
||||
assert(to_json_list(res) == exp)
|
||||
assert(errs == {})
|
||||
|
||||
|
||||
def test_agg_dir_sources_1(self):
|
||||
res = self.dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'})
|
||||
exp = {'sources': {'colls/A/indexes/example.cdxj': 'file',
|
||||
'colls/B/indexes/iana.cdxj': 'file',
|
||||
'colls/C/indexes/dupes.cdxj': 'file'}
|
||||
}
|
||||
|
||||
assert(res == exp)
|
||||
|
||||
|
||||
def test_agg_dir_sources_2(self):
|
||||
res = self.dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '[A,C]'})
|
||||
exp = {'sources': {'colls/A/indexes/example.cdxj': 'file',
|
||||
'colls/C/indexes/dupes.cdxj': 'file'}
|
||||
}
|
||||
|
||||
assert(res == exp)
|
||||
|
||||
|
||||
def test_agg_dir_sources_single_dir(self):
|
||||
loader = DirectoryIndexSource(os.path.join(self.root_dir, 'colls', 'A', 'indexes'), '')
|
||||
res = loader.get_source_list({'url': 'example.com/'})
|
||||
|
||||
exp = {'sources': {'example.cdxj': 'file'}}
|
||||
|
||||
assert(res == exp)
|
||||
|
||||
|
||||
def test_agg_dir_sources_not_found_dir(self):
|
||||
loader = DirectoryIndexSource(os.path.join(self.root_dir, 'colls', 'Z', 'indexes'), '')
|
||||
res = loader.get_source_list({'url': 'example.com/'})
|
||||
|
||||
exp = {'sources': {}}
|
||||
|
||||
assert(res == exp)
|
||||
|
||||
|
||||
|
||||
def test_cache_dir_sources_1(self):
|
||||
exp = {'sources': {'colls/A/indexes/example.cdxj': 'file',
|
||||
'colls/B/indexes/iana.cdxj': 'file',
|
||||
'colls/C/indexes/dupes.cdxj': 'file'}
|
||||
}
|
||||
|
||||
res = self.cache_dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'})
|
||||
assert(res == exp)
|
||||
|
||||
res = self.cache_dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'})
|
||||
assert(res == exp)
|
||||
|
||||
new_file = os.path.join(self.root_dir, 'colls/C/indexes/empty.cdxj')
|
||||
|
||||
with open(new_file, 'a') as fh:
|
||||
os.utime(new_file, None)
|
||||
|
||||
res = self.cache_dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'})
|
||||
|
||||
# New File Included
|
||||
exp['sources']['colls/C/indexes/empty.cdxj'] = 'file'
|
||||
assert(res == exp)
|
463
webagg/test/test_handlers.py
Normal file
463
webagg/test/test_handlers.py
Normal file
@ -0,0 +1,463 @@
|
||||
#from gevent import monkey; monkey.patch_all(thread=False)
|
||||
|
||||
from collections import OrderedDict
|
||||
|
||||
from webagg.handlers import DefaultResourceHandler, HandlerSeq
|
||||
|
||||
from webagg.indexsource import MementoIndexSource, FileIndexSource, LiveIndexSource
|
||||
from webagg.aggregator import GeventTimeoutAggregator, SimpleAggregator
|
||||
from webagg.aggregator import DirectoryIndexSource
|
||||
|
||||
from webagg.app import ResAggApp
|
||||
from webagg.utils import MementoUtils
|
||||
|
||||
from pywb.utils.statusandheaders import StatusAndHeadersParser
|
||||
from pywb.utils.bufferedreaders import ChunkedDataReader
|
||||
from io import BytesIO
|
||||
from six.moves.urllib.parse import urlencode
|
||||
|
||||
import webtest
|
||||
from fakeredis import FakeStrictRedis
|
||||
|
||||
from .testutils import to_path, FakeRedisTests, BaseTestClass
|
||||
|
||||
import json
|
||||
|
||||
sources = {
|
||||
'local': DirectoryIndexSource(to_path('testdata/'), ''),
|
||||
'ia': MementoIndexSource.from_timegate_url('http://web.archive.org/web/'),
|
||||
'rhiz': MementoIndexSource.from_timegate_url('http://webenact.rhizome.org/vvork/', path='*'),
|
||||
'live': LiveIndexSource(),
|
||||
}
|
||||
|
||||
|
||||
class TestResAgg(FakeRedisTests, BaseTestClass):
|
||||
def setup_class(cls):
|
||||
super(TestResAgg, cls).setup_class()
|
||||
|
||||
live_source = SimpleAggregator({'live': LiveIndexSource()})
|
||||
live_handler = DefaultResourceHandler(live_source)
|
||||
app = ResAggApp()
|
||||
app.add_route('/live', live_handler)
|
||||
|
||||
source1 = GeventTimeoutAggregator(sources)
|
||||
handler1 = DefaultResourceHandler(source1, to_path('testdata/'))
|
||||
app.add_route('/many', handler1)
|
||||
|
||||
source2 = SimpleAggregator({'post': FileIndexSource(to_path('testdata/post-test.cdxj'))})
|
||||
handler2 = DefaultResourceHandler(source2, to_path('testdata/'))
|
||||
app.add_route('/posttest', handler2)
|
||||
|
||||
source3 = SimpleAggregator({'example': FileIndexSource(to_path('testdata/example.cdxj'))})
|
||||
handler3 = DefaultResourceHandler(source3, to_path('testdata/'))
|
||||
|
||||
app.add_route('/fallback', HandlerSeq([handler3,
|
||||
handler2,
|
||||
live_handler]))
|
||||
|
||||
app.add_route('/seq', HandlerSeq([handler3,
|
||||
handler2]))
|
||||
|
||||
app.add_route('/allredis', DefaultResourceHandler(source3, 'redis://localhost/2/test:warc'))
|
||||
|
||||
app.add_route('/empty', HandlerSeq([]))
|
||||
app.add_route('/invalid', DefaultResourceHandler([SimpleAggregator({'invalid': 'should not be a callable'})]))
|
||||
|
||||
url_agnost = SimpleAggregator({'url-agnost': FileIndexSource(to_path('testdata/url-agnost-example.cdxj'))})
|
||||
app.add_route('/urlagnost', DefaultResourceHandler(url_agnost, 'redis://localhost/2/test:{arg}:warc'))
|
||||
|
||||
cls.testapp = webtest.TestApp(app)
|
||||
|
||||
def _check_uri_date(self, resp, uri, dt):
|
||||
buff = BytesIO(resp.body)
|
||||
buff = ChunkedDataReader(buff)
|
||||
status_headers = StatusAndHeadersParser(['WARC/1.0']).parse(buff)
|
||||
assert status_headers.get_header('WARC-Target-URI') == uri
|
||||
if dt == True:
|
||||
assert status_headers.get_header('WARC-Date') != ''
|
||||
else:
|
||||
assert status_headers.get_header('WARC-Date') == dt
|
||||
|
||||
def test_list_routes(self):
|
||||
resp = self.testapp.get('/')
|
||||
res = resp.json
|
||||
assert set(res.keys()) == set(['/empty', '/empty/postreq',
|
||||
'/fallback', '/fallback/postreq',
|
||||
'/live', '/live/postreq',
|
||||
'/many', '/many/postreq',
|
||||
'/posttest', '/posttest/postreq',
|
||||
'/seq', '/seq/postreq',
|
||||
'/allredis', '/allredis/postreq',
|
||||
'/urlagnost', '/urlagnost/postreq',
|
||||
'/invalid', '/invalid/postreq'])
|
||||
|
||||
assert res['/fallback'] == {'modes': ['list_sources', 'index', 'resource']}
|
||||
|
||||
def test_list_handlers(self):
|
||||
resp = self.testapp.get('/many')
|
||||
assert resp.json == {'modes': ['list_sources', 'index', 'resource']}
|
||||
assert 'ResErrors' not in resp.headers
|
||||
|
||||
resp = self.testapp.get('/many/other')
|
||||
assert resp.json == {'modes': ['list_sources', 'index', 'resource']}
|
||||
assert 'ResErrors' not in resp.headers
|
||||
|
||||
def test_list_errors(self):
|
||||
# must specify url for index or resource
|
||||
resp = self.testapp.get('/many/index', status=400)
|
||||
assert resp.json == {'message': 'The "url" param is required'}
|
||||
assert resp.text == resp.headers['ResErrors']
|
||||
|
||||
resp = self.testapp.get('/many/index', status=400)
|
||||
assert resp.json == {'message': 'The "url" param is required'}
|
||||
assert resp.text == resp.headers['ResErrors']
|
||||
|
||||
resp = self.testapp.get('/many/resource', status=400)
|
||||
assert resp.json == {'message': 'The "url" param is required'}
|
||||
assert resp.text == resp.headers['ResErrors']
|
||||
|
||||
def test_list_sources(self):
|
||||
resp = self.testapp.get('/many/list_sources')
|
||||
assert resp.json == {'sources': {'local': 'file_dir', 'ia': 'memento', 'rhiz': 'memento', 'live': 'live'}}
|
||||
assert 'ResErrors' not in resp.headers
|
||||
|
||||
def test_live_index(self):
|
||||
resp = self.testapp.get('/live/index?url=http://httpbin.org/get&output=json')
|
||||
resp.charset = 'utf-8'
|
||||
|
||||
cdxlist = list([json.loads(cdx) for cdx in resp.text.rstrip().split('\n')])
|
||||
cdxlist[0]['timestamp'] = '2016'
|
||||
assert(cdxlist == [{'url': 'http://httpbin.org/get', 'urlkey': 'org,httpbin)/get', 'is_live': 'true',
|
||||
'mime': '', 'load_url': 'http://httpbin.org/get', 'source': 'live', 'timestamp': '2016'}])
|
||||
|
||||
def test_live_resource(self):
|
||||
headers = {'foo': 'bar'}
|
||||
resp = self.testapp.get('/live/resource?url=http://httpbin.org/get?foo=bar', headers=headers)
|
||||
|
||||
assert resp.headers['WebAgg-Source-Coll'] == 'live'
|
||||
|
||||
self._check_uri_date(resp, 'http://httpbin.org/get?foo=bar', True)
|
||||
|
||||
assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/get?foo=bar', 'original')
|
||||
assert resp.headers['Memento-Datetime'] != ''
|
||||
|
||||
assert b'HTTP/1.1 200 OK' in resp.body
|
||||
assert b'"foo": "bar"' in resp.body
|
||||
|
||||
assert 'ResErrors' not in resp.headers
|
||||
|
||||
def test_live_post_resource(self):
|
||||
resp = self.testapp.post('/live/resource?url=http://httpbin.org/post',
|
||||
OrderedDict([('foo', 'bar')]))
|
||||
|
||||
assert resp.headers['WebAgg-Source-Coll'] == 'live'
|
||||
|
||||
self._check_uri_date(resp, 'http://httpbin.org/post', True)
|
||||
|
||||
assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/post', 'original')
|
||||
assert resp.headers['Memento-Datetime'] != ''
|
||||
|
||||
assert b'HTTP/1.1 200 OK' in resp.body
|
||||
assert b'"foo": "bar"' in resp.body
|
||||
|
||||
assert 'ResErrors' not in resp.headers
|
||||
|
||||
def test_agg_select_mem_1(self):
|
||||
resp = self.testapp.get('/many/resource?url=http://vvork.com/&closest=20141001')
|
||||
|
||||
assert resp.headers['WebAgg-Source-Coll'] == 'rhiz'
|
||||
|
||||
self._check_uri_date(resp, 'http://www.vvork.com/', '2014-10-06T18:43:57Z')
|
||||
|
||||
assert b'HTTP/1.1 200 OK' in resp.body
|
||||
|
||||
assert resp.headers['Link'] == MementoUtils.make_link('http://www.vvork.com/', 'original')
|
||||
assert resp.headers['Memento-Datetime'] == 'Mon, 06 Oct 2014 18:43:57 GMT'
|
||||
|
||||
assert 'ResErrors' not in resp.headers
|
||||
|
||||
def test_agg_select_mem_2(self):
|
||||
resp = self.testapp.get('/many/resource?url=http://vvork.com/&closest=20151231')
|
||||
|
||||
assert resp.headers['WebAgg-Source-Coll'] == 'ia'
|
||||
|
||||
self._check_uri_date(resp, 'http://vvork.com/', '2016-01-10T13:48:55Z')
|
||||
|
||||
assert b'HTTP/1.1 200 OK' in resp.body
|
||||
|
||||
assert resp.headers['Link'] == MementoUtils.make_link('http://vvork.com/', 'original')
|
||||
assert resp.headers['Memento-Datetime'] == 'Sun, 10 Jan 2016 13:48:55 GMT'
|
||||
|
||||
assert 'ResErrors' not in resp.headers
|
||||
|
||||
def test_agg_select_live(self):
|
||||
resp = self.testapp.get('/many/resource?url=http://vvork.com/&closest=2016')
|
||||
|
||||
assert resp.headers['WebAgg-Source-Coll'] == 'live'
|
||||
|
||||
self._check_uri_date(resp, 'http://vvork.com/', True)
|
||||
|
||||
assert resp.headers['Link'] == MementoUtils.make_link('http://vvork.com/', 'original')
|
||||
assert resp.headers['Memento-Datetime'] != ''
|
||||
|
||||
assert 'ResErrors' not in resp.headers
|
||||
|
||||
def test_agg_select_local(self):
|
||||
resp = self.testapp.get('/many/resource?url=http://iana.org/&closest=20140126200624')
|
||||
|
||||
assert resp.headers['WebAgg-Source-Coll'] == 'local:iana.cdxj'
|
||||
|
||||
self._check_uri_date(resp, 'http://www.iana.org/', '2014-01-26T20:06:24Z')
|
||||
|
||||
assert resp.headers['Link'] == MementoUtils.make_link('http://www.iana.org/', 'original')
|
||||
assert resp.headers['Memento-Datetime'] == 'Sun, 26 Jan 2014 20:06:24 GMT'
|
||||
|
||||
assert json.loads(resp.headers['ResErrors']) == {"rhiz": "NotFoundException('http://webenact.rhizome.org/vvork/http://iana.org/',)"}
|
||||
|
||||
def test_agg_select_local_postreq(self):
|
||||
req_data = """\
|
||||
GET / HTTP/1.1
|
||||
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8
|
||||
User-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36
|
||||
Host: iana.org
|
||||
"""
|
||||
|
||||
resp = self.testapp.post('/many/resource/postreq?url=http://iana.org/&closest=20140126200624', req_data)
|
||||
|
||||
assert resp.headers['WebAgg-Source-Coll'] == 'local:iana.cdxj'
|
||||
|
||||
self._check_uri_date(resp, 'http://www.iana.org/', '2014-01-26T20:06:24Z')
|
||||
|
||||
assert resp.headers['Link'] == MementoUtils.make_link('http://www.iana.org/', 'original')
|
||||
assert resp.headers['Memento-Datetime'] == 'Sun, 26 Jan 2014 20:06:24 GMT'
|
||||
|
||||
assert json.loads(resp.headers['ResErrors']) == {"rhiz": "NotFoundException('http://webenact.rhizome.org/vvork/http://iana.org/',)"}
|
||||
|
||||
def test_agg_live_postreq(self):
|
||||
req_data = """\
|
||||
GET /get?foo=bar HTTP/1.1
|
||||
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8
|
||||
User-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36
|
||||
Host: httpbin.org
|
||||
"""
|
||||
|
||||
resp = self.testapp.post('/many/resource/postreq?url=http://httpbin.org/get?foo=bar&closest=now', req_data)
|
||||
|
||||
assert resp.headers['WebAgg-Source-Coll'] == 'live'
|
||||
|
||||
self._check_uri_date(resp, 'http://httpbin.org/get?foo=bar', True)
|
||||
|
||||
assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/get?foo=bar', 'original')
|
||||
assert resp.headers['Memento-Datetime'] != ''
|
||||
|
||||
assert b'HTTP/1.1 200 OK' in resp.body
|
||||
assert b'"foo": "bar"' in resp.body
|
||||
|
||||
assert json.loads(resp.headers['ResErrors']) == {"rhiz": "NotFoundException('http://webenact.rhizome.org/vvork/http://httpbin.org/get?foo=bar',)"}
|
||||
|
||||
def test_agg_post_resolve_postreq(self):
|
||||
req_data = """\
|
||||
POST /post HTTP/1.1
|
||||
content-length: 16
|
||||
accept-encoding: gzip, deflate
|
||||
accept: */*
|
||||
host: httpbin.org
|
||||
content-type: application/x-www-form-urlencoded
|
||||
|
||||
foo=bar&test=abc"""
|
||||
|
||||
resp = self.testapp.post('/posttest/resource/postreq?url=http://httpbin.org/post', req_data)
|
||||
|
||||
assert resp.headers['WebAgg-Source-Coll'] == 'post'
|
||||
|
||||
self._check_uri_date(resp, 'http://httpbin.org/post', True)
|
||||
|
||||
assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/post', 'original')
|
||||
assert resp.headers['Memento-Datetime'] != ''
|
||||
|
||||
assert b'HTTP/1.1 200 OK' in resp.body
|
||||
assert b'"foo": "bar"' in resp.body
|
||||
assert b'"test": "abc"' in resp.body
|
||||
assert b'"url": "http://httpbin.org/post"' in resp.body
|
||||
|
||||
assert 'ResErrors' not in resp.headers
|
||||
|
||||
def test_agg_post_resolve_fallback(self):
|
||||
req_data = OrderedDict([('foo', 'bar'), ('test', 'abc')])
|
||||
|
||||
resp = self.testapp.post('/fallback/resource?url=http://httpbin.org/post', req_data)
|
||||
|
||||
assert resp.headers['WebAgg-Source-Coll'] == 'post'
|
||||
|
||||
self._check_uri_date(resp, 'http://httpbin.org/post', True)
|
||||
|
||||
assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/post', 'original')
|
||||
|
||||
assert b'HTTP/1.1 200 OK' in resp.body
|
||||
assert b'"foo": "bar"' in resp.body
|
||||
assert b'"test": "abc"' in resp.body
|
||||
assert b'"url": "http://httpbin.org/post"' in resp.body
|
||||
|
||||
assert 'ResErrors' not in resp.headers
|
||||
|
||||
def test_agg_seq_fallback_1(self):
|
||||
resp = self.testapp.get('/fallback/resource?url=http://www.iana.org/')
|
||||
|
||||
assert resp.headers['WebAgg-Source-Coll'] == 'live'
|
||||
|
||||
self._check_uri_date(resp, 'http://www.iana.org/', True)
|
||||
|
||||
assert resp.headers['Link'] == MementoUtils.make_link('http://www.iana.org/', 'original')
|
||||
|
||||
assert b'HTTP/1.1 200 OK' in resp.body
|
||||
|
||||
assert 'ResErrors' not in resp.headers
|
||||
|
||||
def test_agg_seq_fallback_2(self):
|
||||
resp = self.testapp.get('/fallback/resource?url=http://www.example.com/')
|
||||
|
||||
assert resp.headers['WebAgg-Source-Coll'] == 'example'
|
||||
|
||||
self._check_uri_date(resp, 'http://example.com/', '2016-02-25T04:23:29Z')
|
||||
|
||||
assert resp.headers['Link'] == MementoUtils.make_link('http://example.com/', 'original')
|
||||
assert resp.headers['Memento-Datetime'] == 'Thu, 25 Feb 2016 04:23:29 GMT'
|
||||
|
||||
assert b'HTTP/1.1 200 OK' in resp.body
|
||||
|
||||
assert 'ResErrors' not in resp.headers
|
||||
|
||||
def test_redis_warc_1(self):
|
||||
f = FakeStrictRedis.from_url('redis://localhost/2')
|
||||
f.hset('test:warc', 'example.warc.gz', './testdata/example.warc.gz')
|
||||
|
||||
resp = self.testapp.get('/allredis/resource?url=http://www.example.com/')
|
||||
|
||||
assert resp.headers['WebAgg-Source-Coll'] == 'example'
|
||||
|
||||
def test_url_agnost(self):
|
||||
f = FakeStrictRedis.from_url('redis://localhost/2')
|
||||
f.hset('test:foo:warc', 'example-url-agnostic-revisit.warc.gz', './testdata/example-url-agnostic-revisit.warc.gz')
|
||||
f.hset('test:foo:warc', 'example-url-agnostic-orig.warc.gz', './testdata/example-url-agnostic-orig.warc.gz')
|
||||
|
||||
resp = self.testapp.get('/urlagnost/resource?url=http://example.com/¶m.arg=foo')
|
||||
|
||||
assert resp.status_int == 200
|
||||
assert resp.headers['Link'] == MementoUtils.make_link('http://test@example.com/', 'original')
|
||||
assert resp.headers['WebAgg-Source-Coll'] == 'url-agnost'
|
||||
assert resp.headers['Memento-Datetime'] == 'Mon, 29 Jul 2013 19:51:51 GMT'
|
||||
|
||||
def test_live_video_loader(self):
|
||||
params = {'url': 'http://www.youtube.com/v/BfBgWtAIbRc',
|
||||
'content_type': 'application/vnd.youtube-dl_formats+json'
|
||||
}
|
||||
|
||||
resp = self.testapp.get('/live/resource', params=params)
|
||||
|
||||
assert resp.headers['WebAgg-Source-Coll'] == 'live'
|
||||
|
||||
self._check_uri_date(resp, 'metadata://www.youtube.com/v/BfBgWtAIbRc', True)
|
||||
|
||||
assert resp.headers['Link'] == MementoUtils.make_link('metadata://www.youtube.com/v/BfBgWtAIbRc', 'original')
|
||||
assert resp.headers['Memento-Datetime'] != ''
|
||||
|
||||
assert b'WARC-Type: metadata' in resp.body
|
||||
assert b'Content-Type: application/vnd.youtube-dl_formats+json' in resp.body
|
||||
|
||||
def test_live_video_loader_post(self):
|
||||
req_data = """\
|
||||
GET /v/BfBgWtAIbRc HTTP/1.1
|
||||
accept-encoding: gzip, deflate
|
||||
accept: */*
|
||||
host: www.youtube.com\
|
||||
"""
|
||||
|
||||
params = {'url': 'http://www.youtube.com/v/BfBgWtAIbRc',
|
||||
'content_type': 'application/vnd.youtube-dl_formats+json'
|
||||
}
|
||||
|
||||
resp = self.testapp.post('/live/resource/postreq?&' + urlencode(params), req_data)
|
||||
|
||||
assert resp.headers['WebAgg-Source-Coll'] == 'live'
|
||||
|
||||
self._check_uri_date(resp, 'metadata://www.youtube.com/v/BfBgWtAIbRc', True)
|
||||
|
||||
assert resp.headers['Link'] == MementoUtils.make_link('metadata://www.youtube.com/v/BfBgWtAIbRc', 'original')
|
||||
assert resp.headers['Memento-Datetime'] != ''
|
||||
|
||||
assert b'WARC-Type: metadata' in resp.body
|
||||
assert b'Content-Type: application/vnd.youtube-dl_formats+json' in resp.body
|
||||
|
||||
def test_error_redis_file_not_found(self):
|
||||
f = FakeStrictRedis.from_url('redis://localhost/2')
|
||||
f.hset('test:warc', 'example.warc.gz', './testdata/example2.warc.gz')
|
||||
|
||||
resp = self.testapp.get('/allredis/resource?url=http://www.example.com/', status=503)
|
||||
assert resp.json['message'] == "example.warc.gz: [Errno 2] No such file or directory: './testdata/example2.warc.gz'"
|
||||
|
||||
f.hdel('test:warc', 'example.warc.gz')
|
||||
resp = self.testapp.get('/allredis/resource?url=http://www.example.com/', status=503)
|
||||
|
||||
assert resp.json == {'message': 'example.warc.gz: Archive File Not Found',
|
||||
'errors': {'WARCPathLoader': 'example.warc.gz: Archive File Not Found'}}
|
||||
|
||||
f.delete('test:warc')
|
||||
resp = self.testapp.get('/allredis/resource?url=http://www.example.com/', status=503)
|
||||
|
||||
assert resp.json == {'message': 'example.warc.gz: Archive File Not Found',
|
||||
'errors': {'WARCPathLoader': 'example.warc.gz: Archive File Not Found'}}
|
||||
|
||||
|
||||
def test_error_fallback_live_not_found(self):
|
||||
resp = self.testapp.get('/fallback/resource?url=http://invalid.url-not-found', status=400)
|
||||
|
||||
assert resp.json == {'message': 'http://invalid.url-not-found/',
|
||||
'errors': {'LiveWebLoader': 'http://invalid.url-not-found/'}}
|
||||
|
||||
assert resp.text == resp.headers['ResErrors']
|
||||
|
||||
def test_agg_local_revisit(self):
|
||||
resp = self.testapp.get('/many/resource?url=http://www.example.com/&closest=20140127171251&sources=local')
|
||||
|
||||
assert resp.headers['WebAgg-Source-Coll'] == 'local:dupes.cdxj'
|
||||
|
||||
buff = BytesIO(resp.body)
|
||||
status_headers = StatusAndHeadersParser(['WARC/1.0']).parse(buff)
|
||||
assert status_headers.get_header('WARC-Target-URI') == 'http://example.com'
|
||||
assert status_headers.get_header('WARC-Date') == '2014-01-27T17:12:51Z'
|
||||
assert status_headers.get_header('WARC-Refers-To-Target-URI') == 'http://example.com'
|
||||
assert status_headers.get_header('WARC-Refers-To-Date') == '2014-01-27T17:12:00Z'
|
||||
|
||||
assert resp.headers['Link'] == MementoUtils.make_link('http://example.com', 'original')
|
||||
assert resp.headers['Memento-Datetime'] == 'Mon, 27 Jan 2014 17:12:51 GMT'
|
||||
|
||||
assert b'HTTP/1.1 200 OK' in resp.body
|
||||
assert b'<!doctype html>' in resp.body
|
||||
|
||||
assert 'ResErrors' not in resp.headers
|
||||
|
||||
def test_error_invalid_index_output(self):
|
||||
resp = self.testapp.get('/live/index?url=http://httpbin.org/get&output=foobar', status=400)
|
||||
|
||||
assert resp.json == {'message': 'output=foobar not supported'}
|
||||
assert resp.text == resp.headers['ResErrors']
|
||||
|
||||
def test_error_local_not_found(self):
|
||||
resp = self.testapp.get('/many/resource?url=http://not-found.error/&sources=local', status=404)
|
||||
|
||||
assert resp.json == {'message': 'No Resource Found'}
|
||||
assert resp.text == resp.headers['ResErrors']
|
||||
|
||||
def test_error_empty(self):
|
||||
resp = self.testapp.get('/empty/resource?url=http://example.com/', status=404)
|
||||
|
||||
assert resp.json == {'message': 'No Resource Found'}
|
||||
assert resp.text == resp.headers['ResErrors']
|
||||
|
||||
def test_error_invalid(self):
|
||||
resp = self.testapp.get('/invalid/resource?url=http://example.com/', status=500)
|
||||
|
||||
assert resp.json == {'message': "Internal Error: 'list' object is not callable"}
|
||||
assert resp.text == resp.headers['ResErrors']
|
||||
|
||||
|
219
webagg/test/test_indexsource.py
Normal file
219
webagg/test/test_indexsource.py
Normal file
@ -0,0 +1,219 @@
|
||||
from webagg.indexsource import FileIndexSource, RemoteIndexSource, MementoIndexSource, RedisIndexSource
|
||||
from webagg.indexsource import LiveIndexSource
|
||||
|
||||
from webagg.aggregator import SimpleAggregator
|
||||
|
||||
from pywb.utils.timeutils import timestamp_now
|
||||
|
||||
from .testutils import key_ts_res
|
||||
|
||||
|
||||
import pytest
|
||||
|
||||
from fakeredis import FakeStrictRedis
|
||||
from mock import patch
|
||||
|
||||
redismock = patch('redis.StrictRedis', FakeStrictRedis)
|
||||
redismock.start()
|
||||
|
||||
def setup_module():
|
||||
r = FakeStrictRedis.from_url('redis://localhost:6379/2')
|
||||
r.delete('test:rediscdx')
|
||||
with open('testdata/iana.cdxj', 'rb') as fh:
|
||||
for line in fh:
|
||||
r.zadd('test:rediscdx', 0, line.rstrip())
|
||||
|
||||
|
||||
def teardown_module():
|
||||
redismock.stop()
|
||||
|
||||
|
||||
local_sources = [
|
||||
FileIndexSource('testdata/iana.cdxj'),
|
||||
RedisIndexSource('redis://localhost:6379/2/test:rediscdx')
|
||||
]
|
||||
|
||||
|
||||
remote_sources = [
|
||||
RemoteIndexSource('http://webenact.rhizome.org/all-cdx?url={url}',
|
||||
'http://webenact.rhizome.org/all/{timestamp}id_/{url}'),
|
||||
|
||||
MementoIndexSource('http://webenact.rhizome.org/all/{url}',
|
||||
'http://webenact.rhizome.org/all/timemap/*/{url}',
|
||||
'http://webenact.rhizome.org/all/{timestamp}id_/{url}')
|
||||
]
|
||||
|
||||
ait_source = RemoteIndexSource('http://wayback.archive-it.org/cdx?url={url}',
|
||||
'http://wayback.archive-it.org/all/{timestamp}id_/{url}')
|
||||
|
||||
|
||||
def query_single_source(source, params):
|
||||
string = str(source)
|
||||
return SimpleAggregator({'source': source})(params)
|
||||
|
||||
|
||||
|
||||
# Url Match -- Local Loaders
|
||||
# ============================================================================
|
||||
@pytest.mark.parametrize("source", local_sources, ids=["file", "redis"])
|
||||
def test_local_cdxj_loader(source):
|
||||
url = 'http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf'
|
||||
res, errs = query_single_source(source, dict(url=url, limit=3))
|
||||
|
||||
expected = """\
|
||||
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200826 iana.warc.gz
|
||||
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200912 iana.warc.gz
|
||||
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200930 iana.warc.gz"""
|
||||
|
||||
assert(key_ts_res(res) == expected)
|
||||
assert(errs == {})
|
||||
|
||||
|
||||
# Closest -- Local Loaders
|
||||
# ============================================================================
|
||||
@pytest.mark.parametrize("source", local_sources, ids=["file", "redis"])
|
||||
def test_local_closest_loader(source):
|
||||
url = 'http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf'
|
||||
res, errs = query_single_source(source, dict(url=url,
|
||||
closest='20140126200930',
|
||||
limit=3))
|
||||
|
||||
expected = """\
|
||||
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200930 iana.warc.gz
|
||||
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200912 iana.warc.gz
|
||||
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200826 iana.warc.gz"""
|
||||
|
||||
assert(key_ts_res(res) == expected)
|
||||
assert(errs == {})
|
||||
|
||||
|
||||
# Prefix -- Local Loaders
|
||||
# ============================================================================
|
||||
@pytest.mark.parametrize("source", local_sources, ids=["file", "redis"])
|
||||
def test_file_prefix_loader(source):
|
||||
res, errs = query_single_source(source, dict(url='http://iana.org/domains/root/*'))
|
||||
|
||||
expected = """\
|
||||
org,iana)/domains/root/db 20140126200927 iana.warc.gz
|
||||
org,iana)/domains/root/db 20140126200928 iana.warc.gz
|
||||
org,iana)/domains/root/servers 20140126201227 iana.warc.gz"""
|
||||
|
||||
assert(key_ts_res(res) == expected)
|
||||
assert(errs == {})
|
||||
|
||||
|
||||
# Url Match -- Remote Loaders
|
||||
# ============================================================================
|
||||
@pytest.mark.parametrize("source", remote_sources, ids=["remote_cdx", "memento"])
|
||||
def test_remote_loader(source):
|
||||
url = 'http://instagram.com/amaliaulman'
|
||||
res, errs = query_single_source(source, dict(url=url))
|
||||
|
||||
expected = """\
|
||||
com,instagram)/amaliaulman 20141014150552 http://webenact.rhizome.org/all/20141014150552id_/http://instagram.com/amaliaulman
|
||||
com,instagram)/amaliaulman 20141014155217 http://webenact.rhizome.org/all/20141014155217id_/http://instagram.com/amaliaulman
|
||||
com,instagram)/amaliaulman 20141014162333 http://webenact.rhizome.org/all/20141014162333id_/http://instagram.com/amaliaulman
|
||||
com,instagram)/amaliaulman 20141014171636 http://webenact.rhizome.org/all/20141014171636id_/http://instagram.com/amaliaulman"""
|
||||
|
||||
assert(key_ts_res(res, 'load_url') == expected)
|
||||
assert(errs == {})
|
||||
|
||||
|
||||
# Url Match -- Remote Loaders
|
||||
# ============================================================================
|
||||
@pytest.mark.parametrize("source", remote_sources, ids=["remote_cdx", "memento"])
|
||||
def test_remote_closest_loader(source):
|
||||
url = 'http://instagram.com/amaliaulman'
|
||||
res, errs = query_single_source(source, dict(url=url, closest='20141014162332', limit=1))
|
||||
|
||||
expected = """\
|
||||
com,instagram)/amaliaulman 20141014162333 http://webenact.rhizome.org/all/20141014162333id_/http://instagram.com/amaliaulman"""
|
||||
|
||||
assert(key_ts_res(res, 'load_url') == expected)
|
||||
assert(errs == {})
|
||||
|
||||
|
||||
# Url Match -- Memento
|
||||
# ============================================================================
|
||||
@pytest.mark.parametrize("source", remote_sources, ids=["remote_cdx", "memento"])
|
||||
def test_remote_closest_loader(source):
|
||||
url = 'http://instagram.com/amaliaulman'
|
||||
res, errs = query_single_source(source, dict(url=url, closest='20141014162332', limit=1))
|
||||
|
||||
expected = """\
|
||||
com,instagram)/amaliaulman 20141014162333 http://webenact.rhizome.org/all/20141014162333id_/http://instagram.com/amaliaulman"""
|
||||
|
||||
assert(key_ts_res(res, 'load_url') == expected)
|
||||
assert(errs == {})
|
||||
|
||||
|
||||
# Live Index -- No Load!
|
||||
# ============================================================================
|
||||
def test_live():
|
||||
url = 'http://example.com/'
|
||||
source = LiveIndexSource()
|
||||
res, errs = query_single_source(source, dict(url=url))
|
||||
|
||||
expected = 'com,example)/ {0} http://example.com/'.format(timestamp_now())
|
||||
|
||||
assert(key_ts_res(res, 'load_url') == expected)
|
||||
assert(errs == {})
|
||||
|
||||
|
||||
# Errors -- Not Found All
|
||||
# ============================================================================
|
||||
@pytest.mark.parametrize("source", local_sources + remote_sources, ids=["file", "redis", "remote_cdx", "memento"])
|
||||
def test_all_not_found(source):
|
||||
url = 'http://x-not-found-x.notfound/'
|
||||
res, errs = query_single_source(source, dict(url=url, limit=3))
|
||||
|
||||
expected = ''
|
||||
assert(key_ts_res(res) == expected)
|
||||
if source == remote_sources[0]:
|
||||
assert('http://x-not-found-x.notfound/' in errs['source'])
|
||||
else:
|
||||
assert(errs == {})
|
||||
|
||||
|
||||
# ============================================================================
|
||||
def test_another_remote_not_found():
|
||||
source = MementoIndexSource.from_timegate_url('http://www.webarchive.org.uk/wayback/archive/')
|
||||
url = 'http://x-not-found-x.notfound/'
|
||||
res, errs = query_single_source(source, dict(url=url, limit=3))
|
||||
|
||||
|
||||
expected = ''
|
||||
assert(key_ts_res(res) == expected)
|
||||
assert(errs['source'] == "NotFoundException('http://www.webarchive.org.uk/wayback/archive/timemap/link/http://x-not-found-x.notfound/',)")
|
||||
|
||||
# ============================================================================
|
||||
def test_file_not_found():
|
||||
source = FileIndexSource('testdata/not-found-x')
|
||||
url = 'http://x-not-found-x.notfound/'
|
||||
res, errs = query_single_source(source, dict(url=url, limit=3))
|
||||
|
||||
expected = ''
|
||||
assert(key_ts_res(res) == expected)
|
||||
assert(errs['source'] == "NotFoundException('testdata/not-found-x',)"), errs
|
||||
|
||||
|
||||
# ============================================================================
|
||||
def test_ait_filters():
|
||||
ait_source = RemoteIndexSource('http://wayback.archive-it.org/cdx/search/cdx?url={url}&filter=filename:ARCHIVEIT-({colls})-.*',
|
||||
'http://wayback.archive-it.org/all/{timestamp}id_/{url}')
|
||||
|
||||
cdxlist, errs = query_single_source(ait_source, {'url': 'http://iana.org/', 'param.source.colls': '5610|933'})
|
||||
filenames = [cdx['filename'] for cdx in cdxlist]
|
||||
|
||||
prefix = ('ARCHIVEIT-5610-', 'ARCHIVEIT-933-')
|
||||
|
||||
assert(all([x.startswith(prefix) for x in filenames]))
|
||||
|
||||
|
||||
cdxlist, errs = query_single_source(ait_source, {'url': 'http://iana.org/', 'param.source.colls': '1883|366|905'})
|
||||
filenames = [cdx['filename'] for cdx in cdxlist]
|
||||
|
||||
prefix = ('ARCHIVEIT-1883-', 'ARCHIVEIT-366-', 'ARCHIVEIT-905-')
|
||||
|
||||
assert(all([x.startswith(prefix) for x in filenames]))
|
||||
|
67
webagg/test/test_inputreq.py
Normal file
67
webagg/test/test_inputreq.py
Normal file
@ -0,0 +1,67 @@
|
||||
from webagg.inputrequest import DirectWSGIInputRequest, POSTInputRequest
|
||||
from bottle import Bottle, request, response, debug
|
||||
import webtest
|
||||
import traceback
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class InputReqApp(object):
|
||||
def __init__(self):
|
||||
self.application = Bottle()
|
||||
debug(True)
|
||||
|
||||
@self.application.route('/test/<url:re:.*>', 'ANY')
|
||||
def direct_input_request(url=''):
|
||||
inputreq = DirectWSGIInputRequest(request.environ)
|
||||
response['Content-Type'] = 'text/plain; charset=utf-8'
|
||||
return inputreq.reconstruct_request(url)
|
||||
|
||||
@self.application.route('/test-postreq', 'POST')
|
||||
def post_fullrequest():
|
||||
params = dict(request.query)
|
||||
inputreq = POSTInputRequest(request.environ)
|
||||
response['Content-Type'] = 'text/plain; charset=utf-8'
|
||||
return inputreq.reconstruct_request(params.get('url'))
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class TestInputReq(object):
|
||||
def setup(self):
|
||||
self.app = InputReqApp()
|
||||
self.testapp = webtest.TestApp(self.app.application)
|
||||
|
||||
def test_get_direct(self):
|
||||
res = self.testapp.get('/test/http://example.com/', headers={'Foo': 'Bar'})
|
||||
assert res.text == '\
|
||||
GET /test/http://example.com/ HTTP/1.0\r\n\
|
||||
Host: example.com\r\n\
|
||||
Foo: Bar\r\n\
|
||||
\r\n\
|
||||
'
|
||||
|
||||
def test_post_direct(self):
|
||||
res = self.testapp.post('/test/http://example.com/', headers={'Foo': 'Bar'}, params='ABC')
|
||||
lines = res.text.split('\r\n')
|
||||
assert lines[0] == 'POST /test/http://example.com/ HTTP/1.0'
|
||||
assert 'Host: example.com' in lines
|
||||
assert 'Content-Length: 3' in lines
|
||||
assert 'Content-Type: application/x-www-form-urlencoded' in lines
|
||||
assert 'Foo: Bar' in lines
|
||||
|
||||
assert 'ABC' in lines
|
||||
|
||||
def test_post_req(self):
|
||||
postdata = '\
|
||||
GET /example.html HTTP/1.0\r\n\
|
||||
Foo: Bar\r\n\
|
||||
\r\n\
|
||||
'
|
||||
res = self.testapp.post('/test-postreq?url=http://example.com/', params=postdata)
|
||||
|
||||
assert res.text == '\
|
||||
GET /example.html HTTP/1.0\r\n\
|
||||
Host: example.com\r\n\
|
||||
Foo: Bar\r\n\
|
||||
\r\n\
|
||||
'
|
||||
|
241
webagg/test/test_memento_agg.py
Normal file
241
webagg/test/test_memento_agg.py
Normal file
@ -0,0 +1,241 @@
|
||||
from gevent import monkey; monkey.patch_all(thread=False)
|
||||
|
||||
from webagg.aggregator import SimpleAggregator, GeventTimeoutAggregator
|
||||
from webagg.aggregator import BaseAggregator
|
||||
|
||||
from webagg.indexsource import FileIndexSource, RemoteIndexSource, MementoIndexSource
|
||||
from .testutils import to_json_list, to_path
|
||||
|
||||
import json
|
||||
import pytest
|
||||
import time
|
||||
import six
|
||||
|
||||
from webagg.handlers import IndexHandler
|
||||
|
||||
|
||||
sources = {
|
||||
'local': FileIndexSource(to_path('testdata/iana.cdxj')),
|
||||
'ia': MementoIndexSource.from_timegate_url('http://web.archive.org/web/'),
|
||||
'ait': MementoIndexSource.from_timegate_url('http://wayback.archive-it.org/all/'),
|
||||
'bl': MementoIndexSource.from_timegate_url('http://www.webarchive.org.uk/wayback/archive/'),
|
||||
'rhiz': MementoIndexSource.from_timegate_url('http://webenact.rhizome.org/vvork/', path='*')
|
||||
}
|
||||
|
||||
|
||||
aggs = {'simple': SimpleAggregator(sources),
|
||||
'gevent': GeventTimeoutAggregator(sources, timeout=5.0),
|
||||
}
|
||||
|
||||
agg_tm = {'gevent': GeventTimeoutAggregator(sources, timeout=0.0)}
|
||||
|
||||
nf = {'notfound': FileIndexSource(to_path('testdata/not-found-x'))}
|
||||
agg_nf = {'simple': SimpleAggregator(nf),
|
||||
'gevent': GeventTimeoutAggregator(nf, timeout=5.0),
|
||||
}
|
||||
|
||||
|
||||
@pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys()))
|
||||
def test_mem_agg_index_1(agg):
|
||||
url = 'http://iana.org/'
|
||||
res, errs = agg(dict(url=url, closest='20140126000000', limit=5))
|
||||
|
||||
|
||||
exp = [{"timestamp": "20140126093743", "load_url": "http://web.archive.org/web/20140126093743id_/http://iana.org/", "source": "ia"},
|
||||
{"timestamp": "20140126200624", "filename": "iana.warc.gz", "source": "local"},
|
||||
{"timestamp": "20140123034755", "load_url": "http://web.archive.org/web/20140123034755id_/http://iana.org/", "source": "ia"},
|
||||
{"timestamp": "20140129175203", "load_url": "http://web.archive.org/web/20140129175203id_/http://iana.org/", "source": "ia"},
|
||||
{"timestamp": "20140107040552", "load_url": "http://wayback.archive-it.org/all/20140107040552id_/http://iana.org/", "source": "ait"}
|
||||
]
|
||||
|
||||
assert(to_json_list(res) == exp)
|
||||
assert(errs == {'bl': "NotFoundException('http://www.webarchive.org.uk/wayback/archive/http://iana.org/',)",
|
||||
'rhiz': "NotFoundException('http://webenact.rhizome.org/vvork/http://iana.org/',)"})
|
||||
|
||||
@pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys()))
|
||||
def test_mem_agg_index_2(agg):
|
||||
url = 'http://example.com/'
|
||||
res, errs = agg(dict(url=url, closest='20100512', limit=6))
|
||||
|
||||
exp = [{"timestamp": "20100513010014", "load_url": "http://www.webarchive.org.uk/wayback/archive/20100513010014id_/http://example.com/", "source": "bl"},
|
||||
{"timestamp": "20100512204410", "load_url": "http://www.webarchive.org.uk/wayback/archive/20100512204410id_/http://example.com/", "source": "bl"},
|
||||
#{"timestamp": "20100513052358", "load_url": "http://web.archive.org/web/20100513052358id_/http://example.com/", "source": "ia"},
|
||||
{"timestamp": "20100511201151", "load_url": "http://wayback.archive-it.org/all/20100511201151id_/http://example.com/", "source": "ait"},
|
||||
{"timestamp": "20100514231857", "load_url": "http://web.archive.org/web/20100514231857id_/http://example.com/", "source": "ia"},
|
||||
{"timestamp": "20100514231857", "load_url": "http://wayback.archive-it.org/all/20100514231857id_/http://example.com/", "source": "ait"},
|
||||
{"timestamp": "20100510233601", "load_url": "http://web.archive.org/web/20100510233601id_/http://example.com/", "source": "ia"}]
|
||||
|
||||
assert(to_json_list(res) == exp)
|
||||
assert(errs == {'rhiz': "NotFoundException('http://webenact.rhizome.org/vvork/http://example.com/',)"})
|
||||
|
||||
|
||||
@pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys()))
|
||||
def test_mem_agg_index_3(agg):
|
||||
url = 'http://vvork.com/'
|
||||
res, errs = agg(dict(url=url, closest='20141001', limit=5))
|
||||
|
||||
exp = [{"timestamp": "20141006184357", "load_url": "http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/", "source": "rhiz"},
|
||||
{"timestamp": "20141018133107", "load_url": "http://web.archive.org/web/20141018133107id_/http://vvork.com/", "source": "ia"},
|
||||
{"timestamp": "20141020161243", "load_url": "http://web.archive.org/web/20141020161243id_/http://vvork.com/", "source": "ia"},
|
||||
{"timestamp": "20140806161228", "load_url": "http://web.archive.org/web/20140806161228id_/http://vvork.com/", "source": "ia"},
|
||||
{"timestamp": "20131004231540", "load_url": "http://wayback.archive-it.org/all/20131004231540id_/http://vvork.com/", "source": "ait"}]
|
||||
|
||||
assert(to_json_list(res) == exp)
|
||||
assert(errs == {})
|
||||
|
||||
|
||||
@pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys()))
|
||||
def test_mem_agg_index_4(agg):
|
||||
url = 'http://vvork.com/'
|
||||
res, errs = agg(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait'))
|
||||
|
||||
exp = [{"timestamp": "20141006184357", "load_url": "http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/", "source": "rhiz"},
|
||||
{"timestamp": "20131004231540", "load_url": "http://wayback.archive-it.org/all/20131004231540id_/http://vvork.com/", "source": "ait"}]
|
||||
|
||||
assert(to_json_list(res) == exp)
|
||||
assert(errs == {})
|
||||
|
||||
|
||||
@pytest.mark.parametrize("agg", list(agg_nf.values()), ids=list(agg_nf.keys()))
|
||||
def test_mem_agg_not_found(agg):
|
||||
url = 'http://vvork.com/'
|
||||
res, errs = agg(dict(url=url, closest='20141001', limit=2))
|
||||
|
||||
assert(to_json_list(res) == [])
|
||||
assert(errs == {'notfound': "NotFoundException('testdata/not-found-x',)"})
|
||||
|
||||
|
||||
@pytest.mark.parametrize("agg", list(agg_tm.values()), ids=list(agg_tm.keys()))
|
||||
def test_mem_agg_timeout(agg):
|
||||
url = 'http://vvork.com/'
|
||||
|
||||
orig_source = BaseAggregator.load_child_source
|
||||
def load_child_source(self, name, source, params):
|
||||
time.sleep(0.1)
|
||||
return orig_source(name, source, params)
|
||||
|
||||
BaseAggregator.load_child_source = load_child_source
|
||||
res, errs = agg(dict(url=url, closest='20141001', limit=2))
|
||||
BaseAggregator.load_child_source = orig_source
|
||||
|
||||
assert(to_json_list(res) == [])
|
||||
assert(errs == {'local': 'timeout',
|
||||
'ait': 'timeout', 'bl': 'timeout', 'ia': 'timeout', 'rhiz': 'timeout'})
|
||||
|
||||
|
||||
def test_handler_output_cdxj():
|
||||
agg = GeventTimeoutAggregator(sources, timeout=5.0)
|
||||
handler = IndexHandler(agg)
|
||||
url = 'http://vvork.com/'
|
||||
headers, res, errs = handler(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait'))
|
||||
|
||||
exp = b"""\
|
||||
com,vvork)/ 20141006184357 {"url": "http://www.vvork.com/", "mem_rel": "memento", "memento_url": "http://webenact.rhizome.org/vvork/20141006184357/http://www.vvork.com/", "load_url": "http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/", "source": "rhiz"}
|
||||
com,vvork)/ 20131004231540 {"url": "http://vvork.com/", "mem_rel": "last memento", "memento_url": "http://wayback.archive-it.org/all/20131004231540/http://vvork.com/", "load_url": "http://wayback.archive-it.org/all/20131004231540id_/http://vvork.com/", "source": "ait"}
|
||||
"""
|
||||
|
||||
assert(headers['Content-Type'] == 'text/x-cdxj')
|
||||
assert(b''.join(res) == exp)
|
||||
assert(errs == {})
|
||||
|
||||
|
||||
def test_handler_output_json():
|
||||
agg = GeventTimeoutAggregator(sources, timeout=5.0)
|
||||
handler = IndexHandler(agg)
|
||||
url = 'http://vvork.com/'
|
||||
headers, res, errs = handler(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait', output='json'))
|
||||
|
||||
exp = b"""\
|
||||
{"urlkey": "com,vvork)/", "timestamp": "20141006184357", "url": "http://www.vvork.com/", "mem_rel": "memento", "memento_url": "http://webenact.rhizome.org/vvork/20141006184357/http://www.vvork.com/", "load_url": "http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/", "source": "rhiz"}
|
||||
{"urlkey": "com,vvork)/", "timestamp": "20131004231540", "url": "http://vvork.com/", "mem_rel": "last memento", "memento_url": "http://wayback.archive-it.org/all/20131004231540/http://vvork.com/", "load_url": "http://wayback.archive-it.org/all/20131004231540id_/http://vvork.com/", "source": "ait"}
|
||||
"""
|
||||
|
||||
assert(headers['Content-Type'] == 'application/x-ndjson')
|
||||
assert(b''.join(res) == exp)
|
||||
assert(errs == {})
|
||||
|
||||
def test_handler_output_link():
|
||||
agg = GeventTimeoutAggregator(sources, timeout=5.0)
|
||||
handler = IndexHandler(agg)
|
||||
url = 'http://vvork.com/'
|
||||
headers, res, errs = handler(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait', output='link'))
|
||||
|
||||
exp = b"""\
|
||||
<http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/>; rel="memento"; datetime="Mon, 06 Oct 2014 18:43:57 GMT"; src="rhiz",
|
||||
<http://wayback.archive-it.org/all/20131004231540id_/http://vvork.com/>; rel="memento"; datetime="Fri, 04 Oct 2013 23:15:40 GMT"; src="ait"
|
||||
"""
|
||||
assert(headers['Content-Type'] == 'application/link')
|
||||
assert(b''.join(res) == exp)
|
||||
assert(errs == {})
|
||||
|
||||
|
||||
def test_handler_output_link_2():
|
||||
agg = GeventTimeoutAggregator(sources, timeout=5.0)
|
||||
handler = IndexHandler(agg)
|
||||
url = 'http://iana.org/'
|
||||
headers, res, errs = handler(dict(url=url, closest='20140126000000', limit=5, output='link'))
|
||||
|
||||
exp = b"""\
|
||||
<http://web.archive.org/web/20140126093743id_/http://iana.org/>; rel="memento"; datetime="Sun, 26 Jan 2014 09:37:43 GMT"; src="ia",
|
||||
<file://iana.warc.gz:334:2258>; rel="memento"; datetime="Sun, 26 Jan 2014 20:06:24 GMT"; src="local",
|
||||
<http://web.archive.org/web/20140123034755id_/http://iana.org/>; rel="memento"; datetime="Thu, 23 Jan 2014 03:47:55 GMT"; src="ia",
|
||||
<http://web.archive.org/web/20140129175203id_/http://iana.org/>; rel="memento"; datetime="Wed, 29 Jan 2014 17:52:03 GMT"; src="ia",
|
||||
<http://wayback.archive-it.org/all/20140107040552id_/http://iana.org/>; rel="memento"; datetime="Tue, 07 Jan 2014 04:05:52 GMT"; src="ait"
|
||||
"""
|
||||
assert(headers['Content-Type'] == 'application/link')
|
||||
assert(b''.join(res) == exp)
|
||||
|
||||
exp_errs = {'bl': "NotFoundException('http://www.webarchive.org.uk/wayback/archive/http://iana.org/',)",
|
||||
'rhiz': "NotFoundException('http://webenact.rhizome.org/vvork/http://iana.org/',)"}
|
||||
|
||||
assert(errs == exp_errs)
|
||||
|
||||
|
||||
|
||||
def test_handler_output_link_3():
|
||||
agg = GeventTimeoutAggregator(sources, timeout=5.0)
|
||||
handler = IndexHandler(agg)
|
||||
url = 'http://foo.bar.non-existent'
|
||||
headers, res, errs = handler(dict(url=url, closest='20140126000000', limit=5, output='link'))
|
||||
|
||||
exp = b''
|
||||
|
||||
assert(headers['Content-Type'] == 'application/link')
|
||||
assert(b''.join(res) == exp)
|
||||
|
||||
exp_errs = {'ait': "NotFoundException('http://wayback.archive-it.org/all/http://foo.bar.non-existent',)",
|
||||
'bl': "NotFoundException('http://www.webarchive.org.uk/wayback/archive/http://foo.bar.non-existent',)",
|
||||
'ia': "NotFoundException('http://web.archive.org/web/http://foo.bar.non-existent',)",
|
||||
'rhiz': "NotFoundException('http://webenact.rhizome.org/vvork/http://foo.bar.non-existent',)"}
|
||||
|
||||
assert(errs == exp_errs)
|
||||
|
||||
def test_handler_output_text():
|
||||
agg = GeventTimeoutAggregator(sources, timeout=5.0)
|
||||
handler = IndexHandler(agg)
|
||||
url = 'http://vvork.com/'
|
||||
headers, res, errs = handler(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait', output='text'))
|
||||
|
||||
exp = b"""\
|
||||
com,vvork)/ 20141006184357 http://www.vvork.com/ memento http://webenact.rhizome.org/vvork/20141006184357/http://www.vvork.com/ http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/ rhiz
|
||||
com,vvork)/ 20131004231540 http://vvork.com/ last memento http://wayback.archive-it.org/all/20131004231540/http://vvork.com/ http://wayback.archive-it.org/all/20131004231540id_/http://vvork.com/ ait
|
||||
"""
|
||||
assert(headers['Content-Type'] == 'text/plain')
|
||||
assert(b''.join(res) == exp)
|
||||
assert(errs == {})
|
||||
|
||||
|
||||
def test_handler_list_sources():
|
||||
agg = GeventTimeoutAggregator(sources, timeout=5.0)
|
||||
handler = IndexHandler(agg)
|
||||
headers, res, errs = handler(dict(mode='list_sources'))
|
||||
|
||||
assert(headers == {})
|
||||
assert(res == {'sources': {'bl': 'memento',
|
||||
'ait': 'memento',
|
||||
'ia': 'memento',
|
||||
'rhiz': 'memento',
|
||||
'local': 'file'}})
|
||||
assert(errs == {})
|
||||
|
||||
|
45
webagg/test/test_redis_agg.py
Normal file
45
webagg/test/test_redis_agg.py
Normal file
@ -0,0 +1,45 @@
|
||||
from webagg.aggregator import RedisMultiKeyIndexSource
|
||||
from .testutils import to_path, to_json_list, FakeRedisTests, BaseTestClass
|
||||
|
||||
|
||||
class TestRedisAgg(FakeRedisTests, BaseTestClass):
|
||||
@classmethod
|
||||
def setup_class(cls):
|
||||
super(TestRedisAgg, cls).setup_class()
|
||||
cls.add_cdx_to_redis(to_path('testdata/example.cdxj'), 'FOO:example:cdxj')
|
||||
cls.add_cdx_to_redis(to_path('testdata/dupes.cdxj'), 'FOO:dupes:cdxj')
|
||||
|
||||
cls.indexloader = RedisMultiKeyIndexSource('redis://localhost/2/{user}:{coll}:cdxj')
|
||||
|
||||
def test_redis_agg_all(self):
|
||||
res, errs = self.indexloader({'url': 'example.com/', 'param.user': 'FOO', 'param.coll': '*'})
|
||||
|
||||
exp = [
|
||||
{'source': 'FOO:dupes:cdxj', 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'},
|
||||
{'source': 'FOO:dupes:cdxj', 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'},
|
||||
{'source': 'FOO:example:cdxj', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'}
|
||||
]
|
||||
|
||||
assert(errs == {})
|
||||
assert(to_json_list(res) == exp)
|
||||
|
||||
def test_redis_agg_one(self):
|
||||
res, errs = self.indexloader({'url': 'example.com/', 'param.user': 'FOO', 'param.coll': 'dupes'})
|
||||
|
||||
exp = [
|
||||
{'source': 'FOO:dupes:cdxj', 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'},
|
||||
{'source': 'FOO:dupes:cdxj', 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'},
|
||||
]
|
||||
|
||||
assert(errs == {})
|
||||
assert(to_json_list(res) == exp)
|
||||
|
||||
def test_redis_not_found(self):
|
||||
res, errs = self.indexloader({'url': 'example.com/'})
|
||||
|
||||
exp = []
|
||||
|
||||
assert(errs == {})
|
||||
assert(to_json_list(res) == exp)
|
||||
|
||||
|
118
webagg/test/test_timeouts.py
Normal file
118
webagg/test/test_timeouts.py
Normal file
@ -0,0 +1,118 @@
|
||||
from gevent import monkey; monkey.patch_all(thread=False)
|
||||
import time
|
||||
from webagg.indexsource import FileIndexSource
|
||||
|
||||
from webagg.aggregator import SimpleAggregator, TimeoutMixin
|
||||
from webagg.aggregator import GeventTimeoutAggregator, GeventTimeoutAggregator
|
||||
|
||||
from .testutils import to_json_list
|
||||
|
||||
|
||||
class TimeoutFileSource(FileIndexSource):
|
||||
def __init__(self, filename, timeout):
|
||||
super(TimeoutFileSource, self).__init__(filename)
|
||||
self.timeout = timeout
|
||||
self.calls = 0
|
||||
|
||||
def load_index(self, params):
|
||||
self.calls += 1
|
||||
print('Sleeping')
|
||||
time.sleep(self.timeout)
|
||||
return super(TimeoutFileSource, self).load_index(params)
|
||||
|
||||
TimeoutAggregator = GeventTimeoutAggregator
|
||||
|
||||
|
||||
|
||||
def setup_module():
|
||||
global sources
|
||||
sources = {'slow': TimeoutFileSource('testdata/example.cdxj', 0.2),
|
||||
'slower': TimeoutFileSource('testdata/dupes.cdxj', 0.5)
|
||||
}
|
||||
|
||||
|
||||
|
||||
def test_timeout_long_all_pass():
|
||||
agg = TimeoutAggregator(sources, timeout=1.0)
|
||||
|
||||
res, errs = agg(dict(url='http://example.com/'))
|
||||
|
||||
exp = [{'source': 'slower', 'timestamp': '20140127171200'},
|
||||
{'source': 'slower', 'timestamp': '20140127171251'},
|
||||
{'source': 'slow', 'timestamp': '20160225042329'}]
|
||||
|
||||
assert(to_json_list(res, fields=['source', 'timestamp']) == exp)
|
||||
|
||||
assert(errs == {})
|
||||
|
||||
|
||||
def test_timeout_slower_skipped_1():
|
||||
agg = GeventTimeoutAggregator(sources, timeout=0.49)
|
||||
|
||||
res, errs = agg(dict(url='http://example.com/'))
|
||||
|
||||
exp = [{'source': 'slow', 'timestamp': '20160225042329'}]
|
||||
|
||||
assert(to_json_list(res, fields=['source', 'timestamp']) == exp)
|
||||
|
||||
assert(errs == {'slower': 'timeout'})
|
||||
|
||||
|
||||
def test_timeout_slower_skipped_2():
|
||||
agg = GeventTimeoutAggregator(sources, timeout=0.19)
|
||||
|
||||
res, errs = agg(dict(url='http://example.com/'))
|
||||
|
||||
exp = []
|
||||
|
||||
assert(to_json_list(res, fields=['source', 'timestamp']) == exp)
|
||||
|
||||
assert(errs == {'slower': 'timeout', 'slow': 'timeout'})
|
||||
|
||||
|
||||
def test_timeout_skipping():
|
||||
assert(sources['slow'].calls == 3)
|
||||
assert(sources['slower'].calls == 3)
|
||||
|
||||
agg = GeventTimeoutAggregator(sources, timeout=0.49,
|
||||
t_count=2, t_duration=2.0)
|
||||
|
||||
exp = [{'source': 'slow', 'timestamp': '20160225042329'}]
|
||||
|
||||
res, errs = agg(dict(url='http://example.com/'))
|
||||
assert(to_json_list(res, fields=['source', 'timestamp']) == exp)
|
||||
assert(sources['slow'].calls == 4)
|
||||
assert(sources['slower'].calls == 4)
|
||||
|
||||
assert(errs == {'slower': 'timeout'})
|
||||
|
||||
res, errs = agg(dict(url='http://example.com/'))
|
||||
assert(to_json_list(res, fields=['source', 'timestamp']) == exp)
|
||||
assert(sources['slow'].calls == 5)
|
||||
assert(sources['slower'].calls == 5)
|
||||
|
||||
assert(errs == {'slower': 'timeout'})
|
||||
|
||||
res, errs = agg(dict(url='http://example.com/'))
|
||||
assert(to_json_list(res, fields=['source', 'timestamp']) == exp)
|
||||
assert(sources['slow'].calls == 6)
|
||||
assert(sources['slower'].calls == 5)
|
||||
|
||||
assert(errs == {})
|
||||
|
||||
res, errs = agg(dict(url='http://example.com/'))
|
||||
assert(to_json_list(res, fields=['source', 'timestamp']) == exp)
|
||||
assert(sources['slow'].calls == 7)
|
||||
assert(sources['slower'].calls == 5)
|
||||
|
||||
assert(errs == {})
|
||||
|
||||
time.sleep(2.01)
|
||||
|
||||
res, errs = agg(dict(url='http://example.com/'))
|
||||
assert(to_json_list(res, fields=['source', 'timestamp']) == exp)
|
||||
assert(sources['slow'].calls == 8)
|
||||
assert(sources['slower'].calls == 6)
|
||||
|
||||
assert(errs == {'slower': 'timeout'})
|
||||
|
74
webagg/test/test_upstream.py
Normal file
74
webagg/test/test_upstream.py
Normal file
@ -0,0 +1,74 @@
|
||||
import webtest
|
||||
|
||||
from io import BytesIO
|
||||
from webagg.app import ResAggApp
|
||||
import requests
|
||||
|
||||
from webagg.handlers import DefaultResourceHandler
|
||||
from webagg.aggregator import SimpleAggregator
|
||||
from webagg.proxyindexsource import ProxyMementoIndexSource, UpstreamAggIndexSource
|
||||
|
||||
from pywb.warc.recordloader import ArcWarcRecordLoader
|
||||
|
||||
from .testutils import LiveServerTests, BaseTestClass
|
||||
|
||||
|
||||
class TestUpstream(LiveServerTests, BaseTestClass):
|
||||
def setup(self):
|
||||
app = ResAggApp()
|
||||
|
||||
base_url = 'http://localhost:{0}'.format(self.server.port)
|
||||
app.add_route('/upstream',
|
||||
DefaultResourceHandler(SimpleAggregator(
|
||||
{'upstream': UpstreamAggIndexSource(base_url + '/live')})
|
||||
)
|
||||
)
|
||||
|
||||
app.add_route('/upstream_opt',
|
||||
DefaultResourceHandler(SimpleAggregator(
|
||||
{'upstream_opt': ProxyMementoIndexSource.upstream_resource(base_url + '/live')})
|
||||
)
|
||||
)
|
||||
|
||||
self.base_url = base_url
|
||||
self.testapp = webtest.TestApp(app)
|
||||
|
||||
|
||||
def test_live_paths(self):
|
||||
res = requests.get(self.base_url + '/')
|
||||
assert set(res.json().keys()) == {'/live/postreq', '/live'}
|
||||
|
||||
def test_upstream_paths(self):
|
||||
res = self.testapp.get('/')
|
||||
assert set(res.json.keys()) == {'/upstream/postreq', '/upstream', '/upstream_opt', '/upstream_opt/postreq'}
|
||||
|
||||
def test_live_1(self):
|
||||
resp = requests.get(self.base_url + '/live/resource?url=http://httpbin.org/get', stream=True)
|
||||
assert resp.headers['WebAgg-Source-Coll'] == 'live'
|
||||
|
||||
record = ArcWarcRecordLoader().parse_record_stream(resp.raw, no_record_parse=False)
|
||||
assert record.rec_headers.get_header('WARC-Target-URI') == 'http://httpbin.org/get'
|
||||
assert record.status_headers.get_header('Date') != ''
|
||||
|
||||
def test_upstream_1(self):
|
||||
resp = self.testapp.get('/upstream/resource?url=http://httpbin.org/get')
|
||||
assert resp.headers['WebAgg-Source-Coll'] == 'upstream:live'
|
||||
|
||||
raw = BytesIO(resp.body)
|
||||
|
||||
record = ArcWarcRecordLoader().parse_record_stream(raw, no_record_parse=False)
|
||||
assert record.rec_headers.get_header('WARC-Target-URI') == 'http://httpbin.org/get'
|
||||
assert record.status_headers.get_header('Date') != ''
|
||||
|
||||
def test_upstream_2(self):
|
||||
resp = self.testapp.get('/upstream_opt/resource?url=http://httpbin.org/get')
|
||||
assert resp.headers['WebAgg-Source-Coll'] == 'upstream_opt:live', resp.headers
|
||||
|
||||
raw = BytesIO(resp.body)
|
||||
|
||||
record = ArcWarcRecordLoader().parse_record_stream(raw, no_record_parse=False)
|
||||
assert record.rec_headers.get_header('WARC-Target-URI') == 'http://httpbin.org/get'
|
||||
assert record.status_headers.get_header('Date') != ''
|
||||
|
||||
|
||||
|
127
webagg/test/testutils.py
Normal file
127
webagg/test/testutils.py
Normal file
@ -0,0 +1,127 @@
|
||||
import json
|
||||
import os
|
||||
import tempfile
|
||||
import shutil
|
||||
|
||||
from multiprocessing import Process
|
||||
|
||||
from fakeredis import FakeStrictRedis
|
||||
from mock import patch
|
||||
|
||||
from wsgiref.simple_server import make_server
|
||||
|
||||
from webagg.aggregator import SimpleAggregator
|
||||
from webagg.app import ResAggApp
|
||||
from webagg.handlers import DefaultResourceHandler
|
||||
from webagg.indexsource import LiveIndexSource
|
||||
|
||||
|
||||
# ============================================================================
|
||||
def to_json_list(cdxlist, fields=['timestamp', 'load_url', 'filename', 'source']):
|
||||
return list([json.loads(cdx.to_json(fields)) for cdx in cdxlist])
|
||||
|
||||
def key_ts_res(cdxlist, extra='filename'):
|
||||
return '\n'.join([cdx['urlkey'] + ' ' + cdx['timestamp'] + ' ' + cdx[extra] for cdx in cdxlist])
|
||||
|
||||
def to_path(path):
|
||||
if os.path.sep != '/':
|
||||
path = path.replace('/', os.path.sep)
|
||||
|
||||
return path
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class BaseTestClass(object):
|
||||
@classmethod
|
||||
def setup_class(cls):
|
||||
pass
|
||||
|
||||
@classmethod
|
||||
def teardown_class(cls):
|
||||
pass
|
||||
|
||||
|
||||
# ============================================================================
|
||||
PUBSUBS = []
|
||||
|
||||
class FakeStrictRedisSharedPubSub(FakeStrictRedis):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(FakeStrictRedisSharedPubSub, self).__init__(*args, **kwargs)
|
||||
self._pubsubs = PUBSUBS
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class FakeRedisTests(object):
|
||||
@classmethod
|
||||
def setup_class(cls):
|
||||
super(FakeRedisTests, cls).setup_class()
|
||||
cls.redismock = patch('redis.StrictRedis', FakeStrictRedisSharedPubSub)
|
||||
cls.redismock.start()
|
||||
|
||||
@staticmethod
|
||||
def add_cdx_to_redis(filename, key, redis_url='redis://localhost:6379/2'):
|
||||
r = FakeStrictRedis.from_url(redis_url)
|
||||
with open(filename, 'rb') as fh:
|
||||
for line in fh:
|
||||
r.zadd(key, 0, line.rstrip())
|
||||
|
||||
@classmethod
|
||||
def teardown_class(cls):
|
||||
super(FakeRedisTests, cls).teardown_class()
|
||||
FakeStrictRedis().flushall()
|
||||
cls.redismock.stop()
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class TempDirTests(object):
|
||||
@classmethod
|
||||
def setup_class(cls):
|
||||
super(TempDirTests, cls).setup_class()
|
||||
cls.root_dir = tempfile.mkdtemp()
|
||||
|
||||
@classmethod
|
||||
def teardown_class(cls):
|
||||
super(TempDirTests, cls).teardown_class()
|
||||
shutil.rmtree(cls.root_dir)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class LiveServerTests(object):
|
||||
@classmethod
|
||||
def setup_class(cls):
|
||||
super(LiveServerTests, cls).setup_class()
|
||||
cls.server = ServerThreadRunner(cls.make_live_app())
|
||||
|
||||
@staticmethod
|
||||
def make_live_app():
|
||||
app = ResAggApp()
|
||||
app.add_route('/live',
|
||||
DefaultResourceHandler(SimpleAggregator(
|
||||
{'live': LiveIndexSource()})
|
||||
)
|
||||
)
|
||||
return app
|
||||
|
||||
@classmethod
|
||||
def teardown_class(cls):
|
||||
super(LiveServerTests, cls).teardown_class()
|
||||
cls.server.stop()
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class ServerThreadRunner(object):
|
||||
def __init__(self, app):
|
||||
self.httpd = make_server('', 0, app)
|
||||
self.port = self.httpd.socket.getsockname()[1]
|
||||
|
||||
def run():
|
||||
self.httpd.serve_forever()
|
||||
|
||||
self.proc = Process(target=run)
|
||||
#self.proc.daemon = True
|
||||
self.proc.start()
|
||||
|
||||
def stop(self):
|
||||
self.proc.terminate()
|
||||
|
||||
|
200
webagg/utils.py
Normal file
200
webagg/utils.py
Normal file
@ -0,0 +1,200 @@
|
||||
import re
|
||||
import six
|
||||
import string
|
||||
import yaml
|
||||
import os
|
||||
|
||||
from contextlib import closing
|
||||
|
||||
from pywb.utils.timeutils import timestamp_to_http_date
|
||||
from pywb.utils.wbexception import BadRequestException
|
||||
|
||||
LINK_SPLIT = re.compile(',\s*(?=[<])')
|
||||
LINK_SEG_SPLIT = re.compile(';\s*')
|
||||
LINK_URL = re.compile('<(.*)>')
|
||||
LINK_PROP = re.compile('([\w]+)="([^"]+)')
|
||||
|
||||
BUFF_SIZE = 16384
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class MementoException(BadRequestException):
|
||||
pass
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class MementoUtils(object):
|
||||
@staticmethod
|
||||
def parse_links(link_header, def_name='timemap'):
|
||||
links = LINK_SPLIT.split(link_header)
|
||||
results = {}
|
||||
mementos = []
|
||||
|
||||
for link in links:
|
||||
props = LINK_SEG_SPLIT.split(link)
|
||||
m = LINK_URL.match(props[0])
|
||||
if not m:
|
||||
raise MementoException('Invalid Link Url: ' + props[0])
|
||||
|
||||
result = dict(url=m.group(1))
|
||||
key = ''
|
||||
is_mem = False
|
||||
|
||||
for prop in props[1:]:
|
||||
m = LINK_PROP.match(prop)
|
||||
if not m:
|
||||
raise MementoException('Invalid prop ' + prop)
|
||||
|
||||
name = m.group(1)
|
||||
value = m.group(2)
|
||||
|
||||
if name == 'rel':
|
||||
if 'memento' in value:
|
||||
is_mem = True
|
||||
result[name] = value
|
||||
elif value == 'self':
|
||||
key = def_name
|
||||
else:
|
||||
key = value
|
||||
else:
|
||||
result[name] = value
|
||||
|
||||
if key:
|
||||
results[key] = result
|
||||
elif is_mem:
|
||||
mementos.append(result)
|
||||
|
||||
results['mementos'] = mementos
|
||||
return results
|
||||
|
||||
@staticmethod
|
||||
def make_timemap_memento_link(cdx, datetime=None, rel='memento', end=',\n'):
|
||||
url = cdx.get('load_url')
|
||||
if not url:
|
||||
url = 'file://{0}:{1}:{2}'.format(cdx.get('filename'), cdx.get('offset'), cdx.get('length'))
|
||||
|
||||
memento = '<{0}>; rel="{1}"; datetime="{2}"; src="{3}"' + end
|
||||
|
||||
if not datetime:
|
||||
datetime = timestamp_to_http_date(cdx['timestamp'])
|
||||
|
||||
return memento.format(url, rel, datetime, cdx.get('source', ''))
|
||||
|
||||
|
||||
@staticmethod
|
||||
def make_timemap(cdx_iter):
|
||||
# get first memento as it'll be used for 'from' field
|
||||
try:
|
||||
first_cdx = six.next(cdx_iter)
|
||||
from_date = timestamp_to_http_date(first_cdx['timestamp'])
|
||||
except StopIteration:
|
||||
first_cdx = None
|
||||
return
|
||||
|
||||
# first memento link
|
||||
yield MementoUtils.make_timemap_memento_link(first_cdx, datetime=from_date)
|
||||
|
||||
prev_cdx = None
|
||||
|
||||
for cdx in cdx_iter:
|
||||
if prev_cdx:
|
||||
yield MementoUtils.make_timemap_memento_link(prev_cdx)
|
||||
|
||||
prev_cdx = cdx
|
||||
|
||||
# last memento link, if any
|
||||
if prev_cdx:
|
||||
yield MementoUtils.make_timemap_memento_link(prev_cdx, end='\n')
|
||||
|
||||
@staticmethod
|
||||
def make_link(url, type):
|
||||
return '<{0}>; rel="{1}"'.format(url, type)
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class ParamFormatter(string.Formatter):
|
||||
def __init__(self, params, name='', prefix='param.'):
|
||||
self.params = params
|
||||
self.prefix = prefix
|
||||
self.name = name
|
||||
|
||||
def get_value(self, key, args, kwargs):
|
||||
# First, try the named param 'param.{name}.{key}'
|
||||
if self.name:
|
||||
named_key = self.prefix + self.name + '.' + key
|
||||
value = self.params.get(named_key)
|
||||
if value is not None:
|
||||
return value
|
||||
|
||||
# Then, try 'param.{key}'
|
||||
named_key = self.prefix + key
|
||||
value = self.params.get(named_key)
|
||||
if value is not None:
|
||||
return value
|
||||
|
||||
# default to just '{key}'
|
||||
value = kwargs.get(key, '')
|
||||
return value
|
||||
|
||||
|
||||
#=============================================================================
|
||||
def res_template(template, params, **extra_params):
|
||||
formatter = params.get('_formatter')
|
||||
if not formatter:
|
||||
formatter = ParamFormatter(params)
|
||||
res = formatter.format(template, url=params.get('url', ''), **extra_params)
|
||||
|
||||
return res
|
||||
|
||||
|
||||
#=============================================================================
|
||||
def StreamIter(stream, header1=None, header2=None, size=BUFF_SIZE):
|
||||
with closing(stream):
|
||||
if header1:
|
||||
yield header1
|
||||
|
||||
if header2:
|
||||
yield header2
|
||||
|
||||
while True:
|
||||
buff = stream.read(size)
|
||||
if not buff:
|
||||
break
|
||||
yield buff
|
||||
|
||||
|
||||
#=============================================================================
|
||||
def chunk_encode_iter(orig_iter):
|
||||
for chunk in orig_iter:
|
||||
if not len(chunk):
|
||||
continue
|
||||
chunk_len = b'%X\r\n' % len(chunk)
|
||||
yield chunk_len
|
||||
yield chunk
|
||||
yield b'\r\n'
|
||||
|
||||
yield b'0\r\n\r\n'
|
||||
|
||||
|
||||
#=============================================================================
|
||||
def load_config(main_env_var, main_default_file='',
|
||||
overlay_env_var='', overlay_file=''):
|
||||
|
||||
configfile = os.environ.get(main_env_var, main_default_file)
|
||||
|
||||
if configfile:
|
||||
# Load config
|
||||
with open(configfile, 'rb') as fh:
|
||||
config = yaml.load(fh)
|
||||
|
||||
else:
|
||||
config = {}
|
||||
|
||||
overlay_configfile = os.environ.get(overlay_env_var, overlay_file)
|
||||
|
||||
if overlay_configfile:
|
||||
with open(overlay_configfile, 'rb') as fh:
|
||||
config.update(yaml.load(fh))
|
||||
|
||||
return config
|
||||
|
Loading…
x
Reference in New Issue
Block a user