1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

Merge remote-tracking branch 'webrec-platform' system into pywb for furthering refactoring!

This commit is contained in:
Ilya Kreymer 2016-11-08 06:55:37 -08:00
commit 88d6b9e097
54 changed files with 6398 additions and 0 deletions

23
Dockerfile Normal file
View File

@ -0,0 +1,23 @@
#webrecorder/webrecore 1.0
FROM python:3.5.2
RUN pip install gevent uwsgi bottle urllib3 youtube-dl
RUN pip install git+https://github.com/ikreymer/pywb.git@master#egg=pywb-0.33.0
#RUN pip install pywb
RUN pip install git+https://github.com/t0m/pyamf.git@python3
RUN pip install boto webassets
ADD . /webrecore/
WORKDIR /webrecore/
RUN pip install -e ./
RUN useradd -ms /bin/bash -u 1000 apprun
USER apprun

19
docker-compose.yml Normal file
View File

@ -0,0 +1,19 @@
version: '2'
services:
proxy:
build: ./proxy/
links:
- webagg:webagg
environment:
- "WEBAGG=http://webrecplatform_webagg_1:8080"
ports:
- 9080:9080
volumes:
- ${HOME}/.mitmproxy/:/root/.mitmproxy/
webagg:
build: ./webagg/

0
recorder/__init__.py Normal file
View File

84
recorder/filters.py Normal file
View File

@ -0,0 +1,84 @@
from pywb.utils.timeutils import timestamp_to_datetime, datetime_to_iso_date
import re
# ============================================================================
# Header Exclusions
# ============================================================================
class ExcludeNone(object):
def __call__(self, record):
return None
# ============================================================================
class ExcludeSpecificHeaders(object):
def __init__(self, exclude_headers=[]):
self.exclude_headers = [x.lower() for x in exclude_headers]
def __call__(self, record):
return self.exclude_headers
# ============================================================================
# Revisit Policy
# ============================================================================
class WriteRevisitDupePolicy(object):
def __call__(self, cdx, params):
dt = timestamp_to_datetime(cdx['timestamp'])
return ('revisit', cdx['url'], datetime_to_iso_date(dt))
# ============================================================================
class SkipDupePolicy(object):
def __call__(self, cdx, params):
if cdx['url'] == params['url']:
return 'skip'
else:
return 'write'
# ============================================================================
class WriteDupePolicy(object):
def __call__(self, cdx, params):
return 'write'
# ============================================================================
# Skip Record Filters
# ============================================================================
class SkipNothingFilter(object):
def skip_request(self, req_headers):
return False
def skip_response(self, req_headers, resp_headers):
return False
# ============================================================================
class CollectionFilter(SkipNothingFilter):
def __init__(self, accept_colls):
self.rx_accept_colls = re.compile(accept_colls)
def skip_request(self, req_headers):
if req_headers.get('Recorder-Skip') == '1':
return True
return False
def skip_response(self, req_headers, resp_headers):
if not self.rx_accept_colls.match(resp_headers.get('WebAgg-Source-Coll', '')):
return True
return False
# ============================================================================
class SkipRangeRequestFilter(SkipNothingFilter):
def skip_request(self, req_headers):
range_ = req_headers.get('Range')
if range_ and not range_.lower().startswith('bytes=0-'):
return True
return False

293
recorder/recorderapp.py Normal file
View File

@ -0,0 +1,293 @@
from webagg.utils import StreamIter, chunk_encode_iter, BUFF_SIZE
from webagg.inputrequest import DirectWSGIInputRequest
from recorder.filters import SkipRangeRequestFilter, CollectionFilter
from six.moves.urllib.parse import parse_qsl
import json
import tempfile
from requests.structures import CaseInsensitiveDict
import requests
import traceback
import gevent.queue
import gevent
#==============================================================================
class RecorderApp(object):
def __init__(self, upstream_host, writer, skip_filters=None, **kwargs):
self.upstream_host = upstream_host
self.writer = writer
self.write_queue = gevent.queue.Queue()
gevent.spawn(self._write_loop)
if not skip_filters:
skip_filters = self.create_default_filters(kwargs)
self.skip_filters = skip_filters
@staticmethod
def create_default_filters(kwargs):
skip_filters = [SkipRangeRequestFilter()]
accept_colls = kwargs.get('accept_colls')
if accept_colls:
skip_filters.append(CollectionFilter(accept_colls))
return skip_filters
def _write_loop(self):
while True:
try:
self._write_one()
except:
traceback.print_exc()
def _write_one(self):
req = None
resp = None
try:
result = self.write_queue.get()
req_head, req_pay, resp_head, resp_pay, params = result
resp_type, resp = self.writer.read_resp_record(resp_head, resp_pay)
if resp_type == 'response':
req = self.writer.create_req_record(req_head, req_pay)
self.writer.write_req_resp(req, resp, params)
else:
self.writer.write_record(resp, params)
finally:
try:
if req:
req.stream.close()
if resp:
resp.stream.close()
except Exception as e:
traceback.print_exc()
def send_error(self, exc, start_response):
return self.send_message({'error': repr(exc)},
'400 Bad Request',
start_response)
def send_message(self, msg, status, start_response):
message = json.dumps(msg)
headers = [('Content-Type', 'application/json; charset=utf-8'),
('Content-Length', str(len(message)))]
start_response(status, headers)
return [message.encode('utf-8')]
def _put_record(self, request_uri, input_buff, record_type,
headers, params, start_response):
if record_type == 'stream':
if self.writer.write_stream_to_file(params, input_buff):
msg = {'success': 'true'}
else:
msg = {'error_message': 'upload_error'}
return self.send_message(msg, '200 OK',
start_response)
req_stream = ReqWrapper(input_buff, headers)
while True:
buff = req_stream.read()
if not buff:
break
content_type = headers.get('Content-Type')
record = self.writer.create_custom_record(params['url'],
req_stream.out,
record_type,
content_type,
req_stream.headers)
self.writer.write_record(record, params)
msg = {'success': 'true',
'WARC-Date': record.rec_headers.get('WARC-Date')}
return self.send_message(msg,
'200 OK',
start_response)
def _get_params(self, environ):
params = dict(parse_qsl(environ.get('QUERY_STRING')))
return params
def __call__(self, environ, start_response):
try:
return self.handle_call(environ, start_response)
except:
import traceback
traceback.print_exc()
def handle_call(self, environ, start_response):
input_req = DirectWSGIInputRequest(environ)
params = self._get_params(environ)
request_uri = input_req.get_full_request_uri()
input_buff = input_req.get_req_body()
headers = input_req.get_req_headers()
method = input_req.get_req_method()
# write request body as metadata/resource
put_record = params.get('put_record')
if put_record and method in ('PUT', 'POST'):
return self._put_record(request_uri,
input_buff,
put_record,
headers,
params,
start_response)
skipping = any(x.skip_request(headers) for x in self.skip_filters)
if not skipping:
req_stream = ReqWrapper(input_buff, headers)
else:
req_stream = input_buff
data = None
if input_buff:
data = req_stream
try:
res = requests.request(url=self.upstream_host + request_uri,
method=method,
data=data,
headers=headers,
allow_redirects=False,
stream=True)
res.raise_for_status()
except Exception as e:
#traceback.print_exc()
return self.send_error(e, start_response)
start_response('200 OK', list(res.headers.items()))
if not skipping:
resp_stream = RespWrapper(res.raw,
res.headers,
req_stream,
params,
self.write_queue,
self.skip_filters)
else:
resp_stream = res.raw
resp_iter = StreamIter(resp_stream)
if res.headers.get('Transfer-Encoding') == 'chunked':
resp_iter = chunk_encode_iter(resp_iter)
return resp_iter
#==============================================================================
class Wrapper(object):
def __init__(self, stream):
self.stream = stream
self.out = self._create_buffer()
self.interrupted = False
def _create_buffer(self):
return tempfile.SpooledTemporaryFile(max_size=512*1024)
def read(self, *args, **kwargs):
try:
buff = self.stream.read(*args, **kwargs)
except Exception as e:
print('INTERRUPT READ')
self.interrupted = True
raise
self.out.write(buff)
return buff
#==============================================================================
class RespWrapper(Wrapper):
def __init__(self, stream, headers, req,
params, queue, skip_filters):
super(RespWrapper, self).__init__(stream)
self.headers = headers
self.req = req
self.params = params
self.queue = queue
self.skip_filters = skip_filters
def close(self):
try:
while True:
if not self.read(BUFF_SIZE):
break
except Exception as e:
print(e)
self.interrupted = True
finally:
try:
self.stream.close()
except Exception as e:
traceback.print_exc()
self._write_to_file()
def _write_to_file(self):
skipping = any(x.skip_response(self.req.headers, self.headers)
for x in self.skip_filters)
if self.interrupted or skipping:
self.out.close()
self.req.out.close()
self.req.close()
return
try:
entry = (self.req.headers, self.req.out,
self.headers, self.out, self.params)
self.queue.put(entry)
self.req.close()
self.req = None
except:
traceback.print_exc()
#==============================================================================
class ReqWrapper(Wrapper):
def __init__(self, stream, req_headers):
super(ReqWrapper, self).__init__(stream)
self.headers = CaseInsensitiveDict(req_headers)
for n in req_headers.keys():
if not n.upper().startswith('WARC-'):
del self.headers[n]
def close(self):
# no need to close wsgi.input
pass

83
recorder/redisindexer.py Normal file
View File

@ -0,0 +1,83 @@
from pywb.utils.canonicalize import calc_search_range
from pywb.cdx.cdxobject import CDXObject
from pywb.warc.cdxindexer import write_cdx_index
from pywb.utils.timeutils import iso_date_to_timestamp
from io import BytesIO
import os
from webagg.indexsource import RedisIndexSource
from webagg.aggregator import SimpleAggregator
from webagg.utils import res_template
from recorder.filters import WriteRevisitDupePolicy
#==============================================================================
class WritableRedisIndexer(RedisIndexSource):
def __init__(self, *args, **kwargs):
redis_url = kwargs.get('redis_url')
redis = kwargs.get('redis')
cdx_key_template = kwargs.get('cdx_key_template')
super(WritableRedisIndexer, self).__init__(redis_url,
redis,
cdx_key_template)
name = kwargs.get('name', 'recorder')
self.cdx_lookup = SimpleAggregator({name: self})
self.rel_path_template = kwargs.get('rel_path_template', '')
self.file_key_template = kwargs.get('file_key_template', '')
self.full_warc_prefix = kwargs.get('full_warc_prefix', '')
self.dupe_policy = kwargs.get('dupe_policy', WriteRevisitDupePolicy())
def add_warc_file(self, full_filename, params):
rel_path = res_template(self.rel_path_template, params)
rel_filename = os.path.relpath(full_filename, rel_path)
file_key = res_template(self.file_key_template, params)
full_load_path = self.full_warc_prefix + full_filename
self.redis.hset(file_key, rel_filename, full_load_path)
def add_urls_to_index(self, stream, params, filename, length):
rel_path = res_template(self.rel_path_template, params)
filename = os.path.relpath(filename, rel_path)
cdxout = BytesIO()
write_cdx_index(cdxout, stream, filename,
cdxj=True, append_post=True)
z_key = res_template(self.redis_key_template, params)
cdx_list = cdxout.getvalue().rstrip().split(b'\n')
for cdx in cdx_list:
if cdx:
self.redis.zadd(z_key, 0, cdx)
return cdx_list
def lookup_revisit(self, params, digest, url, iso_dt):
params['url'] = url
params['closest'] = iso_date_to_timestamp(iso_dt)
filters = []
filters.append('!mime:warc/revisit')
if digest and digest != '-':
filters.append('digest:' + digest.split(':')[-1])
params['filter'] = filters
cdx_iter, errs = self.cdx_lookup(params)
for cdx in cdx_iter:
res = self.dupe_policy(cdx, params)
if res:
return res
return None

17
recorder/test/rec.ini Normal file
View File

@ -0,0 +1,17 @@
[uwsgi]
if-not-env = PORT
http-socket = :8010
endif =
master = true
buffer-size = 65536
die-on-term = true
if-env = VIRTUAL_ENV
venv = $(VIRTUAL_ENV)
endif =
gevent = 100
#gevent-early-monkey-patch =
wsgi = recorder.test.simplerec

View File

@ -0,0 +1,42 @@
from gevent import monkey; monkey.patch_all()
from recorder.recorderapp import RecorderApp
from recorder.redisindexer import WritableRedisIndexer
from recorder.warcwriter import MultiFileWARCWriter
from recorder.filters import SkipDupePolicy
import atexit
import tempfile
import redis
upstream_url = 'http://localhost:8080'
target = tempfile.mkdtemp(prefix='tmprec') + '/'
print('Recording to ' + target)
def rm_target():
print('Removing ' + target)
shutil.rmtree(target)
atexit.register(rm_target)
local_r = redis.StrictRedis.from_url('redis://localhost/2')
local_r.delete('rec:cdxj')
local_r.delete('rec:warc')
#target = './_recordings/'
dedup_index = WritableRedisIndexer(
redis_url='redis://localhost/2/rec:cdxj',
file_key_template='rec:warc',
rel_path_template=target,
dupe_policy=SkipDupePolicy())
recorder_app = RecorderApp(upstream_url,
MultiFileWARCWriter(target, dedup_index=dedup_index),
accept_colls='live')
application = recorder_app

View File

@ -0,0 +1,582 @@
#from gevent import monkey; monkey.patch_all()
import gevent
from webagg.test.testutils import TempDirTests, LiveServerTests, BaseTestClass, to_path
from webagg.test.testutils import FakeRedisTests
import os
import webtest
from pytest import raises
from fakeredis import FakeStrictRedis
from recorder.recorderapp import RecorderApp
from recorder.redisindexer import WritableRedisIndexer
from recorder.warcwriter import PerRecordWARCWriter, MultiFileWARCWriter, SimpleTempWARCWriter
from recorder.filters import ExcludeSpecificHeaders
from recorder.filters import SkipDupePolicy, WriteDupePolicy, WriteRevisitDupePolicy
from webagg.utils import MementoUtils
from pywb.cdx.cdxobject import CDXObject
from pywb.utils.statusandheaders import StatusAndHeadersParser
from pywb.utils.bufferedreaders import DecompressingBufferedReader
from pywb.warc.recordloader import ArcWarcRecordLoader
from pywb.warc.cdxindexer import write_cdx_index
from pywb.warc.archiveiterator import ArchiveIterator
from six.moves.urllib.parse import quote, unquote, urlencode
from io import BytesIO
import time
import json
general_req_data = "\
GET {path} HTTP/1.1\r\n\
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8\r\n\
User-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36\r\n\
X-Other: foo\r\n\
Host: {host}\r\n\
Cookie: boo=far\r\n\
\r\n"
class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass):
@classmethod
def setup_class(cls):
super(TestRecorder, cls).setup_class()
cls.warcs_dir = to_path(cls.root_dir + '/warcs')
os.makedirs(cls.warcs_dir)
cls.upstream_url = 'http://localhost:{0}'.format(cls.server.port)
def _get_dedup_index(self, dupe_policy=WriteRevisitDupePolicy(), user=True):
if user:
file_key_template = '{user}:{coll}:warc'
redis_url = 'redis://localhost/2/{user}:{coll}:cdxj'
else:
file_key_template = '{coll}:warc'
redis_url = 'redis://localhost/2/{coll}:cdxj'
dedup_index = WritableRedisIndexer(redis_url=redis_url,
file_key_template=file_key_template,
rel_path_template=self.root_dir + '/warcs/',
dupe_policy=dupe_policy)
return dedup_index
def _test_warc_write(self, recorder_app, host, path, other_params='', link_url=''):
url = 'http://' + host + path
req_url = '/live/resource/postreq?url=' + url + other_params
testapp = webtest.TestApp(recorder_app)
resp = testapp.post(req_url, general_req_data.format(host=host, path=path).encode('utf-8'))
if not recorder_app.write_queue.empty():
recorder_app._write_one()
assert resp.headers['WebAgg-Source-Coll'] == 'live'
if not link_url:
link_url = unquote(url)
assert resp.headers['Link'] == MementoUtils.make_link(link_url, 'original')
assert resp.headers['Memento-Datetime'] != ''
return resp
def _test_all_warcs(self, dirname, num):
coll_dir = to_path(self.root_dir + dirname)
assert os.path.isdir(coll_dir)
files = [x for x in os.listdir(coll_dir) if os.path.isfile(os.path.join(coll_dir, x))]
assert len(files) == num
assert all(x.endswith('.warc.gz') for x in files)
return files, coll_dir
def _load_resp_req(self, base_path):
warcs = os.listdir(base_path)
assert len(warcs) == 1
warc = warcs[0]
stored_resp = None
stored_req = None
with open(os.path.join(base_path, warc), 'rb') as fh:
for rec in ArchiveIterator(fh)():
if rec.rec_type == 'response':
stored_resp = rec
elif rec.rec_type == 'request':
stored_req = rec
assert stored_resp is not None
assert stored_req is not None
return stored_req, stored_resp
def test_record_warc_1(self):
recorder_app = RecorderApp(self.upstream_url,
PerRecordWARCWriter(to_path(self.root_dir + '/warcs/')))
resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar')
assert b'HTTP/1.1 200 OK' in resp.body
assert b'"foo": "bar"' in resp.body
self._test_all_warcs('/warcs/', 1)
def test_record_warc_2(self):
recorder_app = RecorderApp(self.upstream_url,
PerRecordWARCWriter(to_path(self.root_dir + '/warcs/')), accept_colls='live')
resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar')
assert b'HTTP/1.1 200 OK' in resp.body
assert b'"foo": "bar"' in resp.body
self._test_all_warcs('/warcs/', 2)
def test_error_url(self):
recorder_app = RecorderApp(self.upstream_url + '01',
PerRecordWARCWriter(to_path(self.root_dir + '/warcs/')), accept_colls='live')
testapp = webtest.TestApp(recorder_app)
resp = testapp.get('/live/resource?url=http://example.com/', status=400)
assert resp.json['error'] != ''
self._test_all_warcs('/warcs/', 2)
def test_record_cookies_header(self):
base_path = to_path(self.root_dir + '/warcs/cookiecheck/')
recorder_app = RecorderApp(self.upstream_url,
PerRecordWARCWriter(base_path), accept_colls='live')
resp = self._test_warc_write(recorder_app, 'httpbin.org', '/cookies/set%3Fname%3Dvalue%26foo%3Dbar')
assert b'HTTP/1.1 302' in resp.body
buff = BytesIO(resp.body)
record = ArcWarcRecordLoader().parse_record_stream(buff)
assert ('Set-Cookie', 'name=value; Path=/') in record.status_headers.headers
assert ('Set-Cookie', 'foo=bar; Path=/') in record.status_headers.headers
stored_req, stored_resp = self._load_resp_req(base_path)
assert ('Set-Cookie', 'name=value; Path=/') in stored_resp.status_headers.headers
assert ('Set-Cookie', 'foo=bar; Path=/') in stored_resp.status_headers.headers
assert ('X-Other', 'foo') in stored_req.status_headers.headers
assert ('Cookie', 'boo=far') in stored_req.status_headers.headers
def test_record_cookies_skip_header(self):
warc_path = to_path(self.root_dir + '/warcs/cookieskip/')
header_filter = ExcludeSpecificHeaders(['Set-Cookie', 'Cookie'])
recorder_app = RecorderApp(self.upstream_url,
PerRecordWARCWriter(warc_path, header_filter=header_filter),
accept_colls='live')
resp = self._test_warc_write(recorder_app, 'httpbin.org', '/cookies/set%3Fname%3Dvalue%26foo%3Dbar')
assert b'HTTP/1.1 302' in resp.body
buff = BytesIO(resp.body)
record = ArcWarcRecordLoader().parse_record_stream(buff)
assert ('Set-Cookie', 'name=value; Path=/') in record.status_headers.headers
assert ('Set-Cookie', 'foo=bar; Path=/') in record.status_headers.headers
stored_req, stored_resp = self._load_resp_req(warc_path)
assert ('Set-Cookie', 'name=value; Path=/') not in stored_resp.status_headers.headers
assert ('Set-Cookie', 'foo=bar; Path=/') not in stored_resp.status_headers.headers
assert ('X-Other', 'foo') in stored_req.status_headers.headers
assert ('Cookie', 'boo=far') not in stored_req.status_headers.headers
def test_record_skip_wrong_coll(self):
recorder_app = RecorderApp(self.upstream_url,
writer=PerRecordWARCWriter(to_path(self.root_dir + '/warcs/')), accept_colls='not-live')
resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar')
assert b'HTTP/1.1 200 OK' in resp.body
assert b'"foo": "bar"' in resp.body
self._test_all_warcs('/warcs/', 2)
def test_record_param_user_coll(self):
warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')
dedup_index = self._get_dedup_index()
recorder_app = RecorderApp(self.upstream_url,
PerRecordWARCWriter(warc_path, dedup_index=dedup_index))
self._test_all_warcs('/warcs/', 2)
resp = self._test_warc_write(recorder_app, 'httpbin.org',
'/get?foo=bar', '&param.recorder.user=USER&param.recorder.coll=COLL')
assert b'HTTP/1.1 200 OK' in resp.body
assert b'"foo": "bar"' in resp.body
self._test_all_warcs('/warcs/USER/COLL/', 1)
r = FakeStrictRedis.from_url('redis://localhost/2')
res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/', '(org,httpbin,')
assert len(res) == 1
cdx = CDXObject(res[0])
assert cdx['urlkey'] == 'org,httpbin)/get?foo=bar'
assert cdx['mime'] == 'application/json'
assert cdx['offset'] == '0'
assert cdx['filename'].startswith('USER/COLL/')
assert cdx['filename'].endswith('.warc.gz')
warcs = r.hgetall('USER:COLL:warc')
full_path = self.root_dir + '/warcs/' + cdx['filename']
assert warcs == {cdx['filename'].encode('utf-8'): full_path.encode('utf-8')}
def test_record_param_user_coll_same_dir(self):
warc_path = to_path(self.root_dir + '/warcs2/')
dedup_index = self._get_dedup_index()
recorder_app = RecorderApp(self.upstream_url,
PerRecordWARCWriter(warc_path, dedup_index=dedup_index, key_template='{user}:{coll}'))
resp = self._test_warc_write(recorder_app, 'httpbin.org',
'/get?foo=bar', '&param.recorder.user=USER2&param.recorder.coll=COLL2')
assert b'HTTP/1.1 200 OK' in resp.body
assert b'"foo": "bar"' in resp.body
resp = self._test_warc_write(recorder_app, 'httpbin.org',
'/get?foo=bar', '&param.recorder.user=USER2&param.recorder.coll=COLL3')
assert b'HTTP/1.1 200 OK' in resp.body
assert b'"foo": "bar"' in resp.body
self._test_all_warcs('/warcs2', 2)
def test_record_param_user_coll_revisit(self):
warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')
dedup_index = self._get_dedup_index()
recorder_app = RecorderApp(self.upstream_url,
PerRecordWARCWriter(warc_path, dedup_index=dedup_index))
self._test_all_warcs('/warcs/', 2)
resp = self._test_warc_write(recorder_app, 'httpbin.org',
'/get?foo=bar', '&param.recorder.user=USER&param.recorder.coll=COLL')
assert b'HTTP/1.1 200 OK' in resp.body
assert b'"foo": "bar"' in resp.body
self._test_all_warcs('/warcs/USER/COLL/', 2)
# Test Redis CDX
r = FakeStrictRedis.from_url('redis://localhost/2')
res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/', '(org,httpbin,')
assert len(res) == 2
cdx = CDXObject(res[1])
assert cdx['urlkey'] == 'org,httpbin)/get?foo=bar'
assert cdx['mime'] == 'warc/revisit'
assert cdx['offset'] == '0'
assert cdx['filename'].startswith('USER/COLL/')
assert cdx['filename'].endswith('.warc.gz')
fullwarc = os.path.join(self.root_dir, 'warcs', cdx['filename'])
warcs = r.hgetall('USER:COLL:warc')
assert len(warcs) == 2
assert warcs[cdx['filename'].encode('utf-8')] == fullwarc.encode('utf-8')
with open(fullwarc, 'rb') as fh:
decomp = DecompressingBufferedReader(fh)
# Test refers-to headers
status_headers = StatusAndHeadersParser(['WARC/1.0']).parse(decomp)
assert status_headers.get_header('WARC-Type') == 'revisit'
assert status_headers.get_header('WARC-Target-URI') == 'http://httpbin.org/get?foo=bar'
assert status_headers.get_header('WARC-Date') != ''
assert status_headers.get_header('WARC-Refers-To-Target-URI') == 'http://httpbin.org/get?foo=bar'
assert status_headers.get_header('WARC-Refers-To-Date') != ''
def test_record_param_user_coll_skip(self):
warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')
dedup_index = self._get_dedup_index(dupe_policy=SkipDupePolicy())
recorder_app = RecorderApp(self.upstream_url,
PerRecordWARCWriter(warc_path, dedup_index=dedup_index))
# No new entries written
self._test_all_warcs('/warcs/', 2)
resp = self._test_warc_write(recorder_app, 'httpbin.org',
'/get?foo=bar', '&param.recorder.user=USER&param.recorder.coll=COLL')
assert b'HTTP/1.1 200 OK' in resp.body
assert b'"foo": "bar"' in resp.body
self._test_all_warcs('/warcs/USER/COLL/', 2)
# Test Redis CDX
r = FakeStrictRedis.from_url('redis://localhost/2')
res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/', '(org,httpbin,')
assert len(res) == 2
def test_record_param_user_coll_write_dupe_no_revisit(self):
warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')
dedup_index = self._get_dedup_index(dupe_policy=WriteDupePolicy())
writer = PerRecordWARCWriter(warc_path, dedup_index=dedup_index)
recorder_app = RecorderApp(self.upstream_url, writer)
resp = self._test_warc_write(recorder_app, 'httpbin.org',
'/get?foo=bar', '&param.recorder.user=USER&param.recorder.coll=COLL')
assert b'HTTP/1.1 200 OK' in resp.body
assert b'"foo": "bar"' in resp.body
self._test_all_warcs('/warcs/USER/COLL/', 3)
r = FakeStrictRedis.from_url('redis://localhost/2')
res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/', '(org,httpbin,')
assert len(res) == 3
mimes = [CDXObject(x)['mime'] for x in res]
assert sorted(mimes) == ['application/json', 'application/json', 'warc/revisit']
assert len(writer.fh_cache) == 0
# Keep Open
def test_record_file_warc_keep_open(self):
path = to_path(self.root_dir + '/warcs/A.warc.gz')
writer = MultiFileWARCWriter(path)
recorder_app = RecorderApp(self.upstream_url, writer)
resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar')
assert b'HTTP/1.1 200 OK' in resp.body
assert b'"foo": "bar"' in resp.body
assert os.path.isfile(path)
assert len(writer.fh_cache) == 1
def test_record_multiple_writes_keep_open(self):
warc_path = to_path(self.root_dir + '/warcs/FOO/ABC-{hostname}-{timestamp}.warc.gz')
rel_path = self.root_dir + '/warcs/'
dedup_index = self._get_dedup_index(user=False)
writer = MultiFileWARCWriter(warc_path, dedup_index=dedup_index)
recorder_app = RecorderApp(self.upstream_url, writer)
# First Record
resp = self._test_warc_write(recorder_app, 'httpbin.org',
'/get?foo=bar', '&param.recorder.coll=FOO')
assert b'HTTP/1.1 200 OK' in resp.body
assert b'"foo": "bar"' in resp.body
# Second Record
resp = self._test_warc_write(recorder_app, 'httpbin.org',
'/get?boo=far', '&param.recorder.coll=FOO')
assert b'HTTP/1.1 200 OK' in resp.body
assert b'"boo": "far"' in resp.body
self._test_all_warcs('/warcs/FOO/', 1)
# Check two records in WARC
r = FakeStrictRedis.from_url('redis://localhost/2')
res = r.zrangebylex('FOO:cdxj', '[org,httpbin)/', '(org,httpbin,')
assert len(res) == 2
files, coll_dir = self._test_all_warcs('/warcs/FOO/', 1)
fullname = coll_dir + files[0]
cdxout = BytesIO()
with open(fullname, 'rb') as fh:
filename = os.path.relpath(fullname, rel_path)
write_cdx_index(cdxout, fh, filename,
cdxj=True, append_post=True, sort=True)
res = [CDXObject(x) for x in res]
cdxres = cdxout.getvalue().strip()
cdxres = cdxres.split(b'\n')
cdxres = [CDXObject(x) for x in cdxres]
assert cdxres == res
assert len(writer.fh_cache) == 1
writer.close_key(self.root_dir + '/warcs/FOO/')
assert len(writer.fh_cache) == 0
writer.close()
resp = self._test_warc_write(recorder_app, 'httpbin.org',
'/get?boo=far', '&param.recorder.coll=FOO')
self._test_all_warcs('/warcs/FOO/', 2)
warcs = r.hgetall('FOO:warc')
assert len(warcs) == 2
def test_record_multiple_writes_rollover_idle(self):
warc_path = to_path(self.root_dir + '/warcs/GOO/ABC-{hostname}-{timestamp}.warc.gz')
rel_path = self.root_dir + '/warcs/'
dedup_index = self._get_dedup_index(user=False)
writer = MultiFileWARCWriter(warc_path, dedup_index=dedup_index, max_idle_secs=0.9)
recorder_app = RecorderApp(self.upstream_url, writer)
# First Record
resp = self._test_warc_write(recorder_app, 'httpbin.org',
'/get?foo=bar', '&param.recorder.coll=GOO')
assert b'HTTP/1.1 200 OK' in resp.body
assert b'"foo": "bar"' in resp.body
# Second Record
resp = self._test_warc_write(recorder_app, 'httpbin.org',
'/get?boo=far', '&param.recorder.coll=GOO')
assert b'HTTP/1.1 200 OK' in resp.body
assert b'"boo": "far"' in resp.body
self._test_all_warcs('/warcs/GOO/', 1)
time.sleep(1.0)
writer.close_idle_files()
# Third Record
resp = self._test_warc_write(recorder_app, 'httpbin.org',
'/get?goo=bar', '&param.recorder.coll=GOO')
assert b'HTTP/1.1 200 OK' in resp.body
assert b'"goo": "bar"' in resp.body
self._test_all_warcs('/warcs/GOO/', 2)
def test_warcinfo_record(self):
simplewriter = SimpleTempWARCWriter(gzip=False)
params = {'software': 'recorder test',
'format': 'WARC File Format 1.0',
'json-metadata': json.dumps({'foo': 'bar'})}
record = simplewriter.create_warcinfo_record('testfile.warc.gz', params)
simplewriter.write_record(record)
buff = simplewriter.get_buffer()
assert isinstance(buff, bytes)
buff = BytesIO(buff)
parsed_record = ArcWarcRecordLoader().parse_record_stream(buff)
assert parsed_record.rec_headers.get_header('WARC-Type') == 'warcinfo'
assert parsed_record.rec_headers.get_header('Content-Type') == 'application/warc-fields'
assert parsed_record.rec_headers.get_header('WARC-Filename') == 'testfile.warc.gz'
buff = parsed_record.stream.read().decode('utf-8')
length = parsed_record.rec_headers.get_header('Content-Length')
assert len(buff) == int(length)
assert 'json-metadata: {"foo": "bar"}\r\n' in buff
assert 'format: WARC File Format 1.0\r\n' in buff
assert 'json-metadata: {"foo": "bar"}\r\n' in buff
def test_record_custom_record(self):
dedup_index = self._get_dedup_index(user=False)
warc_path = to_path(self.root_dir + '/warcs/meta/meta.warc.gz')
recorder_app = RecorderApp(self.upstream_url,
MultiFileWARCWriter(warc_path, dedup_index=dedup_index))
req_url = '/live/resource/postreq?url=custom://httpbin.org&param.recorder.coll=META&put_record=resource'
buff = b'Some Data'
testapp = webtest.TestApp(recorder_app)
headers = {'content-type': 'text/plain',
'WARC-Custom': 'foo'
}
resp = testapp.put(req_url, headers=headers, params=buff)
assert resp.json['success'] == 'true'
assert resp.json['WARC-Date'] != ''
self._test_all_warcs('/warcs/meta', 1)
r = FakeStrictRedis.from_url('redis://localhost/2')
warcs = r.hgetall('META:warc')
assert len(warcs) == 1
with open(warcs[b'meta/meta.warc.gz'], 'rb') as fh:
decomp = DecompressingBufferedReader(fh)
record = ArcWarcRecordLoader().parse_record_stream(decomp)
status_headers = record.rec_headers
assert len(record.rec_headers.headers) == 9
assert status_headers.get_header('WARC-Type') == 'resource'
assert status_headers.get_header('WARC-Target-URI') == 'custom://httpbin.org'
assert status_headers.get_header('WARC-Record-ID') != ''
assert status_headers.get_header('WARC-Date') != ''
assert status_headers.get_header('WARC-Block-Digest') != ''
assert status_headers.get_header('WARC-Block-Digest') == status_headers.get_header('WARC-Payload-Digest')
assert status_headers.get_header('Content-Type') == 'text/plain'
assert status_headers.get_header('Content-Length') == str(len(buff))
assert status_headers.get_header('WARC-Custom') == 'foo'
assert record.stream.read() == buff
status_headers = record.status_headers
assert len(record.status_headers.headers) == 2
assert status_headers.get_header('Content-Type') == 'text/plain'
assert status_headers.get_header('Content-Length') == str(len(buff))
def test_record_video_metadata(self):
warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')
dedup_index = self._get_dedup_index()
writer = PerRecordWARCWriter(warc_path, dedup_index=dedup_index)
recorder_app = RecorderApp(self.upstream_url, writer)
params = {'param.recorder.user': 'USER',
'param.recorder.coll': 'VIDEO',
'content_type': 'application/vnd.youtube-dl_formats+json'
}
resp = self._test_warc_write(recorder_app,
'www.youtube.com', '/v/BfBgWtAIbRc', '&' + urlencode(params),
link_url='metadata://www.youtube.com/v/BfBgWtAIbRc')
r = FakeStrictRedis.from_url('redis://localhost/2')
warcs = r.hgetall('USER:VIDEO:warc')
assert len(warcs) == 1
filename = list(warcs.values())[0]
with open(filename, 'rb') as fh:
decomp = DecompressingBufferedReader(fh)
record = ArcWarcRecordLoader().parse_record_stream(decomp)
status_headers = record.rec_headers
assert status_headers.get_header('WARC-Type') == 'metadata'
assert status_headers.get_header('Content-Type') == 'application/vnd.youtube-dl_formats+json'
assert status_headers.get_header('WARC-Block-Digest') != ''
assert status_headers.get_header('WARC-Block-Digest') == status_headers.get_header('WARC-Payload-Digest')

553
recorder/warcwriter.py Normal file
View File

@ -0,0 +1,553 @@
import tempfile
import uuid
import base64
import hashlib
import datetime
import zlib
import sys
import os
import six
import shutil
import traceback
from collections import OrderedDict
from socket import gethostname
from io import BytesIO
import fcntl
from pywb.utils.loaders import LimitReader, to_native_str
from pywb.utils.bufferedreaders import BufferedReader
from pywb.utils.timeutils import timestamp20_now, datetime_to_iso_date
from pywb.utils.statusandheaders import StatusAndHeadersParser
from pywb.warc.recordloader import ArcWarcRecord
from pywb.warc.recordloader import ArcWarcRecordLoader
from requests.structures import CaseInsensitiveDict
from webagg.utils import ParamFormatter, res_template
from recorder.filters import ExcludeNone
# ============================================================================
class BaseWARCWriter(object):
WARC_RECORDS = {'warcinfo': 'application/warc-fields',
'response': 'application/http; msgtype=response',
'revisit': 'application/http; msgtype=response',
'request': 'application/http; msgtype=request',
'metadata': 'application/warc-fields',
}
REVISIT_PROFILE = 'http://netpreserve.org/warc/1.0/revisit/uri-agnostic-identical-payload-digest'
BUFF_SIZE = 8192
FILE_TEMPLATE = 'rec-{timestamp}-{hostname}.warc.gz'
def __init__(self, gzip=True, dedup_index=None, name='recorder',
header_filter=ExcludeNone(), *args, **kwargs):
self.gzip = gzip
self.dedup_index = dedup_index
self.rec_source_name = name
self.header_filter = header_filter
self.hostname = gethostname()
self.parser = StatusAndHeadersParser([], verify=False)
def ensure_digest(self, record):
block_digest = record.rec_headers.get('WARC-Block-Digest')
payload_digest = record.rec_headers.get('WARC-Payload-Digest')
if block_digest and payload_digest:
return
block_digester = self._create_digester()
payload_digester = self._create_digester()
pos = record.stream.tell()
if record.status_headers and hasattr(record.status_headers, 'headers_buff'):
block_digester.update(record.status_headers.headers_buff)
while True:
buf = record.stream.read(self.BUFF_SIZE)
if not buf:
break
block_digester.update(buf)
payload_digester.update(buf)
record.stream.seek(pos)
record.rec_headers['WARC-Block-Digest'] = str(block_digester)
record.rec_headers['WARC-Payload-Digest'] = str(payload_digester)
def _create_digester(self):
return Digester('sha1')
def _set_header_buff(self, record):
exclude_list = self.header_filter(record)
buff = record.status_headers.to_bytes(exclude_list)
record.status_headers.headers_buff = buff
def write_req_resp(self, req, resp, params):
url = resp.rec_headers.get('WARC-Target-URI')
dt = resp.rec_headers.get('WARC-Date')
#req.rec_headers['Content-Type'] = req.content_type
req.rec_headers['WARC-Target-URI'] = url
req.rec_headers['WARC-Date'] = dt
resp_id = resp.rec_headers.get('WARC-Record-ID')
if resp_id:
req.rec_headers['WARC-Concurrent-To'] = resp_id
resp = self._check_revisit(resp, params)
if not resp:
print('Skipping due to dedup')
return
params['_formatter'] = ParamFormatter(params, name=self.rec_source_name)
self._do_write_req_resp(req, resp, params)
def create_req_record(self, req_headers, payload):
len_ = payload.tell()
payload.seek(0)
warc_headers = req_headers
warc_headers['WARC-Type'] = 'request'
if not warc_headers.get('WARC-Record-ID'):
warc_headers['WARC-Record-ID'] = self._make_warc_id()
status_headers = self.parser.parse(payload)
record = ArcWarcRecord('warc', 'request', warc_headers, payload,
status_headers, '', len_)
self._set_header_buff(record)
return record
def read_resp_record(self, resp_headers, payload):
len_ = payload.tell()
payload.seek(0)
warc_headers = self.parser.parse(payload)
warc_headers = CaseInsensitiveDict(warc_headers.headers)
record_type = warc_headers.get('WARC-Type', 'response')
if record_type == 'response':
status_headers = self.parser.parse(payload)
else:
status_headers = None
record = ArcWarcRecord('warc', record_type, warc_headers, payload,
status_headers, '', len_)
if record_type == 'response':
self._set_header_buff(record)
self.ensure_digest(record)
return record_type, record
def create_warcinfo_record(self, filename, info):
warc_headers = {}
warc_headers['WARC-Record-ID'] = self._make_warc_id()
warc_headers['WARC-Type'] = 'warcinfo'
if filename:
warc_headers['WARC-Filename'] = filename
warc_headers['WARC-Date'] = datetime_to_iso_date(datetime.datetime.utcnow())
warcinfo = BytesIO()
for n, v in six.iteritems(info):
self._header(warcinfo, n, v)
warcinfo.seek(0)
record = ArcWarcRecord('warc', 'warcinfo', warc_headers, warcinfo,
None, '', len(warcinfo.getvalue()))
return record
def create_custom_record(self, uri, payload, record_type, content_type,
warc_headers=None):
len_ = payload.tell()
payload.seek(0)
warc_headers = warc_headers or {}
warc_headers['WARC-Record-ID'] = self._make_warc_id()
warc_headers['WARC-Type'] = record_type
warc_headers['WARC-Target-URI'] = uri
if 'WARC-Date' not in warc_headers:
warc_headers['WARC-Date'] = datetime_to_iso_date(datetime.datetime.utcnow())
record = ArcWarcRecord('warc', record_type, warc_headers, payload,
None, content_type, len_)
self.ensure_digest(record)
return record
def _check_revisit(self, record, params):
if not self.dedup_index:
return record
try:
url = record.rec_headers.get('WARC-Target-URI')
digest = record.rec_headers.get('WARC-Payload-Digest')
iso_dt = record.rec_headers.get('WARC-Date')
result = self.dedup_index.lookup_revisit(params, digest, url, iso_dt)
except Exception as e:
traceback.print_exc()
result = None
if result == 'skip':
return None
if isinstance(result, tuple) and result[0] == 'revisit':
record.rec_headers['WARC-Type'] = 'revisit'
record.rec_headers['WARC-Profile'] = self.REVISIT_PROFILE
record.rec_headers['WARC-Refers-To-Target-URI'] = result[1]
record.rec_headers['WARC-Refers-To-Date'] = result[2]
return record
def _write_warc_record(self, out, record):
if self.gzip:
out = GzippingWrapper(out)
self._line(out, b'WARC/1.0')
for n, v in six.iteritems(record.rec_headers):
if n.lower() in ('content-length', 'content-type'):
continue
self._header(out, n, v)
content_type = record.rec_headers.get('Content-Type')
if not content_type:
content_type = record.content_type
if not content_type:
content_type = self.WARC_RECORDS.get(record.rec_headers['WARC-Type'])
if content_type:
self._header(out, 'Content-Type', content_type)
if record.rec_headers['WARC-Type'] == 'revisit':
http_headers_only = True
else:
http_headers_only = False
if record.length:
actual_len = 0
if record.status_headers:
actual_len = len(record.status_headers.headers_buff)
if not http_headers_only:
diff = record.stream.tell() - actual_len
actual_len = record.length - diff
self._header(out, 'Content-Length', str(actual_len))
# add empty line
self._line(out, b'')
# write headers buffer, if any
if record.status_headers:
out.write(record.status_headers.headers_buff)
if not http_headers_only:
out.write(record.stream.read())
# add two lines
self._line(out, b'\r\n')
else:
# add three lines (1 for end of header, 2 for end of record)
self._line(out, b'Content-Length: 0\r\n\r\n')
out.flush()
def _header(self, out, name, value):
if not value:
return
self._line(out, (name + ': ' + str(value)).encode('latin-1'))
def _line(self, out, line):
out.write(line + b'\r\n')
@staticmethod
def _make_warc_id(id_=None):
if not id_:
id_ = uuid.uuid1()
return '<urn:uuid:{0}>'.format(id_)
# ============================================================================
class GzippingWrapper(object):
def __init__(self, out):
self.compressor = zlib.compressobj(9, zlib.DEFLATED, zlib.MAX_WBITS + 16)
self.out = out
def write(self, buff):
#if isinstance(buff, str):
# buff = buff.encode('utf-8')
buff = self.compressor.compress(buff)
self.out.write(buff)
def flush(self):
buff = self.compressor.flush()
self.out.write(buff)
self.out.flush()
# ============================================================================
class Digester(object):
def __init__(self, type_='sha1'):
self.type_ = type_
self.digester = hashlib.new(type_)
def update(self, buff):
self.digester.update(buff)
def __str__(self):
return self.type_ + ':' + to_native_str(base64.b32encode(self.digester.digest()))
# ============================================================================
class MultiFileWARCWriter(BaseWARCWriter):
def __init__(self, dir_template, filename_template=None, max_size=0,
max_idle_secs=1800, *args, **kwargs):
super(MultiFileWARCWriter, self).__init__(*args, **kwargs)
if not filename_template:
dir_template, filename_template = os.path.split(dir_template)
dir_template += os.path.sep
if not filename_template:
filename_template = self.FILE_TEMPLATE
self.dir_template = dir_template
self.key_template = kwargs.get('key_template', self.dir_template)
self.filename_template = filename_template
self.max_size = max_size
if max_idle_secs > 0:
self.max_idle_time = datetime.timedelta(seconds=max_idle_secs)
else:
self.max_idle_time = None
self.fh_cache = {}
def get_new_filename(self, dir_, params):
timestamp = timestamp20_now()
randstr = base64.b32encode(os.urandom(5)).decode('utf-8')
filename = dir_ + res_template(self.filename_template, params,
hostname=self.hostname,
timestamp=timestamp,
random=randstr)
return filename
def allow_new_file(self, filename, params):
return True
def _open_file(self, filename, params):
path, name = os.path.split(filename)
try:
os.makedirs(path)
except:
pass
fh = open(filename, 'a+b')
if self.dedup_index:
self.dedup_index.add_warc_file(filename, params)
return fh
def _close_file(self, fh):
try:
fcntl.flock(fh, fcntl.LOCK_UN)
fh.close()
except Exception as e:
print(e)
def get_dir_key(self, params):
return res_template(self.key_template, params)
def close_key(self, dir_key):
if isinstance(dir_key, dict):
dir_key = self.get_dir_key(dir_key)
result = self.fh_cache.pop(dir_key, None)
if not result:
return
out, filename = result
self._close_file(out)
return filename
def close_file(self, match_filename):
for dir_key, out, filename in self.iter_open_files():
if filename == match_filename:
return self.close_key(dir_key)
def _is_write_resp(self, resp, params):
return True
def _is_write_req(self, req, params):
return True
def write_record(self, record, params=None):
params = params or {}
params['_formatter'] = ParamFormatter(params, name=self.rec_source_name)
self._do_write_req_resp(None, record, params)
def _do_write_req_resp(self, req, resp, params):
def write_callback(out, filename):
url = resp.rec_headers.get('WARC-Target-URI')
print('Writing req/resp {0} to {1} '.format(url, filename))
if resp and self._is_write_resp(resp, params):
self._write_warc_record(out, resp)
if req and self._is_write_req(req, params):
self._write_warc_record(out, req)
return self._write_to_file(params, write_callback)
def write_stream_to_file(self, params, stream):
def write_callback(out, filename):
print('Writing stream to {0}'.format(filename))
shutil.copyfileobj(stream, out)
return self._write_to_file(params, write_callback)
def _write_to_file(self, params, write_callback):
full_dir = res_template(self.dir_template, params)
dir_key = self.get_dir_key(params)
result = self.fh_cache.get(dir_key)
close_file = False
if result:
out, filename = result
is_new = False
else:
filename = self.get_new_filename(full_dir, params)
if not self.allow_new_file(filename, params):
return False
out = self._open_file(filename, params)
is_new = True
try:
start = out.tell()
write_callback(out, filename)
out.flush()
new_size = out.tell()
out.seek(start)
if self.dedup_index:
self.dedup_index.add_urls_to_index(out, params,
filename,
new_size - start)
return True
except Exception as e:
traceback.print_exc()
close_file = True
return False
finally:
# check for rollover
if self.max_size and new_size > self.max_size:
close_file = True
if close_file:
self._close_file(out)
if not is_new:
self.fh_cache.pop(dir_key, None)
elif is_new:
fcntl.flock(out, fcntl.LOCK_EX | fcntl.LOCK_NB)
self.fh_cache[dir_key] = (out, filename)
def iter_open_files(self):
for n, v in list(self.fh_cache.items()):
out, filename = v
yield n, out, filename
def close(self):
for dir_key, out, filename in self.iter_open_files():
self._close_file(out)
self.fh_cache = {}
def close_idle_files(self):
if not self.max_idle_time:
return
now = datetime.datetime.now()
for dir_key, out, filename in self.iter_open_files():
try:
mtime = os.path.getmtime(filename)
except:
self.close_key(dir_key)
return
mtime = datetime.datetime.fromtimestamp(mtime)
if (now - mtime) > self.max_idle_time:
print('Closing idle ' + filename)
self.close_key(dir_key)
# ============================================================================
class PerRecordWARCWriter(MultiFileWARCWriter):
def __init__(self, *args, **kwargs):
kwargs['max_size'] = 1
super(PerRecordWARCWriter, self).__init__(*args, **kwargs)
# ============================================================================
class SimpleTempWARCWriter(BaseWARCWriter):
def __init__(self, *args, **kwargs):
super(SimpleTempWARCWriter, self).__init__(*args, **kwargs)
self.out = self._create_buffer()
def _create_buffer(self):
return tempfile.SpooledTemporaryFile(max_size=512*1024)
def _do_write_req_resp(self, req, resp, params):
self._write_warc_record(self.out, resp)
self._write_warc_record(self.out, req)
def write_record(self, record, params=None):
self._write_warc_record(self.out, record)
def get_buffer(self):
pos = self.out.tell()
self.out.seek(0)
buff = self.out.read()
self.out.seek(pos)
return buff

12
testdata/dupes.cdxj vendored Normal file
View File

@ -0,0 +1,12 @@
com,example)/ 20140127171200 {"url": "http://example.com", "mime": "text/html", "status": "200", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "1046", "offset": "334", "filename": "dupes.warc.gz"}
com,example)/ 20140127171251 {"url": "http://example.com", "mime": "warc/revisit", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "553", "offset": "11875", "filename": "dupes.warc.gz"}
org,iana)/ 20140127171238 {"url": "http://iana.org", "mime": "unk", "status": "302", "digest": "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "343", "offset": "1858", "filename": "dupes.warc.gz"}
org,iana)/ 20140127171238 {"url": "http://www.iana.org/", "mime": "warc/revisit", "digest": "OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB", "length": "536", "offset": "2678", "filename": "dupes.warc.gz"}
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140127171240 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf", "mime": "warc/revisit", "digest": "YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW", "length": "556", "offset": "10826", "filename": "dupes.warc.gz"}
org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140127171240 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Regular.ttf", "mime": "warc/revisit", "digest": "GVSO2C2TMPPVZ4TXYFXAY27NYWTIEIL7", "length": "540", "offset": "9793", "filename": "dupes.warc.gz"}
org,iana)/_css/2013.1/print.css 20140127171239 {"url": "http://www.iana.org/_css/2013.1/print.css", "mime": "warc/revisit", "digest": "VNBXHMUNWJQC5OWWGZ3X7GM5C7X6ZAB4", "length": "537", "offset": "6684", "filename": "dupes.warc.gz"}
org,iana)/_css/2013.1/screen.css 20140127171239 {"url": "http://www.iana.org/_css/2013.1/screen.css", "mime": "warc/revisit", "digest": "BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD", "length": "541", "offset": "4630", "filename": "dupes.warc.gz"}
org,iana)/_img/2013.1/iana-logo-homepage.png 20140127171240 {"url": "http://www.iana.org/_img/2013.1/iana-logo-homepage.png", "mime": "warc/revisit", "digest": "GCW2GM3SIMHEIQYZX25MLSRYVWUCZ7OK", "length": "549", "offset": "8750", "filename": "dupes.warc.gz"}
org,iana)/_img/2013.1/icann-logo.svg 20140127171239 {"url": "http://www.iana.org/_img/2013.1/icann-logo.svg", "mime": "warc/revisit", "digest": "HGRZHOH73EFQQWBYWBSOIV2UU5JDTSGJ", "length": "549", "offset": "7709", "filename": "dupes.warc.gz"}
org,iana)/_js/2013.1/iana.js 20140127171239 {"url": "http://www.iana.org/_js/2013.1/iana.js", "mime": "application/x-javascript", "status": "200", "digest": "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "457", "offset": "3696", "filename": "dupes.warc.gz"}
org,iana)/_js/2013.1/jquery.js 20140127171239 {"url": "http://www.iana.org/_js/2013.1/jquery.js", "mime": "warc/revisit", "digest": "AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO", "length": "547", "offset": "5658", "filename": "dupes.warc.gz"}

BIN
testdata/dupes.warc.gz vendored Normal file

Binary file not shown.

Binary file not shown.

Binary file not shown.

1
testdata/example.cdxj vendored Normal file
View File

@ -0,0 +1 @@
com,example)/ 20160225042329 {"url": "http://example.com/", "mime": "text/html", "status": "200", "digest": "37cf167c2672a4a64af901d9484e75eee0e2c98a", "length": "1286", "offset": "363", "filename": "example.warc.gz"}

BIN
testdata/example.warc.gz vendored Normal file

Binary file not shown.

171
testdata/iana.cdxj vendored Normal file
View File

@ -0,0 +1,171 @@
org,iana)/ 20140126200624 {"url": "http://www.iana.org/", "mime": "text/html", "status": "200", "digest": "OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB", "length": "2258", "offset": "334", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200826 {"url": "http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf", "mime": "application/octet-stream", "status": "200", "digest": "LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR", "length": "34054", "offset": "620049", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200912 {"url": "http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf", "mime": "warc/revisit", "digest": "LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR", "length": "546", "offset": "667073", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200930 {"url": "http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf", "mime": "warc/revisit", "digest": "LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR", "length": "534", "offset": "697255", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201055 {"url": "http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf", "mime": "warc/revisit", "digest": "LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR", "length": "547", "offset": "714833", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201249 {"url": "http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf", "mime": "warc/revisit", "digest": "LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR", "length": "551", "offset": "768625", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200625 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf", "mime": "application/octet-stream", "status": "200", "digest": "YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW", "length": "117166", "offset": "198285", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200654 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf", "mime": "warc/revisit", "digest": "YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW", "length": "548", "offset": "482544", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200706 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf", "mime": "warc/revisit", "digest": "YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW", "length": "552", "offset": "495230", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200718 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf", "mime": "warc/revisit", "digest": "YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW", "length": "536", "offset": "566542", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200738 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf", "mime": "warc/revisit", "digest": "YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW", "length": "552", "offset": "578743", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200805 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf", "mime": "warc/revisit", "digest": "YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW", "length": "535", "offset": "593400", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200816 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf", "mime": "warc/revisit", "digest": "YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW", "length": "554", "offset": "608401", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200826 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf", "mime": "warc/revisit", "digest": "YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW", "length": "550", "offset": "654593", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200912 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf", "mime": "warc/revisit", "digest": "YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW", "length": "553", "offset": "670224", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200930 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf", "mime": "warc/revisit", "digest": "YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW", "length": "551", "offset": "699343", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201055 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf", "mime": "warc/revisit", "digest": "YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW", "length": "552", "offset": "712719", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201128 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf", "mime": "warc/revisit", "digest": "YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW", "length": "554", "offset": "731718", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201228 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf", "mime": "warc/revisit", "digest": "YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW", "length": "551", "offset": "745730", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201240 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf", "mime": "warc/revisit", "digest": "YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW", "length": "551", "offset": "757988", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201249 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf", "mime": "warc/revisit", "digest": "YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW", "length": "552", "offset": "771773", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201308 {"url": "https://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf", "mime": "warc/revisit", "digest": "YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW", "length": "551", "offset": "783712", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200626 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Regular.ttf", "mime": "application/octet-stream", "status": "200", "digest": "GVSO2C2TMPPVZ4TXYFXAY27NYWTIEIL7", "length": "114499", "offset": "83293", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200654 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Regular.ttf", "mime": "warc/revisit", "digest": "GVSO2C2TMPPVZ4TXYFXAY27NYWTIEIL7", "length": "550", "offset": "446529", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200706 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Regular.ttf", "mime": "warc/revisit", "digest": "GVSO2C2TMPPVZ4TXYFXAY27NYWTIEIL7", "length": "553", "offset": "493141", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200718 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Regular.ttf", "mime": "warc/revisit", "digest": "GVSO2C2TMPPVZ4TXYFXAY27NYWTIEIL7", "length": "554", "offset": "567576", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200738 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Regular.ttf", "mime": "warc/revisit", "digest": "GVSO2C2TMPPVZ4TXYFXAY27NYWTIEIL7", "length": "555", "offset": "580835", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200805 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Regular.ttf", "mime": "warc/revisit", "digest": "GVSO2C2TMPPVZ4TXYFXAY27NYWTIEIL7", "length": "551", "offset": "595503", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200816 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Regular.ttf", "mime": "warc/revisit", "digest": "GVSO2C2TMPPVZ4TXYFXAY27NYWTIEIL7", "length": "554", "offset": "609468", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200826 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Regular.ttf", "mime": "warc/revisit", "digest": "GVSO2C2TMPPVZ4TXYFXAY27NYWTIEIL7", "length": "551", "offset": "655640", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200912 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Regular.ttf", "mime": "warc/revisit", "digest": "GVSO2C2TMPPVZ4TXYFXAY27NYWTIEIL7", "length": "551", "offset": "669172", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200930 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Regular.ttf", "mime": "warc/revisit", "digest": "GVSO2C2TMPPVZ4TXYFXAY27NYWTIEIL7", "length": "553", "offset": "698287", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126201055 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Regular.ttf", "mime": "warc/revisit", "digest": "GVSO2C2TMPPVZ4TXYFXAY27NYWTIEIL7", "length": "553", "offset": "711664", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126201128 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Regular.ttf", "mime": "warc/revisit", "digest": "GVSO2C2TMPPVZ4TXYFXAY27NYWTIEIL7", "length": "553", "offset": "730663", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126201228 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Regular.ttf", "mime": "warc/revisit", "digest": "GVSO2C2TMPPVZ4TXYFXAY27NYWTIEIL7", "length": "537", "offset": "743642", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126201240 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Regular.ttf", "mime": "warc/revisit", "digest": "GVSO2C2TMPPVZ4TXYFXAY27NYWTIEIL7", "length": "552", "offset": "755896", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126201249 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Regular.ttf", "mime": "warc/revisit", "digest": "GVSO2C2TMPPVZ4TXYFXAY27NYWTIEIL7", "length": "553", "offset": "769676", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126201308 {"url": "https://www.iana.org/_css/2013.1/fonts/OpenSans-Regular.ttf", "mime": "warc/revisit", "digest": "GVSO2C2TMPPVZ4TXYFXAY27NYWTIEIL7", "length": "551", "offset": "784758", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126200654 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Semibold.ttf", "mime": "application/octet-stream", "status": "200", "digest": "6HXHVHDNCPXC2ZBKQBWATZZXE5PGCN4S", "length": "116641", "offset": "329393", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126200706 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Semibold.ttf", "mime": "warc/revisit", "digest": "6HXHVHDNCPXC2ZBKQBWATZZXE5PGCN4S", "length": "538", "offset": "494192", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126200718 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Semibold.ttf", "mime": "warc/revisit", "digest": "6HXHVHDNCPXC2ZBKQBWATZZXE5PGCN4S", "length": "538", "offset": "565504", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126200738 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Semibold.ttf", "mime": "warc/revisit", "digest": "6HXHVHDNCPXC2ZBKQBWATZZXE5PGCN4S", "length": "539", "offset": "579795", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126200805 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Semibold.ttf", "mime": "warc/revisit", "digest": "6HXHVHDNCPXC2ZBKQBWATZZXE5PGCN4S", "length": "555", "offset": "592333", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126200816 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Semibold.ttf", "mime": "warc/revisit", "digest": "6HXHVHDNCPXC2ZBKQBWATZZXE5PGCN4S", "length": "556", "offset": "607332", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126200826 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Semibold.ttf", "mime": "warc/revisit", "digest": "6HXHVHDNCPXC2ZBKQBWATZZXE5PGCN4S", "length": "556", "offset": "656690", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126200912 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Semibold.ttf", "mime": "warc/revisit", "digest": "6HXHVHDNCPXC2ZBKQBWATZZXE5PGCN4S", "length": "554", "offset": "668113", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126200930 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Semibold.ttf", "mime": "warc/revisit", "digest": "6HXHVHDNCPXC2ZBKQBWATZZXE5PGCN4S", "length": "556", "offset": "700397", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126201055 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Semibold.ttf", "mime": "warc/revisit", "digest": "6HXHVHDNCPXC2ZBKQBWATZZXE5PGCN4S", "length": "555", "offset": "713774", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126201128 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Semibold.ttf", "mime": "warc/revisit", "digest": "6HXHVHDNCPXC2ZBKQBWATZZXE5PGCN4S", "length": "556", "offset": "732779", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126201228 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Semibold.ttf", "mime": "warc/revisit", "digest": "6HXHVHDNCPXC2ZBKQBWATZZXE5PGCN4S", "length": "538", "offset": "744686", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126201240 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Semibold.ttf", "mime": "warc/revisit", "digest": "6HXHVHDNCPXC2ZBKQBWATZZXE5PGCN4S", "length": "537", "offset": "756949", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126201249 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Semibold.ttf", "mime": "warc/revisit", "digest": "6HXHVHDNCPXC2ZBKQBWATZZXE5PGCN4S", "length": "539", "offset": "770730", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126201308 {"url": "https://www.iana.org/_css/2013.1/fonts/OpenSans-Semibold.ttf", "mime": "warc/revisit", "digest": "6HXHVHDNCPXC2ZBKQBWATZZXE5PGCN4S", "length": "554", "offset": "782657", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/print.css 20140126200625 {"url": "http://www.iana.org/_css/2013.1/print.css", "mime": "text/css", "status": "200", "digest": "VNBXHMUNWJQC5OWWGZ3X7GM5C7X6ZAB4", "length": "4662", "offset": "50482", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/print.css 20140126200653 {"url": "http://www.iana.org/_css/2013.1/print.css", "mime": "warc/revisit", "digest": "VNBXHMUNWJQC5OWWGZ3X7GM5C7X6ZAB4", "length": "534", "offset": "326315", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/print.css 20140126200706 {"url": "http://www.iana.org/_css/2013.1/print.css", "mime": "warc/revisit", "digest": "VNBXHMUNWJQC5OWWGZ3X7GM5C7X6ZAB4", "length": "534", "offset": "487982", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/print.css 20140126200716 {"url": "http://www.iana.org/_css/2013.1/print.css", "mime": "warc/revisit", "digest": "VNBXHMUNWJQC5OWWGZ3X7GM5C7X6ZAB4", "length": "535", "offset": "561375", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/print.css 20140126200737 {"url": "http://www.iana.org/_css/2013.1/print.css", "mime": "warc/revisit", "digest": "VNBXHMUNWJQC5OWWGZ3X7GM5C7X6ZAB4", "length": "536", "offset": "574583", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/print.css 20140126200804 {"url": "http://www.iana.org/_css/2013.1/print.css", "mime": "warc/revisit", "digest": "VNBXHMUNWJQC5OWWGZ3X7GM5C7X6ZAB4", "length": "538", "offset": "588168", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/print.css 20140126200816 {"url": "http://www.iana.org/_css/2013.1/print.css", "mime": "warc/revisit", "digest": "VNBXHMUNWJQC5OWWGZ3X7GM5C7X6ZAB4", "length": "537", "offset": "602081", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/print.css 20140126200825 {"url": "http://www.iana.org/_css/2013.1/print.css", "mime": "warc/revisit", "digest": "VNBXHMUNWJQC5OWWGZ3X7GM5C7X6ZAB4", "length": "535", "offset": "613943", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/print.css 20140126200912 {"url": "http://www.iana.org/_css/2013.1/print.css", "mime": "warc/revisit", "digest": "VNBXHMUNWJQC5OWWGZ3X7GM5C7X6ZAB4", "length": "536", "offset": "662904", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/print.css 20140126200929 {"url": "http://www.iana.org/_css/2013.1/print.css", "mime": "warc/revisit", "digest": "VNBXHMUNWJQC5OWWGZ3X7GM5C7X6ZAB4", "length": "537", "offset": "693076", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/print.css 20140126201054 {"url": "http://www.iana.org/_css/2013.1/print.css", "mime": "warc/revisit", "digest": "VNBXHMUNWJQC5OWWGZ3X7GM5C7X6ZAB4", "length": "526", "offset": "707519", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/print.css 20140126201127 {"url": "http://www.iana.org/_css/2013.1/print.css", "mime": "warc/revisit", "digest": "VNBXHMUNWJQC5OWWGZ3X7GM5C7X6ZAB4", "length": "525", "offset": "726489", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/print.css 20140126201227 {"url": "http://www.iana.org/_css/2013.1/print.css", "mime": "warc/revisit", "digest": "VNBXHMUNWJQC5OWWGZ3X7GM5C7X6ZAB4", "length": "527", "offset": "738432", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/print.css 20140126201239 {"url": "http://www.iana.org/_css/2013.1/print.css", "mime": "warc/revisit", "digest": "VNBXHMUNWJQC5OWWGZ3X7GM5C7X6ZAB4", "length": "526", "offset": "750710", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/print.css 20140126201248 {"url": "http://www.iana.org/_css/2013.1/print.css", "mime": "warc/revisit", "digest": "VNBXHMUNWJQC5OWWGZ3X7GM5C7X6ZAB4", "length": "535", "offset": "763424", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/print.css 20140126201307 {"url": "https://www.iana.org/_css/2013.1/print.css", "mime": "warc/revisit", "digest": "VNBXHMUNWJQC5OWWGZ3X7GM5C7X6ZAB4", "length": "539", "offset": "777477", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/screen.css 20140126200625 {"url": "http://www.iana.org/_css/2013.1/screen.css", "mime": "text/css", "status": "200", "digest": "BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD", "length": "8754", "offset": "41238", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/screen.css 20140126200653 {"url": "http://www.iana.org/_css/2013.1/screen.css", "mime": "warc/revisit", "digest": "BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD", "length": "533", "offset": "328367", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/screen.css 20140126200706 {"url": "http://www.iana.org/_css/2013.1/screen.css", "mime": "warc/revisit", "digest": "BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD", "length": "539", "offset": "489005", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/screen.css 20140126200716 {"url": "http://www.iana.org/_css/2013.1/screen.css", "mime": "warc/revisit", "digest": "BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD", "length": "542", "offset": "563417", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/screen.css 20140126200737 {"url": "http://www.iana.org/_css/2013.1/screen.css", "mime": "warc/revisit", "digest": "BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD", "length": "528", "offset": "572623", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/screen.css 20140126200804 {"url": "http://www.iana.org/_css/2013.1/screen.css", "mime": "warc/revisit", "digest": "BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD", "length": "527", "offset": "589212", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/screen.css 20140126200816 {"url": "http://www.iana.org/_css/2013.1/screen.css", "mime": "warc/revisit", "digest": "BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD", "length": "528", "offset": "603125", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/screen.css 20140126200825 {"url": "http://www.iana.org/_css/2013.1/screen.css", "mime": "warc/revisit", "digest": "BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD", "length": "527", "offset": "614971", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/screen.css 20140126200912 {"url": "http://www.iana.org/_css/2013.1/screen.css", "mime": "warc/revisit", "digest": "BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD", "length": "531", "offset": "661876", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/screen.css 20140126200929 {"url": "http://www.iana.org/_css/2013.1/screen.css", "mime": "warc/revisit", "digest": "BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD", "length": "538", "offset": "691096", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/screen.css 20140126201054 {"url": "http://www.iana.org/_css/2013.1/screen.css", "mime": "warc/revisit", "digest": "BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD", "length": "543", "offset": "706476", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/screen.css 20140126201127 {"url": "http://www.iana.org/_css/2013.1/screen.css", "mime": "warc/revisit", "digest": "BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD", "length": "543", "offset": "725445", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/screen.css 20140126201227 {"url": "http://www.iana.org/_css/2013.1/screen.css", "mime": "warc/revisit", "digest": "BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD", "length": "543", "offset": "739461", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/screen.css 20140126201239 {"url": "http://www.iana.org/_css/2013.1/screen.css", "mime": "warc/revisit", "digest": "BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD", "length": "541", "offset": "751731", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/screen.css 20140126201248 {"url": "http://www.iana.org/_css/2013.1/screen.css", "mime": "warc/revisit", "digest": "BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD", "length": "541", "offset": "764454", "filename": "iana.warc.gz"}
org,iana)/_css/2013.1/screen.css 20140126201307 {"url": "https://www.iana.org/_css/2013.1/screen.css", "mime": "warc/revisit", "digest": "BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD", "length": "537", "offset": "779533", "filename": "iana.warc.gz"}
org,iana)/_img/2013.1/iana-logo-header.svg 20140126200654 {"url": "http://www.iana.org/_img/2013.1/iana-logo-header.svg", "mime": "image/svg+xml", "status": "200", "digest": "N6T6ZRHLEHKP2675D7JVKDYKVKYKWQ6X", "length": "9739", "offset": "447577", "filename": "iana.warc.gz"}
org,iana)/_img/2013.1/iana-logo-header.svg 20140126200706 {"url": "http://www.iana.org/_img/2013.1/iana-logo-header.svg", "mime": "warc/revisit", "digest": "N6T6ZRHLEHKP2675D7JVKDYKVKYKWQ6X", "length": "553", "offset": "491049", "filename": "iana.warc.gz"}
org,iana)/_img/2013.1/iana-logo-header.svg 20140126200718 {"url": "http://www.iana.org/_img/2013.1/iana-logo-header.svg", "mime": "warc/revisit", "digest": "N6T6ZRHLEHKP2675D7JVKDYKVKYKWQ6X", "length": "551", "offset": "564454", "filename": "iana.warc.gz"}
org,iana)/_img/2013.1/iana-logo-header.svg 20140126200737 {"url": "http://www.iana.org/_img/2013.1/iana-logo-header.svg", "mime": "warc/revisit", "digest": "N6T6ZRHLEHKP2675D7JVKDYKVKYKWQ6X", "length": "550", "offset": "576643", "filename": "iana.warc.gz"}
org,iana)/_img/2013.1/iana-logo-header.svg 20140126200805 {"url": "http://www.iana.org/_img/2013.1/iana-logo-header.svg", "mime": "warc/revisit", "digest": "N6T6ZRHLEHKP2675D7JVKDYKVKYKWQ6X", "length": "552", "offset": "591269", "filename": "iana.warc.gz"}
org,iana)/_img/2013.1/iana-logo-header.svg 20140126200816 {"url": "http://www.iana.org/_img/2013.1/iana-logo-header.svg", "mime": "warc/revisit", "digest": "N6T6ZRHLEHKP2675D7JVKDYKVKYKWQ6X", "length": "552", "offset": "605204", "filename": "iana.warc.gz"}
org,iana)/_img/2013.1/iana-logo-header.svg 20140126200826 {"url": "http://www.iana.org/_img/2013.1/iana-logo-header.svg", "mime": "warc/revisit", "digest": "N6T6ZRHLEHKP2675D7JVKDYKVKYKWQ6X", "length": "552", "offset": "617954", "filename": "iana.warc.gz"}
org,iana)/_img/2013.1/iana-logo-header.svg 20140126200912 {"url": "http://www.iana.org/_img/2013.1/iana-logo-header.svg", "mime": "warc/revisit", "digest": "N6T6ZRHLEHKP2675D7JVKDYKVKYKWQ6X", "length": "553", "offset": "664967", "filename": "iana.warc.gz"}
org,iana)/_img/2013.1/iana-logo-header.svg 20140126200929 {"url": "http://www.iana.org/_img/2013.1/iana-logo-header.svg", "mime": "warc/revisit", "digest": "N6T6ZRHLEHKP2675D7JVKDYKVKYKWQ6X", "length": "550", "offset": "695150", "filename": "iana.warc.gz"}
org,iana)/_img/2013.1/iana-logo-header.svg 20140126201054 {"url": "http://www.iana.org/_img/2013.1/iana-logo-header.svg", "mime": "warc/revisit", "digest": "N6T6ZRHLEHKP2675D7JVKDYKVKYKWQ6X", "length": "548", "offset": "709577", "filename": "iana.warc.gz"}
org,iana)/_img/2013.1/iana-logo-header.svg 20140126201128 {"url": "http://www.iana.org/_img/2013.1/iana-logo-header.svg", "mime": "warc/revisit", "digest": "N6T6ZRHLEHKP2675D7JVKDYKVKYKWQ6X", "length": "552", "offset": "728551", "filename": "iana.warc.gz"}
org,iana)/_img/2013.1/iana-logo-header.svg 20140126201228 {"url": "http://www.iana.org/_img/2013.1/iana-logo-header.svg", "mime": "warc/revisit", "digest": "N6T6ZRHLEHKP2675D7JVKDYKVKYKWQ6X", "length": "548", "offset": "741538", "filename": "iana.warc.gz"}
org,iana)/_img/2013.1/iana-logo-header.svg 20140126201239 {"url": "http://www.iana.org/_img/2013.1/iana-logo-header.svg", "mime": "warc/revisit", "digest": "N6T6ZRHLEHKP2675D7JVKDYKVKYKWQ6X", "length": "549", "offset": "753801", "filename": "iana.warc.gz"}
org,iana)/_img/2013.1/iana-logo-header.svg 20140126201249 {"url": "http://www.iana.org/_img/2013.1/iana-logo-header.svg", "mime": "warc/revisit", "digest": "N6T6ZRHLEHKP2675D7JVKDYKVKYKWQ6X", "length": "551", "offset": "766525", "filename": "iana.warc.gz"}
org,iana)/_img/2013.1/iana-logo-header.svg 20140126201307 {"url": "https://www.iana.org/_img/2013.1/iana-logo-header.svg", "mime": "warc/revisit", "digest": "N6T6ZRHLEHKP2675D7JVKDYKVKYKWQ6X", "length": "552", "offset": "780562", "filename": "iana.warc.gz"}
org,iana)/_img/2013.1/iana-logo-homepage.png 20140126200625 {"url": "http://www.iana.org/_img/2013.1/iana-logo-homepage.png", "mime": "image/png", "status": "200", "digest": "GCW2GM3SIMHEIQYZX25MLSRYVWUCZ7OK", "length": "27163", "offset": "55631", "filename": "iana.warc.gz"}
org,iana)/_img/2013.1/icann-logo.svg 20140126200625 {"url": "http://www.iana.org/_img/2013.1/icann-logo.svg", "mime": "image/svg+xml", "status": "200", "digest": "HGRZHOH73EFQQWBYWBSOIV2UU5JDTSGJ", "length": "2809", "offset": "4009", "filename": "iana.warc.gz"}
org,iana)/_img/2013.1/icann-logo.svg 20140126200654 {"url": "http://www.iana.org/_img/2013.1/icann-logo.svg", "mime": "warc/revisit", "digest": "HGRZHOH73EFQQWBYWBSOIV2UU5JDTSGJ", "length": "546", "offset": "457816", "filename": "iana.warc.gz"}
org,iana)/_img/2013.1/icann-logo.svg 20140126200706 {"url": "http://www.iana.org/_img/2013.1/icann-logo.svg", "mime": "warc/revisit", "digest": "HGRZHOH73EFQQWBYWBSOIV2UU5JDTSGJ", "length": "545", "offset": "492101", "filename": "iana.warc.gz"}
org,iana)/_img/2013.1/icann-logo.svg 20140126200719 {"url": "http://www.iana.org/_img/2013.1/icann-logo.svg", "mime": "warc/revisit", "digest": "HGRZHOH73EFQQWBYWBSOIV2UU5JDTSGJ", "length": "548", "offset": "568628", "filename": "iana.warc.gz"}
org,iana)/_img/2013.1/icann-logo.svg 20140126200738 {"url": "http://www.iana.org/_img/2013.1/icann-logo.svg", "mime": "warc/revisit", "digest": "HGRZHOH73EFQQWBYWBSOIV2UU5JDTSGJ", "length": "548", "offset": "577695", "filename": "iana.warc.gz"}
org,iana)/_img/2013.1/icann-logo.svg 20140126200805 {"url": "http://www.iana.org/_img/2013.1/icann-logo.svg", "mime": "warc/revisit", "digest": "HGRZHOH73EFQQWBYWBSOIV2UU5JDTSGJ", "length": "547", "offset": "594444", "filename": "iana.warc.gz"}
org,iana)/_img/2013.1/icann-logo.svg 20140126200816 {"url": "http://www.iana.org/_img/2013.1/icann-logo.svg", "mime": "warc/revisit", "digest": "HGRZHOH73EFQQWBYWBSOIV2UU5JDTSGJ", "length": "548", "offset": "606272", "filename": "iana.warc.gz"}
org,iana)/_img/2013.1/icann-logo.svg 20140126200826 {"url": "http://www.iana.org/_img/2013.1/icann-logo.svg", "mime": "warc/revisit", "digest": "HGRZHOH73EFQQWBYWBSOIV2UU5JDTSGJ", "length": "545", "offset": "619007", "filename": "iana.warc.gz"}
org,iana)/_img/2013.1/icann-logo.svg 20140126200912 {"url": "http://www.iana.org/_img/2013.1/icann-logo.svg", "mime": "warc/revisit", "digest": "HGRZHOH73EFQQWBYWBSOIV2UU5JDTSGJ", "length": "547", "offset": "666025", "filename": "iana.warc.gz"}
org,iana)/_img/2013.1/icann-logo.svg 20140126200930 {"url": "http://www.iana.org/_img/2013.1/icann-logo.svg", "mime": "warc/revisit", "digest": "HGRZHOH73EFQQWBYWBSOIV2UU5JDTSGJ", "length": "547", "offset": "696207", "filename": "iana.warc.gz"}
org,iana)/_img/2013.1/icann-logo.svg 20140126201055 {"url": "http://www.iana.org/_img/2013.1/icann-logo.svg", "mime": "warc/revisit", "digest": "HGRZHOH73EFQQWBYWBSOIV2UU5JDTSGJ", "length": "529", "offset": "710633", "filename": "iana.warc.gz"}
org,iana)/_img/2013.1/icann-logo.svg 20140126201128 {"url": "http://www.iana.org/_img/2013.1/icann-logo.svg", "mime": "warc/revisit", "digest": "HGRZHOH73EFQQWBYWBSOIV2UU5JDTSGJ", "length": "549", "offset": "729609", "filename": "iana.warc.gz"}
org,iana)/_img/2013.1/icann-logo.svg 20140126201228 {"url": "http://www.iana.org/_img/2013.1/icann-logo.svg", "mime": "warc/revisit", "digest": "HGRZHOH73EFQQWBYWBSOIV2UU5JDTSGJ", "length": "544", "offset": "742593", "filename": "iana.warc.gz"}
org,iana)/_img/2013.1/icann-logo.svg 20140126201240 {"url": "http://www.iana.org/_img/2013.1/icann-logo.svg", "mime": "warc/revisit", "digest": "HGRZHOH73EFQQWBYWBSOIV2UU5JDTSGJ", "length": "546", "offset": "754853", "filename": "iana.warc.gz"}
org,iana)/_img/2013.1/icann-logo.svg 20140126201249 {"url": "http://www.iana.org/_img/2013.1/icann-logo.svg", "mime": "warc/revisit", "digest": "HGRZHOH73EFQQWBYWBSOIV2UU5JDTSGJ", "length": "544", "offset": "767580", "filename": "iana.warc.gz"}
org,iana)/_img/2013.1/icann-logo.svg 20140126201308 {"url": "https://www.iana.org/_img/2013.1/icann-logo.svg", "mime": "warc/revisit", "digest": "HGRZHOH73EFQQWBYWBSOIV2UU5JDTSGJ", "length": "546", "offset": "781613", "filename": "iana.warc.gz"}
org,iana)/_img/2013.1/rir-map.svg 20140126200654 {"url": "http://www.iana.org/_img/2013.1/rir-map.svg", "mime": "image/svg+xml", "status": "200", "digest": "C4LTM7ATRZYZL3W2UCEEX6A26L6PIT4K", "length": "23189", "offset": "458860", "filename": "iana.warc.gz"}
org,iana)/_img/bookmark_icon.ico 20140126200631 {"url": "http://www.iana.org/_img/bookmark_icon.ico", "mime": "application/octet-stream", "status": "200", "digest": "PG3PAWWE72JQ37CXJSPCJNNF7QI3SNX7", "length": "4968", "offset": "315944", "filename": "iana.warc.gz"}
org,iana)/_img/bookmark_icon.ico 20140126201310 {"url": "https://www.iana.org/_img/bookmark_icon.ico", "mime": "warc/revisit", "digest": "PG3PAWWE72JQ37CXJSPCJNNF7QI3SNX7", "length": "548", "offset": "785806", "filename": "iana.warc.gz"}
org,iana)/_js/2013.1/iana.js 20140126200625 {"url": "http://www.iana.org/_js/2013.1/iana.js", "mime": "application/x-javascript", "status": "200", "digest": "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "458", "offset": "3074", "filename": "iana.warc.gz"}
org,iana)/_js/2013.1/iana.js 20140126200653 {"url": "http://www.iana.org/_js/2013.1/iana.js", "mime": "application/x-javascript", "status": "200", "digest": "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "456", "offset": "325380", "filename": "iana.warc.gz"}
org,iana)/_js/2013.1/iana.js 20140126200706 {"url": "http://www.iana.org/_js/2013.1/iana.js", "mime": "application/x-javascript", "status": "200", "digest": "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "458", "offset": "487044", "filename": "iana.warc.gz"}
org,iana)/_js/2013.1/iana.js 20140126200716 {"url": "http://www.iana.org/_js/2013.1/iana.js", "mime": "application/x-javascript", "status": "200", "digest": "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "457", "offset": "560436", "filename": "iana.warc.gz"}
org,iana)/_js/2013.1/iana.js 20140126200737 {"url": "http://www.iana.org/_js/2013.1/iana.js", "mime": "application/x-javascript", "status": "200", "digest": "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "457", "offset": "573645", "filename": "iana.warc.gz"}
org,iana)/_js/2013.1/iana.js 20140126200804 {"url": "http://www.iana.org/_js/2013.1/iana.js", "mime": "application/x-javascript", "status": "200", "digest": "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "460", "offset": "587215", "filename": "iana.warc.gz"}
org,iana)/_js/2013.1/iana.js 20140126200816 {"url": "http://www.iana.org/_js/2013.1/iana.js", "mime": "application/x-javascript", "status": "200", "digest": "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "459", "offset": "601126", "filename": "iana.warc.gz"}
org,iana)/_js/2013.1/iana.js 20140126200825 {"url": "http://www.iana.org/_js/2013.1/iana.js", "mime": "application/x-javascript", "status": "200", "digest": "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "458", "offset": "615991", "filename": "iana.warc.gz"}
org,iana)/_js/2013.1/iana.js 20140126200912 {"url": "http://www.iana.org/_js/2013.1/iana.js", "mime": "application/x-javascript", "status": "200", "digest": "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "456", "offset": "660937", "filename": "iana.warc.gz"}
org,iana)/_js/2013.1/iana.js 20140126200929 {"url": "http://www.iana.org/_js/2013.1/iana.js", "mime": "application/x-javascript", "status": "200", "digest": "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "458", "offset": "692132", "filename": "iana.warc.gz"}
org,iana)/_js/2013.1/iana.js 20140126201054 {"url": "http://www.iana.org/_js/2013.1/iana.js", "mime": "application/x-javascript", "status": "200", "digest": "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "456", "offset": "705534", "filename": "iana.warc.gz"}
org,iana)/_js/2013.1/iana.js 20140126201127 {"url": "http://www.iana.org/_js/2013.1/iana.js", "mime": "application/x-javascript", "status": "200", "digest": "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "457", "offset": "724500", "filename": "iana.warc.gz"}
org,iana)/_js/2013.1/iana.js 20140126201227 {"url": "http://www.iana.org/_js/2013.1/iana.js", "mime": "application/x-javascript", "status": "200", "digest": "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "458", "offset": "737484", "filename": "iana.warc.gz"}
org,iana)/_js/2013.1/iana.js 20140126201239 {"url": "http://www.iana.org/_js/2013.1/iana.js", "mime": "application/x-javascript", "status": "200", "digest": "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "457", "offset": "749770", "filename": "iana.warc.gz"}
org,iana)/_js/2013.1/iana.js 20140126201248 {"url": "http://www.iana.org/_js/2013.1/iana.js", "mime": "application/x-javascript", "status": "200", "digest": "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "458", "offset": "762480", "filename": "iana.warc.gz"}
org,iana)/_js/2013.1/iana.js 20140126201307 {"url": "https://www.iana.org/_js/2013.1/iana.js", "mime": "application/x-javascript", "status": "200", "digest": "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "453", "offset": "776543", "filename": "iana.warc.gz"}
org,iana)/_js/2013.1/jquery.js 20140126200625 {"url": "http://www.iana.org/_js/2013.1/jquery.js", "mime": "application/x-javascript", "status": "200", "digest": "AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO", "length": "33449", "offset": "7311", "filename": "iana.warc.gz"}
org,iana)/_js/2013.1/jquery.js 20140126200653 {"url": "http://www.iana.org/_js/2013.1/jquery.js", "mime": "warc/revisit", "digest": "AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO", "length": "542", "offset": "327341", "filename": "iana.warc.gz"}
org,iana)/_js/2013.1/jquery.js 20140126200706 {"url": "http://www.iana.org/_js/2013.1/jquery.js", "mime": "warc/revisit", "digest": "AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO", "length": "529", "offset": "490037", "filename": "iana.warc.gz"}
org,iana)/_js/2013.1/jquery.js 20140126200716 {"url": "http://www.iana.org/_js/2013.1/jquery.js", "mime": "warc/revisit", "digest": "AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO", "length": "529", "offset": "562402", "filename": "iana.warc.gz"}
org,iana)/_js/2013.1/jquery.js 20140126200737 {"url": "http://www.iana.org/_js/2013.1/jquery.js", "mime": "warc/revisit", "digest": "AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO", "length": "543", "offset": "575613", "filename": "iana.warc.gz"}
org,iana)/_js/2013.1/jquery.js 20140126200804 {"url": "http://www.iana.org/_js/2013.1/jquery.js", "mime": "warc/revisit", "digest": "AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO", "length": "530", "offset": "590244", "filename": "iana.warc.gz"}
org,iana)/_js/2013.1/jquery.js 20140126200816 {"url": "http://www.iana.org/_js/2013.1/jquery.js", "mime": "warc/revisit", "digest": "AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO", "length": "544", "offset": "604162", "filename": "iana.warc.gz"}
org,iana)/_js/2013.1/jquery.js 20140126200825 {"url": "http://www.iana.org/_js/2013.1/jquery.js", "mime": "warc/revisit", "digest": "AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO", "length": "543", "offset": "616929", "filename": "iana.warc.gz"}
org,iana)/_js/2013.1/jquery.js 20140126200912 {"url": "http://www.iana.org/_js/2013.1/jquery.js", "mime": "warc/revisit", "digest": "AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO", "length": "544", "offset": "663936", "filename": "iana.warc.gz"}
org,iana)/_js/2013.1/jquery.js 20140126200929 {"url": "http://www.iana.org/_js/2013.1/jquery.js", "mime": "warc/revisit", "digest": "AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO", "length": "546", "offset": "694112", "filename": "iana.warc.gz"}
org,iana)/_js/2013.1/jquery.js 20140126201054 {"url": "http://www.iana.org/_js/2013.1/jquery.js", "mime": "warc/revisit", "digest": "AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO", "length": "544", "offset": "708544", "filename": "iana.warc.gz"}
org,iana)/_js/2013.1/jquery.js 20140126201127 {"url": "http://www.iana.org/_js/2013.1/jquery.js", "mime": "warc/revisit", "digest": "AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO", "length": "545", "offset": "727515", "filename": "iana.warc.gz"}
org,iana)/_js/2013.1/jquery.js 20140126201227 {"url": "http://www.iana.org/_js/2013.1/jquery.js", "mime": "warc/revisit", "digest": "AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO", "length": "543", "offset": "740505", "filename": "iana.warc.gz"}
org,iana)/_js/2013.1/jquery.js 20140126201239 {"url": "http://www.iana.org/_js/2013.1/jquery.js", "mime": "warc/revisit", "digest": "AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO", "length": "545", "offset": "752769", "filename": "iana.warc.gz"}
org,iana)/_js/2013.1/jquery.js 20140126201248 {"url": "http://www.iana.org/_js/2013.1/jquery.js", "mime": "warc/revisit", "digest": "AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO", "length": "544", "offset": "765491", "filename": "iana.warc.gz"}
org,iana)/_js/2013.1/jquery.js 20140126201307 {"url": "https://www.iana.org/_js/2013.1/jquery.js", "mime": "warc/revisit", "digest": "AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO", "length": "543", "offset": "778507", "filename": "iana.warc.gz"}
org,iana)/about 20140126200706 {"url": "http://www.iana.org/about", "mime": "text/html", "status": "200", "digest": "6G77LZKFAVKH4PCWWKMW6TRJPSHWUBI3", "length": "2962", "offset": "483588", "filename": "iana.warc.gz"}
org,iana)/about/performance/ietf-draft-status 20140126200815 {"url": "http://www.iana.org/about/performance/ietf-draft-status", "mime": "text/html", "status": "302", "digest": "Y7CTA2QZUSCDTJCSECZNSPIBLJDO7PJJ", "length": "584", "offset": "596566", "filename": "iana.warc.gz"}
org,iana)/about/performance/ietf-statistics 20140126200804 {"url": "http://www.iana.org/about/performance/ietf-statistics", "mime": "text/html", "status": "302", "digest": "HNYDN7XRX46RQTT2OFIWXKEYMZQAJWHD", "length": "582", "offset": "581890", "filename": "iana.warc.gz"}
org,iana)/dnssec 20140126201306 {"url": "http://www.iana.org/dnssec", "mime": "text/html", "status": "302", "digest": "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "442", "offset": "772827", "filename": "iana.warc.gz"}
org,iana)/dnssec 20140126201307 {"url": "https://www.iana.org/dnssec", "mime": "text/html", "status": "200", "digest": "PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI", "length": "2278", "offset": "773766", "filename": "iana.warc.gz"}
org,iana)/domains 20140126200825 {"url": "http://www.iana.org/domains", "mime": "text/html", "status": "200", "digest": "7UPSCLNWNZP33LGW6OJGSF2Y4CDG4ES7", "length": "2912", "offset": "610534", "filename": "iana.warc.gz"}
org,iana)/domains/arpa 20140126201248 {"url": "http://www.iana.org/domains/arpa", "mime": "text/html", "status": "200", "digest": "QOFZZRN6JIKAL2JRL6ZC2VVG42SPKGHT", "length": "2939", "offset": "759039", "filename": "iana.warc.gz"}
org,iana)/domains/idn-tables 20140126201127 {"url": "http://www.iana.org/domains/idn-tables", "mime": "text/html", "status": "200", "digest": "HNCUFTJMOQOGAEY6T56KVC3T7TVLKGEW", "length": "8118", "offset": "715878", "filename": "iana.warc.gz"}
org,iana)/domains/int 20140126201239 {"url": "http://www.iana.org/domains/int", "mime": "text/html", "status": "200", "digest": "X32BBNNORV4SPEHTQF5KI5NFHSKTZK6Q", "length": "2482", "offset": "746788", "filename": "iana.warc.gz"}
org,iana)/domains/reserved 20140126201054 {"url": "http://www.iana.org/domains/reserved", "mime": "text/html", "status": "200", "digest": "R5AAEQX5XY5X5DG66B23ODN5DUBWRA27", "length": "3573", "offset": "701457", "filename": "iana.warc.gz"}
org,iana)/domains/root 20140126200912 {"url": "http://www.iana.org/domains/root", "mime": "text/html", "status": "200", "digest": "YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX", "length": "2691", "offset": "657746", "filename": "iana.warc.gz"}
org,iana)/domains/root/db 20140126200927 {"url": "http://www.iana.org/domains/root/db/", "mime": "text/html", "status": "302", "digest": "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "446", "offset": "671278", "filename": "iana.warc.gz"}
org,iana)/domains/root/db 20140126200928 {"url": "http://www.iana.org/domains/root/db", "mime": "text/html", "status": "200", "digest": "DHXA725IW5VJJFRTWBQT6BEZKRE7H57S", "length": "18365", "offset": "672225", "filename": "iana.warc.gz"}
org,iana)/domains/root/servers 20140126201227 {"url": "http://www.iana.org/domains/root/servers", "mime": "text/html", "status": "200", "digest": "AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU", "length": "3137", "offset": "733840", "filename": "iana.warc.gz"}
org,iana)/numbers 20140126200651 {"url": "http://www.iana.org/numbers", "mime": "text/html", "status": "200", "digest": "HWT5UZKURYLW5QNWVZCWFCANGEMU7XWK", "length": "3498", "offset": "321385", "filename": "iana.warc.gz"}
org,iana)/performance/ietf-draft-status 20140126200815 {"url": "http://www.iana.org/performance/ietf-draft-status", "mime": "text/html", "status": "200", "digest": "T5IQTX6DWV5KABGH454CYEDWKRI5Y23E", "length": "2940", "offset": "597667", "filename": "iana.warc.gz"}
org,iana)/performance/ietf-statistics 20140126200804 {"url": "http://www.iana.org/performance/ietf-statistics", "mime": "text/html", "status": "200", "digest": "XOFML5WNBQMTSULLIIPLSP6U5MX33HN6", "length": "3712", "offset": "582987", "filename": "iana.warc.gz"}
org,iana)/protocols 20140126200715 {"url": "http://www.iana.org/protocols", "mime": "text/html", "status": "200", "digest": "IRUJZEUAXOUUG224ZMI4VWTUPJX6XJTT", "length": "63663", "offset": "496277", "filename": "iana.warc.gz"}
org,iana)/time-zones 20140126200737 {"url": "http://www.iana.org/time-zones", "mime": "text/html", "status": "200", "digest": "4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R", "length": "2449", "offset": "569675", "filename": "iana.warc.gz"}

BIN
testdata/iana.warc.gz vendored Normal file

Binary file not shown.

3
testdata/post-test.cdxj vendored Normal file
View File

@ -0,0 +1,3 @@
org,httpbin)/post?foo=bar&test=abc 20140610000859 {"url": "http://httpbin.org/post", "mime": "application/json", "status": "200", "digest": "M532K5WS4GY2H4OVZO6HRPOP47A7KDWU", "length": "720", "offset": "0", "filename": "post-test.warc.gz"}
org,httpbin)/post?a=1&b=[]&c=3 20140610001151 {"url": "http://httpbin.org/post", "mime": "application/json", "status": "200", "digest": "M7YCTM7HS3YKYQTAWQVMQSQZBNEOXGU2", "length": "723", "offset": "1196", "filename": "post-test.warc.gz"}
org,httpbin)/post?data=^&foo=bar 20140610001255 {"url": "http://httpbin.org/post?foo=bar", "mime": "application/json", "status": "200", "digest": "B6E5P6JUZI6UPDTNO4L2BCHMGLTNCUAJ", "length": "723", "offset": "2395", "filename": "post-test.warc.gz"}

BIN
testdata/post-test.warc.gz vendored Normal file

Binary file not shown.

2
testdata/url-agnost-example.cdxj vendored Normal file
View File

@ -0,0 +1,2 @@
com,example)/ 20130729195151 {"url": "http://test@example.com/", "mime": "warc/revisit", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "591", "offset": "355", "filename": "example-url-agnostic-revisit.warc.gz"}
org,iana,example)/ 20130702195402 {"url": "http://example.iana.org/", "mime": "text/html", "status": "200", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "1001", "offset": "353", "filename": "example-url-agnostic-orig.warc.gz"}

0
urlrewrite/__init__.py Normal file
View File

153
urlrewrite/cookies.py Normal file
View File

@ -0,0 +1,153 @@
from pywb.rewrite.cookie_rewriter import WbUrlBaseCookieRewriter
from pywb.utils.timeutils import datetime_to_http_date
from six.moves.http_cookiejar import CookieJar, DefaultCookiePolicy
from six.moves import zip
import redis
import tldextract
import time
import datetime
import six
# =============================================================================
class CookieTracker(object):
def __init__(self, redis, expire_time=120):
self.redis = redis
self.expire_time = expire_time
def get_rewriter(self, url_rewriter, cookie_key):
return DomainCacheCookieRewriter(url_rewriter, self, cookie_key)
def get_cookie_headers(self, url, cookie_key):
subds = self.get_subdomains(url)
if not subds:
return None, None
with redis.utils.pipeline(self.redis) as pi:
for domain in subds:
pi.hgetall(cookie_key + '.' + domain)
all_res = pi.execute()
cookies = []
set_cookies = []
with redis.utils.pipeline(self.redis) as pi:
for res, domain in zip(all_res, subds):
if not res:
continue
for n, v in six.iteritems(res):
n = n.decode('utf-8')
v = v.decode('utf-8')
full = n + '=' + v
cookies.append(full.split(';')[0])
set_cookies.append(('Set-Cookie', full + '; Max-Age=' + str(self.expire_time)))
pi.expire(cookie_key + '.' + domain, self.expire_time)
cookies = ';'.join(cookies)
return cookies, set_cookies
def add_cookie(self, cookie_key, domain, name, value):
if domain[0] != '.':
domain = '.' + domain
with redis.utils.pipeline(self.redis) as pi:
pi.hset(cookie_key + domain, name, value)
pi.expire(cookie_key + domain, self.expire_time)
@staticmethod
def get_subdomains(url):
tld = tldextract.extract(url)
if not tld.subdomain:
return None
main = tld.domain + '.' + tld.suffix
full = tld.subdomain + '.' + main
def get_all_subdomains(main, full):
doms = []
while main != full:
full = full.split('.', 1)[1]
doms.append(full)
return doms
all_subs = get_all_subdomains(main, full)
return all_subs
# =============================================================================
class DomainCacheCookieRewriter(WbUrlBaseCookieRewriter):
def __init__(self, url_rewriter, cookie_tracker, cookie_key):
super(DomainCacheCookieRewriter, self).__init__(url_rewriter)
self.cookie_tracker = cookie_tracker
self.cookie_key = cookie_key
def rewrite_cookie(self, name, morsel):
# if domain set, no choice but to expand cookie path to root
domain = morsel.pop('domain', '')
if domain:
#if morsel.get('max-age'):
# morsel['max-age'] = int(morsel['max-age'])
#self.cookiejar.set_cookie(self.morsel_to_cookie(morsel))
#print(morsel, self.cookie_key + domain)
string = morsel.value
if morsel.get('path'):
string += '; Path=' + morsel.get('path')
if morsel.get('httponly'):
string += '; HttpOnly'
if morsel.get('secure'):
string += '; Secure'
self.cookie_tracker.add_cookie(self.cookie_key,
domain,
morsel.key,
string)
# else set cookie to rewritten path
if morsel.get('path'):
morsel['path'] = self.url_rewriter.rewrite(morsel['path'])
return morsel
def get_expire_sec(self, morsel):
expires = None
if morsel.get('max-age'):
return int(morsel['max-age'])
expires = morsel.get('expires')
if not expires:
return None
expires = expires.replace(' UTC', ' GMT')
try:
expires = time.strptime(expires, '%a, %d-%b-%Y %H:%M:%S GMT')
except:
pass
try:
expires = time.strptime(expires, '%a, %d %b %Y %H:%M:%S GMT')
except:
pass
expires = time.mktime(expires)
expires = expires - time.timezone - time.time()
return expires
# ============================================================================

View File

@ -0,0 +1,99 @@
from gevent.monkey import patch_all; patch_all()
import requests
from pywb.framework.archivalrouter import Route
from pywb.rewrite.rewrite_content import RewriteContent
from pywb.rewrite.wburl import WbUrl
from pywb.warc.recordloader import ArcWarcRecordLoader
from pywb.webapp.live_rewrite_handler import RewriteHandler
from pywb.utils.canonicalize import canonicalize
from pywb.utils.timeutils import http_date_to_timestamp
from pywb.cdx.cdxobject import CDXObject
from io import BytesIO
from rewriteinputreq import RewriteInputRequest
from six.moves.urllib.parse import quote
# ============================================================================
class PlatformRoute(Route):
def apply_filters(self, wbrequest, matcher):
wbrequest.matchdict = matcher.groupdict()
# ============================================================================
class PlatformHandler(RewriteHandler):
def __init__(self, config):
super(PlatformHandler, self).__init__(config)
self.upstream_url = config.get('upstream_url')
self.loader = ArcWarcRecordLoader()
framed = config.get('framed_replay')
self.content_rewriter = RewriteContent(is_framed_replay=framed)
def render_content(self, wbrequest):
if wbrequest.wb_url.mod == 'vi_':
return self._get_video_info(wbrequest)
ref_wburl_str = wbrequest.extract_referrer_wburl_str()
if ref_wburl_str:
wbrequest.env['HTTP_REFERER'] = WbUrl(ref_wburl_str).url
urlkey = canonicalize(wbrequest.wb_url.url)
url = wbrequest.wb_url.url
inputreq = RewriteInputRequest(wbrequest.env, urlkey, url,
self.content_rewriter)
req_data = inputreq.reconstruct_request(url)
headers = {'Content-Length': len(req_data),
'Content-Type': 'application/request'}
if wbrequest.wb_url.is_latest_replay():
closest = 'now'
else:
closest = wbrequest.wb_url.timestamp
upstream_url = self.upstream_url.format(url=quote(url),
closest=closest,
#coll=wbrequest.coll,
**wbrequest.matchdict)
r = requests.post(upstream_url,
data=BytesIO(req_data),
headers=headers,
stream=True,
allow_redirects=False)
r.raise_for_status()
record = self.loader.parse_record_stream(r.raw)
cdx = CDXObject()
cdx['urlkey'] = urlkey
cdx['timestamp'] = http_date_to_timestamp(r.headers.get('Memento-Datetime'))
cdx['url'] = url
head_insert_func = self.head_insert_view.create_insert_func(wbrequest)
result = self.content_rewriter.rewrite_content(wbrequest.urlrewriter,
record.status_headers,
record.stream,
head_insert_func,
urlkey,
cdx)
status_headers, gen, is_rw = result
return self._make_response(wbrequest, *result)
if __name__ == "__main__":
from gevent.wsgi import WSGIServer
from pywb.apps.wayback import application
server = WSGIServer(('', 8090), application)
server.serve_forever()

View File

@ -0,0 +1,134 @@
from webagg.inputrequest import DirectWSGIInputRequest
from pywb.utils.loaders import extract_client_cookie
from six import iteritems
from six.moves.urllib.parse import urlsplit
import re
#=============================================================================
class RewriteInputRequest(DirectWSGIInputRequest):
RANGE_ARG_RX = re.compile('.*.googlevideo.com/videoplayback.*([&?]range=(\d+)-(\d+))')
RANGE_HEADER = re.compile('bytes=(\d+)-(\d+)?')
def __init__(self, env, urlkey, url, rewriter):
super(RewriteInputRequest, self).__init__(env)
self.urlkey = urlkey
self.url = url
self.rewriter = rewriter
self.extra_cookie = None
self.splits = urlsplit(self.url)
def get_full_request_uri(self):
uri = self.splits.path
if self.splits.query:
uri += '?' + self.splits.query
return uri
def get_req_headers(self):
headers = {}
has_cookies = False
for name, value in iteritems(self.env):
if name == 'HTTP_HOST':
name = 'Host'
value = self.splits.netloc
elif name == 'HTTP_ORIGIN':
name = 'Origin'
value = (self.splits.scheme + '://' + self.splits.netloc)
elif name == 'HTTP_X_CSRFTOKEN':
name = 'X-CSRFToken'
cookie_val = extract_client_cookie(self.env, 'csrftoken')
if cookie_val:
value = cookie_val
elif name == 'HTTP_X_PYWB_REQUESTED_WITH':
continue
elif name == 'HTTP_X_FORWARDED_PROTO':
name = 'X-Forwarded-Proto'
value = self.splits.scheme
elif name == 'HTTP_COOKIE':
name = 'Cookie'
value = self._req_cookie_rewrite(value)
has_cookies = True
elif name.startswith('HTTP_'):
name = name[5:].title().replace('_', '-')
elif name in ('CONTENT_LENGTH', 'CONTENT_TYPE'):
name = name.title().replace('_', '-')
else:
value = None
if value:
headers[name] = value
if not has_cookies:
value = self._req_cookie_rewrite('')
if value:
headers['Cookie'] = value
if self.extra_cookie:
headers['Cookie'] = self.extra_cookie + ';' + headers.get('Cookie', '')
print('Cookie', headers['Cookie'])
return headers
def _req_cookie_rewrite(self, value):
rule = self.rewriter.ruleset.get_first_match(self.urlkey)
if not rule or not rule.req_cookie_rewrite:
return value
for cr in rule.req_cookie_rewrite:
try:
value = cr['rx'].sub(cr['replace'], value)
except KeyError:
pass
return value
def extract_range(self):
use_206 = False
start = None
end = None
url = self.url
range_h = self.env.get('HTTP_RANGE')
if range_h:
m = self.RANGE_HEADER.match(range_h)
if m:
start = m.group(1)
end = m.group(2)
use_206 = True
else:
m = self.RANGE_ARG_RX.match(url)
if m:
start = m.group(2)
end = m.group(3)
url = url[:m.start(1)] + url[m.end(1):]
use_206 = False
if not start:
return None
start = int(start)
if end:
end = int(end)
else:
end = ''
result = (url, start, end, use_206)
return result

425
urlrewrite/rewriterapp.py Normal file
View File

@ -0,0 +1,425 @@
import requests
from pywb.rewrite.rewrite_amf import RewriteContentAMF
from pywb.rewrite.wburl import WbUrl
from pywb.rewrite.url_rewriter import UrlRewriter
from pywb.utils.wbexception import WbException
from pywb.utils.canonicalize import canonicalize
from pywb.utils.timeutils import http_date_to_timestamp
from pywb.utils.loaders import extract_client_cookie
from pywb.cdx.cdxobject import CDXObject
from pywb.warc.recordloader import ArcWarcRecordLoader
from pywb.framework.wbrequestresponse import WbResponse
from six.moves.urllib.parse import urlencode
from urlrewrite.rewriteinputreq import RewriteInputRequest
from urlrewrite.templateview import JinjaEnv, HeadInsertView, TopFrameView, BaseInsertView
from io import BytesIO
import gevent
import json
# ============================================================================
class UpstreamException(WbException):
def __init__(self, status_code, url, details):
super(UpstreamException, self).__init__(url=url, msg=details)
self.status_code = status_code
# ============================================================================
class RewriterApp(object):
VIDEO_INFO_CONTENT_TYPE = 'application/vnd.youtube-dl_formats+json'
def __init__(self, framed_replay=False, jinja_env=None, config=None):
self.loader = ArcWarcRecordLoader()
config = config or {}
self.paths = config['url_templates']
self.framed_replay = framed_replay
self.frame_mod = ''
self.replay_mod = 'mp_'
frame_type = 'inverse' if framed_replay else False
self.content_rewriter = RewriteContentAMF(is_framed_replay=frame_type)
if not jinja_env:
jinja_env = JinjaEnv(globals={'static_path': 'static/__pywb'})
self.jinja_env = jinja_env
self.head_insert_view = HeadInsertView(self.jinja_env, 'head_insert.html', 'banner.html')
self.frame_insert_view = TopFrameView(self.jinja_env, 'frame_insert.html', 'banner.html')
self.error_view = BaseInsertView(self.jinja_env, 'error.html')
self.query_view = BaseInsertView(self.jinja_env, config.get('query_html', 'query.html'))
self.cookie_tracker = None
def call_with_params(self, **kwargs):
def run_app(environ, start_response):
environ['pywb.kwargs'] = kwargs
return self(environ, start_response)
return run_app
def __call__(self, environ, start_response):
wb_url = self.get_wburl(environ)
kwargs = environ.get('pywb.kwargs', {})
try:
response = self.render_content(wb_url, kwargs, environ)
except UpstreamException as ue:
response = self.handle_error(environ, ue)
return response(environ, start_response)
def is_framed_replay(self, wb_url):
return (self.framed_replay and
wb_url.mod == self.frame_mod and
wb_url.is_replay())
def render_content(self, wb_url, kwargs, environ):
wb_url = WbUrl(wb_url)
host_prefix = self.get_host_prefix(environ)
rel_prefix = self.get_rel_prefix(environ)
full_prefix = host_prefix + rel_prefix
resp = self.handle_custom_response(environ, wb_url,
full_prefix, host_prefix, kwargs)
if resp is not None:
content_type = 'text/html'
# if not replay outer frame, specify utf-8 charset
if not self.is_framed_replay(wb_url):
content_type += '; charset=utf-8'
return WbResponse.text_response(resp, content_type=content_type)
urlrewriter = UrlRewriter(wb_url,
prefix=full_prefix,
full_prefix=full_prefix,
rel_prefix=rel_prefix)
self.unrewrite_referrer(environ)
urlkey = canonicalize(wb_url.url)
inputreq = RewriteInputRequest(environ, urlkey, wb_url.url,
self.content_rewriter)
inputreq.include_post_query(wb_url.url)
mod_url = None
use_206 = False
rangeres = None
readd_range = False
async_record_url = None
if kwargs.get('type') in ('record', 'patch'):
rangeres = inputreq.extract_range()
if rangeres:
mod_url, start, end, use_206 = rangeres
# if bytes=0- Range request,
# simply remove the range and still proxy
if start == 0 and not end and use_206:
wb_url.url = mod_url
inputreq.url = mod_url
del environ['HTTP_RANGE']
readd_range = True
else:
async_record_url = mod_url
skip = async_record_url is not None
setcookie_headers = None
if self.cookie_tracker:
cookie_key = self.get_cookie_key(kwargs)
res = self.cookie_tracker.get_cookie_headers(wb_url.url, cookie_key)
inputreq.extra_cookie, setcookie_headers = res
r = self._do_req(inputreq, wb_url, kwargs, skip)
if r.status_code >= 400:
error = None
try:
error = r.raw.read()
r.raw.close()
except:
pass
if error:
error = error.decode('utf-8')
else:
error = ''
details = dict(args=kwargs, error=error)
raise UpstreamException(r.status_code, url=wb_url.url, details=details)
if async_record_url:
environ.pop('HTTP_RANGE', '')
gevent.spawn(self._do_async_req,
inputreq,
async_record_url,
wb_url,
kwargs,
False)
record = self.loader.parse_record_stream(r.raw)
cdx = CDXObject()
cdx['urlkey'] = urlkey
cdx['timestamp'] = http_date_to_timestamp(r.headers.get('Memento-Datetime'))
cdx['url'] = wb_url.url
self._add_custom_params(cdx, r.headers, kwargs)
if readd_range:
content_length = (record.status_headers.
get_header('Content-Length'))
try:
content_length = int(content_length)
record.status_headers.add_range(0, content_length,
content_length)
except (ValueError, TypeError):
pass
if self.is_ajax(environ):
head_insert_func = None
urlrewriter.rewrite_opts['is_ajax'] = True
else:
top_url = self.get_top_url(full_prefix, wb_url, cdx, kwargs)
head_insert_func = (self.head_insert_view.
create_insert_func(wb_url,
full_prefix,
host_prefix,
top_url,
environ,
self.framed_replay))
cookie_rewriter = None
if self.cookie_tracker:
cookie_rewriter = self.cookie_tracker.get_rewriter(urlrewriter,
cookie_key)
result = self.content_rewriter.rewrite_content(urlrewriter,
record.status_headers,
record.stream,
head_insert_func,
urlkey,
cdx,
cookie_rewriter,
environ)
status_headers, gen, is_rw = result
if setcookie_headers:
status_headers.headers.extend(setcookie_headers)
return WbResponse(status_headers, gen)
def get_top_url(self, full_prefix, wb_url, cdx, kwargs):
top_url = full_prefix
top_url += wb_url.to_str(mod='')
return top_url
def _do_async_req(self, *args):
count = 0
try:
r = self._do_req(*args)
while True:
buff = r.raw.read(8192)
count += len(buff)
if not buff:
return
except:
import traceback
traceback.print_exc()
finally:
try:
r.raw.close()
except:
pass
def handle_error(self, environ, ue):
error_html = self.error_view.render_to_string(environ,
err_msg=ue.url,
err_details=ue.msg)
return WbResponse.text_response(error_html, content_type='text/html')
def _do_req(self, inputreq, wb_url, kwargs, skip):
req_data = inputreq.reconstruct_request(wb_url.url)
headers = {'Content-Length': str(len(req_data)),
'Content-Type': 'application/request'}
if skip:
headers['Recorder-Skip'] = '1'
if wb_url.is_latest_replay():
closest = 'now'
else:
closest = wb_url.timestamp
params = {}
params['url'] = wb_url.url
params['closest'] = closest
if wb_url.mod == 'vi_':
params['content_type'] = self.VIDEO_INFO_CONTENT_TYPE
upstream_url = self.get_upstream_url(wb_url, kwargs, params)
r = requests.post(upstream_url,
data=BytesIO(req_data),
headers=headers,
stream=True)
return r
def do_query(self, wb_url, kwargs):
params = {}
params['url'] = wb_url.url
params['output'] = 'json'
params['from'] = wb_url.timestamp
params['to'] = wb_url.end_timestamp
upstream_url = self.get_upstream_url(wb_url, kwargs, params)
upstream_url = upstream_url.replace('/resource/postreq', '/index')
r = requests.get(upstream_url)
return r.text
def handle_query(self, environ, wb_url, kwargs):
res = self.do_query(wb_url, kwargs)
def format_cdx(text):
cdx_lines = text.rstrip().split('\n')
for cdx in cdx_lines:
if not cdx:
continue
cdx = json.loads(cdx)
self.process_query_cdx(cdx, wb_url, kwargs)
yield cdx
prefix = self.get_full_prefix(environ)
params = dict(url=wb_url.url,
prefix=prefix,
cdx_lines=list(format_cdx(res)))
extra_params = self.get_query_params(wb_url, kwargs)
if extra_params:
params.update(extra_params)
return self.query_view.render_to_string(environ, **params)
def process_query_cdx(self, cdx, wb_url, kwargs):
return
def get_query_params(self, wb_url, kwargs):
return None
def get_host_prefix(self, environ):
#return request.urlparts.scheme + '://' + request.urlparts.netloc
url = environ['wsgi.url_scheme'] + '://'
if environ.get('HTTP_HOST'):
url += environ['HTTP_HOST']
else:
url += environ['SERVER_NAME']
if environ['wsgi.url_scheme'] == 'https':
if environ['SERVER_PORT'] != '443':
url += ':' + environ['SERVER_PORT']
else:
if environ['SERVER_PORT'] != '80':
url += ':' + environ['SERVER_PORT']
return url
def get_rel_prefix(self, environ):
#return request.script_name
return environ.get('SCRIPT_NAME') + '/'
def get_full_prefix(self, environ):
return self.get_host_prefix(environ) + self.get_rel_prefix(environ)
def get_wburl(self, environ):
wb_url = environ.get('PATH_INFO', '/')[1:]
if environ.get('QUERY_STRING'):
wb_url += '?' + environ.get('QUERY_STRING')
return wb_url
def unrewrite_referrer(self, environ):
referrer = environ.get('HTTP_REFERER')
if not referrer:
return False
full_prefix = self.get_full_prefix(environ)
if referrer.startswith(full_prefix):
referrer = referrer[len(full_prefix):]
environ['HTTP_REFERER'] = WbUrl(referrer).url
return True
return False
def is_ajax(self, environ):
value = environ.get('HTTP_X_REQUESTED_WITH')
value = value or environ.get('HTTP_X_PYWB_REQUESTED_WITH')
if value and value.lower() == 'xmlhttprequest':
return True
return False
def get_base_url(self, wb_url, kwargs):
type = kwargs.get('type')
return self.paths[type]
def get_upstream_url(self, wb_url, kwargs, params):
base_url = self.get_base_url(wb_url, kwargs)
param_str = urlencode(params, True)
if param_str:
base_url += '&' + param_str
return base_url
def get_cookie_key(self, kwargs):
raise NotImplemented()
def _add_custom_params(self, cdx, headers, kwargs):
cdx['is_live'] = 'true'
pass
def get_top_frame_params(self, wb_url, kwargs):
return None
def handle_custom_response(self, environ, wb_url, full_prefix, host_prefix, kwargs):
if wb_url.is_query():
return self.handle_query(environ, wb_url, kwargs)
if self.is_framed_replay(wb_url):
extra_params = self.get_top_frame_params(wb_url, kwargs)
return self.frame_insert_view.get_top_frame(wb_url,
full_prefix,
host_prefix,
environ,
self.frame_mod,
self.replay_mod,
coll='',
extra_params=extra_params)
return None

225
urlrewrite/templateview.py Normal file
View File

@ -0,0 +1,225 @@
from pywb.utils.timeutils import timestamp_to_datetime, timestamp_to_sec
from pywb.utils.timeutils import timestamp_now
from six.moves.urllib.parse import urlsplit
from jinja2 import Environment
from jinja2 import FileSystemLoader, PackageLoader, ChoiceLoader
from webassets.ext.jinja2 import AssetsExtension
from webassets.loaders import YAMLLoader
from webassets.env import Resolver
from pkg_resources import resource_filename
import json
import os
# ============================================================================
class FileOnlyPackageLoader(PackageLoader):
def get_source(self, env, template):
dir_, file_ = os.path.split(template)
return super(FileOnlyPackageLoader, self).get_source(env, file_)
# ============================================================================
class RelEnvironment(Environment):
"""Override join_path() to enable relative template paths."""
def join_path(self, template, parent):
return os.path.join(os.path.dirname(parent), template)
# ============================================================================
class JinjaEnv(object):
def __init__(self, paths=['templates', '.', '/'],
packages=['pywb'],
assets_path=None,
globals=None,
overlay=None,
extensions=None):
self._init_filters()
loader = ChoiceLoader(self._make_loaders(paths, packages))
extensions = extensions or []
if assets_path:
extensions.append(AssetsExtension)
if overlay:
jinja_env = overlay.jinja_env.overlay(loader=loader,
trim_blocks=True,
extensions=extensions)
else:
jinja_env = RelEnvironment(loader=loader,
trim_blocks=True,
extensions=extensions)
jinja_env.filters.update(self.filters)
if globals:
jinja_env.globals.update(globals)
self.jinja_env = jinja_env
# init assets
if assets_path:
assets_loader = YAMLLoader(assets_path)
assets_env = assets_loader.load_environment()
assets_env.resolver = PkgResResolver()
jinja_env.assets_environment = assets_env
def _make_loaders(self, paths, packages):
loaders = []
# add loaders for paths
for path in paths:
loaders.append(FileSystemLoader(path))
# add loaders for all specified packages
for package in packages:
loaders.append(FileOnlyPackageLoader(package))
return loaders
def template_filter(self, param=None):
def deco(func):
name = param or func.__name__
self.filters[name] = func
return func
return deco
def _init_filters(self):
self.filters = {}
@self.template_filter()
def format_ts(value, format_='%a, %b %d %Y %H:%M:%S'):
if format_ == '%s':
return timestamp_to_sec(value)
else:
value = timestamp_to_datetime(value)
return value.strftime(format_)
@self.template_filter('urlsplit')
def get_urlsplit(url):
split = urlsplit(url)
return split
@self.template_filter()
def tojson(obj):
return json.dumps(obj)
# ============================================================================
class BaseInsertView(object):
def __init__(self, jenv, insert_file, banner_file=''):
self.jenv = jenv
self.insert_file = insert_file
self.banner_file = banner_file
def render_to_string(self, env, **kwargs):
template = self.jenv.jinja_env.get_template(self.insert_file)
params = env.get('webrec.template_params')
if params:
kwargs.update(params)
return template.render(**kwargs)
# ============================================================================
class HeadInsertView(BaseInsertView):
def create_insert_func(self, wb_url,
wb_prefix,
host_prefix,
top_url,
env,
is_framed,
coll='',
include_ts=True):
url = wb_url.get_url()
include_wombat = not wb_url.is_banner_only
wbrequest = {'host_prefix': host_prefix,
'wb_prefix': wb_prefix,
'wb_url': wb_url,
'coll': coll,
'env': env,
'options': {'is_framed': is_framed},
'rewrite_opts': {}
}
def make_head_insert(rule, cdx):
return (self.render_to_string(env, wbrequest=wbrequest,
cdx=cdx,
top_url=top_url,
include_ts=include_ts,
include_wombat=include_wombat,
banner_html=self.banner_file,
rule=rule))
return make_head_insert
# ============================================================================
class TopFrameView(BaseInsertView):
def get_top_frame(self, wb_url,
wb_prefix,
host_prefix,
env,
frame_mod,
replay_mod,
coll='',
extra_params=None):
embed_url = wb_url.to_str(mod=replay_mod)
if wb_url.timestamp:
timestamp = wb_url.timestamp
else:
timestamp = timestamp_now()
wbrequest = {'host_prefix': host_prefix,
'wb_prefix': wb_prefix,
'wb_url': wb_url,
'coll': coll,
'options': {'frame_mod': frame_mod,
'replay_mod': replay_mod},
}
params = dict(embed_url=embed_url,
wbrequest=wbrequest,
timestamp=timestamp,
url=wb_url.get_url(),
banner_html=self.banner_file)
if extra_params:
params.update(extra_params)
return self.render_to_string(env, **params)
# ============================================================================
class PkgResResolver(Resolver):
def get_pkg_path(self, item):
if not isinstance(item, str):
return None
parts = urlsplit(item)
if parts.scheme == 'pkg' and parts.netloc:
return (parts.netloc, parts.path)
return None
def resolve_source(self, ctx, item):
pkg = self.get_pkg_path(item)
if pkg:
filename = resource_filename(pkg[0], pkg[1])
if filename:
return filename
return super(PkgResResolver, self).resolve_source(ctx, item)

View File

View File

@ -0,0 +1,74 @@
from gevent.monkey import patch_all; patch_all()
from bottle import run, Bottle, request, response, debug
from six.moves.urllib.parse import quote
from pywb.utils.loaders import LocalFileLoader
import mimetypes
import redis
from urlrewrite.rewriterapp import RewriterApp
from urlrewrite.cookies import CookieTracker
# ============================================================================
class RWApp(RewriterApp):
def __init__(self, upstream_urls, cookie_key_templ, redis):
config = {}
config['url_templates'] = upstream_urls
self.cookie_key_templ = cookie_key_templ
self.app = Bottle()
self.block_loader = LocalFileLoader()
self.init_routes()
super(RWApp, self).__init__(True, config=config)
self.cookie_tracker = CookieTracker(redis)
self.orig_error_handler = self.app.default_error_handler
self.app.default_error_handler = self.err_handler
def err_handler(self, exc):
print(exc)
import traceback
traceback.print_exc()
return self.orig_error_handler(exc)
def get_cookie_key(self, kwargs):
return self.cookie_key_templ.format(**kwargs)
def init_routes(self):
@self.app.get('/static/__pywb/<filepath:path>')
def server_static(filepath):
data = self.block_loader.load('pywb/static/' + filepath)
guessed = mimetypes.guess_type(filepath)
if guessed[0]:
response.headers['Content-Type'] = guessed[0]
return data
self.app.mount('/live/', self.call_with_params(type='live'))
self.app.mount('/record/', self.call_with_params(type='record'))
self.app.mount('/replay/', self.call_with_params(type='replay'))
@staticmethod
def create_app(replay_port=8080, record_port=8010):
upstream_urls = {'live': 'http://localhost:%s/live/resource/postreq?' % replay_port,
'record': 'http://localhost:%s/live/resource/postreq?' % record_port,
'replay': 'http://localhost:%s/replay/resource/postreq?' % replay_port,
}
r = redis.StrictRedis.from_url('redis://localhost/2')
rwapp = RWApp(upstream_urls, 'cookies:', r)
return rwapp
# ============================================================================
if __name__ == "__main__":
application = RWApp.create_app()
application.app.run(port=8090, server='gevent')

View File

@ -0,0 +1,43 @@
from webagg.test.testutils import LiveServerTests, BaseTestClass
from webagg.test.testutils import FakeRedisTests
from .simpleapp import RWApp, debug
import os
import webtest
class TestRewriter(LiveServerTests, FakeRedisTests, BaseTestClass):
@classmethod
def setup_class(cls):
super(TestRewriter, cls).setup_class()
#cls.upstream_url = 'http://localhost:{0}'.format(cls.server.port)
#cls.upstream_url += '/{type}/resource/postreq?url={url}&closest={closest}'
#cls.app = RWApp(cls.upstream_url)
cls.app = RWApp.create_app(replay_port=cls.server.port)
cls.testapp = webtest.TestApp(cls.app.app)
debug(True)
def test_replay(self):
resp = self.testapp.get('/live/mp_/http://example.com/')
resp.charset = 'utf-8'
assert '"http://localhost:80/live/mp_/http://www.iana.org/domains/example"' in resp.text
assert 'wbinfo.url = "http://example.com/"'
def test_top_frame(self):
resp = self.testapp.get('/live/http://example.com/')
resp.charset = 'utf-8'
assert '"http://localhost:80/live/mp_/http://example.com/"' in resp.text
assert 'wbinfo.capture_url = "http://example.com/"' in resp.text
def test_cookie_track_1(self):
resp = self.testapp.get('/live/mp_/https://twitter.com/')
assert resp.headers['set-cookie'] != None

18
urlrewrite/test/uwsgi.ini Normal file
View File

@ -0,0 +1,18 @@
[uwsgi]
if-not-env = PORT
http-socket = :8090
endif =
master = true
buffer-size = 65536
die-on-term = true
if-env = VIRTUAL_ENV
venv = $(VIRTUAL_ENV)
endif =
gevent = 100
wsgi = urlrewrite.test.simpleapp

14
webagg/Dockerfile Normal file
View File

@ -0,0 +1,14 @@
FROM python:3.5
WORKDIR /code/
RUN pip install -U git+https://github.com/ikreymer/pywb.git@develop#egg=pywb-0.30.0-develop
RUN pip install uwsgi gevent bottle
ADD . /code/webagg/
ADD ./test/ /code/test/
WORKDIR /code/
CMD uwsgi /code/test/live.ini

6
webagg/README.rst Normal file
View File

@ -0,0 +1,6 @@
Resource Memento/Aggregator
===========================
This is a reference implementation of the `Resource/Memento Aggregator <https://github.com/webrecorder/platform-spec/wiki/ResourceMementoAggregator>`_
from the `Webrecorder Platform <https://github.com/webrecorder/platform-spec/wiki>`_

0
webagg/__init__.py Normal file
View File

287
webagg/aggregator.py Normal file
View File

@ -0,0 +1,287 @@
from gevent.pool import Pool
import gevent
from concurrent import futures
import json
import time
import os
from pywb.utils.timeutils import timestamp_now
from pywb.cdx.cdxops import process_cdx
from pywb.cdx.query import CDXQuery
from heapq import merge
from collections import deque
from itertools import chain
from webagg.indexsource import FileIndexSource, RedisIndexSource
from pywb.utils.wbexception import NotFoundException, WbException
from webagg.utils import ParamFormatter, res_template
import six
import glob
#=============================================================================
class BaseAggregator(object):
def __call__(self, params):
if params.get('closest') == 'now':
params['closest'] = timestamp_now()
content_type = params.get('content_type')
if content_type:
params['filter'] = '=mime:' + content_type
query = CDXQuery(params)
cdx_iter, errs = self.load_index(query.params)
cdx_iter = process_cdx(cdx_iter, query)
return cdx_iter, dict(errs)
def load_child_source(self, name, source, params):
try:
params['_formatter'] = ParamFormatter(params, name)
res = source.load_index(params)
if isinstance(res, tuple):
cdx_iter, err_list = res
else:
cdx_iter = res
err_list = []
except WbException as wbe:
#print('Not found in ' + name)
cdx_iter = iter([])
err_list = [(name, repr(wbe))]
def add_name(cdx, name):
if cdx.get('source'):
cdx['source'] = name + ':' + cdx['source']
else:
cdx['source'] = name
return cdx
return (add_name(cdx, name) for cdx in cdx_iter), err_list
def load_index(self, params):
res_list = self._load_all(params)
iter_list = [res[0] for res in res_list]
err_list = chain(*[res[1] for res in res_list])
#optimization: if only a single entry (or empty) just load directly
if len(iter_list) <= 1:
cdx_iter = iter_list[0] if iter_list else iter([])
else:
cdx_iter = merge(*(iter_list))
return cdx_iter, err_list
def _on_source_error(self, name): #pragma: no cover
pass
def _load_all(self, params): #pragma: no cover
raise NotImplemented()
def _iter_sources(self, params): #pragma: no cover
raise NotImplemented()
def get_source_list(self, params):
srcs = self._iter_sources(params)
result = [(name, str(value)) for name, value in srcs]
result = {'sources': dict(result)}
return result
#=============================================================================
class BaseSourceListAggregator(BaseAggregator):
def __init__(self, sources, **kwargs):
self.sources = sources
def get_all_sources(self, params):
return self.sources
def _iter_sources(self, params):
sources = self.get_all_sources(params)
srcs_list = params.get('sources')
if not srcs_list:
return sources.items()
sel_sources = tuple(srcs_list.split(','))
return [(name, sources[name]) for name in sources.keys() if name in sel_sources]
#=============================================================================
class SeqAggMixin(object):
def __init__(self, *args, **kwargs):
super(SeqAggMixin, self).__init__(*args, **kwargs)
def _load_all(self, params):
sources = self._iter_sources(params)
return [self.load_child_source(name, source, params)
for name, source in sources]
#=============================================================================
class SimpleAggregator(SeqAggMixin, BaseSourceListAggregator):
pass
#=============================================================================
class TimeoutMixin(object):
def __init__(self, *args, **kwargs):
super(TimeoutMixin, self).__init__(*args, **kwargs)
self.t_count = kwargs.get('t_count', 3)
self.t_dura = kwargs.get('t_duration', 20)
self.timeouts = {}
def is_timed_out(self, name):
timeout_deq = self.timeouts.get(name)
if not timeout_deq:
return False
the_time = time.time()
for t in list(timeout_deq):
if (the_time - t) > self.t_dura:
timeout_deq.popleft()
if len(timeout_deq) >= self.t_count:
print('Skipping {0}, {1} timeouts in {2} seconds'.
format(name, self.t_count, self.t_dura))
return True
return False
def _iter_sources(self, params):
sources = super(TimeoutMixin, self)._iter_sources(params)
for name, source in sources:
if not self.is_timed_out(name):
yield name, source
def _on_source_error(self, name):
the_time = time.time()
if name not in self.timeouts:
self.timeouts[name] = deque()
self.timeouts[name].append(the_time)
print(name + ' timed out!')
#=============================================================================
class GeventMixin(object):
def __init__(self, *args, **kwargs):
super(GeventMixin, self).__init__(*args, **kwargs)
self.pool = Pool(size=kwargs.get('size'))
self.timeout = kwargs.get('timeout', 5.0)
def _load_all(self, params):
params['_timeout'] = self.timeout
sources = list(self._iter_sources(params))
def do_spawn(name, source):
return self.pool.spawn(self.load_child_source, name, source, params)
jobs = [do_spawn(name, source) for name, source in sources]
gevent.joinall(jobs, timeout=self.timeout)
results = []
for (name, source), job in zip(sources, jobs):
if job.value is not None:
results.append(job.value)
else:
results.append((iter([]), [(name, 'timeout')]))
self._on_source_error(name)
return results
#=============================================================================
class GeventTimeoutAggregator(TimeoutMixin, GeventMixin, BaseSourceListAggregator):
pass
#=============================================================================
class BaseDirectoryIndexSource(BaseAggregator):
CDX_EXT = ('.cdx', '.cdxj')
def __init__(self, base_prefix, base_dir=''):
self.base_prefix = base_prefix
self.base_dir = base_dir
def _iter_sources(self, params):
the_dir = res_template(self.base_dir, params)
the_dir = os.path.join(self.base_prefix, the_dir)
try:
sources = list(self._load_files(the_dir))
except Exception:
raise NotFoundException(the_dir)
return sources
def _load_files(self, glob_dir):
for the_dir in glob.iglob(glob_dir):
for result in self._load_files_single_dir(the_dir):
yield result
def _load_files_single_dir(self, the_dir):
for name in os.listdir(the_dir):
filename = os.path.join(the_dir, name)
if filename.endswith(self.CDX_EXT):
print('Adding ' + filename)
rel_path = os.path.relpath(the_dir, self.base_prefix)
if rel_path == '.':
full_name = name
else:
full_name = rel_path + '/' + name
yield full_name, FileIndexSource(filename)
def __str__(self):
return 'file_dir'
#=============================================================================
class DirectoryIndexSource(SeqAggMixin, BaseDirectoryIndexSource):
pass
#=============================================================================
class CacheDirectoryIndexSource(DirectoryIndexSource):
def __init__(self, *args, **kwargs):
super(CacheDirectoryIndexSource, self).__init__(*args, **kwargs)
self.cached_file_list = {}
def _load_files_single_dir(self, the_dir):
try:
stat = os.stat(the_dir)
except Exception as e:
stat = 0
result = self.cached_file_list.get(the_dir)
if result:
last_stat, files = result
if stat and last_stat == stat:
print('Dir {0} unchanged'.format(the_dir))
return files
files = super(CacheDirectoryIndexSource, self)._load_files_single_dir(the_dir)
files = list(files)
self.cached_file_list[the_dir] = (stat, files)
return files
#=============================================================================
class RedisMultiKeyIndexSource(SeqAggMixin, BaseAggregator, RedisIndexSource):
def _iter_sources(self, params):
redis_key_pattern = res_template(self.redis_key_template, params)
for key in self.redis.scan_iter(match=redis_key_pattern):
key = key.decode('utf-8')
yield key, RedisIndexSource(None, self.redis, key)

124
webagg/app.py Normal file
View File

@ -0,0 +1,124 @@
from webagg.inputrequest import DirectWSGIInputRequest, POSTInputRequest
from werkzeug.routing import Map, Rule
import requests
import traceback
import json
from six.moves.urllib.parse import parse_qsl
import six
JSON_CT = 'application/json; charset=utf-8'
#=============================================================================
class ResAggApp(object):
def __init__(self, *args, **kwargs):
self.route_dict = {}
self.debug = kwargs.get('debug', False)
self.url_map = Map()
def list_routes(environ):
return {}, self.route_dict, {}
self.url_map.add(Rule('/', endpoint=list_routes))
def add_route(self, path, handler):
def direct_input_request(environ, mode=''):
params = self.get_query_dict(environ)
params['mode'] = mode
params['_input_req'] = DirectWSGIInputRequest(environ)
return handler(params)
def post_fullrequest(environ, mode=''):
params = self.get_query_dict(environ)
params['mode'] = mode
params['_input_req'] = POSTInputRequest(environ)
return handler(params)
self.url_map.add(Rule(path, endpoint=direct_input_request))
self.url_map.add(Rule(path + '/<path:mode>', endpoint=direct_input_request))
self.url_map.add(Rule(path + '/postreq', endpoint=post_fullrequest))
self.url_map.add(Rule(path + '/<path:mode>/postreq', endpoint=post_fullrequest))
handler_dict = handler.get_supported_modes()
self.route_dict[path] = handler_dict
self.route_dict[path + '/postreq'] = handler_dict
def get_query_dict(self, environ):
query_str = environ.get('QUERY_STRING')
if query_str:
return dict(parse_qsl(query_str))
else:
return {}
def __call__(self, environ, start_response):
urls = self.url_map.bind_to_environ(environ)
try:
endpoint, args = urls.match()
except HTTPException as e:
return e(environ, start_response)
try:
result = endpoint(environ, **args)
out_headers, res, errs = result
if not res:
return self.send_error(errs, start_response)
if isinstance(res, dict):
res = self.json_encode(res, out_headers)
if errs:
if 'last_exc' in errs:
errs['last_exc'] = str(errs['last_exc'])
out_headers['ResErrors'] = json.dumps(errs)
start_response('200 OK', list(out_headers.items()))
return res
except Exception as e:
if self.debug:
traceback.print_exc()
message = 'Internal Error: ' + str(e)
status = 500
return self.send_error({}, start_response,
message=message,
status=status)
def json_encode(self, res, out_headers):
res = json.dumps(res).encode('utf-8')
out_headers['Content-Type'] = JSON_CT
out_headers['Content-Length'] = str(len(res))
return [res]
def send_error(self, errs, start_response,
message='No Resource Found', status=404):
last_exc = errs.pop('last_exc', None)
if last_exc:
if self.debug:
traceback.print_exc()
status = last_exc.status()
message = last_exc.msg
res = {'message': message}
if errs:
res['errors'] = errs
out_headers = {}
res = self.json_encode(res, out_headers)
if six.PY3:
out_headers['ResErrors'] = res[0].decode('utf-8')
else:
out_headers['ResErrors'] = res[0]
message = message.encode('utf-8')
message = str(status) + ' ' + message
start_response(message, list(out_headers.items()))
return res

194
webagg/handlers.py Normal file
View File

@ -0,0 +1,194 @@
from webagg.responseloader import WARCPathLoader, LiveWebLoader, VideoLoader
from webagg.utils import MementoUtils
from pywb.utils.wbexception import BadRequestException, WbException
from pywb.utils.wbexception import NotFoundException
from pywb.cdx.query import CDXQuery
from pywb.cdx.cdxdomainspecific import load_domain_specific_cdx_rules
import six
#=============================================================================
def to_cdxj(cdx_iter, fields):
content_type = 'text/x-cdxj'
return content_type, (cdx.to_cdxj(fields) for cdx in cdx_iter)
def to_json(cdx_iter, fields):
content_type = 'application/x-ndjson'
return content_type, (cdx.to_json(fields) for cdx in cdx_iter)
def to_text(cdx_iter, fields):
content_type = 'text/plain'
return content_type, (cdx.to_text(fields) for cdx in cdx_iter)
def to_link(cdx_iter, fields):
content_type = 'application/link'
return content_type, MementoUtils.make_timemap(cdx_iter)
#=============================================================================
class FuzzyMatcher(object):
def __init__(self):
res = load_domain_specific_cdx_rules('pywb/rules.yaml', True)
self.url_canon, self.fuzzy_query = res
def __call__(self, index_source, params):
cdx_iter, errs = index_source(params)
return self.do_fuzzy(cdx_iter, index_source, params), errs
def do_fuzzy(self, cdx_iter, index_source, params):
found = False
for cdx in cdx_iter:
found = True
yield cdx
fuzzy_query_params = None
if not found:
query = CDXQuery(params)
fuzzy_query_params = self.fuzzy_query(query)
if not fuzzy_query_params:
return
fuzzy_query_params.pop('alt_url', '')
new_iter, errs = index_source(fuzzy_query_params)
for cdx in new_iter:
yield cdx
#=============================================================================
class IndexHandler(object):
OUTPUTS = {
'cdxj': to_cdxj,
'json': to_json,
'text': to_text,
'link': to_link,
}
DEF_OUTPUT = 'cdxj'
def __init__(self, index_source, opts=None, *args, **kwargs):
self.index_source = index_source
self.opts = opts or {}
self.fuzzy = FuzzyMatcher()
def get_supported_modes(self):
return dict(modes=['list_sources', 'index'])
def _load_index_source(self, params):
url = params.get('url')
if not url:
errs = dict(last_exc=BadRequestException('The "url" param is required'))
return None, errs
input_req = params.get('_input_req')
if input_req:
params['alt_url'] = input_req.include_post_query(url)
return self.fuzzy(self.index_source, params)
def __call__(self, params):
mode = params.get('mode', 'index')
if mode == 'list_sources':
return {}, self.index_source.get_source_list(params), {}
if mode != 'index':
return {}, self.get_supported_modes(), {}
output = params.get('output', self.DEF_OUTPUT)
fields = params.get('fields')
handler = self.OUTPUTS.get(output)
if not handler:
errs = dict(last_exc=BadRequestException('output={0} not supported'.format(output)))
return None, None, errs
cdx_iter, errs = self._load_index_source(params)
if not cdx_iter:
return None, None, errs
content_type, res = handler(cdx_iter, fields)
out_headers = {'Content-Type': content_type}
def check_str(lines):
for line in lines:
if isinstance(line, six.text_type):
line = line.encode('utf-8')
yield line
return out_headers, check_str(res), errs
#=============================================================================
class ResourceHandler(IndexHandler):
def __init__(self, index_source, resource_loaders):
super(ResourceHandler, self).__init__(index_source)
self.resource_loaders = resource_loaders
def get_supported_modes(self):
res = super(ResourceHandler, self).get_supported_modes()
res['modes'].append('resource')
return res
def __call__(self, params):
if params.get('mode', 'resource') != 'resource':
return super(ResourceHandler, self).__call__(params)
cdx_iter, errs = self._load_index_source(params)
if not cdx_iter:
return None, None, errs
last_exc = None
for cdx in cdx_iter:
for loader in self.resource_loaders:
try:
out_headers, resp = loader(cdx, params)
if resp is not None:
return out_headers, resp, errs
except WbException as e:
last_exc = e
errs[str(loader)] = str(e)
if last_exc:
errs['last_exc'] = last_exc
return None, None, errs
#=============================================================================
class DefaultResourceHandler(ResourceHandler):
def __init__(self, index_source, warc_paths=''):
loaders = [WARCPathLoader(warc_paths, index_source),
LiveWebLoader(),
VideoLoader()
]
super(DefaultResourceHandler, self).__init__(index_source, loaders)
#=============================================================================
class HandlerSeq(object):
def __init__(self, handlers):
self.handlers = handlers
def get_supported_modes(self):
if self.handlers:
return self.handlers[0].get_supported_modes()
else:
return {}
def __call__(self, params):
all_errs = {}
for handler in self.handlers:
out_headers, res, errs = handler(params)
all_errs.update(errs)
if res is not None:
return out_headers, res, all_errs
return None, None, all_errs

226
webagg/indexsource.py Normal file
View File

@ -0,0 +1,226 @@
import redis
from pywb.utils.binsearch import iter_range
from pywb.utils.timeutils import timestamp_to_http_date, http_date_to_timestamp
from pywb.utils.timeutils import timestamp_now
from pywb.utils.canonicalize import canonicalize
from pywb.utils.wbexception import NotFoundException
from pywb.cdx.cdxobject import CDXObject
#from webagg.liverec import patched_requests as requests
import requests
from webagg.utils import ParamFormatter, res_template
from webagg.utils import MementoUtils
WAYBACK_ORIG_SUFFIX = '{timestamp}id_/{url}'
#=============================================================================
class BaseIndexSource(object):
def load_index(self, params): #pragma: no cover
raise NotImplemented()
#=============================================================================
class FileIndexSource(BaseIndexSource):
def __init__(self, filename):
self.filename_template = filename
def load_index(self, params):
filename = res_template(self.filename_template, params)
try:
fh = open(filename, 'rb')
except IOError:
raise NotFoundException(filename)
def do_load(fh):
with fh:
gen = iter_range(fh, params['key'], params['end_key'])
for line in gen:
yield CDXObject(line)
return do_load(fh)
def __str__(self):
return 'file'
#=============================================================================
class RemoteIndexSource(BaseIndexSource):
def __init__(self, api_url, replay_url, url_field='load_url'):
self.api_url_template = api_url
self.replay_url = replay_url
self.url_field = url_field
def load_index(self, params):
api_url = res_template(self.api_url_template, params)
r = requests.get(api_url, timeout=params.get('_timeout'))
if r.status_code >= 400:
raise NotFoundException(api_url)
lines = r.content.strip().split(b'\n')
def do_load(lines):
for line in lines:
cdx = CDXObject(line)
self._set_load_url(cdx)
yield cdx
return do_load(lines)
def _set_load_url(self, cdx):
cdx[self.url_field] = self.replay_url.format(
timestamp=cdx['timestamp'],
url=cdx['url'])
def __str__(self):
return 'remote'
#=============================================================================
class LiveIndexSource(BaseIndexSource):
def __init__(self, proxy_url='{url}'):
self.proxy_url = proxy_url
def load_index(self, params):
cdx = CDXObject()
cdx['urlkey'] = params.get('key').decode('utf-8')
cdx['timestamp'] = timestamp_now()
cdx['url'] = params['url']
cdx['load_url'] = res_template(self.proxy_url, params)
cdx['is_live'] = 'true'
cdx['mime'] = params.get('content_type', '')
def live():
yield cdx
return live()
def __str__(self):
return 'live'
#=============================================================================
class RedisIndexSource(BaseIndexSource):
def __init__(self, redis_url, redis=None, key_template=None):
if redis_url and not redis:
redis, key_template = self.parse_redis_url(redis_url)
self.redis = redis
self.redis_key_template = key_template
@staticmethod
def parse_redis_url(redis_url):
parts = redis_url.split('/')
key_prefix = ''
if len(parts) > 4:
key_prefix = parts[4]
redis_url = 'redis://' + parts[2] + '/' + parts[3]
redis_key_template = key_prefix
red = redis.StrictRedis.from_url(redis_url)
return red, key_prefix
def load_index(self, params):
return self.load_key_index(self.redis_key_template, params)
def load_key_index(self, key_template, params):
z_key = res_template(key_template, params)
index_list = self.redis.zrangebylex(z_key,
b'[' + params['key'],
b'(' + params['end_key'])
def do_load(index_list):
for line in index_list:
yield CDXObject(line)
return do_load(index_list)
def __str__(self):
return 'redis'
#=============================================================================
class MementoIndexSource(BaseIndexSource):
def __init__(self, timegate_url, timemap_url, replay_url):
self.timegate_url = timegate_url
self.timemap_url = timemap_url
self.replay_url = replay_url
def links_to_cdxobject(self, link_header, def_name):
results = MementoUtils.parse_links(link_header, def_name)
#meta = MementoUtils.meta_field('timegate', results)
#if meta:
# yield meta
#meta = MementoUtils.meta_field('timemap', results)
#if meta:
# yield meta
#meta = MementoUtils.meta_field('original', results)
#if meta:
# yield meta
original = results['original']['url']
key = canonicalize(original)
mementos = results['mementos']
for val in mementos:
dt = val['datetime']
ts = http_date_to_timestamp(dt)
cdx = CDXObject()
cdx['urlkey'] = key
cdx['timestamp'] = ts
cdx['url'] = original
cdx['mem_rel'] = val.get('rel', '')
cdx['memento_url'] = val['url']
load_url = self.replay_url.format(timestamp=cdx['timestamp'],
url=original)
cdx['load_url'] = load_url
yield cdx
def get_timegate_links(self, params, closest):
url = res_template(self.timegate_url, params)
accept_dt = timestamp_to_http_date(closest)
res = requests.head(url, headers={'Accept-Datetime': accept_dt})
if res.status_code >= 400:
raise NotFoundException(url)
return res.headers.get('Link')
def get_timemap_links(self, params):
url = res_template(self.timemap_url, params)
res = requests.get(url, timeout=params.get('_timeout'))
if res.status_code >= 400:
raise NotFoundException(url)
return res.text
def load_index(self, params):
closest = params.get('closest')
if not closest:
links = self.get_timemap_links(params)
def_name = 'timemap'
else:
links = self.get_timegate_links(params, closest)
def_name = 'timegate'
return self.links_to_cdxobject(links, def_name)
@staticmethod
def from_timegate_url(timegate_url, path='link'):
return MementoIndexSource(timegate_url + '{url}',
timegate_url + 'timemap/' + path + '/{url}',
timegate_url + WAYBACK_ORIG_SUFFIX)
def __str__(self):
return 'memento'

170
webagg/inputrequest.py Normal file
View File

@ -0,0 +1,170 @@
from pywb.utils.loaders import extract_post_query, append_post_query
from pywb.utils.loaders import LimitReader
from pywb.utils.statusandheaders import StatusAndHeadersParser
from six.moves.urllib.parse import urlsplit, quote
from six import iteritems, StringIO
from io import BytesIO
#=============================================================================
class DirectWSGIInputRequest(object):
def __init__(self, env):
self.env = env
def get_req_method(self):
return self.env['REQUEST_METHOD'].upper()
def get_req_protocol(self):
return self.env['SERVER_PROTOCOL']
def get_req_headers(self):
headers = {}
for name, value in iteritems(self.env):
# will be set by requests to match actual host
if name == 'HTTP_HOST':
continue
elif name.startswith('HTTP_'):
name = name[5:].title().replace('_', '-')
elif name in ('CONTENT_LENGTH', 'CONTENT_TYPE'):
name = name.title().replace('_', '-')
else:
value = None
if value:
headers[name] = value
return headers
def get_req_body(self):
input_ = self.env['wsgi.input']
len_ = self._get_content_length()
enc = self._get_header('Transfer-Encoding')
if len_:
data = LimitReader(input_, int(len_))
elif enc:
data = input_
else:
data = None
return data
def _get_content_type(self):
return self.env.get('CONTENT_TYPE')
def _get_content_length(self):
return self.env.get('CONTENT_LENGTH')
def _get_header(self, name):
return self.env.get('HTTP_' + name.upper().replace('-', '_'))
def include_post_query(self, url):
if not url or self.get_req_method() != 'POST':
return url
mime = self._get_content_type()
#mime = mime.split(';')[0] if mime else ''
length = self._get_content_length()
stream = self.env['wsgi.input']
buffered_stream = BytesIO()
post_query = extract_post_query('POST', mime, length, stream,
buffered_stream=buffered_stream,
environ=self.env)
if post_query:
self.env['wsgi.input'] = buffered_stream
url = append_post_query(url, post_query)
return url
def get_full_request_uri(self):
req_uri = self.env.get('REQUEST_URI')
if req_uri and not self.env.get('SCRIPT_NAME'):
return req_uri
req_uri = quote(self.env.get('PATH_INFO', ''), safe='/~!$&\'()*+,;=:@')
query = self.env.get('QUERY_STRING')
if query:
req_uri += '?' + query
return req_uri
def reconstruct_request(self, url=None):
buff = StringIO()
buff.write(self.get_req_method())
buff.write(' ')
buff.write(self.get_full_request_uri())
buff.write(' ')
buff.write(self.get_req_protocol())
buff.write('\r\n')
headers = self.get_req_headers()
if url:
parts = urlsplit(url)
buff.write('Host: ')
buff.write(parts.netloc)
buff.write('\r\n')
for name, value in iteritems(headers):
if name.lower() == 'host':
continue
buff.write(name)
buff.write(': ')
buff.write(value)
buff.write('\r\n')
buff.write('\r\n')
buff = buff.getvalue().encode('latin-1')
body = self.get_req_body()
if body:
buff += body.read()
return buff
#=============================================================================
class POSTInputRequest(DirectWSGIInputRequest):
def __init__(self, env):
self.env = env
parser = StatusAndHeadersParser([], verify=False)
self.status_headers = parser.parse(self.env['wsgi.input'])
def get_req_method(self):
return self.status_headers.protocol
def get_req_headers(self):
headers = {}
for n, v in self.status_headers.headers:
headers[n] = v
return headers
def get_full_request_uri(self):
return self.status_headers.statusline.split(' ', 1)[0]
def get_req_protocol(self):
return self.status_headers.statusline.split(' ', 1)[-1]
def _get_content_type(self):
return self.status_headers.get_header('Content-Type')
def _get_content_length(self):
return self.status_headers.get_header('Content-Length')
def _get_header(self, name):
return self.status_headers.get_header(name)

View File

@ -0,0 +1,54 @@
from pywb.cdx.cdxobject import CDXObject
from pywb.utils.wbexception import NotFoundException
from webagg.indexsource import BaseIndexSource, RemoteIndexSource
from webagg.responseloader import LiveWebLoader
from webagg.utils import ParamFormatter, res_template
from pywb.utils.timeutils import timestamp_now
#=============================================================================
class UpstreamAggIndexSource(RemoteIndexSource):
def __init__(self, base_url):
api_url = base_url + '/index?url={url}'
proxy_url = base_url + '/resource?url={url}&closest={timestamp}'
super(UpstreamAggIndexSource, self).__init__(api_url, proxy_url, 'filename')
def _set_load_url(self, cdx):
super(UpstreamAggIndexSource, self)._set_load_url(cdx)
cdx['offset'] = '0'
cdx.pop('load_url', '')
#=============================================================================
class ProxyMementoIndexSource(BaseIndexSource):
def __init__(self, proxy_url='{url}'):
self.proxy_url = proxy_url
self.loader = LiveWebLoader()
def load_index(self, params):
cdx = CDXObject()
cdx['urlkey'] = params.get('key').decode('utf-8')
closest = params.get('closest')
cdx['timestamp'] = closest if closest else timestamp_now()
cdx['url'] = params['url']
cdx['load_url'] = res_template(self.proxy_url, params)
cdx['memento_url'] = cdx['load_url']
return self._do_load(cdx, params)
def _do_load(self, cdx, params):
result = self.loader.load_resource(cdx, params)
if not result:
raise NotFoundException('Not a memento: ' + cdx['url'])
cdx['_cached_result'] = result
yield cdx
def __str__(self):
return 'proxy'
@staticmethod
def upstream_resource(base_url):
return ProxyMementoIndexSource(base_url + '/resource?url={url}&closest={closest}')

436
webagg/responseloader.py Normal file
View File

@ -0,0 +1,436 @@
from webagg.utils import MementoUtils, StreamIter, chunk_encode_iter
from webagg.utils import ParamFormatter
from webagg.indexsource import RedisIndexSource
from pywb.utils.timeutils import timestamp_to_datetime, datetime_to_timestamp
from pywb.utils.timeutils import iso_date_to_datetime, datetime_to_iso_date
from pywb.utils.timeutils import http_date_to_datetime, datetime_to_http_date
from pywb.utils.wbexception import LiveResourceException, WbException
from pywb.utils.statusandheaders import StatusAndHeaders, StatusAndHeadersParser
from pywb.warc.resolvingloader import ResolvingLoader
from six.moves.urllib.parse import urlsplit, quote, unquote
from io import BytesIO
import uuid
import six
import itertools
import json
from requests.models import PreparedRequest
import urllib3
#=============================================================================
class BaseLoader(object):
def __call__(self, cdx, params):
entry = self.load_resource(cdx, params)
if not entry:
return None, None
warc_headers, other_headers, stream = entry
out_headers = {}
out_headers['WebAgg-Type'] = 'warc'
out_headers['WebAgg-Source-Coll'] = quote(cdx.get('source', ''), safe=':/')
out_headers['Content-Type'] = 'application/warc-record'
if not warc_headers:
if other_headers:
out_headers['Link'] = other_headers.get('Link')
out_headers['Memento-Datetime'] = other_headers.get('Memento-Datetime')
out_headers['Content-Length'] = other_headers.get('Content-Length')
return out_headers, StreamIter(stream)
out_headers['Link'] = MementoUtils.make_link(
warc_headers.get_header('WARC-Target-URI'),
'original')
memento_dt = iso_date_to_datetime(warc_headers.get_header('WARC-Date'))
out_headers['Memento-Datetime'] = datetime_to_http_date(memento_dt)
warc_headers_buff = warc_headers.to_bytes()
lenset = self._set_content_len(warc_headers.get_header('Content-Length'),
out_headers,
len(warc_headers_buff))
streamiter = StreamIter(stream,
header1=warc_headers_buff,
header2=other_headers)
if not lenset:
out_headers['Transfer-Encoding'] = 'chunked'
streamiter = chunk_encode_iter(streamiter)
return out_headers, streamiter
def _set_content_len(self, content_len_str, headers, existing_len):
# Try to set content-length, if it is available and valid
try:
content_len = int(content_len_str)
except (KeyError, TypeError):
content_len = -1
if content_len >= 0:
content_len += existing_len
headers['Content-Length'] = str(content_len)
return True
return False
def raise_on_self_redirect(self, params, cdx, status_code, location_url):
"""
Check if response is a 3xx redirect to the same url
If so, reject this capture to avoid causing redirect loop
"""
if cdx.get('is_live'):
return
if not status_code.startswith('3') or status_code == '304':
return
request_url = params['url'].lower()
if not location_url:
return
location_url = location_url.lower()
if location_url.startswith('/'):
host = urlsplit(cdx['url']).netloc
location_url = host + location_url
if request_url == location_url:
msg = 'Self Redirect {0} -> {1}'
msg = msg.format(request_url, location_url)
#print(msg)
raise LiveResourceException(msg)
@staticmethod
def _make_warc_id(id_=None):
if not id_:
id_ = uuid.uuid1()
return '<urn:uuid:{0}>'.format(id_)
#=============================================================================
class PrefixResolver(object):
def __init__(self, template):
self.template = template
def __call__(self, filename, cdx):
full_path = self.template
if hasattr(cdx, '_formatter') and cdx._formatter:
full_path = cdx._formatter.format(full_path)
return full_path + filename
#=============================================================================
class RedisResolver(RedisIndexSource):
def __call__(self, filename, cdx):
redis_key = self.redis_key_template
if hasattr(cdx, '_formatter') and cdx._formatter:
redis_key = cdx._formatter.format(redis_key)
res = None
if '*' in redis_key:
for key in self.redis.scan_iter(redis_key):
#key = key.decode('utf-8')
res = self.redis.hget(key, filename)
if res:
break
else:
res = self.redis.hget(redis_key, filename)
if res and six.PY3:
res = res.decode('utf-8')
return res
#=============================================================================
class WARCPathLoader(BaseLoader):
def __init__(self, paths, cdx_source):
self.paths = paths
if isinstance(paths, six.string_types):
self.paths = [paths]
self.resolvers = [self._make_resolver(path) for path in self.paths]
self.resolve_loader = ResolvingLoader(self.resolvers,
no_record_parse=True)
self.headers_parser = StatusAndHeadersParser([], verify=False)
self.cdx_source = cdx_source
def _make_resolver(self, path):
if hasattr(path, '__call__'):
return path
if path.startswith('redis://'):
return RedisResolver(path)
else:
return PrefixResolver(path)
def load_resource(self, cdx, params):
if cdx.get('_cached_result'):
return cdx.get('_cached_result')
if not cdx.get('filename') or cdx.get('offset') is None:
return None
orig_source = cdx.get('source', '').split(':')[0]
formatter = ParamFormatter(params, orig_source)
cdx._formatter = formatter
def local_index_query(local_params):
for n, v in six.iteritems(params):
if n.startswith('param.'):
local_params[n] = v
cdx_iter, errs = self.cdx_source(local_params)
for cdx in cdx_iter:
cdx._formatter = formatter
yield cdx
return cdx_iter
failed_files = []
headers, payload = (self.resolve_loader.
load_headers_and_payload(cdx,
failed_files,
local_index_query))
status = cdx.get('status')
if not status or status.startswith('3'):
status_headers = self.headers_parser.parse(payload.stream)
self.raise_on_self_redirect(params, cdx,
status_headers.get_statuscode(),
status_headers.get_header('Location'))
http_headers_buff = status_headers.to_bytes()
else:
http_headers_buff = None
warc_headers = payload.rec_headers
if headers != payload:
warc_headers.replace_header('WARC-Refers-To-Target-URI',
payload.rec_headers.get_header('WARC-Target-URI'))
warc_headers.replace_header('WARC-Refers-To-Date',
payload.rec_headers.get_header('WARC-Date'))
warc_headers.replace_header('WARC-Target-URI',
headers.rec_headers.get_header('WARC-Target-URI'))
warc_headers.replace_header('WARC-Date',
headers.rec_headers.get_header('WARC-Date'))
headers.stream.close()
return (warc_headers, http_headers_buff, payload.stream)
def __str__(self):
return 'WARCPathLoader'
#=============================================================================
class LiveWebLoader(BaseLoader):
SKIP_HEADERS = ('link',
'memento-datetime',
'content-location',
'x-archive')
def __init__(self):
self.num_retries = 3
self.num_pools = 10
self.num_conn_per_pool = 10
self.pool = urllib3.PoolManager(num_pools=self.num_pools,
maxsize=self.num_conn_per_pool)
def load_resource(self, cdx, params):
load_url = cdx.get('load_url')
if not load_url:
return None
if params.get('content_type') == VideoLoader.CONTENT_TYPE:
return None
input_req = params['_input_req']
req_headers = input_req.get_req_headers()
dt = timestamp_to_datetime(cdx['timestamp'])
if cdx.get('memento_url'):
req_headers['Accept-Datetime'] = datetime_to_http_date(dt)
method = input_req.get_req_method()
data = input_req.get_req_body()
p = PreparedRequest()
p.prepare_url(load_url, None)
p.prepare_headers(None)
p.prepare_auth(None, load_url)
auth = p.headers.get('Authorization')
if auth:
req_headers['Authorization'] = auth
load_url = p.url
try:
upstream_res = self.pool.urlopen(method=method,
url=load_url,
body=data,
headers=req_headers,
redirect=False,
assert_same_host=False,
preload_content=False,
decode_content=False,
retries=self.num_retries,
timeout=params.get('_timeout'))
except Exception as e:
raise LiveResourceException(load_url)
memento_dt = upstream_res.headers.get('Memento-Datetime')
if memento_dt:
dt = http_date_to_datetime(memento_dt)
cdx['timestamp'] = datetime_to_timestamp(dt)
elif cdx.get('memento_url'):
# if 'memento_url' set and no Memento-Datetime header present
# then its an error
return None
agg_type = upstream_res.headers.get('WebAgg-Type')
if agg_type == 'warc':
cdx['source'] = unquote(upstream_res.headers.get('WebAgg-Source-Coll'))
return None, upstream_res.headers, upstream_res
self.raise_on_self_redirect(params, cdx,
str(upstream_res.status),
upstream_res.headers.get('Location'))
if upstream_res.version == 11:
version = '1.1'
else:
version = '1.0'
status = 'HTTP/{version} {status} {reason}\r\n'
status = status.format(version=version,
status=upstream_res.status,
reason=upstream_res.reason)
http_headers_buff = status
orig_resp = upstream_res._original_response
try: #pragma: no cover
#PY 3
resp_headers = orig_resp.headers._headers
for n, v in resp_headers:
if n.lower() in self.SKIP_HEADERS:
continue
http_headers_buff += n + ': ' + v + '\r\n'
except: #pragma: no cover
#PY 2
resp_headers = orig_resp.msg.headers
for n, v in zip(orig_resp.getheaders(), resp_headers):
if n in self.SKIP_HEADERS:
continue
http_headers_buff += v
http_headers_buff += '\r\n'
http_headers_buff = http_headers_buff.encode('latin-1')
try:
fp = upstream_res._fp.fp
if hasattr(fp, 'raw'): #pragma: no cover
fp = fp.raw
remote_ip = fp._sock.getpeername()[0]
except: #pragma: no cover
remote_ip = None
warc_headers = {}
warc_headers['WARC-Type'] = 'response'
warc_headers['WARC-Record-ID'] = self._make_warc_id()
warc_headers['WARC-Target-URI'] = cdx['url']
warc_headers['WARC-Date'] = datetime_to_iso_date(dt)
if remote_ip:
warc_headers['WARC-IP-Address'] = remote_ip
warc_headers['Content-Type'] = 'application/http; msgtype=response'
self._set_content_len(upstream_res.headers.get('Content-Length', -1),
warc_headers,
len(http_headers_buff))
warc_headers = StatusAndHeaders('WARC/1.0', warc_headers.items())
return (warc_headers, http_headers_buff, upstream_res)
def __str__(self):
return 'LiveWebLoader'
#=============================================================================
class VideoLoader(BaseLoader):
CONTENT_TYPE = 'application/vnd.youtube-dl_formats+json'
def __init__(self):
try:
from youtube_dl import YoutubeDL as YoutubeDL
except ImportError:
self.ydl = None
return
self.ydl = YoutubeDL(dict(simulate=True,
youtube_include_dash_manifest=False))
self.ydl.add_default_info_extractors()
def load_resource(self, cdx, params):
load_url = cdx.get('load_url')
if not load_url:
return None
if params.get('content_type') != self.CONTENT_TYPE:
return None
if not self.ydl:
return None
info = self.ydl.extract_info(load_url)
info_buff = json.dumps(info)
info_buff = info_buff.encode('utf-8')
warc_headers = {}
schema, rest = load_url.split('://', 1)
target_url = 'metadata://' + rest
dt = timestamp_to_datetime(cdx['timestamp'])
warc_headers['WARC-Type'] = 'metadata'
warc_headers['WARC-Record-ID'] = self._make_warc_id()
warc_headers['WARC-Target-URI'] = target_url
warc_headers['WARC-Date'] = datetime_to_iso_date(dt)
warc_headers['Content-Type'] = self.CONTENT_TYPE
warc_headers['Content-Length'] = str(len(info_buff))
warc_headers = StatusAndHeaders('WARC/1.0', warc_headers.items())
return warc_headers, None, BytesIO(info_buff)

0
webagg/test/__init__.py Normal file
View File

17
webagg/test/live.ini Normal file
View File

@ -0,0 +1,17 @@
[uwsgi]
if-not-env = PORT
http-socket = :8080
endif =
master = true
buffer-size = 65536
die-on-term = true
if-env = VIRTUAL_ENV
venv = $(VIRTUAL_ENV)
endif =
gevent = 100
gevent-monkey-patch =
wsgi = webagg.test.live

44
webagg/test/live.py Normal file
View File

@ -0,0 +1,44 @@
from gevent.monkey import patch_all; patch_all()
from webagg.test.testutils import LiveServerTests
from webagg.handlers import DefaultResourceHandler
from webagg.app import ResAggApp
from webagg.indexsource import LiveIndexSource, RedisIndexSource
from webagg.aggregator import SimpleAggregator, CacheDirectoryIndexSource
def simpleapp():
app = ResAggApp(debug=True)
app.add_route('/live',
DefaultResourceHandler(SimpleAggregator(
{'live': LiveIndexSource()})
)
)
app.add_route('/replay',
DefaultResourceHandler(SimpleAggregator(
{'replay': RedisIndexSource('redis://localhost/2/rec:cdxj')}),
'redis://localhost/2/rec:warc'
)
)
app.add_route('/replay-testdata',
DefaultResourceHandler(SimpleAggregator(
{'test': CacheDirectoryIndexSource('./testdata/')}),
'./testdata/'
)
)
return app
application = simpleapp()
if __name__ == "__main__":
# from bottle import run
# run(application, server='gevent', port=8080, fast=True)
from gevent.wsgi import WSGIServer
server = WSGIServer(('', 8080), application)
server.serve_forever()

216
webagg/test/test_dir_agg.py Normal file
View File

@ -0,0 +1,216 @@
import tempfile
import os
import shutil
import json
from .testutils import to_path, to_json_list, TempDirTests, BaseTestClass
from mock import patch
import time
from webagg.aggregator import DirectoryIndexSource, CacheDirectoryIndexSource
from webagg.aggregator import SimpleAggregator
from webagg.indexsource import MementoIndexSource
#=============================================================================
linkheader = """\
<http://example.com/>; rel="original", <http://web.archive.org/web/timemap/link/http://example.com/>; rel="timemap"; type="application/link-format", <http://web.archive.org/web/20020120142510/http://example.com/>; rel="first memento"; datetime="Sun, 20 Jan 2002 14:25:10 GMT", <http://web.archive.org/web/20100501123414/http://example.com/>; rel="prev memento"; datetime="Sat, 01 May 2010 12:34:14 GMT", <http://web.archive.org/web/20100514231857/http://example.com/>; rel="memento"; datetime="Fri, 14 May 2010 23:18:57 GMT", <http://web.archive.org/web/20100519202418/http://example.com/>; rel="next memento"; datetime="Wed, 19 May 2010 20:24:18 GMT", <http://web.archive.org/web/20160307200619/http://example.com/>; rel="last memento"; datetime="Mon, 07 Mar 2016 20:06:19 GMT"\
"""
def mock_link_header(*args, **kwargs):
return linkheader
class TestDirAgg(TempDirTests, BaseTestClass):
@classmethod
def setup_class(cls):
super(TestDirAgg, cls).setup_class()
coll_A = to_path(cls.root_dir + '/colls/A/indexes')
coll_B = to_path(cls.root_dir + '/colls/B/indexes')
coll_C = to_path(cls.root_dir + '/colls/C/indexes')
os.makedirs(coll_A)
os.makedirs(coll_B)
os.makedirs(coll_C)
dir_prefix = to_path(cls.root_dir)
dir_path ='colls/{coll}/indexes'
shutil.copy(to_path('testdata/example.cdxj'), coll_A)
shutil.copy(to_path('testdata/iana.cdxj'), coll_B)
shutil.copy(to_path('testdata/dupes.cdxj'), coll_C)
with open(to_path(cls.root_dir) + '/somefile', 'w') as fh:
fh.write('foo')
cls.dir_loader = DirectoryIndexSource(dir_prefix, dir_path)
cls.cache_dir_loader = CacheDirectoryIndexSource(dir_prefix, dir_path)
def test_agg_no_coll_set(self):
res, errs = self.dir_loader(dict(url='example.com/'))
assert(to_json_list(res) == [])
assert(errs == {})
def test_agg_collA_found(self):
res, errs = self.dir_loader({'url': 'example.com/', 'param.coll': 'A'})
exp = [{'source': 'colls/A/indexes/example.cdxj', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'}]
assert(to_json_list(res) == exp)
assert(errs == {})
def test_agg_collB(self):
res, errs = self.dir_loader({'url': 'example.com/', 'param.coll': 'B'})
exp = []
assert(to_json_list(res) == exp)
assert(errs == {})
def test_agg_collB_found(self):
res, errs = self.dir_loader({'url': 'iana.org/', 'param.coll': 'B'})
exp = [{'source': 'colls/B/indexes/iana.cdxj', 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}]
assert(to_json_list(res) == exp)
assert(errs == {})
def test_extra_agg_collB(self):
agg_source = SimpleAggregator({'dir': self.dir_loader})
res, errs = agg_source({'url': 'iana.org/', 'param.coll': 'B'})
exp = [{'source': 'dir:colls/B/indexes/iana.cdxj', 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}]
assert(to_json_list(res) == exp)
assert(errs == {})
def test_agg_all_found_1(self):
res, errs = self.dir_loader({'url': 'iana.org/', 'param.coll': '*'})
exp = [
{'source': 'colls/B/indexes/iana.cdxj', 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'},
{'source': 'colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171238', 'filename': 'dupes.warc.gz'},
{'source': 'colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171238', 'filename': 'dupes.warc.gz'},
]
assert(to_json_list(res) == exp)
assert(errs == {})
def test_agg_all_found_2(self):
res, errs = self.dir_loader({'url': 'example.com/', 'param.coll': '*'})
exp = [
{'source': 'colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'},
{'source': 'colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'},
{'source': 'colls/A/indexes/example.cdxj', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'}
]
assert(to_json_list(res) == exp)
assert(errs == {})
@patch('webagg.indexsource.MementoIndexSource.get_timegate_links', mock_link_header)
def test_agg_dir_and_memento(self):
sources = {'ia': MementoIndexSource.from_timegate_url('http://web.archive.org/web/'),
'local': self.dir_loader}
agg_source = SimpleAggregator(sources)
res, errs = agg_source({'url': 'example.com/', 'param.local.coll': '*', 'closest': '20100512', 'limit': 6})
exp = [
{'source': 'ia', 'timestamp': '20100514231857', 'load_url': 'http://web.archive.org/web/20100514231857id_/http://example.com/'},
{'source': 'ia', 'timestamp': '20100519202418', 'load_url': 'http://web.archive.org/web/20100519202418id_/http://example.com/'},
{'source': 'ia', 'timestamp': '20100501123414', 'load_url': 'http://web.archive.org/web/20100501123414id_/http://example.com/'},
{'source': 'local:colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'},
{'source': 'local:colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'},
{'source': 'local:colls/A/indexes/example.cdxj', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'}
]
assert(to_json_list(res) == exp)
assert(errs == {})
def test_agg_no_dir_1(self):
res, errs = self.dir_loader({'url': 'example.com/', 'param.coll': 'X'})
exp = []
assert(to_json_list(res) == exp)
assert(errs == {})
def test_agg_no_dir_2(self):
loader = DirectoryIndexSource(self.root_dir, '')
res, errs = loader({'url': 'example.com/', 'param.coll': 'X'})
exp = []
assert(to_json_list(res) == exp)
assert(errs == {})
def test_agg_dir_sources_1(self):
res = self.dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'})
exp = {'sources': {'colls/A/indexes/example.cdxj': 'file',
'colls/B/indexes/iana.cdxj': 'file',
'colls/C/indexes/dupes.cdxj': 'file'}
}
assert(res == exp)
def test_agg_dir_sources_2(self):
res = self.dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '[A,C]'})
exp = {'sources': {'colls/A/indexes/example.cdxj': 'file',
'colls/C/indexes/dupes.cdxj': 'file'}
}
assert(res == exp)
def test_agg_dir_sources_single_dir(self):
loader = DirectoryIndexSource(os.path.join(self.root_dir, 'colls', 'A', 'indexes'), '')
res = loader.get_source_list({'url': 'example.com/'})
exp = {'sources': {'example.cdxj': 'file'}}
assert(res == exp)
def test_agg_dir_sources_not_found_dir(self):
loader = DirectoryIndexSource(os.path.join(self.root_dir, 'colls', 'Z', 'indexes'), '')
res = loader.get_source_list({'url': 'example.com/'})
exp = {'sources': {}}
assert(res == exp)
def test_cache_dir_sources_1(self):
exp = {'sources': {'colls/A/indexes/example.cdxj': 'file',
'colls/B/indexes/iana.cdxj': 'file',
'colls/C/indexes/dupes.cdxj': 'file'}
}
res = self.cache_dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'})
assert(res == exp)
res = self.cache_dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'})
assert(res == exp)
new_file = os.path.join(self.root_dir, 'colls/C/indexes/empty.cdxj')
with open(new_file, 'a') as fh:
os.utime(new_file, None)
res = self.cache_dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'})
# New File Included
exp['sources']['colls/C/indexes/empty.cdxj'] = 'file'
assert(res == exp)

View File

@ -0,0 +1,463 @@
#from gevent import monkey; monkey.patch_all(thread=False)
from collections import OrderedDict
from webagg.handlers import DefaultResourceHandler, HandlerSeq
from webagg.indexsource import MementoIndexSource, FileIndexSource, LiveIndexSource
from webagg.aggregator import GeventTimeoutAggregator, SimpleAggregator
from webagg.aggregator import DirectoryIndexSource
from webagg.app import ResAggApp
from webagg.utils import MementoUtils
from pywb.utils.statusandheaders import StatusAndHeadersParser
from pywb.utils.bufferedreaders import ChunkedDataReader
from io import BytesIO
from six.moves.urllib.parse import urlencode
import webtest
from fakeredis import FakeStrictRedis
from .testutils import to_path, FakeRedisTests, BaseTestClass
import json
sources = {
'local': DirectoryIndexSource(to_path('testdata/'), ''),
'ia': MementoIndexSource.from_timegate_url('http://web.archive.org/web/'),
'rhiz': MementoIndexSource.from_timegate_url('http://webenact.rhizome.org/vvork/', path='*'),
'live': LiveIndexSource(),
}
class TestResAgg(FakeRedisTests, BaseTestClass):
def setup_class(cls):
super(TestResAgg, cls).setup_class()
live_source = SimpleAggregator({'live': LiveIndexSource()})
live_handler = DefaultResourceHandler(live_source)
app = ResAggApp()
app.add_route('/live', live_handler)
source1 = GeventTimeoutAggregator(sources)
handler1 = DefaultResourceHandler(source1, to_path('testdata/'))
app.add_route('/many', handler1)
source2 = SimpleAggregator({'post': FileIndexSource(to_path('testdata/post-test.cdxj'))})
handler2 = DefaultResourceHandler(source2, to_path('testdata/'))
app.add_route('/posttest', handler2)
source3 = SimpleAggregator({'example': FileIndexSource(to_path('testdata/example.cdxj'))})
handler3 = DefaultResourceHandler(source3, to_path('testdata/'))
app.add_route('/fallback', HandlerSeq([handler3,
handler2,
live_handler]))
app.add_route('/seq', HandlerSeq([handler3,
handler2]))
app.add_route('/allredis', DefaultResourceHandler(source3, 'redis://localhost/2/test:warc'))
app.add_route('/empty', HandlerSeq([]))
app.add_route('/invalid', DefaultResourceHandler([SimpleAggregator({'invalid': 'should not be a callable'})]))
url_agnost = SimpleAggregator({'url-agnost': FileIndexSource(to_path('testdata/url-agnost-example.cdxj'))})
app.add_route('/urlagnost', DefaultResourceHandler(url_agnost, 'redis://localhost/2/test:{arg}:warc'))
cls.testapp = webtest.TestApp(app)
def _check_uri_date(self, resp, uri, dt):
buff = BytesIO(resp.body)
buff = ChunkedDataReader(buff)
status_headers = StatusAndHeadersParser(['WARC/1.0']).parse(buff)
assert status_headers.get_header('WARC-Target-URI') == uri
if dt == True:
assert status_headers.get_header('WARC-Date') != ''
else:
assert status_headers.get_header('WARC-Date') == dt
def test_list_routes(self):
resp = self.testapp.get('/')
res = resp.json
assert set(res.keys()) == set(['/empty', '/empty/postreq',
'/fallback', '/fallback/postreq',
'/live', '/live/postreq',
'/many', '/many/postreq',
'/posttest', '/posttest/postreq',
'/seq', '/seq/postreq',
'/allredis', '/allredis/postreq',
'/urlagnost', '/urlagnost/postreq',
'/invalid', '/invalid/postreq'])
assert res['/fallback'] == {'modes': ['list_sources', 'index', 'resource']}
def test_list_handlers(self):
resp = self.testapp.get('/many')
assert resp.json == {'modes': ['list_sources', 'index', 'resource']}
assert 'ResErrors' not in resp.headers
resp = self.testapp.get('/many/other')
assert resp.json == {'modes': ['list_sources', 'index', 'resource']}
assert 'ResErrors' not in resp.headers
def test_list_errors(self):
# must specify url for index or resource
resp = self.testapp.get('/many/index', status=400)
assert resp.json == {'message': 'The "url" param is required'}
assert resp.text == resp.headers['ResErrors']
resp = self.testapp.get('/many/index', status=400)
assert resp.json == {'message': 'The "url" param is required'}
assert resp.text == resp.headers['ResErrors']
resp = self.testapp.get('/many/resource', status=400)
assert resp.json == {'message': 'The "url" param is required'}
assert resp.text == resp.headers['ResErrors']
def test_list_sources(self):
resp = self.testapp.get('/many/list_sources')
assert resp.json == {'sources': {'local': 'file_dir', 'ia': 'memento', 'rhiz': 'memento', 'live': 'live'}}
assert 'ResErrors' not in resp.headers
def test_live_index(self):
resp = self.testapp.get('/live/index?url=http://httpbin.org/get&output=json')
resp.charset = 'utf-8'
cdxlist = list([json.loads(cdx) for cdx in resp.text.rstrip().split('\n')])
cdxlist[0]['timestamp'] = '2016'
assert(cdxlist == [{'url': 'http://httpbin.org/get', 'urlkey': 'org,httpbin)/get', 'is_live': 'true',
'mime': '', 'load_url': 'http://httpbin.org/get', 'source': 'live', 'timestamp': '2016'}])
def test_live_resource(self):
headers = {'foo': 'bar'}
resp = self.testapp.get('/live/resource?url=http://httpbin.org/get?foo=bar', headers=headers)
assert resp.headers['WebAgg-Source-Coll'] == 'live'
self._check_uri_date(resp, 'http://httpbin.org/get?foo=bar', True)
assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/get?foo=bar', 'original')
assert resp.headers['Memento-Datetime'] != ''
assert b'HTTP/1.1 200 OK' in resp.body
assert b'"foo": "bar"' in resp.body
assert 'ResErrors' not in resp.headers
def test_live_post_resource(self):
resp = self.testapp.post('/live/resource?url=http://httpbin.org/post',
OrderedDict([('foo', 'bar')]))
assert resp.headers['WebAgg-Source-Coll'] == 'live'
self._check_uri_date(resp, 'http://httpbin.org/post', True)
assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/post', 'original')
assert resp.headers['Memento-Datetime'] != ''
assert b'HTTP/1.1 200 OK' in resp.body
assert b'"foo": "bar"' in resp.body
assert 'ResErrors' not in resp.headers
def test_agg_select_mem_1(self):
resp = self.testapp.get('/many/resource?url=http://vvork.com/&closest=20141001')
assert resp.headers['WebAgg-Source-Coll'] == 'rhiz'
self._check_uri_date(resp, 'http://www.vvork.com/', '2014-10-06T18:43:57Z')
assert b'HTTP/1.1 200 OK' in resp.body
assert resp.headers['Link'] == MementoUtils.make_link('http://www.vvork.com/', 'original')
assert resp.headers['Memento-Datetime'] == 'Mon, 06 Oct 2014 18:43:57 GMT'
assert 'ResErrors' not in resp.headers
def test_agg_select_mem_2(self):
resp = self.testapp.get('/many/resource?url=http://vvork.com/&closest=20151231')
assert resp.headers['WebAgg-Source-Coll'] == 'ia'
self._check_uri_date(resp, 'http://vvork.com/', '2016-01-10T13:48:55Z')
assert b'HTTP/1.1 200 OK' in resp.body
assert resp.headers['Link'] == MementoUtils.make_link('http://vvork.com/', 'original')
assert resp.headers['Memento-Datetime'] == 'Sun, 10 Jan 2016 13:48:55 GMT'
assert 'ResErrors' not in resp.headers
def test_agg_select_live(self):
resp = self.testapp.get('/many/resource?url=http://vvork.com/&closest=2016')
assert resp.headers['WebAgg-Source-Coll'] == 'live'
self._check_uri_date(resp, 'http://vvork.com/', True)
assert resp.headers['Link'] == MementoUtils.make_link('http://vvork.com/', 'original')
assert resp.headers['Memento-Datetime'] != ''
assert 'ResErrors' not in resp.headers
def test_agg_select_local(self):
resp = self.testapp.get('/many/resource?url=http://iana.org/&closest=20140126200624')
assert resp.headers['WebAgg-Source-Coll'] == 'local:iana.cdxj'
self._check_uri_date(resp, 'http://www.iana.org/', '2014-01-26T20:06:24Z')
assert resp.headers['Link'] == MementoUtils.make_link('http://www.iana.org/', 'original')
assert resp.headers['Memento-Datetime'] == 'Sun, 26 Jan 2014 20:06:24 GMT'
assert json.loads(resp.headers['ResErrors']) == {"rhiz": "NotFoundException('http://webenact.rhizome.org/vvork/http://iana.org/',)"}
def test_agg_select_local_postreq(self):
req_data = """\
GET / HTTP/1.1
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8
User-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36
Host: iana.org
"""
resp = self.testapp.post('/many/resource/postreq?url=http://iana.org/&closest=20140126200624', req_data)
assert resp.headers['WebAgg-Source-Coll'] == 'local:iana.cdxj'
self._check_uri_date(resp, 'http://www.iana.org/', '2014-01-26T20:06:24Z')
assert resp.headers['Link'] == MementoUtils.make_link('http://www.iana.org/', 'original')
assert resp.headers['Memento-Datetime'] == 'Sun, 26 Jan 2014 20:06:24 GMT'
assert json.loads(resp.headers['ResErrors']) == {"rhiz": "NotFoundException('http://webenact.rhizome.org/vvork/http://iana.org/',)"}
def test_agg_live_postreq(self):
req_data = """\
GET /get?foo=bar HTTP/1.1
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8
User-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36
Host: httpbin.org
"""
resp = self.testapp.post('/many/resource/postreq?url=http://httpbin.org/get?foo=bar&closest=now', req_data)
assert resp.headers['WebAgg-Source-Coll'] == 'live'
self._check_uri_date(resp, 'http://httpbin.org/get?foo=bar', True)
assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/get?foo=bar', 'original')
assert resp.headers['Memento-Datetime'] != ''
assert b'HTTP/1.1 200 OK' in resp.body
assert b'"foo": "bar"' in resp.body
assert json.loads(resp.headers['ResErrors']) == {"rhiz": "NotFoundException('http://webenact.rhizome.org/vvork/http://httpbin.org/get?foo=bar',)"}
def test_agg_post_resolve_postreq(self):
req_data = """\
POST /post HTTP/1.1
content-length: 16
accept-encoding: gzip, deflate
accept: */*
host: httpbin.org
content-type: application/x-www-form-urlencoded
foo=bar&test=abc"""
resp = self.testapp.post('/posttest/resource/postreq?url=http://httpbin.org/post', req_data)
assert resp.headers['WebAgg-Source-Coll'] == 'post'
self._check_uri_date(resp, 'http://httpbin.org/post', True)
assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/post', 'original')
assert resp.headers['Memento-Datetime'] != ''
assert b'HTTP/1.1 200 OK' in resp.body
assert b'"foo": "bar"' in resp.body
assert b'"test": "abc"' in resp.body
assert b'"url": "http://httpbin.org/post"' in resp.body
assert 'ResErrors' not in resp.headers
def test_agg_post_resolve_fallback(self):
req_data = OrderedDict([('foo', 'bar'), ('test', 'abc')])
resp = self.testapp.post('/fallback/resource?url=http://httpbin.org/post', req_data)
assert resp.headers['WebAgg-Source-Coll'] == 'post'
self._check_uri_date(resp, 'http://httpbin.org/post', True)
assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/post', 'original')
assert b'HTTP/1.1 200 OK' in resp.body
assert b'"foo": "bar"' in resp.body
assert b'"test": "abc"' in resp.body
assert b'"url": "http://httpbin.org/post"' in resp.body
assert 'ResErrors' not in resp.headers
def test_agg_seq_fallback_1(self):
resp = self.testapp.get('/fallback/resource?url=http://www.iana.org/')
assert resp.headers['WebAgg-Source-Coll'] == 'live'
self._check_uri_date(resp, 'http://www.iana.org/', True)
assert resp.headers['Link'] == MementoUtils.make_link('http://www.iana.org/', 'original')
assert b'HTTP/1.1 200 OK' in resp.body
assert 'ResErrors' not in resp.headers
def test_agg_seq_fallback_2(self):
resp = self.testapp.get('/fallback/resource?url=http://www.example.com/')
assert resp.headers['WebAgg-Source-Coll'] == 'example'
self._check_uri_date(resp, 'http://example.com/', '2016-02-25T04:23:29Z')
assert resp.headers['Link'] == MementoUtils.make_link('http://example.com/', 'original')
assert resp.headers['Memento-Datetime'] == 'Thu, 25 Feb 2016 04:23:29 GMT'
assert b'HTTP/1.1 200 OK' in resp.body
assert 'ResErrors' not in resp.headers
def test_redis_warc_1(self):
f = FakeStrictRedis.from_url('redis://localhost/2')
f.hset('test:warc', 'example.warc.gz', './testdata/example.warc.gz')
resp = self.testapp.get('/allredis/resource?url=http://www.example.com/')
assert resp.headers['WebAgg-Source-Coll'] == 'example'
def test_url_agnost(self):
f = FakeStrictRedis.from_url('redis://localhost/2')
f.hset('test:foo:warc', 'example-url-agnostic-revisit.warc.gz', './testdata/example-url-agnostic-revisit.warc.gz')
f.hset('test:foo:warc', 'example-url-agnostic-orig.warc.gz', './testdata/example-url-agnostic-orig.warc.gz')
resp = self.testapp.get('/urlagnost/resource?url=http://example.com/&param.arg=foo')
assert resp.status_int == 200
assert resp.headers['Link'] == MementoUtils.make_link('http://test@example.com/', 'original')
assert resp.headers['WebAgg-Source-Coll'] == 'url-agnost'
assert resp.headers['Memento-Datetime'] == 'Mon, 29 Jul 2013 19:51:51 GMT'
def test_live_video_loader(self):
params = {'url': 'http://www.youtube.com/v/BfBgWtAIbRc',
'content_type': 'application/vnd.youtube-dl_formats+json'
}
resp = self.testapp.get('/live/resource', params=params)
assert resp.headers['WebAgg-Source-Coll'] == 'live'
self._check_uri_date(resp, 'metadata://www.youtube.com/v/BfBgWtAIbRc', True)
assert resp.headers['Link'] == MementoUtils.make_link('metadata://www.youtube.com/v/BfBgWtAIbRc', 'original')
assert resp.headers['Memento-Datetime'] != ''
assert b'WARC-Type: metadata' in resp.body
assert b'Content-Type: application/vnd.youtube-dl_formats+json' in resp.body
def test_live_video_loader_post(self):
req_data = """\
GET /v/BfBgWtAIbRc HTTP/1.1
accept-encoding: gzip, deflate
accept: */*
host: www.youtube.com\
"""
params = {'url': 'http://www.youtube.com/v/BfBgWtAIbRc',
'content_type': 'application/vnd.youtube-dl_formats+json'
}
resp = self.testapp.post('/live/resource/postreq?&' + urlencode(params), req_data)
assert resp.headers['WebAgg-Source-Coll'] == 'live'
self._check_uri_date(resp, 'metadata://www.youtube.com/v/BfBgWtAIbRc', True)
assert resp.headers['Link'] == MementoUtils.make_link('metadata://www.youtube.com/v/BfBgWtAIbRc', 'original')
assert resp.headers['Memento-Datetime'] != ''
assert b'WARC-Type: metadata' in resp.body
assert b'Content-Type: application/vnd.youtube-dl_formats+json' in resp.body
def test_error_redis_file_not_found(self):
f = FakeStrictRedis.from_url('redis://localhost/2')
f.hset('test:warc', 'example.warc.gz', './testdata/example2.warc.gz')
resp = self.testapp.get('/allredis/resource?url=http://www.example.com/', status=503)
assert resp.json['message'] == "example.warc.gz: [Errno 2] No such file or directory: './testdata/example2.warc.gz'"
f.hdel('test:warc', 'example.warc.gz')
resp = self.testapp.get('/allredis/resource?url=http://www.example.com/', status=503)
assert resp.json == {'message': 'example.warc.gz: Archive File Not Found',
'errors': {'WARCPathLoader': 'example.warc.gz: Archive File Not Found'}}
f.delete('test:warc')
resp = self.testapp.get('/allredis/resource?url=http://www.example.com/', status=503)
assert resp.json == {'message': 'example.warc.gz: Archive File Not Found',
'errors': {'WARCPathLoader': 'example.warc.gz: Archive File Not Found'}}
def test_error_fallback_live_not_found(self):
resp = self.testapp.get('/fallback/resource?url=http://invalid.url-not-found', status=400)
assert resp.json == {'message': 'http://invalid.url-not-found/',
'errors': {'LiveWebLoader': 'http://invalid.url-not-found/'}}
assert resp.text == resp.headers['ResErrors']
def test_agg_local_revisit(self):
resp = self.testapp.get('/many/resource?url=http://www.example.com/&closest=20140127171251&sources=local')
assert resp.headers['WebAgg-Source-Coll'] == 'local:dupes.cdxj'
buff = BytesIO(resp.body)
status_headers = StatusAndHeadersParser(['WARC/1.0']).parse(buff)
assert status_headers.get_header('WARC-Target-URI') == 'http://example.com'
assert status_headers.get_header('WARC-Date') == '2014-01-27T17:12:51Z'
assert status_headers.get_header('WARC-Refers-To-Target-URI') == 'http://example.com'
assert status_headers.get_header('WARC-Refers-To-Date') == '2014-01-27T17:12:00Z'
assert resp.headers['Link'] == MementoUtils.make_link('http://example.com', 'original')
assert resp.headers['Memento-Datetime'] == 'Mon, 27 Jan 2014 17:12:51 GMT'
assert b'HTTP/1.1 200 OK' in resp.body
assert b'<!doctype html>' in resp.body
assert 'ResErrors' not in resp.headers
def test_error_invalid_index_output(self):
resp = self.testapp.get('/live/index?url=http://httpbin.org/get&output=foobar', status=400)
assert resp.json == {'message': 'output=foobar not supported'}
assert resp.text == resp.headers['ResErrors']
def test_error_local_not_found(self):
resp = self.testapp.get('/many/resource?url=http://not-found.error/&sources=local', status=404)
assert resp.json == {'message': 'No Resource Found'}
assert resp.text == resp.headers['ResErrors']
def test_error_empty(self):
resp = self.testapp.get('/empty/resource?url=http://example.com/', status=404)
assert resp.json == {'message': 'No Resource Found'}
assert resp.text == resp.headers['ResErrors']
def test_error_invalid(self):
resp = self.testapp.get('/invalid/resource?url=http://example.com/', status=500)
assert resp.json == {'message': "Internal Error: 'list' object is not callable"}
assert resp.text == resp.headers['ResErrors']

View File

@ -0,0 +1,219 @@
from webagg.indexsource import FileIndexSource, RemoteIndexSource, MementoIndexSource, RedisIndexSource
from webagg.indexsource import LiveIndexSource
from webagg.aggregator import SimpleAggregator
from pywb.utils.timeutils import timestamp_now
from .testutils import key_ts_res
import pytest
from fakeredis import FakeStrictRedis
from mock import patch
redismock = patch('redis.StrictRedis', FakeStrictRedis)
redismock.start()
def setup_module():
r = FakeStrictRedis.from_url('redis://localhost:6379/2')
r.delete('test:rediscdx')
with open('testdata/iana.cdxj', 'rb') as fh:
for line in fh:
r.zadd('test:rediscdx', 0, line.rstrip())
def teardown_module():
redismock.stop()
local_sources = [
FileIndexSource('testdata/iana.cdxj'),
RedisIndexSource('redis://localhost:6379/2/test:rediscdx')
]
remote_sources = [
RemoteIndexSource('http://webenact.rhizome.org/all-cdx?url={url}',
'http://webenact.rhizome.org/all/{timestamp}id_/{url}'),
MementoIndexSource('http://webenact.rhizome.org/all/{url}',
'http://webenact.rhizome.org/all/timemap/*/{url}',
'http://webenact.rhizome.org/all/{timestamp}id_/{url}')
]
ait_source = RemoteIndexSource('http://wayback.archive-it.org/cdx?url={url}',
'http://wayback.archive-it.org/all/{timestamp}id_/{url}')
def query_single_source(source, params):
string = str(source)
return SimpleAggregator({'source': source})(params)
# Url Match -- Local Loaders
# ============================================================================
@pytest.mark.parametrize("source", local_sources, ids=["file", "redis"])
def test_local_cdxj_loader(source):
url = 'http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf'
res, errs = query_single_source(source, dict(url=url, limit=3))
expected = """\
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200826 iana.warc.gz
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200912 iana.warc.gz
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200930 iana.warc.gz"""
assert(key_ts_res(res) == expected)
assert(errs == {})
# Closest -- Local Loaders
# ============================================================================
@pytest.mark.parametrize("source", local_sources, ids=["file", "redis"])
def test_local_closest_loader(source):
url = 'http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf'
res, errs = query_single_source(source, dict(url=url,
closest='20140126200930',
limit=3))
expected = """\
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200930 iana.warc.gz
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200912 iana.warc.gz
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200826 iana.warc.gz"""
assert(key_ts_res(res) == expected)
assert(errs == {})
# Prefix -- Local Loaders
# ============================================================================
@pytest.mark.parametrize("source", local_sources, ids=["file", "redis"])
def test_file_prefix_loader(source):
res, errs = query_single_source(source, dict(url='http://iana.org/domains/root/*'))
expected = """\
org,iana)/domains/root/db 20140126200927 iana.warc.gz
org,iana)/domains/root/db 20140126200928 iana.warc.gz
org,iana)/domains/root/servers 20140126201227 iana.warc.gz"""
assert(key_ts_res(res) == expected)
assert(errs == {})
# Url Match -- Remote Loaders
# ============================================================================
@pytest.mark.parametrize("source", remote_sources, ids=["remote_cdx", "memento"])
def test_remote_loader(source):
url = 'http://instagram.com/amaliaulman'
res, errs = query_single_source(source, dict(url=url))
expected = """\
com,instagram)/amaliaulman 20141014150552 http://webenact.rhizome.org/all/20141014150552id_/http://instagram.com/amaliaulman
com,instagram)/amaliaulman 20141014155217 http://webenact.rhizome.org/all/20141014155217id_/http://instagram.com/amaliaulman
com,instagram)/amaliaulman 20141014162333 http://webenact.rhizome.org/all/20141014162333id_/http://instagram.com/amaliaulman
com,instagram)/amaliaulman 20141014171636 http://webenact.rhizome.org/all/20141014171636id_/http://instagram.com/amaliaulman"""
assert(key_ts_res(res, 'load_url') == expected)
assert(errs == {})
# Url Match -- Remote Loaders
# ============================================================================
@pytest.mark.parametrize("source", remote_sources, ids=["remote_cdx", "memento"])
def test_remote_closest_loader(source):
url = 'http://instagram.com/amaliaulman'
res, errs = query_single_source(source, dict(url=url, closest='20141014162332', limit=1))
expected = """\
com,instagram)/amaliaulman 20141014162333 http://webenact.rhizome.org/all/20141014162333id_/http://instagram.com/amaliaulman"""
assert(key_ts_res(res, 'load_url') == expected)
assert(errs == {})
# Url Match -- Memento
# ============================================================================
@pytest.mark.parametrize("source", remote_sources, ids=["remote_cdx", "memento"])
def test_remote_closest_loader(source):
url = 'http://instagram.com/amaliaulman'
res, errs = query_single_source(source, dict(url=url, closest='20141014162332', limit=1))
expected = """\
com,instagram)/amaliaulman 20141014162333 http://webenact.rhizome.org/all/20141014162333id_/http://instagram.com/amaliaulman"""
assert(key_ts_res(res, 'load_url') == expected)
assert(errs == {})
# Live Index -- No Load!
# ============================================================================
def test_live():
url = 'http://example.com/'
source = LiveIndexSource()
res, errs = query_single_source(source, dict(url=url))
expected = 'com,example)/ {0} http://example.com/'.format(timestamp_now())
assert(key_ts_res(res, 'load_url') == expected)
assert(errs == {})
# Errors -- Not Found All
# ============================================================================
@pytest.mark.parametrize("source", local_sources + remote_sources, ids=["file", "redis", "remote_cdx", "memento"])
def test_all_not_found(source):
url = 'http://x-not-found-x.notfound/'
res, errs = query_single_source(source, dict(url=url, limit=3))
expected = ''
assert(key_ts_res(res) == expected)
if source == remote_sources[0]:
assert('http://x-not-found-x.notfound/' in errs['source'])
else:
assert(errs == {})
# ============================================================================
def test_another_remote_not_found():
source = MementoIndexSource.from_timegate_url('http://www.webarchive.org.uk/wayback/archive/')
url = 'http://x-not-found-x.notfound/'
res, errs = query_single_source(source, dict(url=url, limit=3))
expected = ''
assert(key_ts_res(res) == expected)
assert(errs['source'] == "NotFoundException('http://www.webarchive.org.uk/wayback/archive/timemap/link/http://x-not-found-x.notfound/',)")
# ============================================================================
def test_file_not_found():
source = FileIndexSource('testdata/not-found-x')
url = 'http://x-not-found-x.notfound/'
res, errs = query_single_source(source, dict(url=url, limit=3))
expected = ''
assert(key_ts_res(res) == expected)
assert(errs['source'] == "NotFoundException('testdata/not-found-x',)"), errs
# ============================================================================
def test_ait_filters():
ait_source = RemoteIndexSource('http://wayback.archive-it.org/cdx/search/cdx?url={url}&filter=filename:ARCHIVEIT-({colls})-.*',
'http://wayback.archive-it.org/all/{timestamp}id_/{url}')
cdxlist, errs = query_single_source(ait_source, {'url': 'http://iana.org/', 'param.source.colls': '5610|933'})
filenames = [cdx['filename'] for cdx in cdxlist]
prefix = ('ARCHIVEIT-5610-', 'ARCHIVEIT-933-')
assert(all([x.startswith(prefix) for x in filenames]))
cdxlist, errs = query_single_source(ait_source, {'url': 'http://iana.org/', 'param.source.colls': '1883|366|905'})
filenames = [cdx['filename'] for cdx in cdxlist]
prefix = ('ARCHIVEIT-1883-', 'ARCHIVEIT-366-', 'ARCHIVEIT-905-')
assert(all([x.startswith(prefix) for x in filenames]))

View File

@ -0,0 +1,67 @@
from webagg.inputrequest import DirectWSGIInputRequest, POSTInputRequest
from bottle import Bottle, request, response, debug
import webtest
import traceback
#=============================================================================
class InputReqApp(object):
def __init__(self):
self.application = Bottle()
debug(True)
@self.application.route('/test/<url:re:.*>', 'ANY')
def direct_input_request(url=''):
inputreq = DirectWSGIInputRequest(request.environ)
response['Content-Type'] = 'text/plain; charset=utf-8'
return inputreq.reconstruct_request(url)
@self.application.route('/test-postreq', 'POST')
def post_fullrequest():
params = dict(request.query)
inputreq = POSTInputRequest(request.environ)
response['Content-Type'] = 'text/plain; charset=utf-8'
return inputreq.reconstruct_request(params.get('url'))
#=============================================================================
class TestInputReq(object):
def setup(self):
self.app = InputReqApp()
self.testapp = webtest.TestApp(self.app.application)
def test_get_direct(self):
res = self.testapp.get('/test/http://example.com/', headers={'Foo': 'Bar'})
assert res.text == '\
GET /test/http://example.com/ HTTP/1.0\r\n\
Host: example.com\r\n\
Foo: Bar\r\n\
\r\n\
'
def test_post_direct(self):
res = self.testapp.post('/test/http://example.com/', headers={'Foo': 'Bar'}, params='ABC')
lines = res.text.split('\r\n')
assert lines[0] == 'POST /test/http://example.com/ HTTP/1.0'
assert 'Host: example.com' in lines
assert 'Content-Length: 3' in lines
assert 'Content-Type: application/x-www-form-urlencoded' in lines
assert 'Foo: Bar' in lines
assert 'ABC' in lines
def test_post_req(self):
postdata = '\
GET /example.html HTTP/1.0\r\n\
Foo: Bar\r\n\
\r\n\
'
res = self.testapp.post('/test-postreq?url=http://example.com/', params=postdata)
assert res.text == '\
GET /example.html HTTP/1.0\r\n\
Host: example.com\r\n\
Foo: Bar\r\n\
\r\n\
'

View File

@ -0,0 +1,241 @@
from gevent import monkey; monkey.patch_all(thread=False)
from webagg.aggregator import SimpleAggregator, GeventTimeoutAggregator
from webagg.aggregator import BaseAggregator
from webagg.indexsource import FileIndexSource, RemoteIndexSource, MementoIndexSource
from .testutils import to_json_list, to_path
import json
import pytest
import time
import six
from webagg.handlers import IndexHandler
sources = {
'local': FileIndexSource(to_path('testdata/iana.cdxj')),
'ia': MementoIndexSource.from_timegate_url('http://web.archive.org/web/'),
'ait': MementoIndexSource.from_timegate_url('http://wayback.archive-it.org/all/'),
'bl': MementoIndexSource.from_timegate_url('http://www.webarchive.org.uk/wayback/archive/'),
'rhiz': MementoIndexSource.from_timegate_url('http://webenact.rhizome.org/vvork/', path='*')
}
aggs = {'simple': SimpleAggregator(sources),
'gevent': GeventTimeoutAggregator(sources, timeout=5.0),
}
agg_tm = {'gevent': GeventTimeoutAggregator(sources, timeout=0.0)}
nf = {'notfound': FileIndexSource(to_path('testdata/not-found-x'))}
agg_nf = {'simple': SimpleAggregator(nf),
'gevent': GeventTimeoutAggregator(nf, timeout=5.0),
}
@pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys()))
def test_mem_agg_index_1(agg):
url = 'http://iana.org/'
res, errs = agg(dict(url=url, closest='20140126000000', limit=5))
exp = [{"timestamp": "20140126093743", "load_url": "http://web.archive.org/web/20140126093743id_/http://iana.org/", "source": "ia"},
{"timestamp": "20140126200624", "filename": "iana.warc.gz", "source": "local"},
{"timestamp": "20140123034755", "load_url": "http://web.archive.org/web/20140123034755id_/http://iana.org/", "source": "ia"},
{"timestamp": "20140129175203", "load_url": "http://web.archive.org/web/20140129175203id_/http://iana.org/", "source": "ia"},
{"timestamp": "20140107040552", "load_url": "http://wayback.archive-it.org/all/20140107040552id_/http://iana.org/", "source": "ait"}
]
assert(to_json_list(res) == exp)
assert(errs == {'bl': "NotFoundException('http://www.webarchive.org.uk/wayback/archive/http://iana.org/',)",
'rhiz': "NotFoundException('http://webenact.rhizome.org/vvork/http://iana.org/',)"})
@pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys()))
def test_mem_agg_index_2(agg):
url = 'http://example.com/'
res, errs = agg(dict(url=url, closest='20100512', limit=6))
exp = [{"timestamp": "20100513010014", "load_url": "http://www.webarchive.org.uk/wayback/archive/20100513010014id_/http://example.com/", "source": "bl"},
{"timestamp": "20100512204410", "load_url": "http://www.webarchive.org.uk/wayback/archive/20100512204410id_/http://example.com/", "source": "bl"},
#{"timestamp": "20100513052358", "load_url": "http://web.archive.org/web/20100513052358id_/http://example.com/", "source": "ia"},
{"timestamp": "20100511201151", "load_url": "http://wayback.archive-it.org/all/20100511201151id_/http://example.com/", "source": "ait"},
{"timestamp": "20100514231857", "load_url": "http://web.archive.org/web/20100514231857id_/http://example.com/", "source": "ia"},
{"timestamp": "20100514231857", "load_url": "http://wayback.archive-it.org/all/20100514231857id_/http://example.com/", "source": "ait"},
{"timestamp": "20100510233601", "load_url": "http://web.archive.org/web/20100510233601id_/http://example.com/", "source": "ia"}]
assert(to_json_list(res) == exp)
assert(errs == {'rhiz': "NotFoundException('http://webenact.rhizome.org/vvork/http://example.com/',)"})
@pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys()))
def test_mem_agg_index_3(agg):
url = 'http://vvork.com/'
res, errs = agg(dict(url=url, closest='20141001', limit=5))
exp = [{"timestamp": "20141006184357", "load_url": "http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/", "source": "rhiz"},
{"timestamp": "20141018133107", "load_url": "http://web.archive.org/web/20141018133107id_/http://vvork.com/", "source": "ia"},
{"timestamp": "20141020161243", "load_url": "http://web.archive.org/web/20141020161243id_/http://vvork.com/", "source": "ia"},
{"timestamp": "20140806161228", "load_url": "http://web.archive.org/web/20140806161228id_/http://vvork.com/", "source": "ia"},
{"timestamp": "20131004231540", "load_url": "http://wayback.archive-it.org/all/20131004231540id_/http://vvork.com/", "source": "ait"}]
assert(to_json_list(res) == exp)
assert(errs == {})
@pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys()))
def test_mem_agg_index_4(agg):
url = 'http://vvork.com/'
res, errs = agg(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait'))
exp = [{"timestamp": "20141006184357", "load_url": "http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/", "source": "rhiz"},
{"timestamp": "20131004231540", "load_url": "http://wayback.archive-it.org/all/20131004231540id_/http://vvork.com/", "source": "ait"}]
assert(to_json_list(res) == exp)
assert(errs == {})
@pytest.mark.parametrize("agg", list(agg_nf.values()), ids=list(agg_nf.keys()))
def test_mem_agg_not_found(agg):
url = 'http://vvork.com/'
res, errs = agg(dict(url=url, closest='20141001', limit=2))
assert(to_json_list(res) == [])
assert(errs == {'notfound': "NotFoundException('testdata/not-found-x',)"})
@pytest.mark.parametrize("agg", list(agg_tm.values()), ids=list(agg_tm.keys()))
def test_mem_agg_timeout(agg):
url = 'http://vvork.com/'
orig_source = BaseAggregator.load_child_source
def load_child_source(self, name, source, params):
time.sleep(0.1)
return orig_source(name, source, params)
BaseAggregator.load_child_source = load_child_source
res, errs = agg(dict(url=url, closest='20141001', limit=2))
BaseAggregator.load_child_source = orig_source
assert(to_json_list(res) == [])
assert(errs == {'local': 'timeout',
'ait': 'timeout', 'bl': 'timeout', 'ia': 'timeout', 'rhiz': 'timeout'})
def test_handler_output_cdxj():
agg = GeventTimeoutAggregator(sources, timeout=5.0)
handler = IndexHandler(agg)
url = 'http://vvork.com/'
headers, res, errs = handler(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait'))
exp = b"""\
com,vvork)/ 20141006184357 {"url": "http://www.vvork.com/", "mem_rel": "memento", "memento_url": "http://webenact.rhizome.org/vvork/20141006184357/http://www.vvork.com/", "load_url": "http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/", "source": "rhiz"}
com,vvork)/ 20131004231540 {"url": "http://vvork.com/", "mem_rel": "last memento", "memento_url": "http://wayback.archive-it.org/all/20131004231540/http://vvork.com/", "load_url": "http://wayback.archive-it.org/all/20131004231540id_/http://vvork.com/", "source": "ait"}
"""
assert(headers['Content-Type'] == 'text/x-cdxj')
assert(b''.join(res) == exp)
assert(errs == {})
def test_handler_output_json():
agg = GeventTimeoutAggregator(sources, timeout=5.0)
handler = IndexHandler(agg)
url = 'http://vvork.com/'
headers, res, errs = handler(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait', output='json'))
exp = b"""\
{"urlkey": "com,vvork)/", "timestamp": "20141006184357", "url": "http://www.vvork.com/", "mem_rel": "memento", "memento_url": "http://webenact.rhizome.org/vvork/20141006184357/http://www.vvork.com/", "load_url": "http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/", "source": "rhiz"}
{"urlkey": "com,vvork)/", "timestamp": "20131004231540", "url": "http://vvork.com/", "mem_rel": "last memento", "memento_url": "http://wayback.archive-it.org/all/20131004231540/http://vvork.com/", "load_url": "http://wayback.archive-it.org/all/20131004231540id_/http://vvork.com/", "source": "ait"}
"""
assert(headers['Content-Type'] == 'application/x-ndjson')
assert(b''.join(res) == exp)
assert(errs == {})
def test_handler_output_link():
agg = GeventTimeoutAggregator(sources, timeout=5.0)
handler = IndexHandler(agg)
url = 'http://vvork.com/'
headers, res, errs = handler(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait', output='link'))
exp = b"""\
<http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/>; rel="memento"; datetime="Mon, 06 Oct 2014 18:43:57 GMT"; src="rhiz",
<http://wayback.archive-it.org/all/20131004231540id_/http://vvork.com/>; rel="memento"; datetime="Fri, 04 Oct 2013 23:15:40 GMT"; src="ait"
"""
assert(headers['Content-Type'] == 'application/link')
assert(b''.join(res) == exp)
assert(errs == {})
def test_handler_output_link_2():
agg = GeventTimeoutAggregator(sources, timeout=5.0)
handler = IndexHandler(agg)
url = 'http://iana.org/'
headers, res, errs = handler(dict(url=url, closest='20140126000000', limit=5, output='link'))
exp = b"""\
<http://web.archive.org/web/20140126093743id_/http://iana.org/>; rel="memento"; datetime="Sun, 26 Jan 2014 09:37:43 GMT"; src="ia",
<file://iana.warc.gz:334:2258>; rel="memento"; datetime="Sun, 26 Jan 2014 20:06:24 GMT"; src="local",
<http://web.archive.org/web/20140123034755id_/http://iana.org/>; rel="memento"; datetime="Thu, 23 Jan 2014 03:47:55 GMT"; src="ia",
<http://web.archive.org/web/20140129175203id_/http://iana.org/>; rel="memento"; datetime="Wed, 29 Jan 2014 17:52:03 GMT"; src="ia",
<http://wayback.archive-it.org/all/20140107040552id_/http://iana.org/>; rel="memento"; datetime="Tue, 07 Jan 2014 04:05:52 GMT"; src="ait"
"""
assert(headers['Content-Type'] == 'application/link')
assert(b''.join(res) == exp)
exp_errs = {'bl': "NotFoundException('http://www.webarchive.org.uk/wayback/archive/http://iana.org/',)",
'rhiz': "NotFoundException('http://webenact.rhizome.org/vvork/http://iana.org/',)"}
assert(errs == exp_errs)
def test_handler_output_link_3():
agg = GeventTimeoutAggregator(sources, timeout=5.0)
handler = IndexHandler(agg)
url = 'http://foo.bar.non-existent'
headers, res, errs = handler(dict(url=url, closest='20140126000000', limit=5, output='link'))
exp = b''
assert(headers['Content-Type'] == 'application/link')
assert(b''.join(res) == exp)
exp_errs = {'ait': "NotFoundException('http://wayback.archive-it.org/all/http://foo.bar.non-existent',)",
'bl': "NotFoundException('http://www.webarchive.org.uk/wayback/archive/http://foo.bar.non-existent',)",
'ia': "NotFoundException('http://web.archive.org/web/http://foo.bar.non-existent',)",
'rhiz': "NotFoundException('http://webenact.rhizome.org/vvork/http://foo.bar.non-existent',)"}
assert(errs == exp_errs)
def test_handler_output_text():
agg = GeventTimeoutAggregator(sources, timeout=5.0)
handler = IndexHandler(agg)
url = 'http://vvork.com/'
headers, res, errs = handler(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait', output='text'))
exp = b"""\
com,vvork)/ 20141006184357 http://www.vvork.com/ memento http://webenact.rhizome.org/vvork/20141006184357/http://www.vvork.com/ http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/ rhiz
com,vvork)/ 20131004231540 http://vvork.com/ last memento http://wayback.archive-it.org/all/20131004231540/http://vvork.com/ http://wayback.archive-it.org/all/20131004231540id_/http://vvork.com/ ait
"""
assert(headers['Content-Type'] == 'text/plain')
assert(b''.join(res) == exp)
assert(errs == {})
def test_handler_list_sources():
agg = GeventTimeoutAggregator(sources, timeout=5.0)
handler = IndexHandler(agg)
headers, res, errs = handler(dict(mode='list_sources'))
assert(headers == {})
assert(res == {'sources': {'bl': 'memento',
'ait': 'memento',
'ia': 'memento',
'rhiz': 'memento',
'local': 'file'}})
assert(errs == {})

View File

@ -0,0 +1,45 @@
from webagg.aggregator import RedisMultiKeyIndexSource
from .testutils import to_path, to_json_list, FakeRedisTests, BaseTestClass
class TestRedisAgg(FakeRedisTests, BaseTestClass):
@classmethod
def setup_class(cls):
super(TestRedisAgg, cls).setup_class()
cls.add_cdx_to_redis(to_path('testdata/example.cdxj'), 'FOO:example:cdxj')
cls.add_cdx_to_redis(to_path('testdata/dupes.cdxj'), 'FOO:dupes:cdxj')
cls.indexloader = RedisMultiKeyIndexSource('redis://localhost/2/{user}:{coll}:cdxj')
def test_redis_agg_all(self):
res, errs = self.indexloader({'url': 'example.com/', 'param.user': 'FOO', 'param.coll': '*'})
exp = [
{'source': 'FOO:dupes:cdxj', 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'},
{'source': 'FOO:dupes:cdxj', 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'},
{'source': 'FOO:example:cdxj', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'}
]
assert(errs == {})
assert(to_json_list(res) == exp)
def test_redis_agg_one(self):
res, errs = self.indexloader({'url': 'example.com/', 'param.user': 'FOO', 'param.coll': 'dupes'})
exp = [
{'source': 'FOO:dupes:cdxj', 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'},
{'source': 'FOO:dupes:cdxj', 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'},
]
assert(errs == {})
assert(to_json_list(res) == exp)
def test_redis_not_found(self):
res, errs = self.indexloader({'url': 'example.com/'})
exp = []
assert(errs == {})
assert(to_json_list(res) == exp)

View File

@ -0,0 +1,118 @@
from gevent import monkey; monkey.patch_all(thread=False)
import time
from webagg.indexsource import FileIndexSource
from webagg.aggregator import SimpleAggregator, TimeoutMixin
from webagg.aggregator import GeventTimeoutAggregator, GeventTimeoutAggregator
from .testutils import to_json_list
class TimeoutFileSource(FileIndexSource):
def __init__(self, filename, timeout):
super(TimeoutFileSource, self).__init__(filename)
self.timeout = timeout
self.calls = 0
def load_index(self, params):
self.calls += 1
print('Sleeping')
time.sleep(self.timeout)
return super(TimeoutFileSource, self).load_index(params)
TimeoutAggregator = GeventTimeoutAggregator
def setup_module():
global sources
sources = {'slow': TimeoutFileSource('testdata/example.cdxj', 0.2),
'slower': TimeoutFileSource('testdata/dupes.cdxj', 0.5)
}
def test_timeout_long_all_pass():
agg = TimeoutAggregator(sources, timeout=1.0)
res, errs = agg(dict(url='http://example.com/'))
exp = [{'source': 'slower', 'timestamp': '20140127171200'},
{'source': 'slower', 'timestamp': '20140127171251'},
{'source': 'slow', 'timestamp': '20160225042329'}]
assert(to_json_list(res, fields=['source', 'timestamp']) == exp)
assert(errs == {})
def test_timeout_slower_skipped_1():
agg = GeventTimeoutAggregator(sources, timeout=0.49)
res, errs = agg(dict(url='http://example.com/'))
exp = [{'source': 'slow', 'timestamp': '20160225042329'}]
assert(to_json_list(res, fields=['source', 'timestamp']) == exp)
assert(errs == {'slower': 'timeout'})
def test_timeout_slower_skipped_2():
agg = GeventTimeoutAggregator(sources, timeout=0.19)
res, errs = agg(dict(url='http://example.com/'))
exp = []
assert(to_json_list(res, fields=['source', 'timestamp']) == exp)
assert(errs == {'slower': 'timeout', 'slow': 'timeout'})
def test_timeout_skipping():
assert(sources['slow'].calls == 3)
assert(sources['slower'].calls == 3)
agg = GeventTimeoutAggregator(sources, timeout=0.49,
t_count=2, t_duration=2.0)
exp = [{'source': 'slow', 'timestamp': '20160225042329'}]
res, errs = agg(dict(url='http://example.com/'))
assert(to_json_list(res, fields=['source', 'timestamp']) == exp)
assert(sources['slow'].calls == 4)
assert(sources['slower'].calls == 4)
assert(errs == {'slower': 'timeout'})
res, errs = agg(dict(url='http://example.com/'))
assert(to_json_list(res, fields=['source', 'timestamp']) == exp)
assert(sources['slow'].calls == 5)
assert(sources['slower'].calls == 5)
assert(errs == {'slower': 'timeout'})
res, errs = agg(dict(url='http://example.com/'))
assert(to_json_list(res, fields=['source', 'timestamp']) == exp)
assert(sources['slow'].calls == 6)
assert(sources['slower'].calls == 5)
assert(errs == {})
res, errs = agg(dict(url='http://example.com/'))
assert(to_json_list(res, fields=['source', 'timestamp']) == exp)
assert(sources['slow'].calls == 7)
assert(sources['slower'].calls == 5)
assert(errs == {})
time.sleep(2.01)
res, errs = agg(dict(url='http://example.com/'))
assert(to_json_list(res, fields=['source', 'timestamp']) == exp)
assert(sources['slow'].calls == 8)
assert(sources['slower'].calls == 6)
assert(errs == {'slower': 'timeout'})

View File

@ -0,0 +1,74 @@
import webtest
from io import BytesIO
from webagg.app import ResAggApp
import requests
from webagg.handlers import DefaultResourceHandler
from webagg.aggregator import SimpleAggregator
from webagg.proxyindexsource import ProxyMementoIndexSource, UpstreamAggIndexSource
from pywb.warc.recordloader import ArcWarcRecordLoader
from .testutils import LiveServerTests, BaseTestClass
class TestUpstream(LiveServerTests, BaseTestClass):
def setup(self):
app = ResAggApp()
base_url = 'http://localhost:{0}'.format(self.server.port)
app.add_route('/upstream',
DefaultResourceHandler(SimpleAggregator(
{'upstream': UpstreamAggIndexSource(base_url + '/live')})
)
)
app.add_route('/upstream_opt',
DefaultResourceHandler(SimpleAggregator(
{'upstream_opt': ProxyMementoIndexSource.upstream_resource(base_url + '/live')})
)
)
self.base_url = base_url
self.testapp = webtest.TestApp(app)
def test_live_paths(self):
res = requests.get(self.base_url + '/')
assert set(res.json().keys()) == {'/live/postreq', '/live'}
def test_upstream_paths(self):
res = self.testapp.get('/')
assert set(res.json.keys()) == {'/upstream/postreq', '/upstream', '/upstream_opt', '/upstream_opt/postreq'}
def test_live_1(self):
resp = requests.get(self.base_url + '/live/resource?url=http://httpbin.org/get', stream=True)
assert resp.headers['WebAgg-Source-Coll'] == 'live'
record = ArcWarcRecordLoader().parse_record_stream(resp.raw, no_record_parse=False)
assert record.rec_headers.get_header('WARC-Target-URI') == 'http://httpbin.org/get'
assert record.status_headers.get_header('Date') != ''
def test_upstream_1(self):
resp = self.testapp.get('/upstream/resource?url=http://httpbin.org/get')
assert resp.headers['WebAgg-Source-Coll'] == 'upstream:live'
raw = BytesIO(resp.body)
record = ArcWarcRecordLoader().parse_record_stream(raw, no_record_parse=False)
assert record.rec_headers.get_header('WARC-Target-URI') == 'http://httpbin.org/get'
assert record.status_headers.get_header('Date') != ''
def test_upstream_2(self):
resp = self.testapp.get('/upstream_opt/resource?url=http://httpbin.org/get')
assert resp.headers['WebAgg-Source-Coll'] == 'upstream_opt:live', resp.headers
raw = BytesIO(resp.body)
record = ArcWarcRecordLoader().parse_record_stream(raw, no_record_parse=False)
assert record.rec_headers.get_header('WARC-Target-URI') == 'http://httpbin.org/get'
assert record.status_headers.get_header('Date') != ''

127
webagg/test/testutils.py Normal file
View File

@ -0,0 +1,127 @@
import json
import os
import tempfile
import shutil
from multiprocessing import Process
from fakeredis import FakeStrictRedis
from mock import patch
from wsgiref.simple_server import make_server
from webagg.aggregator import SimpleAggregator
from webagg.app import ResAggApp
from webagg.handlers import DefaultResourceHandler
from webagg.indexsource import LiveIndexSource
# ============================================================================
def to_json_list(cdxlist, fields=['timestamp', 'load_url', 'filename', 'source']):
return list([json.loads(cdx.to_json(fields)) for cdx in cdxlist])
def key_ts_res(cdxlist, extra='filename'):
return '\n'.join([cdx['urlkey'] + ' ' + cdx['timestamp'] + ' ' + cdx[extra] for cdx in cdxlist])
def to_path(path):
if os.path.sep != '/':
path = path.replace('/', os.path.sep)
return path
# ============================================================================
class BaseTestClass(object):
@classmethod
def setup_class(cls):
pass
@classmethod
def teardown_class(cls):
pass
# ============================================================================
PUBSUBS = []
class FakeStrictRedisSharedPubSub(FakeStrictRedis):
def __init__(self, *args, **kwargs):
super(FakeStrictRedisSharedPubSub, self).__init__(*args, **kwargs)
self._pubsubs = PUBSUBS
# ============================================================================
class FakeRedisTests(object):
@classmethod
def setup_class(cls):
super(FakeRedisTests, cls).setup_class()
cls.redismock = patch('redis.StrictRedis', FakeStrictRedisSharedPubSub)
cls.redismock.start()
@staticmethod
def add_cdx_to_redis(filename, key, redis_url='redis://localhost:6379/2'):
r = FakeStrictRedis.from_url(redis_url)
with open(filename, 'rb') as fh:
for line in fh:
r.zadd(key, 0, line.rstrip())
@classmethod
def teardown_class(cls):
super(FakeRedisTests, cls).teardown_class()
FakeStrictRedis().flushall()
cls.redismock.stop()
# ============================================================================
class TempDirTests(object):
@classmethod
def setup_class(cls):
super(TempDirTests, cls).setup_class()
cls.root_dir = tempfile.mkdtemp()
@classmethod
def teardown_class(cls):
super(TempDirTests, cls).teardown_class()
shutil.rmtree(cls.root_dir)
# ============================================================================
class LiveServerTests(object):
@classmethod
def setup_class(cls):
super(LiveServerTests, cls).setup_class()
cls.server = ServerThreadRunner(cls.make_live_app())
@staticmethod
def make_live_app():
app = ResAggApp()
app.add_route('/live',
DefaultResourceHandler(SimpleAggregator(
{'live': LiveIndexSource()})
)
)
return app
@classmethod
def teardown_class(cls):
super(LiveServerTests, cls).teardown_class()
cls.server.stop()
# ============================================================================
class ServerThreadRunner(object):
def __init__(self, app):
self.httpd = make_server('', 0, app)
self.port = self.httpd.socket.getsockname()[1]
def run():
self.httpd.serve_forever()
self.proc = Process(target=run)
#self.proc.daemon = True
self.proc.start()
def stop(self):
self.proc.terminate()

200
webagg/utils.py Normal file
View File

@ -0,0 +1,200 @@
import re
import six
import string
import yaml
import os
from contextlib import closing
from pywb.utils.timeutils import timestamp_to_http_date
from pywb.utils.wbexception import BadRequestException
LINK_SPLIT = re.compile(',\s*(?=[<])')
LINK_SEG_SPLIT = re.compile(';\s*')
LINK_URL = re.compile('<(.*)>')
LINK_PROP = re.compile('([\w]+)="([^"]+)')
BUFF_SIZE = 16384
#=============================================================================
class MementoException(BadRequestException):
pass
#=============================================================================
class MementoUtils(object):
@staticmethod
def parse_links(link_header, def_name='timemap'):
links = LINK_SPLIT.split(link_header)
results = {}
mementos = []
for link in links:
props = LINK_SEG_SPLIT.split(link)
m = LINK_URL.match(props[0])
if not m:
raise MementoException('Invalid Link Url: ' + props[0])
result = dict(url=m.group(1))
key = ''
is_mem = False
for prop in props[1:]:
m = LINK_PROP.match(prop)
if not m:
raise MementoException('Invalid prop ' + prop)
name = m.group(1)
value = m.group(2)
if name == 'rel':
if 'memento' in value:
is_mem = True
result[name] = value
elif value == 'self':
key = def_name
else:
key = value
else:
result[name] = value
if key:
results[key] = result
elif is_mem:
mementos.append(result)
results['mementos'] = mementos
return results
@staticmethod
def make_timemap_memento_link(cdx, datetime=None, rel='memento', end=',\n'):
url = cdx.get('load_url')
if not url:
url = 'file://{0}:{1}:{2}'.format(cdx.get('filename'), cdx.get('offset'), cdx.get('length'))
memento = '<{0}>; rel="{1}"; datetime="{2}"; src="{3}"' + end
if not datetime:
datetime = timestamp_to_http_date(cdx['timestamp'])
return memento.format(url, rel, datetime, cdx.get('source', ''))
@staticmethod
def make_timemap(cdx_iter):
# get first memento as it'll be used for 'from' field
try:
first_cdx = six.next(cdx_iter)
from_date = timestamp_to_http_date(first_cdx['timestamp'])
except StopIteration:
first_cdx = None
return
# first memento link
yield MementoUtils.make_timemap_memento_link(first_cdx, datetime=from_date)
prev_cdx = None
for cdx in cdx_iter:
if prev_cdx:
yield MementoUtils.make_timemap_memento_link(prev_cdx)
prev_cdx = cdx
# last memento link, if any
if prev_cdx:
yield MementoUtils.make_timemap_memento_link(prev_cdx, end='\n')
@staticmethod
def make_link(url, type):
return '<{0}>; rel="{1}"'.format(url, type)
#=============================================================================
class ParamFormatter(string.Formatter):
def __init__(self, params, name='', prefix='param.'):
self.params = params
self.prefix = prefix
self.name = name
def get_value(self, key, args, kwargs):
# First, try the named param 'param.{name}.{key}'
if self.name:
named_key = self.prefix + self.name + '.' + key
value = self.params.get(named_key)
if value is not None:
return value
# Then, try 'param.{key}'
named_key = self.prefix + key
value = self.params.get(named_key)
if value is not None:
return value
# default to just '{key}'
value = kwargs.get(key, '')
return value
#=============================================================================
def res_template(template, params, **extra_params):
formatter = params.get('_formatter')
if not formatter:
formatter = ParamFormatter(params)
res = formatter.format(template, url=params.get('url', ''), **extra_params)
return res
#=============================================================================
def StreamIter(stream, header1=None, header2=None, size=BUFF_SIZE):
with closing(stream):
if header1:
yield header1
if header2:
yield header2
while True:
buff = stream.read(size)
if not buff:
break
yield buff
#=============================================================================
def chunk_encode_iter(orig_iter):
for chunk in orig_iter:
if not len(chunk):
continue
chunk_len = b'%X\r\n' % len(chunk)
yield chunk_len
yield chunk
yield b'\r\n'
yield b'0\r\n\r\n'
#=============================================================================
def load_config(main_env_var, main_default_file='',
overlay_env_var='', overlay_file=''):
configfile = os.environ.get(main_env_var, main_default_file)
if configfile:
# Load config
with open(configfile, 'rb') as fh:
config = yaml.load(fh)
else:
config = {}
overlay_configfile = os.environ.get(overlay_env_var, overlay_file)
if overlay_configfile:
with open(overlay_configfile, 'rb') as fh:
config.update(yaml.load(fh))
return config