1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 08:04:49 +01:00

add utils, responseloader and liverec

This commit is contained in:
Ilya Kreymer 2016-02-19 17:27:19 -08:00
parent baa02add69
commit 37198767ed
3 changed files with 485 additions and 0 deletions

245
liverec.py Normal file
View File

@ -0,0 +1,245 @@
from io import BytesIO
try:
import httplib
except ImportError:
import http.client as httplib
orig_connection = httplib.HTTPConnection
from contextlib import contextmanager
import ssl
from array import array
from time import sleep
BUFF_SIZE = 8192
# ============================================================================
class RecordingStream(object):
def __init__(self, fp, recorder):
self.fp = fp
self.recorder = recorder
self.incomplete = False
if hasattr(self.fp, 'unread'):
self.unread = self.fp.unread
if hasattr(self.fp, 'tell'):
self.tell = self.fp.tell
def read(self, *args, **kwargs):
buff = self.fp.read(*args, **kwargs)
self.recorder.write_response_buff(buff)
return buff
def readinto(self, buff):
res = self.fp.readinto(buff)
self.recorder.write_response_buff(buff)
return res
def readline(self, maxlen=None):
line = self.fp.readline(maxlen)
self.recorder.write_response_header_line(line)
return line
def flush(self):
self.fp.flush()
def close(self):
try:
self.recorder.finish_response(self.incomplete)
except Exception as e:
import traceback
traceback.print_exc()
res = self.fp.close()
return res
# ============================================================================
class RecordingHTTPResponse(httplib.HTTPResponse):
def __init__(self, recorder, *args, **kwargs):
httplib.HTTPResponse.__init__(self, *args, **kwargs)
self.fp = RecordingStream(self.fp, recorder)
def mark_incomplete(self):
self.fp.incomplete = True
# ============================================================================
class RecordingHTTPConnection(httplib.HTTPConnection):
global_recorder_maker = None
def __init__(self, *args, **kwargs):
orig_connection.__init__(self, *args, **kwargs)
if not self.global_recorder_maker:
self.recorder = None
else:
self.recorder = self.global_recorder_maker()
def make_recording_response(*args, **kwargs):
return RecordingHTTPResponse(self.recorder, *args, **kwargs)
self.response_class = make_recording_response
def send(self, data):
if not self.recorder:
orig_connection.send(self, data)
return
if hasattr(data,'read') and not isinstance(data, array):
url = None
while True:
buff = data.read(self.BUFF_SIZE)
if not buff:
break
orig_connection.send(self, buff)
self.recorder.write_request(url, buff)
else:
orig_connection.send(self, data)
self.recorder.write_request(self, data)
def get_url(self, data):
try:
buff = BytesIO(data)
line = buff.readline()
path = line.split(' ', 2)[1]
host = self.host
port = self.port
scheme = 'https' if isinstance(self.sock, ssl.SSLSocket) else 'http'
url = scheme + '://' + host
if (scheme == 'https' and port != '443') and (scheme == 'http' and port != '80'):
url += ':' + port
url += path
except Exception as e:
raise
return url
def request(self, *args, **kwargs):
#if self.recorder:
# self.recorder.start_request(self)
res = orig_connection.request(self, *args, **kwargs)
if self.recorder:
self.recorder.finish_request(self.sock)
return res
# ============================================================================
class BaseRecorder(object):
def write_request(self, conn, buff):
#url = conn.get_url()
pass
def write_response_header_line(self, line):
pass
def write_response_buff(self, buff):
pass
def finish_request(self, socket):
pass
def finish_response(self, incomplete=False):
pass
#=================================================================
class ReadFullyStream(object):
def __init__(self, stream):
self.stream = stream
def read(self, *args, **kwargs):
try:
return self.stream.read(*args, **kwargs)
except:
self.mark_incomplete()
raise
def readline(self, *args, **kwargs):
try:
return self.stream.readline(*args, **kwargs)
except:
self.mark_incomplete()
raise
def mark_incomplete(self):
if (hasattr(self.stream, '_fp') and
hasattr(self.stream._fp, 'mark_incomplete')):
self.stream._fp.mark_incomplete()
def close(self):
try:
while True:
buff = self.stream.read(BUFF_SIZE)
sleep(0)
if not buff:
break
except Exception as e:
import traceback
traceback.print_exc()
self.mark_incomplete()
finally:
self.stream.close()
# ============================================================================
httplib.HTTPConnection = RecordingHTTPConnection
# ============================================================================
class DefaultRecorderMaker(object):
def __call__(self):
return BaseRecorder()
class FixedRecorder(object):
def __init__(self, recorder):
self.recorder = recorder
def __call__(self):
return self.recorder
@contextmanager
def record_requests(url, recorder_maker):
RecordingHTTPConnection.global_recorder_maker = recorder_maker
yield
RecordingHTTPConnection.global_recorder_maker = None
@contextmanager
def orig_requests():
httplib.HTTPConnection = orig_connection
yield
httplib.HTTPConnection = RecordingHTTPConnection
import requests as patched_requests
def request(url, method='GET', recorder=None, recorder_maker=None, session=patched_requests, **kwargs):
if kwargs.get('skip_recording'):
recorder_maker = None
elif recorder:
recorder_maker = FixedRecorder(recorder)
elif not recorder_maker:
recorder_maker = DefaultRecorderMaker()
with record_requests(url, recorder_maker):
kwargs['allow_redirects'] = False
r = session.request(method=method,
url=url,
**kwargs)
return r

118
responseloader.py Normal file
View File

@ -0,0 +1,118 @@
from liverec import BaseRecorder
from liverec import request as remote_request
from pywb.warc.recordloader import ArcWarcRecordLoader, ArchiveLoadFailed
from pywb.utils.timeutils import timestamp_to_datetime
from io import BytesIO
from bottle import response
import uuid
#=============================================================================
def incr_reader(stream, header=None, size=8192):
if header:
yield header
while True:
data = stream.read(size)
if data:
yield data
else:
break
#=============================================================================
class WARCPathPrefixLoader(object):
def __init__(self, prefix):
self.prefix = prefix
self.record_loader = ArcWarcRecordLoader()
def __call__(self, cdx):
filename = cdx.get('filename')
offset = cdx.get('offset')
length = cdx.get('length', -1)
if filename is None or offset is None:
raise Exception
record = self.record_loader.load(self.prefix + filename,
offset,
length,
no_record_parse=True)
for n, v in record.rec_headers.headers:
response.headers[n] = v
return incr_reader(record.stream)
#=============================================================================
class HeaderRecorder(BaseRecorder):
def __init__(self, skip_list=None):
self.buff = BytesIO()
self.skip_list = skip_list
self.skipped = []
def write_response_header_line(self, line):
if self.accept_header(line):
self.buff.write(line)
def get_header(self):
return self.buff.getvalue()
def accept_header(self, line):
if self.skip_list and line.lower().startswith(self.skip_list):
self.skipped.append(line)
return False
return True
#=============================================================================
class LiveWebLoader(object):
SKIP_HEADERS = (b'link',
b'memento-datetime',
b'content-location',
b'x-archive',
b'set-cookie')
def __call__(self, cdx):
load_url = cdx.get('load_url')
if not load_url:
raise Exception
recorder = HeaderRecorder(self.SKIP_HEADERS)
upstream_res = remote_request(load_url, recorder=recorder, stream=True,
headers={'Accept-Encoding': 'identity'})
response.headers['Content-Type'] = 'application/http; msgtype=response'
response.headers['WARC-Type'] = 'response'
response.headers['WARC-Record-ID'] = self._make_warc_id()
response.headers['WARC-Target-URI'] = cdx['url']
response.headers['WARC-Date'] = self._make_date(cdx['timestamp'])
# Try to set content-length, if it is available and valid
try:
content_len = int(upstream_res.headers.get('content-length', 0))
if content_len > 0:
content_len += len(recorder.get_header())
response.headers['Content-Length'] = content_len
except:
pass
return incr_reader(upstream_res.raw, header=recorder.get_header())
@staticmethod
def _make_date(ts):
return timestamp_to_datetime(ts).strftime('%Y-%m-%dT%H:%M:%SZ')
@staticmethod
def _make_warc_id(id_=None):
if not id_:
id_ = uuid.uuid1()
return '<urn:uuid:{0}>'.format(id_)

122
utils.py Normal file
View File

@ -0,0 +1,122 @@
import re, json
from pywb.utils.canonicalize import canonicalize
from pywb.utils.timeutils import timestamp_to_sec, http_date_to_timestamp
from pywb.cdx.cdxobject import CDXObject
LINK_SPLIT = re.compile(',\s*(?=[<])')
LINK_SEG_SPLIT = re.compile(';\s*')
LINK_URL = re.compile('<(.*)>')
LINK_PROP = re.compile('([\w]+)="([^"]+)')
#=================================================================
class MementoUtils(object):
@staticmethod
def parse_links(link_header, def_name='timemap'):
links = LINK_SPLIT.split(link_header)
results = {}
mementos = []
for link in links:
props = LINK_SEG_SPLIT.split(link)
m = LINK_URL.match(props[0])
if not m:
raise Exception('Invalid Link Url: ' + props[0])
result = dict(url=m.group(1))
key = ''
is_mem = False
for prop in props[1:]:
m = LINK_PROP.match(prop)
if not m:
raise Exception('Invalid prop ' + prop)
name = m.group(1)
value = m.group(2)
if name == 'rel':
if 'memento' in value:
is_mem = True
result[name] = value
elif value == 'self':
key = def_name
else:
key = value
else:
result[name] = value
if key:
results[key] = result
elif is_mem:
mementos.append(result)
results['mementos'] = mementos
return results
@staticmethod
def links_to_json(link_header, def_name='timemap', sort=False):
results = MementoUtils.parse_links(link_header, def_name)
#meta = MementoUtils.meta_field('timegate', results)
#if meta:
# yield meta
#meta = MementoUtils.meta_field('timemap', results)
#if meta:
# yield meta
#meta = MementoUtils.meta_field('original', results)
#if meta:
# yield meta
original = results['original']['url']
key = canonicalize(original)
mementos = results['mementos']
if sort:
mementos = sorted(mementos)
def link_iter():
for val in mementos:
dt = val.get('datetime')
if not dt:
continue
ts = http_date_to_timestamp(dt)
line = CDXObject()
line['urlkey'] = key
line['timestamp'] = ts
line['url'] = original
line['mem_rel'] = val.get('rel', '')
line['memento_url'] = val['url']
yield line
return original, link_iter
@staticmethod
def meta_field(name, results):
v = results.get(name)
if v:
c = CDXObject()
c['key'] = '@' + name
c['url'] = v['url']
return c
#=================================================================
def cdx_sort_closest(closest, cdx_json):
closest_sec = timestamp_to_sec(closest)
def get_key(cdx):
sec = timestamp_to_sec(cdx['timestamp'])
return abs(closest_sec - sec)
cdx_sorted = sorted(cdx_json, key=get_key)
return cdx_sorted