1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 16:14:48 +01:00
pywb/utils.py

123 lines
3.3 KiB
Python
Raw Normal View History

2016-02-19 17:27:19 -08:00
import re, json
from pywb.utils.canonicalize import canonicalize
from pywb.utils.timeutils import timestamp_to_sec, http_date_to_timestamp
from pywb.cdx.cdxobject import CDXObject
LINK_SPLIT = re.compile(',\s*(?=[<])')
LINK_SEG_SPLIT = re.compile(';\s*')
LINK_URL = re.compile('<(.*)>')
LINK_PROP = re.compile('([\w]+)="([^"]+)')
#=================================================================
class MementoUtils(object):
@staticmethod
def parse_links(link_header, def_name='timemap'):
links = LINK_SPLIT.split(link_header)
results = {}
mementos = []
for link in links:
props = LINK_SEG_SPLIT.split(link)
m = LINK_URL.match(props[0])
if not m:
raise Exception('Invalid Link Url: ' + props[0])
result = dict(url=m.group(1))
key = ''
is_mem = False
for prop in props[1:]:
m = LINK_PROP.match(prop)
if not m:
raise Exception('Invalid prop ' + prop)
name = m.group(1)
value = m.group(2)
if name == 'rel':
if 'memento' in value:
is_mem = True
result[name] = value
elif value == 'self':
key = def_name
else:
key = value
else:
result[name] = value
if key:
results[key] = result
elif is_mem:
mementos.append(result)
results['mementos'] = mementos
return results
@staticmethod
def links_to_json(link_header, def_name='timemap', sort=False):
results = MementoUtils.parse_links(link_header, def_name)
#meta = MementoUtils.meta_field('timegate', results)
#if meta:
# yield meta
#meta = MementoUtils.meta_field('timemap', results)
#if meta:
# yield meta
#meta = MementoUtils.meta_field('original', results)
#if meta:
# yield meta
original = results['original']['url']
key = canonicalize(original)
mementos = results['mementos']
if sort:
mementos = sorted(mementos)
def link_iter():
for val in mementos:
dt = val.get('datetime')
if not dt:
continue
ts = http_date_to_timestamp(dt)
line = CDXObject()
line['urlkey'] = key
line['timestamp'] = ts
line['url'] = original
line['mem_rel'] = val.get('rel', '')
line['memento_url'] = val['url']
yield line
return original, link_iter
@staticmethod
def meta_field(name, results):
v = results.get(name)
if v:
c = CDXObject()
c['key'] = '@' + name
c['url'] = v['url']
return c
#=================================================================
def cdx_sort_closest(closest, cdx_json):
closest_sec = timestamp_to_sec(closest)
def get_key(cdx):
sec = timestamp_to_sec(cdx['timestamp'])
return abs(closest_sec - sec)
cdx_sorted = sorted(cdx_json, key=get_key)
return cdx_sorted