mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 16:14:48 +01:00
123 lines
3.3 KiB
Python
123 lines
3.3 KiB
Python
import re, json
|
|
from pywb.utils.canonicalize import canonicalize
|
|
from pywb.utils.timeutils import timestamp_to_sec, http_date_to_timestamp
|
|
from pywb.cdx.cdxobject import CDXObject
|
|
|
|
|
|
LINK_SPLIT = re.compile(',\s*(?=[<])')
|
|
LINK_SEG_SPLIT = re.compile(';\s*')
|
|
LINK_URL = re.compile('<(.*)>')
|
|
LINK_PROP = re.compile('([\w]+)="([^"]+)')
|
|
|
|
|
|
#=================================================================
|
|
class MementoUtils(object):
|
|
@staticmethod
|
|
def parse_links(link_header, def_name='timemap'):
|
|
links = LINK_SPLIT.split(link_header)
|
|
results = {}
|
|
mementos = []
|
|
|
|
for link in links:
|
|
props = LINK_SEG_SPLIT.split(link)
|
|
m = LINK_URL.match(props[0])
|
|
if not m:
|
|
raise Exception('Invalid Link Url: ' + props[0])
|
|
|
|
result = dict(url=m.group(1))
|
|
key = ''
|
|
is_mem = False
|
|
|
|
for prop in props[1:]:
|
|
m = LINK_PROP.match(prop)
|
|
if not m:
|
|
raise Exception('Invalid prop ' + prop)
|
|
|
|
name = m.group(1)
|
|
value = m.group(2)
|
|
|
|
if name == 'rel':
|
|
if 'memento' in value:
|
|
is_mem = True
|
|
result[name] = value
|
|
elif value == 'self':
|
|
key = def_name
|
|
else:
|
|
key = value
|
|
else:
|
|
result[name] = value
|
|
|
|
if key:
|
|
results[key] = result
|
|
elif is_mem:
|
|
mementos.append(result)
|
|
|
|
results['mementos'] = mementos
|
|
return results
|
|
|
|
@staticmethod
|
|
def links_to_json(link_header, def_name='timemap', sort=False):
|
|
results = MementoUtils.parse_links(link_header, def_name)
|
|
|
|
#meta = MementoUtils.meta_field('timegate', results)
|
|
#if meta:
|
|
# yield meta
|
|
|
|
#meta = MementoUtils.meta_field('timemap', results)
|
|
#if meta:
|
|
# yield meta
|
|
|
|
#meta = MementoUtils.meta_field('original', results)
|
|
#if meta:
|
|
# yield meta
|
|
|
|
original = results['original']['url']
|
|
key = canonicalize(original)
|
|
|
|
mementos = results['mementos']
|
|
if sort:
|
|
mementos = sorted(mementos)
|
|
|
|
def link_iter():
|
|
for val in mementos:
|
|
dt = val.get('datetime')
|
|
if not dt:
|
|
continue
|
|
|
|
ts = http_date_to_timestamp(dt)
|
|
line = CDXObject()
|
|
line['urlkey'] = key
|
|
line['timestamp'] = ts
|
|
line['url'] = original
|
|
line['mem_rel'] = val.get('rel', '')
|
|
line['memento_url'] = val['url']
|
|
yield line
|
|
|
|
return original, link_iter
|
|
|
|
@staticmethod
|
|
def meta_field(name, results):
|
|
v = results.get(name)
|
|
if v:
|
|
c = CDXObject()
|
|
c['key'] = '@' + name
|
|
c['url'] = v['url']
|
|
return c
|
|
|
|
|
|
|
|
|
|
#=================================================================
|
|
def cdx_sort_closest(closest, cdx_json):
|
|
closest_sec = timestamp_to_sec(closest)
|
|
|
|
def get_key(cdx):
|
|
sec = timestamp_to_sec(cdx['timestamp'])
|
|
return abs(closest_sec - sec)
|
|
|
|
cdx_sorted = sorted(cdx_json, key=get_key)
|
|
return cdx_sorted
|
|
|
|
|
|
|