mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-25 23:47:47 +01:00
recording support: now available for dynamic collections via config - config.yaml 'recorder: live' entry enables /record/ subpath which records to any dynamic collections (can record from any collection, though usually live) - autoindex refactor: simplified, standalone AutoIndexer() -- indexes any changed warc files to autoindex.cdxj - windows autoindex support: also check for changed file size, as last modified time may not be changing - manager: remove autoindex, now part of main cli - tests: updated test_auto_colls with autoindex changes - tests: add record/replay tests for recording and replay
108 lines
3.0 KiB
Python
108 lines
3.0 KiB
Python
import gevent
|
|
import time
|
|
import re
|
|
import os
|
|
import logging
|
|
|
|
from pywb.manager.manager import CollectionsManager
|
|
|
|
|
|
#=============================================================================
|
|
class AutoIndexer(object):
|
|
EXT_RX = re.compile('.*\.w?arc(\.gz)?$')
|
|
AUTO_INDEX_FILE = 'autoindex.cdxj'
|
|
|
|
def __init__(self, interval=30, keep_running=True):
|
|
self.manager = CollectionsManager('', must_exist=False)
|
|
|
|
self.root_path = self.manager.colls_dir
|
|
|
|
self.keep_running = keep_running
|
|
|
|
self.interval = interval
|
|
|
|
self.last_size = {}
|
|
|
|
def is_newer_than(self, path1, path2, track=False):
|
|
try:
|
|
mtime1 = os.path.getmtime(path1)
|
|
mtime2 = os.path.getmtime(path2)
|
|
newer = mtime1 > mtime2
|
|
except:
|
|
newer = True
|
|
|
|
if track:
|
|
size = os.path.getsize(path1)
|
|
try:
|
|
if size != self.last_size[path1]:
|
|
newer = True
|
|
except:
|
|
pass
|
|
|
|
self.last_size[path1] = size
|
|
|
|
return newer
|
|
|
|
def do_index(self, files):
|
|
logging.info('Auto-Indexing... ' + str(files))
|
|
self.manager.index_merge(files, self.AUTO_INDEX_FILE)
|
|
logging.info('...Done')
|
|
|
|
def check_path(self):
|
|
for coll in os.listdir(self.root_path):
|
|
coll_dir = os.path.join(self.root_path, coll)
|
|
if not os.path.isdir(coll_dir):
|
|
continue
|
|
|
|
self.manager.change_collection(coll)
|
|
|
|
archive_dir = self.manager.archive_dir
|
|
|
|
if not os.path.isdir(archive_dir):
|
|
continue
|
|
|
|
index_file = os.path.join(self.manager.indexes_dir, self.AUTO_INDEX_FILE)
|
|
|
|
if os.path.isfile(index_file):
|
|
if os.name != 'nt' and self.is_newer_than(archive_dir, index_file):
|
|
continue
|
|
else:
|
|
try:
|
|
os.makedirs(self.manager.indexes_dir)
|
|
except Exception as e:
|
|
pass
|
|
|
|
logging.info('Collection Possibly Changed: ' + coll)
|
|
to_index = []
|
|
for dirpath, dirnames, filenames in os.walk(archive_dir):
|
|
for filename in filenames:
|
|
if not self.EXT_RX.match(filename):
|
|
continue
|
|
|
|
full_filename = os.path.join(dirpath, filename)
|
|
|
|
if self.is_newer_than(full_filename, index_file, True):
|
|
to_index.append(full_filename)
|
|
|
|
if to_index:
|
|
self.do_index(to_index)
|
|
|
|
def run(self):
|
|
try:
|
|
while self.keep_running:
|
|
self.check_path()
|
|
if not self.interval:
|
|
break
|
|
|
|
time.sleep(self.interval)
|
|
except KeyboardInterrupt: # pragma: no cover
|
|
return
|
|
|
|
def start(self):
|
|
self.ge = gevent.spawn(self.run)
|
|
|
|
def stop(self):
|
|
self.interval = 0
|
|
self.keep_running = False
|
|
|