From 5fd49f35ee31f1840a8711bec7989534dfa28a75 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 22 Mar 2016 11:31:08 -0700 Subject: [PATCH] zipnum: when using .loc file, resolve shard paths relative to the .loc file, not from working directory, fixes #173 --- pywb/cdx/cdxserver.py | 4 ++-- pywb/cdx/zipnum.py | 11 ++++++++++- sample_archive/zipcdx/zipnum-bad.loc | 4 ++-- sample_archive/zipcdx/zipnum-sample.loc | 2 +- 4 files changed, 15 insertions(+), 6 deletions(-) diff --git a/pywb/cdx/cdxserver.py b/pywb/cdx/cdxserver.py index bfdf5741..f0869d0f 100644 --- a/pywb/cdx/cdxserver.py +++ b/pywb/cdx/cdxserver.py @@ -175,8 +175,8 @@ class CDXServer(BaseCDXServer): if filename.endswith(('.summary', '.idx')): return ZipNumCluster(filename, config) - # no warning for .loc - if not filename.endswith('.loc'): + # no warning for .loc or .gz (zipnum) + if not filename.endswith(('.loc', '.gz')): logging.warn('skipping unrecognized URI: %s', filename) return None diff --git a/pywb/cdx/zipnum.py b/pywb/cdx/zipnum.py index f44a6b6a..9a51ae7f 100644 --- a/pywb/cdx/zipnum.py +++ b/pywb/cdx/zipnum.py @@ -54,11 +54,20 @@ class LocMapResolver(object): # update loc file mtime self.loc_mtime = new_mtime + local_dir = os.path.dirname(self.loc_filename) + + def res_path(pathname): + if '://' not in pathname: + pathname = os.path.join(local_dir, pathname) + return pathname + logging.debug('Loading loc from: ' + self.loc_filename) with open(self.loc_filename, 'r') as fh: for line in fh: parts = line.rstrip().split('\t') - self.loc_map[parts[0]] = parts[1:] + + paths = [res_path(pathname) for pathname in parts[1:]] + self.loc_map[parts[0]] = paths def __call__(self, part, query): return self.loc_map[part] diff --git a/sample_archive/zipcdx/zipnum-bad.loc b/sample_archive/zipcdx/zipnum-bad.loc index d113a330..8c6bd330 100644 --- a/sample_archive/zipcdx/zipnum-bad.loc +++ b/sample_archive/zipcdx/zipnum-bad.loc @@ -1,3 +1,3 @@ -bar ./sample_archive/invalid +bar invalid foo2 -zipnum ./sample_archive/x-bad-path-to-ignore-x ./sample_archive/zipcdx/zipnum-sample.cdx.gz +zipnum x-bad-path-to-ignore-x zipnum-sample.cdx.gz diff --git a/sample_archive/zipcdx/zipnum-sample.loc b/sample_archive/zipcdx/zipnum-sample.loc index 8a4d1210..c0f24a88 100644 --- a/sample_archive/zipcdx/zipnum-sample.loc +++ b/sample_archive/zipcdx/zipnum-sample.loc @@ -1 +1 @@ -zipnum ./sample_archive/x-bad-path-to-ignore-x ./sample_archive/zipcdx/zipnum-sample.cdx.gz +zipnum x-bad-path-to-ignore-x zipnum-sample.cdx.gz