mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
manager: add cdx -> cdxj migration tool #80, which will convert all cdxs in a directory to cdxj, removing original files
migration will also recanonicalize the urlkey to surt form add migration test using non-surt, 9-field cdx (created from samples) cdxindexer: fix multi warc->multi cdx indexing options
This commit is contained in:
parent
c5b5c8ee4b
commit
b43a7f94f3
@ -7,6 +7,7 @@ from pywb.utils.loaders import load_yaml_config
|
|||||||
from pywb.utils.timeutils import timestamp20_now
|
from pywb.utils.timeutils import timestamp20_now
|
||||||
from pywb.warc.cdxindexer import main as cdxindexer_main
|
from pywb.warc.cdxindexer import main as cdxindexer_main
|
||||||
from pywb.webapp.pywb_init import DEFAULT_CONFIG
|
from pywb.webapp.pywb_init import DEFAULT_CONFIG
|
||||||
|
from migrate import MigrateCDX
|
||||||
|
|
||||||
from distutils.util import strtobool
|
from distutils.util import strtobool
|
||||||
from pkg_resources import resource_string
|
from pkg_resources import resource_string
|
||||||
@ -30,6 +31,8 @@ simplify the creation and management of web archive collections
|
|||||||
It may be used via cmdline to setup and maintain the
|
It may be used via cmdline to setup and maintain the
|
||||||
directory structure expected by pywb
|
directory structure expected by pywb
|
||||||
"""
|
"""
|
||||||
|
DEF_INDEX_FILE = 'index.cdxj'
|
||||||
|
|
||||||
def __init__(self, coll_name, root_dir='collections', must_exist=True):
|
def __init__(self, coll_name, root_dir='collections', must_exist=True):
|
||||||
self.root_dir = root_dir
|
self.root_dir = root_dir
|
||||||
self.default_config = load_yaml_config('pywb/default_config.yaml')
|
self.default_config = load_yaml_config('pywb/default_config.yaml')
|
||||||
@ -95,7 +98,7 @@ directory structure expected by pywb
|
|||||||
self._index_merge_warcs(full_paths)
|
self._index_merge_warcs(full_paths)
|
||||||
|
|
||||||
def reindex(self):
|
def reindex(self):
|
||||||
cdx_file = os.path.join(self.cdx_dir, 'index.cdxj')
|
cdx_file = os.path.join(self.cdx_dir, self.DEF_INDEX_FILE)
|
||||||
logging.info('Indexing ' + self.warc_dir + ' to ' + cdx_file)
|
logging.info('Indexing ' + self.warc_dir + ' to ' + cdx_file)
|
||||||
self._cdx_index(cdx_file, [self.warc_dir])
|
self._cdx_index(cdx_file, [self.warc_dir])
|
||||||
|
|
||||||
@ -128,7 +131,7 @@ directory structure expected by pywb
|
|||||||
self._index_merge_warcs(filtered_warcs)
|
self._index_merge_warcs(filtered_warcs)
|
||||||
|
|
||||||
def _index_merge_warcs(self, new_warcs):
|
def _index_merge_warcs(self, new_warcs):
|
||||||
cdx_file = os.path.join(self.cdx_dir, 'index.cdx')
|
cdx_file = os.path.join(self.cdx_dir, self.DEF_INDEX_FILE)
|
||||||
|
|
||||||
# no existing file, just reindex all
|
# no existing file, just reindex all
|
||||||
if not os.path.isfile(cdx_file):
|
if not os.path.isfile(cdx_file):
|
||||||
@ -276,6 +279,27 @@ directory structure expected by pywb
|
|||||||
os.remove(full_path)
|
os.remove(full_path)
|
||||||
print('Removed template file "{0}"'.format(full_path))
|
print('Removed template file "{0}"'.format(full_path))
|
||||||
|
|
||||||
|
def migrate_cdxj(self, path, force=False):
|
||||||
|
migrate = MigrateCDX(path)
|
||||||
|
count = migrate.count_cdx()
|
||||||
|
if count == 0:
|
||||||
|
print('Index files up-to-date, nothing to migrate')
|
||||||
|
return
|
||||||
|
|
||||||
|
msg = 'Migrate {0} index files? (y/n)'.format(count)
|
||||||
|
if not force:
|
||||||
|
res = get_input(msg)
|
||||||
|
try:
|
||||||
|
res = strtobool(res)
|
||||||
|
except ValueError:
|
||||||
|
res = False
|
||||||
|
|
||||||
|
if not res:
|
||||||
|
return
|
||||||
|
|
||||||
|
migrate.convert_to_cdxj()
|
||||||
|
|
||||||
|
|
||||||
#=============================================================================
|
#=============================================================================
|
||||||
def main(args=None):
|
def main(args=None):
|
||||||
description = """
|
description = """
|
||||||
@ -374,6 +398,15 @@ Create manage file based web archive collections
|
|||||||
template.add_argument('--list', action='store_true')
|
template.add_argument('--list', action='store_true')
|
||||||
template.set_defaults(func=do_add_template)
|
template.set_defaults(func=do_add_template)
|
||||||
|
|
||||||
|
def do_migrate(r):
|
||||||
|
m = CollectionsManager('', must_exist=False)
|
||||||
|
m.migrate_cdxj(r.path, r.force)
|
||||||
|
|
||||||
|
template = subparsers.add_parser('migrate')
|
||||||
|
template.add_argument('path', default='./', nargs='?')
|
||||||
|
template.add_argument('-f', '--force', action='store_true')
|
||||||
|
template.set_defaults(func=do_migrate)
|
||||||
|
|
||||||
r = parser.parse_args(args=args)
|
r = parser.parse_args(args=args)
|
||||||
r.func(r)
|
r.func(r)
|
||||||
|
|
||||||
|
46
pywb/manager/migrate.py
Normal file
46
pywb/manager/migrate.py
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
from pywb.utils.canonicalize import canonicalize
|
||||||
|
from pywb.cdx.cdxobject import CDXObject, URLKEY, ORIGINAL
|
||||||
|
from pywb.warc.cdxindexer import CDXJ
|
||||||
|
|
||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
|
||||||
|
|
||||||
|
#=============================================================================
|
||||||
|
class MigrateCDX(object):
|
||||||
|
def __init__(self, dir_):
|
||||||
|
self.cdx_dir = dir_
|
||||||
|
|
||||||
|
def iter_cdx_files(self):
|
||||||
|
for root, dirs, files in os.walk(self.cdx_dir):
|
||||||
|
for filename in files:
|
||||||
|
if filename.endswith('.cdx'):
|
||||||
|
full_path = os.path.join(root, filename)
|
||||||
|
yield full_path
|
||||||
|
|
||||||
|
def count_cdx(self):
|
||||||
|
count = 0
|
||||||
|
for x in self.iter_cdx_files():
|
||||||
|
count += 1
|
||||||
|
return count
|
||||||
|
|
||||||
|
def convert_to_cdxj(self):
|
||||||
|
cdxj_writer = CDXJ()
|
||||||
|
for filename in self.iter_cdx_files():
|
||||||
|
outfile = filename + 'j'
|
||||||
|
|
||||||
|
print('Converting {0} -> {1}'.format(filename, outfile))
|
||||||
|
|
||||||
|
with open(outfile + '.tmp', 'w+b') as out:
|
||||||
|
with open(filename) as fh:
|
||||||
|
for line in fh:
|
||||||
|
if line.startswith(' CDX'):
|
||||||
|
continue
|
||||||
|
cdx = CDXObject(line)
|
||||||
|
cdx[URLKEY] = canonicalize(cdx[ORIGINAL])
|
||||||
|
cdxj_writer.write_cdx_line(out, cdx, cdx['filename'])
|
||||||
|
|
||||||
|
shutil.move(outfile + '.tmp', outfile)
|
||||||
|
os.remove(filename)
|
||||||
|
|
||||||
|
|
@ -198,7 +198,7 @@ class ArchiveIndexEntryMixin(object):
|
|||||||
MIME_RE = re.compile('[; ]')
|
MIME_RE = re.compile('[; ]')
|
||||||
|
|
||||||
def reset_entry(self):
|
def reset_entry(self):
|
||||||
self['key'] = ''
|
self['urlkey'] = ''
|
||||||
|
|
||||||
def extract_mime(self, mime, def_mime='unk'):
|
def extract_mime(self, mime, def_mime='unk'):
|
||||||
""" Utility function to extract mimetype only
|
""" Utility function to extract mimetype only
|
||||||
@ -238,8 +238,8 @@ class ArchiveIndexEntryMixin(object):
|
|||||||
post_query = other.get('_post_query')
|
post_query = other.get('_post_query')
|
||||||
if post_query:
|
if post_query:
|
||||||
url = append_post_query(self['url'], post_query)
|
url = append_post_query(self['url'], post_query)
|
||||||
self['key'] = canonicalize(url, surt_ordered)
|
self['urlkey'] = canonicalize(url, surt_ordered)
|
||||||
other['key'] = self['key']
|
other['urlkey'] = self['urlkey']
|
||||||
|
|
||||||
referer = other.record.status_headers.get_header('referer')
|
referer = other.record.status_headers.get_header('referer')
|
||||||
if referer:
|
if referer:
|
||||||
@ -303,8 +303,8 @@ class DefaultRecordIter(object):
|
|||||||
if not entry:
|
if not entry:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if entry.get('url') and not entry.get('key'):
|
if entry.get('url') and not entry.get('urlkey'):
|
||||||
entry['key'] = canonicalize(entry['url'], surt_ordered)
|
entry['urlkey'] = canonicalize(entry['url'], surt_ordered)
|
||||||
|
|
||||||
compute_digest = False
|
compute_digest = False
|
||||||
|
|
||||||
@ -370,7 +370,7 @@ class DefaultRecordIter(object):
|
|||||||
|
|
||||||
if record.rec_type == 'warcinfo':
|
if record.rec_type == 'warcinfo':
|
||||||
entry['url'] = record.rec_headers.get_header('WARC-Filename')
|
entry['url'] = record.rec_headers.get_header('WARC-Filename')
|
||||||
entry['key'] = entry['url']
|
entry['urlkey'] = entry['url']
|
||||||
entry['_warcinfo'] = record.stream.read(record.length)
|
entry['_warcinfo'] = record.stream.read(record.length)
|
||||||
return entry
|
return entry
|
||||||
|
|
||||||
|
@ -27,7 +27,7 @@ class BaseCDXWriter(object):
|
|||||||
return self
|
return self
|
||||||
|
|
||||||
def write(self, entry, filename):
|
def write(self, entry, filename):
|
||||||
if not entry.get('url') or not entry.get('key'):
|
if not entry.get('url') or not entry.get('urlkey'):
|
||||||
return
|
return
|
||||||
|
|
||||||
if entry.record.rec_type == 'warcinfo':
|
if entry.record.rec_type == 'warcinfo':
|
||||||
@ -45,7 +45,7 @@ class CDXJ(object):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
def write_cdx_line(self, out, entry, filename):
|
def write_cdx_line(self, out, entry, filename):
|
||||||
out.write(entry['key'])
|
out.write(entry['urlkey'])
|
||||||
out.write(' ')
|
out.write(' ')
|
||||||
out.write(entry['timestamp'])
|
out.write(entry['timestamp'])
|
||||||
out.write(' ')
|
out.write(' ')
|
||||||
@ -53,7 +53,7 @@ class CDXJ(object):
|
|||||||
outdict = OrderedDict()
|
outdict = OrderedDict()
|
||||||
|
|
||||||
for n, v in entry.iteritems():
|
for n, v in entry.iteritems():
|
||||||
if n in ('key', 'timestamp'):
|
if n in ('urlkey', 'timestamp'):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if n.startswith('_'):
|
if n.startswith('_'):
|
||||||
@ -75,7 +75,7 @@ class CDX09(object):
|
|||||||
self.out.write(' CDX N b a m s k r V g\n')
|
self.out.write(' CDX N b a m s k r V g\n')
|
||||||
|
|
||||||
def write_cdx_line(self, out, entry, filename):
|
def write_cdx_line(self, out, entry, filename):
|
||||||
out.write(entry['key'])
|
out.write(entry['urlkey'])
|
||||||
out.write(' ')
|
out.write(' ')
|
||||||
out.write(entry['timestamp'])
|
out.write(entry['timestamp'])
|
||||||
out.write(' ')
|
out.write(' ')
|
||||||
@ -99,7 +99,7 @@ class CDX11(object):
|
|||||||
self.out.write(' CDX N b a m s k r M S V g\n')
|
self.out.write(' CDX N b a m s k r M S V g\n')
|
||||||
|
|
||||||
def write_cdx_line(self, out, entry, filename):
|
def write_cdx_line(self, out, entry, filename):
|
||||||
out.write(entry['key'])
|
out.write(entry['urlkey'])
|
||||||
out.write(' ')
|
out.write(' ')
|
||||||
out.write(entry['timestamp'])
|
out.write(entry['timestamp'])
|
||||||
out.write(' ')
|
out.write(' ')
|
||||||
@ -218,8 +218,10 @@ def write_multi_cdx_index(output, inputs, **options):
|
|||||||
|
|
||||||
with open(outpath, 'wb') as outfile:
|
with open(outpath, 'wb') as outfile:
|
||||||
with open(fullpath, 'rb') as infile:
|
with open(fullpath, 'rb') as infile:
|
||||||
return write_cdx_index(outfile, infile, filename,
|
writer = write_cdx_index(outfile, infile, filename,
|
||||||
**options)
|
**options)
|
||||||
|
|
||||||
|
return writer
|
||||||
|
|
||||||
# write to one cdx file
|
# write to one cdx file
|
||||||
else:
|
else:
|
||||||
|
@ -10,6 +10,8 @@ from io import BytesIO
|
|||||||
from pywb.webapp.pywb_init import create_wb_router
|
from pywb.webapp.pywb_init import create_wb_router
|
||||||
from pywb.manager.manager import main
|
from pywb.manager.manager import main
|
||||||
|
|
||||||
|
from pywb.warc.cdxindexer import main as cdxindexer_main
|
||||||
|
|
||||||
from pywb import get_test_dir
|
from pywb import get_test_dir
|
||||||
from pywb.framework.wsgi_wrappers import init_app
|
from pywb.framework.wsgi_wrappers import init_app
|
||||||
|
|
||||||
@ -392,6 +394,44 @@ class TestManagedColls(object):
|
|||||||
assert '- nested' in output
|
assert '- nested' in output
|
||||||
assert '- test' in output
|
assert '- test' in output
|
||||||
|
|
||||||
|
def test_migrate(self):
|
||||||
|
""" Create non-surt cdx, then convert to cdxj
|
||||||
|
"""
|
||||||
|
migrate_dir = os.path.join(self.root_dir, '_migrate')
|
||||||
|
|
||||||
|
os.mkdir(migrate_dir)
|
||||||
|
|
||||||
|
cdxindexer_main(['-u', migrate_dir, self._get_sample_warc('')])
|
||||||
|
|
||||||
|
# try one file with -9
|
||||||
|
cdxindexer_main(['-u', '-9', migrate_dir, self._get_sample_warc('example.warc.gz')])
|
||||||
|
|
||||||
|
cdxs = os.listdir(migrate_dir)
|
||||||
|
assert all(x.endswith('.cdx') for x in cdxs)
|
||||||
|
|
||||||
|
@patch('pywb.manager.manager.get_input', lambda x: 'blah')
|
||||||
|
def do_migrate_no():
|
||||||
|
main(['migrate', migrate_dir])
|
||||||
|
|
||||||
|
do_migrate_no()
|
||||||
|
assert os.listdir(migrate_dir) == cdxs
|
||||||
|
|
||||||
|
@patch('pywb.manager.manager.get_input', lambda x: 'y')
|
||||||
|
def do_migrate_yes():
|
||||||
|
main(['migrate', migrate_dir])
|
||||||
|
|
||||||
|
do_migrate_yes()
|
||||||
|
cdxjs = os.listdir(migrate_dir)
|
||||||
|
|
||||||
|
assert len(cdxs) == len(cdxjs)
|
||||||
|
assert all(x.endswith('.cdxj') for x in cdxjs)
|
||||||
|
|
||||||
|
with open(os.path.join(migrate_dir, 'iana.cdxj')) as fh:
|
||||||
|
assert fh.readline().startswith('org,iana)/ 20140126200624 {"url": "http://www.iana.org/",')
|
||||||
|
|
||||||
|
# Nothing else to migrate
|
||||||
|
main(['migrate', migrate_dir])
|
||||||
|
|
||||||
def test_err_template_remove(self):
|
def test_err_template_remove(self):
|
||||||
""" Test various error conditions for templates:
|
""" Test various error conditions for templates:
|
||||||
invalid template name, no collection for collection template
|
invalid template name, no collection for collection template
|
||||||
|
Loading…
x
Reference in New Issue
Block a user