From b43a7f94f33ab7f73d5f8120830267a833ffd4ab Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 19 Mar 2015 20:52:00 -0700 Subject: [PATCH] manager: add cdx -> cdxj migration tool #80, which will convert all cdxs in a directory to cdxj, removing original files migration will also recanonicalize the urlkey to surt form add migration test using non-surt, 9-field cdx (created from samples) cdxindexer: fix multi warc->multi cdx indexing options --- pywb/manager/manager.py | 37 +++++++++++++++++++++++++++-- pywb/manager/migrate.py | 46 ++++++++++++++++++++++++++++++++++++ pywb/warc/archiveiterator.py | 12 +++++----- pywb/warc/cdxindexer.py | 16 +++++++------ tests/test_auto_colls.py | 40 +++++++++++++++++++++++++++++++ 5 files changed, 136 insertions(+), 15 deletions(-) create mode 100644 pywb/manager/migrate.py diff --git a/pywb/manager/manager.py b/pywb/manager/manager.py index c7c28bf2..fb2f0240 100644 --- a/pywb/manager/manager.py +++ b/pywb/manager/manager.py @@ -7,6 +7,7 @@ from pywb.utils.loaders import load_yaml_config from pywb.utils.timeutils import timestamp20_now from pywb.warc.cdxindexer import main as cdxindexer_main from pywb.webapp.pywb_init import DEFAULT_CONFIG +from migrate import MigrateCDX from distutils.util import strtobool from pkg_resources import resource_string @@ -30,6 +31,8 @@ simplify the creation and management of web archive collections It may be used via cmdline to setup and maintain the directory structure expected by pywb """ + DEF_INDEX_FILE = 'index.cdxj' + def __init__(self, coll_name, root_dir='collections', must_exist=True): self.root_dir = root_dir self.default_config = load_yaml_config('pywb/default_config.yaml') @@ -95,7 +98,7 @@ directory structure expected by pywb self._index_merge_warcs(full_paths) def reindex(self): - cdx_file = os.path.join(self.cdx_dir, 'index.cdxj') + cdx_file = os.path.join(self.cdx_dir, self.DEF_INDEX_FILE) logging.info('Indexing ' + self.warc_dir + ' to ' + cdx_file) self._cdx_index(cdx_file, [self.warc_dir]) @@ -128,7 +131,7 @@ directory structure expected by pywb self._index_merge_warcs(filtered_warcs) def _index_merge_warcs(self, new_warcs): - cdx_file = os.path.join(self.cdx_dir, 'index.cdx') + cdx_file = os.path.join(self.cdx_dir, self.DEF_INDEX_FILE) # no existing file, just reindex all if not os.path.isfile(cdx_file): @@ -276,6 +279,27 @@ directory structure expected by pywb os.remove(full_path) print('Removed template file "{0}"'.format(full_path)) + def migrate_cdxj(self, path, force=False): + migrate = MigrateCDX(path) + count = migrate.count_cdx() + if count == 0: + print('Index files up-to-date, nothing to migrate') + return + + msg = 'Migrate {0} index files? (y/n)'.format(count) + if not force: + res = get_input(msg) + try: + res = strtobool(res) + except ValueError: + res = False + + if not res: + return + + migrate.convert_to_cdxj() + + #============================================================================= def main(args=None): description = """ @@ -374,6 +398,15 @@ Create manage file based web archive collections template.add_argument('--list', action='store_true') template.set_defaults(func=do_add_template) + def do_migrate(r): + m = CollectionsManager('', must_exist=False) + m.migrate_cdxj(r.path, r.force) + + template = subparsers.add_parser('migrate') + template.add_argument('path', default='./', nargs='?') + template.add_argument('-f', '--force', action='store_true') + template.set_defaults(func=do_migrate) + r = parser.parse_args(args=args) r.func(r) diff --git a/pywb/manager/migrate.py b/pywb/manager/migrate.py new file mode 100644 index 00000000..8359fdc5 --- /dev/null +++ b/pywb/manager/migrate.py @@ -0,0 +1,46 @@ +from pywb.utils.canonicalize import canonicalize +from pywb.cdx.cdxobject import CDXObject, URLKEY, ORIGINAL +from pywb.warc.cdxindexer import CDXJ + +import os +import shutil + + +#============================================================================= +class MigrateCDX(object): + def __init__(self, dir_): + self.cdx_dir = dir_ + + def iter_cdx_files(self): + for root, dirs, files in os.walk(self.cdx_dir): + for filename in files: + if filename.endswith('.cdx'): + full_path = os.path.join(root, filename) + yield full_path + + def count_cdx(self): + count = 0 + for x in self.iter_cdx_files(): + count += 1 + return count + + def convert_to_cdxj(self): + cdxj_writer = CDXJ() + for filename in self.iter_cdx_files(): + outfile = filename + 'j' + + print('Converting {0} -> {1}'.format(filename, outfile)) + + with open(outfile + '.tmp', 'w+b') as out: + with open(filename) as fh: + for line in fh: + if line.startswith(' CDX'): + continue + cdx = CDXObject(line) + cdx[URLKEY] = canonicalize(cdx[ORIGINAL]) + cdxj_writer.write_cdx_line(out, cdx, cdx['filename']) + + shutil.move(outfile + '.tmp', outfile) + os.remove(filename) + + diff --git a/pywb/warc/archiveiterator.py b/pywb/warc/archiveiterator.py index c72eae62..c33ec313 100644 --- a/pywb/warc/archiveiterator.py +++ b/pywb/warc/archiveiterator.py @@ -198,7 +198,7 @@ class ArchiveIndexEntryMixin(object): MIME_RE = re.compile('[; ]') def reset_entry(self): - self['key'] = '' + self['urlkey'] = '' def extract_mime(self, mime, def_mime='unk'): """ Utility function to extract mimetype only @@ -238,8 +238,8 @@ class ArchiveIndexEntryMixin(object): post_query = other.get('_post_query') if post_query: url = append_post_query(self['url'], post_query) - self['key'] = canonicalize(url, surt_ordered) - other['key'] = self['key'] + self['urlkey'] = canonicalize(url, surt_ordered) + other['urlkey'] = self['urlkey'] referer = other.record.status_headers.get_header('referer') if referer: @@ -303,8 +303,8 @@ class DefaultRecordIter(object): if not entry: continue - if entry.get('url') and not entry.get('key'): - entry['key'] = canonicalize(entry['url'], surt_ordered) + if entry.get('url') and not entry.get('urlkey'): + entry['urlkey'] = canonicalize(entry['url'], surt_ordered) compute_digest = False @@ -370,7 +370,7 @@ class DefaultRecordIter(object): if record.rec_type == 'warcinfo': entry['url'] = record.rec_headers.get_header('WARC-Filename') - entry['key'] = entry['url'] + entry['urlkey'] = entry['url'] entry['_warcinfo'] = record.stream.read(record.length) return entry diff --git a/pywb/warc/cdxindexer.py b/pywb/warc/cdxindexer.py index 4d7c5837..cf889e74 100644 --- a/pywb/warc/cdxindexer.py +++ b/pywb/warc/cdxindexer.py @@ -27,7 +27,7 @@ class BaseCDXWriter(object): return self def write(self, entry, filename): - if not entry.get('url') or not entry.get('key'): + if not entry.get('url') or not entry.get('urlkey'): return if entry.record.rec_type == 'warcinfo': @@ -45,7 +45,7 @@ class CDXJ(object): pass def write_cdx_line(self, out, entry, filename): - out.write(entry['key']) + out.write(entry['urlkey']) out.write(' ') out.write(entry['timestamp']) out.write(' ') @@ -53,7 +53,7 @@ class CDXJ(object): outdict = OrderedDict() for n, v in entry.iteritems(): - if n in ('key', 'timestamp'): + if n in ('urlkey', 'timestamp'): continue if n.startswith('_'): @@ -75,7 +75,7 @@ class CDX09(object): self.out.write(' CDX N b a m s k r V g\n') def write_cdx_line(self, out, entry, filename): - out.write(entry['key']) + out.write(entry['urlkey']) out.write(' ') out.write(entry['timestamp']) out.write(' ') @@ -99,7 +99,7 @@ class CDX11(object): self.out.write(' CDX N b a m s k r M S V g\n') def write_cdx_line(self, out, entry, filename): - out.write(entry['key']) + out.write(entry['urlkey']) out.write(' ') out.write(entry['timestamp']) out.write(' ') @@ -218,8 +218,10 @@ def write_multi_cdx_index(output, inputs, **options): with open(outpath, 'wb') as outfile: with open(fullpath, 'rb') as infile: - return write_cdx_index(outfile, infile, filename, - **options) + writer = write_cdx_index(outfile, infile, filename, + **options) + + return writer # write to one cdx file else: diff --git a/tests/test_auto_colls.py b/tests/test_auto_colls.py index a43ad3fb..cae4d6b0 100644 --- a/tests/test_auto_colls.py +++ b/tests/test_auto_colls.py @@ -10,6 +10,8 @@ from io import BytesIO from pywb.webapp.pywb_init import create_wb_router from pywb.manager.manager import main +from pywb.warc.cdxindexer import main as cdxindexer_main + from pywb import get_test_dir from pywb.framework.wsgi_wrappers import init_app @@ -392,6 +394,44 @@ class TestManagedColls(object): assert '- nested' in output assert '- test' in output + def test_migrate(self): + """ Create non-surt cdx, then convert to cdxj + """ + migrate_dir = os.path.join(self.root_dir, '_migrate') + + os.mkdir(migrate_dir) + + cdxindexer_main(['-u', migrate_dir, self._get_sample_warc('')]) + + # try one file with -9 + cdxindexer_main(['-u', '-9', migrate_dir, self._get_sample_warc('example.warc.gz')]) + + cdxs = os.listdir(migrate_dir) + assert all(x.endswith('.cdx') for x in cdxs) + + @patch('pywb.manager.manager.get_input', lambda x: 'blah') + def do_migrate_no(): + main(['migrate', migrate_dir]) + + do_migrate_no() + assert os.listdir(migrate_dir) == cdxs + + @patch('pywb.manager.manager.get_input', lambda x: 'y') + def do_migrate_yes(): + main(['migrate', migrate_dir]) + + do_migrate_yes() + cdxjs = os.listdir(migrate_dir) + + assert len(cdxs) == len(cdxjs) + assert all(x.endswith('.cdxj') for x in cdxjs) + + with open(os.path.join(migrate_dir, 'iana.cdxj')) as fh: + assert fh.readline().startswith('org,iana)/ 20140126200624 {"url": "http://www.iana.org/",') + + # Nothing else to migrate + main(['migrate', migrate_dir]) + def test_err_template_remove(self): """ Test various error conditions for templates: invalid template name, no collection for collection template