1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

manager: add cdx -> cdxj migration tool #80, which will convert all cdxs in a directory to cdxj, removing original files

migration will also recanonicalize the urlkey to surt form
add migration test using non-surt, 9-field cdx (created from samples)
cdxindexer: fix multi warc->multi cdx indexing options
This commit is contained in:
Ilya Kreymer 2015-03-19 20:52:00 -07:00
parent c5b5c8ee4b
commit b43a7f94f3
5 changed files with 136 additions and 15 deletions

View File

@ -7,6 +7,7 @@ from pywb.utils.loaders import load_yaml_config
from pywb.utils.timeutils import timestamp20_now from pywb.utils.timeutils import timestamp20_now
from pywb.warc.cdxindexer import main as cdxindexer_main from pywb.warc.cdxindexer import main as cdxindexer_main
from pywb.webapp.pywb_init import DEFAULT_CONFIG from pywb.webapp.pywb_init import DEFAULT_CONFIG
from migrate import MigrateCDX
from distutils.util import strtobool from distutils.util import strtobool
from pkg_resources import resource_string from pkg_resources import resource_string
@ -30,6 +31,8 @@ simplify the creation and management of web archive collections
It may be used via cmdline to setup and maintain the It may be used via cmdline to setup and maintain the
directory structure expected by pywb directory structure expected by pywb
""" """
DEF_INDEX_FILE = 'index.cdxj'
def __init__(self, coll_name, root_dir='collections', must_exist=True): def __init__(self, coll_name, root_dir='collections', must_exist=True):
self.root_dir = root_dir self.root_dir = root_dir
self.default_config = load_yaml_config('pywb/default_config.yaml') self.default_config = load_yaml_config('pywb/default_config.yaml')
@ -95,7 +98,7 @@ directory structure expected by pywb
self._index_merge_warcs(full_paths) self._index_merge_warcs(full_paths)
def reindex(self): def reindex(self):
cdx_file = os.path.join(self.cdx_dir, 'index.cdxj') cdx_file = os.path.join(self.cdx_dir, self.DEF_INDEX_FILE)
logging.info('Indexing ' + self.warc_dir + ' to ' + cdx_file) logging.info('Indexing ' + self.warc_dir + ' to ' + cdx_file)
self._cdx_index(cdx_file, [self.warc_dir]) self._cdx_index(cdx_file, [self.warc_dir])
@ -128,7 +131,7 @@ directory structure expected by pywb
self._index_merge_warcs(filtered_warcs) self._index_merge_warcs(filtered_warcs)
def _index_merge_warcs(self, new_warcs): def _index_merge_warcs(self, new_warcs):
cdx_file = os.path.join(self.cdx_dir, 'index.cdx') cdx_file = os.path.join(self.cdx_dir, self.DEF_INDEX_FILE)
# no existing file, just reindex all # no existing file, just reindex all
if not os.path.isfile(cdx_file): if not os.path.isfile(cdx_file):
@ -276,6 +279,27 @@ directory structure expected by pywb
os.remove(full_path) os.remove(full_path)
print('Removed template file "{0}"'.format(full_path)) print('Removed template file "{0}"'.format(full_path))
def migrate_cdxj(self, path, force=False):
migrate = MigrateCDX(path)
count = migrate.count_cdx()
if count == 0:
print('Index files up-to-date, nothing to migrate')
return
msg = 'Migrate {0} index files? (y/n)'.format(count)
if not force:
res = get_input(msg)
try:
res = strtobool(res)
except ValueError:
res = False
if not res:
return
migrate.convert_to_cdxj()
#============================================================================= #=============================================================================
def main(args=None): def main(args=None):
description = """ description = """
@ -374,6 +398,15 @@ Create manage file based web archive collections
template.add_argument('--list', action='store_true') template.add_argument('--list', action='store_true')
template.set_defaults(func=do_add_template) template.set_defaults(func=do_add_template)
def do_migrate(r):
m = CollectionsManager('', must_exist=False)
m.migrate_cdxj(r.path, r.force)
template = subparsers.add_parser('migrate')
template.add_argument('path', default='./', nargs='?')
template.add_argument('-f', '--force', action='store_true')
template.set_defaults(func=do_migrate)
r = parser.parse_args(args=args) r = parser.parse_args(args=args)
r.func(r) r.func(r)

46
pywb/manager/migrate.py Normal file
View File

@ -0,0 +1,46 @@
from pywb.utils.canonicalize import canonicalize
from pywb.cdx.cdxobject import CDXObject, URLKEY, ORIGINAL
from pywb.warc.cdxindexer import CDXJ
import os
import shutil
#=============================================================================
class MigrateCDX(object):
def __init__(self, dir_):
self.cdx_dir = dir_
def iter_cdx_files(self):
for root, dirs, files in os.walk(self.cdx_dir):
for filename in files:
if filename.endswith('.cdx'):
full_path = os.path.join(root, filename)
yield full_path
def count_cdx(self):
count = 0
for x in self.iter_cdx_files():
count += 1
return count
def convert_to_cdxj(self):
cdxj_writer = CDXJ()
for filename in self.iter_cdx_files():
outfile = filename + 'j'
print('Converting {0} -> {1}'.format(filename, outfile))
with open(outfile + '.tmp', 'w+b') as out:
with open(filename) as fh:
for line in fh:
if line.startswith(' CDX'):
continue
cdx = CDXObject(line)
cdx[URLKEY] = canonicalize(cdx[ORIGINAL])
cdxj_writer.write_cdx_line(out, cdx, cdx['filename'])
shutil.move(outfile + '.tmp', outfile)
os.remove(filename)

View File

@ -198,7 +198,7 @@ class ArchiveIndexEntryMixin(object):
MIME_RE = re.compile('[; ]') MIME_RE = re.compile('[; ]')
def reset_entry(self): def reset_entry(self):
self['key'] = '' self['urlkey'] = ''
def extract_mime(self, mime, def_mime='unk'): def extract_mime(self, mime, def_mime='unk'):
""" Utility function to extract mimetype only """ Utility function to extract mimetype only
@ -238,8 +238,8 @@ class ArchiveIndexEntryMixin(object):
post_query = other.get('_post_query') post_query = other.get('_post_query')
if post_query: if post_query:
url = append_post_query(self['url'], post_query) url = append_post_query(self['url'], post_query)
self['key'] = canonicalize(url, surt_ordered) self['urlkey'] = canonicalize(url, surt_ordered)
other['key'] = self['key'] other['urlkey'] = self['urlkey']
referer = other.record.status_headers.get_header('referer') referer = other.record.status_headers.get_header('referer')
if referer: if referer:
@ -303,8 +303,8 @@ class DefaultRecordIter(object):
if not entry: if not entry:
continue continue
if entry.get('url') and not entry.get('key'): if entry.get('url') and not entry.get('urlkey'):
entry['key'] = canonicalize(entry['url'], surt_ordered) entry['urlkey'] = canonicalize(entry['url'], surt_ordered)
compute_digest = False compute_digest = False
@ -370,7 +370,7 @@ class DefaultRecordIter(object):
if record.rec_type == 'warcinfo': if record.rec_type == 'warcinfo':
entry['url'] = record.rec_headers.get_header('WARC-Filename') entry['url'] = record.rec_headers.get_header('WARC-Filename')
entry['key'] = entry['url'] entry['urlkey'] = entry['url']
entry['_warcinfo'] = record.stream.read(record.length) entry['_warcinfo'] = record.stream.read(record.length)
return entry return entry

View File

@ -27,7 +27,7 @@ class BaseCDXWriter(object):
return self return self
def write(self, entry, filename): def write(self, entry, filename):
if not entry.get('url') or not entry.get('key'): if not entry.get('url') or not entry.get('urlkey'):
return return
if entry.record.rec_type == 'warcinfo': if entry.record.rec_type == 'warcinfo':
@ -45,7 +45,7 @@ class CDXJ(object):
pass pass
def write_cdx_line(self, out, entry, filename): def write_cdx_line(self, out, entry, filename):
out.write(entry['key']) out.write(entry['urlkey'])
out.write(' ') out.write(' ')
out.write(entry['timestamp']) out.write(entry['timestamp'])
out.write(' ') out.write(' ')
@ -53,7 +53,7 @@ class CDXJ(object):
outdict = OrderedDict() outdict = OrderedDict()
for n, v in entry.iteritems(): for n, v in entry.iteritems():
if n in ('key', 'timestamp'): if n in ('urlkey', 'timestamp'):
continue continue
if n.startswith('_'): if n.startswith('_'):
@ -75,7 +75,7 @@ class CDX09(object):
self.out.write(' CDX N b a m s k r V g\n') self.out.write(' CDX N b a m s k r V g\n')
def write_cdx_line(self, out, entry, filename): def write_cdx_line(self, out, entry, filename):
out.write(entry['key']) out.write(entry['urlkey'])
out.write(' ') out.write(' ')
out.write(entry['timestamp']) out.write(entry['timestamp'])
out.write(' ') out.write(' ')
@ -99,7 +99,7 @@ class CDX11(object):
self.out.write(' CDX N b a m s k r M S V g\n') self.out.write(' CDX N b a m s k r M S V g\n')
def write_cdx_line(self, out, entry, filename): def write_cdx_line(self, out, entry, filename):
out.write(entry['key']) out.write(entry['urlkey'])
out.write(' ') out.write(' ')
out.write(entry['timestamp']) out.write(entry['timestamp'])
out.write(' ') out.write(' ')
@ -218,8 +218,10 @@ def write_multi_cdx_index(output, inputs, **options):
with open(outpath, 'wb') as outfile: with open(outpath, 'wb') as outfile:
with open(fullpath, 'rb') as infile: with open(fullpath, 'rb') as infile:
return write_cdx_index(outfile, infile, filename, writer = write_cdx_index(outfile, infile, filename,
**options) **options)
return writer
# write to one cdx file # write to one cdx file
else: else:

View File

@ -10,6 +10,8 @@ from io import BytesIO
from pywb.webapp.pywb_init import create_wb_router from pywb.webapp.pywb_init import create_wb_router
from pywb.manager.manager import main from pywb.manager.manager import main
from pywb.warc.cdxindexer import main as cdxindexer_main
from pywb import get_test_dir from pywb import get_test_dir
from pywb.framework.wsgi_wrappers import init_app from pywb.framework.wsgi_wrappers import init_app
@ -392,6 +394,44 @@ class TestManagedColls(object):
assert '- nested' in output assert '- nested' in output
assert '- test' in output assert '- test' in output
def test_migrate(self):
""" Create non-surt cdx, then convert to cdxj
"""
migrate_dir = os.path.join(self.root_dir, '_migrate')
os.mkdir(migrate_dir)
cdxindexer_main(['-u', migrate_dir, self._get_sample_warc('')])
# try one file with -9
cdxindexer_main(['-u', '-9', migrate_dir, self._get_sample_warc('example.warc.gz')])
cdxs = os.listdir(migrate_dir)
assert all(x.endswith('.cdx') for x in cdxs)
@patch('pywb.manager.manager.get_input', lambda x: 'blah')
def do_migrate_no():
main(['migrate', migrate_dir])
do_migrate_no()
assert os.listdir(migrate_dir) == cdxs
@patch('pywb.manager.manager.get_input', lambda x: 'y')
def do_migrate_yes():
main(['migrate', migrate_dir])
do_migrate_yes()
cdxjs = os.listdir(migrate_dir)
assert len(cdxs) == len(cdxjs)
assert all(x.endswith('.cdxj') for x in cdxjs)
with open(os.path.join(migrate_dir, 'iana.cdxj')) as fh:
assert fh.readline().startswith('org,iana)/ 20140126200624 {"url": "http://www.iana.org/",')
# Nothing else to migrate
main(['migrate', migrate_dir])
def test_err_template_remove(self): def test_err_template_remove(self):
""" Test various error conditions for templates: """ Test various error conditions for templates:
invalid template name, no collection for collection template invalid template name, no collection for collection template