diff --git a/pywb/utils/test/test_statusandheaders.py b/pywb/utils/test/test_statusandheaders.py index e52caa5e..ea835e32 100644 --- a/pywb/utils/test/test_statusandheaders.py +++ b/pywb/utils/test/test_statusandheaders.py @@ -1,5 +1,6 @@ """ ->>> StatusAndHeadersParser(['HTTP/1.0']).parse(BytesIO(status_headers_1)) +>>> st1 = StatusAndHeadersParser(['HTTP/1.0']).parse(BytesIO(status_headers_1)) +>>> st1 StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '200 OK', headers = [ ('Content-Type', 'ABC'), ('Some', 'Value'), ('Multi-Line', 'Value1 Also This')]) @@ -7,6 +8,18 @@ StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '200 OK', headers = [ ('Con >>> StatusAndHeadersParser(['Other']).parse(BytesIO(status_headers_1)) Traceback (most recent call last): StatusAndHeadersParserException: Expected Status Line starting with ['Other'] - Found: HTTP/1.0 200 OK + +# test equality op +>>> st1 == StatusAndHeadersParser(['HTTP/1.0']).parse(BytesIO(status_headers_1)) +True + +# remove header +>>> st1.remove_header('some') +True + +# already removed +>>> st1.remove_header('Some') +False """ diff --git a/pywb/warc/archiveindexer.py b/pywb/warc/archiveindexer.py index dad9fb18..e6e14c1f 100644 --- a/pywb/warc/archiveindexer.py +++ b/pywb/warc/archiveindexer.py @@ -273,6 +273,7 @@ class SortedCDXWriter(CDXWriter): self.out.write(''.join(self.sortlist)) +#================================================================= class MultiFileMixin(object): def start_all(self): super(MultiFileMixin, self).start() @@ -338,14 +339,18 @@ def index_to_file(inputs, output, sort): def remove_ext(filename): for ext in ('.arc', '.arc.gz', '.warc', '.warc.gz'): if filename.endswith(ext): - return filename[:-len(ext)] + filename = filename[:-len(ext)] + break + return filename +def cdx_filename(filename): + return remove_ext(filename) + '.cdx' def index_to_dir(inputs, output, sort): for fullpath, filename in iter_file_or_dir(inputs): - outpath = remove_ext(filename) + '.cdx' + outpath = cdx_filename(filename) outpath = os.path.join(output, outpath) with open(outpath, 'w') as outfile: @@ -356,26 +361,49 @@ def index_to_dir(inputs, output, sort): out=outfile).make_index() -def main(): - description = 'description' - epilog = 'epilog' +def main(args=None): + description = """ +Generate .cdx index files for WARCs and ARCs +Compressed (.warc.gz / .arc.gz) or uncompressed (.warc / .arc) formats +are supported. +""" - sort_help = 'sort help' - output_help = 'output help' - input_help = 'input help' + epilog = """ +Some examples: + +* Create "example.cdx" index from example.warc.gz +{0} ./cdx/example.cdx ./warcs/example.warc.gz + +* Create "combined.cdx", a combined, sorted index of all warcs in ./warcs/ +{0} --sort combined.cdx ./warcs/ + +* Create a sorted cdx per file in ./cdx/ for each archive file in ./warcs/ +{0} --sort ./cdx/ ./warcs/ +""".format(os.path.basename(sys.argv[0])) + + sort_help = """ +sort the output to each file before writing to create a total ordering +""" + + output_help = """output file or directory. +- If directory, each input file is written to a seperate output file + with a .cdx extension +- If output is '-', output is written to stdout +""" + + input_help = """input file or directory +- If directory, all archive files from that directory are read +""" parser = ArgumentParser(description=description, epilog=epilog, formatter_class=RawTextHelpFormatter) - parser.add_argument('--sort', action='store_true', help=sort_help) + parser.add_argument('-s', '--sort', action='store_true', help=sort_help) parser.add_argument('output', help=output_help) parser.add_argument('inputs', nargs='+', help=input_help) - cmd = parser.parse_args() - #print cmd - #return - + cmd = parser.parse_args(args=args) if cmd.output != '-' and os.path.isdir(cmd.output): index_to_dir(cmd.inputs, cmd.output, cmd.sort) else: diff --git a/pywb/warc/test/test_indexing.py b/pywb/warc/test/test_indexing.py index b323d679..0e470424 100644 --- a/pywb/warc/test/test_indexing.py +++ b/pywb/warc/test/test_indexing.py @@ -1,5 +1,6 @@ r""" +#================================================================= # warc.gz >>> print_cdx_index('example.warc.gz') CDX N b a m s k r M S V g @@ -37,14 +38,40 @@ metadata)/gnu.org/software/wget/warc/wget.log 20140216012908 metadata://gnu.org/ CDX N b a m s k r M S V g com,example)/ 20140401000000 http://example.com/ text/html 204 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 67 134 bad.arc com,example)/ 20140401000000 http://example.com/ text/html 204 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 68 202 bad.arc + +# Test CLI interface -- (check for num lines) +#================================================================= + +# test sort, multiple inputs +>>> cli_lines(['--sort', '-', TEST_WARC_DIR]) +com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz +org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1001 353 example-url-agnostic-orig.warc.gz +200 + +# test writing to stdout +>>> cli_lines(['-', TEST_WARC_DIR + 'example.warc.gz']) +com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz +org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz +4 + +# test writing to temp dir +>>> cli_lines_with_dir(TEST_WARC_DIR + 'example.warc.gz') +example.cdx +com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz +org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz +4 """ from pywb import get_test_dir -from pywb.warc.archiveindexer import ArchiveIndexer +from pywb.warc.archiveindexer import ArchiveIndexer, main, cdx_filename from io import BytesIO import sys +import os +import shutil +import tempfile + TEST_CDX_DIR = get_test_dir() + 'cdx/' TEST_WARC_DIR = get_test_dir() + 'warcs/' @@ -79,3 +106,49 @@ def test_sorted_warc_gz(): assert_cdx_match('example.cdx', 'example.warc.gz', sort=True) assert_cdx_match('dupes.cdx', 'dupes.warc.gz', sort=True) assert_cdx_match('iana.cdx', 'iana.warc.gz', sort=True) + +def cli_lines(cmds): + buff = BytesIO() + orig = sys.stdout + sys.stdout = buff + main(cmds) + sys.stdout = orig + lines = buff.getvalue().rstrip().split('\n') + + # print first, last, num lines + print (lines[1]) + print (lines[-1]) + print len(lines) + +def cli_lines_with_dir(input_): + try: + lines = None + tmp_dir = None + tmp_dir = tempfile.mkdtemp() + + main([tmp_dir, input_]) + + filename = cdx_filename(os.path.basename(input_)) + + print filename + + with open(os.path.join(tmp_dir, filename), 'r') as fh: + lines = fh.read(8192).rstrip().split('\n') + + finally: + try: + if tmp_dir: + shutil.rmtree(tmp_dir) + except OSError as exc: + if exc.errno != 2: + raise + + if not lines: + return + + # print first, last, num lines + print (lines[1]) + print (lines[-1]) + print len(lines) + + diff --git a/setup.py b/setup.py index f1cf3e02..c6ecb656 100755 --- a/setup.py +++ b/setup.py @@ -72,8 +72,8 @@ setup( 'pyyaml', ], tests_require=[ - 'WebTest', 'pytest', + 'WebTest', 'pytest-cov', 'fakeredis', 'mock', @@ -84,6 +84,7 @@ setup( [console_scripts] wayback = pywb.apps.wayback:main cdx-server = pywb.apps.cdx_server:main + cdx-indexer = pywb.warc.archiveindexer:main """, zip_safe=False, classifiers=[