mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
add cli interface for archiveindexer expose as 'cdx-indexer'
add tests for cli interface additional tests for statusheaders
This commit is contained in:
parent
732df1a172
commit
90f4833df3
@ -1,5 +1,6 @@
|
|||||||
"""
|
"""
|
||||||
>>> StatusAndHeadersParser(['HTTP/1.0']).parse(BytesIO(status_headers_1))
|
>>> st1 = StatusAndHeadersParser(['HTTP/1.0']).parse(BytesIO(status_headers_1))
|
||||||
|
>>> st1
|
||||||
StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '200 OK', headers = [ ('Content-Type', 'ABC'),
|
StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '200 OK', headers = [ ('Content-Type', 'ABC'),
|
||||||
('Some', 'Value'),
|
('Some', 'Value'),
|
||||||
('Multi-Line', 'Value1 Also This')])
|
('Multi-Line', 'Value1 Also This')])
|
||||||
@ -7,6 +8,18 @@ StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '200 OK', headers = [ ('Con
|
|||||||
>>> StatusAndHeadersParser(['Other']).parse(BytesIO(status_headers_1))
|
>>> StatusAndHeadersParser(['Other']).parse(BytesIO(status_headers_1))
|
||||||
Traceback (most recent call last):
|
Traceback (most recent call last):
|
||||||
StatusAndHeadersParserException: Expected Status Line starting with ['Other'] - Found: HTTP/1.0 200 OK
|
StatusAndHeadersParserException: Expected Status Line starting with ['Other'] - Found: HTTP/1.0 200 OK
|
||||||
|
|
||||||
|
# test equality op
|
||||||
|
>>> st1 == StatusAndHeadersParser(['HTTP/1.0']).parse(BytesIO(status_headers_1))
|
||||||
|
True
|
||||||
|
|
||||||
|
# remove header
|
||||||
|
>>> st1.remove_header('some')
|
||||||
|
True
|
||||||
|
|
||||||
|
# already removed
|
||||||
|
>>> st1.remove_header('Some')
|
||||||
|
False
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@ -273,6 +273,7 @@ class SortedCDXWriter(CDXWriter):
|
|||||||
self.out.write(''.join(self.sortlist))
|
self.out.write(''.join(self.sortlist))
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
class MultiFileMixin(object):
|
class MultiFileMixin(object):
|
||||||
def start_all(self):
|
def start_all(self):
|
||||||
super(MultiFileMixin, self).start()
|
super(MultiFileMixin, self).start()
|
||||||
@ -338,14 +339,18 @@ def index_to_file(inputs, output, sort):
|
|||||||
def remove_ext(filename):
|
def remove_ext(filename):
|
||||||
for ext in ('.arc', '.arc.gz', '.warc', '.warc.gz'):
|
for ext in ('.arc', '.arc.gz', '.warc', '.warc.gz'):
|
||||||
if filename.endswith(ext):
|
if filename.endswith(ext):
|
||||||
return filename[:-len(ext)]
|
filename = filename[:-len(ext)]
|
||||||
|
break
|
||||||
|
|
||||||
return filename
|
return filename
|
||||||
|
|
||||||
|
def cdx_filename(filename):
|
||||||
|
return remove_ext(filename) + '.cdx'
|
||||||
|
|
||||||
def index_to_dir(inputs, output, sort):
|
def index_to_dir(inputs, output, sort):
|
||||||
for fullpath, filename in iter_file_or_dir(inputs):
|
for fullpath, filename in iter_file_or_dir(inputs):
|
||||||
|
|
||||||
outpath = remove_ext(filename) + '.cdx'
|
outpath = cdx_filename(filename)
|
||||||
outpath = os.path.join(output, outpath)
|
outpath = os.path.join(output, outpath)
|
||||||
|
|
||||||
with open(outpath, 'w') as outfile:
|
with open(outpath, 'w') as outfile:
|
||||||
@ -356,26 +361,49 @@ def index_to_dir(inputs, output, sort):
|
|||||||
out=outfile).make_index()
|
out=outfile).make_index()
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main(args=None):
|
||||||
description = 'description'
|
description = """
|
||||||
epilog = 'epilog'
|
Generate .cdx index files for WARCs and ARCs
|
||||||
|
Compressed (.warc.gz / .arc.gz) or uncompressed (.warc / .arc) formats
|
||||||
|
are supported.
|
||||||
|
"""
|
||||||
|
|
||||||
sort_help = 'sort help'
|
epilog = """
|
||||||
output_help = 'output help'
|
Some examples:
|
||||||
input_help = 'input help'
|
|
||||||
|
* Create "example.cdx" index from example.warc.gz
|
||||||
|
{0} ./cdx/example.cdx ./warcs/example.warc.gz
|
||||||
|
|
||||||
|
* Create "combined.cdx", a combined, sorted index of all warcs in ./warcs/
|
||||||
|
{0} --sort combined.cdx ./warcs/
|
||||||
|
|
||||||
|
* Create a sorted cdx per file in ./cdx/ for each archive file in ./warcs/
|
||||||
|
{0} --sort ./cdx/ ./warcs/
|
||||||
|
""".format(os.path.basename(sys.argv[0]))
|
||||||
|
|
||||||
|
sort_help = """
|
||||||
|
sort the output to each file before writing to create a total ordering
|
||||||
|
"""
|
||||||
|
|
||||||
|
output_help = """output file or directory.
|
||||||
|
- If directory, each input file is written to a seperate output file
|
||||||
|
with a .cdx extension
|
||||||
|
- If output is '-', output is written to stdout
|
||||||
|
"""
|
||||||
|
|
||||||
|
input_help = """input file or directory
|
||||||
|
- If directory, all archive files from that directory are read
|
||||||
|
"""
|
||||||
|
|
||||||
parser = ArgumentParser(description=description,
|
parser = ArgumentParser(description=description,
|
||||||
epilog=epilog,
|
epilog=epilog,
|
||||||
formatter_class=RawTextHelpFormatter)
|
formatter_class=RawTextHelpFormatter)
|
||||||
|
|
||||||
parser.add_argument('--sort', action='store_true', help=sort_help)
|
parser.add_argument('-s', '--sort', action='store_true', help=sort_help)
|
||||||
parser.add_argument('output', help=output_help)
|
parser.add_argument('output', help=output_help)
|
||||||
parser.add_argument('inputs', nargs='+', help=input_help)
|
parser.add_argument('inputs', nargs='+', help=input_help)
|
||||||
|
|
||||||
cmd = parser.parse_args()
|
cmd = parser.parse_args(args=args)
|
||||||
#print cmd
|
|
||||||
#return
|
|
||||||
|
|
||||||
if cmd.output != '-' and os.path.isdir(cmd.output):
|
if cmd.output != '-' and os.path.isdir(cmd.output):
|
||||||
index_to_dir(cmd.inputs, cmd.output, cmd.sort)
|
index_to_dir(cmd.inputs, cmd.output, cmd.sort)
|
||||||
else:
|
else:
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
r"""
|
r"""
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
# warc.gz
|
# warc.gz
|
||||||
>>> print_cdx_index('example.warc.gz')
|
>>> print_cdx_index('example.warc.gz')
|
||||||
CDX N b a m s k r M S V g
|
CDX N b a m s k r M S V g
|
||||||
@ -37,14 +38,40 @@ metadata)/gnu.org/software/wget/warc/wget.log 20140216012908 metadata://gnu.org/
|
|||||||
CDX N b a m s k r M S V g
|
CDX N b a m s k r M S V g
|
||||||
com,example)/ 20140401000000 http://example.com/ text/html 204 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 67 134 bad.arc
|
com,example)/ 20140401000000 http://example.com/ text/html 204 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 67 134 bad.arc
|
||||||
com,example)/ 20140401000000 http://example.com/ text/html 204 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 68 202 bad.arc
|
com,example)/ 20140401000000 http://example.com/ text/html 204 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 68 202 bad.arc
|
||||||
|
|
||||||
|
# Test CLI interface -- (check for num lines)
|
||||||
|
#=================================================================
|
||||||
|
|
||||||
|
# test sort, multiple inputs
|
||||||
|
>>> cli_lines(['--sort', '-', TEST_WARC_DIR])
|
||||||
|
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
|
||||||
|
org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1001 353 example-url-agnostic-orig.warc.gz
|
||||||
|
200
|
||||||
|
|
||||||
|
# test writing to stdout
|
||||||
|
>>> cli_lines(['-', TEST_WARC_DIR + 'example.warc.gz'])
|
||||||
|
com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
|
||||||
|
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
|
||||||
|
4
|
||||||
|
|
||||||
|
# test writing to temp dir
|
||||||
|
>>> cli_lines_with_dir(TEST_WARC_DIR + 'example.warc.gz')
|
||||||
|
example.cdx
|
||||||
|
com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
|
||||||
|
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
|
||||||
|
4
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from pywb import get_test_dir
|
from pywb import get_test_dir
|
||||||
from pywb.warc.archiveindexer import ArchiveIndexer
|
from pywb.warc.archiveindexer import ArchiveIndexer, main, cdx_filename
|
||||||
|
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
import tempfile
|
||||||
|
|
||||||
TEST_CDX_DIR = get_test_dir() + 'cdx/'
|
TEST_CDX_DIR = get_test_dir() + 'cdx/'
|
||||||
TEST_WARC_DIR = get_test_dir() + 'warcs/'
|
TEST_WARC_DIR = get_test_dir() + 'warcs/'
|
||||||
|
|
||||||
@ -79,3 +106,49 @@ def test_sorted_warc_gz():
|
|||||||
assert_cdx_match('example.cdx', 'example.warc.gz', sort=True)
|
assert_cdx_match('example.cdx', 'example.warc.gz', sort=True)
|
||||||
assert_cdx_match('dupes.cdx', 'dupes.warc.gz', sort=True)
|
assert_cdx_match('dupes.cdx', 'dupes.warc.gz', sort=True)
|
||||||
assert_cdx_match('iana.cdx', 'iana.warc.gz', sort=True)
|
assert_cdx_match('iana.cdx', 'iana.warc.gz', sort=True)
|
||||||
|
|
||||||
|
def cli_lines(cmds):
|
||||||
|
buff = BytesIO()
|
||||||
|
orig = sys.stdout
|
||||||
|
sys.stdout = buff
|
||||||
|
main(cmds)
|
||||||
|
sys.stdout = orig
|
||||||
|
lines = buff.getvalue().rstrip().split('\n')
|
||||||
|
|
||||||
|
# print first, last, num lines
|
||||||
|
print (lines[1])
|
||||||
|
print (lines[-1])
|
||||||
|
print len(lines)
|
||||||
|
|
||||||
|
def cli_lines_with_dir(input_):
|
||||||
|
try:
|
||||||
|
lines = None
|
||||||
|
tmp_dir = None
|
||||||
|
tmp_dir = tempfile.mkdtemp()
|
||||||
|
|
||||||
|
main([tmp_dir, input_])
|
||||||
|
|
||||||
|
filename = cdx_filename(os.path.basename(input_))
|
||||||
|
|
||||||
|
print filename
|
||||||
|
|
||||||
|
with open(os.path.join(tmp_dir, filename), 'r') as fh:
|
||||||
|
lines = fh.read(8192).rstrip().split('\n')
|
||||||
|
|
||||||
|
finally:
|
||||||
|
try:
|
||||||
|
if tmp_dir:
|
||||||
|
shutil.rmtree(tmp_dir)
|
||||||
|
except OSError as exc:
|
||||||
|
if exc.errno != 2:
|
||||||
|
raise
|
||||||
|
|
||||||
|
if not lines:
|
||||||
|
return
|
||||||
|
|
||||||
|
# print first, last, num lines
|
||||||
|
print (lines[1])
|
||||||
|
print (lines[-1])
|
||||||
|
print len(lines)
|
||||||
|
|
||||||
|
|
||||||
|
3
setup.py
3
setup.py
@ -72,8 +72,8 @@ setup(
|
|||||||
'pyyaml',
|
'pyyaml',
|
||||||
],
|
],
|
||||||
tests_require=[
|
tests_require=[
|
||||||
'WebTest',
|
|
||||||
'pytest',
|
'pytest',
|
||||||
|
'WebTest',
|
||||||
'pytest-cov',
|
'pytest-cov',
|
||||||
'fakeredis',
|
'fakeredis',
|
||||||
'mock',
|
'mock',
|
||||||
@ -84,6 +84,7 @@ setup(
|
|||||||
[console_scripts]
|
[console_scripts]
|
||||||
wayback = pywb.apps.wayback:main
|
wayback = pywb.apps.wayback:main
|
||||||
cdx-server = pywb.apps.cdx_server:main
|
cdx-server = pywb.apps.cdx_server:main
|
||||||
|
cdx-indexer = pywb.warc.archiveindexer:main
|
||||||
""",
|
""",
|
||||||
zip_safe=False,
|
zip_safe=False,
|
||||||
classifiers=[
|
classifiers=[
|
||||||
|
Loading…
x
Reference in New Issue
Block a user