mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
indexing: refactor cdxindexer interface to better allow custom writers
record loader: skip whois: and dns: records, better skipping of arc headers (todo: need more unit tests)
This commit is contained in:
parent
3965fad4dd
commit
6761f5697f
@ -171,7 +171,7 @@ class ArchiveIndexEntry(object):
|
||||
self.digest = digest
|
||||
|
||||
def merge_request_data(self, other, options):
|
||||
surt_ordered = options.get('surt_ordered')
|
||||
surt_ordered = options.get('surt_ordered', True)
|
||||
|
||||
if other.record.rec_type != 'request':
|
||||
return False
|
||||
@ -354,7 +354,7 @@ def create_index_iter(fh, **options):
|
||||
|
||||
entry_iter = create_record_iter(aiter, options)
|
||||
|
||||
if options.get('append_post'):
|
||||
if options.get('append_post') == True:
|
||||
entry_iter = join_request_records(entry_iter, options)
|
||||
|
||||
for entry in entry_iter:
|
||||
|
@ -79,44 +79,6 @@ def iter_file_or_dir(inputs):
|
||||
yield os.path.join(input_, filename), filename
|
||||
|
||||
|
||||
#=================================================================
|
||||
def index_to_file(inputs, output, sort,
|
||||
surt_ordered, include_all, append_post, cdx09):
|
||||
if output == '-':
|
||||
outfile = sys.stdout
|
||||
else:
|
||||
outfile = open(output, 'w')
|
||||
|
||||
if sort:
|
||||
writer_cls = SortedCDXWriter
|
||||
else:
|
||||
writer_cls = CDXWriter
|
||||
|
||||
with writer_cls(outfile, cdx09) as writer:
|
||||
for fullpath, filename in iter_file_or_dir(inputs):
|
||||
with open(fullpath, 'r') as infile:
|
||||
write_index(writer, filename, infile,
|
||||
surt_ordered, append_post, include_all)
|
||||
|
||||
#=================================================================
|
||||
def index_to_dir(inputs, output, sort,
|
||||
surt_ordered, include_all, append_post, cdx09):
|
||||
|
||||
if sort:
|
||||
writer_cls = SortedCDXWriter
|
||||
else:
|
||||
writer_cls = CDXWriter
|
||||
|
||||
for fullpath, filename in iter_file_or_dir(inputs):
|
||||
outpath = cdx_filename(filename)
|
||||
outpath = os.path.join(output, outpath)
|
||||
|
||||
with open(outpath, 'w') as outfile:
|
||||
with writer_cls(outfile, cdx09) as writer:
|
||||
with open(fullpath, 'r') as infile:
|
||||
write_index(writer, filename, infile,
|
||||
surt_ordered, append_post, include_all)
|
||||
|
||||
#=================================================================
|
||||
def remove_ext(filename):
|
||||
for ext in ('.arc', '.arc.gz', '.warc', '.warc.gz'):
|
||||
@ -133,16 +95,57 @@ def cdx_filename(filename):
|
||||
|
||||
|
||||
#=================================================================
|
||||
def write_index(writer, filename, infile,
|
||||
surt_ordered, append_post, include_all):
|
||||
def write_multi_cdx_index(output, inputs, **options):
|
||||
|
||||
entry_iter = create_index_iter(infile,
|
||||
surt_ordered=surt_ordered,
|
||||
append_post=append_post,
|
||||
include_all=include_all)
|
||||
# write one cdx per dir
|
||||
if output != '-' and os.path.isdir(output):
|
||||
for fullpath, filename in iter_file_or_dir(inputs):
|
||||
outpath = cdx_filename(filename)
|
||||
outpath = os.path.join(output, outpath)
|
||||
|
||||
for entry in entry_iter:
|
||||
writer.write(entry, filename)
|
||||
with open(outpath, 'w') as outfile:
|
||||
with open(fullpath, 'r') as infile:
|
||||
write_cdx_index(outfile, infile, filename, **options)
|
||||
|
||||
# write to one cdx file
|
||||
else:
|
||||
if output == '-':
|
||||
outfile = sys.stdout
|
||||
else:
|
||||
outfile = open(output, 'w')
|
||||
|
||||
if options.get('sort'):
|
||||
writer_cls = SortedCDXWriter
|
||||
else:
|
||||
writer_cls = CDXWriter
|
||||
|
||||
with writer_cls(outfile, options.get('cdx09')) as writer:
|
||||
for fullpath, filename in iter_file_or_dir(inputs):
|
||||
with open(fullpath, 'r') as infile:
|
||||
entry_iter = create_index_iter(infile, **options)
|
||||
|
||||
for entry in entry_iter:
|
||||
writer.write(entry, filename)
|
||||
|
||||
|
||||
#=================================================================
|
||||
def write_cdx_index(outfile, infile, filename, **options):
|
||||
writer_cls = options.get('writer_cls')
|
||||
|
||||
if writer_cls:
|
||||
pass
|
||||
elif options.get('sort'):
|
||||
writer_cls = SortedCDXWriter
|
||||
else:
|
||||
writer_cls = CDXWriter
|
||||
|
||||
with writer_cls(outfile, options.get('cdx09')) as writer:
|
||||
entry_iter = create_index_iter(infile, **options)
|
||||
|
||||
for entry in entry_iter:
|
||||
writer.write(entry, filename)
|
||||
|
||||
return writer
|
||||
|
||||
|
||||
#=================================================================
|
||||
@ -225,12 +228,13 @@ form query to url key. (Only applies to form url encoded posts)"""
|
||||
parser.add_argument('inputs', nargs='+', help=input_help)
|
||||
|
||||
cmd = parser.parse_args(args=args)
|
||||
if cmd.output != '-' and os.path.isdir(cmd.output):
|
||||
index_to_dir(cmd.inputs, cmd.output, cmd.sort,
|
||||
not cmd.unsurt, cmd.allrecords, cmd.postappend, cmd.cdx09)
|
||||
else:
|
||||
index_to_file(cmd.inputs, cmd.output, cmd.sort,
|
||||
not cmd.unsurt, cmd.allrecords, cmd.postappend, cmd.cdx09)
|
||||
|
||||
write_multi_cdx_index(cmd.output, cmd.inputs,
|
||||
sort=cmd.sort,
|
||||
surt_ordered=not cmd.unsurt,
|
||||
include_all=cmd.allrecords,
|
||||
append_post=cmd.postappend,
|
||||
cdx09=cmd.cdx09)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
@ -94,21 +94,29 @@ class ArcWarcRecordLoader:
|
||||
known_format))
|
||||
|
||||
if the_format == 'arc':
|
||||
if rec_headers.get_header('uri').startswith('filedesc://'):
|
||||
rec_type = 'arc_header'
|
||||
length = 0
|
||||
else:
|
||||
rec_type = 'response'
|
||||
length = rec_headers.get_header('length')
|
||||
rec_type = 'response'
|
||||
uri = rec_headers.get_header('uri')
|
||||
length = rec_headers.get_header('length')
|
||||
sub_len = rec_headers.total_len
|
||||
|
||||
elif the_format == 'warc':
|
||||
rec_type = rec_headers.get_header('WARC-Type')
|
||||
uri = rec_headers.get_header('WARC-Target-URI')
|
||||
length = rec_headers.get_header('Content-Length')
|
||||
sub_len = 0
|
||||
|
||||
if rec_type == 'response' and uri:
|
||||
if uri.startswith('filedesc://'):
|
||||
rec_type = 'arc_header'
|
||||
elif uri.startswith('dns:'):
|
||||
rec_type = 'dns_response'
|
||||
elif uri.startswith('whois:'):
|
||||
rec_type = 'whois_response'
|
||||
|
||||
is_err = False
|
||||
|
||||
try:
|
||||
length = int(length)
|
||||
length = int(length) - sub_len
|
||||
if length < 0:
|
||||
is_err = True
|
||||
except ValueError:
|
||||
@ -139,8 +147,7 @@ class ArcWarcRecordLoader:
|
||||
|
||||
status_headers = StatusAndHeaders('200 OK', content_type)
|
||||
|
||||
elif (rec_type == 'warcinfo' or
|
||||
rec_type == 'arc_header'):
|
||||
elif (rec_type in ('warcinfo', 'arc_header', 'dns_response', 'whois_response')):
|
||||
# no extra parsing of body for these
|
||||
status_headers = StatusAndHeaders('204 No Content', [])
|
||||
|
||||
@ -182,7 +189,7 @@ class ArcWarcRecordLoader:
|
||||
return 'arc', rec_headers
|
||||
except StatusAndHeadersParserException as se:
|
||||
if known_format == 'arc':
|
||||
msg = 'Invalid WARC record, first line: '
|
||||
msg = 'Invalid ARC record, first line: '
|
||||
else:
|
||||
msg = 'Unknown archive format, first line: '
|
||||
raise ArchiveLoadFailed(msg + str(se.statusline))
|
||||
@ -194,16 +201,15 @@ class ARCHeadersParser:
|
||||
self.headernames = headernames
|
||||
|
||||
def parse(self, stream, headerline=None):
|
||||
|
||||
total_read = 0
|
||||
|
||||
# if headerline passed in, use that
|
||||
if headerline is None:
|
||||
headerline = stream.readline()
|
||||
|
||||
total_read = len(headerline)
|
||||
header_len = len(headerline)
|
||||
|
||||
if total_read == 0:
|
||||
if header_len == 0:
|
||||
raise EOFError()
|
||||
|
||||
headerline = headerline.rstrip()
|
||||
@ -212,8 +218,10 @@ class ARCHeadersParser:
|
||||
|
||||
# if arc header, consume next two lines
|
||||
if headerline.startswith('filedesc://'):
|
||||
stream.readline() # skip version
|
||||
stream.readline() # skip header spec, use preset one
|
||||
version = stream.readline() # skip version
|
||||
spec = stream.readline() # skip header spec, use preset one
|
||||
total_read += len(version)
|
||||
total_read += len(spec)
|
||||
|
||||
parts = headerline.split(' ')
|
||||
|
||||
|
@ -69,7 +69,7 @@ org,httpbin)/post 20140610001151 http://httpbin.org/post application/json 200 M7
|
||||
org,httpbin)/post?foo=bar 20140610001255 http://httpbin.org/post?foo=bar application/json 200 B6E5P6JUZI6UPDTNO4L2BCHMGLTNCUAJ - - 723 2395 post-test.warc.gz
|
||||
|
||||
# post append
|
||||
>>> print_cdx_index('post-test.warc.gz', append_post_query=True)
|
||||
>>> print_cdx_index('post-test.warc.gz', append_post=True)
|
||||
CDX N b a m s k r M S V g
|
||||
org,httpbin)/post?foo=bar&test=abc 20140610000859 http://httpbin.org/post application/json 200 M532K5WS4GY2H4OVZO6HRPOP47A7KDWU - - 720 0 post-test.warc.gz
|
||||
org,httpbin)/post?a=1&b=[]&c=3 20140610001151 http://httpbin.org/post application/json 200 M7YCTM7HS3YKYQTAWQVMQSQZBNEOXGU2 - - 723 1196 post-test.warc.gz
|
||||
@ -86,7 +86,7 @@ org,httpbin)/post?foo=bar 20140610001255 http://httpbin.org/post?foo=bar applica
|
||||
org,httpbin)/post?foo=bar 20140610001255 http://httpbin.org/post?foo=bar application/x-www-form-urlencoded - - - - 475 3118 post-test.warc.gz
|
||||
|
||||
# post append + requests included
|
||||
>>> print_cdx_index('post-test.warc.gz', include_all=True, append_post_query=True)
|
||||
>>> print_cdx_index('post-test.warc.gz', include_all=True, append_post=True)
|
||||
CDX N b a m s k r M S V g
|
||||
org,httpbin)/post?foo=bar&test=abc 20140610000859 http://httpbin.org/post application/json 200 M532K5WS4GY2H4OVZO6HRPOP47A7KDWU - - 720 0 post-test.warc.gz
|
||||
org,httpbin)/post?foo=bar&test=abc 20140610000859 http://httpbin.org/post application/x-www-form-urlencoded - - - - 476 720 post-test.warc.gz
|
||||
@ -135,7 +135,7 @@ org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example tex
|
||||
from pywb import get_test_dir
|
||||
|
||||
#from pywb.warc.archiveindexer import ArchiveIndexer, main, cdx_filename
|
||||
from pywb.warc.cdxindexer import write_index, main, cdx_filename, CDXWriter, SortedCDXWriter
|
||||
from pywb.warc.cdxindexer import write_cdx_index, main, cdx_filename
|
||||
|
||||
from io import BytesIO
|
||||
import sys
|
||||
@ -157,19 +157,11 @@ def read_fully(cdx):
|
||||
curr.write(b)
|
||||
return curr.getvalue()
|
||||
|
||||
def cdx_index(warc, sort=False,
|
||||
include_all=False, append_post_query=False):
|
||||
def cdx_index(warc, **options):
|
||||
buff = BytesIO()
|
||||
|
||||
if sort:
|
||||
writer_cls = SortedCDXWriter
|
||||
else:
|
||||
writer_cls = CDXWriter
|
||||
|
||||
with writer_cls(buff) as writer:
|
||||
with open(TEST_WARC_DIR + warc) as fh:
|
||||
write_index(writer, warc, fh,
|
||||
True, append_post_query, include_all)
|
||||
with open(TEST_WARC_DIR + warc) as fh:
|
||||
write_cdx_index(buff, fh, warc, **options)
|
||||
|
||||
return buff.getvalue()
|
||||
|
||||
@ -177,7 +169,7 @@ def print_cdx_index(*args, **kwargs):
|
||||
sys.stdout.write(cdx_index(*args, **kwargs))
|
||||
|
||||
def assert_cdx_match(cdx, warc, sort=False):
|
||||
assert read_fully(cdx) == cdx_index(warc, sort)
|
||||
assert read_fully(cdx) == cdx_index(warc, sort=sort)
|
||||
|
||||
def test_sorted_warc_gz():
|
||||
assert_cdx_match('example.cdx', 'example.warc.gz', sort=True)
|
||||
|
Loading…
x
Reference in New Issue
Block a user