1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

indexing: refactor cdxindexer interface to better allow custom writers

record loader: skip whois: and dns: records, better skipping of arc headers
(todo: need more unit tests)
This commit is contained in:
Ilya Kreymer 2014-06-24 17:08:10 -07:00
parent 3965fad4dd
commit 6761f5697f
4 changed files with 88 additions and 84 deletions

View File

@ -171,7 +171,7 @@ class ArchiveIndexEntry(object):
self.digest = digest
def merge_request_data(self, other, options):
surt_ordered = options.get('surt_ordered')
surt_ordered = options.get('surt_ordered', True)
if other.record.rec_type != 'request':
return False
@ -354,7 +354,7 @@ def create_index_iter(fh, **options):
entry_iter = create_record_iter(aiter, options)
if options.get('append_post'):
if options.get('append_post') == True:
entry_iter = join_request_records(entry_iter, options)
for entry in entry_iter:

View File

@ -79,44 +79,6 @@ def iter_file_or_dir(inputs):
yield os.path.join(input_, filename), filename
#=================================================================
def index_to_file(inputs, output, sort,
surt_ordered, include_all, append_post, cdx09):
if output == '-':
outfile = sys.stdout
else:
outfile = open(output, 'w')
if sort:
writer_cls = SortedCDXWriter
else:
writer_cls = CDXWriter
with writer_cls(outfile, cdx09) as writer:
for fullpath, filename in iter_file_or_dir(inputs):
with open(fullpath, 'r') as infile:
write_index(writer, filename, infile,
surt_ordered, append_post, include_all)
#=================================================================
def index_to_dir(inputs, output, sort,
surt_ordered, include_all, append_post, cdx09):
if sort:
writer_cls = SortedCDXWriter
else:
writer_cls = CDXWriter
for fullpath, filename in iter_file_or_dir(inputs):
outpath = cdx_filename(filename)
outpath = os.path.join(output, outpath)
with open(outpath, 'w') as outfile:
with writer_cls(outfile, cdx09) as writer:
with open(fullpath, 'r') as infile:
write_index(writer, filename, infile,
surt_ordered, append_post, include_all)
#=================================================================
def remove_ext(filename):
for ext in ('.arc', '.arc.gz', '.warc', '.warc.gz'):
@ -133,16 +95,57 @@ def cdx_filename(filename):
#=================================================================
def write_index(writer, filename, infile,
surt_ordered, append_post, include_all):
def write_multi_cdx_index(output, inputs, **options):
entry_iter = create_index_iter(infile,
surt_ordered=surt_ordered,
append_post=append_post,
include_all=include_all)
# write one cdx per dir
if output != '-' and os.path.isdir(output):
for fullpath, filename in iter_file_or_dir(inputs):
outpath = cdx_filename(filename)
outpath = os.path.join(output, outpath)
for entry in entry_iter:
writer.write(entry, filename)
with open(outpath, 'w') as outfile:
with open(fullpath, 'r') as infile:
write_cdx_index(outfile, infile, filename, **options)
# write to one cdx file
else:
if output == '-':
outfile = sys.stdout
else:
outfile = open(output, 'w')
if options.get('sort'):
writer_cls = SortedCDXWriter
else:
writer_cls = CDXWriter
with writer_cls(outfile, options.get('cdx09')) as writer:
for fullpath, filename in iter_file_or_dir(inputs):
with open(fullpath, 'r') as infile:
entry_iter = create_index_iter(infile, **options)
for entry in entry_iter:
writer.write(entry, filename)
#=================================================================
def write_cdx_index(outfile, infile, filename, **options):
writer_cls = options.get('writer_cls')
if writer_cls:
pass
elif options.get('sort'):
writer_cls = SortedCDXWriter
else:
writer_cls = CDXWriter
with writer_cls(outfile, options.get('cdx09')) as writer:
entry_iter = create_index_iter(infile, **options)
for entry in entry_iter:
writer.write(entry, filename)
return writer
#=================================================================
@ -225,12 +228,13 @@ form query to url key. (Only applies to form url encoded posts)"""
parser.add_argument('inputs', nargs='+', help=input_help)
cmd = parser.parse_args(args=args)
if cmd.output != '-' and os.path.isdir(cmd.output):
index_to_dir(cmd.inputs, cmd.output, cmd.sort,
not cmd.unsurt, cmd.allrecords, cmd.postappend, cmd.cdx09)
else:
index_to_file(cmd.inputs, cmd.output, cmd.sort,
not cmd.unsurt, cmd.allrecords, cmd.postappend, cmd.cdx09)
write_multi_cdx_index(cmd.output, cmd.inputs,
sort=cmd.sort,
surt_ordered=not cmd.unsurt,
include_all=cmd.allrecords,
append_post=cmd.postappend,
cdx09=cmd.cdx09)
if __name__ == '__main__':

View File

@ -94,21 +94,29 @@ class ArcWarcRecordLoader:
known_format))
if the_format == 'arc':
if rec_headers.get_header('uri').startswith('filedesc://'):
rec_type = 'arc_header'
length = 0
else:
rec_type = 'response'
length = rec_headers.get_header('length')
rec_type = 'response'
uri = rec_headers.get_header('uri')
length = rec_headers.get_header('length')
sub_len = rec_headers.total_len
elif the_format == 'warc':
rec_type = rec_headers.get_header('WARC-Type')
uri = rec_headers.get_header('WARC-Target-URI')
length = rec_headers.get_header('Content-Length')
sub_len = 0
if rec_type == 'response' and uri:
if uri.startswith('filedesc://'):
rec_type = 'arc_header'
elif uri.startswith('dns:'):
rec_type = 'dns_response'
elif uri.startswith('whois:'):
rec_type = 'whois_response'
is_err = False
try:
length = int(length)
length = int(length) - sub_len
if length < 0:
is_err = True
except ValueError:
@ -139,8 +147,7 @@ class ArcWarcRecordLoader:
status_headers = StatusAndHeaders('200 OK', content_type)
elif (rec_type == 'warcinfo' or
rec_type == 'arc_header'):
elif (rec_type in ('warcinfo', 'arc_header', 'dns_response', 'whois_response')):
# no extra parsing of body for these
status_headers = StatusAndHeaders('204 No Content', [])
@ -182,7 +189,7 @@ class ArcWarcRecordLoader:
return 'arc', rec_headers
except StatusAndHeadersParserException as se:
if known_format == 'arc':
msg = 'Invalid WARC record, first line: '
msg = 'Invalid ARC record, first line: '
else:
msg = 'Unknown archive format, first line: '
raise ArchiveLoadFailed(msg + str(se.statusline))
@ -194,16 +201,15 @@ class ARCHeadersParser:
self.headernames = headernames
def parse(self, stream, headerline=None):
total_read = 0
# if headerline passed in, use that
if headerline is None:
headerline = stream.readline()
total_read = len(headerline)
header_len = len(headerline)
if total_read == 0:
if header_len == 0:
raise EOFError()
headerline = headerline.rstrip()
@ -212,8 +218,10 @@ class ARCHeadersParser:
# if arc header, consume next two lines
if headerline.startswith('filedesc://'):
stream.readline() # skip version
stream.readline() # skip header spec, use preset one
version = stream.readline() # skip version
spec = stream.readline() # skip header spec, use preset one
total_read += len(version)
total_read += len(spec)
parts = headerline.split(' ')

View File

@ -69,7 +69,7 @@ org,httpbin)/post 20140610001151 http://httpbin.org/post application/json 200 M7
org,httpbin)/post?foo=bar 20140610001255 http://httpbin.org/post?foo=bar application/json 200 B6E5P6JUZI6UPDTNO4L2BCHMGLTNCUAJ - - 723 2395 post-test.warc.gz
# post append
>>> print_cdx_index('post-test.warc.gz', append_post_query=True)
>>> print_cdx_index('post-test.warc.gz', append_post=True)
CDX N b a m s k r M S V g
org,httpbin)/post?foo=bar&test=abc 20140610000859 http://httpbin.org/post application/json 200 M532K5WS4GY2H4OVZO6HRPOP47A7KDWU - - 720 0 post-test.warc.gz
org,httpbin)/post?a=1&b=[]&c=3 20140610001151 http://httpbin.org/post application/json 200 M7YCTM7HS3YKYQTAWQVMQSQZBNEOXGU2 - - 723 1196 post-test.warc.gz
@ -86,7 +86,7 @@ org,httpbin)/post?foo=bar 20140610001255 http://httpbin.org/post?foo=bar applica
org,httpbin)/post?foo=bar 20140610001255 http://httpbin.org/post?foo=bar application/x-www-form-urlencoded - - - - 475 3118 post-test.warc.gz
# post append + requests included
>>> print_cdx_index('post-test.warc.gz', include_all=True, append_post_query=True)
>>> print_cdx_index('post-test.warc.gz', include_all=True, append_post=True)
CDX N b a m s k r M S V g
org,httpbin)/post?foo=bar&test=abc 20140610000859 http://httpbin.org/post application/json 200 M532K5WS4GY2H4OVZO6HRPOP47A7KDWU - - 720 0 post-test.warc.gz
org,httpbin)/post?foo=bar&test=abc 20140610000859 http://httpbin.org/post application/x-www-form-urlencoded - - - - 476 720 post-test.warc.gz
@ -135,7 +135,7 @@ org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example tex
from pywb import get_test_dir
#from pywb.warc.archiveindexer import ArchiveIndexer, main, cdx_filename
from pywb.warc.cdxindexer import write_index, main, cdx_filename, CDXWriter, SortedCDXWriter
from pywb.warc.cdxindexer import write_cdx_index, main, cdx_filename
from io import BytesIO
import sys
@ -157,19 +157,11 @@ def read_fully(cdx):
curr.write(b)
return curr.getvalue()
def cdx_index(warc, sort=False,
include_all=False, append_post_query=False):
def cdx_index(warc, **options):
buff = BytesIO()
if sort:
writer_cls = SortedCDXWriter
else:
writer_cls = CDXWriter
with writer_cls(buff) as writer:
with open(TEST_WARC_DIR + warc) as fh:
write_index(writer, warc, fh,
True, append_post_query, include_all)
with open(TEST_WARC_DIR + warc) as fh:
write_cdx_index(buff, fh, warc, **options)
return buff.getvalue()
@ -177,7 +169,7 @@ def print_cdx_index(*args, **kwargs):
sys.stdout.write(cdx_index(*args, **kwargs))
def assert_cdx_match(cdx, warc, sort=False):
assert read_fully(cdx) == cdx_index(warc, sort)
assert read_fully(cdx) == cdx_index(warc, sort=sort)
def test_sorted_warc_gz():
assert_cdx_match('example.cdx', 'example.warc.gz', sort=True)