diff --git a/pywb/warc/archiveiterator.py b/pywb/warc/archiveiterator.py index e2c678ed..1654a696 100644 --- a/pywb/warc/archiveiterator.py +++ b/pywb/warc/archiveiterator.py @@ -171,7 +171,7 @@ class ArchiveIndexEntry(object): self.digest = digest def merge_request_data(self, other, options): - surt_ordered = options.get('surt_ordered') + surt_ordered = options.get('surt_ordered', True) if other.record.rec_type != 'request': return False @@ -354,7 +354,7 @@ def create_index_iter(fh, **options): entry_iter = create_record_iter(aiter, options) - if options.get('append_post'): + if options.get('append_post') == True: entry_iter = join_request_records(entry_iter, options) for entry in entry_iter: diff --git a/pywb/warc/cdxindexer.py b/pywb/warc/cdxindexer.py index 0ba0900b..d40b3a2b 100644 --- a/pywb/warc/cdxindexer.py +++ b/pywb/warc/cdxindexer.py @@ -79,44 +79,6 @@ def iter_file_or_dir(inputs): yield os.path.join(input_, filename), filename -#================================================================= -def index_to_file(inputs, output, sort, - surt_ordered, include_all, append_post, cdx09): - if output == '-': - outfile = sys.stdout - else: - outfile = open(output, 'w') - - if sort: - writer_cls = SortedCDXWriter - else: - writer_cls = CDXWriter - - with writer_cls(outfile, cdx09) as writer: - for fullpath, filename in iter_file_or_dir(inputs): - with open(fullpath, 'r') as infile: - write_index(writer, filename, infile, - surt_ordered, append_post, include_all) - -#================================================================= -def index_to_dir(inputs, output, sort, - surt_ordered, include_all, append_post, cdx09): - - if sort: - writer_cls = SortedCDXWriter - else: - writer_cls = CDXWriter - - for fullpath, filename in iter_file_or_dir(inputs): - outpath = cdx_filename(filename) - outpath = os.path.join(output, outpath) - - with open(outpath, 'w') as outfile: - with writer_cls(outfile, cdx09) as writer: - with open(fullpath, 'r') as infile: - write_index(writer, filename, infile, - surt_ordered, append_post, include_all) - #================================================================= def remove_ext(filename): for ext in ('.arc', '.arc.gz', '.warc', '.warc.gz'): @@ -133,16 +95,57 @@ def cdx_filename(filename): #================================================================= -def write_index(writer, filename, infile, - surt_ordered, append_post, include_all): +def write_multi_cdx_index(output, inputs, **options): - entry_iter = create_index_iter(infile, - surt_ordered=surt_ordered, - append_post=append_post, - include_all=include_all) + # write one cdx per dir + if output != '-' and os.path.isdir(output): + for fullpath, filename in iter_file_or_dir(inputs): + outpath = cdx_filename(filename) + outpath = os.path.join(output, outpath) - for entry in entry_iter: - writer.write(entry, filename) + with open(outpath, 'w') as outfile: + with open(fullpath, 'r') as infile: + write_cdx_index(outfile, infile, filename, **options) + + # write to one cdx file + else: + if output == '-': + outfile = sys.stdout + else: + outfile = open(output, 'w') + + if options.get('sort'): + writer_cls = SortedCDXWriter + else: + writer_cls = CDXWriter + + with writer_cls(outfile, options.get('cdx09')) as writer: + for fullpath, filename in iter_file_or_dir(inputs): + with open(fullpath, 'r') as infile: + entry_iter = create_index_iter(infile, **options) + + for entry in entry_iter: + writer.write(entry, filename) + + +#================================================================= +def write_cdx_index(outfile, infile, filename, **options): + writer_cls = options.get('writer_cls') + + if writer_cls: + pass + elif options.get('sort'): + writer_cls = SortedCDXWriter + else: + writer_cls = CDXWriter + + with writer_cls(outfile, options.get('cdx09')) as writer: + entry_iter = create_index_iter(infile, **options) + + for entry in entry_iter: + writer.write(entry, filename) + + return writer #================================================================= @@ -225,12 +228,13 @@ form query to url key. (Only applies to form url encoded posts)""" parser.add_argument('inputs', nargs='+', help=input_help) cmd = parser.parse_args(args=args) - if cmd.output != '-' and os.path.isdir(cmd.output): - index_to_dir(cmd.inputs, cmd.output, cmd.sort, - not cmd.unsurt, cmd.allrecords, cmd.postappend, cmd.cdx09) - else: - index_to_file(cmd.inputs, cmd.output, cmd.sort, - not cmd.unsurt, cmd.allrecords, cmd.postappend, cmd.cdx09) + + write_multi_cdx_index(cmd.output, cmd.inputs, + sort=cmd.sort, + surt_ordered=not cmd.unsurt, + include_all=cmd.allrecords, + append_post=cmd.postappend, + cdx09=cmd.cdx09) if __name__ == '__main__': diff --git a/pywb/warc/recordloader.py b/pywb/warc/recordloader.py index 98355a5d..ed904ac7 100644 --- a/pywb/warc/recordloader.py +++ b/pywb/warc/recordloader.py @@ -94,21 +94,29 @@ class ArcWarcRecordLoader: known_format)) if the_format == 'arc': - if rec_headers.get_header('uri').startswith('filedesc://'): - rec_type = 'arc_header' - length = 0 - else: - rec_type = 'response' - length = rec_headers.get_header('length') + rec_type = 'response' + uri = rec_headers.get_header('uri') + length = rec_headers.get_header('length') + sub_len = rec_headers.total_len elif the_format == 'warc': rec_type = rec_headers.get_header('WARC-Type') + uri = rec_headers.get_header('WARC-Target-URI') length = rec_headers.get_header('Content-Length') + sub_len = 0 + + if rec_type == 'response' and uri: + if uri.startswith('filedesc://'): + rec_type = 'arc_header' + elif uri.startswith('dns:'): + rec_type = 'dns_response' + elif uri.startswith('whois:'): + rec_type = 'whois_response' is_err = False try: - length = int(length) + length = int(length) - sub_len if length < 0: is_err = True except ValueError: @@ -139,8 +147,7 @@ class ArcWarcRecordLoader: status_headers = StatusAndHeaders('200 OK', content_type) - elif (rec_type == 'warcinfo' or - rec_type == 'arc_header'): + elif (rec_type in ('warcinfo', 'arc_header', 'dns_response', 'whois_response')): # no extra parsing of body for these status_headers = StatusAndHeaders('204 No Content', []) @@ -182,7 +189,7 @@ class ArcWarcRecordLoader: return 'arc', rec_headers except StatusAndHeadersParserException as se: if known_format == 'arc': - msg = 'Invalid WARC record, first line: ' + msg = 'Invalid ARC record, first line: ' else: msg = 'Unknown archive format, first line: ' raise ArchiveLoadFailed(msg + str(se.statusline)) @@ -194,16 +201,15 @@ class ARCHeadersParser: self.headernames = headernames def parse(self, stream, headerline=None): - total_read = 0 # if headerline passed in, use that if headerline is None: headerline = stream.readline() - total_read = len(headerline) + header_len = len(headerline) - if total_read == 0: + if header_len == 0: raise EOFError() headerline = headerline.rstrip() @@ -212,8 +218,10 @@ class ARCHeadersParser: # if arc header, consume next two lines if headerline.startswith('filedesc://'): - stream.readline() # skip version - stream.readline() # skip header spec, use preset one + version = stream.readline() # skip version + spec = stream.readline() # skip header spec, use preset one + total_read += len(version) + total_read += len(spec) parts = headerline.split(' ') diff --git a/pywb/warc/test/test_indexing.py b/pywb/warc/test/test_indexing.py index b13c71b5..172d923e 100644 --- a/pywb/warc/test/test_indexing.py +++ b/pywb/warc/test/test_indexing.py @@ -69,7 +69,7 @@ org,httpbin)/post 20140610001151 http://httpbin.org/post application/json 200 M7 org,httpbin)/post?foo=bar 20140610001255 http://httpbin.org/post?foo=bar application/json 200 B6E5P6JUZI6UPDTNO4L2BCHMGLTNCUAJ - - 723 2395 post-test.warc.gz # post append ->>> print_cdx_index('post-test.warc.gz', append_post_query=True) +>>> print_cdx_index('post-test.warc.gz', append_post=True) CDX N b a m s k r M S V g org,httpbin)/post?foo=bar&test=abc 20140610000859 http://httpbin.org/post application/json 200 M532K5WS4GY2H4OVZO6HRPOP47A7KDWU - - 720 0 post-test.warc.gz org,httpbin)/post?a=1&b=[]&c=3 20140610001151 http://httpbin.org/post application/json 200 M7YCTM7HS3YKYQTAWQVMQSQZBNEOXGU2 - - 723 1196 post-test.warc.gz @@ -86,7 +86,7 @@ org,httpbin)/post?foo=bar 20140610001255 http://httpbin.org/post?foo=bar applica org,httpbin)/post?foo=bar 20140610001255 http://httpbin.org/post?foo=bar application/x-www-form-urlencoded - - - - 475 3118 post-test.warc.gz # post append + requests included ->>> print_cdx_index('post-test.warc.gz', include_all=True, append_post_query=True) +>>> print_cdx_index('post-test.warc.gz', include_all=True, append_post=True) CDX N b a m s k r M S V g org,httpbin)/post?foo=bar&test=abc 20140610000859 http://httpbin.org/post application/json 200 M532K5WS4GY2H4OVZO6HRPOP47A7KDWU - - 720 0 post-test.warc.gz org,httpbin)/post?foo=bar&test=abc 20140610000859 http://httpbin.org/post application/x-www-form-urlencoded - - - - 476 720 post-test.warc.gz @@ -135,7 +135,7 @@ org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example tex from pywb import get_test_dir #from pywb.warc.archiveindexer import ArchiveIndexer, main, cdx_filename -from pywb.warc.cdxindexer import write_index, main, cdx_filename, CDXWriter, SortedCDXWriter +from pywb.warc.cdxindexer import write_cdx_index, main, cdx_filename from io import BytesIO import sys @@ -157,19 +157,11 @@ def read_fully(cdx): curr.write(b) return curr.getvalue() -def cdx_index(warc, sort=False, - include_all=False, append_post_query=False): +def cdx_index(warc, **options): buff = BytesIO() - if sort: - writer_cls = SortedCDXWriter - else: - writer_cls = CDXWriter - - with writer_cls(buff) as writer: - with open(TEST_WARC_DIR + warc) as fh: - write_index(writer, warc, fh, - True, append_post_query, include_all) + with open(TEST_WARC_DIR + warc) as fh: + write_cdx_index(buff, fh, warc, **options) return buff.getvalue() @@ -177,7 +169,7 @@ def print_cdx_index(*args, **kwargs): sys.stdout.write(cdx_index(*args, **kwargs)) def assert_cdx_match(cdx, warc, sort=False): - assert read_fully(cdx) == cdx_index(warc, sort) + assert read_fully(cdx) == cdx_index(warc, sort=sort) def test_sorted_warc_gz(): assert_cdx_match('example.cdx', 'example.warc.gz', sort=True)