indexing: refactor cdxindexer interface to better allow custom writers

record loader: skip whois: and dns: records, better skipping of arc headers (todo: need more unit tests)
2025-03-15 00:03:28 +01:00 · 2014-06-24 17:08:10 -07:00 · 2014-06-24 17:08:10 -07:00 · 6761f5697f
commit 6761f5697f
parent 3965fad4dd
4 changed files with 88 additions and 84 deletions
--- a/pywb/warc/archiveiterator.py
+++ b/pywb/warc/archiveiterator.py
@ -171,7 +171,7 @@ class ArchiveIndexEntry(object):
            self.digest = digest

    def merge_request_data(self, other, options):
-        surt_ordered = options.get('surt_ordered')
+        surt_ordered = options.get('surt_ordered', True)

        if other.record.rec_type != 'request':
            return False
@ -354,7 +354,7 @@ def create_index_iter(fh, **options):

    entry_iter = create_record_iter(aiter, options)

-    if options.get('append_post'):
+    if options.get('append_post') == True:
        entry_iter = join_request_records(entry_iter, options)

    for entry in entry_iter:
--- a/pywb/warc/cdxindexer.py
+++ b/pywb/warc/cdxindexer.py
@ -79,44 +79,6 @@ def iter_file_or_dir(inputs):
                yield os.path.join(input_, filename), filename


-#=================================================================
-def index_to_file(inputs, output, sort,
-                  surt_ordered, include_all, append_post, cdx09):
-    if output == '-':
-        outfile = sys.stdout
-    else:
-        outfile = open(output, 'w')
-
-    if sort:
-        writer_cls = SortedCDXWriter
-    else:
-        writer_cls = CDXWriter
-
-    with writer_cls(outfile, cdx09) as writer:
-        for fullpath, filename in iter_file_or_dir(inputs):
-            with open(fullpath, 'r') as infile:
-                write_index(writer, filename, infile,
-                            surt_ordered, append_post, include_all)
-
-#=================================================================
-def index_to_dir(inputs, output, sort,
-                 surt_ordered, include_all, append_post, cdx09):
-
-    if sort:
-        writer_cls = SortedCDXWriter
-    else:
-        writer_cls = CDXWriter
-
-    for fullpath, filename in iter_file_or_dir(inputs):
-        outpath = cdx_filename(filename)
-        outpath = os.path.join(output, outpath)
-
-        with open(outpath, 'w') as outfile:
-            with writer_cls(outfile, cdx09) as writer:
-                with open(fullpath, 'r') as infile:
-                    write_index(writer, filename, infile,
-                                surt_ordered, append_post, include_all)
-
 #=================================================================
 def remove_ext(filename):
    for ext in ('.arc', '.arc.gz', '.warc', '.warc.gz'):
@ -133,16 +95,57 @@ def cdx_filename(filename):


 #=================================================================
-def write_index(writer, filename, infile,
-                surt_ordered, append_post, include_all):
+def write_multi_cdx_index(output, inputs, **options):

-    entry_iter = create_index_iter(infile,
-                                   surt_ordered=surt_ordered,
-                                   append_post=append_post,
-                                   include_all=include_all)
+    # write one cdx per dir
+    if output != '-' and os.path.isdir(output):
+        for fullpath, filename in iter_file_or_dir(inputs):
+            outpath = cdx_filename(filename)
+            outpath = os.path.join(output, outpath)

-    for entry in entry_iter:
-        writer.write(entry, filename)
+            with open(outpath, 'w') as outfile:
+                with open(fullpath, 'r') as infile:
+                    write_cdx_index(outfile, infile, filename, **options)
+
+    # write to one cdx file
+    else:
+        if output == '-':
+            outfile = sys.stdout
+        else:
+            outfile = open(output, 'w')
+
+        if options.get('sort'):
+            writer_cls = SortedCDXWriter
+        else:
+            writer_cls = CDXWriter
+
+        with writer_cls(outfile, options.get('cdx09')) as writer:
+            for fullpath, filename in iter_file_or_dir(inputs):
+                with open(fullpath, 'r') as infile:
+                    entry_iter = create_index_iter(infile, **options)
+
+                    for entry in entry_iter:
+                        writer.write(entry, filename)
+
+
+#=================================================================
+def write_cdx_index(outfile, infile, filename, **options):
+    writer_cls = options.get('writer_cls')
+
+    if writer_cls:
+        pass
+    elif options.get('sort'):
+        writer_cls = SortedCDXWriter
+    else:
+        writer_cls = CDXWriter
+
+    with writer_cls(outfile, options.get('cdx09')) as writer:
+        entry_iter = create_index_iter(infile, **options)
+
+        for entry in entry_iter:
+            writer.write(entry, filename)
+
+    return writer


 #=================================================================
@ -225,12 +228,13 @@ form query to url key. (Only applies to form url encoded posts)"""
    parser.add_argument('inputs', nargs='+', help=input_help)

    cmd = parser.parse_args(args=args)
-    if cmd.output != '-' and os.path.isdir(cmd.output):
-        index_to_dir(cmd.inputs, cmd.output, cmd.sort,
-                     not cmd.unsurt, cmd.allrecords, cmd.postappend, cmd.cdx09)
-    else:
-        index_to_file(cmd.inputs, cmd.output, cmd.sort,
-                      not cmd.unsurt, cmd.allrecords, cmd.postappend, cmd.cdx09)
+
+    write_multi_cdx_index(cmd.output, cmd.inputs,
+                          sort=cmd.sort,
+                          surt_ordered=not cmd.unsurt,
+                          include_all=cmd.allrecords,
+                          append_post=cmd.postappend,
+                          cdx09=cmd.cdx09)


 if __name__ == '__main__':
--- a/pywb/warc/recordloader.py
+++ b/pywb/warc/recordloader.py
@ -94,21 +94,29 @@ class ArcWarcRecordLoader:
                                                               known_format))

        if the_format == 'arc':
-            if rec_headers.get_header('uri').startswith('filedesc://'):
-                rec_type = 'arc_header'
-                length = 0
-            else:
-                rec_type = 'response'
-                length = rec_headers.get_header('length')
+            rec_type = 'response'
+            uri = rec_headers.get_header('uri')
+            length = rec_headers.get_header('length')
+            sub_len = rec_headers.total_len

        elif the_format == 'warc':
            rec_type = rec_headers.get_header('WARC-Type')
+            uri = rec_headers.get_header('WARC-Target-URI')
            length = rec_headers.get_header('Content-Length')
+            sub_len = 0
+
+        if rec_type == 'response' and uri:
+            if uri.startswith('filedesc://'):
+                rec_type = 'arc_header'
+            elif uri.startswith('dns:'):
+                rec_type = 'dns_response'
+            elif uri.startswith('whois:'):
+                rec_type = 'whois_response'

        is_err = False

        try:
-            length = int(length)
+            length = int(length) - sub_len
            if length < 0:
                is_err = True
        except ValueError:
@ -139,8 +147,7 @@ class ArcWarcRecordLoader:

            status_headers = StatusAndHeaders('200 OK', content_type)

-        elif (rec_type == 'warcinfo' or
-              rec_type == 'arc_header'):
+        elif (rec_type in ('warcinfo', 'arc_header', 'dns_response', 'whois_response')):
            # no extra parsing of body for these
            status_headers = StatusAndHeaders('204 No Content', [])

@ -182,7 +189,7 @@ class ArcWarcRecordLoader:
            return 'arc', rec_headers
        except StatusAndHeadersParserException as se:
            if known_format == 'arc':
-                msg = 'Invalid WARC record, first line: '
+                msg = 'Invalid ARC record, first line: '
            else:
                msg = 'Unknown archive format, first line: '
            raise ArchiveLoadFailed(msg + str(se.statusline))
@ -194,16 +201,15 @@ class ARCHeadersParser:
        self.headernames = headernames

    def parse(self, stream, headerline=None):
-
        total_read = 0

        # if headerline passed in, use that
        if headerline is None:
            headerline = stream.readline()

-        total_read = len(headerline)
+        header_len = len(headerline)

-        if total_read == 0:
+        if header_len == 0:
            raise EOFError()

        headerline = headerline.rstrip()
@ -212,8 +218,10 @@ class ARCHeadersParser:

        # if arc header, consume next two lines
        if headerline.startswith('filedesc://'):
-            stream.readline()  # skip version
-            stream.readline()  # skip header spec, use preset one
+            version = stream.readline()  # skip version
+            spec = stream.readline()  # skip header spec, use preset one
+            total_read += len(version)
+            total_read += len(spec)

        parts = headerline.split(' ')

--- a/pywb/warc/test/test_indexing.py
+++ b/pywb/warc/test/test_indexing.py
@ -69,7 +69,7 @@ org,httpbin)/post 20140610001151 http://httpbin.org/post application/json 200 M7
 org,httpbin)/post?foo=bar 20140610001255 http://httpbin.org/post?foo=bar application/json 200 B6E5P6JUZI6UPDTNO4L2BCHMGLTNCUAJ - - 723 2395 post-test.warc.gz

 # post append
->>> print_cdx_index('post-test.warc.gz', append_post_query=True)
+>>> print_cdx_index('post-test.warc.gz', append_post=True)
 CDX N b a m s k r M S V g
 org,httpbin)/post?foo=bar&test=abc 20140610000859 http://httpbin.org/post application/json 200 M532K5WS4GY2H4OVZO6HRPOP47A7KDWU - - 720 0 post-test.warc.gz
 org,httpbin)/post?a=1&b=[]&c=3 20140610001151 http://httpbin.org/post application/json 200 M7YCTM7HS3YKYQTAWQVMQSQZBNEOXGU2 - - 723 1196 post-test.warc.gz
@ -86,7 +86,7 @@ org,httpbin)/post?foo=bar 20140610001255 http://httpbin.org/post?foo=bar applica
 org,httpbin)/post?foo=bar 20140610001255 http://httpbin.org/post?foo=bar application/x-www-form-urlencoded - - - - 475 3118 post-test.warc.gz

 # post append + requests included
->>> print_cdx_index('post-test.warc.gz', include_all=True, append_post_query=True)
+>>> print_cdx_index('post-test.warc.gz', include_all=True, append_post=True)
 CDX N b a m s k r M S V g
 org,httpbin)/post?foo=bar&test=abc 20140610000859 http://httpbin.org/post application/json 200 M532K5WS4GY2H4OVZO6HRPOP47A7KDWU - - 720 0 post-test.warc.gz
 org,httpbin)/post?foo=bar&test=abc 20140610000859 http://httpbin.org/post application/x-www-form-urlencoded - - - - 476 720 post-test.warc.gz
@ -135,7 +135,7 @@ org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example tex
 from pywb import get_test_dir

 #from pywb.warc.archiveindexer import ArchiveIndexer, main, cdx_filename
-from pywb.warc.cdxindexer import write_index, main, cdx_filename, CDXWriter, SortedCDXWriter
+from pywb.warc.cdxindexer import write_cdx_index, main, cdx_filename

 from io import BytesIO
 import sys
@ -157,19 +157,11 @@ def read_fully(cdx):
            curr.write(b)
    return curr.getvalue()

-def cdx_index(warc, sort=False,
-              include_all=False, append_post_query=False):
+def cdx_index(warc, **options):
    buff = BytesIO()

-    if sort:
-        writer_cls = SortedCDXWriter
-    else:
-        writer_cls = CDXWriter
-
-    with writer_cls(buff) as writer:
-        with open(TEST_WARC_DIR + warc) as fh:
-            write_index(writer, warc, fh,
-                        True, append_post_query, include_all)
+    with open(TEST_WARC_DIR + warc) as fh:
+        write_cdx_index(buff, fh,  warc, **options)

    return buff.getvalue()

@ -177,7 +169,7 @@ def print_cdx_index(*args, **kwargs):
    sys.stdout.write(cdx_index(*args, **kwargs))

 def assert_cdx_match(cdx, warc, sort=False):
-    assert read_fully(cdx) == cdx_index(warc, sort)
+    assert read_fully(cdx) == cdx_index(warc, sort=sort)

 def test_sorted_warc_gz():
    assert_cdx_match('example.cdx', 'example.warc.gz', sort=True)