cdx indexing: add support for 9-field cdx output,

request merge: store referer if available, check for record id matching
2025-03-24 06:59:52 +01:00 · 2014-06-19 16:51:23 -07:00 · 2014-06-19 16:51:23 -07:00 · 3965fad4dd
commit 3965fad4dd
parent 694b97e67f
3 changed files with 53 additions and 34 deletions
--- a/pywb/warc/archiveiterator.py
+++ b/pywb/warc/archiveiterator.py
@ -170,22 +170,26 @@ class ArchiveIndexEntry(object):
        if digest:
            self.digest = digest
-    def add_post_query(self, other, options):
+    def merge_request_data(self, other, options):
        surt_ordered = options.get('surt_ordered')
        if other.record.rec_type != 'request':
            return False
        if not hasattr(other, 'post_query'):
            return False
        # two requests, not correct
        if self.record.rec_type == 'request':
            return False
-        url = append_post_query(self.url, other.post_query)
+        # merge POST/PUT body query
-        self.key = canonicalize(url, surt_ordered)
+        if hasattr(other, 'post_query'):
-        other.key = self.key
+            url = append_post_query(self.url, other.post_query)
            self.key = canonicalize(url, surt_ordered)
            other.key = self.key
        referer = other.record.status_headers.get_header('referer')
        if referer:
            self.referer = referer
        return True
@ -244,13 +248,14 @@ def join_request_records(entry_iter, options):
        # check for url match
        if (entry.url != prev_entry.url):
            pass
        # check for concurrency also
        #elif (entry.record.rec_headers.get_header('WARC-Concurrent-To') !=
        #    prev_entry.record.rec_headers.get_header('WARC-Record-ID')):
        #    pass
-        elif (entry.add_post_query(prev_entry, options) or
+        # check for concurrency also
-            prev_entry.add_post_query(entry, options)):
+        elif (entry.record.rec_headers.get_header('WARC-Concurrent-To') !=
            prev_entry.record.rec_headers.get_header('WARC-Record-ID')):
            pass
        elif (entry.merge_request_data(prev_entry, options) or
            prev_entry.merge_request_data(entry, options)):
            yield prev_entry
            yield entry
            prev_entry = None
--- a/pywb/warc/cdxindexer.py
+++ b/pywb/warc/cdxindexer.py
@ -10,11 +10,16 @@ from archiveiterator import create_index_iter
 #=================================================================
 class CDXWriter(object):
-    def __init__(self, out):
+    def __init__(self, out, cdx09=False):
        self.out = out
        self.cdx09 = cdx09
    def __enter__(self):
-        self.out.write(' CDX N b a m s k r M S V g\n')
+        if not self.cdx09:
            self.out.write(' CDX N b a m s k r M S V g\n')
        else:
            self.out.write(' CDX N b a m s k r V g\n')
        return self
    def write(self, entry, filename):
@ -23,8 +28,7 @@ class CDXWriter(object):
    def __exit__(self, *args):
        return False
-    @staticmethod
+    def write_cdx_line(self, out, entry, filename):
    def write_cdx_line(out, entry, filename):
        out.write(entry.key)
        out.write(' ')
        out.write(entry.timestamp)
@ -36,9 +40,12 @@ class CDXWriter(object):
        out.write(entry.status)
        out.write(' ')
        out.write(entry.digest)
-        out.write(' - - ')
+        if self.cdx09:
-        out.write(entry.length)
+            out.write(' - ')
-        out.write(' ')
+        else:
            out.write(' - - ')
            out.write(entry.length)
            out.write(' ')
        out.write(entry.offset)
        out.write(' ')
        out.write(filename)
@ -47,9 +54,9 @@ class CDXWriter(object):
 #=================================================================
 class SortedCDXWriter(CDXWriter):
-    def __init__(self, out):
+    def __enter__(self):
        super(SortedCDXWriter, self).__init__(out)
        self.sortlist = []
        return super(SortedCDXWriter, self).__enter__()
    def write(self, entry, filename):
        outbuff = BytesIO()
@ -74,7 +81,7 @@ def iter_file_or_dir(inputs):
 #=================================================================
 def index_to_file(inputs, output, sort,
-                  surt_ordered, include_all, append_post_query):
+                  surt_ordered, include_all, append_post, cdx09):
    if output == '-':
        outfile = sys.stdout
    else:
@ -85,15 +92,15 @@ def index_to_file(inputs, output, sort,
    else:
        writer_cls = CDXWriter
-    with writer_cls(outfile) as writer:
+    with writer_cls(outfile, cdx09) as writer:
        for fullpath, filename in iter_file_or_dir(inputs):
            with open(fullpath, 'r') as infile:
                write_index(writer, filename, infile,
-                            surt_ordered, append_post_query, include_all)
+                            surt_ordered, append_post, include_all)
 #=================================================================
 def index_to_dir(inputs, output, sort,
-                 surt_ordered, include_all, append_post_query):
+                 surt_ordered, include_all, append_post, cdx09):
    if sort:
        writer_cls = SortedCDXWriter
@ -101,15 +108,14 @@ def index_to_dir(inputs, output, sort,
        writer_cls = CDXWriter
    for fullpath, filename in iter_file_or_dir(inputs):
        outpath = cdx_filename(filename)
        outpath = os.path.join(output, outpath)
        with open(outpath, 'w') as outfile:
-            with writer_cls(outfile) as writer:
+            with writer_cls(outfile, cdx09) as writer:
                with open(fullpath, 'r') as infile:
                    write_index(writer, filename, infile,
-                                surt_ordered, append_post_query, include_all)
+                                surt_ordered, append_post, include_all)
 #=================================================================
 def remove_ext(filename):
@ -168,6 +174,10 @@ sort the output to each file before writing to create a total ordering
 Convert SURT (Sort-friendly URI Reordering Transform) back to regular
 urls for the cdx key. Default is to use SURT keys.
 Not-recommended for new cdx, use only for backwards-compatibility.
 """
    cdx09_help = """
 Use older 9-field cdx format, default is 11-cdx field
 """
    output_help = """output file or directory.
@ -207,16 +217,20 @@ form query to url key. (Only applies to form url encoded posts)"""
                        action='store_true',
                        help=unsurt_help)
    parser.add_argument('-9', '--cdx09',
                        action='store_true',
                        help=cdx09_help)
    parser.add_argument('output', nargs='?', default='-', help=output_help)
    parser.add_argument('inputs', nargs='+', help=input_help)
    cmd = parser.parse_args(args=args)
    if cmd.output != '-' and os.path.isdir(cmd.output):
        index_to_dir(cmd.inputs, cmd.output, cmd.sort,
-                     not cmd.unsurt, cmd.allrecords, cmd.postappend)
+                     not cmd.unsurt, cmd.allrecords, cmd.postappend, cmd.cdx09)
    else:
        index_to_file(cmd.inputs, cmd.output, cmd.sort,
-                      not cmd.unsurt, cmd.allrecords, cmd.postappend)
+                      not cmd.unsurt, cmd.allrecords, cmd.postappend, cmd.cdx09)
 if __name__ == '__main__':
--- a/pywb/warc/test/test_indexing.py
+++ b/pywb/warc/test/test_indexing.py
@ -107,9 +107,9 @@ org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWP
 204
 # test sort, multiple inputs, all records + post query
->>> cli_lines(['--sort', '-a', '-p', TEST_WARC_DIR])
+>>> cli_lines(['--sort', '-a', '-p', '-9', TEST_WARC_DIR])
-com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
+com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - 355 example-url-agnostic-revisit.warc.gz
-org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1001 353 example-url-agnostic-orig.warc.gz
+org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - 353 example-url-agnostic-orig.warc.gz
 395
 # test writing to stdout