mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
cdx indexing: add support for 9-field cdx output,
request merge: store referer if available, check for record id matching
This commit is contained in:
parent
694b97e67f
commit
3965fad4dd
@ -170,22 +170,26 @@ class ArchiveIndexEntry(object):
|
|||||||
if digest:
|
if digest:
|
||||||
self.digest = digest
|
self.digest = digest
|
||||||
|
|
||||||
def add_post_query(self, other, options):
|
def merge_request_data(self, other, options):
|
||||||
surt_ordered = options.get('surt_ordered')
|
surt_ordered = options.get('surt_ordered')
|
||||||
|
|
||||||
if other.record.rec_type != 'request':
|
if other.record.rec_type != 'request':
|
||||||
return False
|
return False
|
||||||
|
|
||||||
if not hasattr(other, 'post_query'):
|
|
||||||
return False
|
|
||||||
|
|
||||||
# two requests, not correct
|
# two requests, not correct
|
||||||
if self.record.rec_type == 'request':
|
if self.record.rec_type == 'request':
|
||||||
return False
|
return False
|
||||||
|
|
||||||
url = append_post_query(self.url, other.post_query)
|
# merge POST/PUT body query
|
||||||
self.key = canonicalize(url, surt_ordered)
|
if hasattr(other, 'post_query'):
|
||||||
other.key = self.key
|
url = append_post_query(self.url, other.post_query)
|
||||||
|
self.key = canonicalize(url, surt_ordered)
|
||||||
|
other.key = self.key
|
||||||
|
|
||||||
|
referer = other.record.status_headers.get_header('referer')
|
||||||
|
if referer:
|
||||||
|
self.referer = referer
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
@ -244,13 +248,14 @@ def join_request_records(entry_iter, options):
|
|||||||
# check for url match
|
# check for url match
|
||||||
if (entry.url != prev_entry.url):
|
if (entry.url != prev_entry.url):
|
||||||
pass
|
pass
|
||||||
# check for concurrency also
|
|
||||||
#elif (entry.record.rec_headers.get_header('WARC-Concurrent-To') !=
|
|
||||||
# prev_entry.record.rec_headers.get_header('WARC-Record-ID')):
|
|
||||||
# pass
|
|
||||||
|
|
||||||
elif (entry.add_post_query(prev_entry, options) or
|
# check for concurrency also
|
||||||
prev_entry.add_post_query(entry, options)):
|
elif (entry.record.rec_headers.get_header('WARC-Concurrent-To') !=
|
||||||
|
prev_entry.record.rec_headers.get_header('WARC-Record-ID')):
|
||||||
|
pass
|
||||||
|
|
||||||
|
elif (entry.merge_request_data(prev_entry, options) or
|
||||||
|
prev_entry.merge_request_data(entry, options)):
|
||||||
yield prev_entry
|
yield prev_entry
|
||||||
yield entry
|
yield entry
|
||||||
prev_entry = None
|
prev_entry = None
|
||||||
|
@ -10,11 +10,16 @@ from archiveiterator import create_index_iter
|
|||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class CDXWriter(object):
|
class CDXWriter(object):
|
||||||
def __init__(self, out):
|
def __init__(self, out, cdx09=False):
|
||||||
self.out = out
|
self.out = out
|
||||||
|
self.cdx09 = cdx09
|
||||||
|
|
||||||
def __enter__(self):
|
def __enter__(self):
|
||||||
self.out.write(' CDX N b a m s k r M S V g\n')
|
if not self.cdx09:
|
||||||
|
self.out.write(' CDX N b a m s k r M S V g\n')
|
||||||
|
else:
|
||||||
|
self.out.write(' CDX N b a m s k r V g\n')
|
||||||
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def write(self, entry, filename):
|
def write(self, entry, filename):
|
||||||
@ -23,8 +28,7 @@ class CDXWriter(object):
|
|||||||
def __exit__(self, *args):
|
def __exit__(self, *args):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
@staticmethod
|
def write_cdx_line(self, out, entry, filename):
|
||||||
def write_cdx_line(out, entry, filename):
|
|
||||||
out.write(entry.key)
|
out.write(entry.key)
|
||||||
out.write(' ')
|
out.write(' ')
|
||||||
out.write(entry.timestamp)
|
out.write(entry.timestamp)
|
||||||
@ -36,9 +40,12 @@ class CDXWriter(object):
|
|||||||
out.write(entry.status)
|
out.write(entry.status)
|
||||||
out.write(' ')
|
out.write(' ')
|
||||||
out.write(entry.digest)
|
out.write(entry.digest)
|
||||||
out.write(' - - ')
|
if self.cdx09:
|
||||||
out.write(entry.length)
|
out.write(' - ')
|
||||||
out.write(' ')
|
else:
|
||||||
|
out.write(' - - ')
|
||||||
|
out.write(entry.length)
|
||||||
|
out.write(' ')
|
||||||
out.write(entry.offset)
|
out.write(entry.offset)
|
||||||
out.write(' ')
|
out.write(' ')
|
||||||
out.write(filename)
|
out.write(filename)
|
||||||
@ -47,9 +54,9 @@ class CDXWriter(object):
|
|||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class SortedCDXWriter(CDXWriter):
|
class SortedCDXWriter(CDXWriter):
|
||||||
def __init__(self, out):
|
def __enter__(self):
|
||||||
super(SortedCDXWriter, self).__init__(out)
|
|
||||||
self.sortlist = []
|
self.sortlist = []
|
||||||
|
return super(SortedCDXWriter, self).__enter__()
|
||||||
|
|
||||||
def write(self, entry, filename):
|
def write(self, entry, filename):
|
||||||
outbuff = BytesIO()
|
outbuff = BytesIO()
|
||||||
@ -74,7 +81,7 @@ def iter_file_or_dir(inputs):
|
|||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
def index_to_file(inputs, output, sort,
|
def index_to_file(inputs, output, sort,
|
||||||
surt_ordered, include_all, append_post_query):
|
surt_ordered, include_all, append_post, cdx09):
|
||||||
if output == '-':
|
if output == '-':
|
||||||
outfile = sys.stdout
|
outfile = sys.stdout
|
||||||
else:
|
else:
|
||||||
@ -85,15 +92,15 @@ def index_to_file(inputs, output, sort,
|
|||||||
else:
|
else:
|
||||||
writer_cls = CDXWriter
|
writer_cls = CDXWriter
|
||||||
|
|
||||||
with writer_cls(outfile) as writer:
|
with writer_cls(outfile, cdx09) as writer:
|
||||||
for fullpath, filename in iter_file_or_dir(inputs):
|
for fullpath, filename in iter_file_or_dir(inputs):
|
||||||
with open(fullpath, 'r') as infile:
|
with open(fullpath, 'r') as infile:
|
||||||
write_index(writer, filename, infile,
|
write_index(writer, filename, infile,
|
||||||
surt_ordered, append_post_query, include_all)
|
surt_ordered, append_post, include_all)
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
def index_to_dir(inputs, output, sort,
|
def index_to_dir(inputs, output, sort,
|
||||||
surt_ordered, include_all, append_post_query):
|
surt_ordered, include_all, append_post, cdx09):
|
||||||
|
|
||||||
if sort:
|
if sort:
|
||||||
writer_cls = SortedCDXWriter
|
writer_cls = SortedCDXWriter
|
||||||
@ -101,15 +108,14 @@ def index_to_dir(inputs, output, sort,
|
|||||||
writer_cls = CDXWriter
|
writer_cls = CDXWriter
|
||||||
|
|
||||||
for fullpath, filename in iter_file_or_dir(inputs):
|
for fullpath, filename in iter_file_or_dir(inputs):
|
||||||
|
|
||||||
outpath = cdx_filename(filename)
|
outpath = cdx_filename(filename)
|
||||||
outpath = os.path.join(output, outpath)
|
outpath = os.path.join(output, outpath)
|
||||||
|
|
||||||
with open(outpath, 'w') as outfile:
|
with open(outpath, 'w') as outfile:
|
||||||
with writer_cls(outfile) as writer:
|
with writer_cls(outfile, cdx09) as writer:
|
||||||
with open(fullpath, 'r') as infile:
|
with open(fullpath, 'r') as infile:
|
||||||
write_index(writer, filename, infile,
|
write_index(writer, filename, infile,
|
||||||
surt_ordered, append_post_query, include_all)
|
surt_ordered, append_post, include_all)
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
def remove_ext(filename):
|
def remove_ext(filename):
|
||||||
@ -168,6 +174,10 @@ sort the output to each file before writing to create a total ordering
|
|||||||
Convert SURT (Sort-friendly URI Reordering Transform) back to regular
|
Convert SURT (Sort-friendly URI Reordering Transform) back to regular
|
||||||
urls for the cdx key. Default is to use SURT keys.
|
urls for the cdx key. Default is to use SURT keys.
|
||||||
Not-recommended for new cdx, use only for backwards-compatibility.
|
Not-recommended for new cdx, use only for backwards-compatibility.
|
||||||
|
"""
|
||||||
|
|
||||||
|
cdx09_help = """
|
||||||
|
Use older 9-field cdx format, default is 11-cdx field
|
||||||
"""
|
"""
|
||||||
|
|
||||||
output_help = """output file or directory.
|
output_help = """output file or directory.
|
||||||
@ -207,16 +217,20 @@ form query to url key. (Only applies to form url encoded posts)"""
|
|||||||
action='store_true',
|
action='store_true',
|
||||||
help=unsurt_help)
|
help=unsurt_help)
|
||||||
|
|
||||||
|
parser.add_argument('-9', '--cdx09',
|
||||||
|
action='store_true',
|
||||||
|
help=cdx09_help)
|
||||||
|
|
||||||
parser.add_argument('output', nargs='?', default='-', help=output_help)
|
parser.add_argument('output', nargs='?', default='-', help=output_help)
|
||||||
parser.add_argument('inputs', nargs='+', help=input_help)
|
parser.add_argument('inputs', nargs='+', help=input_help)
|
||||||
|
|
||||||
cmd = parser.parse_args(args=args)
|
cmd = parser.parse_args(args=args)
|
||||||
if cmd.output != '-' and os.path.isdir(cmd.output):
|
if cmd.output != '-' and os.path.isdir(cmd.output):
|
||||||
index_to_dir(cmd.inputs, cmd.output, cmd.sort,
|
index_to_dir(cmd.inputs, cmd.output, cmd.sort,
|
||||||
not cmd.unsurt, cmd.allrecords, cmd.postappend)
|
not cmd.unsurt, cmd.allrecords, cmd.postappend, cmd.cdx09)
|
||||||
else:
|
else:
|
||||||
index_to_file(cmd.inputs, cmd.output, cmd.sort,
|
index_to_file(cmd.inputs, cmd.output, cmd.sort,
|
||||||
not cmd.unsurt, cmd.allrecords, cmd.postappend)
|
not cmd.unsurt, cmd.allrecords, cmd.postappend, cmd.cdx09)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
@ -107,9 +107,9 @@ org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWP
|
|||||||
204
|
204
|
||||||
|
|
||||||
# test sort, multiple inputs, all records + post query
|
# test sort, multiple inputs, all records + post query
|
||||||
>>> cli_lines(['--sort', '-a', '-p', TEST_WARC_DIR])
|
>>> cli_lines(['--sort', '-a', '-p', '-9', TEST_WARC_DIR])
|
||||||
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
|
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - 355 example-url-agnostic-revisit.warc.gz
|
||||||
org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1001 353 example-url-agnostic-orig.warc.gz
|
org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - 353 example-url-agnostic-orig.warc.gz
|
||||||
395
|
395
|
||||||
|
|
||||||
# test writing to stdout
|
# test writing to stdout
|
||||||
|
Loading…
x
Reference in New Issue
Block a user