mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
cdx indexing: add support for 9-field cdx output,
request merge: store referer if available, check for record id matching
This commit is contained in:
parent
694b97e67f
commit
3965fad4dd
@ -170,22 +170,26 @@ class ArchiveIndexEntry(object):
|
||||
if digest:
|
||||
self.digest = digest
|
||||
|
||||
def add_post_query(self, other, options):
|
||||
def merge_request_data(self, other, options):
|
||||
surt_ordered = options.get('surt_ordered')
|
||||
|
||||
if other.record.rec_type != 'request':
|
||||
return False
|
||||
|
||||
if not hasattr(other, 'post_query'):
|
||||
return False
|
||||
|
||||
# two requests, not correct
|
||||
if self.record.rec_type == 'request':
|
||||
return False
|
||||
|
||||
url = append_post_query(self.url, other.post_query)
|
||||
self.key = canonicalize(url, surt_ordered)
|
||||
other.key = self.key
|
||||
# merge POST/PUT body query
|
||||
if hasattr(other, 'post_query'):
|
||||
url = append_post_query(self.url, other.post_query)
|
||||
self.key = canonicalize(url, surt_ordered)
|
||||
other.key = self.key
|
||||
|
||||
referer = other.record.status_headers.get_header('referer')
|
||||
if referer:
|
||||
self.referer = referer
|
||||
|
||||
return True
|
||||
|
||||
|
||||
@ -244,13 +248,14 @@ def join_request_records(entry_iter, options):
|
||||
# check for url match
|
||||
if (entry.url != prev_entry.url):
|
||||
pass
|
||||
# check for concurrency also
|
||||
#elif (entry.record.rec_headers.get_header('WARC-Concurrent-To') !=
|
||||
# prev_entry.record.rec_headers.get_header('WARC-Record-ID')):
|
||||
# pass
|
||||
|
||||
elif (entry.add_post_query(prev_entry, options) or
|
||||
prev_entry.add_post_query(entry, options)):
|
||||
# check for concurrency also
|
||||
elif (entry.record.rec_headers.get_header('WARC-Concurrent-To') !=
|
||||
prev_entry.record.rec_headers.get_header('WARC-Record-ID')):
|
||||
pass
|
||||
|
||||
elif (entry.merge_request_data(prev_entry, options) or
|
||||
prev_entry.merge_request_data(entry, options)):
|
||||
yield prev_entry
|
||||
yield entry
|
||||
prev_entry = None
|
||||
|
@ -10,11 +10,16 @@ from archiveiterator import create_index_iter
|
||||
|
||||
#=================================================================
|
||||
class CDXWriter(object):
|
||||
def __init__(self, out):
|
||||
def __init__(self, out, cdx09=False):
|
||||
self.out = out
|
||||
self.cdx09 = cdx09
|
||||
|
||||
def __enter__(self):
|
||||
self.out.write(' CDX N b a m s k r M S V g\n')
|
||||
if not self.cdx09:
|
||||
self.out.write(' CDX N b a m s k r M S V g\n')
|
||||
else:
|
||||
self.out.write(' CDX N b a m s k r V g\n')
|
||||
|
||||
return self
|
||||
|
||||
def write(self, entry, filename):
|
||||
@ -23,8 +28,7 @@ class CDXWriter(object):
|
||||
def __exit__(self, *args):
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
def write_cdx_line(out, entry, filename):
|
||||
def write_cdx_line(self, out, entry, filename):
|
||||
out.write(entry.key)
|
||||
out.write(' ')
|
||||
out.write(entry.timestamp)
|
||||
@ -36,9 +40,12 @@ class CDXWriter(object):
|
||||
out.write(entry.status)
|
||||
out.write(' ')
|
||||
out.write(entry.digest)
|
||||
out.write(' - - ')
|
||||
out.write(entry.length)
|
||||
out.write(' ')
|
||||
if self.cdx09:
|
||||
out.write(' - ')
|
||||
else:
|
||||
out.write(' - - ')
|
||||
out.write(entry.length)
|
||||
out.write(' ')
|
||||
out.write(entry.offset)
|
||||
out.write(' ')
|
||||
out.write(filename)
|
||||
@ -47,9 +54,9 @@ class CDXWriter(object):
|
||||
|
||||
#=================================================================
|
||||
class SortedCDXWriter(CDXWriter):
|
||||
def __init__(self, out):
|
||||
super(SortedCDXWriter, self).__init__(out)
|
||||
def __enter__(self):
|
||||
self.sortlist = []
|
||||
return super(SortedCDXWriter, self).__enter__()
|
||||
|
||||
def write(self, entry, filename):
|
||||
outbuff = BytesIO()
|
||||
@ -74,7 +81,7 @@ def iter_file_or_dir(inputs):
|
||||
|
||||
#=================================================================
|
||||
def index_to_file(inputs, output, sort,
|
||||
surt_ordered, include_all, append_post_query):
|
||||
surt_ordered, include_all, append_post, cdx09):
|
||||
if output == '-':
|
||||
outfile = sys.stdout
|
||||
else:
|
||||
@ -85,15 +92,15 @@ def index_to_file(inputs, output, sort,
|
||||
else:
|
||||
writer_cls = CDXWriter
|
||||
|
||||
with writer_cls(outfile) as writer:
|
||||
with writer_cls(outfile, cdx09) as writer:
|
||||
for fullpath, filename in iter_file_or_dir(inputs):
|
||||
with open(fullpath, 'r') as infile:
|
||||
write_index(writer, filename, infile,
|
||||
surt_ordered, append_post_query, include_all)
|
||||
surt_ordered, append_post, include_all)
|
||||
|
||||
#=================================================================
|
||||
def index_to_dir(inputs, output, sort,
|
||||
surt_ordered, include_all, append_post_query):
|
||||
surt_ordered, include_all, append_post, cdx09):
|
||||
|
||||
if sort:
|
||||
writer_cls = SortedCDXWriter
|
||||
@ -101,15 +108,14 @@ def index_to_dir(inputs, output, sort,
|
||||
writer_cls = CDXWriter
|
||||
|
||||
for fullpath, filename in iter_file_or_dir(inputs):
|
||||
|
||||
outpath = cdx_filename(filename)
|
||||
outpath = os.path.join(output, outpath)
|
||||
|
||||
with open(outpath, 'w') as outfile:
|
||||
with writer_cls(outfile) as writer:
|
||||
with writer_cls(outfile, cdx09) as writer:
|
||||
with open(fullpath, 'r') as infile:
|
||||
write_index(writer, filename, infile,
|
||||
surt_ordered, append_post_query, include_all)
|
||||
surt_ordered, append_post, include_all)
|
||||
|
||||
#=================================================================
|
||||
def remove_ext(filename):
|
||||
@ -168,6 +174,10 @@ sort the output to each file before writing to create a total ordering
|
||||
Convert SURT (Sort-friendly URI Reordering Transform) back to regular
|
||||
urls for the cdx key. Default is to use SURT keys.
|
||||
Not-recommended for new cdx, use only for backwards-compatibility.
|
||||
"""
|
||||
|
||||
cdx09_help = """
|
||||
Use older 9-field cdx format, default is 11-cdx field
|
||||
"""
|
||||
|
||||
output_help = """output file or directory.
|
||||
@ -207,16 +217,20 @@ form query to url key. (Only applies to form url encoded posts)"""
|
||||
action='store_true',
|
||||
help=unsurt_help)
|
||||
|
||||
parser.add_argument('-9', '--cdx09',
|
||||
action='store_true',
|
||||
help=cdx09_help)
|
||||
|
||||
parser.add_argument('output', nargs='?', default='-', help=output_help)
|
||||
parser.add_argument('inputs', nargs='+', help=input_help)
|
||||
|
||||
cmd = parser.parse_args(args=args)
|
||||
if cmd.output != '-' and os.path.isdir(cmd.output):
|
||||
index_to_dir(cmd.inputs, cmd.output, cmd.sort,
|
||||
not cmd.unsurt, cmd.allrecords, cmd.postappend)
|
||||
not cmd.unsurt, cmd.allrecords, cmd.postappend, cmd.cdx09)
|
||||
else:
|
||||
index_to_file(cmd.inputs, cmd.output, cmd.sort,
|
||||
not cmd.unsurt, cmd.allrecords, cmd.postappend)
|
||||
not cmd.unsurt, cmd.allrecords, cmd.postappend, cmd.cdx09)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
@ -107,9 +107,9 @@ org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWP
|
||||
204
|
||||
|
||||
# test sort, multiple inputs, all records + post query
|
||||
>>> cli_lines(['--sort', '-a', '-p', TEST_WARC_DIR])
|
||||
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
|
||||
org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1001 353 example-url-agnostic-orig.warc.gz
|
||||
>>> cli_lines(['--sort', '-a', '-p', '-9', TEST_WARC_DIR])
|
||||
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - 355 example-url-agnostic-revisit.warc.gz
|
||||
org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - 353 example-url-agnostic-orig.warc.gz
|
||||
395
|
||||
|
||||
# test writing to stdout
|
||||
|
Loading…
x
Reference in New Issue
Block a user