1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

cdx indexing: add support for 9-field cdx output,

request merge: store referer if available, check for record id matching
This commit is contained in:
Ilya Kreymer 2014-06-19 16:51:23 -07:00
parent 694b97e67f
commit 3965fad4dd
3 changed files with 53 additions and 34 deletions

View File

@ -170,22 +170,26 @@ class ArchiveIndexEntry(object):
if digest:
self.digest = digest
def add_post_query(self, other, options):
def merge_request_data(self, other, options):
surt_ordered = options.get('surt_ordered')
if other.record.rec_type != 'request':
return False
if not hasattr(other, 'post_query'):
return False
# two requests, not correct
if self.record.rec_type == 'request':
return False
url = append_post_query(self.url, other.post_query)
self.key = canonicalize(url, surt_ordered)
other.key = self.key
# merge POST/PUT body query
if hasattr(other, 'post_query'):
url = append_post_query(self.url, other.post_query)
self.key = canonicalize(url, surt_ordered)
other.key = self.key
referer = other.record.status_headers.get_header('referer')
if referer:
self.referer = referer
return True
@ -244,13 +248,14 @@ def join_request_records(entry_iter, options):
# check for url match
if (entry.url != prev_entry.url):
pass
# check for concurrency also
#elif (entry.record.rec_headers.get_header('WARC-Concurrent-To') !=
# prev_entry.record.rec_headers.get_header('WARC-Record-ID')):
# pass
elif (entry.add_post_query(prev_entry, options) or
prev_entry.add_post_query(entry, options)):
# check for concurrency also
elif (entry.record.rec_headers.get_header('WARC-Concurrent-To') !=
prev_entry.record.rec_headers.get_header('WARC-Record-ID')):
pass
elif (entry.merge_request_data(prev_entry, options) or
prev_entry.merge_request_data(entry, options)):
yield prev_entry
yield entry
prev_entry = None

View File

@ -10,11 +10,16 @@ from archiveiterator import create_index_iter
#=================================================================
class CDXWriter(object):
def __init__(self, out):
def __init__(self, out, cdx09=False):
self.out = out
self.cdx09 = cdx09
def __enter__(self):
self.out.write(' CDX N b a m s k r M S V g\n')
if not self.cdx09:
self.out.write(' CDX N b a m s k r M S V g\n')
else:
self.out.write(' CDX N b a m s k r V g\n')
return self
def write(self, entry, filename):
@ -23,8 +28,7 @@ class CDXWriter(object):
def __exit__(self, *args):
return False
@staticmethod
def write_cdx_line(out, entry, filename):
def write_cdx_line(self, out, entry, filename):
out.write(entry.key)
out.write(' ')
out.write(entry.timestamp)
@ -36,9 +40,12 @@ class CDXWriter(object):
out.write(entry.status)
out.write(' ')
out.write(entry.digest)
out.write(' - - ')
out.write(entry.length)
out.write(' ')
if self.cdx09:
out.write(' - ')
else:
out.write(' - - ')
out.write(entry.length)
out.write(' ')
out.write(entry.offset)
out.write(' ')
out.write(filename)
@ -47,9 +54,9 @@ class CDXWriter(object):
#=================================================================
class SortedCDXWriter(CDXWriter):
def __init__(self, out):
super(SortedCDXWriter, self).__init__(out)
def __enter__(self):
self.sortlist = []
return super(SortedCDXWriter, self).__enter__()
def write(self, entry, filename):
outbuff = BytesIO()
@ -74,7 +81,7 @@ def iter_file_or_dir(inputs):
#=================================================================
def index_to_file(inputs, output, sort,
surt_ordered, include_all, append_post_query):
surt_ordered, include_all, append_post, cdx09):
if output == '-':
outfile = sys.stdout
else:
@ -85,15 +92,15 @@ def index_to_file(inputs, output, sort,
else:
writer_cls = CDXWriter
with writer_cls(outfile) as writer:
with writer_cls(outfile, cdx09) as writer:
for fullpath, filename in iter_file_or_dir(inputs):
with open(fullpath, 'r') as infile:
write_index(writer, filename, infile,
surt_ordered, append_post_query, include_all)
surt_ordered, append_post, include_all)
#=================================================================
def index_to_dir(inputs, output, sort,
surt_ordered, include_all, append_post_query):
surt_ordered, include_all, append_post, cdx09):
if sort:
writer_cls = SortedCDXWriter
@ -101,15 +108,14 @@ def index_to_dir(inputs, output, sort,
writer_cls = CDXWriter
for fullpath, filename in iter_file_or_dir(inputs):
outpath = cdx_filename(filename)
outpath = os.path.join(output, outpath)
with open(outpath, 'w') as outfile:
with writer_cls(outfile) as writer:
with writer_cls(outfile, cdx09) as writer:
with open(fullpath, 'r') as infile:
write_index(writer, filename, infile,
surt_ordered, append_post_query, include_all)
surt_ordered, append_post, include_all)
#=================================================================
def remove_ext(filename):
@ -168,6 +174,10 @@ sort the output to each file before writing to create a total ordering
Convert SURT (Sort-friendly URI Reordering Transform) back to regular
urls for the cdx key. Default is to use SURT keys.
Not-recommended for new cdx, use only for backwards-compatibility.
"""
cdx09_help = """
Use older 9-field cdx format, default is 11-cdx field
"""
output_help = """output file or directory.
@ -207,16 +217,20 @@ form query to url key. (Only applies to form url encoded posts)"""
action='store_true',
help=unsurt_help)
parser.add_argument('-9', '--cdx09',
action='store_true',
help=cdx09_help)
parser.add_argument('output', nargs='?', default='-', help=output_help)
parser.add_argument('inputs', nargs='+', help=input_help)
cmd = parser.parse_args(args=args)
if cmd.output != '-' and os.path.isdir(cmd.output):
index_to_dir(cmd.inputs, cmd.output, cmd.sort,
not cmd.unsurt, cmd.allrecords, cmd.postappend)
not cmd.unsurt, cmd.allrecords, cmd.postappend, cmd.cdx09)
else:
index_to_file(cmd.inputs, cmd.output, cmd.sort,
not cmd.unsurt, cmd.allrecords, cmd.postappend)
not cmd.unsurt, cmd.allrecords, cmd.postappend, cmd.cdx09)
if __name__ == '__main__':

View File

@ -107,9 +107,9 @@ org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWP
204
# test sort, multiple inputs, all records + post query
>>> cli_lines(['--sort', '-a', '-p', TEST_WARC_DIR])
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1001 353 example-url-agnostic-orig.warc.gz
>>> cli_lines(['--sort', '-a', '-p', '-9', TEST_WARC_DIR])
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - 355 example-url-agnostic-revisit.warc.gz
org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - 353 example-url-agnostic-orig.warc.gz
395
# test writing to stdout