mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
warc indexing: in include_all mode, pass 'warcinfo' records to writer, allowing it to option to handle or ignore
This commit is contained in:
parent
377ea33bc8
commit
1980b66127
@ -149,6 +149,11 @@ class ArchiveIterator(object):
|
|||||||
class ArchiveIndexEntry(object):
|
class ArchiveIndexEntry(object):
|
||||||
MIME_RE = re.compile('[; ]')
|
MIME_RE = re.compile('[; ]')
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.url = None
|
||||||
|
self.key = None
|
||||||
|
self.digest = '-'
|
||||||
|
|
||||||
def extract_mime(self, mime, def_mime='unk'):
|
def extract_mime(self, mime, def_mime='unk'):
|
||||||
""" Utility function to extract mimetype only
|
""" Utility function to extract mimetype only
|
||||||
from a full content type, removing charset settings
|
from a full content type, removing charset settings
|
||||||
@ -195,7 +200,6 @@ class ArchiveIndexEntry(object):
|
|||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
def create_record_iter(arcv_iter, options):
|
def create_record_iter(arcv_iter, options):
|
||||||
|
|
||||||
append_post = options.get('append_post')
|
append_post = options.get('append_post')
|
||||||
include_all = options.get('include_all')
|
include_all = options.get('include_all')
|
||||||
|
|
||||||
@ -206,7 +210,7 @@ def create_record_iter(arcv_iter, options):
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
if record.format == 'warc':
|
if record.format == 'warc':
|
||||||
if (record.rec_type == 'request' and
|
if (record.rec_type in ('request', 'warcinfo') and
|
||||||
not include_all and
|
not include_all and
|
||||||
not append_post):
|
not append_post):
|
||||||
continue
|
continue
|
||||||
@ -221,12 +225,13 @@ def create_record_iter(arcv_iter, options):
|
|||||||
if not entry:
|
if not entry:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
entry.key = canonicalize(entry.url, options.get('surt_ordered', True))
|
if entry.url and not entry.key:
|
||||||
|
entry.key = canonicalize(entry.url, options.get('surt_ordered', True))
|
||||||
|
|
||||||
compute_digest = False
|
compute_digest = False
|
||||||
|
|
||||||
if (entry.digest == '-' and
|
if (entry.digest == '-' and
|
||||||
record.rec_type not in ('revisit', 'request')):
|
record.rec_type not in ('revisit', 'request', 'warcinfo')):
|
||||||
|
|
||||||
compute_digest = True
|
compute_digest = True
|
||||||
|
|
||||||
@ -284,12 +289,15 @@ def parse_warc_record(record):
|
|||||||
""" Parse warc record
|
""" Parse warc record
|
||||||
"""
|
"""
|
||||||
|
|
||||||
url = record.rec_headers.get_header('WARC-Target-Uri')
|
|
||||||
if not url:
|
|
||||||
return None
|
|
||||||
|
|
||||||
entry = ArchiveIndexEntry()
|
entry = ArchiveIndexEntry()
|
||||||
entry.url = url
|
|
||||||
|
if record.rec_type == 'warcinfo':
|
||||||
|
entry.url = record.rec_headers.get_header('WARC-Filename')
|
||||||
|
entry.key = entry.url
|
||||||
|
entry.warcinfo = record.stream.read(record.length)
|
||||||
|
return entry
|
||||||
|
|
||||||
|
entry.url = record.rec_headers.get_header('WARC-Target-Uri')
|
||||||
|
|
||||||
# timestamp
|
# timestamp
|
||||||
entry.timestamp = iso_date_to_timestamp(record.rec_headers.
|
entry.timestamp = iso_date_to_timestamp(record.rec_headers.
|
||||||
@ -366,29 +374,8 @@ def create_index_iter(fh, **options):
|
|||||||
entry_iter = join_request_records(entry_iter, options)
|
entry_iter = join_request_records(entry_iter, options)
|
||||||
|
|
||||||
for entry in entry_iter:
|
for entry in entry_iter:
|
||||||
if (entry.record.rec_type == 'request' and
|
if (entry.record.rec_type in ('request', 'warcinfo') and
|
||||||
not options.get('include_all')):
|
not options.get('include_all')):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
yield entry
|
yield entry
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
if __name__ == "__main__":
|
|
||||||
import sys
|
|
||||||
filename = sys.argv[1]
|
|
||||||
|
|
||||||
with open(filename) as fh:
|
|
||||||
ait = ArchiveIterator(fh)
|
|
||||||
options = dict(surt_ordered=True, append_post=True)
|
|
||||||
|
|
||||||
out = sys.stdout
|
|
||||||
|
|
||||||
entry_iter = create_record_iter(ait, options)
|
|
||||||
entry_iter = join_request_records(entry_iter, options)
|
|
||||||
|
|
||||||
cdx_write(out, entry_iter, options, filename)
|
|
||||||
|
|
||||||
#for record in ait.iter_records():
|
|
||||||
# result = ait.read_to_end(record.stream)
|
|
||||||
# print record.rec_type, result
|
|
||||||
|
@ -23,12 +23,18 @@ class CDXWriter(object):
|
|||||||
return self
|
return self
|
||||||
|
|
||||||
def write(self, entry, filename):
|
def write(self, entry, filename):
|
||||||
|
if not entry.url or not entry.key:
|
||||||
|
return
|
||||||
|
|
||||||
self.write_cdx_line(self.out, entry, filename)
|
self.write_cdx_line(self.out, entry, filename)
|
||||||
|
|
||||||
def __exit__(self, *args):
|
def __exit__(self, *args):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def write_cdx_line(self, out, entry, filename):
|
def write_cdx_line(self, out, entry, filename):
|
||||||
|
if entry.record.rec_type == 'warcinfo':
|
||||||
|
return
|
||||||
|
|
||||||
out.write(entry.key)
|
out.write(entry.key)
|
||||||
out.write(' ')
|
out.write(' ')
|
||||||
out.write(entry.timestamp)
|
out.write(entry.timestamp)
|
||||||
@ -62,7 +68,9 @@ class SortedCDXWriter(CDXWriter):
|
|||||||
outbuff = BytesIO()
|
outbuff = BytesIO()
|
||||||
self.write_cdx_line(outbuff, entry, filename)
|
self.write_cdx_line(outbuff, entry, filename)
|
||||||
|
|
||||||
insort(self.sortlist, outbuff.getvalue())
|
line = outbuff.getvalue()
|
||||||
|
if line:
|
||||||
|
insort(self.sortlist, line)
|
||||||
|
|
||||||
def __exit__(self, *args):
|
def __exit__(self, *args):
|
||||||
self.out.write(''.join(self.sortlist))
|
self.out.write(''.join(self.sortlist))
|
||||||
|
@ -15,7 +15,8 @@ from pywb.utils.wbexception import WbException
|
|||||||
#=================================================================
|
#=================================================================
|
||||||
ArcWarcRecord = collections.namedtuple('ArcWarcRecord',
|
ArcWarcRecord = collections.namedtuple('ArcWarcRecord',
|
||||||
'format, rec_type, rec_headers, ' +
|
'format, rec_type, rec_headers, ' +
|
||||||
'stream, status_headers content_type')
|
'stream, status_headers, ' +
|
||||||
|
'content_type, length')
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
@ -148,7 +149,7 @@ class ArcWarcRecordLoader:
|
|||||||
|
|
||||||
return ArcWarcRecord(the_format, rec_type,
|
return ArcWarcRecord(the_format, rec_type,
|
||||||
rec_headers, stream, status_headers,
|
rec_headers, stream, status_headers,
|
||||||
content_type)
|
content_type, length)
|
||||||
|
|
||||||
def _detect_type_load_headers(self, stream,
|
def _detect_type_load_headers(self, stream,
|
||||||
statusline=None, known_format=None):
|
statusline=None, known_format=None):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user