mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
Merge branch 'develop' into binary-parse
This commit is contained in:
commit
f2bfc96002
@ -195,13 +195,23 @@ class ArchiveIndexEntry(object):
|
||||
|
||||
#=================================================================
|
||||
def create_record_iter(arcv_iter, options):
|
||||
|
||||
append_post = options.get('append_post')
|
||||
include_all = options.get('include_all')
|
||||
|
||||
for record in arcv_iter.iter_records():
|
||||
entry = None
|
||||
|
||||
if not include_all and (record.status_headers.get_statuscode() == '-'):
|
||||
continue
|
||||
|
||||
if record.format == 'warc':
|
||||
if (record.rec_type == 'request' and
|
||||
not options.get('append_post') and
|
||||
not options.get('include_all')):
|
||||
not include_all and
|
||||
not append_post):
|
||||
continue
|
||||
|
||||
elif (not include_all and record.content_type == 'application/warc-fields'):
|
||||
continue
|
||||
|
||||
entry = parse_warc_record(record)
|
||||
@ -236,6 +246,7 @@ def create_record_iter(arcv_iter, options):
|
||||
|
||||
yield entry
|
||||
|
||||
|
||||
#=================================================================
|
||||
def join_request_records(entry_iter, options):
|
||||
prev_entry = None
|
||||
@ -264,13 +275,10 @@ def join_request_records(entry_iter, options):
|
||||
yield prev_entry
|
||||
prev_entry = entry
|
||||
|
||||
|
||||
if prev_entry:
|
||||
yield prev_entry
|
||||
|
||||
|
||||
|
||||
|
||||
#=================================================================
|
||||
def parse_warc_record(record):
|
||||
""" Parse warc record
|
||||
@ -354,7 +362,7 @@ def create_index_iter(fh, **options):
|
||||
|
||||
entry_iter = create_record_iter(aiter, options)
|
||||
|
||||
if options.get('append_post') == True:
|
||||
if options.get('append_post'):
|
||||
entry_iter = join_request_records(entry_iter, options)
|
||||
|
||||
for entry in entry_iter:
|
||||
|
@ -15,7 +15,7 @@ from pywb.utils.wbexception import WbException
|
||||
#=================================================================
|
||||
ArcWarcRecord = collections.namedtuple('ArcWarcRecord',
|
||||
'format, rec_type, rec_headers, ' +
|
||||
'stream, status_headers')
|
||||
'stream, status_headers content_type')
|
||||
|
||||
|
||||
#=================================================================
|
||||
@ -147,7 +147,8 @@ class ArcWarcRecordLoader:
|
||||
status_headers = StatusAndHeaders('- OK', content_type_header)
|
||||
|
||||
return ArcWarcRecord(the_format, rec_type,
|
||||
rec_headers, stream, status_headers)
|
||||
rec_headers, stream, status_headers,
|
||||
content_type)
|
||||
|
||||
def _detect_type_load_headers(self, stream,
|
||||
statusline=None, known_format=None):
|
||||
|
@ -43,16 +43,22 @@ com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ
|
||||
CDX N b a m s k r M S V g
|
||||
com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1656 151 example.arc
|
||||
|
||||
# wget warc (w/ metadata)
|
||||
# wget warc, just responses
|
||||
>>> print_cdx_index('example-wget-1-14.warc.gz')
|
||||
CDX N b a m s k r M S V g
|
||||
com,example)/ 20140216012908 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1151 792 example-wget-1-14.warc.gz
|
||||
|
||||
# wget warc include all w/ metadata
|
||||
>>> print_cdx_index('example-wget-1-14.warc.gz', include_all=True)
|
||||
CDX N b a m s k r M S V g
|
||||
com,example)/ 20140216012908 http://example.com/ - - - - - 394 398 example-wget-1-14.warc.gz
|
||||
com,example)/ 20140216012908 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1151 792 example-wget-1-14.warc.gz
|
||||
metadata)/gnu.org/software/wget/warc/manifest.txt 20140216012908 metadata://gnu.org/software/wget/warc/MANIFEST.txt text/plain - SWUF4CK2XMZSOKSA7SDT7M7NUGWH2TRE - - 315 1943 example-wget-1-14.warc.gz
|
||||
metadata)/gnu.org/software/wget/warc/wget_arguments.txt 20140216012908 metadata://gnu.org/software/wget/warc/wget_arguments.txt text/plain - UCXDCGORD6K4RJT5NUQGKE2PKEG4ZZD6 - - 340 2258 example-wget-1-14.warc.gz
|
||||
metadata)/gnu.org/software/wget/warc/wget.log 20140216012908 metadata://gnu.org/software/wget/warc/wget.log text/plain - 2ULE2LD5UOWDXGACCT624TU5BVKACRQ4 - - 599 2598 example-wget-1-14.warc.gz
|
||||
|
||||
# bad arcs -- test error edge cases
|
||||
>>> print_cdx_index('bad.arc')
|
||||
>>> print_cdx_index('bad.arc', include_all=True)
|
||||
CDX N b a m s k r M S V g
|
||||
com,example)/ 20140401000000 http://example.com/ text/html - 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 67 134 bad.arc
|
||||
com,example)/ 20140102000000 http://example.com/ text/plain - 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 59 202 bad.arc
|
||||
@ -104,7 +110,7 @@ org,httpbin)/post?data=^&foo=bar 20140610001255 http://httpbin.org/post?foo=bar
|
||||
>>> cli_lines(['--sort', '-', TEST_WARC_DIR])
|
||||
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
|
||||
org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1001 353 example-url-agnostic-orig.warc.gz
|
||||
206
|
||||
200
|
||||
|
||||
# test sort, multiple inputs, all records + post query
|
||||
>>> cli_lines(['--sort', '-a', '-p', '-9', TEST_WARC_DIR])
|
||||
|
Loading…
x
Reference in New Issue
Block a user