mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
Merge branch 'develop' into binary-parse
This commit is contained in:
commit
f2bfc96002
@ -195,13 +195,23 @@ class ArchiveIndexEntry(object):
|
|||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
def create_record_iter(arcv_iter, options):
|
def create_record_iter(arcv_iter, options):
|
||||||
|
|
||||||
|
append_post = options.get('append_post')
|
||||||
|
include_all = options.get('include_all')
|
||||||
|
|
||||||
for record in arcv_iter.iter_records():
|
for record in arcv_iter.iter_records():
|
||||||
entry = None
|
entry = None
|
||||||
|
|
||||||
|
if not include_all and (record.status_headers.get_statuscode() == '-'):
|
||||||
|
continue
|
||||||
|
|
||||||
if record.format == 'warc':
|
if record.format == 'warc':
|
||||||
if (record.rec_type == 'request' and
|
if (record.rec_type == 'request' and
|
||||||
not options.get('append_post') and
|
not include_all and
|
||||||
not options.get('include_all')):
|
not append_post):
|
||||||
|
continue
|
||||||
|
|
||||||
|
elif (not include_all and record.content_type == 'application/warc-fields'):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
entry = parse_warc_record(record)
|
entry = parse_warc_record(record)
|
||||||
@ -236,6 +246,7 @@ def create_record_iter(arcv_iter, options):
|
|||||||
|
|
||||||
yield entry
|
yield entry
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
def join_request_records(entry_iter, options):
|
def join_request_records(entry_iter, options):
|
||||||
prev_entry = None
|
prev_entry = None
|
||||||
@ -264,13 +275,10 @@ def join_request_records(entry_iter, options):
|
|||||||
yield prev_entry
|
yield prev_entry
|
||||||
prev_entry = entry
|
prev_entry = entry
|
||||||
|
|
||||||
|
|
||||||
if prev_entry:
|
if prev_entry:
|
||||||
yield prev_entry
|
yield prev_entry
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
def parse_warc_record(record):
|
def parse_warc_record(record):
|
||||||
""" Parse warc record
|
""" Parse warc record
|
||||||
@ -354,7 +362,7 @@ def create_index_iter(fh, **options):
|
|||||||
|
|
||||||
entry_iter = create_record_iter(aiter, options)
|
entry_iter = create_record_iter(aiter, options)
|
||||||
|
|
||||||
if options.get('append_post') == True:
|
if options.get('append_post'):
|
||||||
entry_iter = join_request_records(entry_iter, options)
|
entry_iter = join_request_records(entry_iter, options)
|
||||||
|
|
||||||
for entry in entry_iter:
|
for entry in entry_iter:
|
||||||
|
@ -15,7 +15,7 @@ from pywb.utils.wbexception import WbException
|
|||||||
#=================================================================
|
#=================================================================
|
||||||
ArcWarcRecord = collections.namedtuple('ArcWarcRecord',
|
ArcWarcRecord = collections.namedtuple('ArcWarcRecord',
|
||||||
'format, rec_type, rec_headers, ' +
|
'format, rec_type, rec_headers, ' +
|
||||||
'stream, status_headers')
|
'stream, status_headers content_type')
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
@ -147,7 +147,8 @@ class ArcWarcRecordLoader:
|
|||||||
status_headers = StatusAndHeaders('- OK', content_type_header)
|
status_headers = StatusAndHeaders('- OK', content_type_header)
|
||||||
|
|
||||||
return ArcWarcRecord(the_format, rec_type,
|
return ArcWarcRecord(the_format, rec_type,
|
||||||
rec_headers, stream, status_headers)
|
rec_headers, stream, status_headers,
|
||||||
|
content_type)
|
||||||
|
|
||||||
def _detect_type_load_headers(self, stream,
|
def _detect_type_load_headers(self, stream,
|
||||||
statusline=None, known_format=None):
|
statusline=None, known_format=None):
|
||||||
|
@ -43,16 +43,22 @@ com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ
|
|||||||
CDX N b a m s k r M S V g
|
CDX N b a m s k r M S V g
|
||||||
com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1656 151 example.arc
|
com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1656 151 example.arc
|
||||||
|
|
||||||
# wget warc (w/ metadata)
|
# wget warc, just responses
|
||||||
>>> print_cdx_index('example-wget-1-14.warc.gz')
|
>>> print_cdx_index('example-wget-1-14.warc.gz')
|
||||||
CDX N b a m s k r M S V g
|
CDX N b a m s k r M S V g
|
||||||
com,example)/ 20140216012908 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1151 792 example-wget-1-14.warc.gz
|
com,example)/ 20140216012908 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1151 792 example-wget-1-14.warc.gz
|
||||||
|
|
||||||
|
# wget warc include all w/ metadata
|
||||||
|
>>> print_cdx_index('example-wget-1-14.warc.gz', include_all=True)
|
||||||
|
CDX N b a m s k r M S V g
|
||||||
|
com,example)/ 20140216012908 http://example.com/ - - - - - 394 398 example-wget-1-14.warc.gz
|
||||||
|
com,example)/ 20140216012908 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1151 792 example-wget-1-14.warc.gz
|
||||||
metadata)/gnu.org/software/wget/warc/manifest.txt 20140216012908 metadata://gnu.org/software/wget/warc/MANIFEST.txt text/plain - SWUF4CK2XMZSOKSA7SDT7M7NUGWH2TRE - - 315 1943 example-wget-1-14.warc.gz
|
metadata)/gnu.org/software/wget/warc/manifest.txt 20140216012908 metadata://gnu.org/software/wget/warc/MANIFEST.txt text/plain - SWUF4CK2XMZSOKSA7SDT7M7NUGWH2TRE - - 315 1943 example-wget-1-14.warc.gz
|
||||||
metadata)/gnu.org/software/wget/warc/wget_arguments.txt 20140216012908 metadata://gnu.org/software/wget/warc/wget_arguments.txt text/plain - UCXDCGORD6K4RJT5NUQGKE2PKEG4ZZD6 - - 340 2258 example-wget-1-14.warc.gz
|
metadata)/gnu.org/software/wget/warc/wget_arguments.txt 20140216012908 metadata://gnu.org/software/wget/warc/wget_arguments.txt text/plain - UCXDCGORD6K4RJT5NUQGKE2PKEG4ZZD6 - - 340 2258 example-wget-1-14.warc.gz
|
||||||
metadata)/gnu.org/software/wget/warc/wget.log 20140216012908 metadata://gnu.org/software/wget/warc/wget.log text/plain - 2ULE2LD5UOWDXGACCT624TU5BVKACRQ4 - - 599 2598 example-wget-1-14.warc.gz
|
metadata)/gnu.org/software/wget/warc/wget.log 20140216012908 metadata://gnu.org/software/wget/warc/wget.log text/plain - 2ULE2LD5UOWDXGACCT624TU5BVKACRQ4 - - 599 2598 example-wget-1-14.warc.gz
|
||||||
|
|
||||||
# bad arcs -- test error edge cases
|
# bad arcs -- test error edge cases
|
||||||
>>> print_cdx_index('bad.arc')
|
>>> print_cdx_index('bad.arc', include_all=True)
|
||||||
CDX N b a m s k r M S V g
|
CDX N b a m s k r M S V g
|
||||||
com,example)/ 20140401000000 http://example.com/ text/html - 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 67 134 bad.arc
|
com,example)/ 20140401000000 http://example.com/ text/html - 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 67 134 bad.arc
|
||||||
com,example)/ 20140102000000 http://example.com/ text/plain - 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 59 202 bad.arc
|
com,example)/ 20140102000000 http://example.com/ text/plain - 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 59 202 bad.arc
|
||||||
@ -104,7 +110,7 @@ org,httpbin)/post?data=^&foo=bar 20140610001255 http://httpbin.org/post?foo=bar
|
|||||||
>>> cli_lines(['--sort', '-', TEST_WARC_DIR])
|
>>> cli_lines(['--sort', '-', TEST_WARC_DIR])
|
||||||
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
|
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
|
||||||
org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1001 353 example-url-agnostic-orig.warc.gz
|
org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1001 353 example-url-agnostic-orig.warc.gz
|
||||||
206
|
200
|
||||||
|
|
||||||
# test sort, multiple inputs, all records + post query
|
# test sort, multiple inputs, all records + post query
|
||||||
>>> cli_lines(['--sort', '-a', '-p', '-9', TEST_WARC_DIR])
|
>>> cli_lines(['--sort', '-a', '-p', '-9', TEST_WARC_DIR])
|
||||||
|
Loading…
x
Reference in New Issue
Block a user