diff --git a/pywb/warc/archiveiterator.py b/pywb/warc/archiveiterator.py index 1654a696..86d5eef4 100644 --- a/pywb/warc/archiveiterator.py +++ b/pywb/warc/archiveiterator.py @@ -195,13 +195,23 @@ class ArchiveIndexEntry(object): #================================================================= def create_record_iter(arcv_iter, options): + + append_post = options.get('append_post') + include_all = options.get('include_all') + for record in arcv_iter.iter_records(): entry = None + if not include_all and (record.status_headers.get_statuscode() == '-'): + continue + if record.format == 'warc': if (record.rec_type == 'request' and - not options.get('append_post') and - not options.get('include_all')): + not include_all and + not append_post): + continue + + elif (not include_all and record.content_type == 'application/warc-fields'): continue entry = parse_warc_record(record) @@ -236,6 +246,7 @@ def create_record_iter(arcv_iter, options): yield entry + #================================================================= def join_request_records(entry_iter, options): prev_entry = None @@ -264,13 +275,10 @@ def join_request_records(entry_iter, options): yield prev_entry prev_entry = entry - if prev_entry: yield prev_entry - - #================================================================= def parse_warc_record(record): """ Parse warc record @@ -354,7 +362,7 @@ def create_index_iter(fh, **options): entry_iter = create_record_iter(aiter, options) - if options.get('append_post') == True: + if options.get('append_post'): entry_iter = join_request_records(entry_iter, options) for entry in entry_iter: diff --git a/pywb/warc/recordloader.py b/pywb/warc/recordloader.py index dee29020..974bd359 100644 --- a/pywb/warc/recordloader.py +++ b/pywb/warc/recordloader.py @@ -15,7 +15,7 @@ from pywb.utils.wbexception import WbException #================================================================= ArcWarcRecord = collections.namedtuple('ArcWarcRecord', 'format, rec_type, rec_headers, ' + - 'stream, status_headers') + 'stream, status_headers content_type') #================================================================= @@ -147,7 +147,8 @@ class ArcWarcRecordLoader: status_headers = StatusAndHeaders('- OK', content_type_header) return ArcWarcRecord(the_format, rec_type, - rec_headers, stream, status_headers) + rec_headers, stream, status_headers, + content_type) def _detect_type_load_headers(self, stream, statusline=None, known_format=None): diff --git a/pywb/warc/test/test_indexing.py b/pywb/warc/test/test_indexing.py index e9c7a892..88a3d3ff 100644 --- a/pywb/warc/test/test_indexing.py +++ b/pywb/warc/test/test_indexing.py @@ -43,16 +43,22 @@ com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ CDX N b a m s k r M S V g com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1656 151 example.arc -# wget warc (w/ metadata) +# wget warc, just responses >>> print_cdx_index('example-wget-1-14.warc.gz') CDX N b a m s k r M S V g com,example)/ 20140216012908 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1151 792 example-wget-1-14.warc.gz + +# wget warc include all w/ metadata +>>> print_cdx_index('example-wget-1-14.warc.gz', include_all=True) + CDX N b a m s k r M S V g +com,example)/ 20140216012908 http://example.com/ - - - - - 394 398 example-wget-1-14.warc.gz +com,example)/ 20140216012908 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1151 792 example-wget-1-14.warc.gz metadata)/gnu.org/software/wget/warc/manifest.txt 20140216012908 metadata://gnu.org/software/wget/warc/MANIFEST.txt text/plain - SWUF4CK2XMZSOKSA7SDT7M7NUGWH2TRE - - 315 1943 example-wget-1-14.warc.gz metadata)/gnu.org/software/wget/warc/wget_arguments.txt 20140216012908 metadata://gnu.org/software/wget/warc/wget_arguments.txt text/plain - UCXDCGORD6K4RJT5NUQGKE2PKEG4ZZD6 - - 340 2258 example-wget-1-14.warc.gz metadata)/gnu.org/software/wget/warc/wget.log 20140216012908 metadata://gnu.org/software/wget/warc/wget.log text/plain - 2ULE2LD5UOWDXGACCT624TU5BVKACRQ4 - - 599 2598 example-wget-1-14.warc.gz # bad arcs -- test error edge cases ->>> print_cdx_index('bad.arc') +>>> print_cdx_index('bad.arc', include_all=True) CDX N b a m s k r M S V g com,example)/ 20140401000000 http://example.com/ text/html - 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 67 134 bad.arc com,example)/ 20140102000000 http://example.com/ text/plain - 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 59 202 bad.arc @@ -104,7 +110,7 @@ org,httpbin)/post?data=^&foo=bar 20140610001255 http://httpbin.org/post?foo=bar >>> cli_lines(['--sort', '-', TEST_WARC_DIR]) com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1001 353 example-url-agnostic-orig.warc.gz -206 +200 # test sort, multiple inputs, all records + post query >>> cli_lines(['--sort', '-a', '-p', '-9', TEST_WARC_DIR])