1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

Merge branch 'develop' into binary-parse

This commit is contained in:
Ilya Kreymer 2014-06-28 11:04:43 -07:00
commit f2bfc96002
3 changed files with 26 additions and 11 deletions

View File

@ -195,13 +195,23 @@ class ArchiveIndexEntry(object):
#=================================================================
def create_record_iter(arcv_iter, options):
append_post = options.get('append_post')
include_all = options.get('include_all')
for record in arcv_iter.iter_records():
entry = None
if not include_all and (record.status_headers.get_statuscode() == '-'):
continue
if record.format == 'warc':
if (record.rec_type == 'request' and
not options.get('append_post') and
not options.get('include_all')):
not include_all and
not append_post):
continue
elif (not include_all and record.content_type == 'application/warc-fields'):
continue
entry = parse_warc_record(record)
@ -236,6 +246,7 @@ def create_record_iter(arcv_iter, options):
yield entry
#=================================================================
def join_request_records(entry_iter, options):
prev_entry = None
@ -264,13 +275,10 @@ def join_request_records(entry_iter, options):
yield prev_entry
prev_entry = entry
if prev_entry:
yield prev_entry
#=================================================================
def parse_warc_record(record):
""" Parse warc record
@ -354,7 +362,7 @@ def create_index_iter(fh, **options):
entry_iter = create_record_iter(aiter, options)
if options.get('append_post') == True:
if options.get('append_post'):
entry_iter = join_request_records(entry_iter, options)
for entry in entry_iter:

View File

@ -15,7 +15,7 @@ from pywb.utils.wbexception import WbException
#=================================================================
ArcWarcRecord = collections.namedtuple('ArcWarcRecord',
'format, rec_type, rec_headers, ' +
'stream, status_headers')
'stream, status_headers content_type')
#=================================================================
@ -147,7 +147,8 @@ class ArcWarcRecordLoader:
status_headers = StatusAndHeaders('- OK', content_type_header)
return ArcWarcRecord(the_format, rec_type,
rec_headers, stream, status_headers)
rec_headers, stream, status_headers,
content_type)
def _detect_type_load_headers(self, stream,
statusline=None, known_format=None):

View File

@ -43,16 +43,22 @@ com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ
CDX N b a m s k r M S V g
com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1656 151 example.arc
# wget warc (w/ metadata)
# wget warc, just responses
>>> print_cdx_index('example-wget-1-14.warc.gz')
CDX N b a m s k r M S V g
com,example)/ 20140216012908 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1151 792 example-wget-1-14.warc.gz
# wget warc include all w/ metadata
>>> print_cdx_index('example-wget-1-14.warc.gz', include_all=True)
CDX N b a m s k r M S V g
com,example)/ 20140216012908 http://example.com/ - - - - - 394 398 example-wget-1-14.warc.gz
com,example)/ 20140216012908 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1151 792 example-wget-1-14.warc.gz
metadata)/gnu.org/software/wget/warc/manifest.txt 20140216012908 metadata://gnu.org/software/wget/warc/MANIFEST.txt text/plain - SWUF4CK2XMZSOKSA7SDT7M7NUGWH2TRE - - 315 1943 example-wget-1-14.warc.gz
metadata)/gnu.org/software/wget/warc/wget_arguments.txt 20140216012908 metadata://gnu.org/software/wget/warc/wget_arguments.txt text/plain - UCXDCGORD6K4RJT5NUQGKE2PKEG4ZZD6 - - 340 2258 example-wget-1-14.warc.gz
metadata)/gnu.org/software/wget/warc/wget.log 20140216012908 metadata://gnu.org/software/wget/warc/wget.log text/plain - 2ULE2LD5UOWDXGACCT624TU5BVKACRQ4 - - 599 2598 example-wget-1-14.warc.gz
# bad arcs -- test error edge cases
>>> print_cdx_index('bad.arc')
>>> print_cdx_index('bad.arc', include_all=True)
CDX N b a m s k r M S V g
com,example)/ 20140401000000 http://example.com/ text/html - 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 67 134 bad.arc
com,example)/ 20140102000000 http://example.com/ text/plain - 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 59 202 bad.arc
@ -104,7 +110,7 @@ org,httpbin)/post?data=^&foo=bar 20140610001255 http://httpbin.org/post?foo=bar
>>> cli_lines(['--sort', '-', TEST_WARC_DIR])
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1001 353 example-url-agnostic-orig.warc.gz
206
200
# test sort, multiple inputs, all records + post query
>>> cli_lines(['--sort', '-a', '-p', '-9', TEST_WARC_DIR])