1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

recordloader: for request/response/revisit records, only parse urls starting with http:/https: as http

This commit is contained in:
Ilya Kreymer 2016-05-04 11:19:42 -07:00
parent af920d77a0
commit 2795802c77

View File

@ -51,6 +51,9 @@ class ArcWarcRecordLoader(object):
NON_HTTP_RECORDS = ('warcinfo', 'arc_header', 'metadata', 'resource')
NON_HTTP_SCHEMES = ('dns:', 'whois:', 'ntp:')
HTTP_SCHEMES = ('http:', 'https:')
def __init__(self, loader=None, cookie_maker=None, block_size=8192,
verify_http=True):
if not loader:
@ -151,13 +154,13 @@ class ArcWarcRecordLoader(object):
status_headers = StatusAndHeaders(msg, [])
# response record or non-empty revisit: parse HTTP status and headers!
elif (rec_type in ('response', 'revisit') and
not uri.startswith(('dns:', 'whois:'))):
elif (rec_type in ('response', 'revisit')
and uri.startswith(self.HTTP_SCHEMES)):
status_headers = self.http_parser.parse(stream)
# request record: parse request
elif ((rec_type == 'request') and
not uri.startswith(('dns:', 'whois:'))):
elif ((rec_type == 'request')
and uri.startswith(self.HTTP_SCHEMES)):
status_headers = self.http_req_parser.parse(stream)
# everything else: create a no-status entry, set content-type