From 2795802c77b210bb6159f4c5b0f44ac2eae12d37 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 4 May 2016 11:19:42 -0700 Subject: [PATCH] recordloader: for request/response/revisit records, only parse urls starting with http:/https: as http --- pywb/warc/recordloader.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/pywb/warc/recordloader.py b/pywb/warc/recordloader.py index 25803b77..06a3c79e 100644 --- a/pywb/warc/recordloader.py +++ b/pywb/warc/recordloader.py @@ -51,6 +51,9 @@ class ArcWarcRecordLoader(object): NON_HTTP_RECORDS = ('warcinfo', 'arc_header', 'metadata', 'resource') + NON_HTTP_SCHEMES = ('dns:', 'whois:', 'ntp:') + HTTP_SCHEMES = ('http:', 'https:') + def __init__(self, loader=None, cookie_maker=None, block_size=8192, verify_http=True): if not loader: @@ -151,13 +154,13 @@ class ArcWarcRecordLoader(object): status_headers = StatusAndHeaders(msg, []) # response record or non-empty revisit: parse HTTP status and headers! - elif (rec_type in ('response', 'revisit') and - not uri.startswith(('dns:', 'whois:'))): + elif (rec_type in ('response', 'revisit') + and uri.startswith(self.HTTP_SCHEMES)): status_headers = self.http_parser.parse(stream) # request record: parse request - elif ((rec_type == 'request') and - not uri.startswith(('dns:', 'whois:'))): + elif ((rec_type == 'request') + and uri.startswith(self.HTTP_SCHEMES)): status_headers = self.http_req_parser.parse(stream) # everything else: create a no-status entry, set content-type