From 312bd715685b9e81a61ce1963c559e67db687feb Mon Sep 17 00:00:00 2001
From: Ilya Kreymer <ilya@archive.org>
Date: Wed, 19 Feb 2014 00:13:15 -0800
Subject: [PATCH] automatic record (warc/arc) format detection and
 decompression if needed. no need to rely on file type listing

---
 pywb/utils/statusandheaders.py |  7 +++-
 pywb/warc/README.md            | 14 ++++---
 pywb/warc/recordloader.py      | 73 ++++++++++++++++++----------------
 pywb/warc/test/test_loading.py | 26 ++++++++++--
 4 files changed, 74 insertions(+), 46 deletions(-)

diff --git a/pywb/utils/statusandheaders.py b/pywb/utils/statusandheaders.py
index 85fd241e..01bb6614 100644
--- a/pywb/utils/statusandheaders.py
+++ b/pywb/utils/statusandheaders.py
@@ -72,7 +72,7 @@ class StatusAndHeadersParser(object):
 
         if not protocol_status:
             msg = 'Expected Status Line - Found: ' + statusline
-            raise StatusAndHeadersParserException(msg)
+            raise StatusAndHeadersParserException(msg, statusline)
 
         headers = []
 
@@ -104,4 +104,7 @@ class StatusAndHeadersParserException(Exception):
     """
     status + headers parsing exception
     """
-    pass
+    def __init__(self, msg, statusline):
+        super(StatusAndHeadersParserException, self).__init__(msg)
+        self.statusline = statusline
+
diff --git a/pywb/warc/README.md b/pywb/warc/README.md
index f3a4bad4..91cc3036 100644
--- a/pywb/warc/README.md
+++ b/pywb/warc/README.md
@@ -1,17 +1,20 @@
 ### pywb.warc
 
 This is the WARC/ARC record loading component of pywb wayback tool suite.
-
-
-This package provides the following facilities:
+The package provides the following facilities:
 
 * Resolve relative WARC/ARC filenames to a full path based on configurable resolvers
 
 * Resolve 'revisit' records from provided index to find a full record with headers and payload content
 
-* Load WARC and ARC records either locally or via http using http 1.1 range requests
+* Load WARC/ARC records either locally or via http using http 1.1 range requests
 
 
+When loading archived content, the format type (WARC vs ARC) and compressed ARCs/WARCs
+are decompressed automatically.
+No assumption is made about format based on filename, content type
+or other external parameters other than the content itself.
+
 ### Tests
 
 This package will includes a test suite for loading a variety of WARC and ARC records.
@@ -26,5 +29,4 @@ Tests so far:
 
 TODO:
 
-* Different url revisit record resolving (TODO)
-* File type detection (no .warc, .arc extensions)
+* Different url revisit record resolving
diff --git a/pywb/warc/recordloader.py b/pywb/warc/recordloader.py
index 5937202c..05973f6b 100644
--- a/pywb/warc/recordloader.py
+++ b/pywb/warc/recordloader.py
@@ -4,6 +4,7 @@ import collections
 
 from pywb.utils.statusandheaders import StatusAndHeaders
 from pywb.utils.statusandheaders import StatusAndHeadersParser
+from pywb.utils.statusandheaders import StatusAndHeadersParserException
 
 from pywb.utils.loaders import FileLoader, HttpLoader
 from pywb.utils.bufferedreaders import BufferedReader
@@ -31,17 +32,6 @@ class ArcWarcRecordLoader:
     ARC_HEADERS = ["uri", "ip-address", "creation-date",
                    "content-type", "length"]
 
-    # Since loading a range request,
-    # can only determine gzip-ness based on file extension
-    # (BufferedReader will however default to non-gzip if
-    # decompression fails)
-    FORMAT_MAP = {
-        '.warc.gz': ('warc', True),
-        '.arc.gz':  ('arc',  True),
-        '.warc':    ('warc', False),
-        '.arc':     ('arc',  False),
-    }
-
     @staticmethod
     def create_default_loaders(cookie_maker=None):
         http = HttpLoader(cookie_maker)
@@ -74,21 +64,6 @@ class ArcWarcRecordLoader:
         if not loader:
             raise ArchiveLoadFailed('Unknown Protocol', url)
 
-        the_format = None
-
-        for ext, iformat in self.FORMAT_MAP.iteritems():
-            if url.endswith(ext):
-                the_format = iformat
-                break
-
-        if the_format is None:
-            raise ArchiveLoadFailed('Unknown file format', url)
-
-        (a_format, is_gzip) = the_format
-
-        #decomp = utils.create_decompressor() if is_gzip else None
-        decomp_type = 'gzip' if is_gzip else None
-
         try:
             length = int(length)
         except:
@@ -96,15 +71,17 @@ class ArcWarcRecordLoader:
 
         raw = loader.load(url, long(offset), length)
 
+        decomp_type = 'gzip'
+
         stream = BufferedReader(raw, length, self.chunk_size, decomp_type)
 
-        if a_format == 'arc':
-            rec_headers = self.arc_parser.parse(stream)
+        (the_format, rec_headers) = self._load_headers(stream)
+
+        if the_format == 'arc':
             rec_type = 'response'
             empty = (rec_headers.get_header('length') == 0)
 
-        elif a_format == 'warc':
-            rec_headers = self.warc_parser.parse(stream)
+        elif the_format == 'warc':
             rec_type = rec_headers.get_header('WARC-Type')
             empty = (rec_headers.get_header('Content-Length') == '0')
 
@@ -131,17 +108,44 @@ class ArcWarcRecordLoader:
             #(statusline, http_headers) = self.parse_http_headers(stream)
             status_headers = self.http_parser.parse(stream)
 
-        return ArcWarcRecord((a_format, rec_type),
+        return ArcWarcRecord((the_format, rec_type),
                              rec_headers, stream, status_headers)
 
+    def _load_headers(self, stream):
+        """
+        Try parsing record as WARC, then try parsing as ARC.
+        if neither one succeeds, we're out of luck.
+        """
+
+        statusline = None
+
+        # try as warc first
+        try:
+            rec_headers = self.warc_parser.parse(stream)
+            return 'warc', rec_headers
+        except StatusAndHeadersParserException as se:
+            statusline = se.statusline
+            pass
+
+        # now try as arc
+        try:
+            rec_headers = self.arc_parser.parse(stream, statusline)
+            return 'arc', rec_headers
+        except StatusAndHeadersParserException as se:
+            msg = 'Unknown archive format, first line: ' + se.statusline
+            raise ArchiveLoadFailed(msg)
+
 
 #=================================================================
 class ARCHeadersParser:
     def __init__(self, headernames):
         self.headernames = headernames
 
-    def parse(self, stream):
-        headerline = stream.readline().rstrip()
+    def parse(self, stream, headerline=None):
+
+        # if headerline passed in, use that
+        if not headerline:
+            headerline = stream.readline().rstrip()
 
         parts = headerline.split()
 
@@ -149,7 +153,8 @@ class ARCHeadersParser:
 
         if len(parts) != len(headernames):
             msg = 'Wrong # of headers, expected arc headers {0}, Found {1}'
-            raise ArchiveLoadFailed(msg.format(headernames, parts))
+            msg = msg.format(headernames, parts)
+            raise StatusAndHeadersParserException(msg, headernames)
 
         headers = []
 
diff --git a/pywb/warc/test/test_loading.py b/pywb/warc/test/test_loading.py
index e1a40950..47176e3e 100644
--- a/pywb/warc/test/test_loading.py
+++ b/pywb/warc/test/test_loading.py
@@ -2,7 +2,7 @@
 """
 Test loading different types of records from a variety of formats
 
-# Load response record from WARC
+# Load response record from compressed WARC
 >>> load_test_archive('example.warc.gz', '333', '1043')
 (('warc', 'response'),
  StatusAndHeaders(protocol = 'WARC/1.0', statusline = '', headers = [ ('WARC-Type', 'response'),
@@ -26,7 +26,7 @@ Test loading different types of records from a variety of formats
   ('Content-Length', '1270'),
   ('Connection', 'close')]))
 
-# Load revisit record from WARC
+# Load revisit record from compressed WARC
 >>> load_test_archive('example.warc.gz', '1864', '553')
 (('warc', 'revisit'),
  StatusAndHeaders(protocol = 'WARC/1.0', statusline = '', headers = [ ('WARC-Type', 'revisit'),
@@ -59,7 +59,7 @@ Test loading different types of records from a variety of formats
 # Print parsed http headers + 2 lines of content
 # ==============================================================================
 
-# Test loading from ARC based on cdx line
+# Test loading from compressed ARC based on cdx line
 >>> load_from_cdx_test('com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 856 171 example.arc.gz')
 StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
   ('Cache-Control', 'max-age=604800'),
@@ -75,6 +75,7 @@ StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Acc
 <!doctype html>
 <html>
 
+# Uncompressed arc
 >>> load_from_cdx_test('com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1656 151 example.arc')
 StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
   ('Cache-Control', 'max-age=604800'),
@@ -91,7 +92,7 @@ StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Acc
 <html>
 
 
-# Test loading from WARC based on cdx line
+# Test loading from compressed WARC based on cdx line
 >>> load_from_cdx_test('com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz')
 StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
   ('Cache-Control', 'max-age=604800'),
@@ -108,6 +109,23 @@ StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Acc
 <!doctype html>
 <html>
 
+# Uncompressed WARC
+>>> load_from_cdx_test('com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1987 460 example.warc')
+StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
+  ('Cache-Control', 'max-age=604800'),
+  ('Content-Type', 'text/html'),
+  ('Date', 'Fri, 03 Jan 2014 03:03:21 GMT'),
+  ('Etag', '"359670651"'),
+  ('Expires', 'Fri, 10 Jan 2014 03:03:21 GMT'),
+  ('Last-Modified', 'Fri, 09 Aug 2013 23:54:35 GMT'),
+  ('Server', 'ECS (sjc/4FCE)'),
+  ('X-Cache', 'HIT'),
+  ('x-ec-custom-error', '1'),
+  ('Content-Length', '1270'),
+  ('Connection', 'close')])
+<!doctype html>
+<html>
+
 # Test cdx w/ revisit
 >>> load_from_cdx_test('com,example)/?example=1 20140103030341 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz 1043 333 example.warc.gz')
 StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),