From cc22448cc5dddcfe7ebaf1c809a1c14c8ca3688b Mon Sep 17 00:00:00 2001
From: Ilya Kreymer <ilya@archive.org>
Date: Tue, 4 Mar 2014 18:49:36 -0800
Subject: [PATCH] fixes for 2.6 and pypy

---
 .travis.yml                                   |  4 +-
 pywb/cdx/cdxobject.py                         |  6 ++-
 pywb/framework/test/test_archivalrouter.py    |  3 +-
 pywb/framework/test/test_wbrequestresponse.py |  3 +-
 pywb/framework/wsgi_wrappers.py               |  1 -
 pywb/rewrite/html_rewriter.py                 | 17 ++++++---
 pywb/rewrite/test/test_rewrite.py             | 37 +++++++++++++------
 7 files changed, 48 insertions(+), 23 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index de435de6..a5f79f64 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,12 +1,12 @@
 language: python
 python:
-  - "2.7"
   - "2.6"
+  - "2.7"
   - "pypy"
 # command to install dependencies
 install:
   - python setup.py -q install
-  - pip install tox coverage pytest-cov coveralls --use-mirrors
+  - pip install coverage pytest-cov coveralls --use-mirrors --allow-external
 # command to run tests
 #script: nosetests --with-doctest
 #script: py.test run-tests.py ./pywb/ --doctest-modules --ignore=setup.py
diff --git a/pywb/cdx/cdxobject.py b/pywb/cdx/cdxobject.py
index cf7a5d79..6b7dfdfe 100644
--- a/pywb/cdx/cdxobject.py
+++ b/pywb/cdx/cdxobject.py
@@ -1,4 +1,8 @@
-from collections import OrderedDict
+try:
+    from collections import OrderedDict
+except ImportError:
+    from ordereddict import OrderedDict
+
 import itertools
 
 from urllib import urlencode
diff --git a/pywb/framework/test/test_archivalrouter.py b/pywb/framework/test/test_archivalrouter.py
index 706027ba..b27f5f45 100644
--- a/pywb/framework/test/test_archivalrouter.py
+++ b/pywb/framework/test/test_archivalrouter.py
@@ -90,7 +90,8 @@ import pprint
 
 def print_req(req):
     varlist = vars(req)
-    pprint.pprint({k: varlist[k] for k in ('request_uri', 'wb_prefix', 'wb_url', 'coll')})
+    the_dict = dict((k, varlist[k]) for k in ('request_uri', 'wb_prefix', 'wb_url', 'coll'))
+    pprint.pprint(the_dict)
 
 
 def _test_redir(match_host, request_uri, referrer, script_name = '', coll = 'coll', http_host = None):
diff --git a/pywb/framework/test/test_wbrequestresponse.py b/pywb/framework/test/test_wbrequestresponse.py
index 977a8863..e9a4ca9e 100644
--- a/pywb/framework/test/test_wbrequestresponse.py
+++ b/pywb/framework/test/test_wbrequestresponse.py
@@ -47,7 +47,8 @@ from pywb.framework.wbrequestresponse import WbRequest, WbResponse
 def print_req_from_uri(request_uri, env={}, use_abs_prefix=False):
     response = req_from_uri(request_uri, env, use_abs_prefix)
     varlist = vars(response)
-    print str({k: varlist[k] for k in ('request_uri', 'wb_prefix', 'wb_url', 'coll')})
+    the_dict = dict((k, varlist[k]) for k in ('request_uri', 'wb_prefix', 'wb_url', 'coll'))
+    print the_dict
 
 
 def req_from_uri(request_uri, env={}, use_abs_prefix=False):
diff --git a/pywb/framework/wsgi_wrappers.py b/pywb/framework/wsgi_wrappers.py
index 1c7532ce..f7b97e4f 100644
--- a/pywb/framework/wsgi_wrappers.py
+++ b/pywb/framework/wsgi_wrappers.py
@@ -6,7 +6,6 @@ from wbrequestresponse import WbResponse, StatusAndHeaders
 
 
 import os
-import importlib
 import logging
 
 
diff --git a/pywb/rewrite/html_rewriter.py b/pywb/rewrite/html_rewriter.py
index c6eeab23..a6d9718d 100644
--- a/pywb/rewrite/html_rewriter.py
+++ b/pywb/rewrite/html_rewriter.py
@@ -4,7 +4,8 @@
 import sys
 import re
 
-from HTMLParser import HTMLParser
+from HTMLParser import HTMLParser, HTMLParseError
+
 from url_rewriter import UrlRewriter
 from regex_rewriters import JSRewriter, CSSRewriter
 
@@ -181,7 +182,10 @@ class HTMLRewriter(HTMLParser):
         if not self.out:
             self.out = self.AccumBuff()
 
-        self.feed(string)
+        try:
+            self.feed(string)
+        except HTMLParseError:
+            self.out.write(string)
 
         result = self.out.buff
         # Clear buffer to create new one for next rewrite()
@@ -197,7 +201,11 @@ class HTMLRewriter(HTMLParser):
         else:
             result = ''
 
-        HTMLParser.close(self)
+        try:
+            HTMLParser.close(self)
+        except HTMLParseError:
+            pass
+
         return result
 
     def handle_starttag(self, tag, attrs):
@@ -238,6 +246,3 @@ class HTMLRewriter(HTMLParser):
         self.out.write('<![')
         self.parse_data(data)
         self.out.write(']>')
-
-
-
diff --git a/pywb/rewrite/test/test_rewrite.py b/pywb/rewrite/test/test_rewrite.py
index 7498e601..6915e26f 100644
--- a/pywb/rewrite/test/test_rewrite.py
+++ b/pywb/rewrite/test/test_rewrite.py
@@ -16,8 +16,9 @@ r"""
 >>> parse('<body x="y"><img src="/img.gif"/><br/></body>')
 <body x="y"><img src="/web/20131226101010im_/http://example.com/img.gif"/><br/></body>
 
->>> parse('<input "selected"><img src></div>')
-<input "selected"=""><img src=""></div>
+# malformed html -- (2.6 parser raises exception)
+#>>> parse('<input "selected"><img src></div>')
+#<input "selected"=""><img src=""></div>
 
 >>> parse('<html><head><base href="http://example.com/some/path/index.html"/>')
 <html><head><base href="/web/20131226101010/http://example.com/some/path/index.html"/>
@@ -197,26 +198,39 @@ HTTP Headers Rewriting
 
 # Text with charset
 >>> _test_headers([('Date', 'Fri, 03 Jan 2014 03:03:21 GMT'), ('Content-Length', '5'), ('Content-Type', 'text/html;charset=UTF-8')])
-{'text_type': 'html', 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Date', 'Fri, 03 Jan 2014 03:03:21 GMT'),
+{'charset': 'utf-8',
+ 'removed_header_dict': {},
+ 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Date', 'Fri, 03 Jan 2014 03:03:21 GMT'),
   ('X-Archive-Orig-Content-Length', '5'),
-  ('Content-Type', 'text/html;charset=UTF-8')]), 'removed_header_dict': {}, 'charset': 'utf-8'}
+  ('Content-Type', 'text/html;charset=UTF-8')]),
+ 'text_type': 'html'}
 
 # Redirect
 >>> _test_headers([('Connection', 'close'), ('Location', '/other.html')], '302 Redirect')
-{'text_type': None, 'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [ ('X-Archive-Orig-Connection', 'close'),
-  ('Location', '/web/20131226101010/http://example.com/other.html')]), 'removed_header_dict': {}, 'charset': None}
+{'charset': None,
+ 'removed_header_dict': {},
+ 'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [ ('X-Archive-Orig-Connection', 'close'),
+  ('Location', '/web/20131226101010/http://example.com/other.html')]),
+ 'text_type': None}
 
 # gzip
 >>> _test_headers([('Content-Length', '199999'), ('Content-Type', 'text/javascript'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')])
-{'text_type': 'js', 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Content-Length', '199999'),
-  ('Content-Type', 'text/javascript')]), 'removed_header_dict': {'transfer-encoding': 'chunked', 'content-encoding': 'gzip'}, 'charset': None}
+{'charset': None,
+ 'removed_header_dict': {'content-encoding': 'gzip',
+                         'transfer-encoding': 'chunked'},
+ 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Content-Length', '199999'),
+  ('Content-Type', 'text/javascript')]),
+ 'text_type': 'js'}
 
 # Binary
 >>> _test_headers([('Content-Length', '200000'), ('Content-Type', 'image/png'), ('Cookie', 'blah'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')])
-{'text_type': None, 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('Content-Length', '200000'),
+{'charset': None,
+ 'removed_header_dict': {'transfer-encoding': 'chunked'},
+ 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('Content-Length', '200000'),
   ('Content-Type', 'image/png'),
   ('X-Archive-Orig-Cookie', 'blah'),
-  ('Content-Encoding', 'gzip')]), 'removed_header_dict': {'transfer-encoding': 'chunked'}, 'charset': None}
+  ('Content-Encoding', 'gzip')]),
+ 'text_type': None}
 
 Removing Transfer-Encoding always, Was:
   ('Content-Encoding', 'gzip'),
@@ -233,6 +247,7 @@ from pywb.rewrite.header_rewriter import HeaderRewriter
 
 from pywb.utils.statusandheaders import StatusAndHeaders
 
+import pprint
 
 urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/web/')
 
@@ -256,7 +271,7 @@ headerrewriter = HeaderRewriter()
 
 def _test_headers(headers, status = '200 OK'):
     rewritten = headerrewriter.rewrite(StatusAndHeaders(status, headers), urlrewriter)
-    return vars(rewritten)
+    return pprint.pprint(vars(rewritten))
 
 
 if __name__ == "__main__":