From 1eb0f96f92a5430d598931c876a61c68877a22d8 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 10 Jan 2015 14:06:15 -0800 Subject: [PATCH] windows support work: fix loaders to use pathname2url to convert to file:/// url, use urlopen to open file paths fix some tests to use universal line breaks --- pywb/rewrite/rewrite_live.py | 17 ++++++++++++++--- pywb/utils/loaders.py | 10 +++++----- pywb/utils/test/test_bufferedreaders.py | 4 ++-- pywb/utils/test/test_loaders.py | 4 +++- pywb/warc/test/test_indexing.py | 6 +++--- setup.py | 8 ++++---- 6 files changed, 31 insertions(+), 18 deletions(-) diff --git a/pywb/rewrite/rewrite_live.py b/pywb/rewrite/rewrite_live.py index 3ebbe68d..8378c5d0 100644 --- a/pywb/rewrite/rewrite_live.py +++ b/pywb/rewrite/rewrite_live.py @@ -6,8 +6,10 @@ import requests import datetime import mimetypes import logging +import os -from urlparse import urlsplit +from urlparse import urlsplit, urljoin +from urllib import pathname2url from pywb.utils.loaders import is_http, LimitReader, BlockLoader from pywb.utils.loaders import extract_client_cookie @@ -180,16 +182,25 @@ class LiveRewriter(object): if url.startswith('//'): url = 'http:' + url + if is_http(url): + is_remote = True + else: + is_remote = False + if not url.startswith('file:'): + url = os.path.abspath(url) + url = urljoin('file:', pathname2url(url)) + print(url) + # explicit urlkey may be passed in (say for testing) if not urlkey: urlkey = canonicalize(url) - if is_http(url): + if is_remote: (status_headers, stream) = self.fetch_http(url, urlkey, env, req_headers, follow_redirects, ignore_proxies) - else: + else: (status_headers, stream) = self.fetch_local_file(url) if timestamp is None: diff --git a/pywb/utils/loaders.py b/pywb/utils/loaders.py index d54f4908..6ef1355e 100644 --- a/pywb/utils/loaders.py +++ b/pywb/utils/loaders.py @@ -129,13 +129,13 @@ class BlockLoader(object): # if starting with . or /, can only be a file path.. file_only = url.startswith(('/', '.')) - if url.startswith('file://'): - url = url[len('file://'):] - file_only = True - try: # first, try as file - afile = open(url, 'rb') + if url.startswith('file://'): + file_only = True + afile = urllib.urlopen(url) + else: + afile = open(url, 'rb') except IOError: if file_only: diff --git a/pywb/utils/test/test_bufferedreaders.py b/pywb/utils/test/test_bufferedreaders.py index cd5f3787..0a249981 100644 --- a/pywb/utils/test/test_bufferedreaders.py +++ b/pywb/utils/test/test_bufferedreaders.py @@ -3,11 +3,11 @@ r""" #================================================================= # DecompressingBufferedReader readline() ->>> DecompressingBufferedReader(open(test_cdx_dir + 'iana.cdx', 'rb')).readline() +>>> DecompressingBufferedReader(open(test_cdx_dir + 'iana.cdx', 'rU')).readline() ' CDX N b a m s k r M S V g\n' # detect not compressed ->>> DecompressingBufferedReader(open(test_cdx_dir + 'iana.cdx', 'rb'), decomp_type = 'gzip').readline() +>>> DecompressingBufferedReader(open(test_cdx_dir + 'iana.cdx', 'rU'), decomp_type = 'gzip').readline() ' CDX N b a m s k r M S V g\n' # decompress with on the fly compression, default gzip compression diff --git a/pywb/utils/test/test_loaders.py b/pywb/utils/test/test_loaders.py index 1da5d71e..020becca 100644 --- a/pywb/utils/test/test_loaders.py +++ b/pywb/utils/test/test_loaders.py @@ -25,7 +25,7 @@ True 100 # no length specified, read full amount requested ->>> len(BlockLoader().load('file://' + test_cdx_dir + 'example.cdx', 0, -1).read(400)) +>>> len(BlockLoader().load('file:' + pathname2url(test_cdx_dir + 'example.cdx'), 0, -1).read(400)) 400 # HMAC Cookie Maker @@ -65,6 +65,8 @@ from io import BytesIO from pywb.utils.loaders import BlockLoader, HMACCookieMaker from pywb.utils.loaders import LimitReader, extract_client_cookie +from urllib import pathname2url + from pywb import get_test_dir test_cdx_dir = get_test_dir() + 'cdx/' diff --git a/pywb/warc/test/test_indexing.py b/pywb/warc/test/test_indexing.py index c8584c8d..51e82d96 100644 --- a/pywb/warc/test/test_indexing.py +++ b/pywb/warc/test/test_indexing.py @@ -160,7 +160,7 @@ TEST_CDX_DIR = get_test_dir() + 'cdx/' TEST_WARC_DIR = get_test_dir() + 'warcs/' def read_fully(cdx): - with open(TEST_CDX_DIR + cdx) as fh: + with open(TEST_CDX_DIR + cdx, 'rU') as fh: curr = BytesIO() while True: b = fh.read() @@ -172,7 +172,7 @@ def read_fully(cdx): def cdx_index(warc, **options): buff = BytesIO() - with open(TEST_WARC_DIR + warc) as fh: + with open(TEST_WARC_DIR + warc, 'rU') as fh: write_cdx_index(buff, fh, warc, **options) return buff.getvalue() @@ -213,7 +213,7 @@ def cli_lines_with_dir(input_): print filename - with open(os.path.join(tmp_dir, filename), 'r') as fh: + with open(os.path.join(tmp_dir, filename), 'rU') as fh: lines = fh.read(8192).rstrip().split('\n') finally: diff --git a/setup.py b/setup.py index 1ecd998c..8d8b0ab8 100755 --- a/setup.py +++ b/setup.py @@ -58,10 +58,10 @@ setup( 'pywb': ['static/flowplayer/*', 'static/*.*', 'ui/*', '*.yaml'], }, data_files=[ - ('sample_archive/cdx/', glob.glob('sample_archive/cdx/*')), - ('sample_archive/zipcdx/', glob.glob('sample_archive/zipcdx/*')), - ('sample_archive/warcs/', glob.glob('sample_archive/warcs/*')), - ('sample_archive/text_content/', + ('sample_archive/cdx', glob.glob('sample_archive/cdx/*')), + ('sample_archive/zipcdx', glob.glob('sample_archive/zipcdx/*')), + ('sample_archive/warcs', glob.glob('sample_archive/warcs/*')), + ('sample_archive/text_content', glob.glob('sample_archive/text_content/*')), ], install_requires=[