diff --git a/__init__.py b/__init__.py deleted file mode 100644 index 769c3cc7..00000000 --- a/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -#Allow importing - diff --git a/pywb/__init__.py b/pywb/__init__.py index 1cdc4fe6..ef7c1d7f 100644 --- a/pywb/__init__.py +++ b/pywb/__init__.py @@ -1,3 +1,4 @@ -#Allow importing - +import os +def get_test_dir(): + return os.path.dirname(os.path.realpath(__file__)) + '/../sample_archive/' diff --git a/pywb/archivalrouter.py b/pywb/archivalrouter.py index dbb092ac..eecd3d9e 100644 --- a/pywb/archivalrouter.py +++ b/pywb/archivalrouter.py @@ -3,8 +3,8 @@ import re import wbexceptions from wbrequestresponse import WbRequest, WbResponse -from url_rewriter import UrlRewriter -from wburl import WbUrl +from pywb.rewrite.url_rewriter import UrlRewriter +from pywb.rewrite.wburl import WbUrl #================================================================= # ArchivalRouter -- route WB requests in archival mode @@ -45,20 +45,6 @@ class ArchivalRouter: # of request uri (excluding first '/') #================================================================= class Route: - """ - # route with relative path - >>> Route('web', handlers.BaseHandler())({'REL_REQUEST_URI': '/web/test.example.com', 'SCRIPT_NAME': ''}, False) - {'wb_url': ('latest_replay', '', '', 'http://test.example.com', 'http://test.example.com'), 'coll': 'web', 'wb_prefix': '/web/', 'request_uri': '/web/test.example.com'} - - # route with absolute path, running at script /my_pywb - >>> Route('web', handlers.BaseHandler())({'REL_REQUEST_URI': '/web/2013im_/test.example.com', 'SCRIPT_NAME': '/my_pywb', 'HTTP_HOST': 'localhost:8081', 'wsgi.url_scheme': 'https'}, True) - {'wb_url': ('replay', '2013', 'im_', 'http://test.example.com', '2013im_/http://test.example.com'), 'coll': 'web', 'wb_prefix': 'https://localhost:8081/my_pywb/web/', 'request_uri': '/web/2013im_/test.example.com'} - - - # not matching route -- skipped - >>> Route('web', handlers.BaseHandler())({'REL_REQUEST_URI': '/other/test.example.com', 'SCRIPT_NAME': ''}, False) - """ - # match upto next / or ? or end SLASH_QUERY_LOOKAHEAD ='(?=/|$|\?)' @@ -127,57 +113,6 @@ class Route: # ReferRedirect -- redirect urls that have 'fallen through' based on the referrer settings #================================================================= class ReferRedirect: - - """ - >>> ReferRedirect('http://localhost:8080/').match_prefixs - ['http://localhost:8080/'] - - >>> ReferRedirect(['http://example:9090/']).match_prefixs - ['http://example:9090/'] - - >>> test_redir('http://localhost:8080/', '/diff_path/other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html') - 'http://localhost:8080/coll/20131010/http://example.com/diff_path/other.html' - - >>> test_redir('http://localhost:8080/', '/../other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html') - 'http://localhost:8080/coll/20131010/http://example.com/other.html' - - >>> test_redir('http://localhost:8080/', '/../../other.html', 'http://localhost:8080/coll/20131010/http://example.com/index.html') - 'http://localhost:8080/coll/20131010/http://example.com/other.html' - - # Custom collection - >>> test_redir('http://localhost:8080/', '/other.html', 'http://localhost:8080/complex/123/20131010/http://example.com/path/page.html', coll='complex/123') - 'http://localhost:8080/complex/123/20131010/http://example.com/other.html' - - # With timestamp included - >>> test_redir('http://localhost:8080/', '/20131010/other.html', 'http://localhost:8080/coll/20131010/http://example.com/index.html') - 'http://localhost:8080/coll/20131010/http://example.com/other.html' - - # With timestamp included - >>> test_redir('http://localhost:8080/', '/20131010/path/other.html', 'http://localhost:8080/coll/20131010/http://example.com/some/index.html') - 'http://localhost:8080/coll/20131010/http://example.com/path/other.html' - - # Wrong Host - >>> test_redir('http://example:8080/', '/other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html') - False - - # Right Host - >>> test_redir('http://localhost:8080/', '/other.html', 'http://example.com:8080/coll/20131010/http://example.com/path/page.html', http_host = 'example.com:8080') - 'http://example.com:8080/coll/20131010/http://example.com/other.html' - - # With custom SCRIPT_NAME - >>> test_redir('http://localhost:8080/', '/../other.html', 'http://localhost:8080/extra/coll/20131010/http://example.com/path/page.html', '/extra') - 'http://localhost:8080/extra/coll/20131010/http://example.com/other.html' - - # With custom SCRIPT_NAME + timestamp - >>> test_redir('http://localhost:8080/', '/20131010/other.html', 'http://localhost:8080/extra/coll/20131010/http://example.com/path/page.html', '/extra') - 'http://localhost:8080/extra/coll/20131010/http://example.com/other.html' - - # With custom SCRIPT_NAME, bad match - >>> test_redir('http://localhost:8080/', '/../other.html', 'http://localhost:8080/extra/coll/20131010/http://example.com/path/page.html', '/extr') - False - - """ - def __init__(self, match_prefixs): if isinstance(match_prefixs, list): self.match_prefixs = match_prefixs @@ -240,31 +175,3 @@ class ReferRedirect: final_url = urlparse.urlunsplit((ref_split.scheme, ref_split.netloc, rewriter.rewrite(rel_request_uri), '', '')) return WbResponse.redir_response(final_url) - - -import utils -if __name__ == "__main__" or utils.enable_doctests(): - - import handlers - - def test_redir(match_host, request_uri, referrer, script_name = '', coll = 'coll', http_host = None): - env = {'REL_REQUEST_URI': request_uri, 'HTTP_REFERER': referrer, 'SCRIPT_NAME': script_name} - - if http_host: - env['HTTP_HOST'] = http_host - - routes = [Route(coll, handlers.BaseHandler())] - - redir = ReferRedirect(match_host) - #req = WbRequest.from_uri(request_uri, env) - rep = redir(env, routes) - if not rep: - return False - - return rep.status_headers.get_header('Location') - - - import doctest - doctest.testmod() - - diff --git a/pywb/archiveloader.py b/pywb/archiveloader.py deleted file mode 100644 index fd690b34..00000000 --- a/pywb/archiveloader.py +++ /dev/null @@ -1,461 +0,0 @@ -import itertools -import utils -import urllib2 -import StringIO -import urlparse -import collections -import wbexceptions - -from wbrequestresponse import StatusAndHeaders - -#================================================================= -# load a reader from http -#================================================================= - -class HttpLoader: - """ - Load content over http with range request and optional signature - """ - def __init__(self, hmac = None, hmac_duration = 30): - self.hmac = hmac - self.hmac_duration = hmac_duration - - def load(self, url, offset, length): - if length > 0: - range_header = 'bytes={0}-{1}'.format(offset, offset + length - 1) - else: - range_header = 'bytes={0}-'.format(offset) - - headers = {} - headers['Range'] = range_header - - if self.hmac: - headers['Cookie'] = self.hmac(self.hmac_duration) - - request = urllib2.Request(url, headers = headers) - return urllib2.urlopen(request) - - -#================================================================= -# load a reader from local filesystem -#================================================================= -class FileLoader: - """ - Load content from local file-system - - # Ensure attempt to read more than 100 bytes, only reads 100 bytes - >>> len(FileLoader().load(utils.test_data_dir() + 'warcs/iana.warc.gz', 0, 100).read('400')) - 100 - - """ - - def load(self, url, offset, length): - if url.startswith('file://'): - url = url[len('file://'):] - - afile = open(url, 'rb') - afile.seek(offset) - - if length > 0: - return LimitReader(afile, length) - else: - return afile - -#================================================================= -# A reader which will not read past the specified limit -#================================================================= -class LimitReader: - """ - >>> LimitReader(StringIO.StringIO('abcdefghjiklmnopqrstuvwxyz'), 10).read(26) - 'abcdefghji' - - >>> LimitReader(StringIO.StringIO('abcdefghjiklmnopqrstuvwxyz'), 8).readline(26) - 'abcdefgh' - - >>> test_multiple_reads(LimitReader(StringIO.StringIO('abcdefghjiklmnopqrstuvwxyz'), 10), [2, 2, 20]) - 'efghji' - - """ - - def __init__(self, stream, limit): - self.stream = stream - self.limit = limit - - if not self.limit: - self.limit = 1 - - - def read(self, length = None): - length = min(length, self.limit) if length else self.limit - buff = self.stream.read(length) - self.limit -= len(buff) - return buff - - - def readline(self, length = None): - length = min(length, self.limit) if length else self.limit - buff = self.stream.readline(length) - self.limit -= len(buff) - return buff - - def close(self): - self.stream.close() - - -#================================================================= -WBArchiveRecord = collections.namedtuple('WBArchiveRecord', 'type, rec_headers, stream, status_headers') - -#================================================================= - -class ArchiveLoader: - """ - >>> load_test_archive('example.warc.gz', '333', '1043') - (('warc', 'response'), - StatusAndHeaders(protocol = 'WARC/1.0', statusline = '', headers = [ ('WARC-Type', 'response'), - ('WARC-Record-ID', ''), - ('WARC-Date', '2014-01-03T03:03:21Z'), - ('Content-Length', '1610'), - ('Content-Type', 'application/http; msgtype=response'), - ('WARC-Payload-Digest', 'sha1:B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'), - ('WARC-Target-URI', 'http://example.com?example=1'), - ('WARC-Warcinfo-ID', '')]), - StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'), - ('Cache-Control', 'max-age=604800'), - ('Content-Type', 'text/html'), - ('Date', 'Fri, 03 Jan 2014 03:03:21 GMT'), - ('Etag', '"359670651"'), - ('Expires', 'Fri, 10 Jan 2014 03:03:21 GMT'), - ('Last-Modified', 'Fri, 09 Aug 2013 23:54:35 GMT'), - ('Server', 'ECS (sjc/4FCE)'), - ('X-Cache', 'HIT'), - ('x-ec-custom-error', '1'), - ('Content-Length', '1270'), - ('Connection', 'close')])) - - - >>> load_test_archive('example.warc.gz', '1864', '553') - (('warc', 'revisit'), - StatusAndHeaders(protocol = 'WARC/1.0', statusline = '', headers = [ ('WARC-Type', 'revisit'), - ('WARC-Record-ID', ''), - ('WARC-Date', '2014-01-03T03:03:41Z'), - ('Content-Length', '340'), - ('Content-Type', 'application/http; msgtype=response'), - ('WARC-Payload-Digest', 'sha1:B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'), - ('WARC-Target-URI', 'http://example.com?example=1'), - ('WARC-Warcinfo-ID', ''), - ( 'WARC-Profile', - 'http://netpreserve.org/warc/0.18/revisit/identical-payload-digest'), - ('WARC-Refers-To-Target-URI', 'http://example.com?example=1'), - ('WARC-Refers-To-Date', '2014-01-03T03:03:21Z')]), - StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'), - ('Cache-Control', 'max-age=604800'), - ('Content-Type', 'text/html'), - ('Date', 'Fri, 03 Jan 2014 03:03:41 GMT'), - ('Etag', '"359670651"'), - ('Expires', 'Fri, 10 Jan 2014 03:03:41 GMT'), - ('Last-Modified', 'Fri, 09 Aug 2013 23:54:35 GMT'), - ('Server', 'ECS (sjc/4FCE)'), - ('X-Cache', 'HIT'), - ('x-ec-custom-error', '1'), - ('Content-Length', '1270'), - ('Connection', 'close')])) - """ - - # Standard ARC headers - ARC_HEADERS = ["uri", "ip-address", "creation-date", "content-type", "length"] - - # Since loading a range request, can only determine gzip-ness based on file extension - FORMAT_MAP = { - '.warc.gz': ('warc', True), - '.arc.gz': ('arc', True), - '.warc': ('warc', False), - '.arc': ('arc', False), - } - - @staticmethod - def create_default_loaders(hmac = None): - http = HttpLoader(hmac) - file = FileLoader() - return { - 'http': http, - 'https': http, - 'file': file, - '': file - } - - - def __init__(self, loaders = {}, hmac = None, chunk_size = 8192): - self.loaders = loaders if loaders else ArchiveLoader.create_default_loaders(hmac) - self.chunk_size = chunk_size - - self.arc_parser = ARCHeadersParser(ArchiveLoader.ARC_HEADERS) - self.warc_parser = StatusAndHeadersParser(['WARC/1.0', 'WARC/0.17', 'WARC/0.18']) - self.http_parser = StatusAndHeadersParser(['HTTP/1.0', 'HTTP/1.1']) - - def load(self, url, offset, length): - url_parts = urlparse.urlsplit(url) - - loader = self.loaders.get(url_parts.scheme) - if not loader: - raise wbexceptions.UnknownLoaderProtocolException(url) - - the_format = None - - for ext, iformat in ArchiveLoader.FORMAT_MAP.iteritems(): - if url.endswith(ext): - the_format = iformat - break - - if the_format is None: - raise wbexceptions.UnknownArchiveFormatException(url) - - (a_format, is_gzip) = the_format - - decomp = utils.create_decompressor() if is_gzip else None - - try: - length = int(length) - except: - length = -1 - - - raw = loader.load(url, long(offset), length) - - stream = LineReader(raw, length, self.chunk_size, decomp) - - if a_format == 'arc': - rec_headers = self.arc_parser.parse(stream) - rec_type = 'response' - empty = (rec_headers.get_header('length') == 0) - - elif a_format == 'warc': - rec_headers = self.warc_parser.parse(stream) - rec_type = rec_headers.get_header('WARC-Type') - empty = (rec_headers.get_header('Content-Length') == '0') - - # special case: empty w/arc record (hopefully a revisit) - if empty: - status_headers = StatusAndHeaders('204 No Content', []) - - # special case: warc records that are not expected to have http headers - # attempt to add 200 status and content-type - elif rec_type == 'metadata' or rec_type == 'resource': - status_headers = StatusAndHeaders('200 OK', [('Content-Type', rec_headers.get_header('Content-Type'))]) - - # special case: http 0.9 response, no status or headers - #elif rec_type == 'response': - # content_type = rec_headers.get_header('Content-Type') - # if content_type and (';version=0.9' in content_type): - # status_headers = StatusAndHeaders('200 OK', []) - - # response record: parse HTTP status and headers! - else: - #(statusline, http_headers) = self.parse_http_headers(stream) - status_headers = self.http_parser.parse(stream) - - return WBArchiveRecord((a_format, rec_type), rec_headers, stream, status_headers) - - -#================================================================= -class StatusAndHeadersParser: - def __init__(self, statuslist): - self.statuslist = statuslist - - def parse(self, stream): - statusline = stream.readline().rstrip() - - protocol_status = utils.split_prefix(statusline, self.statuslist) - - if not protocol_status: - raise wbexceptions.InvalidArchiveRecordException('Expected Status Line, Found: ' + statusline) - - headers = [] - - line = stream.readline().rstrip() - while line and line != '\r\n': - name, value = line.split(':', 1) - header = (name, value.strip()) - headers.append(header) - line = stream.readline().rstrip() - - return StatusAndHeaders(statusline = protocol_status[1].strip(), headers = headers, protocol = protocol_status[0]) - -#================================================================= -class ARCHeadersParser: - def __init__(self, headernames): - self.headernames = headernames - - - def parse(self, stream): - headerline = stream.readline().rstrip() - - parts = headerline.split() - - headernames = self.headernames - - if len(parts) != len(headernames): - raise wbexceptions.InvalidArchiveRecordException('Wrong # of heaeders, expected arc headers {0}, Found {1}'.format(headernames, parts)) - - headers = [] - - for name, value in itertools.izip(headernames, parts): - headers.append((name, value)) - - return StatusAndHeaders(statusline = '', headers = headers, protocol = 'ARC/1.0') - -#================================================================= -class LineReader: - def __init__(self, stream, max_len = 0, chunk_size = 1024, decomp = None): - self.stream = stream - self.chunk_size = chunk_size - self.decomp = decomp - self.buff = None - self.num_read = 0 - self.max_len = max_len - - def _fillbuff(self, chunk_size = None): - if not chunk_size: - chunk_size = self.chunk_size - - if not self.buff or self.buff.pos >= self.buff.len: - to_read = min(self.max_len - self.num_read, self.chunk_size) if (self.max_len > 0) else self.chunk_size - data = self.stream.read(to_read) - self._process_read(data) - - def _process_read(self, data): - if self.decomp and data: - try: - data = self.decomp.decompress(data) - except Exception: - # if first read attempt, assume non-gzipped stream - if self.num_read == 0: - self.decomp = False - # otherwise (partly decompressed), something is wrong - else: - raise - - self.num_read += len(data) - self.buff = StringIO.StringIO(data) - - - def read(self, length = None): - self._fillbuff() - return self.buff.read(length) - - def readline(self, length = None): - self._fillbuff() - return self.buff.readline(length) - - def close(self): - if self.stream: - self.stream.close() - self.stream = None - - -class ChunkedDataException(Exception): - pass - - -class ChunkedLineReader(LineReader): - r""" - Properly formatted chunked data: - >>> c=ChunkedLineReader(StringIO.StringIO("4\r\n1234\r\n0\r\n\r\n")); c.read()+c.read() - '1234' - - Non-chunked data: - >>> ChunkedLineReader(StringIO.StringIO("xyz123!@#")).read() - 'xyz123!@#' - - Starts like chunked data, but isn't: - >>> c=ChunkedLineReader(StringIO.StringIO("1\r\nxyz123!@#")); c.read()+c.read() - '1\r\nx123!@#' - - Chunked data cut off part way through: - >>> c=ChunkedLineReader(StringIO.StringIO("4\r\n1234\r\n4\r\n12"));c.read()+c.read() - '123412' - """ - - all_chunks_read = False - not_chunked = False - raise_chunked_data_exceptions = False # if False, we'll use best-guess fallback for parse errors - - def _fillbuff(self, chunk_size = None): - if self.not_chunked: - return LineReader._fillbuff(self, chunk_size) - - if self.all_chunks_read: - return - - if not self.buff or self.buff.pos >= self.buff.len: - length_header = self.stream.readline(64) - data = '' - - try: - # decode length header - try: - chunk_size = int(length_header.strip().split(';')[0], 16) - except ValueError: - raise ChunkedDataException("Couldn't decode length header '%s'" % length_header) - - if chunk_size: - # read chunk - while len(data) < chunk_size: - new_data = self.stream.read(chunk_size - len(data)) - - # if we unexpectedly run out of data, either raise an exception or just stop reading, assuming file was cut off - if not new_data: - if self.raise_chunked_data_exceptions: - raise ChunkedDataException("Ran out of data before end of chunk") - else: - chunk_size = len(data) - self.all_chunks_read = True - - data += new_data - - # if we successfully read a block without running out, it should end in \r\n - if not self.all_chunks_read: - clrf = self.stream.read(2) - if clrf != '\r\n': - raise ChunkedDataException("Chunk terminator not found.") - - if self.decomp: - data = self.decomp.decompress(data) - else: - # chunk_size 0 indicates end of file - self.all_chunks_read = True - data = '' - - self._process_read(data) - except ChunkedDataException: - if self.raise_chunked_data_exceptions: - raise - # Can't parse the data as chunked. - # It's possible that non-chunked data is set with a Transfer-Encoding: chunked - # Treat this as non-chunk encoded from here on - self._process_read(length_header + data) - self.not_chunked = True - - -#================================================================= -import utils -if __name__ == "__main__" or utils.enable_doctests(): - import os - import pprint - - testloader = ArchiveLoader() - - def load_test_archive(test_file, offset, length): - path = utils.test_data_dir() + 'warcs/' + test_file - - archive = testloader.load(path, offset, length) - pprint.pprint((archive.type, archive.rec_headers, archive.status_headers)) - - def test_multiple_reads(reader, inc_reads): - result = None - for x in inc_reads: - result = reader.read(x) - return result - - import doctest - doctest.testmod() - diff --git a/pywb/binsearch/binsearch.py b/pywb/binsearch/binsearch.py deleted file mode 100644 index 2d7646d9..00000000 --- a/pywb/binsearch/binsearch.py +++ /dev/null @@ -1,123 +0,0 @@ -from collections import deque -import os -import itertools - -#================================================================= -# Binary Search over a text file -#================================================================= -class FileReader: - """ - A very simple file-like object wrapper that knows it's size - getsize() method returns the filesize - """ - def __init__(self, filename): - self.fh = open(filename, 'rb') - self.filename = filename - self.size = os.path.getsize(filename) - - def getsize(self): - return self.size - - def readline(self): - return self.fh.readline() - - def seek(self, offset): - return self.fh.seek(offset) - - def close(self): - return self.fh.close() - - -#================================================================= -def binsearch_offset(reader, key, compare_func=cmp, block_size=8192): - """ - Find offset of the full line which matches a given 'key' using binary search - If key is not found, the offset is of the line after the key - - File is subdivided into block_size (default 8192) sized blocks - Optional compare_func may be specified - """ - min = 0 - max = reader.getsize() / block_size - - while (max - min > 1): - mid = min + ((max - min) / 2) - reader.seek(mid * block_size) - - if mid > 0: - reader.readline() # skip partial line - - line = reader.readline() - - if compare_func(key, line) > 0: - min = mid - else: - max = mid - - return (min * block_size) - - -def search(reader, key, prev_size = 0, compare_func = cmp, block_size = 8192): - """ - Perform a binsearch for a specified key down to block_size (8192) sized blocks, - followed by linear search within the block to find first matching line. - - When performing linear search, keep track of up to N previous lines before - first matching line. - """ - min = binsearch_offset(reader, key, compare_func, block_size) - - reader.seek(min) - - if min > 0: - reader.readline() # skip partial line - - if prev_size > 1: - prev_deque = deque(maxlen = prev_size) - - line = None - - while True: - line = reader.readline() - if not line: - break - if compare_func(line, key) >= 0: - break - - if prev_size == 1: - prev = line - elif prev_size > 1: - prev_deque.append(line) - - def gen_iter(line): - if prev_size == 1: - yield prev.rstrip() - elif prev_size > 1: - for i in prev_deque: - yield i.rstrip() - - while line: - yield line.rstrip() - line = reader.readline() - - return gen_iter(line) - - -# Iterate over prefix matches -def iter_prefix(reader, key): - """ - Creates an iterator which iterates over prefix matches for a key in a sorted text file - A line matches as long as it starts with key - """ - - return itertools.takewhile(lambda line: line.startswith(key), search(reader, key)) - - -def iter_exact(reader, key, token=' '): - """ - Create an iterator which iterates over exact matches for a key in a sorted text file - Key is terminated by a token (default ' ') - """ - - return iter_prefix(reader, key + token) - diff --git a/pywb/cdx/README.md b/pywb/cdx/README.md new file mode 100644 index 00000000..26a41eb1 --- /dev/null +++ b/pywb/cdx/README.md @@ -0,0 +1,36 @@ +## PyWb CDX v0.2 + +[![Build Status](https://travis-ci.org/ikreymer/pywb_cdx.png?branch=master)](https://travis-ci.org/ikreymer/pywb_cdx) + + +This package contains the CDX processing suite of the pywb wayback tool suite. + +The CDX Server loads, filters and transforms cdx from multiple sources in response +to a given query. + +### Installation and Tests + +`pip install -r requirements` -- to install + +`python run-tests.py` -- to run all tests + + +### Sample App + +A very simple reference WSGI app is included. + +Run: `python -m pywb_cdx.wsgi_cdxserver` to start the app, keyboard interrupt to stop. + +The default [config.yaml](pywb_cdx/config.yaml) points to the sample data directory +and uses port 8080 + +### CDX Server API Reference + +Goal is to provide compatiblity with this feature set and more: +https://github.com/internetarchive/wayback/tree/master/wayback-cdx-server + +TODO + + + + diff --git a/pywb/binsearch/__init__.py b/pywb/cdx/__init__.py similarity index 100% rename from pywb/binsearch/__init__.py rename to pywb/cdx/__init__.py diff --git a/pywb/cdxserver/cdxobject.py b/pywb/cdx/cdxobject.py similarity index 61% rename from pywb/cdxserver/cdxobject.py rename to pywb/cdx/cdxobject.py index 804e3492..ac3975b2 100644 --- a/pywb/cdxserver/cdxobject.py +++ b/pywb/cdx/cdxobject.py @@ -1,25 +1,31 @@ from collections import OrderedDict import itertools + #================================================================= class CDXObject(OrderedDict): CDX_FORMATS = [ # Public CDX Format - ["urlkey","timestamp","original","mimetype","statuscode","digest","length"], + ["urlkey", "timestamp", "original", "mimetype", "statuscode", + "digest", "length"], # CDX 11 Format - ["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"], + ["urlkey", "timestamp", "original", "mimetype", "statuscode", + "digest", "redirect", "robotflags", "length", "offset", "filename"], # CDX 9 Format - ["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","offset","filename"], + ["urlkey", "timestamp", "original", "mimetype", "statuscode", + "digest", "redirect", "offset", "filename"], # CDX 11 Format + 3 revisit resolve fields - ["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename", - "orig.length","orig.offset","orig.filename"], + ["urlkey", "timestamp", "original", "mimetype", "statuscode", + "digest", "redirect", "robotflags", "length", "offset", "filename", + "orig.length", "orig.offset", "orig.filename"], # CDX 9 Format + 3 revisit resolve fields - ["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","offset","filename", - "orig.length","orig.offset","orig.filename"] + ["urlkey", "timestamp", "original", "mimetype", "statuscode", + "digest", "redirect", "offset", "filename", + "orig.length", "orig.offset", "orig.filename"] ] def __init__(self, cdxline): @@ -53,5 +59,3 @@ class CDXObject(OrderedDict): li = itertools.imap(lambda (n, val): val, self.items()) return ' '.join(li) - - diff --git a/pywb/cdxserver/cdxops.py b/pywb/cdx/cdxops.py similarity index 79% rename from pywb/cdxserver/cdxops.py rename to pywb/cdx/cdxops.py index 28d94a07..d18cc8fa 100644 --- a/pywb/cdxserver/cdxops.py +++ b/pywb/cdx/cdxops.py @@ -1,8 +1,6 @@ from cdxobject import CDXObject +from pywb.utils.timeutils import timestamp_to_sec -from ..binsearch.binsearch import iter_exact, iter_prefix, FileReader - -import timeutils import bisect import itertools import re @@ -11,7 +9,6 @@ from heapq import merge from collections import deque - #================================================================= def cdx_text_out(cdx, fields): if not fields: @@ -26,30 +23,31 @@ def cdx_load(sources, params): cdx_iter = make_cdx_iter(cdx_iter) - resolve_revisits = params.get('resolve_revisits', False) - if resolve_revisits: - cdx_iter = cdx_resolve_revisits(cdx_iter) + if not params.get('proxy_all'): + resolve_revisits = params.get('resolve_revisits', False) + if resolve_revisits: + cdx_iter = cdx_resolve_revisits(cdx_iter) - filters = params.get('filter', None) - if filters: - cdx_iter = cdx_filter(cdx_iter, filters) + filters = params.get('filter', None) + if filters: + cdx_iter = cdx_filter(cdx_iter, filters) - collapse_time = params.get('collapse_time', None) - if collapse_time: - cdx_iter = cdx_collapse_time_status(cdx_iter, collapse_time) + collapse_time = params.get('collapse_time', None) + if collapse_time: + cdx_iter = cdx_collapse_time_status(cdx_iter, collapse_time) - limit = int(params.get('limit', 1000000)) + limit = int(params.get('limit', 1000000)) - reverse = params.get('reverse', False) - if reverse: - cdx_iter = cdx_reverse(cdx_iter, limit) + reverse = params.get('reverse', False) + if reverse: + cdx_iter = cdx_reverse(cdx_iter, limit) - closest_to = params.get('closest_to', None) - if closest_to: - cdx_iter = cdx_sort_closest(closest_to, cdx_iter, limit) + closest_to = params.get('closest', None) + if closest_to: + cdx_iter = cdx_sort_closest(closest_to, cdx_iter, limit) - if limit: - cdx_iter = cdx_limit(cdx_iter, limit) + if limit: + cdx_iter = cdx_limit(cdx_iter, limit) # output raw cdx objects if params.get('output') == 'raw': @@ -73,6 +71,7 @@ def load_cdx_streams(sources, params): merged_stream = merge(*(source_iters)) return merged_stream + #================================================================= # convert text cdx stream to CDXObject def make_cdx_iter(text_iter): @@ -98,7 +97,7 @@ def cdx_reverse(cdx_iter, limit): return [last] if last else [] - reverse_cdxs = deque(maxlen = limit) + reverse_cdxs = deque(maxlen=limit) for cdx in cdx_iter: reverse_cdxs.appendleft(cdx) @@ -142,14 +141,13 @@ def cdx_filter(cdx_iter, filter_strings): filters = map(Filter, filter_strings) for cdx in cdx_iter: - if all (x(cdx) for x in filters): + if all(x(cdx) for x in filters): yield cdx - #================================================================= # collapse by timestamp and status code -def cdx_collapse_time_status(cdx_iter, timelen = 10): +def cdx_collapse_time_status(cdx_iter, timelen=10): timelen = int(timelen) last_token = None @@ -163,16 +161,15 @@ def cdx_collapse_time_status(cdx_iter, timelen = 10): yield cdx - #================================================================= # sort CDXCaptureResult by closest to timestamp -def cdx_sort_closest(closest, cdx_iter, limit = 10): +def cdx_sort_closest(closest, cdx_iter, limit=10): closest_cdx = [] - closest_sec = timeutils.timestamp_to_sec(closest) + closest_sec = timestamp_to_sec(closest) for cdx in cdx_iter: - sec = timeutils.timestamp_to_sec(cdx['timestamp']) + sec = timestamp_to_sec(cdx['timestamp']) key = abs(closest_sec - sec) # create tuple to sort by key @@ -186,22 +183,22 @@ def cdx_sort_closest(closest, cdx_iter, limit = 10): if len(closest_cdx) > limit: closest_cdx.pop() - return itertools.imap(lambda x: x[1], closest_cdx) - #================================================================= # resolve revisits # Fields to append from cdx original to revisit ORIG_TUPLE = ['length', 'offset', 'filename'] + def cdx_resolve_revisits(cdx_iter): originals = {} for cdx in cdx_iter: - is_revisit = (cdx['mimetype'] == 'warc/revisit') or (cdx['filename'] == '-') + is_revisit = ((cdx['mimetype'] == 'warc/revisit') or + (cdx['filename'] == '-')) digest = cdx['digest'] @@ -210,7 +207,6 @@ def cdx_resolve_revisits(cdx_iter): if not original_cdx and not is_revisit: originals[digest] = cdx - if original_cdx and is_revisit: fill_orig = lambda field: original_cdx[field] # Transfer mimetype and statuscode @@ -224,5 +220,3 @@ def cdx_resolve_revisits(cdx_iter): cdx['orig.' + field] = fill_orig(field) yield cdx - - diff --git a/pywb/cdxserver/cdxserver.py b/pywb/cdx/cdxserver.py similarity index 53% rename from pywb/cdxserver/cdxserver.py rename to pywb/cdx/cdxserver.py index 82697167..22a14ee0 100644 --- a/pywb/cdxserver/cdxserver.py +++ b/pywb/cdx/cdxserver.py @@ -1,5 +1,4 @@ import surt -from ..binsearch.binsearch import iter_exact, iter_prefix, FileReader from cdxops import cdx_load import itertools @@ -7,39 +6,21 @@ import logging import os import urlparse +from cdxsource import CDXSource, CDXFile, RemoteCDXSource -#================================================================= -class CDXFile: - def __init__(self, filename): - self.filename = filename - - def load_cdx(self, params): - source = FileReader(self.filename) - - match_type = params.get('match_type') - - if match_type == 'prefix': - iter_func = iter_prefix - else: - iter_func = iter_exact - - key = params.get('key') - - return iter_func(source, key) - - def __str__(self): - return 'CDX File - ' + self.filename #================================================================= class CDXException(Exception): - def __init__(self, msg, url = None): - Exception.__init__(self, msg) - self.url = url - def status(self): return '400 Bad Request' +#================================================================= +class AccessException(CDXException): + def status(self): + return '403 Bad Request' + + #================================================================= class CDXServer: """ @@ -47,33 +28,51 @@ class CDXServer: responds to queries and dispatches to the cdx ops for processing """ - def __init__(self, sources, surt_ordered = True): + @staticmethod + def create_from_config(config): + paths = config.get('index_paths') + surt_ordered = config.get('surt_ordered', True) + return CDXServer(paths, surt_ordered) + + def __init__(self, sources, surt_ordered=True): self.sources = [] self.surt_ordered = surt_ordered + logging.debug('CDX Surt-Ordered? ' + str(surt_ordered)) + if not isinstance(sources, list): + sources = [sources] + for src in sources: - if os.path.isdir(src): - for file in os.listdir(src): - self.add_cdx_loader(src + file) - else: - self.add_cdx_loader(src) + if isinstance(src, CDXSource): + self.add_cdx_source(src) + elif isinstance(src, str): + if os.path.isdir(src): + for file in os.listdir(src): + self.add_cdx_source(src + file) + else: + self.add_cdx_source(src) if len(self.sources) == 0: - logging.exception('No CDX Sources Found!') + logging.exception('No CDX Sources Found from: ' + str(sources)) - def add_cdx_loader(self, filename): - source = self.create_cdx_loader(filename) - if not source: - return + def add_cdx_source(self, source): + if not isinstance(source, CDXSource): + source = self.create_cdx_source(source) + if not source: + return logging.debug('Adding CDX Source: ' + str(source)) self.sources.append(source) @staticmethod - def create_cdx_loader(filename): + def create_cdx_source(filename): + if filename.startswith('http://') or filename.startswith('https://'): + return RemoteCDXSource(filename) + if filename.endswith('.cdx'): return CDXFile(filename) + return None #TODO: support zipnum #elif filename.endswith('.summary') @@ -81,27 +80,52 @@ class CDXServer: #elif filename.startswith('redis://') # return RedisCDXSource(filename) - def load_cdx(self, **params): - # canonicalize to surt (canonicalization is part of surt conversion) + # if key not set, assume 'url' is set and needs canonicalization + if not params.get('key'): + params['key'] = self._canonicalize(params) + + self._convert_old_style(params) + + return cdx_load(self.sources, params) + + def _canonicalize(self, params): + """ + Canonicalize url and convert to surt + If no surt-mode, convert back to url form + as surt conversion is currently part of canonicalization + """ try: url = params['url'] except KeyError: - raise CDXException('The url= param must be specified to query the cdx server') + msg = 'A url= param must be specified to query the cdx server' + raise CDXException(msg) try: key = surt.surt(url) except Exception as e: - raise CDXException('Invalid url: ', url) + raise CDXException('Invalid Url: ' + url) # if not surt, unsurt the surt to get canonicalized non-surt url if not self.surt_ordered: key = unsurt(key) - params['key'] = key + return key - return cdx_load(self.sources, params) + def _convert_old_style(self, params): + """ + Convert old-style CDX Server param semantics + """ + collapse_time = params.get('collapseTime') + if collapse_time: + params['collapse_time'] = collapse_time + resolve_revisits = params.get('resolveRevisits') + if resolve_revisits: + params['resolve_revisits'] = resolve_revisits + + if params.get('sort') == 'reverse': + params['reverse'] = True def load_cdx_from_request(self, env): #url = wbrequest.wb_url.url @@ -113,7 +137,8 @@ class CDXServer: params['output'] = 'text' # parse_qs produces arrays for single values - # cdxreader expects singleton params for all except filters, so convert here + # cdx processing expects singleton params for all params, + # except filters, so convert here # use first value of the list for name, val in params.iteritems(): if name != 'filter': @@ -122,13 +147,10 @@ class CDXServer: cdx_lines = self.load_cdx(**params) return cdx_lines - - def __str__(self): return 'load cdx indexes from ' + str(self.sources) - #================================================================= def unsurt(surt): """ @@ -141,7 +163,8 @@ def unsurt(surt): 'com,example)' # Long surt - >>> unsurt('suffix,domain,sub,subsub,another,subdomain)/path/file/index.html?a=b?c=)/') + >>> unsurt('suffix,domain,sub,subsub,another,subdomain)/path/file/\ +index.html?a=b?c=)/') 'subdomain.another.subsub.sub.domain.suffix)/path/file/index.html?a=b?c=)/' """ @@ -158,3 +181,6 @@ def unsurt(surt): return surt +if __name__ == "__main__": + import doctest + doctest.testmod() diff --git a/pywb/cdx/cdxsource.py b/pywb/cdx/cdxsource.py new file mode 100644 index 00000000..addd60f7 --- /dev/null +++ b/pywb/cdx/cdxsource.py @@ -0,0 +1,92 @@ +from pywb.utils.binsearch import iter_exact, iter_prefix +from pywb.utils.loaders import SeekableTextFileReader + +import urllib +import urllib2 + + +#================================================================= +class CDXSource(object): + """ + Represents any cdx index source + """ + def load_cdx(self, params): + raise NotImplementedError('Implement in subclass') + + +#================================================================= +class CDXFile(CDXSource): + """ + Represents a local plain-text .cdx file + """ + def __init__(self, filename): + self.filename = filename + + def load_cdx(self, params): + source = SeekableTextFileReader(self.filename) + + match_type = params.get('match_type') + + if match_type == 'prefix': + iter_func = iter_prefix + else: + iter_func = iter_exact + + key = params.get('key') + + return iter_func(source, key) + + def __str__(self): + return 'CDX File - ' + self.filename + + +#================================================================= +class RemoteCDXSource(CDXSource): + """ + Represents a remote cdx server, to which requests will be proxied. + + Only url and match type params are proxied at this time, + the stream is passed through all other filters locally. + """ + def __init__(self, filename, cookie=None, proxy_all=True): + self.remote_url = filename + self.cookie = cookie + self.proxy_all = proxy_all + + def load_cdx(self, proxy_params): + if self.proxy_all: + params = proxy_params + params['proxy_all'] = True + else: + # Only send url and matchType params to remote + params = {} + params['url'] = proxy_params['url'] + match_type = proxy_params.get('match_type') + + if match_type: + proxy_params['matchType'] = match_type + + urlparams = urllib.urlencode(params, True) + + try: + request = urllib2.Request(self.remote_url, urlparams) + + if self.cookie: + request.add_header('Cookie', self.cookie) + + response = urllib2.urlopen(request) + + except urllib2.HTTPError as e: + if e.code == 403: + exc_msg = e.read() + msg = ('Blocked By Robots' if 'Blocked By Robots' in exc_msg + else 'Excluded') + + raise AccessException(msg) + else: + raise + + return iter(response) + + def __str__(self): + return 'Remote CDX Server: ' + self.remote_url diff --git a/pywb/cdx/config.yaml b/pywb/cdx/config.yaml new file mode 100644 index 00000000..2aa4838f --- /dev/null +++ b/pywb/cdx/config.yaml @@ -0,0 +1,3 @@ +#CDX Server WSGI App Config +index_paths: ./sample_data/ +port: 8090 diff --git a/pywb/cdx/test/cdxserver_test.py b/pywb/cdx/test/cdxserver_test.py new file mode 100644 index 00000000..88ed74dc --- /dev/null +++ b/pywb/cdx/test/cdxserver_test.py @@ -0,0 +1,163 @@ +#================================================================= +""" +# Merge Sort Multipe CDX Sources +>>> cdx_ops_test(url = 'http://iana.org/', sources = [test_cdx_dir + 'dupes.cdx', test_cdx_dir + 'iana.cdx']) +org,iana)/ 20140126200624 http://www.iana.org/ text/html 200 OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 2258 334 iana.warc.gz +org,iana)/ 20140127171238 http://iana.org unk 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 343 1858 dupes.warc.gz +org,iana)/ 20140127171238 http://www.iana.org/ warc/revisit - OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 536 2678 dupes.warc.gz + + +# Limit CDX Stream +>>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', limit = 3) +org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200625 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 117166 198285 iana.warc.gz +org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200654 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf warc/revisit - YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 548 482544 iana.warc.gz +org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200706 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf warc/revisit - YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 495230 iana.warc.gz + + +# Reverse CDX Stream +>>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', reverse = True, resolve_revisits = True, limit = 3) +org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201308 https://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 551 783712 iana.warc.gz 117166 198285 iana.warc.gz +org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201249 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 771773 iana.warc.gz 117166 198285 iana.warc.gz +org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201240 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 551 757988 iana.warc.gz 117166 198285 iana.warc.gz + +>>> cdx_ops_test('http://iana.org/_js/2013.1/jquery.js', reverse = True, resolve_revisits = True, limit = 1) +org,iana)/_js/2013.1/jquery.js 20140126201307 https://www.iana.org/_js/2013.1/jquery.js application/x-javascript 200 AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO - - 543 778507 iana.warc.gz 33449 7311 iana.warc.gz + +# No matching results +>>> cdx_ops_test('http://iana.org/dont_have_this', reverse = True, resolve_revisits = True, limit = 2) + + +# Filter cdx +>>> cdx_ops_test(url = 'http://iana.org/domains', match_type = 'prefix', filter = ['mimetype:text/html']) +org,iana)/domains 20140126200825 http://www.iana.org/domains text/html 200 7UPSCLNWNZP33LGW6OJGSF2Y4CDG4ES7 - - 2912 610534 iana.warc.gz +org,iana)/domains/arpa 20140126201248 http://www.iana.org/domains/arpa text/html 200 QOFZZRN6JIKAL2JRL6ZC2VVG42SPKGHT - - 2939 759039 iana.warc.gz +org,iana)/domains/idn-tables 20140126201127 http://www.iana.org/domains/idn-tables text/html 200 HNCUFTJMOQOGAEY6T56KVC3T7TVLKGEW - - 8118 715878 iana.warc.gz +org,iana)/domains/int 20140126201239 http://www.iana.org/domains/int text/html 200 X32BBNNORV4SPEHTQF5KI5NFHSKTZK6Q - - 2482 746788 iana.warc.gz +org,iana)/domains/reserved 20140126201054 http://www.iana.org/domains/reserved text/html 200 R5AAEQX5XY5X5DG66B23ODN5DUBWRA27 - - 3573 701457 iana.warc.gz +org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz +org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz +org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz +org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz + + +>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', filter = 'statuscode:200') +org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz + + +# Collapse by timestamp +# unresolved revisits, different statuscode results in an extra repeat +>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapse_time = 11) +org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz +org,iana)/_css/2013.1/screen.css 20140126200653 http://www.iana.org/_css/2013.1/screen.css warc/revisit - BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 533 328367 iana.warc.gz +org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css warc/revisit - BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz + +# resolved revisits +>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapse_time = '11', resolve_revisits = True) +org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz - - - +org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz 8754 41238 iana.warc.gz + + +# Sort by closest timestamp + field select output +>>> cdx_ops_test(closest = '20140126200826', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', fields = 'timestamp', limit = 10) +20140126200826 +20140126200816 +20140126200805 +20140126200912 +20140126200738 +20140126200930 +20140126200718 +20140126200706 +20140126200654 +20140126200625 + +>>> cdx_ops_test(closest = '20140126201306', url = 'http://iana.org/dnssec', resolve_revisits = True, sources = [test_cdx_dir + 'dupes.cdx', test_cdx_dir + 'iana.cdx']) +org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz - - - +org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz - - - + + +>>> cdx_ops_test(closest = '20140126201307', url = 'http://iana.org/dnssec', resolve_revisits = True) +org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz - - - +org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz - - - + +# equal dist prefer earlier +>>> cdx_ops_test(closest = '20140126200700', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolve_revisits = True, limit = 2) +org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200654 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 548 482544 iana.warc.gz 117166 198285 iana.warc.gz +org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200706 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 495230 iana.warc.gz 117166 198285 iana.warc.gz + +>>> cdx_ops_test(closest = '20140126200659', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolve_revisits = True, limit = 2, fields = 'timestamp') +20140126200654 +20140126200706 + +>>> cdx_ops_test(closest = '20140126200701', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolve_revisits = True, limit = 2, fields = 'timestamp') +20140126200706 +20140126200654 + + +# Resolve Revisits +>>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/inconsolata.otf', resolve_revisits = True) +org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200826 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 34054 620049 iana.warc.gz - - - +org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200912 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 546 667073 iana.warc.gz 34054 620049 iana.warc.gz +org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200930 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 534 697255 iana.warc.gz 34054 620049 iana.warc.gz +org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201055 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 547 714833 iana.warc.gz 34054 620049 iana.warc.gz +org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201249 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 551 768625 iana.warc.gz 34054 620049 iana.warc.gz + +>>> cdx_ops_test('http://iana.org/domains/root/db', resolve_revisits = True) +org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz - - - +org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz - - - + + +# CDX Server init +>>> x = CDXServer([test_cdx_dir]).load_cdx(url = 'example.com', limit = 1, output = 'raw') +>>> pprint.pprint(x.next().items()) +[('urlkey', 'com,example)/'), + ('timestamp', '20140127171200'), + ('original', 'http://example.com'), + ('mimetype', 'text/html'), + ('statuscode', '200'), + ('digest', 'B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'), + ('redirect', '-'), + ('robotflags', '-'), + ('length', '1046'), + ('offset', '334'), + ('filename', 'dupes.warc.gz')] + +# NOTE: external dependency -- need self-contained test +>>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'example.com', output = 'raw', limit = '2') +>>> pprint.pprint(x.next().items()) +[('urlkey', 'com,example)/'), + ('timestamp', '20020120142510'), + ('original', 'http://example.com:80/'), + ('mimetype', 'text/html'), + ('statuscode', '200'), + ('digest', 'HT2DYGA5UKZCPBSFVCV3JOBXGW2G5UUA'), + ('length', '1792')] + +""" + +#================================================================= +from pywb.cdx.cdxserver import CDXServer +import os +import sys +import pprint + +from pywb import get_test_dir +#test_cdx_dir = os.path.dirname(os.path.realpath(__file__)) + '/../sample_data/' +test_cdx_dir = get_test_dir() + 'cdx/' + +def cdx_ops_test(url, sources = [test_cdx_dir + 'iana.cdx'], **kwparams): + kwparams['url'] = url + kwparams['output'] = 'text' + + server = CDXServer(sources) + results = server.load_cdx(**kwparams) + + for x in results: + sys.stdout.write(x) + + + +if __name__ == "__main__": + import doctest + doctest.testmod() + + diff --git a/pywb/cdx/wsgi_cdxserver.py b/pywb/cdx/wsgi_cdxserver.py new file mode 100644 index 00000000..ecf64b8b --- /dev/null +++ b/pywb/cdx/wsgi_cdxserver.py @@ -0,0 +1,72 @@ +from cdxserver import CDXServer +import logging +import os +import yaml +import pkgutil + +#================================================================= +TEST_CDX_DIR = os.path.dirname(os.path.realpath(__file__)) + '/../sample_data/' + +CONFIG_FILE = 'config.yaml' + +DEFAULT_PORT = 8080 + +if __package__: + config = pkgutil.get_data(__package__, CONFIG_FILE) + config = yaml.load(config) +else: + config = None + + +#================================================================= +def main(): + logging.basicConfig(format='%(asctime)s: [%(levelname)s]: %(message)s', + level=logging.DEBUG) + + cdx_config = config.get('index_paths') if config else None + + if not cdx_config: + cdx_config = [TEST_CDX_DIR] + + cdxserver = CDXServer(cdx_config) + + def application(env, start_response): + try: + response = cdxserver.load_cdx_from_request(env) + start_response('200 OK', [('Content-Type', 'text/plain')]) + + response = list(response) + + except Exception as exc: + import traceback + err_details = traceback.format_exc(exc) + start_response('400 Error', [('Content-Type', 'text/plain')]) + response = [str(exc)] + print err_details + + return response + + return application + + +if __name__ == "__main__": + from wsgiref.simple_server import make_server + + app = main() + + port = DEFAULT_PORT + if config: + port = config.get('port', DEFAULT_PORT) + + httpd = make_server('', port, app) + + logging.debug('Starting CDX Server on port ' + str(port)) + + try: + httpd.serve_forever() + except KeyboardInterrupt: + pass + + logging.debug('Stopping CDX Server') +else: + application = main() diff --git a/pywb/cdxserver/cdxapp.py b/pywb/cdxserver/cdxapp.py deleted file mode 100644 index 15488582..00000000 --- a/pywb/cdxserver/cdxapp.py +++ /dev/null @@ -1,42 +0,0 @@ -from cdxserver import CDXServer -import logging -import os - - -test_cdx_dir = os.path.dirname(os.path.realpath(__file__)) + '/../../sample_archive/cdx/' - -#================================================================= -def main(config = None): - logging.basicConfig(format = '%(asctime)s: [%(levelname)s]: %(message)s', level = logging.DEBUG) - - if not config: - config = [test_cdx_dir] - - cdxserver = CDXServer(config) - - def application(env, start_response): - try: - response = cdxserver.load_cdx_from_request(env) - start_response('200 OK', [('Content-Type', 'text/plain')]) - - response = list(response) - - except Exception as exc: - import traceback - err_details = traceback.format_exc(exc) - start_response('400 Error', [('Content-Type', 'text/plain')]) - response = [str(exc)] - print err_details - - return response - - - return application - - -if __name__ == "__main__": - pass -else: - application = main() - - diff --git a/pywb/config_utils.py b/pywb/config_utils.py index 999eba75..672e8735 100644 --- a/pywb/config_utils.py +++ b/pywb/config_utils.py @@ -1,59 +1,34 @@ -import archiveloader import views import handlers -import indexreader import replay_views -import replay_resolvers import logging -import hmac -import time + +from pywb.warc.recordloader import ArcWarcRecordLoader +from pywb.warc.resolvingloader import ResolvingLoader +from pywb.rewrite.rewrite_content import RewriteContent #================================================================= # Config Loading #================================================================= def load_template_file(file, desc = None, view_class = views.J2TemplateView): if file: - logging.info('Adding {0}: {1}'.format(desc if desc else name, file)) + logging.debug('Adding {0}: {1}'.format(desc if desc else name, file)) file = view_class(file) return file #================================================================= -# Cookie Signing -#================================================================= +def create_wb_handler(cdx_server, config): -class HMACCookieMaker: - def __init__(self, key, name): - self.key = key - self.name = name + record_loader = ArcWarcRecordLoader(cookie_maker = config.get('cookie_maker')) + paths = config.get('archive_paths') - def __call__(self, duration, extra_id = ''): - expire = str(long(time.time() + duration)) + resolving_loader = ResolvingLoader(paths = paths, cdx_server = cdx_server, record_loader = record_loader) - if extra_id: - msg = extra_id + '-' + expire - else: - msg = expire + replayer = replay_views.ReplayView( + content_loader = resolving_loader, - hmacdigest = hmac.new(self.key, msg) - hexdigest = hmacdigest.hexdigest() - - if extra_id: - cookie = '{0}-{1}={2}-{3}'.format(self.name, extra_id, expire, hexdigest) - else: - cookie = '{0}={1}-{2}'.format(self.name, expire, hexdigest) - - return cookie - - -#================================================================= -def create_wb_handler(cdx_source, config): - - replayer = replay_views.RewritingReplayView( - - resolvers = replay_resolvers.make_best_resolvers(config.get('archive_paths')), - - loader = archiveloader.ArchiveLoader(hmac = config.get('hmac')), + content_rewriter = RewriteContent(), head_insert_view = load_template_file(config.get('head_insert_html'), 'Head Insert'), @@ -66,7 +41,7 @@ def create_wb_handler(cdx_source, config): wb_handler = handlers.WBHandler( - cdx_source, + cdx_server, replayer, diff --git a/pywb/handlers.py b/pywb/handlers.py index 81314ea3..0061264d 100644 --- a/pywb/handlers.py +++ b/pywb/handlers.py @@ -1,13 +1,12 @@ -import views -import utils import urlparse - -from wbrequestresponse import WbResponse -from wburl import WbUrl -from wbexceptions import WbException, NotFoundException - import pkgutil import mimetypes +import time + +from pywb.rewrite.wburl import WbUrl +from wbrequestresponse import WbResponse +from wbexceptions import WbException, NotFoundException +from views import TextCapturesView class BaseHandler: @@ -22,23 +21,22 @@ class BaseHandler: # Standard WB Handler #================================================================= class WBHandler(BaseHandler): - def __init__(self, cdx_reader, replay, html_view = None, search_view = None): - self.cdx_reader = cdx_reader + def __init__(self, index_reader, replay, html_view = None, search_view = None): + self.index_reader = index_reader self.replay = replay - self.text_view = views.TextCapturesView() + self.text_view = TextCapturesView() self.html_view = html_view self.search_view = search_view def __call__(self, wbrequest): - if wbrequest.wb_url_str == '/': return self.render_search_page(wbrequest) - with utils.PerfTimer(wbrequest.env.get('X_PERF'), 'query') as t: - cdx_lines = self.cdx_reader.load_for_request(wbrequest, parsed_cdx = True) + with PerfTimer(wbrequest.env.get('X_PERF'), 'query') as t: + cdx_lines = self.index_reader.load_for_request(wbrequest) # new special modifier to always show cdx index if wbrequest.wb_url.mod == 'cdx_': @@ -48,8 +46,8 @@ class WBHandler(BaseHandler): query_view = self.html_view if self.html_view else self.text_view return query_view.render_response(wbrequest, cdx_lines) - with utils.PerfTimer(wbrequest.env.get('X_PERF'), 'replay') as t: - return self.replay(wbrequest, cdx_lines, self.cdx_reader) + with PerfTimer(wbrequest.env.get('X_PERF'), 'replay') as t: + return self.replay(wbrequest, cdx_lines) def render_search_page(self, wbrequest): @@ -60,18 +58,18 @@ class WBHandler(BaseHandler): def __str__(self): - return 'WBHandler: ' + str(self.cdx_reader) + ', ' + str(self.replay) + return 'WBHandler: ' + str(self.index_reader) + ', ' + str(self.replay) #================================================================= # CDX-Server Handler -- pass all params to cdx server #================================================================= class CDXHandler(BaseHandler): - def __init__(self, cdx_server, view = None): - self.cdx_server = cdx_server - self.view = view if view else views.TextCapturesView() + def __init__(self, index_reader, view = None): + self.index_reader = index_reader + self.view = view if view else TextCapturesView() def __call__(self, wbrequest): - cdx_lines = self.cdx_server.load_cdx_from_request(wbrequest.env) + cdx_lines = self.index_reader.cdx_server.load_cdx_from_request(wbrequest.env) return self.view.render_response(wbrequest, cdx_lines) @@ -81,7 +79,7 @@ class CDXHandler(BaseHandler): return None def __str__(self): - return 'CDX Server: ' + str(self.cdx_server) + return 'Index Reader: ' + str(self.index_reader) #================================================================= @@ -136,4 +134,19 @@ class DebugEchoHandler(BaseHandler): return WbResponse.text_response(str(wbrequest)) +#================================================================= +class PerfTimer: + def __init__(self, perfdict, name): + self.perfdict = perfdict + self.name = name + + def __enter__(self): + self.start = time.clock() + return self + + def __exit__(self, *args): + self.end = time.clock() + if self.perfdict is not None: + self.perfdict[self.name] = str(self.end - self.start) + diff --git a/pywb/indexreader.py b/pywb/indexreader.py index ce4c295b..959e9f67 100644 --- a/pywb/indexreader.py +++ b/pywb/indexreader.py @@ -1,17 +1,22 @@ import urllib import urllib2 import wbexceptions -import wbrequestresponse -from collections import OrderedDict -from cdxserver.cdxserver import CDXServer, CDXException -from cdxserver.cdxobject import CDXObject +from itertools import chain +from pprint import pprint -import logging +from pywb.cdx.cdxserver import CDXServer, CDXException +from pywb.cdx.cdxobject import CDXObject #================================================================= -class IndexReader: - def load_for_request(self, wbrequest, parsed_cdx = True): +class IndexReader(object): + def __init__(self, config): + if isinstance(config, str): + self.cdx_server = CDXServer(config) + else: + self.cdx_server = CDXServer.create_from_config(config) + + def load_for_request(self, wbrequest): wburl = wbrequest.wb_url # init standard params @@ -24,147 +29,27 @@ class IndexReader: if wbrequest.custom_params: params.update(wbrequest.custom_params) - #params['url'] = wburl.url - output = 'raw' if parsed_cdx else 'text' - + params['url'] = wburl.url try: - cdxlines = self.load_cdx(url = wburl.url, output = output, **params) + cdxlines = self.load_cdx(output='raw', **params) except CDXException: raise wbexceptions.BadUrlException('Bad Request Url: ' + wburl.url) - cdxlines = utils.peek_iter(cdxlines) + cdxlines = self.peek_iter(cdxlines) if cdxlines is None: raise wbexceptions.NotFoundException('WB Does Not Have Url: ' + wburl.url) - cdxlines = self.filter_cdx(wbrequest, cdxlines) - return cdxlines - def filter_cdx(self, wbrequest, cdxlines): - # Subclasses may wrap cdxlines iterator in a filter - return cdxlines + def load_cdx(self, **params): + return self.cdx_server.load_cdx(**params) - def load_cdx(self, url, params = {}, parsed_cdx = True): - raise NotImplementedError('Override in subclasses') - - @staticmethod - def make_best_cdx_source(paths, config): - # may be a string or list - surt_ordered = config.get('surt_ordered', True) - - # support mixed cdx streams and remote servers? - # for now, list implies local sources - if isinstance(paths, list): - if len(paths) > 1: - return EmbeddedCDXServer(paths, surt_ordered) - else: - # treat as non-list - paths = paths[0] - - # a single uri - uri = paths - - # Check for remote cdx server - if (uri.startswith('http://') or uri.startswith('https://')) and not uri.endswith('.cdx'): - cookie = config.get('cookie', None) - return RemoteCDXServer(uri, cookie = cookie) - else: - return EmbeddedCDXServer([uri], surt_ordered) - - - - -#================================================================= -class EmbeddedCDXServer(CDXServer, IndexReader): def get_query_params(self, wburl, limit = 150000, collapse_time = None, replay_closest = 10): - if wburl.type == wburl.URL_QUERY: raise NotImplementedError('Url Query Not Yet Supported') return { - - wburl.QUERY: - {'collapse_time': collapse_time, 'filter': '!statuscode:(500|502|504)', 'limit': limit}, - - wburl.URL_QUERY: - {}, -# raise Exception('Not Yet Implemented') -# {'collapse': 'urlkey', 'matchType': 'prefix', 'showGroupCount': True, 'showUniqCount': True, 'lastSkipTimestamp': True, 'limit': limit, -# 'fl': 'urlkey,original,timestamp,endtimestamp,groupcount,uniqcount', -# }, - - wburl.REPLAY: - {'filter': '!statuscode:(500|502|504)', 'limit': replay_closest, 'closest_to': wburl.timestamp, 'resolve_revisits': True}, - - wburl.LATEST_REPLAY: - {'reverse': True, 'filter': 'statuscode:[23]..', 'limit': '1', 'resolve_revisits': True} - - }[wburl.type] - - - def __str__(self): - return 'load cdx indexes from ' + str(self.sources) - - - -#================================================================= -class RemoteCDXServer(IndexReader): - """ - >>> x = RemoteCDXServer('http://web.archive.org/cdx/search/cdx').load_cdx('example.com', parsed_cdx = True, limit = '2') - >>> pprint(x.next().items()) - [('urlkey', 'com,example)/'), - ('timestamp', '20020120142510'), - ('original', 'http://example.com:80/'), - ('mimetype', 'text/html'), - ('statuscode', '200'), - ('digest', 'HT2DYGA5UKZCPBSFVCV3JOBXGW2G5UUA'), - ('length', '1792')] - """ - - def __init__(self, server_url, cookie = None): - self.server_url = server_url - self.auth_cookie = cookie - - def load_cdx(self, url, params = {}, parsed_cdx = True, **kwvalues): - #url is required, must be passed explicitly! - params['url'] = url - params.update(**kwvalues) - - urlparams = urllib.urlencode(params, True) - - try: - request = urllib2.Request(self.server_url, urlparams) - - if self.auth_cookie: - request.add_header('Cookie', self.auth_cookie) - - response = urllib2.urlopen(request) - except urllib2.HTTPError, e: - if e.code == 403: - exc_msg = e.read() - msg = 'Blocked By Robots' if 'Blocked By Robots' in exc_msg else 'Excluded' - raise wbexceptions.AccessException(msg) - else: - raise - - if parsed_cdx: - return (CDXObject(cdx) for cdx in response) - else: - return iter(response) - - - # Note: this params are designed to make pywb compatible with the original Java wayback-cdx-server API: - # https://github.com/internetarchive/wayback/tree/master/wayback-cdx-server - # Soon, this will be switched over to support the native pywb cdx server - - # BUG: Setting replayClosest to high number for now, as cdx server sometimes returns wrong result - # with lower values if there are too many captures. Ideally, should be around 10-20 - # The replayClosest is the max number of cdx lines, so max number of retry attempts that WB will make - - def get_query_params(self, wburl, limit = '150000', collapse_time = '10', replay_closest = '4000'): - return { - wburl.QUERY: {'collapseTime': collapse_time, 'filter': '!statuscode:(500|502|504)', 'limit': limit}, @@ -184,18 +69,20 @@ class RemoteCDXServer(IndexReader): }[wburl.type] + @staticmethod + def peek_iter(iterable): + try: + first = next(iterable) + except StopIteration: + return None - def __str__(self): - return 'server cdx from ' + self.server_url + return chain([first], iterable) +#================================================================= +class RemoteCDXServer(IndexReader): + def __init__(self, remote_url, cookie=None): + self.remote = RemoteCDXSource(remote_url=remote_url, cookie=cookie, proxy_all=True) + self.cdx_server = CDXServer(self.remote) -# Testing - -import utils -if __name__ == "__main__" or utils.enable_doctests(): - from pprint import pprint - - test_dir = utils.test_data_dir() + 'cdx/' - - import doctest - doctest.testmod() + #def load_cdx(self, **params): + #return remote.load_cdx(**params) diff --git a/pywb/pywb_init.py b/pywb/pywb_init.py index a781d601..b88c7d72 100644 --- a/pywb/pywb_init.py +++ b/pywb/pywb_init.py @@ -1,11 +1,12 @@ import handlers import indexreader import archivalrouter +import config_utils +import proxy + import os import yaml -import config_utils import logging -import proxy #================================================================= DEFAULTS = { @@ -49,24 +50,20 @@ def pywb_config_manual(passed_config = {}): collections = config.get('collections') for name, value in collections.iteritems(): - route_config = config - - if isinstance(value, dict): - # if a dict, extend with base properies - index_paths = value['index_paths'] - route_config = DictChain(value, config) + if isinstance(value, str): + route_config = config + cdx_server = indexreader.IndexReader(value) else: - index_paths = str(value) - - cdx_source = indexreader.IndexReader.make_best_cdx_source(index_paths, route_config) + route_config = DictChain(value, config) + cdx_server = indexreader.IndexReader(route_config) wb_handler = config_utils.create_wb_handler( - cdx_source = cdx_source, + cdx_server = cdx_server, config = route_config, ) - logging.info('Adding Collection: ' + name) + logging.debug('Adding Collection: ' + name) route_class = route_config.get('route_class', archivalrouter.Route) @@ -74,7 +71,7 @@ def pywb_config_manual(passed_config = {}): # cdx query handler if route_config.get('enable_cdx_api', False): - routes.append(archivalrouter.Route(name + '-cdx', handlers.CDXHandler(cdx_source))) + routes.append(archivalrouter.Route(name + '-cdx', handlers.CDXHandler(cdx_server))) if config.get('debug_echo_env', False): @@ -125,11 +122,3 @@ def pywb_config(config_file = None): return pywb_config_manual(config) - -import utils -if __name__ == "__main__" or utils.enable_doctests(): - # Just test for execution for now - #pywb_config(os.path.dirname(os.path.realpath(__file__)) + '/../config.yaml') - pywb_config_manual() - - diff --git a/pywb/regex_rewriters.py b/pywb/regex_rewriters.py deleted file mode 100644 index 49727350..00000000 --- a/pywb/regex_rewriters.py +++ /dev/null @@ -1,269 +0,0 @@ -import re -import sys -import itertools - -from url_rewriter import UrlRewriter - -#================================================================= -class RegexRewriter: - """ - # Test https->http converter (other tests below in subclasses) - >>> RegexRewriter([(RegexRewriter.HTTPX_MATCH_STR, RegexRewriter.remove_https, 0)]).rewrite('a = https://example.com; b = http://example.com; c = https://some-url/path/https://embedded.example.com') - 'a = http://example.com; b = http://example.com; c = http://some-url/path/http://embedded.example.com' - """ - - @staticmethod - def comment_out(string): - return '/*' + string + '*/' - - @staticmethod - def remove_https(string): - return string.replace("https", "http") - - @staticmethod - def add_prefix(prefix): - return lambda string: prefix + string - - @staticmethod - def archival_rewrite(rewriter): - return lambda x: rewriter.rewrite(x) - - @staticmethod - def replacer(string): - return lambda x: string - - HTTPX_MATCH_STR = r'https?:\\?/\\?/[A-Za-z0-9:_@.-]+' - - - - DEFAULT_OP = add_prefix - - - def __init__(self, rules): - #rules = self.create_rules(http_prefix) - - # Build regexstr, concatenating regex list - regex_str = '|'.join(['(' + rx + ')' for rx, op, count in rules]) - - # ensure it's not middle of a word, wrap in non-capture group - regex_str = '(? 0: - i += 1 - count -= 1 - - if not m.group(i): - continue - - # Optional filter to skip matches - if not self.filter(m): - return m.group(0) - - # Custom func - if not hasattr(op, '__call__'): - op = RegexRewriter.DEFAULT_OP(op) - - result = op(m.group(i)) - - # if extracting partial match - if i != full_m: - result = m.string[m.start(full_m):m.start(i)] + result + m.string[m.end(i):m.end(full_m)] - - return result - - - -#================================================================= -class JSRewriter(RegexRewriter): - """ - >>> test_js('location = "http://example.com/abc.html"') - 'WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"' - - >>> test_js(r'location = "http:\/\/example.com/abc.html"') - 'WB_wombat_location = "/web/20131010im_/http:\\\\/\\\\/example.com/abc.html"' - - >>> test_js(r'location = "http:\\/\\/example.com/abc.html"') - 'WB_wombat_location = "/web/20131010im_/http:\\\\/\\\\/example.com/abc.html"' - - >>> test_js(r"location = 'http://example.com/abc.html/'") - "WB_wombat_location = '/web/20131010im_/http://example.com/abc.html/'" - - >>> test_js(r'location = http://example.com/abc.html/') - 'WB_wombat_location = http://example.com/abc.html/' - - >>> test_js(r'location = /http:\/\/example.com/abc.html/') - 'WB_wombat_location = /http:\\\\/\\\\/example.com/abc.html/' - - >>> test_js('"/location" == some_location_val; locations = location;') - '"/location" == some_location_val; locations = WB_wombat_location;' - - >>> test_js('cool_Location = "http://example.com/abc.html"') - 'cool_Location = "/web/20131010im_/http://example.com/abc.html"' - - >>> test_js('window.location = "http://example.com/abc.html" document.domain = "anotherdomain.com"') - 'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html" document.WB_wombat_domain = "anotherdomain.com"' - - >>> test_js('document_domain = "anotherdomain.com"; window.document.domain = "example.com"') - 'document_domain = "anotherdomain.com"; window.document.WB_wombat_domain = "example.com"' - - # custom rules added - >>> test_js('window.location = "http://example.com/abc.html"; some_func(); ', [('some_func\(\).*', RegexRewriter.comment_out, 0)]) - 'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"; /*some_func(); */' - - # scheme-agnostic - >>> test_js('cool_Location = "//example.com/abc.html" //comment') - 'cool_Location = "/web/20131010im_///example.com/abc.html" //comment' - - """ - - JS_HTTPX = r'(?<="|\')(?:https?:)?\\?/\\?/[A-Za-z0-9:_@.-]+' - - def __init__(self, rewriter, extra = []): - rules = self._create_rules(rewriter.get_abs_url()) - rules.extend(extra) - - RegexRewriter.__init__(self, rules) - - - def _create_rules(self, http_prefix): - return [ - (self.JS_HTTPX, http_prefix, 0), - (r'(?>> test_xml('') - '' - - >>> test_xml('') - '' - - >>> test_xml(' http://example.comabchttp://example.com') - ' /web/20131010im_/http://example.comabchttp://example.com' - - >>> test_xml('
http://www.example.com/blah http://example.com
') - '
/web/20131010im_/http://www.example.com/blah /web/20131010im_/http://example.com
' - - """ - - def __init__(self, rewriter, extra = []): - rules = self._create_rules(rewriter.get_abs_url()) - - RegexRewriter.__init__(self, rules) - - # custom filter to reject 'xmlns' attr - def filter(self, m): - attr = m.group(1) - if attr and attr.startswith('xmlns'): - return False - - return True - - def _create_rules(self, http_prefix): - return [ - ('([A-Za-z:]+[\s=]+)?["\'\s]*(' + RegexRewriter.HTTPX_MATCH_STR + ')', http_prefix, 2), - ] - -#================================================================= -class CSSRewriter(RegexRewriter): - r""" - >>> test_css("background: url('/some/path.html')") - "background: url('/web/20131010im_/http://example.com/some/path.html')" - - >>> test_css("background: url('../path.html')") - "background: url('/web/20131010im_/http://example.com/path.html')" - - >>> test_css("background: url(\"http://domain.com/path.html\")") - 'background: url("/web/20131010im_/http://domain.com/path.html")' - - >>> test_css("background: url(file.jpeg)") - 'background: url(/web/20131010im_/http://example.com/file.jpeg)' - - >>> test_css("background: url('')") - "background: url('')" - - >>> test_css("background: url (\"weirdpath\')") - 'background: url ("/web/20131010im_/http://example.com/weirdpath\')' - - >>> test_css("@import url ('path.css')") - "@import url ('/web/20131010im_/http://example.com/path.css')" - - >>> test_css("@import url('path.css')") - "@import url('/web/20131010im_/http://example.com/path.css')" - - >>> test_css("@import ( 'path.css')") - "@import ( '/web/20131010im_/http://example.com/path.css')" - - >>> test_css("@import \"path.css\"") - '@import "/web/20131010im_/http://example.com/path.css"' - - >>> test_css("@import ('../path.css\"") - '@import (\'/web/20131010im_/http://example.com/path.css"' - - >>> test_css("@import ('../url.css\"") - '@import (\'/web/20131010im_/http://example.com/url.css"' - - >>> test_css("@import (\"url.css\")") - '@import ("/web/20131010im_/http://example.com/url.css")' - - >>> test_css("@import url(/url.css)\n@import url(/anotherurl.css)\n @import url(/and_a_third.css)") - '@import url(/web/20131010im_/http://example.com/url.css)\n@import url(/web/20131010im_/http://example.com/anotherurl.css)\n @import url(/web/20131010im_/http://example.com/and_a_third.css)' - - """ - - CSS_URL_REGEX = "url\\s*\\(\\s*[\\\\\"']*([^)'\"]+)[\\\\\"']*\\s*\\)" - CSS_IMPORT_NO_URL_REGEX = "@import\\s+(?!url)\\(?\\s*['\"]?(?!url[\\s\\(])([\w.:/\\\\-]+)" - - def __init__(self, rewriter): - rules = self._create_rules(rewriter) - - RegexRewriter.__init__(self, rules) - - - def _create_rules(self, rewriter): - return [ - (CSSRewriter.CSS_URL_REGEX, RegexRewriter.archival_rewrite(rewriter), 1), - (CSSRewriter.CSS_IMPORT_NO_URL_REGEX, RegexRewriter.archival_rewrite(rewriter), 1), - ] - -import utils -if __name__ == "__main__" or utils.enable_doctests(): - arcrw = UrlRewriter('20131010im_/http://example.com/', '/web/') - - def test_js(string, extra = []): - return JSRewriter(arcrw, extra).rewrite(string) - - def test_xml(string): - return XMLRewriter(arcrw).rewrite(string) - - def test_css(string): - return CSSRewriter(arcrw).rewrite(string) - - - import doctest - doctest.testmod() - - - diff --git a/pywb/replay_views.py b/pywb/replay_views.py index 78c097b2..53b60fc4 100644 --- a/pywb/replay_views.py +++ b/pywb/replay_views.py @@ -1,30 +1,30 @@ import StringIO -from urllib2 import URLError -import chardet -import copy -import itertools -import archiveloader -from wbrequestresponse import WbResponse, StatusAndHeaders -import utils - -from url_rewriter import UrlRewriter -from header_rewriter import HeaderRewriter -import html_rewriter -import regex_rewriters +from pywb.rewrite.url_rewriter import UrlRewriter +from pywb.utils.bufferedreaders import ChunkedDataReader +from wbrequestresponse import WbResponse import wbexceptions #================================================================= class ReplayView: - def __init__(self, resolvers, loader = None, reporter = None): - self.resolvers = resolvers - self.loader = loader if loader else archiveloader.ArchiveLoader() + def __init__(self, content_loader, content_rewriter, head_insert_view = None, + redir_to_exact = True, buffer_response = False, reporter = None): + + self.content_loader = content_loader + self.content_rewriter = content_rewriter + + self.head_insert_view = head_insert_view + + self.redir_to_exact = redir_to_exact + # buffer or stream rewritten response + self.buffer_response = buffer_response + self._reporter = reporter - def __call__(self, wbrequest, cdx_lines, cdx_reader): + def __call__(self, wbrequest, cdx_lines): last_e = None first = True @@ -40,9 +40,22 @@ class ReplayView: self._redirect_if_needed(wbrequest, cdx) first = False - (cdx, status_headers, stream) = self.resolve_headers_and_payload(cdx, wbrequest, cdx_reader, failed_files) + (status_headers, stream) = self.content_loader.resolve_headers_and_payload(cdx, failed_files) - response = self.make_response(wbrequest, cdx, status_headers, stream) + # check and reject self-redirect + self._reject_self_redirect(wbrequest, cdx, status_headers) + + # check if redir is needed + self._redirect_if_needed(wbrequest, cdx) + + response = None + + if self.content_rewriter and wbrequest.wb_url.mod != 'id_': + response = self.rewrite_content(wbrequest, cdx, status_headers, stream) + else: + (status_headers, stream) = self.sanitize_content(status_headers, stream) + response_iter = self.stream_to_iter(stream) + response = WbResponse(status_headers, response_iter) # notify reporter callback, if any if self._reporter: @@ -62,288 +75,57 @@ class ReplayView: else: raise wbexceptions.UnresolvedArchiveFileException() - - # callback to issue a redirect to another request - # subclasses may provide custom logic - def _redirect_if_needed(self, wbrequest, cdx): - pass - - - def _load(self, cdx, revisit, failed_files): - if revisit: - (filename, offset, length) = (cdx['orig.filename'], cdx['orig.offset'], cdx['orig.length']) - else: - (filename, offset, length) = (cdx['filename'], cdx['offset'], cdx['length']) - - #optimization: if same file already failed this request, don't try again - if failed_files and filename in failed_files: - raise wbexceptions.ArchiveLoadFailed(filename, 'Skipping Already Failed') - - any_found = False - last_exc = None - for resolver in self.resolvers: - possible_paths = resolver(filename) - - if possible_paths: - for path in possible_paths: - any_found = True - try: - return self.loader.load(path, offset, length) - - except Exception as ue: - last_exc = ue - print last_exc - pass - - # Unsuccessful if reached here - if failed_files: - failed_files.append(filename) - - if not any_found: - raise wbexceptions.UnresolvedArchiveFileException('Archive File Not Found: ' + filename) - else: - raise wbexceptions.ArchiveLoadFailed(filename, last_exc.reason if last_exc else '') - - - def resolve_headers_and_payload(self, cdx, wbrequest, cdx_reader, failed_files): - has_curr = (cdx['filename'] != '-') - has_orig = (cdx.get('orig.filename','-') != '-') - - # load headers record from cdx['filename'] unless it is '-' (rare) - headers_record = self._load(cdx, False, failed_files) if has_curr else None - - # two index lookups - # Case 1: if mimetype is still warc/revisit - if cdx['mimetype'] == 'warc/revisit' and headers_record: - payload_record = self._load_different_url_payload(wbrequest, cdx_reader, cdx, headers_record, failed_files) - - # single lookup cases - # case 2: non-revisit - elif (has_curr and not has_orig): - payload_record = headers_record - - # case 3: identical url revisit, load payload from orig.filename - elif (has_orig): - payload_record = self._load(cdx, True, failed_files) - - # special case: set header to payload if old-style revisit with missing header - if not headers_record: - headers_record = payload_record - elif headers_record != payload_record: - # close remainder of stream as this record only used for (already parsed) headers - headers_record.stream.close() - - # special case: check if headers record is actually empty (eg empty revisit), then use headers from revisit - if not headers_record.status_headers.headers: - headers_record = payload_record - - - if not headers_record or not payload_record: - raise wbexceptions.CaptureException('Invalid CDX' + str(cdx)) - - - #response = WbResponse(headers_record.status_headers, self.create_stream_gen(payload_record.stream)) - #response._stream = payload_record.stream - return (cdx, headers_record.status_headers, payload_record.stream) - - - # done here! just return response - # subclasses make override to do additional processing - def make_response(self, wbrequest, cdx, status_headers, stream): - return self.create_stream_response(status_headers, stream) - - - # create response from headers and wrapping stream in generator - def create_stream_response(self, status_headers, stream): - return WbResponse(status_headers, self.create_stream_gen(stream)) - - - # Handle the case where a duplicate of a capture with same digest exists at a different url - # Must query the index at that url filtering by matching digest - # Raise exception if no matches found - def _load_different_url_payload(self, wbrequest, cdx_reader, cdx, headers_record, failed_files): - ref_target_uri = headers_record.rec_headers.get_header('WARC-Refers-To-Target-URI') - - # Check for unresolved revisit error, if refers to target uri not present or same as the current url - if not ref_target_uri or (ref_target_uri == headers_record.rec_headers.get_header('WARC-Target-URI')): - raise wbexceptions.CaptureException('Missing Revisit Original' + str(cdx)) - - ref_target_date = headers_record.rec_headers.get_header('WARC-Refers-To-Date') - - if not ref_target_date: - ref_target_date = cdx['timestamp'] - else: - ref_target_date = utils.iso_date_to_timestamp(ref_target_date) - - # clone WbRequest - orig_wbreq = copy.copy(wbrequest) - orig_wbreq.wb_url = copy.copy(orig_wbreq.wb_url) - - orig_wbreq.wb_url.url = ref_target_uri - orig_wbreq.wb_url.timestamp = ref_target_date - - # Must also match digest - orig_wbreq.query_filter.append('digest:' + cdx['digest']) - - orig_cdx_lines = cdx_reader.load_for_request(orig_wbreq, parsed_cdx = True) - - for cdx in orig_cdx_lines: - try: - #cdx = cdx_reader.CDXCaptureResult(cdx) - #print cdx - payload_record = self._load(cdx, False, failed_files) - return payload_record - - except wbexceptions.CaptureException as e: - pass - - raise wbexceptions.CaptureException('Original for revisit could not be loaded') - - - def resolve_full(self, filename): - # Attempt to resolve cdx file to full path - full_url = None - for resolver in self.resolvers: - full_url = resolver(filename) - if full_url: - return full_url - - raise wbexceptions.UnresolvedArchiveFileException('Archive File Not Found: ' + filename) - - - # Create a generator reading from a stream, with optional rewriting and final read call @staticmethod - def create_stream_gen(stream, rewrite_func = None, final_read_func = None, first_buff = None): + def stream_to_iter(stream): try: - buff = first_buff if first_buff else stream.read() + buff = stream.read() while buff: - if rewrite_func: - buff = rewrite_func(buff) yield buff buff = stream.read() - # For adding a tail/handling final buffer - if final_read_func: - buff = final_read_func() - if buff: - yield buff - finally: stream.close() + def sanitize_content(self, status_headers, stream): + # remove transfer encoding chunked and wrap in a dechunking stream + if (status_headers.remove_header('transfer-encoding')): + stream = ChunkedDataReader(stream) - def __str__(self): - return 'find archive files from ' + str(self.resolvers) - -#================================================================= -class RewritingReplayView(ReplayView): - - def __init__(self, resolvers, loader = None, head_insert_view = None, header_rewriter = None, redir_to_exact = True, buffer_response = False, reporter = None): - ReplayView.__init__(self, resolvers, loader, reporter) - self.head_insert_view = head_insert_view - self.header_rewriter = header_rewriter if header_rewriter else HeaderRewriter() - self.redir_to_exact = redir_to_exact - - # buffer or stream rewritten response - self.buffer_response = buffer_response - - - def _text_content_type(self, content_type): - for ctype, mimelist in self.REWRITE_TYPES.iteritems(): - if any ((mime in content_type) for mime in mimelist): - return ctype - - return None - - - def make_response(self, wbrequest, cdx, status_headers, stream): - # check and reject self-redirect - self._reject_self_redirect(wbrequest, cdx, status_headers) - - # check if redir is needed - self._redirect_if_needed(wbrequest, cdx) + return (status_headers, stream) + def rewrite_content(self, wbrequest, cdx, status_headers, stream): urlrewriter = wbrequest.urlrewriter - rewritten_headers = self.header_rewriter.rewrite(status_headers, urlrewriter) + (rewritten_headers, stream) = self.content_rewriter.rewrite_headers(urlrewriter, status_headers, stream) - # de_chunking in case chunk encoding is broken - # TODO: investigate further - de_chunk = False - - # handle transfer-encoding: chunked - if (rewritten_headers.contains_removed_header('transfer-encoding', 'chunked')): - stream = archiveloader.ChunkedLineReader(stream) - de_chunk = True - - # transparent, though still may need to dechunk - if wbrequest.wb_url.mod == 'id_': - if de_chunk: - status_headers.remove_header('transfer-encoding') - - return self.create_stream_response(status_headers, stream) - - # non-text content type, just send through with rewritten headers - # but may need to dechunk + # no rewriting needed! if rewritten_headers.text_type is None: - status_headers = rewritten_headers.status_headers + response_iter = self.stream_to_iter(stream) + return WbResponse(rewritten_headers.status_headers, response_iter) - return self.create_stream_response(status_headers, stream) - - # Handle text rewriting - - # special case -- need to ungzip the body - if (rewritten_headers.contains_removed_header('content-encoding', 'gzip')): - stream = archiveloader.LineReader(stream, decomp = utils.create_decompressor()) - - # TODO: is this right? - if rewritten_headers.charset: - encoding = rewritten_headers.charset - first_buff = None + # do head insert + if self.head_insert_view: + head_insert_str = self.head_insert_view.render_to_string(wbrequest = wbrequest, cdx = cdx) else: - (encoding, first_buff) = self._detect_charset(stream) + head_insert_str = None - # if chardet thinks its ascii, use utf-8 - if encoding == 'ascii': - #encoding = None - encoding = 'utf-8' - - # Buffering response for html, streaming for others? - #if rewritten_headers.text_type == 'html': - # return self._rewrite_html(encoding, urlrewriter, stream, rewritten_headers.status_headers, first_buff) - #else: - # return self._rewrite_other(rewritten_headers.text_type, encoding, urlrewriter, stream, rewritten_headers.status_headers, first_buff) - - text_type = rewritten_headers.text_type - status_headers = rewritten_headers.status_headers - - if text_type == 'html': - head_insert_str = self.head_insert_view.render_to_string(wbrequest = wbrequest, cdx = cdx) if self.head_insert_view else None - rewriter = html_rewriter.HTMLRewriter(urlrewriter, outstream = None, head_insert = head_insert_str) - elif text_type == 'css': - rewriter = regex_rewriters.CSSRewriter(urlrewriter) - elif text_type == 'js': - rewriter = regex_rewriters.JSRewriter(urlrewriter) - elif text_type == 'xml': - rewriter = regex_rewriters.XMLRewriter(urlrewriter) - else: - raise Exception('Unknown Text Type for Rewrite: ' + text_type) - - # Create generator for response - response_gen = self._create_rewrite_stream(rewriter, encoding, stream, first_buff) + (status_headers, response_gen) = self.content_rewriter.rewrite_content(urlrewriter, rewritten_headers, stream, head_insert_str) if self.buffer_response: - return self._create_buffer_response(status_headers, response_gen) - else: - return WbResponse(status_headers, value = response_gen) + if wbrequest.wb_url.mod == 'id_': + status_headers.remove_header('content-length') + + return self.buffered_response(status_headers, response_gen) + + return WbResponse(status_headers, response_gen) - # Buffer rewrite generator and return a response from a string - def _create_buffer_response(self, status_headers, generator): + # Buffer rewrite iterator and return a response from a string + def buffered_response(self, status_headers, iterator): out = StringIO.StringIO() try: - for buff in generator: + for buff in iterator: out.write(buff) finally: @@ -355,53 +137,9 @@ class RewritingReplayView(ReplayView): return WbResponse(status_headers, value = [content]) - # Create rewrite response from record (no Content-Length), may even be chunked by front-end - def _create_rewrite_stream(self, rewriter, encoding, stream, first_buff = None): - def do_rewrite(buff): - if encoding: - buff = self._decode_buff(buff, stream, encoding) - - buff = rewriter.rewrite(buff) - - if encoding: - buff = buff.encode(encoding) - - return buff - - def do_finish(): - return rewriter.close() - - return self.create_stream_gen(stream, rewrite_func = do_rewrite, final_read_func = do_finish, first_buff = first_buff) - - - def _decode_buff(self, buff, stream, encoding): - try: - buff = buff.decode(encoding) - except UnicodeDecodeError, e: - # chunk may have cut apart unicode bytes -- add 1-3 bytes and retry - for i in range(3): - buff += stream.read(1) - try: - buff = buff.decode(encoding) - break - except UnicodeDecodeError: - pass - else: - raise - - return buff - - - def _detect_charset(self, stream): - buff = stream.read(8192) - result = chardet.detect(buff) - print "chardet result: " + str(result) - return (result['encoding'], buff) - def _redirect_if_needed(self, wbrequest, cdx): - is_proxy = wbrequest.is_proxy - if self.redir_to_exact and not is_proxy and cdx and (cdx['timestamp'] != wbrequest.wb_url.timestamp): + if self.redir_to_exact and not wbrequest.is_proxy and cdx and (cdx['timestamp'] != wbrequest.wb_url.timestamp): new_url = wbrequest.urlrewriter.get_timestamp_url(cdx['timestamp'], cdx['original']) raise wbexceptions.InternalRedirect(new_url) diff --git a/pywb/rewrite/README.md b/pywb/rewrite/README.md new file mode 100644 index 00000000..dc658ea9 --- /dev/null +++ b/pywb/rewrite/README.md @@ -0,0 +1,47 @@ +## PyWb Rewrite v0.2 + +[![Build Status](https://travis-ci.org/ikreymer/pywb_rewrite.png?branch=master)](https://travis-ci.org/ikreymer/pywb_rewrite) + +This package includes the content rewriting component of the pywb wayback tool suite. + +This package applies standard rewriting content rewriting, in the form of url rewriting, for +HTTP headers, html, css, js and xml content. + +An additional domain-specific rewritin is planned, especially for JS, to allow for proper +replay of difficult pages. + + +### Command-Line Rewriter + +To enable easier testing of rewriting, this package includes a command-line rewriter +which will fetch a live url and apply the registered rewriting rules to that url: + +After installing with: + +`pip install -r requirements.txt` + +Run: + +`python ./pywb_rewrite/rewrite_live.py http://example.com` + +To specify custom timestamp and prefix: + +``` +python ./pywb_rewrite/rewrite_live.py http://example.com /mycoll/20141026000102/http://mysite.example.com/path.html +``` + +This will print to stdout the content of `http://example.com` with all urls rewritten relative to +`/mycoll/20141026000102/http://mysite.example.com/path.html`. + +Headers are also rewritten, for further details, consult the `get_rewritten` function in +[pywb_rewrite/rewrite_live.py](pywb_rewrite/rewrite_live.py) + + +### Tests + +Rewriting doctests as well as live rewriting tests (subject to change) are provided. +To run full test suite: `python run-tests.py` + + + + diff --git a/pywb/cdxserver/__init__.py b/pywb/rewrite/__init__.py similarity index 100% rename from pywb/cdxserver/__init__.py rename to pywb/rewrite/__init__.py diff --git a/pywb/header_rewriter.py b/pywb/rewrite/header_rewriter.py similarity index 56% rename from pywb/header_rewriter.py rename to pywb/rewrite/header_rewriter.py index fe67f49f..a9b53a46 100644 --- a/pywb/header_rewriter.py +++ b/pywb/rewrite/header_rewriter.py @@ -1,4 +1,4 @@ -from wbrequestresponse import StatusAndHeaders +from pywb.utils.statusandheaders import StatusAndHeaders #================================================================= class RewrittenStatusAndHeaders: @@ -14,37 +14,6 @@ class RewrittenStatusAndHeaders: #================================================================= class HeaderRewriter: - """ - # Text with charset - >>> test_rewrite([('Date', 'Fri, 03 Jan 2014 03:03:21 GMT'), ('Content-Length', '5'), ('Content-Type', 'text/html;charset=UTF-8')]) - {'text_type': 'html', 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Date', 'Fri, 03 Jan 2014 03:03:21 GMT'), - ('X-Archive-Orig-Content-Length', '5'), - ('Content-Type', 'text/html;charset=UTF-8')]), 'removed_header_dict': {}, 'charset': 'utf-8'} - - # Redirect - >>> test_rewrite([('Connection', 'close'), ('Location', '/other.html')], '302 Redirect') - {'text_type': None, 'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [ ('X-Archive-Orig-Connection', 'close'), - ('Location', '/web/20131226101010/http://example.com/other.html')]), 'removed_header_dict': {}, 'charset': None} - - # gzip - >>> test_rewrite([('Content-Length', '199999'), ('Content-Type', 'text/javascript'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')]) - {'text_type': 'js', 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Content-Length', '199999'), - ('Content-Type', 'text/javascript')]), 'removed_header_dict': {'transfer-encoding': 'chunked', 'content-encoding': 'gzip'}, 'charset': None} - - # Binary - >>> test_rewrite([('Content-Length', '200000'), ('Content-Type', 'image/png'), ('Cookie', 'blah'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')]) - {'text_type': None, 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('Content-Length', '200000'), - ('Content-Type', 'image/png'), - ('X-Archive-Orig-Cookie', 'blah'), - ('Content-Encoding', 'gzip')]), 'removed_header_dict': {'transfer-encoding': 'chunked'}, 'charset': None} - - Removing Transfer-Encoding always, Was: - ('Content-Encoding', 'gzip'), - ('Transfer-Encoding', 'chunked')]), 'charset': None, 'text_type': None, 'removed_header_dict': {}} - - """ - - REWRITE_TYPES = { 'html': ['text/html', 'application/xhtml'], 'css': ['text/css'], @@ -122,20 +91,3 @@ class HeaderRewriter: return (new_headers, removed_header_dict) -import utils -if __name__ == "__main__" or utils.enable_doctests(): - import os - import pprint - import url_rewriter - - urlrewriter = url_rewriter.UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/web/') - - headerrewriter = HeaderRewriter() - - def test_rewrite(headers, status = '200 OK'): - rewritten = headerrewriter.rewrite(StatusAndHeaders(status, headers), urlrewriter) - return vars(rewritten) - - import doctest - doctest.testmod() - diff --git a/pywb/html_rewriter.py b/pywb/rewrite/html_rewriter.py similarity index 65% rename from pywb/html_rewriter.py rename to pywb/rewrite/html_rewriter.py index 25236acd..c6eeab23 100644 --- a/pywb/html_rewriter.py +++ b/pywb/rewrite/html_rewriter.py @@ -12,75 +12,8 @@ from regex_rewriters import JSRewriter, CSSRewriter # HTMLRewriter -- html parser for custom rewriting, also handlers for script and css #================================================================= class HTMLRewriter(HTMLParser): - r""" - >>> parse('Text') - Text - - >>> parse('
') -
- - >>> parse('
') -
- - >>> parse('') - - - >>> parse('') - - - # HTML Entities - >>> parse('›   >') - ›   > - - # Don't rewrite anchors - >>> parse('Text') - Text - - # Unicode - >>> parse('испытание') - испытание - - # Meta tag - >>> parse('') - - - >>> parse('') - - - >>> parse('') - - - # Script tag - >>> parse('') - - - # Unterminated script tag auto-terminate - >>> parse(' - - >>> parse('') - - - >>> parse('
') -
- - >>> parse('') - - - # Unterminated style tag auto-terminate - >>> parse(' - - # Head Insertion - >>> parse('Test', head_insert = '') - Test - - >>> parse('
SomeTest
', head_insert = '/* Insert */') - /* Insert */
SomeTest
- - >>> parse('
SomeTest
', head_insert = '') -
SomeTest
- + """ + HTML-Parsing Rewriter """ REWRITE_TAGS = { @@ -307,16 +240,4 @@ class HTMLRewriter(HTMLParser): self.out.write(']>') -import utils -if __name__ == "__main__" or utils.enable_doctests(): - - url_rewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/web/') - - def parse(data, head_insert = None): - parser = HTMLRewriter(url_rewriter, head_insert = head_insert) - print parser.rewrite(data) + parser.close() - - import doctest - doctest.testmod() - diff --git a/pywb/rewrite/regex_rewriters.py b/pywb/rewrite/regex_rewriters.py new file mode 100644 index 00000000..690775e7 --- /dev/null +++ b/pywb/rewrite/regex_rewriters.py @@ -0,0 +1,156 @@ +import re +import sys +import itertools + +from url_rewriter import UrlRewriter + +#================================================================= +class RegexRewriter(object): + @staticmethod + def comment_out(string): + return '/*' + string + '*/' + + @staticmethod + def remove_https(string): + return string.replace("https", "http") + + @staticmethod + def add_prefix(prefix): + return lambda string: prefix + string + + @staticmethod + def archival_rewrite(rewriter): + return lambda x: rewriter.rewrite(x) + + @staticmethod + def replacer(string): + return lambda x: string + + HTTPX_MATCH_STR = r'https?:\\?/\\?/[A-Za-z0-9:_@.-]+' + + + + DEFAULT_OP = add_prefix + + + def __init__(self, rules): + #rules = self.create_rules(http_prefix) + + # Build regexstr, concatenating regex list + regex_str = '|'.join(['(' + rx + ')' for rx, op, count in rules]) + + # ensure it's not middle of a word, wrap in non-capture group + regex_str = '(? 0: + i += 1 + count -= 1 + + if not m.group(i): + continue + + # Optional filter to skip matches + if not self.filter(m): + return m.group(0) + + # Custom func + if not hasattr(op, '__call__'): + op = RegexRewriter.DEFAULT_OP(op) + + result = op(m.group(i)) + + # if extracting partial match + if i != full_m: + result = m.string[m.start(full_m):m.start(i)] + result + m.string[m.end(i):m.end(full_m)] + + return result + + + +#================================================================= +class JSLinkRewriter(RegexRewriter): + """ + JS Rewriter which rewrites absolute http://, https:// and // urls + at the beginning of a string + """ + JS_HTTPX = r'(?<="|\')(?:https?:)?\\{0,2}/\\{0,2}/[A-Za-z0-9:_@.-]+' + + def __init__(self, rewriter, rules = []): + rules = rules + [(self.JS_HTTPX, rewriter.get_abs_url(), 0)] + super(JSLinkRewriter, self).__init__(rules) + +#================================================================= +class JSLocationAndLinkRewriter(JSLinkRewriter): + """ + JS Rewriter which also rewrites location and domain to the + specified prefix (default: 'WB_wombat_') + """ + + def __init__(self, rewriter, rules = [], prefix = 'WB_wombat_'): + rules = rules + [ + (r'(?= 3: + wburl_str = sys.argv[2] + if wburl_str.startswith('/'): + wburl_str = wburl_str[1:] + + prefix, wburl_str = wburl_str.split('/', 1) + prefix = '/' + prefix + '/' + else: + wburl_str = datetime_to_timestamp(datetime.datetime.now()) + '/http://example.com/path/sample.html' + prefix = '/pywb_rewrite/' + + urlrewriter = UrlRewriter(wburl_str, prefix) + + status_headers, buff = get_rewritten(url, urlrewriter) + + sys.stdout.write(buff) + + +#================================================================= +if __name__ == "__main__": + main() diff --git a/pywb/rewrite/test/test_rewrite.py b/pywb/rewrite/test/test_rewrite.py new file mode 100644 index 00000000..d9fe8bfa --- /dev/null +++ b/pywb/rewrite/test/test_rewrite.py @@ -0,0 +1,266 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +r""" + +#================================================================= +# HTML Rewriting +#================================================================= + +>>> parse('Text') +Text + +>>> parse('
') +
+ +>>> parse('
') +
+ +>>> parse('') + + +>>> parse('') + + +# HTML Entities +>>> parse('›   >') +›   > + +# Don't rewrite anchors +>>> parse('Text') +Text + +# Unicode +>>> parse('испытание') +испытание + +# Meta tag +>>> parse('') + + +>>> parse('') + + +>>> parse('') + + +# Script tag +>>> parse('') + + +# Unterminated script tag auto-terminate +>>> parse(' + +>>> parse('') + + +>>> parse('
') +
+ +>>> parse('') + + +# Unterminated style tag auto-terminate +>>> parse(' + +# Head Insertion +>>> parse('Test', head_insert = '') +Test + +>>> parse('
SomeTest
', head_insert = '/* Insert */') +/* Insert */
SomeTest
+ +>>> parse('
SomeTest
', head_insert = '') +
SomeTest
+ +#================================================================= +# Custom Regex +# Test https->http converter (other tests below in subclasses) +>>> RegexRewriter([(RegexRewriter.HTTPX_MATCH_STR, RegexRewriter.remove_https, 0)]).rewrite('a = https://example.com; b = http://example.com; c = https://some-url/path/https://embedded.example.com') +'a = http://example.com; b = http://example.com; c = http://some-url/path/http://embedded.example.com' + + +#================================================================= +# JS Rewriting +#================================================================= + +>>> _test_js('location = "http://example.com/abc.html"') +'WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"' + +>>> _test_js(r'location = "http:\/\/example.com/abc.html"') +'WB_wombat_location = "/web/20131010im_/http:\\/\\/example.com/abc.html"' + +>>> _test_js(r'location = "http:\\/\\/example.com/abc.html"') +'WB_wombat_location = "/web/20131010im_/http:\\\\/\\\\/example.com/abc.html"' + +>>> _test_js(r"location = 'http://example.com/abc.html/'") +"WB_wombat_location = '/web/20131010im_/http://example.com/abc.html/'" + +>>> _test_js(r'location = http://example.com/abc.html/') +'WB_wombat_location = http://example.com/abc.html/' + +# not rewritten -- to be handled on client side +>>> _test_js(r'location = "/abc.html"') +'WB_wombat_location = "/abc.html"' + +>>> _test_js(r'location = /http:\/\/example.com/abc.html/') +'WB_wombat_location = /http:\\/\\/example.com/abc.html/' + +>>> _test_js('"/location" == some_location_val; locations = location;') +'"/location" == some_location_val; locations = WB_wombat_location;' + +>>> _test_js('cool_Location = "http://example.com/abc.html"') +'cool_Location = "/web/20131010im_/http://example.com/abc.html"' + +>>> _test_js('window.location = "http://example.com/abc.html" document.domain = "anotherdomain.com"') +'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html" document.WB_wombat_domain = "anotherdomain.com"' + +>>> _test_js('document_domain = "anotherdomain.com"; window.document.domain = "example.com"') +'document_domain = "anotherdomain.com"; window.document.WB_wombat_domain = "example.com"' + +# custom rules added +>>> _test_js('window.location = "http://example.com/abc.html"; some_func(); ', [('some_func\(\).*', RegexRewriter.comment_out, 0)]) +'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"; /*some_func(); */' + +# scheme-agnostic +>>> _test_js('cool_Location = "//example.com/abc.html" //comment') +'cool_Location = "/web/20131010im_///example.com/abc.html" //comment' + + +#================================================================= +# XML Rewriting +#================================================================= + +>>> _test_xml('') +'' + +>>> _test_xml('') +'' + +>>> _test_xml(' http://example.comabchttp://example.com') +' /web/20131010im_/http://example.comabchttp://example.com' + +>>> _test_xml('
http://www.example.com/blah http://example.com
') +'
/web/20131010im_/http://www.example.com/blah /web/20131010im_/http://example.com
' + +#================================================================= +# CSS Rewriting +#================================================================= + +>>> _test_css("background: url('/some/path.html')") +"background: url('/web/20131010im_/http://example.com/some/path.html')" + +>>> _test_css("background: url('../path.html')") +"background: url('/web/20131010im_/http://example.com/path.html')" + +>>> _test_css("background: url(\"http://domain.com/path.html\")") +'background: url("/web/20131010im_/http://domain.com/path.html")' + +>>> _test_css("background: url(file.jpeg)") +'background: url(/web/20131010im_/http://example.com/file.jpeg)' + +>>> _test_css("background: url('')") +"background: url('')" + +>>> _test_css("background: url (\"weirdpath\')") +'background: url ("/web/20131010im_/http://example.com/weirdpath\')' + +>>> _test_css("@import url ('path.css')") +"@import url ('/web/20131010im_/http://example.com/path.css')" + +>>> _test_css("@import url('path.css')") +"@import url('/web/20131010im_/http://example.com/path.css')" + +>>> _test_css("@import ( 'path.css')") +"@import ( '/web/20131010im_/http://example.com/path.css')" + +>>> _test_css("@import \"path.css\"") +'@import "/web/20131010im_/http://example.com/path.css"' + +>>> _test_css("@import ('../path.css\"") +'@import (\'/web/20131010im_/http://example.com/path.css"' + +>>> _test_css("@import ('../url.css\"") +'@import (\'/web/20131010im_/http://example.com/url.css"' + +>>> _test_css("@import (\"url.css\")") +'@import ("/web/20131010im_/http://example.com/url.css")' + +>>> _test_css("@import url(/url.css)\n@import url(/anotherurl.css)\n @import url(/and_a_third.css)") +'@import url(/web/20131010im_/http://example.com/url.css)\n@import url(/web/20131010im_/http://example.com/anotherurl.css)\n @import url(/web/20131010im_/http://example.com/and_a_third.css)' + +#================================================================= +HTTP Headers Rewriting +#================================================================= + +# Text with charset +>>> _test_headers([('Date', 'Fri, 03 Jan 2014 03:03:21 GMT'), ('Content-Length', '5'), ('Content-Type', 'text/html;charset=UTF-8')]) +{'text_type': 'html', 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Date', 'Fri, 03 Jan 2014 03:03:21 GMT'), + ('X-Archive-Orig-Content-Length', '5'), + ('Content-Type', 'text/html;charset=UTF-8')]), 'removed_header_dict': {}, 'charset': 'utf-8'} + +# Redirect +>>> _test_headers([('Connection', 'close'), ('Location', '/other.html')], '302 Redirect') +{'text_type': None, 'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [ ('X-Archive-Orig-Connection', 'close'), + ('Location', '/web/20131226101010/http://example.com/other.html')]), 'removed_header_dict': {}, 'charset': None} + +# gzip +>>> _test_headers([('Content-Length', '199999'), ('Content-Type', 'text/javascript'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')]) +{'text_type': 'js', 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Content-Length', '199999'), + ('Content-Type', 'text/javascript')]), 'removed_header_dict': {'transfer-encoding': 'chunked', 'content-encoding': 'gzip'}, 'charset': None} + +# Binary +>>> _test_headers([('Content-Length', '200000'), ('Content-Type', 'image/png'), ('Cookie', 'blah'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')]) +{'text_type': None, 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('Content-Length', '200000'), + ('Content-Type', 'image/png'), + ('X-Archive-Orig-Cookie', 'blah'), + ('Content-Encoding', 'gzip')]), 'removed_header_dict': {'transfer-encoding': 'chunked'}, 'charset': None} + +Removing Transfer-Encoding always, Was: + ('Content-Encoding', 'gzip'), + ('Transfer-Encoding', 'chunked')]), 'charset': None, 'text_type': None, 'removed_header_dict': {}} + + +""" + +#================================================================= +from pywb.rewrite.url_rewriter import UrlRewriter +from pywb.rewrite.html_rewriter import HTMLRewriter +from pywb.rewrite.regex_rewriters import RegexRewriter, JSRewriter, CSSRewriter, XMLRewriter +from pywb.rewrite.header_rewriter import HeaderRewriter + +from pywb.utils.statusandheaders import StatusAndHeaders + + +urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/web/') + +def parse(data, head_insert = None): + parser = HTMLRewriter(urlrewriter, head_insert = head_insert) + print parser.rewrite(data) + parser.close() + +arcrw = UrlRewriter('20131010im_/http://example.com/', '/web/') + + +def _test_js(string, extra = []): + return JSRewriter(arcrw, extra).rewrite(string) + +def _test_xml(string): + return XMLRewriter(arcrw).rewrite(string) + +def _test_css(string): + return CSSRewriter(arcrw).rewrite(string) + +headerrewriter = HeaderRewriter() + +def _test_headers(headers, status = '200 OK'): + rewritten = headerrewriter.rewrite(StatusAndHeaders(status, headers), urlrewriter) + return vars(rewritten) + + +if __name__ == "__main__": + import doctest + doctest.testmod() + + diff --git a/pywb/rewrite/test/test_rewrite_live.py b/pywb/rewrite/test/test_rewrite_live.py new file mode 100644 index 00000000..691bec6d --- /dev/null +++ b/pywb/rewrite/test/test_rewrite_live.py @@ -0,0 +1,32 @@ +from pywb.rewrite.rewrite_live import get_rewritten +from pywb.rewrite.url_rewriter import UrlRewriter + +# This module has some rewriting tests against the 'live web' +# As such, the content may change and the test may break + +urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/pywb/') + + +def test_example_1(): + status_headers, buff = get_rewritten('http://example.com/', urlrewriter) + + # verify header rewriting + assert (('X-Archive-Orig-connection', 'close') in status_headers.headers), status_headers + + +def test_example_2(): + status_headers, buff = get_rewritten('http://example.com/', urlrewriter) + + # verify header rewriting + assert (('X-Archive-Orig-connection', 'close') in status_headers.headers), status_headers + + assert '/pywb/20131226101010/http://www.iana.org/domains/example' in buff, buff + + + +def test_example_3(): + status_headers, buff = get_rewritten('http://archive.org/', urlrewriter) + + assert '/pywb/20131226101010/http://example.com/about/terms.php' in buff, buff + + diff --git a/pywb/url_rewriter.py b/pywb/rewrite/url_rewriter.py similarity index 64% rename from pywb/url_rewriter.py rename to pywb/rewrite/url_rewriter.py index 79199744..c4cc4054 100644 --- a/pywb/url_rewriter.py +++ b/pywb/rewrite/url_rewriter.py @@ -6,43 +6,43 @@ from wburl import WbUrl class UrlRewriter: """ - >>> test_rewrite('other.html', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/') + >>> do_rewrite('other.html', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/') 'https://web.archive.org/web/20131010/http://example.com/path/other.html' - >>> test_rewrite('file.js', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/', 'js_') + >>> do_rewrite('file.js', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/', 'js_') 'https://web.archive.org/web/20131010js_/http://example.com/path/file.js' - >>> test_rewrite('/other.html', '20130907*/http://example.com/path/page.html', '/coll/') + >>> do_rewrite('/other.html', '20130907*/http://example.com/path/page.html', '/coll/') '/coll/20130907*/http://example.com/other.html' - >>> test_rewrite('./other.html', '20130907*/http://example.com/path/page.html', '/coll/') + >>> do_rewrite('./other.html', '20130907*/http://example.com/path/page.html', '/coll/') '/coll/20130907*/http://example.com/path/other.html' - >>> test_rewrite('../other.html', '20131112im_/http://example.com/path/page.html', '/coll/') + >>> do_rewrite('../other.html', '20131112im_/http://example.com/path/page.html', '/coll/') '/coll/20131112im_/http://example.com/other.html' - >>> test_rewrite('../../other.html', '*/http://example.com/index.html', 'localhost:8080/') + >>> do_rewrite('../../other.html', '*/http://example.com/index.html', 'localhost:8080/') 'localhost:8080/*/http://example.com/other.html' - >>> test_rewrite('path/../../other.html', '*/http://example.com/index.html', 'localhost:8080/') + >>> do_rewrite('path/../../other.html', '*/http://example.com/index.html', 'localhost:8080/') 'localhost:8080/*/http://example.com/other.html' - >>> test_rewrite('http://some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/') + >>> do_rewrite('http://some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/') 'localhost:8080/20101226101112/http://some-other-site.com' - >>> test_rewrite('../../other.html', '2020/http://example.com/index.html', '/') + >>> do_rewrite('../../other.html', '2020/http://example.com/index.html', '/') '/2020/http://example.com/other.html' - >>> test_rewrite('../../other.html', '2020/http://example.com/index.html', '') + >>> do_rewrite('../../other.html', '2020/http://example.com/index.html', '') '2020/http://example.com/other.html' - >>> test_rewrite('', '20131010010203/http://example.com/file.html', '/web/') + >>> do_rewrite('', '20131010010203/http://example.com/file.html', '/web/') '/web/20131010010203/http://example.com/file.html' - >>> test_rewrite('#anchor', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/') + >>> do_rewrite('#anchor', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/') '#anchor' - >>> test_rewrite('mailto:example@example.com', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/') + >>> do_rewrite('mailto:example@example.com', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/') 'mailto:example@example.com' >>> UrlRewriter('19960708im_/http://domain.example.com/path.txt', '/abc/').get_abs_url() @@ -62,7 +62,6 @@ class UrlRewriter: def __init__(self, wburl, prefix): self.wburl = wburl if isinstance(wburl, WbUrl) else WbUrl(wburl) self.prefix = prefix - self.archivalurl_class = self.wburl.__class__ #if self.prefix.endswith('/'): # self.prefix = self.prefix[:-1] @@ -74,7 +73,7 @@ class UrlRewriter: wburl = self.wburl - isAbs = any (url.startswith(x) for x in self.PROTOCOLS) + isAbs = any(url.startswith(x) for x in self.PROTOCOLS) # Optimized rewriter for # -rel urls that don't start with / and don't contain ../ and no special mod @@ -117,12 +116,11 @@ class UrlRewriter: return url -import utils -if __name__ == "__main__" or utils.enable_doctests(): - def test_rewrite(rel_url, base_url, prefix, mod = None): - rewriter = UrlRewriter(base_url, prefix) - return rewriter.rewrite(rel_url, mod) +def do_rewrite(rel_url, base_url, prefix, mod = None): + rewriter = UrlRewriter(base_url, prefix) + return rewriter.rewrite(rel_url, mod) +if __name__ == "__main__": import doctest doctest.testmod() diff --git a/pywb/wburl.py b/pywb/rewrite/wburl.py similarity index 75% rename from pywb/wburl.py rename to pywb/rewrite/wburl.py index 70162b80..77bd437d 100644 --- a/pywb/wburl.py +++ b/pywb/rewrite/wburl.py @@ -3,9 +3,38 @@ import re import rfc3987 -import wbexceptions - # WbUrl : wb archival url representation for WB +""" +WbUrl represents the standard wayback archival url format. +A regular url is a subset of the WbUrl (latest replay). + +The WbUrl expresses the common interface for interacting +with the wayback machine. + +There WbUrl may represent one of the following forms: + +query form: [/modifier]/[timestamp][-end_timestamp]*/ + +modifier, timestamp and end_timestamp are optional + +*/example.com +20101112030201*/http://example.com +2009-2015*/http://example.com +/cdx/*/http://example.com + +url query form: used to indicate query across urls +same as query form but with a final * +*/example.com* +20101112030201*/http://example.com* + + +replay form: +20101112030201/http://example.com +20101112030201im_/http://example.com + +latest_replay: (no timestamp) +http://example.com +""" class WbUrl: """ @@ -38,6 +67,13 @@ class WbUrl: >>> repr(WbUrl('*/http://example.com/abc?def=a*')) "('url_query', '', '', 'http://example.com/abc?def=a', '*/http://example.com/abc?def=a*')" + >>> repr(WbUrl('2010*/http://example.com/abc?def=a')) + "('query', '2010', '', 'http://example.com/abc?def=a', '2010*/http://example.com/abc?def=a')" + + # timestamp range query + >>> repr(WbUrl('2009-2015*/http://example.com/abc?def=a')) + "('query', '2009', '', 'http://example.com/abc?def=a', '2009-2015*/http://example.com/abc?def=a')" + >>> repr(WbUrl('json/*/http://example.com/abc?def=a')) "('query', '', 'json', 'http://example.com/abc?def=a', 'json/*/http://example.com/abc?def=a')" @@ -59,16 +95,16 @@ class WbUrl: # ====================== >>> x = WbUrl('/#$%#/') Traceback (most recent call last): - BadUrlException: Bad Request Url: http://#$%#/ + Exception: Bad Request Url: http://#$%#/ >>> x = WbUrl('/http://example.com:abc/') Traceback (most recent call last): - BadUrlException: Bad Request Url: http://example.com:abc/ + Exception: Bad Request Url: http://example.com:abc/ """ # Regexs # ====================== - QUERY_REGEX = re.compile('^(?:([\w\-:]+)/)?(\d*)\*/?(.*)$') + QUERY_REGEX = re.compile('^(?:([\w\-:]+)/)?(\d*)(?:-(\d+))?\*/?(.*)$') REPLAY_REGEX = re.compile('^(\d*)([a-z]+_)?/{0,3}(.*)$') QUERY = 'query' @@ -85,13 +121,14 @@ class WbUrl: self.type = None self.url = '' self.timestamp = '' + self.end_timestamp = '' self.mod = '' if not any (f(url) for f in [self._init_query, self._init_replay]): - raise wbexceptions.RequestParseException('Invalid WB Request Url: ', url) + raise Exception('Invalid WbUrl: ', url) if len(self.url) == 0: - raise wbexceptions.RequestParseException('Invalid WB Request Url: ', url) + raise Exception('Invalid WbUrl: ', url) # protocol agnostic url -> http:// #if self.url.startswith('//'): @@ -105,7 +142,7 @@ class WbUrl: matcher = rfc3987.match(self.url.upper(), 'IRI') if not matcher: - raise wbexceptions.BadUrlException('Bad Request Url: ' + self.url) + raise Exception('Bad Request Url: ' + self.url) # Match query regex # ====================== @@ -118,7 +155,8 @@ class WbUrl: self.mod = res[0] self.timestamp = res[1] - self.url = res[2] + self.end_timestamp = res[2] + self.url = res[3] if self.url.endswith('*'): self.type = self.URL_QUERY self.url = self.url[:-1] @@ -151,6 +189,7 @@ class WbUrl: atype = overrides['type'] if 'type' in overrides else self.type mod = overrides['mod'] if 'mod' in overrides else self.mod timestamp = overrides['timestamp'] if 'timestamp' in overrides else self.timestamp + end_timestamp = overrides['end_timestamp'] if 'end_timestamp' in overrides else self.end_timestamp url = overrides['url'] if 'url' in overrides else self.url if atype == self.QUERY or atype == self.URL_QUERY: @@ -159,6 +198,8 @@ class WbUrl: tsmod += mod + "/" if timestamp: tsmod += timestamp + if end_timestamp: + tsmod += '-' + end_timestamp tsmod += "*/" + url if atype == self.URL_QUERY: diff --git a/pywb/utils.py b/pywb/utils.py deleted file mode 100644 index fee5d931..00000000 --- a/pywb/utils.py +++ /dev/null @@ -1,122 +0,0 @@ -import itertools -import time -import zlib -import time -import datetime -import calendar -import re - -def peek_iter(iterable): - try: - first = next(iterable) - except StopIteration: - return None - - return itertools.chain([first], iterable) - - -def split_prefix(key, prefixs): - for p in prefixs: - if key.startswith(p): - plen = len(p) - return (key[:plen], key[plen:]) - - -def create_decompressor(): - return zlib.decompressobj(16 + zlib.MAX_WBITS) - - -#================================================================= -# Adapted from example at -class PerfTimer: - def __init__(self, perfdict, name): - self.perfdict = perfdict - self.name = name - - def __enter__(self): - self.start = time.clock() - return self - - def __exit__(self, *args): - self.end = time.clock() - if self.perfdict is not None: - self.perfdict[self.name] = str(self.end - self.start) - - -#================================================================= -# adapted -from wsgiref.request_uri, but doesn't include domain name and allows all characters -# allowed in the path segment according to: http://tools.ietf.org/html/rfc3986#section-3.3 -# explained here: http://stackoverflow.com/questions/4669692/valid-characters-for-directory-part-of-a-url-for-short-links -def rel_request_uri(environ, include_query=1): - """ - Return the requested path, optionally including the query string - - # Simple test: - >>> rel_request_uri({'PATH_INFO': '/web/example.com'}) - '/web/example.com' - - # Test all unecoded special chars and double-quote - # (double-quote must be encoded but not single quote) - >>> rel_request_uri({'PATH_INFO': "/web/example.com/0~!+$&'()*+,;=:\\\""}) - "/web/example.com/0~!+$&'()*+,;=:%22" - """ - from urllib import quote - url = quote(environ.get('PATH_INFO',''), safe='/~!$&\'()*+,;=:@') - if include_query and environ.get('QUERY_STRING'): - url += '?' + environ['QUERY_STRING'] - - return url - - - -#================================================================= -def unsurt(surt): - """ - # Simple surt - >>> unsurt('com,example)/') - 'example.com)/' - - # Broken surt - >>> unsurt('com,example)') - 'com,example)' - - # Long surt - >>> unsurt('suffix,domain,sub,subsub,another,subdomain)/path/file/index.html?a=b?c=)/') - 'subdomain.another.subsub.sub.domain.suffix)/path/file/index.html?a=b?c=)/' - """ - - try: - index = surt.index(')/') - parts = surt[0:index].split(',') - parts.reverse() - host = '.'.join(parts) - host += surt[index:] - return host - - except ValueError: - # May not be a valid surt - return surt - - -#================================================================= -# Support for bulk doctest testing via nose or py.test -# nosetests --with-doctest -# py.test --doctest_modules - -import sys -is_in_testtool = any(sys.argv[0].endswith(tool) for tool in ['py.test', 'nosetests']) - -def enable_doctests(): - return is_in_testtool - - -def test_data_dir(): - import os - return os.path.dirname(os.path.realpath(__file__)) + '/../sample_archive/' - -#================================================================= - -if __name__ == "__main__" or enable_doctests(): - import doctest - doctest.testmod() - diff --git a/pywb/utils/README.md b/pywb/utils/README.md new file mode 100644 index 00000000..35ebca86 --- /dev/null +++ b/pywb/utils/README.md @@ -0,0 +1,16 @@ +## PyWb Utils v0.2 ## + +[![Build Status](https://travis-ci.org/ikreymer/pywb_utils.png?branch=master)](https://travis-ci.org/ikreymer/pywb_utils) + +This is a standalone module contains a variety of utils used by pywb wayback tool suite. + +`python run-tests.py` will run all tests + +#### Modules + +[binsearch.py](pywb_utils/binsearch.py) -- Binary search implementation over text files + +[loaders.py](pywb_utils/loaders.py) -- Loading abstraction for http, local file system, as well as buffered and seekable file readers + +[timeutils.py](pywb_utils/timeutils.py) -- Utility functions for converting between standard datetime formats 14-digit timestamp + diff --git a/pywb/utils/__init__.py b/pywb/utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pywb/utils/binsearch.py b/pywb/utils/binsearch.py new file mode 100644 index 00000000..96b2e9de --- /dev/null +++ b/pywb/utils/binsearch.py @@ -0,0 +1,110 @@ +""" +Utility functions for performing binary search over a sorted text file +""" + +from collections import deque +import itertools + + +#================================================================= +def binsearch_offset(reader, key, compare_func=cmp, block_size=8192): + """ + Find offset of the line which matches a given 'key' using binary search + If key is not found, the offset is of the line after the key + + File is subdivided into block_size (default 8192) sized blocks + Optional compare_func may be specified + """ + min_ = 0 + max_ = reader.getsize() / block_size + + while max_ - min_ > 1: + mid = min_ + ((max_ - min_) / 2) + reader.seek(mid * block_size) + + if mid > 0: + reader.readline() # skip partial line + + line = reader.readline() + + if compare_func(key, line) > 0: + min_ = mid + else: + max_ = mid + + return min_ * block_size + + +#================================================================= +def search(reader, key, prev_size=0, compare_func=cmp, block_size=8192): + """ + Perform a binary search for a specified key to within a 'block_size' + (default 8192) sized block followed by linear search + within the block to find first matching line. + + When performin_g linear search, keep track of up to N previous lines before + first matching line. + """ + min_ = binsearch_offset(reader, key, compare_func, block_size) + + reader.seek(min_) + + if min_ > 0: + reader.readline() # skip partial line + + if prev_size > 1: + prev_deque = deque(max_len=prev_size) + + line = None + + while True: + line = reader.readline() + if not line: + break + if compare_func(line, key) >= 0: + break + + if prev_size == 1: + prev = line + elif prev_size > 1: + prev_deque.append(line) + + def gen_iter(line): + """ + Create iterator over any previous lines to + current matched line + """ + if prev_size == 1: + yield prev.rstrip() + elif prev_size > 1: + for i in prev_deque: + yield i.rstrip() + + while line: + yield line.rstrip() + line = reader.readline() + + return gen_iter(line) + + +#================================================================= +def iter_prefix(reader, key): + """ + Creates an iterator which iterates over lines that start with prefix + 'key' in a sorted text file. + """ + + return itertools.takewhile( + lambda line: line.startswith(key), + search(reader, key)) + + +#================================================================= +def iter_exact(reader, key, token=' '): + """ + Create an iterator which iterates over lines where the first field matches + the 'key', equivalent to token + sep prefix. + Default field termin_ator/seperator is ' ' + """ + + return iter_prefix(reader, key + token) diff --git a/pywb/utils/bufferedreaders.py b/pywb/utils/bufferedreaders.py new file mode 100644 index 00000000..27a3ed33 --- /dev/null +++ b/pywb/utils/bufferedreaders.py @@ -0,0 +1,204 @@ +import StringIO +import zlib + + +#================================================================= +def gzip_decompressor(): + """ + Decompressor which can handle decompress gzip stream + """ + return zlib.decompressobj(16 + zlib.MAX_WBITS) + + +#================================================================= +class BufferedReader(object): + """ + A wrapping line reader which wraps an existing reader. + Read operations operate on underlying buffer, which is filled to + block_size (1024 default) + + If an optional decompress type is specified, + data is fed through the decompressor when read from the buffer. + Currently supported decompression: gzip + + If decompression fails on first try, data is assumed to be decompressed + and no exception is thrown. If a failure occurs after data has been + partially decompressed, the exception is propagated. + + """ + + DECOMPRESSORS = {'gzip': gzip_decompressor} + + def __init__(self, stream, max_len=0, block_size=1024, decomp_type=None): + self.stream = stream + self.block_size = block_size + + if decomp_type: + try: + self.decompressor = self.DECOMPRESSORS[decomp_type.lower()]() + except KeyError: + raise Exception('Decompression type not supported: ' + + decomp_type) + else: + self.decompressor = None + + self.buff = None + self.num_read = 0 + self.max_len = max_len + + def _fillbuff(self, block_size=None): + if not block_size: + block_size = self.block_size + + if not self.buff or self.buff.pos >= self.buff.len: + if self.max_len > 0: + to_read = min(self.max_len - self.num_read, self.block_size) + else: + to_read = self.block_size + + data = self.stream.read(to_read) + self._process_read(data) + + def _process_read(self, data): + data = self._decompress(data) + self.num_read += len(data) + self.buff = StringIO.StringIO(data) + + def _decompress(self, data): + if self.decompressor and data: + try: + data = self.decompressor.decompress(data) + except Exception: + # if first read attempt, assume non-gzipped stream + if self.num_read == 0: + self.decompressor = None + # otherwise (partly decompressed), something is wrong + else: + raise + return data + + def read(self, length=None): + self._fillbuff() + return self.buff.read(length) + + def readline(self, length=None): + self._fillbuff() + return self.buff.readline(length) + + def close(self): + if self.stream: + self.stream.close() + self.stream = None + + +#================================================================= +class ChunkedDataException(Exception): + pass + + +#================================================================= +class ChunkedDataReader(BufferedReader): + r""" + A ChunkedDataReader is a BufferedReader which also supports de-chunking + of the data if it happens to be http 'chunk-encoded'. + + If at any point the chunked header is not available, the stream is + assumed to not be chunked and no more dechunking occurs. + + Properly formatted chunked data: + >>> c = ChunkedDataReader(StringIO.StringIO("4\r\n1234\r\n0\r\n\r\n")); + >>> c.read() + c.read() + '1234' + + Non-chunked data: + >>> ChunkedDataReader(StringIO.StringIO("xyz123!@#")).read() + 'xyz123!@#' + + Starts like chunked data, but isn't: + >>> c = ChunkedDataReader(StringIO.StringIO("1\r\nxyz123!@#")); + >>> c.read() + c.read() + '1\r\nx123!@#' + + Chunked data cut off part way through: + >>> c = ChunkedDataReader(StringIO.StringIO("4\r\n1234\r\n4\r\n12")); + >>> c.read() + c.read() + '123412' + """ + + all_chunks_read = False + not_chunked = False + + # if False, we'll use best-guess fallback for parse errors + raise_chunked_data_exceptions = False + + def _fillbuff(self, block_size=None): + if self.not_chunked: + return BufferedReader._fillbuff(self, block_size) + + if self.all_chunks_read: + return + + if not self.buff or self.buff.pos >= self.buff.len: + length_header = self.stream.readline(64) + self._data = '' + + try: + self._try_decode(length_header) + except ChunkedDataException: + if self.raise_chunked_data_exceptions: + raise + + # Can't parse the data as chunked. + # It's possible that non-chunked data is served + # with a Transfer-Encoding: chunked. + # Treat this as non-chunk encoded from here on. + self._process_read(length_header + self._data) + self.not_chunked = True + + def _try_decode(self, length_header): + # decode length header + try: + chunk_size = int(length_header.strip().split(';')[0], 16) + except ValueError: + raise ChunkedDataException("Couldn't decode length header " + + length_header) + + if not chunk_size: + # chunk_size 0 indicates end of file + self.all_chunks_read = True + #self._process_read('') + return + + data_len = len(self._data) + + # read chunk + while data_len < chunk_size: + new_data = self.stream.read(chunk_size - data_len) + + # if we unexpectedly run out of data, + # either raise an exception or just stop reading, + # assuming file was cut off + if not new_data: + if self.raise_chunked_data_exceptions: + msg = 'Ran out of data before end of chunk' + raise ChunkedDataException(msg) + else: + chunk_size = data_len + self.all_chunks_read = True + + self._data += new_data + data_len = len(self._data) + + # if we successfully read a block without running out, + # it should end in \r\n + if not self.all_chunks_read: + clrf = self.stream.read(2) + if clrf != '\r\n': + raise ChunkedDataException("Chunk terminator not found.") + + # hand to base class for further processing + self._process_read(self._data) + +if __name__ == "__main__": + import doctest + doctest.testmod() diff --git a/pywb/utils/loaders.py b/pywb/utils/loaders.py new file mode 100644 index 00000000..4d458738 --- /dev/null +++ b/pywb/utils/loaders.py @@ -0,0 +1,152 @@ +""" +This module provides loaders for local file system and over http +local and remote access +""" + +import os +import hmac +import urllib2 +import time + + +#================================================================= +# load a reader from http +#================================================================= +class HttpLoader(object): + """ + Load a file-like reader over http using range requests + and an optional cookie created via a cookie_maker + """ + def __init__(self, cookie_maker=None): + self.cookie_maker = cookie_maker + + def load(self, url, offset, length): + if length > 0: + range_header = 'bytes={0}-{1}'.format(offset, offset + length - 1) + else: + range_header = 'bytes={0}-'.format(offset) + + headers = {} + headers['Range'] = range_header + + if self.cookie_maker: + headers['Cookie'] = self.cookie_maker.make() + + request = urllib2.Request(url, headers=headers) + return urllib2.urlopen(request) + + +#================================================================= +# Signed Cookie-Maker +#================================================================= + +class HMACCookieMaker(object): + """ + Utility class to produce signed HMAC digest cookies + to be used with each http request + """ + def __init__(self, key, name, duration=10): + self.key = key + self.name = name + # duration in seconds + self.duration = duration + + def make(self, extra_id=''): + expire = str(long(time.time() + self.duration)) + + if extra_id: + msg = extra_id + '-' + expire + else: + msg = expire + + hmacdigest = hmac.new(self.key, msg) + hexdigest = hmacdigest.hexdigest() + + if extra_id: + cookie = '{0}-{1}={2}-{3}'.format(self.name, extra_id, + expire, hexdigest) + else: + cookie = '{0}={1}-{2}'.format(self.name, expire, hexdigest) + + return cookie + + +#================================================================= +# load a reader from local filesystem +#================================================================= +class FileLoader(object): + """ + Load a file-like reader from the local file system + """ + + def load(self, url, offset, length): + if url.startswith('file://'): + url = url[len('file://'):] + + afile = open(url, 'rb') + afile.seek(offset) + + if length > 0: + return LimitReader(afile, length) + + +#================================================================= +# Limit Reader +#================================================================= +class LimitReader(object): + """ + A reader which will not read more than specified limit + """ + + def __init__(self, stream, limit): + self.stream = stream + self.limit = limit + + if not self.limit: + self.limit = 1 + + def read(self, length=None): + length = min(length, self.limit) if length else self.limit + buff = self.stream.read(length) + self.limit -= len(buff) + return buff + + def readline(self, length=None): + length = min(length, self.limit) if length else self.limit + buff = self.stream.readline(length) + self.limit -= len(buff) + return buff + + def close(self): + self.stream.close() + + +#================================================================= +# Local text file with known size -- used for binsearch +#================================================================= +class SeekableTextFileReader(object): + """ + A very simple file-like object wrapper that knows it's total size, + via getsize() + Supports seek() operation. + Assumed to be a text file. Used for binsearch. + """ + def __init__(self, filename): + self.fh = open(filename, 'rb') + self.filename = filename + self.size = os.path.getsize(filename) + + def getsize(self): + return self.size + + def read(self): + return self.fh.read() + + def readline(self): + return self.fh.readline() + + def seek(self, offset): + return self.fh.seek(offset) + + def close(self): + return self.fh.close() diff --git a/pywb/utils/statusandheaders.py b/pywb/utils/statusandheaders.py new file mode 100644 index 00000000..85fd241e --- /dev/null +++ b/pywb/utils/statusandheaders.py @@ -0,0 +1,107 @@ +""" +Representation and parsing of HTTP-style status + headers +""" + +import pprint + + +#================================================================= +class StatusAndHeaders(object): + """ + Representation of parsed http-style status line and headers + Status Line if first line of request/response + Headers is a list of (name, value) tuples + An optional protocol which appears on first line may be specified + """ + def __init__(self, statusline, headers, protocol=''): + self.statusline = statusline + self.headers = headers + self.protocol = protocol + + def get_header(self, name): + """ + return header (name, value) + if found + """ + name_lower = name.lower() + for value in self.headers: + if value[0].lower() == name_lower: + return value[1] + + def remove_header(self, name): + """ + remove header (case-insensitive) + return True if header removed, False otherwise + """ + name_lower = name.lower() + for index in xrange(len(self.headers) - 1, -1, -1): + if self.headers[index][0].lower() == name_lower: + del self.headers[index] + return True + + return False + + def __repr__(self): + headers_str = pprint.pformat(self.headers, indent=2) + return "StatusAndHeaders(protocol = '{0}', statusline = '{1}', \ +headers = {2})".format(self.protocol, self.statusline, headers_str) + + def __eq__(self, other): + return (self.statusline == other.statusline and + self.headers == other.headers and + self.protocol == other.protocol) + + +#================================================================= +class StatusAndHeadersParser(object): + """ + Parser which consumes a stream support readline() to read + status and headers and return a StatusAndHeaders object + """ + def __init__(self, statuslist): + self.statuslist = statuslist + + def parse(self, stream): + """ + parse stream for status line and headers + return a StatusAndHeaders object + """ + statusline = stream.readline().rstrip() + + protocol_status = self.split_prefix(statusline, self.statuslist) + + if not protocol_status: + msg = 'Expected Status Line - Found: ' + statusline + raise StatusAndHeadersParserException(msg) + + headers = [] + + line = stream.readline().rstrip() + while line and line != '\r\n': + name, value = line.split(':', 1) + header = (name, value.strip()) + headers.append(header) + line = stream.readline().rstrip() + + return StatusAndHeaders(statusline=protocol_status[1].strip(), + headers=headers, + protocol=protocol_status[0]) + + @staticmethod + def split_prefix(key, prefixs): + """ + split key string into prefix and remainder + for first matching prefix from a list + """ + for prefix in prefixs: + if key.startswith(prefix): + plen = len(prefix) + return (key[:plen], key[plen:]) + + +#================================================================= +class StatusAndHeadersParserException(Exception): + """ + status + headers parsing exception + """ + pass diff --git a/pywb/utils/test/binsearch_test.py b/pywb/utils/test/binsearch_test.py new file mode 100644 index 00000000..d35551ec --- /dev/null +++ b/pywb/utils/test/binsearch_test.py @@ -0,0 +1,52 @@ +#================================================================= +""" +# binsearch tests + +# Prefix Search +>>> print_binsearch_results('org,iana)/domains/root', iter_prefix) +org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz +org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz +org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz +org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz + +>>> print_binsearch_results('org,iana)/domains/root', iter_exact) +org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz + +>>> print_binsearch_results('org,iana)/', iter_exact) +org,iana)/ 20140126200624 http://www.iana.org/ text/html 200 OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 2258 334 iana.warc.gz + +>>> print_binsearch_results('org,iana)/domains/root/db', iter_exact) +org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz +org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz + +# Exact Search +>>> print_binsearch_results('org,iaana)/', iter_exact) +>>> print_binsearch_results('org,ibna)/', iter_exact) + +>>> print_binsearch_results('org,iana)/time-zones', iter_exact) +org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz +""" + + +#================================================================= +import os +from pywb.utils.binsearch import iter_prefix, iter_exact +from pywb.utils.loaders import SeekableTextFileReader + +from pywb import get_test_dir + +#test_cdx_dir = os.path.dirname(os.path.realpath(__file__)) + '/../sample-data/' +test_cdx_dir = get_test_dir() + 'cdx/' + +def print_binsearch_results(key, iter_func): + cdx = SeekableTextFileReader(test_cdx_dir + 'iana.cdx') + + for line in iter_func(cdx, key): + print line + + +if __name__ == "__main__": + import doctest + doctest.testmod() + + diff --git a/pywb/utils/test/loaders_test.py b/pywb/utils/test/loaders_test.py new file mode 100644 index 00000000..73d4b3dd --- /dev/null +++ b/pywb/utils/test/loaders_test.py @@ -0,0 +1,69 @@ +#================================================================= +""" +# LimitReader Tests +>>> LimitReader(StringIO.StringIO('abcdefghjiklmnopqrstuvwxyz'), 10).read(26) +'abcdefghji' + +>>> LimitReader(StringIO.StringIO('abcdefghjiklmnopqrstuvwxyz'), 8).readline(26) +'abcdefgh' + +>>> read_multiple(LimitReader(StringIO.StringIO('abcdefghjiklmnopqrstuvwxyz'), 10), [2, 2, 20]) +'efghji' + +# FileLoader Tests (includes LimitReader) +# Ensure attempt to read more than 100 bytes, reads exactly 100 bytes +>>> len(FileLoader().load(test_cdx_dir + 'iana.cdx', 0, 100).read('400')) +100 + +# SeekableTextFileReader Test +>>> sr = SeekableTextFileReader(test_cdx_dir + 'iana.cdx') +>>> sr.getsize() +30399 + +>>> seek_read_full(sr, 100) +'org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200826 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 34054 620049 iana.warc.gz\\n' + +#BufferedReader readline() +>>> BufferedReader(open(test_cdx_dir + 'iana.cdx', 'rb')).readline() +' CDX N b a m s k r M S V g\\n' + +#BufferedReader readline() with decompression +>>> BufferedReader(open(test_cdx_dir + 'iana.cdx.gz', 'rb'), decomp_type = 'gzip').readline() +' CDX N b a m s k r M S V g\\n' + +>>> HttpLoader(HMACCookieMaker('test', 'test', 5)).load('http://example.com', 41, 14).read() +'Example Domain' +""" + + +#================================================================= +import os +import StringIO +from pywb.utils.loaders import FileLoader, HttpLoader, HMACCookieMaker +from pywb.utils.loaders import LimitReader, SeekableTextFileReader +from pywb.utils.bufferedreaders import BufferedReader + +from pywb import get_test_dir +#test_cdx_dir = os.path.dirname(os.path.realpath(__file__)) + '/../sample-data/' +test_cdx_dir = get_test_dir() + 'cdx/' + + +def read_multiple(reader, inc_reads): + result = None + for x in inc_reads: + result = reader.read(x) + return result + + +def seek_read_full(seekable_reader, offset): + seekable_reader.seek(offset) + seekable_reader.readline() #skip + return seekable_reader.readline() + + + +if __name__ == "__main__": + import doctest + doctest.testmod() + + diff --git a/pywb/cdxserver/timeutils.py b/pywb/utils/timeutils.py similarity index 75% rename from pywb/cdxserver/timeutils.py rename to pywb/utils/timeutils.py index 23ed3bec..62929d50 100644 --- a/pywb/cdxserver/timeutils.py +++ b/pywb/utils/timeutils.py @@ -1,20 +1,25 @@ +""" +utility functions for converting between +datetime, iso date and 14-digit timestamp +""" + import re import time import datetime import calendar +from itertools import imap #================================================================= # str <-> datetime conversion #================================================================= -DATE_TIMESPLIT = re.compile('[^\d]') +DATE_TIMESPLIT = re.compile(r'[^\d]') TIMESTAMP_14 = '%Y%m%d%H%M%S' PAD_STAMP_END = '29991231235959' - def iso_date_to_datetime(string): """ >>> iso_date_to_datetime('2013-12-26T10:11:12Z') @@ -28,16 +33,18 @@ def iso_date_to_datetime(string): if nums[-1] == '': nums = nums[:-1] - dt = datetime.datetime(*map(int, nums)) - return dt + the_datetime = datetime.datetime(*imap(int, nums)) + return the_datetime -def datetime_to_timestamp(dt): + +def datetime_to_timestamp(the_datetime): """ >>> datetime_to_timestamp(datetime.datetime(2013, 12, 26, 10, 11, 12)) '20131226101112' """ - return dt.strftime(TIMESTAMP_14) + return the_datetime.strftime(TIMESTAMP_14) + def iso_date_to_timestamp(string): """ @@ -52,7 +59,7 @@ def iso_date_to_timestamp(string): # default pad is end of range for compatibility -def pad_timestamp(string, pad_str = PAD_STAMP_END): +def pad_timestamp(string, pad_str=PAD_STAMP_END): """ >>> pad_timestamp('20') '20991231235959' @@ -76,10 +83,12 @@ def pad_timestamp(string, pad_str = PAD_STAMP_END): def timestamp_to_datetime(string): """ >>> timestamp_to_datetime('20131226095010') - time.struct_time(tm_year=2013, tm_mon=12, tm_mday=26, tm_hour=9, tm_min=50, tm_sec=10, tm_wday=3, tm_yday=360, tm_isdst=-1) + time.struct_time(tm_year=2013, tm_mon=12, tm_mday=26, \ +tm_hour=9, tm_min=50, tm_sec=10, tm_wday=3, tm_yday=360, tm_isdst=-1) >>> timestamp_to_datetime('2014') - time.struct_time(tm_year=2014, tm_mon=12, tm_mday=31, tm_hour=23, tm_min=59, tm_sec=59, tm_wday=2, tm_yday=365, tm_isdst=-1) + time.struct_time(tm_year=2014, tm_mon=12, tm_mday=31, \ +tm_hour=23, tm_min=59, tm_sec=59, tm_wday=2, tm_yday=365, tm_isdst=-1) """ # Default pad to end of range for comptability diff --git a/pywb/views.py b/pywb/views.py index 9e3f0a96..681f6c97 100644 --- a/pywb/views.py +++ b/pywb/views.py @@ -1,4 +1,4 @@ -import cdxserver.timeutils as timeutils +import pywb.utils.timeutils as timeutils import wbrequestresponse import wbexceptions diff --git a/pywb/warc/README.md b/pywb/warc/README.md new file mode 100644 index 00000000..fe6bf216 --- /dev/null +++ b/pywb/warc/README.md @@ -0,0 +1,22 @@ +## PyWb Warc v0.2 + +[![Build Status](https://travis-ci.org/ikreymer/pywb_warc.png?branch=master)](https://travis-ci.org/ikreymer/pywb_warc) + +This is the WARC/ARC record loading component of pywb wayback tool suite. + + +This package provides the following facilities: + +* Resolve relative WARC/ARC filenames to a full path based on configurable resolvers + +* Resolve 'revisit' records from provided index to find a full record with headers and payload content + +* Load WARC and ARC records either locally or via http using http 1.1 range requests + + +### Tests + +This package will include a test suite for different WARC and ARC loading formats. + +To run: `python run-tests.py` + diff --git a/pywb/warc/__init__.py b/pywb/warc/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pywb/replay_resolvers.py b/pywb/warc/pathresolvers.py similarity index 62% rename from pywb/replay_resolvers.py rename to pywb/warc/pathresolvers.py index 45354599..7b275c0c 100644 --- a/pywb/replay_resolvers.py +++ b/pywb/warc/pathresolvers.py @@ -1,13 +1,27 @@ import redis -import binsearch.binsearch + +from pywb.utils.binsearch import iter_exact +from pywb.utils.loaders import SeekableTextFileReader import urlparse import os import logging -#====================================== -# PrefixResolver - convert cdx file entry to url with prefix if url contains specified string -#====================================== +""" +The purpose of this module is to 'resolve' a warc/arc filename, +often found in a CDX file, to a full loadable url. + +Supported resolvers are: url prefix, path index lookup and redis + +make_best_resolver() attempts to guess the resolver method for given uri + +""" + + +#================================================================= +# PrefixResolver - convert cdx file entry to url with prefix +# if url contains specified string +#================================================================= class PrefixResolver: def __init__(self, prefix, contains): self.prefix = prefix @@ -18,14 +32,15 @@ class PrefixResolver: def __repr__(self): if self.contains: - return "PrefixResolver('{0}', contains = '{1}')".format(self.prefix, self.contains) + return ("PrefixResolver('{0}', contains = '{1}')" + .format(self.prefix, self.contains)) else: return "PrefixResolver('{0}')".format(self.prefix) -#====================================== +#================================================================= class RedisResolver: - def __init__(self, redis_url, key_prefix = None): + def __init__(self, redis_url, key_prefix=None): self.redis_url = redis_url self.key_prefix = key_prefix if key_prefix else 'w:' self.redis = redis.StrictRedis.from_url(redis_url) @@ -42,14 +57,14 @@ class RedisResolver: return "RedisResolver('{0}')".format(self.redis_url) -#====================================== +#================================================================= class PathIndexResolver: def __init__(self, pathindex_file): self.pathindex_file = pathindex_file - self.reader = binsearch.binsearch.FileReader(pathindex_file) + self.reader = SeekableTextFileReader(pathindex_file) def __call__(self, filename): - result = binsearch.binsearch.iter_exact(self.reader, filename, '\t') + result = iter_exact(self.reader, filename, '\t') def gen_list(result): for pathline in result: @@ -63,6 +78,7 @@ class PathIndexResolver: return "PathIndexResolver('{0}')".format(self.pathindex_file) +#================================================================= #TODO: more options (remote files, contains param, etc..) # find best resolver given the path def make_best_resolver(param): @@ -80,11 +96,14 @@ def make_best_resolver(param): RedisResolver('redis://myhost.example.com:1234/1') # a file - >>> class_name(make_best_resolver('file://' + os.path.realpath(__file__))) + >>> r = make_best_resolver('file://' + os.path.realpath(__file__)) + >>> r.__class__.__name__ 'PathIndexResolver' # a dir - >>> class_name(make_best_resolver('file://' + os.path.dirname(os.path.realpath(__file__)))) + >>> path = os.path.realpath(__file__) + >>> r = make_best_resolver('file://' + os.path.dirname(path)) + >>> r.__class__.__name__ 'PrefixResolver' """ @@ -99,27 +118,29 @@ def make_best_resolver(param): url_parts = urlparse.urlsplit(path) if url_parts.scheme == 'redis': - logging.info('Adding Redis Index: ' + path) + logging.debug('Adding Redis Index: ' + path) return RedisResolver(path, arg) if url_parts.scheme == 'file': path = url_parts.path if os.path.isfile(path): - logging.info('Adding Path Index: ' + path) + logging.debug('Adding Path Index: ' + path) return PathIndexResolver(path) # non-file paths always treated as prefix for now else: - logging.info('Adding Archive Path Source: ' + path) + logging.debug('Adding Archive Path Source: ' + path) return PrefixResolver(path, arg) #================================================================= def make_best_resolvers(paths): """ - >>> make_best_resolvers(['http://myhost.example.com/warcs/', 'redis://myhost.example.com:1234/1']) - [PrefixResolver('http://myhost.example.com/warcs/'), RedisResolver('redis://myhost.example.com:1234/1')] + >>> r = make_best_resolvers(['http://example.com/warcs/',\ + 'redis://example.com:1234/1']) + >>> map(lambda x: x.__class__.__name__, r) + ['PrefixResolver', 'RedisResolver'] """ if hasattr(paths, '__iter__'): return map(make_best_resolver, paths) @@ -127,13 +148,7 @@ def make_best_resolvers(paths): return [make_best_resolver(paths)] -import utils #================================================================= -if __name__ == "__main__" or utils.enable_doctests(): - - def class_name(obj): - return obj.__class__.__name__ - +if __name__ == "__main__": import doctest doctest.testmod() - diff --git a/pywb/warc/recordloader.py b/pywb/warc/recordloader.py new file mode 100644 index 00000000..5937202c --- /dev/null +++ b/pywb/warc/recordloader.py @@ -0,0 +1,161 @@ +import itertools +import urlparse +import collections + +from pywb.utils.statusandheaders import StatusAndHeaders +from pywb.utils.statusandheaders import StatusAndHeadersParser + +from pywb.utils.loaders import FileLoader, HttpLoader +from pywb.utils.bufferedreaders import BufferedReader + +#================================================================= +ArcWarcRecord = collections.namedtuple('ArchiveRecord', + 'type, rec_headers, ' + + 'stream, status_headers') + + +#================================================================= +class ArchiveLoadFailed(Exception): + def __init__(self, reason, filename=''): + super(ArchiveLoadFailed, self).__init__(filename + ':' + str(reason)) + #self.filename = filename + #self.reason = reason + + def status(self): + return '503 Service Unavailable' + + +#================================================================= +class ArcWarcRecordLoader: + # Standard ARC headers + ARC_HEADERS = ["uri", "ip-address", "creation-date", + "content-type", "length"] + + # Since loading a range request, + # can only determine gzip-ness based on file extension + # (BufferedReader will however default to non-gzip if + # decompression fails) + FORMAT_MAP = { + '.warc.gz': ('warc', True), + '.arc.gz': ('arc', True), + '.warc': ('warc', False), + '.arc': ('arc', False), + } + + @staticmethod + def create_default_loaders(cookie_maker=None): + http = HttpLoader(cookie_maker) + file = FileLoader() + return { + 'http': http, + 'https': http, + 'file': file, + '': file + } + + def __init__(self, loaders={}, cookie_maker=None, chunk_size=8192): + self.loaders = loaders + + if not self.loaders: + self.loaders = self.create_default_loaders(cookie_maker) + + self.chunk_size = chunk_size + + self.arc_parser = ARCHeadersParser(self.ARC_HEADERS) + + warc_types = ['WARC/1.0', 'WARC/0.17', 'WARC/0.18'] + self.warc_parser = StatusAndHeadersParser(warc_types) + self.http_parser = StatusAndHeadersParser(['HTTP/1.0', 'HTTP/1.1']) + + def load(self, url, offset, length): + url_parts = urlparse.urlsplit(url) + + loader = self.loaders.get(url_parts.scheme) + if not loader: + raise ArchiveLoadFailed('Unknown Protocol', url) + + the_format = None + + for ext, iformat in self.FORMAT_MAP.iteritems(): + if url.endswith(ext): + the_format = iformat + break + + if the_format is None: + raise ArchiveLoadFailed('Unknown file format', url) + + (a_format, is_gzip) = the_format + + #decomp = utils.create_decompressor() if is_gzip else None + decomp_type = 'gzip' if is_gzip else None + + try: + length = int(length) + except: + length = -1 + + raw = loader.load(url, long(offset), length) + + stream = BufferedReader(raw, length, self.chunk_size, decomp_type) + + if a_format == 'arc': + rec_headers = self.arc_parser.parse(stream) + rec_type = 'response' + empty = (rec_headers.get_header('length') == 0) + + elif a_format == 'warc': + rec_headers = self.warc_parser.parse(stream) + rec_type = rec_headers.get_header('WARC-Type') + empty = (rec_headers.get_header('Content-Length') == '0') + + # special case: empty w/arc record (hopefully a revisit) + if empty: + status_headers = StatusAndHeaders('204 No Content', []) + + # special case: warc records that are not expected to have http headers + # attempt to add 200 status and content-type + elif rec_type == 'metadata' or rec_type == 'resource': + content_type = [('Content-Type', + rec_headers.get_header('Content-Type'))] + + status_headers = StatusAndHeaders('200 OK', content_type) + + # special case: http 0.9 response, no status or headers + #elif rec_type == 'response': + # content_type = rec_headers.get_header('Content-Type') + # if content_type and (';version=0.9' in content_type): + # status_headers = StatusAndHeaders('200 OK', []) + + # response record: parse HTTP status and headers! + else: + #(statusline, http_headers) = self.parse_http_headers(stream) + status_headers = self.http_parser.parse(stream) + + return ArcWarcRecord((a_format, rec_type), + rec_headers, stream, status_headers) + + +#================================================================= +class ARCHeadersParser: + def __init__(self, headernames): + self.headernames = headernames + + def parse(self, stream): + headerline = stream.readline().rstrip() + + parts = headerline.split() + + headernames = self.headernames + + if len(parts) != len(headernames): + msg = 'Wrong # of headers, expected arc headers {0}, Found {1}' + raise ArchiveLoadFailed(msg.format(headernames, parts)) + + headers = [] + + for name, value in itertools.izip(headernames, parts): + headers.append((name, value)) + + return StatusAndHeaders(statusline='', + headers=headers, + protocol='ARC/1.0') diff --git a/pywb/warc/resolvingloader.py b/pywb/warc/resolvingloader.py new file mode 100644 index 00000000..c4ed557f --- /dev/null +++ b/pywb/warc/resolvingloader.py @@ -0,0 +1,176 @@ +from pywb.utils.timeutils import iso_date_to_timestamp +from recordloader import ArcWarcRecordLoader, ArchiveLoadFailed +from pathresolvers import make_best_resolvers + + +#================================================================= +class ResolvingLoader: + def __init__(self, paths, record_loader=ArcWarcRecordLoader(), + cdx_server=None): + + self.path_resolvers = make_best_resolvers(paths) + self.record_loader = record_loader + self.cdx_server = cdx_server + + def resolve_headers_and_payload(self, cdx, failed_files): + """ + Resolve headers and payload for a given capture + In the simple case, headers and payload are in the same record. + In the case of revisit records, the payload and headers may be in + different records. + + If the original has already been found, lookup original using + orig. fields in cdx dict. + Otherwise, call _load_different_url_payload() to get cdx index + from a different url to find the original record. + """ + has_curr = (cdx['filename'] != '-') + has_orig = (cdx.get('orig.filename', '-') != '-') + + # load headers record from cdx['filename'] unless it is '-' (rare) + headers_record = None + if has_curr: + headers_record = self._resolve_path_load(cdx, False, failed_files) + + # two index lookups + # Case 1: if mimetype is still warc/revisit + if cdx['mimetype'] == 'warc/revisit' and headers_record: + payload_record = self._load_different_url_payload(cdx, + headers_record, + failed_files) + + # single lookup cases + # case 2: non-revisit + elif (has_curr and not has_orig): + payload_record = headers_record + + # case 3: identical url revisit, load payload from orig.filename + elif (has_orig): + payload_record = self._resolve_path_load(cdx, True, failed_files) + + # special case: set header to payload if old-style revisit + # with missing header + if not headers_record: + headers_record = payload_record + elif headers_record != payload_record: + # close remainder of stream as this record only used for + # (already parsed) headers + headers_record.stream.close() + + # special case: check if headers record is actually empty + # (eg empty revisit), then use headers from revisit + if not headers_record.status_headers.headers: + headers_record = payload_record + + if not headers_record or not payload_record: + raise ArchiveLoadFailed('Could not load ' + str(cdx)) + + return (headers_record.status_headers, payload_record.stream) + + def _resolve_path_load(self, cdx, is_original, failed_files): + """ + Load specific record based on filename, offset and length + fields in the cdx. + If original=True, use the orig.* fields for the cdx + + Resolve the filename to full path using specified path resolvers + + If failed_files list provided, keep track of failed resolve attempts + """ + + if is_original: + (filename, offset, length) = (cdx['orig.filename'], + cdx['orig.offset'], + cdx['orig.length']) + else: + (filename, offset, length) = (cdx['filename'], + cdx['offset'], + cdx['length']) + + # optimization: if same file already failed this request, + # don't try again + if failed_files and filename in failed_files: + raise ArchiveLoadFailed('Skipping Already Failed', filename) + + any_found = False + last_exc = None + for resolver in self.path_resolvers: + possible_paths = resolver(filename) + + if possible_paths: + for path in possible_paths: + any_found = True + try: + return self.record_loader.load(path, offset, length) + + except Exception as ue: + last_exc = ue + + # Unsuccessful if reached here + if failed_files: + failed_files.append(filename) + + if last_exc: + msg = str(last_exc.__class__.__name__) + else: + msg = 'Archive File Not Found' + + raise ArchiveLoadFailed(msg, filename) + + def _load_different_url_payload(self, cdx, headers_record, failed_files): + """ + Handle the case where a duplicate of a capture with same digest + exists at a different url. + + If a cdx_server is provided, a query is made for matching + url, timestamp and digest. + + Raise exception if no matches found. + """ + + ref_target_uri = (headers_record.rec_headers. + get_header('WARC-Refers-To-Target-URI')) + + target_uri = headers_record.rec_headers.get_header('WARC-Target-URI') + + # Check for unresolved revisit error, + # if refers to target uri not present or same as the current url + if not ref_target_uri or (ref_target_uri == target_uri): + raise ArchiveLoadFailed('Missing Revisit Original') + + ref_target_date = (headers_record.rec_headers. + get_header('WARC-Refers-To-Date')) + + if not ref_target_date: + ref_target_date = cdx['timestamp'] + else: + ref_target_date = iso_date_to_timestamp(ref_target_date) + + orig_cdx_lines = self.load_cdx_for_dupe(ref_target_uri, + ref_target_date, digest) + + for cdx in orig_cdx_lines: + try: + payload_record = self._load_and_resolve(cdx, False, + failed_files) + return payload_record + + except ArchiveLoadFailed as e: + pass + + raise ArchiveLoadFailed('Original for revisit could not be loaded') + + def load_cdx_for_dupe(url, timestamp, digest): + """ + If a cdx_server is available, return response from server, + otherwise empty list + """ + if not self.cdx_server: + return [] + + params = {'url': url, + 'closest': closest, + 'filter': 'digest:' + digest, + 'output': 'raw'} + + return self.cdx_server.load_cdx(params) diff --git a/pywb/warc/test/test_loading.py b/pywb/warc/test/test_loading.py new file mode 100644 index 00000000..d95aaba5 --- /dev/null +++ b/pywb/warc/test/test_loading.py @@ -0,0 +1,199 @@ + +""" +Test loading different types of records from a variety of formats + +# Load response record from WARC +>>> load_test_archive('example.warc.gz', '333', '1043') +(('warc', 'response'), + StatusAndHeaders(protocol = 'WARC/1.0', statusline = '', headers = [ ('WARC-Type', 'response'), + ('WARC-Record-ID', ''), + ('WARC-Date', '2014-01-03T03:03:21Z'), + ('Content-Length', '1610'), + ('Content-Type', 'application/http; msgtype=response'), + ('WARC-Payload-Digest', 'sha1:B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'), + ('WARC-Target-URI', 'http://example.com?example=1'), + ('WARC-Warcinfo-ID', '')]), + StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'), + ('Cache-Control', 'max-age=604800'), + ('Content-Type', 'text/html'), + ('Date', 'Fri, 03 Jan 2014 03:03:21 GMT'), + ('Etag', '"359670651"'), + ('Expires', 'Fri, 10 Jan 2014 03:03:21 GMT'), + ('Last-Modified', 'Fri, 09 Aug 2013 23:54:35 GMT'), + ('Server', 'ECS (sjc/4FCE)'), + ('X-Cache', 'HIT'), + ('x-ec-custom-error', '1'), + ('Content-Length', '1270'), + ('Connection', 'close')])) + +# Load revisit record from WARC +>>> load_test_archive('example.warc.gz', '1864', '553') +(('warc', 'revisit'), + StatusAndHeaders(protocol = 'WARC/1.0', statusline = '', headers = [ ('WARC-Type', 'revisit'), + ('WARC-Record-ID', ''), + ('WARC-Date', '2014-01-03T03:03:41Z'), + ('Content-Length', '340'), + ('Content-Type', 'application/http; msgtype=response'), + ('WARC-Payload-Digest', 'sha1:B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'), + ('WARC-Target-URI', 'http://example.com?example=1'), + ('WARC-Warcinfo-ID', ''), + ( 'WARC-Profile', + 'http://netpreserve.org/warc/0.18/revisit/identical-payload-digest'), + ('WARC-Refers-To-Target-URI', 'http://example.com?example=1'), + ('WARC-Refers-To-Date', '2014-01-03T03:03:21Z')]), + StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'), + ('Cache-Control', 'max-age=604800'), + ('Content-Type', 'text/html'), + ('Date', 'Fri, 03 Jan 2014 03:03:41 GMT'), + ('Etag', '"359670651"'), + ('Expires', 'Fri, 10 Jan 2014 03:03:41 GMT'), + ('Last-Modified', 'Fri, 09 Aug 2013 23:54:35 GMT'), + ('Server', 'ECS (sjc/4FCE)'), + ('X-Cache', 'HIT'), + ('x-ec-custom-error', '1'), + ('Content-Length', '1270'), + ('Connection', 'close')])) + + +# Test of record loading based on cdx line +# Print parsed http headers + 2 lines of content +# ============================================================================== + +# Test loading from ARC based on cdx line +>>> load_from_cdx_test('com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 856 171 example.arc.gz') +StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'), + ('Cache-Control', 'max-age=604800'), + ('Content-Type', 'text/html'), + ('Date', 'Sun, 16 Feb 2014 05:02:20 GMT'), + ('Etag', '"359670651"'), + ('Expires', 'Sun, 23 Feb 2014 05:02:20 GMT'), + ('Last-Modified', 'Fri, 09 Aug 2013 23:54:35 GMT'), + ('Server', 'ECS (sjc/4FCE)'), + ('X-Cache', 'HIT'), + ('x-ec-custom-error', '1'), + ('Content-Length', '1270')]) + + + +>>> load_from_cdx_test('com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1656 151 example.arc') +StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'), + ('Cache-Control', 'max-age=604800'), + ('Content-Type', 'text/html'), + ('Date', 'Sun, 16 Feb 2014 05:02:20 GMT'), + ('Etag', '"359670651"'), + ('Expires', 'Sun, 23 Feb 2014 05:02:20 GMT'), + ('Last-Modified', 'Fri, 09 Aug 2013 23:54:35 GMT'), + ('Server', 'ECS (sjc/4FCE)'), + ('X-Cache', 'HIT'), + ('x-ec-custom-error', '1'), + ('Content-Length', '1270')]) + + + + +# Test loading from WARC based on cdx line +>>> load_from_cdx_test('com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz') +StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'), + ('Cache-Control', 'max-age=604800'), + ('Content-Type', 'text/html'), + ('Date', 'Fri, 03 Jan 2014 03:03:21 GMT'), + ('Etag', '"359670651"'), + ('Expires', 'Fri, 10 Jan 2014 03:03:21 GMT'), + ('Last-Modified', 'Fri, 09 Aug 2013 23:54:35 GMT'), + ('Server', 'ECS (sjc/4FCE)'), + ('X-Cache', 'HIT'), + ('x-ec-custom-error', '1'), + ('Content-Length', '1270'), + ('Connection', 'close')]) + + + +# Test cdx w/ revisit +>>> load_from_cdx_test('com,example)/?example=1 20140103030341 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz 1043 333 example.warc.gz') +StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'), + ('Cache-Control', 'max-age=604800'), + ('Content-Type', 'text/html'), + ('Date', 'Fri, 03 Jan 2014 03:03:41 GMT'), + ('Etag', '"359670651"'), + ('Expires', 'Fri, 10 Jan 2014 03:03:41 GMT'), + ('Last-Modified', 'Fri, 09 Aug 2013 23:54:35 GMT'), + ('Server', 'ECS (sjc/4FCE)'), + ('X-Cache', 'HIT'), + ('x-ec-custom-error', '1'), + ('Content-Length', '1270'), + ('Connection', 'close')]) + + + +# Test loading warc created by wget 1.14 +>>> load_from_cdx_test('com,example)/ 20140216012908 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1151 792 example-wget-1-14.warc.gz') +StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'), + ('Cache-Control', 'max-age=604800'), + ('Content-Type', 'text/html'), + ('Date', 'Sun, 16 Feb 2014 01:29:08 GMT'), + ('Etag', '"359670651"'), + ('Expires', 'Sun, 23 Feb 2014 01:29:08 GMT'), + ('Last-Modified', 'Fri, 09 Aug 2013 23:54:35 GMT'), + ('Server', 'ECS (sjc/4FB4)'), + ('X-Cache', 'HIT'), + ('x-ec-custom-error', '1'), + ('Content-Length', '1270')]) + + + +# Error Handling + +# Invalid WARC Offset +>>> load_from_cdx_test('com,example)/?example=1 20140103030341 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1860 example.warc.gz 1043 333 example.warc.gz') +Traceback (most recent call last): +ArchiveLoadFailed: example.warc.gz:StatusAndHeadersParserException + +# Invalid ARC Offset +>>> load_from_cdx_test('com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 332 example.warc.gz') +Traceback (most recent call last): +ArchiveLoadFailed: example.warc.gz:StatusAndHeadersParserException + + +# Error Expected with revisit -- invalid offset on original +>>> load_from_cdx_test('com,example)/?example=1 20140103030341 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz 1043 330 example.warc.gz') +Traceback (most recent call last): +ArchiveLoadFailed: example.warc.gz:StatusAndHeadersParserException + +""" + +import os +import sys +import pprint + +from pywb.warc.recordloader import ArcWarcRecordLoader, ArchiveLoadFailed +from pywb.warc.pathresolvers import make_best_resolvers +from pywb.warc.resolvingloader import ResolvingLoader +from pywb.cdx.cdxobject import CDXObject + +from pywb import get_test_dir + +#test_warc_dir = os.path.dirname(os.path.realpath(__file__)) + '/../sample_data/' +test_warc_dir = get_test_dir() + 'warcs/' + +def load_test_archive(test_file, offset, length): + path = test_warc_dir + test_file + + testloader = ArcWarcRecordLoader() + + archive = testloader.load(path, offset, length) + archive = testloader.load(path, offset, length) + + pprint.pprint((archive.type, archive.rec_headers, archive.status_headers)) + + +def load_from_cdx_test(cdx): + resolve_loader = ResolvingLoader(test_warc_dir) + cdx = CDXObject(cdx) + (headers, stream) = resolve_loader.resolve_headers_and_payload(cdx, None) + print headers + sys.stdout.write(stream.readline()) + sys.stdout.write(stream.readline()) + + + + diff --git a/pywb/wbapp.py b/pywb/wbapp.py index e0bedef6..5fef353e 100644 --- a/pywb/wbapp.py +++ b/pywb/wbapp.py @@ -1,8 +1,7 @@ -import utils import wbexceptions from wbrequestresponse import WbResponse, StatusAndHeaders -from cdxserver.cdxserver import CDXException +from pywb.cdx.cdxserver import CDXException import os import importlib @@ -10,13 +9,37 @@ import logging +#================================================================= +# adapted -from wsgiref.request_uri, but doesn't include domain name and allows all characters +# allowed in the path segment according to: http://tools.ietf.org/html/rfc3986#section-3.3 +# explained here: http://stackoverflow.com/questions/4669692/valid-characters-for-directory-part-of-a-url-for-short-links +def rel_request_uri(environ, include_query=1): + """ + Return the requested path, optionally including the query string + + # Simple test: + >>> rel_request_uri({'PATH_INFO': '/web/example.com'}) + '/web/example.com' + + # Test all unecoded special chars and double-quote + # (double-quote must be encoded but not single quote) + >>> rel_request_uri({'PATH_INFO': "/web/example.com/0~!+$&'()*+,;=:\\\""}) + "/web/example.com/0~!+$&'()*+,;=:%22" + """ + from urllib import quote + url = quote(environ.get('PATH_INFO',''), safe='/~!$&\'()*+,;=:@') + if include_query and environ.get('QUERY_STRING'): + url += '?' + environ['QUERY_STRING'] + + return url + #================================================================= def create_wb_app(wb_router): # Top-level wsgi application def application(env, start_response): if env.get('SCRIPT_NAME') or not env.get('REQUEST_URI'): - env['REL_REQUEST_URI'] = utils.rel_request_uri(env) + env['REL_REQUEST_URI'] = rel_request_uri(env) else: env['REL_REQUEST_URI'] = env['REQUEST_URI'] @@ -95,7 +118,7 @@ def main(): raise #================================================================= -if __name__ == "__main__" or utils.enable_doctests(): +if __name__ == "__main__": pass else: application = main() diff --git a/pywb/wbrequestresponse.py b/pywb/wbrequestresponse.py index a1a82045..e2715177 100644 --- a/pywb/wbrequestresponse.py +++ b/pywb/wbrequestresponse.py @@ -1,7 +1,6 @@ -from wburl import WbUrl -from url_rewriter import UrlRewriter - -import utils +from pywb.rewrite.wburl import WbUrl +from pywb.rewrite.url_rewriter import UrlRewriter +from pywb.utils.statusandheaders import StatusAndHeaders import pprint #WB Request and Response @@ -182,35 +181,6 @@ class WbResponse: def __repr__(self): return str(vars(self)) - -#================================================================= -class StatusAndHeaders: - def __init__(self, statusline, headers, protocol = ''): - self.statusline = statusline - self.headers = headers - self.protocol = protocol - - def get_header(self, name): - name_lower = name.lower() - for value in self.headers: - if (value[0].lower() == name_lower): - return value[1] - - def remove_header(self, name): - name_lower = name.lower() - for x in xrange(len(self.headers) - 1, -1, -1): - if self.headers[x][0].lower() == name_lower: - del self.headers[x] - break - - def __repr__(self): - return "StatusAndHeaders(protocol = '{0}', statusline = '{1}', headers = {2})".format(self.protocol, self.statusline, pprint.pformat(self.headers, indent = 2)) - #return pprint.pformat(self.__dict__) - - def __eq__(self, other): - return self.statusline == other.statusline and self.headers == other.headers and self.protocol == other.protocol - - if __name__ == "__main__": import doctest doctest.testmod() diff --git a/run-tests.py b/run-tests.py new file mode 100644 index 00000000..f70aedf9 --- /dev/null +++ b/run-tests.py @@ -0,0 +1,3 @@ +import pytest +result = pytest.main('-v --doctest-module tests/ pywb/') +exit(result) diff --git a/sample_archive/cdx/iana.cdx.gz b/sample_archive/cdx/iana.cdx.gz new file mode 100644 index 00000000..11499ca5 Binary files /dev/null and b/sample_archive/cdx/iana.cdx.gz differ diff --git a/sample_archive/warcs/example-wget-1-14.warc.gz b/sample_archive/warcs/example-wget-1-14.warc.gz new file mode 100644 index 00000000..fecc0d80 Binary files /dev/null and b/sample_archive/warcs/example-wget-1-14.warc.gz differ diff --git a/sample_archive/warcs/example.arc b/sample_archive/warcs/example.arc new file mode 100644 index 00000000..2024e9c2 --- /dev/null +++ b/sample_archive/warcs/example.arc @@ -0,0 +1,69 @@ +filedesc://live-web-example.arc.gz 127.0.0.1 20140216050221 text/plain 75 +1 0 LiveWeb Capture +URL IP-address Archive-date Content-type Archive-length + +http://example.com/ 93.184.216.119 20140216050221 text/html 1591 +HTTP/1.1 200 OK +Accept-Ranges: bytes +Cache-Control: max-age=604800 +Content-Type: text/html +Date: Sun, 16 Feb 2014 05:02:20 GMT +Etag: "359670651" +Expires: Sun, 23 Feb 2014 05:02:20 GMT +Last-Modified: Fri, 09 Aug 2013 23:54:35 GMT +Server: ECS (sjc/4FCE) +X-Cache: HIT +x-ec-custom-error: 1 +Content-Length: 1270 + + + + + Example Domain + + + + + + + + +
+

Example Domain

+

This domain is established to be used for illustrative examples in documents. You may use this + domain in examples without prior coordination or asking for permission.

+

More information...

+
+ + + diff --git a/sample_archive/warcs/example.arc.gz b/sample_archive/warcs/example.arc.gz new file mode 100644 index 00000000..bc959cf1 Binary files /dev/null and b/sample_archive/warcs/example.arc.gz differ diff --git a/setup.py b/setup.py index fe23f1e7..c3bb977d 100755 --- a/setup.py +++ b/setup.py @@ -5,18 +5,18 @@ import setuptools import glob setuptools.setup(name='pywb', - version='0.1', + version='0.2', url='https://github.com/ikreymer/pywb', author='Ilya Kreymer', author_email='ilya@archive.org', long_description=open('README.md').read(), license='GPL', - packages=['pywb', 'pywb.binsearch', 'pywb.cdxserver'], - provides=['pywb', 'pywb.binsearch', 'pywb.cdxserver'], + packages=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite'], + provides=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite'], package_data={'pywb': ['ui/*', 'static/*']}, data_files = [('sample_archive/cdx/', glob.glob('sample_archive/cdx/*')), ('sample_archive/warcs/', glob.glob('sample_archive/warcs/*'))], - install_requires=['uwsgi', 'rfc3987', 'chardet', 'redis', 'jinja2', 'surt', 'pyyaml', 'WebTest'], - tests_require=['WebTest', 'pytest'], + install_requires=['uwsgi', 'rfc3987', 'chardet', 'redis', 'jinja2', 'surt', 'pyyaml', 'WebTest','pytest'], +# tests_require=['WebTest', 'pytest'], zip_safe=False) diff --git a/tests/test_archivalrouter.py b/tests/test_archivalrouter.py new file mode 100644 index 00000000..415626e6 --- /dev/null +++ b/tests/test_archivalrouter.py @@ -0,0 +1,88 @@ +""" +Test Route +# route with relative path +>>> Route('web', BaseHandler())({'REL_REQUEST_URI': '/web/test.example.com', 'SCRIPT_NAME': ''}, False) +{'wb_url': ('latest_replay', '', '', 'http://test.example.com', 'http://test.example.com'), 'coll': 'web', 'wb_prefix': '/web/', 'request_uri': '/web/test.example.com'} + +# route with absolute path, running at script /my_pywb +>>> Route('web', BaseHandler())({'REL_REQUEST_URI': '/web/2013im_/test.example.com', 'SCRIPT_NAME': '/my_pywb', 'HTTP_HOST': 'localhost:8081', 'wsgi.url_scheme': 'https'}, True) +{'wb_url': ('replay', '2013', 'im_', 'http://test.example.com', '2013im_/http://test.example.com'), 'coll': 'web', 'wb_prefix': 'https://localhost:8081/my_pywb/web/', 'request_uri': '/web/2013im_/test.example.com'} + + +# not matching route -- skipped +>>> Route('web', BaseHandler())({'REL_REQUEST_URI': '/other/test.example.com', 'SCRIPT_NAME': ''}, False) + + +# Referer Redirect Test +>>> ReferRedirect('http://localhost:8080/').match_prefixs +['http://localhost:8080/'] + +>>> ReferRedirect(['http://example:9090/']).match_prefixs +['http://example:9090/'] + +>>> _test_redir('http://localhost:8080/', '/diff_path/other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html') +'http://localhost:8080/coll/20131010/http://example.com/diff_path/other.html' + +>>> _test_redir('http://localhost:8080/', '/../other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html') +'http://localhost:8080/coll/20131010/http://example.com/other.html' + +>>> _test_redir('http://localhost:8080/', '/../../other.html', 'http://localhost:8080/coll/20131010/http://example.com/index.html') +'http://localhost:8080/coll/20131010/http://example.com/other.html' + +# Custom collection +>>> _test_redir('http://localhost:8080/', '/other.html', 'http://localhost:8080/complex/123/20131010/http://example.com/path/page.html', coll='complex/123') +'http://localhost:8080/complex/123/20131010/http://example.com/other.html' + +# With timestamp included +>>> _test_redir('http://localhost:8080/', '/20131010/other.html', 'http://localhost:8080/coll/20131010/http://example.com/index.html') +'http://localhost:8080/coll/20131010/http://example.com/other.html' + +# With timestamp included +>>> _test_redir('http://localhost:8080/', '/20131010/path/other.html', 'http://localhost:8080/coll/20131010/http://example.com/some/index.html') +'http://localhost:8080/coll/20131010/http://example.com/path/other.html' + +# Wrong Host +>>> _test_redir('http://example:8080/', '/other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html') +False + +# Right Host +>>> _test_redir('http://localhost:8080/', '/other.html', 'http://example.com:8080/coll/20131010/http://example.com/path/page.html', http_host = 'example.com:8080') +'http://example.com:8080/coll/20131010/http://example.com/other.html' + +# With custom SCRIPT_NAME +>>> _test_redir('http://localhost:8080/', '/../other.html', 'http://localhost:8080/extra/coll/20131010/http://example.com/path/page.html', '/extra') +'http://localhost:8080/extra/coll/20131010/http://example.com/other.html' + +# With custom SCRIPT_NAME + timestamp +>>> _test_redir('http://localhost:8080/', '/20131010/other.html', 'http://localhost:8080/extra/coll/20131010/http://example.com/path/page.html', '/extra') +'http://localhost:8080/extra/coll/20131010/http://example.com/other.html' + +# With custom SCRIPT_NAME, bad match +>>> _test_redir('http://localhost:8080/', '/../other.html', 'http://localhost:8080/extra/coll/20131010/http://example.com/path/page.html', '/extr') +False + + +""" + +from pywb.archivalrouter import Route, ReferRedirect +from pywb.handlers import BaseHandler + + +def _test_redir(match_host, request_uri, referrer, script_name = '', coll = 'coll', http_host = None): + env = {'REL_REQUEST_URI': request_uri, 'HTTP_REFERER': referrer, 'SCRIPT_NAME': script_name} + + if http_host: + env['HTTP_HOST'] = http_host + + routes = [Route(coll, BaseHandler())] + + redir = ReferRedirect(match_host) + #req = WbRequest.from_uri(request_uri, env) + rep = redir(env, routes) + if not rep: + return False + + return rep.status_headers.get_header('Location') + + + diff --git a/tests/test_binsearch.py b/tests/test_binsearch.py deleted file mode 100644 index 20f50ea4..00000000 --- a/tests/test_binsearch.py +++ /dev/null @@ -1,43 +0,0 @@ -import os -from ..pywb.binsearch.binsearch import iter_prefix, iter_exact, FileReader - -test_cdx_dir = os.path.dirname(os.path.realpath(__file__)) + '/../sample_archive/cdx/' - -def binsearch_cdx_test(key, iter_func): - """ - # Prefix Search - >>> binsearch_cdx_test('org,iana)/domains/root', iter_prefix) - org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz - org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz - org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz - org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz - - >>> binsearch_cdx_test('org,iana)/domains/root', iter_exact) - org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz - - >>> binsearch_cdx_test('org,iana)/', iter_exact) - org,iana)/ 20140126200624 http://www.iana.org/ text/html 200 OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 2258 334 iana.warc.gz - - >>> binsearch_cdx_test('org,iana)/domains/root/db', iter_exact) - org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz - org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz - - # Exact Search - >>> binsearch_cdx_test('org,iaana)/', iter_exact) - >>> binsearch_cdx_test('org,ibna)/', iter_exact) - - >>> binsearch_cdx_test('org,iana)/time-zones', iter_exact) - org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz - """ - - cdx = FileReader(test_cdx_dir + 'iana.cdx') - - for line in iter_func(cdx, key): - print line - - -if __name__ == "__main__": - import doctest - doctest.testmod() - - diff --git a/tests/test_cdxserve.py b/tests/test_cdxserve.py deleted file mode 100644 index 77812bc4..00000000 --- a/tests/test_cdxserve.py +++ /dev/null @@ -1,149 +0,0 @@ -from ..pywb.binsearch.binsearch import iter_exact, iter_prefix, FileReader -from ..pywb.cdxserver.cdxserver import CDXServer -import os -import sys -import pprint - -test_cdx_dir = os.path.dirname(os.path.realpath(__file__)) + '/../sample_archive/cdx/' - -def cdx_ops_test(url, sources = [test_cdx_dir + 'iana.cdx'], **kwparams): - """ - # Merge Sort Multipe CDX Sources - >>> cdx_ops_test(url = 'http://iana.org/', sources = [test_cdx_dir + 'dupes.cdx', test_cdx_dir + 'iana.cdx']) - org,iana)/ 20140126200624 http://www.iana.org/ text/html 200 OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 2258 334 iana.warc.gz - org,iana)/ 20140127171238 http://iana.org unk 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 343 1858 dupes.warc.gz - org,iana)/ 20140127171238 http://www.iana.org/ warc/revisit - OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 536 2678 dupes.warc.gz - - - # Limit CDX Stream - >>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', limit = 3) - org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200625 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 117166 198285 iana.warc.gz - org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200654 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf warc/revisit - YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 548 482544 iana.warc.gz - org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200706 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf warc/revisit - YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 495230 iana.warc.gz - - - # Reverse CDX Stream - >>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', reverse = True, resolve_revisits = True, limit = 3) - org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201308 https://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 551 783712 iana.warc.gz 117166 198285 iana.warc.gz - org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201249 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 771773 iana.warc.gz 117166 198285 iana.warc.gz - org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201240 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 551 757988 iana.warc.gz 117166 198285 iana.warc.gz - - >>> cdx_ops_test('http://iana.org/_js/2013.1/jquery.js', reverse = True, resolve_revisits = True, limit = 1) - org,iana)/_js/2013.1/jquery.js 20140126201307 https://www.iana.org/_js/2013.1/jquery.js application/x-javascript 200 AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO - - 543 778507 iana.warc.gz 33449 7311 iana.warc.gz - - # No matching results - >>> cdx_ops_test('http://iana.org/dont_have_this', reverse = True, resolve_revisits = True, limit = 2) - - - # Filter cdx - >>> cdx_ops_test(url = 'http://iana.org/domains', match_type = 'prefix', filter = ['mimetype:text/html']) - org,iana)/domains 20140126200825 http://www.iana.org/domains text/html 200 7UPSCLNWNZP33LGW6OJGSF2Y4CDG4ES7 - - 2912 610534 iana.warc.gz - org,iana)/domains/arpa 20140126201248 http://www.iana.org/domains/arpa text/html 200 QOFZZRN6JIKAL2JRL6ZC2VVG42SPKGHT - - 2939 759039 iana.warc.gz - org,iana)/domains/idn-tables 20140126201127 http://www.iana.org/domains/idn-tables text/html 200 HNCUFTJMOQOGAEY6T56KVC3T7TVLKGEW - - 8118 715878 iana.warc.gz - org,iana)/domains/int 20140126201239 http://www.iana.org/domains/int text/html 200 X32BBNNORV4SPEHTQF5KI5NFHSKTZK6Q - - 2482 746788 iana.warc.gz - org,iana)/domains/reserved 20140126201054 http://www.iana.org/domains/reserved text/html 200 R5AAEQX5XY5X5DG66B23ODN5DUBWRA27 - - 3573 701457 iana.warc.gz - org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz - org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz - org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz - org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz - - - >>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', filter = 'statuscode:200') - org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz - - - # Collapse by timestamp - # unresolved revisits, different statuscode results in an extra repeat - >>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapse_time = 11) - org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz - org,iana)/_css/2013.1/screen.css 20140126200653 http://www.iana.org/_css/2013.1/screen.css warc/revisit - BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 533 328367 iana.warc.gz - org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css warc/revisit - BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz - - # resolved revisits - >>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapse_time = '11', resolve_revisits = True) - org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz - - - - org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz 8754 41238 iana.warc.gz - - - # Sort by closest timestamp + field select output - >>> cdx_ops_test(closest_to = '20140126200826', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', fields = 'timestamp', limit = 10) - 20140126200826 - 20140126200816 - 20140126200805 - 20140126200912 - 20140126200738 - 20140126200930 - 20140126200718 - 20140126200706 - 20140126200654 - 20140126200625 - - >>> cdx_ops_test(closest_to = '20140126201306', url = 'http://iana.org/dnssec', resolve_revisits = True, sources = [test_cdx_dir + 'dupes.cdx', test_cdx_dir + 'iana.cdx']) - org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz - - - - org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz - - - - - - >>> cdx_ops_test(closest_to = '20140126201307', url = 'http://iana.org/dnssec', resolve_revisits = True) - org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz - - - - org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz - - - - - # equal dist prefer earlier - >>> cdx_ops_test(closest_to = '20140126200700', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolve_revisits = True, limit = 2) - org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200654 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 548 482544 iana.warc.gz 117166 198285 iana.warc.gz - org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200706 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 495230 iana.warc.gz 117166 198285 iana.warc.gz - - >>> cdx_ops_test(closest_to = '20140126200659', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolve_revisits = True, limit = 2, fields = 'timestamp') - 20140126200654 - 20140126200706 - - >>> cdx_ops_test(closest_to = '20140126200701', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolve_revisits = True, limit = 2, fields = 'timestamp') - 20140126200706 - 20140126200654 - - - # Resolve Revisits - >>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/inconsolata.otf', resolve_revisits = True) - org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200826 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 34054 620049 iana.warc.gz - - - - org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200912 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 546 667073 iana.warc.gz 34054 620049 iana.warc.gz - org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200930 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 534 697255 iana.warc.gz 34054 620049 iana.warc.gz - org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201055 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 547 714833 iana.warc.gz 34054 620049 iana.warc.gz - org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201249 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 551 768625 iana.warc.gz 34054 620049 iana.warc.gz - - >>> cdx_ops_test('http://iana.org/domains/root/db', resolve_revisits = True) - org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz - - - - org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz - - - - - - # CDX Server init - >>> x = CDXServer([test_cdx_dir]).load_cdx(url = 'example.com', limit = 1, output = 'raw') - >>> pprint.pprint(x.next().items()) - [('urlkey', 'com,example)/'), - ('timestamp', '20140127171200'), - ('original', 'http://example.com'), - ('mimetype', 'text/html'), - ('statuscode', '200'), - ('digest', 'B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'), - ('redirect', '-'), - ('robotflags', '-'), - ('length', '1046'), - ('offset', '334'), - ('filename', 'dupes.warc.gz')] - - """ - - kwparams['url'] = url - kwparams['output'] = 'text' - - server = CDXServer(sources) - results = server.load_cdx(**kwparams) - - for x in results: - sys.stdout.write(x) - - - -if __name__ == "__main__": - import doctest - doctest.testmod() - - diff --git a/tests/test_integration.py b/tests/test_integration.py index abd104c5..9047e184 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -1,7 +1,7 @@ import webtest -from ..pywb.pywb_init import pywb_config -from ..pywb.wbapp import create_wb_app -from ..pywb.cdxserver.cdxobject import CDXObject +from pywb.pywb_init import pywb_config +from pywb.wbapp import create_wb_app +from pywb.cdx.cdxobject import CDXObject class TestWb: TEST_CONFIG = 'test_config.yaml'