diff --git a/pywb/pycdx_server/__init__.py b/pywb/pycdx_server/__init__.py deleted file mode 100644 index ef680e2a..00000000 --- a/pywb/pycdx_server/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -#Allow importing - -#from pkgutil import extend_path -#__path__ = extend_path(__path__, __name__) diff --git a/pywb/pycdx_server/binsearch.py b/pywb/pycdx_server/binsearch.py deleted file mode 100644 index ef3171f1..00000000 --- a/pywb/pycdx_server/binsearch.py +++ /dev/null @@ -1,92 +0,0 @@ -from collections import deque -import os -import itertools - -class FileReader: - def __init__(self, filename): - self.fh = open(filename, 'rb') - self.size = os.path.getsize(filename) - - def getsize(self): - return self.size - - def readline(self): - return self.fh.readline() - - def seek(self, offset): - return self.fh.seek(offset) - - -def binsearch_offset(reader, key, compare_func = cmp, block_size = 8192): - min = 0 - max = reader.getsize() / block_size - - while (max - min > 1): - mid = min + ((max - min) / 2) - reader.seek(mid * block_size) - - if mid > 0: - reader.readline() # skip partial line - - line = reader.readline() - - if compare_func(key, line) > 0: - min = mid - else: - max = mid - - return (min * block_size) - - -def search(reader, key, prev_size = 0, compare_func = cmp, block_size = 8192): - min = binsearch_offset(reader, key, compare_func, block_size) - - reader.seek(min) - - if min > 0: - reader.readline() # skip partial line - - if prev_size > 1: - prev_deque = deque(maxlen = prev_size) - - line = None - - while True: - line = reader.readline() - if not line: - break - if compare_func(line, key) >= 0: - break - - if prev_size == 1: - prev = line - elif prev_size > 1: - prev_deque.append(line) - - def gen_iter(line): - if prev_size == 1: - yield prev - elif prev_size > 1: - for i in prev_deque: - yield i - - while line: - yield line - line = reader.readline() - - return gen_iter(line) - - -# Iterate over exact matches -def iter_exact(reader, key): - lines = search(reader, key) - for x in lines: - if not x.startswith(key): - break - - yield x - - - - - diff --git a/pywb/query.py b/pywb/query.py index d8aa4a60..a42d3a64 100644 --- a/pywb/query.py +++ b/pywb/query.py @@ -73,11 +73,11 @@ class J2QueryRenderer: ## =========== ## Simple handlers for debugging -class EchoEnv: +class DebugEchoEnv: def __call__(self, wbrequest): return wbrequestresponse.WbResponse.text_response(str(wbrequest.env)) -class EchoRequest: +class DebugEchoRequest: def __call__(self, wbrequest): return wbrequestresponse.WbResponse.text_response(str(wbrequest)) diff --git a/pywb/replay_resolvers.py b/pywb/replay_resolvers.py index cd3a7117..99c24b44 100644 --- a/pywb/replay_resolvers.py +++ b/pywb/replay_resolvers.py @@ -1,9 +1,9 @@ import redis -import pycdx_server.binsearch as binsearch +import binsearch #====================================== # PrefixResolver - convert cdx file entry to url with prefix if url contains specified string #====================================== -def PrefixResolver(prefix, contains): +def PrefixResolver(prefix, contains = ''): def makeUrl(url): return [prefix + url] if (contains in url) else [] diff --git a/pywb/wbapp.py b/pywb/wbapp.py index 42ba4e1d..a2555a9a 100644 --- a/pywb/wbapp.py +++ b/pywb/wbapp.py @@ -1,14 +1,10 @@ from utils import rel_request_uri -from query import QueryHandler, EchoEnv, EchoRequest -from replay import WBHandler import wbexceptions -import indexreader from wbrequestresponse import WbResponse, StatusAndHeaders -from archivalrouter import ArchivalRequestRouter, Route ## =========== -headInsert = """ +default_head_insert = """ @@ -19,8 +15,6 @@ headInsert = """ ## =========== ''' -The below createDefaultWB() function is just a sample/debug which loads publicly accessible cdx data - To declare Wayback with one collection, `mycoll` and will be accessed by user at: @@ -36,81 +30,87 @@ and look for warcs at paths: `http://warcs.example.com/servewarc/` and `http://warcs.example.com/anotherpath/`, -one could declare a `createWB()` method as follows: - - def createWB(): - aloader = archiveloader.ArchiveLoader() - query = QueryHandler(indexreader.RemoteCDXServer('http://cdx.example.com/cdx')) - - prefixes = [replay.PrefixResolver('http://warcs.example.com/servewarc/'), - replay.PrefixResolver('http://warcs.example.com/anotherpath/')] - - replay = replay.RewritingReplayHandler(resolvers = prefixes, archiveloader = aloader, headInsert = headInsert) - - return ArchivalRequestRouter( - { - Route('mycoll', WBHandler(query, replay)) - }, - hostpaths = ['http://mywb.example.com:8080/']) +one could declare a `sample_wb_settings()` method as follows ''' -## =========== -def createDefaultWB(headInsert): - query = QueryHandler(indexreader.RemoteCDXServer('http://web.archive.org/cdx/search/cdx')) + +# TODO: simplify this!! + +def sample_wb_settings(): + import archiveloader + import query, indexreader + import replay, replay_resolvers + from archivalrouter import ArchivalRequestRouter, Route + + + # Standard loader which supports WARC/ARC files + aloader = archiveloader.ArchiveLoader() + + # Source for cdx source + query_h = query.QueryHandler(indexreader.RemoteCDXServer('http://cdx.example.com/cdx')) + + # Loads warcs specified in cdx from these locations + prefixes = [replay_resolvers.PrefixResolver('http://warcs.example.com/servewarc/'), + replay_resolvers.PrefixResolver('http://warcs.example.com/anotherpath/')] + + # Create rewriting replay handler to rewrite records + replayer = replay.RewritingReplayHandler(resolvers = prefixes, archiveloader = aloader, headInsert = default_head_insert) + + # Create Jinja2 based html query renderer + htmlquery = query.J2QueryRenderer('./ui/', 'query.html') + + # Handler which combins query, replayer, and html_query + wb_handler = replay.WBHandler(query_h, replayer, htmlquery = htmlquery) + + # Finally, create wb router return ArchivalRequestRouter( - { - Route('echo', EchoEnv()), # Just echo the env - Route('req', EchoRequest()), # Echo the WbRequest - Route('cdx', query), # Query the CDX - Route('web', query), # Query the CDX - }, - hostpaths = ['http://localhost:9090/']) -## =========== - - -try: - import globalwb - wbparser = globalwb.createDefaultWB(headInsert) -except: - print " *** Note: Inited With Sample Wayback *** " - wbparser = createDefaultWB(headInsert) - import traceback - traceback.print_exc() + { + Route('echo_req', query.DebugEchoRequest()), # Debug ex: just echo parsed request + Route('mycoll', wb_handler) + }, + # Specify hostnames that pywb will be running on + # This will help catch occasionally missed rewrites that fall-through to the host + # (See archivalrouter.ReferRedirect) + hostpaths = ['http://mywb.example.com:8080/']) +def create_wb_app(wb_router): -def application(env, start_response): + # Top-level wsgi application + def application(env, start_response): + if env.get('SCRIPT_NAME') or not env.get('REQUEST_URI'): + env['REL_REQUEST_URI'] = rel_request_uri(env) + else: + env['REL_REQUEST_URI'] = env['REQUEST_URI'] - if env.get('SCRIPT_NAME') or not env.get('REQUEST_URI'): - env['REL_REQUEST_URI'] = rel_request_uri(env) - else: - env['REL_REQUEST_URI'] = env['REQUEST_URI'] + response = None - response = None + try: + response = wb_router(env) - try: - response = wbparser(env) + if not response: + raise wbexceptions.NotFoundException(env['REL_REQUEST_URI'] + ' was not found') - if not response: - raise wbexceptions.NotFoundException(env['REL_REQUEST_URI'] + ' was not found') + except wbexceptions.InternalRedirect as ir: + response = WbResponse(StatusAndHeaders(ir.status, ir.httpHeaders)) - except wbexceptions.InternalRedirect as ir: - response = WbResponse(StatusAndHeaders(ir.status, ir.httpHeaders)) + except (wbexceptions.NotFoundException, wbexceptions.AccessException) as e: + print "[INFO]: " + str(e) + response = handle_exception(env, e) - except (wbexceptions.NotFoundException, wbexceptions.AccessException) as e: - print "[INFO]: " + str(e) - response = handleException(env, e) + except Exception as e: + last_exc = e + import traceback + traceback.print_exc() + response = handle_exception(env, e) - except Exception as e: - last_exc = e - import traceback - traceback.print_exc() - response = handleException(env, e) - - return response(env, start_response) + return response(env, start_response) -def handleException(env, exc): + return application + + +def handle_exception(env, exc): if hasattr(exc, 'status'): status = exc.status() else: @@ -119,3 +119,13 @@ def handleException(env, exc): return WbResponse.text_response(status + ' Error: ' + str(exc), status = status) +if __name__ == "__main__": + app = create_wb_app(sample_wb_settings()) + + +#================================================================= +import globalwb +application = create_wb_app(globalwb.create_global_wb(default_head_insert)) +#================================================================= + + diff --git a/setup.py b/setup.py index aecb3512..daf86a99 100755 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ import setuptools setuptools.setup(name='pywb', - version='1.0', + version='0.1', url='https://github.com/ikreymer/pywb', author='Ilya Kreymer', author_email='ilya@archive.org',