1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

remove pycdx_server pkg for now, move binsearch into pywb package,

update setup.py
This commit is contained in:
Ilya Kreymer 2014-01-24 00:54:48 -08:00
parent 03b6938b9c
commit 391f3bf81d
6 changed files with 82 additions and 168 deletions

View File

@ -1,4 +0,0 @@
#Allow importing
#from pkgutil import extend_path
#__path__ = extend_path(__path__, __name__)

View File

@ -1,92 +0,0 @@
from collections import deque
import os
import itertools
class FileReader:
def __init__(self, filename):
self.fh = open(filename, 'rb')
self.size = os.path.getsize(filename)
def getsize(self):
return self.size
def readline(self):
return self.fh.readline()
def seek(self, offset):
return self.fh.seek(offset)
def binsearch_offset(reader, key, compare_func = cmp, block_size = 8192):
min = 0
max = reader.getsize() / block_size
while (max - min > 1):
mid = min + ((max - min) / 2)
reader.seek(mid * block_size)
if mid > 0:
reader.readline() # skip partial line
line = reader.readline()
if compare_func(key, line) > 0:
min = mid
else:
max = mid
return (min * block_size)
def search(reader, key, prev_size = 0, compare_func = cmp, block_size = 8192):
min = binsearch_offset(reader, key, compare_func, block_size)
reader.seek(min)
if min > 0:
reader.readline() # skip partial line
if prev_size > 1:
prev_deque = deque(maxlen = prev_size)
line = None
while True:
line = reader.readline()
if not line:
break
if compare_func(line, key) >= 0:
break
if prev_size == 1:
prev = line
elif prev_size > 1:
prev_deque.append(line)
def gen_iter(line):
if prev_size == 1:
yield prev
elif prev_size > 1:
for i in prev_deque:
yield i
while line:
yield line
line = reader.readline()
return gen_iter(line)
# Iterate over exact matches
def iter_exact(reader, key):
lines = search(reader, key)
for x in lines:
if not x.startswith(key):
break
yield x

View File

@ -73,11 +73,11 @@ class J2QueryRenderer:
## =========== ## ===========
## Simple handlers for debugging ## Simple handlers for debugging
class EchoEnv: class DebugEchoEnv:
def __call__(self, wbrequest): def __call__(self, wbrequest):
return wbrequestresponse.WbResponse.text_response(str(wbrequest.env)) return wbrequestresponse.WbResponse.text_response(str(wbrequest.env))
class EchoRequest: class DebugEchoRequest:
def __call__(self, wbrequest): def __call__(self, wbrequest):
return wbrequestresponse.WbResponse.text_response(str(wbrequest)) return wbrequestresponse.WbResponse.text_response(str(wbrequest))

View File

@ -1,9 +1,9 @@
import redis import redis
import pycdx_server.binsearch as binsearch import binsearch
#====================================== #======================================
# PrefixResolver - convert cdx file entry to url with prefix if url contains specified string # PrefixResolver - convert cdx file entry to url with prefix if url contains specified string
#====================================== #======================================
def PrefixResolver(prefix, contains): def PrefixResolver(prefix, contains = ''):
def makeUrl(url): def makeUrl(url):
return [prefix + url] if (contains in url) else [] return [prefix + url] if (contains in url) else []

View File

@ -1,14 +1,10 @@
from utils import rel_request_uri from utils import rel_request_uri
from query import QueryHandler, EchoEnv, EchoRequest
from replay import WBHandler
import wbexceptions import wbexceptions
import indexreader
from wbrequestresponse import WbResponse, StatusAndHeaders from wbrequestresponse import WbResponse, StatusAndHeaders
from archivalrouter import ArchivalRequestRouter, Route
## =========== ## ===========
headInsert = """ default_head_insert = """
<!-- WB Insert --> <!-- WB Insert -->
<script src='/static/wb.js'> </script> <script src='/static/wb.js'> </script>
@ -19,8 +15,6 @@ headInsert = """
## =========== ## ===========
''' '''
The below createDefaultWB() function is just a sample/debug which loads publicly accessible cdx data
To declare Wayback with one collection, `mycoll` To declare Wayback with one collection, `mycoll`
and will be accessed by user at: and will be accessed by user at:
@ -36,81 +30,87 @@ and look for warcs at paths:
`http://warcs.example.com/servewarc/` and `http://warcs.example.com/servewarc/` and
`http://warcs.example.com/anotherpath/`, `http://warcs.example.com/anotherpath/`,
one could declare a `createWB()` method as follows: one could declare a `sample_wb_settings()` method as follows
def createWB():
aloader = archiveloader.ArchiveLoader()
query = QueryHandler(indexreader.RemoteCDXServer('http://cdx.example.com/cdx'))
prefixes = [replay.PrefixResolver('http://warcs.example.com/servewarc/'),
replay.PrefixResolver('http://warcs.example.com/anotherpath/')]
replay = replay.RewritingReplayHandler(resolvers = prefixes, archiveloader = aloader, headInsert = headInsert)
return ArchivalRequestRouter(
{
Route('mycoll', WBHandler(query, replay))
},
hostpaths = ['http://mywb.example.com:8080/'])
''' '''
## ===========
def createDefaultWB(headInsert): # TODO: simplify this!!
query = QueryHandler(indexreader.RemoteCDXServer('http://web.archive.org/cdx/search/cdx'))
def sample_wb_settings():
import archiveloader
import query, indexreader
import replay, replay_resolvers
from archivalrouter import ArchivalRequestRouter, Route
# Standard loader which supports WARC/ARC files
aloader = archiveloader.ArchiveLoader()
# Source for cdx source
query_h = query.QueryHandler(indexreader.RemoteCDXServer('http://cdx.example.com/cdx'))
# Loads warcs specified in cdx from these locations
prefixes = [replay_resolvers.PrefixResolver('http://warcs.example.com/servewarc/'),
replay_resolvers.PrefixResolver('http://warcs.example.com/anotherpath/')]
# Create rewriting replay handler to rewrite records
replayer = replay.RewritingReplayHandler(resolvers = prefixes, archiveloader = aloader, headInsert = default_head_insert)
# Create Jinja2 based html query renderer
htmlquery = query.J2QueryRenderer('./ui/', 'query.html')
# Handler which combins query, replayer, and html_query
wb_handler = replay.WBHandler(query_h, replayer, htmlquery = htmlquery)
# Finally, create wb router
return ArchivalRequestRouter( return ArchivalRequestRouter(
{ {
Route('echo', EchoEnv()), # Just echo the env Route('echo_req', query.DebugEchoRequest()), # Debug ex: just echo parsed request
Route('req', EchoRequest()), # Echo the WbRequest Route('mycoll', wb_handler)
Route('cdx', query), # Query the CDX },
Route('web', query), # Query the CDX # Specify hostnames that pywb will be running on
}, # This will help catch occasionally missed rewrites that fall-through to the host
hostpaths = ['http://localhost:9090/']) # (See archivalrouter.ReferRedirect)
## =========== hostpaths = ['http://mywb.example.com:8080/'])
try:
import globalwb
wbparser = globalwb.createDefaultWB(headInsert)
except:
print " *** Note: Inited With Sample Wayback *** "
wbparser = createDefaultWB(headInsert)
import traceback
traceback.print_exc()
def create_wb_app(wb_router):
def application(env, start_response): # Top-level wsgi application
def application(env, start_response):
if env.get('SCRIPT_NAME') or not env.get('REQUEST_URI'):
env['REL_REQUEST_URI'] = rel_request_uri(env)
else:
env['REL_REQUEST_URI'] = env['REQUEST_URI']
if env.get('SCRIPT_NAME') or not env.get('REQUEST_URI'): response = None
env['REL_REQUEST_URI'] = rel_request_uri(env)
else:
env['REL_REQUEST_URI'] = env['REQUEST_URI']
response = None try:
response = wb_router(env)
try: if not response:
response = wbparser(env) raise wbexceptions.NotFoundException(env['REL_REQUEST_URI'] + ' was not found')
if not response: except wbexceptions.InternalRedirect as ir:
raise wbexceptions.NotFoundException(env['REL_REQUEST_URI'] + ' was not found') response = WbResponse(StatusAndHeaders(ir.status, ir.httpHeaders))
except wbexceptions.InternalRedirect as ir: except (wbexceptions.NotFoundException, wbexceptions.AccessException) as e:
response = WbResponse(StatusAndHeaders(ir.status, ir.httpHeaders)) print "[INFO]: " + str(e)
response = handle_exception(env, e)
except (wbexceptions.NotFoundException, wbexceptions.AccessException) as e: except Exception as e:
print "[INFO]: " + str(e) last_exc = e
response = handleException(env, e) import traceback
traceback.print_exc()
response = handle_exception(env, e)
except Exception as e: return response(env, start_response)
last_exc = e
import traceback
traceback.print_exc()
response = handleException(env, e)
return response(env, start_response)
def handleException(env, exc): return application
def handle_exception(env, exc):
if hasattr(exc, 'status'): if hasattr(exc, 'status'):
status = exc.status() status = exc.status()
else: else:
@ -119,3 +119,13 @@ def handleException(env, exc):
return WbResponse.text_response(status + ' Error: ' + str(exc), status = status) return WbResponse.text_response(status + ' Error: ' + str(exc), status = status)
if __name__ == "__main__":
app = create_wb_app(sample_wb_settings())
#=================================================================
import globalwb
application = create_wb_app(globalwb.create_global_wb(default_head_insert))
#=================================================================

View File

@ -4,7 +4,7 @@
import setuptools import setuptools
setuptools.setup(name='pywb', setuptools.setup(name='pywb',
version='1.0', version='0.1',
url='https://github.com/ikreymer/pywb', url='https://github.com/ikreymer/pywb',
author='Ilya Kreymer', author='Ilya Kreymer',
author_email='ilya@archive.org', author_email='ilya@archive.org',