1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

remove pycdx_server pkg for now, move binsearch into pywb package,

update setup.py
This commit is contained in:
Ilya Kreymer 2014-01-24 00:54:48 -08:00
parent 03b6938b9c
commit 391f3bf81d
6 changed files with 82 additions and 168 deletions

View File

@ -1,4 +0,0 @@
#Allow importing
#from pkgutil import extend_path
#__path__ = extend_path(__path__, __name__)

View File

@ -1,92 +0,0 @@
from collections import deque
import os
import itertools
class FileReader:
def __init__(self, filename):
self.fh = open(filename, 'rb')
self.size = os.path.getsize(filename)
def getsize(self):
return self.size
def readline(self):
return self.fh.readline()
def seek(self, offset):
return self.fh.seek(offset)
def binsearch_offset(reader, key, compare_func = cmp, block_size = 8192):
min = 0
max = reader.getsize() / block_size
while (max - min > 1):
mid = min + ((max - min) / 2)
reader.seek(mid * block_size)
if mid > 0:
reader.readline() # skip partial line
line = reader.readline()
if compare_func(key, line) > 0:
min = mid
else:
max = mid
return (min * block_size)
def search(reader, key, prev_size = 0, compare_func = cmp, block_size = 8192):
min = binsearch_offset(reader, key, compare_func, block_size)
reader.seek(min)
if min > 0:
reader.readline() # skip partial line
if prev_size > 1:
prev_deque = deque(maxlen = prev_size)
line = None
while True:
line = reader.readline()
if not line:
break
if compare_func(line, key) >= 0:
break
if prev_size == 1:
prev = line
elif prev_size > 1:
prev_deque.append(line)
def gen_iter(line):
if prev_size == 1:
yield prev
elif prev_size > 1:
for i in prev_deque:
yield i
while line:
yield line
line = reader.readline()
return gen_iter(line)
# Iterate over exact matches
def iter_exact(reader, key):
lines = search(reader, key)
for x in lines:
if not x.startswith(key):
break
yield x

View File

@ -73,11 +73,11 @@ class J2QueryRenderer:
## ===========
## Simple handlers for debugging
class EchoEnv:
class DebugEchoEnv:
def __call__(self, wbrequest):
return wbrequestresponse.WbResponse.text_response(str(wbrequest.env))
class EchoRequest:
class DebugEchoRequest:
def __call__(self, wbrequest):
return wbrequestresponse.WbResponse.text_response(str(wbrequest))

View File

@ -1,9 +1,9 @@
import redis
import pycdx_server.binsearch as binsearch
import binsearch
#======================================
# PrefixResolver - convert cdx file entry to url with prefix if url contains specified string
#======================================
def PrefixResolver(prefix, contains):
def PrefixResolver(prefix, contains = ''):
def makeUrl(url):
return [prefix + url] if (contains in url) else []

View File

@ -1,14 +1,10 @@
from utils import rel_request_uri
from query import QueryHandler, EchoEnv, EchoRequest
from replay import WBHandler
import wbexceptions
import indexreader
from wbrequestresponse import WbResponse, StatusAndHeaders
from archivalrouter import ArchivalRequestRouter, Route
## ===========
headInsert = """
default_head_insert = """
<!-- WB Insert -->
<script src='/static/wb.js'> </script>
@ -19,8 +15,6 @@ headInsert = """
## ===========
'''
The below createDefaultWB() function is just a sample/debug which loads publicly accessible cdx data
To declare Wayback with one collection, `mycoll`
and will be accessed by user at:
@ -36,81 +30,87 @@ and look for warcs at paths:
`http://warcs.example.com/servewarc/` and
`http://warcs.example.com/anotherpath/`,
one could declare a `createWB()` method as follows:
def createWB():
aloader = archiveloader.ArchiveLoader()
query = QueryHandler(indexreader.RemoteCDXServer('http://cdx.example.com/cdx'))
prefixes = [replay.PrefixResolver('http://warcs.example.com/servewarc/'),
replay.PrefixResolver('http://warcs.example.com/anotherpath/')]
replay = replay.RewritingReplayHandler(resolvers = prefixes, archiveloader = aloader, headInsert = headInsert)
return ArchivalRequestRouter(
{
Route('mycoll', WBHandler(query, replay))
},
hostpaths = ['http://mywb.example.com:8080/'])
one could declare a `sample_wb_settings()` method as follows
'''
## ===========
def createDefaultWB(headInsert):
query = QueryHandler(indexreader.RemoteCDXServer('http://web.archive.org/cdx/search/cdx'))
# TODO: simplify this!!
def sample_wb_settings():
import archiveloader
import query, indexreader
import replay, replay_resolvers
from archivalrouter import ArchivalRequestRouter, Route
# Standard loader which supports WARC/ARC files
aloader = archiveloader.ArchiveLoader()
# Source for cdx source
query_h = query.QueryHandler(indexreader.RemoteCDXServer('http://cdx.example.com/cdx'))
# Loads warcs specified in cdx from these locations
prefixes = [replay_resolvers.PrefixResolver('http://warcs.example.com/servewarc/'),
replay_resolvers.PrefixResolver('http://warcs.example.com/anotherpath/')]
# Create rewriting replay handler to rewrite records
replayer = replay.RewritingReplayHandler(resolvers = prefixes, archiveloader = aloader, headInsert = default_head_insert)
# Create Jinja2 based html query renderer
htmlquery = query.J2QueryRenderer('./ui/', 'query.html')
# Handler which combins query, replayer, and html_query
wb_handler = replay.WBHandler(query_h, replayer, htmlquery = htmlquery)
# Finally, create wb router
return ArchivalRequestRouter(
{
Route('echo', EchoEnv()), # Just echo the env
Route('req', EchoRequest()), # Echo the WbRequest
Route('cdx', query), # Query the CDX
Route('web', query), # Query the CDX
},
hostpaths = ['http://localhost:9090/'])
## ===========
try:
import globalwb
wbparser = globalwb.createDefaultWB(headInsert)
except:
print " *** Note: Inited With Sample Wayback *** "
wbparser = createDefaultWB(headInsert)
import traceback
traceback.print_exc()
{
Route('echo_req', query.DebugEchoRequest()), # Debug ex: just echo parsed request
Route('mycoll', wb_handler)
},
# Specify hostnames that pywb will be running on
# This will help catch occasionally missed rewrites that fall-through to the host
# (See archivalrouter.ReferRedirect)
hostpaths = ['http://mywb.example.com:8080/'])
def create_wb_app(wb_router):
def application(env, start_response):
# Top-level wsgi application
def application(env, start_response):
if env.get('SCRIPT_NAME') or not env.get('REQUEST_URI'):
env['REL_REQUEST_URI'] = rel_request_uri(env)
else:
env['REL_REQUEST_URI'] = env['REQUEST_URI']
if env.get('SCRIPT_NAME') or not env.get('REQUEST_URI'):
env['REL_REQUEST_URI'] = rel_request_uri(env)
else:
env['REL_REQUEST_URI'] = env['REQUEST_URI']
response = None
response = None
try:
response = wb_router(env)
try:
response = wbparser(env)
if not response:
raise wbexceptions.NotFoundException(env['REL_REQUEST_URI'] + ' was not found')
if not response:
raise wbexceptions.NotFoundException(env['REL_REQUEST_URI'] + ' was not found')
except wbexceptions.InternalRedirect as ir:
response = WbResponse(StatusAndHeaders(ir.status, ir.httpHeaders))
except wbexceptions.InternalRedirect as ir:
response = WbResponse(StatusAndHeaders(ir.status, ir.httpHeaders))
except (wbexceptions.NotFoundException, wbexceptions.AccessException) as e:
print "[INFO]: " + str(e)
response = handle_exception(env, e)
except (wbexceptions.NotFoundException, wbexceptions.AccessException) as e:
print "[INFO]: " + str(e)
response = handleException(env, e)
except Exception as e:
last_exc = e
import traceback
traceback.print_exc()
response = handle_exception(env, e)
except Exception as e:
last_exc = e
import traceback
traceback.print_exc()
response = handleException(env, e)
return response(env, start_response)
return response(env, start_response)
def handleException(env, exc):
return application
def handle_exception(env, exc):
if hasattr(exc, 'status'):
status = exc.status()
else:
@ -119,3 +119,13 @@ def handleException(env, exc):
return WbResponse.text_response(status + ' Error: ' + str(exc), status = status)
if __name__ == "__main__":
app = create_wb_app(sample_wb_settings())
#=================================================================
import globalwb
application = create_wb_app(globalwb.create_global_wb(default_head_insert))
#=================================================================

View File

@ -4,7 +4,7 @@
import setuptools
setuptools.setup(name='pywb',
version='1.0',
version='0.1',
url='https://github.com/ikreymer/pywb',
author='Ilya Kreymer',
author_email='ilya@archive.org',