mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
remove pycdx_server pkg for now, move binsearch into pywb package,
update setup.py
This commit is contained in:
parent
03b6938b9c
commit
391f3bf81d
@ -1,4 +0,0 @@
|
||||
#Allow importing
|
||||
|
||||
#from pkgutil import extend_path
|
||||
#__path__ = extend_path(__path__, __name__)
|
@ -1,92 +0,0 @@
|
||||
from collections import deque
|
||||
import os
|
||||
import itertools
|
||||
|
||||
class FileReader:
|
||||
def __init__(self, filename):
|
||||
self.fh = open(filename, 'rb')
|
||||
self.size = os.path.getsize(filename)
|
||||
|
||||
def getsize(self):
|
||||
return self.size
|
||||
|
||||
def readline(self):
|
||||
return self.fh.readline()
|
||||
|
||||
def seek(self, offset):
|
||||
return self.fh.seek(offset)
|
||||
|
||||
|
||||
def binsearch_offset(reader, key, compare_func = cmp, block_size = 8192):
|
||||
min = 0
|
||||
max = reader.getsize() / block_size
|
||||
|
||||
while (max - min > 1):
|
||||
mid = min + ((max - min) / 2)
|
||||
reader.seek(mid * block_size)
|
||||
|
||||
if mid > 0:
|
||||
reader.readline() # skip partial line
|
||||
|
||||
line = reader.readline()
|
||||
|
||||
if compare_func(key, line) > 0:
|
||||
min = mid
|
||||
else:
|
||||
max = mid
|
||||
|
||||
return (min * block_size)
|
||||
|
||||
|
||||
def search(reader, key, prev_size = 0, compare_func = cmp, block_size = 8192):
|
||||
min = binsearch_offset(reader, key, compare_func, block_size)
|
||||
|
||||
reader.seek(min)
|
||||
|
||||
if min > 0:
|
||||
reader.readline() # skip partial line
|
||||
|
||||
if prev_size > 1:
|
||||
prev_deque = deque(maxlen = prev_size)
|
||||
|
||||
line = None
|
||||
|
||||
while True:
|
||||
line = reader.readline()
|
||||
if not line:
|
||||
break
|
||||
if compare_func(line, key) >= 0:
|
||||
break
|
||||
|
||||
if prev_size == 1:
|
||||
prev = line
|
||||
elif prev_size > 1:
|
||||
prev_deque.append(line)
|
||||
|
||||
def gen_iter(line):
|
||||
if prev_size == 1:
|
||||
yield prev
|
||||
elif prev_size > 1:
|
||||
for i in prev_deque:
|
||||
yield i
|
||||
|
||||
while line:
|
||||
yield line
|
||||
line = reader.readline()
|
||||
|
||||
return gen_iter(line)
|
||||
|
||||
|
||||
# Iterate over exact matches
|
||||
def iter_exact(reader, key):
|
||||
lines = search(reader, key)
|
||||
for x in lines:
|
||||
if not x.startswith(key):
|
||||
break
|
||||
|
||||
yield x
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -73,11 +73,11 @@ class J2QueryRenderer:
|
||||
|
||||
## ===========
|
||||
## Simple handlers for debugging
|
||||
class EchoEnv:
|
||||
class DebugEchoEnv:
|
||||
def __call__(self, wbrequest):
|
||||
return wbrequestresponse.WbResponse.text_response(str(wbrequest.env))
|
||||
|
||||
class EchoRequest:
|
||||
class DebugEchoRequest:
|
||||
def __call__(self, wbrequest):
|
||||
return wbrequestresponse.WbResponse.text_response(str(wbrequest))
|
||||
|
||||
|
@ -1,9 +1,9 @@
|
||||
import redis
|
||||
import pycdx_server.binsearch as binsearch
|
||||
import binsearch
|
||||
#======================================
|
||||
# PrefixResolver - convert cdx file entry to url with prefix if url contains specified string
|
||||
#======================================
|
||||
def PrefixResolver(prefix, contains):
|
||||
def PrefixResolver(prefix, contains = ''):
|
||||
def makeUrl(url):
|
||||
return [prefix + url] if (contains in url) else []
|
||||
|
||||
|
144
pywb/wbapp.py
144
pywb/wbapp.py
@ -1,14 +1,10 @@
|
||||
from utils import rel_request_uri
|
||||
from query import QueryHandler, EchoEnv, EchoRequest
|
||||
from replay import WBHandler
|
||||
import wbexceptions
|
||||
import indexreader
|
||||
|
||||
from wbrequestresponse import WbResponse, StatusAndHeaders
|
||||
from archivalrouter import ArchivalRequestRouter, Route
|
||||
|
||||
## ===========
|
||||
headInsert = """
|
||||
default_head_insert = """
|
||||
|
||||
<!-- WB Insert -->
|
||||
<script src='/static/wb.js'> </script>
|
||||
@ -19,8 +15,6 @@ headInsert = """
|
||||
|
||||
## ===========
|
||||
'''
|
||||
The below createDefaultWB() function is just a sample/debug which loads publicly accessible cdx data
|
||||
|
||||
|
||||
To declare Wayback with one collection, `mycoll`
|
||||
and will be accessed by user at:
|
||||
@ -36,81 +30,87 @@ and look for warcs at paths:
|
||||
`http://warcs.example.com/servewarc/` and
|
||||
`http://warcs.example.com/anotherpath/`,
|
||||
|
||||
one could declare a `createWB()` method as follows:
|
||||
|
||||
def createWB():
|
||||
aloader = archiveloader.ArchiveLoader()
|
||||
query = QueryHandler(indexreader.RemoteCDXServer('http://cdx.example.com/cdx'))
|
||||
|
||||
prefixes = [replay.PrefixResolver('http://warcs.example.com/servewarc/'),
|
||||
replay.PrefixResolver('http://warcs.example.com/anotherpath/')]
|
||||
|
||||
replay = replay.RewritingReplayHandler(resolvers = prefixes, archiveloader = aloader, headInsert = headInsert)
|
||||
|
||||
return ArchivalRequestRouter(
|
||||
{
|
||||
Route('mycoll', WBHandler(query, replay))
|
||||
},
|
||||
hostpaths = ['http://mywb.example.com:8080/'])
|
||||
one could declare a `sample_wb_settings()` method as follows
|
||||
'''
|
||||
## ===========
|
||||
def createDefaultWB(headInsert):
|
||||
query = QueryHandler(indexreader.RemoteCDXServer('http://web.archive.org/cdx/search/cdx'))
|
||||
|
||||
# TODO: simplify this!!
|
||||
|
||||
def sample_wb_settings():
|
||||
import archiveloader
|
||||
import query, indexreader
|
||||
import replay, replay_resolvers
|
||||
from archivalrouter import ArchivalRequestRouter, Route
|
||||
|
||||
|
||||
# Standard loader which supports WARC/ARC files
|
||||
aloader = archiveloader.ArchiveLoader()
|
||||
|
||||
# Source for cdx source
|
||||
query_h = query.QueryHandler(indexreader.RemoteCDXServer('http://cdx.example.com/cdx'))
|
||||
|
||||
# Loads warcs specified in cdx from these locations
|
||||
prefixes = [replay_resolvers.PrefixResolver('http://warcs.example.com/servewarc/'),
|
||||
replay_resolvers.PrefixResolver('http://warcs.example.com/anotherpath/')]
|
||||
|
||||
# Create rewriting replay handler to rewrite records
|
||||
replayer = replay.RewritingReplayHandler(resolvers = prefixes, archiveloader = aloader, headInsert = default_head_insert)
|
||||
|
||||
# Create Jinja2 based html query renderer
|
||||
htmlquery = query.J2QueryRenderer('./ui/', 'query.html')
|
||||
|
||||
# Handler which combins query, replayer, and html_query
|
||||
wb_handler = replay.WBHandler(query_h, replayer, htmlquery = htmlquery)
|
||||
|
||||
# Finally, create wb router
|
||||
return ArchivalRequestRouter(
|
||||
{
|
||||
Route('echo', EchoEnv()), # Just echo the env
|
||||
Route('req', EchoRequest()), # Echo the WbRequest
|
||||
Route('cdx', query), # Query the CDX
|
||||
Route('web', query), # Query the CDX
|
||||
},
|
||||
hostpaths = ['http://localhost:9090/'])
|
||||
## ===========
|
||||
|
||||
|
||||
try:
|
||||
import globalwb
|
||||
wbparser = globalwb.createDefaultWB(headInsert)
|
||||
except:
|
||||
print " *** Note: Inited With Sample Wayback *** "
|
||||
wbparser = createDefaultWB(headInsert)
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
{
|
||||
Route('echo_req', query.DebugEchoRequest()), # Debug ex: just echo parsed request
|
||||
Route('mycoll', wb_handler)
|
||||
},
|
||||
# Specify hostnames that pywb will be running on
|
||||
# This will help catch occasionally missed rewrites that fall-through to the host
|
||||
# (See archivalrouter.ReferRedirect)
|
||||
hostpaths = ['http://mywb.example.com:8080/'])
|
||||
|
||||
|
||||
|
||||
def create_wb_app(wb_router):
|
||||
|
||||
def application(env, start_response):
|
||||
# Top-level wsgi application
|
||||
def application(env, start_response):
|
||||
if env.get('SCRIPT_NAME') or not env.get('REQUEST_URI'):
|
||||
env['REL_REQUEST_URI'] = rel_request_uri(env)
|
||||
else:
|
||||
env['REL_REQUEST_URI'] = env['REQUEST_URI']
|
||||
|
||||
if env.get('SCRIPT_NAME') or not env.get('REQUEST_URI'):
|
||||
env['REL_REQUEST_URI'] = rel_request_uri(env)
|
||||
else:
|
||||
env['REL_REQUEST_URI'] = env['REQUEST_URI']
|
||||
response = None
|
||||
|
||||
response = None
|
||||
try:
|
||||
response = wb_router(env)
|
||||
|
||||
try:
|
||||
response = wbparser(env)
|
||||
if not response:
|
||||
raise wbexceptions.NotFoundException(env['REL_REQUEST_URI'] + ' was not found')
|
||||
|
||||
if not response:
|
||||
raise wbexceptions.NotFoundException(env['REL_REQUEST_URI'] + ' was not found')
|
||||
except wbexceptions.InternalRedirect as ir:
|
||||
response = WbResponse(StatusAndHeaders(ir.status, ir.httpHeaders))
|
||||
|
||||
except wbexceptions.InternalRedirect as ir:
|
||||
response = WbResponse(StatusAndHeaders(ir.status, ir.httpHeaders))
|
||||
except (wbexceptions.NotFoundException, wbexceptions.AccessException) as e:
|
||||
print "[INFO]: " + str(e)
|
||||
response = handle_exception(env, e)
|
||||
|
||||
except (wbexceptions.NotFoundException, wbexceptions.AccessException) as e:
|
||||
print "[INFO]: " + str(e)
|
||||
response = handleException(env, e)
|
||||
except Exception as e:
|
||||
last_exc = e
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
response = handle_exception(env, e)
|
||||
|
||||
except Exception as e:
|
||||
last_exc = e
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
response = handleException(env, e)
|
||||
|
||||
return response(env, start_response)
|
||||
return response(env, start_response)
|
||||
|
||||
|
||||
def handleException(env, exc):
|
||||
return application
|
||||
|
||||
|
||||
def handle_exception(env, exc):
|
||||
if hasattr(exc, 'status'):
|
||||
status = exc.status()
|
||||
else:
|
||||
@ -119,3 +119,13 @@ def handleException(env, exc):
|
||||
return WbResponse.text_response(status + ' Error: ' + str(exc), status = status)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app = create_wb_app(sample_wb_settings())
|
||||
|
||||
|
||||
#=================================================================
|
||||
import globalwb
|
||||
application = create_wb_app(globalwb.create_global_wb(default_head_insert))
|
||||
#=================================================================
|
||||
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user