mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
remove pycdx_server pkg for now, move binsearch into pywb package,
update setup.py
This commit is contained in:
parent
03b6938b9c
commit
391f3bf81d
@ -1,4 +0,0 @@
|
|||||||
#Allow importing
|
|
||||||
|
|
||||||
#from pkgutil import extend_path
|
|
||||||
#__path__ = extend_path(__path__, __name__)
|
|
@ -1,92 +0,0 @@
|
|||||||
from collections import deque
|
|
||||||
import os
|
|
||||||
import itertools
|
|
||||||
|
|
||||||
class FileReader:
|
|
||||||
def __init__(self, filename):
|
|
||||||
self.fh = open(filename, 'rb')
|
|
||||||
self.size = os.path.getsize(filename)
|
|
||||||
|
|
||||||
def getsize(self):
|
|
||||||
return self.size
|
|
||||||
|
|
||||||
def readline(self):
|
|
||||||
return self.fh.readline()
|
|
||||||
|
|
||||||
def seek(self, offset):
|
|
||||||
return self.fh.seek(offset)
|
|
||||||
|
|
||||||
|
|
||||||
def binsearch_offset(reader, key, compare_func = cmp, block_size = 8192):
|
|
||||||
min = 0
|
|
||||||
max = reader.getsize() / block_size
|
|
||||||
|
|
||||||
while (max - min > 1):
|
|
||||||
mid = min + ((max - min) / 2)
|
|
||||||
reader.seek(mid * block_size)
|
|
||||||
|
|
||||||
if mid > 0:
|
|
||||||
reader.readline() # skip partial line
|
|
||||||
|
|
||||||
line = reader.readline()
|
|
||||||
|
|
||||||
if compare_func(key, line) > 0:
|
|
||||||
min = mid
|
|
||||||
else:
|
|
||||||
max = mid
|
|
||||||
|
|
||||||
return (min * block_size)
|
|
||||||
|
|
||||||
|
|
||||||
def search(reader, key, prev_size = 0, compare_func = cmp, block_size = 8192):
|
|
||||||
min = binsearch_offset(reader, key, compare_func, block_size)
|
|
||||||
|
|
||||||
reader.seek(min)
|
|
||||||
|
|
||||||
if min > 0:
|
|
||||||
reader.readline() # skip partial line
|
|
||||||
|
|
||||||
if prev_size > 1:
|
|
||||||
prev_deque = deque(maxlen = prev_size)
|
|
||||||
|
|
||||||
line = None
|
|
||||||
|
|
||||||
while True:
|
|
||||||
line = reader.readline()
|
|
||||||
if not line:
|
|
||||||
break
|
|
||||||
if compare_func(line, key) >= 0:
|
|
||||||
break
|
|
||||||
|
|
||||||
if prev_size == 1:
|
|
||||||
prev = line
|
|
||||||
elif prev_size > 1:
|
|
||||||
prev_deque.append(line)
|
|
||||||
|
|
||||||
def gen_iter(line):
|
|
||||||
if prev_size == 1:
|
|
||||||
yield prev
|
|
||||||
elif prev_size > 1:
|
|
||||||
for i in prev_deque:
|
|
||||||
yield i
|
|
||||||
|
|
||||||
while line:
|
|
||||||
yield line
|
|
||||||
line = reader.readline()
|
|
||||||
|
|
||||||
return gen_iter(line)
|
|
||||||
|
|
||||||
|
|
||||||
# Iterate over exact matches
|
|
||||||
def iter_exact(reader, key):
|
|
||||||
lines = search(reader, key)
|
|
||||||
for x in lines:
|
|
||||||
if not x.startswith(key):
|
|
||||||
break
|
|
||||||
|
|
||||||
yield x
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -73,11 +73,11 @@ class J2QueryRenderer:
|
|||||||
|
|
||||||
## ===========
|
## ===========
|
||||||
## Simple handlers for debugging
|
## Simple handlers for debugging
|
||||||
class EchoEnv:
|
class DebugEchoEnv:
|
||||||
def __call__(self, wbrequest):
|
def __call__(self, wbrequest):
|
||||||
return wbrequestresponse.WbResponse.text_response(str(wbrequest.env))
|
return wbrequestresponse.WbResponse.text_response(str(wbrequest.env))
|
||||||
|
|
||||||
class EchoRequest:
|
class DebugEchoRequest:
|
||||||
def __call__(self, wbrequest):
|
def __call__(self, wbrequest):
|
||||||
return wbrequestresponse.WbResponse.text_response(str(wbrequest))
|
return wbrequestresponse.WbResponse.text_response(str(wbrequest))
|
||||||
|
|
||||||
|
@ -1,9 +1,9 @@
|
|||||||
import redis
|
import redis
|
||||||
import pycdx_server.binsearch as binsearch
|
import binsearch
|
||||||
#======================================
|
#======================================
|
||||||
# PrefixResolver - convert cdx file entry to url with prefix if url contains specified string
|
# PrefixResolver - convert cdx file entry to url with prefix if url contains specified string
|
||||||
#======================================
|
#======================================
|
||||||
def PrefixResolver(prefix, contains):
|
def PrefixResolver(prefix, contains = ''):
|
||||||
def makeUrl(url):
|
def makeUrl(url):
|
||||||
return [prefix + url] if (contains in url) else []
|
return [prefix + url] if (contains in url) else []
|
||||||
|
|
||||||
|
144
pywb/wbapp.py
144
pywb/wbapp.py
@ -1,14 +1,10 @@
|
|||||||
from utils import rel_request_uri
|
from utils import rel_request_uri
|
||||||
from query import QueryHandler, EchoEnv, EchoRequest
|
|
||||||
from replay import WBHandler
|
|
||||||
import wbexceptions
|
import wbexceptions
|
||||||
import indexreader
|
|
||||||
|
|
||||||
from wbrequestresponse import WbResponse, StatusAndHeaders
|
from wbrequestresponse import WbResponse, StatusAndHeaders
|
||||||
from archivalrouter import ArchivalRequestRouter, Route
|
|
||||||
|
|
||||||
## ===========
|
## ===========
|
||||||
headInsert = """
|
default_head_insert = """
|
||||||
|
|
||||||
<!-- WB Insert -->
|
<!-- WB Insert -->
|
||||||
<script src='/static/wb.js'> </script>
|
<script src='/static/wb.js'> </script>
|
||||||
@ -19,8 +15,6 @@ headInsert = """
|
|||||||
|
|
||||||
## ===========
|
## ===========
|
||||||
'''
|
'''
|
||||||
The below createDefaultWB() function is just a sample/debug which loads publicly accessible cdx data
|
|
||||||
|
|
||||||
|
|
||||||
To declare Wayback with one collection, `mycoll`
|
To declare Wayback with one collection, `mycoll`
|
||||||
and will be accessed by user at:
|
and will be accessed by user at:
|
||||||
@ -36,81 +30,87 @@ and look for warcs at paths:
|
|||||||
`http://warcs.example.com/servewarc/` and
|
`http://warcs.example.com/servewarc/` and
|
||||||
`http://warcs.example.com/anotherpath/`,
|
`http://warcs.example.com/anotherpath/`,
|
||||||
|
|
||||||
one could declare a `createWB()` method as follows:
|
one could declare a `sample_wb_settings()` method as follows
|
||||||
|
|
||||||
def createWB():
|
|
||||||
aloader = archiveloader.ArchiveLoader()
|
|
||||||
query = QueryHandler(indexreader.RemoteCDXServer('http://cdx.example.com/cdx'))
|
|
||||||
|
|
||||||
prefixes = [replay.PrefixResolver('http://warcs.example.com/servewarc/'),
|
|
||||||
replay.PrefixResolver('http://warcs.example.com/anotherpath/')]
|
|
||||||
|
|
||||||
replay = replay.RewritingReplayHandler(resolvers = prefixes, archiveloader = aloader, headInsert = headInsert)
|
|
||||||
|
|
||||||
return ArchivalRequestRouter(
|
|
||||||
{
|
|
||||||
Route('mycoll', WBHandler(query, replay))
|
|
||||||
},
|
|
||||||
hostpaths = ['http://mywb.example.com:8080/'])
|
|
||||||
'''
|
'''
|
||||||
## ===========
|
|
||||||
def createDefaultWB(headInsert):
|
# TODO: simplify this!!
|
||||||
query = QueryHandler(indexreader.RemoteCDXServer('http://web.archive.org/cdx/search/cdx'))
|
|
||||||
|
def sample_wb_settings():
|
||||||
|
import archiveloader
|
||||||
|
import query, indexreader
|
||||||
|
import replay, replay_resolvers
|
||||||
|
from archivalrouter import ArchivalRequestRouter, Route
|
||||||
|
|
||||||
|
|
||||||
|
# Standard loader which supports WARC/ARC files
|
||||||
|
aloader = archiveloader.ArchiveLoader()
|
||||||
|
|
||||||
|
# Source for cdx source
|
||||||
|
query_h = query.QueryHandler(indexreader.RemoteCDXServer('http://cdx.example.com/cdx'))
|
||||||
|
|
||||||
|
# Loads warcs specified in cdx from these locations
|
||||||
|
prefixes = [replay_resolvers.PrefixResolver('http://warcs.example.com/servewarc/'),
|
||||||
|
replay_resolvers.PrefixResolver('http://warcs.example.com/anotherpath/')]
|
||||||
|
|
||||||
|
# Create rewriting replay handler to rewrite records
|
||||||
|
replayer = replay.RewritingReplayHandler(resolvers = prefixes, archiveloader = aloader, headInsert = default_head_insert)
|
||||||
|
|
||||||
|
# Create Jinja2 based html query renderer
|
||||||
|
htmlquery = query.J2QueryRenderer('./ui/', 'query.html')
|
||||||
|
|
||||||
|
# Handler which combins query, replayer, and html_query
|
||||||
|
wb_handler = replay.WBHandler(query_h, replayer, htmlquery = htmlquery)
|
||||||
|
|
||||||
|
# Finally, create wb router
|
||||||
return ArchivalRequestRouter(
|
return ArchivalRequestRouter(
|
||||||
{
|
{
|
||||||
Route('echo', EchoEnv()), # Just echo the env
|
Route('echo_req', query.DebugEchoRequest()), # Debug ex: just echo parsed request
|
||||||
Route('req', EchoRequest()), # Echo the WbRequest
|
Route('mycoll', wb_handler)
|
||||||
Route('cdx', query), # Query the CDX
|
},
|
||||||
Route('web', query), # Query the CDX
|
# Specify hostnames that pywb will be running on
|
||||||
},
|
# This will help catch occasionally missed rewrites that fall-through to the host
|
||||||
hostpaths = ['http://localhost:9090/'])
|
# (See archivalrouter.ReferRedirect)
|
||||||
## ===========
|
hostpaths = ['http://mywb.example.com:8080/'])
|
||||||
|
|
||||||
|
|
||||||
try:
|
|
||||||
import globalwb
|
|
||||||
wbparser = globalwb.createDefaultWB(headInsert)
|
|
||||||
except:
|
|
||||||
print " *** Note: Inited With Sample Wayback *** "
|
|
||||||
wbparser = createDefaultWB(headInsert)
|
|
||||||
import traceback
|
|
||||||
traceback.print_exc()
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def create_wb_app(wb_router):
|
||||||
|
|
||||||
def application(env, start_response):
|
# Top-level wsgi application
|
||||||
|
def application(env, start_response):
|
||||||
|
if env.get('SCRIPT_NAME') or not env.get('REQUEST_URI'):
|
||||||
|
env['REL_REQUEST_URI'] = rel_request_uri(env)
|
||||||
|
else:
|
||||||
|
env['REL_REQUEST_URI'] = env['REQUEST_URI']
|
||||||
|
|
||||||
if env.get('SCRIPT_NAME') or not env.get('REQUEST_URI'):
|
response = None
|
||||||
env['REL_REQUEST_URI'] = rel_request_uri(env)
|
|
||||||
else:
|
|
||||||
env['REL_REQUEST_URI'] = env['REQUEST_URI']
|
|
||||||
|
|
||||||
response = None
|
try:
|
||||||
|
response = wb_router(env)
|
||||||
|
|
||||||
try:
|
if not response:
|
||||||
response = wbparser(env)
|
raise wbexceptions.NotFoundException(env['REL_REQUEST_URI'] + ' was not found')
|
||||||
|
|
||||||
if not response:
|
except wbexceptions.InternalRedirect as ir:
|
||||||
raise wbexceptions.NotFoundException(env['REL_REQUEST_URI'] + ' was not found')
|
response = WbResponse(StatusAndHeaders(ir.status, ir.httpHeaders))
|
||||||
|
|
||||||
except wbexceptions.InternalRedirect as ir:
|
except (wbexceptions.NotFoundException, wbexceptions.AccessException) as e:
|
||||||
response = WbResponse(StatusAndHeaders(ir.status, ir.httpHeaders))
|
print "[INFO]: " + str(e)
|
||||||
|
response = handle_exception(env, e)
|
||||||
|
|
||||||
except (wbexceptions.NotFoundException, wbexceptions.AccessException) as e:
|
except Exception as e:
|
||||||
print "[INFO]: " + str(e)
|
last_exc = e
|
||||||
response = handleException(env, e)
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
response = handle_exception(env, e)
|
||||||
|
|
||||||
except Exception as e:
|
return response(env, start_response)
|
||||||
last_exc = e
|
|
||||||
import traceback
|
|
||||||
traceback.print_exc()
|
|
||||||
response = handleException(env, e)
|
|
||||||
|
|
||||||
return response(env, start_response)
|
|
||||||
|
|
||||||
|
|
||||||
def handleException(env, exc):
|
return application
|
||||||
|
|
||||||
|
|
||||||
|
def handle_exception(env, exc):
|
||||||
if hasattr(exc, 'status'):
|
if hasattr(exc, 'status'):
|
||||||
status = exc.status()
|
status = exc.status()
|
||||||
else:
|
else:
|
||||||
@ -119,3 +119,13 @@ def handleException(env, exc):
|
|||||||
return WbResponse.text_response(status + ' Error: ' + str(exc), status = status)
|
return WbResponse.text_response(status + ' Error: ' + str(exc), status = status)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
app = create_wb_app(sample_wb_settings())
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
import globalwb
|
||||||
|
application = create_wb_app(globalwb.create_global_wb(default_head_insert))
|
||||||
|
#=================================================================
|
||||||
|
|
||||||
|
|
||||||
|
2
setup.py
2
setup.py
@ -4,7 +4,7 @@
|
|||||||
import setuptools
|
import setuptools
|
||||||
|
|
||||||
setuptools.setup(name='pywb',
|
setuptools.setup(name='pywb',
|
||||||
version='1.0',
|
version='0.1',
|
||||||
url='https://github.com/ikreymer/pywb',
|
url='https://github.com/ikreymer/pywb',
|
||||||
author='Ilya Kreymer',
|
author='Ilya Kreymer',
|
||||||
author_email='ilya@archive.org',
|
author_email='ilya@archive.org',
|
||||||
|
Loading…
x
Reference in New Issue
Block a user