mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
Update README
Rename FullHandler -> WBHandler Add additional comments!
This commit is contained in:
parent
d820a8c06a
commit
c4457abc4c
94
README.md
94
README.md
@ -1,6 +1,92 @@
|
|||||||
pywb
|
PyWb 0.0.1
|
||||||
====
|
==========
|
||||||
|
|
||||||
`pip install uwsgi`
|
Python implementation of Wayback Machine replay.
|
||||||
|
|
||||||
Test with: pywb/run.sh (requires UWSGI)
|
Currently, this module handles the replay and routing components.
|
||||||
|
|
||||||
|
(The calendar page/query is just a raw CDX stream at the moment)
|
||||||
|
|
||||||
|
It read records from WARC and ARC files and rewrites them in
|
||||||
|
'archival url' format like:
|
||||||
|
|
||||||
|
`http://<host>/<collection>/<timestamp>/<original url>`
|
||||||
|
|
||||||
|
The Internet Archive Wayback Machine has urls of the form:
|
||||||
|
|
||||||
|
`http://web.archive.org/web/20131015120316/http://archive.org/`
|
||||||
|
|
||||||
|
Note: The module consumes a CDX stream, currently produced by the [wayback-cdx-server][1] and does not read the CDX index files itself.
|
||||||
|
|
||||||
|
|
||||||
|
### Installation/Reqs
|
||||||
|
|
||||||
|
Currently only supports Python 2.7.x
|
||||||
|
|
||||||
|
`python setup.py install`
|
||||||
|
|
||||||
|
(Tested under 2.7.3 with uWSGI 1.9.20)
|
||||||
|
|
||||||
|
Start with `run.sh`
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Sample Setup
|
||||||
|
------------
|
||||||
|
|
||||||
|
The main driver is wbapp.py and contains a sample WB declaration.
|
||||||
|
|
||||||
|
To declare Wayback with one collection, `mycoll`
|
||||||
|
and will be accessed by user at:
|
||||||
|
|
||||||
|
`http://mywb.example.com:8080/mycoll/`
|
||||||
|
|
||||||
|
and will load cdx from cdx server running at:
|
||||||
|
|
||||||
|
`http://cdx.example.com/cdx`
|
||||||
|
|
||||||
|
and look for warcs at paths:
|
||||||
|
|
||||||
|
`http://warcs.example.com/servewarc/` and
|
||||||
|
`http://warcs.example.com/anotherpath/`,
|
||||||
|
|
||||||
|
one could declare a `createWB()` method as follows:
|
||||||
|
|
||||||
|
def createWB():
|
||||||
|
aloader = archiveloader.ArchiveLoader()
|
||||||
|
query = QueryHandler(indexreader.RemoteCDXServer('http://cdx.example.com/cdx'))
|
||||||
|
|
||||||
|
prefixes = [replay.PrefixResolver('http://warcs.example.com/servewarc/'),
|
||||||
|
replay.PrefixResolver('http://warcs.example.com/anotherpath/')]
|
||||||
|
|
||||||
|
replay = replay.RewritingReplayHandler(resolvers = prefixes, archiveloader = aloader, headInsert = headInsert)
|
||||||
|
|
||||||
|
return ArchivalRequestRouter(
|
||||||
|
{
|
||||||
|
'mycoll': [WBHandler(query, replay)],
|
||||||
|
},
|
||||||
|
hostpaths = ['http://mywb.example.com:8080/'])
|
||||||
|
|
||||||
|
|
||||||
|
Quick File Reference
|
||||||
|
--------------------
|
||||||
|
|
||||||
|
- `archivalrouter.py`- Archival mode routing and referer fallback
|
||||||
|
|
||||||
|
- `archiveloader.py` - IO for loading W/ARC data
|
||||||
|
|
||||||
|
- `indexreader.py`,`query.py` - CDX reading (from remote cdx server)
|
||||||
|
and parsing cdx
|
||||||
|
|
||||||
|
- `wbarchivalurl.py` - representation of the 'archival url' eg: `/<collection>/<timestamp>/<original url>` form
|
||||||
|
|
||||||
|
- `url_rewriter.py`, `header_rewriter.py`, `html_rewriter.py`,`regex_rewriter.py`- Various types of for rewriters. The urlrewriter converts url -> archival url, and is used by all the others. JS/CSS/XML are rewritten via regexs.
|
||||||
|
|
||||||
|
- `wbrequestresponse.py` - Wrappers for request and response for WSGI, and wrapping status and headers
|
||||||
|
|
||||||
|
- `replay.py` - drives the replay from archival content, either transparently or with rewriting
|
||||||
|
|
||||||
|
- `utils.py`, `wbexceptions.py` - Misc util functions and all exceptions
|
||||||
|
|
||||||
|
|
||||||
|
[1]: https://github.com/internetarchive/wayback/tree/master/wayback-cdx-server
|
||||||
|
@ -50,6 +50,10 @@ class RemoteCDXServer:
|
|||||||
else:
|
else:
|
||||||
return response
|
return response
|
||||||
|
|
||||||
|
# BUG: Setting replayClosest to high number for now, as cdx server sometimes returns wrong result
|
||||||
|
# with lower values if there are too many captures. Ideally, should be around 10-20
|
||||||
|
# The replayClosest is the max number of cdx lines, so max number of retry attempts that WB will make
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def getQueryParams(wburl, limit = '150000', collapseTime = '10', replayClosest = '4000'):
|
def getQueryParams(wburl, limit = '150000', collapseTime = '10', replayClosest = '4000'):
|
||||||
return {
|
return {
|
||||||
|
@ -16,7 +16,7 @@ import regex_rewriters
|
|||||||
import wbexceptions
|
import wbexceptions
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class FullHandler:
|
class WBHandler:
|
||||||
def __init__(self, query, replay):
|
def __init__(self, query, replay):
|
||||||
self.query = query
|
self.query = query
|
||||||
self.replay = replay
|
self.replay = replay
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
from query import QueryHandler, EchoEnv, EchoRequest
|
from query import QueryHandler, EchoEnv, EchoRequest
|
||||||
from replay import FullHandler
|
from replay import WBHandler
|
||||||
import wbexceptions
|
import wbexceptions
|
||||||
import indexreader
|
import indexreader
|
||||||
|
|
||||||
@ -18,15 +18,51 @@ headInsert = """
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
## ===========
|
||||||
|
'''
|
||||||
|
The below createDefaultWB() function is just a sample/debug which loads publicly accessible cdx data
|
||||||
|
|
||||||
|
|
||||||
|
To declare Wayback with one collection, `mycoll`
|
||||||
|
and will be accessed by user at:
|
||||||
|
|
||||||
|
`http://mywb.example.com:8080/mycoll/`
|
||||||
|
|
||||||
|
and will load cdx from cdx server running at:
|
||||||
|
|
||||||
|
`http://cdx.example.com/cdx`
|
||||||
|
|
||||||
|
and look for warcs at paths:
|
||||||
|
|
||||||
|
`http://warcs.example.com/servewarc/` and
|
||||||
|
`http://warcs.example.com/anotherpath/`,
|
||||||
|
|
||||||
|
one could declare a `createWB()` method as follows:
|
||||||
|
|
||||||
|
def createWB():
|
||||||
|
aloader = archiveloader.ArchiveLoader()
|
||||||
|
query = QueryHandler(indexreader.RemoteCDXServer('http://cdx.example.com/cdx'))
|
||||||
|
|
||||||
|
prefixes = [replay.PrefixResolver('http://warcs.example.com/servewarc/'),
|
||||||
|
replay.PrefixResolver('http://warcs.example.com/anotherpath/')]
|
||||||
|
|
||||||
|
replay = replay.RewritingReplayHandler(resolvers = prefixes, archiveloader = aloader, headInsert = headInsert)
|
||||||
|
|
||||||
|
return ArchivalRequestRouter(
|
||||||
|
{
|
||||||
|
'mycoll': [WBHandler(query, replay)],
|
||||||
|
},
|
||||||
|
hostpaths = ['http://mywb.example.com:8080/'])
|
||||||
|
'''
|
||||||
## ===========
|
## ===========
|
||||||
def createDefaultWB(headInsert):
|
def createDefaultWB(headInsert):
|
||||||
query = QueryHandler(indexreader.RemoteCDXServer('http://web.archive.org/cdx/search/cdx'))
|
query = QueryHandler(indexreader.RemoteCDXServer('http://web.archive.org/cdx/search/cdx'))
|
||||||
return ArchivalRequestRouter(
|
return ArchivalRequestRouter(
|
||||||
{
|
{
|
||||||
'echo' : [EchoEnv()],
|
'echo' : [EchoEnv()], # Just echo the env
|
||||||
'req' : [EchoRequest()],
|
'req' : [EchoRequest()], # Echo the WbRequest
|
||||||
'cdx' : [query],
|
'cdx' : [query], # Query the CDX
|
||||||
'web' : [query],
|
'web' : [query], # Query the CDX
|
||||||
},
|
},
|
||||||
hostpaths = ['http://localhost:9090/'])
|
hostpaths = ['http://localhost:9090/'])
|
||||||
## ===========
|
## ===========
|
||||||
@ -36,7 +72,7 @@ try:
|
|||||||
import globalwb
|
import globalwb
|
||||||
wbparser = globalwb.createDefaultWB(headInsert)
|
wbparser = globalwb.createDefaultWB(headInsert)
|
||||||
except:
|
except:
|
||||||
print " *** Test Wayback Inited *** "
|
print " *** Note: Inited With Sample Wayback *** "
|
||||||
wbparser = createDefaultWB(headInsert)
|
wbparser = createDefaultWB(headInsert)
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user