Update README

Rename FullHandler -> WBHandler Add additional comments!
2025-03-15 00:03:28 +01:00 · 2014-01-03 21:44:20 -08:00 · 2014-01-03 21:44:20 -08:00 · c4457abc4c
commit c4457abc4c
parent d820a8c06a
4 changed files with 137 additions and 11 deletions
--- a/README.md
+++ b/README.md
@ -1,6 +1,92 @@
-pywb
-====
+PyWb 0.0.1
+==========

-`pip install uwsgi`
+Python implementation of Wayback Machine replay.

-Test with: pywb/run.sh (requires UWSGI)
+Currently, this module handles the replay and routing components.
+
+(The calendar page/query is just a raw CDX stream at the moment)
+
+It read records from WARC and ARC files and rewrites them in
+'archival url' format like:
+
+`http://<host>/<collection>/<timestamp>/<original url>`
+
+The Internet Archive Wayback Machine has urls of the form:
+
+`http://web.archive.org/web/20131015120316/http://archive.org/`
+
+Note: The module consumes a CDX stream, currently produced by the [wayback-cdx-server][1] and does not read the CDX index files itself.
+
+
+### Installation/Reqs
+
+Currently only supports Python 2.7.x
+
+`python setup.py install`
+
+(Tested under 2.7.3 with uWSGI 1.9.20)
+
+Start with `run.sh`
+
+
+
+Sample Setup
+------------
+
+The main driver is wbapp.py and contains a sample WB declaration.
+
+To declare Wayback with one collection, `mycoll`
+and will be accessed by user at:
+
+`http://mywb.example.com:8080/mycoll/`
+
+and will load cdx from cdx server running at:
+
+`http://cdx.example.com/cdx`
+
+and look for warcs at paths:
+
+`http://warcs.example.com/servewarc/` and
+`http://warcs.example.com/anotherpath/`,
+
+one could declare a `createWB()` method as follows:
+
+    def createWB():
+        aloader = archiveloader.ArchiveLoader()
+        query = QueryHandler(indexreader.RemoteCDXServer('http://cdx.example.com/cdx'))
+    
+        prefixes = [replay.PrefixResolver('http://warcs.example.com/servewarc/'),
+                   replay.PrefixResolver('http://warcs.example.com/anotherpath/')]
+    
+        replay = replay.RewritingReplayHandler(resolvers = prefixes, archiveloader = aloader, headInsert = headInsert)
+    
+        return ArchivalRequestRouter(
+        {
+              'mycoll': [WBHandler(query, replay)],
+        },
+        hostpaths = ['http://mywb.example.com:8080/'])
+
+
+Quick File Reference
+--------------------
+
+ - `archivalrouter.py`- Archival mode routing and referer fallback
+
+ - `archiveloader.py` - IO for loading W/ARC data
+
+ - `indexreader.py`,`query.py` - CDX reading (from remote cdx server)
+   and parsing cdx
+
+ - `wbarchivalurl.py` - representation of the 'archival url' eg: `/<collection>/<timestamp>/<original url>` form
+
+ - `url_rewriter.py`, `header_rewriter.py`, `html_rewriter.py`,`regex_rewriter.py`- Various types of for rewriters. The urlrewriter converts url -> archival url, and is used by all the others. JS/CSS/XML are rewritten via regexs.
+ 
+ - `wbrequestresponse.py` - Wrappers for request and response for WSGI, and wrapping status and headers
+ 
+ - `replay.py` - drives the replay from archival content, either transparently or with rewriting
+
+ - `utils.py`, `wbexceptions.py` - Misc util functions and all exceptions
+
+
+  [1]: https://github.com/internetarchive/wayback/tree/master/wayback-cdx-server
--- a/pywb/indexreader.py
+++ b/pywb/indexreader.py
@ -50,6 +50,10 @@ class RemoteCDXServer:
        else:
            return response

+    # BUG: Setting replayClosest to high number for now, as cdx server sometimes returns wrong result
+    # with lower values if there are too many captures. Ideally, should be around 10-20
+    # The replayClosest is the max number of cdx lines, so max number of retry attempts that WB will make
+
    @staticmethod
    def getQueryParams(wburl, limit = '150000', collapseTime = '10', replayClosest = '4000'):
        return {
--- a/pywb/replay.py
+++ b/pywb/replay.py
@ -16,7 +16,7 @@ import regex_rewriters
 import wbexceptions

 #=================================================================
-class FullHandler:
+class WBHandler:
    def __init__(self, query, replay):
        self.query = query
        self.replay = replay
--- a/pywb/wbapp.py
+++ b/pywb/wbapp.py
@ -1,5 +1,5 @@
 from query import QueryHandler, EchoEnv, EchoRequest
-from replay import FullHandler
+from replay import WBHandler
 import wbexceptions
 import indexreader

@ -18,15 +18,51 @@ headInsert = """
 """


+## ===========
+'''
+The below createDefaultWB() function is just a sample/debug which loads publicly accessible cdx data
+
+
+To declare Wayback with one collection, `mycoll`
+and will be accessed by user at:
+
+`http://mywb.example.com:8080/mycoll/`
+
+and will load cdx from cdx server running at:
+
+`http://cdx.example.com/cdx`
+
+and look for warcs at paths:
+
+`http://warcs.example.com/servewarc/` and
+`http://warcs.example.com/anotherpath/`,
+
+one could declare a `createWB()` method as follows:
+
+    def createWB():
+        aloader = archiveloader.ArchiveLoader()
+        query = QueryHandler(indexreader.RemoteCDXServer('http://cdx.example.com/cdx'))
+    
+        prefixes = [replay.PrefixResolver('http://warcs.example.com/servewarc/'),
+                   replay.PrefixResolver('http://warcs.example.com/anotherpath/')]
+    
+        replay = replay.RewritingReplayHandler(resolvers = prefixes, archiveloader = aloader, headInsert = headInsert)
+    
+        return ArchivalRequestRouter(
+        {
+              'mycoll': [WBHandler(query, replay)],
+        },
+        hostpaths = ['http://mywb.example.com:8080/'])
+'''
 ## ===========
 def createDefaultWB(headInsert):
    query = QueryHandler(indexreader.RemoteCDXServer('http://web.archive.org/cdx/search/cdx'))
    return ArchivalRequestRouter(
    {
-     'echo' : [EchoEnv()],
-     'req'  : [EchoRequest()],
-     'cdx'  : [query],
-     'web'  : [query],
+     'echo' : [EchoEnv()],     # Just echo the env
+     'req'  : [EchoRequest()], # Echo the WbRequest
+     'cdx'  : [query],         # Query the CDX
+     'web'  : [query],         # Query the CDX
    },
    hostpaths = ['http://localhost:9090/'])
 ## ===========
@ -36,7 +72,7 @@ try:
    import globalwb
    wbparser = globalwb.createDefaultWB(headInsert)
 except:
-    print " *** Test Wayback Inited *** "
+    print " *** Note: Inited With Sample Wayback *** "
    wbparser = createDefaultWB(headInsert)