From c4457abc4c32eb34a15ff22ed1ed4f29c3e20fa7 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 3 Jan 2014 21:44:20 -0800 Subject: [PATCH] Update README Rename FullHandler -> WBHandler Add additional comments! --- README.md | 94 +++++++++++++++++++++++++++++++++++++++++++-- pywb/indexreader.py | 4 ++ pywb/replay.py | 2 +- pywb/wbapp.py | 48 ++++++++++++++++++++--- 4 files changed, 137 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index adeea037..a9ed0be0 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,92 @@ -pywb -==== +PyWb 0.0.1 +========== -`pip install uwsgi` +Python implementation of Wayback Machine replay. -Test with: pywb/run.sh (requires UWSGI) +Currently, this module handles the replay and routing components. + +(The calendar page/query is just a raw CDX stream at the moment) + +It read records from WARC and ARC files and rewrites them in +'archival url' format like: + +`http://///` + +The Internet Archive Wayback Machine has urls of the form: + +`http://web.archive.org/web/20131015120316/http://archive.org/` + +Note: The module consumes a CDX stream, currently produced by the [wayback-cdx-server][1] and does not read the CDX index files itself. + + +### Installation/Reqs + +Currently only supports Python 2.7.x + +`python setup.py install` + +(Tested under 2.7.3 with uWSGI 1.9.20) + +Start with `run.sh` + + + +Sample Setup +------------ + +The main driver is wbapp.py and contains a sample WB declaration. + +To declare Wayback with one collection, `mycoll` +and will be accessed by user at: + +`http://mywb.example.com:8080/mycoll/` + +and will load cdx from cdx server running at: + +`http://cdx.example.com/cdx` + +and look for warcs at paths: + +`http://warcs.example.com/servewarc/` and +`http://warcs.example.com/anotherpath/`, + +one could declare a `createWB()` method as follows: + + def createWB(): + aloader = archiveloader.ArchiveLoader() + query = QueryHandler(indexreader.RemoteCDXServer('http://cdx.example.com/cdx')) + + prefixes = [replay.PrefixResolver('http://warcs.example.com/servewarc/'), + replay.PrefixResolver('http://warcs.example.com/anotherpath/')] + + replay = replay.RewritingReplayHandler(resolvers = prefixes, archiveloader = aloader, headInsert = headInsert) + + return ArchivalRequestRouter( + { + 'mycoll': [WBHandler(query, replay)], + }, + hostpaths = ['http://mywb.example.com:8080/']) + + +Quick File Reference +-------------------- + + - `archivalrouter.py`- Archival mode routing and referer fallback + + - `archiveloader.py` - IO for loading W/ARC data + + - `indexreader.py`,`query.py` - CDX reading (from remote cdx server) + and parsing cdx + + - `wbarchivalurl.py` - representation of the 'archival url' eg: `///` form + + - `url_rewriter.py`, `header_rewriter.py`, `html_rewriter.py`,`regex_rewriter.py`- Various types of for rewriters. The urlrewriter converts url -> archival url, and is used by all the others. JS/CSS/XML are rewritten via regexs. + + - `wbrequestresponse.py` - Wrappers for request and response for WSGI, and wrapping status and headers + + - `replay.py` - drives the replay from archival content, either transparently or with rewriting + + - `utils.py`, `wbexceptions.py` - Misc util functions and all exceptions + + + [1]: https://github.com/internetarchive/wayback/tree/master/wayback-cdx-server diff --git a/pywb/indexreader.py b/pywb/indexreader.py index 30a9e0f0..c87d011a 100644 --- a/pywb/indexreader.py +++ b/pywb/indexreader.py @@ -50,6 +50,10 @@ class RemoteCDXServer: else: return response + # BUG: Setting replayClosest to high number for now, as cdx server sometimes returns wrong result + # with lower values if there are too many captures. Ideally, should be around 10-20 + # The replayClosest is the max number of cdx lines, so max number of retry attempts that WB will make + @staticmethod def getQueryParams(wburl, limit = '150000', collapseTime = '10', replayClosest = '4000'): return { diff --git a/pywb/replay.py b/pywb/replay.py index 659f7aa9..660a167d 100644 --- a/pywb/replay.py +++ b/pywb/replay.py @@ -16,7 +16,7 @@ import regex_rewriters import wbexceptions #================================================================= -class FullHandler: +class WBHandler: def __init__(self, query, replay): self.query = query self.replay = replay diff --git a/pywb/wbapp.py b/pywb/wbapp.py index 3e27dbee..d93ae92b 100644 --- a/pywb/wbapp.py +++ b/pywb/wbapp.py @@ -1,5 +1,5 @@ from query import QueryHandler, EchoEnv, EchoRequest -from replay import FullHandler +from replay import WBHandler import wbexceptions import indexreader @@ -18,15 +18,51 @@ headInsert = """ """ +## =========== +''' +The below createDefaultWB() function is just a sample/debug which loads publicly accessible cdx data + + +To declare Wayback with one collection, `mycoll` +and will be accessed by user at: + +`http://mywb.example.com:8080/mycoll/` + +and will load cdx from cdx server running at: + +`http://cdx.example.com/cdx` + +and look for warcs at paths: + +`http://warcs.example.com/servewarc/` and +`http://warcs.example.com/anotherpath/`, + +one could declare a `createWB()` method as follows: + + def createWB(): + aloader = archiveloader.ArchiveLoader() + query = QueryHandler(indexreader.RemoteCDXServer('http://cdx.example.com/cdx')) + + prefixes = [replay.PrefixResolver('http://warcs.example.com/servewarc/'), + replay.PrefixResolver('http://warcs.example.com/anotherpath/')] + + replay = replay.RewritingReplayHandler(resolvers = prefixes, archiveloader = aloader, headInsert = headInsert) + + return ArchivalRequestRouter( + { + 'mycoll': [WBHandler(query, replay)], + }, + hostpaths = ['http://mywb.example.com:8080/']) +''' ## =========== def createDefaultWB(headInsert): query = QueryHandler(indexreader.RemoteCDXServer('http://web.archive.org/cdx/search/cdx')) return ArchivalRequestRouter( { - 'echo' : [EchoEnv()], - 'req' : [EchoRequest()], - 'cdx' : [query], - 'web' : [query], + 'echo' : [EchoEnv()], # Just echo the env + 'req' : [EchoRequest()], # Echo the WbRequest + 'cdx' : [query], # Query the CDX + 'web' : [query], # Query the CDX }, hostpaths = ['http://localhost:9090/']) ## =========== @@ -36,7 +72,7 @@ try: import globalwb wbparser = globalwb.createDefaultWB(headInsert) except: - print " *** Test Wayback Inited *** " + print " *** Note: Inited With Sample Wayback *** " wbparser = createDefaultWB(headInsert)