From 9ff3fc300bd528a9a246bf1f7c70b6068bbb70c2 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 20 Jan 2014 10:50:06 -0800 Subject: [PATCH] Fix #5, bringing back customParams optional params sent to cdx server Rename archivalrouter.MatchRegex -> archivalrouter.Route, supporting regex/prefix matching add redir_to_exact to turn off redirect to exact timestamp in RewritingReplayHandler update README --- README.md | 4 ++-- pywb/archivalrouter.py | 45 +++------------------------------------ pywb/query.py | 5 ++++- pywb/replay.py | 5 +++-- pywb/wbapp.py | 12 +++++------ pywb/wbrequestresponse.py | 2 ++ 6 files changed, 20 insertions(+), 53 deletions(-) diff --git a/README.md b/README.md index acbb69c7..24cfc9ad 100644 --- a/README.md +++ b/README.md @@ -65,7 +65,7 @@ one could declare a `createWB()` method as follows: return ArchivalRequestRouter( { - MatchPrefix('mycoll': replay.WBHandler(query, replay)), + Route('mycoll': replay.WBHandler(query, replay)), }, hostpaths = ['http://mywb.example.com:8080/']) @@ -73,7 +73,7 @@ one could declare a `createWB()` method as follows: Quick File Reference -------------------- - - `archivalrouter.py`- Archival mode routing and referer fallback, include MatchPrefix and MatchRegex + - `archivalrouter.py`- Archival mode routing by regex and fallback based on referrer - `archiveloader.py` - IO for loading W/ARC data diff --git a/pywb/archivalrouter.py b/pywb/archivalrouter.py index b9ff2e7d..529be99a 100644 --- a/pywb/archivalrouter.py +++ b/pywb/archivalrouter.py @@ -28,51 +28,12 @@ class ArchivalRequestRouter: -#================================================================= -# Route by matching prefix -- deprecated, as MatchRegex -# also supports the same -#================================================================= - -class MatchPrefix: - def __init__(self, prefix, handler): - self.prefix = '/' + prefix + '/' if prefix else '/' - self.coll = prefix - self.handler = handler - - - def __call__(self, env, useAbsPrefix, archivalurl_class): - request_uri = env['REL_REQUEST_URI'] - if not request_uri.startswith(self.prefix): - return None - - if self.coll: - wb_prefix = env['SCRIPT_NAME'] + self.prefix - wb_url = request_uri[len(self.coll) + 1:] - else: - wb_prefix = env['SCRIPT_NAME'] + self.prefix - wb_url = request_uri - - wbrequest = WbRequest(env, - request_uri = request_uri, - coll = self.coll, - wb_url = wb_url, - wb_prefix = wb_prefix, - use_abs_prefix = useAbsPrefix, - archivalurl_class = archivalurl_class) - - return self._handleRequest(wbrequest) - - - def _handleRequest(self, wbrequest): - return self.handler(wbrequest) - - #================================================================= -# Route by matching regex of request uri (excluding first '/') -# May be a fixed prefix +# Route by matching regex (or fixed prefix) +# of request uri (excluding first '/') #================================================================= -class MatchRegex: +class Route: def __init__(self, regex, handler, coll_group = 0): self.regex = re.compile(regex) self.handler = handler diff --git a/pywb/query.py b/pywb/query.py index b92ba534..d8aa4a60 100644 --- a/pywb/query.py +++ b/pywb/query.py @@ -18,10 +18,13 @@ class QueryHandler: # init standard params params = self.cdxserver.getQueryParams(wburl) - # add any custom params from the request + # add any custom filter from the request if wbrequest.queryFilter: params['filter'] = wbrequest.queryFilter + if wbrequest.customParams: + params.update(wbrequest.customParams) + cdxlines = self.cdxserver.load(wburl.url, params) cdxlines = utils.peek_iter(cdxlines) diff --git a/pywb/replay.py b/pywb/replay.py index acce6cf8..adc87d52 100644 --- a/pywb/replay.py +++ b/pywb/replay.py @@ -198,12 +198,13 @@ class ReplayHandler(object): #================================================================= class RewritingReplayHandler(ReplayHandler): - def __init__(self, resolvers, archiveloader, headInsert = None, headerRewriter = None): + def __init__(self, resolvers, archiveloader, headInsert = None, headerRewriter = None, redir_to_exact = True): ReplayHandler.__init__(self, resolvers, archiveloader) self.headInsert = headInsert if not headerRewriter: headerRewriter = HeaderRewriter() self.headerRewriter = headerRewriter + self.redir_to_exact = redir_to_exact def _textContentType(self, contentType): @@ -333,7 +334,7 @@ class RewritingReplayHandler(ReplayHandler): return (result['encoding'], buff) def _checkRedir(self, wbrequest, cdx): - if cdx and (cdx['timestamp'] != wbrequest.wb_url.timestamp): + if self.redir_to_exact and cdx and (cdx['timestamp'] != wbrequest.wb_url.timestamp): newUrl = wbrequest.urlrewriter.getTimestampUrl(cdx['timestamp'], cdx['original']) raise wbexceptions.InternalRedirect(newUrl) #return WbResponse.better_timestamp_response(wbrequest, cdx['timestamp']) diff --git a/pywb/wbapp.py b/pywb/wbapp.py index 5a2fec75..42ba4e1d 100644 --- a/pywb/wbapp.py +++ b/pywb/wbapp.py @@ -5,7 +5,7 @@ import wbexceptions import indexreader from wbrequestresponse import WbResponse, StatusAndHeaders -from archivalrouter import ArchivalRequestRouter, MatchPrefix +from archivalrouter import ArchivalRequestRouter, Route ## =========== headInsert = """ @@ -49,7 +49,7 @@ one could declare a `createWB()` method as follows: return ArchivalRequestRouter( { - MatchPrefix('mycoll', WBHandler(query, replay)) + Route('mycoll', WBHandler(query, replay)) }, hostpaths = ['http://mywb.example.com:8080/']) ''' @@ -58,10 +58,10 @@ def createDefaultWB(headInsert): query = QueryHandler(indexreader.RemoteCDXServer('http://web.archive.org/cdx/search/cdx')) return ArchivalRequestRouter( { - MatchPrefix('echo', EchoEnv()), # Just echo the env - MatchPrefix('req', EchoRequest()), # Echo the WbRequest - MatchPrefix('cdx', query), # Query the CDX - MatchPrefix('web', query), # Query the CDX + Route('echo', EchoEnv()), # Just echo the env + Route('req', EchoRequest()), # Echo the WbRequest + Route('cdx', query), # Query the CDX + Route('web', query), # Query the CDX }, hostpaths = ['http://localhost:9090/']) ## =========== diff --git a/pywb/wbrequestresponse.py b/pywb/wbrequestresponse.py index ce64098e..b12c6c20 100644 --- a/pywb/wbrequestresponse.py +++ b/pywb/wbrequestresponse.py @@ -78,6 +78,8 @@ class WbRequest: self.queryFilter = [] + self.customParams = {} + # PERF env['X_PERF'] = {}