diff --git a/pywb/cdxserve.py b/pywb/cdxserve.py index a8f19ea4..8f53b7b0 100644 --- a/pywb/cdxserve.py +++ b/pywb/cdxserve.py @@ -48,6 +48,9 @@ def cdx_serve(key, params, sources, match_func = binsearch.iter_exact): if limit: cdx_iter = cdx_limit(cdx_iter, limit) + # output raw cdx objects + if params.get('output') == 'raw': + return cdx_iter def write_cdx(fields): for cdx in cdx_iter: diff --git a/pywb/indexreader.py b/pywb/indexreader.py index ea1a3579..919c4594 100644 --- a/pywb/indexreader.py +++ b/pywb/indexreader.py @@ -2,6 +2,7 @@ import urllib import urllib2 import wbexceptions import itertools +import wbrequestresponse import surt from collections import OrderedDict @@ -13,7 +14,58 @@ import logging import os #================================================================= -class LocalCDXServer: +class IndexReader: + def load_for_request(self, wbrequest, parsed_cdx = True): + wburl = wbrequest.wb_url + + # init standard params + params = self.get_query_params(wburl) + + # add any custom filter from the request + if wbrequest.queryFilter: + params['filter'] = wbrequest.queryFilter + + if wbrequest.customParams: + params.update(wbrequest.customParams) + + cdxlines = self.load_cdx(wburl.url, params, parsed_cdx) + + cdxlines = utils.peek_iter(cdxlines) + + if cdxlines is None: + raise wbexceptions.NotFoundException('WB Does Not Have Url: ' + wburl.url) + + cdxlines = self.filter_cdx(wbrequest, cdxlines) + + return cdxlines + + def filter_cdx(self, wbrequest, cdxlines): + # Subclasses may wrap cdxlines iterator in a filter + return cdxlines + + def load_cdx(self, url, params = {}, parsed_cdx = True): + raise NotImplementedError('Override in subclasses') + + +#================================================================= +class LocalCDXServer(IndexReader): + """ + >>> x = LocalCDXServer([test_dir]).load_cdx('example.com', parsed_cdx = True, limit = 1) + >>> pprint(x.next().items()) + [('urlkey', 'com,example)/'), + ('timestamp', '20140127171200'), + ('original', 'http://example.com'), + ('mimetype', 'text/html'), + ('statuscode', '200'), + ('digest', 'B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'), + ('redirect', '-'), + ('robotflags', '-'), + ('length', '1046'), + ('offset', '334'), + ('filename', 'dupes.warc.gz')] + + """ + def __init__(self, sources): self.sources = [] @@ -29,8 +81,22 @@ class LocalCDXServer: self.sources.append(src) - @staticmethod - def getQueryParams(wburl, limit = 150000, collapse_time = None, replay_closest = 10): + def load_cdx(self, url, params = {}, parsed_cdx = True, **kwvalues): + # convert to surt + key = surt.surt(url) + match_func = binsearch.iter_exact + + params.update(**kwvalues) + params['output'] = 'raw' if parsed_cdx else 'text' + + return cdxserve.cdx_serve(key, params, self.sources, match_func) + + + def get_query_params(self, wburl, limit = 150000, collapse_time = None, replay_closest = 10): + + if wburl.type == wburl.URL_QUERY: + raise NotImplementedError('Url Query Not Yet Supported') + return { wburl.QUERY: @@ -52,21 +118,11 @@ class LocalCDXServer: }[wburl.type] - def load(self, url, params): - - # convert to surt - key = surt.surt(url) - match_func = binsearch.iter_exact - - print key + ' ' + urllib.urlencode(params, True) - - return cdxserve.cdx_serve(key, params, self.sources, match_func) - #================================================================= -class RemoteCDXServer: +class RemoteCDXServer(IndexReader): """ - >>> x = cdxserver.load('example.com', parse_cdx = True, limit = '2') + >>> x = RemoteCDXServer('http://web.archive.org/cdx/search/cdx').load_cdx('example.com', parsed_cdx = True, limit = '2') >>> pprint(x[0].items()) [('urlkey', 'com,example)/'), ('timestamp', '20020120142510'), @@ -81,7 +137,7 @@ class RemoteCDXServer: self.serverUrl = serverUrl self.authCookie = cookie - def load(self, url, params = {}, parse_cdx = False, **kwvalues): + def load_cdx(self, url, params = {}, parsed_cdx = True, **kwvalues): #url is required, must be passed explicitly! params['url'] = url params.update(**kwvalues) @@ -103,7 +159,7 @@ class RemoteCDXServer: else: raise e - if parse_cdx: + if parsed_cdx: return map(CDXCaptureResult, response) else: return response @@ -112,8 +168,7 @@ class RemoteCDXServer: # with lower values if there are too many captures. Ideally, should be around 10-20 # The replayClosest is the max number of cdx lines, so max number of retry attempts that WB will make - @staticmethod - def getQueryParams(wburl, limit = '150000', collapseTime = '10', replayClosest = '4000'): + def get_query_params(self, wburl, limit = '150000', collapseTime = '10', replayClosest = '4000'): return { wburl.QUERY: @@ -136,6 +191,7 @@ class RemoteCDXServer: }[wburl.type] +#================================================================= class CDXCaptureResult(OrderedDict): CDX_FORMATS = [ # Public CDX Format @@ -197,7 +253,7 @@ import utils if __name__ == "__main__" or utils.enable_doctests(): from pprint import pprint - cdxserver = RemoteCDXServer('http://web.archive.org/cdx/search/cdx') + test_dir = os.path.dirname(os.path.realpath(__file__)) + '/../test/' import doctest doctest.testmod() diff --git a/pywb/pywb_init.py b/pywb/pywb_init.py index d6eea0c7..7570f90e 100644 --- a/pywb/pywb_init.py +++ b/pywb/pywb_init.py @@ -1,5 +1,5 @@ import archiveloader -import query +import views import indexreader import replay import replay_resolvers @@ -18,7 +18,7 @@ def pywb_config(head_insert = ''): # Source for cdx source #query_h = query.QueryHandler(indexreader.RemoteCDXServer('http://cdx.example.com/cdx')) #test_cdx = [test_dir + 'iana.cdx', test_dir + 'example.cdx', test_dir + 'dupes.cdx'] - query_h = query.QueryHandler(indexreader.LocalCDXServer([test_dir])) + indexs = indexreader.LocalCDXServer([test_dir]) # Loads warcs specified in cdx from these locations prefixes = [replay_resolvers.PrefixResolver(test_dir)] @@ -26,18 +26,17 @@ def pywb_config(head_insert = ''): # Create rewriting replay handler to rewrite records replayer = replay.RewritingReplayHandler(resolvers = prefixes, archiveloader = aloader, headInsert = head_insert, buffer_response = True) - # Create Jinja2 based html query renderer - htmlquery = query.J2QueryRenderer('./ui/', 'query.html') + # Create Jinja2 based html query view + html_view = views.J2QueryView('./ui/', 'query.html') - # Handler which combins query, replayer, and html_query - wb_handler = replay.WBHandler(query_h, replayer, htmlquery = htmlquery) + # WB handler which uses the index reader, replayer, and html_view + wb_handler = replay.WBHandler(indexs, replayer, html_view) # Finally, create wb router return ArchivalRequestRouter( { - Route('echo_req', query.DebugEchoRequest()), # Debug ex: just echo parsed request + Route('echo_req', views.DebugEchoView()), # Debug ex: just echo parsed request Route('pywb', wb_handler), - Route('cdx', query_h) }, # Specify hostnames that pywb will be running on # This will help catch occasionally missed rewrites that fall-through to the host diff --git a/pywb/query.py b/pywb/query.py deleted file mode 100644 index a42d3a64..00000000 --- a/pywb/query.py +++ /dev/null @@ -1,84 +0,0 @@ -import indexreader -import utils -import wbrequestresponse -import wbexceptions - -from jinja2 import Environment, FileSystemLoader - -class QueryHandler: - def __init__(self, cdxserver = None): - if not cdxserver: - cdxserver = indexreader.RemoteCDXServer('http://web.archive.org/cdx/search/cdx') - - self.cdxserver = cdxserver - - def __call__(self, wbrequest): - wburl = wbrequest.wb_url - - # init standard params - params = self.cdxserver.getQueryParams(wburl) - - # add any custom filter from the request - if wbrequest.queryFilter: - params['filter'] = wbrequest.queryFilter - - if wbrequest.customParams: - params.update(wbrequest.customParams) - - cdxlines = self.cdxserver.load(wburl.url, params) - - cdxlines = utils.peek_iter(cdxlines) - - if cdxlines is None: - raise wbexceptions.NotFoundException('WB Does Not Have Url: ' + wburl.url) - - cdxlines = self.filterCdx(wbrequest, cdxlines) - - # Output raw cdx stream - return wbrequestresponse.WbResponse.text_stream(cdxlines) - - def filterCdx(self, wbrequest, cdxlines): - # Subclasses may wrap cdxlines iterator in a filter - return cdxlines - - -class J2QueryRenderer: - def __init__(self, template_dir, template_file): - self.template_file = template_file - - self.jinja_env = Environment(loader = FileSystemLoader(template_dir), trim_blocks = True) - - def __call__(self, wbrequest, query_response): - cdxlines = query_response.body - - def parse_cdx(): - for cdx in cdxlines: - try: - cdx = indexreader.CDXCaptureResult(cdx) - yield cdx - - except wbexceptions.InvalidCDXException: - import traceback - traceback.print_exc() - pass - - - template = self.jinja_env.get_template(self.template_file) - response = template.render(cdxlines = parse_cdx(), - url = wbrequest.wb_url.url, - prefix = wbrequest.wb_prefix) - - return wbrequestresponse.WbResponse.text_response(str(response), content_type = 'text/html') - - -## =========== -## Simple handlers for debugging -class DebugEchoEnv: - def __call__(self, wbrequest): - return wbrequestresponse.WbResponse.text_response(str(wbrequest.env)) - -class DebugEchoRequest: - def __call__(self, wbrequest): - return wbrequestresponse.WbResponse.text_response(str(wbrequest)) - - diff --git a/pywb/replay.py b/pywb/replay.py index 303a5735..e3675209 100644 --- a/pywb/replay.py +++ b/pywb/replay.py @@ -2,8 +2,10 @@ import StringIO from urllib2 import URLError import chardet import copy +import itertools -import indexreader, archiveloader +import archiveloader +import views from wbrequestresponse import WbResponse, StatusAndHeaders from wbarchivalurl import ArchivalUrl import utils @@ -17,33 +19,37 @@ import wbexceptions #================================================================= class WBHandler: - def __init__(self, query, replay, htmlquery = None): - self.query = query + def __init__(self, cdx_reader, replay, html_view = None): + self.cdx_reader = cdx_reader self.replay = replay - self.htmlquery = htmlquery + self.html_view = html_view + self.text_view = views.TextQueryView() def __call__(self, wbrequest): with utils.PerfTimer(wbrequest.env.get('X_PERF'), 'query') as t: - query_response = self.query(wbrequest) + cdx_lines = self.cdx_reader.load_for_request(wbrequest, parsed_cdx = True) + + # new special modifier to always show cdx index + if wbrequest.wb_url.mod == 'cdx_': + return self.text_view(wbrequest, cdx_lines) if (wbrequest.wb_url.type == wbrequest.wb_url.QUERY) or (wbrequest.wb_url.type == wbrequest.wb_url.URL_QUERY): - if wbrequest.wb_url.mod == 'text' or not self.htmlquery: - return query_response + if not self.html_view: + return self.text_view(wbrequest, cdx_lines) else: - return self.htmlquery(wbrequest, query_response) + return self.html_view(wbrequest, cdx_lines) with utils.PerfTimer(wbrequest.env.get('X_PERF'), 'replay') as t: - return self.replay(wbrequest, query_response, self.query) + return self.replay(wbrequest, cdx_lines, self.cdx_reader) #================================================================= class ReplayHandler(object): def __init__(self, resolvers, archiveloader): self.resolvers = resolvers - self.archiveloader = archiveloader + self.loader = archiveloader - def __call__(self, wbrequest, query_response, query): - cdxlist = query_response.body + def __call__(self, wbrequest, cdx_lines, cdx_reader): last_e = None first = True @@ -52,16 +58,14 @@ class ReplayHandler(object): # Iterate over the cdx until find one that works # The cdx should already be sorted in closest-to-timestamp order (from the cdx server) - for cdx in cdxlist: + for cdx in cdx_lines: try: - cdx = indexreader.CDXCaptureResult(cdx) - # ability to intercept and redirect if first: self._checkRedir(wbrequest, cdx) first = False - response = self.doReplay(cdx, wbrequest, query, failedFiles) + response = self.doReplay(cdx, wbrequest, cdx_reader, failedFiles) if response: response.cdx = cdx @@ -100,7 +104,7 @@ class ReplayHandler(object): for path in possible_paths: any_found = True try: - return self.archiveloader.load(path, offset, length) + return self.loader.load(path, offset, length) except URLError as ue: last_exc = ue @@ -117,7 +121,7 @@ class ReplayHandler(object): raise wbexceptions.ArchiveLoadFailed(filename, last_exc.reason if last_exc else '') - def doReplay(self, cdx, wbrequest, query, failedFiles): + def doReplay(self, cdx, wbrequest, cdx_reader, failedFiles): hasCurr = (cdx['filename'] != '-') hasOrig = (cdx.get('orig.filename','-') != '-') @@ -127,7 +131,7 @@ class ReplayHandler(object): # two index lookups # Case 1: if mimetype is still warc/revisit if cdx['mimetype'] == 'warc/revisit' and headersRecord: - payloadRecord = self._load_different_url_payload(wbrequest, query, cdx, headersRecord, failedFiles) + payloadRecord = self._load_different_url_payload(wbrequest, cdx_reader, cdx, headersRecord, failedFiles) # single lookup cases # case 2: non-revisit @@ -163,7 +167,7 @@ class ReplayHandler(object): # Handle the case where a duplicate of a capture with same digest exists at a different url # Must query the index at that url filtering by matching digest # Raise exception if no matches found - def _load_different_url_payload(self, wbrequest, query, cdx, headersRecord, failedFiles): + def _load_different_url_payload(self, wbrequest, cdx_reader, cdx, headersRecord, failedFiles): ref_target_uri = headersRecord.rec_headers.getHeader('WARC-Refers-To-Target-URI') # Check for unresolved revisit error, if refers to target uri not present or same as the current url @@ -187,11 +191,11 @@ class ReplayHandler(object): # Must also match digest orig_wbreq.queryFilter.append('digest:' + cdx['digest']) - orig_cdxlines = query(orig_wbreq).body + orig_cdx_lines = cdx_reader.load_for_request(orig_wbreq, parsed_cdx = True) - for cdx in orig_cdxlines: + for cdx in orig_cdx_lines: try: - cdx = indexreader.CDXCaptureResult(cdx) + #cdx = cdx_reader.CDXCaptureResult(cdx) #print cdx payloadRecord = self._load(cdx, False, failedFiles) return payloadRecord @@ -256,11 +260,11 @@ class RewritingReplayHandler(ReplayHandler): return None - def __call__(self, wbrequest, query_response, query): + def __call__(self, wbrequest, index, cdx_reader): urlrewriter = ArchivalUrlRewriter(wbrequest.wb_url, wbrequest.wb_prefix) wbrequest.urlrewriter = urlrewriter - response = ReplayHandler.__call__(self, wbrequest, query_response, query) + response = ReplayHandler.__call__(self, wbrequest, index, cdx_reader) if response and response.cdx: self._checkRedir(wbrequest, response.cdx) @@ -414,8 +418,8 @@ class RewritingReplayHandler(ReplayHandler): return None - def doReplay(self, cdx, wbrequest, query, failedFiles): - wbresponse = ReplayHandler.doReplay(self, cdx, wbrequest, query, failedFiles) + def doReplay(self, cdx, wbrequest, index, failedFiles): + wbresponse = ReplayHandler.doReplay(self, cdx, wbrequest, index, failedFiles) # Check for self redirect if wbresponse.status_headers.statusline.startswith('3'): diff --git a/pywb/views.py b/pywb/views.py new file mode 100644 index 00000000..7e572ccb --- /dev/null +++ b/pywb/views.py @@ -0,0 +1,49 @@ +import indexreader +import utils +import wbrequestresponse +import wbexceptions + +from itertools import imap +from jinja2 import Environment, FileSystemLoader + + +#================================================================= +class TextQueryView: + def __call__(self, wbrequest, cdx_lines): + cdx_lines = imap(lambda x: str(x) + '\n', cdx_lines) + return wbrequestresponse.WbResponse.text_stream(cdx_lines) + +#================================================================= +class J2QueryView: + def __init__(self, template_dir, template_file, buffer_index = True): + self.template_file = template_file + self.buffer_index = buffer_index + + self.jinja_env = Environment(loader = FileSystemLoader(template_dir), trim_blocks = True) + + + def __call__(self, wbrequest, cdx_lines): + template = self.jinja_env.get_template(self.template_file) + + # buffer/convert to list so we have length available for template + if self.buffer_index: + cdx_lines = list(cdx_lines) + + response = template.render(cdx_lines = cdx_lines, + url = wbrequest.wb_url.url, + prefix = wbrequest.wb_prefix) + + return wbrequestresponse.WbResponse.text_response(str(response), content_type = 'text/html') + + +#================================================================= +class DebugEchoView: + def __call__(self, wbrequest): + return wbrequestresponse.WbResponse.text_response(str(wbrequest.env)) + +#================================================================= +class DebugEchoView: + def __call__(self, wbrequest): + return wbrequestresponse.WbResponse.text_response(str(wbrequest)) + + diff --git a/ui/query.html b/ui/query.html index b73e0815..4b44ef32 100644 --- a/ui/query.html +++ b/ui/query.html @@ -1,14 +1,11 @@
- Captures of {{ url }} + {{ cdx_lines | length }} captures of {{ url }}{{ cdx.timestamp }} | {{ cdx['filename'] }} |