1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

refactor: remove intermediate query object.

rename query -> views
wbhandler queries index, replayer and renders via view

new feature: 'cdx_' modifier can be used to render cdx from any request
This commit is contained in:
Ilya Kreymer 2014-01-28 16:41:19 -08:00
parent a83d527702
commit 1a234f2953
7 changed files with 168 additions and 144 deletions

View File

@ -48,6 +48,9 @@ def cdx_serve(key, params, sources, match_func = binsearch.iter_exact):
if limit: if limit:
cdx_iter = cdx_limit(cdx_iter, limit) cdx_iter = cdx_limit(cdx_iter, limit)
# output raw cdx objects
if params.get('output') == 'raw':
return cdx_iter
def write_cdx(fields): def write_cdx(fields):
for cdx in cdx_iter: for cdx in cdx_iter:

View File

@ -2,6 +2,7 @@ import urllib
import urllib2 import urllib2
import wbexceptions import wbexceptions
import itertools import itertools
import wbrequestresponse
import surt import surt
from collections import OrderedDict from collections import OrderedDict
@ -13,7 +14,58 @@ import logging
import os import os
#================================================================= #=================================================================
class LocalCDXServer: class IndexReader:
def load_for_request(self, wbrequest, parsed_cdx = True):
wburl = wbrequest.wb_url
# init standard params
params = self.get_query_params(wburl)
# add any custom filter from the request
if wbrequest.queryFilter:
params['filter'] = wbrequest.queryFilter
if wbrequest.customParams:
params.update(wbrequest.customParams)
cdxlines = self.load_cdx(wburl.url, params, parsed_cdx)
cdxlines = utils.peek_iter(cdxlines)
if cdxlines is None:
raise wbexceptions.NotFoundException('WB Does Not Have Url: ' + wburl.url)
cdxlines = self.filter_cdx(wbrequest, cdxlines)
return cdxlines
def filter_cdx(self, wbrequest, cdxlines):
# Subclasses may wrap cdxlines iterator in a filter
return cdxlines
def load_cdx(self, url, params = {}, parsed_cdx = True):
raise NotImplementedError('Override in subclasses')
#=================================================================
class LocalCDXServer(IndexReader):
"""
>>> x = LocalCDXServer([test_dir]).load_cdx('example.com', parsed_cdx = True, limit = 1)
>>> pprint(x.next().items())
[('urlkey', 'com,example)/'),
('timestamp', '20140127171200'),
('original', 'http://example.com'),
('mimetype', 'text/html'),
('statuscode', '200'),
('digest', 'B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'),
('redirect', '-'),
('robotflags', '-'),
('length', '1046'),
('offset', '334'),
('filename', 'dupes.warc.gz')]
"""
def __init__(self, sources): def __init__(self, sources):
self.sources = [] self.sources = []
@ -29,8 +81,22 @@ class LocalCDXServer:
self.sources.append(src) self.sources.append(src)
@staticmethod def load_cdx(self, url, params = {}, parsed_cdx = True, **kwvalues):
def getQueryParams(wburl, limit = 150000, collapse_time = None, replay_closest = 10): # convert to surt
key = surt.surt(url)
match_func = binsearch.iter_exact
params.update(**kwvalues)
params['output'] = 'raw' if parsed_cdx else 'text'
return cdxserve.cdx_serve(key, params, self.sources, match_func)
def get_query_params(self, wburl, limit = 150000, collapse_time = None, replay_closest = 10):
if wburl.type == wburl.URL_QUERY:
raise NotImplementedError('Url Query Not Yet Supported')
return { return {
wburl.QUERY: wburl.QUERY:
@ -52,21 +118,11 @@ class LocalCDXServer:
}[wburl.type] }[wburl.type]
def load(self, url, params):
# convert to surt
key = surt.surt(url)
match_func = binsearch.iter_exact
print key + ' ' + urllib.urlencode(params, True)
return cdxserve.cdx_serve(key, params, self.sources, match_func)
#================================================================= #=================================================================
class RemoteCDXServer: class RemoteCDXServer(IndexReader):
""" """
>>> x = cdxserver.load('example.com', parse_cdx = True, limit = '2') >>> x = RemoteCDXServer('http://web.archive.org/cdx/search/cdx').load_cdx('example.com', parsed_cdx = True, limit = '2')
>>> pprint(x[0].items()) >>> pprint(x[0].items())
[('urlkey', 'com,example)/'), [('urlkey', 'com,example)/'),
('timestamp', '20020120142510'), ('timestamp', '20020120142510'),
@ -81,7 +137,7 @@ class RemoteCDXServer:
self.serverUrl = serverUrl self.serverUrl = serverUrl
self.authCookie = cookie self.authCookie = cookie
def load(self, url, params = {}, parse_cdx = False, **kwvalues): def load_cdx(self, url, params = {}, parsed_cdx = True, **kwvalues):
#url is required, must be passed explicitly! #url is required, must be passed explicitly!
params['url'] = url params['url'] = url
params.update(**kwvalues) params.update(**kwvalues)
@ -103,7 +159,7 @@ class RemoteCDXServer:
else: else:
raise e raise e
if parse_cdx: if parsed_cdx:
return map(CDXCaptureResult, response) return map(CDXCaptureResult, response)
else: else:
return response return response
@ -112,8 +168,7 @@ class RemoteCDXServer:
# with lower values if there are too many captures. Ideally, should be around 10-20 # with lower values if there are too many captures. Ideally, should be around 10-20
# The replayClosest is the max number of cdx lines, so max number of retry attempts that WB will make # The replayClosest is the max number of cdx lines, so max number of retry attempts that WB will make
@staticmethod def get_query_params(self, wburl, limit = '150000', collapseTime = '10', replayClosest = '4000'):
def getQueryParams(wburl, limit = '150000', collapseTime = '10', replayClosest = '4000'):
return { return {
wburl.QUERY: wburl.QUERY:
@ -136,6 +191,7 @@ class RemoteCDXServer:
}[wburl.type] }[wburl.type]
#=================================================================
class CDXCaptureResult(OrderedDict): class CDXCaptureResult(OrderedDict):
CDX_FORMATS = [ CDX_FORMATS = [
# Public CDX Format # Public CDX Format
@ -197,7 +253,7 @@ import utils
if __name__ == "__main__" or utils.enable_doctests(): if __name__ == "__main__" or utils.enable_doctests():
from pprint import pprint from pprint import pprint
cdxserver = RemoteCDXServer('http://web.archive.org/cdx/search/cdx') test_dir = os.path.dirname(os.path.realpath(__file__)) + '/../test/'
import doctest import doctest
doctest.testmod() doctest.testmod()

View File

@ -1,5 +1,5 @@
import archiveloader import archiveloader
import query import views
import indexreader import indexreader
import replay import replay
import replay_resolvers import replay_resolvers
@ -18,7 +18,7 @@ def pywb_config(head_insert = ''):
# Source for cdx source # Source for cdx source
#query_h = query.QueryHandler(indexreader.RemoteCDXServer('http://cdx.example.com/cdx')) #query_h = query.QueryHandler(indexreader.RemoteCDXServer('http://cdx.example.com/cdx'))
#test_cdx = [test_dir + 'iana.cdx', test_dir + 'example.cdx', test_dir + 'dupes.cdx'] #test_cdx = [test_dir + 'iana.cdx', test_dir + 'example.cdx', test_dir + 'dupes.cdx']
query_h = query.QueryHandler(indexreader.LocalCDXServer([test_dir])) indexs = indexreader.LocalCDXServer([test_dir])
# Loads warcs specified in cdx from these locations # Loads warcs specified in cdx from these locations
prefixes = [replay_resolvers.PrefixResolver(test_dir)] prefixes = [replay_resolvers.PrefixResolver(test_dir)]
@ -26,18 +26,17 @@ def pywb_config(head_insert = ''):
# Create rewriting replay handler to rewrite records # Create rewriting replay handler to rewrite records
replayer = replay.RewritingReplayHandler(resolvers = prefixes, archiveloader = aloader, headInsert = head_insert, buffer_response = True) replayer = replay.RewritingReplayHandler(resolvers = prefixes, archiveloader = aloader, headInsert = head_insert, buffer_response = True)
# Create Jinja2 based html query renderer # Create Jinja2 based html query view
htmlquery = query.J2QueryRenderer('./ui/', 'query.html') html_view = views.J2QueryView('./ui/', 'query.html')
# Handler which combins query, replayer, and html_query # WB handler which uses the index reader, replayer, and html_view
wb_handler = replay.WBHandler(query_h, replayer, htmlquery = htmlquery) wb_handler = replay.WBHandler(indexs, replayer, html_view)
# Finally, create wb router # Finally, create wb router
return ArchivalRequestRouter( return ArchivalRequestRouter(
{ {
Route('echo_req', query.DebugEchoRequest()), # Debug ex: just echo parsed request Route('echo_req', views.DebugEchoView()), # Debug ex: just echo parsed request
Route('pywb', wb_handler), Route('pywb', wb_handler),
Route('cdx', query_h)
}, },
# Specify hostnames that pywb will be running on # Specify hostnames that pywb will be running on
# This will help catch occasionally missed rewrites that fall-through to the host # This will help catch occasionally missed rewrites that fall-through to the host

View File

@ -1,84 +0,0 @@
import indexreader
import utils
import wbrequestresponse
import wbexceptions
from jinja2 import Environment, FileSystemLoader
class QueryHandler:
def __init__(self, cdxserver = None):
if not cdxserver:
cdxserver = indexreader.RemoteCDXServer('http://web.archive.org/cdx/search/cdx')
self.cdxserver = cdxserver
def __call__(self, wbrequest):
wburl = wbrequest.wb_url
# init standard params
params = self.cdxserver.getQueryParams(wburl)
# add any custom filter from the request
if wbrequest.queryFilter:
params['filter'] = wbrequest.queryFilter
if wbrequest.customParams:
params.update(wbrequest.customParams)
cdxlines = self.cdxserver.load(wburl.url, params)
cdxlines = utils.peek_iter(cdxlines)
if cdxlines is None:
raise wbexceptions.NotFoundException('WB Does Not Have Url: ' + wburl.url)
cdxlines = self.filterCdx(wbrequest, cdxlines)
# Output raw cdx stream
return wbrequestresponse.WbResponse.text_stream(cdxlines)
def filterCdx(self, wbrequest, cdxlines):
# Subclasses may wrap cdxlines iterator in a filter
return cdxlines
class J2QueryRenderer:
def __init__(self, template_dir, template_file):
self.template_file = template_file
self.jinja_env = Environment(loader = FileSystemLoader(template_dir), trim_blocks = True)
def __call__(self, wbrequest, query_response):
cdxlines = query_response.body
def parse_cdx():
for cdx in cdxlines:
try:
cdx = indexreader.CDXCaptureResult(cdx)
yield cdx
except wbexceptions.InvalidCDXException:
import traceback
traceback.print_exc()
pass
template = self.jinja_env.get_template(self.template_file)
response = template.render(cdxlines = parse_cdx(),
url = wbrequest.wb_url.url,
prefix = wbrequest.wb_prefix)
return wbrequestresponse.WbResponse.text_response(str(response), content_type = 'text/html')
## ===========
## Simple handlers for debugging
class DebugEchoEnv:
def __call__(self, wbrequest):
return wbrequestresponse.WbResponse.text_response(str(wbrequest.env))
class DebugEchoRequest:
def __call__(self, wbrequest):
return wbrequestresponse.WbResponse.text_response(str(wbrequest))

View File

@ -2,8 +2,10 @@ import StringIO
from urllib2 import URLError from urllib2 import URLError
import chardet import chardet
import copy import copy
import itertools
import indexreader, archiveloader import archiveloader
import views
from wbrequestresponse import WbResponse, StatusAndHeaders from wbrequestresponse import WbResponse, StatusAndHeaders
from wbarchivalurl import ArchivalUrl from wbarchivalurl import ArchivalUrl
import utils import utils
@ -17,33 +19,37 @@ import wbexceptions
#================================================================= #=================================================================
class WBHandler: class WBHandler:
def __init__(self, query, replay, htmlquery = None): def __init__(self, cdx_reader, replay, html_view = None):
self.query = query self.cdx_reader = cdx_reader
self.replay = replay self.replay = replay
self.htmlquery = htmlquery self.html_view = html_view
self.text_view = views.TextQueryView()
def __call__(self, wbrequest): def __call__(self, wbrequest):
with utils.PerfTimer(wbrequest.env.get('X_PERF'), 'query') as t: with utils.PerfTimer(wbrequest.env.get('X_PERF'), 'query') as t:
query_response = self.query(wbrequest) cdx_lines = self.cdx_reader.load_for_request(wbrequest, parsed_cdx = True)
# new special modifier to always show cdx index
if wbrequest.wb_url.mod == 'cdx_':
return self.text_view(wbrequest, cdx_lines)
if (wbrequest.wb_url.type == wbrequest.wb_url.QUERY) or (wbrequest.wb_url.type == wbrequest.wb_url.URL_QUERY): if (wbrequest.wb_url.type == wbrequest.wb_url.QUERY) or (wbrequest.wb_url.type == wbrequest.wb_url.URL_QUERY):
if wbrequest.wb_url.mod == 'text' or not self.htmlquery: if not self.html_view:
return query_response return self.text_view(wbrequest, cdx_lines)
else: else:
return self.htmlquery(wbrequest, query_response) return self.html_view(wbrequest, cdx_lines)
with utils.PerfTimer(wbrequest.env.get('X_PERF'), 'replay') as t: with utils.PerfTimer(wbrequest.env.get('X_PERF'), 'replay') as t:
return self.replay(wbrequest, query_response, self.query) return self.replay(wbrequest, cdx_lines, self.cdx_reader)
#================================================================= #=================================================================
class ReplayHandler(object): class ReplayHandler(object):
def __init__(self, resolvers, archiveloader): def __init__(self, resolvers, archiveloader):
self.resolvers = resolvers self.resolvers = resolvers
self.archiveloader = archiveloader self.loader = archiveloader
def __call__(self, wbrequest, query_response, query): def __call__(self, wbrequest, cdx_lines, cdx_reader):
cdxlist = query_response.body
last_e = None last_e = None
first = True first = True
@ -52,16 +58,14 @@ class ReplayHandler(object):
# Iterate over the cdx until find one that works # Iterate over the cdx until find one that works
# The cdx should already be sorted in closest-to-timestamp order (from the cdx server) # The cdx should already be sorted in closest-to-timestamp order (from the cdx server)
for cdx in cdxlist: for cdx in cdx_lines:
try: try:
cdx = indexreader.CDXCaptureResult(cdx)
# ability to intercept and redirect # ability to intercept and redirect
if first: if first:
self._checkRedir(wbrequest, cdx) self._checkRedir(wbrequest, cdx)
first = False first = False
response = self.doReplay(cdx, wbrequest, query, failedFiles) response = self.doReplay(cdx, wbrequest, cdx_reader, failedFiles)
if response: if response:
response.cdx = cdx response.cdx = cdx
@ -100,7 +104,7 @@ class ReplayHandler(object):
for path in possible_paths: for path in possible_paths:
any_found = True any_found = True
try: try:
return self.archiveloader.load(path, offset, length) return self.loader.load(path, offset, length)
except URLError as ue: except URLError as ue:
last_exc = ue last_exc = ue
@ -117,7 +121,7 @@ class ReplayHandler(object):
raise wbexceptions.ArchiveLoadFailed(filename, last_exc.reason if last_exc else '') raise wbexceptions.ArchiveLoadFailed(filename, last_exc.reason if last_exc else '')
def doReplay(self, cdx, wbrequest, query, failedFiles): def doReplay(self, cdx, wbrequest, cdx_reader, failedFiles):
hasCurr = (cdx['filename'] != '-') hasCurr = (cdx['filename'] != '-')
hasOrig = (cdx.get('orig.filename','-') != '-') hasOrig = (cdx.get('orig.filename','-') != '-')
@ -127,7 +131,7 @@ class ReplayHandler(object):
# two index lookups # two index lookups
# Case 1: if mimetype is still warc/revisit # Case 1: if mimetype is still warc/revisit
if cdx['mimetype'] == 'warc/revisit' and headersRecord: if cdx['mimetype'] == 'warc/revisit' and headersRecord:
payloadRecord = self._load_different_url_payload(wbrequest, query, cdx, headersRecord, failedFiles) payloadRecord = self._load_different_url_payload(wbrequest, cdx_reader, cdx, headersRecord, failedFiles)
# single lookup cases # single lookup cases
# case 2: non-revisit # case 2: non-revisit
@ -163,7 +167,7 @@ class ReplayHandler(object):
# Handle the case where a duplicate of a capture with same digest exists at a different url # Handle the case where a duplicate of a capture with same digest exists at a different url
# Must query the index at that url filtering by matching digest # Must query the index at that url filtering by matching digest
# Raise exception if no matches found # Raise exception if no matches found
def _load_different_url_payload(self, wbrequest, query, cdx, headersRecord, failedFiles): def _load_different_url_payload(self, wbrequest, cdx_reader, cdx, headersRecord, failedFiles):
ref_target_uri = headersRecord.rec_headers.getHeader('WARC-Refers-To-Target-URI') ref_target_uri = headersRecord.rec_headers.getHeader('WARC-Refers-To-Target-URI')
# Check for unresolved revisit error, if refers to target uri not present or same as the current url # Check for unresolved revisit error, if refers to target uri not present or same as the current url
@ -187,11 +191,11 @@ class ReplayHandler(object):
# Must also match digest # Must also match digest
orig_wbreq.queryFilter.append('digest:' + cdx['digest']) orig_wbreq.queryFilter.append('digest:' + cdx['digest'])
orig_cdxlines = query(orig_wbreq).body orig_cdx_lines = cdx_reader.load_for_request(orig_wbreq, parsed_cdx = True)
for cdx in orig_cdxlines: for cdx in orig_cdx_lines:
try: try:
cdx = indexreader.CDXCaptureResult(cdx) #cdx = cdx_reader.CDXCaptureResult(cdx)
#print cdx #print cdx
payloadRecord = self._load(cdx, False, failedFiles) payloadRecord = self._load(cdx, False, failedFiles)
return payloadRecord return payloadRecord
@ -256,11 +260,11 @@ class RewritingReplayHandler(ReplayHandler):
return None return None
def __call__(self, wbrequest, query_response, query): def __call__(self, wbrequest, index, cdx_reader):
urlrewriter = ArchivalUrlRewriter(wbrequest.wb_url, wbrequest.wb_prefix) urlrewriter = ArchivalUrlRewriter(wbrequest.wb_url, wbrequest.wb_prefix)
wbrequest.urlrewriter = urlrewriter wbrequest.urlrewriter = urlrewriter
response = ReplayHandler.__call__(self, wbrequest, query_response, query) response = ReplayHandler.__call__(self, wbrequest, index, cdx_reader)
if response and response.cdx: if response and response.cdx:
self._checkRedir(wbrequest, response.cdx) self._checkRedir(wbrequest, response.cdx)
@ -414,8 +418,8 @@ class RewritingReplayHandler(ReplayHandler):
return None return None
def doReplay(self, cdx, wbrequest, query, failedFiles): def doReplay(self, cdx, wbrequest, index, failedFiles):
wbresponse = ReplayHandler.doReplay(self, cdx, wbrequest, query, failedFiles) wbresponse = ReplayHandler.doReplay(self, cdx, wbrequest, index, failedFiles)
# Check for self redirect # Check for self redirect
if wbresponse.status_headers.statusline.startswith('3'): if wbresponse.status_headers.statusline.startswith('3'):

49
pywb/views.py Normal file
View File

@ -0,0 +1,49 @@
import indexreader
import utils
import wbrequestresponse
import wbexceptions
from itertools import imap
from jinja2 import Environment, FileSystemLoader
#=================================================================
class TextQueryView:
def __call__(self, wbrequest, cdx_lines):
cdx_lines = imap(lambda x: str(x) + '\n', cdx_lines)
return wbrequestresponse.WbResponse.text_stream(cdx_lines)
#=================================================================
class J2QueryView:
def __init__(self, template_dir, template_file, buffer_index = True):
self.template_file = template_file
self.buffer_index = buffer_index
self.jinja_env = Environment(loader = FileSystemLoader(template_dir), trim_blocks = True)
def __call__(self, wbrequest, cdx_lines):
template = self.jinja_env.get_template(self.template_file)
# buffer/convert to list so we have length available for template
if self.buffer_index:
cdx_lines = list(cdx_lines)
response = template.render(cdx_lines = cdx_lines,
url = wbrequest.wb_url.url,
prefix = wbrequest.wb_prefix)
return wbrequestresponse.WbResponse.text_response(str(response), content_type = 'text/html')
#=================================================================
class DebugEchoView:
def __call__(self, wbrequest):
return wbrequestresponse.WbResponse.text_response(str(wbrequest.env))
#=================================================================
class DebugEchoView:
def __call__(self, wbrequest):
return wbrequestresponse.WbResponse.text_response(str(wbrequest))

View File

@ -1,14 +1,11 @@
<body> <body>
<b><span id="count"></span>&nbsp;Captures of {{ url }}</b> <b>{{ cdx_lines | length }} captures of {{ url }}</b>
<table id="captures"> <table id="captures">
{% for cdx in cdxlines %} {% for cdx in cdx_lines %}
<tr> <tr>
<td><a href="{{ prefix}}{{ cdx.timestamp }}/{{ url }}">{{ cdx.timestamp }}</a></td> <td><a href="{{ prefix}}{{ cdx.timestamp }}/{{ url }}">{{ cdx.timestamp }}</a></td>
<td><a href="https://archive.org/details/{{ cdx['filename'] }}">{{ cdx['filename'] }}</a></td> <td><a href="https://archive.org/details/{{ cdx['filename'] }}">{{ cdx['filename'] }}</a></td>
</tr> </tr>
{% endfor %} {% endfor %}
</table> </table>
<script>
document.getElementById("count").innerHTML = document.getElementById("captures").getElementsByTagName("tr").length
</script>
</body> </body>