1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

refactor: remove intermediate query object.

rename query -> views
wbhandler queries index, replayer and renders via view

new feature: 'cdx_' modifier can be used to render cdx from any request
This commit is contained in:
Ilya Kreymer 2014-01-28 16:41:19 -08:00
parent a83d527702
commit 1a234f2953
7 changed files with 168 additions and 144 deletions

View File

@ -48,6 +48,9 @@ def cdx_serve(key, params, sources, match_func = binsearch.iter_exact):
if limit:
cdx_iter = cdx_limit(cdx_iter, limit)
# output raw cdx objects
if params.get('output') == 'raw':
return cdx_iter
def write_cdx(fields):
for cdx in cdx_iter:

View File

@ -2,6 +2,7 @@ import urllib
import urllib2
import wbexceptions
import itertools
import wbrequestresponse
import surt
from collections import OrderedDict
@ -13,7 +14,58 @@ import logging
import os
#=================================================================
class LocalCDXServer:
class IndexReader:
def load_for_request(self, wbrequest, parsed_cdx = True):
wburl = wbrequest.wb_url
# init standard params
params = self.get_query_params(wburl)
# add any custom filter from the request
if wbrequest.queryFilter:
params['filter'] = wbrequest.queryFilter
if wbrequest.customParams:
params.update(wbrequest.customParams)
cdxlines = self.load_cdx(wburl.url, params, parsed_cdx)
cdxlines = utils.peek_iter(cdxlines)
if cdxlines is None:
raise wbexceptions.NotFoundException('WB Does Not Have Url: ' + wburl.url)
cdxlines = self.filter_cdx(wbrequest, cdxlines)
return cdxlines
def filter_cdx(self, wbrequest, cdxlines):
# Subclasses may wrap cdxlines iterator in a filter
return cdxlines
def load_cdx(self, url, params = {}, parsed_cdx = True):
raise NotImplementedError('Override in subclasses')
#=================================================================
class LocalCDXServer(IndexReader):
"""
>>> x = LocalCDXServer([test_dir]).load_cdx('example.com', parsed_cdx = True, limit = 1)
>>> pprint(x.next().items())
[('urlkey', 'com,example)/'),
('timestamp', '20140127171200'),
('original', 'http://example.com'),
('mimetype', 'text/html'),
('statuscode', '200'),
('digest', 'B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'),
('redirect', '-'),
('robotflags', '-'),
('length', '1046'),
('offset', '334'),
('filename', 'dupes.warc.gz')]
"""
def __init__(self, sources):
self.sources = []
@ -29,8 +81,22 @@ class LocalCDXServer:
self.sources.append(src)
@staticmethod
def getQueryParams(wburl, limit = 150000, collapse_time = None, replay_closest = 10):
def load_cdx(self, url, params = {}, parsed_cdx = True, **kwvalues):
# convert to surt
key = surt.surt(url)
match_func = binsearch.iter_exact
params.update(**kwvalues)
params['output'] = 'raw' if parsed_cdx else 'text'
return cdxserve.cdx_serve(key, params, self.sources, match_func)
def get_query_params(self, wburl, limit = 150000, collapse_time = None, replay_closest = 10):
if wburl.type == wburl.URL_QUERY:
raise NotImplementedError('Url Query Not Yet Supported')
return {
wburl.QUERY:
@ -52,21 +118,11 @@ class LocalCDXServer:
}[wburl.type]
def load(self, url, params):
# convert to surt
key = surt.surt(url)
match_func = binsearch.iter_exact
print key + ' ' + urllib.urlencode(params, True)
return cdxserve.cdx_serve(key, params, self.sources, match_func)
#=================================================================
class RemoteCDXServer:
class RemoteCDXServer(IndexReader):
"""
>>> x = cdxserver.load('example.com', parse_cdx = True, limit = '2')
>>> x = RemoteCDXServer('http://web.archive.org/cdx/search/cdx').load_cdx('example.com', parsed_cdx = True, limit = '2')
>>> pprint(x[0].items())
[('urlkey', 'com,example)/'),
('timestamp', '20020120142510'),
@ -81,7 +137,7 @@ class RemoteCDXServer:
self.serverUrl = serverUrl
self.authCookie = cookie
def load(self, url, params = {}, parse_cdx = False, **kwvalues):
def load_cdx(self, url, params = {}, parsed_cdx = True, **kwvalues):
#url is required, must be passed explicitly!
params['url'] = url
params.update(**kwvalues)
@ -103,7 +159,7 @@ class RemoteCDXServer:
else:
raise e
if parse_cdx:
if parsed_cdx:
return map(CDXCaptureResult, response)
else:
return response
@ -112,8 +168,7 @@ class RemoteCDXServer:
# with lower values if there are too many captures. Ideally, should be around 10-20
# The replayClosest is the max number of cdx lines, so max number of retry attempts that WB will make
@staticmethod
def getQueryParams(wburl, limit = '150000', collapseTime = '10', replayClosest = '4000'):
def get_query_params(self, wburl, limit = '150000', collapseTime = '10', replayClosest = '4000'):
return {
wburl.QUERY:
@ -136,6 +191,7 @@ class RemoteCDXServer:
}[wburl.type]
#=================================================================
class CDXCaptureResult(OrderedDict):
CDX_FORMATS = [
# Public CDX Format
@ -197,7 +253,7 @@ import utils
if __name__ == "__main__" or utils.enable_doctests():
from pprint import pprint
cdxserver = RemoteCDXServer('http://web.archive.org/cdx/search/cdx')
test_dir = os.path.dirname(os.path.realpath(__file__)) + '/../test/'
import doctest
doctest.testmod()

View File

@ -1,5 +1,5 @@
import archiveloader
import query
import views
import indexreader
import replay
import replay_resolvers
@ -18,7 +18,7 @@ def pywb_config(head_insert = ''):
# Source for cdx source
#query_h = query.QueryHandler(indexreader.RemoteCDXServer('http://cdx.example.com/cdx'))
#test_cdx = [test_dir + 'iana.cdx', test_dir + 'example.cdx', test_dir + 'dupes.cdx']
query_h = query.QueryHandler(indexreader.LocalCDXServer([test_dir]))
indexs = indexreader.LocalCDXServer([test_dir])
# Loads warcs specified in cdx from these locations
prefixes = [replay_resolvers.PrefixResolver(test_dir)]
@ -26,18 +26,17 @@ def pywb_config(head_insert = ''):
# Create rewriting replay handler to rewrite records
replayer = replay.RewritingReplayHandler(resolvers = prefixes, archiveloader = aloader, headInsert = head_insert, buffer_response = True)
# Create Jinja2 based html query renderer
htmlquery = query.J2QueryRenderer('./ui/', 'query.html')
# Create Jinja2 based html query view
html_view = views.J2QueryView('./ui/', 'query.html')
# Handler which combins query, replayer, and html_query
wb_handler = replay.WBHandler(query_h, replayer, htmlquery = htmlquery)
# WB handler which uses the index reader, replayer, and html_view
wb_handler = replay.WBHandler(indexs, replayer, html_view)
# Finally, create wb router
return ArchivalRequestRouter(
{
Route('echo_req', query.DebugEchoRequest()), # Debug ex: just echo parsed request
Route('echo_req', views.DebugEchoView()), # Debug ex: just echo parsed request
Route('pywb', wb_handler),
Route('cdx', query_h)
},
# Specify hostnames that pywb will be running on
# This will help catch occasionally missed rewrites that fall-through to the host

View File

@ -1,84 +0,0 @@
import indexreader
import utils
import wbrequestresponse
import wbexceptions
from jinja2 import Environment, FileSystemLoader
class QueryHandler:
def __init__(self, cdxserver = None):
if not cdxserver:
cdxserver = indexreader.RemoteCDXServer('http://web.archive.org/cdx/search/cdx')
self.cdxserver = cdxserver
def __call__(self, wbrequest):
wburl = wbrequest.wb_url
# init standard params
params = self.cdxserver.getQueryParams(wburl)
# add any custom filter from the request
if wbrequest.queryFilter:
params['filter'] = wbrequest.queryFilter
if wbrequest.customParams:
params.update(wbrequest.customParams)
cdxlines = self.cdxserver.load(wburl.url, params)
cdxlines = utils.peek_iter(cdxlines)
if cdxlines is None:
raise wbexceptions.NotFoundException('WB Does Not Have Url: ' + wburl.url)
cdxlines = self.filterCdx(wbrequest, cdxlines)
# Output raw cdx stream
return wbrequestresponse.WbResponse.text_stream(cdxlines)
def filterCdx(self, wbrequest, cdxlines):
# Subclasses may wrap cdxlines iterator in a filter
return cdxlines
class J2QueryRenderer:
def __init__(self, template_dir, template_file):
self.template_file = template_file
self.jinja_env = Environment(loader = FileSystemLoader(template_dir), trim_blocks = True)
def __call__(self, wbrequest, query_response):
cdxlines = query_response.body
def parse_cdx():
for cdx in cdxlines:
try:
cdx = indexreader.CDXCaptureResult(cdx)
yield cdx
except wbexceptions.InvalidCDXException:
import traceback
traceback.print_exc()
pass
template = self.jinja_env.get_template(self.template_file)
response = template.render(cdxlines = parse_cdx(),
url = wbrequest.wb_url.url,
prefix = wbrequest.wb_prefix)
return wbrequestresponse.WbResponse.text_response(str(response), content_type = 'text/html')
## ===========
## Simple handlers for debugging
class DebugEchoEnv:
def __call__(self, wbrequest):
return wbrequestresponse.WbResponse.text_response(str(wbrequest.env))
class DebugEchoRequest:
def __call__(self, wbrequest):
return wbrequestresponse.WbResponse.text_response(str(wbrequest))

View File

@ -2,8 +2,10 @@ import StringIO
from urllib2 import URLError
import chardet
import copy
import itertools
import indexreader, archiveloader
import archiveloader
import views
from wbrequestresponse import WbResponse, StatusAndHeaders
from wbarchivalurl import ArchivalUrl
import utils
@ -17,33 +19,37 @@ import wbexceptions
#=================================================================
class WBHandler:
def __init__(self, query, replay, htmlquery = None):
self.query = query
def __init__(self, cdx_reader, replay, html_view = None):
self.cdx_reader = cdx_reader
self.replay = replay
self.htmlquery = htmlquery
self.html_view = html_view
self.text_view = views.TextQueryView()
def __call__(self, wbrequest):
with utils.PerfTimer(wbrequest.env.get('X_PERF'), 'query') as t:
query_response = self.query(wbrequest)
cdx_lines = self.cdx_reader.load_for_request(wbrequest, parsed_cdx = True)
# new special modifier to always show cdx index
if wbrequest.wb_url.mod == 'cdx_':
return self.text_view(wbrequest, cdx_lines)
if (wbrequest.wb_url.type == wbrequest.wb_url.QUERY) or (wbrequest.wb_url.type == wbrequest.wb_url.URL_QUERY):
if wbrequest.wb_url.mod == 'text' or not self.htmlquery:
return query_response
if not self.html_view:
return self.text_view(wbrequest, cdx_lines)
else:
return self.htmlquery(wbrequest, query_response)
return self.html_view(wbrequest, cdx_lines)
with utils.PerfTimer(wbrequest.env.get('X_PERF'), 'replay') as t:
return self.replay(wbrequest, query_response, self.query)
return self.replay(wbrequest, cdx_lines, self.cdx_reader)
#=================================================================
class ReplayHandler(object):
def __init__(self, resolvers, archiveloader):
self.resolvers = resolvers
self.archiveloader = archiveloader
self.loader = archiveloader
def __call__(self, wbrequest, query_response, query):
cdxlist = query_response.body
def __call__(self, wbrequest, cdx_lines, cdx_reader):
last_e = None
first = True
@ -52,16 +58,14 @@ class ReplayHandler(object):
# Iterate over the cdx until find one that works
# The cdx should already be sorted in closest-to-timestamp order (from the cdx server)
for cdx in cdxlist:
for cdx in cdx_lines:
try:
cdx = indexreader.CDXCaptureResult(cdx)
# ability to intercept and redirect
if first:
self._checkRedir(wbrequest, cdx)
first = False
response = self.doReplay(cdx, wbrequest, query, failedFiles)
response = self.doReplay(cdx, wbrequest, cdx_reader, failedFiles)
if response:
response.cdx = cdx
@ -100,7 +104,7 @@ class ReplayHandler(object):
for path in possible_paths:
any_found = True
try:
return self.archiveloader.load(path, offset, length)
return self.loader.load(path, offset, length)
except URLError as ue:
last_exc = ue
@ -117,7 +121,7 @@ class ReplayHandler(object):
raise wbexceptions.ArchiveLoadFailed(filename, last_exc.reason if last_exc else '')
def doReplay(self, cdx, wbrequest, query, failedFiles):
def doReplay(self, cdx, wbrequest, cdx_reader, failedFiles):
hasCurr = (cdx['filename'] != '-')
hasOrig = (cdx.get('orig.filename','-') != '-')
@ -127,7 +131,7 @@ class ReplayHandler(object):
# two index lookups
# Case 1: if mimetype is still warc/revisit
if cdx['mimetype'] == 'warc/revisit' and headersRecord:
payloadRecord = self._load_different_url_payload(wbrequest, query, cdx, headersRecord, failedFiles)
payloadRecord = self._load_different_url_payload(wbrequest, cdx_reader, cdx, headersRecord, failedFiles)
# single lookup cases
# case 2: non-revisit
@ -163,7 +167,7 @@ class ReplayHandler(object):
# Handle the case where a duplicate of a capture with same digest exists at a different url
# Must query the index at that url filtering by matching digest
# Raise exception if no matches found
def _load_different_url_payload(self, wbrequest, query, cdx, headersRecord, failedFiles):
def _load_different_url_payload(self, wbrequest, cdx_reader, cdx, headersRecord, failedFiles):
ref_target_uri = headersRecord.rec_headers.getHeader('WARC-Refers-To-Target-URI')
# Check for unresolved revisit error, if refers to target uri not present or same as the current url
@ -187,11 +191,11 @@ class ReplayHandler(object):
# Must also match digest
orig_wbreq.queryFilter.append('digest:' + cdx['digest'])
orig_cdxlines = query(orig_wbreq).body
orig_cdx_lines = cdx_reader.load_for_request(orig_wbreq, parsed_cdx = True)
for cdx in orig_cdxlines:
for cdx in orig_cdx_lines:
try:
cdx = indexreader.CDXCaptureResult(cdx)
#cdx = cdx_reader.CDXCaptureResult(cdx)
#print cdx
payloadRecord = self._load(cdx, False, failedFiles)
return payloadRecord
@ -256,11 +260,11 @@ class RewritingReplayHandler(ReplayHandler):
return None
def __call__(self, wbrequest, query_response, query):
def __call__(self, wbrequest, index, cdx_reader):
urlrewriter = ArchivalUrlRewriter(wbrequest.wb_url, wbrequest.wb_prefix)
wbrequest.urlrewriter = urlrewriter
response = ReplayHandler.__call__(self, wbrequest, query_response, query)
response = ReplayHandler.__call__(self, wbrequest, index, cdx_reader)
if response and response.cdx:
self._checkRedir(wbrequest, response.cdx)
@ -414,8 +418,8 @@ class RewritingReplayHandler(ReplayHandler):
return None
def doReplay(self, cdx, wbrequest, query, failedFiles):
wbresponse = ReplayHandler.doReplay(self, cdx, wbrequest, query, failedFiles)
def doReplay(self, cdx, wbrequest, index, failedFiles):
wbresponse = ReplayHandler.doReplay(self, cdx, wbrequest, index, failedFiles)
# Check for self redirect
if wbresponse.status_headers.statusline.startswith('3'):

49
pywb/views.py Normal file
View File

@ -0,0 +1,49 @@
import indexreader
import utils
import wbrequestresponse
import wbexceptions
from itertools import imap
from jinja2 import Environment, FileSystemLoader
#=================================================================
class TextQueryView:
def __call__(self, wbrequest, cdx_lines):
cdx_lines = imap(lambda x: str(x) + '\n', cdx_lines)
return wbrequestresponse.WbResponse.text_stream(cdx_lines)
#=================================================================
class J2QueryView:
def __init__(self, template_dir, template_file, buffer_index = True):
self.template_file = template_file
self.buffer_index = buffer_index
self.jinja_env = Environment(loader = FileSystemLoader(template_dir), trim_blocks = True)
def __call__(self, wbrequest, cdx_lines):
template = self.jinja_env.get_template(self.template_file)
# buffer/convert to list so we have length available for template
if self.buffer_index:
cdx_lines = list(cdx_lines)
response = template.render(cdx_lines = cdx_lines,
url = wbrequest.wb_url.url,
prefix = wbrequest.wb_prefix)
return wbrequestresponse.WbResponse.text_response(str(response), content_type = 'text/html')
#=================================================================
class DebugEchoView:
def __call__(self, wbrequest):
return wbrequestresponse.WbResponse.text_response(str(wbrequest.env))
#=================================================================
class DebugEchoView:
def __call__(self, wbrequest):
return wbrequestresponse.WbResponse.text_response(str(wbrequest))

View File

@ -1,14 +1,11 @@
<body>
<b><span id="count"></span>&nbsp;Captures of {{ url }}</b>
<b>{{ cdx_lines | length }} captures of {{ url }}</b>
<table id="captures">
{% for cdx in cdxlines %}
{% for cdx in cdx_lines %}
<tr>
<td><a href="{{ prefix}}{{ cdx.timestamp }}/{{ url }}">{{ cdx.timestamp }}</a></td>
<td><a href="https://archive.org/details/{{ cdx['filename'] }}">{{ cdx['filename'] }}</a></td>
</tr>
{% endfor %}
</table>
<script>
document.getElementById("count").innerHTML = document.getElementById("captures").getElementsByTagName("tr").length
</script>
</body>