mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
refactor: remove intermediate query object.
rename query -> views wbhandler queries index, replayer and renders via view new feature: 'cdx_' modifier can be used to render cdx from any request
This commit is contained in:
parent
a83d527702
commit
1a234f2953
@ -48,6 +48,9 @@ def cdx_serve(key, params, sources, match_func = binsearch.iter_exact):
|
||||
if limit:
|
||||
cdx_iter = cdx_limit(cdx_iter, limit)
|
||||
|
||||
# output raw cdx objects
|
||||
if params.get('output') == 'raw':
|
||||
return cdx_iter
|
||||
|
||||
def write_cdx(fields):
|
||||
for cdx in cdx_iter:
|
||||
|
@ -2,6 +2,7 @@ import urllib
|
||||
import urllib2
|
||||
import wbexceptions
|
||||
import itertools
|
||||
import wbrequestresponse
|
||||
import surt
|
||||
from collections import OrderedDict
|
||||
|
||||
@ -13,7 +14,58 @@ import logging
|
||||
import os
|
||||
|
||||
#=================================================================
|
||||
class LocalCDXServer:
|
||||
class IndexReader:
|
||||
def load_for_request(self, wbrequest, parsed_cdx = True):
|
||||
wburl = wbrequest.wb_url
|
||||
|
||||
# init standard params
|
||||
params = self.get_query_params(wburl)
|
||||
|
||||
# add any custom filter from the request
|
||||
if wbrequest.queryFilter:
|
||||
params['filter'] = wbrequest.queryFilter
|
||||
|
||||
if wbrequest.customParams:
|
||||
params.update(wbrequest.customParams)
|
||||
|
||||
cdxlines = self.load_cdx(wburl.url, params, parsed_cdx)
|
||||
|
||||
cdxlines = utils.peek_iter(cdxlines)
|
||||
|
||||
if cdxlines is None:
|
||||
raise wbexceptions.NotFoundException('WB Does Not Have Url: ' + wburl.url)
|
||||
|
||||
cdxlines = self.filter_cdx(wbrequest, cdxlines)
|
||||
|
||||
return cdxlines
|
||||
|
||||
def filter_cdx(self, wbrequest, cdxlines):
|
||||
# Subclasses may wrap cdxlines iterator in a filter
|
||||
return cdxlines
|
||||
|
||||
def load_cdx(self, url, params = {}, parsed_cdx = True):
|
||||
raise NotImplementedError('Override in subclasses')
|
||||
|
||||
|
||||
#=================================================================
|
||||
class LocalCDXServer(IndexReader):
|
||||
"""
|
||||
>>> x = LocalCDXServer([test_dir]).load_cdx('example.com', parsed_cdx = True, limit = 1)
|
||||
>>> pprint(x.next().items())
|
||||
[('urlkey', 'com,example)/'),
|
||||
('timestamp', '20140127171200'),
|
||||
('original', 'http://example.com'),
|
||||
('mimetype', 'text/html'),
|
||||
('statuscode', '200'),
|
||||
('digest', 'B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'),
|
||||
('redirect', '-'),
|
||||
('robotflags', '-'),
|
||||
('length', '1046'),
|
||||
('offset', '334'),
|
||||
('filename', 'dupes.warc.gz')]
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, sources):
|
||||
self.sources = []
|
||||
|
||||
@ -29,8 +81,22 @@ class LocalCDXServer:
|
||||
self.sources.append(src)
|
||||
|
||||
|
||||
@staticmethod
|
||||
def getQueryParams(wburl, limit = 150000, collapse_time = None, replay_closest = 10):
|
||||
def load_cdx(self, url, params = {}, parsed_cdx = True, **kwvalues):
|
||||
# convert to surt
|
||||
key = surt.surt(url)
|
||||
match_func = binsearch.iter_exact
|
||||
|
||||
params.update(**kwvalues)
|
||||
params['output'] = 'raw' if parsed_cdx else 'text'
|
||||
|
||||
return cdxserve.cdx_serve(key, params, self.sources, match_func)
|
||||
|
||||
|
||||
def get_query_params(self, wburl, limit = 150000, collapse_time = None, replay_closest = 10):
|
||||
|
||||
if wburl.type == wburl.URL_QUERY:
|
||||
raise NotImplementedError('Url Query Not Yet Supported')
|
||||
|
||||
return {
|
||||
|
||||
wburl.QUERY:
|
||||
@ -52,21 +118,11 @@ class LocalCDXServer:
|
||||
}[wburl.type]
|
||||
|
||||
|
||||
def load(self, url, params):
|
||||
|
||||
# convert to surt
|
||||
key = surt.surt(url)
|
||||
match_func = binsearch.iter_exact
|
||||
|
||||
print key + ' ' + urllib.urlencode(params, True)
|
||||
|
||||
return cdxserve.cdx_serve(key, params, self.sources, match_func)
|
||||
|
||||
|
||||
#=================================================================
|
||||
class RemoteCDXServer:
|
||||
class RemoteCDXServer(IndexReader):
|
||||
"""
|
||||
>>> x = cdxserver.load('example.com', parse_cdx = True, limit = '2')
|
||||
>>> x = RemoteCDXServer('http://web.archive.org/cdx/search/cdx').load_cdx('example.com', parsed_cdx = True, limit = '2')
|
||||
>>> pprint(x[0].items())
|
||||
[('urlkey', 'com,example)/'),
|
||||
('timestamp', '20020120142510'),
|
||||
@ -81,7 +137,7 @@ class RemoteCDXServer:
|
||||
self.serverUrl = serverUrl
|
||||
self.authCookie = cookie
|
||||
|
||||
def load(self, url, params = {}, parse_cdx = False, **kwvalues):
|
||||
def load_cdx(self, url, params = {}, parsed_cdx = True, **kwvalues):
|
||||
#url is required, must be passed explicitly!
|
||||
params['url'] = url
|
||||
params.update(**kwvalues)
|
||||
@ -103,7 +159,7 @@ class RemoteCDXServer:
|
||||
else:
|
||||
raise e
|
||||
|
||||
if parse_cdx:
|
||||
if parsed_cdx:
|
||||
return map(CDXCaptureResult, response)
|
||||
else:
|
||||
return response
|
||||
@ -112,8 +168,7 @@ class RemoteCDXServer:
|
||||
# with lower values if there are too many captures. Ideally, should be around 10-20
|
||||
# The replayClosest is the max number of cdx lines, so max number of retry attempts that WB will make
|
||||
|
||||
@staticmethod
|
||||
def getQueryParams(wburl, limit = '150000', collapseTime = '10', replayClosest = '4000'):
|
||||
def get_query_params(self, wburl, limit = '150000', collapseTime = '10', replayClosest = '4000'):
|
||||
return {
|
||||
|
||||
wburl.QUERY:
|
||||
@ -136,6 +191,7 @@ class RemoteCDXServer:
|
||||
}[wburl.type]
|
||||
|
||||
|
||||
#=================================================================
|
||||
class CDXCaptureResult(OrderedDict):
|
||||
CDX_FORMATS = [
|
||||
# Public CDX Format
|
||||
@ -197,7 +253,7 @@ import utils
|
||||
if __name__ == "__main__" or utils.enable_doctests():
|
||||
from pprint import pprint
|
||||
|
||||
cdxserver = RemoteCDXServer('http://web.archive.org/cdx/search/cdx')
|
||||
test_dir = os.path.dirname(os.path.realpath(__file__)) + '/../test/'
|
||||
|
||||
import doctest
|
||||
doctest.testmod()
|
||||
|
@ -1,5 +1,5 @@
|
||||
import archiveloader
|
||||
import query
|
||||
import views
|
||||
import indexreader
|
||||
import replay
|
||||
import replay_resolvers
|
||||
@ -18,7 +18,7 @@ def pywb_config(head_insert = ''):
|
||||
# Source for cdx source
|
||||
#query_h = query.QueryHandler(indexreader.RemoteCDXServer('http://cdx.example.com/cdx'))
|
||||
#test_cdx = [test_dir + 'iana.cdx', test_dir + 'example.cdx', test_dir + 'dupes.cdx']
|
||||
query_h = query.QueryHandler(indexreader.LocalCDXServer([test_dir]))
|
||||
indexs = indexreader.LocalCDXServer([test_dir])
|
||||
|
||||
# Loads warcs specified in cdx from these locations
|
||||
prefixes = [replay_resolvers.PrefixResolver(test_dir)]
|
||||
@ -26,18 +26,17 @@ def pywb_config(head_insert = ''):
|
||||
# Create rewriting replay handler to rewrite records
|
||||
replayer = replay.RewritingReplayHandler(resolvers = prefixes, archiveloader = aloader, headInsert = head_insert, buffer_response = True)
|
||||
|
||||
# Create Jinja2 based html query renderer
|
||||
htmlquery = query.J2QueryRenderer('./ui/', 'query.html')
|
||||
# Create Jinja2 based html query view
|
||||
html_view = views.J2QueryView('./ui/', 'query.html')
|
||||
|
||||
# Handler which combins query, replayer, and html_query
|
||||
wb_handler = replay.WBHandler(query_h, replayer, htmlquery = htmlquery)
|
||||
# WB handler which uses the index reader, replayer, and html_view
|
||||
wb_handler = replay.WBHandler(indexs, replayer, html_view)
|
||||
|
||||
# Finally, create wb router
|
||||
return ArchivalRequestRouter(
|
||||
{
|
||||
Route('echo_req', query.DebugEchoRequest()), # Debug ex: just echo parsed request
|
||||
Route('echo_req', views.DebugEchoView()), # Debug ex: just echo parsed request
|
||||
Route('pywb', wb_handler),
|
||||
Route('cdx', query_h)
|
||||
},
|
||||
# Specify hostnames that pywb will be running on
|
||||
# This will help catch occasionally missed rewrites that fall-through to the host
|
||||
|
@ -1,84 +0,0 @@
|
||||
import indexreader
|
||||
import utils
|
||||
import wbrequestresponse
|
||||
import wbexceptions
|
||||
|
||||
from jinja2 import Environment, FileSystemLoader
|
||||
|
||||
class QueryHandler:
|
||||
def __init__(self, cdxserver = None):
|
||||
if not cdxserver:
|
||||
cdxserver = indexreader.RemoteCDXServer('http://web.archive.org/cdx/search/cdx')
|
||||
|
||||
self.cdxserver = cdxserver
|
||||
|
||||
def __call__(self, wbrequest):
|
||||
wburl = wbrequest.wb_url
|
||||
|
||||
# init standard params
|
||||
params = self.cdxserver.getQueryParams(wburl)
|
||||
|
||||
# add any custom filter from the request
|
||||
if wbrequest.queryFilter:
|
||||
params['filter'] = wbrequest.queryFilter
|
||||
|
||||
if wbrequest.customParams:
|
||||
params.update(wbrequest.customParams)
|
||||
|
||||
cdxlines = self.cdxserver.load(wburl.url, params)
|
||||
|
||||
cdxlines = utils.peek_iter(cdxlines)
|
||||
|
||||
if cdxlines is None:
|
||||
raise wbexceptions.NotFoundException('WB Does Not Have Url: ' + wburl.url)
|
||||
|
||||
cdxlines = self.filterCdx(wbrequest, cdxlines)
|
||||
|
||||
# Output raw cdx stream
|
||||
return wbrequestresponse.WbResponse.text_stream(cdxlines)
|
||||
|
||||
def filterCdx(self, wbrequest, cdxlines):
|
||||
# Subclasses may wrap cdxlines iterator in a filter
|
||||
return cdxlines
|
||||
|
||||
|
||||
class J2QueryRenderer:
|
||||
def __init__(self, template_dir, template_file):
|
||||
self.template_file = template_file
|
||||
|
||||
self.jinja_env = Environment(loader = FileSystemLoader(template_dir), trim_blocks = True)
|
||||
|
||||
def __call__(self, wbrequest, query_response):
|
||||
cdxlines = query_response.body
|
||||
|
||||
def parse_cdx():
|
||||
for cdx in cdxlines:
|
||||
try:
|
||||
cdx = indexreader.CDXCaptureResult(cdx)
|
||||
yield cdx
|
||||
|
||||
except wbexceptions.InvalidCDXException:
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
pass
|
||||
|
||||
|
||||
template = self.jinja_env.get_template(self.template_file)
|
||||
response = template.render(cdxlines = parse_cdx(),
|
||||
url = wbrequest.wb_url.url,
|
||||
prefix = wbrequest.wb_prefix)
|
||||
|
||||
return wbrequestresponse.WbResponse.text_response(str(response), content_type = 'text/html')
|
||||
|
||||
|
||||
## ===========
|
||||
## Simple handlers for debugging
|
||||
class DebugEchoEnv:
|
||||
def __call__(self, wbrequest):
|
||||
return wbrequestresponse.WbResponse.text_response(str(wbrequest.env))
|
||||
|
||||
class DebugEchoRequest:
|
||||
def __call__(self, wbrequest):
|
||||
return wbrequestresponse.WbResponse.text_response(str(wbrequest))
|
||||
|
||||
|
@ -2,8 +2,10 @@ import StringIO
|
||||
from urllib2 import URLError
|
||||
import chardet
|
||||
import copy
|
||||
import itertools
|
||||
|
||||
import indexreader, archiveloader
|
||||
import archiveloader
|
||||
import views
|
||||
from wbrequestresponse import WbResponse, StatusAndHeaders
|
||||
from wbarchivalurl import ArchivalUrl
|
||||
import utils
|
||||
@ -17,33 +19,37 @@ import wbexceptions
|
||||
|
||||
#=================================================================
|
||||
class WBHandler:
|
||||
def __init__(self, query, replay, htmlquery = None):
|
||||
self.query = query
|
||||
def __init__(self, cdx_reader, replay, html_view = None):
|
||||
self.cdx_reader = cdx_reader
|
||||
self.replay = replay
|
||||
self.htmlquery = htmlquery
|
||||
self.html_view = html_view
|
||||
self.text_view = views.TextQueryView()
|
||||
|
||||
def __call__(self, wbrequest):
|
||||
with utils.PerfTimer(wbrequest.env.get('X_PERF'), 'query') as t:
|
||||
query_response = self.query(wbrequest)
|
||||
cdx_lines = self.cdx_reader.load_for_request(wbrequest, parsed_cdx = True)
|
||||
|
||||
# new special modifier to always show cdx index
|
||||
if wbrequest.wb_url.mod == 'cdx_':
|
||||
return self.text_view(wbrequest, cdx_lines)
|
||||
|
||||
if (wbrequest.wb_url.type == wbrequest.wb_url.QUERY) or (wbrequest.wb_url.type == wbrequest.wb_url.URL_QUERY):
|
||||
if wbrequest.wb_url.mod == 'text' or not self.htmlquery:
|
||||
return query_response
|
||||
if not self.html_view:
|
||||
return self.text_view(wbrequest, cdx_lines)
|
||||
else:
|
||||
return self.htmlquery(wbrequest, query_response)
|
||||
return self.html_view(wbrequest, cdx_lines)
|
||||
|
||||
with utils.PerfTimer(wbrequest.env.get('X_PERF'), 'replay') as t:
|
||||
return self.replay(wbrequest, query_response, self.query)
|
||||
return self.replay(wbrequest, cdx_lines, self.cdx_reader)
|
||||
|
||||
|
||||
#=================================================================
|
||||
class ReplayHandler(object):
|
||||
def __init__(self, resolvers, archiveloader):
|
||||
self.resolvers = resolvers
|
||||
self.archiveloader = archiveloader
|
||||
self.loader = archiveloader
|
||||
|
||||
def __call__(self, wbrequest, query_response, query):
|
||||
cdxlist = query_response.body
|
||||
def __call__(self, wbrequest, cdx_lines, cdx_reader):
|
||||
last_e = None
|
||||
first = True
|
||||
|
||||
@ -52,16 +58,14 @@ class ReplayHandler(object):
|
||||
|
||||
# Iterate over the cdx until find one that works
|
||||
# The cdx should already be sorted in closest-to-timestamp order (from the cdx server)
|
||||
for cdx in cdxlist:
|
||||
for cdx in cdx_lines:
|
||||
try:
|
||||
cdx = indexreader.CDXCaptureResult(cdx)
|
||||
|
||||
# ability to intercept and redirect
|
||||
if first:
|
||||
self._checkRedir(wbrequest, cdx)
|
||||
first = False
|
||||
|
||||
response = self.doReplay(cdx, wbrequest, query, failedFiles)
|
||||
response = self.doReplay(cdx, wbrequest, cdx_reader, failedFiles)
|
||||
|
||||
if response:
|
||||
response.cdx = cdx
|
||||
@ -100,7 +104,7 @@ class ReplayHandler(object):
|
||||
for path in possible_paths:
|
||||
any_found = True
|
||||
try:
|
||||
return self.archiveloader.load(path, offset, length)
|
||||
return self.loader.load(path, offset, length)
|
||||
|
||||
except URLError as ue:
|
||||
last_exc = ue
|
||||
@ -117,7 +121,7 @@ class ReplayHandler(object):
|
||||
raise wbexceptions.ArchiveLoadFailed(filename, last_exc.reason if last_exc else '')
|
||||
|
||||
|
||||
def doReplay(self, cdx, wbrequest, query, failedFiles):
|
||||
def doReplay(self, cdx, wbrequest, cdx_reader, failedFiles):
|
||||
hasCurr = (cdx['filename'] != '-')
|
||||
hasOrig = (cdx.get('orig.filename','-') != '-')
|
||||
|
||||
@ -127,7 +131,7 @@ class ReplayHandler(object):
|
||||
# two index lookups
|
||||
# Case 1: if mimetype is still warc/revisit
|
||||
if cdx['mimetype'] == 'warc/revisit' and headersRecord:
|
||||
payloadRecord = self._load_different_url_payload(wbrequest, query, cdx, headersRecord, failedFiles)
|
||||
payloadRecord = self._load_different_url_payload(wbrequest, cdx_reader, cdx, headersRecord, failedFiles)
|
||||
|
||||
# single lookup cases
|
||||
# case 2: non-revisit
|
||||
@ -163,7 +167,7 @@ class ReplayHandler(object):
|
||||
# Handle the case where a duplicate of a capture with same digest exists at a different url
|
||||
# Must query the index at that url filtering by matching digest
|
||||
# Raise exception if no matches found
|
||||
def _load_different_url_payload(self, wbrequest, query, cdx, headersRecord, failedFiles):
|
||||
def _load_different_url_payload(self, wbrequest, cdx_reader, cdx, headersRecord, failedFiles):
|
||||
ref_target_uri = headersRecord.rec_headers.getHeader('WARC-Refers-To-Target-URI')
|
||||
|
||||
# Check for unresolved revisit error, if refers to target uri not present or same as the current url
|
||||
@ -187,11 +191,11 @@ class ReplayHandler(object):
|
||||
# Must also match digest
|
||||
orig_wbreq.queryFilter.append('digest:' + cdx['digest'])
|
||||
|
||||
orig_cdxlines = query(orig_wbreq).body
|
||||
orig_cdx_lines = cdx_reader.load_for_request(orig_wbreq, parsed_cdx = True)
|
||||
|
||||
for cdx in orig_cdxlines:
|
||||
for cdx in orig_cdx_lines:
|
||||
try:
|
||||
cdx = indexreader.CDXCaptureResult(cdx)
|
||||
#cdx = cdx_reader.CDXCaptureResult(cdx)
|
||||
#print cdx
|
||||
payloadRecord = self._load(cdx, False, failedFiles)
|
||||
return payloadRecord
|
||||
@ -256,11 +260,11 @@ class RewritingReplayHandler(ReplayHandler):
|
||||
return None
|
||||
|
||||
|
||||
def __call__(self, wbrequest, query_response, query):
|
||||
def __call__(self, wbrequest, index, cdx_reader):
|
||||
urlrewriter = ArchivalUrlRewriter(wbrequest.wb_url, wbrequest.wb_prefix)
|
||||
wbrequest.urlrewriter = urlrewriter
|
||||
|
||||
response = ReplayHandler.__call__(self, wbrequest, query_response, query)
|
||||
response = ReplayHandler.__call__(self, wbrequest, index, cdx_reader)
|
||||
|
||||
if response and response.cdx:
|
||||
self._checkRedir(wbrequest, response.cdx)
|
||||
@ -414,8 +418,8 @@ class RewritingReplayHandler(ReplayHandler):
|
||||
return None
|
||||
|
||||
|
||||
def doReplay(self, cdx, wbrequest, query, failedFiles):
|
||||
wbresponse = ReplayHandler.doReplay(self, cdx, wbrequest, query, failedFiles)
|
||||
def doReplay(self, cdx, wbrequest, index, failedFiles):
|
||||
wbresponse = ReplayHandler.doReplay(self, cdx, wbrequest, index, failedFiles)
|
||||
|
||||
# Check for self redirect
|
||||
if wbresponse.status_headers.statusline.startswith('3'):
|
||||
|
49
pywb/views.py
Normal file
49
pywb/views.py
Normal file
@ -0,0 +1,49 @@
|
||||
import indexreader
|
||||
import utils
|
||||
import wbrequestresponse
|
||||
import wbexceptions
|
||||
|
||||
from itertools import imap
|
||||
from jinja2 import Environment, FileSystemLoader
|
||||
|
||||
|
||||
#=================================================================
|
||||
class TextQueryView:
|
||||
def __call__(self, wbrequest, cdx_lines):
|
||||
cdx_lines = imap(lambda x: str(x) + '\n', cdx_lines)
|
||||
return wbrequestresponse.WbResponse.text_stream(cdx_lines)
|
||||
|
||||
#=================================================================
|
||||
class J2QueryView:
|
||||
def __init__(self, template_dir, template_file, buffer_index = True):
|
||||
self.template_file = template_file
|
||||
self.buffer_index = buffer_index
|
||||
|
||||
self.jinja_env = Environment(loader = FileSystemLoader(template_dir), trim_blocks = True)
|
||||
|
||||
|
||||
def __call__(self, wbrequest, cdx_lines):
|
||||
template = self.jinja_env.get_template(self.template_file)
|
||||
|
||||
# buffer/convert to list so we have length available for template
|
||||
if self.buffer_index:
|
||||
cdx_lines = list(cdx_lines)
|
||||
|
||||
response = template.render(cdx_lines = cdx_lines,
|
||||
url = wbrequest.wb_url.url,
|
||||
prefix = wbrequest.wb_prefix)
|
||||
|
||||
return wbrequestresponse.WbResponse.text_response(str(response), content_type = 'text/html')
|
||||
|
||||
|
||||
#=================================================================
|
||||
class DebugEchoView:
|
||||
def __call__(self, wbrequest):
|
||||
return wbrequestresponse.WbResponse.text_response(str(wbrequest.env))
|
||||
|
||||
#=================================================================
|
||||
class DebugEchoView:
|
||||
def __call__(self, wbrequest):
|
||||
return wbrequestresponse.WbResponse.text_response(str(wbrequest))
|
||||
|
||||
|
@ -1,14 +1,11 @@
|
||||
<body>
|
||||
<b><span id="count"></span> Captures of {{ url }}</b>
|
||||
<b>{{ cdx_lines | length }} captures of {{ url }}</b>
|
||||
<table id="captures">
|
||||
{% for cdx in cdxlines %}
|
||||
{% for cdx in cdx_lines %}
|
||||
<tr>
|
||||
<td><a href="{{ prefix}}{{ cdx.timestamp }}/{{ url }}">{{ cdx.timestamp }}</a></td>
|
||||
<td><a href="https://archive.org/details/{{ cdx['filename'] }}">{{ cdx['filename'] }}</a></td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</table>
|
||||
<script>
|
||||
document.getElementById("count").innerHTML = document.getElementById("captures").getElementsByTagName("tr").length
|
||||
</script>
|
||||
</body>
|
||||
|
Loading…
x
Reference in New Issue
Block a user