mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
refactor: remove intermediate query object.
rename query -> views wbhandler queries index, replayer and renders via view new feature: 'cdx_' modifier can be used to render cdx from any request
This commit is contained in:
parent
a83d527702
commit
1a234f2953
@ -48,6 +48,9 @@ def cdx_serve(key, params, sources, match_func = binsearch.iter_exact):
|
|||||||
if limit:
|
if limit:
|
||||||
cdx_iter = cdx_limit(cdx_iter, limit)
|
cdx_iter = cdx_limit(cdx_iter, limit)
|
||||||
|
|
||||||
|
# output raw cdx objects
|
||||||
|
if params.get('output') == 'raw':
|
||||||
|
return cdx_iter
|
||||||
|
|
||||||
def write_cdx(fields):
|
def write_cdx(fields):
|
||||||
for cdx in cdx_iter:
|
for cdx in cdx_iter:
|
||||||
|
@ -2,6 +2,7 @@ import urllib
|
|||||||
import urllib2
|
import urllib2
|
||||||
import wbexceptions
|
import wbexceptions
|
||||||
import itertools
|
import itertools
|
||||||
|
import wbrequestresponse
|
||||||
import surt
|
import surt
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
|
|
||||||
@ -13,7 +14,58 @@ import logging
|
|||||||
import os
|
import os
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class LocalCDXServer:
|
class IndexReader:
|
||||||
|
def load_for_request(self, wbrequest, parsed_cdx = True):
|
||||||
|
wburl = wbrequest.wb_url
|
||||||
|
|
||||||
|
# init standard params
|
||||||
|
params = self.get_query_params(wburl)
|
||||||
|
|
||||||
|
# add any custom filter from the request
|
||||||
|
if wbrequest.queryFilter:
|
||||||
|
params['filter'] = wbrequest.queryFilter
|
||||||
|
|
||||||
|
if wbrequest.customParams:
|
||||||
|
params.update(wbrequest.customParams)
|
||||||
|
|
||||||
|
cdxlines = self.load_cdx(wburl.url, params, parsed_cdx)
|
||||||
|
|
||||||
|
cdxlines = utils.peek_iter(cdxlines)
|
||||||
|
|
||||||
|
if cdxlines is None:
|
||||||
|
raise wbexceptions.NotFoundException('WB Does Not Have Url: ' + wburl.url)
|
||||||
|
|
||||||
|
cdxlines = self.filter_cdx(wbrequest, cdxlines)
|
||||||
|
|
||||||
|
return cdxlines
|
||||||
|
|
||||||
|
def filter_cdx(self, wbrequest, cdxlines):
|
||||||
|
# Subclasses may wrap cdxlines iterator in a filter
|
||||||
|
return cdxlines
|
||||||
|
|
||||||
|
def load_cdx(self, url, params = {}, parsed_cdx = True):
|
||||||
|
raise NotImplementedError('Override in subclasses')
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class LocalCDXServer(IndexReader):
|
||||||
|
"""
|
||||||
|
>>> x = LocalCDXServer([test_dir]).load_cdx('example.com', parsed_cdx = True, limit = 1)
|
||||||
|
>>> pprint(x.next().items())
|
||||||
|
[('urlkey', 'com,example)/'),
|
||||||
|
('timestamp', '20140127171200'),
|
||||||
|
('original', 'http://example.com'),
|
||||||
|
('mimetype', 'text/html'),
|
||||||
|
('statuscode', '200'),
|
||||||
|
('digest', 'B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'),
|
||||||
|
('redirect', '-'),
|
||||||
|
('robotflags', '-'),
|
||||||
|
('length', '1046'),
|
||||||
|
('offset', '334'),
|
||||||
|
('filename', 'dupes.warc.gz')]
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(self, sources):
|
def __init__(self, sources):
|
||||||
self.sources = []
|
self.sources = []
|
||||||
|
|
||||||
@ -29,8 +81,22 @@ class LocalCDXServer:
|
|||||||
self.sources.append(src)
|
self.sources.append(src)
|
||||||
|
|
||||||
|
|
||||||
@staticmethod
|
def load_cdx(self, url, params = {}, parsed_cdx = True, **kwvalues):
|
||||||
def getQueryParams(wburl, limit = 150000, collapse_time = None, replay_closest = 10):
|
# convert to surt
|
||||||
|
key = surt.surt(url)
|
||||||
|
match_func = binsearch.iter_exact
|
||||||
|
|
||||||
|
params.update(**kwvalues)
|
||||||
|
params['output'] = 'raw' if parsed_cdx else 'text'
|
||||||
|
|
||||||
|
return cdxserve.cdx_serve(key, params, self.sources, match_func)
|
||||||
|
|
||||||
|
|
||||||
|
def get_query_params(self, wburl, limit = 150000, collapse_time = None, replay_closest = 10):
|
||||||
|
|
||||||
|
if wburl.type == wburl.URL_QUERY:
|
||||||
|
raise NotImplementedError('Url Query Not Yet Supported')
|
||||||
|
|
||||||
return {
|
return {
|
||||||
|
|
||||||
wburl.QUERY:
|
wburl.QUERY:
|
||||||
@ -52,21 +118,11 @@ class LocalCDXServer:
|
|||||||
}[wburl.type]
|
}[wburl.type]
|
||||||
|
|
||||||
|
|
||||||
def load(self, url, params):
|
|
||||||
|
|
||||||
# convert to surt
|
|
||||||
key = surt.surt(url)
|
|
||||||
match_func = binsearch.iter_exact
|
|
||||||
|
|
||||||
print key + ' ' + urllib.urlencode(params, True)
|
|
||||||
|
|
||||||
return cdxserve.cdx_serve(key, params, self.sources, match_func)
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class RemoteCDXServer:
|
class RemoteCDXServer(IndexReader):
|
||||||
"""
|
"""
|
||||||
>>> x = cdxserver.load('example.com', parse_cdx = True, limit = '2')
|
>>> x = RemoteCDXServer('http://web.archive.org/cdx/search/cdx').load_cdx('example.com', parsed_cdx = True, limit = '2')
|
||||||
>>> pprint(x[0].items())
|
>>> pprint(x[0].items())
|
||||||
[('urlkey', 'com,example)/'),
|
[('urlkey', 'com,example)/'),
|
||||||
('timestamp', '20020120142510'),
|
('timestamp', '20020120142510'),
|
||||||
@ -81,7 +137,7 @@ class RemoteCDXServer:
|
|||||||
self.serverUrl = serverUrl
|
self.serverUrl = serverUrl
|
||||||
self.authCookie = cookie
|
self.authCookie = cookie
|
||||||
|
|
||||||
def load(self, url, params = {}, parse_cdx = False, **kwvalues):
|
def load_cdx(self, url, params = {}, parsed_cdx = True, **kwvalues):
|
||||||
#url is required, must be passed explicitly!
|
#url is required, must be passed explicitly!
|
||||||
params['url'] = url
|
params['url'] = url
|
||||||
params.update(**kwvalues)
|
params.update(**kwvalues)
|
||||||
@ -103,7 +159,7 @@ class RemoteCDXServer:
|
|||||||
else:
|
else:
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
if parse_cdx:
|
if parsed_cdx:
|
||||||
return map(CDXCaptureResult, response)
|
return map(CDXCaptureResult, response)
|
||||||
else:
|
else:
|
||||||
return response
|
return response
|
||||||
@ -112,8 +168,7 @@ class RemoteCDXServer:
|
|||||||
# with lower values if there are too many captures. Ideally, should be around 10-20
|
# with lower values if there are too many captures. Ideally, should be around 10-20
|
||||||
# The replayClosest is the max number of cdx lines, so max number of retry attempts that WB will make
|
# The replayClosest is the max number of cdx lines, so max number of retry attempts that WB will make
|
||||||
|
|
||||||
@staticmethod
|
def get_query_params(self, wburl, limit = '150000', collapseTime = '10', replayClosest = '4000'):
|
||||||
def getQueryParams(wburl, limit = '150000', collapseTime = '10', replayClosest = '4000'):
|
|
||||||
return {
|
return {
|
||||||
|
|
||||||
wburl.QUERY:
|
wburl.QUERY:
|
||||||
@ -136,6 +191,7 @@ class RemoteCDXServer:
|
|||||||
}[wburl.type]
|
}[wburl.type]
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
class CDXCaptureResult(OrderedDict):
|
class CDXCaptureResult(OrderedDict):
|
||||||
CDX_FORMATS = [
|
CDX_FORMATS = [
|
||||||
# Public CDX Format
|
# Public CDX Format
|
||||||
@ -197,7 +253,7 @@ import utils
|
|||||||
if __name__ == "__main__" or utils.enable_doctests():
|
if __name__ == "__main__" or utils.enable_doctests():
|
||||||
from pprint import pprint
|
from pprint import pprint
|
||||||
|
|
||||||
cdxserver = RemoteCDXServer('http://web.archive.org/cdx/search/cdx')
|
test_dir = os.path.dirname(os.path.realpath(__file__)) + '/../test/'
|
||||||
|
|
||||||
import doctest
|
import doctest
|
||||||
doctest.testmod()
|
doctest.testmod()
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
import archiveloader
|
import archiveloader
|
||||||
import query
|
import views
|
||||||
import indexreader
|
import indexreader
|
||||||
import replay
|
import replay
|
||||||
import replay_resolvers
|
import replay_resolvers
|
||||||
@ -18,7 +18,7 @@ def pywb_config(head_insert = ''):
|
|||||||
# Source for cdx source
|
# Source for cdx source
|
||||||
#query_h = query.QueryHandler(indexreader.RemoteCDXServer('http://cdx.example.com/cdx'))
|
#query_h = query.QueryHandler(indexreader.RemoteCDXServer('http://cdx.example.com/cdx'))
|
||||||
#test_cdx = [test_dir + 'iana.cdx', test_dir + 'example.cdx', test_dir + 'dupes.cdx']
|
#test_cdx = [test_dir + 'iana.cdx', test_dir + 'example.cdx', test_dir + 'dupes.cdx']
|
||||||
query_h = query.QueryHandler(indexreader.LocalCDXServer([test_dir]))
|
indexs = indexreader.LocalCDXServer([test_dir])
|
||||||
|
|
||||||
# Loads warcs specified in cdx from these locations
|
# Loads warcs specified in cdx from these locations
|
||||||
prefixes = [replay_resolvers.PrefixResolver(test_dir)]
|
prefixes = [replay_resolvers.PrefixResolver(test_dir)]
|
||||||
@ -26,18 +26,17 @@ def pywb_config(head_insert = ''):
|
|||||||
# Create rewriting replay handler to rewrite records
|
# Create rewriting replay handler to rewrite records
|
||||||
replayer = replay.RewritingReplayHandler(resolvers = prefixes, archiveloader = aloader, headInsert = head_insert, buffer_response = True)
|
replayer = replay.RewritingReplayHandler(resolvers = prefixes, archiveloader = aloader, headInsert = head_insert, buffer_response = True)
|
||||||
|
|
||||||
# Create Jinja2 based html query renderer
|
# Create Jinja2 based html query view
|
||||||
htmlquery = query.J2QueryRenderer('./ui/', 'query.html')
|
html_view = views.J2QueryView('./ui/', 'query.html')
|
||||||
|
|
||||||
# Handler which combins query, replayer, and html_query
|
# WB handler which uses the index reader, replayer, and html_view
|
||||||
wb_handler = replay.WBHandler(query_h, replayer, htmlquery = htmlquery)
|
wb_handler = replay.WBHandler(indexs, replayer, html_view)
|
||||||
|
|
||||||
# Finally, create wb router
|
# Finally, create wb router
|
||||||
return ArchivalRequestRouter(
|
return ArchivalRequestRouter(
|
||||||
{
|
{
|
||||||
Route('echo_req', query.DebugEchoRequest()), # Debug ex: just echo parsed request
|
Route('echo_req', views.DebugEchoView()), # Debug ex: just echo parsed request
|
||||||
Route('pywb', wb_handler),
|
Route('pywb', wb_handler),
|
||||||
Route('cdx', query_h)
|
|
||||||
},
|
},
|
||||||
# Specify hostnames that pywb will be running on
|
# Specify hostnames that pywb will be running on
|
||||||
# This will help catch occasionally missed rewrites that fall-through to the host
|
# This will help catch occasionally missed rewrites that fall-through to the host
|
||||||
|
@ -1,84 +0,0 @@
|
|||||||
import indexreader
|
|
||||||
import utils
|
|
||||||
import wbrequestresponse
|
|
||||||
import wbexceptions
|
|
||||||
|
|
||||||
from jinja2 import Environment, FileSystemLoader
|
|
||||||
|
|
||||||
class QueryHandler:
|
|
||||||
def __init__(self, cdxserver = None):
|
|
||||||
if not cdxserver:
|
|
||||||
cdxserver = indexreader.RemoteCDXServer('http://web.archive.org/cdx/search/cdx')
|
|
||||||
|
|
||||||
self.cdxserver = cdxserver
|
|
||||||
|
|
||||||
def __call__(self, wbrequest):
|
|
||||||
wburl = wbrequest.wb_url
|
|
||||||
|
|
||||||
# init standard params
|
|
||||||
params = self.cdxserver.getQueryParams(wburl)
|
|
||||||
|
|
||||||
# add any custom filter from the request
|
|
||||||
if wbrequest.queryFilter:
|
|
||||||
params['filter'] = wbrequest.queryFilter
|
|
||||||
|
|
||||||
if wbrequest.customParams:
|
|
||||||
params.update(wbrequest.customParams)
|
|
||||||
|
|
||||||
cdxlines = self.cdxserver.load(wburl.url, params)
|
|
||||||
|
|
||||||
cdxlines = utils.peek_iter(cdxlines)
|
|
||||||
|
|
||||||
if cdxlines is None:
|
|
||||||
raise wbexceptions.NotFoundException('WB Does Not Have Url: ' + wburl.url)
|
|
||||||
|
|
||||||
cdxlines = self.filterCdx(wbrequest, cdxlines)
|
|
||||||
|
|
||||||
# Output raw cdx stream
|
|
||||||
return wbrequestresponse.WbResponse.text_stream(cdxlines)
|
|
||||||
|
|
||||||
def filterCdx(self, wbrequest, cdxlines):
|
|
||||||
# Subclasses may wrap cdxlines iterator in a filter
|
|
||||||
return cdxlines
|
|
||||||
|
|
||||||
|
|
||||||
class J2QueryRenderer:
|
|
||||||
def __init__(self, template_dir, template_file):
|
|
||||||
self.template_file = template_file
|
|
||||||
|
|
||||||
self.jinja_env = Environment(loader = FileSystemLoader(template_dir), trim_blocks = True)
|
|
||||||
|
|
||||||
def __call__(self, wbrequest, query_response):
|
|
||||||
cdxlines = query_response.body
|
|
||||||
|
|
||||||
def parse_cdx():
|
|
||||||
for cdx in cdxlines:
|
|
||||||
try:
|
|
||||||
cdx = indexreader.CDXCaptureResult(cdx)
|
|
||||||
yield cdx
|
|
||||||
|
|
||||||
except wbexceptions.InvalidCDXException:
|
|
||||||
import traceback
|
|
||||||
traceback.print_exc()
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
template = self.jinja_env.get_template(self.template_file)
|
|
||||||
response = template.render(cdxlines = parse_cdx(),
|
|
||||||
url = wbrequest.wb_url.url,
|
|
||||||
prefix = wbrequest.wb_prefix)
|
|
||||||
|
|
||||||
return wbrequestresponse.WbResponse.text_response(str(response), content_type = 'text/html')
|
|
||||||
|
|
||||||
|
|
||||||
## ===========
|
|
||||||
## Simple handlers for debugging
|
|
||||||
class DebugEchoEnv:
|
|
||||||
def __call__(self, wbrequest):
|
|
||||||
return wbrequestresponse.WbResponse.text_response(str(wbrequest.env))
|
|
||||||
|
|
||||||
class DebugEchoRequest:
|
|
||||||
def __call__(self, wbrequest):
|
|
||||||
return wbrequestresponse.WbResponse.text_response(str(wbrequest))
|
|
||||||
|
|
||||||
|
|
@ -2,8 +2,10 @@ import StringIO
|
|||||||
from urllib2 import URLError
|
from urllib2 import URLError
|
||||||
import chardet
|
import chardet
|
||||||
import copy
|
import copy
|
||||||
|
import itertools
|
||||||
|
|
||||||
import indexreader, archiveloader
|
import archiveloader
|
||||||
|
import views
|
||||||
from wbrequestresponse import WbResponse, StatusAndHeaders
|
from wbrequestresponse import WbResponse, StatusAndHeaders
|
||||||
from wbarchivalurl import ArchivalUrl
|
from wbarchivalurl import ArchivalUrl
|
||||||
import utils
|
import utils
|
||||||
@ -17,33 +19,37 @@ import wbexceptions
|
|||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class WBHandler:
|
class WBHandler:
|
||||||
def __init__(self, query, replay, htmlquery = None):
|
def __init__(self, cdx_reader, replay, html_view = None):
|
||||||
self.query = query
|
self.cdx_reader = cdx_reader
|
||||||
self.replay = replay
|
self.replay = replay
|
||||||
self.htmlquery = htmlquery
|
self.html_view = html_view
|
||||||
|
self.text_view = views.TextQueryView()
|
||||||
|
|
||||||
def __call__(self, wbrequest):
|
def __call__(self, wbrequest):
|
||||||
with utils.PerfTimer(wbrequest.env.get('X_PERF'), 'query') as t:
|
with utils.PerfTimer(wbrequest.env.get('X_PERF'), 'query') as t:
|
||||||
query_response = self.query(wbrequest)
|
cdx_lines = self.cdx_reader.load_for_request(wbrequest, parsed_cdx = True)
|
||||||
|
|
||||||
|
# new special modifier to always show cdx index
|
||||||
|
if wbrequest.wb_url.mod == 'cdx_':
|
||||||
|
return self.text_view(wbrequest, cdx_lines)
|
||||||
|
|
||||||
if (wbrequest.wb_url.type == wbrequest.wb_url.QUERY) or (wbrequest.wb_url.type == wbrequest.wb_url.URL_QUERY):
|
if (wbrequest.wb_url.type == wbrequest.wb_url.QUERY) or (wbrequest.wb_url.type == wbrequest.wb_url.URL_QUERY):
|
||||||
if wbrequest.wb_url.mod == 'text' or not self.htmlquery:
|
if not self.html_view:
|
||||||
return query_response
|
return self.text_view(wbrequest, cdx_lines)
|
||||||
else:
|
else:
|
||||||
return self.htmlquery(wbrequest, query_response)
|
return self.html_view(wbrequest, cdx_lines)
|
||||||
|
|
||||||
with utils.PerfTimer(wbrequest.env.get('X_PERF'), 'replay') as t:
|
with utils.PerfTimer(wbrequest.env.get('X_PERF'), 'replay') as t:
|
||||||
return self.replay(wbrequest, query_response, self.query)
|
return self.replay(wbrequest, cdx_lines, self.cdx_reader)
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class ReplayHandler(object):
|
class ReplayHandler(object):
|
||||||
def __init__(self, resolvers, archiveloader):
|
def __init__(self, resolvers, archiveloader):
|
||||||
self.resolvers = resolvers
|
self.resolvers = resolvers
|
||||||
self.archiveloader = archiveloader
|
self.loader = archiveloader
|
||||||
|
|
||||||
def __call__(self, wbrequest, query_response, query):
|
def __call__(self, wbrequest, cdx_lines, cdx_reader):
|
||||||
cdxlist = query_response.body
|
|
||||||
last_e = None
|
last_e = None
|
||||||
first = True
|
first = True
|
||||||
|
|
||||||
@ -52,16 +58,14 @@ class ReplayHandler(object):
|
|||||||
|
|
||||||
# Iterate over the cdx until find one that works
|
# Iterate over the cdx until find one that works
|
||||||
# The cdx should already be sorted in closest-to-timestamp order (from the cdx server)
|
# The cdx should already be sorted in closest-to-timestamp order (from the cdx server)
|
||||||
for cdx in cdxlist:
|
for cdx in cdx_lines:
|
||||||
try:
|
try:
|
||||||
cdx = indexreader.CDXCaptureResult(cdx)
|
|
||||||
|
|
||||||
# ability to intercept and redirect
|
# ability to intercept and redirect
|
||||||
if first:
|
if first:
|
||||||
self._checkRedir(wbrequest, cdx)
|
self._checkRedir(wbrequest, cdx)
|
||||||
first = False
|
first = False
|
||||||
|
|
||||||
response = self.doReplay(cdx, wbrequest, query, failedFiles)
|
response = self.doReplay(cdx, wbrequest, cdx_reader, failedFiles)
|
||||||
|
|
||||||
if response:
|
if response:
|
||||||
response.cdx = cdx
|
response.cdx = cdx
|
||||||
@ -100,7 +104,7 @@ class ReplayHandler(object):
|
|||||||
for path in possible_paths:
|
for path in possible_paths:
|
||||||
any_found = True
|
any_found = True
|
||||||
try:
|
try:
|
||||||
return self.archiveloader.load(path, offset, length)
|
return self.loader.load(path, offset, length)
|
||||||
|
|
||||||
except URLError as ue:
|
except URLError as ue:
|
||||||
last_exc = ue
|
last_exc = ue
|
||||||
@ -117,7 +121,7 @@ class ReplayHandler(object):
|
|||||||
raise wbexceptions.ArchiveLoadFailed(filename, last_exc.reason if last_exc else '')
|
raise wbexceptions.ArchiveLoadFailed(filename, last_exc.reason if last_exc else '')
|
||||||
|
|
||||||
|
|
||||||
def doReplay(self, cdx, wbrequest, query, failedFiles):
|
def doReplay(self, cdx, wbrequest, cdx_reader, failedFiles):
|
||||||
hasCurr = (cdx['filename'] != '-')
|
hasCurr = (cdx['filename'] != '-')
|
||||||
hasOrig = (cdx.get('orig.filename','-') != '-')
|
hasOrig = (cdx.get('orig.filename','-') != '-')
|
||||||
|
|
||||||
@ -127,7 +131,7 @@ class ReplayHandler(object):
|
|||||||
# two index lookups
|
# two index lookups
|
||||||
# Case 1: if mimetype is still warc/revisit
|
# Case 1: if mimetype is still warc/revisit
|
||||||
if cdx['mimetype'] == 'warc/revisit' and headersRecord:
|
if cdx['mimetype'] == 'warc/revisit' and headersRecord:
|
||||||
payloadRecord = self._load_different_url_payload(wbrequest, query, cdx, headersRecord, failedFiles)
|
payloadRecord = self._load_different_url_payload(wbrequest, cdx_reader, cdx, headersRecord, failedFiles)
|
||||||
|
|
||||||
# single lookup cases
|
# single lookup cases
|
||||||
# case 2: non-revisit
|
# case 2: non-revisit
|
||||||
@ -163,7 +167,7 @@ class ReplayHandler(object):
|
|||||||
# Handle the case where a duplicate of a capture with same digest exists at a different url
|
# Handle the case where a duplicate of a capture with same digest exists at a different url
|
||||||
# Must query the index at that url filtering by matching digest
|
# Must query the index at that url filtering by matching digest
|
||||||
# Raise exception if no matches found
|
# Raise exception if no matches found
|
||||||
def _load_different_url_payload(self, wbrequest, query, cdx, headersRecord, failedFiles):
|
def _load_different_url_payload(self, wbrequest, cdx_reader, cdx, headersRecord, failedFiles):
|
||||||
ref_target_uri = headersRecord.rec_headers.getHeader('WARC-Refers-To-Target-URI')
|
ref_target_uri = headersRecord.rec_headers.getHeader('WARC-Refers-To-Target-URI')
|
||||||
|
|
||||||
# Check for unresolved revisit error, if refers to target uri not present or same as the current url
|
# Check for unresolved revisit error, if refers to target uri not present or same as the current url
|
||||||
@ -187,11 +191,11 @@ class ReplayHandler(object):
|
|||||||
# Must also match digest
|
# Must also match digest
|
||||||
orig_wbreq.queryFilter.append('digest:' + cdx['digest'])
|
orig_wbreq.queryFilter.append('digest:' + cdx['digest'])
|
||||||
|
|
||||||
orig_cdxlines = query(orig_wbreq).body
|
orig_cdx_lines = cdx_reader.load_for_request(orig_wbreq, parsed_cdx = True)
|
||||||
|
|
||||||
for cdx in orig_cdxlines:
|
for cdx in orig_cdx_lines:
|
||||||
try:
|
try:
|
||||||
cdx = indexreader.CDXCaptureResult(cdx)
|
#cdx = cdx_reader.CDXCaptureResult(cdx)
|
||||||
#print cdx
|
#print cdx
|
||||||
payloadRecord = self._load(cdx, False, failedFiles)
|
payloadRecord = self._load(cdx, False, failedFiles)
|
||||||
return payloadRecord
|
return payloadRecord
|
||||||
@ -256,11 +260,11 @@ class RewritingReplayHandler(ReplayHandler):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def __call__(self, wbrequest, query_response, query):
|
def __call__(self, wbrequest, index, cdx_reader):
|
||||||
urlrewriter = ArchivalUrlRewriter(wbrequest.wb_url, wbrequest.wb_prefix)
|
urlrewriter = ArchivalUrlRewriter(wbrequest.wb_url, wbrequest.wb_prefix)
|
||||||
wbrequest.urlrewriter = urlrewriter
|
wbrequest.urlrewriter = urlrewriter
|
||||||
|
|
||||||
response = ReplayHandler.__call__(self, wbrequest, query_response, query)
|
response = ReplayHandler.__call__(self, wbrequest, index, cdx_reader)
|
||||||
|
|
||||||
if response and response.cdx:
|
if response and response.cdx:
|
||||||
self._checkRedir(wbrequest, response.cdx)
|
self._checkRedir(wbrequest, response.cdx)
|
||||||
@ -414,8 +418,8 @@ class RewritingReplayHandler(ReplayHandler):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def doReplay(self, cdx, wbrequest, query, failedFiles):
|
def doReplay(self, cdx, wbrequest, index, failedFiles):
|
||||||
wbresponse = ReplayHandler.doReplay(self, cdx, wbrequest, query, failedFiles)
|
wbresponse = ReplayHandler.doReplay(self, cdx, wbrequest, index, failedFiles)
|
||||||
|
|
||||||
# Check for self redirect
|
# Check for self redirect
|
||||||
if wbresponse.status_headers.statusline.startswith('3'):
|
if wbresponse.status_headers.statusline.startswith('3'):
|
||||||
|
49
pywb/views.py
Normal file
49
pywb/views.py
Normal file
@ -0,0 +1,49 @@
|
|||||||
|
import indexreader
|
||||||
|
import utils
|
||||||
|
import wbrequestresponse
|
||||||
|
import wbexceptions
|
||||||
|
|
||||||
|
from itertools import imap
|
||||||
|
from jinja2 import Environment, FileSystemLoader
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class TextQueryView:
|
||||||
|
def __call__(self, wbrequest, cdx_lines):
|
||||||
|
cdx_lines = imap(lambda x: str(x) + '\n', cdx_lines)
|
||||||
|
return wbrequestresponse.WbResponse.text_stream(cdx_lines)
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class J2QueryView:
|
||||||
|
def __init__(self, template_dir, template_file, buffer_index = True):
|
||||||
|
self.template_file = template_file
|
||||||
|
self.buffer_index = buffer_index
|
||||||
|
|
||||||
|
self.jinja_env = Environment(loader = FileSystemLoader(template_dir), trim_blocks = True)
|
||||||
|
|
||||||
|
|
||||||
|
def __call__(self, wbrequest, cdx_lines):
|
||||||
|
template = self.jinja_env.get_template(self.template_file)
|
||||||
|
|
||||||
|
# buffer/convert to list so we have length available for template
|
||||||
|
if self.buffer_index:
|
||||||
|
cdx_lines = list(cdx_lines)
|
||||||
|
|
||||||
|
response = template.render(cdx_lines = cdx_lines,
|
||||||
|
url = wbrequest.wb_url.url,
|
||||||
|
prefix = wbrequest.wb_prefix)
|
||||||
|
|
||||||
|
return wbrequestresponse.WbResponse.text_response(str(response), content_type = 'text/html')
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class DebugEchoView:
|
||||||
|
def __call__(self, wbrequest):
|
||||||
|
return wbrequestresponse.WbResponse.text_response(str(wbrequest.env))
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class DebugEchoView:
|
||||||
|
def __call__(self, wbrequest):
|
||||||
|
return wbrequestresponse.WbResponse.text_response(str(wbrequest))
|
||||||
|
|
||||||
|
|
@ -1,14 +1,11 @@
|
|||||||
<body>
|
<body>
|
||||||
<b><span id="count"></span> Captures of {{ url }}</b>
|
<b>{{ cdx_lines | length }} captures of {{ url }}</b>
|
||||||
<table id="captures">
|
<table id="captures">
|
||||||
{% for cdx in cdxlines %}
|
{% for cdx in cdx_lines %}
|
||||||
<tr>
|
<tr>
|
||||||
<td><a href="{{ prefix}}{{ cdx.timestamp }}/{{ url }}">{{ cdx.timestamp }}</a></td>
|
<td><a href="{{ prefix}}{{ cdx.timestamp }}/{{ url }}">{{ cdx.timestamp }}</a></td>
|
||||||
<td><a href="https://archive.org/details/{{ cdx['filename'] }}">{{ cdx['filename'] }}</a></td>
|
<td><a href="https://archive.org/details/{{ cdx['filename'] }}">{{ cdx['filename'] }}</a></td>
|
||||||
</tr>
|
</tr>
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
</table>
|
</table>
|
||||||
<script>
|
|
||||||
document.getElementById("count").innerHTML = document.getElementById("captures").getElementsByTagName("tr").length
|
|
||||||
</script>
|
|
||||||
</body>
|
</body>
|
||||||
|
Loading…
x
Reference in New Issue
Block a user