1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

support for url-agnostic dedup, eg loading payload from a different url

than the revisit
This commit is contained in:
Ilya Kreymer 2014-01-19 12:31:19 -08:00
parent 7ce6d0d22b
commit 354040a7e0
5 changed files with 108 additions and 12 deletions

View File

@ -29,7 +29,7 @@ class RemoteCDXServer:
params['url'] = url
params.update(**kwvalues)
urlparams = urllib.urlencode(params)
urlparams = urllib.urlencode(params, True)
try:
request = urllib2.Request(self.serverUrl, urlparams)

View File

@ -19,7 +19,8 @@ class QueryHandler:
params = self.cdxserver.getQueryParams(wburl)
# add any custom params from the request
params.update(wbrequest.customParams)
if wbrequest.queryFilter:
params['filter'] = wbrequest.queryFilter
cdxlines = self.cdxserver.load(wburl.url, params)

View File

@ -2,6 +2,7 @@ import StringIO
from urllib2 import URLError
import chardet
import redis
import copy
import indexreader
from wbrequestresponse import WbResponse, StatusAndHeaders
@ -30,7 +31,7 @@ class WBHandler:
return self.htmlquery(wbrequest, query_response) if self.htmlquery else query_response
with utils.PerfTimer(wbrequest.env.get('X_PERF'), 'replay') as t:
return self.replay(wbrequest, query_response)
return self.replay(wbrequest, query_response, self.query)
#=================================================================
@ -39,7 +40,7 @@ class ReplayHandler(object):
self.resolvers = resolvers
self.archiveloader = archiveloader
def __call__(self, wbrequest, query_response):
def __call__(self, wbrequest, query_response, query):
cdxlist = query_response.body
last_e = None
first = True
@ -58,7 +59,7 @@ class ReplayHandler(object):
self._checkRedir(wbrequest, cdx)
first = False
response = self.doReplay(cdx, wbrequest, failedFiles)
response = self.doReplay(cdx, wbrequest, query, failedFiles)
if response:
response.cdx = cdx
@ -98,7 +99,7 @@ class ReplayHandler(object):
raise wbexceptions.ArchiveLoadFailed(filename, ue.reason)
def doReplay(self, cdx, wbrequest, failedFiles):
def doReplay(self, cdx, wbrequest, query, failedFiles):
hasCurr = (cdx['filename'] != '-')
hasOrig = (cdx['orig.filename'] != '-')
@ -130,11 +131,59 @@ class ReplayHandler(object):
isRevisit = True
else:
raise wbexceptions.CaptureException('Invalid CDX' + cdx)
raise wbexceptions.CaptureException('Invalid CDX' + str(cdx))
# special cases: if mimetype is still warc/revisit.. need to look further
if cdx['mimetype'] == 'warc/revisit':
payloadRecord = self._load_different_url_payload(wbrequest, query, cdx, headersRecord, failedFiles)
return WbResponse.stream_response(headersRecord.status_headers, payloadRecord.stream)
# Handle the case where a duplicate of a capture with same digest exists at a different url
# Must query the index at that url filtering by matching digest
# Raise exception if no matches found
def _load_different_url_payload(self, wbrequest, query, cdx, headersRecord, failedFiles):
ref_target_uri = headersRecord.rec_headers.getHeader('WARC-Refers-To-Target-URI')
# Case 5: unresolved revisit error, if refers to target uri not present or same as the current uri
if not ref_target_uri or (ref_target_uri == headersRecord.rec_headers.getHeader('WARC-Target-URI')):
raise wbexceptions.CaptureException('Missing Revisit Original' + str(cdx))
# Case 6: url-agnostic revisit with different original url (either same or different date)
ref_target_date = headersRecord.rec_headers.getHeader('WARC-Refers-To-Date')
if not ref_target_date:
ref_target_date = cdx['timestamp']
else:
ref_target_date = utils.iso_date_to_timestamp(ref_target_date)
# clone WbRequest
orig_wbreq = copy.copy(wbrequest)
orig_wbreq.wb_url = copy.copy(orig_wbreq.wb_url)
orig_wbreq.wb_url.url = ref_target_uri
orig_wbreq.wb_url.timestamp = ref_target_date
# Must also match digest
orig_wbreq.queryFilter.append('digest:' + cdx['digest'])
orig_cdxlines = query(orig_wbreq).body
for cdx in orig_cdxlines:
try:
cdx = indexreader.CDXCaptureResult(cdx)
#print cdx
payloadRecord = self._load(cdx, False, failedFiles)
return payloadRecord
except wbexceptions.CaptureException as e:
pass
raise wbexceptions.CaptureException('Original for revisit could not be loaded')
def resolveFull(self, filename):
# Attempt to resolve cdx file to full path
fullUrl = None
@ -165,11 +214,11 @@ class RewritingReplayHandler(ReplayHandler):
return None
def __call__(self, wbrequest, query_response):
def __call__(self, wbrequest, query_response, query):
urlrewriter = ArchivalUrlRewriter(wbrequest.wb_url, wbrequest.wb_prefix)
wbrequest.urlrewriter = urlrewriter
response = ReplayHandler.__call__(self, wbrequest, query_response)
response = ReplayHandler.__call__(self, wbrequest, query_response, query)
if response and response.cdx:
self._checkRedir(wbrequest, response.cdx)
@ -276,8 +325,8 @@ class RewritingReplayHandler(ReplayHandler):
return None
def doReplay(self, cdx, wbrequest, failedFiles):
wbresponse = ReplayHandler.doReplay(self, cdx, wbrequest, failedFiles)
def doReplay(self, cdx, wbrequest, query, failedFiles):
wbresponse = ReplayHandler.doReplay(self, cdx, wbrequest, query, failedFiles)
# Check for self redirect
if wbresponse.status_headers.statusline.startswith('3'):

View File

@ -3,6 +3,8 @@ import hmac
import time
import zlib
import time
import datetime
import re
def peek_iter(iterable):
try:
@ -67,3 +69,47 @@ class PerfTimer:
self.perfdict[self.name] = str(self.end - self.start)
DATE_TIMESPLIT = re.compile('[^\d]')
def iso_date_to_datetime(string):
"""
>>> iso_date_to_datetime('2013-12-26T10:11:12Z')
datetime.datetime(2013, 12, 26, 10, 11, 12)
>>> iso_date_to_datetime('2013-12-26T10:11:12Z')
datetime.datetime(2013, 12, 26, 10, 11, 12)
"""
nums = DATE_TIMESPLIT.split(string)
if nums[-1] == '':
nums = nums[:-1]
dt = datetime.datetime(*map(int, nums))
return dt
def datetime_to_timestamp(dt):
"""
>>> datetime_to_timestamp(datetime.datetime(2013, 12, 26, 10, 11, 12))
'20131226101112'
"""
return dt.strftime('%Y%m%d%H%M%S')
def iso_date_to_timestamp(string):
"""
>>> iso_date_to_timestamp('2013-12-26T10:11:12Z')
'20131226101112'
>>> iso_date_to_timestamp('2013-12-26T10:11:12')
'20131226101112'
"""
return datetime_to_timestamp(iso_date_to_datetime(string))
if __name__ == "__main__":
import doctest
doctest.testmod()

View File

@ -76,7 +76,7 @@ class WbRequest:
self.is_ajax = self._is_ajax()
self.customParams = {}
self.queryFilter = []
# PERF
env['X_PERF'] = {}