1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

support for url-agnostic dedup, eg loading payload from a different url

than the revisit
This commit is contained in:
Ilya Kreymer 2014-01-19 12:31:19 -08:00
parent 7ce6d0d22b
commit 354040a7e0
5 changed files with 108 additions and 12 deletions

View File

@ -29,7 +29,7 @@ class RemoteCDXServer:
params['url'] = url params['url'] = url
params.update(**kwvalues) params.update(**kwvalues)
urlparams = urllib.urlencode(params) urlparams = urllib.urlencode(params, True)
try: try:
request = urllib2.Request(self.serverUrl, urlparams) request = urllib2.Request(self.serverUrl, urlparams)

View File

@ -19,7 +19,8 @@ class QueryHandler:
params = self.cdxserver.getQueryParams(wburl) params = self.cdxserver.getQueryParams(wburl)
# add any custom params from the request # add any custom params from the request
params.update(wbrequest.customParams) if wbrequest.queryFilter:
params['filter'] = wbrequest.queryFilter
cdxlines = self.cdxserver.load(wburl.url, params) cdxlines = self.cdxserver.load(wburl.url, params)

View File

@ -2,6 +2,7 @@ import StringIO
from urllib2 import URLError from urllib2 import URLError
import chardet import chardet
import redis import redis
import copy
import indexreader import indexreader
from wbrequestresponse import WbResponse, StatusAndHeaders from wbrequestresponse import WbResponse, StatusAndHeaders
@ -30,7 +31,7 @@ class WBHandler:
return self.htmlquery(wbrequest, query_response) if self.htmlquery else query_response return self.htmlquery(wbrequest, query_response) if self.htmlquery else query_response
with utils.PerfTimer(wbrequest.env.get('X_PERF'), 'replay') as t: with utils.PerfTimer(wbrequest.env.get('X_PERF'), 'replay') as t:
return self.replay(wbrequest, query_response) return self.replay(wbrequest, query_response, self.query)
#================================================================= #=================================================================
@ -39,7 +40,7 @@ class ReplayHandler(object):
self.resolvers = resolvers self.resolvers = resolvers
self.archiveloader = archiveloader self.archiveloader = archiveloader
def __call__(self, wbrequest, query_response): def __call__(self, wbrequest, query_response, query):
cdxlist = query_response.body cdxlist = query_response.body
last_e = None last_e = None
first = True first = True
@ -58,7 +59,7 @@ class ReplayHandler(object):
self._checkRedir(wbrequest, cdx) self._checkRedir(wbrequest, cdx)
first = False first = False
response = self.doReplay(cdx, wbrequest, failedFiles) response = self.doReplay(cdx, wbrequest, query, failedFiles)
if response: if response:
response.cdx = cdx response.cdx = cdx
@ -98,7 +99,7 @@ class ReplayHandler(object):
raise wbexceptions.ArchiveLoadFailed(filename, ue.reason) raise wbexceptions.ArchiveLoadFailed(filename, ue.reason)
def doReplay(self, cdx, wbrequest, failedFiles): def doReplay(self, cdx, wbrequest, query, failedFiles):
hasCurr = (cdx['filename'] != '-') hasCurr = (cdx['filename'] != '-')
hasOrig = (cdx['orig.filename'] != '-') hasOrig = (cdx['orig.filename'] != '-')
@ -130,11 +131,59 @@ class ReplayHandler(object):
isRevisit = True isRevisit = True
else: else:
raise wbexceptions.CaptureException('Invalid CDX' + cdx) raise wbexceptions.CaptureException('Invalid CDX' + str(cdx))
# special cases: if mimetype is still warc/revisit.. need to look further
if cdx['mimetype'] == 'warc/revisit':
payloadRecord = self._load_different_url_payload(wbrequest, query, cdx, headersRecord, failedFiles)
return WbResponse.stream_response(headersRecord.status_headers, payloadRecord.stream) return WbResponse.stream_response(headersRecord.status_headers, payloadRecord.stream)
# Handle the case where a duplicate of a capture with same digest exists at a different url
# Must query the index at that url filtering by matching digest
# Raise exception if no matches found
def _load_different_url_payload(self, wbrequest, query, cdx, headersRecord, failedFiles):
ref_target_uri = headersRecord.rec_headers.getHeader('WARC-Refers-To-Target-URI')
# Case 5: unresolved revisit error, if refers to target uri not present or same as the current uri
if not ref_target_uri or (ref_target_uri == headersRecord.rec_headers.getHeader('WARC-Target-URI')):
raise wbexceptions.CaptureException('Missing Revisit Original' + str(cdx))
# Case 6: url-agnostic revisit with different original url (either same or different date)
ref_target_date = headersRecord.rec_headers.getHeader('WARC-Refers-To-Date')
if not ref_target_date:
ref_target_date = cdx['timestamp']
else:
ref_target_date = utils.iso_date_to_timestamp(ref_target_date)
# clone WbRequest
orig_wbreq = copy.copy(wbrequest)
orig_wbreq.wb_url = copy.copy(orig_wbreq.wb_url)
orig_wbreq.wb_url.url = ref_target_uri
orig_wbreq.wb_url.timestamp = ref_target_date
# Must also match digest
orig_wbreq.queryFilter.append('digest:' + cdx['digest'])
orig_cdxlines = query(orig_wbreq).body
for cdx in orig_cdxlines:
try:
cdx = indexreader.CDXCaptureResult(cdx)
#print cdx
payloadRecord = self._load(cdx, False, failedFiles)
return payloadRecord
except wbexceptions.CaptureException as e:
pass
raise wbexceptions.CaptureException('Original for revisit could not be loaded')
def resolveFull(self, filename): def resolveFull(self, filename):
# Attempt to resolve cdx file to full path # Attempt to resolve cdx file to full path
fullUrl = None fullUrl = None
@ -165,11 +214,11 @@ class RewritingReplayHandler(ReplayHandler):
return None return None
def __call__(self, wbrequest, query_response): def __call__(self, wbrequest, query_response, query):
urlrewriter = ArchivalUrlRewriter(wbrequest.wb_url, wbrequest.wb_prefix) urlrewriter = ArchivalUrlRewriter(wbrequest.wb_url, wbrequest.wb_prefix)
wbrequest.urlrewriter = urlrewriter wbrequest.urlrewriter = urlrewriter
response = ReplayHandler.__call__(self, wbrequest, query_response) response = ReplayHandler.__call__(self, wbrequest, query_response, query)
if response and response.cdx: if response and response.cdx:
self._checkRedir(wbrequest, response.cdx) self._checkRedir(wbrequest, response.cdx)
@ -276,8 +325,8 @@ class RewritingReplayHandler(ReplayHandler):
return None return None
def doReplay(self, cdx, wbrequest, failedFiles): def doReplay(self, cdx, wbrequest, query, failedFiles):
wbresponse = ReplayHandler.doReplay(self, cdx, wbrequest, failedFiles) wbresponse = ReplayHandler.doReplay(self, cdx, wbrequest, query, failedFiles)
# Check for self redirect # Check for self redirect
if wbresponse.status_headers.statusline.startswith('3'): if wbresponse.status_headers.statusline.startswith('3'):

View File

@ -3,6 +3,8 @@ import hmac
import time import time
import zlib import zlib
import time import time
import datetime
import re
def peek_iter(iterable): def peek_iter(iterable):
try: try:
@ -67,3 +69,47 @@ class PerfTimer:
self.perfdict[self.name] = str(self.end - self.start) self.perfdict[self.name] = str(self.end - self.start)
DATE_TIMESPLIT = re.compile('[^\d]')
def iso_date_to_datetime(string):
"""
>>> iso_date_to_datetime('2013-12-26T10:11:12Z')
datetime.datetime(2013, 12, 26, 10, 11, 12)
>>> iso_date_to_datetime('2013-12-26T10:11:12Z')
datetime.datetime(2013, 12, 26, 10, 11, 12)
"""
nums = DATE_TIMESPLIT.split(string)
if nums[-1] == '':
nums = nums[:-1]
dt = datetime.datetime(*map(int, nums))
return dt
def datetime_to_timestamp(dt):
"""
>>> datetime_to_timestamp(datetime.datetime(2013, 12, 26, 10, 11, 12))
'20131226101112'
"""
return dt.strftime('%Y%m%d%H%M%S')
def iso_date_to_timestamp(string):
"""
>>> iso_date_to_timestamp('2013-12-26T10:11:12Z')
'20131226101112'
>>> iso_date_to_timestamp('2013-12-26T10:11:12')
'20131226101112'
"""
return datetime_to_timestamp(iso_date_to_datetime(string))
if __name__ == "__main__":
import doctest
doctest.testmod()

View File

@ -76,7 +76,7 @@ class WbRequest:
self.is_ajax = self._is_ajax() self.is_ajax = self._is_ajax()
self.customParams = {} self.queryFilter = []
# PERF # PERF
env['X_PERF'] = {} env['X_PERF'] = {}