mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
support for url-agnostic dedup, eg loading payload from a different url
than the revisit
This commit is contained in:
parent
7ce6d0d22b
commit
354040a7e0
@ -29,7 +29,7 @@ class RemoteCDXServer:
|
|||||||
params['url'] = url
|
params['url'] = url
|
||||||
params.update(**kwvalues)
|
params.update(**kwvalues)
|
||||||
|
|
||||||
urlparams = urllib.urlencode(params)
|
urlparams = urllib.urlencode(params, True)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
request = urllib2.Request(self.serverUrl, urlparams)
|
request = urllib2.Request(self.serverUrl, urlparams)
|
||||||
|
@ -19,7 +19,8 @@ class QueryHandler:
|
|||||||
params = self.cdxserver.getQueryParams(wburl)
|
params = self.cdxserver.getQueryParams(wburl)
|
||||||
|
|
||||||
# add any custom params from the request
|
# add any custom params from the request
|
||||||
params.update(wbrequest.customParams)
|
if wbrequest.queryFilter:
|
||||||
|
params['filter'] = wbrequest.queryFilter
|
||||||
|
|
||||||
cdxlines = self.cdxserver.load(wburl.url, params)
|
cdxlines = self.cdxserver.load(wburl.url, params)
|
||||||
|
|
||||||
|
@ -2,6 +2,7 @@ import StringIO
|
|||||||
from urllib2 import URLError
|
from urllib2 import URLError
|
||||||
import chardet
|
import chardet
|
||||||
import redis
|
import redis
|
||||||
|
import copy
|
||||||
|
|
||||||
import indexreader
|
import indexreader
|
||||||
from wbrequestresponse import WbResponse, StatusAndHeaders
|
from wbrequestresponse import WbResponse, StatusAndHeaders
|
||||||
@ -30,7 +31,7 @@ class WBHandler:
|
|||||||
return self.htmlquery(wbrequest, query_response) if self.htmlquery else query_response
|
return self.htmlquery(wbrequest, query_response) if self.htmlquery else query_response
|
||||||
|
|
||||||
with utils.PerfTimer(wbrequest.env.get('X_PERF'), 'replay') as t:
|
with utils.PerfTimer(wbrequest.env.get('X_PERF'), 'replay') as t:
|
||||||
return self.replay(wbrequest, query_response)
|
return self.replay(wbrequest, query_response, self.query)
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
@ -39,7 +40,7 @@ class ReplayHandler(object):
|
|||||||
self.resolvers = resolvers
|
self.resolvers = resolvers
|
||||||
self.archiveloader = archiveloader
|
self.archiveloader = archiveloader
|
||||||
|
|
||||||
def __call__(self, wbrequest, query_response):
|
def __call__(self, wbrequest, query_response, query):
|
||||||
cdxlist = query_response.body
|
cdxlist = query_response.body
|
||||||
last_e = None
|
last_e = None
|
||||||
first = True
|
first = True
|
||||||
@ -58,7 +59,7 @@ class ReplayHandler(object):
|
|||||||
self._checkRedir(wbrequest, cdx)
|
self._checkRedir(wbrequest, cdx)
|
||||||
first = False
|
first = False
|
||||||
|
|
||||||
response = self.doReplay(cdx, wbrequest, failedFiles)
|
response = self.doReplay(cdx, wbrequest, query, failedFiles)
|
||||||
|
|
||||||
if response:
|
if response:
|
||||||
response.cdx = cdx
|
response.cdx = cdx
|
||||||
@ -98,7 +99,7 @@ class ReplayHandler(object):
|
|||||||
raise wbexceptions.ArchiveLoadFailed(filename, ue.reason)
|
raise wbexceptions.ArchiveLoadFailed(filename, ue.reason)
|
||||||
|
|
||||||
|
|
||||||
def doReplay(self, cdx, wbrequest, failedFiles):
|
def doReplay(self, cdx, wbrequest, query, failedFiles):
|
||||||
hasCurr = (cdx['filename'] != '-')
|
hasCurr = (cdx['filename'] != '-')
|
||||||
hasOrig = (cdx['orig.filename'] != '-')
|
hasOrig = (cdx['orig.filename'] != '-')
|
||||||
|
|
||||||
@ -130,11 +131,59 @@ class ReplayHandler(object):
|
|||||||
isRevisit = True
|
isRevisit = True
|
||||||
|
|
||||||
else:
|
else:
|
||||||
raise wbexceptions.CaptureException('Invalid CDX' + cdx)
|
raise wbexceptions.CaptureException('Invalid CDX' + str(cdx))
|
||||||
|
|
||||||
|
# special cases: if mimetype is still warc/revisit.. need to look further
|
||||||
|
if cdx['mimetype'] == 'warc/revisit':
|
||||||
|
payloadRecord = self._load_different_url_payload(wbrequest, query, cdx, headersRecord, failedFiles)
|
||||||
|
|
||||||
return WbResponse.stream_response(headersRecord.status_headers, payloadRecord.stream)
|
return WbResponse.stream_response(headersRecord.status_headers, payloadRecord.stream)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# Handle the case where a duplicate of a capture with same digest exists at a different url
|
||||||
|
# Must query the index at that url filtering by matching digest
|
||||||
|
# Raise exception if no matches found
|
||||||
|
def _load_different_url_payload(self, wbrequest, query, cdx, headersRecord, failedFiles):
|
||||||
|
ref_target_uri = headersRecord.rec_headers.getHeader('WARC-Refers-To-Target-URI')
|
||||||
|
|
||||||
|
# Case 5: unresolved revisit error, if refers to target uri not present or same as the current uri
|
||||||
|
if not ref_target_uri or (ref_target_uri == headersRecord.rec_headers.getHeader('WARC-Target-URI')):
|
||||||
|
raise wbexceptions.CaptureException('Missing Revisit Original' + str(cdx))
|
||||||
|
|
||||||
|
# Case 6: url-agnostic revisit with different original url (either same or different date)
|
||||||
|
ref_target_date = headersRecord.rec_headers.getHeader('WARC-Refers-To-Date')
|
||||||
|
|
||||||
|
if not ref_target_date:
|
||||||
|
ref_target_date = cdx['timestamp']
|
||||||
|
else:
|
||||||
|
ref_target_date = utils.iso_date_to_timestamp(ref_target_date)
|
||||||
|
|
||||||
|
# clone WbRequest
|
||||||
|
orig_wbreq = copy.copy(wbrequest)
|
||||||
|
orig_wbreq.wb_url = copy.copy(orig_wbreq.wb_url)
|
||||||
|
|
||||||
|
orig_wbreq.wb_url.url = ref_target_uri
|
||||||
|
orig_wbreq.wb_url.timestamp = ref_target_date
|
||||||
|
|
||||||
|
# Must also match digest
|
||||||
|
orig_wbreq.queryFilter.append('digest:' + cdx['digest'])
|
||||||
|
|
||||||
|
orig_cdxlines = query(orig_wbreq).body
|
||||||
|
|
||||||
|
for cdx in orig_cdxlines:
|
||||||
|
try:
|
||||||
|
cdx = indexreader.CDXCaptureResult(cdx)
|
||||||
|
#print cdx
|
||||||
|
payloadRecord = self._load(cdx, False, failedFiles)
|
||||||
|
return payloadRecord
|
||||||
|
|
||||||
|
except wbexceptions.CaptureException as e:
|
||||||
|
pass
|
||||||
|
|
||||||
|
raise wbexceptions.CaptureException('Original for revisit could not be loaded')
|
||||||
|
|
||||||
|
|
||||||
def resolveFull(self, filename):
|
def resolveFull(self, filename):
|
||||||
# Attempt to resolve cdx file to full path
|
# Attempt to resolve cdx file to full path
|
||||||
fullUrl = None
|
fullUrl = None
|
||||||
@ -165,11 +214,11 @@ class RewritingReplayHandler(ReplayHandler):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def __call__(self, wbrequest, query_response):
|
def __call__(self, wbrequest, query_response, query):
|
||||||
urlrewriter = ArchivalUrlRewriter(wbrequest.wb_url, wbrequest.wb_prefix)
|
urlrewriter = ArchivalUrlRewriter(wbrequest.wb_url, wbrequest.wb_prefix)
|
||||||
wbrequest.urlrewriter = urlrewriter
|
wbrequest.urlrewriter = urlrewriter
|
||||||
|
|
||||||
response = ReplayHandler.__call__(self, wbrequest, query_response)
|
response = ReplayHandler.__call__(self, wbrequest, query_response, query)
|
||||||
|
|
||||||
if response and response.cdx:
|
if response and response.cdx:
|
||||||
self._checkRedir(wbrequest, response.cdx)
|
self._checkRedir(wbrequest, response.cdx)
|
||||||
@ -276,8 +325,8 @@ class RewritingReplayHandler(ReplayHandler):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def doReplay(self, cdx, wbrequest, failedFiles):
|
def doReplay(self, cdx, wbrequest, query, failedFiles):
|
||||||
wbresponse = ReplayHandler.doReplay(self, cdx, wbrequest, failedFiles)
|
wbresponse = ReplayHandler.doReplay(self, cdx, wbrequest, query, failedFiles)
|
||||||
|
|
||||||
# Check for self redirect
|
# Check for self redirect
|
||||||
if wbresponse.status_headers.statusline.startswith('3'):
|
if wbresponse.status_headers.statusline.startswith('3'):
|
||||||
|
@ -3,6 +3,8 @@ import hmac
|
|||||||
import time
|
import time
|
||||||
import zlib
|
import zlib
|
||||||
import time
|
import time
|
||||||
|
import datetime
|
||||||
|
import re
|
||||||
|
|
||||||
def peek_iter(iterable):
|
def peek_iter(iterable):
|
||||||
try:
|
try:
|
||||||
@ -67,3 +69,47 @@ class PerfTimer:
|
|||||||
self.perfdict[self.name] = str(self.end - self.start)
|
self.perfdict[self.name] = str(self.end - self.start)
|
||||||
|
|
||||||
|
|
||||||
|
DATE_TIMESPLIT = re.compile('[^\d]')
|
||||||
|
|
||||||
|
def iso_date_to_datetime(string):
|
||||||
|
"""
|
||||||
|
>>> iso_date_to_datetime('2013-12-26T10:11:12Z')
|
||||||
|
datetime.datetime(2013, 12, 26, 10, 11, 12)
|
||||||
|
|
||||||
|
>>> iso_date_to_datetime('2013-12-26T10:11:12Z')
|
||||||
|
datetime.datetime(2013, 12, 26, 10, 11, 12)
|
||||||
|
"""
|
||||||
|
|
||||||
|
nums = DATE_TIMESPLIT.split(string)
|
||||||
|
if nums[-1] == '':
|
||||||
|
nums = nums[:-1]
|
||||||
|
|
||||||
|
dt = datetime.datetime(*map(int, nums))
|
||||||
|
return dt
|
||||||
|
|
||||||
|
def datetime_to_timestamp(dt):
|
||||||
|
"""
|
||||||
|
>>> datetime_to_timestamp(datetime.datetime(2013, 12, 26, 10, 11, 12))
|
||||||
|
'20131226101112'
|
||||||
|
"""
|
||||||
|
|
||||||
|
return dt.strftime('%Y%m%d%H%M%S')
|
||||||
|
|
||||||
|
def iso_date_to_timestamp(string):
|
||||||
|
"""
|
||||||
|
>>> iso_date_to_timestamp('2013-12-26T10:11:12Z')
|
||||||
|
'20131226101112'
|
||||||
|
|
||||||
|
>>> iso_date_to_timestamp('2013-12-26T10:11:12')
|
||||||
|
'20131226101112'
|
||||||
|
"""
|
||||||
|
|
||||||
|
return datetime_to_timestamp(iso_date_to_datetime(string))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import doctest
|
||||||
|
doctest.testmod()
|
||||||
|
|
||||||
|
@ -76,7 +76,7 @@ class WbRequest:
|
|||||||
|
|
||||||
self.is_ajax = self._is_ajax()
|
self.is_ajax = self._is_ajax()
|
||||||
|
|
||||||
self.customParams = {}
|
self.queryFilter = []
|
||||||
|
|
||||||
# PERF
|
# PERF
|
||||||
env['X_PERF'] = {}
|
env['X_PERF'] = {}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user