mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
archiveloader: Support for loading warc/arc records using hanzo parser (for record header parsing only)
ReplayHandler: load replay from query response, find best option basic support for matching url, checking self-redirects!
This commit is contained in:
parent
787dfc136e
commit
16f458d5ec
189
pywb/archiveloader.py
Normal file
189
pywb/archiveloader.py
Normal file
@ -0,0 +1,189 @@
|
||||
import hanzo.warctools
|
||||
|
||||
import re
|
||||
import utils
|
||||
import zlib
|
||||
import urllib2
|
||||
import StringIO
|
||||
import urlparse
|
||||
import collections
|
||||
|
||||
#=================================================================
|
||||
class HttpStreamLoader:
|
||||
def __init__(self, hmac = None, hmacDuration = 30):
|
||||
self.hmac = hmac
|
||||
self.hmacDuration = hmacDuration
|
||||
|
||||
def load(self, url, offset, length):
|
||||
if length:
|
||||
rangeHeader = 'bytes={0}-{1}'.format(offset, int(offset) + int(length) - 1)
|
||||
else:
|
||||
rangeHeader = 'bytes={0}-'.format(offset)
|
||||
|
||||
headers = {}
|
||||
headers['Range'] = rangeHeader
|
||||
|
||||
if self.hmac:
|
||||
headers['Cookie'] = self.hmac(self.hmacDuration)
|
||||
|
||||
request = urllib2.Request(url, headers = headers)
|
||||
return urllib2.urlopen(request)
|
||||
|
||||
|
||||
#=================================================================
|
||||
WBArchiveRecord = collections.namedtuple('WBArchiveRecord', 'parsed, stream, statusline, httpHeaders')
|
||||
|
||||
#=================================================================
|
||||
class ArchiveLoader:
|
||||
# Standard ARC headers
|
||||
ARC_HEADERS = ["uri", "ip-address", "creation-date", "content-type", "length"]
|
||||
|
||||
# Since loading a range request, can only determine gzip-ness based on file extension
|
||||
FORMAT_MAP = {
|
||||
'.warc.gz': (hanzo.warctools.WarcRecord, 'warc', True),
|
||||
'.arc.gz': (hanzo.warctools.ArcRecord, 'arc', True),
|
||||
'.warc': (hanzo.warctools.WarcRecord, 'warc', False),
|
||||
'.arc': (hanzo.warctools.ArcRecord, 'arc', False),
|
||||
}
|
||||
|
||||
HTTP_STATUS_REGEX = re.compile('^HTTP/[\d.]+ ((\d+).*)$')
|
||||
|
||||
@staticmethod
|
||||
def createDefaultLoaders():
|
||||
http = HttpStreamLoader()
|
||||
return {
|
||||
'http': http,
|
||||
'https': http,
|
||||
}
|
||||
|
||||
|
||||
def __init__(self, loaders = {}, chunkSize = 8192):
|
||||
self.loaders = loaders if loaders else ArchiveLoader.createDefaultLoaders()
|
||||
self.chunkSize = chunkSize
|
||||
|
||||
def load(self, url, offset, length):
|
||||
urlParts = urlparse.urlsplit(url)
|
||||
|
||||
try:
|
||||
loader = self.loaders.get(urlParts.scheme)
|
||||
except Exception:
|
||||
raise wbexceptions.UnknownLoaderProtocolException(url)
|
||||
|
||||
loaderCls = None
|
||||
|
||||
for ext, (loaderCls, aFormat, gzip) in ArchiveLoader.FORMAT_MAP.iteritems():
|
||||
if url.endswith(ext):
|
||||
loaderCls = loaderCls
|
||||
aFormat = aFormat
|
||||
isGzip = gzip
|
||||
break
|
||||
|
||||
if loaderCls is None:
|
||||
raise wbexceptions.UnknownArchiveFormatException(url)
|
||||
|
||||
if isGzip:
|
||||
decomp = zlib.decompressobj(16+zlib.MAX_WBITS)
|
||||
else:
|
||||
decomp = None
|
||||
|
||||
|
||||
raw = loader.load(url, offset, length)
|
||||
|
||||
reader = LineReader(raw, self.chunkSize, decomp)
|
||||
|
||||
parser = loaderCls.make_parser()
|
||||
|
||||
if aFormat == 'arc':
|
||||
parser.headers = ArchiveLoader.ARC_HEADERS
|
||||
|
||||
(parsed, errors, _) = parser.parse(reader, 0)
|
||||
|
||||
if errors:
|
||||
reader.close()
|
||||
raise wbexceptions.InvalidArchiveRecordException('Error Parsing Record', errors)
|
||||
|
||||
|
||||
if aFormat == 'arc':
|
||||
recType = 'arc-response'
|
||||
empty = (utils.get_header(parsed.headers, 'length') == 0)
|
||||
else:
|
||||
recType = utils.get_header(parsed.headers, 'WARC-Type')
|
||||
empty = (utils.get_header(parsed.headers, 'Content-Length') == '0')
|
||||
|
||||
parsed.recType = recType
|
||||
parsed.aFormat = aFormat
|
||||
|
||||
if empty:
|
||||
return WBArchiveRecord(parsed, reader, '400', [])
|
||||
|
||||
elif recType == 'metadata' or recType == 'resource':
|
||||
headers = [('Content-Type', utils.get_header(parsed.headers, 'Content-Type'))]
|
||||
|
||||
return WBArchiveRecord(parsed, reader, '200 OK', headers)
|
||||
|
||||
else:
|
||||
(statusline, headers) = self.parseHttpHeaders(reader)
|
||||
|
||||
return WBArchiveRecord(parsed, reader, statusline, headers)
|
||||
|
||||
|
||||
def parseHttpHeaders(self, stream):
|
||||
def nextHeaderLine(stream):
|
||||
return stream.readline().rstrip()
|
||||
|
||||
line = nextHeaderLine(stream)
|
||||
matched = self.HTTP_STATUS_REGEX.match(line)
|
||||
|
||||
if not matched:
|
||||
raise wbexceptions.InvalidArchiveRecordException('Expected HTTP Status Line, Found: ' + line)
|
||||
|
||||
#status = int(matched.group(2))
|
||||
statusline = matched.group(1)
|
||||
headers = []
|
||||
|
||||
line = nextHeaderLine(stream)
|
||||
|
||||
while line and line != '\r\n':
|
||||
name, value = line.split(':', 1)
|
||||
value = value.strip()
|
||||
headers.append((name, value))
|
||||
line = nextHeaderLine(stream)
|
||||
|
||||
return (statusline, headers)
|
||||
|
||||
#=================================================================
|
||||
class LineReader:
|
||||
def __init__(self, stream, chunkSize = 1024, decomp = None):
|
||||
self.stream = stream
|
||||
self.chunkSize = chunkSize
|
||||
self.decomp = decomp
|
||||
self.buff = None
|
||||
self.numread = 0
|
||||
|
||||
def _fillbuff(self, chunkSize = None):
|
||||
if not chunkSize:
|
||||
chunkSize = self.chunkSize
|
||||
|
||||
if not self.buff or self.buff.pos >= self.buff.len:
|
||||
data = self.stream.read(chunkSize)
|
||||
self.numread += len(data)
|
||||
if self.decomp:
|
||||
data = self.decomp.decompress(data)
|
||||
|
||||
self.buff = StringIO.StringIO(data)
|
||||
|
||||
def read(self):
|
||||
self._fillbuff()
|
||||
return self.buff.read()
|
||||
|
||||
def readline(self):
|
||||
self._fillbuff()
|
||||
return self.buff.readline()
|
||||
|
||||
def close(self):
|
||||
if self.stream:
|
||||
self.stream.close()
|
||||
self.stream = None
|
||||
|
||||
|
||||
|
@ -1,13 +1,14 @@
|
||||
import urllib
|
||||
import urllib2
|
||||
import wbexceptions
|
||||
import itertools
|
||||
|
||||
from wbarchivalurl import ArchivalUrl
|
||||
|
||||
class RemoteCDXServer:
|
||||
"""
|
||||
>>> x = cdxserver.load('example.com', parse_cdx = True, limit = '2')
|
||||
>>> pprint(vars(x[0]))
|
||||
>>> pprint(x[0])
|
||||
{'digest': 'HT2DYGA5UKZCPBSFVCV3JOBXGW2G5UUA',
|
||||
'filename': 'DJ_crawl2.20020401123359-c/DJ_crawl3.20020120141301.arc.gz',
|
||||
'length': '1792',
|
||||
@ -20,7 +21,23 @@ class RemoteCDXServer:
|
||||
'timestamp': '20020120142510',
|
||||
'urlkey': 'com,example)/'}
|
||||
|
||||
"""
|
||||
>>> x = cdxserver.load('example.com', parse_cdx = True, params = {'resolveRevisits': True, 'closest': '20131226', 'sort': 'closest', 'limit': '1'})
|
||||
>>> pprint(x[0])
|
||||
{'digest': 'B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A',
|
||||
'filename': 'top_domains-00800-20131210-035838/top_domains-00800-20131210051705-00024.warc.gz',
|
||||
'length': '523',
|
||||
'mimetype': 'warc/revisit',
|
||||
'offset': '247256770',
|
||||
'orig.filename': 'deccanchronicle.com-20130107-023325/IA-FOC-deccanchronicle.com-20130921004125-00000.warc.gz',
|
||||
'orig.length': '529',
|
||||
'orig.offset': '769759',
|
||||
'original': 'http://www.example.com/',
|
||||
'redirect': '-',
|
||||
'robotflags': '-',
|
||||
'statuscode': '-',
|
||||
'timestamp': '20131210052355',
|
||||
'urlkey': 'com,example)/'}
|
||||
"""
|
||||
|
||||
def __init__(self, serverUrl):
|
||||
self.serverUrl = serverUrl
|
||||
@ -69,9 +86,22 @@ class RemoteCDXServer:
|
||||
}[wburl.type]
|
||||
|
||||
|
||||
class CDXCaptureResult:
|
||||
CDX_FORMATS = [["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"],
|
||||
["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","offset","filename"]]
|
||||
class CDXCaptureResult(dict):
|
||||
CDX_FORMATS = [
|
||||
# CDX 11 Format
|
||||
["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"],
|
||||
|
||||
# CDX 9 Format
|
||||
["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","offset","filename"],
|
||||
|
||||
# CDX 11 Format + 3 revisit resolve fields
|
||||
["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename",
|
||||
"orig.length","orig.offset","orig.filename"],
|
||||
|
||||
# CDX 9 Format + 3 revisit resolve fields
|
||||
["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","offset","filename",
|
||||
"orig.length","orig.offset","orig.filename"]
|
||||
]
|
||||
|
||||
def __init__(self, cdxline):
|
||||
cdxline = cdxline.rstrip()
|
||||
@ -83,13 +113,14 @@ class CDXCaptureResult:
|
||||
cdxformat = i
|
||||
|
||||
if not cdxformat:
|
||||
raise InvalidCDXException('unknown {0}-field cdx format'.format(len(fields)))
|
||||
raise wbexceptions.InvalidCDXException('unknown {0}-field cdx format'.format(len(fields)))
|
||||
|
||||
for header, field in zip(cdxformat, fields):
|
||||
setattr(self, header, field)
|
||||
for header, field in itertools.izip(cdxformat, fields):
|
||||
self[header] = field
|
||||
# setattr(self, header, field)
|
||||
|
||||
def __repr__(self):
|
||||
return str(vars(self))
|
||||
#def __repr__(self):
|
||||
# return str(vars(self))
|
||||
|
||||
|
||||
|
||||
|
24
pywb/query.py
Normal file
24
pywb/query.py
Normal file
@ -0,0 +1,24 @@
|
||||
import indexreader
|
||||
import utils
|
||||
import wbrequestresponse
|
||||
import wbexceptions
|
||||
|
||||
class QueryHandler:
|
||||
def __init__(self):
|
||||
self.cdxserver = indexreader.RemoteCDXServer('http://web.archive.org/cdx/search/cdx')
|
||||
|
||||
def __call__(self, wbrequest, prev_wbresponse):
|
||||
wburl = wbrequest.wb_url
|
||||
|
||||
params = self.cdxserver.getQueryParams(wburl)
|
||||
|
||||
cdxlines = self.cdxserver.load(wburl.url, params)
|
||||
|
||||
cdxlines = utils.peek_iter(cdxlines)
|
||||
|
||||
if cdxlines is not None:
|
||||
return wbrequestresponse.WbResponse.text_stream(cdxlines)
|
||||
|
||||
raise wbexceptions.NotFoundException('WB Does Not Have Url: ' + wburl.url)
|
||||
|
||||
|
110
pywb/replay.py
Normal file
110
pywb/replay.py
Normal file
@ -0,0 +1,110 @@
|
||||
import indexreader
|
||||
from wbrequestresponse import WbResponse
|
||||
import utils
|
||||
|
||||
class ReplayHandler:
|
||||
def __init__(self, resolvers, archiveloader):
|
||||
self.resolvers = resolvers
|
||||
self.archiveloader = archiveloader
|
||||
|
||||
def __call__(self, wbrequest, query_response):
|
||||
cdxlist = query_response.body
|
||||
last_e = None
|
||||
first = True
|
||||
for cdx in cdxlist:
|
||||
try:
|
||||
cdx = indexreader.CDXCaptureResult(cdx)
|
||||
|
||||
# First time through, check if do redirect before warc load
|
||||
if first and (cdx['timestamp'] != wbrequest.wb_url.timestamp):
|
||||
return WbResponse.better_timestamp_response(wbrequest, cdx['timestamp'])
|
||||
|
||||
response = self.doReplay(cdx, wbrequest)
|
||||
|
||||
if response:
|
||||
# if a fallback, redirect to exact timestamp!
|
||||
if not first and (cdx['timestamp'] != wbrequest.wb_url.timestamp):
|
||||
response.close()
|
||||
return WbResponse.better_timestamp_response(wbrequest, cdx['timestamp'])
|
||||
|
||||
return response
|
||||
|
||||
first = False
|
||||
|
||||
except Exception, e:
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
last_e = e
|
||||
pass
|
||||
|
||||
if last_e:
|
||||
raise last_e
|
||||
|
||||
def _load(self, cdx, revisit = False):
|
||||
prefix = '' if not revisit else 'orig.'
|
||||
return self.archiveloader.load(self.resolveFull(cdx[prefix + 'filename']), cdx[prefix + 'offset'], cdx[prefix + 'length'])
|
||||
|
||||
def doReplay(self, cdx, wbrequest):
|
||||
hasCurr = (cdx['filename'] != '-')
|
||||
hasOrig = (cdx['orig.filename'] != '-')
|
||||
|
||||
# Case 1: non-revisit
|
||||
if (hasCurr and not hasOrig):
|
||||
headersRecord = self._load(cdx, False)
|
||||
payloadRecord = headersRecord
|
||||
isRevisit = False
|
||||
|
||||
# Case 2: old-style revisit, load headers from original payload
|
||||
elif (not hasCurr and hasOrig):
|
||||
payloadRecord = self._load(cdx, False)
|
||||
headersRecord = payloadRecord
|
||||
isRevisit = True
|
||||
|
||||
# Case 3: modern revisit, load headers from curr, payload from original
|
||||
elif (hasCurr and hasOrig):
|
||||
headersRecord = self._load(cdx, False)
|
||||
payloadRecord = self._load(cdx, True)
|
||||
|
||||
# Case 4: if headers record is actually empty (eg empty revisit), then use headers from revisit
|
||||
if not headersRecord.httpHeaders:
|
||||
headersRecord.close()
|
||||
headersRecord = payloadRecord
|
||||
|
||||
isRevisit = True
|
||||
|
||||
else:
|
||||
raise wbexceptions.CaptureException('Invalid CDX' + cdx)
|
||||
|
||||
# Check for self redirect
|
||||
if headersRecord.statusline.startswith('3'):
|
||||
if self.isSelfRedirect(wbrequest, headersRecord):
|
||||
raise wbexception.CaptureException('Self Redirect: ' + cdx)
|
||||
|
||||
return WbResponse.stream_response(headersRecord.statusline, headersRecord.httpHeaders, payloadRecord.stream)
|
||||
|
||||
def isSelfRedirect(self, wbrequest, record):
|
||||
requestUrl = wbrequest.wb_url.url.lower()
|
||||
locationUrl = utils.get_header(record.httpHeaders, 'Location').lower()
|
||||
return requestUrl == locationUrl
|
||||
#ArchivalUrlRewriter.stripProtocol(requestUrl) == ArchivalUrlRewriter.stripProtocol(locationUrl)
|
||||
|
||||
|
||||
def resolveFull(self, filename):
|
||||
# Attempt to resolve cdx file to full path
|
||||
fullUrl = None
|
||||
for resolver in self.resolvers:
|
||||
fullUrl = resolver(filename)
|
||||
if fullUrl:
|
||||
return fullUrl
|
||||
|
||||
raise exceptions.UnresolvedArchiveFileException('Archive File Not Found: ' + cdx.filename)
|
||||
|
||||
|
||||
#======================================
|
||||
# PrefixResolver - convert cdx file entry to url with prefix if url contains specified string
|
||||
#======================================
|
||||
def PrefixResolver(prefix, contains):
|
||||
def makeUrl(url):
|
||||
return prefix + url if (contains in url) else None
|
||||
|
||||
return makeUrl
|
@ -1,4 +1,6 @@
|
||||
import itertools
|
||||
import hmac
|
||||
import time
|
||||
|
||||
def peek_iter(iterable):
|
||||
try:
|
||||
@ -7,3 +9,39 @@ def peek_iter(iterable):
|
||||
return None
|
||||
|
||||
return itertools.chain([first], iterable)
|
||||
|
||||
|
||||
def get_header(headersList, name):
|
||||
nameLower = name.lower()
|
||||
for value in headersList:
|
||||
if (value[0].lower() == nameLower):
|
||||
return value[1]
|
||||
|
||||
class HMACCookieMaker:
|
||||
def __init__(self, key, name):
|
||||
self.key = key
|
||||
self.name = name
|
||||
|
||||
|
||||
def __call__(self, duration, extraId = ''):
|
||||
expire = str(long(time.time() + duration))
|
||||
|
||||
if extraId:
|
||||
msg = extraId + '-' + expire
|
||||
else:
|
||||
msg = expire
|
||||
|
||||
hmacdigest = hmac.new(self.key, msg)
|
||||
hexdigest = hmacdigest.hexdigest()
|
||||
|
||||
if extraId:
|
||||
cookie = '{0}-{1}={2}-{3}'.format(self.name, extraId, expire, hexdigest)
|
||||
else:
|
||||
cookie = '{0}={1}-{2}'.format(self.name, expire, hexdigest)
|
||||
|
||||
return cookie
|
||||
|
||||
#return cookie + hexdigest
|
||||
|
||||
|
||||
|
||||
|
@ -1,11 +1,11 @@
|
||||
import indexreader
|
||||
import json
|
||||
from query import QueryHandler
|
||||
import wbexceptions
|
||||
import utils
|
||||
|
||||
from wbrequestresponse import WbResponse
|
||||
from archivalrouter import ArchivalRequestRouter
|
||||
|
||||
|
||||
## ===========
|
||||
class EchoEnv:
|
||||
def __call__(self, wbrequest, _):
|
||||
return WbResponse.text_response(str(wbrequest.env))
|
||||
@ -14,33 +14,20 @@ class WBHandler:
|
||||
def __call__(self, wbrequest, _):
|
||||
return WbResponse.text_response(str(wbrequest))
|
||||
|
||||
class QueryHandler:
|
||||
def __init__(self):
|
||||
self.cdxserver = indexreader.RemoteCDXServer('http://web.archive.org/cdx/search/cdx')
|
||||
|
||||
|
||||
def __call__(self, wbrequest, prev_wbresponse):
|
||||
wburl = wbrequest.wb_url
|
||||
|
||||
params = self.cdxserver.getQueryParams(wburl)
|
||||
|
||||
cdxlines = self.cdxserver.load(wburl.url, params)
|
||||
|
||||
cdxlines = utils.peek_iter(cdxlines)
|
||||
|
||||
if cdxlines is not None:
|
||||
return WbResponse.text_stream(cdxlines)
|
||||
|
||||
raise wbexceptions.NotFoundException('WB Does Not Have Url: ' + wburl.url)
|
||||
|
||||
## ===========
|
||||
query = QueryHandler()
|
||||
|
||||
import testwb
|
||||
replay = testwb.createReplay()
|
||||
|
||||
## ===========
|
||||
parser = ArchivalRequestRouter(
|
||||
{
|
||||
't0' : [EchoEnv()],
|
||||
't1' : [WBHandler()],
|
||||
't2' : [QueryHandler()]
|
||||
't2' : [query],
|
||||
't3' : [query, replay],
|
||||
},
|
||||
hostpaths = ['http://localhost:9090/'])
|
||||
## ===========
|
||||
@ -63,6 +50,7 @@ def application(env, start_response):
|
||||
|
||||
return response(env, start_response)
|
||||
|
||||
|
||||
def handleException(env, exc):
|
||||
if hasattr(exc, 'status'):
|
||||
status = exc.status()
|
||||
|
@ -18,3 +18,24 @@ class InvalidCDXException(Exception):
|
||||
class NotFoundException(Exception):
|
||||
def status(_):
|
||||
return '404'
|
||||
|
||||
# Exceptions that effect a specific capture and result in a retry
|
||||
class CaptureException(Exception):
|
||||
def status(_):
|
||||
return '500'
|
||||
|
||||
class UnresolvedArchiveFileException(CaptureException):
|
||||
pass
|
||||
|
||||
class UnknownArchiveFormatException(CaptureException):
|
||||
pass
|
||||
|
||||
class UnknownLoaderProtocolException(CaptureException):
|
||||
pass
|
||||
|
||||
class InvalidArchiveRecordException(CaptureException):
|
||||
def __init__(msg, errList = None):
|
||||
super(InvalidArchiveRecordException, self).__init__(msg)
|
||||
self.errList = errList
|
||||
|
||||
|
||||
|
@ -1,4 +1,5 @@
|
||||
from wbarchivalurl import ArchivalUrl
|
||||
import utils
|
||||
#WB Request and Response
|
||||
|
||||
class WbRequest:
|
||||
@ -106,11 +107,27 @@ class WbResponse:
|
||||
def redir_response(location, status = '302 Redirect'):
|
||||
return WbResponse(status, headersList = [('Location', location)])
|
||||
|
||||
@staticmethod
|
||||
def stream_response(statusline, headers, stream):
|
||||
def streamGen():
|
||||
try:
|
||||
buff = stream.read()
|
||||
while buff:
|
||||
yield buff
|
||||
buff = stream.read()
|
||||
finally:
|
||||
stream.close()
|
||||
|
||||
return WbResponse(statusline, headersList = headers, value = streamGen())
|
||||
|
||||
@staticmethod
|
||||
def better_timestamp_response(wbrequest, newTimestamp):
|
||||
wbrequest.wb_url.timestamp = newTimestamp
|
||||
newUrl = wbrequest.wb_prefix + str(wbrequest.wb_url)[1:]
|
||||
return WbResponse.redir_response(newUrl)
|
||||
|
||||
def get_header(self, name):
|
||||
name_upp = name.upper()
|
||||
for value in self.headersList:
|
||||
if (value[0].upper() == name_upp):
|
||||
return value[1]
|
||||
return utils.get_header(self.headersList, name)
|
||||
|
||||
def __call__(self, env, start_response):
|
||||
#headersList = []
|
||||
@ -119,6 +136,12 @@ class WbResponse:
|
||||
|
||||
start_response(self.status, self.headersList)
|
||||
|
||||
if env['REQUEST_METHOD'] == 'HEAD':
|
||||
if hasattr(self.body, 'close'):
|
||||
self.body.close()
|
||||
return self.body
|
||||
return []
|
||||
|
||||
if hasattr(self.body, '__iter__'):
|
||||
return self.body
|
||||
else:
|
||||
|
@ -37,6 +37,9 @@ class ArchivalUrlRewriter:
|
||||
|
||||
>>> ArchivalUrlRewriter('/19960708im_/http://domain.example.com/path.txt', '/abc/').getAbsUrl()
|
||||
'/abc/19960708im_/'
|
||||
|
||||
>>> ArchivalUrlRewriter.stripProtocol('https://example.com') == ArchivalUrlRewriter.stripProtocol('http://example.com')
|
||||
True
|
||||
"""
|
||||
|
||||
NO_REWRITE_URI_PREFIX = ['javascript:', 'data:', 'mailto:', 'about:']
|
||||
@ -85,6 +88,14 @@ class ArchivalUrlRewriter:
|
||||
def setBaseUrl(self, newUrl):
|
||||
self.wburl.url = newUrl
|
||||
|
||||
@staticmethod
|
||||
def stripProtocol(url):
|
||||
for protocol in ArchivalUrlRewriter.PROTOCOLS:
|
||||
if url.startswith(protocol):
|
||||
return url[len(protocol):]
|
||||
|
||||
return url
|
||||
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user