1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-25 23:47:47 +01:00
pywb/pywb/core/replay_views.py
Ilya Kreymer e384425d48 proxy cleanup: move HttpsUrlRewriter to url_rewriter module,
move strip_scheme to replay_views where it is used
regex rewriters: use url rewriter for rewriting http:// in JS,
instead of just prefix, to support custom rewriters (such as
https->http rewriter in proxy mode)
2014-03-09 14:21:32 -07:00

249 lines
8.7 KiB
Python

import re
from io import BytesIO
from pywb.utils.bufferedreaders import ChunkedDataReader
from pywb.framework.wbrequestresponse import WbResponse
from pywb.framework.wbexceptions import CaptureException, InternalRedirect
from pywb.warc.recordloader import ArchiveLoadFailed
from pywb.utils.loaders import LimitReader
#=================================================================
class ReplayView:
STRIP_SCHEME = re.compile('^([\w]+:[/]*)?(.*?)$')
def __init__(self, content_loader, content_rewriter, head_insert_view = None,
redir_to_exact = True, buffer_response = False, reporter = None):
self.content_loader = content_loader
self.content_rewriter = content_rewriter
self.head_insert_view = head_insert_view
self.redir_to_exact = redir_to_exact
# buffer or stream rewritten response
self.buffer_response = buffer_response
self._reporter = reporter
def __call__(self, wbrequest, cdx_lines, cdx_loader):
last_e = None
first = True
# List of already failed w/arcs
failed_files = []
# Iterate over the cdx until find one that works
# The cdx should already be sorted in closest-to-timestamp order (from the cdx server)
for cdx in cdx_lines:
try:
# optimize: can detect if redirect is needed just from the cdx, no need to load w/arc data
if first:
self._redirect_if_needed(wbrequest, cdx)
first = False
(status_headers, stream) = (self.content_loader.
resolve_headers_and_payload(cdx, failed_files, cdx_loader))
# check and reject self-redirect
self._reject_self_redirect(wbrequest, cdx, status_headers)
# check if redir is needed
self._redirect_if_needed(wbrequest, cdx)
# one more check for referrer-based self-redirect
self._reject_referrer_self_redirect(wbrequest)
response = None
# if Content-Length for payload is present, ensure we don't read past it
content_length = status_headers.get_header('content-length')
if content_length:
stream = LimitReader.wrap_stream(stream, content_length)
if self.content_rewriter and wbrequest.wb_url.mod != 'id_':
response = self.rewrite_content(wbrequest, cdx, status_headers, stream)
else:
(status_headers, stream) = self.sanitize_content(status_headers, stream)
#status_headers.remove_header('content-length')
response_iter = self.stream_to_iter(stream)
response = WbResponse(status_headers, response_iter)
# notify reporter callback, if any
if self._reporter:
self._reporter(wbrequest, cdx, response)
return response
except (CaptureException, ArchiveLoadFailed) as ce:
import traceback
traceback.print_exc()
last_e = ce
pass
if last_e:
raise last_e
else:
raise WbException('No Content Loaded for: ' + wbrequest.wb_url)
@staticmethod
def stream_to_iter(stream):
try:
buff = stream.read()
while buff:
yield buff
buff = stream.read()
finally:
stream.close()
def sanitize_content(self, status_headers, stream):
# remove transfer encoding chunked and wrap in a dechunking stream
if (status_headers.remove_header('transfer-encoding')):
stream = ChunkedDataReader(stream)
return (status_headers, stream)
def rewrite_content(self, wbrequest, cdx, status_headers, stream):
urlrewriter = wbrequest.urlrewriter
result = self.content_rewriter.rewrite_headers(urlrewriter,
status_headers,
stream,
cdx['urlkey'])
(rewritten_headers, stream) = result
# no rewriting needed!
if rewritten_headers.text_type is None:
response_iter = self.stream_to_iter(stream)
return WbResponse(rewritten_headers.status_headers, response_iter)
def make_head_insert(rule):
return (self.head_insert_view.render_to_string(wbrequest=wbrequest,
cdx=cdx,
rule=rule))
# do head insert
if self.head_insert_view:
head_insert_func = make_head_insert
else:
head_insert_func = None
result = self.content_rewriter.rewrite_content(urlrewriter,
rewritten_headers,
stream,
head_insert_func,
cdx['urlkey'])
(status_headers, response_gen) = result
if self.buffer_response:
if wbrequest.wb_url.mod == 'id_':
status_headers.remove_header('content-length')
return self.buffered_response(status_headers, response_gen)
return WbResponse(status_headers, response_gen)
# Buffer rewrite iterator and return a response from a string
def buffered_response(self, status_headers, iterator):
out = BytesIO()
try:
for buff in iterator:
out.write(bytes(buff))
finally:
content = out.getvalue()
content_length_str = str(len(content))
status_headers.headers.append(('Content-Length', content_length_str))
out.close()
return WbResponse(status_headers, value = [content])
def _redirect_if_needed(self, wbrequest, cdx):
if self.redir_to_exact and not wbrequest.is_proxy and cdx and (cdx['timestamp'] != wbrequest.wb_url.timestamp):
new_url = wbrequest.urlrewriter.get_timestamp_url(cdx['timestamp'], cdx['original'])
raise InternalRedirect(new_url)
return None
def _reject_self_redirect(self, wbrequest, cdx, status_headers):
"""
Check if response is a 3xx redirect to the same url
If so, reject this capture to avoid causing redirect loop
"""
if not status_headers.statusline.startswith('3'):
return
# skip all 304s
if (status_headers.statusline.startswith('304') and
not wbrequest.wb_url.mod == 'id_'):
raise CaptureException('Skipping 304 Modified: ' + str(cdx))
request_url = wbrequest.wb_url.url.lower()
location_url = status_headers.get_header('Location')
if not location_url:
return
location_url = location_url.lower()
if (ReplayView.strip_scheme(request_url) == ReplayView.strip_scheme(location_url)):
raise CaptureException('Self Redirect: ' + str(cdx))
def _reject_referrer_self_redirect(self, wbrequest):
"""
Perform final check for referrer based self-redirect.
This method should be called after verifying request timestamp matches capture.
if referrer is same as current url, reject this response and try another capture
"""
if not wbrequest.referrer:
return
# build full url even if using relative-rewriting
request_url = (wbrequest.host_prefix +
wbrequest.rel_prefix + str(wbrequest.wb_url))
if (ReplayView.strip_scheme(request_url) ==
ReplayView.strip_scheme(wbrequest.referrer)):
raise CaptureException('Self Redirect via Referrer: ' + str(wbrequest.wb_url))
@staticmethod
def strip_scheme(url):
"""
>>> ReplayView.strip_scheme('https://example.com') == ReplayView.strip_scheme('http://example.com')
True
>>> ReplayView.strip_scheme('https://example.com') == ReplayView.strip_scheme('http:/example.com')
True
>>> ReplayView.strip_scheme('https://example.com') == ReplayView.strip_scheme('example.com')
True
>>> ReplayView.strip_scheme('about://example.com') == ReplayView.strip_scheme('example.com')
True
"""
m = ReplayView.STRIP_SCHEME.match(url)
if not m:
return url
match = m.group(2)
if match:
return match
else:
return url
if __name__ == "__main__":
import doctest
doctest.testmod()