1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-30 10:45:31 +02:00
pywb/pywb/webapp/replay_views.py

354 lines
12 KiB
Python
Raw Normal View History

import re
import logging
2014-03-08 09:30:19 -08:00
from io import BytesIO
from urlparse import urlsplit
from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.utils.wbexception import WbException, NotFoundException
from pywb.utils.loaders import LimitReader
from pywb.utils.timeutils import timestamp_now
from pywb.framework.wbrequestresponse import WbResponse
from pywb.framework.memento import MementoResponse
from pywb.rewrite.rewrite_content import RewriteContent
2014-02-17 10:23:37 -08:00
from pywb.warc.recordloader import ArchiveLoadFailed
from views import J2TemplateView, add_env_globals
from views import J2HtmlCapturesView, HeadInsertView
from rangecache import range_cache
#=================================================================
class CaptureException(WbException):
"""
raised to indicate an issue with a specific capture
and will be caught and result in a retry, if possible
if not, will result in a 502
"""
def status(self):
return '502 Internal Server Error'
#=================================================================
class ReplayView(object):
STRIP_SCHEME_WWW = re.compile('^([\w]+:[/]*(?:www[\d]*\.)?)?(.*?)$')
def __init__(self, content_loader, config):
self.content_loader = content_loader
framed = config.get('framed_replay')
self.content_rewriter = RewriteContent(is_framed_replay=framed)
self.head_insert_view = HeadInsertView.init_from_config(config)
self.buffer_response = config.get('buffer_response', True)
self.redir_to_exact = config.get('redir_to_exact', True)
memento = config.get('enable_memento', False)
if memento:
self.response_class = MementoResponse
else:
self.response_class = WbResponse
self.enable_range_cache = config.get('enable_ranges', True)
self._reporter = config.get('reporter')
def render_content(self, wbrequest, cdx_lines, cdx_loader):
last_e = None
first = True
#cdx_lines = args[0]
#cdx_loader = args[1]
# List of already failed w/arcs
failed_files = []
response = None
# Iterate over the cdx until find one that works
# The cdx should already be sorted in
# closest-to-timestamp order (from the cdx server)
for cdx in cdx_lines:
try:
# optimize: can detect if redirect is needed just from the cdx,
# no need to load w/arc data if requiring exact match
if first:
redir_response = self._redirect_if_needed(wbrequest, cdx)
if redir_response:
return redir_response
first = False
response = self.cached_replay_capture(wbrequest,
cdx,
cdx_loader,
failed_files)
except (CaptureException, ArchiveLoadFailed) as ce:
#import traceback
#traceback.print_exc()
logging.debug(ce)
last_e = ce
pass
if response:
return response
if not last_e:
# can only get here if cdx_lines is empty somehow
# should be filtered out before hand, but if not
msg = 'No Captures found for: ' + wbrequest.wb_url.url
last_e = NotFoundException(msg)
raise last_e
def cached_replay_capture(self, wbrequest, cdx, cdx_loader, failed_files):
def get_capture():
return self.replay_capture(wbrequest,
cdx,
cdx_loader,
failed_files)
if not self.enable_range_cache:
return get_capture()
range_info = wbrequest.extract_range()
if not range_info:
return get_capture()
range_status, range_iter = (range_cache.
handle_range(wbrequest,
cdx.get('digest'),
get_capture,
*range_info))
response = self.response_class(range_status,
range_iter,
wbrequest=wbrequest,
cdx=cdx)
return response
def replay_capture(self, wbrequest, cdx, cdx_loader, failed_files):
(status_headers, stream) = (self.content_loader(cdx,
failed_files,
cdx_loader,
wbrequest))
# check and reject self-redirect
self._reject_self_redirect(wbrequest, cdx, status_headers)
# check if redir is needed
redir_response = self._redirect_if_needed(wbrequest, cdx)
if redir_response:
return redir_response
length = status_headers.get_header('content-length')
stream = LimitReader.wrap_stream(stream, length)
# one more check for referrer-based self-redirect
# TODO: evaluate this, as refreshing in browser may sometimes cause
# referrer to be set to the same page, incorrectly skipping a capture
# self._reject_referrer_self_redirect(wbrequest)
urlrewriter = wbrequest.urlrewriter
# if using url rewriter, use original url for rewriting purposes
if wbrequest and wbrequest.wb_url:
wbrequest.wb_url.url = cdx['original']
head_insert_func = None
if self.head_insert_view:
head_insert_func = (self.head_insert_view.
create_insert_func(wbrequest))
result = (self.content_rewriter.
rewrite_content(urlrewriter,
headers=status_headers,
stream=stream,
head_insert_func=head_insert_func,
urlkey=cdx['urlkey'],
cdx=cdx))
(status_headers, response_iter, is_rewritten) = result
# buffer response if buffering enabled
if self.buffer_response:
response_iter = self.buffered_response(status_headers,
response_iter)
response = self.response_class(status_headers,
response_iter,
wbrequest=wbrequest,
cdx=cdx)
# notify reporter callback, if any
if self._reporter:
self._reporter(wbrequest, cdx, response)
return response
# Buffer rewrite iterator and return a response from a string
def buffered_response(self, status_headers, iterator):
2014-03-08 09:30:19 -08:00
out = BytesIO()
try:
for buff in iterator:
2014-03-08 09:30:19 -08:00
out.write(bytes(buff))
finally:
content = out.getvalue()
content_length_str = str(len(content))
# remove existing content length
status_headers.replace_header('Content-Length',
content_length_str)
out.close()
return [content]
def _redirect_if_needed(self, wbrequest, cdx):
if wbrequest.options['is_proxy']:
return None
if wbrequest.custom_params.get('noredir'):
return None
is_timegate = (wbrequest.options.get('is_timegate', False))
if not is_timegate:
is_timegate = wbrequest.wb_url.is_latest_replay()
redir_needed = is_timegate
if not redir_needed and self.redir_to_exact:
redir_needed = (cdx['timestamp'] != wbrequest.wb_url.timestamp)
if not redir_needed:
return None
if self.enable_range_cache and wbrequest.extract_range():
return None
if is_timegate and not self.redir_to_exact:
timestamp = timestamp_now()
else:
timestamp = cdx['timestamp']
new_url = (wbrequest.urlrewriter.
get_new_url(timestamp=timestamp,
url=cdx['original']))
if wbrequest.method == 'POST':
# FF shows a confirm dialog, so can't use 307 effectively
# was: statusline = '307 Same-Method Internal Redirect'
return None
elif is_timegate:
statusline = '302 Found'
else:
# clear cdx line to indicate internal redirect
statusline = '302 Internal Redirect'
cdx = None
status_headers = StatusAndHeaders(statusline,
[('Location', new_url)])
return self.response_class(status_headers,
wbrequest=wbrequest,
cdx=cdx)
def _reject_self_redirect(self, wbrequest, cdx, status_headers):
"""
Check if response is a 3xx redirect to the same url
If so, reject this capture to avoid causing redirect loop
"""
2014-03-08 23:46:59 +00:00
if not status_headers.statusline.startswith('3'):
return
# skip all 304s
if (status_headers.statusline.startswith('304') and
not wbrequest.wb_url.is_identity):
raise CaptureException('Skipping 304 Modified: ' + str(cdx))
2014-03-08 23:46:59 +00:00
request_url = wbrequest.wb_url.url.lower()
location_url = status_headers.get_header('Location')
if not location_url:
return
2014-03-08 23:46:59 +00:00
location_url = location_url.lower()
if location_url.startswith('/'):
host = urlsplit(cdx['original']).netloc
location_url = host + location_url
if (ReplayView.strip_scheme_www(request_url) ==
ReplayView.strip_scheme_www(location_url)):
2014-03-08 23:46:59 +00:00
raise CaptureException('Self Redirect: ' + str(cdx))
# TODO: reevaluate this, as it may reject valid refreshes of a page
def _reject_referrer_self_redirect(self, wbrequest): # pragma: no cover
"""
Perform final check for referrer based self-redirect.
This method should be called after verifying that
the request timestamp == capture timestamp
If referrer is same as current url,
reject this response and try another capture.
"""
if not wbrequest.referrer:
return
# build full url even if using relative-rewriting
request_url = (wbrequest.host_prefix +
wbrequest.rel_prefix + str(wbrequest.wb_url))
if (ReplayView.strip_scheme_www(request_url) ==
ReplayView.strip_scheme_www(wbrequest.referrer)):
raise CaptureException('Self Redirect via Referrer: ' +
str(wbrequest.wb_url))
@staticmethod
def strip_scheme_www(url):
"""
>>> ReplayView.strip_scheme_www('https://example.com') ==\
ReplayView.strip_scheme_www('http://example.com')
True
>>> ReplayView.strip_scheme_www('https://example.com') ==\
ReplayView.strip_scheme_www('http:/example.com')
True
>>> ReplayView.strip_scheme_www('https://example.com') ==\
ReplayView.strip_scheme_www('example.com')
True
>>> ReplayView.strip_scheme_www('https://example.com') ==\
ReplayView.strip_scheme_www('http://www2.example.com')
True
>>> ReplayView.strip_scheme_www('about://example.com') ==\
ReplayView.strip_scheme_www('example.com')
True
>>> ReplayView.strip_scheme_www('http://') ==\
ReplayView.strip_scheme_www('')
True
>>> ReplayView.strip_scheme_www('#!@?') ==\
ReplayView.strip_scheme_www('#!@?')
True
"""
m = ReplayView.STRIP_SCHEME_WWW.match(url)
match = m.group(2)
return match
if __name__ == "__main__":
import doctest
doctest.testmod()