2013-12-28 17:39:43 -08:00
|
|
|
import StringIO
|
2013-12-31 00:18:12 +00:00
|
|
|
from urllib2 import URLError
|
2014-01-03 13:03:03 -08:00
|
|
|
import chardet
|
2014-01-19 12:31:19 -08:00
|
|
|
import copy
|
2014-01-28 16:41:19 -08:00
|
|
|
import itertools
|
2013-12-28 17:39:43 -08:00
|
|
|
|
2014-01-28 16:41:19 -08:00
|
|
|
import archiveloader
|
2014-01-03 13:03:03 -08:00
|
|
|
from wbrequestresponse import WbResponse, StatusAndHeaders
|
2013-12-28 05:00:06 -08:00
|
|
|
import utils
|
2013-12-28 17:39:43 -08:00
|
|
|
|
2014-01-28 19:37:37 -08:00
|
|
|
from url_rewriter import UrlRewriter
|
2014-01-03 13:03:03 -08:00
|
|
|
from header_rewriter import HeaderRewriter
|
|
|
|
import html_rewriter
|
|
|
|
import regex_rewriters
|
|
|
|
|
2013-12-28 17:39:43 -08:00
|
|
|
import wbexceptions
|
|
|
|
|
|
|
|
|
|
|
|
#=================================================================
|
2014-01-28 17:23:44 -08:00
|
|
|
class ReplayView:
|
2014-02-03 09:24:40 -08:00
|
|
|
def __init__(self, resolvers, loader = None):
|
2013-12-28 05:00:06 -08:00
|
|
|
self.resolvers = resolvers
|
2014-02-03 09:24:40 -08:00
|
|
|
self.loader = loader if loader else archiveloader.ArchiveLoader()
|
2013-12-28 05:00:06 -08:00
|
|
|
|
2014-02-03 09:24:40 -08:00
|
|
|
|
2014-02-08 20:07:16 -08:00
|
|
|
def __call__(self, wbrequest, cdx_lines, cdx_reader):
|
2013-12-28 05:00:06 -08:00
|
|
|
last_e = None
|
|
|
|
first = True
|
2014-01-03 17:40:20 -08:00
|
|
|
|
2013-12-31 00:18:12 +00:00
|
|
|
# List of already failed w/arcs
|
2014-01-28 19:37:37 -08:00
|
|
|
failed_files = []
|
2013-12-28 17:39:43 -08:00
|
|
|
|
2014-01-03 17:40:20 -08:00
|
|
|
# Iterate over the cdx until find one that works
|
|
|
|
# The cdx should already be sorted in closest-to-timestamp order (from the cdx server)
|
2014-01-28 16:41:19 -08:00
|
|
|
for cdx in cdx_lines:
|
2013-12-28 05:00:06 -08:00
|
|
|
try:
|
2014-02-03 09:24:40 -08:00
|
|
|
# optimize: can detect if redirect is needed just from the cdx, no need to load w/arc data
|
2013-12-28 17:39:43 -08:00
|
|
|
if first:
|
2014-02-03 09:24:40 -08:00
|
|
|
self._redirect_if_needed(wbrequest, cdx)
|
2013-12-28 17:39:43 -08:00
|
|
|
first = False
|
2013-12-28 05:00:06 -08:00
|
|
|
|
2014-02-03 09:24:40 -08:00
|
|
|
(cdx, status_headers, stream) = self.resolve_headers_and_payload(cdx, wbrequest, cdx_reader, failed_files)
|
|
|
|
|
2014-02-08 20:07:16 -08:00
|
|
|
return self.make_response(wbrequest, cdx, status_headers, stream)
|
2013-12-28 05:00:06 -08:00
|
|
|
|
|
|
|
|
2013-12-28 17:39:43 -08:00
|
|
|
except wbexceptions.CaptureException as ce:
|
2013-12-28 05:00:06 -08:00
|
|
|
import traceback
|
|
|
|
traceback.print_exc()
|
2013-12-28 17:39:43 -08:00
|
|
|
last_e = ce
|
2013-12-28 05:00:06 -08:00
|
|
|
pass
|
|
|
|
|
|
|
|
if last_e:
|
|
|
|
raise last_e
|
2013-12-28 17:39:43 -08:00
|
|
|
else:
|
2013-12-31 00:18:12 +00:00
|
|
|
raise wbexceptions.UnresolvedArchiveFileException()
|
2013-12-28 17:39:43 -08:00
|
|
|
|
2014-02-03 09:24:40 -08:00
|
|
|
|
|
|
|
# callback to issue a redirect to another request
|
|
|
|
# subclasses may provide custom logic
|
|
|
|
def _redirect_if_needed(self, wbrequest, cdx):
|
|
|
|
pass
|
|
|
|
|
2013-12-28 05:00:06 -08:00
|
|
|
|
2014-01-28 19:37:37 -08:00
|
|
|
def _load(self, cdx, revisit, failed_files):
|
2013-12-28 17:39:43 -08:00
|
|
|
if revisit:
|
2013-12-31 00:18:12 +00:00
|
|
|
(filename, offset, length) = (cdx['orig.filename'], cdx['orig.offset'], cdx['orig.length'])
|
2013-12-28 17:39:43 -08:00
|
|
|
else:
|
2013-12-31 00:18:12 +00:00
|
|
|
(filename, offset, length) = (cdx['filename'], cdx['offset'], cdx['length'])
|
|
|
|
|
|
|
|
#optimization: if same file already failed this request, don't try again
|
2014-01-28 19:37:37 -08:00
|
|
|
if failed_files and filename in failed_files:
|
2013-12-31 00:18:12 +00:00
|
|
|
raise wbexceptions.ArchiveLoadFailed(filename, 'Skipping Already Failed')
|
|
|
|
|
2014-01-23 01:38:09 -08:00
|
|
|
any_found = False
|
|
|
|
last_exc = None
|
|
|
|
for resolver in self.resolvers:
|
|
|
|
possible_paths = resolver(filename)
|
|
|
|
|
|
|
|
if possible_paths:
|
|
|
|
for path in possible_paths:
|
|
|
|
any_found = True
|
|
|
|
try:
|
2014-01-28 16:41:19 -08:00
|
|
|
return self.loader.load(path, offset, length)
|
2014-01-23 01:38:09 -08:00
|
|
|
|
|
|
|
except URLError as ue:
|
|
|
|
last_exc = ue
|
|
|
|
print last_exc
|
|
|
|
pass
|
2013-12-31 00:18:12 +00:00
|
|
|
|
2014-01-23 01:38:09 -08:00
|
|
|
# Unsuccessful if reached here
|
2014-01-28 19:37:37 -08:00
|
|
|
if failed_files:
|
|
|
|
failed_files.append(filename)
|
2013-12-31 00:18:12 +00:00
|
|
|
|
2014-01-23 01:38:09 -08:00
|
|
|
if not any_found:
|
|
|
|
raise wbexceptions.UnresolvedArchiveFileException('Archive File Not Found: ' + filename)
|
|
|
|
else:
|
|
|
|
raise wbexceptions.ArchiveLoadFailed(filename, last_exc.reason if last_exc else '')
|
2013-12-28 17:39:43 -08:00
|
|
|
|
2013-12-28 05:00:06 -08:00
|
|
|
|
2014-02-03 09:24:40 -08:00
|
|
|
def resolve_headers_and_payload(self, cdx, wbrequest, cdx_reader, failed_files):
|
2014-01-28 19:37:37 -08:00
|
|
|
has_curr = (cdx['filename'] != '-')
|
|
|
|
has_orig = (cdx.get('orig.filename','-') != '-')
|
2013-12-28 05:00:06 -08:00
|
|
|
|
2014-01-20 17:52:14 -08:00
|
|
|
# load headers record from cdx['filename'] unless it is '-' (rare)
|
2014-01-28 19:37:37 -08:00
|
|
|
headers_record = self._load(cdx, False, failed_files) if has_curr else None
|
2013-12-28 05:00:06 -08:00
|
|
|
|
2014-01-20 17:52:14 -08:00
|
|
|
# two index lookups
|
|
|
|
# Case 1: if mimetype is still warc/revisit
|
2014-01-28 19:37:37 -08:00
|
|
|
if cdx['mimetype'] == 'warc/revisit' and headers_record:
|
|
|
|
payload_record = self._load_different_url_payload(wbrequest, cdx_reader, cdx, headers_record, failed_files)
|
2014-01-20 17:52:14 -08:00
|
|
|
|
|
|
|
# single lookup cases
|
|
|
|
# case 2: non-revisit
|
2014-01-28 19:37:37 -08:00
|
|
|
elif (has_curr and not has_orig):
|
|
|
|
payload_record = headers_record
|
2013-12-28 05:00:06 -08:00
|
|
|
|
2014-01-20 17:52:14 -08:00
|
|
|
# case 3: identical url revisit, load payload from orig.filename
|
2014-01-28 19:37:37 -08:00
|
|
|
elif (has_orig):
|
|
|
|
payload_record = self._load(cdx, True, failed_files)
|
2013-12-28 05:00:06 -08:00
|
|
|
|
2014-01-20 17:52:14 -08:00
|
|
|
# special case: set header to payload if old-style revisit with missing header
|
2014-01-28 19:37:37 -08:00
|
|
|
if not headers_record:
|
|
|
|
headers_record = payload_record
|
|
|
|
elif headers_record != payload_record:
|
2014-01-20 17:52:14 -08:00
|
|
|
# close remainder of stream as this record only used for (already parsed) headers
|
2014-01-28 19:37:37 -08:00
|
|
|
headers_record.stream.close()
|
2014-01-20 17:52:14 -08:00
|
|
|
|
|
|
|
# special case: check if headers record is actually empty (eg empty revisit), then use headers from revisit
|
2014-01-28 19:37:37 -08:00
|
|
|
if not headers_record.status_headers.headers:
|
|
|
|
headers_record = payload_record
|
2013-12-28 05:00:06 -08:00
|
|
|
|
|
|
|
|
2014-01-28 19:37:37 -08:00
|
|
|
if not headers_record or not payload_record:
|
2014-01-19 12:31:19 -08:00
|
|
|
raise wbexceptions.CaptureException('Invalid CDX' + str(cdx))
|
|
|
|
|
2013-12-28 05:00:06 -08:00
|
|
|
|
2014-02-03 09:24:40 -08:00
|
|
|
#response = WbResponse(headers_record.status_headers, self.create_stream_gen(payload_record.stream))
|
|
|
|
#response._stream = payload_record.stream
|
|
|
|
return (cdx, headers_record.status_headers, payload_record.stream)
|
|
|
|
|
|
|
|
|
|
|
|
# done here! just return response
|
|
|
|
# subclasses make override to do additional processing
|
2014-02-08 20:07:16 -08:00
|
|
|
def make_response(self, wbrequest, cdx, status_headers, stream):
|
2014-02-03 09:24:40 -08:00
|
|
|
return self.create_stream_response(status_headers, stream)
|
|
|
|
|
2013-12-28 05:00:06 -08:00
|
|
|
|
2014-02-03 09:24:40 -08:00
|
|
|
# create response from headers and wrapping stream in generator
|
|
|
|
def create_stream_response(self, status_headers, stream):
|
|
|
|
return WbResponse(status_headers, self.create_stream_gen(stream))
|
2013-12-28 05:00:06 -08:00
|
|
|
|
2014-01-19 12:31:19 -08:00
|
|
|
|
|
|
|
# Handle the case where a duplicate of a capture with same digest exists at a different url
|
|
|
|
# Must query the index at that url filtering by matching digest
|
|
|
|
# Raise exception if no matches found
|
2014-01-28 19:37:37 -08:00
|
|
|
def _load_different_url_payload(self, wbrequest, cdx_reader, cdx, headers_record, failed_files):
|
|
|
|
ref_target_uri = headers_record.rec_headers.get_header('WARC-Refers-To-Target-URI')
|
2014-01-19 12:31:19 -08:00
|
|
|
|
2014-01-20 17:52:14 -08:00
|
|
|
# Check for unresolved revisit error, if refers to target uri not present or same as the current url
|
2014-01-28 19:37:37 -08:00
|
|
|
if not ref_target_uri or (ref_target_uri == headers_record.rec_headers.get_header('WARC-Target-URI')):
|
2014-01-19 12:31:19 -08:00
|
|
|
raise wbexceptions.CaptureException('Missing Revisit Original' + str(cdx))
|
|
|
|
|
2014-01-28 19:37:37 -08:00
|
|
|
ref_target_date = headers_record.rec_headers.get_header('WARC-Refers-To-Date')
|
2014-01-19 12:31:19 -08:00
|
|
|
|
|
|
|
if not ref_target_date:
|
|
|
|
ref_target_date = cdx['timestamp']
|
|
|
|
else:
|
|
|
|
ref_target_date = utils.iso_date_to_timestamp(ref_target_date)
|
|
|
|
|
|
|
|
# clone WbRequest
|
|
|
|
orig_wbreq = copy.copy(wbrequest)
|
|
|
|
orig_wbreq.wb_url = copy.copy(orig_wbreq.wb_url)
|
|
|
|
|
|
|
|
orig_wbreq.wb_url.url = ref_target_uri
|
|
|
|
orig_wbreq.wb_url.timestamp = ref_target_date
|
|
|
|
|
|
|
|
# Must also match digest
|
2014-01-28 19:37:37 -08:00
|
|
|
orig_wbreq.query_filter.append('digest:' + cdx['digest'])
|
2014-01-19 12:31:19 -08:00
|
|
|
|
2014-01-28 16:41:19 -08:00
|
|
|
orig_cdx_lines = cdx_reader.load_for_request(orig_wbreq, parsed_cdx = True)
|
2014-01-19 12:31:19 -08:00
|
|
|
|
2014-01-28 16:41:19 -08:00
|
|
|
for cdx in orig_cdx_lines:
|
2014-01-19 12:31:19 -08:00
|
|
|
try:
|
2014-01-28 16:41:19 -08:00
|
|
|
#cdx = cdx_reader.CDXCaptureResult(cdx)
|
2014-01-19 12:31:19 -08:00
|
|
|
#print cdx
|
2014-01-28 19:37:37 -08:00
|
|
|
payload_record = self._load(cdx, False, failed_files)
|
|
|
|
return payload_record
|
2014-01-19 12:31:19 -08:00
|
|
|
|
|
|
|
except wbexceptions.CaptureException as e:
|
|
|
|
pass
|
|
|
|
|
|
|
|
raise wbexceptions.CaptureException('Original for revisit could not be loaded')
|
|
|
|
|
|
|
|
|
2014-01-28 19:37:37 -08:00
|
|
|
def resolve_full(self, filename):
|
2013-12-28 05:00:06 -08:00
|
|
|
# Attempt to resolve cdx file to full path
|
2014-01-28 19:37:37 -08:00
|
|
|
full_url = None
|
2013-12-28 05:00:06 -08:00
|
|
|
for resolver in self.resolvers:
|
2014-01-28 19:37:37 -08:00
|
|
|
full_url = resolver(filename)
|
|
|
|
if full_url:
|
|
|
|
return full_url
|
2013-12-28 05:00:06 -08:00
|
|
|
|
2013-12-30 03:03:33 +00:00
|
|
|
raise wbexceptions.UnresolvedArchiveFileException('Archive File Not Found: ' + filename)
|
2013-12-28 05:00:06 -08:00
|
|
|
|
2014-02-03 09:24:40 -08:00
|
|
|
|
2014-01-22 17:55:55 -08:00
|
|
|
# Create a generator reading from a stream, with optional rewriting and final read call
|
|
|
|
@staticmethod
|
|
|
|
def create_stream_gen(stream, rewrite_func = None, final_read_func = None, first_buff = None):
|
|
|
|
try:
|
|
|
|
buff = first_buff if first_buff else stream.read()
|
|
|
|
while buff:
|
|
|
|
if rewrite_func:
|
|
|
|
buff = rewrite_func(buff)
|
|
|
|
yield buff
|
|
|
|
buff = stream.read()
|
|
|
|
|
|
|
|
# For adding a tail/handling final buffer
|
|
|
|
if final_read_func:
|
|
|
|
buff = final_read_func()
|
|
|
|
if buff:
|
|
|
|
yield buff
|
|
|
|
|
|
|
|
finally:
|
|
|
|
stream.close()
|
|
|
|
|
2013-12-28 05:00:06 -08:00
|
|
|
|
2014-01-31 10:04:21 -08:00
|
|
|
def __str__(self):
|
|
|
|
return 'find archive files from ' + str(self.resolvers)
|
|
|
|
|
2013-12-28 17:39:43 -08:00
|
|
|
#=================================================================
|
2014-01-28 17:23:44 -08:00
|
|
|
class RewritingReplayView(ReplayView):
|
2013-12-28 17:39:43 -08:00
|
|
|
|
2014-02-03 09:24:40 -08:00
|
|
|
def __init__(self, resolvers, loader = None, head_insert_view = None, header_rewriter = None, redir_to_exact = True, buffer_response = False):
|
|
|
|
ReplayView.__init__(self, resolvers, loader)
|
2014-01-31 10:04:21 -08:00
|
|
|
self.head_insert_view = head_insert_view
|
2014-01-28 19:37:37 -08:00
|
|
|
self.header_rewriter = header_rewriter if header_rewriter else HeaderRewriter()
|
2014-01-20 10:50:06 -08:00
|
|
|
self.redir_to_exact = redir_to_exact
|
2013-12-28 17:39:43 -08:00
|
|
|
|
2014-01-22 14:03:41 -08:00
|
|
|
# buffer or stream rewritten response
|
2014-01-22 15:28:01 -08:00
|
|
|
self.buffer_response = buffer_response
|
2014-01-22 14:03:41 -08:00
|
|
|
|
2013-12-28 17:39:43 -08:00
|
|
|
|
2014-02-03 09:24:40 -08:00
|
|
|
|
2014-01-28 19:37:37 -08:00
|
|
|
def _text_content_type(self, content_type):
|
2013-12-30 03:03:33 +00:00
|
|
|
for ctype, mimelist in self.REWRITE_TYPES.iteritems():
|
2014-01-28 19:37:37 -08:00
|
|
|
if any ((mime in content_type) for mime in mimelist):
|
2013-12-30 03:03:33 +00:00
|
|
|
return ctype
|
2013-12-28 17:39:43 -08:00
|
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
2014-02-08 20:07:16 -08:00
|
|
|
def make_response(self, wbrequest, cdx, status_headers, stream):
|
2014-02-03 09:24:40 -08:00
|
|
|
# check and reject self-redirect
|
|
|
|
self._reject_self_redirect(wbrequest, cdx, status_headers)
|
2013-12-28 17:39:43 -08:00
|
|
|
|
2014-02-03 09:24:40 -08:00
|
|
|
# check if redir is needed
|
|
|
|
self._redirect_if_needed(wbrequest, cdx)
|
2013-12-28 17:39:43 -08:00
|
|
|
|
2014-02-03 09:24:40 -08:00
|
|
|
urlrewriter = wbrequest.urlrewriter
|
2013-12-28 17:39:43 -08:00
|
|
|
|
2014-02-03 09:24:40 -08:00
|
|
|
rewritten_headers = self.header_rewriter.rewrite(status_headers, urlrewriter)
|
2013-12-28 17:39:43 -08:00
|
|
|
|
2014-01-27 21:46:38 -08:00
|
|
|
# de_chunking in case chunk encoding is broken
|
|
|
|
# TODO: investigate further
|
|
|
|
de_chunk = False
|
|
|
|
|
|
|
|
# handle transfer-encoding: chunked
|
2014-01-28 19:37:37 -08:00
|
|
|
if (rewritten_headers.contains_removed_header('transfer-encoding', 'chunked')):
|
2014-01-27 21:46:38 -08:00
|
|
|
stream = archiveloader.ChunkedLineReader(stream)
|
|
|
|
de_chunk = True
|
|
|
|
|
2014-02-03 09:24:40 -08:00
|
|
|
# transparent, though still may need to dechunk
|
2014-01-27 21:46:38 -08:00
|
|
|
if wbrequest.wb_url.mod == 'id_':
|
|
|
|
if de_chunk:
|
2014-02-03 09:24:40 -08:00
|
|
|
status_headers.remove_header('transfer-encoding')
|
2014-01-27 21:46:38 -08:00
|
|
|
|
2014-02-03 09:24:40 -08:00
|
|
|
return self.create_stream_response(status_headers, stream)
|
2014-01-03 13:03:03 -08:00
|
|
|
|
|
|
|
# non-text content type, just send through with rewritten headers
|
2014-01-27 21:46:38 -08:00
|
|
|
# but may need to dechunk
|
2014-01-28 19:37:37 -08:00
|
|
|
if rewritten_headers.text_type is None:
|
2014-02-03 09:24:40 -08:00
|
|
|
status_headers = rewritten_headers.status_headers
|
2014-01-27 21:46:38 -08:00
|
|
|
|
2014-02-03 09:24:40 -08:00
|
|
|
return self.create_stream_response(status_headers, stream)
|
2013-12-28 17:39:43 -08:00
|
|
|
|
|
|
|
# Handle text rewriting
|
2014-01-18 21:32:49 -05:00
|
|
|
|
2013-12-28 17:39:43 -08:00
|
|
|
# special case -- need to ungzip the body
|
2014-01-28 19:37:37 -08:00
|
|
|
if (rewritten_headers.contains_removed_header('content-encoding', 'gzip')):
|
2014-01-18 21:32:49 -05:00
|
|
|
stream = archiveloader.LineReader(stream, decomp = utils.create_decompressor())
|
2013-12-28 17:39:43 -08:00
|
|
|
|
2014-01-03 13:03:03 -08:00
|
|
|
# TODO: is this right?
|
2014-01-28 19:37:37 -08:00
|
|
|
if rewritten_headers.charset:
|
|
|
|
encoding = rewritten_headers.charset
|
2014-01-22 17:55:55 -08:00
|
|
|
first_buff = None
|
2014-01-03 13:03:03 -08:00
|
|
|
else:
|
2014-01-28 19:37:37 -08:00
|
|
|
(encoding, first_buff) = self._detect_charset(stream)
|
2013-12-28 17:39:43 -08:00
|
|
|
|
2014-01-03 21:38:18 +00:00
|
|
|
# if chardet thinks its ascii, use utf-8
|
2014-01-03 13:03:03 -08:00
|
|
|
if encoding == 'ascii':
|
2014-01-03 21:38:18 +00:00
|
|
|
#encoding = None
|
|
|
|
encoding = 'utf-8'
|
2013-12-28 17:39:43 -08:00
|
|
|
|
2014-01-03 13:03:03 -08:00
|
|
|
# Buffering response for html, streaming for others?
|
2014-01-28 19:37:37 -08:00
|
|
|
#if rewritten_headers.text_type == 'html':
|
|
|
|
# return self._rewrite_html(encoding, urlrewriter, stream, rewritten_headers.status_headers, first_buff)
|
2014-01-22 14:03:41 -08:00
|
|
|
#else:
|
2014-01-28 19:37:37 -08:00
|
|
|
# return self._rewrite_other(rewritten_headers.text_type, encoding, urlrewriter, stream, rewritten_headers.status_headers, first_buff)
|
2014-01-22 14:03:41 -08:00
|
|
|
|
2014-01-28 19:37:37 -08:00
|
|
|
text_type = rewritten_headers.text_type
|
|
|
|
status_headers = rewritten_headers.status_headers
|
2014-01-22 14:03:41 -08:00
|
|
|
|
2014-01-28 19:37:37 -08:00
|
|
|
if text_type == 'html':
|
2014-02-08 20:07:16 -08:00
|
|
|
head_insert_str = self.head_insert_view.render_to_string(wbrequest = wbrequest, cdx = cdx) if self.head_insert_view else None
|
2014-01-28 20:18:47 -08:00
|
|
|
rewriter = html_rewriter.HTMLRewriter(urlrewriter, outstream = None, head_insert = head_insert_str)
|
2014-01-28 19:37:37 -08:00
|
|
|
elif text_type == 'css':
|
2014-01-22 14:03:41 -08:00
|
|
|
rewriter = regex_rewriters.CSSRewriter(urlrewriter)
|
2014-01-28 19:37:37 -08:00
|
|
|
elif text_type == 'js':
|
2014-01-22 14:03:41 -08:00
|
|
|
rewriter = regex_rewriters.JSRewriter(urlrewriter)
|
2014-01-28 19:37:37 -08:00
|
|
|
elif text_type == 'xml':
|
2014-01-22 14:03:41 -08:00
|
|
|
rewriter = regex_rewriters.XMLRewriter(urlrewriter)
|
2014-01-03 13:03:03 -08:00
|
|
|
else:
|
2014-01-28 19:37:37 -08:00
|
|
|
raise Exception('Unknown Text Type for Rewrite: ' + text_type)
|
2014-01-03 13:03:03 -08:00
|
|
|
|
2014-01-22 17:55:55 -08:00
|
|
|
# Create generator for response
|
|
|
|
response_gen = self._create_rewrite_stream(rewriter, encoding, stream, first_buff)
|
2014-01-03 13:03:03 -08:00
|
|
|
|
2014-01-22 14:03:41 -08:00
|
|
|
if self.buffer_response:
|
2014-01-22 17:55:55 -08:00
|
|
|
return self._create_buffer_response(status_headers, response_gen)
|
2014-01-22 14:03:41 -08:00
|
|
|
else:
|
2014-01-22 17:55:55 -08:00
|
|
|
return WbResponse(status_headers, value = response_gen)
|
2014-01-22 14:03:41 -08:00
|
|
|
|
|
|
|
|
2014-01-22 17:55:55 -08:00
|
|
|
# Buffer rewrite generator and return a response from a string
|
|
|
|
def _create_buffer_response(self, status_headers, generator):
|
2014-01-03 13:03:03 -08:00
|
|
|
out = StringIO.StringIO()
|
|
|
|
|
|
|
|
try:
|
2014-01-22 17:55:55 -08:00
|
|
|
for buff in generator:
|
|
|
|
out.write(buff)
|
2013-12-28 17:39:43 -08:00
|
|
|
|
2014-01-03 13:03:03 -08:00
|
|
|
finally:
|
2014-01-22 14:03:41 -08:00
|
|
|
content = out.getvalue()
|
2013-12-28 17:39:43 -08:00
|
|
|
|
2014-01-28 19:37:37 -08:00
|
|
|
content_length_str = str(len(content))
|
|
|
|
status_headers.headers.append(('Content-Length', content_length_str))
|
2014-01-22 14:03:41 -08:00
|
|
|
out.close()
|
2013-12-28 17:39:43 -08:00
|
|
|
|
2014-01-22 17:55:55 -08:00
|
|
|
return WbResponse(status_headers, value = [content])
|
2013-12-28 17:39:43 -08:00
|
|
|
|
2014-01-22 17:55:55 -08:00
|
|
|
# Create rewrite response from record (no Content-Length), may even be chunked by front-end
|
|
|
|
def _create_rewrite_stream(self, rewriter, encoding, stream, first_buff = None):
|
2014-01-28 19:37:37 -08:00
|
|
|
def do_rewrite(buff):
|
2014-01-03 13:03:03 -08:00
|
|
|
if encoding:
|
2014-01-28 19:37:37 -08:00
|
|
|
buff = self._decode_buff(buff, stream, encoding)
|
2014-01-22 14:03:41 -08:00
|
|
|
|
|
|
|
buff = rewriter.rewrite(buff)
|
|
|
|
|
2014-01-03 13:03:03 -08:00
|
|
|
if encoding:
|
|
|
|
buff = buff.encode(encoding)
|
2013-12-28 17:39:43 -08:00
|
|
|
|
2014-01-03 13:03:03 -08:00
|
|
|
return buff
|
2013-12-28 17:39:43 -08:00
|
|
|
|
2014-01-28 19:37:37 -08:00
|
|
|
def do_finish():
|
2014-01-22 14:03:41 -08:00
|
|
|
return rewriter.close()
|
|
|
|
|
2014-01-28 19:37:37 -08:00
|
|
|
return self.create_stream_gen(stream, rewrite_func = do_rewrite, final_read_func = do_finish, first_buff = first_buff)
|
2014-01-22 14:03:41 -08:00
|
|
|
|
|
|
|
|
2014-01-28 19:37:37 -08:00
|
|
|
def _decode_buff(self, buff, stream, encoding):
|
2014-01-22 14:03:41 -08:00
|
|
|
try:
|
|
|
|
buff = buff.decode(encoding)
|
|
|
|
except UnicodeDecodeError, e:
|
|
|
|
# chunk may have cut apart unicode bytes -- add 1-3 bytes and retry
|
|
|
|
for i in range(3):
|
|
|
|
buff += stream.read(1)
|
|
|
|
try:
|
|
|
|
buff = buff.decode(encoding)
|
|
|
|
break
|
|
|
|
except UnicodeDecodeError:
|
|
|
|
pass
|
|
|
|
else:
|
|
|
|
raise
|
|
|
|
|
|
|
|
return buff
|
|
|
|
|
2013-12-28 17:39:43 -08:00
|
|
|
|
2014-01-28 19:37:37 -08:00
|
|
|
def _detect_charset(self, stream):
|
2014-01-03 13:03:03 -08:00
|
|
|
buff = stream.read(8192)
|
|
|
|
result = chardet.detect(buff)
|
|
|
|
print "chardet result: " + str(result)
|
|
|
|
return (result['encoding'], buff)
|
2013-12-28 17:39:43 -08:00
|
|
|
|
2014-01-22 14:03:41 -08:00
|
|
|
|
2014-02-03 09:24:40 -08:00
|
|
|
def _redirect_if_needed(self, wbrequest, cdx):
|
|
|
|
is_proxy = wbrequest.is_proxy
|
|
|
|
if self.redir_to_exact and not is_proxy and cdx and (cdx['timestamp'] != wbrequest.wb_url.timestamp):
|
2014-01-28 19:37:37 -08:00
|
|
|
new_url = wbrequest.urlrewriter.get_timestamp_url(cdx['timestamp'], cdx['original'])
|
|
|
|
raise wbexceptions.InternalRedirect(new_url)
|
2013-12-28 17:39:43 -08:00
|
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
2014-02-03 09:24:40 -08:00
|
|
|
def _reject_self_redirect(self, wbrequest, cdx, status_headers):
|
|
|
|
if status_headers.statusline.startswith('3'):
|
|
|
|
request_url = wbrequest.wb_url.url.lower()
|
|
|
|
location_url = status_headers.get_header('Location').lower()
|
2013-12-28 17:39:43 -08:00
|
|
|
|
2014-02-03 09:24:40 -08:00
|
|
|
#TODO: canonicalize before testing?
|
|
|
|
if (UrlRewriter.strip_protocol(request_url) == UrlRewriter.strip_protocol(location_url)):
|
2013-12-28 17:39:43 -08:00
|
|
|
raise wbexceptions.CaptureException('Self Redirect: ' + str(cdx))
|
|
|
|
|
2013-12-31 00:18:12 +00:00
|
|
|
|