mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
pep8 pass: fix spacing, line length, issues
also remove references to obsolete cached_replay, hostnames in pywb_init
This commit is contained in:
parent
51919ed1e7
commit
181c18a1b8
@ -106,7 +106,6 @@ class FuzzyQuery:
|
|||||||
if inx > 0:
|
if inx > 0:
|
||||||
url = url[:inx + 1]
|
url = url[:inx + 1]
|
||||||
|
|
||||||
|
|
||||||
if matched_rule.match_type == 'domain':
|
if matched_rule.match_type == 'domain':
|
||||||
host = urlparse.urlsplit(url).netloc
|
host = urlparse.urlsplit(url).netloc
|
||||||
# remove the subdomain
|
# remove the subdomain
|
||||||
@ -174,8 +173,8 @@ class CDXDomainSpecificRule(BaseRule):
|
|||||||
@staticmethod
|
@staticmethod
|
||||||
def make_query_match_regex(params_list):
|
def make_query_match_regex(params_list):
|
||||||
r"""
|
r"""
|
||||||
>>> CDXDomainSpecificRule.make_query_match_regex(['param1', 'id', 'abc'])
|
>>> CDXDomainSpecificRule.make_query_match_regex(['para', 'id', 'abc'])
|
||||||
'[?&](abc=[^&]+).*[?&](id=[^&]+).*[?&](param1=[^&]+)'
|
'[?&](abc=[^&]+).*[?&](id=[^&]+).*[?&](para=[^&]+)'
|
||||||
|
|
||||||
>>> CDXDomainSpecificRule.make_query_match_regex(['id[0]', 'abc()'])
|
>>> CDXDomainSpecificRule.make_query_match_regex(['id[0]', 'abc()'])
|
||||||
'[?&](abc\\(\\)=[^&]+).*[?&](id\\[0\\]=[^&]+)'
|
'[?&](abc\\(\\)=[^&]+).*[?&](id\\[0\\]=[^&]+)'
|
||||||
|
@ -44,7 +44,7 @@ class CDXObject(OrderedDict):
|
|||||||
["urlkey", "timestamp", "original", "mimetype", "statuscode",
|
["urlkey", "timestamp", "original", "mimetype", "statuscode",
|
||||||
"digest", "redirect", "offset", "filename",
|
"digest", "redirect", "offset", "filename",
|
||||||
"orig.length", "orig.offset", "orig.filename"]
|
"orig.length", "orig.offset", "orig.filename"]
|
||||||
]
|
]
|
||||||
|
|
||||||
def __init__(self, cdxline=''):
|
def __init__(self, cdxline=''):
|
||||||
OrderedDict.__init__(self)
|
OrderedDict.__init__(self)
|
||||||
|
@ -169,8 +169,8 @@ def cdx_filter(cdx_iter, filter_strings):
|
|||||||
# no field set, apply filter to entire cdx
|
# no field set, apply filter to entire cdx
|
||||||
if len(parts) == 1:
|
if len(parts) == 1:
|
||||||
self.field = ''
|
self.field = ''
|
||||||
else:
|
|
||||||
# apply filter to cdx[field]
|
# apply filter to cdx[field]
|
||||||
|
else:
|
||||||
self.field = parts[0]
|
self.field = parts[0]
|
||||||
string = parts[1]
|
string = parts[1]
|
||||||
|
|
||||||
|
@ -194,7 +194,7 @@ def main(args=None):
|
|||||||
help=('use specified root cert (.pem file) ' +
|
help=('use specified root cert (.pem file) ' +
|
||||||
'to create signed cert'))
|
'to create signed cert'))
|
||||||
|
|
||||||
parser.add_argument('-n', '--name', action='store', default=CERT_NAME,
|
parser.add_argument('-n', '--name', action='store', default=CERT_NAME,
|
||||||
help='name for root certificate')
|
help='name for root certificate')
|
||||||
|
|
||||||
parser.add_argument('-d', '--certs-dir', default=CERTS_DIR)
|
parser.add_argument('-d', '--certs-dir', default=CERTS_DIR)
|
||||||
|
@ -159,7 +159,7 @@ class ProxyRouter(object):
|
|||||||
if env['pywb.proxy_host'] == self.magic_name:
|
if env['pywb.proxy_host'] == self.magic_name:
|
||||||
env['REL_REQUEST_URI'] = env['pywb.proxy_req_uri']
|
env['REL_REQUEST_URI'] = env['pywb.proxy_req_uri']
|
||||||
|
|
||||||
# special case for proxy install
|
# special case for proxy install
|
||||||
response = self.handle_cert_install(env)
|
response = self.handle_cert_install(env)
|
||||||
if response:
|
if response:
|
||||||
return response
|
return response
|
||||||
@ -307,7 +307,7 @@ class ProxyRouter(object):
|
|||||||
|
|
||||||
name = name.replace('-', '_').upper()
|
name = name.replace('-', '_').upper()
|
||||||
|
|
||||||
if not name in ('CONTENT_LENGTH', 'CONTENT_TYPE'):
|
if name not in ('CONTENT_LENGTH', 'CONTENT_TYPE'):
|
||||||
name = 'HTTP_' + name
|
name = 'HTTP_' + name
|
||||||
|
|
||||||
env[name] = value
|
env[name] = value
|
||||||
|
@ -83,8 +83,8 @@ class WbRequest(object):
|
|||||||
rewrite_opts)
|
rewrite_opts)
|
||||||
|
|
||||||
self.urlrewriter.deprefix_url()
|
self.urlrewriter.deprefix_url()
|
||||||
else:
|
|
||||||
# no wb_url, just store blank wb_url
|
# no wb_url, just store blank wb_url
|
||||||
|
else:
|
||||||
self.wb_url = None
|
self.wb_url = None
|
||||||
self.urlrewriter = None
|
self.urlrewriter = None
|
||||||
|
|
||||||
@ -113,6 +113,7 @@ class WbRequest(object):
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
RANGE_ARG_RX = re.compile('.*.googlevideo.com/videoplayback.*([&?]range=(\d+)-(\d+))')
|
RANGE_ARG_RX = re.compile('.*.googlevideo.com/videoplayback.*([&?]range=(\d+)-(\d+))')
|
||||||
|
|
||||||
RANGE_HEADER = re.compile('bytes=(\d+)-(\d+)?')
|
RANGE_HEADER = re.compile('bytes=(\d+)-(\d+)?')
|
||||||
|
|
||||||
def extract_range(self):
|
def extract_range(self):
|
||||||
|
@ -73,6 +73,8 @@ class ExactPathCookieRewriter(WbUrlBaseCookieRewriter):
|
|||||||
|
|
||||||
self._remove_age_opts(morsel)
|
self._remove_age_opts(morsel)
|
||||||
return morsel
|
return morsel
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class RootScopeCookieRewriter(WbUrlBaseCookieRewriter):
|
class RootScopeCookieRewriter(WbUrlBaseCookieRewriter):
|
||||||
"""
|
"""
|
||||||
|
@ -33,7 +33,8 @@ class HeaderRewriter:
|
|||||||
'xml': ['/xml', '+xml', '.xml', '.rss'],
|
'xml': ['/xml', '+xml', '.xml', '.rss'],
|
||||||
}
|
}
|
||||||
|
|
||||||
PROXY_HEADERS = ['content-type', 'content-disposition', 'content-range', 'accept-ranges']
|
PROXY_HEADERS = ['content-type', 'content-disposition', 'content-range',
|
||||||
|
'accept-ranges']
|
||||||
|
|
||||||
URL_REWRITE_HEADERS = ['location', 'content-location', 'content-base']
|
URL_REWRITE_HEADERS = ['location', 'content-location', 'content-base']
|
||||||
|
|
||||||
|
@ -172,7 +172,7 @@ class HTMLRewriterMixin(object):
|
|||||||
|
|
||||||
# special case: inline JS/event handler
|
# special case: inline JS/event handler
|
||||||
if ((attr_value and attr_value.startswith('javascript:'))
|
if ((attr_value and attr_value.startswith('javascript:'))
|
||||||
or attr_name.startswith('on')):
|
or attr_name.startswith('on')):
|
||||||
attr_value = self._rewrite_script(attr_value)
|
attr_value = self._rewrite_script(attr_value)
|
||||||
|
|
||||||
# special case: inline CSS/style attribute
|
# special case: inline CSS/style attribute
|
||||||
@ -193,7 +193,7 @@ class HTMLRewriterMixin(object):
|
|||||||
# don't rewrite rel=canonical
|
# don't rewrite rel=canonical
|
||||||
elif tag == 'link' and attr_name == 'href':
|
elif tag == 'link' and attr_name == 'href':
|
||||||
if (self.opts.get('rewrite_rel_canon', True) or
|
if (self.opts.get('rewrite_rel_canon', True) or
|
||||||
not self.has_attr(tag_attrs, ('rel', 'canonical'))):
|
not self.has_attr(tag_attrs, ('rel', 'canonical'))):
|
||||||
rw_mod = handler.get(attr_name)
|
rw_mod = handler.get(attr_name)
|
||||||
attr_value = self._rewrite_url(attr_value, rw_mod)
|
attr_value = self._rewrite_url(attr_value, rw_mod)
|
||||||
|
|
||||||
|
@ -123,7 +123,6 @@ class JSLinkRewriterMixin(object):
|
|||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class JSLocationRewriterMixin(object):
|
class JSLocationRewriterMixin(object):
|
||||||
#class JSLinkAndLocationRewriter(JSLinkOnlyRewriter):
|
|
||||||
"""
|
"""
|
||||||
JS Rewriter mixin which rewrites location and domain to the
|
JS Rewriter mixin which rewrites location and domain to the
|
||||||
specified prefix (default: 'WB_wombat_')
|
specified prefix (default: 'WB_wombat_')
|
||||||
@ -131,23 +130,23 @@ class JSLocationRewriterMixin(object):
|
|||||||
|
|
||||||
def __init__(self, rewriter, rules=[], prefix='WB_wombat_'):
|
def __init__(self, rewriter, rules=[], prefix='WB_wombat_'):
|
||||||
rules = rules + [
|
rules = rules + [
|
||||||
(r'(?<![/$])\blocation\b(?!\":)', RegexRewriter.add_prefix(prefix), 0),
|
(r'(?<![/$])\blocation\b(?!\":)', RegexRewriter.add_prefix(prefix), 0),
|
||||||
(r'(?<=document\.)domain', RegexRewriter.add_prefix(prefix), 0),
|
(r'(?<=document\.)domain', RegexRewriter.add_prefix(prefix), 0),
|
||||||
(r'(?<=document\.)referrer', RegexRewriter.add_prefix(prefix), 0),
|
(r'(?<=document\.)referrer', RegexRewriter.add_prefix(prefix), 0),
|
||||||
(r'(?<=document\.)cookie', RegexRewriter.add_prefix(prefix), 0),
|
(r'(?<=document\.)cookie', RegexRewriter.add_prefix(prefix), 0),
|
||||||
|
|
||||||
#todo: move to mixin?
|
#todo: move to mixin?
|
||||||
(r'(?:[\s=(){]|^)(top)(?:[\s!=}()]|$)',
|
(r'(?:[\s=(){]|^)(top)(?:[\s!=}()]|$)',
|
||||||
RegexRewriter.add_prefix(prefix), 1),
|
RegexRewriter.add_prefix(prefix), 1),
|
||||||
|
|
||||||
(r'(?<=window\.)top',
|
(r'(?<=window\.)top',
|
||||||
RegexRewriter.add_prefix(prefix), 0),
|
RegexRewriter.add_prefix(prefix), 0),
|
||||||
|
|
||||||
# (r'\b(top)\b[!=\W]+(?:self|window)',
|
# (r'\b(top)\b[!=\W]+(?:self|window)',
|
||||||
# RegexRewriter.add_prefix(prefix), 1),
|
# RegexRewriter.add_prefix(prefix), 1),
|
||||||
|
|
||||||
#(r'\b(?:self|window)\b[!=\W]+\b(top)\b',
|
#(r'\b(?:self|window)\b[!=\W]+\b(top)\b',
|
||||||
#RegexRewriter.add_prefix(prefix), 1),
|
#RegexRewriter.add_prefix(prefix), 1),
|
||||||
]
|
]
|
||||||
super(JSLocationRewriterMixin, self).__init__(rewriter, rules)
|
super(JSLocationRewriterMixin, self).__init__(rewriter, rules)
|
||||||
|
|
||||||
@ -161,6 +160,7 @@ class JSLocationOnlyRewriter(JSLocationRewriterMixin, RegexRewriter):
|
|||||||
class JSLinkOnlyRewriter(JSLinkRewriterMixin, RegexRewriter):
|
class JSLinkOnlyRewriter(JSLinkRewriterMixin, RegexRewriter):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class JSLinkAndLocationRewriter(JSLocationRewriterMixin,
|
class JSLinkAndLocationRewriter(JSLocationRewriterMixin,
|
||||||
JSLinkRewriterMixin,
|
JSLinkRewriterMixin,
|
||||||
@ -190,9 +190,9 @@ class XMLRewriter(RegexRewriter):
|
|||||||
|
|
||||||
def _create_rules(self, rewriter):
|
def _create_rules(self, rewriter):
|
||||||
return [
|
return [
|
||||||
('([A-Za-z:]+[\s=]+)?["\'\s]*(' +
|
('([A-Za-z:]+[\s=]+)?["\'\s]*(' +
|
||||||
RegexRewriter.HTTPX_MATCH_STR + ')',
|
RegexRewriter.HTTPX_MATCH_STR + ')',
|
||||||
RegexRewriter.archival_rewrite(rewriter), 2),
|
RegexRewriter.archival_rewrite(rewriter), 2),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
@ -210,9 +210,9 @@ class CSSRewriter(RegexRewriter):
|
|||||||
|
|
||||||
def _create_rules(self, rewriter):
|
def _create_rules(self, rewriter):
|
||||||
return [
|
return [
|
||||||
(CSSRewriter.CSS_URL_REGEX,
|
(CSSRewriter.CSS_URL_REGEX,
|
||||||
RegexRewriter.archival_rewrite(rewriter), 1),
|
RegexRewriter.archival_rewrite(rewriter), 1),
|
||||||
|
|
||||||
(CSSRewriter.CSS_IMPORT_NO_URL_REGEX,
|
(CSSRewriter.CSS_IMPORT_NO_URL_REGEX,
|
||||||
RegexRewriter.archival_rewrite(rewriter), 1),
|
RegexRewriter.archival_rewrite(rewriter), 1),
|
||||||
]
|
]
|
||||||
|
@ -76,7 +76,7 @@ class RewriteContent:
|
|||||||
wb_url = urlrewriter.wburl
|
wb_url = urlrewriter.wburl
|
||||||
|
|
||||||
if (wb_url.is_identity or
|
if (wb_url.is_identity or
|
||||||
(not head_insert_func and wb_url.is_banner_only)):
|
(not head_insert_func and wb_url.is_banner_only)):
|
||||||
status_headers, stream = self.sanitize_content(headers, stream)
|
status_headers, stream = self.sanitize_content(headers, stream)
|
||||||
return (status_headers, self.stream_to_gen(stream), False)
|
return (status_headers, self.stream_to_gen(stream), False)
|
||||||
|
|
||||||
|
@ -1,7 +1,8 @@
|
|||||||
from pywb.utils.dsrules import BaseRule
|
from pywb.utils.dsrules import BaseRule
|
||||||
|
|
||||||
from regex_rewriters import RegexRewriter, CSSRewriter, XMLRewriter
|
from regex_rewriters import RegexRewriter, CSSRewriter, XMLRewriter
|
||||||
from regex_rewriters import JSLinkAndLocationRewriter, JSLinkOnlyRewriter, JSLocationOnlyRewriter
|
from regex_rewriters import JSLinkAndLocationRewriter, JSLinkOnlyRewriter
|
||||||
|
from regex_rewriters import JSLocationOnlyRewriter
|
||||||
|
|
||||||
from header_rewriter import HeaderRewriter
|
from header_rewriter import HeaderRewriter
|
||||||
from html_rewriter import HTMLRewriter
|
from html_rewriter import HTMLRewriter
|
||||||
|
@ -35,13 +35,13 @@ class UrlRewriter(object):
|
|||||||
return url
|
return url
|
||||||
|
|
||||||
if (self.prefix and
|
if (self.prefix and
|
||||||
self.prefix != '/' and
|
self.prefix != '/' and
|
||||||
url.startswith(self.prefix)):
|
url.startswith(self.prefix)):
|
||||||
return url
|
return url
|
||||||
|
|
||||||
if (self.full_prefix and
|
if (self.full_prefix and
|
||||||
self.full_prefix != self.prefix and
|
self.full_prefix != self.prefix and
|
||||||
url.startswith(self.full_prefix)):
|
url.startswith(self.full_prefix)):
|
||||||
return url
|
return url
|
||||||
|
|
||||||
wburl = self.wburl
|
wburl = self.wburl
|
||||||
|
@ -41,6 +41,7 @@ wayback url format.
|
|||||||
import re
|
import re
|
||||||
import urllib
|
import urllib
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class BaseWbUrl(object):
|
class BaseWbUrl(object):
|
||||||
QUERY = 'query'
|
QUERY = 'query'
|
||||||
@ -107,7 +108,8 @@ class WbUrl(BaseWbUrl):
|
|||||||
m = self.PARTIAL_ENC_RX.match(self.url)
|
m = self.PARTIAL_ENC_RX.match(self.url)
|
||||||
if m:
|
if m:
|
||||||
len_ = len(m.group(0))
|
len_ = len(m.group(0))
|
||||||
self.url = urllib.unquote_plus(self.url[:len_]) + self.url[len_:]
|
self.url = (urllib.unquote_plus(self.url[:len_]) +
|
||||||
|
self.url[len_:])
|
||||||
inx = self.url.find(':/')
|
inx = self.url.find(':/')
|
||||||
|
|
||||||
if inx < 0:
|
if inx < 0:
|
||||||
@ -160,7 +162,6 @@ class WbUrl(BaseWbUrl):
|
|||||||
self.timestamp = timestamp
|
self.timestamp = timestamp
|
||||||
self.type = self.REPLAY
|
self.type = self.REPLAY
|
||||||
|
|
||||||
|
|
||||||
def deprefix_url(self, prefix):
|
def deprefix_url(self, prefix):
|
||||||
prefix = urllib.quote_plus(prefix)
|
prefix = urllib.quote_plus(prefix)
|
||||||
rex_query = '=' + re.escape(prefix) + '([0-9])*([\w]{2}_)?/?'
|
rex_query = '=' + re.escape(prefix) + '([0-9])*([\w]{2}_)?/?'
|
||||||
|
@ -173,7 +173,7 @@ def calc_search_range(url, match_type, surt_ordered=True, url_canon=None):
|
|||||||
|
|
||||||
# if tld, use com, as start_key
|
# if tld, use com, as start_key
|
||||||
# otherwise, stick with com,example)/
|
# otherwise, stick with com,example)/
|
||||||
if not ',' in host:
|
if ',' not in host:
|
||||||
start_key = host + ','
|
start_key = host + ','
|
||||||
else:
|
else:
|
||||||
start_key = host + ')/'
|
start_key = host + ')/'
|
||||||
|
@ -277,7 +277,7 @@ def create_record_iter(arcv_iter, options):
|
|||||||
compute_digest = False
|
compute_digest = False
|
||||||
|
|
||||||
if (entry.digest == '-' and
|
if (entry.digest == '-' and
|
||||||
record.rec_type not in ('revisit', 'request', 'warcinfo')):
|
record.rec_type not in ('revisit', 'request', 'warcinfo')):
|
||||||
|
|
||||||
compute_digest = True
|
compute_digest = True
|
||||||
|
|
||||||
@ -315,11 +315,11 @@ def join_request_records(entry_iter, options):
|
|||||||
|
|
||||||
# check for concurrency also
|
# check for concurrency also
|
||||||
elif (entry.record.rec_headers.get_header('WARC-Concurrent-To') !=
|
elif (entry.record.rec_headers.get_header('WARC-Concurrent-To') !=
|
||||||
prev_entry.record.rec_headers.get_header('WARC-Record-ID')):
|
prev_entry.record.rec_headers.get_header('WARC-Record-ID')):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
elif (entry.merge_request_data(prev_entry, options) or
|
elif (entry.merge_request_data(prev_entry, options) or
|
||||||
prev_entry.merge_request_data(entry, options)):
|
prev_entry.merge_request_data(entry, options)):
|
||||||
yield prev_entry
|
yield prev_entry
|
||||||
yield entry
|
yield entry
|
||||||
prev_entry = None
|
prev_entry = None
|
||||||
@ -423,7 +423,7 @@ def create_index_iter(fh, **options):
|
|||||||
|
|
||||||
for entry in entry_iter:
|
for entry in entry_iter:
|
||||||
if (entry.record.rec_type in ('request', 'warcinfo') and
|
if (entry.record.rec_type in ('request', 'warcinfo') and
|
||||||
not options.get('include_all')):
|
not options.get('include_all')):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
yield entry
|
yield entry
|
||||||
|
@ -42,7 +42,7 @@ class CDXAPIHandler(BaseHandler):
|
|||||||
if name != 'filter':
|
if name != 'filter':
|
||||||
params[name] = val[0]
|
params[name] = val[0]
|
||||||
|
|
||||||
if not 'output' in params:
|
if 'output' not in params:
|
||||||
params['output'] = 'text'
|
params['output'] = 'text'
|
||||||
elif params['output'] not in ('text'):
|
elif params['output'] not in ('text'):
|
||||||
params['output'] = 'text'
|
params['output'] = 'text'
|
||||||
|
@ -16,7 +16,6 @@ from pywb.warc.resolvingloader import ResolvingLoader
|
|||||||
|
|
||||||
from views import J2TemplateView
|
from views import J2TemplateView
|
||||||
from replay_views import ReplayView
|
from replay_views import ReplayView
|
||||||
from cached_replay import CachedReplayView
|
|
||||||
from pywb.framework.memento import MementoResponse
|
from pywb.framework.memento import MementoResponse
|
||||||
from pywb.utils.timeutils import datetime_to_timestamp
|
from pywb.utils.timeutils import datetime_to_timestamp
|
||||||
|
|
||||||
@ -65,8 +64,8 @@ class SearchPageWbUrlHandler(WbUrlHandler):
|
|||||||
# render top level frame if in frame mode
|
# render top level frame if in frame mode
|
||||||
# (not supported in proxy mode)
|
# (not supported in proxy mode)
|
||||||
if (self.is_frame_mode and wbrequest.wb_url and
|
if (self.is_frame_mode and wbrequest.wb_url and
|
||||||
not wbrequest.wb_url.is_query() and
|
not wbrequest.wb_url.is_query() and
|
||||||
not wbrequest.options['is_proxy']):
|
not wbrequest.options['is_proxy']):
|
||||||
|
|
||||||
if wbrequest.wb_url.is_top_frame:
|
if wbrequest.wb_url.is_top_frame:
|
||||||
return self.get_top_frame_response(wbrequest)
|
return self.get_top_frame_response(wbrequest)
|
||||||
@ -154,8 +153,8 @@ class WBHandler(SearchPageWbUrlHandler):
|
|||||||
|
|
||||||
def handle_not_found(self, wbrequest, nfe):
|
def handle_not_found(self, wbrequest, nfe):
|
||||||
if (not self.fallback_handler or
|
if (not self.fallback_handler or
|
||||||
wbrequest.wb_url.is_query() or
|
wbrequest.wb_url.is_query() or
|
||||||
wbrequest.wb_url.is_identity):
|
wbrequest.wb_url.is_identity):
|
||||||
raise
|
raise
|
||||||
|
|
||||||
return self.fallback_handler(wbrequest)
|
return self.fallback_handler(wbrequest)
|
||||||
|
@ -89,7 +89,8 @@ class RewriteHandler(SearchPageWbUrlHandler):
|
|||||||
if rangeres:
|
if rangeres:
|
||||||
url, start, end, use_206 = rangeres
|
url, start, end, use_206 = rangeres
|
||||||
|
|
||||||
# if bytes=0- Range request, simply remove the range and still proxy
|
# if bytes=0- Range request,
|
||||||
|
# simply remove the range and still proxy
|
||||||
if start == 0 and not end and use_206:
|
if start == 0 and not end and use_206:
|
||||||
wbrequest.wb_url.url = url
|
wbrequest.wb_url.url = url
|
||||||
del wbrequest.env['HTTP_RANGE']
|
del wbrequest.env['HTTP_RANGE']
|
||||||
@ -111,10 +112,12 @@ class RewriteHandler(SearchPageWbUrlHandler):
|
|||||||
wbresponse = self._make_response(wbrequest, *result)
|
wbresponse = self._make_response(wbrequest, *result)
|
||||||
|
|
||||||
if readd_range:
|
if readd_range:
|
||||||
content_length = wbresponse.status_headers.get_header('Content-Length')
|
content_length = (wbresponse.status_headers.
|
||||||
|
get_header('Content-Length'))
|
||||||
try:
|
try:
|
||||||
content_length = int(content_length)
|
content_length = int(content_length)
|
||||||
wbresponse.status_headers.add_range(0, content_length, content_length)
|
wbresponse.status_headers.add_range(0, content_length,
|
||||||
|
content_length)
|
||||||
except (ValueError, TypeError):
|
except (ValueError, TypeError):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@ -165,7 +168,8 @@ class RewriteHandler(SearchPageWbUrlHandler):
|
|||||||
verify=False,
|
verify=False,
|
||||||
stream=True)
|
stream=True)
|
||||||
|
|
||||||
# don't actually read whole response, proxy response for writing it
|
# don't actually read whole response,
|
||||||
|
# proxy response for writing it
|
||||||
resp.close()
|
resp.close()
|
||||||
except:
|
except:
|
||||||
del self._cache[key]
|
del self._cache[key]
|
||||||
@ -176,6 +180,7 @@ class RewriteHandler(SearchPageWbUrlHandler):
|
|||||||
resp = self.get_video_info(wbrequest,
|
resp = self.get_video_info(wbrequest,
|
||||||
info_url=referrer,
|
info_url=referrer,
|
||||||
video_url=url)
|
video_url=url)
|
||||||
|
|
||||||
def wrap_buff_gen(gen):
|
def wrap_buff_gen(gen):
|
||||||
for x in gen:
|
for x in gen:
|
||||||
yield x
|
yield x
|
||||||
|
@ -24,7 +24,6 @@ import logging
|
|||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
DEFAULTS = {
|
DEFAULTS = {
|
||||||
'hostpaths': ['http://localhost:8080'],
|
|
||||||
'collections': {'pywb': './sample_archive/cdx/'},
|
'collections': {'pywb': './sample_archive/cdx/'},
|
||||||
'archive_paths': './sample_archive/warcs/',
|
'archive_paths': './sample_archive/warcs/',
|
||||||
|
|
||||||
@ -153,13 +152,6 @@ def create_wb_router(passed_config={}):
|
|||||||
|
|
||||||
routes = []
|
routes = []
|
||||||
|
|
||||||
# TODO: examine this more
|
|
||||||
hostname = os.environ.get('PYWB_HOST_NAME')
|
|
||||||
if hostname:
|
|
||||||
hostpaths = [hostname]
|
|
||||||
else:
|
|
||||||
hostpaths = config.get('hostpaths')
|
|
||||||
|
|
||||||
port = config.get('port')
|
port = config.get('port')
|
||||||
|
|
||||||
# collections based on cdx source
|
# collections based on cdx source
|
||||||
@ -238,18 +230,18 @@ def create_wb_router(passed_config={}):
|
|||||||
router = ProxyArchivalRouter
|
router = ProxyArchivalRouter
|
||||||
|
|
||||||
view = J2TemplateView.create_template(
|
view = J2TemplateView.create_template(
|
||||||
config.get('proxy_select_html'),
|
config.get('proxy_select_html'),
|
||||||
'Proxy Coll Selector')
|
'Proxy Coll Selector')
|
||||||
|
|
||||||
if not 'proxy_options' in passed_config:
|
if 'proxy_options' not in passed_config:
|
||||||
passed_config['proxy_options'] = {}
|
passed_config['proxy_options'] = {}
|
||||||
|
|
||||||
if view:
|
if view:
|
||||||
passed_config['proxy_options']['proxy_select_view'] = view
|
passed_config['proxy_options']['proxy_select_view'] = view
|
||||||
|
|
||||||
view = J2TemplateView.create_template(
|
view = J2TemplateView.create_template(
|
||||||
config.get('proxy_cert_download_html'),
|
config.get('proxy_cert_download_html'),
|
||||||
'Proxy Cert Download')
|
'Proxy Cert Download')
|
||||||
|
|
||||||
if view:
|
if view:
|
||||||
passed_config['proxy_options']['proxy_cert_download_view'] = view
|
passed_config['proxy_options']['proxy_cert_download_view'] = view
|
||||||
@ -257,11 +249,6 @@ def create_wb_router(passed_config={}):
|
|||||||
# Finally, create wb router
|
# Finally, create wb router
|
||||||
return router(
|
return router(
|
||||||
routes,
|
routes,
|
||||||
# Specify hostnames that pywb will be running on
|
|
||||||
# This will help catch occasionally missed rewrites that
|
|
||||||
# fall-through to the host
|
|
||||||
# (See archivalrouter.ReferRedirect)
|
|
||||||
hostpaths=hostpaths,
|
|
||||||
port=port,
|
port=port,
|
||||||
|
|
||||||
abs_path=config.get('absolute_paths', True),
|
abs_path=config.get('absolute_paths', True),
|
||||||
|
@ -141,7 +141,7 @@ class QueryHandler(object):
|
|||||||
'limit': limit,
|
'limit': limit,
|
||||||
'fl': ('urlkey,original,timestamp,' +
|
'fl': ('urlkey,original,timestamp,' +
|
||||||
'endtimestamp,groupcount,uniqcount'),
|
'endtimestamp,groupcount,uniqcount'),
|
||||||
'filter':[],
|
'filter': [],
|
||||||
},
|
},
|
||||||
|
|
||||||
wburl.REPLAY:
|
wburl.REPLAY:
|
||||||
|
@ -18,7 +18,7 @@ class RangeCache(object):
|
|||||||
atexit.register(self.cleanup)
|
atexit.register(self.cleanup)
|
||||||
|
|
||||||
def cleanup(self):
|
def cleanup(self):
|
||||||
if self.temp_dir: # pragma: no cover
|
if self.temp_dir: # pragma: no cover
|
||||||
import shutil
|
import shutil
|
||||||
print('Removing: ' + self.temp_dir)
|
print('Removing: ' + self.temp_dir)
|
||||||
shutil.rmtree(self.temp_dir, True)
|
shutil.rmtree(self.temp_dir, True)
|
||||||
@ -28,7 +28,7 @@ class RangeCache(object):
|
|||||||
url, start, end, use_206):
|
url, start, end, use_206):
|
||||||
|
|
||||||
key = digest
|
key = digest
|
||||||
if not key in self.cache:
|
if key not in self.cache:
|
||||||
wbrequest.custom_params['noredir'] = True
|
wbrequest.custom_params['noredir'] = True
|
||||||
response = wbresponse_func()
|
response = wbresponse_func()
|
||||||
|
|
||||||
|
@ -229,8 +229,8 @@ class ReplayView(object):
|
|||||||
url=cdx['original']))
|
url=cdx['original']))
|
||||||
|
|
||||||
if wbrequest.method == 'POST':
|
if wbrequest.method == 'POST':
|
||||||
# FF shows a confirm dialog, so can't use 307 effectively
|
# FF shows a confirm dialog, so can't use 307 effectively
|
||||||
# statusline = '307 Same-Method Internal Redirect'
|
# was: statusline = '307 Same-Method Internal Redirect'
|
||||||
return None
|
return None
|
||||||
else:
|
else:
|
||||||
statusline = '302 Internal Redirect'
|
statusline = '302 Internal Redirect'
|
||||||
@ -252,7 +252,7 @@ class ReplayView(object):
|
|||||||
|
|
||||||
# skip all 304s
|
# skip all 304s
|
||||||
if (status_headers.statusline.startswith('304') and
|
if (status_headers.statusline.startswith('304') and
|
||||||
not wbrequest.wb_url.is_identity):
|
not wbrequest.wb_url.is_identity):
|
||||||
|
|
||||||
raise CaptureException('Skipping 304 Modified: ' + str(cdx))
|
raise CaptureException('Skipping 304 Modified: ' + str(cdx))
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user