mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
pep8 pass: fix spacing, line length, issues
also remove references to obsolete cached_replay, hostnames in pywb_init
This commit is contained in:
parent
51919ed1e7
commit
181c18a1b8
@ -106,7 +106,6 @@ class FuzzyQuery:
|
||||
if inx > 0:
|
||||
url = url[:inx + 1]
|
||||
|
||||
|
||||
if matched_rule.match_type == 'domain':
|
||||
host = urlparse.urlsplit(url).netloc
|
||||
# remove the subdomain
|
||||
@ -174,8 +173,8 @@ class CDXDomainSpecificRule(BaseRule):
|
||||
@staticmethod
|
||||
def make_query_match_regex(params_list):
|
||||
r"""
|
||||
>>> CDXDomainSpecificRule.make_query_match_regex(['param1', 'id', 'abc'])
|
||||
'[?&](abc=[^&]+).*[?&](id=[^&]+).*[?&](param1=[^&]+)'
|
||||
>>> CDXDomainSpecificRule.make_query_match_regex(['para', 'id', 'abc'])
|
||||
'[?&](abc=[^&]+).*[?&](id=[^&]+).*[?&](para=[^&]+)'
|
||||
|
||||
>>> CDXDomainSpecificRule.make_query_match_regex(['id[0]', 'abc()'])
|
||||
'[?&](abc\\(\\)=[^&]+).*[?&](id\\[0\\]=[^&]+)'
|
||||
|
@ -44,7 +44,7 @@ class CDXObject(OrderedDict):
|
||||
["urlkey", "timestamp", "original", "mimetype", "statuscode",
|
||||
"digest", "redirect", "offset", "filename",
|
||||
"orig.length", "orig.offset", "orig.filename"]
|
||||
]
|
||||
]
|
||||
|
||||
def __init__(self, cdxline=''):
|
||||
OrderedDict.__init__(self)
|
||||
|
@ -169,8 +169,8 @@ def cdx_filter(cdx_iter, filter_strings):
|
||||
# no field set, apply filter to entire cdx
|
||||
if len(parts) == 1:
|
||||
self.field = ''
|
||||
else:
|
||||
# apply filter to cdx[field]
|
||||
else:
|
||||
self.field = parts[0]
|
||||
string = parts[1]
|
||||
|
||||
|
@ -194,7 +194,7 @@ def main(args=None):
|
||||
help=('use specified root cert (.pem file) ' +
|
||||
'to create signed cert'))
|
||||
|
||||
parser.add_argument('-n', '--name', action='store', default=CERT_NAME,
|
||||
parser.add_argument('-n', '--name', action='store', default=CERT_NAME,
|
||||
help='name for root certificate')
|
||||
|
||||
parser.add_argument('-d', '--certs-dir', default=CERTS_DIR)
|
||||
|
@ -159,7 +159,7 @@ class ProxyRouter(object):
|
||||
if env['pywb.proxy_host'] == self.magic_name:
|
||||
env['REL_REQUEST_URI'] = env['pywb.proxy_req_uri']
|
||||
|
||||
# special case for proxy install
|
||||
# special case for proxy install
|
||||
response = self.handle_cert_install(env)
|
||||
if response:
|
||||
return response
|
||||
@ -307,7 +307,7 @@ class ProxyRouter(object):
|
||||
|
||||
name = name.replace('-', '_').upper()
|
||||
|
||||
if not name in ('CONTENT_LENGTH', 'CONTENT_TYPE'):
|
||||
if name not in ('CONTENT_LENGTH', 'CONTENT_TYPE'):
|
||||
name = 'HTTP_' + name
|
||||
|
||||
env[name] = value
|
||||
|
@ -83,8 +83,8 @@ class WbRequest(object):
|
||||
rewrite_opts)
|
||||
|
||||
self.urlrewriter.deprefix_url()
|
||||
else:
|
||||
# no wb_url, just store blank wb_url
|
||||
else:
|
||||
self.wb_url = None
|
||||
self.urlrewriter = None
|
||||
|
||||
@ -113,6 +113,7 @@ class WbRequest(object):
|
||||
return False
|
||||
|
||||
RANGE_ARG_RX = re.compile('.*.googlevideo.com/videoplayback.*([&?]range=(\d+)-(\d+))')
|
||||
|
||||
RANGE_HEADER = re.compile('bytes=(\d+)-(\d+)?')
|
||||
|
||||
def extract_range(self):
|
||||
|
@ -73,6 +73,8 @@ class ExactPathCookieRewriter(WbUrlBaseCookieRewriter):
|
||||
|
||||
self._remove_age_opts(morsel)
|
||||
return morsel
|
||||
|
||||
|
||||
#=================================================================
|
||||
class RootScopeCookieRewriter(WbUrlBaseCookieRewriter):
|
||||
"""
|
||||
|
@ -33,7 +33,8 @@ class HeaderRewriter:
|
||||
'xml': ['/xml', '+xml', '.xml', '.rss'],
|
||||
}
|
||||
|
||||
PROXY_HEADERS = ['content-type', 'content-disposition', 'content-range', 'accept-ranges']
|
||||
PROXY_HEADERS = ['content-type', 'content-disposition', 'content-range',
|
||||
'accept-ranges']
|
||||
|
||||
URL_REWRITE_HEADERS = ['location', 'content-location', 'content-base']
|
||||
|
||||
|
@ -172,7 +172,7 @@ class HTMLRewriterMixin(object):
|
||||
|
||||
# special case: inline JS/event handler
|
||||
if ((attr_value and attr_value.startswith('javascript:'))
|
||||
or attr_name.startswith('on')):
|
||||
or attr_name.startswith('on')):
|
||||
attr_value = self._rewrite_script(attr_value)
|
||||
|
||||
# special case: inline CSS/style attribute
|
||||
@ -193,7 +193,7 @@ class HTMLRewriterMixin(object):
|
||||
# don't rewrite rel=canonical
|
||||
elif tag == 'link' and attr_name == 'href':
|
||||
if (self.opts.get('rewrite_rel_canon', True) or
|
||||
not self.has_attr(tag_attrs, ('rel', 'canonical'))):
|
||||
not self.has_attr(tag_attrs, ('rel', 'canonical'))):
|
||||
rw_mod = handler.get(attr_name)
|
||||
attr_value = self._rewrite_url(attr_value, rw_mod)
|
||||
|
||||
|
@ -123,7 +123,6 @@ class JSLinkRewriterMixin(object):
|
||||
|
||||
#=================================================================
|
||||
class JSLocationRewriterMixin(object):
|
||||
#class JSLinkAndLocationRewriter(JSLinkOnlyRewriter):
|
||||
"""
|
||||
JS Rewriter mixin which rewrites location and domain to the
|
||||
specified prefix (default: 'WB_wombat_')
|
||||
@ -131,23 +130,23 @@ class JSLocationRewriterMixin(object):
|
||||
|
||||
def __init__(self, rewriter, rules=[], prefix='WB_wombat_'):
|
||||
rules = rules + [
|
||||
(r'(?<![/$])\blocation\b(?!\":)', RegexRewriter.add_prefix(prefix), 0),
|
||||
(r'(?<=document\.)domain', RegexRewriter.add_prefix(prefix), 0),
|
||||
(r'(?<=document\.)referrer', RegexRewriter.add_prefix(prefix), 0),
|
||||
(r'(?<=document\.)cookie', RegexRewriter.add_prefix(prefix), 0),
|
||||
(r'(?<![/$])\blocation\b(?!\":)', RegexRewriter.add_prefix(prefix), 0),
|
||||
(r'(?<=document\.)domain', RegexRewriter.add_prefix(prefix), 0),
|
||||
(r'(?<=document\.)referrer', RegexRewriter.add_prefix(prefix), 0),
|
||||
(r'(?<=document\.)cookie', RegexRewriter.add_prefix(prefix), 0),
|
||||
|
||||
#todo: move to mixin?
|
||||
(r'(?:[\s=(){]|^)(top)(?:[\s!=}()]|$)',
|
||||
RegexRewriter.add_prefix(prefix), 1),
|
||||
(r'(?:[\s=(){]|^)(top)(?:[\s!=}()]|$)',
|
||||
RegexRewriter.add_prefix(prefix), 1),
|
||||
|
||||
(r'(?<=window\.)top',
|
||||
RegexRewriter.add_prefix(prefix), 0),
|
||||
(r'(?<=window\.)top',
|
||||
RegexRewriter.add_prefix(prefix), 0),
|
||||
|
||||
# (r'\b(top)\b[!=\W]+(?:self|window)',
|
||||
# RegexRewriter.add_prefix(prefix), 1),
|
||||
# (r'\b(top)\b[!=\W]+(?:self|window)',
|
||||
# RegexRewriter.add_prefix(prefix), 1),
|
||||
|
||||
#(r'\b(?:self|window)\b[!=\W]+\b(top)\b',
|
||||
#RegexRewriter.add_prefix(prefix), 1),
|
||||
#(r'\b(?:self|window)\b[!=\W]+\b(top)\b',
|
||||
#RegexRewriter.add_prefix(prefix), 1),
|
||||
]
|
||||
super(JSLocationRewriterMixin, self).__init__(rewriter, rules)
|
||||
|
||||
@ -161,6 +160,7 @@ class JSLocationOnlyRewriter(JSLocationRewriterMixin, RegexRewriter):
|
||||
class JSLinkOnlyRewriter(JSLinkRewriterMixin, RegexRewriter):
|
||||
pass
|
||||
|
||||
|
||||
#=================================================================
|
||||
class JSLinkAndLocationRewriter(JSLocationRewriterMixin,
|
||||
JSLinkRewriterMixin,
|
||||
@ -190,9 +190,9 @@ class XMLRewriter(RegexRewriter):
|
||||
|
||||
def _create_rules(self, rewriter):
|
||||
return [
|
||||
('([A-Za-z:]+[\s=]+)?["\'\s]*(' +
|
||||
RegexRewriter.HTTPX_MATCH_STR + ')',
|
||||
RegexRewriter.archival_rewrite(rewriter), 2),
|
||||
('([A-Za-z:]+[\s=]+)?["\'\s]*(' +
|
||||
RegexRewriter.HTTPX_MATCH_STR + ')',
|
||||
RegexRewriter.archival_rewrite(rewriter), 2),
|
||||
]
|
||||
|
||||
|
||||
@ -210,9 +210,9 @@ class CSSRewriter(RegexRewriter):
|
||||
|
||||
def _create_rules(self, rewriter):
|
||||
return [
|
||||
(CSSRewriter.CSS_URL_REGEX,
|
||||
RegexRewriter.archival_rewrite(rewriter), 1),
|
||||
(CSSRewriter.CSS_URL_REGEX,
|
||||
RegexRewriter.archival_rewrite(rewriter), 1),
|
||||
|
||||
(CSSRewriter.CSS_IMPORT_NO_URL_REGEX,
|
||||
RegexRewriter.archival_rewrite(rewriter), 1),
|
||||
(CSSRewriter.CSS_IMPORT_NO_URL_REGEX,
|
||||
RegexRewriter.archival_rewrite(rewriter), 1),
|
||||
]
|
||||
|
@ -76,7 +76,7 @@ class RewriteContent:
|
||||
wb_url = urlrewriter.wburl
|
||||
|
||||
if (wb_url.is_identity or
|
||||
(not head_insert_func and wb_url.is_banner_only)):
|
||||
(not head_insert_func and wb_url.is_banner_only)):
|
||||
status_headers, stream = self.sanitize_content(headers, stream)
|
||||
return (status_headers, self.stream_to_gen(stream), False)
|
||||
|
||||
|
@ -1,7 +1,8 @@
|
||||
from pywb.utils.dsrules import BaseRule
|
||||
|
||||
from regex_rewriters import RegexRewriter, CSSRewriter, XMLRewriter
|
||||
from regex_rewriters import JSLinkAndLocationRewriter, JSLinkOnlyRewriter, JSLocationOnlyRewriter
|
||||
from regex_rewriters import JSLinkAndLocationRewriter, JSLinkOnlyRewriter
|
||||
from regex_rewriters import JSLocationOnlyRewriter
|
||||
|
||||
from header_rewriter import HeaderRewriter
|
||||
from html_rewriter import HTMLRewriter
|
||||
|
@ -35,13 +35,13 @@ class UrlRewriter(object):
|
||||
return url
|
||||
|
||||
if (self.prefix and
|
||||
self.prefix != '/' and
|
||||
url.startswith(self.prefix)):
|
||||
self.prefix != '/' and
|
||||
url.startswith(self.prefix)):
|
||||
return url
|
||||
|
||||
if (self.full_prefix and
|
||||
self.full_prefix != self.prefix and
|
||||
url.startswith(self.full_prefix)):
|
||||
self.full_prefix != self.prefix and
|
||||
url.startswith(self.full_prefix)):
|
||||
return url
|
||||
|
||||
wburl = self.wburl
|
||||
|
@ -41,6 +41,7 @@ wayback url format.
|
||||
import re
|
||||
import urllib
|
||||
|
||||
|
||||
#=================================================================
|
||||
class BaseWbUrl(object):
|
||||
QUERY = 'query'
|
||||
@ -107,7 +108,8 @@ class WbUrl(BaseWbUrl):
|
||||
m = self.PARTIAL_ENC_RX.match(self.url)
|
||||
if m:
|
||||
len_ = len(m.group(0))
|
||||
self.url = urllib.unquote_plus(self.url[:len_]) + self.url[len_:]
|
||||
self.url = (urllib.unquote_plus(self.url[:len_]) +
|
||||
self.url[len_:])
|
||||
inx = self.url.find(':/')
|
||||
|
||||
if inx < 0:
|
||||
@ -160,7 +162,6 @@ class WbUrl(BaseWbUrl):
|
||||
self.timestamp = timestamp
|
||||
self.type = self.REPLAY
|
||||
|
||||
|
||||
def deprefix_url(self, prefix):
|
||||
prefix = urllib.quote_plus(prefix)
|
||||
rex_query = '=' + re.escape(prefix) + '([0-9])*([\w]{2}_)?/?'
|
||||
|
@ -173,7 +173,7 @@ def calc_search_range(url, match_type, surt_ordered=True, url_canon=None):
|
||||
|
||||
# if tld, use com, as start_key
|
||||
# otherwise, stick with com,example)/
|
||||
if not ',' in host:
|
||||
if ',' not in host:
|
||||
start_key = host + ','
|
||||
else:
|
||||
start_key = host + ')/'
|
||||
|
@ -277,7 +277,7 @@ def create_record_iter(arcv_iter, options):
|
||||
compute_digest = False
|
||||
|
||||
if (entry.digest == '-' and
|
||||
record.rec_type not in ('revisit', 'request', 'warcinfo')):
|
||||
record.rec_type not in ('revisit', 'request', 'warcinfo')):
|
||||
|
||||
compute_digest = True
|
||||
|
||||
@ -315,11 +315,11 @@ def join_request_records(entry_iter, options):
|
||||
|
||||
# check for concurrency also
|
||||
elif (entry.record.rec_headers.get_header('WARC-Concurrent-To') !=
|
||||
prev_entry.record.rec_headers.get_header('WARC-Record-ID')):
|
||||
prev_entry.record.rec_headers.get_header('WARC-Record-ID')):
|
||||
pass
|
||||
|
||||
elif (entry.merge_request_data(prev_entry, options) or
|
||||
prev_entry.merge_request_data(entry, options)):
|
||||
prev_entry.merge_request_data(entry, options)):
|
||||
yield prev_entry
|
||||
yield entry
|
||||
prev_entry = None
|
||||
@ -423,7 +423,7 @@ def create_index_iter(fh, **options):
|
||||
|
||||
for entry in entry_iter:
|
||||
if (entry.record.rec_type in ('request', 'warcinfo') and
|
||||
not options.get('include_all')):
|
||||
not options.get('include_all')):
|
||||
continue
|
||||
|
||||
yield entry
|
||||
|
@ -42,7 +42,7 @@ class CDXAPIHandler(BaseHandler):
|
||||
if name != 'filter':
|
||||
params[name] = val[0]
|
||||
|
||||
if not 'output' in params:
|
||||
if 'output' not in params:
|
||||
params['output'] = 'text'
|
||||
elif params['output'] not in ('text'):
|
||||
params['output'] = 'text'
|
||||
|
@ -16,7 +16,6 @@ from pywb.warc.resolvingloader import ResolvingLoader
|
||||
|
||||
from views import J2TemplateView
|
||||
from replay_views import ReplayView
|
||||
from cached_replay import CachedReplayView
|
||||
from pywb.framework.memento import MementoResponse
|
||||
from pywb.utils.timeutils import datetime_to_timestamp
|
||||
|
||||
@ -65,8 +64,8 @@ class SearchPageWbUrlHandler(WbUrlHandler):
|
||||
# render top level frame if in frame mode
|
||||
# (not supported in proxy mode)
|
||||
if (self.is_frame_mode and wbrequest.wb_url and
|
||||
not wbrequest.wb_url.is_query() and
|
||||
not wbrequest.options['is_proxy']):
|
||||
not wbrequest.wb_url.is_query() and
|
||||
not wbrequest.options['is_proxy']):
|
||||
|
||||
if wbrequest.wb_url.is_top_frame:
|
||||
return self.get_top_frame_response(wbrequest)
|
||||
@ -154,8 +153,8 @@ class WBHandler(SearchPageWbUrlHandler):
|
||||
|
||||
def handle_not_found(self, wbrequest, nfe):
|
||||
if (not self.fallback_handler or
|
||||
wbrequest.wb_url.is_query() or
|
||||
wbrequest.wb_url.is_identity):
|
||||
wbrequest.wb_url.is_query() or
|
||||
wbrequest.wb_url.is_identity):
|
||||
raise
|
||||
|
||||
return self.fallback_handler(wbrequest)
|
||||
|
@ -89,7 +89,8 @@ class RewriteHandler(SearchPageWbUrlHandler):
|
||||
if rangeres:
|
||||
url, start, end, use_206 = rangeres
|
||||
|
||||
# if bytes=0- Range request, simply remove the range and still proxy
|
||||
# if bytes=0- Range request,
|
||||
# simply remove the range and still proxy
|
||||
if start == 0 and not end and use_206:
|
||||
wbrequest.wb_url.url = url
|
||||
del wbrequest.env['HTTP_RANGE']
|
||||
@ -111,10 +112,12 @@ class RewriteHandler(SearchPageWbUrlHandler):
|
||||
wbresponse = self._make_response(wbrequest, *result)
|
||||
|
||||
if readd_range:
|
||||
content_length = wbresponse.status_headers.get_header('Content-Length')
|
||||
content_length = (wbresponse.status_headers.
|
||||
get_header('Content-Length'))
|
||||
try:
|
||||
content_length = int(content_length)
|
||||
wbresponse.status_headers.add_range(0, content_length, content_length)
|
||||
wbresponse.status_headers.add_range(0, content_length,
|
||||
content_length)
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
||||
@ -165,7 +168,8 @@ class RewriteHandler(SearchPageWbUrlHandler):
|
||||
verify=False,
|
||||
stream=True)
|
||||
|
||||
# don't actually read whole response, proxy response for writing it
|
||||
# don't actually read whole response,
|
||||
# proxy response for writing it
|
||||
resp.close()
|
||||
except:
|
||||
del self._cache[key]
|
||||
@ -176,6 +180,7 @@ class RewriteHandler(SearchPageWbUrlHandler):
|
||||
resp = self.get_video_info(wbrequest,
|
||||
info_url=referrer,
|
||||
video_url=url)
|
||||
|
||||
def wrap_buff_gen(gen):
|
||||
for x in gen:
|
||||
yield x
|
||||
|
@ -24,7 +24,6 @@ import logging
|
||||
|
||||
#=================================================================
|
||||
DEFAULTS = {
|
||||
'hostpaths': ['http://localhost:8080'],
|
||||
'collections': {'pywb': './sample_archive/cdx/'},
|
||||
'archive_paths': './sample_archive/warcs/',
|
||||
|
||||
@ -153,13 +152,6 @@ def create_wb_router(passed_config={}):
|
||||
|
||||
routes = []
|
||||
|
||||
# TODO: examine this more
|
||||
hostname = os.environ.get('PYWB_HOST_NAME')
|
||||
if hostname:
|
||||
hostpaths = [hostname]
|
||||
else:
|
||||
hostpaths = config.get('hostpaths')
|
||||
|
||||
port = config.get('port')
|
||||
|
||||
# collections based on cdx source
|
||||
@ -238,18 +230,18 @@ def create_wb_router(passed_config={}):
|
||||
router = ProxyArchivalRouter
|
||||
|
||||
view = J2TemplateView.create_template(
|
||||
config.get('proxy_select_html'),
|
||||
'Proxy Coll Selector')
|
||||
config.get('proxy_select_html'),
|
||||
'Proxy Coll Selector')
|
||||
|
||||
if not 'proxy_options' in passed_config:
|
||||
if 'proxy_options' not in passed_config:
|
||||
passed_config['proxy_options'] = {}
|
||||
|
||||
if view:
|
||||
passed_config['proxy_options']['proxy_select_view'] = view
|
||||
|
||||
view = J2TemplateView.create_template(
|
||||
config.get('proxy_cert_download_html'),
|
||||
'Proxy Cert Download')
|
||||
config.get('proxy_cert_download_html'),
|
||||
'Proxy Cert Download')
|
||||
|
||||
if view:
|
||||
passed_config['proxy_options']['proxy_cert_download_view'] = view
|
||||
@ -257,11 +249,6 @@ def create_wb_router(passed_config={}):
|
||||
# Finally, create wb router
|
||||
return router(
|
||||
routes,
|
||||
# Specify hostnames that pywb will be running on
|
||||
# This will help catch occasionally missed rewrites that
|
||||
# fall-through to the host
|
||||
# (See archivalrouter.ReferRedirect)
|
||||
hostpaths=hostpaths,
|
||||
port=port,
|
||||
|
||||
abs_path=config.get('absolute_paths', True),
|
||||
|
@ -141,7 +141,7 @@ class QueryHandler(object):
|
||||
'limit': limit,
|
||||
'fl': ('urlkey,original,timestamp,' +
|
||||
'endtimestamp,groupcount,uniqcount'),
|
||||
'filter':[],
|
||||
'filter': [],
|
||||
},
|
||||
|
||||
wburl.REPLAY:
|
||||
|
@ -18,7 +18,7 @@ class RangeCache(object):
|
||||
atexit.register(self.cleanup)
|
||||
|
||||
def cleanup(self):
|
||||
if self.temp_dir: # pragma: no cover
|
||||
if self.temp_dir: # pragma: no cover
|
||||
import shutil
|
||||
print('Removing: ' + self.temp_dir)
|
||||
shutil.rmtree(self.temp_dir, True)
|
||||
@ -28,7 +28,7 @@ class RangeCache(object):
|
||||
url, start, end, use_206):
|
||||
|
||||
key = digest
|
||||
if not key in self.cache:
|
||||
if key not in self.cache:
|
||||
wbrequest.custom_params['noredir'] = True
|
||||
response = wbresponse_func()
|
||||
|
||||
|
@ -229,8 +229,8 @@ class ReplayView(object):
|
||||
url=cdx['original']))
|
||||
|
||||
if wbrequest.method == 'POST':
|
||||
# FF shows a confirm dialog, so can't use 307 effectively
|
||||
# statusline = '307 Same-Method Internal Redirect'
|
||||
# FF shows a confirm dialog, so can't use 307 effectively
|
||||
# was: statusline = '307 Same-Method Internal Redirect'
|
||||
return None
|
||||
else:
|
||||
statusline = '302 Internal Redirect'
|
||||
@ -252,7 +252,7 @@ class ReplayView(object):
|
||||
|
||||
# skip all 304s
|
||||
if (status_headers.statusline.startswith('304') and
|
||||
not wbrequest.wb_url.is_identity):
|
||||
not wbrequest.wb_url.is_identity):
|
||||
|
||||
raise CaptureException('Skipping 304 Modified: ' + str(cdx))
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user