1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

pep8 pass: fix spacing, line length, issues

also remove references to obsolete cached_replay, hostnames in pywb_init
This commit is contained in:
Ilya Kreymer 2014-12-23 15:14:03 -08:00
parent 51919ed1e7
commit 181c18a1b8
23 changed files with 75 additions and 79 deletions

View File

@ -106,7 +106,6 @@ class FuzzyQuery:
if inx > 0:
url = url[:inx + 1]
if matched_rule.match_type == 'domain':
host = urlparse.urlsplit(url).netloc
# remove the subdomain
@ -174,8 +173,8 @@ class CDXDomainSpecificRule(BaseRule):
@staticmethod
def make_query_match_regex(params_list):
r"""
>>> CDXDomainSpecificRule.make_query_match_regex(['param1', 'id', 'abc'])
'[?&](abc=[^&]+).*[?&](id=[^&]+).*[?&](param1=[^&]+)'
>>> CDXDomainSpecificRule.make_query_match_regex(['para', 'id', 'abc'])
'[?&](abc=[^&]+).*[?&](id=[^&]+).*[?&](para=[^&]+)'
>>> CDXDomainSpecificRule.make_query_match_regex(['id[0]', 'abc()'])
'[?&](abc\\(\\)=[^&]+).*[?&](id\\[0\\]=[^&]+)'

View File

@ -44,7 +44,7 @@ class CDXObject(OrderedDict):
["urlkey", "timestamp", "original", "mimetype", "statuscode",
"digest", "redirect", "offset", "filename",
"orig.length", "orig.offset", "orig.filename"]
]
]
def __init__(self, cdxline=''):
OrderedDict.__init__(self)

View File

@ -169,8 +169,8 @@ def cdx_filter(cdx_iter, filter_strings):
# no field set, apply filter to entire cdx
if len(parts) == 1:
self.field = ''
else:
# apply filter to cdx[field]
else:
self.field = parts[0]
string = parts[1]

View File

@ -194,7 +194,7 @@ def main(args=None):
help=('use specified root cert (.pem file) ' +
'to create signed cert'))
parser.add_argument('-n', '--name', action='store', default=CERT_NAME,
parser.add_argument('-n', '--name', action='store', default=CERT_NAME,
help='name for root certificate')
parser.add_argument('-d', '--certs-dir', default=CERTS_DIR)

View File

@ -159,7 +159,7 @@ class ProxyRouter(object):
if env['pywb.proxy_host'] == self.magic_name:
env['REL_REQUEST_URI'] = env['pywb.proxy_req_uri']
# special case for proxy install
# special case for proxy install
response = self.handle_cert_install(env)
if response:
return response
@ -307,7 +307,7 @@ class ProxyRouter(object):
name = name.replace('-', '_').upper()
if not name in ('CONTENT_LENGTH', 'CONTENT_TYPE'):
if name not in ('CONTENT_LENGTH', 'CONTENT_TYPE'):
name = 'HTTP_' + name
env[name] = value

View File

@ -83,8 +83,8 @@ class WbRequest(object):
rewrite_opts)
self.urlrewriter.deprefix_url()
else:
# no wb_url, just store blank wb_url
else:
self.wb_url = None
self.urlrewriter = None
@ -113,6 +113,7 @@ class WbRequest(object):
return False
RANGE_ARG_RX = re.compile('.*.googlevideo.com/videoplayback.*([&?]range=(\d+)-(\d+))')
RANGE_HEADER = re.compile('bytes=(\d+)-(\d+)?')
def extract_range(self):

View File

@ -73,6 +73,8 @@ class ExactPathCookieRewriter(WbUrlBaseCookieRewriter):
self._remove_age_opts(morsel)
return morsel
#=================================================================
class RootScopeCookieRewriter(WbUrlBaseCookieRewriter):
"""

View File

@ -33,7 +33,8 @@ class HeaderRewriter:
'xml': ['/xml', '+xml', '.xml', '.rss'],
}
PROXY_HEADERS = ['content-type', 'content-disposition', 'content-range', 'accept-ranges']
PROXY_HEADERS = ['content-type', 'content-disposition', 'content-range',
'accept-ranges']
URL_REWRITE_HEADERS = ['location', 'content-location', 'content-base']

View File

@ -172,7 +172,7 @@ class HTMLRewriterMixin(object):
# special case: inline JS/event handler
if ((attr_value and attr_value.startswith('javascript:'))
or attr_name.startswith('on')):
or attr_name.startswith('on')):
attr_value = self._rewrite_script(attr_value)
# special case: inline CSS/style attribute
@ -193,7 +193,7 @@ class HTMLRewriterMixin(object):
# don't rewrite rel=canonical
elif tag == 'link' and attr_name == 'href':
if (self.opts.get('rewrite_rel_canon', True) or
not self.has_attr(tag_attrs, ('rel', 'canonical'))):
not self.has_attr(tag_attrs, ('rel', 'canonical'))):
rw_mod = handler.get(attr_name)
attr_value = self._rewrite_url(attr_value, rw_mod)

View File

@ -123,7 +123,6 @@ class JSLinkRewriterMixin(object):
#=================================================================
class JSLocationRewriterMixin(object):
#class JSLinkAndLocationRewriter(JSLinkOnlyRewriter):
"""
JS Rewriter mixin which rewrites location and domain to the
specified prefix (default: 'WB_wombat_')
@ -131,23 +130,23 @@ class JSLocationRewriterMixin(object):
def __init__(self, rewriter, rules=[], prefix='WB_wombat_'):
rules = rules + [
(r'(?<![/$])\blocation\b(?!\":)', RegexRewriter.add_prefix(prefix), 0),
(r'(?<=document\.)domain', RegexRewriter.add_prefix(prefix), 0),
(r'(?<=document\.)referrer', RegexRewriter.add_prefix(prefix), 0),
(r'(?<=document\.)cookie', RegexRewriter.add_prefix(prefix), 0),
(r'(?<![/$])\blocation\b(?!\":)', RegexRewriter.add_prefix(prefix), 0),
(r'(?<=document\.)domain', RegexRewriter.add_prefix(prefix), 0),
(r'(?<=document\.)referrer', RegexRewriter.add_prefix(prefix), 0),
(r'(?<=document\.)cookie', RegexRewriter.add_prefix(prefix), 0),
#todo: move to mixin?
(r'(?:[\s=(){]|^)(top)(?:[\s!=}()]|$)',
RegexRewriter.add_prefix(prefix), 1),
(r'(?:[\s=(){]|^)(top)(?:[\s!=}()]|$)',
RegexRewriter.add_prefix(prefix), 1),
(r'(?<=window\.)top',
RegexRewriter.add_prefix(prefix), 0),
(r'(?<=window\.)top',
RegexRewriter.add_prefix(prefix), 0),
# (r'\b(top)\b[!=\W]+(?:self|window)',
# RegexRewriter.add_prefix(prefix), 1),
# (r'\b(top)\b[!=\W]+(?:self|window)',
# RegexRewriter.add_prefix(prefix), 1),
#(r'\b(?:self|window)\b[!=\W]+\b(top)\b',
#RegexRewriter.add_prefix(prefix), 1),
#(r'\b(?:self|window)\b[!=\W]+\b(top)\b',
#RegexRewriter.add_prefix(prefix), 1),
]
super(JSLocationRewriterMixin, self).__init__(rewriter, rules)
@ -161,6 +160,7 @@ class JSLocationOnlyRewriter(JSLocationRewriterMixin, RegexRewriter):
class JSLinkOnlyRewriter(JSLinkRewriterMixin, RegexRewriter):
pass
#=================================================================
class JSLinkAndLocationRewriter(JSLocationRewriterMixin,
JSLinkRewriterMixin,
@ -190,9 +190,9 @@ class XMLRewriter(RegexRewriter):
def _create_rules(self, rewriter):
return [
('([A-Za-z:]+[\s=]+)?["\'\s]*(' +
RegexRewriter.HTTPX_MATCH_STR + ')',
RegexRewriter.archival_rewrite(rewriter), 2),
('([A-Za-z:]+[\s=]+)?["\'\s]*(' +
RegexRewriter.HTTPX_MATCH_STR + ')',
RegexRewriter.archival_rewrite(rewriter), 2),
]
@ -210,9 +210,9 @@ class CSSRewriter(RegexRewriter):
def _create_rules(self, rewriter):
return [
(CSSRewriter.CSS_URL_REGEX,
RegexRewriter.archival_rewrite(rewriter), 1),
(CSSRewriter.CSS_URL_REGEX,
RegexRewriter.archival_rewrite(rewriter), 1),
(CSSRewriter.CSS_IMPORT_NO_URL_REGEX,
RegexRewriter.archival_rewrite(rewriter), 1),
(CSSRewriter.CSS_IMPORT_NO_URL_REGEX,
RegexRewriter.archival_rewrite(rewriter), 1),
]

View File

@ -76,7 +76,7 @@ class RewriteContent:
wb_url = urlrewriter.wburl
if (wb_url.is_identity or
(not head_insert_func and wb_url.is_banner_only)):
(not head_insert_func and wb_url.is_banner_only)):
status_headers, stream = self.sanitize_content(headers, stream)
return (status_headers, self.stream_to_gen(stream), False)

View File

@ -1,7 +1,8 @@
from pywb.utils.dsrules import BaseRule
from regex_rewriters import RegexRewriter, CSSRewriter, XMLRewriter
from regex_rewriters import JSLinkAndLocationRewriter, JSLinkOnlyRewriter, JSLocationOnlyRewriter
from regex_rewriters import JSLinkAndLocationRewriter, JSLinkOnlyRewriter
from regex_rewriters import JSLocationOnlyRewriter
from header_rewriter import HeaderRewriter
from html_rewriter import HTMLRewriter

View File

@ -35,13 +35,13 @@ class UrlRewriter(object):
return url
if (self.prefix and
self.prefix != '/' and
url.startswith(self.prefix)):
self.prefix != '/' and
url.startswith(self.prefix)):
return url
if (self.full_prefix and
self.full_prefix != self.prefix and
url.startswith(self.full_prefix)):
self.full_prefix != self.prefix and
url.startswith(self.full_prefix)):
return url
wburl = self.wburl

View File

@ -41,6 +41,7 @@ wayback url format.
import re
import urllib
#=================================================================
class BaseWbUrl(object):
QUERY = 'query'
@ -107,7 +108,8 @@ class WbUrl(BaseWbUrl):
m = self.PARTIAL_ENC_RX.match(self.url)
if m:
len_ = len(m.group(0))
self.url = urllib.unquote_plus(self.url[:len_]) + self.url[len_:]
self.url = (urllib.unquote_plus(self.url[:len_]) +
self.url[len_:])
inx = self.url.find(':/')
if inx < 0:
@ -160,7 +162,6 @@ class WbUrl(BaseWbUrl):
self.timestamp = timestamp
self.type = self.REPLAY
def deprefix_url(self, prefix):
prefix = urllib.quote_plus(prefix)
rex_query = '=' + re.escape(prefix) + '([0-9])*([\w]{2}_)?/?'

View File

@ -173,7 +173,7 @@ def calc_search_range(url, match_type, surt_ordered=True, url_canon=None):
# if tld, use com, as start_key
# otherwise, stick with com,example)/
if not ',' in host:
if ',' not in host:
start_key = host + ','
else:
start_key = host + ')/'

View File

@ -277,7 +277,7 @@ def create_record_iter(arcv_iter, options):
compute_digest = False
if (entry.digest == '-' and
record.rec_type not in ('revisit', 'request', 'warcinfo')):
record.rec_type not in ('revisit', 'request', 'warcinfo')):
compute_digest = True
@ -315,11 +315,11 @@ def join_request_records(entry_iter, options):
# check for concurrency also
elif (entry.record.rec_headers.get_header('WARC-Concurrent-To') !=
prev_entry.record.rec_headers.get_header('WARC-Record-ID')):
prev_entry.record.rec_headers.get_header('WARC-Record-ID')):
pass
elif (entry.merge_request_data(prev_entry, options) or
prev_entry.merge_request_data(entry, options)):
prev_entry.merge_request_data(entry, options)):
yield prev_entry
yield entry
prev_entry = None
@ -423,7 +423,7 @@ def create_index_iter(fh, **options):
for entry in entry_iter:
if (entry.record.rec_type in ('request', 'warcinfo') and
not options.get('include_all')):
not options.get('include_all')):
continue
yield entry

View File

@ -42,7 +42,7 @@ class CDXAPIHandler(BaseHandler):
if name != 'filter':
params[name] = val[0]
if not 'output' in params:
if 'output' not in params:
params['output'] = 'text'
elif params['output'] not in ('text'):
params['output'] = 'text'

View File

@ -16,7 +16,6 @@ from pywb.warc.resolvingloader import ResolvingLoader
from views import J2TemplateView
from replay_views import ReplayView
from cached_replay import CachedReplayView
from pywb.framework.memento import MementoResponse
from pywb.utils.timeutils import datetime_to_timestamp
@ -65,8 +64,8 @@ class SearchPageWbUrlHandler(WbUrlHandler):
# render top level frame if in frame mode
# (not supported in proxy mode)
if (self.is_frame_mode and wbrequest.wb_url and
not wbrequest.wb_url.is_query() and
not wbrequest.options['is_proxy']):
not wbrequest.wb_url.is_query() and
not wbrequest.options['is_proxy']):
if wbrequest.wb_url.is_top_frame:
return self.get_top_frame_response(wbrequest)
@ -154,8 +153,8 @@ class WBHandler(SearchPageWbUrlHandler):
def handle_not_found(self, wbrequest, nfe):
if (not self.fallback_handler or
wbrequest.wb_url.is_query() or
wbrequest.wb_url.is_identity):
wbrequest.wb_url.is_query() or
wbrequest.wb_url.is_identity):
raise
return self.fallback_handler(wbrequest)

View File

@ -89,7 +89,8 @@ class RewriteHandler(SearchPageWbUrlHandler):
if rangeres:
url, start, end, use_206 = rangeres
# if bytes=0- Range request, simply remove the range and still proxy
# if bytes=0- Range request,
# simply remove the range and still proxy
if start == 0 and not end and use_206:
wbrequest.wb_url.url = url
del wbrequest.env['HTTP_RANGE']
@ -111,10 +112,12 @@ class RewriteHandler(SearchPageWbUrlHandler):
wbresponse = self._make_response(wbrequest, *result)
if readd_range:
content_length = wbresponse.status_headers.get_header('Content-Length')
content_length = (wbresponse.status_headers.
get_header('Content-Length'))
try:
content_length = int(content_length)
wbresponse.status_headers.add_range(0, content_length, content_length)
wbresponse.status_headers.add_range(0, content_length,
content_length)
except (ValueError, TypeError):
pass
@ -165,7 +168,8 @@ class RewriteHandler(SearchPageWbUrlHandler):
verify=False,
stream=True)
# don't actually read whole response, proxy response for writing it
# don't actually read whole response,
# proxy response for writing it
resp.close()
except:
del self._cache[key]
@ -176,6 +180,7 @@ class RewriteHandler(SearchPageWbUrlHandler):
resp = self.get_video_info(wbrequest,
info_url=referrer,
video_url=url)
def wrap_buff_gen(gen):
for x in gen:
yield x

View File

@ -24,7 +24,6 @@ import logging
#=================================================================
DEFAULTS = {
'hostpaths': ['http://localhost:8080'],
'collections': {'pywb': './sample_archive/cdx/'},
'archive_paths': './sample_archive/warcs/',
@ -153,13 +152,6 @@ def create_wb_router(passed_config={}):
routes = []
# TODO: examine this more
hostname = os.environ.get('PYWB_HOST_NAME')
if hostname:
hostpaths = [hostname]
else:
hostpaths = config.get('hostpaths')
port = config.get('port')
# collections based on cdx source
@ -238,18 +230,18 @@ def create_wb_router(passed_config={}):
router = ProxyArchivalRouter
view = J2TemplateView.create_template(
config.get('proxy_select_html'),
'Proxy Coll Selector')
config.get('proxy_select_html'),
'Proxy Coll Selector')
if not 'proxy_options' in passed_config:
if 'proxy_options' not in passed_config:
passed_config['proxy_options'] = {}
if view:
passed_config['proxy_options']['proxy_select_view'] = view
view = J2TemplateView.create_template(
config.get('proxy_cert_download_html'),
'Proxy Cert Download')
config.get('proxy_cert_download_html'),
'Proxy Cert Download')
if view:
passed_config['proxy_options']['proxy_cert_download_view'] = view
@ -257,11 +249,6 @@ def create_wb_router(passed_config={}):
# Finally, create wb router
return router(
routes,
# Specify hostnames that pywb will be running on
# This will help catch occasionally missed rewrites that
# fall-through to the host
# (See archivalrouter.ReferRedirect)
hostpaths=hostpaths,
port=port,
abs_path=config.get('absolute_paths', True),

View File

@ -141,7 +141,7 @@ class QueryHandler(object):
'limit': limit,
'fl': ('urlkey,original,timestamp,' +
'endtimestamp,groupcount,uniqcount'),
'filter':[],
'filter': [],
},
wburl.REPLAY:

View File

@ -18,7 +18,7 @@ class RangeCache(object):
atexit.register(self.cleanup)
def cleanup(self):
if self.temp_dir: # pragma: no cover
if self.temp_dir: # pragma: no cover
import shutil
print('Removing: ' + self.temp_dir)
shutil.rmtree(self.temp_dir, True)
@ -28,7 +28,7 @@ class RangeCache(object):
url, start, end, use_206):
key = digest
if not key in self.cache:
if key not in self.cache:
wbrequest.custom_params['noredir'] = True
response = wbresponse_func()

View File

@ -229,8 +229,8 @@ class ReplayView(object):
url=cdx['original']))
if wbrequest.method == 'POST':
# FF shows a confirm dialog, so can't use 307 effectively
# statusline = '307 Same-Method Internal Redirect'
# FF shows a confirm dialog, so can't use 307 effectively
# was: statusline = '307 Same-Method Internal Redirect'
return None
else:
statusline = '302 Internal Redirect'
@ -252,7 +252,7 @@ class ReplayView(object):
# skip all 304s
if (status_headers.statusline.startswith('304') and
not wbrequest.wb_url.is_identity):
not wbrequest.wb_url.is_identity):
raise CaptureException('Skipping 304 Modified: ' + str(cdx))