From 51d61a873891432762362a82d58dfcc4b2c5cdb5 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 24 Feb 2014 03:00:01 -0800 Subject: [PATCH 1/7] package reorg! split up remaining parts of pywb root pkg into core, dispatch and bootstrap --- pywb/bootstrap/__init__.py | 0 pywb/{ => bootstrap}/config_utils.py | 14 ++++---- pywb/{ => bootstrap}/pywb_init.py | 24 +++++++------- pywb/{ => bootstrap}/wbapp.py | 10 ++++-- pywb/core/__init__.py | 0 pywb/{ => core}/handlers.py | 15 +++++---- pywb/{ => core}/indexreader.py | 0 pywb/{ => core}/replay_views.py | 33 +++++++++++-------- .../{ => core}/test/test_wbrequestresponse.py | 2 +- pywb/{ => core}/views.py | 14 ++++---- pywb/{ => core}/wbexceptions.py | 0 pywb/{ => core}/wbrequestresponse.py | 0 pywb/dispatch/__init__.py | 0 pywb/{ => dispatch}/archivalrouter.py | 2 +- pywb/{ => dispatch}/proxy.py | 2 +- .../test/test_archivalrouter.py | 4 +-- run.sh | 4 +-- setup.py | 4 +-- tests/test_integration.py | 4 +-- 19 files changed, 73 insertions(+), 59 deletions(-) create mode 100644 pywb/bootstrap/__init__.py rename pywb/{ => bootstrap}/config_utils.py (81%) rename pywb/{ => bootstrap}/pywb_init.py (82%) rename pywb/{ => bootstrap}/wbapp.py (92%) create mode 100644 pywb/core/__init__.py rename pywb/{ => core}/handlers.py (92%) rename pywb/{ => core}/indexreader.py (100%) rename pywb/{ => core}/replay_views.py (85%) rename pywb/{ => core}/test/test_wbrequestresponse.py (98%) rename pywb/{ => core}/views.py (85%) rename pywb/{ => core}/wbexceptions.py (100%) rename pywb/{ => core}/wbrequestresponse.py (100%) create mode 100644 pywb/dispatch/__init__.py rename pywb/{ => dispatch}/archivalrouter.py (98%) rename pywb/{ => dispatch}/proxy.py (98%) rename pywb/{ => dispatch}/test/test_archivalrouter.py (97%) diff --git a/pywb/bootstrap/__init__.py b/pywb/bootstrap/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pywb/config_utils.py b/pywb/bootstrap/config_utils.py similarity index 81% rename from pywb/config_utils.py rename to pywb/bootstrap/config_utils.py index 672e8735..2307022a 100644 --- a/pywb/config_utils.py +++ b/pywb/bootstrap/config_utils.py @@ -1,16 +1,16 @@ -import views -import handlers -import replay_views import logging from pywb.warc.recordloader import ArcWarcRecordLoader from pywb.warc.resolvingloader import ResolvingLoader from pywb.rewrite.rewrite_content import RewriteContent +from pywb.core.views import J2TemplateView, J2HtmlCapturesView +from pywb.core.handlers import WBHandler +from pywb.core.replay_views import ReplayView #================================================================= # Config Loading #================================================================= -def load_template_file(file, desc = None, view_class = views.J2TemplateView): +def load_template_file(file, desc = None, view_class = J2TemplateView): if file: logging.debug('Adding {0}: {1}'.format(desc if desc else name, file)) file = view_class(file) @@ -25,7 +25,7 @@ def create_wb_handler(cdx_server, config): resolving_loader = ResolvingLoader(paths = paths, cdx_server = cdx_server, record_loader = record_loader) - replayer = replay_views.ReplayView( + replayer = ReplayView( content_loader = resolving_loader, content_rewriter = RewriteContent(), @@ -40,12 +40,12 @@ def create_wb_handler(cdx_server, config): ) - wb_handler = handlers.WBHandler( + wb_handler = WBHandler( cdx_server, replayer, - html_view = load_template_file(config.get('query_html'), 'Captures Page', views.J2HtmlCapturesView), + html_view = load_template_file(config.get('query_html'), 'Captures Page', J2HtmlCapturesView), search_view = load_template_file(config.get('search_html'), 'Search Page'), ) diff --git a/pywb/pywb_init.py b/pywb/bootstrap/pywb_init.py similarity index 82% rename from pywb/pywb_init.py rename to pywb/bootstrap/pywb_init.py index be4bdded..7465ba64 100644 --- a/pywb/pywb_init.py +++ b/pywb/bootstrap/pywb_init.py @@ -1,8 +1,10 @@ -import handlers -import archivalrouter +from pywb.core.handlers import CDXHandler, StaticHandler +from pywb.core.handlers import DebugEchoHandler, DebugEchoEnvHandler +from pywb.dispatch.archivalrouter import ArchivalRouter, Route +from pywb.dispatch.proxy import ProxyArchivalRouter +from pywb.core.indexreader import IndexReader + import config_utils -import proxy -from indexreader import IndexReader import os import yaml @@ -67,32 +69,32 @@ def pywb_config_manual(passed_config = {}): logging.debug('Adding Collection: ' + name) - route_class = route_config.get('route_class', archivalrouter.Route) + route_class = route_config.get('route_class', Route) routes.append(route_class(name, wb_handler, config = route_config)) # cdx query handler if route_config.get('enable_cdx_api', False): - routes.append(archivalrouter.Route(name + '-cdx', handlers.CDXHandler(cdx_server))) + routes.append(Route(name + '-cdx', CDXHandler(cdx_server))) if config.get('debug_echo_env', False): - routes.append(archivalrouter.Route('echo_env', handlers.DebugEchoEnvHandler())) + routes.append(Route('echo_env', DebugEchoEnvHandler())) if config.get('debug_echo_req', False): - routes.append(archivalrouter.Route('echo_req', handlers.DebugEchoHandler())) + routes.append(Route('echo_req', DebugEchoHandler())) static_routes = config.get('static_routes') for static_name, static_path in static_routes.iteritems(): - routes.append(archivalrouter.Route(static_name, handlers.StaticHandler(static_path))) + routes.append(Route(static_name, StaticHandler(static_path))) # Check for new proxy mode! if config.get('enable_http_proxy', False): - router = proxy.ProxyArchivalRouter + router = ProxyArchivalRouter else: - router = archivalrouter.ArchivalRouter + router = ArchivalRouter # Finally, create wb router return router( diff --git a/pywb/wbapp.py b/pywb/bootstrap/wbapp.py similarity index 92% rename from pywb/wbapp.py rename to pywb/bootstrap/wbapp.py index 0befa172..f9a6d359 100644 --- a/pywb/wbapp.py +++ b/pywb/bootstrap/wbapp.py @@ -1,5 +1,5 @@ -from wbexceptions import WbException, NotFoundException, InternalRedirect -from wbrequestresponse import WbResponse, StatusAndHeaders +from pywb.core.wbexceptions import WbException, NotFoundException, InternalRedirect +from pywb.core.wbrequestresponse import WbResponse, StatusAndHeaders from pywb.cdx.cdxserver import CDXException from pywb.warc.recordloader import ArchiveLoadFailed @@ -91,6 +91,10 @@ def handle_exception(env, error_view, exc, print_trace): #================================================================= DEFAULT_CONFIG_FILE = 'config.yaml' +DEFAULT_INIT_MODULE = 'pywb.bootstrap.pywb_init' + + +#================================================================= def main(): try: logging.basicConfig(format = '%(asctime)s: [%(levelname)s]: %(message)s', level = logging.DEBUG) @@ -100,7 +104,7 @@ def main(): if not config_name: # use default module - config_name = 'pywb.pywb_init' + config_name = DEFAULT_INIT_MODULE logging.info('Loading from default config module "{0}"'.format(config_name)) logging.info('') diff --git a/pywb/core/__init__.py b/pywb/core/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pywb/handlers.py b/pywb/core/handlers.py similarity index 92% rename from pywb/handlers.py rename to pywb/core/handlers.py index c82db7fe..d166e640 100644 --- a/pywb/handlers.py +++ b/pywb/core/handlers.py @@ -33,11 +33,15 @@ class WBHandler(WbUrlHandler): html_view=None, search_view=None): self.index_reader = index_reader + self.replay = replay - self.text_view = TextCapturesView() + self.text_query_view = TextCapturesView() + + self.query_view = html_view + if not self.query_view: + self.query_view = text_query_view - self.html_view = html_view self.search_view = search_view def __call__(self, wbrequest): @@ -49,11 +53,10 @@ class WBHandler(WbUrlHandler): # new special modifier to always show cdx index if wbrequest.wb_url.mod == 'cdx_': - return self.text_view.render_response(wbrequest, cdx_lines) + return self.text_query_view.render_response(wbrequest, cdx_lines) if (wbrequest.wb_url.type == wbrequest.wb_url.QUERY) or (wbrequest.wb_url.type == wbrequest.wb_url.URL_QUERY): - query_view = self.html_view if self.html_view else self.text_view - return query_view.render_response(wbrequest, cdx_lines) + return self.query_view.render_response(wbrequest, cdx_lines) with PerfTimer(wbrequest.env.get('X_PERF'), 'replay') as t: return self.replay(wbrequest, cdx_lines) @@ -92,7 +95,7 @@ class CDXHandler(BaseHandler): # Static Content Handler #================================================================= class StaticHandler(BaseHandler): - def __init__(self, static_path, pkg = __package__): + def __init__(self, static_path, pkg = 'pywb'): mimetypes.init() self.static_path = static_path diff --git a/pywb/indexreader.py b/pywb/core/indexreader.py similarity index 100% rename from pywb/indexreader.py rename to pywb/core/indexreader.py diff --git a/pywb/replay_views.py b/pywb/core/replay_views.py similarity index 85% rename from pywb/replay_views.py rename to pywb/core/replay_views.py index 4c6907eb..dd11ed4c 100644 --- a/pywb/replay_views.py +++ b/pywb/core/replay_views.py @@ -49,7 +49,7 @@ class ReplayView: self._redirect_if_needed(wbrequest, cdx) # one more check for referrer-based self-redirect - self._reject_referrer_self_redirect(wbrequest, status_headers) + self._reject_referrer_self_redirect(wbrequest) response = None @@ -150,25 +150,30 @@ class ReplayView: def _reject_self_redirect(self, wbrequest, cdx, status_headers): - # self-redirect via location + """ + Check if response is a 3xx redirect to the same url + If so, reject this capture to avoid causing redirect loop + """ if status_headers.statusline.startswith('3'): request_url = wbrequest.wb_url.url.lower() location_url = status_headers.get_header('Location').lower() - #TODO: canonicalize before testing? if (UrlRewriter.strip_protocol(request_url) == UrlRewriter.strip_protocol(location_url)): raise CaptureException('Self Redirect: ' + str(cdx)) - def _reject_referrer_self_redirect(self, wbrequest, status_headers): - # at correct timestamp now, but must check for referrer redirect - # indirect self-redirect, via meta-refresh, if referrer is same as current url - if status_headers.statusline.startswith('2'): - # build full url even if using relative-rewriting - request_url = wbrequest.host_prefix + wbrequest.rel_prefix + str(wbrequest.wb_url) - referrer_url = wbrequest.referrer - if (referrer_url and UrlRewriter.strip_protocol(request_url) == UrlRewriter.strip_protocol(referrer_url)): - raise CaptureException('Self Redirect via Referrer: ' + str(wbrequest.wb_url)) - - + def _reject_referrer_self_redirect(self, wbrequest): + """ + Perform final check for referrer based self-redirect. + This method should be called after verifying request timestamp matches capture. + if referrer is same as current url, reject this response and try another capture + """ + if not wbrequest.referrer: + return + # build full url even if using relative-rewriting + request_url = (wbrequest.host_prefix + + wbrequest.rel_prefix + str(wbrequest.wb_url)) + if (UrlRewriter.strip_protocol(request_url) == + UrlRewriter.strip_protocol(wbrequest.referrer)): + raise CaptureException('Self Redirect via Referrer: ' + str(wbrequest.wb_url)) diff --git a/pywb/test/test_wbrequestresponse.py b/pywb/core/test/test_wbrequestresponse.py similarity index 98% rename from pywb/test/test_wbrequestresponse.py rename to pywb/core/test/test_wbrequestresponse.py index 600ec926..09017564 100644 --- a/pywb/test/test_wbrequestresponse.py +++ b/pywb/core/test/test_wbrequestresponse.py @@ -41,7 +41,7 @@ from pywb.rewrite.wburl import WbUrl from pywb.rewrite.url_rewriter import UrlRewriter from pywb.utils.statusandheaders import StatusAndHeaders -from pywb.wbrequestresponse import WbRequest, WbResponse +from pywb.core.wbrequestresponse import WbRequest, WbResponse def print_req_from_uri(request_uri, env={}, use_abs_prefix=False): diff --git a/pywb/views.py b/pywb/core/views.py similarity index 85% rename from pywb/views.py rename to pywb/core/views.py index f693d1e6..961d1af7 100644 --- a/pywb/views.py +++ b/pywb/core/views.py @@ -1,6 +1,6 @@ -import pywb.utils.timeutils as timeutils +from pywb.utils.timeutils import timestamp_to_datetime +from wbrequestresponse import WbResponse -import wbrequestresponse import urlparse import time @@ -18,7 +18,7 @@ class StaticTextView: return self.text def render_response(self, **kwargs): - return wbrequestresponse.WbResponse.text_stream(self.text) + return WbResponse.text_stream(self.text) #================================================================= class J2TemplateView: @@ -34,7 +34,7 @@ class J2TemplateView: if template_dir.startswith('.') or template_dir.startswith('file://'): loader = FileSystemLoader(template_dir) else: - loader = PackageLoader(__package__, template_dir) + loader = PackageLoader('pywb', template_dir) jinja_env = Environment(loader = loader, trim_blocks = True) jinja_env.filters['format_ts'] = J2TemplateView.format_ts @@ -51,13 +51,13 @@ class J2TemplateView: def render_response(self, **kwargs): template_result = self.render_to_string(**kwargs) status = kwargs.get('status', '200 OK') - return wbrequestresponse.WbResponse.text_response(str(template_result), status = status, content_type = 'text/html; charset=utf-8') + return WbResponse.text_response(str(template_result), status = status, content_type = 'text/html; charset=utf-8') # Filters @staticmethod def format_ts(value, format='%a, %b %d %Y %H:%M:%S'): - value = timeutils.timestamp_to_datetime(value) + value = timestamp_to_datetime(value) return time.strftime(format, value) @staticmethod @@ -90,7 +90,7 @@ class TextCapturesView: cdx += '\n' return cdx cdx_lines = imap(to_str, cdx_lines) - return wbrequestresponse.WbResponse.text_stream(cdx_lines) + return WbResponse.text_stream(cdx_lines) diff --git a/pywb/wbexceptions.py b/pywb/core/wbexceptions.py similarity index 100% rename from pywb/wbexceptions.py rename to pywb/core/wbexceptions.py diff --git a/pywb/wbrequestresponse.py b/pywb/core/wbrequestresponse.py similarity index 100% rename from pywb/wbrequestresponse.py rename to pywb/core/wbrequestresponse.py diff --git a/pywb/dispatch/__init__.py b/pywb/dispatch/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pywb/archivalrouter.py b/pywb/dispatch/archivalrouter.py similarity index 98% rename from pywb/archivalrouter.py rename to pywb/dispatch/archivalrouter.py index 4d28b57e..f548969b 100644 --- a/pywb/archivalrouter.py +++ b/pywb/dispatch/archivalrouter.py @@ -1,7 +1,7 @@ import urlparse import re -from wbrequestresponse import WbRequest, WbResponse +from pywb.core.wbrequestresponse import WbRequest, WbResponse from pywb.rewrite.url_rewriter import UrlRewriter diff --git a/pywb/proxy.py b/pywb/dispatch/proxy.py similarity index 98% rename from pywb/proxy.py rename to pywb/dispatch/proxy.py index fc14d1e5..ffc74c47 100644 --- a/pywb/proxy.py +++ b/pywb/dispatch/proxy.py @@ -1,4 +1,4 @@ -from wbrequestresponse import WbResponse, WbRequest +from pywb.core.wbrequestresponse import WbResponse, WbRequest from archivalrouter import ArchivalRouter import urlparse diff --git a/pywb/test/test_archivalrouter.py b/pywb/dispatch/test/test_archivalrouter.py similarity index 97% rename from pywb/test/test_archivalrouter.py rename to pywb/dispatch/test/test_archivalrouter.py index 4379fbfd..82b0d147 100644 --- a/pywb/test/test_archivalrouter.py +++ b/pywb/dispatch/test/test_archivalrouter.py @@ -70,8 +70,8 @@ False """ -from pywb.archivalrouter import Route, ReferRedirect -from pywb.handlers import BaseHandler, WbUrlHandler +from pywb.dispatch.archivalrouter import Route, ReferRedirect +from pywb.core.handlers import BaseHandler, WbUrlHandler import pprint def print_req(req): diff --git a/run.sh b/run.sh index d6e484b9..6232c030 100755 --- a/run.sh +++ b/run.sh @@ -10,14 +10,14 @@ mypath=$(cd `dirname $0` && pwd) # ex: my_pywb.pywb_config() #export 'PYWB_CONFIG=my_pywb' -app="pywb.wbapp" +app="pywb.bootstrap.wbapp" params="--http-socket :8080 -b 65536" #params="--static-map /static=$mypath/static --http-socket :8080 -b 65536" if [ -z "$1" ]; then # Standard root config - params="$params --wsgi pywb.wbapp" + params="$params --wsgi $app" else # run with --mount # requires a file not a package, so creating a mount_run.py to load the package diff --git a/setup.py b/setup.py index 20ac8518..982e067d 100755 --- a/setup.py +++ b/setup.py @@ -11,8 +11,8 @@ setuptools.setup(name='pywb', author_email='ilya@archive.org', long_description=open('README.md').read(), license='GPL', - packages=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite'], - provides=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite'], + packages=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite','pywb.core','pywb.dispatch','pywb.bootstrap'], + provides=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite','pywb.core','pywb.dispatch','pywb.bootstrap'], package_data={'pywb': ['ui/*', 'static/*'], 'pywb.cdx': ['*.yaml']}, data_files = [('sample_archive/cdx/', glob.glob('sample_archive/cdx/*')), ('sample_archive/warcs/', glob.glob('sample_archive/warcs/*'))], diff --git a/tests/test_integration.py b/tests/test_integration.py index 1a7a943c..bede0e2b 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -1,6 +1,6 @@ import webtest -from pywb.pywb_init import pywb_config -from pywb.wbapp import create_wb_app +from pywb.bootstrap.pywb_init import pywb_config +from pywb.bootstrap.wbapp import create_wb_app from pywb.cdx.cdxobject import CDXObject class TestWb: From f24b2e77673bf02c9d9a221d02fbb369a188ced1 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 24 Feb 2014 23:40:32 -0800 Subject: [PATCH 2/7] fix typo from merge --- pywb/core/views.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pywb/core/views.py b/pywb/core/views.py index e6ca5635..520faa78 100644 --- a/pywb/core/views.py +++ b/pywb/core/views.py @@ -57,7 +57,7 @@ class J2TemplateView: # Filters @staticmethod def format_ts(value, format_='%a, %b %d %Y %H:%M:%S'): - value = timeutils.timestamp_to_datetime(value) + value = timestamp_to_datetime(value) return value.strftime(format_) @staticmethod From 47271bbfab2df2115f3356a5248e2b0dcbc23a48 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sun, 2 Mar 2014 08:55:26 -0800 Subject: [PATCH 3/7] remove extra .gz file, change test to use zipnum file instead --- pywb/utils/test/loaders_test.py | 8 ++++---- sample_archive/cdx/iana.cdx.gz | Bin 3785 -> 0 bytes 2 files changed, 4 insertions(+), 4 deletions(-) delete mode 100644 sample_archive/cdx/iana.cdx.gz diff --git a/pywb/utils/test/loaders_test.py b/pywb/utils/test/loaders_test.py index 7dc42d83..a8454816 100644 --- a/pywb/utils/test/loaders_test.py +++ b/pywb/utils/test/loaders_test.py @@ -30,9 +30,9 @@ >>> DecompressingBufferedReader(open(test_cdx_dir + 'iana.cdx', 'rb')).readline() ' CDX N b a m s k r M S V g\\n' -#DecompressingBufferedReader readline() with decompression ->>> DecompressingBufferedReader(open(test_cdx_dir + 'iana.cdx.gz', 'rb'), decomp_type = 'gzip').readline() -' CDX N b a m s k r M S V g\\n' +#DecompressingBufferedReader readline() with decompression (zipnum file, no header) +>>> DecompressingBufferedReader(open(test_zip_dir + 'zipnum-sample.cdx.gz', 'rb'), decomp_type = 'gzip').readline() +'com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz\\n' >>> BlockLoader(HMACCookieMaker('test', 'test', 5)).load('http://example.com', 41, 14).read() 'Example Domain' @@ -60,7 +60,7 @@ from pywb.utils.bufferedreaders import DecompressingBufferedReader from pywb import get_test_dir #test_cdx_dir = os.path.dirname(os.path.realpath(__file__)) + '/../sample-data/' test_cdx_dir = get_test_dir() + 'cdx/' - +test_zip_dir = get_test_dir() + 'zipcdx/' def read_multiple(reader, inc_reads): result = None diff --git a/sample_archive/cdx/iana.cdx.gz b/sample_archive/cdx/iana.cdx.gz deleted file mode 100644 index 11499ca5a26b931c02b70f48030e7b2d30fee9ed..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3785 zcmV;)4mR;0iwFoRCHztV18HGyVJ>52cmTbfTXUm2635^7Q}DXAn;6|;-!Q&kunpK? zUQT%uPmbp#cGmV>_I&zn?95D@AS^(nrq0xW7FiA8h;*-&;MyOzJLFIn_h4WS2m2}`p2>H z=ke(gt~QNH5IEClfXRsQ{&L|*q}ypbJ{`M*ad#RvjT`ubF|!OpsBwDA_tpOH_TleW z?f!9hI5ePba*GIkhKKfeaEFwX>g|B(Khhpe8TA2)Za<7T&Q z?Cy^1=4d%Sd<^hs*?6L?IJKBoEeF}ag?*_r#y8TF&V zI9l+I<8=Jd!k>9@GV8FOADzrWNFdG)WLwzMW)aJdmMRv#84@KM7_|&)VMYtY)La9E zsTx%^8NjeXZH6I)XfgmN2IDa9SeF6hgAITYV8Rl#KuF3pz%t9v>T)B%3}!=0 z*5yJL`sg_T8O^kyd0hr5OCwzdmthR9c4mSCI)f(@pC{hVO!cpKv`XD zoC5-hB4t^w0mMAJ(BVUez#=e_OIWa9KYV>$?cWy(g_D{8>PfkELU$2Nu#3a-bh=1r z7%w}qv&6z%Mq$6*ABmSbfl}KxEP_krHETHB>i)Lg4@bi)%Eacl8a2pAY>99m)p(Eh(bd^kKGows> zp~DP;5a(rzJc$`4sthe=5R-F#k*Z9sQP~V4pxhw5)Vi6>jN%nX_0a&$&X{HE%i?9~ z4lPc_ZLU#_l&U^-L|kD}!OG;n!}@8XX;D9T{ixg$g!kg!qSx}=Y3$;pIh-{kCrpyK z&8A(~qrpoF0#OSYX$4ZM29a4-LcL#D_GXBbY+;~LsZEhxX5vc?DX!nO7{WlMm}T;b z)R5u|UV|aF!GtaBG7?^9NC^#J-H@~vu`wy~cJ;a;#Z|lpLpaEIL5R*dQbNb;F@%8z zE+gb+hLlk98VrHLidzWjbgdFvUdxCG(jlG#PN&r4(YEv4(V#9sj! zj46e!=ly1TyoG-#y4&dYji>F6=O>uP!fVF8@!X3BGnY-GsGE>jbjQpUF;AQ(Jt0!g zu>oq(QcBv{h){;9BfjZJCCGI59KoC*C9RhUsZumlhoqLUEA*sLP;FHqL2PnJKoxot z;!NF;9Hc#EmXd7AgtW}-)o)cuD5NaV*-AxwHsl8kw*X0wp{4SWg#u7np9Gh*)wY3T zLaK=C>OKjf(pIaRvmx>IT&iKXL7%HFh-E<{Q(@VtA|y_=ykKgZW-=iWt~~^S*9*+4 zRUQ(503;|rVo|k%I~$VJ?vv?oGe~VpYAzd+UM~x(hakZTRohCJ2??|Ay1=YTnF?b; zg_4{+-0j!vZ5ciJ>dp&6*_=D=X`FZ-rA|L4-YA}Ubkm$bP(;A|K~;KFL04AA z#Gu(AtYammk&UUMEGuIQkd`)9+mI%~{D47+-sC1hYG2k!FpGtwvc1U>wS|%f&c;;H zmz8IFy6pm4YT8YLsh}_`W5S@@$W(iEoQbKVF)Ifo0wR;s3cabMGIe04A3~L%LP|PQ z2PPC~y!;eWQks=9)4kM4kSdr8T2lumhcQ_5mF1fRAbf^ zVZhVvobtB_x=eT0HBGtA5R)$`siVuaXf5ahj zbx0%?fmMR}<=t=GLy0y$iHI_h%A`uTN;QQJTn5zyMv6g1j*Aq+hYs9yhqA@7TDX?u z$~Gm}fGY&g*@vuzt5h%Oz!gv_P?~f0jvB5?ji9G@IRv7dkI~d{Rq6sgQ;j1VoUaU^ezU)M zT0NJMkFRfiFLqrz3dh0;{bZSpNQCEZyB%`}^AVr+L;9l639>-#GRxxmz58jm`|D}7 z|Lc#<-EMmee_g0N-sS!bDjz7G=^^Gqmfu8Dp3tU$&&%gY7M{>I2i`o{M_jkSS)gL!sR^L{KyZz?* zcS9li|`?25!t|&E?r~X?nP46Uz5}ZD+tedXPpZx-9S?dEQDBQ37qrgKtAlp zx`7SF4(Pp-ZXn%EQ`ZFJkPU44szNTTw8W_^bU~eC6M5y34=a1YQCrX280Bp1&4ZP{ z(x|Iv1r$^{=cDJsN;lcm6}kkbEa%YGTv+K=o4VWp0z0)z_N??pMQuGxZ6V6`tZds& zZ9U7O$&#@0ml5^Q5<>{ia<(P>ZuZN6zpnTHD=uh%-FFBi269G(Dh? zqMY}k*AR<@pt1wa5w~-W3B7`-AV@XzA}6*|m6wfpU4~Xg1f6VCl=J05KB9{Ftlo>l zwDNvD;tHaIE>ZGXW``K z>5bL?d8HE8F0Q{snoT-M;(L71A37sE@JBpx@nX@XIG7H*z3>Go{k|BaD3M0opG|D; zw>QVtr^mHYV$Hbg60hgE^G-MzPiB*@(_ZqB@!`THp$HeFVYhugrr$ych7K`HNQ20) zhPFqQRQ3AuOQ4wGX4CT~{(=V6b}yWD*s#yMPA?dS$&k;^{gob?BPbX0sP?mk{ra%p ze_P+HD5ZDzT%!5RaoV$(#Y+~mR+sZ8CX<%OTJvV)JNWEfgo*U~H$>$p5B+Lpw>zrH z;EU@ok(QBzeLi1A?$YZulfkeVPM6b!xr1of?at48Dlxayp;6NujQXI!oiV-{HcWc+!{WskLbu_G|+M{F?|bo?-C&O+X7Cqut2dQ7~WpK1}# zfDB{#{+x?uPX4l25|WwsT{1eIh)|CN)Ei=dz-Q5TF>SIocKY*n==A1iM?>jHVuV=o zL(jKgpFUkunVe<6yZ(9Uo9GaQY@Q6~{&EztnHMb*H|n^K*KLpIB96{4)d{sNgJ4AD zzlv8`T&T*56gp)6Ss3$Hv|vN0+3itA-DSHK4gEe_V$yzzxHPT(AsI=hX+=e$rX@}x zZ_*i$SmZTl<1md*V#ad)?Se6tiK>Cs;o zU&8eL`5^zcQK~PxK9^RIG+w;9uLBEzTzZ3PMm_alz&fg+&rys{@!iZO3O~K zuD;}<2^QmJGzsD*j$6Jn@FNlVW5`pZxYeTmOFp3KdYH}M9 Date: Sun, 2 Mar 2014 19:26:06 -0800 Subject: [PATCH 4/7] wsgi wrapper reorg! support pluggable wsgi apps utils: BlockLoader() supports loading from package exceptions: base WbException moved to utils --- pywb/apps/__init__.py | 0 pywb/apps/wayback.py | 10 ++ pywb/bootstrap/config_utils.py | 56 --------- pywb/bootstrap/pywb_init.py | 101 +++++++++++---- pywb/bootstrap/{wbapp.py => wsgi_wrappers.py} | 115 ++++++++++++------ pywb/cdx/cdxdomainspecific.py | 1 + pywb/cdx/cdxobject.py | 4 +- pywb/cdx/cdxops.py | 3 +- pywb/cdx/cdxserver.py | 11 +- pywb/cdx/cdxsource.py | 2 +- pywb/core/wbexceptions.py | 4 +- pywb/utils/canonicalize.py | 8 +- pywb/utils/loaders.py | 21 +++- pywb/utils/timeutils.py | 1 - pywb/utils/wbexception.py | 3 + pywb/warc/recordloader.py | 11 +- run.sh | 2 +- setup.py | 2 +- tests/test_integration.py | 13 +- 19 files changed, 217 insertions(+), 151 deletions(-) create mode 100644 pywb/apps/__init__.py create mode 100644 pywb/apps/wayback.py delete mode 100644 pywb/bootstrap/config_utils.py rename pywb/bootstrap/{wbapp.py => wsgi_wrappers.py} (52%) create mode 100644 pywb/utils/wbexception.py diff --git a/pywb/apps/__init__.py b/pywb/apps/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pywb/apps/wayback.py b/pywb/apps/wayback.py new file mode 100644 index 00000000..beaf0b0c --- /dev/null +++ b/pywb/apps/wayback.py @@ -0,0 +1,10 @@ +from pywb.bootstrap.wsgi_wrappers import init_app, start_wsgi_server +from pywb.bootstrap.pywb_init import create_wb_router + +#================================================================= +# init pywb app +#================================================================= +application = init_app(create_wb_router, load_yaml=True) + +if __name__ == "__main__": + start_wsgi_server(application) diff --git a/pywb/bootstrap/config_utils.py b/pywb/bootstrap/config_utils.py deleted file mode 100644 index 686a6bbb..00000000 --- a/pywb/bootstrap/config_utils.py +++ /dev/null @@ -1,56 +0,0 @@ -import logging - -from pywb.warc.recordloader import ArcWarcRecordLoader -from pywb.warc.resolvingloader import ResolvingLoader -from pywb.rewrite.rewrite_content import RewriteContent -from pywb.core.views import J2TemplateView, J2HtmlCapturesView -from pywb.core.handlers import WBHandler -from pywb.core.replay_views import ReplayView - -#================================================================= -# Config Loading -#================================================================= -def load_template_file(file, desc = None, view_class = J2TemplateView): - if file: - logging.debug('Adding {0}: {1}'.format(desc if desc else name, file)) - file = view_class(file) - - return file - -#================================================================= -def create_wb_handler(cdx_server, config, ds_rules_file=None): - - record_loader = ArcWarcRecordLoader(cookie_maker = config.get('cookie_maker')) - paths = config.get('archive_paths') - - resolving_loader = ResolvingLoader(paths=paths, - cdx_server=cdx_server, - record_loader=record_loader) - - replayer = ReplayView( - content_loader = resolving_loader, - - content_rewriter = RewriteContent(ds_rules_file=ds_rules_file), - - head_insert_view = load_template_file(config.get('head_insert_html'), 'Head Insert'), - - buffer_response = config.get('buffer_response', True), - - redir_to_exact = config.get('redir_to_exact', True), - - reporter = config.get('reporter') - ) - - - wb_handler = WBHandler( - cdx_server, - - replayer, - - html_view = load_template_file(config.get('query_html'), 'Captures Page', J2HtmlCapturesView), - - search_view = load_template_file(config.get('search_html'), 'Search Page'), - ) - - return wb_handler - diff --git a/pywb/bootstrap/pywb_init.py b/pywb/bootstrap/pywb_init.py index 1fe33ddc..d4382204 100644 --- a/pywb/bootstrap/pywb_init.py +++ b/pywb/bootstrap/pywb_init.py @@ -1,10 +1,20 @@ -from pywb.core.handlers import CDXHandler, StaticHandler -from pywb.core.handlers import DebugEchoHandler, DebugEchoEnvHandler from pywb.dispatch.archivalrouter import ArchivalRouter, Route from pywb.dispatch.proxy import ProxyArchivalRouter -from pywb.core.indexreader import IndexReader -import config_utils +from pywb.warc.recordloader import ArcWarcRecordLoader +from pywb.warc.resolvingloader import ResolvingLoader + +from pywb.rewrite.rewrite_content import RewriteContent + +from pywb.core.indexreader import IndexReader +from pywb.core.views import J2TemplateView, J2HtmlCapturesView +from pywb.core.handlers import WBHandler +from pywb.core.replay_views import ReplayView + +from pywb.core.handlers import CDXHandler, StaticHandler +from pywb.core.handlers import DebugEchoHandler, DebugEchoEnvHandler + +from pywb.utils.loaders import BlockLoader import os import yaml @@ -27,6 +37,7 @@ DEFAULTS = { 'domain_specific_rules': 'rules.yaml', } +#================================================================= class DictChain: def __init__(self, *dicts): self.dicts = dicts @@ -40,9 +51,63 @@ class DictChain: #================================================================= -## Reference non-YAML config +def load_template_file(file, desc=None, view_class=J2TemplateView): + if file: + logging.debug('Adding {0}: {1}'.format(desc if desc else name, file)) + file = view_class(file) + + return file + + #================================================================= -def pywb_config_manual(passed_config = {}): +def create_wb_handler(cdx_server, config, ds_rules_file=None): + + cookie_maker=config.get('cookie_maker') + record_loader = ArcWarcRecordLoader(cookie_maker=cookie_maker) + + paths = config.get('archive_paths') + + resolving_loader = ResolvingLoader(paths=paths, + cdx_server=cdx_server, + record_loader=record_loader) + + head_insert_view = load_template_file(config.get('head_insert_html'), + 'Head Insert') + + replayer = ReplayView( + content_loader=resolving_loader, + + content_rewriter=RewriteContent(ds_rules_file=ds_rules_file), + + head_insert_view=head_insert_view, + + buffer_response=config.get('buffer_response', True), + + redir_to_exact=config.get('redir_to_exact', True), + + reporter=config.get('reporter') + ) + + html_view = load_template_file(config.get('query_html'), + 'Captures Page', + J2HtmlCapturesView) + + + search_view = load_template_file(config.get('search_html'), + 'Search Page') + + wb_handler = WBHandler( + cdx_server, + replayer, + html_view=html_view, + search_view=search_view, + ) + + return wb_handler + + +#================================================================= +def create_wb_router(passed_config = {}): config = DictChain(passed_config, DEFAULTS) @@ -62,7 +127,7 @@ def pywb_config_manual(passed_config = {}): ds_rules_file = route_config.get('domain_specific_rules', None) cdx_server = IndexReader(route_config, ds_rules_file) - wb_handler = config_utils.create_wb_handler( + wb_handler = create_wb_handler( cdx_server=cdx_server, config=route_config, ds_rules_file=ds_rules_file, @@ -107,24 +172,6 @@ def pywb_config_manual(passed_config = {}): abs_path = config.get('absolute_paths', True), - home_view = config_utils.load_template_file(config.get('home_html'), 'Home Page'), - error_view = config_utils.load_template_file(config.get('error_html'), 'Error Page') + home_view = load_template_file(config.get('home_html'), 'Home Page'), + error_view = load_template_file(config.get('error_html'), 'Error Page') ) - - - -#================================================================= -# YAML config loader -#================================================================= -DEFAULT_CONFIG_FILE = 'config.yaml' - - -def pywb_config(config_file = None): - if not config_file: - config_file = os.environ.get('PYWB_CONFIG', DEFAULT_CONFIG_FILE) - - with open(config_file) as fh: - config = yaml.load(fh) - - return pywb_config_manual(config) - diff --git a/pywb/bootstrap/wbapp.py b/pywb/bootstrap/wsgi_wrappers.py similarity index 52% rename from pywb/bootstrap/wbapp.py rename to pywb/bootstrap/wsgi_wrappers.py index e7ea0c82..4dd04115 100644 --- a/pywb/bootstrap/wbapp.py +++ b/pywb/bootstrap/wsgi_wrappers.py @@ -1,20 +1,19 @@ -from pywb.core.wbexceptions import WbException, NotFoundException, InternalRedirect +from pywb.utils.wbexception import WbException +from pywb.core.wbexceptions import NotFoundException, InternalRedirect from pywb.core.wbrequestresponse import WbResponse, StatusAndHeaders -from pywb.cdx.cdxserver import CDXException -from pywb.utils.canonicalize import UrlCanonicalizeException -from pywb.warc.recordloader import ArchiveLoadFailed +from pywb.utils.loaders import BlockLoader import os import importlib import logging - #================================================================= -# adapted -from wsgiref.request_uri, but doesn't include domain name and allows all characters +# adapted from wsgiref.request_uri, but doesn't include domain name and allows all characters # allowed in the path segment according to: http://tools.ietf.org/html/rfc3986#section-3.3 -# explained here: http://stackoverflow.com/questions/4669692/valid-characters-for-directory-part-of-a-url-for-short-links +# explained here: +# http://stackoverflow.com/questions/4669692/valid-characters-for-directory-part-of-a-url-for-short-links def rel_request_uri(environ, include_query=1): """ Return the requested path, optionally including the query string @@ -35,9 +34,9 @@ def rel_request_uri(environ, include_query=1): return url + #================================================================= def create_wb_app(wb_router): - # Top-level wsgi application def application(env, start_response): if env.get('SCRIPT_NAME') or not env.get('REQUEST_URI'): @@ -56,8 +55,7 @@ def create_wb_app(wb_router): except InternalRedirect as ir: response = WbResponse(StatusAndHeaders(ir.status, ir.httpHeaders)) - except (WbException, CDXException, - UrlCanonicalizeException, ArchiveLoadFailed) as e: + except WbException as e: response = handle_exception(env, wb_router.error_view, e, False) except Exception as e: @@ -69,6 +67,7 @@ def create_wb_app(wb_router): return application +#================================================================= def handle_exception(env, error_view, exc, print_trace): if hasattr(exc, 'status'): status = exc.status() @@ -85,44 +84,82 @@ def handle_exception(env, error_view, exc, print_trace): if error_view: import traceback - return error_view.render_response(err_msg = str(exc), err_details = err_details, status = status) + return error_view.render_response(err_msg=str(exc), + err_details=err_details, + status=status) else: - return WbResponse.text_response(status + ' Error: ' + str(exc), status = status) - + return WbResponse.text_response(status + ' Error: ' + str(exc), + status=status) #================================================================= DEFAULT_CONFIG_FILE = 'config.yaml' -DEFAULT_INIT_MODULE = 'pywb.bootstrap.pywb_init' +def load_yaml_config(config_file=None): + import yaml + + if not config_file: + config_file = DEFAULT_CONFIG_FILE + + configdata = BlockLoader().load(config_file) + config = yaml.load(configdata) + return config #================================================================= -def main(): +def init_app(init_func, load_yaml=True, config_file=None): + logging.basicConfig(format='%(asctime)s: [%(levelname)s]: %(message)s', + level=logging.DEBUG) + logging.info('') + + if load_yaml: + if not config_file: + config_file = os.environ.get('PYWB_CONFIG_FILE') + config = load_yaml_config(config_file) + try: - logging.basicConfig(format = '%(asctime)s: [%(levelname)s]: %(message)s', level = logging.DEBUG) - - # see if there's a custom init module - config_name = os.environ.get('PYWB_CONFIG_MODULE') - - if not config_name: - # use default module - config_name = DEFAULT_INIT_MODULE - logging.info('Loading from default config module "{0}"'.format(config_name)) - logging.info('') - - module = importlib.import_module(config_name) - - app = create_wb_app(module.pywb_config()) - logging.info('') - logging.info('*** pywb inited with settings from {0}.pywb_config()!\n'.format(config_name)) - return app - - except Exception: - logging.exception('*** pywb could not init with settings from {0}.pywb_config()!\n'.format(config_name)) + if load_yaml: + wb_router = init_func(config) + else: + wb_router = init_func() + except: + msg = '*** pywb app init FAILED config from "%s"!\n' + logging.exception(msg, init_func.__name__) raise + else: + msg = '*** pywb app inited with config from "%s"!\n' + logging.info(msg, init_func.__name__) + + return create_wb_app(wb_router) + #================================================================= -if __name__ == "__main__": - pass -else: - application = main() +DEFAULT_PORT = 8080 + +def start_wsgi_server(the_app): + from wsgiref.simple_server import make_server + from optparse import OptionParser + + opt = OptionParser('%prog [OPTIONS]') + opt.add_option('-p', '--port', type='int', default=None) + + options, args = opt.parse_args() + + port = options.port + + if port is None: + try: + config = load_default_config() + port = config.get('port', DEFAULT_PORT) + except: + port = DEFAULT_PORT + + + logging.debug('Starting CDX Server on port %s', port) + + try: + httpd = make_server('', port, the_app) + httpd.serve_forever() + except KeyboardInterrupt as ex: + pass + + logging.debug('Stopping CDX Server') diff --git a/pywb/cdx/cdxdomainspecific.py b/pywb/cdx/cdxdomainspecific.py index 2e8a3855..e77c4666 100644 --- a/pywb/cdx/cdxdomainspecific.py +++ b/pywb/cdx/cdxdomainspecific.py @@ -9,6 +9,7 @@ from pywb.utils.canonicalize import unsurt, UrlCanonicalizer from query import CDXQuery + #================================================================= def load_domain_specific_cdx_rules(ds_rules_file, surt_ordered): """ diff --git a/pywb/cdx/cdxobject.py b/pywb/cdx/cdxobject.py index 3915f169..49cd74c5 100644 --- a/pywb/cdx/cdxobject.py +++ b/pywb/cdx/cdxobject.py @@ -4,9 +4,11 @@ import itertools from urllib import urlencode from urlparse import parse_qs +from pywb.utils.wbexception import WbException + #================================================================= -class CDXException(Exception): +class CDXException(WbException): def status(self): return '400 Bad Request' diff --git a/pywb/cdx/cdxops.py b/pywb/cdx/cdxops.py index c4f865c2..e3a1a13b 100644 --- a/pywb/cdx/cdxops.py +++ b/pywb/cdx/cdxops.py @@ -33,6 +33,7 @@ def cdx_load(sources, query, perms_checker=None, process=True): return cdx_iter + #================================================================= def restrict_cdx(cdx_iter, query, perms_checker): """ @@ -56,6 +57,7 @@ def restrict_cdx(cdx_iter, query, perms_checker): yield cdx + #================================================================= def process_cdx(cdx_iter, query): if query.resolve_revisits: @@ -255,7 +257,6 @@ def cdx_resolve_revisits(cdx_iter): originals = {} for cdx in cdx_iter: - is_revisit = cdx.is_revisit() digest = cdx['digest'] diff --git a/pywb/cdx/cdxserver.py b/pywb/cdx/cdxserver.py index 54d46f4b..2e5ec8ad 100644 --- a/pywb/cdx/cdxserver.py +++ b/pywb/cdx/cdxserver.py @@ -126,14 +126,19 @@ class CDXServer(BaseCDXServer): logging.warn('No CDX Sources configured from paths=%s', paths) def _add_cdx_source(self, source): - if source is None: return + if source is None: + return + logging.debug('Adding CDX Source: %s', source) self.sources.append(source) def add_cdx_source(self, source, config): - if source is None: return + if source is None: + return + if isinstance(source, CDXSource): self._add_cdx_source(source) + elif isinstance(source, str): if os.path.isdir(source): for fn in os.listdir(source): @@ -213,5 +218,3 @@ def create_cdx_server(config, ds_rules_file=None): surt_ordered=surt_ordered, ds_rules_file=ds_rules_file, perms_checker=perms_checker) - - diff --git a/pywb/cdx/cdxsource.py b/pywb/cdx/cdxsource.py index 0923fba9..dfab0f25 100644 --- a/pywb/cdx/cdxsource.py +++ b/pywb/cdx/cdxsource.py @@ -8,6 +8,7 @@ import urllib import urllib2 import itertools + #================================================================= class CDXSource(object): """ @@ -92,7 +93,6 @@ class RedisCDXSource(CDXSource): if config: self.key_prefix = config.get('redis_key_prefix', self.key_prefix) - def load_cdx(self, query): """ Load cdx from redis cache, from an ordered list diff --git a/pywb/core/wbexceptions.py b/pywb/core/wbexceptions.py index afacc325..e9b07ad3 100644 --- a/pywb/core/wbexceptions.py +++ b/pywb/core/wbexceptions.py @@ -1,8 +1,6 @@ +from pywb.utils.wbexception import WbException -class WbException(Exception): - pass - class NotFoundException(WbException): def status(self): return '404 Not Found' diff --git a/pywb/utils/canonicalize.py b/pywb/utils/canonicalize.py index 73555ca6..6979a323 100644 --- a/pywb/utils/canonicalize.py +++ b/pywb/utils/canonicalize.py @@ -4,6 +4,9 @@ import surt import urlparse +from wbexception import WbException + + #================================================================= class UrlCanonicalizer(object): def __init__(self, surt_ordered=True): @@ -14,7 +17,7 @@ class UrlCanonicalizer(object): #================================================================= -class UrlCanonicalizeException(Exception): +class UrlCanonicalizeException(WbException): def status(self): return '400 Bad Request' @@ -164,7 +167,8 @@ def calc_search_range(url, match_type, surt_ordered=True, url_canon=None): elif match_type == 'domain': if not surt_ordered: - raise UrlCanonicalizeException('matchType=domain unsupported for non-surt') + msg = 'matchType=domain unsupported for non-surt' + raise UrlCanonicalizeException(msg) host = start_key.split(')/')[0] diff --git a/pywb/utils/loaders.py b/pywb/utils/loaders.py index 7813ded8..6f2fa6c9 100644 --- a/pywb/utils/loaders.py +++ b/pywb/utils/loaders.py @@ -7,6 +7,7 @@ import os import hmac import urllib2 import time +from pkg_resources import resource_stream #================================================================= @@ -24,16 +25,16 @@ class BlockLoader(object): def __init__(self, cookie_maker=None): self.cookie_maker = cookie_maker - def load(self, url, offset, length): + def load(self, url, offset=0, length=-1): """ Determine loading method based on uri """ if is_http(url): return self.load_http(url, offset, length) else: - return self.load_file(url, offset, length) + return self.load_file_or_resource(url, offset, length) - def load_file(self, url, offset, length): + def load_file_or_resource(self, url, offset, length): """ Load a file-like reader from the local file system """ @@ -41,10 +42,18 @@ class BlockLoader(object): if url.startswith('file://'): url = url[len('file://'):] - afile = open(url, 'rb') - afile.seek(offset) + try: + # first, try as file + afile = open(url, 'rb') + except IOError as file_err: + # then, try as package.path/file + pkg_split = url.split('/', 1) + afile = resource_stream(pkg_split[0], pkg_split[1]) - if length > 0: + if offset > 0: + afile.seek(offset) + + if length >= 0: return LimitReader(afile, length) else: return afile diff --git a/pywb/utils/timeutils.py b/pywb/utils/timeutils.py index f93f324d..a89424aa 100644 --- a/pywb/utils/timeutils.py +++ b/pywb/utils/timeutils.py @@ -171,7 +171,6 @@ def timestamp_to_datetime(string): # pad to 6 digits string = _pad_timestamp(string, PAD_6) - def clamp(val, min_, max_): try: val = int(val) diff --git a/pywb/utils/wbexception.py b/pywb/utils/wbexception.py new file mode 100644 index 00000000..a8757935 --- /dev/null +++ b/pywb/utils/wbexception.py @@ -0,0 +1,3 @@ +class WbException(Exception): + def status(self): + return '500 Internal Server Error' diff --git a/pywb/warc/recordloader.py b/pywb/warc/recordloader.py index 446e0da3..fb3af38c 100644 --- a/pywb/warc/recordloader.py +++ b/pywb/warc/recordloader.py @@ -9,6 +9,9 @@ from pywb.utils.statusandheaders import StatusAndHeadersParserException from pywb.utils.loaders import BlockLoader from pywb.utils.bufferedreaders import DecompressingBufferedReader +from pywb.utils.wbexception import WbException + + #================================================================= ArcWarcRecord = collections.namedtuple('ArchiveRecord', 'type, rec_headers, ' + @@ -16,7 +19,7 @@ ArcWarcRecord = collections.namedtuple('ArchiveRecord', #================================================================= -class ArchiveLoadFailed(Exception): +class ArchiveLoadFailed(WbException): def __init__(self, reason, filename=''): super(ArchiveLoadFailed, self).__init__(filename + ':' + str(reason)) #self.filename = filename @@ -62,9 +65,9 @@ class ArcWarcRecordLoader: decomp_type = 'gzip' # Create decompressing stream - stream = DecompressingBufferedReader(stream = raw, - decomp_type = decomp_type, - block_size = self.block_size) + stream = DecompressingBufferedReader(stream=raw, + decomp_type=decomp_type, + block_size=self.block_size) (the_format, rec_headers) = self._detect_type_load_headers(stream) diff --git a/run.sh b/run.sh index 6232c030..77964b32 100755 --- a/run.sh +++ b/run.sh @@ -10,7 +10,7 @@ mypath=$(cd `dirname $0` && pwd) # ex: my_pywb.pywb_config() #export 'PYWB_CONFIG=my_pywb' -app="pywb.bootstrap.wbapp" +app="pywb.apps.wayback" params="--http-socket :8080 -b 65536" #params="--static-map /static=$mypath/static --http-socket :8080 -b 65536" diff --git a/setup.py b/setup.py index 4c2cad20..889fe2a8 100755 --- a/setup.py +++ b/setup.py @@ -22,6 +22,7 @@ setup( 'pywb.core', 'pywb.dispatch', 'pywb.bootstrap' + 'pywb.apps' ], package_data={ 'pywb': ['ui/*', 'static/*', '*.yaml'], @@ -41,7 +42,6 @@ setup( 'pyyaml', 'WebTest', 'pytest', - 'werkzeug>=0.9.4', ], # tests_require=['WebTest', 'pytest'], zip_safe=False diff --git a/tests/test_integration.py b/tests/test_integration.py index b9b20e06..b71e8574 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -1,6 +1,6 @@ import webtest -from pywb.bootstrap.pywb_init import pywb_config -from pywb.bootstrap.wbapp import create_wb_app +from pywb.bootstrap.pywb_init import create_wb_router +from pywb.bootstrap.wsgi_wrappers import init_app from pywb.cdx.cdxobject import CDXObject from fixture import TestExclusionPerms @@ -11,8 +11,13 @@ class TestWb: def setup(self): #self.app = pywb.wbapp.create_wb_app(pywb.pywb_init.pywb_config()) # save it in self - useful for debugging - self.router = pywb_config(self.TEST_CONFIG) - self.app = create_wb_app(self.router) + self.app = init_app(create_wb_router, + load_yaml=True, + config_file=self.TEST_CONFIG) + + #self.router = pywb_config(self.TEST_CONFIG) + #self.app = create_wb_app(self.router) + self.testapp = webtest.TestApp(self.app) def _assert_basic_html(self, resp): From f0a09760385dc6ac0803f75dd19c9d1ed6092695 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sun, 2 Mar 2014 21:42:05 -0800 Subject: [PATCH 5/7] more refactoring! create 'framework' subpackage for general purpose components! contains routing, request/response, exceptions and wsgi wrappers update framework package for pep8 dsrules: using load_config_yaml() (pushed to utils) to init default config --- pywb/apps/wayback.py | 4 +- pywb/core/handlers.py | 2 +- pywb/{bootstrap => core}/pywb_init.py | 20 +++---- pywb/core/replay_views.py | 4 +- pywb/core/views.py | 2 +- pywb/dispatch/__init__.py | 0 pywb/{bootstrap => framework}/__init__.py | 0 .../{dispatch => framework}/archivalrouter.py | 52 +++++++++++-------- pywb/{dispatch => framework}/proxy.py | 33 ++++++------ .../test/test_archivalrouter.py | 2 +- .../test/test_wbrequestresponse.py | 2 +- pywb/{core => framework}/wbexceptions.py | 0 pywb/{core => framework}/wbrequestresponse.py | 39 ++++++++------ .../{bootstrap => framework}/wsgi_wrappers.py | 37 ++++++------- pywb/utils/dsrules.py | 31 ++++------- pywb/utils/loaders.py | 27 ++++++++-- tests/test_integration.py | 4 +- 17 files changed, 138 insertions(+), 121 deletions(-) rename pywb/{bootstrap => core}/pywb_init.py (90%) delete mode 100644 pywb/dispatch/__init__.py rename pywb/{bootstrap => framework}/__init__.py (100%) rename pywb/{dispatch => framework}/archivalrouter.py (77%) rename pywb/{dispatch => framework}/proxy.py (78%) rename pywb/{dispatch => framework}/test/test_archivalrouter.py (98%) rename pywb/{core => framework}/test/test_wbrequestresponse.py (98%) rename pywb/{core => framework}/wbexceptions.py (100%) rename pywb/{core => framework}/wbrequestresponse.py (76%) rename pywb/{bootstrap => framework}/wsgi_wrappers.py (84%) diff --git a/pywb/apps/wayback.py b/pywb/apps/wayback.py index beaf0b0c..0cda072b 100644 --- a/pywb/apps/wayback.py +++ b/pywb/apps/wayback.py @@ -1,5 +1,5 @@ -from pywb.bootstrap.wsgi_wrappers import init_app, start_wsgi_server -from pywb.bootstrap.pywb_init import create_wb_router +from pywb.framework.wsgi_wrappers import init_app, start_wsgi_server +from pywb.core.pywb_init import create_wb_router #================================================================= # init pywb app diff --git a/pywb/core/handlers.py b/pywb/core/handlers.py index cbf2d71f..1984a4df 100644 --- a/pywb/core/handlers.py +++ b/pywb/core/handlers.py @@ -5,7 +5,7 @@ import time from pywb.rewrite.wburl import WbUrl from pywb.cdx.query import CDXQuery -from wbrequestresponse import WbResponse +from pywb.framework.wbrequestresponse import WbResponse from wbexceptions import WbException, NotFoundException from views import TextCapturesView diff --git a/pywb/bootstrap/pywb_init.py b/pywb/core/pywb_init.py similarity index 90% rename from pywb/bootstrap/pywb_init.py rename to pywb/core/pywb_init.py index d4382204..52df9f5f 100644 --- a/pywb/bootstrap/pywb_init.py +++ b/pywb/core/pywb_init.py @@ -1,25 +1,25 @@ -from pywb.dispatch.archivalrouter import ArchivalRouter, Route -from pywb.dispatch.proxy import ProxyArchivalRouter +from pywb.framework.archivalrouter import ArchivalRouter, Route +from pywb.framework.proxy import ProxyArchivalRouter from pywb.warc.recordloader import ArcWarcRecordLoader from pywb.warc.resolvingloader import ResolvingLoader from pywb.rewrite.rewrite_content import RewriteContent -from pywb.core.indexreader import IndexReader -from pywb.core.views import J2TemplateView, J2HtmlCapturesView -from pywb.core.handlers import WBHandler -from pywb.core.replay_views import ReplayView +from indexreader import IndexReader +from views import J2TemplateView, J2HtmlCapturesView +from replay_views import ReplayView -from pywb.core.handlers import CDXHandler, StaticHandler -from pywb.core.handlers import DebugEchoHandler, DebugEchoEnvHandler +from handlers import WBHandler +from handlers import CDXHandler, StaticHandler +from handlers import DebugEchoHandler, DebugEchoEnvHandler -from pywb.utils.loaders import BlockLoader import os import yaml import logging + #================================================================= DEFAULTS = { 'hostpaths': ['http://localhost:8080'], @@ -34,7 +34,7 @@ DEFAULTS = { 'static_routes': {'static/default': 'static/'}, - 'domain_specific_rules': 'rules.yaml', + 'domain_specific_rules': 'pywb/rules.yaml', } #================================================================= diff --git a/pywb/core/replay_views.py b/pywb/core/replay_views.py index bf046416..07997396 100644 --- a/pywb/core/replay_views.py +++ b/pywb/core/replay_views.py @@ -2,9 +2,9 @@ import StringIO from pywb.rewrite.url_rewriter import UrlRewriter from pywb.utils.bufferedreaders import ChunkedDataReader -from wbrequestresponse import WbResponse +from pywb.framework.wbrequestresponse import WbResponse -from wbexceptions import CaptureException, InternalRedirect +from pywb.framework.wbexceptions import CaptureException, InternalRedirect from pywb.warc.recordloader import ArchiveLoadFailed from pywb.utils.loaders import LimitReader diff --git a/pywb/core/views.py b/pywb/core/views.py index 520faa78..3be55eae 100644 --- a/pywb/core/views.py +++ b/pywb/core/views.py @@ -1,5 +1,5 @@ from pywb.utils.timeutils import timestamp_to_datetime -from wbrequestresponse import WbResponse +from pywb.framework.wbrequestresponse import WbResponse import urlparse import time diff --git a/pywb/dispatch/__init__.py b/pywb/dispatch/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/pywb/bootstrap/__init__.py b/pywb/framework/__init__.py similarity index 100% rename from pywb/bootstrap/__init__.py rename to pywb/framework/__init__.py diff --git a/pywb/dispatch/archivalrouter.py b/pywb/framework/archivalrouter.py similarity index 77% rename from pywb/dispatch/archivalrouter.py rename to pywb/framework/archivalrouter.py index fb09fa1a..2ae3bb5f 100644 --- a/pywb/dispatch/archivalrouter.py +++ b/pywb/framework/archivalrouter.py @@ -1,15 +1,17 @@ import urlparse import re -from pywb.core.wbrequestresponse import WbRequest, WbResponse from pywb.rewrite.url_rewriter import UrlRewriter +from wbrequestresponse import WbRequest, WbResponse #================================================================= # ArchivalRouter -- route WB requests in archival mode #================================================================= -class ArchivalRouter: - def __init__(self, routes, hostpaths=None, abs_path=True, home_view=None, error_view=None): +class ArchivalRouter(object): + def __init__(self, routes, hostpaths=None, abs_path=True, + home_view=None, error_view=None): + self.routes = routes self.fallback = ReferRedirect(hostpaths) self.abs_path = abs_path @@ -29,26 +31,27 @@ class ArchivalRouter: return self.fallback(env, self.routes) if self.fallback else None - def render_home_page(self): # render the homepage! if self.home_view: - return self.home_view.render_response(routes = self.routes) + return self.home_view.render_response(routes=self.routes) else: # default home page template text = '\n'.join(map(str, self.routes)) return WbResponse.text_response(text) + #================================================================= # Route by matching regex (or fixed prefix) # of request uri (excluding first '/') #================================================================= -class Route: +class Route(object): # match upto next / or ? or end - SLASH_QUERY_LOOKAHEAD ='(?=/|$|\?)' + SLASH_QUERY_LOOKAHEAD = '(?=/|$|\?)' + def __init__(self, regex, handler, coll_group=0, config={}, + lookahead=SLASH_QUERY_LOOKAHEAD): - def __init__(self, regex, handler, coll_group = 0, config = {}, lookahead = SLASH_QUERY_LOOKAHEAD): self.path = regex if regex: self.regex = re.compile(regex + lookahead) @@ -59,12 +62,11 @@ class Route: self.coll_group = coll_group self._custom_init(config) - def __call__(self, env, use_abs_prefix): wbrequest = self.parse_request(env, use_abs_prefix) return self.handler(wbrequest) if wbrequest else None - def parse_request(self, env, use_abs_prefix, request_uri = None): + def parse_request(self, env, use_abs_prefix, request_uri=None): if not request_uri: request_uri = env['REL_REQUEST_URI'] @@ -75,10 +77,12 @@ class Route: matched_str = matcher.group(0) if matched_str: rel_prefix = env['SCRIPT_NAME'] + '/' + matched_str + '/' - wb_url_str = request_uri[len(matched_str) + 2:] # remove the '/' + rel_prefix part of uri + # remove the '/' + rel_prefix part of uri + wb_url_str = request_uri[len(matched_str) + 2:] else: rel_prefix = env['SCRIPT_NAME'] + '/' - wb_url_str = request_uri[1:] # the request_uri is the wb_url, since no coll + # the request_uri is the wb_url, since no coll + wb_url_str = request_uri[1:] coll = matcher.group(self.coll_group) @@ -88,20 +92,19 @@ class Route: rel_prefix=rel_prefix, coll=coll, use_abs_prefix=use_abs_prefix, - wburl_class = self.handler.get_wburl_type(), + wburl_class=self.handler.get_wburl_type(), urlrewriter_class=UrlRewriter) - # Allow for applying of additional filters self._apply_filters(wbrequest, matcher) return wbrequest - def _apply_filters(self, wbrequest, matcher): for filter in self.filters: last_grp = len(matcher.groups()) - wbrequest.query_filter.append(filter.format(matcher.group(last_grp))) + filter_str = filter.format(matcher.group(last_grp)) + wbrequest.query_filter.append(filter_str) def _custom_init(self, config): self.filters = config.get('filters', []) @@ -112,7 +115,8 @@ class Route: #================================================================= -# ReferRedirect -- redirect urls that have 'fallen through' based on the referrer settings +# ReferRedirect -- redirect urls that have 'fallen through' +# based on the referrer settings #================================================================= class ReferRedirect: def __init__(self, match_prefixs): @@ -121,7 +125,6 @@ class ReferRedirect: else: self.match_prefixs = [match_prefixs] - def __call__(self, env, routes): referrer = env.get('HTTP_REFERER') @@ -133,7 +136,7 @@ class ReferRedirect: ref_split = urlparse.urlsplit(referrer) # ensure referrer starts with one of allowed hosts - if not any (referrer.startswith(i) for i in self.match_prefixs): + if not any(referrer.startswith(i) for i in self.match_prefixs): if ref_split.netloc != env.get('HTTP_HOST'): return None @@ -144,13 +147,12 @@ class ReferRedirect: if app_path: # must start with current app name, if not root if not path.startswith(app_path): - return None + return None path = path[len(app_path):] - for route in routes: - ref_request = route.parse_request(env, False, request_uri = path) + ref_request = route.parse_request(env, False, request_uri=path) if ref_request: break @@ -174,6 +176,10 @@ class ReferRedirect: # 2013/path.html -> /path.html rel_request_uri = rel_request_uri[len(timestamp_path) - 1:] - final_url = urlparse.urlunsplit((ref_split.scheme, ref_split.netloc, rewriter.rewrite(rel_request_uri), '', '')) + final_url = urlparse.urlunsplit((ref_split.scheme, + ref_split.netloc, + rewriter.rewrite(rel_request_uri), + '', + '')) return WbResponse.redir_response(final_url) diff --git a/pywb/dispatch/proxy.py b/pywb/framework/proxy.py similarity index 78% rename from pywb/dispatch/proxy.py rename to pywb/framework/proxy.py index ffc74c47..cbebf4ae 100644 --- a/pywb/dispatch/proxy.py +++ b/pywb/framework/proxy.py @@ -1,15 +1,19 @@ -from pywb.core.wbrequestresponse import WbResponse, WbRequest +from wbrequestresponse import WbResponse, WbRequest from archivalrouter import ArchivalRouter import urlparse + #================================================================= # An experimental router which combines both archival and proxy modes -# http proxy mode support is very simple: only latest capture is available currently +# http proxy mode support is very simple so far: +# only latest capture is available currently #================================================================= - class ProxyArchivalRouter: - def __init__(self, routes, hostpaths = None, abs_path = True, home_view = None, error_view = None): - self.archival = ArchivalRouter(routes, hostpaths, abs_path, home_view, error_view) + def __init__(self, routes, hostpaths=None, abs_path=True, + home_view=None, error_view=None): + + self.archival = ArchivalRouter(routes, hostpaths, abs_path, + home_view, error_view) self.proxy = ProxyRouter(routes[0].handler, hostpaths, error_view) self.error_view = error_view @@ -29,7 +33,7 @@ class ProxyArchivalRouter: # Only supports latest capture replay at the moment #================================================================= class ProxyRouter: - def __init__(self, handler, hostpaths = None, error_view = None): + def __init__(self, handler, hostpaths=None, error_view=None): self.handler = handler self.hostpaths = hostpaths @@ -56,27 +60,26 @@ class ProxyRouter: return self.handler(wbrequest) - # Proxy Auto-Config (PAC) script for the proxy def make_pac_response(self, env): server_hostport = env['SERVER_NAME'] + ':' + env['SERVER_PORT'] buff = 'function FindProxyForURL (url, host) {\n' - direct_cond =' if (shExpMatch(host, "{0}")) {{ return "DIRECT"; }}\n' + direct = ' if (shExpMatch(host, "{0}")) {{ return "DIRECT"; }}\n' for hostpath in self.hostpaths: parts = urlparse.urlsplit(hostpath).netloc.split(':') - buff += direct_cond.format(parts[0]) + buff += direct.format(parts[0]) - buff += direct_cond.format(env['SERVER_NAME']) + buff += direct.format(env['SERVER_NAME']) #buff += '\n return "PROXY {0}";\n}}\n'.format(self.hostpaths[0]) buff += '\n return "PROXY {0}";\n}}\n'.format(server_hostport) - return WbResponse.text_response(buff, content_type = 'application/x-ns-proxy-autoconfig') - + content_type = 'application/x-ns-proxy-autoconfig' + return WbResponse.text_response(buff, content_type=content_type) #================================================================= @@ -85,10 +88,11 @@ class ProxyRouter: class ProxyHttpsUrlRewriter: HTTP = 'http://' HTTPS = 'https://' + def __init__(self, wbrequest, prefix): pass - def rewrite(self, url, mod = None): + def rewrite(self, url, mod=None): if url.startswith(self.HTTPS): return self.HTTP + url[len(self.HTTPS):] else: @@ -97,6 +101,5 @@ class ProxyHttpsUrlRewriter: def get_timestamp_url(self, timestamp, url): return url - def get_abs_url(self, url = ''): + def get_abs_url(self, url=''): return url - diff --git a/pywb/dispatch/test/test_archivalrouter.py b/pywb/framework/test/test_archivalrouter.py similarity index 98% rename from pywb/dispatch/test/test_archivalrouter.py rename to pywb/framework/test/test_archivalrouter.py index a076c015..86df528a 100644 --- a/pywb/dispatch/test/test_archivalrouter.py +++ b/pywb/framework/test/test_archivalrouter.py @@ -84,7 +84,7 @@ False """ -from pywb.dispatch.archivalrouter import Route, ReferRedirect +from pywb.framework.archivalrouter import Route, ReferRedirect from pywb.core.handlers import BaseHandler, WbUrlHandler import pprint diff --git a/pywb/core/test/test_wbrequestresponse.py b/pywb/framework/test/test_wbrequestresponse.py similarity index 98% rename from pywb/core/test/test_wbrequestresponse.py rename to pywb/framework/test/test_wbrequestresponse.py index 09017564..977a8863 100644 --- a/pywb/core/test/test_wbrequestresponse.py +++ b/pywb/framework/test/test_wbrequestresponse.py @@ -41,7 +41,7 @@ from pywb.rewrite.wburl import WbUrl from pywb.rewrite.url_rewriter import UrlRewriter from pywb.utils.statusandheaders import StatusAndHeaders -from pywb.core.wbrequestresponse import WbRequest, WbResponse +from pywb.framework.wbrequestresponse import WbRequest, WbResponse def print_req_from_uri(request_uri, env={}, use_abs_prefix=False): diff --git a/pywb/core/wbexceptions.py b/pywb/framework/wbexceptions.py similarity index 100% rename from pywb/core/wbexceptions.py rename to pywb/framework/wbexceptions.py diff --git a/pywb/core/wbrequestresponse.py b/pywb/framework/wbrequestresponse.py similarity index 76% rename from pywb/core/wbrequestresponse.py rename to pywb/framework/wbrequestresponse.py index 4a459c4b..3ef091d9 100644 --- a/pywb/core/wbrequestresponse.py +++ b/pywb/framework/wbrequestresponse.py @@ -26,7 +26,6 @@ class WbRequest: except KeyError: return '' - def __init__(self, env, request_uri=None, rel_prefix='', @@ -40,7 +39,10 @@ class WbRequest: self.env = env - self.request_uri = request_uri if request_uri else env.get('REL_REQUEST_URI') + if request_uri: + self.request_uri = request_uri + else: + self.request_uri = env.get('REL_REQUEST_URI') self.coll = coll @@ -55,7 +57,6 @@ class WbRequest: else: self.wb_prefix = rel_prefix - if not wb_url_str: wb_url_str = '/' @@ -83,7 +84,6 @@ class WbRequest: # PERF env['X_PERF'] = {} - def _is_ajax(self): value = self.env.get('HTTP_X_REQUESTED_WITH') if not value: @@ -96,7 +96,6 @@ class WbRequest: return True return False - def __repr__(self): varlist = vars(self) varstr = pprint.pformat(varlist) @@ -111,32 +110,39 @@ class WbResponse: Holds a status_headers object and a response iter, to be returned to wsgi container. """ - def __init__(self, status_headers, value = []): + def __init__(self, status_headers, value=[]): self.status_headers = status_headers self.body = value @staticmethod - def text_stream(text, status = '200 OK', content_type = 'text/plain'): - return WbResponse(StatusAndHeaders(status, [('Content-Type', content_type)]), value = text) + def text_stream(stream, status='200 OK', content_type='text/plain'): + status_headers = StatusAndHeaders(status, + [('Content-Type', content_type)]) + + return WbResponse(status_headers, value=stream) @staticmethod - def text_response(text, status = '200 OK', content_type = 'text/plain'): - return WbResponse(StatusAndHeaders(status, [('Content-Type', content_type)]), value = [text]) + def text_response(text, status='200 OK', content_type='text/plain'): + status_headers = StatusAndHeaders(status, + [('Content-Type', content_type)]) + + return WbResponse(status_headers, value=[text]) @staticmethod - def redir_response(location, status = '302 Redirect'): - return WbResponse(StatusAndHeaders(status, [('Location', location)])) - + def redir_response(location, status='302 Redirect'): + return WbResponse(StatusAndHeaders(status, + [('Location', location)])) def __call__(self, env, start_response): # PERF perfstats = env.get('X_PERF') if perfstats: - self.status_headers.headers.append(('X-Archive-Perf-Stats', str(perfstats))) + self.status_headers.headers.append(('X-Archive-Perf-Stats', + str(perfstats))) - - start_response(self.status_headers.statusline, self.status_headers.headers) + start_response(self.status_headers.statusline, + self.status_headers.headers) if env['REQUEST_METHOD'] == 'HEAD': if hasattr(self.body, 'close'): @@ -148,6 +154,5 @@ class WbResponse: else: return [str(self.body)] - def __repr__(self): return str(vars(self)) diff --git a/pywb/bootstrap/wsgi_wrappers.py b/pywb/framework/wsgi_wrappers.py similarity index 84% rename from pywb/bootstrap/wsgi_wrappers.py rename to pywb/framework/wsgi_wrappers.py index 4dd04115..2811aa92 100644 --- a/pywb/bootstrap/wsgi_wrappers.py +++ b/pywb/framework/wsgi_wrappers.py @@ -1,8 +1,9 @@ from pywb.utils.wbexception import WbException -from pywb.core.wbexceptions import NotFoundException, InternalRedirect -from pywb.core.wbrequestresponse import WbResponse, StatusAndHeaders +from pywb.utils.loaders import load_yaml_config + +from wbexceptions import NotFoundException, InternalRedirect +from wbrequestresponse import WbResponse, StatusAndHeaders -from pywb.utils.loaders import BlockLoader import os import importlib @@ -10,10 +11,13 @@ import logging #================================================================= -# adapted from wsgiref.request_uri, but doesn't include domain name and allows all characters -# allowed in the path segment according to: http://tools.ietf.org/html/rfc3986#section-3.3 +# adapted from wsgiref.request_uri, but doesn't include domain name +# and allows all characters which are allowed in the path segment +# according to: http://tools.ietf.org/html/rfc3986#section-3.3 # explained here: -# http://stackoverflow.com/questions/4669692/valid-characters-for-directory-part-of-a-url-for-short-links +# http://stackoverflow.com/questions/4669692/ +# valid-characters-for-directory-part-of-a-url-for-short-links + def rel_request_uri(environ, include_query=1): """ Return the requested path, optionally including the query string @@ -28,7 +32,7 @@ def rel_request_uri(environ, include_query=1): "/web/example.com/0~!+$&'()*+,;=:%22" """ from urllib import quote - url = quote(environ.get('PATH_INFO',''), safe='/~!$&\'()*+,;=:@') + url = quote(environ.get('PATH_INFO', ''), safe='/~!$&\'()*+,;=:@') if include_query and environ.get('QUERY_STRING'): url += '?' + environ['QUERY_STRING'] @@ -50,7 +54,8 @@ def create_wb_app(wb_router): response = wb_router(env) if not response: - raise NotFoundException('No handler for "{0}"'.format(env['REL_REQUEST_URI'])) + msg = 'No handler for "{0}"'.format(env['REL_REQUEST_URI']) + raise NotFoundException(msg) except InternalRedirect as ir: response = WbResponse(StatusAndHeaders(ir.status, ir.httpHeaders)) @@ -63,7 +68,6 @@ def create_wb_app(wb_router): return response(env, start_response) - return application @@ -94,16 +98,6 @@ def handle_exception(env, error_view, exc, print_trace): #================================================================= DEFAULT_CONFIG_FILE = 'config.yaml' -def load_yaml_config(config_file=None): - import yaml - - if not config_file: - config_file = DEFAULT_CONFIG_FILE - - configdata = BlockLoader().load(config_file) - config = yaml.load(configdata) - return config - #================================================================= def init_app(init_func, load_yaml=True, config_file=None): @@ -114,6 +108,9 @@ def init_app(init_func, load_yaml=True, config_file=None): if load_yaml: if not config_file: config_file = os.environ.get('PYWB_CONFIG_FILE') + if not config_file: + config_file = DEFAULT_CONFIG_FILE + config = load_yaml_config(config_file) try: @@ -135,6 +132,7 @@ def init_app(init_func, load_yaml=True, config_file=None): #================================================================= DEFAULT_PORT = 8080 + def start_wsgi_server(the_app): from wsgiref.simple_server import make_server from optparse import OptionParser @@ -153,7 +151,6 @@ def start_wsgi_server(the_app): except: port = DEFAULT_PORT - logging.debug('Starting CDX Server on port %s', port) try: diff --git a/pywb/utils/dsrules.py b/pywb/utils/dsrules.py index 2e6f9626..bfbb5a1a 100644 --- a/pywb/utils/dsrules.py +++ b/pywb/utils/dsrules.py @@ -1,11 +1,10 @@ -import yaml import pkgutil +from loaders import load_yaml_config + #================================================================= -DEFAULT_RULES_FILE = 'rules.yaml' -DEFAULT_RULES_PKG = 'pywb' - +DEFAULT_RULES_FILE = 'pywb/rules.yaml' #================================================================= class RuleSet(object): @@ -23,10 +22,14 @@ class RuleSet(object): self.rules = [] - ds_rules_file = kwargs.get('ds_rules_file') default_rule_config = kwargs.get('default_rule_config') - config = self.load_default_rules(ds_rules_file) + ds_rules_file = kwargs.get('ds_rules_file') + + if not ds_rules_file: + ds_rules_file = DEFAULT_RULES_FILE + + config = load_yaml_config(ds_rules_file) rulesmap = config.get('rules') if config else None @@ -53,22 +56,6 @@ class RuleSet(object): if not def_key_found and default_rule_config is not None: self.rules.append(rule_cls(self.DEFAULT_KEY, default_rule_config)) - @staticmethod - def load_default_rules(filename=None, pkg=None): - config = None - - if not filename: - filename = DEFAULT_RULES_FILE - - if not pkg: - pkg = DEFAULT_RULES_PKG - - if filename: - yaml_str = pkgutil.get_data(pkg, filename) - config = yaml.load(yaml_str) - - return config - def iter_matching(self, urlkey): """ Iterate over all matching rules for given urlkey diff --git a/pywb/utils/loaders.py b/pywb/utils/loaders.py index 6f2fa6c9..0f925105 100644 --- a/pywb/utils/loaders.py +++ b/pywb/utils/loaders.py @@ -7,12 +7,20 @@ import os import hmac import urllib2 import time -from pkg_resources import resource_stream +import pkg_resources #================================================================= def is_http(filename): - return any(filename.startswith(x) for x in ['http://', 'https://']) + return filename.startswith(('http://', 'https://')) + + +#================================================================= +def load_yaml_config(config_file): + import yaml + configdata = BlockLoader().load(config_file) + config = yaml.load(configdata) + return config #================================================================= @@ -39,16 +47,27 @@ class BlockLoader(object): Load a file-like reader from the local file system """ + file_only = False + if url.startswith('file://'): url = url[len('file://'):] + file_only = True try: # first, try as file afile = open(url, 'rb') - except IOError as file_err: + + except IOError: + #if file_only: + # raise + # then, try as package.path/file pkg_split = url.split('/', 1) - afile = resource_stream(pkg_split[0], pkg_split[1]) + #if len(pkg_split) == 1: + # raise + + afile = pkg_resources.resource_stream(pkg_split[0], + pkg_split[1]) if offset > 0: afile.seek(offset) diff --git a/tests/test_integration.py b/tests/test_integration.py index b71e8574..6e539c31 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -1,6 +1,6 @@ import webtest -from pywb.bootstrap.pywb_init import create_wb_router -from pywb.bootstrap.wsgi_wrappers import init_app +from pywb.core.pywb_init import create_wb_router +from pywb.framework.wsgi_wrappers import init_app from pywb.cdx.cdxobject import CDXObject from fixture import TestExclusionPerms From 0bf651c2e30e6715001dafe5182961b8f67722d4 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sun, 2 Mar 2014 23:41:44 -0800 Subject: [PATCH 6/7] add cdx_server app! port wsgi cdx server tests to test new app! move base handlers to basehandlers in framework pkg (remove werkzeug dependency) --- pywb/apps/cdx_server.py | 30 +++++ pywb/cdx/cdxobject.py | 15 ++- pywb/cdx/cdxops.py | 9 ++ pywb/cdx/query.py | 11 +- pywb/cdx/test/cdxserver_test.py | 1 + pywb/cdx/test/wsgi_cdxserver_test.py | 15 --- pywb/cdx/wsgi_cdxserver.py | 103 ------------------ pywb/core/handlers.py | 19 +--- pywb/core/indexreader.py | 1 + pywb/framework/archivalrouter.py | 7 +- pywb/framework/basehandlers.py | 23 ++++ pywb/framework/test/test_archivalrouter.py | 2 +- pywb/utils/dsrules.py | 2 +- pywb/utils/loaders.py | 8 +- pywb/warc/resolvingloader.py | 2 +- setup.py | 3 +- ...gi_cdxserver.py => test_cdx_server_app.py} | 86 ++++++++------- 17 files changed, 147 insertions(+), 190 deletions(-) create mode 100644 pywb/apps/cdx_server.py delete mode 100644 pywb/cdx/test/wsgi_cdxserver_test.py delete mode 100644 pywb/cdx/wsgi_cdxserver.py create mode 100644 pywb/framework/basehandlers.py rename tests/{test_wsgi_cdxserver.py => test_cdx_server_app.py} (73%) diff --git a/pywb/apps/cdx_server.py b/pywb/apps/cdx_server.py new file mode 100644 index 00000000..893531b7 --- /dev/null +++ b/pywb/apps/cdx_server.py @@ -0,0 +1,30 @@ +from pywb.cdx.cdxserver import create_cdx_server + +from pywb.framework.wsgi_wrappers import init_app, start_wsgi_server +from pywb.framework.archivalrouter import ArchivalRouter, Route + +from pywb.core.handlers import CDXHandler + +DEFAULT_RULES = 'pywb/rules.yaml' + +# cdx-server only config +DEFAULT_CONFIG = 'pywb/cdx/config.yaml' + +#================================================================= +# create simple cdx server under '/cdx' using config file +# TODO: support multiple collections like full wayback? + +def create_cdx_server_app(config): + cdx_server = create_cdx_server(config, DEFAULT_RULES) + routes = [Route('cdx', CDXHandler(cdx_server))] + return ArchivalRouter(routes) + +#================================================================= +# init pywb app +#================================================================= +application = init_app(create_cdx_server_app, + load_yaml=True, + config_file=DEFAULT_CONFIG) + +if __name__ == "__main__": + start_wsgi_server(application) diff --git a/pywb/cdx/cdxobject.py b/pywb/cdx/cdxobject.py index 49cd74c5..9ea4a92e 100644 --- a/pywb/cdx/cdxobject.py +++ b/pywb/cdx/cdxobject.py @@ -63,7 +63,7 @@ class CDXObject(OrderedDict): cdxformat = i if not cdxformat: - raise Exception('unknown {0}-field cdx format'.format(len(fields))) + raise CDXException('unknown {0}-field cdx format'.format(len(fields))) for header, field in itertools.izip(cdxformat, fields): self[header] = field @@ -87,8 +87,15 @@ class CDXObject(OrderedDict): """ if fields is None: return str(self) + '\n' - else: - return ' '.join(self[x] for x in fields) + '\n' + + try: + result = ' '.join(self[x] for x in fields) + '\n' + except KeyError as ke: + msg = 'Invalid field "{0}" found in fields= argument' + msg = msg.format(ke.message) + raise CDXException(msg) + + return result def __str__(self): if self.cdxline: @@ -111,7 +118,7 @@ class IDXObject(OrderedDict): if len(fields) < self.NUM_REQ_FIELDS: msg = 'invalid idx format: {0} fields found, {1} required' - raise Exception(msg.format(len(fields), self.NUM_REQ_FIELDS)) + raise CDXException(msg.format(len(fields), self.NUM_REQ_FIELDS)) for header, field in itertools.izip(self.FORMAT, fields): self[header] = field diff --git a/pywb/cdx/cdxops.py b/pywb/cdx/cdxops.py index e3a1a13b..6963b28c 100644 --- a/pywb/cdx/cdxops.py +++ b/pywb/cdx/cdxops.py @@ -31,9 +31,18 @@ def cdx_load(sources, query, perms_checker=None, process=True): if perms_checker: cdx_iter = restrict_cdx(cdx_iter, query, perms_checker) + if query.output == 'text': + cdx_iter = cdx_to_text(cdx_iter, query.fields) + return cdx_iter +#================================================================= +def cdx_to_text(cdx_iter, fields): + for cdx in cdx_iter: + yield cdx.to_text(fields) + + #================================================================= def restrict_cdx(cdx_iter, query, perms_checker): """ diff --git a/pywb/cdx/query.py b/pywb/cdx/query.py index dc480836..6449223a 100644 --- a/pywb/cdx/query.py +++ b/pywb/cdx/query.py @@ -1,5 +1,6 @@ from urllib import urlencode from urlparse import parse_qs +from cdxobject import CDXException #================================================================= @@ -62,6 +63,9 @@ class CDXQuery(object): @property def fields(self): v = self.params.get('fields') + # check old param name + if not v: + v = self.params.get('fl') return v.split(',') if v else None @property @@ -105,9 +109,6 @@ class CDXQuery(object): """ params = parse_qs(env['QUERY_STRING']) - if not 'output' in params: - params['output'] = 'text' - # parse_qs produces arrays for single values # cdx processing expects singleton params for all params, # except filters, so convert here @@ -116,4 +117,8 @@ class CDXQuery(object): if name != 'filter': params[name] = val[0] + if not 'output' in params: + params['output'] = 'text' + + return params diff --git a/pywb/cdx/test/cdxserver_test.py b/pywb/cdx/test/cdxserver_test.py index e261ead4..f0a3398d 100644 --- a/pywb/cdx/test/cdxserver_test.py +++ b/pywb/cdx/test/cdxserver_test.py @@ -187,6 +187,7 @@ import pytest def cdx_ops_test(url, sources = [test_cdx_dir + 'iana.cdx'], **kwparams): kwparams['url'] = url + kwparams['output'] = 'cdxobject' fields = kwparams.get('fields') if fields: fields = fields.split(',') diff --git a/pywb/cdx/test/wsgi_cdxserver_test.py b/pywb/cdx/test/wsgi_cdxserver_test.py deleted file mode 100644 index a7d1ecdb..00000000 --- a/pywb/cdx/test/wsgi_cdxserver_test.py +++ /dev/null @@ -1,15 +0,0 @@ -import webtest -from pywb.cdx.wsgi_cdxserver import create_app -from pywb import get_test_dir - -class TestCdx: - def setup(self): - self.app = create_app(get_test_dir() + 'cdx/') - self.testapp = webtest.TestApp(self.app) - - def test_cdx(self): - resp = self.testapp.get('/cdx?url=http://www.iana.org/_css/2013.1/screen.css') - assert resp.content_type == 'text/plain' - assert resp.content_length > 0 - - diff --git a/pywb/cdx/wsgi_cdxserver.py b/pywb/cdx/wsgi_cdxserver.py deleted file mode 100644 index c9fe11d7..00000000 --- a/pywb/cdx/wsgi_cdxserver.py +++ /dev/null @@ -1,103 +0,0 @@ -from werkzeug.wrappers import BaseResponse -from cdxserver import create_cdx_server -from pywb import get_test_dir -from query import CDXQuery - -import logging -import os -import yaml -import pkg_resources - -#================================================================= -CONFIG_FILE = 'config.yaml' - -RULES_FILE = 'rules.yaml' - -DEFAULT_PORT = 8080 - -#================================================================= - -class CDXQueryRequest(object): - def __init__(self, environ): - self.query = CDXQuery.from_wsgi_env(environ) - - -class WSGICDXServer(object): - def __init__(self, config, rules_file): - self.cdxserver = create_cdx_server(config, rules_file) - - def __call__(self, environ, start_response): - request = CDXQueryRequest(environ) - try: - logging.debug('request.args=%s', request.query) - result = self.cdxserver.load_cdx_query(request.query) - - # TODO: select response type by "output" parameter - response = PlainTextResponse(result, request.query.fields) - return response(environ, start_response) - except Exception as exc: - logging.error('load_cdx failed', exc_info=1) - # TODO: error response should be different for each response - # type - start_response('400 Error', [('Content-Type', 'text/plain')]) - return [str(exc)] - -def cdx_text_out(cdx, fields): - if not fields: - return str(cdx) + '\n' - else: - logging.info('cdx fields=%s', cdx.keys) - # TODO: this will results in an exception if fields contain - # non-existent field name. - return ' '.join(cdx[x] for x in fields) + '\n' - -class PlainTextResponse(BaseResponse): - def __init__(self, cdxitr, fields, status=200, content_type='text/plain'): - super(PlainTextResponse, self).__init__( - response=( - cdx.to_text(fields) for cdx in cdxitr - ), - status=status, content_type=content_type) - -# class JsonResponse(Response): -# pass -# class MementoResponse(Response): -# pass - -def create_app(config=None): - logging.basicConfig(format='%(asctime)s: [%(levelname)s]: %(message)s', - level=logging.DEBUG) - - if not config: - index_paths = get_test_dir() + 'cdx/' - config = dict(index_paths=index_paths) - - return WSGICDXServer(config, RULES_FILE) - -if __name__ == "__main__": - from optparse import OptionParser - from werkzeug.serving import run_simple - - opt = OptionParser('%prog [OPTIONS]') - opt.add_option('-p', '--port', type='int', default=None) - - options, args = opt.parse_args() - - configdata = pkg_resources.resource_string(__name__, CONFIG_FILE) - config = yaml.load(configdata) - - port = options.port - if port is None: - port = (config and config.get('port')) or DEFAULT_PORT - - app = create_app(config) - - logging.debug('Starting CDX Server on port %s', port) - try: - run_simple('0.0.0.0', port, app, use_reloader=True, use_debugger=True) - except KeyboardInterrupt as ex: - pass - logging.debug('Stopping CDX Server') -else: - # XXX pass production config - application = create_app() diff --git a/pywb/core/handlers.py b/pywb/core/handlers.py index 1984a4df..18bd0fc9 100644 --- a/pywb/core/handlers.py +++ b/pywb/core/handlers.py @@ -3,28 +3,13 @@ import pkgutil import mimetypes import time -from pywb.rewrite.wburl import WbUrl from pywb.cdx.query import CDXQuery +from pywb.framework.basehandlers import BaseHandler, WbUrlHandler from pywb.framework.wbrequestresponse import WbResponse -from wbexceptions import WbException, NotFoundException +from pywb.framework.wbexceptions import WbException, NotFoundException from views import TextCapturesView -#================================================================= -class BaseHandler(object): - def __call__(self, wbrequest): - return wbrequest - - def get_wburl_type(self): - return None - - -#================================================================= -class WbUrlHandler(BaseHandler): - def get_wburl_type(self): - return WbUrl - - #================================================================= # Standard WB Handler #================================================================= diff --git a/pywb/core/indexreader.py b/pywb/core/indexreader.py index a422d0b4..b77f8590 100644 --- a/pywb/core/indexreader.py +++ b/pywb/core/indexreader.py @@ -29,6 +29,7 @@ class IndexReader(object): params.update(wbrequest.custom_params) params['allowFuzzy'] = True + params['output'] = 'cdxobject' cdxlines = self.load_cdx(url=wburl.url, **params) diff --git a/pywb/framework/archivalrouter.py b/pywb/framework/archivalrouter.py index 2ae3bb5f..29701fa8 100644 --- a/pywb/framework/archivalrouter.py +++ b/pywb/framework/archivalrouter.py @@ -13,7 +13,12 @@ class ArchivalRouter(object): home_view=None, error_view=None): self.routes = routes - self.fallback = ReferRedirect(hostpaths) + + if hostpaths: + self.fallback = ReferRedirect(hostpaths) + else: + self.fallback = None + self.abs_path = abs_path self.home_view = home_view diff --git a/pywb/framework/basehandlers.py b/pywb/framework/basehandlers.py new file mode 100644 index 00000000..8ae4d662 --- /dev/null +++ b/pywb/framework/basehandlers.py @@ -0,0 +1,23 @@ +from pywb.rewrite.wburl import WbUrl + + +#================================================================= +class BaseHandler(object): + """ + Represents a base handler class that handles any request + """ + def __call__(self, wbrequest): + return wbrequest + + def get_wburl_type(self): + return None + + +#================================================================= +class WbUrlHandler(BaseHandler): + """ + Represents a handler which assumes the request contains a WbUrl + Ensure that the WbUrl is parsed in the request + """ + def get_wburl_type(self): + return WbUrl diff --git a/pywb/framework/test/test_archivalrouter.py b/pywb/framework/test/test_archivalrouter.py index 86df528a..706027ba 100644 --- a/pywb/framework/test/test_archivalrouter.py +++ b/pywb/framework/test/test_archivalrouter.py @@ -85,7 +85,7 @@ False """ from pywb.framework.archivalrouter import Route, ReferRedirect -from pywb.core.handlers import BaseHandler, WbUrlHandler +from pywb.framework.basehandlers import BaseHandler, WbUrlHandler import pprint def print_req(req): diff --git a/pywb/utils/dsrules.py b/pywb/utils/dsrules.py index bfbb5a1a..672ce738 100644 --- a/pywb/utils/dsrules.py +++ b/pywb/utils/dsrules.py @@ -3,9 +3,9 @@ from loaders import load_yaml_config #================================================================= - DEFAULT_RULES_FILE = 'pywb/rules.yaml' + #================================================================= class RuleSet(object): DEFAULT_KEY = '' diff --git a/pywb/utils/loaders.py b/pywb/utils/loaders.py index 0f925105..d2ca827f 100644 --- a/pywb/utils/loaders.py +++ b/pywb/utils/loaders.py @@ -58,13 +58,13 @@ class BlockLoader(object): afile = open(url, 'rb') except IOError: - #if file_only: - # raise + if file_only: + raise # then, try as package.path/file pkg_split = url.split('/', 1) - #if len(pkg_split) == 1: - # raise + if len(pkg_split) == 1: + raise afile = pkg_resources.resource_stream(pkg_split[0], pkg_split[1]) diff --git a/pywb/warc/resolvingloader.py b/pywb/warc/resolvingloader.py index 041024e7..6a44739d 100644 --- a/pywb/warc/resolvingloader.py +++ b/pywb/warc/resolvingloader.py @@ -176,6 +176,6 @@ class ResolvingLoader: params = {'url': url, 'closest': timestamp, 'filter': 'digest:' + digest, - 'output': 'raw'} + 'output': 'cdxobject'} return self.cdx_server.load_cdx(**params) diff --git a/setup.py b/setup.py index 889fe2a8..54f136b4 100755 --- a/setup.py +++ b/setup.py @@ -19,9 +19,8 @@ setup( 'pywb.cdx', 'pywb.warc', 'pywb.rewrite', + 'pywb.framework' 'pywb.core', - 'pywb.dispatch', - 'pywb.bootstrap' 'pywb.apps' ], package_data={ diff --git a/tests/test_wsgi_cdxserver.py b/tests/test_cdx_server_app.py similarity index 73% rename from tests/test_wsgi_cdxserver.py rename to tests/test_cdx_server_app.py index 8eee2484..613273b5 100644 --- a/tests/test_wsgi_cdxserver.py +++ b/tests/test_cdx_server_app.py @@ -1,32 +1,26 @@ -import os import re +import webtest -import pytest from urllib import urlencode -from werkzeug.test import Client -from werkzeug.wrappers import BaseResponse, Response - -import yaml - from pywb.cdx.cdxobject import CDXObject -from pywb.cdx.wsgi_cdxserver import create_app +from pywb.apps.cdx_server import application -from tests.fixture import testconfig +import pytest +#================================================================ @pytest.fixture -def client(testconfig): - app = create_app(testconfig) - return Client(app, Response) +def client(): + return webtest.TestApp(application) -# ================================================================ -def query(client, url, **params): +#================================================================ +def query(client, url, is_error=False, **params): params['url'] = url - return client.get('/cdx?' + urlencode(params, doseq=1)) + return client.get('/cdx?' + urlencode(params, doseq=1), expect_errors=is_error) -# ================================================================ +#================================================================ def test_exact_url(client): """ basic exact match, no filters, etc. @@ -34,48 +28,54 @@ def test_exact_url(client): resp = query(client, 'http://www.iana.org/') assert resp.status_code == 200 - print resp.data + print resp.body + +#================================================================ def test_prefix_match(client): """ prefix match test """ resp = query(client, 'http://www.iana.org/', matchType='prefix') - print resp.data.splitlines() + print resp.body.splitlines() assert resp.status_code == 200 suburls = 0 - for l in resp.data.splitlines(): + for l in resp.body.splitlines(): fields = l.split(' ') if len(fields[0]) > len('org,iana)/'): suburls += 1 assert suburls > 0 - + + +#================================================================ def test_filters(client): """ filter cdxes by mimetype and filename field, exact match. """ resp = query(client, 'http://www.iana.org/_css/2013.1/screen.css', filter=('mimetype:warc/revisit', 'filename:dupes.warc.gz')) - - assert resp.status_code == 200 - assert resp.mimetype == 'text/plain' - for l in resp.data.splitlines(): + assert resp.status_code == 200 + assert resp.content_type == 'text/plain' + + for l in resp.body.splitlines(): fields = l.split(' ') assert fields[0] == 'org,iana)/_css/2013.1/screen.css' assert fields[3] == 'warc/revisit' assert fields[10] == 'dupes.warc.gz' + +#================================================================ def test_limit(client): resp = query(client, 'http://www.iana.org/_css/2013.1/screen.css', limit='1') assert resp.status_code == 200 - assert resp.mimetype == 'text/plain' + assert resp.content_type == 'text/plain' - cdxes = resp.data.splitlines() + cdxes = resp.body.splitlines() assert len(cdxes) == 1 fields = cdxes[0].split(' ') assert fields[0] == 'org,iana)/_css/2013.1/screen.css' @@ -86,15 +86,17 @@ def test_limit(client): limit='1', reverse='1') assert resp.status_code == 200 - assert resp.mimetype == 'text/plain' + assert resp.content_type == 'text/plain' - cdxes = resp.data.splitlines() + cdxes = resp.body.splitlines() assert len(cdxes) == 1 fields = cdxes[0].split(' ') assert fields[0] == 'org,iana)/_css/2013.1/screen.css' assert fields[1] == '20140127171239' assert fields[3] == 'warc/revisit' + +#================================================================ def test_fields(client): """ retrieve subset of fields with ``fields`` parameter. @@ -104,7 +106,7 @@ def test_fields(client): assert resp.status_code == 200 - cdxes = resp.data.splitlines() + cdxes = resp.body.splitlines() for cdx in cdxes: fields = cdx.split(' ') @@ -113,16 +115,21 @@ def test_fields(client): assert re.match(r'\d{14}$', fields[1]) assert re.match(r'\d{3}|-', fields[2]) + +#================================================================ def test_fields_undefined(client): """ - server shall respond with Bad Request (TODO: with proper explanation), + server shall respond with Bad Request and name of undefined when ``fields`` parameter contains undefined name(s). """ resp = query(client, 'http://www.iana.org/_css/2013.1/print.css', + is_error=True, fields='urlkey,nosuchfield') resp.status_code == 400 - + + +#================================================================ def test_resolveRevisits(client): """ with ``resolveRevisits=true``, server adds three fields pointing to @@ -132,9 +139,9 @@ def test_resolveRevisits(client): resolveRevisits='true' ) assert resp.status_code == 200 - assert resp.mimetype == 'text/plain' + assert resp.content_type == 'text/plain' - cdxes = resp.data.splitlines() + cdxes = resp.body.splitlines() originals = {} for cdx in cdxes: fields = cdx.split(' ') @@ -151,6 +158,8 @@ def test_resolveRevisits(client): orig = originals.get(sha) assert orig == (int(orig_size), int(orig_offset), orig_fn) + +#================================================================ def test_resolveRevisits_orig_fields(client): """ when resolveRevisits=true, extra three fields are named @@ -162,9 +171,9 @@ def test_resolveRevisits_orig_fields(client): fields='urlkey,orig.length,orig.offset,orig.filename' ) assert resp.status_code == 200 - assert resp.mimetype == 'text/plain' + assert resp.content_type == 'text/plain' - cdxes = resp.data.splitlines() + cdxes = resp.body.splitlines() for cdx in cdxes: fields = cdx.split(' ') assert len(fields) == 4 @@ -172,6 +181,8 @@ def test_resolveRevisits_orig_fields(client): assert (orig_len == '-' and orig_offset == '-' and orig_fn == '-' or (int(orig_len), int(orig_offset), orig_fn)) + +#================================================================ def test_collapseTime_resolveRevisits_reverse(client): resp = query(client, 'http://www.iana.org/_css/2013.1/print.css', collapseTime='11', @@ -179,11 +190,10 @@ def test_collapseTime_resolveRevisits_reverse(client): reverse='true' ) - cdxes = [CDXObject(l) for l in resp.data.splitlines()] - + cdxes = [CDXObject(l) for l in resp.body.splitlines()] + assert len(cdxes) == 3 # timestamp is in descending order for i in range(len(cdxes) - 1): assert cdxes[i]['timestamp'] >= cdxes[i + 1]['timestamp'] - From 2d4ae62fbe45710402fbdd28c91fb5bea1a0e0ad Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 3 Mar 2014 10:35:57 -0800 Subject: [PATCH 7/7] - cdx handler refactoring: factor out CDXHandler and init to seperate cdx_handler module - Make wsgi app a class, add port as an optional field in wsgi app and router. (not required to be specified) --- pywb/apps/cdx_server.py | 21 +++------------- pywb/core/cdx_handler.py | 43 ++++++++++++++++++++++++++++++++ pywb/core/handlers.py | 20 --------------- pywb/core/pywb_init.py | 6 ++++- pywb/framework/archivalrouter.py | 11 ++++++-- pywb/framework/proxy.py | 26 +++++++++++++------ pywb/framework/wbexceptions.py | 5 ++-- pywb/framework/wsgi_wrappers.py | 31 ++++++++++++----------- test_config.yaml | 3 +++ 9 files changed, 102 insertions(+), 64 deletions(-) create mode 100644 pywb/core/cdx_handler.py diff --git a/pywb/apps/cdx_server.py b/pywb/apps/cdx_server.py index 893531b7..a16df1fe 100644 --- a/pywb/apps/cdx_server.py +++ b/pywb/apps/cdx_server.py @@ -1,27 +1,14 @@ -from pywb.cdx.cdxserver import create_cdx_server - from pywb.framework.wsgi_wrappers import init_app, start_wsgi_server -from pywb.framework.archivalrouter import ArchivalRouter, Route -from pywb.core.handlers import CDXHandler +from pywb.core.cdx_handler import create_cdx_server_app -DEFAULT_RULES = 'pywb/rules.yaml' +#================================================================= +# init cdx server app +#================================================================= # cdx-server only config DEFAULT_CONFIG = 'pywb/cdx/config.yaml' -#================================================================= -# create simple cdx server under '/cdx' using config file -# TODO: support multiple collections like full wayback? - -def create_cdx_server_app(config): - cdx_server = create_cdx_server(config, DEFAULT_RULES) - routes = [Route('cdx', CDXHandler(cdx_server))] - return ArchivalRouter(routes) - -#================================================================= -# init pywb app -#================================================================= application = init_app(create_cdx_server_app, load_yaml=True, config_file=DEFAULT_CONFIG) diff --git a/pywb/core/cdx_handler.py b/pywb/core/cdx_handler.py new file mode 100644 index 00000000..3f5bb2a8 --- /dev/null +++ b/pywb/core/cdx_handler.py @@ -0,0 +1,43 @@ +from pywb.cdx.query import CDXQuery +from pywb.cdx.cdxserver import create_cdx_server + +from pywb.framework.archivalrouter import ArchivalRouter, Route +from pywb.framework.basehandlers import BaseHandler + +from views import TextCapturesView + + +#================================================================= +class CDXHandler(BaseHandler): + """ + Handler which passes wsgi request to cdx server and + returns a text-based cdx response + """ + def __init__(self, index_reader, view=None): + self.index_reader = index_reader + self.view = view if view else TextCapturesView() + + def __call__(self, wbrequest): + params = CDXQuery.extract_params_from_wsgi_env(wbrequest.env) + cdx_lines = self.index_reader.load_cdx(**params) + + return self.view.render_response(wbrequest, cdx_lines) + + def __str__(self): + return 'CDX Handler: ' + str(self.index_reader) + + +#================================================================= +DEFAULT_RULES = 'pywb/rules.yaml' + +#================================================================= +def create_cdx_server_app(config): + """ + Create a cdx server config to be wrapped in a wsgi app + Currently using single access point '/cdx' + TODO: more complex example with multiple collections? + """ + cdx_server = create_cdx_server(config, DEFAULT_RULES) + port = config.get('port') + routes = [Route('cdx', CDXHandler(cdx_server))] + return ArchivalRouter(routes, port=port) diff --git a/pywb/core/handlers.py b/pywb/core/handlers.py index 18bd0fc9..049888df 100644 --- a/pywb/core/handlers.py +++ b/pywb/core/handlers.py @@ -1,9 +1,7 @@ -import urlparse import pkgutil import mimetypes import time -from pywb.cdx.query import CDXQuery from pywb.framework.basehandlers import BaseHandler, WbUrlHandler from pywb.framework.wbrequestresponse import WbResponse from pywb.framework.wbexceptions import WbException, NotFoundException @@ -58,24 +56,6 @@ class WBHandler(WbUrlHandler): return 'WBHandler: ' + str(self.index_reader) + ', ' + str(self.replay) -#================================================================= -# CDX-Server Handler -- pass all params to cdx server -#================================================================= -class CDXHandler(BaseHandler): - def __init__(self, index_reader, view = None): - self.index_reader = index_reader - self.view = view if view else TextCapturesView() - - def __call__(self, wbrequest): - params = CDXQuery.extract_params_from_wsgi_env(wbrequest.env) - cdx_lines = self.index_reader.load_cdx(**params) - - return self.view.render_response(wbrequest, cdx_lines) - - def __str__(self): - return 'Index Reader: ' + str(self.index_reader) - - #================================================================= # Static Content Handler #================================================================= diff --git a/pywb/core/pywb_init.py b/pywb/core/pywb_init.py index 52df9f5f..10c7b999 100644 --- a/pywb/core/pywb_init.py +++ b/pywb/core/pywb_init.py @@ -11,7 +11,8 @@ from views import J2TemplateView, J2HtmlCapturesView from replay_views import ReplayView from handlers import WBHandler -from handlers import CDXHandler, StaticHandler +from handlers import StaticHandler +from cdx_handler import CDXHandler from handlers import DebugEchoHandler, DebugEchoEnvHandler @@ -115,6 +116,8 @@ def create_wb_router(passed_config = {}): hostpaths = config.get('hostpaths') + port = config.get('port') + # collections based on cdx source collections = config.get('collections') @@ -169,6 +172,7 @@ def create_wb_router(passed_config = {}): # This will help catch occasionally missed rewrites that fall-through to the host # (See archivalrouter.ReferRedirect) hostpaths = hostpaths, + port = port, abs_path = config.get('absolute_paths', True), diff --git a/pywb/framework/archivalrouter.py b/pywb/framework/archivalrouter.py index 29701fa8..6c901fac 100644 --- a/pywb/framework/archivalrouter.py +++ b/pywb/framework/archivalrouter.py @@ -9,11 +9,18 @@ from wbrequestresponse import WbRequest, WbResponse # ArchivalRouter -- route WB requests in archival mode #================================================================= class ArchivalRouter(object): - def __init__(self, routes, hostpaths=None, abs_path=True, - home_view=None, error_view=None): + def __init__(self, routes, + hostpaths=None, + port=None, + abs_path=True, + home_view=None, + error_view=None): self.routes = routes + # optional port setting may be ignored by wsgi container + self.port = port + if hostpaths: self.fallback = ReferRedirect(hostpaths) else: diff --git a/pywb/framework/proxy.py b/pywb/framework/proxy.py index cbebf4ae..d27b922e 100644 --- a/pywb/framework/proxy.py +++ b/pywb/framework/proxy.py @@ -8,21 +8,31 @@ import urlparse # http proxy mode support is very simple so far: # only latest capture is available currently #================================================================= -class ProxyArchivalRouter: - def __init__(self, routes, hostpaths=None, abs_path=True, - home_view=None, error_view=None): +class ProxyArchivalRouter(ArchivalRouter): + def __init__(self, routes, + hostpaths=None, + port=None, + abs_path=True, + home_view=None, + error_view=None): + + (super(ProxyArchivalRouter, self). + __init__(routes, + hostpaths=hostpaths, + port=port, + abs_path=abs_path, + home_view=home_view, + error_view=error_view)) - self.archival = ArchivalRouter(routes, hostpaths, abs_path, - home_view, error_view) self.proxy = ProxyRouter(routes[0].handler, hostpaths, error_view) - self.error_view = error_view + #self.error_view = error_view def __call__(self, env): - response = self.archival(env) + response = self.proxy(env) if response: return response - response = self.proxy(env) + response = super(ProxyArchivalRouter, self).__call__(env) if response: return response diff --git a/pywb/framework/wbexceptions.py b/pywb/framework/wbexceptions.py index e9b07ad3..6d437a4e 100644 --- a/pywb/framework/wbexceptions.py +++ b/pywb/framework/wbexceptions.py @@ -5,17 +5,18 @@ class NotFoundException(WbException): def status(self): return '404 Not Found' + # Exceptions that effect a specific capture and result in a retry class CaptureException(WbException): def status(self): return '500 Internal Server Error' + class InternalRedirect(WbException): - def __init__(self, location, status = '302 Internal Redirect'): + def __init__(self, location, status='302 Internal Redirect'): WbException.__init__(self, 'Redirecting -> ' + location) self.status = status self.httpHeaders = [('Location', location)] def status(self): return self.status - diff --git a/pywb/framework/wsgi_wrappers.py b/pywb/framework/wsgi_wrappers.py index 2811aa92..1dd433de 100644 --- a/pywb/framework/wsgi_wrappers.py +++ b/pywb/framework/wsgi_wrappers.py @@ -10,6 +10,8 @@ import importlib import logging +DEFAULT_PORT = 8080 + #================================================================= # adapted from wsgiref.request_uri, but doesn't include domain name # and allows all characters which are allowed in the path segment @@ -18,6 +20,7 @@ import logging # http://stackoverflow.com/questions/4669692/ # valid-characters-for-directory-part-of-a-url-for-short-links + def rel_request_uri(environ, include_query=1): """ Return the requested path, optionally including the query string @@ -40,14 +43,21 @@ def rel_request_uri(environ, include_query=1): #================================================================= -def create_wb_app(wb_router): +class WSGIApp(object): + def __init__(self, wb_router): + self.wb_router = wb_router + self.port = DEFAULT_PORT + if hasattr(wb_router, 'port'): + self.port = wb_router.port + # Top-level wsgi application - def application(env, start_response): + def __call__(self, env, start_response): if env.get('SCRIPT_NAME') or not env.get('REQUEST_URI'): env['REL_REQUEST_URI'] = rel_request_uri(env) else: env['REL_REQUEST_URI'] = env['REQUEST_URI'] + wb_router = self.wb_router response = None try: @@ -68,8 +78,6 @@ def create_wb_app(wb_router): return response(env, start_response) - return application - #================================================================= def handle_exception(env, error_view, exc, print_trace): @@ -126,13 +134,10 @@ def init_app(init_func, load_yaml=True, config_file=None): msg = '*** pywb app inited with config from "%s"!\n' logging.info(msg, init_func.__name__) - return create_wb_app(wb_router) + return WSGIApp(wb_router) #================================================================= -DEFAULT_PORT = 8080 - - def start_wsgi_server(the_app): from wsgiref.simple_server import make_server from optparse import OptionParser @@ -144,12 +149,10 @@ def start_wsgi_server(the_app): port = options.port - if port is None: - try: - config = load_default_config() - port = config.get('port', DEFAULT_PORT) - except: - port = DEFAULT_PORT + port = the_app.port + + if not port: + port = DEFAULT_PORT logging.debug('Starting CDX Server on port %s', port) diff --git a/test_config.yaml b/test_config.yaml index 20e52933..d6c75650 100644 --- a/test_config.yaml +++ b/test_config.yaml @@ -90,6 +90,9 @@ enable_http_proxy: true # enable cdx server api for querying cdx directly (experimental) enable_cdx_api: true +# test different port +port: 9000 + # optional reporter callback func # if set, called with request and cdx object reporter: !!python/object/new:tests.fixture.PrintReporter []