diff --git a/README.md b/README.md index d1f6979a..c35486dd 100644 --- a/README.md +++ b/README.md @@ -7,10 +7,10 @@ pywb is a Python re-implementation of the Wayback Machine software. The goal is to provide a brand new, clean implementation of Wayback. -This involves playing back archival web content (usually in WARC or ARC files) as best or accurately -as possible, in straightforward by highly customizable way. +The focus is to focus on providing the best/accurate replay of archival web content (usually in WARC or ARC files), +and new ways of handling dynamic and difficult content. -It should be easy to deploy and hack! +pywb should also be easy to deploy and modify! ### Wayback Machine @@ -72,9 +72,16 @@ If everything worked, the following pages should be loading (served from *sample ### Automated Tests Currently pywb consists of numerous doctests against the sample archive. -Additional testing is in the works. -The current set of tests can be run with Nose: +The `run-tests.py` file currently contains a few basic integration tests against the default config. + + +The current set of tests can be run with py.test: + +`py.test run-tests.py ./pywb/ --doctest-modules` + + +or with Nose: `nosetests --with-doctest` @@ -85,31 +92,21 @@ pywb is configurable via yaml. The simplest [config.yaml](config.yaml) is roughly as follows: -``` yaml +```yaml -routes: - - name: pywb - - index_paths: - - ./sample_archive/cdx/ - - archive_paths: - - ./sample_archive/warcs/ - - head_insert_html_template: ./ui/head_insert.html - - calendar_html_template: ./ui/query.html +collections: + pywb: ./sample_archive/cdx/ -hostpaths: ['http://localhost:8080/'] +archive_paths: ./sample_archive/warcs/ ``` -The optional ui elements, the query/calendar and header insert are specifyable via html/Jinja2 templates. +This sets up pywb with a single route for collection /pywb -(Refer to [full version of config.yaml](config.yaml) for additional documentation) - +(The [full version of config.yaml](config.yaml) contains additional documentation and specifies +all the optional properties, such as ui filenames for Jinja2/html template files.) For more advanced use, the pywb init path can be customized further: diff --git a/config.yaml b/config.yaml index 838e6b72..8f38f1ed 100644 --- a/config.yaml +++ b/config.yaml @@ -1,80 +1,56 @@ # pywb config file # ======================================== # -# Settings for each route are defined below -# Each route may be an archival collection or other handler +# Settings for each collection + +collections: + # : + # collection will be accessed via / + # is a string or list of: + # - string or list of one or more local .cdx file + # - string or list of one or more local dirs with .cdx files + # - a string value indicating remote http cdx server + pywb: ./sample_archive/cdx/ + +# indicate if cdx files are sorted by SURT keys -- eg: com,example)/ +# SURT keys are recommended for future indices, but non-SURT cdxs +# are also supported # -routes: - # route name (eg /pywb) - - name: pywb +# * Set to true if cdxs start with surts: com,example)/ +# * Set to false if cdx start with urls: example.com)/ +surt_ordered: true - # list of paths to search cdx files - # * local .cdx file - # * local dir, will include all .cdx files in dir - # - # or a string value indicating remote http cdx server - index_paths: - - ./sample_archive/cdx/ +# list of paths prefixes for pywb look to 'resolve' WARC and ARC filenames +# in the cdx to their absolute path +# +# if path is: +# * local dir, use path as prefix +# * local file, lookup prefix in tab-delimited sorted index +# * http:// path, use path as remote prefix +# * redis:// path, use redis to lookup full path for w: as key - # indicate if cdx files are sorted by SURT keys -- eg: com,example)/ - # SURT keys are recommended for future indices, but non-SURT cdxs - # are also supported - # - # * Set to true if cdxs start with surts: com,example)/ - # * Set to false if cdx start with urls: example.com)/ - surt_ordered: True +archive_paths: ./sample_archive/warcs/ - # list of paths prefixes for pywb look to 'resolve' WARC and ARC filenames - # in the cdx to their absolute path - # - # if path is: - # * local dir, use path as prefix - # * local file, lookup prefix in tab-delimited sorted index - # * http:// path, use path as remote prefix - # * redis:// path, use redis to lookup full path for w: as key +# ui: optional Jinja2 template to insert into of each replay +head_insert_html: ./ui/head_insert.html - archive_paths: - - ./sample_archive/warcs/ +# ui: optional text to directly insert into +# only loaded if ui_head_insert_template_file is not specified - # ui: optional Jinja2 template to insert into of each replay - head_insert_html_template: ./ui/head_insert.html +#head_insert_text: - # ui: optional text to directly insert into - # only loaded if ui_head_insert_template_file is not specified - - #head_insert_text: - - - # ui: optional Jinja2 template to use for 'calendar' query, - # eg, a listing of captures in response to a ../*/ - # - # may be a simple listing or a more complex 'calendar' UI - # if omitted, the capture listing lists raw index - calendar_html_template: ./ui/query.html - - # ui: optional Jinja2 template to use for 'search' page - # this page is displayed when no search url is entered - search_html_template: ./ui/search.html - - # Sample Debug Handlers (subject to change) - # Echo Request - - name: echo_req - - type: echo_req - - # Echo WSGI Env - - name: echo_env - - type: echo_env - - # CDX Server - - name: cdx - - index_paths: ['./sample_archive/cdx/'] - - type: 'cdx' +#static_path: /static2/ +# ui: optional Jinja2 template to use for 'calendar' query, +# eg, a listing of captures in response to a ../*/ +# +# may be a simple listing or a more complex 'calendar' UI +# if omitted, the capture listing lists raw index +query_html: ./ui/query.html +# ui: optional Jinja2 template to use for 'search' page +# this page is displayed when no search url is entered +search_html: ./ui/search.html # list of host names that pywb will be running from to detect # 'fallthrough' requests based on referrer @@ -89,10 +65,10 @@ hostpaths: ['http://localhost:8080/'] # ui: optional Jinja2 template for home page # if no other route is set to home page, this template will # be rendered at /, /index.htm and /index.html -home_html_template: ./ui/index.html +home_html: ./ui/index.html # ui: optional Jinja2 template for rendering any errors # the error page may print a detailed error message -error_html_template: ./ui/error.html +error_html: ./ui/error.html diff --git a/pywb/archivalrouter.py b/pywb/archivalrouter.py index b16897e9..8fb17ec3 100644 --- a/pywb/archivalrouter.py +++ b/pywb/archivalrouter.py @@ -10,13 +10,13 @@ from wburl import WbUrl # ArchivalRequestRouter -- route WB requests in archival mode #================================================================= class ArchivalRequestRouter: - def __init__(self, routes, hostpaths = None, abs_path = True, homepage = None, errorpage = None): + def __init__(self, routes, hostpaths = None, abs_path = True, home_view = None, error_view = None): self.routes = routes self.fallback = ReferRedirect(hostpaths) self.abs_path = abs_path - self.homepage = homepage - self.errorpage = errorpage + self.home_view = home_view + self.error_view = error_view def __call__(self, env): for route in self.routes: @@ -26,7 +26,7 @@ class ArchivalRequestRouter: # Home Page if env['REL_REQUEST_URI'] in ['/', '/index.html', '/index.htm']: - return self.render_homepage() + return self.render_home_page() if not self.fallback: return None @@ -34,10 +34,10 @@ class ArchivalRequestRouter: return self.fallback(WbRequest.from_uri(None, env)) - def render_homepage(self): + def render_home_page(self): # render the homepage! - if self.homepage: - return self.homepage.render_response(routes = self.routes) + if self.home_view: + return self.home_view.render_response(routes = self.routes) else: # default home page template text = '\n'.join(map(str, self.routes)) diff --git a/pywb/archiveloader.py b/pywb/archiveloader.py index 7f661eef..c1193dce 100644 --- a/pywb/archiveloader.py +++ b/pywb/archiveloader.py @@ -126,7 +126,7 @@ class ArchiveLoader: ('x-ec-custom-error', '1'), ('Content-Length', '1270'), ('Connection', 'close')])) - + >>> load_test_archive('example.warc.gz', '1864', '553') (('warc', 'revisit'), @@ -168,8 +168,8 @@ class ArchiveLoader: } @staticmethod - def create_default_loaders(): - http = HttpLoader() + def create_default_loaders(hmac = None): + http = HttpLoader(hmac) file = FileLoader() return { 'http': http, @@ -179,8 +179,8 @@ class ArchiveLoader: } - def __init__(self, loaders = {}, chunk_size = 8192): - self.loaders = loaders if loaders else ArchiveLoader.create_default_loaders() + def __init__(self, loaders = {}, hmac = None, chunk_size = 8192): + self.loaders = loaders if loaders else ArchiveLoader.create_default_loaders(hmac) self.chunk_size = chunk_size self.arc_parser = ARCHeadersParser(ArchiveLoader.ARC_HEADERS) diff --git a/pywb/config_utils.py b/pywb/config_utils.py new file mode 100644 index 00000000..c4755eec --- /dev/null +++ b/pywb/config_utils.py @@ -0,0 +1,52 @@ +import archiveloader +import views +import handlers +import indexreader +import replay_views +import replay_resolvers +from archivalrouter import ArchivalRequestRouter, Route +import logging + + +#================================================================= +# Config Loading +#================================================================= +def load_template_file(file, desc = None, view_class = views.J2TemplateView): + if file: + logging.info('Adding {0}: {1}'.format(desc if desc else name, file)) + file = view_class(file) + + return file + + +#================================================================= +def create_wb_handler(**config): + replayer = replay_views.RewritingReplayView( + + resolvers = replay_resolvers.make_best_resolvers(config.get('archive_paths')), + + loader = archiveloader.ArchiveLoader(hmac = config.get('hmac', None)), + + head_insert_view = load_template_file(config.get('head_html'), 'Head Insert'), + + buffer_response = config.get('buffer_response', True), + + redir_to_exact = config.get('redir_to_exact', True), + ) + + + wb_handler = handlers.WBHandler( + config['cdx_source'], + + replayer, + + html_view = load_template_file(config.get('query_html'), 'Captures Page', views.J2HtmlCapturesView), + + search_view = load_template_file(config.get('search_html'), 'Search Page'), + + static_path = config.get('static_path'), + ) + + return wb_handler + + diff --git a/pywb/handlers.py b/pywb/handlers.py index 3708d6e4..37a7eb9c 100644 --- a/pywb/handlers.py +++ b/pywb/handlers.py @@ -19,19 +19,22 @@ class BaseHandler: # Standard WB Handler #================================================================= class WBHandler(BaseHandler): - def __init__(self, cdx_reader, replay, capturespage = None, searchpage = None): + def __init__(self, cdx_reader, replay, html_view = None, search_view = None, static_path = '/static/'): self.cdx_reader = cdx_reader self.replay = replay self.text_view = views.TextCapturesView() - self.html_view = capturespage - self.searchpage = searchpage + + self.html_view = html_view + self.search_view = search_view + + self.static_path = static_path def __call__(self, wbrequest): if wbrequest.wb_url_str == '/': - return self.render_searchpage(wbrequest) + return self.render_search_page(wbrequest) with utils.PerfTimer(wbrequest.env.get('X_PERF'), 'query') as t: cdx_lines = self.cdx_reader.load_for_request(wbrequest, parsed_cdx = True) @@ -45,22 +48,19 @@ class WBHandler(BaseHandler): return query_view.render_response(wbrequest, cdx_lines) with utils.PerfTimer(wbrequest.env.get('X_PERF'), 'replay') as t: - return self.replay(wbrequest, cdx_lines, self.cdx_reader) + return self.replay(wbrequest, cdx_lines, self.cdx_reader, self.static_path) - def render_searchpage(self, wbrequest): - if self.searchpage: - return self.searchpage.render_response(wbrequest = wbrequest) + def render_search_page(self, wbrequest): + if self.search_view: + return self.search_view.render_response(wbrequest = wbrequest) else: return WbResponse.text_response('No Lookup Url Specified') - def __str__(self): return 'WBHandler: ' + str(self.cdx_reader) + ', ' + str(self.replay) - - #================================================================= # CDX-Server Handler -- pass all params to cdx server #================================================================= diff --git a/pywb/indexreader.py b/pywb/indexreader.py index 2907ca05..23995fbd 100644 --- a/pywb/indexreader.py +++ b/pywb/indexreader.py @@ -44,6 +44,32 @@ class IndexReader: def load_cdx(self, url, params = {}, parsed_cdx = True): raise NotImplementedError('Override in subclasses') + @staticmethod + def make_best_cdx_source(paths, **config): + # may be a string or list + surt_ordered = config.get('surt_ordered', True) + + # support mixed cdx streams and remote servers? + # for now, list implies local sources + if isinstance(paths, list): + if len(paths) > 1: + return LocalCDXServer(paths, surt_ordered) + else: + # treat as non-list + paths = paths[0] + + # a single uri + uri = paths + + # Check for remote cdx server + if (uri.startswith('http://') or uri.startswith('https://')) and not uri.endswith('.cdx'): + cookie = config.get('cookie', None) + return RemoteCDXServer(uri, cookie = cookie) + else: + return LocalCDXServer([uri], surt_ordered) + + + #================================================================= class LocalCDXServer(IndexReader): diff --git a/pywb/pywb_init.py b/pywb/pywb_init.py index eebf4a1c..1f8f6184 100644 --- a/pywb/pywb_init.py +++ b/pywb/pywb_init.py @@ -1,69 +1,71 @@ -import archiveloader -import views import handlers import indexreader -import replay_views -import replay_resolvers -import cdxserve from archivalrouter import ArchivalRequestRouter, Route import os import yaml -import utils +import config_utils import logging + #================================================================= ## Reference non-YAML config #================================================================= -def pywb_config_manual(): - default_head_insert = """ +def pywb_config_manual(config = {}): - - - - - """ + routes = [] - # Current test dir - #test_dir = utils.test_data_dir() - test_dir = './sample_archive/' + hostpaths = config.get('hostpaths', ['http://localhost:8080/']) - # Standard loader which supports WARC/ARC files - aloader = archiveloader.ArchiveLoader() + # collections based on cdx source + collections = config.get('collections', {'pywb': './sample_archive/cdx/'}) - # Source for cdx source - #query_h = query.QueryHandler(indexreader.RemoteCDXServer('http://cdx.example.com/cdx')) - #test_cdx = [test_dir + 'iana.cdx', test_dir + 'example.cdx', test_dir + 'dupes.cdx'] - indexs = indexreader.LocalCDXServer([test_dir + 'cdx/']) + for name, value in collections.iteritems(): + if isinstance(value, dict): + # if a dict, extend with base properies + index_paths = value['index_paths'] + value.extend(config) + config = value + else: + index_paths = str(value) - # Loads warcs specified in cdx from these locations - prefixes = [replay_resolvers.PrefixResolver(test_dir + 'warcs/')] + cdx_source = indexreader.IndexReader.make_best_cdx_source(index_paths, **config) - # Jinja2 head insert - head_insert = views.J2TemplateView('./ui/head_insert.html') + # cdx query handler + if config.get('enable_cdx_api', True): + routes.append(Route(name + '-cdx', handlers.CDXHandler(cdx_source))) - # Create rewriting replay handler to rewrite records - replayer = replay_views.RewritingReplayView(resolvers = prefixes, archiveloader = aloader, head_insert_view = head_insert, buffer_response = True) + wb_handler = config_utils.create_wb_handler( + cdx_source = cdx_source, + archive_paths = config.get('archive_paths', './sample_archive/warcs/'), + head_html = config.get('head_insert_html', './ui/head_insert.html'), + query_html = config.get('query_html', './ui/query.html'), + search_html = config.get('search_html', './ui/search.html'), + static_path = config.get('static_path', hostpaths[0] + 'static/') + ) - # Create Jinja2 based html query view - html_view = views.J2HtmlCapturesView('./ui/query.html') + logging.info('Adding Collection: ' + name) - # WB handler which uses the index reader, replayer, and html_view - wb_handler = handlers.WBHandler(indexs, replayer, html_view) + routes.append(Route(name, wb_handler)) + + + if config.get('debug_echo_env', False): + routes.append(Route('echo_env', handlers.DebugEchoEnvHandler())) + + if config.get('debug_echo_req', False): + routes.append(Route('echo_req', handlers.DebugEchoHandler())) - # cdx handler - cdx_handler = handlers.CDXHandler(indexs) # Finally, create wb router return ArchivalRequestRouter( - { - Route('echo_req', handlers.DebugEchoHandler()), # Debug ex: just echo parsed request - Route('pywb', wb_handler), - Route('cdx', cdx_handler), - }, + routes, # Specify hostnames that pywb will be running on # This will help catch occasionally missed rewrites that fall-through to the host # (See archivalrouter.ReferRedirect) - hostpaths = ['http://localhost:8080/']) + hostpaths = hostpaths, + + home_view = config_utils.load_template_file(config.get('home_html', './ui/index.html'), 'Home Page'), + error_view = config_utils.load_template_file(config.get('error_html', './ui/error.html'), 'Error Page') + ) @@ -79,119 +81,13 @@ def pywb_config(config_file = None): config = yaml.load(open(config_file)) - routes = map(yaml_parse_route, config['routes']) - - homepage = yaml_load_template(config, 'home_html_template', 'Home Page Template') - errorpage = yaml_load_template(config, 'error_html_template', 'Error Page Template') - - hostpaths = config.get('hostpaths', ['http://localhost:8080/']) - - return ArchivalRequestRouter(routes, hostpaths, homepage = homepage, errorpage = errorpage) - - -def yaml_load_template(config, name, desc = None): - file = config.get(name) - if file: - logging.info('Adding {0}: {1}'.format(desc if desc else name, file)) - file = views.J2TemplateView(file) - return file - - - -def yaml_parse_index_loader(config): - index_config = config['index_paths'] - surt_ordered = config.get('surt_ordered', True) - - # support mixed cdx streams and remote servers? - # for now, list implies local sources - if isinstance(index_config, list): - if len(index_config) > 1: - return indexreader.LocalCDXServer(index_config, surt_ordered) - else: - # treat as non-list - index_config = index_config[0] - - if isinstance(index_config, str): - uri = index_config - cookie = None - elif isinstance(index_config, dict): - uri = index_config['url'] - cookie = index_config['cookie'] - else: - raise Exception('Invalid Index Reader Config: ' + str(index_config)) - - # Check for remote cdx server - if (uri.startswith('http://') or uri.startswith('https://')) and not uri.endswith('.cdx'): - return indexreader.RemoteCDXServer(uri, cookie = cookie) - else: - return indexreader.LocalCDXServer([uri]) - - - - -def yaml_parse_head_insert(config): - # First, try a template file - head_insert_file = config.get('head_insert_html_template') - if head_insert_file: - logging.info('Adding Head-Insert Template: ' + head_insert_file) - return views.J2TemplateView(head_insert_file) - - # Then, static head_insert text - head_insert_text = config.get('head_insert_text', '') - logging.info('Adding Head-Insert Text: ' + head_insert_text) - return views.StaticTextView(head_insert_text) - - -def yaml_parse_calendar_view(config): - html_view_file = config.get('calendar_html_template') - if html_view_file: - logging.info('Adding HTML Calendar Template: ' + html_view_file) - else: - logging.info('No HTML Calendar View Present') - - return views.J2HtmlCapturesView(html_view_file) if html_view_file else None - - - -def yaml_parse_route(config): - name = config['name'] - type = config.get('type', 'wb') - - if type == 'echo_env': - return Route(name, handlers.DebugEchoEnvHandler()) - - if type == 'echo_req': - return Route(name, handlers.DebugEchoHandler()) - - archive_loader = archiveloader.ArchiveLoader() - - index_loader = yaml_parse_index_loader(config) - - if type == 'cdx': - handler = handlers.CDXHandler(index_loader) - return Route(name, handler) - - archive_resolvers = map(replay_resolvers.make_best_resolver, config['archive_paths']) - - head_insert = yaml_parse_head_insert(config) - - replayer = replay_views.RewritingReplayView(resolvers = archive_resolvers, - archiveloader = archive_loader, - head_insert_view = head_insert, - buffer_response = config.get('buffer_response', False)) - - html_view = yaml_parse_calendar_view(config) - - searchpage = yaml_load_template(config, 'search_html_template', 'Search Page Template') - - wb_handler = handlers.WBHandler(index_loader, replayer, html_view, searchpage = searchpage) - - return Route(name, wb_handler) + return pywb_config_manual(config) +import utils if __name__ == "__main__" or utils.enable_doctests(): # Just test for execution for now - pywb_config(os.path.dirname(os.path.realpath(__file__)) + '/../config.yaml') + #pywb_config(os.path.dirname(os.path.realpath(__file__)) + '/../config.yaml') pywb_config_manual() diff --git a/pywb/regex_rewriters.py b/pywb/regex_rewriters.py index 300c248f..9ec3a3aa 100644 --- a/pywb/regex_rewriters.py +++ b/pywb/regex_rewriters.py @@ -30,9 +30,9 @@ class RegexRewriter: @staticmethod def replacer(string): - return lambda x: string + return lambda x: string - HTTPX_MATCH_STR = 'https?:\\\\?/\\\\?/[A-Za-z0-9:_@.-]+' + HTTPX_MATCH_STR = r'https?:\\?/\\?/[A-Za-z0-9:_@.-]+' DEFAULT_OP = add_prefix @@ -95,6 +95,18 @@ class JSRewriter(RegexRewriter): >>> test_js('location = "http://example.com/abc.html"') 'WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"' + >>> test_js(r'location = "http:\/\/example.com/abc.html"') + 'WB_wombat_location = "/web/20131010im_/http:\\\\/\\\\/example.com/abc.html"' + + >>> test_js(r'location = "http:\\/\\/example.com/abc.html"') + 'WB_wombat_location = "/web/20131010im_/http:\\\\/\\\\/example.com/abc.html"' + + >>> test_js(r'location = /http:\/\/example.com/abc.html/') + 'WB_wombat_location = /http:\\\\/\\\\/example.com/abc.html/' + + >>> test_js('"/location" == some_location_val; locations = location;') + '"/location" == some_location_val; locations = WB_wombat_location;' + >>> test_js('cool_Location = "http://example.com/abc.html"') 'cool_Location = "/web/20131010im_/http://example.com/abc.html"' @@ -119,9 +131,9 @@ class JSRewriter(RegexRewriter): def _create_rules(self, http_prefix): return [ - (RegexRewriter.HTTPX_MATCH_STR, http_prefix, 0), - ('location', 'WB_wombat_', 0), - ('(?<=document\.)domain', 'WB_wombat_', 0), + (r'(?>> make_best_resolver('http://myhost.example.com/warcs/') PrefixResolver('http://myhost.example.com/warcs/') + # http path w/ contains param + >>> make_best_resolver(('http://myhost.example.com/warcs/', '/')) + PrefixResolver('http://myhost.example.com/warcs/', contains = '/') + # redis path >>> make_best_resolver('redis://myhost.example.com:1234/1') RedisResolver('redis://myhost.example.com:1234/1') @@ -85,11 +89,18 @@ def make_best_resolver(path): """ + if isinstance(param, tuple): + path = param[0] + arg = param[1] + else: + path = param + arg = None + url_parts = urlparse.urlsplit(path) if url_parts.scheme == 'redis': logging.info('Adding Redis Index: ' + path) - return RedisResolver(path) + return RedisResolver(path, arg) if url_parts.scheme == 'file': path = url_parts.path @@ -101,7 +112,17 @@ def make_best_resolver(path): # non-file paths always treated as prefix for now else: logging.info('Adding Archive Path Source: ' + path) - return PrefixResolver(path) + return PrefixResolver(path, arg) + + +#================================================================= +def make_best_resolvers(*paths): + """ + >>> make_best_resolvers('http://myhost.example.com/warcs/', 'redis://myhost.example.com:1234/1') + [PrefixResolver('http://myhost.example.com/warcs/'), RedisResolver('redis://myhost.example.com:1234/1')] + """ + return map(make_best_resolver, paths) + import utils #================================================================= diff --git a/pywb/replay_views.py b/pywb/replay_views.py index 45ea3b7b..d1c36ead 100644 --- a/pywb/replay_views.py +++ b/pywb/replay_views.py @@ -18,11 +18,12 @@ import wbexceptions #================================================================= class ReplayView: - def __init__(self, resolvers, archiveloader): + def __init__(self, resolvers, loader = None): self.resolvers = resolvers - self.loader = archiveloader + self.loader = loader if loader else archiveloader.ArchiveLoader() - def __call__(self, wbrequest, cdx_lines, cdx_reader): + + def __call__(self, wbrequest, cdx_lines, cdx_reader, static_path): last_e = None first = True @@ -33,16 +34,15 @@ class ReplayView: # The cdx should already be sorted in closest-to-timestamp order (from the cdx server) for cdx in cdx_lines: try: - # ability to intercept and redirect + # optimize: can detect if redirect is needed just from the cdx, no need to load w/arc data if first: - self._check_redir(wbrequest, cdx) + self._redirect_if_needed(wbrequest, cdx) first = False - response = self.do_replay(cdx, wbrequest, cdx_reader, failed_files) + (cdx, status_headers, stream) = self.resolve_headers_and_payload(cdx, wbrequest, cdx_reader, failed_files) + + return self.make_response(wbrequest, cdx, status_headers, stream, static_path) - if response: - response.cdx = cdx - return response except wbexceptions.CaptureException as ce: import traceback @@ -55,8 +55,12 @@ class ReplayView: else: raise wbexceptions.UnresolvedArchiveFileException() - def _check_redir(self, wbrequest, cdx): - return None + + # callback to issue a redirect to another request + # subclasses may provide custom logic + def _redirect_if_needed(self, wbrequest, cdx): + pass + def _load(self, cdx, revisit, failed_files): if revisit: @@ -94,7 +98,7 @@ class ReplayView: raise wbexceptions.ArchiveLoadFailed(filename, last_exc.reason if last_exc else '') - def do_replay(self, cdx, wbrequest, cdx_reader, failed_files): + def resolve_headers_and_payload(self, cdx, wbrequest, cdx_reader, failed_files): has_curr = (cdx['filename'] != '-') has_orig = (cdx.get('orig.filename','-') != '-') @@ -131,11 +135,21 @@ class ReplayView: raise wbexceptions.CaptureException('Invalid CDX' + str(cdx)) - response = WbResponse(headers_record.status_headers, self.create_stream_gen(payload_record.stream)) - response._stream = payload_record.stream - return response + #response = WbResponse(headers_record.status_headers, self.create_stream_gen(payload_record.stream)) + #response._stream = payload_record.stream + return (cdx, headers_record.status_headers, payload_record.stream) + # done here! just return response + # subclasses make override to do additional processing + def make_response(self, wbrequest, cdx, status_headers, stream, static_path): + return self.create_stream_response(status_headers, stream) + + + # create response from headers and wrapping stream in generator + def create_stream_response(self, status_headers, stream): + return WbResponse(status_headers, self.create_stream_gen(stream)) + # Handle the case where a duplicate of a capture with same digest exists at a different url # Must query the index at that url filtering by matching digest @@ -189,6 +203,7 @@ class ReplayView: raise wbexceptions.UnresolvedArchiveFileException('Archive File Not Found: ' + filename) + # Create a generator reading from a stream, with optional rewriting and final read call @staticmethod def create_stream_gen(stream, rewrite_func = None, final_read_func = None, first_buff = None): @@ -216,8 +231,8 @@ class ReplayView: #================================================================= class RewritingReplayView(ReplayView): - def __init__(self, resolvers, archiveloader, head_insert_view = None, header_rewriter = None, redir_to_exact = True, buffer_response = False): - ReplayView.__init__(self, resolvers, archiveloader) + def __init__(self, resolvers, loader = None, head_insert_view = None, header_rewriter = None, redir_to_exact = True, buffer_response = False): + ReplayView.__init__(self, resolvers, loader) self.head_insert_view = head_insert_view self.header_rewriter = header_rewriter if header_rewriter else HeaderRewriter() self.redir_to_exact = redir_to_exact @@ -226,6 +241,7 @@ class RewritingReplayView(ReplayView): self.buffer_response = buffer_response + def _text_content_type(self, content_type): for ctype, mimelist in self.REWRITE_TYPES.iteritems(): if any ((mime in content_type) for mime in mimelist): @@ -234,19 +250,16 @@ class RewritingReplayView(ReplayView): return None - def __call__(self, wbrequest, cdx_list, cdx_reader): - urlrewriter = UrlRewriter(wbrequest.wb_url, wbrequest.wb_prefix) - wbrequest.urlrewriter = urlrewriter + def make_response(self, wbrequest, cdx, status_headers, stream, static_path): + # check and reject self-redirect + self._reject_self_redirect(wbrequest, cdx, status_headers) - response = ReplayView.__call__(self, wbrequest, cdx_list, cdx_reader) + # check if redir is needed + self._redirect_if_needed(wbrequest, cdx) - if response and response.cdx: - self._check_redir(wbrequest, response.cdx) + urlrewriter = wbrequest.urlrewriter - rewritten_headers = self.header_rewriter.rewrite(response.status_headers, urlrewriter) - - # TODO: better way to pass this? - stream = response._stream + rewritten_headers = self.header_rewriter.rewrite(status_headers, urlrewriter) # de_chunking in case chunk encoding is broken # TODO: investigate further @@ -257,23 +270,19 @@ class RewritingReplayView(ReplayView): stream = archiveloader.ChunkedLineReader(stream) de_chunk = True - # Transparent, though still may need to dechunk + # transparent, though still may need to dechunk if wbrequest.wb_url.mod == 'id_': if de_chunk: - response.status_headers.remove_header('transfer-encoding') - response.body = self.create_stream_gen(stream) + status_headers.remove_header('transfer-encoding') - return response + return self.create_stream_response(status_headers, stream) # non-text content type, just send through with rewritten headers # but may need to dechunk if rewritten_headers.text_type is None: - response.status_headers = rewritten_headers.status_headers + status_headers = rewritten_headers.status_headers - if de_chunk: - response.body = self.create_stream_gen(stream) - - return response + return self.create_stream_response(status_headers, stream) # Handle text rewriting @@ -303,7 +312,7 @@ class RewritingReplayView(ReplayView): status_headers = rewritten_headers.status_headers if text_type == 'html': - head_insert_str = self.head_insert_view.render_to_string(wbrequest = wbrequest, cdx = response.cdx) if self.head_insert_view else None + head_insert_str = self.head_insert_view.render_to_string(wbrequest = wbrequest, cdx = cdx, static_path = static_path) if self.head_insert_view else None rewriter = html_rewriter.HTMLRewriter(urlrewriter, outstream = None, head_insert = head_insert_str) elif text_type == 'css': rewriter = regex_rewriters.CSSRewriter(urlrewriter) @@ -384,30 +393,22 @@ class RewritingReplayView(ReplayView): return (result['encoding'], buff) - def _check_redir(self, wbrequest, cdx): - if self.redir_to_exact and cdx and (cdx['timestamp'] != wbrequest.wb_url.timestamp): + def _redirect_if_needed(self, wbrequest, cdx): + is_proxy = wbrequest.is_proxy + if self.redir_to_exact and not is_proxy and cdx and (cdx['timestamp'] != wbrequest.wb_url.timestamp): new_url = wbrequest.urlrewriter.get_timestamp_url(cdx['timestamp'], cdx['original']) raise wbexceptions.InternalRedirect(new_url) - #return WbResponse.better_timestamp_response(wbrequest, cdx['timestamp']) return None - def do_replay(self, cdx, wbrequest, index, failed_files): - wbresponse = ReplayView.do_replay(self, cdx, wbrequest, index, failed_files) + def _reject_self_redirect(self, wbrequest, cdx, status_headers): + if status_headers.statusline.startswith('3'): + request_url = wbrequest.wb_url.url.lower() + location_url = status_headers.get_header('Location').lower() - # Check for self redirect - if wbresponse.status_headers.statusline.startswith('3'): - if self.is_self_redirect(wbrequest, wbresponse.status_headers): + #TODO: canonicalize before testing? + if (UrlRewriter.strip_protocol(request_url) == UrlRewriter.strip_protocol(location_url)): raise wbexceptions.CaptureException('Self Redirect: ' + str(cdx)) - return wbresponse - - def is_self_redirect(self, wbrequest, status_headers): - request_url = wbrequest.wb_url.url.lower() - location_url = status_headers.get_header('Location').lower() - #return request_url == location_url - return (UrlRewriter.strip_protocol(request_url) == UrlRewriter.strip_protocol(location_url)) - - diff --git a/pywb/wbapp.py b/pywb/wbapp.py index 0558cf72..dc1141fb 100644 --- a/pywb/wbapp.py +++ b/pywb/wbapp.py @@ -8,6 +8,8 @@ import importlib import logging + +#================================================================= def create_wb_app(wb_router): # Top-level wsgi application @@ -29,13 +31,13 @@ def create_wb_app(wb_router): response = WbResponse(StatusAndHeaders(ir.status, ir.httpHeaders)) except (wbexceptions.NotFoundException, wbexceptions.AccessException) as e: - response = handle_exception(env, wb_router.errorpage, e, False) + response = handle_exception(env, wb_router.error_view, e, False) except wbexceptions.WbException as wbe: - response = handle_exception(env, wb_router.errorpage, wbe, False) + response = handle_exception(env, wb_router.error_view, wbe, False) except Exception as e: - response = handle_exception(env, wb_router.errorpage, e, True) + response = handle_exception(env, wb_router.error_view, e, True) return response(env, start_response) @@ -43,7 +45,7 @@ def create_wb_app(wb_router): return application -def handle_exception(env, errorpage, exc, print_trace): +def handle_exception(env, error_view, exc, print_trace): if hasattr(exc, 'status'): status = exc.status() else: @@ -57,9 +59,9 @@ def handle_exception(env, errorpage, exc, print_trace): logging.info(str(exc)) err_details = None - if errorpage: + if error_view: import traceback - return errorpage.render_response(err_msg = str(exc), err_details = err_details, status = status) + return error_view.render_response(err_msg = str(exc), err_details = err_details, status = status) else: return WbResponse.text_response(status + ' Error: ' + str(exc), status = status) diff --git a/pywb/wbrequestresponse.py b/pywb/wbrequestresponse.py index efa5fafb..8449f588 100644 --- a/pywb/wbrequestresponse.py +++ b/pywb/wbrequestresponse.py @@ -1,4 +1,6 @@ from wburl import WbUrl +from url_rewriter import UrlRewriter + import utils import pprint @@ -61,7 +63,12 @@ class WbRequest: return rel_prefix - def __init__(self, env, request_uri, wb_prefix, wb_url_str, coll, use_abs_prefix = False, wburl_class = WbUrl): + def __init__(self, env, request_uri, wb_prefix, wb_url_str, coll, + use_abs_prefix = False, + wburl_class = WbUrl, + url_rewriter_class = UrlRewriter, + is_proxy = False): + self.env = env self.request_uri = request_uri if request_uri else env.get('REL_REQUEST_URI') @@ -72,10 +79,12 @@ class WbRequest: if wb_url_str != '/' and wb_url_str != '' and wburl_class: self.wb_url_str = wb_url_str self.wb_url = wburl_class(wb_url_str) + self.urlrewriter = url_rewriter_class(self.wb_url, self.wb_prefix) else: # no wb_url, just store blank self.wb_url_str = '/' self.wb_url = None + self.urlrewriter = None self.coll = coll @@ -85,6 +94,8 @@ class WbRequest: self.query_filter = [] + self.is_proxy = is_proxy + self.custom_params = {} # PERF diff --git a/run-tests.py b/run-tests.py index f8761266..3cfbc46d 100644 --- a/run-tests.py +++ b/run-tests.py @@ -5,8 +5,8 @@ from pywb.indexreader import CDXCaptureResult class TestWb: def setup(self): import pywb.wbapp - #self.testapp = webtest.TestApp(pywb.wbapp.application) - self.app = pywb.wbapp.create_wb_app(pywb.pywb_init.pywb_config()) + #self.app = pywb.wbapp.create_wb_app(pywb.pywb_init.pywb_config()) + self.app = pywb.wbapp.create_wb_app(pywb.pywb_init.pywb_config_manual()) self.testapp = webtest.TestApp(self.app) def _assert_basic_html(self, resp): @@ -74,14 +74,14 @@ class TestWb: assert '/pywb/20140127171251/http://www.iana.org/domains/example' in resp.body def test_cdx_server_filters(self): - resp = self.testapp.get('/cdx?url=http://www.iana.org/_css/2013.1/screen.css&filter=mimetype:warc/revisit&filter=filename:dupes.warc.gz') + resp = self.testapp.get('/pywb-cdx?url=http://www.iana.org/_css/2013.1/screen.css&filter=mimetype:warc/revisit&filter=filename:dupes.warc.gz') self._assert_basic_text(resp) actual_len = len(resp.body.rstrip().split('\n')) assert actual_len == 1, actual_len def test_cdx_server_advanced(self): # combine collapsing, reversing and revisit resolving - resp = self.testapp.get('/cdx?url=http://www.iana.org/_css/2013.1/print.css&collapse_time=11&resolve_revisits=true&reverse=true') + resp = self.testapp.get('/pywb-cdx?url=http://www.iana.org/_css/2013.1/print.css&collapse_time=11&resolve_revisits=true&reverse=true') # convert back to CDXCaptureResult cdxs = map(CDXCaptureResult, resp.body.rstrip().split('\n')) diff --git a/ui/head_insert.html b/ui/head_insert.html index 3af55ad5..f049aacd 100644 --- a/ui/head_insert.html +++ b/ui/head_insert.html @@ -3,6 +3,6 @@ wbinfo = {} wbinfo.capture_str = "{{ cdx['timestamp'] | format_ts }}"; - - + + diff --git a/ui/query.html b/ui/query.html index 11712fda..f4c69e26 100644 --- a/ui/query.html +++ b/ui/query.html @@ -11,9 +11,9 @@ {% for cdx in cdx_lines %} {{ cdx['timestamp'] | format_ts}} - {{ cdx['filename'] }} {{ cdx['statuscode'] }} - {{ cdx['originalurl'] }} + {{ cdx['original'] }} + {{ cdx['filename'] }} {% endfor %}