diff --git a/CHANGES.rst b/CHANGES.rst index 2959ae20..ae02ded7 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,3 +1,9 @@ +pywb 0.7.0 changelist +~~~~~~~~~~~~~~~~~~~~~ + +Video Buffering Replay + + pywb 0.6.4 changelist ~~~~~~~~~~~~~~~~~~~~~ @@ -35,7 +41,7 @@ pywb 0.6.0 changelist * Revamped HTTP/S system: proxy collection and capture time switching via cookie! -* removed *hostnames* setting in config.yaml. pywb no longer needs to know the host(s) it is running on, +* removed *hostnames* setting in config.yaml. pywb no longer needs to know the host(s) it is running on, can infer the correct path from referrer on a fallback handling. * remove PAC config, just using direct proxy (HTTP and HTTPS) for simplicity. @@ -136,7 +142,7 @@ pywb 0.4.0 changelist * Improved test coverage throughout the project. -* live-rewrite-server: A new web server for checking rewriting rules against live content. A white-list of request headers is sent to +* live-rewrite-server: A new web server for checking rewriting rules against live content. A white-list of request headers is sent to the destination server. See `rewrite_live.py `_ for more details. * Cookie Rewriting in Archival Mode: HTTP Set-Cookie header rewritten to remove Expires, rewrite Path and Domain. If Domain is used, Path is set to / to ensure cookie is visible from all archival urls. @@ -155,7 +161,7 @@ pywb 0.4.0 changelist * New, experimental support for top-level 'frame mode', used by live-rewrite-server, to display rewritten content in a frame. The mp_ modifier is used to indicate the main page when top-level page is a frame. -* cdx-indexer: Support for creation of non-SURT, url-ordered as well SURT-ordered CDX files. +* cdx-indexer: Support for creation of non-SURT, url-ordered as well SURT-ordered CDX files. * Further rewrite of wombat.js: support for window.open, postMessage overrides, additional rewriting at Node creation time, better hash change detection. Use ``Object.defineProperty`` whenever possible to better override assignment to various JS properties. @@ -173,13 +179,13 @@ pywb 0.3.0 changelist * Generate cdx indexs via command-line `cdx-indexer` script. Optionally sorting, and output to either a single combined file or a file per-directory. Refer to ``cdx-indexer -h`` for more info. - + * Initial support for prefix url queries, eg: http://localhost:8080/pywb/\*/http://example.com\* to query all captures from http://example.com * Support for optional LXML html-based parser for fastest possible parsing. If lxml is installed on the system and via ``pip install lxml``, lxml parser is enabled by default. (This can be turned off by setting ``use_lxml_parser: false`` in the config) * Full support for `Memento Protocol RFC7089 `_ Memento, TimeGate and TimeMaps. Memento: TimeMaps in ``application/link-format`` provided via the ``/timemap/*/`` query.. eg: http://localhost:8080/pywb/timemap/\*/http://example.com - + * pywb now features new `domain-specific rules `_ which are applied to resolve and render certain difficult and dynamic content, in order to make accurate web replay work. This ruleset will be under further iteration to address further challenges as the web evoles. diff --git a/README.rst b/README.rst index 8e8fe103..0fbee5eb 100644 --- a/README.rst +++ b/README.rst @@ -1,4 +1,4 @@ -PyWb 0.6.4 +PyWb 0.7.0 ========== .. image:: https://travis-ci.org/ikreymer/pywb.png?branch=develop @@ -7,7 +7,7 @@ PyWb 0.6.4 :target: https://coveralls.io/r/ikreymer/pywb?branch=develop .. image:: https://img.shields.io/gratipay/ikreymer.svg :target: https://www.gratipay.com/ikreymer/ - + pywb is a python implementation of web archival replay tools, sometimes also known as 'Wayback Machine'. pywb allows high-quality replay (browsing) of archived web data stored in standardized `ARC `_ and `WARC `_. @@ -44,7 +44,7 @@ This README contains a basic overview of using pywb. After reading this intro, c pywb Tools Overview ----------------------------- -In addition to the standard wayback machine (explained further below), pywb tool suite includes a +In addition to the standard wayback machine (explained further below), pywb tool suite includes a number of useful command-line and web server tools. The tools should be available to run after running ``python setup.py install``: @@ -151,7 +151,7 @@ If you would like to use non-SURT ordered .cdx files, simply add this field to t :: surt_ordered: false - + UI Customization """"""""""""""""""""" diff --git a/pywb/rewrite/rewrite_live.py b/pywb/rewrite/rewrite_live.py index 7e72fea6..e648a957 100644 --- a/pywb/rewrite/rewrite_live.py +++ b/pywb/rewrite/rewrite_live.py @@ -88,7 +88,9 @@ class LiveRewriter(object): method = 'GET' data = None - if not proxies and self.default_proxy: + if proxies == False: + proxies = None + elif not proxies and self.default_proxy: proxies = {'http': self.default_proxy, 'https': self.default_proxy} diff --git a/pywb/rules.yaml b/pywb/rules.yaml index 69e8f6b2..757d7da7 100644 --- a/pywb/rules.yaml +++ b/pywb/rules.yaml @@ -126,10 +126,20 @@ rules: - video_id - html5 + - url_prefix: 'com,youtube,s)/api/stats/qoe' + + fuzzy_lookup: + - docid + + - url_prefix: 'com,youtube,s)/api/stats/watch' + + fuzzy_lookup: + - docid - url_prefix: 'com,googlevideo,' - fuzzy_lookup: 'com,googlevideo.*/videoplayback?.*(id=[^&]).*(mime=[^&]).*(signature=[^&])' + #fuzzy_lookup: 'com,googlevideo.*/videoplayback?.*(id=[^&]).*(mime=[^&]).*(signature=[^&])' + fuzzy_lookup: 'com,googlevideo.*/videoplayback?.*(id=[^&]).*(mime=[^&])' # testing rules -- not for valid domain diff --git a/pywb/static/vidrw.js b/pywb/static/vidrw.js index d5fd1006..248d5373 100644 --- a/pywb/static/vidrw.js +++ b/pywb/static/vidrw.js @@ -44,11 +44,19 @@ __wbvidrw = (function() { if (wbinfo.url.indexOf("://www.youtube.com/watch") > 0) { var ytvideo = document.getElementsByTagName("video"); +/* if (ytvideo.length == 1) { if (ytvideo[0].getAttribute("data-youtube-id") != "") { - check_replacement(ytvideo[0], wbinfo.url); + // Wait to see if video is playing, if so, don't replace it + window.setTimeout(function() { + if (ytvideo[0].readyState == 0) { + console.log("Replacing Broken Video"); + check_replacement(ytvideo[0], wbinfo.url); + } + }, 3000); } } +*/ } } diff --git a/pywb/static/wombat.js b/pywb/static/wombat.js index 408b413b..2f599bc4 100644 --- a/pywb/static/wombat.js +++ b/pywb/static/wombat.js @@ -517,7 +517,21 @@ window._WBWombat = (function() { override_attr(image, "src"); return image; } - }(Image); + }(window.Image); + } + + //============================================ + function init_date_override(timestamp) { + window.Date = function (Date) { + return function (A, B, C, D, E, F, G) { + if (arguments.length == 0) { + timestamp = parseInt(timestamp) * 1000; + return new Date(timestamp); + } else { + return new Date(A, B, C, D, E, F, G); + } + } + }(window.Date); } //============================================ @@ -859,6 +873,9 @@ window._WBWombat = (function() { // Random init_seeded_random(timestamp); + // Date + init_date_override(timestamp); + // expose functions this.extract_orig = extract_orig; } diff --git a/pywb/webapp/live_rewrite_handler.py b/pywb/webapp/live_rewrite_handler.py index 4578edf9..39a7f565 100644 --- a/pywb/webapp/live_rewrite_handler.py +++ b/pywb/webapp/live_rewrite_handler.py @@ -13,6 +13,7 @@ from pywb.utils.wbexception import WbException import json import requests +import hashlib from rangecache import range_cache @@ -70,25 +71,36 @@ class RewriteHandler(SearchPageWbUrlHandler): if ref_wburl_str: wbrequest.env['REL_REFERER'] = WbUrl(ref_wburl_str).url - def do_req(): - result = self.rewriter.fetch_request(wbrequest.wb_url.url, - wbrequest.urlrewriter, - head_insert_func=head_insert_func, - req_headers=req_headers, - env=wbrequest.env) + proxies = None # default + ping_url = None + ping_cache_key = None - return self._make_response(wbrequest, *result) + if self.default_proxy and range_cache: + rangeres = range_cache.is_ranged(wbrequest) + if rangeres: + proxies = False - cdx = dict(url=wbrequest.wb_url.url) + hash_ = hashlib.md5() + hash_.update(rangeres[0]) + ping_cache_key = hash_.hexdigest() - range_status, range_iter = range_cache(wbrequest, cdx, do_req) + if ping_cache_key not in range_cache.cache: + ping_url = rangeres[0] - if not range_status or not range_iter: - return do_req() - else: - result = range_status, range_iter, False - return self._make_response(wbrequest, *result) + result = self.rewriter.fetch_request(wbrequest.wb_url.url, + wbrequest.urlrewriter, + head_insert_func=head_insert_func, + req_headers=req_headers, + env=wbrequest.env, + proxies=proxies) + wbresponse = self._make_response(wbrequest, *result) + + if ping_url: + self._proxy_ping(wbrequest, wbresponse, + ping_url, ping_cache_key) + + return wbresponse def _make_response(self, wbrequest, status_headers, gen, is_rewritten): # if cookie set, pass recorded timestamp info via cookie @@ -102,6 +114,37 @@ class RewriteHandler(SearchPageWbUrlHandler): return WbResponse(status_headers, gen) + def _proxy_ping(self, wbrequest, wbresponse, url, key): + def do_proxy_ping(): + proxies = {'http': self.default_proxy, + 'https': self.default_proxy} + + headers = self._live_request_headers(wbrequest) + print('PINGING PROXY: ' + url) + resp = requests.get(url=url, + headers=headers, + proxies=proxies, + verify=False, + stream=True) + + # don't actually read whole response, proxy response for writing it + resp.raw.close() + resp.close() + + # mark as pinged + range_cache.cache[key] = '1' + + return None + + def check_buff_gen(gen): + for x in gen: + yield x + + do_proxy_ping() + + wbresponse.body = check_buff_gen(wbresponse.body) + return wbresponse + def get_video_info(self, wbrequest): if not self.youtubedl: self.youtubedl = YoutubeDLWrapper() diff --git a/pywb/webapp/rangecache.py b/pywb/webapp/rangecache.py index dddb06e5..fe3fae39 100644 --- a/pywb/webapp/rangecache.py +++ b/pywb/webapp/rangecache.py @@ -4,7 +4,6 @@ from pywb.framework.cache import create_cache from tempfile import NamedTemporaryFile -import hashlib import yaml import os import re @@ -28,16 +27,14 @@ class RangeCache(object): new_url = RangeCache.YT_EXTRACT_RX.sub(repl_range, url) if range_h_res: - print('MATCHED') return range_h_res[0], new_url else: return None, url def __init__(self): self.cache = create_cache() - print(type(self.cache)) - def __call__(self, wbrequest, cdx, wbresponse_func): + def is_ranged(self, wbrequest): url = wbrequest.wb_url.url range_h = None use_206 = False @@ -45,37 +42,34 @@ class RangeCache(object): result = self.match_yt(url) if result: range_h, url = result - wbrequest.wb_url.url = url - print(range_h) # check for standard range header if not range_h: range_h = wbrequest.env.get('HTTP_RANGE') if not range_h: - return None, None - range_h = True + return None - return self.handle_range(wbrequest, cdx, url, - wbresponse_func, - range_h, use_206) + use_206 = True - def handle_range(self, wbrequest, cdx, url, wbresponse_func, - range_h, use_206): + return url, range_h, use_206 + + def __call__(self, wbrequest, digest, wbresponse_func): + result = self.is_ranged(wbrequest) + if not result: + return None, None + + return self.handle_range(wbrequest, digest, wbresponse_func, + *result) + + def handle_range(self, wbrequest, digest, wbresponse_func, + url, range_h, use_206): range_h = range_h.split('=')[-1] - key = cdx.get('digest') - if not key: - hash_ = hashlib.md5() - hash_.update(url) - #hash_.update(cdx['timestamp']) - key = hash_.hexdigest() - - print('KEY: ', key) - print('RANGE: ', range_h) - + key = digest if not key in self.cache: - print('MISS') response = wbresponse_func() + if not response: + return None, None with NamedTemporaryFile(delete=False) as fh: for obj in response.body: @@ -86,21 +80,19 @@ class RangeCache(object): spec = dict(name=fh.name, headers=response.status_headers.headers) - print('SET CACHE: ' + key) self.cache[key] = yaml.dump(spec) else: - print('HIT') spec = yaml.load(self.cache[key]) + if not spec: + return None, None + spec['headers'] = [tuple(x) for x in spec['headers']] - print(spec['headers']) - print('TEMP FILE: ' + spec['name']) filelen = os.path.getsize(spec['name']) range_h = range_h.rstrip() if range_h == '0-': - print('FIX RANGE') range_h = '0-120000' parts = range_h.rstrip().split('-') @@ -119,7 +111,6 @@ class RangeCache(object): fh = LimitReader.wrap_stream(fh, maxlen) while True: buf = fh.read() - print('READ: ', len(buf)) if not buf: break @@ -129,17 +120,16 @@ class RangeCache(object): content_range = 'bytes {0}-{1}/{2}'.format(start, start + maxlen - 1, filelen) - print('CONTENT_RANGE: ', content_range) status_headers = StatusAndHeaders('206 Partial Content', spec['headers']) status_headers.replace_header('Content-Range', content_range) else: status_headers = StatusAndHeaders('200 OK', spec['headers']) - status_headers.headers.append(('Accept-Ranges', 'bytes')) - status_headers.headers.append(('Access-Control-Allow-Credentials', 'true')) - status_headers.headers.append(('Access-Control-Allow-Origin', 'http://localhost:8080')) - status_headers.headers.append(('Timing-Allow-Origin', 'http://localhost:8080')) + #status_headers.headers.append(('Accept-Ranges', 'bytes')) + #status_headers.headers.append(('Access-Control-Allow-Credentials', 'true')) + #status_headers.headers.append(('Access-Control-Allow-Origin', 'http://localhost:8080')) + #status_headers.headers.append(('Timing-Allow-Origin', 'http://localhost:8080')) status_headers.replace_header('Content-Length', str(maxlen)) return status_headers, read_range() diff --git a/pywb/webapp/replay_views.py b/pywb/webapp/replay_views.py index b5002b0a..877d7ec5 100644 --- a/pywb/webapp/replay_views.py +++ b/pywb/webapp/replay_views.py @@ -107,7 +107,7 @@ class ReplayView(object): return self.replay_capture(wbrequest, cdx, cdx_loader, failed_files) range_status, range_iter = range_cache(wbrequest, - cdx, + cdx.get('digest'), get_capture) if range_status and range_iter: response = self.response_class(range_status, diff --git a/setup.py b/setup.py index 54e3c021..2fe33078 100755 --- a/setup.py +++ b/setup.py @@ -34,7 +34,7 @@ class PyTest(TestCommand): setup( name='pywb', - version='0.6.4', + version='0.7.0', url='https://github.com/ikreymer/pywb', author='Ilya Kreymer', author_email='ikreymer@gmail.com',