From 5e9b13e2675abaf209750c5fffb3599dbda5e321 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 8 Jun 2020 09:40:59 -0700 Subject: [PATCH] proxy mode: don't rewrite xml for ajax requests. Support python 3.8 (#563) * rewrite: - don't rewrite xml in proxy mode / html-insert only mode - ajax: if sec-fetch-mode is set to non-navigate, also treat as 'ajax' * ci: build python 3.8, ignore 2.7 failures * reqs: use released ujson for extra_reqs * hmac: add digestmod, fix for py3.8 --- .travis.yml | 2 ++ extra_requirements.txt | 2 +- pywb/apps/rewriterapp.py | 6 ++++++ pywb/rewrite/html_insert_rewriter.py | 9 +++++++++ pywb/rewrite/test/test_html_insert_rewriter.py | 12 +++++++++++- pywb/utils/loaders.py | 3 ++- 6 files changed, 31 insertions(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index 9d7835da..65482bda 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,6 +5,7 @@ python: - "3.5" - "3.6" - "3.7" + - "3.8" dist: xenial @@ -39,6 +40,7 @@ after_success: matrix: allow_failures: - env: WR_TEST=yes + - python: "2.7" exclude: - env: WR_TEST=yes diff --git a/extra_requirements.txt b/extra_requirements.txt index aa65d62a..4bea8f80 100644 --- a/extra_requirements.txt +++ b/extra_requirements.txt @@ -2,6 +2,6 @@ certauth youtube-dl boto3 uwsgi -git+https://github.com/esnme/ultrajson.git +ujson pysocks lxml diff --git a/pywb/apps/rewriterapp.py b/pywb/apps/rewriterapp.py index 9032588a..c590825a 100644 --- a/pywb/apps/rewriterapp.py +++ b/pywb/apps/rewriterapp.py @@ -822,6 +822,12 @@ class RewriterApp(object): if value and value.lower() == 'xmlhttprequest': return True + # if Chrome Sec-Fetch-Mode is set and is not 'navigate', then this is likely + # a fetch / ajax request + sec_fetch_mode = environ.get('HTTP_SEC_FETCH_MODE') + if sec_fetch_mode and sec_fetch_mode != 'navigate': + return True + return False def is_preflight(self, environ): diff --git a/pywb/rewrite/html_insert_rewriter.py b/pywb/rewrite/html_insert_rewriter.py index 7a5dcf26..7ffbfac0 100644 --- a/pywb/rewrite/html_insert_rewriter.py +++ b/pywb/rewrite/html_insert_rewriter.py @@ -9,13 +9,22 @@ class HTMLInsertOnlyRewriter(StreamingRewriter): """ NOT_HEAD_REGEX = re.compile(r'(<\s*\b)(?!(html|head))', re.I) + XML_HEADER = re.compile(r'<\?xml.*\?>') + def __init__(self, url_rewriter, **kwargs): super(HTMLInsertOnlyRewriter, self).__init__(url_rewriter, False) self.head_insert = kwargs['head_insert'] self.done = False + self.first = True def rewrite(self, string): + if self.first: + if self.url_rewriter.rewrite_opts.get('is_ajax') and self.XML_HEADER.search(string): + self.done = True + + self.first = False + if self.done: return string diff --git a/pywb/rewrite/test/test_html_insert_rewriter.py b/pywb/rewrite/test/test_html_insert_rewriter.py index ed3607a4..5d953b91 100644 --- a/pywb/rewrite/test/test_html_insert_rewriter.py +++ b/pywb/rewrite/test/test_html_insert_rewriter.py @@ -16,14 +16,24 @@ r''' >>> parse('text') 'text' + +>>> parse('\n') +'\n' + +# ajax leave unchanged? +>>> parse('\n', is_ajax=True) +'\n' ''' from pywb.rewrite.url_rewriter import UrlRewriter from pywb.rewrite.html_insert_rewriter import HTMLInsertOnlyRewriter -def parse(html_text): +def parse(html_text, is_ajax=False): urlrewriter = UrlRewriter('20131226101010/https://example.com/some/path.html', '/web/') + if is_ajax: + urlrewriter.rewrite_opts['is_ajax'] = True + rewriter = HTMLInsertOnlyRewriter(urlrewriter, head_insert='') return rewriter.rewrite(html_text) + rewriter.final_read() diff --git a/pywb/utils/loaders.py b/pywb/utils/loaders.py index 06e88e38..c8623fa6 100644 --- a/pywb/utils/loaders.py +++ b/pywb/utils/loaders.py @@ -7,6 +7,7 @@ local and remote access import os import hmac +import hashlib import requests import yaml @@ -485,7 +486,7 @@ class HMACCookieMaker(object): else: msg = expire - hmacdigest = hmac.new(self.key.encode('utf-8'), msg.encode('utf-8')) + hmacdigest = hmac.new(self.key.encode('utf-8'), msg.encode('utf-8'), digestmod=hashlib.md5) hexdigest = hmacdigest.hexdigest() if extra_id: