1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-14 15:53:28 +01:00

proxy mode: don't rewrite xml for ajax requests. Support python 3.8 (#563)

* rewrite:
- don't rewrite xml in proxy mode / html-insert only mode
- ajax: if sec-fetch-mode is set to non-navigate, also treat as 'ajax'

* ci: build python 3.8, ignore 2.7 failures

* reqs: use released ujson for extra_reqs

* hmac: add digestmod, fix for py3.8
This commit is contained in:
Ilya Kreymer 2020-06-08 09:40:59 -07:00 committed by GitHub
parent ed89fcc6f8
commit 5e9b13e267
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 31 additions and 3 deletions

View File

@ -5,6 +5,7 @@ python:
- "3.5"
- "3.6"
- "3.7"
- "3.8"
dist: xenial
@ -39,6 +40,7 @@ after_success:
matrix:
allow_failures:
- env: WR_TEST=yes
- python: "2.7"
exclude:
- env: WR_TEST=yes

View File

@ -2,6 +2,6 @@ certauth
youtube-dl
boto3
uwsgi
git+https://github.com/esnme/ultrajson.git
ujson
pysocks
lxml

View File

@ -822,6 +822,12 @@ class RewriterApp(object):
if value and value.lower() == 'xmlhttprequest':
return True
# if Chrome Sec-Fetch-Mode is set and is not 'navigate', then this is likely
# a fetch / ajax request
sec_fetch_mode = environ.get('HTTP_SEC_FETCH_MODE')
if sec_fetch_mode and sec_fetch_mode != 'navigate':
return True
return False
def is_preflight(self, environ):

View File

@ -9,13 +9,22 @@ class HTMLInsertOnlyRewriter(StreamingRewriter):
"""
NOT_HEAD_REGEX = re.compile(r'(<\s*\b)(?!(html|head))', re.I)
XML_HEADER = re.compile(r'<\?xml.*\?>')
def __init__(self, url_rewriter, **kwargs):
super(HTMLInsertOnlyRewriter, self).__init__(url_rewriter, False)
self.head_insert = kwargs['head_insert']
self.done = False
self.first = True
def rewrite(self, string):
if self.first:
if self.url_rewriter.rewrite_opts.get('is_ajax') and self.XML_HEADER.search(string):
self.done = True
self.first = False
if self.done:
return string

View File

@ -16,14 +16,24 @@ r'''
>>> parse('<head></head>text')
'<head></head>text<!--Insert-->'
>>> parse('<?xml version="1.0" encoding="UTF-8" standalone="no"?>\n<html xmlns="http://www.w3.org/1999/xhtml"><body></body></html>')
'<?xml version="1.0" encoding="UTF-8" standalone="no"?>\n<html xmlns="http://www.w3.org/1999/xhtml"><!--Insert--><body></body></html>'
# ajax leave unchanged?
>>> parse('<?xml version="1.0" encoding="UTF-8" standalone="no"?>\n<html xmlns="http://www.w3.org/1999/xhtml"><body></body></html>', is_ajax=True)
'<?xml version="1.0" encoding="UTF-8" standalone="no"?>\n<html xmlns="http://www.w3.org/1999/xhtml"><body></body></html>'
'''
from pywb.rewrite.url_rewriter import UrlRewriter
from pywb.rewrite.html_insert_rewriter import HTMLInsertOnlyRewriter
def parse(html_text):
def parse(html_text, is_ajax=False):
urlrewriter = UrlRewriter('20131226101010/https://example.com/some/path.html', '/web/')
if is_ajax:
urlrewriter.rewrite_opts['is_ajax'] = True
rewriter = HTMLInsertOnlyRewriter(urlrewriter, head_insert='<!--Insert-->')
return rewriter.rewrite(html_text) + rewriter.final_read()

View File

@ -7,6 +7,7 @@ local and remote access
import os
import hmac
import hashlib
import requests
import yaml
@ -485,7 +486,7 @@ class HMACCookieMaker(object):
else:
msg = expire
hmacdigest = hmac.new(self.key.encode('utf-8'), msg.encode('utf-8'))
hmacdigest = hmac.new(self.key.encode('utf-8'), msg.encode('utf-8'), digestmod=hashlib.md5)
hexdigest = hmacdigest.hexdigest()
if extra_id: