1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

Merge branch 'main' into new-ui-work

This commit is contained in:
Ilya Kreymer 2022-01-25 23:20:24 -08:00
commit 08826f886b
9 changed files with 52 additions and 31 deletions

View File

@ -1,3 +1,15 @@
pywb 2.6.4 changelist
~~~~~~~~~~~~~~~~~~~~~
* wombat.js: actually update to 3.3.6, update built wombat.js
* Fix live mode when ``redirect_to_exact`` is enabled `#692 <https://github.com/webrecorder/pywb/pull/692>`_
* Rules: additional fuzzy ignore of facebook query param: `#691 <https://github.com/webrecorder/pywb/pull/691>`_
* Docs: typo fixes: `#669 <https://github.com/webrecorder/pywb/pull/669>`_, `#670 <https://github.com/webrecorder/pywb/pull/670>`_
pywb 2.6.3 changelist pywb 2.6.3 changelist
~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~

View File

@ -113,8 +113,8 @@ The available access types are as follows:
- ``exclude`` - when matched, results are excluded from the index, as if they do not exist. User will receive a 404. - ``exclude`` - when matched, results are excluded from the index, as if they do not exist. User will receive a 404.
- ``block`` - when matched, results are not excluded from the index, but access to the actual content is blocked. User will see a 451. - ``block`` - when matched, results are not excluded from the index, but access to the actual content is blocked. User will see a 451.
- ``allow`` - full access to the index and the resource, but may be overriden by embargo - ``allow`` - full access to the index and the resource, but may be overriden by embargo.
- ``allow_ignore_embargo`` - full access to the index and resource, overriding any embargo settings - ``allow_ignore_embargo`` - full access to the index and resource, overriding any embargo settings.
The difference between ``exclude`` and ``block`` is that when blocked, the user can be notified that access is blocked, while The difference between ``exclude`` and ``block`` is that when blocked, the user can be notified that access is blocked, while
with exclude, no trace of the resource is presented to the user. with exclude, no trace of the resource is presented to the user.

View File

@ -379,13 +379,11 @@ class RewriterApp(object):
response = self.handle_query(environ, wb_url, kwargs, full_prefix) response = self.handle_query(environ, wb_url, kwargs, full_prefix)
else: else:
# don't return top-frame response for timegate with exact redirects response = self.handle_custom_response(environ, wb_url,
if not (is_timegate and redirect_to_exact): full_prefix, host_prefix,
response = self.handle_custom_response(environ, wb_url, kwargs)
full_prefix, host_prefix,
kwargs)
keep_frame_response = not kwargs.get('no_timegate_check') and is_timegate and not redirect_to_exact and not is_proxy keep_frame_response = (not kwargs.get('no_timegate_check') and is_timegate and not is_proxy) or redirect_to_exact
if response and not keep_frame_response: if response and not keep_frame_response:
@ -465,8 +463,12 @@ class RewriterApp(object):
return self.send_redirect(new_path, url_parts, urlrewriter) return self.send_redirect(new_path, url_parts, urlrewriter)
# only redirect to exact if not live, otherwise set to false
redirect_to_exact = redirect_to_exact and not cdx.get('is_live')
# return top-frame timegate response, with timestamp from cdx # return top-frame timegate response, with timestamp from cdx
if response and keep_frame_response: if response and keep_frame_response and (not redirect_to_exact or not is_timegate):
no_except_close(r.raw) no_except_close(r.raw)
return self.format_response(response, wb_url, full_prefix, is_timegate, is_proxy, cdx['timestamp']) return self.format_response(response, wb_url, full_prefix, is_timegate, is_proxy, cdx['timestamp'])
@ -487,8 +489,8 @@ class RewriterApp(object):
if target_uri != wb_url.url and cdx.get('is_fuzzy') == '1': if target_uri != wb_url.url and cdx.get('is_fuzzy') == '1':
set_content_loc = True set_content_loc = True
# if redirect to exact timestamp, bit only if not live # if redirect to exact timestamp (only set if not live)
if redirect_to_exact and not cdx.get('is_live'): if redirect_to_exact:
if set_content_loc or is_timegate or wb_url.timestamp != cdx.get('timestamp'): if set_content_loc or is_timegate or wb_url.timestamp != cdx.get('timestamp'):
new_url = urlrewriter.get_new_url(url=target_uri, new_url = urlrewriter.get_new_url(url=target_uri,
timestamp=cdx['timestamp'], timestamp=cdx['timestamp'],

View File

@ -50,6 +50,13 @@ default_filters:
- match: '[?&](\w*(bust|ts)\w*=1[\d]{12,15})(?=&|$)' - match: '[?&](\w*(bust|ts)\w*=1[\d]{12,15})(?=&|$)'
replace: '' replace: ''
# remove facbook link ID when pywb urls are shared on facebook
- match: '[?&](fbclid)=(.*)+(?=&|$)'
replace: ''
rules: rules:
# twitter rules # twitter rules

View File

@ -105,7 +105,7 @@ def test_rev_merge():
# check reverse merge: verify merging of lists, than reversing # check reverse merge: verify merging of lists, than reversing
# eqauls merging with reverse=True of reversed lists # equals merging with reverse=True of reversed lists
assert (list(reversed(list(merge(lines1, lines2)))) == assert (list(reversed(list(merge(lines1, lines2)))) ==
list(merge(reversed(lines1), reversed(lines2), reverse=True))) list(merge(reversed(lines1), reversed(lines2), reverse=True)))

View File

@ -26,12 +26,12 @@ class TestIndexSources(FakeRedisTests, BaseTestClass):
cls.all_sources = { cls.all_sources = {
'file': FileIndexSource(TEST_CDX_PATH + 'iana.cdxj'), 'file': FileIndexSource(TEST_CDX_PATH + 'iana.cdxj'),
'redis': RedisIndexSource('redis://localhost:6379/2/test:rediscdx'), 'redis': RedisIndexSource('redis://localhost:6379/2/test:rediscdx'),
'remote_cdx': RemoteIndexSource('https://webenact.rhizome.org/all/cdx?url={url}', 'remote_cdx': RemoteIndexSource('https://webenact.rhizome.org/excellences-and-perfections/cdx?url={url}',
'https://webenact.rhizome.org/all/{timestamp}id_/{url}'), 'https://webenact.rhizome.org/excellences-and-perfections/{timestamp}id_/{url}'),
'memento': MementoIndexSource('https://webenact.rhizome.org/all/{url}', 'memento': MementoIndexSource('https://webenact.rhizome.org/excellences-and-perfections/{url}',
'https://webenact.rhizome.org/all/timemap/link/{url}', 'https://webenact.rhizome.org/excellences-and-perfections/timemap/link/{url}',
'https://webenact.rhizome.org/all/{timestamp}id_/{url}') 'https://webenact.rhizome.org/excellences-and-perfections/{timestamp}id_/{url}')
} }
@pytest.fixture(params=local_sources) @pytest.fixture(params=local_sources)
@ -99,14 +99,10 @@ org,iana)/domains/root/servers 20140126201227 iana.warc.gz"""
res, errs = self.query_single_source(remote_source, dict(url=url)) res, errs = self.query_single_source(remote_source, dict(url=url))
expected = """\ expected = """\
com,instagram)/amaliaulman 20141014150552 https://webenact.rhizome.org/all/20141014150552id_/http://instagram.com/amaliaulman com,instagram)/amaliaulman 20141014150552 https://webenact.rhizome.org/excellences-and-perfections/20141014150552id_/http://instagram.com/amaliaulman
com,instagram)/amaliaulman 20141014152101 https://webenact.rhizome.org/all/20141014152101id_/http://instagram.com/amaliaulman com,instagram)/amaliaulman 20141014155217 https://webenact.rhizome.org/excellences-and-perfections/20141014155217id_/http://instagram.com/amaliaulman
com,instagram)/amaliaulman 20141014155217 https://webenact.rhizome.org/all/20141014155217id_/http://instagram.com/amaliaulman com,instagram)/amaliaulman 20141014162333 https://webenact.rhizome.org/excellences-and-perfections/20141014162333id_/http://instagram.com/amaliaulman
com,instagram)/amaliaulman 20141014160238 https://webenact.rhizome.org/all/20141014160238id_/http://instagram.com/amaliaulman com,instagram)/amaliaulman 20141014171636 https://webenact.rhizome.org/excellences-and-perfections/20141014171636id_/http://instagram.com/amaliaulman"""
com,instagram)/amaliaulman 20141014162333 https://webenact.rhizome.org/all/20141014162333id_/http://instagram.com/amaliaulman
com,instagram)/amaliaulman 20141014163116 https://webenact.rhizome.org/all/20141014163116id_/http://instagram.com/amaliaulman
com,instagram)/amaliaulman 20141014171636 https://webenact.rhizome.org/all/20141014171636id_/http://instagram.com/amaliaulman
com,instagram)/amaliaulman 20141014171954 https://webenact.rhizome.org/all/20141014171954id_/http://instagram.com/amaliaulman"""
assert(key_ts_res(res, 'load_url') == expected) assert(key_ts_res(res, 'load_url') == expected)
assert(errs == {}) assert(errs == {})
@ -117,7 +113,7 @@ com,instagram)/amaliaulman 20141014171954 https://webenact.rhizome.org/all/20141
res, errs = self.query_single_source(remote_source, dict(url=url, closest='20141014162332', limit=1, allowFuzzy='0')) res, errs = self.query_single_source(remote_source, dict(url=url, closest='20141014162332', limit=1, allowFuzzy='0'))
expected = """\ expected = """\
com,instagram)/amaliaulman 20141014162333 https://webenact.rhizome.org/all/20141014162333id_/http://instagram.com/amaliaulman""" com,instagram)/amaliaulman 20141014162333 https://webenact.rhizome.org/excellences-and-perfections/20141014162333id_/http://instagram.com/amaliaulman"""
assert(key_ts_res(res, 'load_url') == expected) assert(key_ts_res(res, 'load_url') == expected)
assert(errs == {}) assert(errs == {})
@ -128,21 +124,21 @@ com,instagram)/amaliaulman 20141014162333 https://webenact.rhizome.org/all/20141
res, errs = self.query_single_source(remote_source, dict(url=url, closest='20141014162332', limit=1)) res, errs = self.query_single_source(remote_source, dict(url=url, closest='20141014162332', limit=1))
expected = """\ expected = """\
com,instagram)/amaliaulman 20141014162333 https://webenact.rhizome.org/all/20141014162333id_/http://instagram.com/amaliaulman""" com,instagram)/amaliaulman 20141014162333 https://webenact.rhizome.org/excellences-and-perfections/20141014162333id_/http://instagram.com/amaliaulman"""
assert(key_ts_res(res, 'load_url') == expected) assert(key_ts_res(res, 'load_url') == expected)
assert(errs == {}) assert(errs == {})
# Url Match -- Wb Memento # Url Match -- Wb Memento
def test_remote_closest_wb_memento_loader(self): def test_remote_closest_wb_memento_loader(self):
replay = 'https://webenact.rhizome.org/all/{timestamp}id_/{url}' replay = 'https://webenact.rhizome.org/excellences-and-perfections/{timestamp}id_/{url}'
source = WBMementoIndexSource(replay, '', replay) source = WBMementoIndexSource(replay, '', replay)
url = 'http://instagram.com/amaliaulman' url = 'http://instagram.com/amaliaulman'
res, errs = self.query_single_source(source, dict(url=url, closest='20141014162332', limit=1)) res, errs = self.query_single_source(source, dict(url=url, closest='20141014162332', limit=1))
expected = """\ expected = """\
com,instagram)/amaliaulman 20141014162333 https://webenact.rhizome.org/all/20141014162333id_/http://instagram.com/amaliaulman""" com,instagram)/amaliaulman 20141014162333 https://webenact.rhizome.org/excellences-and-perfections/20141014162333id_/http://instagram.com/amaliaulman"""
assert(key_ts_res(res, 'load_url') == expected) assert(key_ts_res(res, 'load_url') == expected)
assert(errs == {}) assert(errs == {})

View File

@ -230,7 +230,7 @@ class WARCPathLoader(DefaultResolverMixin, BaseLoader):
http_headers_buff = http_headers.to_bytes() http_headers_buff = http_headers.to_bytes()
# if new http_headers_buff is different length, # if new http_headers_buff is different length,
# attempt to adjust content-lenghth on the WARC record # attempt to adjust content-length on the WARC record
if orig_size and len(http_headers_buff) != orig_size: if orig_size and len(http_headers_buff) != orig_size:
orig_cl = payload.rec_headers.get_header('Content-Length') orig_cl = payload.rec_headers.get_header('Content-Length')
if orig_cl: if orig_cl:

View File

@ -161,7 +161,7 @@ class TestPathIndex(object):
res = DefaultResolverMixin.make_best_resolver(a_file) res = DefaultResolverMixin.make_best_resolver(a_file)
assert isinstance(res, PathIndexResolver) assert isinstance(res, PathIndexResolver)
# a dir -- asume prefix # a dir -- assume prefix
res = DefaultResolverMixin.make_best_resolver(a_dir) res = DefaultResolverMixin.make_best_resolver(a_dir)
assert isinstance(res, PrefixResolver) assert isinstance(res, PrefixResolver)

View File

@ -74,6 +74,10 @@ class TestRedirectClassic(BaseConfigTest):
resp = self.get('/live/{0}http://example.com/?test=test', fmod_slash) resp = self.get('/live/{0}http://example.com/?test=test', fmod_slash)
assert resp.status_int == 200 assert resp.status_int == 200
def test_live_top_frame(self):
resp = self.testapp.get('/live/http://example.com/?test=test')
assert 'top_url' not in resp.text
def test_replay_limit_cdx(self): def test_replay_limit_cdx(self):
resp = self.testapp.get('/pywb/cdx?url=http://www.iana.org/*&output=json') resp = self.testapp.get('/pywb/cdx?url=http://www.iana.org/*&output=json')
assert resp.content_type == 'text/x-ndjson' assert resp.content_type == 'text/x-ndjson'