1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

Merge branch 'main' into upgrade-dependencies

This commit is contained in:
Tessa Walsh 2024-03-27 16:31:05 -04:00 committed by GitHub
commit bfbb4ab09d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 47 additions and 40 deletions

View File

@ -60,9 +60,7 @@ Installation for Deployment
To install pywb for usage, you can use: To install pywb for usage, you can use:
```shell ``pip install pywb``
pip install pywb
```
Note: depending on your Python installation, you may have to use `pip3` instead of `pip`. Note: depending on your Python installation, you may have to use `pip3` instead of `pip`.
@ -70,9 +68,7 @@ Note: depending on your Python installation, you may have to use `pip3` instead
Installation from local copy Installation from local copy
---------------------------- ----------------------------
```shell ``git clone https://github.com/webrecorder/pywb``
git clone https://github.com/webrecorder/pywb
```
To install from a locally cloned copy, install with ``pip install -e .`` or ``python setup.py install``. To install from a locally cloned copy, install with ``pip install -e .`` or ``python setup.py install``.

View File

@ -667,10 +667,12 @@ class FrontEndApp(object):
# store original script_name (original prefix) before modifications are made # store original script_name (original prefix) before modifications are made
environ['ORIG_SCRIPT_NAME'] = environ.get('SCRIPT_NAME') environ['ORIG_SCRIPT_NAME'] = environ.get('SCRIPT_NAME')
lang = args.pop('lang', self.default_locale) lang = args.pop('lang', '')
if lang: if lang:
shift_path_info(environ) shift_path_info(environ)
environ['pywb_lang'] = lang environ['pywb_lang'] = lang
elif self.default_locale:
environ['pywb_lang'] = self.default_locale
response = endpoint(environ, **args) response = endpoint(environ, **args)

View File

@ -12,7 +12,7 @@ from distutils.util import strtobool
from pkg_resources import resource_string, get_distribution from pkg_resources import resource_string, get_distribution
from argparse import ArgumentParser, RawTextHelpFormatter from argparse import ArgumentParser, RawTextHelpFormatter
from tempfile import mkdtemp from tempfile import mkdtemp, TemporaryDirectory
from zipfile import ZipFile from zipfile import ZipFile
from pywb.utils.loaders import load_yaml_config from pywb.utils.loaders import load_yaml_config
@ -213,35 +213,35 @@ directory structure expected by pywb
# delete temporary files # delete temporary files
shutil.rmtree(temp_dir) shutil.rmtree(temp_dir)
@staticmethod def _add_wacz_index(self, collection_index_path, wacz_index_path, filename_mapping):
def _add_wacz_index(collection_index_path, wacz_index_path, filename_mapping):
from pywb.warcserver.index.cdxobject import CDXObject from pywb.warcserver.index.cdxobject import CDXObject
# copy collection index to temporary directory # rewrite wacz index to temporary index file
tempdir = mkdtemp() tempdir = TemporaryDirectory()
collection_index_name = os.path.basename(collection_index_path) wacz_index_name = os.path.basename(wacz_index_path)
collection_index_temp_path = os.path.join(tempdir, collection_index_name) rewritten_index_path = os.path.join(tempdir.name, wacz_index_name)
if os.path.exists(collection_index_path): with open(rewritten_index_path, 'w') as rewritten_index:
shutil.copy2(collection_index_path, collection_index_temp_path)
with open(collection_index_temp_path, 'a') as collection_index_temp_file:
if wacz_index_path.endswith('.gz'): if wacz_index_path.endswith('.gz'):
wacz_index_file = gzip.open(wacz_index_path, 'rb') wacz_index = gzip.open(wacz_index_path, 'rb')
else: else:
wacz_index_file = open(wacz_index_path, 'rb') wacz_index = open(wacz_index_path, 'rb')
collection_index_temp_file.write('\n')
for line in wacz_index_file.readlines(): for line in wacz_index:
cdx_object = CDXObject(cdxline=line) cdx_object = CDXObject(cdxline=line)
if cdx_object['filename'] in filename_mapping: if cdx_object['filename'] in filename_mapping:
cdx_object['filename'] = filename_mapping[cdx_object['filename']] cdx_object['filename'] = filename_mapping[cdx_object['filename']]
collection_index_temp_file.write(cdx_object.to_cdxj()) rewritten_index.write(cdx_object.to_cdxj())
wacz_index_file.close() if not os.path.isfile(collection_index_path):
shutil.move(rewritten_index_path, collection_index_path)
return
# copy temporary index back to original location and delete temporary directory temp_coll_index_path = collection_index_path + '.tmp.' + timestamp20_now()
shutil.move(collection_index_temp_path, collection_index_path) self._merge_indices(collection_index_path, rewritten_index_path, temp_coll_index_path)
shutil.rmtree(tempdir) shutil.move(temp_coll_index_path, collection_index_path)
tempdir.cleanup()
def reindex(self): def reindex(self):
cdx_file = os.path.join(self.indexes_dir, self.DEF_INDEX_FILE) cdx_file = os.path.join(self.indexes_dir, self.DEF_INDEX_FILE)
@ -294,20 +294,24 @@ directory structure expected by pywb
merged_file = temp_file + '.merged' merged_file = temp_file + '.merged'
last_line = None self._merge_indices(cdx_file, temp_file, merged_file)
with open(cdx_file, 'rb') as orig_index:
with open(temp_file, 'rb') as new_index:
with open(merged_file, 'w+b') as merged:
for line in heapq.merge(orig_index, new_index):
if last_line != line:
merged.write(line)
last_line = line
shutil.move(merged_file, cdx_file) shutil.move(merged_file, cdx_file)
#os.rename(merged_file, cdx_file) #os.rename(merged_file, cdx_file)
os.remove(temp_file) os.remove(temp_file)
@staticmethod
def _merge_indices(index1, index2, dest):
last_line = None
with open(index1, 'rb') as index1_f:
with open(index2, 'rb') as index2_f:
with open(dest, 'wb') as dest_f:
for line in heapq.merge(index1_f, index2_f):
if last_line != line:
dest_f.write(line)
last_line = line
def set_metadata(self, namevalue_pairs): def set_metadata(self, namevalue_pairs):
metadata_yaml = os.path.join(self.curr_coll_dir, 'metadata.yaml') metadata_yaml = os.path.join(self.curr_coll_dir, 'metadata.yaml')
metadata = None metadata = None

View File

@ -178,7 +178,7 @@ class JinjaEnv(object):
request_uri = environ.get('REQUEST_URI', environ.get('PATH_INFO')) request_uri = environ.get('REQUEST_URI', environ.get('PATH_INFO'))
if curr_loc: if curr_loc and request_uri.startswith('/' + curr_loc + '/'):
return request_uri.replace(curr_loc, locale, 1) return request_uri.replace(curr_loc, locale, 1)
app_prefix = environ.get('pywb.app_prefix', '') app_prefix = environ.get('pywb.app_prefix', '')
@ -196,11 +196,11 @@ class JinjaEnv(object):
orig_prefix = environ.get('pywb.app_prefix', '') orig_prefix = environ.get('pywb.app_prefix', '')
coll = environ.get('SCRIPT_NAME', '') coll = environ.get('SCRIPT_NAME', '')
if orig_prefix: if orig_prefix and coll.startswith(orig_prefix):
coll = coll[len(orig_prefix):] coll = coll[len(orig_prefix):]
curr_loc = environ.get('pywb_lang', '') curr_loc = environ.get('pywb_lang', '')
if curr_loc: if curr_loc and coll.startswith('/' + curr_loc):
coll = coll[len(curr_loc) + 1:] coll = coll[len(curr_loc) + 1:]
for locale in loc_map.keys(): for locale in loc_map.keys():

View File

@ -3,7 +3,7 @@
{% block body %} {% block body %}
<div class="container text-danger error"> <div class="container text-danger error">
<div class="row justify-content-center"> <div class="row justify-content-center">
<h2 class="display-2">Pywb Error</h2> <h2 class="display-2">{{ _('Pywb Error') }}</h2>
</div> </div>
<div class="row"> <div class="row">
<div class="col-12 text-center"> <div class="col-12 text-center">

View File

@ -75,10 +75,15 @@ class TestManager:
{'example.warc.gz': 'rewritten.warc.gz'}) {'example.warc.gz': 'rewritten.warc.gz'})
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f: with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
index_content = f.read() index_content = f.read()
index_content = index_content.strip()
assert 'example.warc.gz' not in index_content assert 'example.warc.gz' not in index_content
assert 'rewritten.warc.gz' in index_content assert 'rewritten.warc.gz' in index_content
# check that collection index is sorted
index_lines = index_content.split('\n')
assert sorted(index_lines) == index_lines
def test_merge_wacz_index_gzip(self, tmp_path): def test_merge_wacz_index_gzip(self, tmp_path):
manager = self.get_test_collections_manager(tmp_path) manager = self.get_test_collections_manager(tmp_path)
manager._add_wacz_index(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), manager._add_wacz_index(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE),