mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
Merge branch 'main' into upgrade-dependencies
This commit is contained in:
commit
bfbb4ab09d
@ -60,9 +60,7 @@ Installation for Deployment
|
|||||||
|
|
||||||
To install pywb for usage, you can use:
|
To install pywb for usage, you can use:
|
||||||
|
|
||||||
```shell
|
``pip install pywb``
|
||||||
pip install pywb
|
|
||||||
```
|
|
||||||
|
|
||||||
Note: depending on your Python installation, you may have to use `pip3` instead of `pip`.
|
Note: depending on your Python installation, you may have to use `pip3` instead of `pip`.
|
||||||
|
|
||||||
@ -70,9 +68,7 @@ Note: depending on your Python installation, you may have to use `pip3` instead
|
|||||||
Installation from local copy
|
Installation from local copy
|
||||||
----------------------------
|
----------------------------
|
||||||
|
|
||||||
```shell
|
``git clone https://github.com/webrecorder/pywb``
|
||||||
git clone https://github.com/webrecorder/pywb
|
|
||||||
```
|
|
||||||
|
|
||||||
To install from a locally cloned copy, install with ``pip install -e .`` or ``python setup.py install``.
|
To install from a locally cloned copy, install with ``pip install -e .`` or ``python setup.py install``.
|
||||||
|
|
||||||
|
@ -667,10 +667,12 @@ class FrontEndApp(object):
|
|||||||
# store original script_name (original prefix) before modifications are made
|
# store original script_name (original prefix) before modifications are made
|
||||||
environ['ORIG_SCRIPT_NAME'] = environ.get('SCRIPT_NAME')
|
environ['ORIG_SCRIPT_NAME'] = environ.get('SCRIPT_NAME')
|
||||||
|
|
||||||
lang = args.pop('lang', self.default_locale)
|
lang = args.pop('lang', '')
|
||||||
if lang:
|
if lang:
|
||||||
shift_path_info(environ)
|
shift_path_info(environ)
|
||||||
environ['pywb_lang'] = lang
|
environ['pywb_lang'] = lang
|
||||||
|
elif self.default_locale:
|
||||||
|
environ['pywb_lang'] = self.default_locale
|
||||||
|
|
||||||
response = endpoint(environ, **args)
|
response = endpoint(environ, **args)
|
||||||
|
|
||||||
|
@ -12,7 +12,7 @@ from distutils.util import strtobool
|
|||||||
from pkg_resources import resource_string, get_distribution
|
from pkg_resources import resource_string, get_distribution
|
||||||
|
|
||||||
from argparse import ArgumentParser, RawTextHelpFormatter
|
from argparse import ArgumentParser, RawTextHelpFormatter
|
||||||
from tempfile import mkdtemp
|
from tempfile import mkdtemp, TemporaryDirectory
|
||||||
from zipfile import ZipFile
|
from zipfile import ZipFile
|
||||||
|
|
||||||
from pywb.utils.loaders import load_yaml_config
|
from pywb.utils.loaders import load_yaml_config
|
||||||
@ -213,35 +213,35 @@ directory structure expected by pywb
|
|||||||
# delete temporary files
|
# delete temporary files
|
||||||
shutil.rmtree(temp_dir)
|
shutil.rmtree(temp_dir)
|
||||||
|
|
||||||
@staticmethod
|
def _add_wacz_index(self, collection_index_path, wacz_index_path, filename_mapping):
|
||||||
def _add_wacz_index(collection_index_path, wacz_index_path, filename_mapping):
|
|
||||||
from pywb.warcserver.index.cdxobject import CDXObject
|
from pywb.warcserver.index.cdxobject import CDXObject
|
||||||
|
|
||||||
# copy collection index to temporary directory
|
# rewrite wacz index to temporary index file
|
||||||
tempdir = mkdtemp()
|
tempdir = TemporaryDirectory()
|
||||||
collection_index_name = os.path.basename(collection_index_path)
|
wacz_index_name = os.path.basename(wacz_index_path)
|
||||||
collection_index_temp_path = os.path.join(tempdir, collection_index_name)
|
rewritten_index_path = os.path.join(tempdir.name, wacz_index_name)
|
||||||
|
|
||||||
if os.path.exists(collection_index_path):
|
with open(rewritten_index_path, 'w') as rewritten_index:
|
||||||
shutil.copy2(collection_index_path, collection_index_temp_path)
|
|
||||||
|
|
||||||
with open(collection_index_temp_path, 'a') as collection_index_temp_file:
|
|
||||||
if wacz_index_path.endswith('.gz'):
|
if wacz_index_path.endswith('.gz'):
|
||||||
wacz_index_file = gzip.open(wacz_index_path, 'rb')
|
wacz_index = gzip.open(wacz_index_path, 'rb')
|
||||||
else:
|
else:
|
||||||
wacz_index_file = open(wacz_index_path, 'rb')
|
wacz_index = open(wacz_index_path, 'rb')
|
||||||
collection_index_temp_file.write('\n')
|
|
||||||
for line in wacz_index_file.readlines():
|
for line in wacz_index:
|
||||||
cdx_object = CDXObject(cdxline=line)
|
cdx_object = CDXObject(cdxline=line)
|
||||||
if cdx_object['filename'] in filename_mapping:
|
if cdx_object['filename'] in filename_mapping:
|
||||||
cdx_object['filename'] = filename_mapping[cdx_object['filename']]
|
cdx_object['filename'] = filename_mapping[cdx_object['filename']]
|
||||||
collection_index_temp_file.write(cdx_object.to_cdxj())
|
rewritten_index.write(cdx_object.to_cdxj())
|
||||||
|
|
||||||
wacz_index_file.close()
|
if not os.path.isfile(collection_index_path):
|
||||||
|
shutil.move(rewritten_index_path, collection_index_path)
|
||||||
|
return
|
||||||
|
|
||||||
# copy temporary index back to original location and delete temporary directory
|
temp_coll_index_path = collection_index_path + '.tmp.' + timestamp20_now()
|
||||||
shutil.move(collection_index_temp_path, collection_index_path)
|
self._merge_indices(collection_index_path, rewritten_index_path, temp_coll_index_path)
|
||||||
shutil.rmtree(tempdir)
|
shutil.move(temp_coll_index_path, collection_index_path)
|
||||||
|
|
||||||
|
tempdir.cleanup()
|
||||||
|
|
||||||
def reindex(self):
|
def reindex(self):
|
||||||
cdx_file = os.path.join(self.indexes_dir, self.DEF_INDEX_FILE)
|
cdx_file = os.path.join(self.indexes_dir, self.DEF_INDEX_FILE)
|
||||||
@ -294,20 +294,24 @@ directory structure expected by pywb
|
|||||||
|
|
||||||
merged_file = temp_file + '.merged'
|
merged_file = temp_file + '.merged'
|
||||||
|
|
||||||
last_line = None
|
self._merge_indices(cdx_file, temp_file, merged_file)
|
||||||
|
|
||||||
with open(cdx_file, 'rb') as orig_index:
|
|
||||||
with open(temp_file, 'rb') as new_index:
|
|
||||||
with open(merged_file, 'w+b') as merged:
|
|
||||||
for line in heapq.merge(orig_index, new_index):
|
|
||||||
if last_line != line:
|
|
||||||
merged.write(line)
|
|
||||||
last_line = line
|
|
||||||
|
|
||||||
shutil.move(merged_file, cdx_file)
|
shutil.move(merged_file, cdx_file)
|
||||||
#os.rename(merged_file, cdx_file)
|
#os.rename(merged_file, cdx_file)
|
||||||
os.remove(temp_file)
|
os.remove(temp_file)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _merge_indices(index1, index2, dest):
|
||||||
|
last_line = None
|
||||||
|
|
||||||
|
with open(index1, 'rb') as index1_f:
|
||||||
|
with open(index2, 'rb') as index2_f:
|
||||||
|
with open(dest, 'wb') as dest_f:
|
||||||
|
for line in heapq.merge(index1_f, index2_f):
|
||||||
|
if last_line != line:
|
||||||
|
dest_f.write(line)
|
||||||
|
last_line = line
|
||||||
|
|
||||||
def set_metadata(self, namevalue_pairs):
|
def set_metadata(self, namevalue_pairs):
|
||||||
metadata_yaml = os.path.join(self.curr_coll_dir, 'metadata.yaml')
|
metadata_yaml = os.path.join(self.curr_coll_dir, 'metadata.yaml')
|
||||||
metadata = None
|
metadata = None
|
||||||
|
@ -178,7 +178,7 @@ class JinjaEnv(object):
|
|||||||
|
|
||||||
request_uri = environ.get('REQUEST_URI', environ.get('PATH_INFO'))
|
request_uri = environ.get('REQUEST_URI', environ.get('PATH_INFO'))
|
||||||
|
|
||||||
if curr_loc:
|
if curr_loc and request_uri.startswith('/' + curr_loc + '/'):
|
||||||
return request_uri.replace(curr_loc, locale, 1)
|
return request_uri.replace(curr_loc, locale, 1)
|
||||||
|
|
||||||
app_prefix = environ.get('pywb.app_prefix', '')
|
app_prefix = environ.get('pywb.app_prefix', '')
|
||||||
@ -196,11 +196,11 @@ class JinjaEnv(object):
|
|||||||
orig_prefix = environ.get('pywb.app_prefix', '')
|
orig_prefix = environ.get('pywb.app_prefix', '')
|
||||||
coll = environ.get('SCRIPT_NAME', '')
|
coll = environ.get('SCRIPT_NAME', '')
|
||||||
|
|
||||||
if orig_prefix:
|
if orig_prefix and coll.startswith(orig_prefix):
|
||||||
coll = coll[len(orig_prefix):]
|
coll = coll[len(orig_prefix):]
|
||||||
|
|
||||||
curr_loc = environ.get('pywb_lang', '')
|
curr_loc = environ.get('pywb_lang', '')
|
||||||
if curr_loc:
|
if curr_loc and coll.startswith('/' + curr_loc):
|
||||||
coll = coll[len(curr_loc) + 1:]
|
coll = coll[len(curr_loc) + 1:]
|
||||||
|
|
||||||
for locale in loc_map.keys():
|
for locale in loc_map.keys():
|
||||||
|
@ -3,7 +3,7 @@
|
|||||||
{% block body %}
|
{% block body %}
|
||||||
<div class="container text-danger error">
|
<div class="container text-danger error">
|
||||||
<div class="row justify-content-center">
|
<div class="row justify-content-center">
|
||||||
<h2 class="display-2">Pywb Error</h2>
|
<h2 class="display-2">{{ _('Pywb Error') }}</h2>
|
||||||
</div>
|
</div>
|
||||||
<div class="row">
|
<div class="row">
|
||||||
<div class="col-12 text-center">
|
<div class="col-12 text-center">
|
||||||
|
@ -75,10 +75,15 @@ class TestManager:
|
|||||||
{'example.warc.gz': 'rewritten.warc.gz'})
|
{'example.warc.gz': 'rewritten.warc.gz'})
|
||||||
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
|
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
|
||||||
index_content = f.read()
|
index_content = f.read()
|
||||||
|
index_content = index_content.strip()
|
||||||
|
|
||||||
assert 'example.warc.gz' not in index_content
|
assert 'example.warc.gz' not in index_content
|
||||||
assert 'rewritten.warc.gz' in index_content
|
assert 'rewritten.warc.gz' in index_content
|
||||||
|
|
||||||
|
# check that collection index is sorted
|
||||||
|
index_lines = index_content.split('\n')
|
||||||
|
assert sorted(index_lines) == index_lines
|
||||||
|
|
||||||
def test_merge_wacz_index_gzip(self, tmp_path):
|
def test_merge_wacz_index_gzip(self, tmp_path):
|
||||||
manager = self.get_test_collections_manager(tmp_path)
|
manager = self.get_test_collections_manager(tmp_path)
|
||||||
manager._add_wacz_index(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE),
|
manager._add_wacz_index(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE),
|
||||||
|
Loading…
x
Reference in New Issue
Block a user