mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
Compare commits
31 Commits
Author | SHA1 | Date | |
---|---|---|---|
|
7b0f8b5860 | ||
|
b44c93bf6e | ||
|
97fffe3a34 | ||
|
6205646b9b | ||
|
23891be2f1 | ||
|
b190dddee9 | ||
|
b9f1609df9 | ||
|
e89924bd39 | ||
|
b4c91c6633 | ||
|
1e2665af13 | ||
|
fee14d7fe8 | ||
|
5712945991 | ||
|
2fd6190b72 | ||
|
791a8d1033 | ||
|
86ee3bd752 | ||
|
d1e1636ae3 | ||
|
b4955cca66 | ||
|
f40e7ef18c | ||
|
6b4f9b323e | ||
|
7879dd0222 | ||
|
013746c10a | ||
|
79140441df | ||
|
af92a9726e | ||
|
83b2113be2 | ||
|
ed36830dc5 | ||
|
81b6a57dfb | ||
|
5c427b9ff2 | ||
|
454486bf75 | ||
|
b8693307d1 | ||
|
98be48d6e4 | ||
|
c441d83435 |
2
.github/workflows/ci.yaml
vendored
2
.github/workflows/ci.yaml
vendored
@ -8,7 +8,7 @@ jobs:
|
|||||||
strategy:
|
strategy:
|
||||||
max-parallel: 3
|
max-parallel: 3
|
||||||
matrix:
|
matrix:
|
||||||
python-version: ['3.7', '3.8', '3.9', '3.10']
|
python-version: ['3.7', '3.8', '3.9', '3.10', '3.11']
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: checkout
|
- name: checkout
|
||||||
|
4
.gitignore
vendored
4
.gitignore
vendored
@ -53,3 +53,7 @@ git_hash.py
|
|||||||
|
|
||||||
# Sphinx documentation
|
# Sphinx documentation
|
||||||
docs/_build/*
|
docs/_build/*
|
||||||
|
|
||||||
|
# virtualenvs
|
||||||
|
env/
|
||||||
|
venv/
|
||||||
|
@ -1181,7 +1181,7 @@ pywb 0.9.6 changelist
|
|||||||
pywb 0.9.5 changelist
|
pywb 0.9.5 changelist
|
||||||
~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
* s3 loading: support ``s3://`` scheme in block loader, allowing for loading index and archive files from s3. ``boto`` library must be installed seperately
|
* s3 loading: support ``s3://`` scheme in block loader, allowing for loading index and archive files from s3. ``boto`` library must be installed separately
|
||||||
via ``pip install boto``. Attempt default boto auth path, and if that fails, attempt anonymous s3 connection.
|
via ``pip install boto``. Attempt default boto auth path, and if that fails, attempt anonymous s3 connection.
|
||||||
|
|
||||||
* Wombat/Client-Side Rewrite Customizations: New ``rewrite_opts.client`` settings from ``config.yaml`` are passed directly to wombat as json.
|
* Wombat/Client-Side Rewrite Customizations: New ``rewrite_opts.client`` settings from ``config.yaml`` are passed directly to wombat as json.
|
||||||
@ -1277,7 +1277,7 @@ pywb 0.9.1 changelist
|
|||||||
|
|
||||||
* cdx server query: add support for ``url=*.host`` and ``url=host/*`` as shortcuts for ``matchType=domain`` and ``matchType=prefix``
|
* cdx server query: add support for ``url=*.host`` and ``url=host/*`` as shortcuts for ``matchType=domain`` and ``matchType=prefix``
|
||||||
|
|
||||||
* zipnum cdx cluster: support loading index shared from prefix path instead of seperate location file.
|
* zipnum cdx cluster: support loading index shared from prefix path instead of separate location file.
|
||||||
|
|
||||||
The ``shard_index_loc`` config property may contain match and replace properties.
|
The ``shard_index_loc`` config property may contain match and replace properties.
|
||||||
Regex replacement is then used to obtain path prefix from the shard prefix path.
|
Regex replacement is then used to obtain path prefix from the shard prefix path.
|
||||||
@ -1643,7 +1643,7 @@ pywb 0.4.7 changelist
|
|||||||
|
|
||||||
* Rewrite: Parsing of html as raw bytes instead of decode/encode, detection still needed for non-ascii compatible encoding.
|
* Rewrite: Parsing of html as raw bytes instead of decode/encode, detection still needed for non-ascii compatible encoding.
|
||||||
|
|
||||||
* Indexing: Refactoring of cdx-indexer using a seperate 'archive record iterator' and pluggable cdx writer classes. Groundwork for creating custom indexers.
|
* Indexing: Refactoring of cdx-indexer using a separate 'archive record iterator' and pluggable cdx writer classes. Groundwork for creating custom indexers.
|
||||||
|
|
||||||
* Indexing: Support for 9 field cdx formats with -9 flag.
|
* Indexing: Support for 9 field cdx formats with -9 flag.
|
||||||
|
|
||||||
|
12
README.rst
12
README.rst
@ -1,4 +1,4 @@
|
|||||||
Webrecorder pywb 2.7
|
Webrecorder pywb 2.8
|
||||||
====================
|
====================
|
||||||
|
|
||||||
.. image:: https://raw.githubusercontent.com/webrecorder/pywb/main/pywb/static/pywb-logo.png
|
.. image:: https://raw.githubusercontent.com/webrecorder/pywb/main/pywb/static/pywb-logo.png
|
||||||
@ -13,7 +13,7 @@ Web Archiving Tools for All
|
|||||||
|
|
||||||
`View the full pywb documentation <https://pywb.readthedocs.org>`_
|
`View the full pywb documentation <https://pywb.readthedocs.org>`_
|
||||||
|
|
||||||
**pywb** is a Python (2 and 3) web archiving toolkit for replaying web archives large and small as accurately as possible.
|
**pywb** is a Python 3 web archiving toolkit for replaying web archives large and small as accurately as possible.
|
||||||
The toolkit now also includes new features for creating high-fidelity web archives.
|
The toolkit now also includes new features for creating high-fidelity web archives.
|
||||||
|
|
||||||
This toolset forms the foundation of Webrecorder project, but also provides a generic web archiving toolkit
|
This toolset forms the foundation of Webrecorder project, but also provides a generic web archiving toolkit
|
||||||
@ -60,9 +60,7 @@ Installation for Deployment
|
|||||||
|
|
||||||
To install pywb for usage, you can use:
|
To install pywb for usage, you can use:
|
||||||
|
|
||||||
```shell
|
``pip install pywb``
|
||||||
pip install pywb
|
|
||||||
```
|
|
||||||
|
|
||||||
Note: depending on your Python installation, you may have to use `pip3` instead of `pip`.
|
Note: depending on your Python installation, you may have to use `pip3` instead of `pip`.
|
||||||
|
|
||||||
@ -70,9 +68,7 @@ Note: depending on your Python installation, you may have to use `pip3` instead
|
|||||||
Installation from local copy
|
Installation from local copy
|
||||||
----------------------------
|
----------------------------
|
||||||
|
|
||||||
```shell
|
``git clone https://github.com/webrecorder/pywb``
|
||||||
git clone https://github.com/webrecorder/pywb
|
|
||||||
```
|
|
||||||
|
|
||||||
To install from a locally cloned copy, install with ``pip install -e .`` or ``python setup.py install``.
|
To install from a locally cloned copy, install with ``pip install -e .`` or ``python setup.py install``.
|
||||||
|
|
||||||
|
@ -10,6 +10,7 @@ debug: true
|
|||||||
# navbar_background_hex: 0c49b0
|
# navbar_background_hex: 0c49b0
|
||||||
# navbar_color_hex: fff
|
# navbar_color_hex: fff
|
||||||
# navbar_light_buttons: true
|
# navbar_light_buttons: true
|
||||||
|
# disable_printing: true
|
||||||
|
|
||||||
collections:
|
collections:
|
||||||
all: $all
|
all: $all
|
||||||
|
@ -105,6 +105,12 @@ Given these rules, a user would:
|
|||||||
* but would receive an 'access blocked' error message when viewing ``http://httpbin.org/`` (block)
|
* but would receive an 'access blocked' error message when viewing ``http://httpbin.org/`` (block)
|
||||||
* would receive a 404 not found error when viewing ``http://httpbin.org/anything`` (exclude)
|
* would receive a 404 not found error when viewing ``http://httpbin.org/anything`` (exclude)
|
||||||
|
|
||||||
|
To match any possible URL in an .aclj file, set ``*,`` as the leading SURT, for example::
|
||||||
|
|
||||||
|
*, - {"access": "allow"}
|
||||||
|
|
||||||
|
Lines starting with ``*,`` should generally be at the end of the file, respecting the reverse alphabetical order.
|
||||||
|
|
||||||
|
|
||||||
Access Types: allow, block, exclude, allow_ignore_embargo
|
Access Types: allow, block, exclude, allow_ignore_embargo
|
||||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
@ -149,6 +155,10 @@ To make this work, pywb must be running behind an Apache or Nginx system that is
|
|||||||
|
|
||||||
For example, this header may be set based on IP range, or based on password authentication.
|
For example, this header may be set based on IP range, or based on password authentication.
|
||||||
|
|
||||||
|
To allow a user access to all URLs, overriding more specific rules and the ``default_access`` configuration setting, use the ``*,`` SURT::
|
||||||
|
|
||||||
|
*, - {"access": "allow", "user": "staff"}
|
||||||
|
|
||||||
Further examples of how to set this header will be provided in the deployments section.
|
Further examples of how to set this header will be provided in the deployments section.
|
||||||
|
|
||||||
**Note: Do not use the user-based rules without configuring proper authentication on an Apache or Nginx frontend to set or remove this header, otherwise the 'X-Pywb-ACL-User' can easily be faked.**
|
**Note: Do not use the user-based rules without configuring proper authentication on an Apache or Nginx frontend to set or remove this header, otherwise the 'X-Pywb-ACL-User' can easily be faked.**
|
||||||
|
@ -46,6 +46,7 @@ It can be used to:
|
|||||||
|
|
||||||
* Create a new collection -- ``wb-manager init <coll>``
|
* Create a new collection -- ``wb-manager init <coll>``
|
||||||
* Add WARCs to collection -- ``wb-manager add <coll> <warc>``
|
* Add WARCs to collection -- ``wb-manager add <coll> <warc>``
|
||||||
|
* Unpack WACZs to add their WARCs and indices to collection -- ``wb-manager add --unpack-wacz <coll> <wacz>``
|
||||||
* Add override templates
|
* Add override templates
|
||||||
* Add and remove metadata to a collections ``metadata.yaml``
|
* Add and remove metadata to a collections ``metadata.yaml``
|
||||||
* List all collections
|
* List all collections
|
||||||
|
@ -95,8 +95,8 @@ add the WARC to a new collection and start pywb:
|
|||||||
|
|
||||||
docker pull webrecorder/pywb
|
docker pull webrecorder/pywb
|
||||||
docker run -e INIT_COLLECTION=my-web-archive -v /pywb-data:/webarchive \
|
docker run -e INIT_COLLECTION=my-web-archive -v /pywb-data:/webarchive \
|
||||||
-v /path/to:/source webrecorder/pywb wb-manager add default /path/to/my_warc.warc.gz
|
-v /path/to:/source webrecorder/pywb wb-manager add my-web-archive /source/my_warc.warc.gz
|
||||||
docker run -p 8080:8080 -v /pywb-data/:/webarchive wayback
|
docker run -p 8080:8080 -v /pywb-data/:/webarchive webrecorder/pywb wayback
|
||||||
|
|
||||||
This example is equivalent to the non-Docker example above.
|
This example is equivalent to the non-Docker example above.
|
||||||
|
|
||||||
@ -114,6 +114,8 @@ Using Existing Web Archive Collections
|
|||||||
Existing archives of WARCs/ARCs files can be used with pywb with minimal amount of setup. By using ``wb-manager add``,
|
Existing archives of WARCs/ARCs files can be used with pywb with minimal amount of setup. By using ``wb-manager add``,
|
||||||
WARC/ARC files will automatically be placed in the collection archive directory and indexed.
|
WARC/ARC files will automatically be placed in the collection archive directory and indexed.
|
||||||
|
|
||||||
|
In pywb 2.8.0 and later, preliminary support for WACZ files is also added with ``wb-manager add --unpack-wacz``. This will unpack the provided WACZ file, adding its WARCs and indices to the collection.
|
||||||
|
|
||||||
By default ``wb-manager``, places new collections in ``collections/<coll name>`` subdirectory in the current working directory. To specify a different root directory, the ``wb-manager -d <dir>``. Other options can be set in the config file.
|
By default ``wb-manager``, places new collections in ``collections/<coll name>`` subdirectory in the current working directory. To specify a different root directory, the ``wb-manager -d <dir>``. Other options can be set in the config file.
|
||||||
|
|
||||||
If you have a large number of existing CDX index files, pywb will be able to read them as well after running through a simple conversion process.
|
If you have a large number of existing CDX index files, pywb will be able to read them as well after running through a simple conversion process.
|
||||||
@ -154,20 +156,20 @@ To enable auto-indexing, run with ``wayback -a`` or ``wayback -a --auto-interval
|
|||||||
Creating a Web Archive
|
Creating a Web Archive
|
||||||
----------------------
|
----------------------
|
||||||
|
|
||||||
Using Webrecorder
|
Using ArchiveWeb.page
|
||||||
^^^^^^^^^^^^^^^^^
|
^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
If you do not have a web archive to test, one easy way to create one is to use `Webrecorder <https://webrecorder.io>`_
|
If you do not have a web archive to test, one easy way to create one is to use the `ArchiveWeb.page <https://archiveweb.page>`_ browser extension for Chrome and other Chromium-based browsers such as Brave Browser. ArchiveWeb.page records pages visited during an archiving session in the browser, and provides means of both replaying and downloading the archived items created.
|
||||||
|
|
||||||
After recording, you can click **Stop** and then click `Download Collection` to receive a WARC (`.warc.gz`) file.
|
Follow the instructions in `How To Create Web Archives with ArchiveWeb.page <https://archiveweb.page/en/usage/>`_. After recording, press **Stop** and then `download your collection <https://archiveweb.page/en/download/>`_ to receive a WARC (`.warc.gz`) file. If you choose to download your collection in the WACZ format, the WARC files can be found inside the zipped WACZ in the ``archive/`` directory.
|
||||||
|
|
||||||
You can then use this with work with pywb.
|
You can then use your WARCs to work with pywb.
|
||||||
|
|
||||||
|
|
||||||
Using pywb Recorder
|
Using pywb Recorder
|
||||||
^^^^^^^^^^^^^^^^^^^
|
^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
The core recording functionality in Webrecorder is also part of :mod:`pywb`. If you want to create a WARC locally, this can be
|
Recording functionality is also part of :mod:`pywb`. If you want to create a WARC locally, this can be
|
||||||
done by directly recording into your pywb collection:
|
done by directly recording into your pywb collection:
|
||||||
|
|
||||||
1. Create a collection: ``wb-manager init my-web-archive`` (if you haven't already created a web archive collection)
|
1. Create a collection: ``wb-manager init my-web-archive`` (if you haven't already created a web archive collection)
|
||||||
@ -180,6 +182,14 @@ In this configuration, the indexing happens every 10 seconds.. After 10 seconds,
|
|||||||
``http://localhost:8080/my-web-archive/http://example.com/``
|
``http://localhost:8080/my-web-archive/http://example.com/``
|
||||||
|
|
||||||
|
|
||||||
|
Using Browsertrix
|
||||||
|
^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
For a more automated browser-based web archiving experience, `Browsertrix <https://browsertrix.com/>`_ provides a web interface for configuring, scheduling, running, reviewing, and curating crawls of web content. Crawl activity is shown in a live screencast of the browsers used for crawling and all web archives created in Browsertrix can be easily downloaded from the application in the WACZ format.
|
||||||
|
|
||||||
|
`Browsertrix Crawler <https://crawler.docs.browsertrix.com/>`_, which provides the underlying crawling functionality of Browsertrix, can also be run standalone in a Docker container on your local computer.
|
||||||
|
|
||||||
|
|
||||||
HTTP/S Proxy Mode Access
|
HTTP/S Proxy Mode Access
|
||||||
------------------------
|
------------------------
|
||||||
|
|
||||||
|
@ -68,6 +68,21 @@ For example, to have the logo redirect to ``https://example.com/web-archive-land
|
|||||||
logo_home_url: https://example.com/web-archive-landing-page
|
logo_home_url: https://example.com/web-archive-landing-page
|
||||||
|
|
||||||
|
|
||||||
|
Printing
|
||||||
|
^^^^^^^^
|
||||||
|
|
||||||
|
As of pywb 2.8, the replay header includes a print button that prints the contents of the replay iframe.
|
||||||
|
|
||||||
|
This button can be disabled by setting ``ui.disable_printing`` in ``config.yaml`` to any value.
|
||||||
|
|
||||||
|
For example:
|
||||||
|
|
||||||
|
.. code:: yaml
|
||||||
|
|
||||||
|
ui:
|
||||||
|
disable_printing: true
|
||||||
|
|
||||||
|
|
||||||
Banner Colors
|
Banner Colors
|
||||||
^^^^^^^^^^^^^
|
^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
from gevent.monkey import patch_all; patch_all()
|
from gevent.monkey import patch_all; patch_all()
|
||||||
|
|
||||||
from werkzeug.routing import Map, Rule, RequestRedirect, Submount
|
from werkzeug.routing import Map, Rule, RequestRedirect, Submount
|
||||||
from werkzeug.wsgi import pop_path_info
|
from wsgiref.util import shift_path_info
|
||||||
from six.moves.urllib.parse import urljoin, parse_qsl
|
from six.moves.urllib.parse import urljoin, parse_qsl
|
||||||
from six import iteritems
|
from six import iteritems
|
||||||
from warcio.utils import to_native_str
|
from warcio.utils import to_native_str
|
||||||
@ -434,7 +434,11 @@ class FrontEndApp(object):
|
|||||||
cdx_url += 'limit=' + str(self.query_limit)
|
cdx_url += 'limit=' + str(self.query_limit)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
res = requests.get(cdx_url, stream=True)
|
headers = {}
|
||||||
|
for key in environ.keys():
|
||||||
|
if key.startswith("HTTP_X_"):
|
||||||
|
headers[key[5:].replace("_", "-")] = environ[key]
|
||||||
|
res = requests.get(cdx_url, stream=True, headers=headers)
|
||||||
|
|
||||||
status_line = '{} {}'.format(res.status_code, res.reason)
|
status_line = '{} {}'.format(res.status_code, res.reason)
|
||||||
content_type = res.headers.get('Content-Type')
|
content_type = res.headers.get('Content-Type')
|
||||||
@ -554,9 +558,9 @@ class FrontEndApp(object):
|
|||||||
return
|
return
|
||||||
|
|
||||||
if coll != '$root':
|
if coll != '$root':
|
||||||
pop_path_info(environ)
|
shift_path_info(environ)
|
||||||
if record:
|
if record:
|
||||||
pop_path_info(environ)
|
shift_path_info(environ)
|
||||||
|
|
||||||
paths = [self.warcserver.root_dir]
|
paths = [self.warcserver.root_dir]
|
||||||
|
|
||||||
@ -599,7 +603,7 @@ class FrontEndApp(object):
|
|||||||
and message.
|
and message.
|
||||||
|
|
||||||
:param dict environ: The WSGI environment dictionary for the request
|
:param dict environ: The WSGI environment dictionary for the request
|
||||||
:param str err_type: The identifier for type of error that occured
|
:param str err_type: The identifier for type of error that occurred
|
||||||
:param str url: The url of the archived page that was requested
|
:param str url: The url of the archived page that was requested
|
||||||
"""
|
"""
|
||||||
raise AppPageNotFound(err_type, url)
|
raise AppPageNotFound(err_type, url)
|
||||||
@ -663,10 +667,14 @@ class FrontEndApp(object):
|
|||||||
# store original script_name (original prefix) before modifications are made
|
# store original script_name (original prefix) before modifications are made
|
||||||
environ['ORIG_SCRIPT_NAME'] = environ.get('SCRIPT_NAME')
|
environ['ORIG_SCRIPT_NAME'] = environ.get('SCRIPT_NAME')
|
||||||
|
|
||||||
lang = args.pop('lang', self.default_locale)
|
lang = args.pop('lang', '')
|
||||||
|
if lang:
|
||||||
|
shift_path_info(environ)
|
||||||
|
|
||||||
if lang:
|
if lang:
|
||||||
pop_path_info(environ)
|
|
||||||
environ['pywb_lang'] = lang
|
environ['pywb_lang'] = lang
|
||||||
|
elif self.default_locale:
|
||||||
|
environ['pywb_lang'] = self.default_locale
|
||||||
|
|
||||||
response = endpoint(environ, **args)
|
response = endpoint(environ, **args)
|
||||||
|
|
||||||
|
@ -64,7 +64,7 @@ class RewriterApp(object):
|
|||||||
|
|
||||||
if not jinja_env:
|
if not jinja_env:
|
||||||
jinja_env = JinjaEnv(globals={'static_path': 'static'},
|
jinja_env = JinjaEnv(globals={'static_path': 'static'},
|
||||||
extensions=['jinja2.ext.i18n', 'jinja2.ext.with_'])
|
extensions=['jinja2.ext.i18n'])
|
||||||
jinja_env.jinja_env.install_null_translations()
|
jinja_env.jinja_env.install_null_translations()
|
||||||
|
|
||||||
self.jinja_env = jinja_env
|
self.jinja_env = jinja_env
|
||||||
|
@ -384,7 +384,7 @@ url timestamp { ... }
|
|||||||
|
|
||||||
output_help = """
|
output_help = """
|
||||||
Output file or directory.
|
Output file or directory.
|
||||||
- If directory, each input file is written to a seperate output file
|
- If directory, each input file is written to a separate output file
|
||||||
with a .cdx extension
|
with a .cdx extension
|
||||||
- If output is '-', output is written to stdout
|
- If output is '-', output is written to stdout
|
||||||
"""
|
"""
|
||||||
|
@ -102,11 +102,11 @@ class ACLManager(CollectionsManager):
|
|||||||
|
|
||||||
except IOError as io:
|
except IOError as io:
|
||||||
if must_exist:
|
if must_exist:
|
||||||
print('Error Occured: ' + str(io))
|
print('Error Occurred: ' + str(io))
|
||||||
return False
|
return False
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print('Error Occured: ' + str(e))
|
print('Error Occurred: ' + str(e))
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def save_acl(self, r=None):
|
def save_acl(self, r=None):
|
||||||
|
@ -5,12 +5,16 @@ import logging
|
|||||||
import heapq
|
import heapq
|
||||||
import yaml
|
import yaml
|
||||||
import re
|
import re
|
||||||
|
import gzip
|
||||||
import six
|
import six
|
||||||
|
import pathlib
|
||||||
|
|
||||||
from distutils.util import strtobool
|
from distutils.util import strtobool
|
||||||
from pkg_resources import resource_string, get_distribution
|
from pkg_resources import resource_string, get_distribution
|
||||||
|
|
||||||
from argparse import ArgumentParser, RawTextHelpFormatter
|
from argparse import ArgumentParser, RawTextHelpFormatter
|
||||||
|
from tempfile import mkdtemp, TemporaryDirectory
|
||||||
|
from zipfile import ZipFile
|
||||||
|
|
||||||
from pywb.utils.loaders import load_yaml_config
|
from pywb.utils.loaders import load_yaml_config
|
||||||
from warcio.timeutils import timestamp20_now
|
from warcio.timeutils import timestamp20_now
|
||||||
@ -47,6 +51,9 @@ directory structure expected by pywb
|
|||||||
|
|
||||||
COLLS_DIR = 'collections'
|
COLLS_DIR = 'collections'
|
||||||
|
|
||||||
|
WARC_RX = re.compile(r'.*\.w?arc(\.gz)?$')
|
||||||
|
WACZ_RX = re.compile(r'.*\.wacz$')
|
||||||
|
|
||||||
def __init__(self, coll_name, colls_dir=None, must_exist=True):
|
def __init__(self, coll_name, colls_dir=None, must_exist=True):
|
||||||
colls_dir = colls_dir or self.COLLS_DIR
|
colls_dir = colls_dir or self.COLLS_DIR
|
||||||
self.default_config = load_yaml_config(DEFAULT_CONFIG)
|
self.default_config = load_yaml_config(DEFAULT_CONFIG)
|
||||||
@ -115,29 +122,142 @@ directory structure expected by pywb
|
|||||||
'To create a new collection, run\n\n{1} init {0}')
|
'To create a new collection, run\n\n{1} init {0}')
|
||||||
raise IOError(msg.format(self.coll_name, sys.argv[0]))
|
raise IOError(msg.format(self.coll_name, sys.argv[0]))
|
||||||
|
|
||||||
def add_warcs(self, warcs):
|
def add_archives(self, archives, unpack_wacz=False):
|
||||||
if not os.path.isdir(self.archive_dir):
|
if not os.path.isdir(self.archive_dir):
|
||||||
raise IOError('Directory {0} does not exist'.
|
raise IOError('Directory {0} does not exist'.
|
||||||
format(self.archive_dir))
|
format(self.archive_dir))
|
||||||
|
|
||||||
full_paths = []
|
invalid_archives = []
|
||||||
duplicate_warcs = []
|
warc_paths = []
|
||||||
for filename in warcs:
|
for archive in archives:
|
||||||
filename = os.path.abspath(filename)
|
if self.WARC_RX.match(archive):
|
||||||
|
full_path = self._add_warc(archive)
|
||||||
|
if full_path:
|
||||||
|
warc_paths.append(full_path)
|
||||||
|
elif self.WACZ_RX.match(archive):
|
||||||
|
if unpack_wacz:
|
||||||
|
self._add_wacz_unpacked(archive)
|
||||||
|
else:
|
||||||
|
raise NotImplementedError('Adding waczs without unpacking is not yet implemented. Use '
|
||||||
|
'\'--unpack-wacz\' flag to add the wacz\'s content.')
|
||||||
|
else:
|
||||||
|
invalid_archives.append(archive)
|
||||||
|
|
||||||
|
self._index_merge_warcs(warc_paths, self.DEF_INDEX_FILE)
|
||||||
|
|
||||||
|
if invalid_archives:
|
||||||
|
logging.warning(f'Invalid archives weren\'t added: {", ".join(invalid_archives)}')
|
||||||
|
|
||||||
|
def _rename_warc(self, warc_basename):
|
||||||
|
dupe_idx = 1
|
||||||
|
ext = ''.join(pathlib.Path(warc_basename).suffixes)
|
||||||
|
pre_ext_name = warc_basename.split(ext)[0]
|
||||||
|
|
||||||
|
while True:
|
||||||
|
new_basename = f'{pre_ext_name}-{dupe_idx}{ext}'
|
||||||
|
if not os.path.exists(os.path.join(self.archive_dir, new_basename)):
|
||||||
|
break
|
||||||
|
dupe_idx += 1
|
||||||
|
|
||||||
|
return new_basename
|
||||||
|
|
||||||
|
def _add_warc(self, warc):
|
||||||
|
warc_source = os.path.abspath(warc)
|
||||||
|
source_dir, warc_basename = os.path.split(warc_source)
|
||||||
|
|
||||||
# don't overwrite existing warcs with duplicate names
|
# don't overwrite existing warcs with duplicate names
|
||||||
if os.path.exists(os.path.join(self.archive_dir, os.path.basename(filename))):
|
if os.path.exists(os.path.join(self.archive_dir, warc_basename)):
|
||||||
duplicate_warcs.append(filename)
|
warc_basename = self._rename_warc(warc_basename)
|
||||||
continue
|
logging.info(f'Warc {os.path.basename(warc)} already exists - renamed to {warc_basename}.')
|
||||||
|
|
||||||
shutil.copy2(filename, self.archive_dir)
|
warc_dest = os.path.join(self.archive_dir, warc_basename)
|
||||||
full_paths.append(os.path.join(self.archive_dir, filename))
|
shutil.copy2(warc_source, warc_dest)
|
||||||
logging.info('Copied ' + filename + ' to ' + self.archive_dir)
|
logging.info(f'Copied {warc} to {self.archive_dir} as {warc_basename}')
|
||||||
|
return warc_dest
|
||||||
|
|
||||||
self._index_merge_warcs(full_paths, self.DEF_INDEX_FILE)
|
def _add_wacz_unpacked(self, wacz):
|
||||||
|
wacz = os.path.abspath(wacz)
|
||||||
|
temp_dir = mkdtemp()
|
||||||
|
warc_regex = re.compile(r'.+\.warc(\.gz)?$')
|
||||||
|
cdx_regex = re.compile(r'.+\.cdx(\.gz)?$')
|
||||||
|
with ZipFile(wacz, 'r') as wacz_zip_file:
|
||||||
|
archive_members = wacz_zip_file.namelist()
|
||||||
|
warc_files = [file for file in archive_members if warc_regex.match(file)]
|
||||||
|
if not warc_files:
|
||||||
|
logging.warning(f'WACZ {wacz} does not contain any warc files.')
|
||||||
|
return
|
||||||
|
|
||||||
if duplicate_warcs:
|
# extract warc files
|
||||||
logging.warning(f'Warcs {", ".join(duplicate_warcs)} weren\'t added because of duplicate names.')
|
for warc_file in warc_files:
|
||||||
|
wacz_zip_file.extract(warc_file, temp_dir)
|
||||||
|
|
||||||
|
cdx_files = [file for file in archive_members if cdx_regex.match(file)]
|
||||||
|
if not cdx_files:
|
||||||
|
logging.warning(f'WACZ {wacz} does not contain any indices.')
|
||||||
|
return
|
||||||
|
|
||||||
|
for cdx_file in cdx_files:
|
||||||
|
wacz_zip_file.extract(cdx_file, temp_dir)
|
||||||
|
|
||||||
|
# copy extracted warc files to collections archive dir, use wacz filename as filename with added index if
|
||||||
|
# multiple warc files exist
|
||||||
|
warc_filename_mapping = {}
|
||||||
|
full_paths = []
|
||||||
|
for idx, extracted_warc_file in enumerate(warc_files):
|
||||||
|
_, warc_ext = os.path.splitext(extracted_warc_file)
|
||||||
|
if warc_ext == '.gz':
|
||||||
|
warc_ext = '.warc.gz'
|
||||||
|
warc_filename = os.path.basename(wacz)
|
||||||
|
warc_filename, _ = os.path.splitext(warc_filename)
|
||||||
|
warc_filename = f'{warc_filename}-{idx}{warc_ext}'
|
||||||
|
warc_destination_path = os.path.join(self.archive_dir, warc_filename)
|
||||||
|
|
||||||
|
if os.path.exists(warc_destination_path):
|
||||||
|
warc_filename = self._rename_warc(warc_filename)
|
||||||
|
logging.info(f'Warc {warc_destination_path} already exists - renamed to {warc_filename}.')
|
||||||
|
warc_destination_path = os.path.join(self.archive_dir, warc_filename)
|
||||||
|
|
||||||
|
warc_filename_mapping[os.path.basename(extracted_warc_file)] = warc_filename
|
||||||
|
shutil.copy2(os.path.join(temp_dir, extracted_warc_file), warc_destination_path)
|
||||||
|
full_paths.append(warc_destination_path)
|
||||||
|
|
||||||
|
# rewrite filenames in wacz indices and merge them with collection index file
|
||||||
|
for cdx_file in cdx_files:
|
||||||
|
self._add_wacz_index(os.path.join(self.indexes_dir, self.DEF_INDEX_FILE), os.path.join(temp_dir, cdx_file),
|
||||||
|
warc_filename_mapping)
|
||||||
|
|
||||||
|
# delete temporary files
|
||||||
|
shutil.rmtree(temp_dir)
|
||||||
|
|
||||||
|
def _add_wacz_index(self, collection_index_path, wacz_index_path, filename_mapping):
|
||||||
|
from pywb.warcserver.index.cdxobject import CDXObject
|
||||||
|
|
||||||
|
# rewrite wacz index to temporary index file
|
||||||
|
tempdir = TemporaryDirectory()
|
||||||
|
wacz_index_name = os.path.basename(wacz_index_path)
|
||||||
|
rewritten_index_path = os.path.join(tempdir.name, wacz_index_name)
|
||||||
|
|
||||||
|
with open(rewritten_index_path, 'w') as rewritten_index:
|
||||||
|
if wacz_index_path.endswith('.gz'):
|
||||||
|
wacz_index = gzip.open(wacz_index_path, 'rb')
|
||||||
|
else:
|
||||||
|
wacz_index = open(wacz_index_path, 'rb')
|
||||||
|
|
||||||
|
for line in wacz_index:
|
||||||
|
cdx_object = CDXObject(cdxline=line)
|
||||||
|
if cdx_object['filename'] in filename_mapping:
|
||||||
|
cdx_object['filename'] = filename_mapping[cdx_object['filename']]
|
||||||
|
rewritten_index.write(cdx_object.to_cdxj())
|
||||||
|
|
||||||
|
if not os.path.isfile(collection_index_path):
|
||||||
|
shutil.move(rewritten_index_path, collection_index_path)
|
||||||
|
return
|
||||||
|
|
||||||
|
temp_coll_index_path = collection_index_path + '.tmp.' + timestamp20_now()
|
||||||
|
self._merge_indices(collection_index_path, rewritten_index_path, temp_coll_index_path)
|
||||||
|
shutil.move(temp_coll_index_path, collection_index_path)
|
||||||
|
|
||||||
|
tempdir.cleanup()
|
||||||
|
|
||||||
def reindex(self):
|
def reindex(self):
|
||||||
cdx_file = os.path.join(self.indexes_dir, self.DEF_INDEX_FILE)
|
cdx_file = os.path.join(self.indexes_dir, self.DEF_INDEX_FILE)
|
||||||
@ -190,20 +310,24 @@ directory structure expected by pywb
|
|||||||
|
|
||||||
merged_file = temp_file + '.merged'
|
merged_file = temp_file + '.merged'
|
||||||
|
|
||||||
last_line = None
|
self._merge_indices(cdx_file, temp_file, merged_file)
|
||||||
|
|
||||||
with open(cdx_file, 'rb') as orig_index:
|
|
||||||
with open(temp_file, 'rb') as new_index:
|
|
||||||
with open(merged_file, 'w+b') as merged:
|
|
||||||
for line in heapq.merge(orig_index, new_index):
|
|
||||||
if last_line != line:
|
|
||||||
merged.write(line)
|
|
||||||
last_line = line
|
|
||||||
|
|
||||||
shutil.move(merged_file, cdx_file)
|
shutil.move(merged_file, cdx_file)
|
||||||
#os.rename(merged_file, cdx_file)
|
#os.rename(merged_file, cdx_file)
|
||||||
os.remove(temp_file)
|
os.remove(temp_file)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _merge_indices(index1, index2, dest):
|
||||||
|
last_line = None
|
||||||
|
|
||||||
|
with open(index1, 'rb') as index1_f:
|
||||||
|
with open(index2, 'rb') as index2_f:
|
||||||
|
with open(dest, 'wb') as dest_f:
|
||||||
|
for line in heapq.merge(index1_f, index2_f):
|
||||||
|
if last_line != line:
|
||||||
|
dest_f.write(line)
|
||||||
|
last_line = line
|
||||||
|
|
||||||
def set_metadata(self, namevalue_pairs):
|
def set_metadata(self, namevalue_pairs):
|
||||||
metadata_yaml = os.path.join(self.curr_coll_dir, 'metadata.yaml')
|
metadata_yaml = os.path.join(self.curr_coll_dir, 'metadata.yaml')
|
||||||
metadata = None
|
metadata = None
|
||||||
@ -383,16 +507,23 @@ Create manage file based web archive collections
|
|||||||
listcmd = subparsers.add_parser('list', help=list_help)
|
listcmd = subparsers.add_parser('list', help=list_help)
|
||||||
listcmd.set_defaults(func=do_list)
|
listcmd.set_defaults(func=do_list)
|
||||||
|
|
||||||
# Add Warcs
|
# Add Warcs or Waczs
|
||||||
def do_add(r):
|
def do_add(r):
|
||||||
m = CollectionsManager(r.coll_name)
|
m = CollectionsManager(r.coll_name)
|
||||||
m.add_warcs(r.files)
|
m.add_archives(r.files, r.unpack_wacz)
|
||||||
|
|
||||||
addwarc_help = 'Copy ARCS/WARCS to collection directory and reindex'
|
add_archives_help = 'Copy ARCs/WARCs to collection directory and reindex'
|
||||||
addwarc = subparsers.add_parser('add', help=addwarc_help)
|
add_unpack_wacz_help = 'Copy WARCs from WACZ to collection directory and reindex'
|
||||||
addwarc.add_argument('coll_name')
|
add_archives = subparsers.add_parser('add', help=add_archives_help)
|
||||||
addwarc.add_argument('files', nargs='+')
|
add_archives.add_argument(
|
||||||
addwarc.set_defaults(func=do_add)
|
'--unpack-wacz',
|
||||||
|
dest='unpack_wacz',
|
||||||
|
action='store_true',
|
||||||
|
help=add_unpack_wacz_help
|
||||||
|
)
|
||||||
|
add_archives.add_argument('coll_name')
|
||||||
|
add_archives.add_argument('files', nargs='+')
|
||||||
|
add_archives.set_defaults(func=do_add)
|
||||||
|
|
||||||
# Reindex All
|
# Reindex All
|
||||||
def do_reindex(r):
|
def do_reindex(r):
|
||||||
|
@ -268,7 +268,7 @@ class HTMLRewriterMixin(StreamingRewriter):
|
|||||||
unesc_value = self.try_unescape(value)
|
unesc_value = self.try_unescape(value)
|
||||||
rewritten_value = self.url_rewriter.rewrite(unesc_value, mod, force_abs)
|
rewritten_value = self.url_rewriter.rewrite(unesc_value, mod, force_abs)
|
||||||
|
|
||||||
# if no rewriting has occured, ensure we return original, not reencoded value
|
# if no rewriting has occurred, ensure we return original, not reencoded value
|
||||||
if rewritten_value == value:
|
if rewritten_value == value:
|
||||||
return orig_value
|
return orig_value
|
||||||
|
|
||||||
@ -668,7 +668,7 @@ class HTMLRewriter(HTMLRewriterMixin, HTMLParser):
|
|||||||
if self.parse_comments:
|
if self.parse_comments:
|
||||||
#data = self._rewrite_script(data)
|
#data = self._rewrite_script(data)
|
||||||
|
|
||||||
# Rewrite with seperate HTMLRewriter
|
# Rewrite with separate HTMLRewriter
|
||||||
comment_rewriter = HTMLRewriter(self.url_rewriter,
|
comment_rewriter = HTMLRewriter(self.url_rewriter,
|
||||||
defmod=self.defmod)
|
defmod=self.defmod)
|
||||||
|
|
||||||
|
@ -124,9 +124,7 @@ if (!self.__WB_pmw) {{ self.__WB_pmw = function(obj) {{ this.__WB_source = obj;
|
|||||||
(r'(?<![$.])\s*\blocation\b\s*[=]\s*(?![=])', self.add_suffix(check_loc), 0),
|
(r'(?<![$.])\s*\blocation\b\s*[=]\s*(?![=])', self.add_suffix(check_loc), 0),
|
||||||
# rewriting 'return this'
|
# rewriting 'return this'
|
||||||
(r'\breturn\s+this\b\s*(?![.$])', self.replace_str(this_rw), 0),
|
(r'\breturn\s+this\b\s*(?![.$])', self.replace_str(this_rw), 0),
|
||||||
# rewriting 'this.' special properties access on new line, with ; prepended
|
# rewriting 'this.' special properties access
|
||||||
(r'\n\s*this\b(?=(?:\.(?:{0})\b))'.format(prop_str), self.replace_str(';' + this_rw), 0),
|
|
||||||
# rewriting 'this.' special properties access, not on new line (no ;)
|
|
||||||
(r'(?<![$.])\s*this\b(?=(?:\.(?:{0})\b))'.format(prop_str), self.replace_str(this_rw), 0),
|
(r'(?<![$.])\s*this\b(?=(?:\.(?:{0})\b))'.format(prop_str), self.replace_str(this_rw), 0),
|
||||||
# rewrite '= this' or ', this'
|
# rewrite '= this' or ', this'
|
||||||
(r'(?<=[=,])\s*this\b\s*(?![:.$])', self.replace_str(this_rw), 0),
|
(r'(?<=[=,])\s*this\b\s*(?![:.$])', self.replace_str(this_rw), 0),
|
||||||
|
@ -5,7 +5,7 @@ from pywb.utils.loaders import load
|
|||||||
|
|
||||||
from six.moves.urllib.parse import urlsplit, quote
|
from six.moves.urllib.parse import urlsplit, quote
|
||||||
|
|
||||||
from jinja2 import Environment, TemplateNotFound, contextfunction, select_autoescape
|
from jinja2 import Environment, TemplateNotFound, pass_context, select_autoescape
|
||||||
from jinja2 import FileSystemLoader, PackageLoader, ChoiceLoader
|
from jinja2 import FileSystemLoader, PackageLoader, ChoiceLoader
|
||||||
|
|
||||||
from webassets.ext.jinja2 import AssetsExtension
|
from webassets.ext.jinja2 import AssetsExtension
|
||||||
@ -139,7 +139,7 @@ class JinjaEnv(object):
|
|||||||
return loc_map.get(loc)
|
return loc_map.get(loc)
|
||||||
|
|
||||||
def override_func(jinja_env, name):
|
def override_func(jinja_env, name):
|
||||||
@contextfunction
|
@pass_context
|
||||||
def get_override(context, text):
|
def get_override(context, text):
|
||||||
translate = get_translate(context)
|
translate = get_translate(context)
|
||||||
if not translate:
|
if not translate:
|
||||||
@ -158,7 +158,7 @@ class JinjaEnv(object):
|
|||||||
|
|
||||||
# Special _Q() function to return %-encoded text, necessary for use
|
# Special _Q() function to return %-encoded text, necessary for use
|
||||||
# with text in banner
|
# with text in banner
|
||||||
@contextfunction
|
@pass_context
|
||||||
def quote_gettext(context, text):
|
def quote_gettext(context, text):
|
||||||
translate = get_translate(context)
|
translate = get_translate(context)
|
||||||
if not translate:
|
if not translate:
|
||||||
@ -171,14 +171,14 @@ class JinjaEnv(object):
|
|||||||
self.jinja_env.globals['_Q'] = quote_gettext
|
self.jinja_env.globals['_Q'] = quote_gettext
|
||||||
self.jinja_env.globals['default_locale'] = default_locale
|
self.jinja_env.globals['default_locale'] = default_locale
|
||||||
|
|
||||||
@contextfunction
|
@pass_context
|
||||||
def switch_locale(context, locale):
|
def switch_locale(context, locale):
|
||||||
environ = context.get('env')
|
environ = context.get('env')
|
||||||
curr_loc = environ.get('pywb_lang', '')
|
curr_loc = environ.get('pywb_lang', '')
|
||||||
|
|
||||||
request_uri = environ.get('REQUEST_URI', environ.get('PATH_INFO'))
|
request_uri = environ.get('REQUEST_URI', environ.get('PATH_INFO'))
|
||||||
|
|
||||||
if curr_loc:
|
if curr_loc and request_uri.startswith('/' + curr_loc + '/'):
|
||||||
return request_uri.replace(curr_loc, locale, 1)
|
return request_uri.replace(curr_loc, locale, 1)
|
||||||
|
|
||||||
app_prefix = environ.get('pywb.app_prefix', '')
|
app_prefix = environ.get('pywb.app_prefix', '')
|
||||||
@ -188,7 +188,7 @@ class JinjaEnv(object):
|
|||||||
|
|
||||||
return app_prefix + '/' + locale + request_uri
|
return app_prefix + '/' + locale + request_uri
|
||||||
|
|
||||||
@contextfunction
|
@pass_context
|
||||||
def get_locale_prefixes(context):
|
def get_locale_prefixes(context):
|
||||||
environ = context.get('env')
|
environ = context.get('env')
|
||||||
locale_prefixes = {}
|
locale_prefixes = {}
|
||||||
@ -196,11 +196,11 @@ class JinjaEnv(object):
|
|||||||
orig_prefix = environ.get('pywb.app_prefix', '')
|
orig_prefix = environ.get('pywb.app_prefix', '')
|
||||||
coll = environ.get('SCRIPT_NAME', '')
|
coll = environ.get('SCRIPT_NAME', '')
|
||||||
|
|
||||||
if orig_prefix:
|
if orig_prefix and coll.startswith(orig_prefix):
|
||||||
coll = coll[len(orig_prefix):]
|
coll = coll[len(orig_prefix):]
|
||||||
|
|
||||||
curr_loc = environ.get('pywb_lang', '')
|
curr_loc = environ.get('pywb_lang', '')
|
||||||
if curr_loc:
|
if curr_loc and coll.startswith('/' + curr_loc):
|
||||||
coll = coll[len(curr_loc) + 1:]
|
coll = coll[len(curr_loc) + 1:]
|
||||||
|
|
||||||
for locale in loc_map.keys():
|
for locale in loc_map.keys():
|
||||||
|
@ -143,7 +143,7 @@ r"""
|
|||||||
'var foo = _____WB$wombat$check$this$function_____(this).location'
|
'var foo = _____WB$wombat$check$this$function_____(this).location'
|
||||||
|
|
||||||
>>> _test_js_obj_proxy('A = B\nthis.location = "foo"')
|
>>> _test_js_obj_proxy('A = B\nthis.location = "foo"')
|
||||||
'A = B\n;_____WB$wombat$check$this$function_____(this).location = "foo"'
|
'A = B\n_____WB$wombat$check$this$function_____(this).location = "foo"'
|
||||||
|
|
||||||
>>> _test_js_obj_proxy('var foo = this.location2')
|
>>> _test_js_obj_proxy('var foo = this.location2')
|
||||||
'var foo = this.location2'
|
'var foo = this.location2'
|
||||||
|
Binary file not shown.
Before Width: | Height: | Size: 20 KiB |
Binary file not shown.
Before Width: | Height: | Size: 2.5 KiB |
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@ -1,6 +1,6 @@
|
|||||||
/*
|
/*
|
||||||
Wombat.js client-side rewriting engine for web archive replay
|
Wombat.js client-side rewriting engine for web archive replay
|
||||||
Copyright (C) 2014-2023 Webrecorder Software, Rhizome, and Contributors. Released under the GNU Affero General Public License.
|
Copyright (C) 2014-2024 Webrecorder Software, Rhizome, and Contributors. Released under the GNU Affero General Public License.
|
||||||
|
|
||||||
This file is part of wombat.js, see https://github.com/webrecorder/wombat.js for the full source
|
This file is part of wombat.js, see https://github.com/webrecorder/wombat.js for the full source
|
||||||
Wombat.js is part of the Webrecorder project (https://github.com/webrecorder)
|
Wombat.js is part of the Webrecorder project (https://github.com/webrecorder)
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
/*
|
/*
|
||||||
Wombat.js client-side rewriting engine for web archive replay
|
Wombat.js client-side rewriting engine for web archive replay
|
||||||
Copyright (C) 2014-2023 Webrecorder Software, Rhizome, and Contributors. Released under the GNU Affero General Public License.
|
Copyright (C) 2014-2024 Webrecorder Software, Rhizome, and Contributors. Released under the GNU Affero General Public License.
|
||||||
|
|
||||||
This file is part of wombat.js, see https://github.com/webrecorder/wombat.js for the full source
|
This file is part of wombat.js, see https://github.com/webrecorder/wombat.js for the full source
|
||||||
Wombat.js is part of the Webrecorder project (https://github.com/webrecorder)
|
Wombat.js is part of the Webrecorder project (https://github.com/webrecorder)
|
||||||
|
Binary file not shown.
Before Width: | Height: | Size: 1.6 KiB |
@ -3,7 +3,7 @@
|
|||||||
{% block body %}
|
{% block body %}
|
||||||
<div class="container text-danger error">
|
<div class="container text-danger error">
|
||||||
<div class="row justify-content-center">
|
<div class="row justify-content-center">
|
||||||
<h2 class="display-2">Pywb Error</h2>
|
<h2 class="display-2">{{ _('Pywb Error') }}</h2>
|
||||||
</div>
|
</div>
|
||||||
<div class="row">
|
<div class="row">
|
||||||
<div class="col-12 text-center">
|
<div class="col-12 text-center">
|
||||||
|
@ -25,8 +25,21 @@ html, body
|
|||||||
|
|
||||||
<div id="app" style="width: 100%; height: 200px"></div>
|
<div id="app" style="width: 100%; height: 200px"></div>
|
||||||
<script>
|
<script>
|
||||||
VueUI.main("{{ static_prefix }}", "{{ url }}", "{{ wb_prefix }}", "{{ timestamp }}", "{{ ui.logo }}", "{{ ui.navbar_background_hex | default('f8f9fa') }}", "{{ ui.navbar_color_hex | default('212529') }}", "{{ ui.navbar_light_buttons }}", "{{ env.pywb_lang | default('en') }}",
|
VueUI.main({
|
||||||
allLocales, i18nStrings, "{{ ui.logo_home_url }}");
|
staticPrefix: "{{ static_prefix }}",
|
||||||
|
url: "{{ url }}",
|
||||||
|
prefix: "{{ wb_prefix }}",
|
||||||
|
timestamp: "{{ timestamp }}",
|
||||||
|
logoUrl: "{{ ui.logo }}",
|
||||||
|
navbarBackground: "{{ ui.navbar_background_hex | default('f8f9fa') }}",
|
||||||
|
navbarColor: "{{ ui.navbar_color_hex | default('212529') }}",
|
||||||
|
navbarLightButtons: "{{ ui.navbar_light_buttons }}",
|
||||||
|
logoHomeUrl: "{{ ui.logo_home_url }}",
|
||||||
|
disablePrinting: "{{ ui.disable_printing }}",
|
||||||
|
allLocales: allLocales
|
||||||
|
},
|
||||||
|
"{{ env.pywb_lang | default('en') }}",
|
||||||
|
i18nStrings);
|
||||||
</script>
|
</script>
|
||||||
|
|
||||||
<div id="wb_iframe_div">
|
<div id="wb_iframe_div">
|
||||||
|
@ -94,8 +94,21 @@
|
|||||||
<div id="app" style="width: 100%; height: 100%"></div>
|
<div id="app" style="width: 100%; height: 100%"></div>
|
||||||
|
|
||||||
<script>
|
<script>
|
||||||
VueUI.main("{{ static_prefix }}", "{{ url }}", "{{ prefix }}", undefined, "{{ ui.logo }}", "{{ ui.navbar_background_hex | default('f8f9fa') }}", "{{ ui.navbar_color_hex | default('212529') }}", "{{ ui.navbar_light_buttons }}", "{{ env.pywb_lang | default('en') }}",
|
VueUI.main({
|
||||||
allLocales, i18nStrings, "{{ ui.logo_home_url }}");
|
staticPrefix: "{{ static_prefix }}",
|
||||||
|
url: "{{ url }}",
|
||||||
|
prefix: "{{ prefix }}",
|
||||||
|
timestamp: undefined,
|
||||||
|
logoUrl: "{{ ui.logo }}",
|
||||||
|
navbarBackground: "{{ ui.navbar_background_hex | default('f8f9fa') }}",
|
||||||
|
navbarColor: "{{ ui.navbar_color_hex | default('212529') }}",
|
||||||
|
navbarLightButtons: "{{ ui.navbar_light_buttons }}",
|
||||||
|
logoHomeUrl: "{{ ui.logo_home_url }}",
|
||||||
|
disablePrinting: "{{ ui.disable_printing }}",
|
||||||
|
allLocales: allLocales
|
||||||
|
},
|
||||||
|
"{{ env.pywb_lang | default('en') }}",
|
||||||
|
i18nStrings);
|
||||||
</script>
|
</script>
|
||||||
|
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
@ -49,6 +49,7 @@
|
|||||||
"Hide calendar":"{{ _Q('Hide calendar') }}",
|
"Hide calendar":"{{ _Q('Hide calendar') }}",
|
||||||
"Previous capture":"{{ _Q('Previous capture') }}",
|
"Previous capture":"{{ _Q('Previous capture') }}",
|
||||||
"Next capture":"{{ _Q('Next capture') }}",
|
"Next capture":"{{ _Q('Next capture') }}",
|
||||||
|
"Print":"{{ _Q('Print') }}",
|
||||||
"Select language":"{{ _Q('Select language') }}",
|
"Select language":"{{ _Q('Select language') }}",
|
||||||
"View capture on {date}":"{{ _Q('View capture on {date}') }}",
|
"View capture on {date}":"{{ _Q('View capture on {date}') }}",
|
||||||
"{count} capture":"{{ _Q('{count} capture') }}",
|
"{count} capture":"{{ _Q('{count} capture') }}",
|
||||||
|
@ -150,7 +150,7 @@ def iter_exact(reader, key, token=b' '):
|
|||||||
"""
|
"""
|
||||||
Create an iterator which iterates over lines where the first field matches
|
Create an iterator which iterates over lines where the first field matches
|
||||||
the 'key', equivalent to token + sep prefix.
|
the 'key', equivalent to token + sep prefix.
|
||||||
Default field termin_ator/seperator is ' '
|
Default field termin_ator/separator is ' '
|
||||||
"""
|
"""
|
||||||
|
|
||||||
return iter_prefix(reader, key + token)
|
return iter_prefix(reader, key + token)
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
__version__ = '2.7.3'
|
__version__ = '2.8.3'
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
print(__version__)
|
print(__version__)
|
||||||
|
@ -77,6 +77,17 @@
|
|||||||
<i class="far fa-chart-bar"></i>
|
<i class="far fa-chart-bar"></i>
|
||||||
</button>
|
</button>
|
||||||
</li>
|
</li>
|
||||||
|
<li class="nav-item">
|
||||||
|
<button
|
||||||
|
class="btn btn-sm"
|
||||||
|
:class="{'btn-outline-light': lightButtons, 'btn-outline-dark': !lightButtons}"
|
||||||
|
:aria-pressed="printReplayFrame"
|
||||||
|
@click="printReplayFrame"
|
||||||
|
v-if="printingEnabled && hasReplayFrame()"
|
||||||
|
:title="_('Print')">
|
||||||
|
<i class="fas fa-print"></i>
|
||||||
|
</button>
|
||||||
|
</li>
|
||||||
<li class="nav-item dropdown" v-if="localesAreSet">
|
<li class="nav-item dropdown" v-if="localesAreSet">
|
||||||
<button
|
<button
|
||||||
class="btn btn-sm dropdown-toggle"
|
class="btn btn-sm dropdown-toggle"
|
||||||
@ -216,6 +227,9 @@ export default {
|
|||||||
lightButtons() {
|
lightButtons() {
|
||||||
return !!this.config.navbarLightButtons;
|
return !!this.config.navbarLightButtons;
|
||||||
},
|
},
|
||||||
|
printingEnabled() {
|
||||||
|
return !this.config.disablePrinting;
|
||||||
|
},
|
||||||
previousSnapshot() {
|
previousSnapshot() {
|
||||||
if (!this.currentSnapshotIndex) {
|
if (!this.currentSnapshotIndex) {
|
||||||
return null;
|
return null;
|
||||||
@ -306,6 +320,14 @@ export default {
|
|||||||
this.showTimelineView = !this.showTimelineView;
|
this.showTimelineView = !this.showTimelineView;
|
||||||
window.localStorage.setItem("showTimelineView", this.showTimelineView ? "1" : "0");
|
window.localStorage.setItem("showTimelineView", this.showTimelineView ? "1" : "0");
|
||||||
},
|
},
|
||||||
|
hasReplayFrame() {
|
||||||
|
return !! window.frames.replay_iframe;
|
||||||
|
},
|
||||||
|
printReplayFrame() {
|
||||||
|
window.frames.replay_iframe.contentWindow.focus();
|
||||||
|
window.frames.replay_iframe.contentWindow.print();
|
||||||
|
return false;
|
||||||
|
},
|
||||||
setData(/** @type {PywbData} data */ data) {
|
setData(/** @type {PywbData} data */ data) {
|
||||||
|
|
||||||
// data-set will usually happen at App INIT (from parent caller)
|
// data-set will usually happen at App INIT (from parent caller)
|
||||||
|
@ -39,7 +39,7 @@
|
|||||||
@keyup.enter="changePeriod(histoPeriod, $event)"
|
@keyup.enter="changePeriod(histoPeriod, $event)"
|
||||||
@mouseover="setTooltipPeriod(histoPeriod, $event)"
|
@mouseover="setTooltipPeriod(histoPeriod, $event)"
|
||||||
@mouseout="setTooltipPeriod(null, $event)"
|
@mouseout="setTooltipPeriod(null, $event)"
|
||||||
tabindex="0"
|
:tabindex="histoPeriod.snapshotCount > 0 ? 0 : -1"
|
||||||
>
|
>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
@ -49,7 +49,6 @@
|
|||||||
@keyup.enter="changePeriod(histoPeriod, $event)"
|
@keyup.enter="changePeriod(histoPeriod, $event)"
|
||||||
@mouseover="setTooltipPeriod(subPeriod, $event)"
|
@mouseover="setTooltipPeriod(subPeriod, $event)"
|
||||||
@mouseout="setTooltipPeriod(null, $event)"
|
@mouseout="setTooltipPeriod(null, $event)"
|
||||||
tabindex="0"
|
|
||||||
>
|
>
|
||||||
<div class="label">
|
<div class="label">
|
||||||
{{subPeriod.getReadableId()}}
|
{{subPeriod.getReadableId()}}
|
||||||
|
@ -8,7 +8,7 @@
|
|||||||
@keyup.enter="changePeriod(parents[0])"
|
@keyup.enter="changePeriod(parents[0])"
|
||||||
:title="getPeriodZoomOutText(parents[0])"
|
:title="getPeriodZoomOutText(parents[0])"
|
||||||
tabindex="1">
|
tabindex="1">
|
||||||
<img src="/static/zoom-out-icon-333316.png" /> {{parents[0].getReadableId(true)}}
|
<i class="fa fa-search-minus"></i> {{parents[0].getReadableId(true)}}
|
||||||
</span>
|
</span>
|
||||||
</span>
|
</span>
|
||||||
>
|
>
|
||||||
|
@ -32,7 +32,7 @@ export class PywbI18N {
|
|||||||
getMonth(id, type='long') {
|
getMonth(id, type='long') {
|
||||||
return decodeURIComponent(this.config[PywbI18N.monthIdPrefix[id]+'_'+type]);
|
return decodeURIComponent(this.config[PywbI18N.monthIdPrefix[id]+'_'+type]);
|
||||||
}
|
}
|
||||||
// can get long (default) or short day string or intial
|
// can get long (default) or short day string or initial
|
||||||
// PywbI18N expects to receive day's initials like:
|
// PywbI18N expects to receive day's initials like:
|
||||||
// config.mon_short, config.tue_long, ...., config.<mmm>_short, config.<mmm>_long
|
// config.mon_short, config.tue_long, ...., config.<mmm>_short, config.<mmm>_long
|
||||||
getWeekDay(id, type='long') {
|
getWeekDay(id, type='long') {
|
||||||
|
@ -7,40 +7,44 @@ import Vue from "vue/dist/vue.esm.browser";
|
|||||||
|
|
||||||
|
|
||||||
// ===========================================================================
|
// ===========================================================================
|
||||||
export function main(staticPrefix, url, prefix, timestamp, logoUrl, navbarBackground, navbarColor, navbarLightButtons, locale, allLocales, i18nStrings, logoHomeUrl) {
|
export function main(config, locale, i18nStrings) {
|
||||||
PywbI18N.init(locale, i18nStrings);
|
PywbI18N.init(locale, i18nStrings);
|
||||||
new CDXLoader(staticPrefix, url, prefix, timestamp, logoUrl, navbarBackground, navbarColor, navbarLightButtons, allLocales, logoHomeUrl);
|
new CDXLoader(config);
|
||||||
}
|
}
|
||||||
|
|
||||||
// ===========================================================================
|
// ===========================================================================
|
||||||
class CDXLoader {
|
class CDXLoader {
|
||||||
constructor(staticPrefix, url, prefix, timestamp, logoUrl, navbarBackground, navbarColor, navbarLightButtons, allLocales, logoHomeUrl) {
|
constructor(config) {
|
||||||
this.loadingSpinner = null;
|
this.loadingSpinner = null;
|
||||||
this.loaded = false;
|
this.loaded = false;
|
||||||
this.opts = {};
|
this.opts = {};
|
||||||
this.prefix = prefix;
|
this.url = config.url;
|
||||||
this.staticPrefix = staticPrefix;
|
this.prefix = config.prefix;
|
||||||
this.logoUrl = logoUrl;
|
this.staticPrefix = config.staticPrefix;
|
||||||
this.logoHomeUrl = logoHomeUrl;
|
this.logoUrl = config.logoUrl;
|
||||||
this.navbarBackground = navbarBackground;
|
this.logoHomeUrl = config.logoHomeUrl;
|
||||||
this.navbarColor = navbarColor;
|
this.navbarBackground = config.navbarBackground;
|
||||||
this.navbarLightButtons = navbarLightButtons;
|
this.navbarColor = config.navbarColor;
|
||||||
this.timestamp = timestamp;
|
this.navbarLightButtons = config.navbarLightButtons;
|
||||||
|
this.disablePrinting = config.disablePrinting;
|
||||||
|
|
||||||
this.isReplay = (timestamp !== undefined);
|
this.timestamp = config.timestamp;
|
||||||
|
|
||||||
|
this.isReplay = (config.timestamp !== undefined);
|
||||||
|
|
||||||
setTimeout(() => {
|
setTimeout(() => {
|
||||||
if (!this.loaded) {
|
if (!this.loaded) {
|
||||||
this.loadingSpinner = new LoadingSpinner({text: PywbI18N.instance?.getText('Loading...'), isSmall: !!timestamp}); // bootstrap loading-spinner EARLY ON
|
this.loadingSpinner = new LoadingSpinner({text: PywbI18N.instance?.getText('Loading...'), isSmall: !!this.timestamp}); // bootstrap loading-spinner EARLY ON
|
||||||
this.loadingSpinner.setOn();
|
this.loadingSpinner.setOn();
|
||||||
}
|
}
|
||||||
}, 500);
|
}, 500);
|
||||||
|
|
||||||
if (this.isReplay) {
|
if (this.isReplay) {
|
||||||
window.WBBanner = new VueBannerWrapper(this, url, timestamp);
|
window.WBBanner = new VueBannerWrapper(this, this.url, this.timestamp);
|
||||||
}
|
}
|
||||||
|
|
||||||
let queryURL;
|
let queryURL;
|
||||||
|
let url;
|
||||||
|
|
||||||
// query form *?=url...
|
// query form *?=url...
|
||||||
if (window.location.href.indexOf("*?") > 0) {
|
if (window.location.href.indexOf("*?") > 0) {
|
||||||
@ -48,23 +52,24 @@ class CDXLoader {
|
|||||||
url = new URL(queryURL).searchParams.get("url");
|
url = new URL(queryURL).searchParams.get("url");
|
||||||
|
|
||||||
// otherwise, traditional calendar form /*/<url>
|
// otherwise, traditional calendar form /*/<url>
|
||||||
} else if (url) {
|
} else if (this.url) {
|
||||||
|
url = this.url
|
||||||
const params = new URLSearchParams();
|
const params = new URLSearchParams();
|
||||||
params.set("url", url);
|
params.set("url", url);
|
||||||
params.set("output", "json");
|
params.set("output", "json");
|
||||||
queryURL = prefix + "cdx?" + params.toString();
|
queryURL = this.prefix + "cdx?" + params.toString();
|
||||||
|
|
||||||
// otherwise, an error since no URL
|
// otherwise, an error since no URL
|
||||||
} else {
|
} else {
|
||||||
throw new Error("No query URL specified");
|
throw new Error("No query URL specified");
|
||||||
}
|
}
|
||||||
|
|
||||||
const logoImg = this.staticPrefix + "/" + (this.logoUrl ? this.logoUrl : "pywb-logo-sm.png");
|
config.logoImg = this.staticPrefix + "/" + (!!this.logoUrl ? this.logoUrl : "pywb-logo-sm.png");
|
||||||
|
|
||||||
this.app = this.initApp({logoImg, logoHomeUrl, navbarBackground, navbarColor, navbarLightButtons, url, allLocales, timestamp});
|
this.app = this.initApp(config);
|
||||||
|
|
||||||
this.loadCDX(queryURL).then((cdxList) => {
|
this.loadCDX(queryURL).then((cdxList) => {
|
||||||
this.setAppData(cdxList, url, this.timestamp);
|
this.setAppData(cdxList, url, config.timestamp);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -260,6 +260,10 @@ class AccessChecker(object):
|
|||||||
if key.startswith(acl_key):
|
if key.startswith(acl_key):
|
||||||
acl_obj = CDXObject(acl)
|
acl_obj = CDXObject(acl)
|
||||||
|
|
||||||
|
# Check for "*," in ACL, which matches any URL
|
||||||
|
if acl_key == b"*,":
|
||||||
|
acl_obj = CDXObject(acl)
|
||||||
|
|
||||||
if acl_obj:
|
if acl_obj:
|
||||||
user = acl_obj.get('user')
|
user = acl_obj.get('user')
|
||||||
if user == acl_user:
|
if user == acl_user:
|
||||||
|
@ -11,6 +11,7 @@ from io import BytesIO
|
|||||||
import base64
|
import base64
|
||||||
import cgi
|
import cgi
|
||||||
import json
|
import json
|
||||||
|
import math
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
|
|
||||||
@ -328,6 +329,21 @@ class MethodQueryCanonicalizer(object):
|
|||||||
_parser(v, name)
|
_parser(v, name)
|
||||||
|
|
||||||
elif name:
|
elif name:
|
||||||
|
if isinstance(json_obj, bool) and json_obj:
|
||||||
|
data[get_key(name)] = "true"
|
||||||
|
elif isinstance(json_obj, bool):
|
||||||
|
data[get_key(name)] = "false"
|
||||||
|
elif json_obj is None:
|
||||||
|
data[get_key(name)] = "null"
|
||||||
|
elif isinstance(json_obj, float):
|
||||||
|
# Treat floats like JavaScript's Number.prototype.toString(),
|
||||||
|
# drop decimal if float represents a whole number.
|
||||||
|
fraction, _ = math.modf(json_obj)
|
||||||
|
if fraction == 0.0:
|
||||||
|
data[get_key(name)] = str(int(json_obj))
|
||||||
|
else:
|
||||||
|
data[get_key(name)] = str(json_obj)
|
||||||
|
else:
|
||||||
data[get_key(name)] = str(json_obj)
|
data[get_key(name)] = str(json_obj)
|
||||||
|
|
||||||
_parser(json.loads(string))
|
_parser(json.loads(string))
|
||||||
|
@ -39,7 +39,7 @@ class InputReqApp(object):
|
|||||||
|
|
||||||
#=============================================================================
|
#=============================================================================
|
||||||
class TestInputReq(object):
|
class TestInputReq(object):
|
||||||
def setup(self):
|
def setup_method(self):
|
||||||
self.app = InputReqApp()
|
self.app = InputReqApp()
|
||||||
self.testapp = webtest.TestApp(self.app)
|
self.testapp = webtest.TestApp(self.app)
|
||||||
|
|
||||||
@ -82,44 +82,49 @@ Foo: Bar\r\n\
|
|||||||
class TestPostQueryExtract(object):
|
class TestPostQueryExtract(object):
|
||||||
@classmethod
|
@classmethod
|
||||||
def setup_class(cls):
|
def setup_class(cls):
|
||||||
cls.post_data = b'foo=bar&dir=%2Fbaz'
|
cls.post_data = b'foo=bar&dir=%2Fbaz&do=true&re=false&re=null'
|
||||||
cls.binary_post_data = b'\x816l`L\xa04P\x0e\xe0r\x02\xb5\x89\x19\x00fP\xdb\x0e\xb0\x02,'
|
cls.binary_post_data = b'\x816l`L\xa04P\x0e\xe0r\x02\xb5\x89\x19\x00fP\xdb\x0e\xb0\x02,'
|
||||||
|
|
||||||
def test_post_extract_1(self):
|
def test_post_extract_1(self):
|
||||||
mq = MethodQueryCanonicalizer('POST', 'application/x-www-form-urlencoded',
|
mq = MethodQueryCanonicalizer('POST', 'application/x-www-form-urlencoded',
|
||||||
len(self.post_data), BytesIO(self.post_data))
|
len(self.post_data), BytesIO(self.post_data))
|
||||||
|
|
||||||
assert mq.append_query('http://example.com/') == 'http://example.com/?__wb_method=POST&foo=bar&dir=/baz'
|
assert mq.append_query('http://example.com/') == 'http://example.com/?__wb_method=POST&foo=bar&dir=/baz&do=true&re=false&re=null'
|
||||||
|
|
||||||
assert mq.append_query('http://example.com/?123=ABC') == 'http://example.com/?123=ABC&__wb_method=POST&foo=bar&dir=/baz'
|
assert mq.append_query('http://example.com/?123=ABC') == 'http://example.com/?123=ABC&__wb_method=POST&foo=bar&dir=/baz&do=true&re=false&re=null'
|
||||||
|
|
||||||
def test_post_extract_json(self):
|
def test_post_extract_json(self):
|
||||||
post_data = b'{"a": "b", "c": {"a": 2}, "d": "e"}'
|
post_data = b'{"a": "b", "c": {"a": 2}, "d": "e", "f": true, "g": [false, null]}'
|
||||||
mq = MethodQueryCanonicalizer('POST', 'application/json',
|
mq = MethodQueryCanonicalizer('POST', 'application/json',
|
||||||
len(post_data), BytesIO(post_data))
|
len(post_data), BytesIO(post_data))
|
||||||
|
|
||||||
assert mq.append_query('http://example.com/') == 'http://example.com/?__wb_method=POST&a=b&a.2_=2&d=e'
|
assert mq.append_query('http://example.com/') == 'http://example.com/?__wb_method=POST&a=b&a.2_=2&d=e&f=true&g=false&g.2_=null'
|
||||||
|
|
||||||
|
post_data = b'{"type": "event", "id": 44.0, "float": 35.7, "values": [true, false, null], "source": {"type": "component", "id": "a+b&c= d", "values": [3, 4]}}'
|
||||||
|
mq = MethodQueryCanonicalizer('POST', 'application/json',
|
||||||
|
len(post_data), BytesIO(post_data))
|
||||||
|
|
||||||
|
assert mq.append_query('http://example.com/events') == 'http://example.com/events?__wb_method=POST&type=event&id=44&float=35.7&values=true&values.2_=false&values.3_=null&type.2_=component&id.2_=a%2Bb%26c%3D+d&values.4_=3&values.5_=4'
|
||||||
|
|
||||||
def test_put_extract_method(self):
|
def test_put_extract_method(self):
|
||||||
mq = MethodQueryCanonicalizer('PUT', 'application/x-www-form-urlencoded',
|
mq = MethodQueryCanonicalizer('PUT', 'application/x-www-form-urlencoded',
|
||||||
len(self.post_data), BytesIO(self.post_data))
|
len(self.post_data), BytesIO(self.post_data))
|
||||||
|
|
||||||
assert mq.append_query('http://example.com/') == 'http://example.com/?__wb_method=PUT&foo=bar&dir=/baz'
|
assert mq.append_query('http://example.com/') == 'http://example.com/?__wb_method=PUT&foo=bar&dir=/baz&do=true&re=false&re=null'
|
||||||
|
|
||||||
def test_post_extract_non_form_data_1(self):
|
def test_post_extract_non_form_data_1(self):
|
||||||
mq = MethodQueryCanonicalizer('POST', 'application/octet-stream',
|
mq = MethodQueryCanonicalizer('POST', 'application/octet-stream',
|
||||||
len(self.post_data), BytesIO(self.post_data))
|
len(self.post_data), BytesIO(self.post_data))
|
||||||
|
|
||||||
#base64 encoded data
|
#base64 encoded data
|
||||||
assert mq.append_query('http://example.com/') == 'http://example.com/?__wb_method=POST&__wb_post_data=Zm9vPWJhciZkaXI9JTJGYmF6'
|
assert mq.append_query('http://example.com/') == 'http://example.com/?__wb_method=POST&__wb_post_data=Zm9vPWJhciZkaXI9JTJGYmF6JmRvPXRydWUmcmU9ZmFsc2UmcmU9bnVsbA=='
|
||||||
|
|
||||||
def test_post_extract_non_form_data_2(self):
|
def test_post_extract_non_form_data_2(self):
|
||||||
mq = MethodQueryCanonicalizer('POST', 'text/plain',
|
mq = MethodQueryCanonicalizer('POST', 'text/plain',
|
||||||
len(self.post_data), BytesIO(self.post_data))
|
len(self.post_data), BytesIO(self.post_data))
|
||||||
|
|
||||||
#base64 encoded data
|
#base64 encoded data
|
||||||
assert mq.append_query('http://example.com/pathbar?id=123') == 'http://example.com/pathbar?id=123&__wb_method=POST&__wb_post_data=Zm9vPWJhciZkaXI9JTJGYmF6'
|
assert mq.append_query('http://example.com/pathbar?id=123') == 'http://example.com/pathbar?id=123&__wb_method=POST&__wb_post_data=Zm9vPWJhciZkaXI9JTJGYmF6JmRvPXRydWUmcmU9ZmFsc2UmcmU9bnVsbA=='
|
||||||
|
|
||||||
def test_post_extract_length_invalid_ignore(self):
|
def test_post_extract_length_invalid_ignore(self):
|
||||||
mq = MethodQueryCanonicalizer('POST', 'application/x-www-form-urlencoded',
|
mq = MethodQueryCanonicalizer('POST', 'application/x-www-form-urlencoded',
|
||||||
@ -136,13 +141,13 @@ class TestPostQueryExtract(object):
|
|||||||
mq = MethodQueryCanonicalizer('POST', 'application/x-www-form-urlencoded',
|
mq = MethodQueryCanonicalizer('POST', 'application/x-www-form-urlencoded',
|
||||||
len(self.post_data) - 4, BytesIO(self.post_data))
|
len(self.post_data) - 4, BytesIO(self.post_data))
|
||||||
|
|
||||||
assert mq.append_query('http://example.com/') == 'http://example.com/?__wb_method=POST&foo=bar&dir=%2'
|
assert mq.append_query('http://example.com/') == 'http://example.com/?__wb_method=POST&foo=bar&dir=/baz&do=true&re=false&re='
|
||||||
|
|
||||||
def test_post_extract_length_too_long(self):
|
def test_post_extract_length_too_long(self):
|
||||||
mq = MethodQueryCanonicalizer('POST', 'application/x-www-form-urlencoded',
|
mq = MethodQueryCanonicalizer('POST', 'application/x-www-form-urlencoded',
|
||||||
len(self.post_data) + 4, BytesIO(self.post_data))
|
len(self.post_data) + 4, BytesIO(self.post_data))
|
||||||
|
|
||||||
assert mq.append_query('http://example.com/') == 'http://example.com/?__wb_method=POST&foo=bar&dir=/baz'
|
assert mq.append_query('http://example.com/') == 'http://example.com/?__wb_method=POST&foo=bar&dir=/baz&do=true&re=false&re=null'
|
||||||
|
|
||||||
def test_post_extract_malformed_form_data(self):
|
def test_post_extract_malformed_form_data(self):
|
||||||
mq = MethodQueryCanonicalizer('POST', 'application/x-www-form-urlencoded',
|
mq = MethodQueryCanonicalizer('POST', 'application/x-www-form-urlencoded',
|
||||||
@ -155,7 +160,7 @@ class TestPostQueryExtract(object):
|
|||||||
mq = MethodQueryCanonicalizer('POST', 'multipart/form-data',
|
mq = MethodQueryCanonicalizer('POST', 'multipart/form-data',
|
||||||
len(self.post_data), BytesIO(self.post_data))
|
len(self.post_data), BytesIO(self.post_data))
|
||||||
|
|
||||||
assert mq.append_query('http://example.com/') == 'http://example.com/?__wb_method=POST&__wb_post_data=Zm9vPWJhciZkaXI9JTJGYmF6'
|
assert mq.append_query('http://example.com/') == 'http://example.com/?__wb_method=POST&__wb_post_data=Zm9vPWJhciZkaXI9JTJGYmF6JmRvPXRydWUmcmU9ZmFsc2UmcmU9bnVsbA=='
|
||||||
|
|
||||||
|
|
||||||
def test_options(self):
|
def test_options(self):
|
||||||
|
@ -18,7 +18,7 @@ from .testutils import LiveServerTests, HttpBinLiveTests, BaseTestClass
|
|||||||
|
|
||||||
|
|
||||||
class TestUpstream(LiveServerTests, HttpBinLiveTests, BaseTestClass):
|
class TestUpstream(LiveServerTests, HttpBinLiveTests, BaseTestClass):
|
||||||
def setup(self):
|
def setup_method(self):
|
||||||
app = BaseWarcServer()
|
app = BaseWarcServer()
|
||||||
|
|
||||||
base_url = 'http://localhost:{0}'.format(self.server.port)
|
base_url = 'http://localhost:{0}'.format(self.server.port)
|
||||||
|
@ -1,19 +1,21 @@
|
|||||||
six
|
six
|
||||||
warcio>=1.7.1
|
warcio>=1.7.1
|
||||||
requests
|
requests
|
||||||
redis<3.0
|
redis==2.10.6
|
||||||
jinja2<3.0.0
|
jinja2>=3.1.2
|
||||||
surt>=0.3.1
|
surt>=0.3.1
|
||||||
brotlipy
|
brotlipy
|
||||||
pyyaml
|
pyyaml
|
||||||
werkzeug
|
werkzeug==2.2.3
|
||||||
webencodings
|
webencodings
|
||||||
gevent==21.12.0
|
gevent==22.10.2
|
||||||
|
greenlet>=2.0.2,<3.0
|
||||||
webassets==2.0
|
webassets==2.0
|
||||||
portalocker
|
portalocker
|
||||||
wsgiprox>=1.5.1
|
wsgiprox>=1.5.1
|
||||||
fakeredis<1.0
|
fakeredis<1.0
|
||||||
tldextract
|
tldextract
|
||||||
python-dateutil
|
python-dateutil
|
||||||
markupsafe<2.1.0
|
markupsafe>=2.1.1
|
||||||
ua_parser
|
ua_parser
|
||||||
|
py3AMF
|
||||||
|
1
sample_archive/access/allow_all.aclj
Normal file
1
sample_archive/access/allow_all.aclj
Normal file
@ -0,0 +1 @@
|
|||||||
|
*, - {"access": "allow", "user": "staff"}
|
@ -5,6 +5,8 @@ org,iana)/_css/2013.1/fonts/opensans-semibold.ttf - {"access": "allow"}
|
|||||||
org,iana)/_css - {"access": "exclude"}
|
org,iana)/_css - {"access": "exclude"}
|
||||||
org,iana)/### - {"access": "allow"}
|
org,iana)/### - {"access": "allow"}
|
||||||
org,iana)/ - {"access": "exclude"}
|
org,iana)/ - {"access": "exclude"}
|
||||||
|
com,example)/?example=3 - {"access": "block", "user": "staff"}
|
||||||
|
com,example)/?example=3 - {"access": "exclude", "user": "staff2"}
|
||||||
org,example)/?example=1 - {"access": "block"}
|
org,example)/?example=1 - {"access": "block"}
|
||||||
com,example)/?example=2 - {"access": "allow_ignore_embargo"}
|
com,example)/?example=2 - {"access": "allow_ignore_embargo"}
|
||||||
com,example)/?example=1 - {"access": "allow_ignore_embargo", "user": "staff2"}
|
com,example)/?example=1 - {"access": "allow_ignore_embargo", "user": "staff2"}
|
||||||
|
BIN
sample_archive/cdxj/example.cdx.gz
Normal file
BIN
sample_archive/cdxj/example.cdx.gz
Normal file
Binary file not shown.
BIN
sample_archive/waczs/invalid_example_1.wacz
Normal file
BIN
sample_archive/waczs/invalid_example_1.wacz
Normal file
Binary file not shown.
BIN
sample_archive/waczs/valid_example_1.wacz
Normal file
BIN
sample_archive/waczs/valid_example_1.wacz
Normal file
Binary file not shown.
13
setup.py
13
setup.py
@ -62,10 +62,6 @@ def generate_git_hash_py(pkg, filename='git_hash.py'):
|
|||||||
def load_requirements(filename):
|
def load_requirements(filename):
|
||||||
with open(filename, 'rt') as fh:
|
with open(filename, 'rt') as fh:
|
||||||
requirements = fh.read().rstrip().split('\n')
|
requirements = fh.read().rstrip().split('\n')
|
||||||
if sys.version_info > (3, 0):
|
|
||||||
requirements.append("py3AMF")
|
|
||||||
else:
|
|
||||||
requirements.append("pyAMF")
|
|
||||||
return requirements
|
return requirements
|
||||||
|
|
||||||
|
|
||||||
@ -113,6 +109,7 @@ setup(
|
|||||||
"translate_toolkit"
|
"translate_toolkit"
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
|
python_requires='>=3.7,<3.12',
|
||||||
tests_require=load_requirements("test_requirements.txt"),
|
tests_require=load_requirements("test_requirements.txt"),
|
||||||
cmdclass={'test': PyTest},
|
cmdclass={'test': PyTest},
|
||||||
test_suite='',
|
test_suite='',
|
||||||
@ -131,16 +128,12 @@ setup(
|
|||||||
'Environment :: Web Environment',
|
'Environment :: Web Environment',
|
||||||
'License :: OSI Approved :: GNU General Public License (GPL)',
|
'License :: OSI Approved :: GNU General Public License (GPL)',
|
||||||
'License :: OSI Approved :: GNU General Public License v3 (GPLv3)',
|
'License :: OSI Approved :: GNU General Public License v3 (GPLv3)',
|
||||||
'Programming Language :: Python :: 2',
|
|
||||||
'Programming Language :: Python :: 2.7',
|
|
||||||
'Programming Language :: Python :: 3',
|
'Programming Language :: Python :: 3',
|
||||||
'Programming Language :: Python :: 3.3',
|
|
||||||
'Programming Language :: Python :: 3.4',
|
|
||||||
'Programming Language :: Python :: 3.5',
|
|
||||||
'Programming Language :: Python :: 3.6',
|
|
||||||
'Programming Language :: Python :: 3.7',
|
'Programming Language :: Python :: 3.7',
|
||||||
'Programming Language :: Python :: 3.8',
|
'Programming Language :: Python :: 3.8',
|
||||||
'Programming Language :: Python :: 3.9',
|
'Programming Language :: Python :: 3.9',
|
||||||
|
'Programming Language :: Python :: 3.10',
|
||||||
|
'Programming Language :: Python :: 3.11',
|
||||||
'Topic :: Internet :: Proxy Servers',
|
'Topic :: Internet :: Proxy Servers',
|
||||||
'Topic :: Internet :: WWW/HTTP',
|
'Topic :: Internet :: WWW/HTTP',
|
||||||
'Topic :: Internet :: WWW/HTTP :: WSGI',
|
'Topic :: Internet :: WWW/HTTP :: WSGI',
|
||||||
|
@ -3,7 +3,6 @@ WebTest
|
|||||||
pytest-cov
|
pytest-cov
|
||||||
mock
|
mock
|
||||||
urllib3
|
urllib3
|
||||||
httpbin==0.5.0
|
|
||||||
flask<2.0
|
|
||||||
ujson
|
ujson
|
||||||
lxml
|
lxml
|
||||||
|
httpbin>=0.10.2
|
||||||
|
@ -62,6 +62,13 @@ collections:
|
|||||||
acl_paths:
|
acl_paths:
|
||||||
- ./sample_archive/access/pywb.aclj
|
- ./sample_archive/access/pywb.aclj
|
||||||
|
|
||||||
|
pywb-wildcard-surt:
|
||||||
|
index_paths: ./sample_archive/cdx/
|
||||||
|
archive_paths: ./sample_archive/warcs/
|
||||||
|
default_access: block
|
||||||
|
acl_paths:
|
||||||
|
- ./sample_archive/access/allow_all.aclj
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -41,12 +41,23 @@ class TestACLApp(BaseConfigTest):
|
|||||||
assert 'Access Blocked' in resp.text
|
assert 'Access Blocked' in resp.text
|
||||||
|
|
||||||
def test_allow_via_acl_header(self):
|
def test_allow_via_acl_header(self):
|
||||||
resp = self.query('http://www.iana.org/about/')
|
resp = self.testapp.get('/pywb/cdx?url=http://www.iana.org/about/', headers={"X-Pywb-Acl-User": "staff"})
|
||||||
|
|
||||||
assert len(resp.text.splitlines()) == 1
|
assert len(resp.text.splitlines()) == 1
|
||||||
|
|
||||||
resp = self.testapp.get('/pywb/mp_/http://www.iana.org/about/', headers={"X-Pywb-Acl-User": "staff"}, status=200)
|
resp = self.testapp.get('/pywb/mp_/http://www.iana.org/about/', headers={"X-Pywb-Acl-User": "staff"}, status=200)
|
||||||
|
|
||||||
|
def test_block_via_acl_header(self):
|
||||||
|
resp = self.testapp.get('/pywb/cdx?url=http://example.com/?example=3', headers={"X-Pywb-Acl-User": "staff"})
|
||||||
|
assert len(resp.text.splitlines()) > 0
|
||||||
|
|
||||||
|
resp = self.testapp.get('/pywb/mp_/http://example.com/?example=3', headers={"X-Pywb-Acl-User": "staff"}, status=451)
|
||||||
|
|
||||||
|
def test_exclude_via_acl_header(self):
|
||||||
|
resp = self.testapp.get('/pywb/cdx?url=http://example.com/?example=3', headers={"X-Pywb-Acl-User": "staff2"})
|
||||||
|
assert len(resp.text.splitlines()) == 0
|
||||||
|
|
||||||
|
resp = self.testapp.get('/pywb/mp_/http://example.com/?example=3', headers={"X-Pywb-Acl-User": "staff2"}, status=404)
|
||||||
|
|
||||||
def test_allowed_more_specific(self):
|
def test_allowed_more_specific(self):
|
||||||
resp = self.query('http://www.iana.org/_css/2013.1/fonts/opensans-semibold.ttf')
|
resp = self.query('http://www.iana.org/_css/2013.1/fonts/opensans-semibold.ttf')
|
||||||
|
|
||||||
@ -85,5 +96,9 @@ class TestACLApp(BaseConfigTest):
|
|||||||
|
|
||||||
assert '"http://httpbin.org/anything/resource.json"' in resp.text
|
assert '"http://httpbin.org/anything/resource.json"' in resp.text
|
||||||
|
|
||||||
|
def test_allow_all_acl_user_specific(self):
|
||||||
|
resp = self.testapp.get('/pywb-wildcard-surt/mp_/http://example.com/', status=451)
|
||||||
|
|
||||||
|
assert 'Access Blocked' in resp.text
|
||||||
|
|
||||||
|
resp = self.testapp.get('/pywb-wildcard-surt/mp_/http://example.com/', headers={"X-Pywb-Acl-User": "staff"}, status=200)
|
||||||
|
@ -537,7 +537,7 @@ class TestManagedColls(CollsDirMixin, BaseConfigTest):
|
|||||||
main(['template', 'foo', '--remove', 'query_html'])
|
main(['template', 'foo', '--remove', 'query_html'])
|
||||||
|
|
||||||
def test_err_no_such_coll(self):
|
def test_err_no_such_coll(self):
|
||||||
""" Test error adding warc to non-existant collection
|
""" Test error adding warc to non-existent collection
|
||||||
"""
|
"""
|
||||||
warc1 = self._get_sample_warc('example.warc.gz')
|
warc1 = self._get_sample_warc('example.warc.gz')
|
||||||
|
|
||||||
|
@ -46,8 +46,12 @@ class TestEmbargoApp(BaseConfigTest):
|
|||||||
def test_embargo_ignore_acl_with_header_only(self):
|
def test_embargo_ignore_acl_with_header_only(self):
|
||||||
# ignore embargo with custom header only
|
# ignore embargo with custom header only
|
||||||
headers = {"X-Pywb-ACL-User": "staff2"}
|
headers = {"X-Pywb-ACL-User": "staff2"}
|
||||||
resp = self.testapp.get('/pywb-embargo-acl/20140126201054mp_/http://example.com/?example=1', status=200, headers=headers)
|
|
||||||
|
|
||||||
|
resp = self.testapp.get('/pywb-embargo-acl/cdx?url=http://example.com/?example=1', headers=headers)
|
||||||
|
assert len(resp.text.splitlines()) > 0
|
||||||
|
resp = self.testapp.get('/pywb-embargo-acl/20140126201054mp_/http://example.com/?example=1', status=200, headers=headers)
|
||||||
|
resp = self.testapp.get('/pywb-embargo-acl/cdx?url=http://example.com/?example=1')
|
||||||
|
assert len(resp.text.splitlines()) == 0
|
||||||
resp = self.testapp.get('/pywb-embargo-acl/20140126201054mp_/http://example.com/?example=1', status=404)
|
resp = self.testapp.get('/pywb-embargo-acl/20140126201054mp_/http://example.com/?example=1', status=404)
|
||||||
|
|
||||||
|
|
||||||
|
@ -56,6 +56,6 @@ class TestForceHttpsRoot(BaseConfigTest):
|
|||||||
resp = self.get('/20140128051539{0}/http://www.iana.org/domains/example', fmod,
|
resp = self.get('/20140128051539{0}/http://www.iana.org/domains/example', fmod,
|
||||||
headers={'X-Forwarded-Proto': 'https'})
|
headers={'X-Forwarded-Proto': 'https'})
|
||||||
|
|
||||||
assert resp.headers['Location'] == 'https://localhost:80/20140128051539{0}/http://www.iana.org/domains/reserved'.format(fmod)
|
assert resp.headers['Location'] == 'https://localhost:80/20140128051539{0}/http://www.iana.org/help/example-domains'.format(fmod)
|
||||||
|
|
||||||
|
|
||||||
|
@ -400,7 +400,7 @@ class TestWbIntegration(BaseConfigTest):
|
|||||||
assert resp.status_int == 200
|
assert resp.status_int == 200
|
||||||
assert resp.headers['Content-Location'].endswith('/pywb/20140126200928{0}/http://www.iana.org/domains/root/db'.format(fmod))
|
assert resp.headers['Content-Location'].endswith('/pywb/20140126200928{0}/http://www.iana.org/domains/root/db'.format(fmod))
|
||||||
|
|
||||||
def test_not_existant_warc_other_capture(self, fmod):
|
def test_not_existent_warc_other_capture(self, fmod):
|
||||||
resp = self.get('/pywb/20140703030321{0}/http://example.com/?example=2', fmod)
|
resp = self.get('/pywb/20140703030321{0}/http://example.com/?example=2', fmod)
|
||||||
assert resp.status_int == 200
|
assert resp.status_int == 200
|
||||||
assert resp.headers['Content-Location'].endswith('/pywb/20140603030341{0}/http://example.com?example=2'.format(fmod))
|
assert resp.headers['Content-Location'].endswith('/pywb/20140603030341{0}/http://example.com?example=2'.format(fmod))
|
||||||
@ -410,7 +410,7 @@ class TestWbIntegration(BaseConfigTest):
|
|||||||
assert resp.status_int == 200
|
assert resp.status_int == 200
|
||||||
assert resp.headers['Content-Location'].endswith('/pywb/20140603030341{0}/http://example.com?example=2'.format(fmod))
|
assert resp.headers['Content-Location'].endswith('/pywb/20140603030341{0}/http://example.com?example=2'.format(fmod))
|
||||||
|
|
||||||
def test_not_existant_warc_no_other(self, fmod):
|
def test_not_existent_warc_no_other(self, fmod):
|
||||||
resp = self.get('/pywb/20140703030321{0}/http://example.com/?example=3', fmod, status=503)
|
resp = self.get('/pywb/20140703030321{0}/http://example.com/?example=3', fmod, status=503)
|
||||||
assert resp.status_int == 503
|
assert resp.status_int == 503
|
||||||
|
|
||||||
|
@ -91,25 +91,28 @@ class TestLiveRewriter(HttpBinLiveTests, BaseConfigTest):
|
|||||||
resp = self.head('/live/{0}httpbin.org/get?foo=bar', fmod_sl)
|
resp = self.head('/live/{0}httpbin.org/get?foo=bar', fmod_sl)
|
||||||
assert resp.status_int == 200
|
assert resp.status_int == 200
|
||||||
|
|
||||||
@pytest.mark.skipif(sys.version_info < (3,0), reason='does not respond in 2.7')
|
# Following tests are temporarily commented out because latest version of PSF httpbin
|
||||||
def test_live_bad_content_length(self, fmod_sl):
|
# now returns 400 if content-length header isn't parsable as an int
|
||||||
resp = self.get('/live/{0}httpbin.org/response-headers?content-length=149,149', fmod_sl, status=200)
|
|
||||||
assert resp.headers['Content-Length'] == '149'
|
|
||||||
|
|
||||||
resp = self.get('/live/{0}httpbin.org/response-headers?Content-Length=xyz', fmod_sl, status=200)
|
# @pytest.mark.skipif(sys.version_info < (3,0), reason='does not respond in 2.7')
|
||||||
assert resp.headers['Content-Length'] == '90'
|
# def test_live_bad_content_length(self, fmod_sl):
|
||||||
|
# resp = self.get('/live/{0}httpbin.org/response-headers?content-length=149,149', fmod_sl, status=200)
|
||||||
|
# assert resp.headers['Content-Length'] == '149'
|
||||||
|
|
||||||
@pytest.mark.skipif(sys.version_info < (3,0), reason='does not respond in 2.7')
|
# resp = self.get('/live/{0}httpbin.org/response-headers?Content-Length=xyz', fmod_sl, status=200)
|
||||||
def test_live_bad_content_length_with_range(self, fmod_sl):
|
# assert resp.headers['Content-Length'] == '90'
|
||||||
resp = self.get('/live/{0}httpbin.org/response-headers?content-length=149,149', fmod_sl,
|
|
||||||
headers={'Range': 'bytes=0-'}, status=206)
|
|
||||||
assert resp.headers['Content-Length'] == '149'
|
|
||||||
assert resp.headers['Content-Range'] == 'bytes 0-148/149'
|
|
||||||
|
|
||||||
resp = self.get('/live/{0}httpbin.org/response-headers?Content-Length=xyz', fmod_sl,
|
# @pytest.mark.skipif(sys.version_info < (3,0), reason='does not respond in 2.7')
|
||||||
headers={'Range': 'bytes=0-'}, status=206)
|
# def test_live_bad_content_length_with_range(self, fmod_sl):
|
||||||
assert resp.headers['Content-Length'] == '90'
|
# resp = self.get('/live/{0}httpbin.org/response-headers?content-length=149,149', fmod_sl,
|
||||||
assert resp.headers['Content-Range'] == 'bytes 0-89/90'
|
# headers={'Range': 'bytes=0-'}, status=206)
|
||||||
|
# assert resp.headers['Content-Length'] == '149'
|
||||||
|
# assert resp.headers['Content-Range'] == 'bytes 0-148/149'
|
||||||
|
|
||||||
|
# resp = self.get('/live/{0}httpbin.org/response-headers?Content-Length=xyz', fmod_sl,
|
||||||
|
# headers={'Range': 'bytes=0-'}, status=206)
|
||||||
|
# assert resp.headers['Content-Length'] == '90'
|
||||||
|
# assert resp.headers['Content-Range'] == 'bytes 0-89/90'
|
||||||
|
|
||||||
def test_custom_unicode_header(self, fmod_sl):
|
def test_custom_unicode_header(self, fmod_sl):
|
||||||
value = u'⛄'
|
value = u'⛄'
|
||||||
|
135
tests/test_manager.py
Normal file
135
tests/test_manager.py
Normal file
@ -0,0 +1,135 @@
|
|||||||
|
import os
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from pywb.manager.manager import CollectionsManager
|
||||||
|
|
||||||
|
VALID_WACZ_PATH = 'sample_archive/waczs/valid_example_1.wacz'
|
||||||
|
INVALID_WACZ_PATH = 'sample_archive/waczs/invalid_example_1.wacz'
|
||||||
|
|
||||||
|
TEST_COLLECTION_NAME = 'test-col'
|
||||||
|
|
||||||
|
|
||||||
|
class TestManager:
|
||||||
|
def test_add_valid_wacz_unpacked(self, tmp_path):
|
||||||
|
"""Test if adding a valid wacz file to a collection succeeds"""
|
||||||
|
manager = self.get_test_collections_manager(tmp_path)
|
||||||
|
manager._add_wacz_unpacked(VALID_WACZ_PATH)
|
||||||
|
assert 'valid_example_1-0.warc' in os.listdir(manager.archive_dir)
|
||||||
|
assert manager.DEF_INDEX_FILE in os.listdir(manager.indexes_dir)
|
||||||
|
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
|
||||||
|
assert '"filename": "valid_example_1-0.warc"' in f.read()
|
||||||
|
|
||||||
|
def test_add_valid_wacz_unpacked_dupe_name(self, tmp_path):
|
||||||
|
"""Test if warc that already exists is renamed with -index suffix"""
|
||||||
|
manager = self.get_test_collections_manager(tmp_path)
|
||||||
|
manager._add_wacz_unpacked(VALID_WACZ_PATH)
|
||||||
|
# Add it again to see if there are name conflicts
|
||||||
|
manager._add_wacz_unpacked(VALID_WACZ_PATH)
|
||||||
|
assert 'valid_example_1-0.warc' in os.listdir(manager.archive_dir)
|
||||||
|
assert 'valid_example_1-0-1.warc' in os.listdir(manager.archive_dir)
|
||||||
|
assert manager.DEF_INDEX_FILE in os.listdir(manager.indexes_dir)
|
||||||
|
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
|
||||||
|
data = f.read()
|
||||||
|
assert '"filename": "valid_example_1-0.warc"' in data
|
||||||
|
assert '"filename": "valid_example_1-0-1.warc"' in data
|
||||||
|
|
||||||
|
def test_add_invalid_wacz_unpacked(self, tmp_path, caplog):
|
||||||
|
"""Test if adding an invalid wacz file to a collection fails"""
|
||||||
|
manager = self.get_test_collections_manager(tmp_path)
|
||||||
|
manager._add_wacz_unpacked(INVALID_WACZ_PATH)
|
||||||
|
assert 'invalid_example_1-0.warc' not in os.listdir(manager.archive_dir)
|
||||||
|
assert 'sample_archive/waczs/invalid_example_1.wacz does not contain any warc files.' in caplog.text
|
||||||
|
|
||||||
|
index_path = os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE)
|
||||||
|
if os.path.exists(index_path):
|
||||||
|
with open(index_path, 'r') as f:
|
||||||
|
assert '"filename": "invalid_example_1-0.warc"' not in f.read()
|
||||||
|
|
||||||
|
def test_add_valid_archives_unpack_wacz(self, tmp_path):
|
||||||
|
manager = self.get_test_collections_manager(tmp_path)
|
||||||
|
archives = ['sample_archive/warcs/example.arc', 'sample_archive/warcs/example.arc.gz',
|
||||||
|
'sample_archive/warcs/example.warc', 'sample_archive/warcs/example.warc.gz',
|
||||||
|
'sample_archive/waczs/valid_example_1.wacz']
|
||||||
|
manager.add_archives(archives, unpack_wacz=True)
|
||||||
|
|
||||||
|
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
|
||||||
|
index_text = f.read()
|
||||||
|
|
||||||
|
for archive in archives:
|
||||||
|
archive = os.path.basename(archive)
|
||||||
|
|
||||||
|
if archive.endswith('wacz'):
|
||||||
|
archive = 'valid_example_1-0.warc'
|
||||||
|
|
||||||
|
assert archive in os.listdir(manager.archive_dir)
|
||||||
|
assert archive in index_text
|
||||||
|
|
||||||
|
def test_add_valid_archives_dupe_name(self, tmp_path):
|
||||||
|
manager = self.get_test_collections_manager(tmp_path)
|
||||||
|
warc_filename = 'sample_archive/warcs/example.warc.gz'
|
||||||
|
manager.add_archives([warc_filename, warc_filename])
|
||||||
|
|
||||||
|
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
|
||||||
|
index_text = f.read()
|
||||||
|
|
||||||
|
expected_archives = ('example.warc.gz', 'example-1.warc.gz')
|
||||||
|
|
||||||
|
for archive in expected_archives:
|
||||||
|
assert archive in os.listdir(manager.archive_dir)
|
||||||
|
assert archive in index_text
|
||||||
|
|
||||||
|
def test_add_valid_archives_dont_unpack_wacz(self, tmp_path):
|
||||||
|
manager = self.get_test_collections_manager(tmp_path)
|
||||||
|
archives = ['sample_archive/warcs/example.arc', 'sample_archive/warcs/example.arc.gz',
|
||||||
|
'sample_archive/warcs/example.warc', 'sample_archive/warcs/example.warc.gz',
|
||||||
|
'sample_archive/waczs/valid_example_1.wacz']
|
||||||
|
|
||||||
|
with pytest.raises(NotImplementedError):
|
||||||
|
manager.add_archives(archives, unpack_wacz=False)
|
||||||
|
|
||||||
|
def test_add_invalid_archives_unpack_wacz(self, tmp_path, caplog):
|
||||||
|
manager = self.get_test_collections_manager(tmp_path)
|
||||||
|
manager.add_archives(['sample_archive/warcs/example.warc', 'sample_archive/text_content/sample.html'],
|
||||||
|
unpack_wacz=True)
|
||||||
|
assert 'sample.html' not in os.listdir(manager.archive_dir)
|
||||||
|
assert 'example.warc' in os.listdir(manager.archive_dir)
|
||||||
|
assert "Invalid archives weren't added: sample_archive/text_content/sample.html" in caplog.messages
|
||||||
|
|
||||||
|
def test_merge_wacz_index(self, tmp_path):
|
||||||
|
manager = self.get_test_collections_manager(tmp_path)
|
||||||
|
manager._add_wacz_index(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE),
|
||||||
|
'sample_archive/cdxj/example.cdxj',
|
||||||
|
{'example.warc.gz': 'rewritten.warc.gz'})
|
||||||
|
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
|
||||||
|
index_content = f.read()
|
||||||
|
index_content = index_content.strip()
|
||||||
|
|
||||||
|
assert 'example.warc.gz' not in index_content
|
||||||
|
assert 'rewritten.warc.gz' in index_content
|
||||||
|
|
||||||
|
# check that collection index is sorted
|
||||||
|
index_lines = index_content.split('\n')
|
||||||
|
assert sorted(index_lines) == index_lines
|
||||||
|
|
||||||
|
def test_merge_wacz_index_gzip(self, tmp_path):
|
||||||
|
manager = self.get_test_collections_manager(tmp_path)
|
||||||
|
manager._add_wacz_index(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE),
|
||||||
|
'sample_archive/cdxj/example.cdx.gz',
|
||||||
|
{'example-collection.warc': 'rewritten.warc'})
|
||||||
|
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
|
||||||
|
index_content = f.read()
|
||||||
|
index_content = index_content.strip()
|
||||||
|
|
||||||
|
assert 'example-collection.warc' not in index_content
|
||||||
|
assert 'rewritten.warc' in index_content
|
||||||
|
|
||||||
|
# check that collection index is sorted
|
||||||
|
index_lines = index_content.split('\n')
|
||||||
|
assert sorted(index_lines) == index_lines
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_test_collections_manager(collections_path):
|
||||||
|
manager = CollectionsManager(TEST_COLLECTION_NAME, colls_dir=collections_path, must_exist=False)
|
||||||
|
manager.add_collection()
|
||||||
|
return manager
|
7
tox.ini
7
tox.ini
@ -4,23 +4,24 @@ testpaths =
|
|||||||
tests
|
tests
|
||||||
|
|
||||||
[tox]
|
[tox]
|
||||||
envlist = py36, py37, py38, py39, py310
|
envlist = py37, py38, py39, py310, py311
|
||||||
|
|
||||||
[gh-actions]
|
[gh-actions]
|
||||||
python =
|
python =
|
||||||
3.6: py36
|
|
||||||
3.7: py37
|
3.7: py37
|
||||||
3.8: py38
|
3.8: py38
|
||||||
3.9: py39
|
3.9: py39
|
||||||
3.10: py310
|
3.10: py310
|
||||||
|
3.11: py311
|
||||||
|
|
||||||
[testenv]
|
[testenv]
|
||||||
setenv = PYWB_NO_VERIFY_SSL = 1
|
setenv = PYWB_NO_VERIFY_SSL = 1
|
||||||
|
passenv = *
|
||||||
deps =
|
deps =
|
||||||
-rtest_requirements.txt
|
-rtest_requirements.txt
|
||||||
-rrequirements.txt
|
-rrequirements.txt
|
||||||
-rextra_requirements.txt
|
-rextra_requirements.txt
|
||||||
commands =
|
commands =
|
||||||
py.test --cov-config .coveragerc --cov pywb -v --doctest-modules ./pywb/ tests/
|
pytest --cov-config .coveragerc --cov pywb -v --doctest-modules ./pywb/ tests/
|
||||||
|
|
||||||
|
|
||||||
|
2
wombat
2
wombat
@ -1 +1 @@
|
|||||||
Subproject commit 1c428cd1f4bdf6531f944f153154714468803f1a
|
Subproject commit 20596ca1e66928cae6f309af781f961aa112ca7f
|
Loading…
x
Reference in New Issue
Block a user