mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
Compare commits
No commits in common. "main" and "v-2.7.2" have entirely different histories.
2
.github/workflows/ci.yaml
vendored
2
.github/workflows/ci.yaml
vendored
@ -8,7 +8,7 @@ jobs:
|
||||
strategy:
|
||||
max-parallel: 3
|
||||
matrix:
|
||||
python-version: ['3.7', '3.8', '3.9', '3.10', '3.11']
|
||||
python-version: ['3.7', '3.8', '3.9', '3.10']
|
||||
|
||||
steps:
|
||||
- name: checkout
|
||||
|
4
.gitignore
vendored
4
.gitignore
vendored
@ -53,7 +53,3 @@ git_hash.py
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/*
|
||||
|
||||
# virtualenvs
|
||||
env/
|
||||
venv/
|
||||
|
22
CHANGES.rst
22
CHANGES.rst
@ -1,19 +1,3 @@
|
||||
pywb 2.7.3 changelist
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
* issue_792 catch warcio exception by @oskarhek in https://github.com/webrecorder/pywb/pull/793
|
||||
* Add ui.logo_home_url as config.yaml option by @tw4l in https://github.com/webrecorder/pywb/pull/791
|
||||
* [#795] Show error when adding duplicate warc file by @kuechensofa in https://github.com/webrecorder/pywb/pull/797
|
||||
* Make search page more intuitive by @krakan in https://github.com/webrecorder/pywb/pull/794
|
||||
* Modify search template buttons by @tw4l in https://github.com/webrecorder/pywb/pull/801
|
||||
* [#804] Use default_locale when lang not set in the request by @krakan in https://github.com/webrecorder/pywb/pull/805
|
||||
* feat: regex substitution on surt rules match by @mijho in https://github.com/webrecorder/pywb/pull/780
|
||||
* Bump minimatch from 3.0.4 to 3.1.2 in /pywb/vueui by @dependabot in https://github.com/webrecorder/pywb/pull/777
|
||||
* Bump decode-uri-component from 0.2.0 to 0.2.2 in /pywb/vueui by @dependabot in https://github.com/webrecorder/pywb/pull/786
|
||||
* rules: add 'debugNoBatch' rewrite for fb and insta by @ikreymer in https://github.com/webrecorder/pywb/pull/806
|
||||
* Vue main order by @tw4l in https://github.com/webrecorder/pywb/pull/809
|
||||
* wombat: bump to 3.4.4 https://github.com/webrecorder/pywb/pull/808
|
||||
|
||||
pywb 2.7.2 changelist
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
@ -1181,7 +1165,7 @@ pywb 0.9.6 changelist
|
||||
pywb 0.9.5 changelist
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
* s3 loading: support ``s3://`` scheme in block loader, allowing for loading index and archive files from s3. ``boto`` library must be installed separately
|
||||
* s3 loading: support ``s3://`` scheme in block loader, allowing for loading index and archive files from s3. ``boto`` library must be installed seperately
|
||||
via ``pip install boto``. Attempt default boto auth path, and if that fails, attempt anonymous s3 connection.
|
||||
|
||||
* Wombat/Client-Side Rewrite Customizations: New ``rewrite_opts.client`` settings from ``config.yaml`` are passed directly to wombat as json.
|
||||
@ -1277,7 +1261,7 @@ pywb 0.9.1 changelist
|
||||
|
||||
* cdx server query: add support for ``url=*.host`` and ``url=host/*`` as shortcuts for ``matchType=domain`` and ``matchType=prefix``
|
||||
|
||||
* zipnum cdx cluster: support loading index shared from prefix path instead of separate location file.
|
||||
* zipnum cdx cluster: support loading index shared from prefix path instead of seperate location file.
|
||||
|
||||
The ``shard_index_loc`` config property may contain match and replace properties.
|
||||
Regex replacement is then used to obtain path prefix from the shard prefix path.
|
||||
@ -1643,7 +1627,7 @@ pywb 0.4.7 changelist
|
||||
|
||||
* Rewrite: Parsing of html as raw bytes instead of decode/encode, detection still needed for non-ascii compatible encoding.
|
||||
|
||||
* Indexing: Refactoring of cdx-indexer using a separate 'archive record iterator' and pluggable cdx writer classes. Groundwork for creating custom indexers.
|
||||
* Indexing: Refactoring of cdx-indexer using a seperate 'archive record iterator' and pluggable cdx writer classes. Groundwork for creating custom indexers.
|
||||
|
||||
* Indexing: Support for 9 field cdx formats with -9 flag.
|
||||
|
||||
|
12
README.rst
12
README.rst
@ -1,4 +1,4 @@
|
||||
Webrecorder pywb 2.8
|
||||
Webrecorder pywb 2.7
|
||||
====================
|
||||
|
||||
.. image:: https://raw.githubusercontent.com/webrecorder/pywb/main/pywb/static/pywb-logo.png
|
||||
@ -13,7 +13,7 @@ Web Archiving Tools for All
|
||||
|
||||
`View the full pywb documentation <https://pywb.readthedocs.org>`_
|
||||
|
||||
**pywb** is a Python 3 web archiving toolkit for replaying web archives large and small as accurately as possible.
|
||||
**pywb** is a Python (2 and 3) web archiving toolkit for replaying web archives large and small as accurately as possible.
|
||||
The toolkit now also includes new features for creating high-fidelity web archives.
|
||||
|
||||
This toolset forms the foundation of Webrecorder project, but also provides a generic web archiving toolkit
|
||||
@ -60,7 +60,9 @@ Installation for Deployment
|
||||
|
||||
To install pywb for usage, you can use:
|
||||
|
||||
``pip install pywb``
|
||||
```shell
|
||||
pip install pywb
|
||||
```
|
||||
|
||||
Note: depending on your Python installation, you may have to use `pip3` instead of `pip`.
|
||||
|
||||
@ -68,7 +70,9 @@ Note: depending on your Python installation, you may have to use `pip3` instead
|
||||
Installation from local copy
|
||||
----------------------------
|
||||
|
||||
``git clone https://github.com/webrecorder/pywb``
|
||||
```shell
|
||||
git clone https://github.com/webrecorder/pywb
|
||||
```
|
||||
|
||||
To install from a locally cloned copy, install with ``pip install -e .`` or ``python setup.py install``.
|
||||
|
||||
|
@ -3,5 +3,4 @@
|
||||
CURR_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
|
||||
|
||||
cd $CURR_DIR/pywb/vueui/
|
||||
yarn install
|
||||
yarn run build
|
||||
|
@ -6,11 +6,9 @@ debug: true
|
||||
# Uncomment to set banner colors and logo
|
||||
# ui:
|
||||
# logo: path/relative/from/static/logo.png
|
||||
# logo_home_url: https://example.com
|
||||
# navbar_background_hex: 0c49b0
|
||||
# navbar_color_hex: fff
|
||||
# navbar_light_buttons: true
|
||||
# disable_printing: true
|
||||
|
||||
collections:
|
||||
all: $all
|
||||
|
@ -105,12 +105,6 @@ Given these rules, a user would:
|
||||
* but would receive an 'access blocked' error message when viewing ``http://httpbin.org/`` (block)
|
||||
* would receive a 404 not found error when viewing ``http://httpbin.org/anything`` (exclude)
|
||||
|
||||
To match any possible URL in an .aclj file, set ``*,`` as the leading SURT, for example::
|
||||
|
||||
*, - {"access": "allow"}
|
||||
|
||||
Lines starting with ``*,`` should generally be at the end of the file, respecting the reverse alphabetical order.
|
||||
|
||||
|
||||
Access Types: allow, block, exclude, allow_ignore_embargo
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
@ -155,10 +149,6 @@ To make this work, pywb must be running behind an Apache or Nginx system that is
|
||||
|
||||
For example, this header may be set based on IP range, or based on password authentication.
|
||||
|
||||
To allow a user access to all URLs, overriding more specific rules and the ``default_access`` configuration setting, use the ``*,`` SURT::
|
||||
|
||||
*, - {"access": "allow", "user": "staff"}
|
||||
|
||||
Further examples of how to set this header will be provided in the deployments section.
|
||||
|
||||
**Note: Do not use the user-based rules without configuring proper authentication on an Apache or Nginx frontend to set or remove this header, otherwise the 'X-Pywb-ACL-User' can easily be faked.**
|
||||
|
@ -46,7 +46,6 @@ It can be used to:
|
||||
|
||||
* Create a new collection -- ``wb-manager init <coll>``
|
||||
* Add WARCs to collection -- ``wb-manager add <coll> <warc>``
|
||||
* Unpack WACZs to add their WARCs and indices to collection -- ``wb-manager add --unpack-wacz <coll> <wacz>``
|
||||
* Add override templates
|
||||
* Add and remove metadata to a collections ``metadata.yaml``
|
||||
* List all collections
|
||||
|
@ -95,8 +95,8 @@ add the WARC to a new collection and start pywb:
|
||||
|
||||
docker pull webrecorder/pywb
|
||||
docker run -e INIT_COLLECTION=my-web-archive -v /pywb-data:/webarchive \
|
||||
-v /path/to:/source webrecorder/pywb wb-manager add my-web-archive /source/my_warc.warc.gz
|
||||
docker run -p 8080:8080 -v /pywb-data/:/webarchive webrecorder/pywb wayback
|
||||
-v /path/to:/source webrecorder/pywb wb-manager add default /path/to/my_warc.warc.gz
|
||||
docker run -p 8080:8080 -v /pywb-data/:/webarchive wayback
|
||||
|
||||
This example is equivalent to the non-Docker example above.
|
||||
|
||||
@ -114,8 +114,6 @@ Using Existing Web Archive Collections
|
||||
Existing archives of WARCs/ARCs files can be used with pywb with minimal amount of setup. By using ``wb-manager add``,
|
||||
WARC/ARC files will automatically be placed in the collection archive directory and indexed.
|
||||
|
||||
In pywb 2.8.0 and later, preliminary support for WACZ files is also added with ``wb-manager add --unpack-wacz``. This will unpack the provided WACZ file, adding its WARCs and indices to the collection.
|
||||
|
||||
By default ``wb-manager``, places new collections in ``collections/<coll name>`` subdirectory in the current working directory. To specify a different root directory, the ``wb-manager -d <dir>``. Other options can be set in the config file.
|
||||
|
||||
If you have a large number of existing CDX index files, pywb will be able to read them as well after running through a simple conversion process.
|
||||
@ -156,20 +154,20 @@ To enable auto-indexing, run with ``wayback -a`` or ``wayback -a --auto-interval
|
||||
Creating a Web Archive
|
||||
----------------------
|
||||
|
||||
Using ArchiveWeb.page
|
||||
^^^^^^^^^^^^^^^^^^^^^
|
||||
Using Webrecorder
|
||||
^^^^^^^^^^^^^^^^^
|
||||
|
||||
If you do not have a web archive to test, one easy way to create one is to use the `ArchiveWeb.page <https://archiveweb.page>`_ browser extension for Chrome and other Chromium-based browsers such as Brave Browser. ArchiveWeb.page records pages visited during an archiving session in the browser, and provides means of both replaying and downloading the archived items created.
|
||||
If you do not have a web archive to test, one easy way to create one is to use `Webrecorder <https://webrecorder.io>`_
|
||||
|
||||
Follow the instructions in `How To Create Web Archives with ArchiveWeb.page <https://archiveweb.page/en/usage/>`_. After recording, press **Stop** and then `download your collection <https://archiveweb.page/en/download/>`_ to receive a WARC (`.warc.gz`) file. If you choose to download your collection in the WACZ format, the WARC files can be found inside the zipped WACZ in the ``archive/`` directory.
|
||||
After recording, you can click **Stop** and then click `Download Collection` to receive a WARC (`.warc.gz`) file.
|
||||
|
||||
You can then use your WARCs to work with pywb.
|
||||
You can then use this with work with pywb.
|
||||
|
||||
|
||||
Using pywb Recorder
|
||||
^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
Recording functionality is also part of :mod:`pywb`. If you want to create a WARC locally, this can be
|
||||
The core recording functionality in Webrecorder is also part of :mod:`pywb`. If you want to create a WARC locally, this can be
|
||||
done by directly recording into your pywb collection:
|
||||
|
||||
1. Create a collection: ``wb-manager init my-web-archive`` (if you haven't already created a web archive collection)
|
||||
@ -182,14 +180,6 @@ In this configuration, the indexing happens every 10 seconds.. After 10 seconds,
|
||||
``http://localhost:8080/my-web-archive/http://example.com/``
|
||||
|
||||
|
||||
Using Browsertrix
|
||||
^^^^^^^^^^^^^^^^^
|
||||
|
||||
For a more automated browser-based web archiving experience, `Browsertrix <https://browsertrix.com/>`_ provides a web interface for configuring, scheduling, running, reviewing, and curating crawls of web content. Crawl activity is shown in a live screencast of the browsers used for crawling and all web archives created in Browsertrix can be easily downloaded from the application in the WACZ format.
|
||||
|
||||
`Browsertrix Crawler <https://crawler.docs.browsertrix.com/>`_, which provides the underlying crawling functionality of Browsertrix, can also be run standalone in a Docker container on your local computer.
|
||||
|
||||
|
||||
HTTP/S Proxy Mode Access
|
||||
------------------------
|
||||
|
||||
|
@ -53,36 +53,6 @@ For example, to use the file ``./static/my-logo.png`` as the logo, set:
|
||||
logo: my-logo.png
|
||||
|
||||
|
||||
Logo URL
|
||||
^^^^^^^^
|
||||
|
||||
It is possible to configure the logo to link to any URL by setting ``ui.logo_home_url`` in ``config.yml`` to the URL of your choice.
|
||||
|
||||
If omitted, the logo will not link to any page.
|
||||
|
||||
For example, to have the logo redirect to ``https://example.com/web-archive-landing-page``, set:
|
||||
|
||||
.. code:: yaml
|
||||
|
||||
ui:
|
||||
logo_home_url: https://example.com/web-archive-landing-page
|
||||
|
||||
|
||||
Printing
|
||||
^^^^^^^^
|
||||
|
||||
As of pywb 2.8, the replay header includes a print button that prints the contents of the replay iframe.
|
||||
|
||||
This button can be disabled by setting ``ui.disable_printing`` in ``config.yaml`` to any value.
|
||||
|
||||
For example:
|
||||
|
||||
.. code:: yaml
|
||||
|
||||
ui:
|
||||
disable_printing: true
|
||||
|
||||
|
||||
Banner Colors
|
||||
^^^^^^^^^^^^^
|
||||
|
||||
|
@ -1,7 +1,7 @@
|
||||
from gevent.monkey import patch_all; patch_all()
|
||||
|
||||
from werkzeug.routing import Map, Rule, RequestRedirect, Submount
|
||||
from wsgiref.util import shift_path_info
|
||||
from werkzeug.wsgi import pop_path_info
|
||||
from six.moves.urllib.parse import urljoin, parse_qsl
|
||||
from six import iteritems
|
||||
from warcio.utils import to_native_str
|
||||
@ -108,7 +108,6 @@ class FrontEndApp(object):
|
||||
self.templates_dir = config.get('templates_dir', 'templates')
|
||||
self.static_dir = config.get('static_dir', 'static')
|
||||
self.static_prefix = config.get('static_prefix', 'static')
|
||||
self.default_locale = config.get('default_locale', '')
|
||||
|
||||
metadata_templ = os.path.join(self.warcserver.root_dir, '{coll}', 'metadata.yaml')
|
||||
self.metadata_cache = MetadataCache(metadata_templ)
|
||||
@ -434,11 +433,7 @@ class FrontEndApp(object):
|
||||
cdx_url += 'limit=' + str(self.query_limit)
|
||||
|
||||
try:
|
||||
headers = {}
|
||||
for key in environ.keys():
|
||||
if key.startswith("HTTP_X_"):
|
||||
headers[key[5:].replace("_", "-")] = environ[key]
|
||||
res = requests.get(cdx_url, stream=True, headers=headers)
|
||||
res = requests.get(cdx_url, stream=True)
|
||||
|
||||
status_line = '{} {}'.format(res.status_code, res.reason)
|
||||
content_type = res.headers.get('Content-Type')
|
||||
@ -558,9 +553,9 @@ class FrontEndApp(object):
|
||||
return
|
||||
|
||||
if coll != '$root':
|
||||
shift_path_info(environ)
|
||||
pop_path_info(environ)
|
||||
if record:
|
||||
shift_path_info(environ)
|
||||
pop_path_info(environ)
|
||||
|
||||
paths = [self.warcserver.root_dir]
|
||||
|
||||
@ -603,7 +598,7 @@ class FrontEndApp(object):
|
||||
and message.
|
||||
|
||||
:param dict environ: The WSGI environment dictionary for the request
|
||||
:param str err_type: The identifier for type of error that occurred
|
||||
:param str err_type: The identifier for type of error that occured
|
||||
:param str url: The url of the archived page that was requested
|
||||
"""
|
||||
raise AppPageNotFound(err_type, url)
|
||||
@ -669,12 +664,8 @@ class FrontEndApp(object):
|
||||
|
||||
lang = args.pop('lang', '')
|
||||
if lang:
|
||||
shift_path_info(environ)
|
||||
|
||||
if lang:
|
||||
pop_path_info(environ)
|
||||
environ['pywb_lang'] = lang
|
||||
elif self.default_locale:
|
||||
environ['pywb_lang'] = self.default_locale
|
||||
|
||||
response = endpoint(environ, **args)
|
||||
|
||||
|
@ -64,7 +64,7 @@ class RewriterApp(object):
|
||||
|
||||
if not jinja_env:
|
||||
jinja_env = JinjaEnv(globals={'static_path': 'static'},
|
||||
extensions=['jinja2.ext.i18n'])
|
||||
extensions=['jinja2.ext.i18n', 'jinja2.ext.with_'])
|
||||
jinja_env.jinja_env.install_null_translations()
|
||||
|
||||
self.jinja_env = jinja_env
|
||||
|
@ -1,9 +1,5 @@
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import traceback
|
||||
|
||||
import warcio
|
||||
|
||||
# Use ujson if available
|
||||
try:
|
||||
@ -302,11 +298,8 @@ def write_multi_cdx_index(output, inputs, **options):
|
||||
with open(fullpath, 'rb') as infile:
|
||||
entry_iter = record_iter(infile)
|
||||
|
||||
try:
|
||||
for entry in entry_iter:
|
||||
writer.write(entry, filename)
|
||||
except warcio.exceptions.ArchiveLoadFailed:
|
||||
logging.error('Error while indexing file %s, %s',filename,traceback.format_exc())
|
||||
for entry in entry_iter:
|
||||
writer.write(entry, filename)
|
||||
|
||||
return writer
|
||||
|
||||
@ -384,7 +377,7 @@ url timestamp { ... }
|
||||
|
||||
output_help = """
|
||||
Output file or directory.
|
||||
- If directory, each input file is written to a separate output file
|
||||
- If directory, each input file is written to a seperate output file
|
||||
with a .cdx extension
|
||||
- If output is '-', output is written to stdout
|
||||
"""
|
||||
|
@ -102,11 +102,11 @@ class ACLManager(CollectionsManager):
|
||||
|
||||
except IOError as io:
|
||||
if must_exist:
|
||||
print('Error Occurred: ' + str(io))
|
||||
print('Error Occured: ' + str(io))
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
print('Error Occurred: ' + str(e))
|
||||
print('Error Occured: ' + str(e))
|
||||
return False
|
||||
|
||||
def save_acl(self, r=None):
|
||||
|
@ -5,16 +5,12 @@ import logging
|
||||
import heapq
|
||||
import yaml
|
||||
import re
|
||||
import gzip
|
||||
import six
|
||||
import pathlib
|
||||
|
||||
from distutils.util import strtobool
|
||||
from pkg_resources import resource_string, get_distribution
|
||||
|
||||
from argparse import ArgumentParser, RawTextHelpFormatter
|
||||
from tempfile import mkdtemp, TemporaryDirectory
|
||||
from zipfile import ZipFile
|
||||
|
||||
from pywb.utils.loaders import load_yaml_config
|
||||
from warcio.timeutils import timestamp20_now
|
||||
@ -51,9 +47,6 @@ directory structure expected by pywb
|
||||
|
||||
COLLS_DIR = 'collections'
|
||||
|
||||
WARC_RX = re.compile(r'.*\.w?arc(\.gz)?$')
|
||||
WACZ_RX = re.compile(r'.*\.wacz$')
|
||||
|
||||
def __init__(self, coll_name, colls_dir=None, must_exist=True):
|
||||
colls_dir = colls_dir or self.COLLS_DIR
|
||||
self.default_config = load_yaml_config(DEFAULT_CONFIG)
|
||||
@ -122,142 +115,19 @@ directory structure expected by pywb
|
||||
'To create a new collection, run\n\n{1} init {0}')
|
||||
raise IOError(msg.format(self.coll_name, sys.argv[0]))
|
||||
|
||||
def add_archives(self, archives, unpack_wacz=False):
|
||||
def add_warcs(self, warcs):
|
||||
if not os.path.isdir(self.archive_dir):
|
||||
raise IOError('Directory {0} does not exist'.
|
||||
format(self.archive_dir))
|
||||
|
||||
invalid_archives = []
|
||||
warc_paths = []
|
||||
for archive in archives:
|
||||
if self.WARC_RX.match(archive):
|
||||
full_path = self._add_warc(archive)
|
||||
if full_path:
|
||||
warc_paths.append(full_path)
|
||||
elif self.WACZ_RX.match(archive):
|
||||
if unpack_wacz:
|
||||
self._add_wacz_unpacked(archive)
|
||||
else:
|
||||
raise NotImplementedError('Adding waczs without unpacking is not yet implemented. Use '
|
||||
'\'--unpack-wacz\' flag to add the wacz\'s content.')
|
||||
else:
|
||||
invalid_archives.append(archive)
|
||||
|
||||
self._index_merge_warcs(warc_paths, self.DEF_INDEX_FILE)
|
||||
|
||||
if invalid_archives:
|
||||
logging.warning(f'Invalid archives weren\'t added: {", ".join(invalid_archives)}')
|
||||
|
||||
def _rename_warc(self, warc_basename):
|
||||
dupe_idx = 1
|
||||
ext = ''.join(pathlib.Path(warc_basename).suffixes)
|
||||
pre_ext_name = warc_basename.split(ext)[0]
|
||||
|
||||
while True:
|
||||
new_basename = f'{pre_ext_name}-{dupe_idx}{ext}'
|
||||
if not os.path.exists(os.path.join(self.archive_dir, new_basename)):
|
||||
break
|
||||
dupe_idx += 1
|
||||
|
||||
return new_basename
|
||||
|
||||
def _add_warc(self, warc):
|
||||
warc_source = os.path.abspath(warc)
|
||||
source_dir, warc_basename = os.path.split(warc_source)
|
||||
|
||||
# don't overwrite existing warcs with duplicate names
|
||||
if os.path.exists(os.path.join(self.archive_dir, warc_basename)):
|
||||
warc_basename = self._rename_warc(warc_basename)
|
||||
logging.info(f'Warc {os.path.basename(warc)} already exists - renamed to {warc_basename}.')
|
||||
|
||||
warc_dest = os.path.join(self.archive_dir, warc_basename)
|
||||
shutil.copy2(warc_source, warc_dest)
|
||||
logging.info(f'Copied {warc} to {self.archive_dir} as {warc_basename}')
|
||||
return warc_dest
|
||||
|
||||
def _add_wacz_unpacked(self, wacz):
|
||||
wacz = os.path.abspath(wacz)
|
||||
temp_dir = mkdtemp()
|
||||
warc_regex = re.compile(r'.+\.warc(\.gz)?$')
|
||||
cdx_regex = re.compile(r'.+\.cdx(\.gz)?$')
|
||||
with ZipFile(wacz, 'r') as wacz_zip_file:
|
||||
archive_members = wacz_zip_file.namelist()
|
||||
warc_files = [file for file in archive_members if warc_regex.match(file)]
|
||||
if not warc_files:
|
||||
logging.warning(f'WACZ {wacz} does not contain any warc files.')
|
||||
return
|
||||
|
||||
# extract warc files
|
||||
for warc_file in warc_files:
|
||||
wacz_zip_file.extract(warc_file, temp_dir)
|
||||
|
||||
cdx_files = [file for file in archive_members if cdx_regex.match(file)]
|
||||
if not cdx_files:
|
||||
logging.warning(f'WACZ {wacz} does not contain any indices.')
|
||||
return
|
||||
|
||||
for cdx_file in cdx_files:
|
||||
wacz_zip_file.extract(cdx_file, temp_dir)
|
||||
|
||||
# copy extracted warc files to collections archive dir, use wacz filename as filename with added index if
|
||||
# multiple warc files exist
|
||||
warc_filename_mapping = {}
|
||||
full_paths = []
|
||||
for idx, extracted_warc_file in enumerate(warc_files):
|
||||
_, warc_ext = os.path.splitext(extracted_warc_file)
|
||||
if warc_ext == '.gz':
|
||||
warc_ext = '.warc.gz'
|
||||
warc_filename = os.path.basename(wacz)
|
||||
warc_filename, _ = os.path.splitext(warc_filename)
|
||||
warc_filename = f'{warc_filename}-{idx}{warc_ext}'
|
||||
warc_destination_path = os.path.join(self.archive_dir, warc_filename)
|
||||
for filename in warcs:
|
||||
filename = os.path.abspath(filename)
|
||||
shutil.copy2(filename, self.archive_dir)
|
||||
full_paths.append(os.path.join(self.archive_dir, filename))
|
||||
logging.info('Copied ' + filename + ' to ' + self.archive_dir)
|
||||
|
||||
if os.path.exists(warc_destination_path):
|
||||
warc_filename = self._rename_warc(warc_filename)
|
||||
logging.info(f'Warc {warc_destination_path} already exists - renamed to {warc_filename}.')
|
||||
warc_destination_path = os.path.join(self.archive_dir, warc_filename)
|
||||
|
||||
warc_filename_mapping[os.path.basename(extracted_warc_file)] = warc_filename
|
||||
shutil.copy2(os.path.join(temp_dir, extracted_warc_file), warc_destination_path)
|
||||
full_paths.append(warc_destination_path)
|
||||
|
||||
# rewrite filenames in wacz indices and merge them with collection index file
|
||||
for cdx_file in cdx_files:
|
||||
self._add_wacz_index(os.path.join(self.indexes_dir, self.DEF_INDEX_FILE), os.path.join(temp_dir, cdx_file),
|
||||
warc_filename_mapping)
|
||||
|
||||
# delete temporary files
|
||||
shutil.rmtree(temp_dir)
|
||||
|
||||
def _add_wacz_index(self, collection_index_path, wacz_index_path, filename_mapping):
|
||||
from pywb.warcserver.index.cdxobject import CDXObject
|
||||
|
||||
# rewrite wacz index to temporary index file
|
||||
tempdir = TemporaryDirectory()
|
||||
wacz_index_name = os.path.basename(wacz_index_path)
|
||||
rewritten_index_path = os.path.join(tempdir.name, wacz_index_name)
|
||||
|
||||
with open(rewritten_index_path, 'w') as rewritten_index:
|
||||
if wacz_index_path.endswith('.gz'):
|
||||
wacz_index = gzip.open(wacz_index_path, 'rb')
|
||||
else:
|
||||
wacz_index = open(wacz_index_path, 'rb')
|
||||
|
||||
for line in wacz_index:
|
||||
cdx_object = CDXObject(cdxline=line)
|
||||
if cdx_object['filename'] in filename_mapping:
|
||||
cdx_object['filename'] = filename_mapping[cdx_object['filename']]
|
||||
rewritten_index.write(cdx_object.to_cdxj())
|
||||
|
||||
if not os.path.isfile(collection_index_path):
|
||||
shutil.move(rewritten_index_path, collection_index_path)
|
||||
return
|
||||
|
||||
temp_coll_index_path = collection_index_path + '.tmp.' + timestamp20_now()
|
||||
self._merge_indices(collection_index_path, rewritten_index_path, temp_coll_index_path)
|
||||
shutil.move(temp_coll_index_path, collection_index_path)
|
||||
|
||||
tempdir.cleanup()
|
||||
self._index_merge_warcs(full_paths, self.DEF_INDEX_FILE)
|
||||
|
||||
def reindex(self):
|
||||
cdx_file = os.path.join(self.indexes_dir, self.DEF_INDEX_FILE)
|
||||
@ -310,24 +180,20 @@ directory structure expected by pywb
|
||||
|
||||
merged_file = temp_file + '.merged'
|
||||
|
||||
self._merge_indices(cdx_file, temp_file, merged_file)
|
||||
last_line = None
|
||||
|
||||
with open(cdx_file, 'rb') as orig_index:
|
||||
with open(temp_file, 'rb') as new_index:
|
||||
with open(merged_file, 'w+b') as merged:
|
||||
for line in heapq.merge(orig_index, new_index):
|
||||
if last_line != line:
|
||||
merged.write(line)
|
||||
last_line = line
|
||||
|
||||
shutil.move(merged_file, cdx_file)
|
||||
#os.rename(merged_file, cdx_file)
|
||||
os.remove(temp_file)
|
||||
|
||||
@staticmethod
|
||||
def _merge_indices(index1, index2, dest):
|
||||
last_line = None
|
||||
|
||||
with open(index1, 'rb') as index1_f:
|
||||
with open(index2, 'rb') as index2_f:
|
||||
with open(dest, 'wb') as dest_f:
|
||||
for line in heapq.merge(index1_f, index2_f):
|
||||
if last_line != line:
|
||||
dest_f.write(line)
|
||||
last_line = line
|
||||
|
||||
def set_metadata(self, namevalue_pairs):
|
||||
metadata_yaml = os.path.join(self.curr_coll_dir, 'metadata.yaml')
|
||||
metadata = None
|
||||
@ -507,23 +373,16 @@ Create manage file based web archive collections
|
||||
listcmd = subparsers.add_parser('list', help=list_help)
|
||||
listcmd.set_defaults(func=do_list)
|
||||
|
||||
# Add Warcs or Waczs
|
||||
# Add Warcs
|
||||
def do_add(r):
|
||||
m = CollectionsManager(r.coll_name)
|
||||
m.add_archives(r.files, r.unpack_wacz)
|
||||
m.add_warcs(r.files)
|
||||
|
||||
add_archives_help = 'Copy ARCs/WARCs to collection directory and reindex'
|
||||
add_unpack_wacz_help = 'Copy WARCs from WACZ to collection directory and reindex'
|
||||
add_archives = subparsers.add_parser('add', help=add_archives_help)
|
||||
add_archives.add_argument(
|
||||
'--unpack-wacz',
|
||||
dest='unpack_wacz',
|
||||
action='store_true',
|
||||
help=add_unpack_wacz_help
|
||||
)
|
||||
add_archives.add_argument('coll_name')
|
||||
add_archives.add_argument('files', nargs='+')
|
||||
add_archives.set_defaults(func=do_add)
|
||||
addwarc_help = 'Copy ARCS/WARCS to collection directory and reindex'
|
||||
addwarc = subparsers.add_parser('add', help=addwarc_help)
|
||||
addwarc.add_argument('coll_name')
|
||||
addwarc.add_argument('files', nargs='+')
|
||||
addwarc.set_defaults(func=do_add)
|
||||
|
||||
# Reindex All
|
||||
def do_reindex(r):
|
||||
|
@ -268,7 +268,7 @@ class HTMLRewriterMixin(StreamingRewriter):
|
||||
unesc_value = self.try_unescape(value)
|
||||
rewritten_value = self.url_rewriter.rewrite(unesc_value, mod, force_abs)
|
||||
|
||||
# if no rewriting has occurred, ensure we return original, not reencoded value
|
||||
# if no rewriting has occured, ensure we return original, not reencoded value
|
||||
if rewritten_value == value:
|
||||
return orig_value
|
||||
|
||||
@ -668,7 +668,7 @@ class HTMLRewriter(HTMLRewriterMixin, HTMLParser):
|
||||
if self.parse_comments:
|
||||
#data = self._rewrite_script(data)
|
||||
|
||||
# Rewrite with separate HTMLRewriter
|
||||
# Rewrite with seperate HTMLRewriter
|
||||
comment_rewriter = HTMLRewriter(self.url_rewriter,
|
||||
defmod=self.defmod)
|
||||
|
||||
|
@ -124,7 +124,9 @@ if (!self.__WB_pmw) {{ self.__WB_pmw = function(obj) {{ this.__WB_source = obj;
|
||||
(r'(?<![$.])\s*\blocation\b\s*[=]\s*(?![=])', self.add_suffix(check_loc), 0),
|
||||
# rewriting 'return this'
|
||||
(r'\breturn\s+this\b\s*(?![.$])', self.replace_str(this_rw), 0),
|
||||
# rewriting 'this.' special properties access
|
||||
# rewriting 'this.' special properties access on new line, with ; prepended
|
||||
(r'\n\s*this\b(?=(?:\.(?:{0})\b))'.format(prop_str), self.replace_str(';' + this_rw), 0),
|
||||
# rewriting 'this.' special properties access, not on new line (no ;)
|
||||
(r'(?<![$.])\s*this\b(?=(?:\.(?:{0})\b))'.format(prop_str), self.replace_str(this_rw), 0),
|
||||
# rewrite '= this' or ', this'
|
||||
(r'(?<=[=,])\s*this\b\s*(?![:.$])', self.replace_str(this_rw), 0),
|
||||
|
@ -5,7 +5,7 @@ from pywb.utils.loaders import load
|
||||
|
||||
from six.moves.urllib.parse import urlsplit, quote
|
||||
|
||||
from jinja2 import Environment, TemplateNotFound, pass_context, select_autoescape
|
||||
from jinja2 import Environment, TemplateNotFound, contextfunction, select_autoescape
|
||||
from jinja2 import FileSystemLoader, PackageLoader, ChoiceLoader
|
||||
|
||||
from webassets.ext.jinja2 import AssetsExtension
|
||||
@ -139,7 +139,7 @@ class JinjaEnv(object):
|
||||
return loc_map.get(loc)
|
||||
|
||||
def override_func(jinja_env, name):
|
||||
@pass_context
|
||||
@contextfunction
|
||||
def get_override(context, text):
|
||||
translate = get_translate(context)
|
||||
if not translate:
|
||||
@ -158,7 +158,7 @@ class JinjaEnv(object):
|
||||
|
||||
# Special _Q() function to return %-encoded text, necessary for use
|
||||
# with text in banner
|
||||
@pass_context
|
||||
@contextfunction
|
||||
def quote_gettext(context, text):
|
||||
translate = get_translate(context)
|
||||
if not translate:
|
||||
@ -171,14 +171,14 @@ class JinjaEnv(object):
|
||||
self.jinja_env.globals['_Q'] = quote_gettext
|
||||
self.jinja_env.globals['default_locale'] = default_locale
|
||||
|
||||
@pass_context
|
||||
@contextfunction
|
||||
def switch_locale(context, locale):
|
||||
environ = context.get('env')
|
||||
curr_loc = environ.get('pywb_lang', '')
|
||||
|
||||
request_uri = environ.get('REQUEST_URI', environ.get('PATH_INFO'))
|
||||
|
||||
if curr_loc and request_uri.startswith('/' + curr_loc + '/'):
|
||||
if curr_loc:
|
||||
return request_uri.replace(curr_loc, locale, 1)
|
||||
|
||||
app_prefix = environ.get('pywb.app_prefix', '')
|
||||
@ -188,7 +188,7 @@ class JinjaEnv(object):
|
||||
|
||||
return app_prefix + '/' + locale + request_uri
|
||||
|
||||
@pass_context
|
||||
@contextfunction
|
||||
def get_locale_prefixes(context):
|
||||
environ = context.get('env')
|
||||
locale_prefixes = {}
|
||||
@ -196,11 +196,11 @@ class JinjaEnv(object):
|
||||
orig_prefix = environ.get('pywb.app_prefix', '')
|
||||
coll = environ.get('SCRIPT_NAME', '')
|
||||
|
||||
if orig_prefix and coll.startswith(orig_prefix):
|
||||
if orig_prefix:
|
||||
coll = coll[len(orig_prefix):]
|
||||
|
||||
curr_loc = environ.get('pywb_lang', '')
|
||||
if curr_loc and coll.startswith('/' + curr_loc):
|
||||
if curr_loc:
|
||||
coll = coll[len(curr_loc) + 1:]
|
||||
|
||||
for locale in loc_map.keys():
|
||||
|
@ -143,7 +143,7 @@ r"""
|
||||
'var foo = _____WB$wombat$check$this$function_____(this).location'
|
||||
|
||||
>>> _test_js_obj_proxy('A = B\nthis.location = "foo"')
|
||||
'A = B\n_____WB$wombat$check$this$function_____(this).location = "foo"'
|
||||
'A = B\n;_____WB$wombat$check$this$function_____(this).location = "foo"'
|
||||
|
||||
>>> _test_js_obj_proxy('var foo = this.location2')
|
||||
'var foo = this.location2'
|
||||
|
@ -110,7 +110,7 @@ rules:
|
||||
|
||||
fuzzy_lookup:
|
||||
match: '("(?:cursor|cursorindex)":["\d\w]+)'
|
||||
re_type: findall
|
||||
find_all: true
|
||||
|
||||
- url_prefix: 'com,facebook)/ajax/pagelet/generic.php/profiletimeline'
|
||||
fuzzy_lookup: 'com,facebook\)/.*[?&](__adt=[^&]+).*[&]data=(?:.*?(?:[&]|(profile_id|pagelet_token)[^,]+))'
|
||||
@ -175,7 +175,7 @@ rules:
|
||||
|
||||
fuzzy_lookup:
|
||||
match: '("q[\d]+":|after:\\"[^"]+)'
|
||||
re_type: findall
|
||||
find_all: true
|
||||
|
||||
- url_prefix: 'com,facebook)/pages_reaction_units/more'
|
||||
|
||||
@ -196,9 +196,6 @@ rules:
|
||||
group: 1
|
||||
function: 'pywb.rewrite.rewrite_dash:rewrite_fb_dash'
|
||||
|
||||
- match: '"debugNoBatching\s?":(?:false|0)'
|
||||
replace: '"debugNoBatching":true'
|
||||
|
||||
parse_comments: true
|
||||
|
||||
- url_prefix: 'com,facebook'
|
||||
@ -230,9 +227,6 @@ rules:
|
||||
- match: '"is_dash_eligible":true'
|
||||
replace: '"is_dash_eligible":false'
|
||||
|
||||
- match: '"debugNoBatching\s?":(?:false|0)'
|
||||
replace: '"debugNoBatching":true'
|
||||
|
||||
fuzzy_lookup: '()'
|
||||
|
||||
|
||||
@ -544,12 +538,6 @@ rules:
|
||||
rewrite:
|
||||
js_rewrite_location: urls
|
||||
|
||||
- url_prefix: 'com,example)/matched'
|
||||
fuzzy_lookup:
|
||||
re_type: sub
|
||||
match: 'matched'
|
||||
replace: 'replaced'
|
||||
|
||||
# all domain rules -- fallback to this dataset
|
||||
#=================================================================
|
||||
# Applies to all urls -- should be last
|
||||
|
BIN
pywb/static/calendar-icon.png
Normal file
BIN
pywb/static/calendar-icon.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 20 KiB |
@ -956,11 +956,11 @@ RenderCalendar.prototype.niceDateRange = function() {
|
||||
var from = this.queryInfo.searchParams.from;
|
||||
var to = this.queryInfo.searchParams.to;
|
||||
if (from && to) {
|
||||
return [text.from, from, text.until, to].join(' ');
|
||||
return 'From ' + from + ' to ' + to;
|
||||
} else if (from) {
|
||||
return [text.from, from, text.until, text.present].join(' ');
|
||||
return 'From ' + from + ' until ' + 'present';
|
||||
}
|
||||
return [text.from, text.earliest, text.until, to].join(' ');
|
||||
return 'From earliest until ' + to;
|
||||
};
|
||||
|
||||
/**
|
||||
|
@ -14,34 +14,17 @@ var elemIds = {
|
||||
},
|
||||
dateTime: {
|
||||
from: 'dt-from',
|
||||
fromTime: 'ts-from',
|
||||
fromBad: 'dt-from-bad',
|
||||
to: 'dt-to',
|
||||
toTime: 'ts-to',
|
||||
toBad: 'dt-to-bad'
|
||||
},
|
||||
match: 'match-type-select',
|
||||
url: 'search-url',
|
||||
form: 'search-form',
|
||||
resultsNewWindow: 'open-results-new-window',
|
||||
advancedOptions: 'advanced-options',
|
||||
resetSearchForm: 'reset-search-form',
|
||||
advancedOptions: 'advanced-options'
|
||||
};
|
||||
|
||||
function resetSearchForm(event) {
|
||||
for (const field of [
|
||||
elemIds.url,
|
||||
elemIds.match,
|
||||
elemIds.dateTime.from,
|
||||
elemIds.dateTime.fromTime,
|
||||
elemIds.dateTime.to,
|
||||
elemIds.dateTime.toTime,
|
||||
]) {
|
||||
document.getElementById(field).value = '';
|
||||
}
|
||||
clearFilters(event);
|
||||
}
|
||||
|
||||
function makeCheckDateRangeChecker(dtInputId, dtBadNotice) {
|
||||
var dtInput = document.getElementById(dtInputId);
|
||||
dtInput.onblur = function() {
|
||||
@ -155,13 +138,11 @@ function performQuery(url) {
|
||||
}
|
||||
var fromT = document.getElementById(elemIds.dateTime.from).value;
|
||||
if (fromT) {
|
||||
fromT += document.getElementById(elemIds.dateTime.fromTime).value;
|
||||
query.push('from=' + fromT.replace(/[^0-9]/g, ''));
|
||||
query.push('from=' + fromT.trim());
|
||||
}
|
||||
var toT = document.getElementById(elemIds.dateTime.to).value;
|
||||
if (toT) {
|
||||
toT += document.getElementById(elemIds.dateTime.toTime).value;
|
||||
query.push('to=' + toT.replace(/[^0-9]/g, ''));
|
||||
query.push('to=' + toT.trim());
|
||||
}
|
||||
var builtQuery = query.join('&');
|
||||
if (document.getElementById(elemIds.resultsNewWindow).checked) {
|
||||
@ -207,7 +188,6 @@ $(document).ready(function() {
|
||||
elemIds.dateTime.to,
|
||||
document.getElementById(elemIds.dateTime.toBad)
|
||||
);
|
||||
document.getElementById(elemIds.resetSearchForm).onclick = resetSearchForm;
|
||||
document.getElementById(elemIds.filtering.add).onclick = addFilter;
|
||||
document.getElementById(elemIds.filtering.clear).onclick = clearFilters;
|
||||
var searchURLInput = document.getElementById(elemIds.url);
|
||||
@ -215,6 +195,9 @@ $(document).ready(function() {
|
||||
form.addEventListener('submit', function(event) {
|
||||
submitForm(event, form, searchURLInput);
|
||||
});
|
||||
document.getElementById(elemIds.advancedOptions).onclick = function() {
|
||||
validateFields(form);
|
||||
}
|
||||
var filteringExpression = document.getElementById(elemIds.filtering.expression);
|
||||
filteringExpression.addEventListener("keypress", function(event) {
|
||||
if (event.key === "Enter") {
|
||||
|
BIN
pywb/static/timeline-icon.png
Normal file
BIN
pywb/static/timeline-icon.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 2.5 KiB |
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@ -1,6 +1,6 @@
|
||||
/*
|
||||
Wombat.js client-side rewriting engine for web archive replay
|
||||
Copyright (C) 2014-2024 Webrecorder Software, Rhizome, and Contributors. Released under the GNU Affero General Public License.
|
||||
Copyright (C) 2014-2020 Webrecorder Software, Rhizome, and Contributors. Released under the GNU Affero General Public License.
|
||||
|
||||
This file is part of wombat.js, see https://github.com/webrecorder/wombat.js for the full source
|
||||
Wombat.js is part of the Webrecorder project (https://github.com/webrecorder)
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
Wombat.js client-side rewriting engine for web archive replay
|
||||
Copyright (C) 2014-2024 Webrecorder Software, Rhizome, and Contributors. Released under the GNU Affero General Public License.
|
||||
Copyright (C) 2014-2020 Webrecorder Software, Rhizome, and Contributors. Released under the GNU Affero General Public License.
|
||||
|
||||
This file is part of wombat.js, see https://github.com/webrecorder/wombat.js for the full source
|
||||
Wombat.js is part of the Webrecorder project (https://github.com/webrecorder)
|
||||
|
BIN
pywb/static/zoom-out-icon-333316.png
Normal file
BIN
pywb/static/zoom-out-icon-333316.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 1.6 KiB |
@ -3,7 +3,7 @@
|
||||
{% block body %}
|
||||
<div class="container text-danger error">
|
||||
<div class="row justify-content-center">
|
||||
<h2 class="display-2">{{ _('Pywb Error') }}</h2>
|
||||
<h2 class="display-2">Pywb Error</h2>
|
||||
</div>
|
||||
<div class="row">
|
||||
<div class="col-12 text-center">
|
||||
|
@ -25,21 +25,8 @@ html, body
|
||||
|
||||
<div id="app" style="width: 100%; height: 200px"></div>
|
||||
<script>
|
||||
VueUI.main({
|
||||
staticPrefix: "{{ static_prefix }}",
|
||||
url: "{{ url }}",
|
||||
prefix: "{{ wb_prefix }}",
|
||||
timestamp: "{{ timestamp }}",
|
||||
logoUrl: "{{ ui.logo }}",
|
||||
navbarBackground: "{{ ui.navbar_background_hex | default('f8f9fa') }}",
|
||||
navbarColor: "{{ ui.navbar_color_hex | default('212529') }}",
|
||||
navbarLightButtons: "{{ ui.navbar_light_buttons }}",
|
||||
logoHomeUrl: "{{ ui.logo_home_url }}",
|
||||
disablePrinting: "{{ ui.disable_printing }}",
|
||||
allLocales: allLocales
|
||||
},
|
||||
"{{ env.pywb_lang | default('en') }}",
|
||||
i18nStrings);
|
||||
VueUI.main("{{ static_prefix }}", "{{ url }}", "{{ wb_prefix }}", "{{ timestamp }}", "{{ ui.logo }}", "{{ ui.navbar_background_hex | default('f8f9fa') }}", "{{ ui.navbar_color_hex | default('212529') }}", "{{ ui.navbar_light_buttons }}", "{{ env.pywb_lang | default('en') }}",
|
||||
allLocales, i18nStrings);
|
||||
</script>
|
||||
|
||||
<div id="wb_iframe_div">
|
||||
|
@ -1,216 +0,0 @@
|
||||
<div class="modal fade" id="searchInstructions" tabindex="-1" role="dialog" aria-labelledby="searchInstructionsTitle" aria-hidden="true">
|
||||
<div class="modal-dialog modal-lg" role="document">
|
||||
<div class="modal-content">
|
||||
<div class="modal-header">
|
||||
<h6 class="modal-title text-muted" id="searchInstructionsTitle">{{ _("Search instructions") }}</h6>
|
||||
<button type="button" class="close" data-dismiss="modal" aria-label="{{ _('Close') }}">
|
||||
<span aria-hidden="true">×</span>
|
||||
</button>
|
||||
</div>
|
||||
<div class="modal-body">
|
||||
<h5>{{ _("URL") }}</h5>
|
||||
<table class="table table-hover table-condensed">
|
||||
<tr>
|
||||
<td>
|
||||
<p>
|
||||
{%trans%}A URL consists of several parts:{%endtrans%}
|
||||
{%trans%}<code>protocol</code>://<code>host</code>:<code>port</code>/<code>path</code>?<code>query</code>{%endtrans%}
|
||||
</p>
|
||||
|
||||
<p>
|
||||
{%trans%}The <code>protocol://</code> prefix is ignored when searching as it's not part of the searchable data.{%endtrans%}
|
||||
</p>
|
||||
<p>
|
||||
{%trans%}A leading <kbd>www.</kbd> in the <code>host</code> will also be ignored for the same reason.{%endtrans%}
|
||||
</p>
|
||||
|
||||
<p>
|
||||
{%trans%}The <code>host</code> contains one or more parts separated by periods (<kbd>.</kbd>).{%endtrans%}
|
||||
{%trans%}The part before the first period is called the <code>hostname</code>.{%endtrans%}
|
||||
{%trans%}The part after the last period is the <code>top level domain</code>.{%endtrans%}
|
||||
{%trans%}Every part added to the left of the top level domain <code>sub-domain</code>.{%endtrans%}
|
||||
{%trans%}I.e. <code>x.y.z</code> is a <code>sub-domain</code> of <code>y.z</code>{%endtrans%}
|
||||
{%trans%}which in turn is a <code>sub-domain</code> of the <code>top level domain</code> <code>z</code>{%endtrans%}
|
||||
</p>
|
||||
|
||||
<p>
|
||||
{%trans%}See <em>Match Type</em> below for interpretations of the search string.{%endtrans%}
|
||||
</p>
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
<h5>{{ _("Results Display") }}</h5>
|
||||
<table class="table table-hover table-condensed">
|
||||
<tr>
|
||||
<td>
|
||||
<p>
|
||||
{%trans%}For the <em>Default</em> search mode, the results are shown in a calendar view unless a filter is also added.{%endtrans%}
|
||||
{%trans%}For all other cases the results will be displayed in a list.{%endtrans%}
|
||||
</p>
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
<h5>{{ _("Search Options") }}</h5>
|
||||
<h6>{{ _("Match Type") }}</h6>
|
||||
<p> {{ _("There are four different search modes:") }}</p>
|
||||
|
||||
<table class="table table-hover table-condensed">
|
||||
<tr>
|
||||
<td><em>{{ _("Default") }}</em></td>
|
||||
<td>
|
||||
<p>
|
||||
{%trans%}In the default mode the exact URL (minus the ignored prefixes mentioned above) is searched for.{%endtrans%}
|
||||
{%trans%}If one leading or trailing wildcard asterisk (<kbd>*</kbd>) is added, see <em>Prefix</em> and <em>Domain</em> below.{%endtrans%}
|
||||
</p>
|
||||
<p class="text-muted">
|
||||
{%trans%}Any other asterisks will be considered literal parts of the search string.{%endtrans%}
|
||||
{%trans%}Hence, adding both a leading and a trailing wildcard asterisk is not possible.{%endtrans%}
|
||||
</p>
|
||||
|
||||
{%trans%}Example:{%endtrans%}
|
||||
<p class="ml-5 text-lowercase">
|
||||
<em>{{ _("URL") }}: <strong>https://http.cat/206</strong></em> & <em>{{ _("Match Type") }}: <strong>{{ _("Default") }}</strong></em>
|
||||
<span class="float-right">
|
||||
<button onclick="fillForm('search-url=https://http.cat/206&match-type-select=');" class="btn btn-outline-info" role="button" aria-label="{{ _('Fill') }}">{{ _('Fill') }}</button>
|
||||
<button onclick="fillForm('search-url=https://http.cat/206&match-type-select=', true);" class="btn btn-outline-primary" role="button" aria-label="{{ _('Search') }}">{{ _('Search') }}</button>
|
||||
</span>
|
||||
</p>
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
<tr>
|
||||
<td><em>{{ _("Prefix") }}</em></td>
|
||||
<td>
|
||||
<p>
|
||||
{%trans%}This will return all URL:s that begin with the given string.{%endtrans%}
|
||||
{%trans%}It returns the same results as <em>Default</em> with a trailing wildcard asterisk.{%endtrans%}
|
||||
</p>
|
||||
|
||||
{%trans%}Examples:{%endtrans%}
|
||||
<p class="ml-5 text-lowercase">
|
||||
<em>{{ _("URL") }}: <strong>https://http.cat/2</strong></em> & <em>{{ _("Match Type") }}: <strong>{{ _("Prefix") }}</strong></em>
|
||||
<span class="float-right">
|
||||
<button onclick="fillForm('search-url=https://http.cat/2&match-type-select=prefix');" class="btn btn-outline-info" role="button" aria-label="{{ _('Fill') }}">{{ _('Fill') }}</button>
|
||||
<button onclick="fillForm('search-url=https://http.cat/2&match-type-select=prefix', true);" class="btn btn-outline-primary" role="button" aria-label="{{ _('Search') }}">{{ _('Search') }}</button>
|
||||
</span>
|
||||
</p>
|
||||
<p class="ml-5 text-lowercase">
|
||||
<em>{{ _("URL") }}: <strong>https://http.cat/2*</strong></em> & <em>{{ _("Match Type") }}: <strong>{{ _("Default") }}</strong></em>
|
||||
<span class="float-right">
|
||||
<button onclick="fillForm('search-url=https://http.cat/2*&match-type-select=');" class="btn btn-outline-info" role="button" aria-label="{{ _('Fill') }}">{{ _('Fill') }}</button>
|
||||
<button onclick="fillForm('search-url=https://http.cat/2*&match-type-select=', true);" class="btn btn-outline-primary" role="button" aria-label="{{ _('Search') }}">{{ _('Search') }}</button>
|
||||
</span>
|
||||
</p>
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
<tr>
|
||||
<td><em>{{ _("Host") }}</em></td>
|
||||
<td>
|
||||
<p>
|
||||
{%trans%}This will ignore any path and query parts of the URL and return all URL:s with the specified <code>host</code> part.{%endtrans%}
|
||||
</p>
|
||||
|
||||
{%trans%}Example:{%endtrans%}
|
||||
<p class="ml-5 text-lowercase">
|
||||
<em>{{ _("URL") }}: <strong>https://http.cat/</strong></em> & <em>{{ _("Match Type") }}: <strong>{{ _("Host") }}</strong></em>
|
||||
<span class="float-right">
|
||||
<button onclick="fillForm('search-url=https://http.cat/&match-type-select=host');" class="btn btn-outline-info" role="button" aria-label="{{ _('Fill') }}">{{ _('Fill') }}</button>
|
||||
<button onclick="fillForm('search-url=https://http.cat/&match-type-select=host', true);" class="btn btn-outline-primary" role="button" aria-label="{{ _('Search') }}">{{ _('Search') }}</button>
|
||||
</span>
|
||||
</p>
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
<tr>
|
||||
<td><em>{{ _("Domain") }}</em></td>
|
||||
<td>
|
||||
<p>
|
||||
{%trans%}This is similar to the previous but doesn't require the whole <code>host</code>.{%endtrans%}
|
||||
{%trans%}It returns the same results as <em>Default</em> with a leading wildcard asterisk and a period (i.e. <kbd>*.</kbd>).{%endtrans%}
|
||||
{%trans%}The leading wildcard matches zero or more <code>sub-domains</code> as well as zero or one <code>hostname</code>.{%endtrans%}
|
||||
</p>
|
||||
|
||||
{%trans%}Examples:{%endtrans%}
|
||||
<p class="ml-5 text-lowercase">
|
||||
<em>{{ _("URL") }}: <strong>cat/</strong></em> & <em>{{ _("Match Type") }}: <strong>{{ _("Domain") }}</strong></em>
|
||||
<span class="float-right">
|
||||
<button onclick="fillForm('search-url=cat/&match-type-select=domain');" class="btn btn-outline-info" role="button" aria-label="{{ _('Fill') }}">{{ _('Fill') }}</button>
|
||||
<button onclick="fillForm('search-url=cat/&match-type-select=domain', true);" class="btn btn-outline-primary" role="button" aria-label="{{ _('Search') }}">{{ _('Search') }}</button>
|
||||
</span>
|
||||
</p>
|
||||
<p class="ml-5 text-lowercase">
|
||||
<em>{{ _("URL") }}: <strong>*.cat/</strong></em> & <em>{{ _("Match Type") }}: <strong>{{ _("Default") }}</strong></em>
|
||||
<span class="float-right">
|
||||
<button onclick="fillForm('search-url=*.cat/&match-type-select=');" class="btn btn-outline-info" role="button" aria-label="{{ _('Fill') }}">{{ _('Fill') }}</button>
|
||||
<button onclick="fillForm('search-url=*.cat/&match-type-select=', true);" class="btn btn-outline-primary" role="button" aria-label="{{ _('Search') }}">{{ _('Search') }}</button>
|
||||
</span>
|
||||
</p>
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
<h6>{{ _("Date/Time Range") }}</h6>
|
||||
<table class="table table-hover table-condensed">
|
||||
<tr>
|
||||
<td>
|
||||
<p>
|
||||
{%trans%}One may specify a start and/or an end timestamp to further restrict the search - both are inclusive.{%endtrans%}
|
||||
{%trans%}The timestamps consist of a date and an optional time of day.{%endtrans%}
|
||||
{%trans%}The layout of these input fields are subject to which browser is used.{%endtrans%}
|
||||
</p>
|
||||
|
||||
{%trans%}Example:{%endtrans%}
|
||||
<p class="ml-5 text-lowercase">
|
||||
<em>{{ _("URL") }}: <strong>https://http.cat/2</strong></em> & <em>{{ _("Match Type") }}: <strong>{{ _("Prefix") }}</strong></em> & <em>{{ _("From") }}: <strong>2022-02-02 09:00</strong></em>
|
||||
<span class="float-right">
|
||||
<button onclick="fillForm('search-url=https://http.cat/2&match-type-select=prefix&dt-from=2022-02-02&ts-from=09:00');" class="btn btn-outline-info" role="button" aria-label="{{ _('Fill') }}">{{ _('Fill') }}</button>
|
||||
<button onclick="fillForm('search-url=https://http.cat/2&match-type-select=prefix&dt-from=2022-02-02&ts-from=09:00', true);" class="btn btn-outline-primary" role="button" aria-label="{{ _('Search') }}">{{ _('Search') }}</button>
|
||||
</span>
|
||||
</p>
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
<h6>{{ _("Filtering") }}</h6>
|
||||
<table class="table table-hover table-condensed">
|
||||
<tr>
|
||||
<td>
|
||||
<p>
|
||||
{%trans%}Finally one may add extra filters for Mime Type, Status and URL.{%endtrans%}
|
||||
{%trans%}For each filter one needs to specify one of the three attributes, one of a set of relations and a string.{%endtrans%}
|
||||
{%trans%}If more than one filter is added, they will all be applied to the list of results.{%endtrans%}
|
||||
</p>
|
||||
<p class="text-muted">{%trans%}Remember to actually add the filter before submitting the search.{%endtrans%}</p>
|
||||
|
||||
{%trans%}Example:{%endtrans%}
|
||||
<p class="ml-5 text-lowercase">
|
||||
<em>{{ _("URL") }}: <strong>https://http.cat/2/</strong></em> & <em>{{ _("Match Type") }}: <strong>{{ _("Prefix") }}</strong></em> & <em>{{ _("Filtering") }}: <strong>{{ _("HTTP Status") }} {{ _("Is Not") }} "301"</strong></em>
|
||||
<span class="float-right">
|
||||
<button onclick="fillForm('search-url=https://http.cat/2&match-type-select=prefix&filter-by=status&filter-modifier==!=&filter-expression=301');" class="btn btn-outline-info" role="button" aria-label="{{ _('Fill') }}">{{ _('Fill') }}</button>
|
||||
<button onclick="fillForm('search-url=https://http.cat/2&match-type-select=prefix&filter-by=status&filter-modifier==!=&filter-expression=301', true);" class="btn btn-outline-primary" role="button" aria-label="{{ _('Search') }}">{{ _('Search') }}</button>
|
||||
</span>
|
||||
</p>
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<script>
|
||||
function fillForm(query, search = false) {
|
||||
$('#searchInstructions').modal('hide');
|
||||
$('#advancedOptions').collapse('show');
|
||||
for (const item of query.split('&')) {
|
||||
var pair = item.split('=');
|
||||
var field = document.getElementById(pair[0]);
|
||||
if (field) field.value = pair.slice(1).join('=');
|
||||
if (pair[0] == "filter-expression") addFilter(event);
|
||||
}
|
||||
if (search) $('#search-button').click();
|
||||
}
|
||||
</script>
|
@ -69,10 +69,6 @@
|
||||
'host': "{{ _('host') }}",
|
||||
'domain': "{{ _('domain') }}",
|
||||
},
|
||||
from: "{{ _('From') }}",
|
||||
until: "{{ _('until') }}",
|
||||
present: "{{ _('present') }}",
|
||||
earliest: "{{ _('earliest') }}",
|
||||
};
|
||||
|
||||
var filterMods = {
|
||||
@ -94,21 +90,8 @@
|
||||
<div id="app" style="width: 100%; height: 100%"></div>
|
||||
|
||||
<script>
|
||||
VueUI.main({
|
||||
staticPrefix: "{{ static_prefix }}",
|
||||
url: "{{ url }}",
|
||||
prefix: "{{ prefix }}",
|
||||
timestamp: undefined,
|
||||
logoUrl: "{{ ui.logo }}",
|
||||
navbarBackground: "{{ ui.navbar_background_hex | default('f8f9fa') }}",
|
||||
navbarColor: "{{ ui.navbar_color_hex | default('212529') }}",
|
||||
navbarLightButtons: "{{ ui.navbar_light_buttons }}",
|
||||
logoHomeUrl: "{{ ui.logo_home_url }}",
|
||||
disablePrinting: "{{ ui.disable_printing }}",
|
||||
allLocales: allLocales
|
||||
},
|
||||
"{{ env.pywb_lang | default('en') }}",
|
||||
i18nStrings);
|
||||
VueUI.main("{{ static_prefix }}", "{{ url }}", "{{ prefix }}", undefined, "{{ ui.logo }}", "{{ ui.navbar_background_hex | default('f8f9fa') }}", "{{ ui.navbar_color_hex | default('212529') }}", "{{ ui.navbar_light_buttons }}", "{{ env.pywb_lang | default('en') }}",
|
||||
allLocales, i18nStrings);
|
||||
</script>
|
||||
|
||||
{% endif %}
|
||||
|
@ -31,20 +31,15 @@
|
||||
<form class="needs-validation" id="search-form" novalidate>
|
||||
<div class="form-row">
|
||||
<div class="col-12">
|
||||
<label for="search-url" class="lead" aria-label="{{ _('Search Collection') }}">
|
||||
<label for="search-url" class="lead" aria-label="Search For Col">
|
||||
{% set coll_title = metadata.title if metadata and metadata.title else coll %}
|
||||
{% autoescape false %}
|
||||
{% trans %}Search the {{ coll_title }} collection by url:{% endtrans %}
|
||||
{% endautoescape %}
|
||||
</label>
|
||||
<a tabindex="0" class="btn btn-sm float-right btn-light" role="button" data-toggle="modal" data-target="#searchInstructions">{{ _('Help') }}</a>
|
||||
</div>
|
||||
</div>
|
||||
<div class="form-row">
|
||||
<div class="col-12">
|
||||
<input aria-label="{{ _('URL') }}" aria-required="true" class="form-control form-control-lg" id="search-url"
|
||||
<input aria-label="url" aria-required="true" class="form-control form-control-lg" id="search-url"
|
||||
name="search" placeholder="{{ _('Enter a URL to search for') }}"
|
||||
title="{{ _('Enter a URL to search for') }}" type="search" required autofocus />
|
||||
title="{{ _('Enter a URL to search for') }}" type="search" required/>
|
||||
<div class="invalid-feedback">
|
||||
{% trans %}Please enter a URL{% endtrans %}
|
||||
</div>
|
||||
@ -58,26 +53,23 @@
|
||||
</div>
|
||||
</div>
|
||||
<div class="col-7">
|
||||
<button type="submit" id="search-button" class="btn btn-primary float-right" role="button" aria-label="{{ _('Search') }}">
|
||||
<button type="submit" class="btn btn-outline-primary float-right" role="button" aria-label="Search">
|
||||
{% trans %}Search{% endtrans %}
|
||||
</button>
|
||||
<button class="btn btn-outline-secondary float-right mr-3" type="button" role="button"
|
||||
<button class="btn btn-outline-info float-right mr-3" type="button" role="button"
|
||||
data-toggle="collapse" data-target="#advancedOptions" id="advanced-options"
|
||||
aria-expanded="false" aria-controls="advancedOptions" aria-label="{{ _('Search Options') }}">
|
||||
{{ _('Search Options') }}
|
||||
</button>
|
||||
<button id="reset-search-form" class="btn btn-outline-danger float-right mr-3" type="button" role="button" aria-label="{{ _('Reset Options') }}">
|
||||
{{ _('Reset') }}
|
||||
aria-expanded="false" aria-controls="advancedOptions" aria-label="Advanced Search Options">
|
||||
{{ _('Advanced Search Options') }}
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
<div class="collapse mt-3" id="advancedOptions">
|
||||
<div class="form-group form-row">
|
||||
<label for="match-type-select" class="col-sm-2 col-form-label" aria-label="{{ _('Match Type') }}">
|
||||
<label for="match-type-select" class="col-sm-2 col-form-label" aria-label="Match Type">
|
||||
{{ _('Match Type:') }}
|
||||
</label>
|
||||
<select id="match-type-select" class="form-control form-control col-sm-6">
|
||||
<option value="">{% trans %}Default{% endtrans %}</option>
|
||||
<option value=""></option>
|
||||
<option value="prefix">{% trans %}Prefix{% endtrans %}</option>
|
||||
<option value="host">{% trans %}Host{% endtrans %}</option>
|
||||
<option value="domain">{% trans %}Domain{% endtrans %}</option>
|
||||
@ -85,43 +77,57 @@
|
||||
</div>
|
||||
<p style="cursor: help;">
|
||||
<span data-toggle="tooltip" data-placement="right"
|
||||
title="{{ _('Restricts the results to the given date/time range (inclusive)') }}">
|
||||
title="Restricts the results to the given date/time range (inclusive)">
|
||||
{{ _('Date/Time Range') }}
|
||||
</span>
|
||||
</p>
|
||||
<div class="form-row">
|
||||
<div class="col-6">
|
||||
<label class="sr-only" for="dt-from" aria-label="{{ _('Date/Time Range From') }}">{% trans %}From:{% endtrans %}</label>
|
||||
<label class="sr-only" for="dt-from" aria-label="Date/Time Range From">{% trans %}From:{% endtrans %}</label>
|
||||
<div class="input-group">
|
||||
<div class="input-group-prepend">
|
||||
<div class="input-group-text">{% trans %}From:{% endtrans %}</div>
|
||||
</div>
|
||||
<input id="dt-from" type="date" placeholder="yyyy-mm-dd" name="date-range-from" class="form-control">
|
||||
<input id="ts-from" type="time" placeholder="hh:mm:ss" name="date-range-from-ts" class="form-control">
|
||||
<input id="dt-from" type="number" name="date-range-from" class="form-control"
|
||||
pattern="^\d{4,14}$">
|
||||
<div class="invalid-feedback" id="dt-from-bad">
|
||||
{% trans %}Please enter a valid <b>From</b> timestamp. Timestamps may be 4 <= ts <=14 digits{% endtrans %}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="col-6">
|
||||
<label class="sr-only" for="dt-to" aria-label="{{ _('Date/Time Range To') }}">{% trans %}To:{% endtrans %}</label>
|
||||
<label class="sr-only" for="dt-to" aria-label="Date/Time Range To">{% trans %}To:{% endtrans %}</label>
|
||||
<div class="input-group">
|
||||
<div class="input-group-prepend">
|
||||
<div class="input-group-text">{% trans %}To:{% endtrans %}</div>
|
||||
</div>
|
||||
<input id="dt-to" type="date" placeholder="yyyy-mm-dd" name="date-range-to" class="form-control">
|
||||
<input id="ts-to" type="time" placeholder="hh:mm:ss" name="date-range-to-ts" class="form-control">
|
||||
<input id="dt-to" type="number" name="date-range-to" class="form-control" pattern="^\d{4,14}$">
|
||||
<div class="invalid-feedback" id="dt-to-bad">
|
||||
{% trans %}Please enter a valid <b>To</b> timestamp. Timestamps may be 4 <= ts <=14 digits{% endtrans %}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="form-group mt-3">
|
||||
<div class="form-row">
|
||||
<div class="col-12">
|
||||
<div class="col-6">
|
||||
<p>{% trans %}Filtering{% endtrans %}</p>
|
||||
</div>
|
||||
<div class="col-6">
|
||||
<button id="clear-filters" class="btn btn-outline-warning float-right" type="button">
|
||||
{% trans %}Clear Filters{% endtrans %}
|
||||
</button>
|
||||
<button id="add-filter" class="btn btn-outline-secondary float-right mr-2" type="button">
|
||||
{% trans %}Add Filter{% endtrans %}
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
<div class="form-row">
|
||||
<div class="col-6">
|
||||
<div class="row pb-1">
|
||||
<label for="filter-by" class="col-form-label col-3">{% trans %}By:{% endtrans %}</label>
|
||||
<select id="filter-by" class="form-control col-7">
|
||||
<option value="" selected></option>
|
||||
<option value="mime">{% trans %}Mime Type{% endtrans %}</option>
|
||||
<option value="status">{% trans %}Status{% endtrans %}</option>
|
||||
<option value="url">{% trans %}URL{% endtrans %}</option>
|
||||
@ -138,24 +144,17 @@
|
||||
<option value="=!~">{% trans %}Does Not Begin With{% endtrans %}</option>
|
||||
</select>
|
||||
</div>
|
||||
<div class="row pb-1">
|
||||
<div class="row">
|
||||
<label for="filter-expression" class="col-form-label col-3">{% trans %}Expr:{% endtrans %}</label>
|
||||
<input type="text" id="filter-expression" class="form-control col-7"
|
||||
placeholder="{% trans %}Enter an expression to filter by{% endtrans %}"
|
||||
>
|
||||
</div>
|
||||
<button id="add-filter" class="btn btn-outline-secondary mt-2" type="button">
|
||||
{% trans %}Add Filter{% endtrans %}
|
||||
</button>
|
||||
|
||||
</div>
|
||||
<div class="col-6">
|
||||
<ul id="filter-list" class="filter-list">
|
||||
<li id="filtering-nothing">{% trans %}No Filter{% endtrans %}</li>
|
||||
</ul>
|
||||
<button id="clear-filters" class="btn btn-outline-danger float-right mr-2" type="button">
|
||||
{% trans %}Clear Filters{% endtrans %}
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
@ -193,5 +192,4 @@
|
||||
</div>
|
||||
</div>
|
||||
{% endif %}
|
||||
{% include "instructions.html" %}
|
||||
{% endblock %}
|
||||
|
@ -49,7 +49,6 @@
|
||||
"Hide calendar":"{{ _Q('Hide calendar') }}",
|
||||
"Previous capture":"{{ _Q('Previous capture') }}",
|
||||
"Next capture":"{{ _Q('Next capture') }}",
|
||||
"Print":"{{ _Q('Print') }}",
|
||||
"Select language":"{{ _Q('Select language') }}",
|
||||
"View capture on {date}":"{{ _Q('View capture on {date}') }}",
|
||||
"{count} capture":"{{ _Q('{count} capture') }}",
|
||||
|
@ -150,7 +150,7 @@ def iter_exact(reader, key, token=b' '):
|
||||
"""
|
||||
Create an iterator which iterates over lines where the first field matches
|
||||
the 'key', equivalent to token + sep prefix.
|
||||
Default field termin_ator/separator is ' '
|
||||
Default field termin_ator/seperator is ' '
|
||||
"""
|
||||
|
||||
return iter_prefix(reader, key + token)
|
||||
|
@ -1,4 +1,4 @@
|
||||
__version__ = '2.8.3'
|
||||
__version__ = '2.7.2'
|
||||
|
||||
if __name__ == '__main__':
|
||||
print(__version__)
|
||||
|
@ -4,12 +4,9 @@
|
||||
<nav
|
||||
class="navbar navbar-light navbar-expand-lg fixed-top top-navbar justify-content-center"
|
||||
:style="navbarStyle">
|
||||
<a class="navbar-brand flex-grow-1 my-1" :href="config.logoHomeUrl" v-if="config.logoHomeUrl">
|
||||
<a class="navbar-brand flex-grow-1 my-1" href="/">
|
||||
<img :src="config.logoImg" id="logo-img" alt="_('pywb logo')">
|
||||
</a>
|
||||
<div class="navbar-brand flex-grow-1 my-1" v-else>
|
||||
<img :src="config.logoImg" id="logo-img" alt="_('pywb logo')">
|
||||
</div>
|
||||
<div class="flex-grow-1 d-flex" id="searchdiv">
|
||||
<form
|
||||
class="form-inline my-2 my-md-0 mx-lg-auto"
|
||||
@ -77,17 +74,6 @@
|
||||
<i class="far fa-chart-bar"></i>
|
||||
</button>
|
||||
</li>
|
||||
<li class="nav-item">
|
||||
<button
|
||||
class="btn btn-sm"
|
||||
:class="{'btn-outline-light': lightButtons, 'btn-outline-dark': !lightButtons}"
|
||||
:aria-pressed="printReplayFrame"
|
||||
@click="printReplayFrame"
|
||||
v-if="printingEnabled && hasReplayFrame()"
|
||||
:title="_('Print')">
|
||||
<i class="fas fa-print"></i>
|
||||
</button>
|
||||
</li>
|
||||
<li class="nav-item dropdown" v-if="localesAreSet">
|
||||
<button
|
||||
class="btn btn-sm dropdown-toggle"
|
||||
@ -227,9 +213,6 @@ export default {
|
||||
lightButtons() {
|
||||
return !!this.config.navbarLightButtons;
|
||||
},
|
||||
printingEnabled() {
|
||||
return !this.config.disablePrinting;
|
||||
},
|
||||
previousSnapshot() {
|
||||
if (!this.currentSnapshotIndex) {
|
||||
return null;
|
||||
@ -320,14 +303,6 @@ export default {
|
||||
this.showTimelineView = !this.showTimelineView;
|
||||
window.localStorage.setItem("showTimelineView", this.showTimelineView ? "1" : "0");
|
||||
},
|
||||
hasReplayFrame() {
|
||||
return !! window.frames.replay_iframe;
|
||||
},
|
||||
printReplayFrame() {
|
||||
window.frames.replay_iframe.contentWindow.focus();
|
||||
window.frames.replay_iframe.contentWindow.print();
|
||||
return false;
|
||||
},
|
||||
setData(/** @type {PywbData} data */ data) {
|
||||
|
||||
// data-set will usually happen at App INIT (from parent caller)
|
||||
|
@ -39,7 +39,7 @@
|
||||
@keyup.enter="changePeriod(histoPeriod, $event)"
|
||||
@mouseover="setTooltipPeriod(histoPeriod, $event)"
|
||||
@mouseout="setTooltipPeriod(null, $event)"
|
||||
:tabindex="histoPeriod.snapshotCount > 0 ? 0 : -1"
|
||||
tabindex="0"
|
||||
>
|
||||
</div>
|
||||
</div>
|
||||
@ -49,6 +49,7 @@
|
||||
@keyup.enter="changePeriod(histoPeriod, $event)"
|
||||
@mouseover="setTooltipPeriod(subPeriod, $event)"
|
||||
@mouseout="setTooltipPeriod(null, $event)"
|
||||
tabindex="0"
|
||||
>
|
||||
<div class="label">
|
||||
{{subPeriod.getReadableId()}}
|
||||
|
@ -8,7 +8,7 @@
|
||||
@keyup.enter="changePeriod(parents[0])"
|
||||
:title="getPeriodZoomOutText(parents[0])"
|
||||
tabindex="1">
|
||||
<i class="fa fa-search-minus"></i> {{parents[0].getReadableId(true)}}
|
||||
<img src="/static/zoom-out-icon-333316.png" /> {{parents[0].getReadableId(true)}}
|
||||
</span>
|
||||
</span>
|
||||
>
|
||||
|
@ -32,7 +32,7 @@ export class PywbI18N {
|
||||
getMonth(id, type='long') {
|
||||
return decodeURIComponent(this.config[PywbI18N.monthIdPrefix[id]+'_'+type]);
|
||||
}
|
||||
// can get long (default) or short day string or initial
|
||||
// can get long (default) or short day string or intial
|
||||
// PywbI18N expects to receive day's initials like:
|
||||
// config.mon_short, config.tue_long, ...., config.<mmm>_short, config.<mmm>_long
|
||||
getWeekDay(id, type='long') {
|
||||
|
@ -7,44 +7,39 @@ import Vue from "vue/dist/vue.esm.browser";
|
||||
|
||||
|
||||
// ===========================================================================
|
||||
export function main(config, locale, i18nStrings) {
|
||||
export function main(staticPrefix, url, prefix, timestamp, logoUrl, navbarBackground, navbarColor, navbarLightButtons, locale, allLocales, i18nStrings) {
|
||||
PywbI18N.init(locale, i18nStrings);
|
||||
new CDXLoader(config);
|
||||
new CDXLoader(staticPrefix, url, prefix, timestamp, logoUrl, navbarBackground, navbarColor, navbarLightButtons, allLocales);
|
||||
}
|
||||
|
||||
// ===========================================================================
|
||||
class CDXLoader {
|
||||
constructor(config) {
|
||||
constructor(staticPrefix, url, prefix, timestamp, logoUrl, navbarBackground, navbarColor, navbarLightButtons, allLocales) {
|
||||
this.loadingSpinner = null;
|
||||
this.loaded = false;
|
||||
this.opts = {};
|
||||
this.url = config.url;
|
||||
this.prefix = config.prefix;
|
||||
this.staticPrefix = config.staticPrefix;
|
||||
this.logoUrl = config.logoUrl;
|
||||
this.logoHomeUrl = config.logoHomeUrl;
|
||||
this.navbarBackground = config.navbarBackground;
|
||||
this.navbarColor = config.navbarColor;
|
||||
this.navbarLightButtons = config.navbarLightButtons;
|
||||
this.disablePrinting = config.disablePrinting;
|
||||
this.prefix = prefix;
|
||||
this.staticPrefix = staticPrefix;
|
||||
this.logoUrl = logoUrl;
|
||||
this.navbarBackground = navbarBackground;
|
||||
this.navbarColor = navbarColor;
|
||||
this.navbarLightButtons = navbarLightButtons;
|
||||
this.timestamp = timestamp;
|
||||
|
||||
this.timestamp = config.timestamp;
|
||||
|
||||
this.isReplay = (config.timestamp !== undefined);
|
||||
this.isReplay = (timestamp !== undefined);
|
||||
|
||||
setTimeout(() => {
|
||||
if (!this.loaded) {
|
||||
this.loadingSpinner = new LoadingSpinner({text: PywbI18N.instance?.getText('Loading...'), isSmall: !!this.timestamp}); // bootstrap loading-spinner EARLY ON
|
||||
this.loadingSpinner = new LoadingSpinner({text: PywbI18N.instance?.getText('Loading...'), isSmall: !!timestamp}); // bootstrap loading-spinner EARLY ON
|
||||
this.loadingSpinner.setOn();
|
||||
}
|
||||
}, 500);
|
||||
|
||||
if (this.isReplay) {
|
||||
window.WBBanner = new VueBannerWrapper(this, this.url, this.timestamp);
|
||||
window.WBBanner = new VueBannerWrapper(this, url, timestamp);
|
||||
}
|
||||
|
||||
let queryURL;
|
||||
let url;
|
||||
|
||||
// query form *?=url...
|
||||
if (window.location.href.indexOf("*?") > 0) {
|
||||
@ -52,24 +47,23 @@ class CDXLoader {
|
||||
url = new URL(queryURL).searchParams.get("url");
|
||||
|
||||
// otherwise, traditional calendar form /*/<url>
|
||||
} else if (this.url) {
|
||||
url = this.url
|
||||
} else if (url) {
|
||||
const params = new URLSearchParams();
|
||||
params.set("url", url);
|
||||
params.set("output", "json");
|
||||
queryURL = this.prefix + "cdx?" + params.toString();
|
||||
queryURL = prefix + "cdx?" + params.toString();
|
||||
|
||||
// otherwise, an error since no URL
|
||||
} else {
|
||||
throw new Error("No query URL specified");
|
||||
}
|
||||
|
||||
config.logoImg = this.staticPrefix + "/" + (!!this.logoUrl ? this.logoUrl : "pywb-logo-sm.png");
|
||||
const logoImg = this.staticPrefix + "/" + (this.logoUrl ? this.logoUrl : "pywb-logo-sm.png");
|
||||
|
||||
this.app = this.initApp(config);
|
||||
this.app = this.initApp({logoImg, navbarBackground, navbarColor, navbarLightButtons, url, allLocales, timestamp});
|
||||
|
||||
this.loadCDX(queryURL).then((cdxList) => {
|
||||
this.setAppData(cdxList, url, config.timestamp);
|
||||
this.setAppData(cdxList, url, this.timestamp);
|
||||
});
|
||||
}
|
||||
|
||||
|
@ -386,7 +386,7 @@ color-name@~1.1.4:
|
||||
concat-map@0.0.1:
|
||||
version "0.0.1"
|
||||
resolved "https://registry.yarnpkg.com/concat-map/-/concat-map-0.0.1.tgz#d8a96bd77fd68df7793a73036a3ba0d5405d477b"
|
||||
integrity sha512-/Srv4dswyQNBfohGpz9o6Yb3Gz3SrUDqBH5rTuhGR7ahtlbYKnVxw2bCFMRljaA7EXHaXZ8wsHdodFvbkhKmqg==
|
||||
integrity sha1-2Klr13/Wjfd5OnMDajug1UBdR3s=
|
||||
|
||||
consolidate@^0.15.1:
|
||||
version "0.15.1"
|
||||
@ -469,9 +469,9 @@ debug@~3.1.0:
|
||||
ms "2.0.0"
|
||||
|
||||
decode-uri-component@^0.2.0:
|
||||
version "0.2.2"
|
||||
resolved "https://registry.yarnpkg.com/decode-uri-component/-/decode-uri-component-0.2.2.tgz#e69dbe25d37941171dd540e024c444cd5188e1e9"
|
||||
integrity sha512-FqUYQ+8o158GyGTrMFJms9qh3CqTKvAqgqsTnkLI8sKu0028orqBhxNMFkFen0zGyg6epACD32pjVk58ngIErQ==
|
||||
version "0.2.0"
|
||||
resolved "https://registry.yarnpkg.com/decode-uri-component/-/decode-uri-component-0.2.0.tgz#eb3913333458775cb84cd1a1fae062106bb87545"
|
||||
integrity sha1-6zkTMzRYd1y4TNGh+uBiEGu4dUU=
|
||||
|
||||
deep-is@^0.1.3:
|
||||
version "0.1.3"
|
||||
@ -1103,9 +1103,9 @@ mime@^1.4.1:
|
||||
integrity sha512-x0Vn8spI+wuJ1O6S7gnbaQg8Pxh4NNHb7KSINmEWKiPE4RKOplvijn+NkmYmmRgP68mc70j2EbeTFRsrswaQeg==
|
||||
|
||||
minimatch@^3.0.4:
|
||||
version "3.1.2"
|
||||
resolved "https://registry.yarnpkg.com/minimatch/-/minimatch-3.1.2.tgz#19cd194bfd3e428f049a70817c038d89ab4be35b"
|
||||
integrity sha512-J7p63hRiAjw1NDEww1W7i37+ByIrOWO5XQQAzZ3VOcL0PNybwpfmV/N05zFAzwQ9USyEcX6t3UO+K5aqBQOIHw==
|
||||
version "3.0.4"
|
||||
resolved "https://registry.yarnpkg.com/minimatch/-/minimatch-3.0.4.tgz#5166e286457f03306064be5497e8dbb0c3d32083"
|
||||
integrity sha512-yJHVQEhyqPLUTgt9B83PXu6W3rx4MvvHvSUvToogpwoGDOUQ+yDrR0HRot+yOCdCO7u4hX3pWft6kWBBcqh0UA==
|
||||
dependencies:
|
||||
brace-expansion "^1.1.7"
|
||||
|
||||
|
@ -260,10 +260,6 @@ class AccessChecker(object):
|
||||
if key.startswith(acl_key):
|
||||
acl_obj = CDXObject(acl)
|
||||
|
||||
# Check for "*," in ACL, which matches any URL
|
||||
if acl_key == b"*,":
|
||||
acl_obj = CDXObject(acl)
|
||||
|
||||
if acl_obj:
|
||||
user = acl_obj.get('user')
|
||||
if user == acl_user:
|
||||
|
@ -15,7 +15,7 @@ from collections import namedtuple
|
||||
# ============================================================================
|
||||
FuzzyRule = namedtuple('FuzzyRule',
|
||||
'url_prefix, regex, replace_after, filter_str, ' +
|
||||
'match_type, re_type')
|
||||
'match_type, find_all')
|
||||
|
||||
|
||||
# ============================================================================
|
||||
@ -23,7 +23,6 @@ class FuzzyMatcher(object):
|
||||
DEFAULT_FILTER = ['urlkey:{0}']
|
||||
DEFAULT_MATCH_TYPE = 'prefix'
|
||||
DEFAULT_REPLACE_AFTER = '?'
|
||||
DEFAULT_RE_TYPE = 'search'
|
||||
|
||||
FUZZY_SKIP_PARAMS = ('alt_url', 'reverse', 'closest', 'end_key',
|
||||
'url', 'matchType', 'filter')
|
||||
@ -59,16 +58,16 @@ class FuzzyMatcher(object):
|
||||
replace_after = self.DEFAULT_REPLACE_AFTER
|
||||
filter_str = self.DEFAULT_FILTER
|
||||
match_type = self.DEFAULT_MATCH_TYPE
|
||||
re_type = self.DEFAULT_RE_TYPE
|
||||
find_all = False
|
||||
|
||||
else:
|
||||
regex = self.make_regex(config.get('match'))
|
||||
replace_after = config.get('replace', self.DEFAULT_REPLACE_AFTER)
|
||||
filter_str = config.get('filter', self.DEFAULT_FILTER)
|
||||
match_type = config.get('type', self.DEFAULT_MATCH_TYPE)
|
||||
re_type = config.get('re_type', self.DEFAULT_RE_TYPE)
|
||||
find_all = config.get('find_all', False)
|
||||
|
||||
return FuzzyRule(url_prefix, regex, replace_after, filter_str, match_type, re_type)
|
||||
return FuzzyRule(url_prefix, regex, replace_after, filter_str, match_type, find_all)
|
||||
|
||||
def get_fuzzy_match(self, urlkey, url, params):
|
||||
filters = set()
|
||||
@ -79,12 +78,9 @@ class FuzzyMatcher(object):
|
||||
continue
|
||||
|
||||
groups = None
|
||||
if rule.re_type == 'findall':
|
||||
if rule.find_all:
|
||||
groups = rule.regex.findall(urlkey)
|
||||
if rule.re_type == 'sub':
|
||||
matched_rule = rule
|
||||
break
|
||||
elif rule.re_type == 'search':
|
||||
else:
|
||||
m = rule.regex.search(urlkey)
|
||||
groups = m and m.groups()
|
||||
|
||||
@ -106,7 +102,7 @@ class FuzzyMatcher(object):
|
||||
no_filters = (not filters or filters == {'urlkey:'}) and (matched_rule.replace_after == '?')
|
||||
|
||||
inx = url.find(matched_rule.replace_after)
|
||||
if inx > 0 and matched_rule.re_type != 'sub':
|
||||
if inx > 0:
|
||||
length = inx + len(matched_rule.replace_after)
|
||||
# don't include trailing '?' for default filter
|
||||
if no_filters:
|
||||
@ -115,17 +111,13 @@ class FuzzyMatcher(object):
|
||||
if url[length - 1] == '/':
|
||||
length -= 1
|
||||
url = url[:length]
|
||||
elif not no_filters and matched_rule.re_type != 'sub':
|
||||
elif not no_filters:
|
||||
url += matched_rule.replace_after[0]
|
||||
|
||||
if matched_rule.match_type == 'domain':
|
||||
host = urlsplit(url).netloc
|
||||
url = host.split('.', 1)[1]
|
||||
|
||||
if matched_rule.re_type == 'sub':
|
||||
filters = {'urlkey:'}
|
||||
url = re.sub(rule.regex, rule.replace_after, url)
|
||||
|
||||
fuzzy_params = {'url': url,
|
||||
'matchType': matched_rule.match_type,
|
||||
'filter': filters,
|
||||
|
@ -234,10 +234,3 @@ class TestFuzzy(object):
|
||||
params = self.get_params(url, actual_url, mime='application/x-shockwave-flash')
|
||||
cdx_iter, errs = self.fuzzy(self.source, params)
|
||||
assert list(cdx_iter) == []
|
||||
|
||||
def test_fuzzy_sub_replacement(self):
|
||||
url = 'https://example.com/matched'
|
||||
actual_url = 'https://example.com/replaced'
|
||||
params = self.get_params(url, actual_url)
|
||||
cdx_iter, errs = self.fuzzy(self.source, params)
|
||||
assert list(cdx_iter) == self.get_expected(actual_url)
|
||||
|
@ -11,7 +11,6 @@ from io import BytesIO
|
||||
import base64
|
||||
import cgi
|
||||
import json
|
||||
import math
|
||||
import sys
|
||||
|
||||
|
||||
@ -329,22 +328,7 @@ class MethodQueryCanonicalizer(object):
|
||||
_parser(v, name)
|
||||
|
||||
elif name:
|
||||
if isinstance(json_obj, bool) and json_obj:
|
||||
data[get_key(name)] = "true"
|
||||
elif isinstance(json_obj, bool):
|
||||
data[get_key(name)] = "false"
|
||||
elif json_obj is None:
|
||||
data[get_key(name)] = "null"
|
||||
elif isinstance(json_obj, float):
|
||||
# Treat floats like JavaScript's Number.prototype.toString(),
|
||||
# drop decimal if float represents a whole number.
|
||||
fraction, _ = math.modf(json_obj)
|
||||
if fraction == 0.0:
|
||||
data[get_key(name)] = str(int(json_obj))
|
||||
else:
|
||||
data[get_key(name)] = str(json_obj)
|
||||
else:
|
||||
data[get_key(name)] = str(json_obj)
|
||||
data[get_key(name)] = str(json_obj)
|
||||
|
||||
_parser(json.loads(string))
|
||||
return urlencode(data)
|
||||
|
@ -39,7 +39,7 @@ class InputReqApp(object):
|
||||
|
||||
#=============================================================================
|
||||
class TestInputReq(object):
|
||||
def setup_method(self):
|
||||
def setup(self):
|
||||
self.app = InputReqApp()
|
||||
self.testapp = webtest.TestApp(self.app)
|
||||
|
||||
@ -82,49 +82,44 @@ Foo: Bar\r\n\
|
||||
class TestPostQueryExtract(object):
|
||||
@classmethod
|
||||
def setup_class(cls):
|
||||
cls.post_data = b'foo=bar&dir=%2Fbaz&do=true&re=false&re=null'
|
||||
cls.post_data = b'foo=bar&dir=%2Fbaz'
|
||||
cls.binary_post_data = b'\x816l`L\xa04P\x0e\xe0r\x02\xb5\x89\x19\x00fP\xdb\x0e\xb0\x02,'
|
||||
|
||||
def test_post_extract_1(self):
|
||||
mq = MethodQueryCanonicalizer('POST', 'application/x-www-form-urlencoded',
|
||||
len(self.post_data), BytesIO(self.post_data))
|
||||
|
||||
assert mq.append_query('http://example.com/') == 'http://example.com/?__wb_method=POST&foo=bar&dir=/baz&do=true&re=false&re=null'
|
||||
assert mq.append_query('http://example.com/') == 'http://example.com/?__wb_method=POST&foo=bar&dir=/baz'
|
||||
|
||||
assert mq.append_query('http://example.com/?123=ABC') == 'http://example.com/?123=ABC&__wb_method=POST&foo=bar&dir=/baz&do=true&re=false&re=null'
|
||||
assert mq.append_query('http://example.com/?123=ABC') == 'http://example.com/?123=ABC&__wb_method=POST&foo=bar&dir=/baz'
|
||||
|
||||
def test_post_extract_json(self):
|
||||
post_data = b'{"a": "b", "c": {"a": 2}, "d": "e", "f": true, "g": [false, null]}'
|
||||
post_data = b'{"a": "b", "c": {"a": 2}, "d": "e"}'
|
||||
mq = MethodQueryCanonicalizer('POST', 'application/json',
|
||||
len(post_data), BytesIO(post_data))
|
||||
|
||||
assert mq.append_query('http://example.com/') == 'http://example.com/?__wb_method=POST&a=b&a.2_=2&d=e&f=true&g=false&g.2_=null'
|
||||
assert mq.append_query('http://example.com/') == 'http://example.com/?__wb_method=POST&a=b&a.2_=2&d=e'
|
||||
|
||||
post_data = b'{"type": "event", "id": 44.0, "float": 35.7, "values": [true, false, null], "source": {"type": "component", "id": "a+b&c= d", "values": [3, 4]}}'
|
||||
mq = MethodQueryCanonicalizer('POST', 'application/json',
|
||||
len(post_data), BytesIO(post_data))
|
||||
|
||||
assert mq.append_query('http://example.com/events') == 'http://example.com/events?__wb_method=POST&type=event&id=44&float=35.7&values=true&values.2_=false&values.3_=null&type.2_=component&id.2_=a%2Bb%26c%3D+d&values.4_=3&values.5_=4'
|
||||
|
||||
def test_put_extract_method(self):
|
||||
mq = MethodQueryCanonicalizer('PUT', 'application/x-www-form-urlencoded',
|
||||
len(self.post_data), BytesIO(self.post_data))
|
||||
|
||||
assert mq.append_query('http://example.com/') == 'http://example.com/?__wb_method=PUT&foo=bar&dir=/baz&do=true&re=false&re=null'
|
||||
assert mq.append_query('http://example.com/') == 'http://example.com/?__wb_method=PUT&foo=bar&dir=/baz'
|
||||
|
||||
def test_post_extract_non_form_data_1(self):
|
||||
mq = MethodQueryCanonicalizer('POST', 'application/octet-stream',
|
||||
len(self.post_data), BytesIO(self.post_data))
|
||||
|
||||
#base64 encoded data
|
||||
assert mq.append_query('http://example.com/') == 'http://example.com/?__wb_method=POST&__wb_post_data=Zm9vPWJhciZkaXI9JTJGYmF6JmRvPXRydWUmcmU9ZmFsc2UmcmU9bnVsbA=='
|
||||
assert mq.append_query('http://example.com/') == 'http://example.com/?__wb_method=POST&__wb_post_data=Zm9vPWJhciZkaXI9JTJGYmF6'
|
||||
|
||||
def test_post_extract_non_form_data_2(self):
|
||||
mq = MethodQueryCanonicalizer('POST', 'text/plain',
|
||||
len(self.post_data), BytesIO(self.post_data))
|
||||
|
||||
#base64 encoded data
|
||||
assert mq.append_query('http://example.com/pathbar?id=123') == 'http://example.com/pathbar?id=123&__wb_method=POST&__wb_post_data=Zm9vPWJhciZkaXI9JTJGYmF6JmRvPXRydWUmcmU9ZmFsc2UmcmU9bnVsbA=='
|
||||
assert mq.append_query('http://example.com/pathbar?id=123') == 'http://example.com/pathbar?id=123&__wb_method=POST&__wb_post_data=Zm9vPWJhciZkaXI9JTJGYmF6'
|
||||
|
||||
def test_post_extract_length_invalid_ignore(self):
|
||||
mq = MethodQueryCanonicalizer('POST', 'application/x-www-form-urlencoded',
|
||||
@ -141,13 +136,13 @@ class TestPostQueryExtract(object):
|
||||
mq = MethodQueryCanonicalizer('POST', 'application/x-www-form-urlencoded',
|
||||
len(self.post_data) - 4, BytesIO(self.post_data))
|
||||
|
||||
assert mq.append_query('http://example.com/') == 'http://example.com/?__wb_method=POST&foo=bar&dir=/baz&do=true&re=false&re='
|
||||
assert mq.append_query('http://example.com/') == 'http://example.com/?__wb_method=POST&foo=bar&dir=%2'
|
||||
|
||||
def test_post_extract_length_too_long(self):
|
||||
mq = MethodQueryCanonicalizer('POST', 'application/x-www-form-urlencoded',
|
||||
len(self.post_data) + 4, BytesIO(self.post_data))
|
||||
|
||||
assert mq.append_query('http://example.com/') == 'http://example.com/?__wb_method=POST&foo=bar&dir=/baz&do=true&re=false&re=null'
|
||||
assert mq.append_query('http://example.com/') == 'http://example.com/?__wb_method=POST&foo=bar&dir=/baz'
|
||||
|
||||
def test_post_extract_malformed_form_data(self):
|
||||
mq = MethodQueryCanonicalizer('POST', 'application/x-www-form-urlencoded',
|
||||
@ -160,7 +155,7 @@ class TestPostQueryExtract(object):
|
||||
mq = MethodQueryCanonicalizer('POST', 'multipart/form-data',
|
||||
len(self.post_data), BytesIO(self.post_data))
|
||||
|
||||
assert mq.append_query('http://example.com/') == 'http://example.com/?__wb_method=POST&__wb_post_data=Zm9vPWJhciZkaXI9JTJGYmF6JmRvPXRydWUmcmU9ZmFsc2UmcmU9bnVsbA=='
|
||||
assert mq.append_query('http://example.com/') == 'http://example.com/?__wb_method=POST&__wb_post_data=Zm9vPWJhciZkaXI9JTJGYmF6'
|
||||
|
||||
|
||||
def test_options(self):
|
||||
|
@ -18,7 +18,7 @@ from .testutils import LiveServerTests, HttpBinLiveTests, BaseTestClass
|
||||
|
||||
|
||||
class TestUpstream(LiveServerTests, HttpBinLiveTests, BaseTestClass):
|
||||
def setup_method(self):
|
||||
def setup(self):
|
||||
app = BaseWarcServer()
|
||||
|
||||
base_url = 'http://localhost:{0}'.format(self.server.port)
|
||||
|
@ -1,21 +1,19 @@
|
||||
six
|
||||
warcio>=1.7.1
|
||||
requests
|
||||
redis==2.10.6
|
||||
jinja2>=3.1.2
|
||||
redis<3.0
|
||||
jinja2<3.0.0
|
||||
surt>=0.3.1
|
||||
brotlipy
|
||||
pyyaml
|
||||
werkzeug==2.2.3
|
||||
werkzeug
|
||||
webencodings
|
||||
gevent==22.10.2
|
||||
greenlet>=2.0.2,<3.0
|
||||
gevent==21.12.0
|
||||
webassets==2.0
|
||||
portalocker
|
||||
wsgiprox>=1.5.1
|
||||
fakeredis<1.0
|
||||
tldextract
|
||||
python-dateutil
|
||||
markupsafe>=2.1.1
|
||||
markupsafe<2.1.0
|
||||
ua_parser
|
||||
py3AMF
|
||||
|
@ -1 +0,0 @@
|
||||
*, - {"access": "allow", "user": "staff"}
|
@ -5,8 +5,6 @@ org,iana)/_css/2013.1/fonts/opensans-semibold.ttf - {"access": "allow"}
|
||||
org,iana)/_css - {"access": "exclude"}
|
||||
org,iana)/### - {"access": "allow"}
|
||||
org,iana)/ - {"access": "exclude"}
|
||||
com,example)/?example=3 - {"access": "block", "user": "staff"}
|
||||
com,example)/?example=3 - {"access": "exclude", "user": "staff2"}
|
||||
org,example)/?example=1 - {"access": "block"}
|
||||
com,example)/?example=2 - {"access": "allow_ignore_embargo"}
|
||||
com,example)/?example=1 - {"access": "allow_ignore_embargo", "user": "staff2"}
|
||||
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
13
setup.py
13
setup.py
@ -62,6 +62,10 @@ def generate_git_hash_py(pkg, filename='git_hash.py'):
|
||||
def load_requirements(filename):
|
||||
with open(filename, 'rt') as fh:
|
||||
requirements = fh.read().rstrip().split('\n')
|
||||
if sys.version_info > (3, 0):
|
||||
requirements.append("py3AMF")
|
||||
else:
|
||||
requirements.append("pyAMF")
|
||||
return requirements
|
||||
|
||||
|
||||
@ -109,7 +113,6 @@ setup(
|
||||
"translate_toolkit"
|
||||
],
|
||||
},
|
||||
python_requires='>=3.7,<3.12',
|
||||
tests_require=load_requirements("test_requirements.txt"),
|
||||
cmdclass={'test': PyTest},
|
||||
test_suite='',
|
||||
@ -128,12 +131,16 @@ setup(
|
||||
'Environment :: Web Environment',
|
||||
'License :: OSI Approved :: GNU General Public License (GPL)',
|
||||
'License :: OSI Approved :: GNU General Public License v3 (GPLv3)',
|
||||
'Programming Language :: Python :: 2',
|
||||
'Programming Language :: Python :: 2.7',
|
||||
'Programming Language :: Python :: 3',
|
||||
'Programming Language :: Python :: 3.3',
|
||||
'Programming Language :: Python :: 3.4',
|
||||
'Programming Language :: Python :: 3.5',
|
||||
'Programming Language :: Python :: 3.6',
|
||||
'Programming Language :: Python :: 3.7',
|
||||
'Programming Language :: Python :: 3.8',
|
||||
'Programming Language :: Python :: 3.9',
|
||||
'Programming Language :: Python :: 3.10',
|
||||
'Programming Language :: Python :: 3.11',
|
||||
'Topic :: Internet :: Proxy Servers',
|
||||
'Topic :: Internet :: WWW/HTTP',
|
||||
'Topic :: Internet :: WWW/HTTP :: WSGI',
|
||||
|
@ -3,6 +3,7 @@ WebTest
|
||||
pytest-cov
|
||||
mock
|
||||
urllib3
|
||||
httpbin==0.5.0
|
||||
flask<2.0
|
||||
ujson
|
||||
lxml
|
||||
httpbin>=0.10.2
|
||||
|
@ -62,13 +62,6 @@ collections:
|
||||
acl_paths:
|
||||
- ./sample_archive/access/pywb.aclj
|
||||
|
||||
pywb-wildcard-surt:
|
||||
index_paths: ./sample_archive/cdx/
|
||||
archive_paths: ./sample_archive/warcs/
|
||||
default_access: block
|
||||
acl_paths:
|
||||
- ./sample_archive/access/allow_all.aclj
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -41,23 +41,12 @@ class TestACLApp(BaseConfigTest):
|
||||
assert 'Access Blocked' in resp.text
|
||||
|
||||
def test_allow_via_acl_header(self):
|
||||
resp = self.testapp.get('/pywb/cdx?url=http://www.iana.org/about/', headers={"X-Pywb-Acl-User": "staff"})
|
||||
resp = self.query('http://www.iana.org/about/')
|
||||
|
||||
assert len(resp.text.splitlines()) == 1
|
||||
|
||||
resp = self.testapp.get('/pywb/mp_/http://www.iana.org/about/', headers={"X-Pywb-Acl-User": "staff"}, status=200)
|
||||
|
||||
def test_block_via_acl_header(self):
|
||||
resp = self.testapp.get('/pywb/cdx?url=http://example.com/?example=3', headers={"X-Pywb-Acl-User": "staff"})
|
||||
assert len(resp.text.splitlines()) > 0
|
||||
|
||||
resp = self.testapp.get('/pywb/mp_/http://example.com/?example=3', headers={"X-Pywb-Acl-User": "staff"}, status=451)
|
||||
|
||||
def test_exclude_via_acl_header(self):
|
||||
resp = self.testapp.get('/pywb/cdx?url=http://example.com/?example=3', headers={"X-Pywb-Acl-User": "staff2"})
|
||||
assert len(resp.text.splitlines()) == 0
|
||||
|
||||
resp = self.testapp.get('/pywb/mp_/http://example.com/?example=3', headers={"X-Pywb-Acl-User": "staff2"}, status=404)
|
||||
|
||||
def test_allowed_more_specific(self):
|
||||
resp = self.query('http://www.iana.org/_css/2013.1/fonts/opensans-semibold.ttf')
|
||||
|
||||
@ -96,9 +85,5 @@ class TestACLApp(BaseConfigTest):
|
||||
|
||||
assert '"http://httpbin.org/anything/resource.json"' in resp.text
|
||||
|
||||
def test_allow_all_acl_user_specific(self):
|
||||
resp = self.testapp.get('/pywb-wildcard-surt/mp_/http://example.com/', status=451)
|
||||
|
||||
assert 'Access Blocked' in resp.text
|
||||
|
||||
resp = self.testapp.get('/pywb-wildcard-surt/mp_/http://example.com/', headers={"X-Pywb-Acl-User": "staff"}, status=200)
|
||||
|
@ -537,7 +537,7 @@ class TestManagedColls(CollsDirMixin, BaseConfigTest):
|
||||
main(['template', 'foo', '--remove', 'query_html'])
|
||||
|
||||
def test_err_no_such_coll(self):
|
||||
""" Test error adding warc to non-existent collection
|
||||
""" Test error adding warc to non-existant collection
|
||||
"""
|
||||
warc1 = self._get_sample_warc('example.warc.gz')
|
||||
|
||||
|
@ -46,12 +46,8 @@ class TestEmbargoApp(BaseConfigTest):
|
||||
def test_embargo_ignore_acl_with_header_only(self):
|
||||
# ignore embargo with custom header only
|
||||
headers = {"X-Pywb-ACL-User": "staff2"}
|
||||
|
||||
resp = self.testapp.get('/pywb-embargo-acl/cdx?url=http://example.com/?example=1', headers=headers)
|
||||
assert len(resp.text.splitlines()) > 0
|
||||
resp = self.testapp.get('/pywb-embargo-acl/20140126201054mp_/http://example.com/?example=1', status=200, headers=headers)
|
||||
resp = self.testapp.get('/pywb-embargo-acl/cdx?url=http://example.com/?example=1')
|
||||
assert len(resp.text.splitlines()) == 0
|
||||
|
||||
resp = self.testapp.get('/pywb-embargo-acl/20140126201054mp_/http://example.com/?example=1', status=404)
|
||||
|
||||
|
||||
|
@ -56,6 +56,6 @@ class TestForceHttpsRoot(BaseConfigTest):
|
||||
resp = self.get('/20140128051539{0}/http://www.iana.org/domains/example', fmod,
|
||||
headers={'X-Forwarded-Proto': 'https'})
|
||||
|
||||
assert resp.headers['Location'] == 'https://localhost:80/20140128051539{0}/http://www.iana.org/help/example-domains'.format(fmod)
|
||||
assert resp.headers['Location'] == 'https://localhost:80/20140128051539{0}/http://www.iana.org/domains/reserved'.format(fmod)
|
||||
|
||||
|
||||
|
@ -400,7 +400,7 @@ class TestWbIntegration(BaseConfigTest):
|
||||
assert resp.status_int == 200
|
||||
assert resp.headers['Content-Location'].endswith('/pywb/20140126200928{0}/http://www.iana.org/domains/root/db'.format(fmod))
|
||||
|
||||
def test_not_existent_warc_other_capture(self, fmod):
|
||||
def test_not_existant_warc_other_capture(self, fmod):
|
||||
resp = self.get('/pywb/20140703030321{0}/http://example.com/?example=2', fmod)
|
||||
assert resp.status_int == 200
|
||||
assert resp.headers['Content-Location'].endswith('/pywb/20140603030341{0}/http://example.com?example=2'.format(fmod))
|
||||
@ -410,7 +410,7 @@ class TestWbIntegration(BaseConfigTest):
|
||||
assert resp.status_int == 200
|
||||
assert resp.headers['Content-Location'].endswith('/pywb/20140603030341{0}/http://example.com?example=2'.format(fmod))
|
||||
|
||||
def test_not_existent_warc_no_other(self, fmod):
|
||||
def test_not_existant_warc_no_other(self, fmod):
|
||||
resp = self.get('/pywb/20140703030321{0}/http://example.com/?example=3', fmod, status=503)
|
||||
assert resp.status_int == 503
|
||||
|
||||
|
@ -91,28 +91,25 @@ class TestLiveRewriter(HttpBinLiveTests, BaseConfigTest):
|
||||
resp = self.head('/live/{0}httpbin.org/get?foo=bar', fmod_sl)
|
||||
assert resp.status_int == 200
|
||||
|
||||
# Following tests are temporarily commented out because latest version of PSF httpbin
|
||||
# now returns 400 if content-length header isn't parsable as an int
|
||||
@pytest.mark.skipif(sys.version_info < (3,0), reason='does not respond in 2.7')
|
||||
def test_live_bad_content_length(self, fmod_sl):
|
||||
resp = self.get('/live/{0}httpbin.org/response-headers?content-length=149,149', fmod_sl, status=200)
|
||||
assert resp.headers['Content-Length'] == '149'
|
||||
|
||||
# @pytest.mark.skipif(sys.version_info < (3,0), reason='does not respond in 2.7')
|
||||
# def test_live_bad_content_length(self, fmod_sl):
|
||||
# resp = self.get('/live/{0}httpbin.org/response-headers?content-length=149,149', fmod_sl, status=200)
|
||||
# assert resp.headers['Content-Length'] == '149'
|
||||
resp = self.get('/live/{0}httpbin.org/response-headers?Content-Length=xyz', fmod_sl, status=200)
|
||||
assert resp.headers['Content-Length'] == '90'
|
||||
|
||||
# resp = self.get('/live/{0}httpbin.org/response-headers?Content-Length=xyz', fmod_sl, status=200)
|
||||
# assert resp.headers['Content-Length'] == '90'
|
||||
@pytest.mark.skipif(sys.version_info < (3,0), reason='does not respond in 2.7')
|
||||
def test_live_bad_content_length_with_range(self, fmod_sl):
|
||||
resp = self.get('/live/{0}httpbin.org/response-headers?content-length=149,149', fmod_sl,
|
||||
headers={'Range': 'bytes=0-'}, status=206)
|
||||
assert resp.headers['Content-Length'] == '149'
|
||||
assert resp.headers['Content-Range'] == 'bytes 0-148/149'
|
||||
|
||||
# @pytest.mark.skipif(sys.version_info < (3,0), reason='does not respond in 2.7')
|
||||
# def test_live_bad_content_length_with_range(self, fmod_sl):
|
||||
# resp = self.get('/live/{0}httpbin.org/response-headers?content-length=149,149', fmod_sl,
|
||||
# headers={'Range': 'bytes=0-'}, status=206)
|
||||
# assert resp.headers['Content-Length'] == '149'
|
||||
# assert resp.headers['Content-Range'] == 'bytes 0-148/149'
|
||||
|
||||
# resp = self.get('/live/{0}httpbin.org/response-headers?Content-Length=xyz', fmod_sl,
|
||||
# headers={'Range': 'bytes=0-'}, status=206)
|
||||
# assert resp.headers['Content-Length'] == '90'
|
||||
# assert resp.headers['Content-Range'] == 'bytes 0-89/90'
|
||||
resp = self.get('/live/{0}httpbin.org/response-headers?Content-Length=xyz', fmod_sl,
|
||||
headers={'Range': 'bytes=0-'}, status=206)
|
||||
assert resp.headers['Content-Length'] == '90'
|
||||
assert resp.headers['Content-Range'] == 'bytes 0-89/90'
|
||||
|
||||
def test_custom_unicode_header(self, fmod_sl):
|
||||
value = u'⛄'
|
||||
|
@ -1,135 +0,0 @@
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
from pywb.manager.manager import CollectionsManager
|
||||
|
||||
VALID_WACZ_PATH = 'sample_archive/waczs/valid_example_1.wacz'
|
||||
INVALID_WACZ_PATH = 'sample_archive/waczs/invalid_example_1.wacz'
|
||||
|
||||
TEST_COLLECTION_NAME = 'test-col'
|
||||
|
||||
|
||||
class TestManager:
|
||||
def test_add_valid_wacz_unpacked(self, tmp_path):
|
||||
"""Test if adding a valid wacz file to a collection succeeds"""
|
||||
manager = self.get_test_collections_manager(tmp_path)
|
||||
manager._add_wacz_unpacked(VALID_WACZ_PATH)
|
||||
assert 'valid_example_1-0.warc' in os.listdir(manager.archive_dir)
|
||||
assert manager.DEF_INDEX_FILE in os.listdir(manager.indexes_dir)
|
||||
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
|
||||
assert '"filename": "valid_example_1-0.warc"' in f.read()
|
||||
|
||||
def test_add_valid_wacz_unpacked_dupe_name(self, tmp_path):
|
||||
"""Test if warc that already exists is renamed with -index suffix"""
|
||||
manager = self.get_test_collections_manager(tmp_path)
|
||||
manager._add_wacz_unpacked(VALID_WACZ_PATH)
|
||||
# Add it again to see if there are name conflicts
|
||||
manager._add_wacz_unpacked(VALID_WACZ_PATH)
|
||||
assert 'valid_example_1-0.warc' in os.listdir(manager.archive_dir)
|
||||
assert 'valid_example_1-0-1.warc' in os.listdir(manager.archive_dir)
|
||||
assert manager.DEF_INDEX_FILE in os.listdir(manager.indexes_dir)
|
||||
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
|
||||
data = f.read()
|
||||
assert '"filename": "valid_example_1-0.warc"' in data
|
||||
assert '"filename": "valid_example_1-0-1.warc"' in data
|
||||
|
||||
def test_add_invalid_wacz_unpacked(self, tmp_path, caplog):
|
||||
"""Test if adding an invalid wacz file to a collection fails"""
|
||||
manager = self.get_test_collections_manager(tmp_path)
|
||||
manager._add_wacz_unpacked(INVALID_WACZ_PATH)
|
||||
assert 'invalid_example_1-0.warc' not in os.listdir(manager.archive_dir)
|
||||
assert 'sample_archive/waczs/invalid_example_1.wacz does not contain any warc files.' in caplog.text
|
||||
|
||||
index_path = os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE)
|
||||
if os.path.exists(index_path):
|
||||
with open(index_path, 'r') as f:
|
||||
assert '"filename": "invalid_example_1-0.warc"' not in f.read()
|
||||
|
||||
def test_add_valid_archives_unpack_wacz(self, tmp_path):
|
||||
manager = self.get_test_collections_manager(tmp_path)
|
||||
archives = ['sample_archive/warcs/example.arc', 'sample_archive/warcs/example.arc.gz',
|
||||
'sample_archive/warcs/example.warc', 'sample_archive/warcs/example.warc.gz',
|
||||
'sample_archive/waczs/valid_example_1.wacz']
|
||||
manager.add_archives(archives, unpack_wacz=True)
|
||||
|
||||
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
|
||||
index_text = f.read()
|
||||
|
||||
for archive in archives:
|
||||
archive = os.path.basename(archive)
|
||||
|
||||
if archive.endswith('wacz'):
|
||||
archive = 'valid_example_1-0.warc'
|
||||
|
||||
assert archive in os.listdir(manager.archive_dir)
|
||||
assert archive in index_text
|
||||
|
||||
def test_add_valid_archives_dupe_name(self, tmp_path):
|
||||
manager = self.get_test_collections_manager(tmp_path)
|
||||
warc_filename = 'sample_archive/warcs/example.warc.gz'
|
||||
manager.add_archives([warc_filename, warc_filename])
|
||||
|
||||
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
|
||||
index_text = f.read()
|
||||
|
||||
expected_archives = ('example.warc.gz', 'example-1.warc.gz')
|
||||
|
||||
for archive in expected_archives:
|
||||
assert archive in os.listdir(manager.archive_dir)
|
||||
assert archive in index_text
|
||||
|
||||
def test_add_valid_archives_dont_unpack_wacz(self, tmp_path):
|
||||
manager = self.get_test_collections_manager(tmp_path)
|
||||
archives = ['sample_archive/warcs/example.arc', 'sample_archive/warcs/example.arc.gz',
|
||||
'sample_archive/warcs/example.warc', 'sample_archive/warcs/example.warc.gz',
|
||||
'sample_archive/waczs/valid_example_1.wacz']
|
||||
|
||||
with pytest.raises(NotImplementedError):
|
||||
manager.add_archives(archives, unpack_wacz=False)
|
||||
|
||||
def test_add_invalid_archives_unpack_wacz(self, tmp_path, caplog):
|
||||
manager = self.get_test_collections_manager(tmp_path)
|
||||
manager.add_archives(['sample_archive/warcs/example.warc', 'sample_archive/text_content/sample.html'],
|
||||
unpack_wacz=True)
|
||||
assert 'sample.html' not in os.listdir(manager.archive_dir)
|
||||
assert 'example.warc' in os.listdir(manager.archive_dir)
|
||||
assert "Invalid archives weren't added: sample_archive/text_content/sample.html" in caplog.messages
|
||||
|
||||
def test_merge_wacz_index(self, tmp_path):
|
||||
manager = self.get_test_collections_manager(tmp_path)
|
||||
manager._add_wacz_index(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE),
|
||||
'sample_archive/cdxj/example.cdxj',
|
||||
{'example.warc.gz': 'rewritten.warc.gz'})
|
||||
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
|
||||
index_content = f.read()
|
||||
index_content = index_content.strip()
|
||||
|
||||
assert 'example.warc.gz' not in index_content
|
||||
assert 'rewritten.warc.gz' in index_content
|
||||
|
||||
# check that collection index is sorted
|
||||
index_lines = index_content.split('\n')
|
||||
assert sorted(index_lines) == index_lines
|
||||
|
||||
def test_merge_wacz_index_gzip(self, tmp_path):
|
||||
manager = self.get_test_collections_manager(tmp_path)
|
||||
manager._add_wacz_index(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE),
|
||||
'sample_archive/cdxj/example.cdx.gz',
|
||||
{'example-collection.warc': 'rewritten.warc'})
|
||||
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
|
||||
index_content = f.read()
|
||||
index_content = index_content.strip()
|
||||
|
||||
assert 'example-collection.warc' not in index_content
|
||||
assert 'rewritten.warc' in index_content
|
||||
|
||||
# check that collection index is sorted
|
||||
index_lines = index_content.split('\n')
|
||||
assert sorted(index_lines) == index_lines
|
||||
|
||||
@staticmethod
|
||||
def get_test_collections_manager(collections_path):
|
||||
manager = CollectionsManager(TEST_COLLECTION_NAME, colls_dir=collections_path, must_exist=False)
|
||||
manager.add_collection()
|
||||
return manager
|
7
tox.ini
7
tox.ini
@ -4,24 +4,23 @@ testpaths =
|
||||
tests
|
||||
|
||||
[tox]
|
||||
envlist = py37, py38, py39, py310, py311
|
||||
envlist = py36, py37, py38, py39, py310
|
||||
|
||||
[gh-actions]
|
||||
python =
|
||||
3.6: py36
|
||||
3.7: py37
|
||||
3.8: py38
|
||||
3.9: py39
|
||||
3.10: py310
|
||||
3.11: py311
|
||||
|
||||
[testenv]
|
||||
setenv = PYWB_NO_VERIFY_SSL = 1
|
||||
passenv = *
|
||||
deps =
|
||||
-rtest_requirements.txt
|
||||
-rrequirements.txt
|
||||
-rextra_requirements.txt
|
||||
commands =
|
||||
pytest --cov-config .coveragerc --cov pywb -v --doctest-modules ./pywb/ tests/
|
||||
py.test --cov-config .coveragerc --cov pywb -v --doctest-modules ./pywb/ tests/
|
||||
|
||||
|
||||
|
2
wombat
2
wombat
@ -1 +1 @@
|
||||
Subproject commit 20596ca1e66928cae6f309af781f961aa112ca7f
|
||||
Subproject commit 04ca325f3a59e7efc8ad0fa5abe25ec1bc9d9620
|
Loading…
x
Reference in New Issue
Block a user