mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
Compare commits
306 Commits
v-2.2.2019
...
main
Author | SHA1 | Date | |
---|---|---|---|
|
7b0f8b5860 | ||
|
b44c93bf6e | ||
|
97fffe3a34 | ||
|
6205646b9b | ||
|
23891be2f1 | ||
|
b190dddee9 | ||
|
b9f1609df9 | ||
|
e89924bd39 | ||
|
b4c91c6633 | ||
|
1e2665af13 | ||
|
fee14d7fe8 | ||
|
5712945991 | ||
|
2fd6190b72 | ||
|
791a8d1033 | ||
|
86ee3bd752 | ||
|
d1e1636ae3 | ||
|
b4955cca66 | ||
|
f40e7ef18c | ||
|
6b4f9b323e | ||
|
7879dd0222 | ||
|
013746c10a | ||
|
79140441df | ||
|
af92a9726e | ||
|
83b2113be2 | ||
|
ed36830dc5 | ||
|
81b6a57dfb | ||
|
5c427b9ff2 | ||
|
454486bf75 | ||
|
b8693307d1 | ||
|
98be48d6e4 | ||
|
c441d83435 | ||
|
4a3e7ddff7 | ||
|
02288db81c | ||
|
4fc2b451d7 | ||
|
c8e78fd7c1 | ||
|
d44d640b93 | ||
|
03f9708d8d | ||
|
406fad95c2 | ||
|
d207c76bae | ||
|
131732d238 | ||
|
59d9beac05 | ||
|
0758e81b62 | ||
|
d392a8d908 | ||
|
9bc8a2e1ef | ||
|
43e5c8bac0 | ||
|
cdab280669 | ||
|
e6ec8b4aeb | ||
|
1790fd006a | ||
|
3d0673e32a | ||
|
3050fd2b2b | ||
|
3c94da04a2 | ||
|
2d19b6b18d | ||
|
6cc9cdc3ad | ||
|
138e2b284d | ||
|
3b49c2229e | ||
|
fec9cef818 | ||
|
d81c2f0303 | ||
|
3d8015c444 | ||
|
91cf74a2a9 | ||
|
373eca641c | ||
|
2ad7eaee4b | ||
|
ca6587caac | ||
|
e20fac2c75 | ||
|
c28941a0b6 | ||
|
ff7783aa74 | ||
|
5e2f47a049 | ||
|
19032e4512 | ||
|
72cb588936 | ||
|
14e464bd1c | ||
|
dc81e78393 | ||
|
6260b226ce | ||
|
29860bcb24 | ||
|
790487ca15 | ||
|
028e7102c0 | ||
|
1dedc46dce | ||
|
f96707d039 | ||
|
ca68cf0da1 | ||
|
815ea92fc2 | ||
|
98378a8845 | ||
|
6e7a8b1e59 | ||
|
1fddec216d | ||
|
8ef4ff102d | ||
|
16135d956a | ||
|
1249b41dba | ||
|
2ccd8eb2c3 | ||
|
f0340c6898 | ||
|
c121198183 | ||
|
0cc912da95 | ||
|
f190190128 | ||
|
49393ce16a | ||
|
a97ad7ebbe | ||
|
4f1a6303fa | ||
|
7432299079 | ||
|
7b00d0627e | ||
|
510c9dc9f1 | ||
|
fbed87aa46 | ||
|
4ac580e401 | ||
|
8e06c2f351 | ||
|
12a9e32129 | ||
|
32e9020fd2 | ||
|
62633a48c4 | ||
|
4f44c2ec98 | ||
|
09f7084aa1 | ||
|
403167fbe0 | ||
|
63ac82ee6f | ||
|
0c3eb4ce94 | ||
|
42445562da | ||
|
0f05dbde55 | ||
|
825e4e54ab | ||
|
38b1952d34 | ||
|
c42833d4ad | ||
|
ddcbde573c | ||
|
6bde8fd8c4 | ||
|
7ff789f1a8 | ||
|
c0519a53c3 | ||
|
de9b9310d4 | ||
|
0c4e406876 | ||
|
c97a66703b | ||
|
5c35a43dac | ||
|
e64e58f040 | ||
|
a6be76642a | ||
|
96de80f83e | ||
|
b28c8f1748 | ||
|
b2a460c33c | ||
|
342007244b | ||
|
98c6fba44d | ||
|
a0faf904ef | ||
|
3e5d97f70b | ||
|
843fe28ed8 | ||
|
096850b41d | ||
|
81308780ec | ||
|
cff2a9efc5 | ||
|
3ca765f847 | ||
|
f9f5d2dc33 | ||
|
f7bd84cdac | ||
|
9587954856 | ||
|
12fcc87962 | ||
|
0eedd1502f | ||
|
d95b79a8ab | ||
|
f07d35709a | ||
|
818b518765 | ||
|
551b8fe026 | ||
|
abb76911f5 | ||
|
626da99899 | ||
|
106a9e9200 | ||
|
5d34018b9f | ||
|
ad9b431eaf | ||
|
c5c4a54e7d | ||
|
73d6735bed | ||
|
cdb17c4000 | ||
|
7ce4573c70 | ||
|
212691bd38 | ||
|
13ea5baee5 | ||
|
c62b1bc987 | ||
|
4224cdd7e5 | ||
|
ca14bdd8b2 | ||
|
084be82550 | ||
|
662fc747bf | ||
|
b475d85c4f | ||
|
78a9888b46 | ||
|
aee458b7f5 | ||
|
94f6273a91 | ||
|
087ef2f261 | ||
|
69654fd013 | ||
|
e1cad621b9 | ||
|
ddf3207e40 | ||
|
04d0586244 | ||
|
4683d95580 | ||
|
841c02c123 | ||
|
07fb6bbf1d | ||
|
a0aaa7558d | ||
|
f628b40e02 | ||
|
b66608c5f3 | ||
|
de81efac78 | ||
|
9e09bcd2a7 | ||
|
7b51101b04 | ||
|
195e85ea9d | ||
|
54d8bccf4a | ||
|
bb1c2a3ec9 | ||
|
3f3f8caef1 | ||
|
9b8c187b3a | ||
|
2e35c3e1ed | ||
|
94b7fdcf97 | ||
|
c7373ba785 | ||
|
47e87ef387 | ||
|
af76ce9fa5 | ||
|
d7d83b0728 | ||
|
8a6475a9c2 | ||
|
3c53c2731b | ||
|
5e9b13e267 | ||
|
ed89fcc6f8 | ||
|
7e56ca8ca2 | ||
|
871a05a76a | ||
|
be90e06742 | ||
|
8f0ce45b27 | ||
|
8d8cf7eb58 | ||
|
6b014d05bf | ||
|
92e459bda5 | ||
|
fa021eebab | ||
|
93ce4f6f7a | ||
|
fb8aa7cbc1 | ||
|
f0b9d5b8e8 | ||
|
523e35d973 | ||
|
0be84520ed | ||
|
30680803e8 | ||
|
c7fdfe72a7 | ||
|
0d819aadeb | ||
|
66ac3ca114 | ||
|
fe09d9991e | ||
|
44dcd39c02 | ||
|
02cc7035e8 | ||
|
8baa8cbdb7 | ||
|
fed3263ac6 | ||
|
6f79840b79 | ||
|
35004c1675 | ||
|
59b735ee99 | ||
|
9ce324212a | ||
|
dc30c890a6 | ||
|
2f6fb74ea1 | ||
|
a3294c8b25 | ||
|
802b9fa4f5 | ||
|
379f7de1ba | ||
|
d6ab31d529 | ||
|
5ab97a41c2 | ||
|
69f7f02006 | ||
|
ae78a955de | ||
|
e34606cecb | ||
|
61b6ff21e1 | ||
|
8d98b9111e | ||
|
9a40d29ac3 | ||
|
41c37129c0 | ||
|
1a7fdd0d70 | ||
|
ce10d9af7c | ||
|
e04adea7a8 | ||
|
7ac9a37bb4 | ||
|
3589240431 | ||
|
1b0c9c6895 | ||
|
42b8c3a22b | ||
|
e92b1969e8 | ||
|
cb3d1196f2 | ||
|
2a30731a0c | ||
|
c00f30e897 | ||
|
54a4e38531 | ||
|
0a9ad5c8dc | ||
|
3868f5b915 | ||
|
5da6122d83 | ||
|
c65f66e03a | ||
|
9b2ae35b93 | ||
|
ce0ed610bd | ||
|
0c08b9b5d5 | ||
|
60ad1739b7 | ||
|
b8124e3931 | ||
|
8bf2f9debb | ||
|
465195f203 | ||
|
af3e9c6293 | ||
|
43537fead3 | ||
|
f30b280437 | ||
|
871cef26a8 | ||
|
a301dda0fb | ||
|
5364275ef5 | ||
|
0d68f67049 | ||
|
3020606608 | ||
|
ef9051ad6e | ||
|
0c1dfba1da | ||
|
bfa3aa7264 | ||
|
a3f81dcc0f | ||
|
77eefcdce6 | ||
|
5b7ca18e0f | ||
|
b38cfb8d67 | ||
|
959481fd48 | ||
|
ec88e962b3 | ||
|
94eb4ad206 | ||
|
c1f0f7517a | ||
|
56e7c78ea3 | ||
|
295f67e675 | ||
|
cf5aceb4f5 | ||
|
bdf4a26807 | ||
|
1e9d8f44af | ||
|
e79c657255 | ||
|
bf9284fec5 | ||
|
42089e237b | ||
|
af1a34cb58 | ||
|
05cc593da6 | ||
|
511c6f7985 | ||
|
ffca45c855 | ||
|
837894a07f | ||
|
d4518ae557 | ||
|
a72d938f15 | ||
|
a4027c7904 | ||
|
11610f6e04 | ||
|
96a7a4bbb0 | ||
|
d2467d5fad | ||
|
db50efc558 | ||
|
06513c2592 | ||
|
193607eed8 | ||
|
56fc26333e | ||
|
178413fe0c | ||
|
d74d4f92a3 | ||
|
c55518640f | ||
|
361ac0081b | ||
|
6794f6d79d | ||
|
cef557eb40 | ||
|
a907b2b511 | ||
|
22b4297fc5 | ||
|
77f8bb6476 | ||
|
32962be7c4 |
@ -1,11 +1,11 @@
|
||||
build/
|
||||
dist/
|
||||
karma-tests/
|
||||
sample_archive/
|
||||
tests/
|
||||
tests_disabled/
|
||||
venv/
|
||||
collections/
|
||||
wombat/
|
||||
docs/
|
||||
|
||||
.cache/
|
||||
.eggs/
|
||||
|
32
.github/workflows/ci.yaml
vendored
Normal file
32
.github/workflows/ci.yaml
vendored
Normal file
@ -0,0 +1,32 @@
|
||||
name: CI
|
||||
|
||||
on: [push, pull_request]
|
||||
|
||||
jobs:
|
||||
unit-tests:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
max-parallel: 3
|
||||
matrix:
|
||||
python-version: ['3.7', '3.8', '3.9', '3.10', '3.11']
|
||||
|
||||
steps:
|
||||
- name: checkout
|
||||
uses: actions/checkout@v2
|
||||
|
||||
- name: Set up Python ${{ matrix.python-version }}
|
||||
uses: actions/setup-python@v1
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install tox tox-gh-actions
|
||||
|
||||
- name: Test with tox
|
||||
run: tox
|
||||
|
||||
- name: Upload coverage to Codecov
|
||||
uses: codecov/codecov-action@v1
|
||||
|
33
.github/workflows/publish_pypi.yaml
vendored
Normal file
33
.github/workflows/publish_pypi.yaml
vendored
Normal file
@ -0,0 +1,33 @@
|
||||
name: Publish to PYPI
|
||||
on:
|
||||
release:
|
||||
types: [published]
|
||||
|
||||
jobs:
|
||||
pypi-release:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
python-version: [3.9]
|
||||
|
||||
steps:
|
||||
- name: checkout
|
||||
uses: actions/checkout@v1
|
||||
|
||||
- name: Set up Python ${{ matrix.python-version }}
|
||||
uses: actions/setup-python@v1
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
|
||||
- name: Install dependencies
|
||||
run: python -m pip install --upgrade pip wheel twine
|
||||
|
||||
- name: Build Dist
|
||||
run: python setup.py sdist bdist_wheel --universal
|
||||
|
||||
- name: Publish package to TestPyPI
|
||||
uses: pypa/gh-action-pypi-publish@master
|
||||
with:
|
||||
user: __token__
|
||||
password: ${{ secrets.PYPI_API_TOKEN }}
|
||||
|
43
.github/workflows/release.yaml
vendored
Normal file
43
.github/workflows/release.yaml
vendored
Normal file
@ -0,0 +1,43 @@
|
||||
name: Publish Docker image
|
||||
on:
|
||||
release:
|
||||
types: [published]
|
||||
|
||||
jobs:
|
||||
push_to_registries:
|
||||
name: Build pywb Docker image for release and push to Dockerhub
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
-
|
||||
name: Check out the repo
|
||||
uses: actions/checkout@v2
|
||||
-
|
||||
name: Docker meta
|
||||
id: meta
|
||||
uses: docker/metadata-action@v3
|
||||
with:
|
||||
images: webrecorder/pywb
|
||||
tags: |
|
||||
type=match,pattern=v-(.*),group=1
|
||||
-
|
||||
name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v1
|
||||
-
|
||||
name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v1
|
||||
-
|
||||
name: Login to DockerHub
|
||||
uses: docker/login-action@v1
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_PASSWORD }}
|
||||
-
|
||||
name: Build and push
|
||||
id: docker_build
|
||||
uses: docker/build-push-action@v2
|
||||
with:
|
||||
context: .
|
||||
push: true
|
||||
platforms: linux/amd64,linux/arm64
|
||||
tags: ${{ steps.meta.outputs.tags }}
|
||||
|
4
.gitignore
vendored
4
.gitignore
vendored
@ -53,3 +53,7 @@ git_hash.py
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/*
|
||||
|
||||
# virtualenvs
|
||||
env/
|
||||
venv/
|
||||
|
3
.gitmodules
vendored
Normal file
3
.gitmodules
vendored
Normal file
@ -0,0 +1,3 @@
|
||||
[submodule "wombat"]
|
||||
path = wombat
|
||||
url = https://github.com/webrecorder/wombat
|
@ -5,6 +5,7 @@ python:
|
||||
- "3.5"
|
||||
- "3.6"
|
||||
- "3.7"
|
||||
- "3.8"
|
||||
|
||||
dist: xenial
|
||||
|
||||
@ -39,6 +40,7 @@ after_success:
|
||||
matrix:
|
||||
allow_failures:
|
||||
- env: WR_TEST=yes
|
||||
- python: "2.7"
|
||||
|
||||
exclude:
|
||||
- env: WR_TEST=yes
|
||||
|
@ -2,11 +2,11 @@
|
||||
set -e
|
||||
|
||||
pip install --upgrade pip setuptools
|
||||
pip install 'Markupsafe<2.0.0'
|
||||
python setup.py -q install
|
||||
pip install -r extra_requirements.txt
|
||||
pip install coverage pytest-cov coveralls
|
||||
pip install codecov
|
||||
npm install
|
||||
|
||||
if [ "$WR_TEST" = "yes" ]; then
|
||||
git clone https://github.com/webrecorder/webrecorder-tests.git
|
||||
|
@ -3,7 +3,6 @@ set -e
|
||||
|
||||
if [ "$WR_TEST" = "no" ]; then
|
||||
python setup.py test
|
||||
cd karma-tests && make test && cd ..
|
||||
else
|
||||
cd webrecorder-tests
|
||||
INTRAVIS=1 pytest -m "pywbtest and chrometest"
|
||||
|
451
CHANGES.rst
451
CHANGES.rst
@ -1,3 +1,448 @@
|
||||
pywb 2.7.3 changelist
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
* issue_792 catch warcio exception by @oskarhek in https://github.com/webrecorder/pywb/pull/793
|
||||
* Add ui.logo_home_url as config.yaml option by @tw4l in https://github.com/webrecorder/pywb/pull/791
|
||||
* [#795] Show error when adding duplicate warc file by @kuechensofa in https://github.com/webrecorder/pywb/pull/797
|
||||
* Make search page more intuitive by @krakan in https://github.com/webrecorder/pywb/pull/794
|
||||
* Modify search template buttons by @tw4l in https://github.com/webrecorder/pywb/pull/801
|
||||
* [#804] Use default_locale when lang not set in the request by @krakan in https://github.com/webrecorder/pywb/pull/805
|
||||
* feat: regex substitution on surt rules match by @mijho in https://github.com/webrecorder/pywb/pull/780
|
||||
* Bump minimatch from 3.0.4 to 3.1.2 in /pywb/vueui by @dependabot in https://github.com/webrecorder/pywb/pull/777
|
||||
* Bump decode-uri-component from 0.2.0 to 0.2.2 in /pywb/vueui by @dependabot in https://github.com/webrecorder/pywb/pull/786
|
||||
* rules: add 'debugNoBatch' rewrite for fb and insta by @ikreymer in https://github.com/webrecorder/pywb/pull/806
|
||||
* Vue main order by @tw4l in https://github.com/webrecorder/pywb/pull/809
|
||||
* wombat: bump to 3.4.4 https://github.com/webrecorder/pywb/pull/808
|
||||
|
||||
pywb 2.7.2 changelist
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
* Fix regression introduced by improper wombat update in 2.7.1
|
||||
* Fix `redirect_to_exact: false` functionality: if not set, UI will stay on current timestamp, but will display info on actual capture.
|
||||
* Location bar nav now keeps current timestamp instead of defaulting to calendar view.
|
||||
* 'Live' mode fixes, no longer cache live cdx entry, don't add timestamp when navigating in live mode without timestamp
|
||||
* Calendar dropdown on replay now scrollable.
|
||||
* Timeline toggle on replay is 'sticky', will stay on if toggled on replay.
|
||||
* Capture text: use '|' as in 'Current Capture: [title] | [capture date]'
|
||||
* Document title: Add 'Archived Page: ' prefix to avoid confusion with live pages.
|
||||
|
||||
pywb 2.7.1 changelist
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
* Add locale-dependent handling of first day of week by @krakan in https://github.com/webrecorder/pywb/pull/781
|
||||
* Make filter expressions translatable by @krakan in https://github.com/webrecorder/pywb/pull/783
|
||||
* Add title to top frame in framed replay
|
||||
* Add missing tooltip translation strings
|
||||
* Fix calendar and timeline rendering for replay URLs without a timestamp
|
||||
* Update template documentation
|
||||
|
||||
pywb 2.7.0 changelist
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
* New banner and calendar implementation in Vue.js, which supports localization/internationalization and easier local theming by @vanecat @ikreymer @tw4l with helpful feedback from @ldko
|
||||
* New interactive timeline to assist in navigating between captures
|
||||
* Add basic development Docker Compose configuration file
|
||||
* Update documentation
|
||||
* Add contributing guide
|
||||
|
||||
pywb 2.6.9 changelist
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
* eval rewrite update + latest wombat by @ikreymer in https://github.com/webrecorder/pywb/pull/763
|
||||
* Rewrite: Support target rewriting, open new windows in top-frame instead by @tw4l in https://github.com/webrecorder/pywb/pull/767
|
||||
* Add arm64 platform support by @luandro in https://github.com/webrecorder/pywb/pull/775
|
||||
* Add uwsgi virtualenv information by @tw4l in https://github.com/webrecorder/pywb/pull/770
|
||||
* update to wombat 3.3.11 to support additional replay improvements
|
||||
* automated pypi publish on release https://github.com/webrecorder/pywb/pull/776
|
||||
|
||||
pywb 2.6.8 changelist
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
* Upgrade webassets to v2.0 by @m4rk3r in https://github.com/webrecorder/pywb/pull/730
|
||||
* Encoding image 'srcset' value including the intrinsic width by @yasarkunduz in https://github.com/webrecorder/pywb/pull/712
|
||||
* Prevent jinja2 from escaping HTML markup in collection metadata by @tw4l in https://github.com/webrecorder/pywb/pull/747
|
||||
* Increase uwsgi_buffer_size for nginx config by @edsu in https://github.com/webrecorder/pywb/pull/716
|
||||
* Add missing translation for the filter-epression field placeholder by @krakan in https://github.com/webrecorder/pywb/pull/721
|
||||
* Activate field validation when expanding the advanced options by @krakan in https://github.com/webrecorder/pywb/pull/722
|
||||
* S3 loader to use boto3 built-in credential configuration by @sebastian-nagel in https://github.com/webrecorder/pywb/pull/723
|
||||
* describing installation using pip by @sepastian in https://github.com/webrecorder/pywb/pull/726
|
||||
* Add missing org/image to docker run commands by @heyvito in https://github.com/webrecorder/pywb/pull/733
|
||||
* Format error messages by @edsu in https://github.com/webrecorder/pywb/pull/737
|
||||
* Ensure CDX status is a string by @edsu in https://github.com/webrecorder/pywb/pull/739
|
||||
* Improve replay banner's accessibility by @lwrubel in https://github.com/webrecorder/pywb/pull/742
|
||||
* Revisit headers load fix by @ikreymer in https://github.com/webrecorder/pywb/pull/751
|
||||
* Enable translation for the remaining strings on the search results page by @krakan in https://github.com/webrecorder/pywb/pull/752
|
||||
* revisit of redirect optimization: by @ikreymer in https://github.com/webrecorder/pywb/pull/753
|
||||
* proxy: add COEP header for proxy mode to avoid errors by @ikreymer in https://github.com/webrecorder/pywb/pull/755
|
||||
* tests run improvements: update from python setup.py test -> tox by @ikreymer in https://github.com/webrecorder/pywb/pull/754
|
||||
* rewrite: detect edge-case where html starts with bom followed by @ikreymer in https://github.com/webrecorder/pywb/pull/758
|
||||
* tests options: add PYWB_NO_VERIFY_SSL env var for tests to avoid fail… by @ikreymer in https://github.com/webrecorder/pywb/pull/760
|
||||
* rewriting fix: twitter video in embedded tweets by @ikreymer in https://github.com/webrecorder/pywb/pull/761
|
||||
* Add ir_ modifier by @ikreymer in https://github.com/webrecorder/pywb/pull/759
|
||||
* Remove unused Appveyor badge
|
||||
|
||||
pywb 2.6.7 changelist
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
* dependency: bump gevent to latest (21.12.0)
|
||||
* rewrite: fix eval rewriting where '._eval' was accidentally being rewritten
|
||||
* post-to-get conversion: properly handle json with top-level lists, to match cdxj-indexer, print parse errors, fixes `#709 <https://github.com/webrecorder/pywb/pull/709>`_
|
||||
|
||||
pywb 2.6.6 changelist
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
* dependency: don't use obsolete werkzeug useragent package `#704 <https://github.com/webrecorder/pywb/pull/704>`_
|
||||
* fix user-agent detection: use ua-parser module, default to new js-proxy mode, unless older browser detected `#707 <https://github.com/webrecorder/pywb/pull/707>`_
|
||||
* fix tests: disable broken s3 tests for now
|
||||
* Dockerfile: use python 3.8 by default
|
||||
|
||||
pywb 2.6.5 changelist
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
* fix build: add 'markupsafe<2.1.0' to requirements
|
||||
|
||||
|
||||
pywb 2.6.4 changelist
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
* wombat.js: actually update to 3.3.6, update built wombat.js
|
||||
|
||||
* Fix live mode when ``redirect_to_exact`` is enabled `#692 <https://github.com/webrecorder/pywb/pull/692>`_
|
||||
|
||||
* Rules: additional fuzzy ignore of facebook query param: `#691 <https://github.com/webrecorder/pywb/pull/691>`_
|
||||
|
||||
* Docs: typo fixes: `#669 <https://github.com/webrecorder/pywb/pull/669>`_, `#670 <https://github.com/webrecorder/pywb/pull/670>`_
|
||||
|
||||
|
||||
pywb 2.6.3 changelist
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
* Fix false-positive rewriting of ``location`` through additional check if local var is used, fixes `#684 <https://github.com/webrecorder/pywb/pull/684>`_
|
||||
|
||||
* Fix missing localization of placeholder, fixes `#685 <https://github.com/webrecorder/pywb/pull/685>`_
|
||||
|
||||
* Fix regression caused by 2.6.2, ensure pywb.app_prefix, pywb.host_prefix and pywb.static_prefix paths set correctly for all pages `#688 <https://github.com/webrecorder/pywb/pull/688>`_, fixes `#686 <https://github.com/webrecorder/pywb/pull/686>`_
|
||||
|
||||
* Documentation: Fixes to ``cdx-indexer`` helped (from @ldko) `#683 <https://github.com/webrecorder/pywb/pull/683>`_
|
||||
|
||||
* Update wombat.js to 3.3.6
|
||||
|
||||
* Add automatic Docker push on new GitHub release
|
||||
|
||||
|
||||
pywb 2.6.2 changelist
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Fix regression caused by 2.6.1, with static files not being loaded correctly. `#678 <https://github.com/webrecorder/pywb/pull/678>`_
|
||||
|
||||
|
||||
pywb 2.6.1 changelist
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
* Domain-Specific Rewriting Rules: Rewrite twitter video to capture full videos.
|
||||
|
||||
* Disable rewriting ``data-`` attributes, better fidelity without rewriting, fixes `#676 <https://github.com/webrecorder/pywb/pull/676>`_
|
||||
|
||||
* Fix regression in autoescaping URL in frame_insert.html
|
||||
|
||||
* Feature: ability to set path used to serve static assets (default ``static``) via ``static_prefix`` config option.
|
||||
|
||||
* Update wombat.js 3.3.4 (includes various rewriting fixes)
|
||||
|
||||
|
||||
pywb 2.6.0 changelist
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
* Improvements for eval() rewriting + extra unnamed scope to avoid variable collision `#668 <https://github.com/webrecorder/pywb/pull/668>`_
|
||||
|
||||
* fix for documentation links `#666 <https://github.com/webrecorder/pywb/pull/666>`_
|
||||
|
||||
* Update to latest wombat.js (3.3.0)
|
||||
|
||||
|
||||
pywb 2.6.0b4 changelist
|
||||
~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
* Update rules for IG rewriting to disable Dash `#662 <https://github.com/webrecorder/pywb/pull/662>`_
|
||||
|
||||
* Support for adding custom resource records via PUT ``/<coll>/record`` `#661 <https://github.com/webrecorder/pywb/pull/661>`_
|
||||
|
||||
* Fixes for URL encoding for query and remote index `#657 <https://github.com/webrecorder/pywb/pull/657>`_ and `#658 <https://github.com/webrecorder/pywb/pull/658>`_
|
||||
|
||||
* Doc fixes for incorrect param name `#651 <https://github.com/webrecorder/pywb/pull/651>`_
|
||||
|
||||
* Update to latest wombat.js (3.2.2)
|
||||
|
||||
|
||||
pywb 2.6.0b3 changelist
|
||||
~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
* Display 'ignoring locales' warning only if locales specified (don't specify any by default)
|
||||
|
||||
* Add -V flag to wb-manager and pywb/wayback commands to display current version and exit
|
||||
|
||||
|
||||
pywb 2.6.0b2 changelist
|
||||
~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
* Update documentation for CDX Server API (by @sebastian-nagel) `#651 <https://github.com/webrecorder/pywb/pull/651>`_
|
||||
|
||||
Localization fixes: `#653 <https://github.com/webrecorder/pywb/pull/653>`_
|
||||
|
||||
* Ensure banner template is not autoescaped
|
||||
|
||||
* Don't show locale switch on not found pages (redundant with banner)
|
||||
|
||||
* Ensure wb-manager works when optional i18n dependencies are not installed
|
||||
|
||||
|
||||
pywb 2.6.0b1 changelist
|
||||
~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Additional documentation / localization fixes `#650 <https://github.com/webrecorder/pywb/pull/650>`_
|
||||
|
||||
* Ensure home page and error page keeps locale, language switching is working.
|
||||
|
||||
* Add autoescaping to Jinja2 to avoid XSS issues (suggested by @sebastian-nagel)
|
||||
|
||||
* Add support for 'pywb[i18n]' extra to install localization dependencies
|
||||
|
||||
Documentation typo fixes (by @ldko, `#649 <https://github.com/webrecorder/pywb/pull/649>`_)
|
||||
|
||||
|
||||
pywb 2.6.0b0 changelist
|
||||
~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Documentation Updates:
|
||||
|
||||
* `Embargo + ACL system updates <https://pywb.readthedocs.io/en/latest/manual/access-control.html>`_
|
||||
|
||||
* `New ACL header configuration <https://pywb.readthedocs.io/en/latest/manual/usage.html#config-acl-header>`_
|
||||
|
||||
* `Locaalization / Multi-lingual Support Guide <https://pywb.readthedocs.io/en/latest/manual/localization.html>`_
|
||||
|
||||
|
||||
Localization Improvements: (`#647 <https://github.com/webrecorder/pywb/pull/647>`_)
|
||||
|
||||
* Support for extracting, updating, listing and removing localizable commands via ``wb-manager i18n`` command.
|
||||
|
||||
* UI: Add language switch header to all UI templates.
|
||||
|
||||
* Mark localizable strings in translatable in existing templates.
|
||||
|
||||
|
||||
Access Control Improvements:
|
||||
|
||||
* Support for Embargo System for date-based embargo, overridable via ACL ``allow_ignore_embargo`` `#642 <https://github.com/webrecorder/pywb/pull/642>`_
|
||||
|
||||
* Support for custom ACL 'user' specified via ``X-pywb-ACL-User`` header passed from frontend proxies.
|
||||
|
||||
* Fixes for exact rule matching `#629 <https://github.com/webrecorder/pywb/pull/629>`_
|
||||
|
||||
* Fixes for ACL for auto-collections `#620 <https://github.com/webrecorder/pywb/pull/620>`_
|
||||
|
||||
|
||||
Rewriting Improvements:
|
||||
|
||||
* Updated YT rewriting rules `#635 <https://github.com/webrecorder/pywb/pull/635>`_
|
||||
|
||||
* POST-to-get rewriting consistent with cdxj-indexer, wabac.js/replayweb.page `#636 <https://github.com/webrecorder/pywb/pull/636>`_
|
||||
|
||||
* Improved fuzzy matching to ensure non-POST requests handled via fuzzy matching.
|
||||
|
||||
* Live web: never truncate when reading POST request to avoid hung requests! (Apply limit only on indexing
|
||||
|
||||
|
||||
CDX Server / API Compatibility Fixes:
|
||||
|
||||
* XmlQuery: set WARC record length field, if available `#633 <https://github.com/webrecorder/pywb/pull/633>`_
|
||||
|
||||
* ZipNum: Don't count pages with filter `#631 <https://github.com/webrecorder/pywb/pull/631>`_
|
||||
|
||||
* Better handle of CDX Server HTTP status `#624 <https://github.com/webrecorder/pywb/pull/624>`_
|
||||
|
||||
* Better handling of errors from CDX Server API with 400 `#623 <https://github.com/webrecorder/pywb/pull/623>`_, `#625 <https://github.com/webrecorder/pywb/pull/625>`_, `#626 <https://github.com/webrecorder/pywb/pull/626>`_, `#630 <https://github.com/webrecorder/pywb/pull/630>`_
|
||||
|
||||
* Backwards compatibility of ``fl`` param `#621 <https://github.com/webrecorder/pywb/pull/621>`_
|
||||
|
||||
|
||||
Recording Redis Dedup mode:
|
||||
|
||||
* Fix dedup index config loading `#617 <https://github.com/webrecorder/pywb/pull/617>`_
|
||||
|
||||
* Add recording size counter to track any in-flight requests `#637 <https://github.com/webrecorder/pywb/pull/637>`_
|
||||
|
||||
|
||||
pywb 2.5.0 changelist
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
* Update to latest wombat.js (3.0.3)
|
||||
|
||||
* Dedup Mode: Support for Redis-based dedup index to skip or write revisit records for duplicates, replay from Redis-based index `#597 <https://github.com/webrecorder/pywb/pull/597>`_, `#611 <https://github.com/webrecorder/pywb/pull/611>`_
|
||||
|
||||
* Rewriting: Updated Rules for youtube and vimeo replay `#610 <https://github.com/webrecorder/pywb/pull/610>`_
|
||||
|
||||
* CDX Indexing: More efficint cdx sorting `#609 <https://github.com/webrecorder/pywb/pull/609>`_
|
||||
|
||||
* Set default CDX closest lookup limit to 100 instead of 10 `#606 <https://github.com/webrecorder/pywb/pull/606>`_
|
||||
|
||||
* UI: Try to avoid css class conflicts in injected banner `#604 <https://github.com/webrecorder/pywb/pull/604>`_
|
||||
|
||||
* Catch invalid headers in uWSGI `#603 <https://github.com/webrecorder/pywb/pull/603>`_
|
||||
|
||||
* Config option to support certificate validation when capturing `#596 <https://github.com/webrecorder/pywb/pull/596>`_
|
||||
|
||||
* Fix indexing POST requests with multipart/form-data without boundary `#599 <https://github.com/webrecorder/pywb/pull/599>`_
|
||||
|
||||
* New OpenWayback->pywb Transition Guide: `https://pywb.readthedocs.io/en/latest/manual/owb-transition.html <https://pywb.readthedocs.io/en/latest/manual/owb-transition.html>`_
|
||||
|
||||
* Sample deployments with Docker Compose for running with Apache, Nginx and OutbackCDX in ``sample-deploy`` directory.
|
||||
|
||||
* Update to latest gevent to fix issues with latest python `#583 <https://github.com/webrecorder/pywb/pull/583>`_
|
||||
|
||||
|
||||
pywb 2.4.2 changelist
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
* ensure RemoteCDXIndexSource also passes ``matchType`` to upstream
|
||||
|
||||
* cdx-indexer: use ``-o`` flag to specify output, not first param (output to stdout by default)
|
||||
|
||||
* static paths cleanup, move ``url-polyfill.min.js`` to correct dir (fixes `#571 <https://github.com/webrecorder/pywb/issues/571>`_)
|
||||
|
||||
* minor fixes to docs
|
||||
|
||||
* logo: resize new logo to actual size, add logo via absolute link to ensure it works on pypi also
|
||||
|
||||
|
||||
pywb 2.4.1 changelist
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
* Minor fix: allow timegate content check in `#564 <https://github.com/webrecorder/pywb/pull/564>`_ to be ignored (for use with derived classes)
|
||||
|
||||
|
||||
pywb 2.4.0 changelist
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
This release includes significant update, specifically merging of https://github.com/ukwa/pywb branch into this release.
|
||||
A few selected improvements:
|
||||
|
||||
* New Access Control System: https://pywb.readthedocs.io/en/latest/manual/access-control.html
|
||||
|
||||
* Support for Localization, configuring multiple languages (not enabled by default): https://github.com/ukwa/ukwa-pywb/blob/master/docs/localization.md
|
||||
|
||||
* Support for OpenWayback-style XML-based index source (xmlquery)
|
||||
|
||||
* Support for loading from WebHDFS via `webhdfs://` scheme.
|
||||
|
||||
* Initial support for a new embeds/transclusions replay system, in combination with warcit: https://github.com/webrecorder/warcit/wiki/Warcit-Video-Audio-Conversion
|
||||
|
||||
* Proxy mode improvements: handle OPTIONS requests and CORS `#520 <https://github.com/webrecorder/pywb/pull/520>`_
|
||||
|
||||
* Memento Prefer header: support for experimental `Prefer` header to select 'raw' or 'rewritten' memento
|
||||
|
||||
* Other memento fixes: fix timemap including invalid mementos, correct timegate behavior on top frame `#564 <https://github.com/webrecorder/pywb/pull/564>`_
|
||||
|
||||
* Fixes for collection metadata display: `#509 <https://github.com/webrecorder/pywb/pull/520>`_
|
||||
|
||||
* Fix for incorrected WARC record length due to re-serialized headers: `#561 <https://github.com/webrecorder/pywb/pull/561>`_
|
||||
|
||||
* Filter invalid WARC records `#536 <https://github.com/webrecorder/pywb/pull/536>`_
|
||||
|
||||
* Updated fuzzy matching rules and wombat client-side rewriting.
|
||||
|
||||
|
||||
For the full changelist, see this PR: `#565 <https://github.com/webrecorder/pywb/pull/565>`_
|
||||
|
||||
* Access Control System
|
||||
|
||||
|
||||
pywb 2.3.5 changelist
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
* General auto-fetch fixes (#503)
|
||||
- Fixed issue that caused HTTP 404 errors to happen when parsing <link> stylesheet hrefs as sheets (webrecorder/wombat #11)
|
||||
- Ensured that requests made are cached by the browser (webrecorder/wombat #13 & #15)
|
||||
- Ensured that the request made by the backing web worker when in proxy mode are not blocked by CORS (webrecorder/wombat #13 & #15)
|
||||
|
||||
* SOCKS proxy fixes (#504)
|
||||
- simplify SOCKS config (avoiding global socket monkey patch), default to no cert verify to match non-proxy behavior
|
||||
- SOCKS proxy can be disabled dynamically by setting SOCKS_DISABLE
|
||||
|
||||
|
||||
pywb 2.3.4 changelist
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
* Improvements to auto-fetch to support page fetch (webrecroder/wombat#5, #497)
|
||||
- Support fetching page with ``X-Wombat-History-Page`` and title ``X-Wombat-History-Title`` headers present.
|
||||
- Attempt to extract title and pass along with cdx to ``_add_history_page()`` callback in RewriterApp, to indicate a url is a page. (#498)
|
||||
- General auto-fetch fixes: queue messages if worker not yet inited (in proxy mode), only parse <link> stylesheet hrefs as sheets.
|
||||
|
||||
* Cookie Rewriting Fix: don't update cookie cache on service worker (``sw_`` modifier) responses (#499)
|
||||
* Rewriting: HTML Unescape Fix: Attempt to HTML-entity-decode urls and innline styles that contain ``&#`` to get correct rewriting of encoded urls (#500)
|
||||
|
||||
|
||||
pywb 2.3.3 changelist
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
* Proxy Mode: Ensure head insert added even if no ``<head>`` tag, insert after first tag that is not ``<html>`` or ``<head>`` (#496)
|
||||
|
||||
|
||||
pywb 2.3.2 changelist
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
* Eval rewriting fix: don't rewrite ``$eval``, only ``eval`` identifier (#493)
|
||||
|
||||
* Cookie rewriting improvements: (#491)
|
||||
- Enable domain cookie cache for live index and recording modes using fakeredis, previously only available in Webrecorder
|
||||
- Don't add duplicate cookies to Set-Cookie or Cookie headers
|
||||
- Don't include cached Set-Cookie headers to serviceworkers for non-200 responses.
|
||||
- Add cookies for ``sw_/`` and ``wkrf_`` modifiers
|
||||
- Testing: add initial testing for domain cookie rewriting
|
||||
|
||||
* Misc fixes: (#490)
|
||||
- Ensure SCRIPT_NAME never empty (#490)
|
||||
- Static Paths: load ``/index.html`` for paths ending in ``/``, ensure static_prefix always inited correctly
|
||||
- Docker: switch to designated $VOLUME_DIR before initializing
|
||||
- Rules: update rules for soundcloud
|
||||
|
||||
|
||||
pywb 2.3.1 changelist
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
* Fix regression in wombat, new window.parent override from (webrecorder/wombat#2) was throwing exception if top-frame was cross-origin (webrecorder/wombat#3)
|
||||
* Update to latest wombat, v3.0.0
|
||||
|
||||
|
||||
pywb 2.3.0 changelist
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
* Wombat Improvements and modularization:
|
||||
- Client-side rewriting and auto-fetch systems moved to https://github.com/webrecorder/wombat
|
||||
- Module-based setup and full testing for wombat
|
||||
- Continuous auto-fetch up to 20 requests (#484)
|
||||
|
||||
* Replay / Fidelity Improvements (#451):
|
||||
- Introduced a new server-side rewriter, JSWorkerRewriter, that handles rewriting JS workers and service-workers
|
||||
- Improvements to JSOP Rewriter to handle empty query (#475)
|
||||
- Improvements to postMessage rewriting, override `eval(` while preserving scope (#475)
|
||||
- Fixes to ``this`` proxy rewrite to include ``, this``
|
||||
|
||||
* Misc Changes:
|
||||
- Versioning: switched back to semver to more easily keep track of versions (#488)
|
||||
- Improved handling of open http connections and file handles (#463)
|
||||
- Fixes for latest urllib3, not verifying SSL certs (#467), (#469)
|
||||
- Better logging for invalid cdxlines and cookies (#477), (#478)
|
||||
- Fix warning in yaml.load (#472)
|
||||
- Index invalid form-data as binary (#471)
|
||||
|
||||
|
||||
pywb 2.2.20190410 changelist
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
* Improved rewriting of JSONP, support matching JSONP with ``//`` comments (fixes #459)
|
||||
|
||||
|
||||
pywb 2.2.20190311 changelist
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
@ -736,7 +1181,7 @@ pywb 0.9.6 changelist
|
||||
pywb 0.9.5 changelist
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
* s3 loading: support ``s3://`` scheme in block loader, allowing for loading index and archive files from s3. ``boto`` library must be installed seperately
|
||||
* s3 loading: support ``s3://`` scheme in block loader, allowing for loading index and archive files from s3. ``boto`` library must be installed separately
|
||||
via ``pip install boto``. Attempt default boto auth path, and if that fails, attempt anonymous s3 connection.
|
||||
|
||||
* Wombat/Client-Side Rewrite Customizations: New ``rewrite_opts.client`` settings from ``config.yaml`` are passed directly to wombat as json.
|
||||
@ -832,7 +1277,7 @@ pywb 0.9.1 changelist
|
||||
|
||||
* cdx server query: add support for ``url=*.host`` and ``url=host/*`` as shortcuts for ``matchType=domain`` and ``matchType=prefix``
|
||||
|
||||
* zipnum cdx cluster: support loading index shared from prefix path instead of seperate location file.
|
||||
* zipnum cdx cluster: support loading index shared from prefix path instead of separate location file.
|
||||
|
||||
The ``shard_index_loc`` config property may contain match and replace properties.
|
||||
Regex replacement is then used to obtain path prefix from the shard prefix path.
|
||||
@ -1198,7 +1643,7 @@ pywb 0.4.7 changelist
|
||||
|
||||
* Rewrite: Parsing of html as raw bytes instead of decode/encode, detection still needed for non-ascii compatible encoding.
|
||||
|
||||
* Indexing: Refactoring of cdx-indexer using a seperate 'archive record iterator' and pluggable cdx writer classes. Groundwork for creating custom indexers.
|
||||
* Indexing: Refactoring of cdx-indexer using a separate 'archive record iterator' and pluggable cdx writer classes. Groundwork for creating custom indexers.
|
||||
|
||||
* Indexing: Support for 9 field cdx formats with -9 flag.
|
||||
|
||||
|
68
CONTRIBUTING.md
Normal file
68
CONTRIBUTING.md
Normal file
@ -0,0 +1,68 @@
|
||||
# pywb contributing guide
|
||||
|
||||
Thank you for your interest in contributing to pywb and open source web archiving tools!
|
||||
|
||||
If you have a question not covered below or are interesting in collaborating, please feel free to reach out via any of our [contact points](https://webrecorder.net/contact).
|
||||
|
||||
## How to contribute to pywb
|
||||
|
||||
### I found a bug
|
||||
|
||||
Please take a look at the [open issues](https://github.com/webrecorder/pywb/issues) to see if someone else has already described the same issue and if so, leave any comments or suggestions there.
|
||||
|
||||
If no such issue already exists, feel free to [open a new issue](https://github.com/webrecorder/pywb/issues/new/choose) using the Bug Report template. If the bug is specifically related to replay of a particular site, instead use the Replay Issue template.
|
||||
|
||||
When opening an issue or commenting on an open issue, please describe the problem you are having, any steps required to reproduce the bug (including the pywb version affected), and include any contextual information or screenshots that may be helpful.
|
||||
|
||||
### I wrote a patch to fix a bug
|
||||
|
||||
Please open a new pull request with a description of the changes and a link to the related issue (if no issue yet exists, please create one first).
|
||||
|
||||
Create a new branch with a short descriptive name including the issue number, based on the latest `main` branch.
|
||||
|
||||
All changes should be submitted with test coverage for the change as well as updates to the project documentation if appropriate.
|
||||
|
||||
Avoid making unnecessary changes such as reformatting code or otherwise touching parts of the codebase that are not directly relevant to the issue at hand.
|
||||
|
||||
We do our best to review pull requests in a timely manner but as we are a small team with many projects we cannot guarantee a response or merging timeline. Webrecorder reserves the right to reject pull requests that do not fit the direction of the project or ethics of the Webrecorder project.
|
||||
|
||||
The Development section below has information on how to get started with working on pywb in a local development environment.
|
||||
|
||||
### I want to propose a new feature
|
||||
|
||||
Please take a look at the [open issues](https://github.com/webrecorder/pywb/issues) to see if someone else has already proposed a similar feature and if so, leave any comments or suggestions there.
|
||||
|
||||
If no such issue already exists, feel free to [open a new issue](https://github.com/webrecorder/pywb/issues/new/choose) using the Feature Request template.
|
||||
|
||||
## Development
|
||||
|
||||
The [pywb documentation](https://pywb.readthedocs.io/en/latest/) contains information on pywb's architecture, configuration file, and how to get started with the software locally or in a Docker container.
|
||||
|
||||
The project root directory contains a basic [Docker Compose](https://docs.docker.com/compose/) configuration file, which can be used to easily start a development environment. After installing Docker Desktop and Docker Compose (if not installed with Desktop), to run pywb in detached mode on `localhost:8080`, run:
|
||||
|
||||
```bash
|
||||
docker compose up -d
|
||||
```
|
||||
|
||||
(Note: this example assumes a newer version of Docker Desktop. For older versions that did not bundle Compose, you may need to replace `docker compose` with `docker-compose`)
|
||||
|
||||
The first time you run this command, it make take some time to build.
|
||||
|
||||
Changes to the [Vue](https://vuejs.org/) frontend components require rebuilding the Vue bundle (`pywb/static/vue/vueui.js`) to take effect. After making changes to one or more Vue components, you can rebuild the static bundle and view the changes in your development environment like so:
|
||||
|
||||
```bash
|
||||
./build-vue-ui.sh
|
||||
docker compose up -d --build --force-recreate
|
||||
```
|
||||
|
||||
Changes that modify pywb's Python dependencies or the operating system also require rebuilding the container:
|
||||
|
||||
```bash
|
||||
docker compose up -d --build --force-recreate
|
||||
```
|
||||
|
||||
To stop the container:
|
||||
|
||||
```bash
|
||||
docker compose down
|
||||
```
|
@ -1,4 +1,4 @@
|
||||
ARG PYTHON=python:3.7.2
|
||||
ARG PYTHON=python:3.8
|
||||
|
||||
FROM $PYTHON
|
||||
|
||||
|
26
MANIFEST.in
Normal file
26
MANIFEST.in
Normal file
@ -0,0 +1,26 @@
|
||||
include LICENSE
|
||||
include *.rst
|
||||
include *requirements.txt
|
||||
include *.yaml
|
||||
include *.yml
|
||||
include .gitmodules
|
||||
include .dockerignore
|
||||
include Dockerfile
|
||||
include Vagrantfile
|
||||
include uwsgi.ini
|
||||
include run-tests.py
|
||||
include *.sh
|
||||
recursive-include static *.js
|
||||
recursive-include pywb *.ini
|
||||
recursive-include pywb *.md
|
||||
recursive-include pywb *.py
|
||||
recursive-include pywb *.yaml
|
||||
recursive-include sample_archive *.aclj
|
||||
recursive-include tests *.po
|
||||
recursive-include tests *.yaml
|
||||
recursive-include tests_disabled *.py
|
||||
recursive-include tests_disabled *.yaml
|
||||
recursive-include docs *.bat
|
||||
recursive-include docs *.py
|
||||
recursive-include docs *.rst
|
||||
recursive-include docs Makefile
|
5
NOTICE
Normal file
5
NOTICE
Normal file
@ -0,0 +1,5 @@
|
||||
pywb
|
||||
Copyright 2014-2020 Webrecorder Software, Rhizome, and Contributors.
|
||||
|
||||
Distributed under the GNU General Public License v3.
|
||||
See LICENSE for details.
|
87
README.rst
87
README.rst
@ -1,11 +1,11 @@
|
||||
Webrecorder pywb 2.2
|
||||
Webrecorder pywb 2.8
|
||||
====================
|
||||
|
||||
.. image:: https://travis-ci.org/webrecorder/pywb.svg?branch=master
|
||||
:target: https://travis-ci.org/webrecorder/pywb
|
||||
.. image:: https://ci.appveyor.com/api/projects/status/qxnbunw65o929599/branch/master?svg=true
|
||||
:target: https://ci.appveyor.com/project/webrecorder/pywb/branch/master
|
||||
.. image:: https://codecov.io/gh/webrecorder/pywb/branch/master/graph/badge.svg
|
||||
.. image:: https://raw.githubusercontent.com/webrecorder/pywb/main/pywb/static/pywb-logo.png
|
||||
|
||||
.. image:: https://github.com/webrecorder/pywb/workflows/CI/badge.svg
|
||||
:target: https://github.com/webrecorder/pywb/actions
|
||||
.. image:: https://codecov.io/gh/webrecorder/pywb/branch/main/graph/badge.svg
|
||||
:target: https://codecov.io/gh/webrecorder/pywb
|
||||
|
||||
Web Archiving Tools for All
|
||||
@ -13,7 +13,7 @@ Web Archiving Tools for All
|
||||
|
||||
`View the full pywb documentation <https://pywb.readthedocs.org>`_
|
||||
|
||||
**pywb** is a Python (2 and 3) web archiving toolkit for replaying web archives large and small as accurately as possible.
|
||||
**pywb** is a Python 3 web archiving toolkit for replaying web archives large and small as accurately as possible.
|
||||
The toolkit now also includes new features for creating high-fidelity web archives.
|
||||
|
||||
This toolset forms the foundation of Webrecorder project, but also provides a generic web archiving toolkit
|
||||
@ -23,7 +23,7 @@ that is used by other web archives, including the traditional "Wayback Machine"
|
||||
New Features
|
||||
^^^^^^^^^^^^
|
||||
|
||||
The 2.0 release included a major overhaul of pywb and introduces many new features, including the following:
|
||||
The 2.x release included a major overhaul of pywb and introduces many new features, including the following:
|
||||
|
||||
* Dynamic multi-collection configuration system with no-restart updates.
|
||||
|
||||
@ -37,44 +37,71 @@ The 2.0 release included a major overhaul of pywb and introduces many new featur
|
||||
|
||||
* Flexible rewriting system with pluggable rewriters for different content-types.
|
||||
|
||||
* Significantly improved client-side rewriting to handle most modern web sites.
|
||||
* Standalone, modular `client-side rewriting system (wombat.js) <https://github.com/webrecorder/wombat>`_ to handle most modern web sites.
|
||||
|
||||
* Improved 'calendar' query UI, grouping results by year and month, and updated replay banner.
|
||||
* Improved 'calendar' query UI with incremental loading, grouping results by year and month, and updated replay banner.
|
||||
|
||||
* Extensible UI customizations system for modifying all aspects of the UI.
|
||||
|
||||
* Robust access control system for blocking or excluding URLs, by prefix or by exact match.
|
||||
|
||||
* New in 2.6: Access Control embargo and http-header control access settings.
|
||||
|
||||
* New in 2.6: Support for localization and multi-language deployment.
|
||||
|
||||
* New in 2.7: New banner/calendar UI written in `Vue <https://vuejs.org/>`_, with interactive timeline and easier theming of colors and logo via ``config.yaml``.
|
||||
|
||||
|
||||
Please see the `full documentation <https://pywb.readthedocs.org>`_ for more detailed info on all these features.
|
||||
|
||||
|
||||
Installation
|
||||
------------
|
||||
Installation for Deployment
|
||||
---------------------------
|
||||
|
||||
To run and install locally you can:
|
||||
To install pywb for usage, you can use:
|
||||
|
||||
* Install with ``python setup.py install``
|
||||
``pip install pywb``
|
||||
|
||||
* Run tests with ``python setup.py test``
|
||||
Note: depending on your Python installation, you may have to use `pip3` instead of `pip`.
|
||||
|
||||
* Run Wayback with ``wayback`` (see docs for info on how to setup collections)
|
||||
|
||||
* Build docs locally with: ``cd docs; make html``. (The docs will be built in ``./_build/html/index.html``)
|
||||
Installation from local copy
|
||||
----------------------------
|
||||
|
||||
``git clone https://github.com/webrecorder/pywb``
|
||||
|
||||
To install from a locally cloned copy, install with ``pip install -e .`` or ``python setup.py install``.
|
||||
|
||||
To run tests, we recommend installing ``pip install tox tox-current-env`` and then running ``tox --current-env`` to test in your current Python environment.
|
||||
|
||||
To Build docs locally, run: ``cd docs; make html``. (The docs will be built in ``./_build/html/index.html``)
|
||||
|
||||
|
||||
Running
|
||||
-------
|
||||
|
||||
After installation, you can run ``pywb`` or ``wayback``.
|
||||
|
||||
Consult the local or `online docs <https://pywb.readthedocs.org>`_ for latest usage and configuration details.
|
||||
|
||||
|
||||
Documentation
|
||||
-------------
|
||||
|
||||
The pywb documentation is extensive. Some links to a few key guides:
|
||||
|
||||
* `Getting Started Guide <https://pywb.readthedocs.io/en/latest/manual/usage.html#getting-started>`_
|
||||
|
||||
* `Embargo and Access Control Guide <https://pywb.readthedocs.io/en/latest/manual/access-control.html>`_
|
||||
|
||||
* `Localization and Multi-Language Guide <https://pywb.readthedocs.io/en/latest/manual/localization.html>`_
|
||||
|
||||
* `Deployment Guide <https://pywb.readthedocs.io/en/latest/manual/usage.html#deployment>`_
|
||||
|
||||
* `OpenWayback Transition Guide <https://pywb.readthedocs.io/en/latest/manual/owb-transition.html>`_
|
||||
|
||||
|
||||
Contributions & Bug Reports
|
||||
---------------------------
|
||||
|
||||
Users are encouraged to fork and contribute to this project to keep improving web archiving tools.
|
||||
|
||||
A few key features are high on list of priorities, but have not yet been implemented, including:
|
||||
|
||||
* Url Exclusion System
|
||||
|
||||
* UI Improvements
|
||||
|
||||
If you are interested in contributing, especially to any of these areas, please let us know!
|
||||
|
||||
Otherwise, please take a look at `list of current issues <https://github.com/webrecorder/pywb/issues>`_ and feel free to open new ones about any aspect of pywb, including the new documentation.
|
||||
|
||||
|
||||
Users are encouraged to fork and contribute to this project to keep improving web archiving tools. Please consult the `contributing guide <CONTRIBUTING.md>`_ for information on how to contribute to pywb.
|
||||
|
@ -3,26 +3,29 @@ environment:
|
||||
CMD_IN_ENV: "cmd /E:ON /V:ON /C obvci_appveyor_python_build_env.cmd"
|
||||
|
||||
matrix:
|
||||
- PYTHON: "C:\\Python27"
|
||||
- PYTHON: "C:\\Python27-x64"
|
||||
- PYTHON: "C:\\Python35"
|
||||
- PYTHON: "C:\\Python35-x64"
|
||||
- PYTHON: "C:\\Python36"
|
||||
- PYTHON: "C:\\Python36-x64"
|
||||
- PYTHON: "C:\\Python37"
|
||||
- PYTHON: "C:\\Python37-x64"
|
||||
- PYTHON: "C:\\Python38"
|
||||
- PYTHON: "C:\\Python38-x64"
|
||||
|
||||
|
||||
|
||||
install:
|
||||
- "SET PATH=%PYTHON%;%PYTHON%\\Scripts;%PATH%"
|
||||
- "python -m pip install --upgrade pip"
|
||||
- "pip install -U setuptools"
|
||||
- "pip install MarkupSafe==1.1.1"
|
||||
- "pip install coverage pytest-cov"
|
||||
- "pip install cffi"
|
||||
- "pip install pyopenssl"
|
||||
- "pip install pypiwin32"
|
||||
- "pip install certauth boto3 youtube-dl pysocks"
|
||||
- "pip install codecov"
|
||||
- "pip install wheel"
|
||||
|
||||
build_script:
|
||||
- "python setup.py install"
|
2
babel.ini
Normal file
2
babel.ini
Normal file
@ -0,0 +1,2 @@
|
||||
[jinja2: pywb/templates/**.html]
|
||||
extensions=jinja2.ext.i18n,jinja2.ext.autoescape,jinja2.ext.with_
|
7
build-vue-ui.sh
Executable file
7
build-vue-ui.sh
Executable file
@ -0,0 +1,7 @@
|
||||
#!/bin/bash
|
||||
|
||||
CURR_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
|
||||
|
||||
cd $CURR_DIR/pywb/vueui/
|
||||
yarn install
|
||||
yarn run build
|
9
build-wombat.sh
Executable file
9
build-wombat.sh
Executable file
@ -0,0 +1,9 @@
|
||||
#!/bin/bash
|
||||
|
||||
CURR_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
|
||||
|
||||
cd $CURR_DIR/wombat
|
||||
export OUTPUT_DIR=../pywb/static/
|
||||
yarn install
|
||||
yarn run build-prod
|
||||
#cp ./dist/*.js ../pywb/static/
|
21
config.yaml
21
config.yaml
@ -1,6 +1,16 @@
|
||||
# pywb config file
|
||||
# ========================================
|
||||
#
|
||||
debug: true
|
||||
|
||||
# Uncomment to set banner colors and logo
|
||||
# ui:
|
||||
# logo: path/relative/from/static/logo.png
|
||||
# logo_home_url: https://example.com
|
||||
# navbar_background_hex: 0c49b0
|
||||
# navbar_color_hex: fff
|
||||
# navbar_light_buttons: true
|
||||
# disable_printing: true
|
||||
|
||||
collections:
|
||||
all: $all
|
||||
@ -11,9 +21,18 @@ collections:
|
||||
# Settings for each collection
|
||||
use_js_obj_proxy: true
|
||||
|
||||
# Memento support, enable
|
||||
# Eanable Memento support
|
||||
enable_memento: true
|
||||
|
||||
# Replay content in an iframe
|
||||
framed_replay: true
|
||||
|
||||
redirect_to_exact: true
|
||||
|
||||
# Uncomment and change to set default locale
|
||||
# default_locale: en
|
||||
|
||||
# Uncomment to set available locales
|
||||
# locales:
|
||||
# - en
|
||||
# - ru
|
||||
|
10
docker-compose.yaml
Normal file
10
docker-compose.yaml
Normal file
@ -0,0 +1,10 @@
|
||||
version: '3'
|
||||
|
||||
services:
|
||||
pywb:
|
||||
build: .
|
||||
ports:
|
||||
- 8080:8080
|
||||
volumes:
|
||||
- ./config.yaml:/webarchive/config.yaml
|
||||
- ./sample_archive/:/webarchive/sample_archive/
|
@ -29,6 +29,7 @@ if [ "$MY_GID" != "$VOLUME_GID" ] || [ "$MY_UID" != "$VOLUME_UID" ]; then
|
||||
else
|
||||
# initialize a collection if defined and not present
|
||||
if [ -n "$INIT_COLLECTION" ] && [ ! -d $VOLUME_DIR/collections/$INIT_COLLECTION ]; then
|
||||
cd $VOLUME_DIR
|
||||
wb-manager init $INIT_COLLECTION
|
||||
fi
|
||||
|
||||
|
@ -1,78 +1,77 @@
|
||||
pywb\.apps package
|
||||
==================
|
||||
pywb.apps package
|
||||
=================
|
||||
|
||||
Submodules
|
||||
----------
|
||||
|
||||
pywb\.apps\.cli module
|
||||
----------------------
|
||||
pywb.apps.cli module
|
||||
--------------------
|
||||
|
||||
.. automodule:: pywb.apps.cli
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
pywb\.apps\.frontendapp module
|
||||
------------------------------
|
||||
pywb.apps.frontendapp module
|
||||
----------------------------
|
||||
|
||||
.. automodule:: pywb.apps.frontendapp
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
pywb\.apps\.live module
|
||||
-----------------------
|
||||
pywb.apps.live module
|
||||
---------------------
|
||||
|
||||
.. automodule:: pywb.apps.live
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
pywb\.apps\.rewriterapp module
|
||||
------------------------------
|
||||
pywb.apps.rewriterapp module
|
||||
----------------------------
|
||||
|
||||
.. automodule:: pywb.apps.rewriterapp
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
pywb\.apps\.static\_handler module
|
||||
----------------------------------
|
||||
|
||||
.. automodule:: pywb.apps.static_handler
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
pywb\.apps\.warcserverapp module
|
||||
pywb.apps.static\_handler module
|
||||
--------------------------------
|
||||
|
||||
.. automodule:: pywb.apps.warcserverapp
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
.. automodule:: pywb.apps.static_handler
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
pywb\.apps\.wayback module
|
||||
--------------------------
|
||||
pywb.apps.warcserverapp module
|
||||
------------------------------
|
||||
|
||||
.. automodule:: pywb.apps.warcserverapp
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
pywb.apps.wayback module
|
||||
------------------------
|
||||
|
||||
.. automodule:: pywb.apps.wayback
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
pywb\.apps\.wbrequestresponse module
|
||||
------------------------------------
|
||||
pywb.apps.wbrequestresponse module
|
||||
----------------------------------
|
||||
|
||||
.. automodule:: pywb.apps.wbrequestresponse
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
Module contents
|
||||
---------------
|
||||
|
||||
.. automodule:: pywb.apps
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
@ -1,30 +1,29 @@
|
||||
pywb\.indexer package
|
||||
=====================
|
||||
pywb.indexer package
|
||||
====================
|
||||
|
||||
Submodules
|
||||
----------
|
||||
|
||||
pywb\.indexer\.archiveindexer module
|
||||
------------------------------------
|
||||
pywb.indexer.archiveindexer module
|
||||
----------------------------------
|
||||
|
||||
.. automodule:: pywb.indexer.archiveindexer
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
pywb\.indexer\.cdxindexer module
|
||||
--------------------------------
|
||||
pywb.indexer.cdxindexer module
|
||||
------------------------------
|
||||
|
||||
.. automodule:: pywb.indexer.cdxindexer
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
Module contents
|
||||
---------------
|
||||
|
||||
.. automodule:: pywb.indexer
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
@ -1,38 +1,53 @@
|
||||
pywb\.manager package
|
||||
=====================
|
||||
pywb.manager package
|
||||
====================
|
||||
|
||||
Submodules
|
||||
----------
|
||||
|
||||
pywb\.manager\.autoindex module
|
||||
-------------------------------
|
||||
pywb.manager.aclmanager module
|
||||
------------------------------
|
||||
|
||||
.. automodule:: pywb.manager.aclmanager
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
pywb.manager.autoindex module
|
||||
-----------------------------
|
||||
|
||||
.. automodule:: pywb.manager.autoindex
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
pywb\.manager\.manager module
|
||||
-----------------------------
|
||||
pywb.manager.locmanager module
|
||||
------------------------------
|
||||
|
||||
.. automodule:: pywb.manager.locmanager
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
pywb.manager.manager module
|
||||
---------------------------
|
||||
|
||||
.. automodule:: pywb.manager.manager
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
pywb\.manager\.migrate module
|
||||
-----------------------------
|
||||
pywb.manager.migrate module
|
||||
---------------------------
|
||||
|
||||
.. automodule:: pywb.manager.migrate
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
Module contents
|
||||
---------------
|
||||
|
||||
.. automodule:: pywb.manager
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
@ -1,46 +1,45 @@
|
||||
pywb\.recorder package
|
||||
======================
|
||||
pywb.recorder package
|
||||
=====================
|
||||
|
||||
Submodules
|
||||
----------
|
||||
|
||||
pywb\.recorder\.filters module
|
||||
------------------------------
|
||||
pywb.recorder.filters module
|
||||
----------------------------
|
||||
|
||||
.. automodule:: pywb.recorder.filters
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
pywb\.recorder\.multifilewarcwriter module
|
||||
------------------------------------------
|
||||
pywb.recorder.multifilewarcwriter module
|
||||
----------------------------------------
|
||||
|
||||
.. automodule:: pywb.recorder.multifilewarcwriter
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
pywb\.recorder\.recorderapp module
|
||||
----------------------------------
|
||||
pywb.recorder.recorderapp module
|
||||
--------------------------------
|
||||
|
||||
.. automodule:: pywb.recorder.recorderapp
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
pywb\.recorder\.redisindexer module
|
||||
-----------------------------------
|
||||
pywb.recorder.redisindexer module
|
||||
---------------------------------
|
||||
|
||||
.. automodule:: pywb.recorder.redisindexer
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
Module contents
|
||||
---------------
|
||||
|
||||
.. automodule:: pywb.recorder
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
@ -1,142 +1,149 @@
|
||||
pywb\.rewrite package
|
||||
=====================
|
||||
pywb.rewrite package
|
||||
====================
|
||||
|
||||
Submodules
|
||||
----------
|
||||
|
||||
pywb\.rewrite\.content\_rewriter module
|
||||
---------------------------------------
|
||||
pywb.rewrite.content\_rewriter module
|
||||
-------------------------------------
|
||||
|
||||
.. automodule:: pywb.rewrite.content_rewriter
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
pywb\.rewrite\.cookie\_rewriter module
|
||||
--------------------------------------
|
||||
|
||||
.. automodule:: pywb.rewrite.cookie_rewriter
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
pywb\.rewrite\.cookies module
|
||||
-----------------------------
|
||||
|
||||
.. automodule:: pywb.rewrite.cookies
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
pywb\.rewrite\.default\_rewriter module
|
||||
---------------------------------------
|
||||
|
||||
.. automodule:: pywb.rewrite.default_rewriter
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
pywb\.rewrite\.header\_rewriter module
|
||||
--------------------------------------
|
||||
|
||||
.. automodule:: pywb.rewrite.header_rewriter
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
pywb\.rewrite\.html\_insert\_rewriter module
|
||||
--------------------------------------------
|
||||
|
||||
.. automodule:: pywb.rewrite.html_insert_rewriter
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
pywb\.rewrite\.html\_rewriter module
|
||||
pywb.rewrite.cookie\_rewriter module
|
||||
------------------------------------
|
||||
|
||||
.. automodule:: pywb.rewrite.html_rewriter
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
.. automodule:: pywb.rewrite.cookie_rewriter
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
pywb\.rewrite\.jsonp\_rewriter module
|
||||
-------------------------------------
|
||||
|
||||
.. automodule:: pywb.rewrite.jsonp_rewriter
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
pywb\.rewrite\.regex\_rewriters module
|
||||
--------------------------------------
|
||||
|
||||
.. automodule:: pywb.rewrite.regex_rewriters
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
pywb\.rewrite\.rewrite\_amf module
|
||||
----------------------------------
|
||||
|
||||
.. automodule:: pywb.rewrite.rewrite_amf
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
pywb\.rewrite\.rewrite\_dash module
|
||||
-----------------------------------
|
||||
|
||||
.. automodule:: pywb.rewrite.rewrite_dash
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
pywb\.rewrite\.rewrite\_hls module
|
||||
----------------------------------
|
||||
|
||||
.. automodule:: pywb.rewrite.rewrite_hls
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
pywb\.rewrite\.rewriteinputreq module
|
||||
-------------------------------------
|
||||
|
||||
.. automodule:: pywb.rewrite.rewriteinputreq
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
pywb\.rewrite\.templateview module
|
||||
----------------------------------
|
||||
|
||||
.. automodule:: pywb.rewrite.templateview
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
pywb\.rewrite\.url\_rewriter module
|
||||
-----------------------------------
|
||||
|
||||
.. automodule:: pywb.rewrite.url_rewriter
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
pywb\.rewrite\.wburl module
|
||||
pywb.rewrite.cookies module
|
||||
---------------------------
|
||||
|
||||
.. automodule:: pywb.rewrite.wburl
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
.. automodule:: pywb.rewrite.cookies
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
pywb.rewrite.default\_rewriter module
|
||||
-------------------------------------
|
||||
|
||||
.. automodule:: pywb.rewrite.default_rewriter
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
pywb.rewrite.header\_rewriter module
|
||||
------------------------------------
|
||||
|
||||
.. automodule:: pywb.rewrite.header_rewriter
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
pywb.rewrite.html\_insert\_rewriter module
|
||||
------------------------------------------
|
||||
|
||||
.. automodule:: pywb.rewrite.html_insert_rewriter
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
pywb.rewrite.html\_rewriter module
|
||||
----------------------------------
|
||||
|
||||
.. automodule:: pywb.rewrite.html_rewriter
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
pywb.rewrite.jsonp\_rewriter module
|
||||
-----------------------------------
|
||||
|
||||
.. automodule:: pywb.rewrite.jsonp_rewriter
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
pywb.rewrite.regex\_rewriters module
|
||||
------------------------------------
|
||||
|
||||
.. automodule:: pywb.rewrite.regex_rewriters
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
pywb.rewrite.rewrite\_amf module
|
||||
--------------------------------
|
||||
|
||||
.. automodule:: pywb.rewrite.rewrite_amf
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
pywb.rewrite.rewrite\_dash module
|
||||
---------------------------------
|
||||
|
||||
.. automodule:: pywb.rewrite.rewrite_dash
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
pywb.rewrite.rewrite\_hls module
|
||||
--------------------------------
|
||||
|
||||
.. automodule:: pywb.rewrite.rewrite_hls
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
pywb.rewrite.rewrite\_js\_workers module
|
||||
----------------------------------------
|
||||
|
||||
.. automodule:: pywb.rewrite.rewrite_js_workers
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
pywb.rewrite.rewriteinputreq module
|
||||
-----------------------------------
|
||||
|
||||
.. automodule:: pywb.rewrite.rewriteinputreq
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
pywb.rewrite.templateview module
|
||||
--------------------------------
|
||||
|
||||
.. automodule:: pywb.rewrite.templateview
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
pywb.rewrite.url\_rewriter module
|
||||
---------------------------------
|
||||
|
||||
.. automodule:: pywb.rewrite.url_rewriter
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
pywb.rewrite.wburl module
|
||||
-------------------------
|
||||
|
||||
.. automodule:: pywb.rewrite.wburl
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
Module contents
|
||||
---------------
|
||||
|
||||
.. automodule:: pywb.rewrite
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
@ -5,19 +5,31 @@ Subpackages
|
||||
-----------
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 4
|
||||
|
||||
pywb.apps
|
||||
pywb.indexer
|
||||
pywb.manager
|
||||
pywb.recorder
|
||||
pywb.rewrite
|
||||
pywb.utils
|
||||
pywb.warcserver
|
||||
pywb.apps
|
||||
pywb.indexer
|
||||
pywb.manager
|
||||
pywb.recorder
|
||||
pywb.rewrite
|
||||
pywb.utils
|
||||
pywb.warcserver
|
||||
|
||||
Submodules
|
||||
----------
|
||||
|
||||
pywb.version module
|
||||
-------------------
|
||||
|
||||
.. automodule:: pywb.version
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
Module contents
|
||||
---------------
|
||||
|
||||
.. automodule:: pywb
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
@ -1,78 +1,85 @@
|
||||
pywb\.utils package
|
||||
===================
|
||||
pywb.utils package
|
||||
==================
|
||||
|
||||
Submodules
|
||||
----------
|
||||
|
||||
pywb\.utils\.binsearch module
|
||||
-----------------------------
|
||||
pywb.utils.binsearch module
|
||||
---------------------------
|
||||
|
||||
.. automodule:: pywb.utils.binsearch
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
pywb\.utils\.canonicalize module
|
||||
--------------------------------
|
||||
pywb.utils.canonicalize module
|
||||
------------------------------
|
||||
|
||||
.. automodule:: pywb.utils.canonicalize
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
pywb\.utils\.format module
|
||||
--------------------------
|
||||
pywb.utils.format module
|
||||
------------------------
|
||||
|
||||
.. automodule:: pywb.utils.format
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
pywb\.utils\.geventserver module
|
||||
--------------------------------
|
||||
pywb.utils.geventserver module
|
||||
------------------------------
|
||||
|
||||
.. automodule:: pywb.utils.geventserver
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
pywb\.utils\.io module
|
||||
----------------------
|
||||
pywb.utils.io module
|
||||
--------------------
|
||||
|
||||
.. automodule:: pywb.utils.io
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
pywb\.utils\.loaders module
|
||||
---------------------------
|
||||
pywb.utils.loaders module
|
||||
-------------------------
|
||||
|
||||
.. automodule:: pywb.utils.loaders
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
pywb\.utils\.memento module
|
||||
---------------------------
|
||||
pywb.utils.memento module
|
||||
-------------------------
|
||||
|
||||
.. automodule:: pywb.utils.memento
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
pywb\.utils\.wbexception module
|
||||
-------------------------------
|
||||
pywb.utils.merge module
|
||||
-----------------------
|
||||
|
||||
.. automodule:: pywb.utils.merge
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
pywb.utils.wbexception module
|
||||
-----------------------------
|
||||
|
||||
.. automodule:: pywb.utils.wbexception
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
Module contents
|
||||
---------------
|
||||
|
||||
.. automodule:: pywb.utils
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
@ -1,70 +1,69 @@
|
||||
pywb\.warcserver\.index package
|
||||
===============================
|
||||
pywb.warcserver.index package
|
||||
=============================
|
||||
|
||||
Submodules
|
||||
----------
|
||||
|
||||
pywb\.warcserver\.index\.aggregator module
|
||||
------------------------------------------
|
||||
pywb.warcserver.index.aggregator module
|
||||
---------------------------------------
|
||||
|
||||
.. automodule:: pywb.warcserver.index.aggregator
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
pywb\.warcserver\.index\.cdxobject module
|
||||
-----------------------------------------
|
||||
pywb.warcserver.index.cdxobject module
|
||||
--------------------------------------
|
||||
|
||||
.. automodule:: pywb.warcserver.index.cdxobject
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
pywb\.warcserver\.index\.cdxops module
|
||||
--------------------------------------
|
||||
pywb.warcserver.index.cdxops module
|
||||
-----------------------------------
|
||||
|
||||
.. automodule:: pywb.warcserver.index.cdxops
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
pywb\.warcserver\.index\.fuzzymatcher module
|
||||
--------------------------------------------
|
||||
pywb.warcserver.index.fuzzymatcher module
|
||||
-----------------------------------------
|
||||
|
||||
.. automodule:: pywb.warcserver.index.fuzzymatcher
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
pywb\.warcserver\.index\.indexsource module
|
||||
-------------------------------------------
|
||||
pywb.warcserver.index.indexsource module
|
||||
----------------------------------------
|
||||
|
||||
.. automodule:: pywb.warcserver.index.indexsource
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
pywb\.warcserver\.index\.query module
|
||||
-------------------------------------
|
||||
pywb.warcserver.index.query module
|
||||
----------------------------------
|
||||
|
||||
.. automodule:: pywb.warcserver.index.query
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
pywb\.warcserver\.index\.zipnum module
|
||||
--------------------------------------
|
||||
pywb.warcserver.index.zipnum module
|
||||
-----------------------------------
|
||||
|
||||
.. automodule:: pywb.warcserver.index.zipnum
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
Module contents
|
||||
---------------
|
||||
|
||||
.. automodule:: pywb.warcserver.index
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
@ -1,46 +1,45 @@
|
||||
pywb\.warcserver\.resource package
|
||||
==================================
|
||||
pywb.warcserver.resource package
|
||||
================================
|
||||
|
||||
Submodules
|
||||
----------
|
||||
|
||||
pywb\.warcserver\.resource\.blockrecordloader module
|
||||
----------------------------------------------------
|
||||
|
||||
.. automodule:: pywb.warcserver.resource.blockrecordloader
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
pywb\.warcserver\.resource\.pathresolvers module
|
||||
------------------------------------------------
|
||||
|
||||
.. automodule:: pywb.warcserver.resource.pathresolvers
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
pywb\.warcserver\.resource\.resolvingloader module
|
||||
--------------------------------------------------
|
||||
|
||||
.. automodule:: pywb.warcserver.resource.resolvingloader
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
pywb\.warcserver\.resource\.responseloader module
|
||||
pywb.warcserver.resource.blockrecordloader module
|
||||
-------------------------------------------------
|
||||
|
||||
.. automodule:: pywb.warcserver.resource.responseloader
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
.. automodule:: pywb.warcserver.resource.blockrecordloader
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
pywb.warcserver.resource.pathresolvers module
|
||||
---------------------------------------------
|
||||
|
||||
.. automodule:: pywb.warcserver.resource.pathresolvers
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
pywb.warcserver.resource.resolvingloader module
|
||||
-----------------------------------------------
|
||||
|
||||
.. automodule:: pywb.warcserver.resource.resolvingloader
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
pywb.warcserver.resource.responseloader module
|
||||
----------------------------------------------
|
||||
|
||||
.. automodule:: pywb.warcserver.resource.responseloader
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
Module contents
|
||||
---------------
|
||||
|
||||
.. automodule:: pywb.warcserver.resource
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
@ -1,70 +1,86 @@
|
||||
pywb\.warcserver package
|
||||
========================
|
||||
pywb.warcserver package
|
||||
=======================
|
||||
|
||||
Subpackages
|
||||
-----------
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 4
|
||||
|
||||
pywb.warcserver.index
|
||||
pywb.warcserver.resource
|
||||
pywb.warcserver.index
|
||||
pywb.warcserver.resource
|
||||
|
||||
Submodules
|
||||
----------
|
||||
|
||||
pywb\.warcserver\.basewarcserver module
|
||||
---------------------------------------
|
||||
pywb.warcserver.access\_checker module
|
||||
--------------------------------------
|
||||
|
||||
.. automodule:: pywb.warcserver.basewarcserver
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
.. automodule:: pywb.warcserver.access_checker
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
pywb\.warcserver\.handlers module
|
||||
---------------------------------
|
||||
pywb.warcserver.amf module
|
||||
--------------------------
|
||||
|
||||
.. automodule:: pywb.warcserver.handlers
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
.. automodule:: pywb.warcserver.amf
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
pywb\.warcserver\.http module
|
||||
-----------------------------
|
||||
|
||||
.. automodule:: pywb.warcserver.http
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
pywb\.warcserver\.inputrequest module
|
||||
pywb.warcserver.basewarcserver module
|
||||
-------------------------------------
|
||||
|
||||
.. automodule:: pywb.warcserver.inputrequest
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
.. automodule:: pywb.warcserver.basewarcserver
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
pywb\.warcserver\.upstreamindexsource module
|
||||
--------------------------------------------
|
||||
pywb.warcserver.handlers module
|
||||
-------------------------------
|
||||
|
||||
.. automodule:: pywb.warcserver.upstreamindexsource
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
.. automodule:: pywb.warcserver.handlers
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
pywb\.warcserver\.warcserver module
|
||||
pywb.warcserver.http module
|
||||
---------------------------
|
||||
|
||||
.. automodule:: pywb.warcserver.http
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
pywb.warcserver.inputrequest module
|
||||
-----------------------------------
|
||||
|
||||
.. automodule:: pywb.warcserver.warcserver
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
.. automodule:: pywb.warcserver.inputrequest
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
pywb.warcserver.upstreamindexsource module
|
||||
------------------------------------------
|
||||
|
||||
.. automodule:: pywb.warcserver.upstreamindexsource
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
pywb.warcserver.warcserver module
|
||||
---------------------------------
|
||||
|
||||
.. automodule:: pywb.warcserver.warcserver
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
Module contents
|
||||
---------------
|
||||
|
||||
.. automodule:: pywb.warcserver
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
@ -53,7 +53,7 @@ master_doc = 'index'
|
||||
|
||||
# General information about the project.
|
||||
project = 'pywb'
|
||||
copyright = 'A Webrecorder Project, Ilya Kreymer, Rhizome'
|
||||
copyright = '2014-2021, Webrecorder Software, Rhizome, and Contributors'
|
||||
author = 'Ilya Kreymer'
|
||||
|
||||
# The version info for the project you're documenting, acts as replacement for
|
||||
@ -61,9 +61,9 @@ author = 'Ilya Kreymer'
|
||||
# built documents.
|
||||
#
|
||||
# The short X.Y version.
|
||||
version = '2.0'
|
||||
version = '2.7'
|
||||
# The full version, including alpha/beta/rc tags.
|
||||
release = '2.0'
|
||||
release = '2.7'
|
||||
|
||||
# The language for content autogenerated by Sphinx. Refer to documentation
|
||||
# for a list of supported languages.
|
||||
|
@ -16,8 +16,12 @@ A subset of features provides the basic functionality of a "Wayback Machine".
|
||||
|
||||
manual/usage
|
||||
manual/configuring
|
||||
manual/access-control
|
||||
manual/ui-customization
|
||||
manual/localization
|
||||
manual/architecture
|
||||
manual/apis
|
||||
manual/owb-transition
|
||||
code/pywb
|
||||
|
||||
|
||||
|
289
docs/manual/access-control.rst
Normal file
289
docs/manual/access-control.rst
Normal file
@ -0,0 +1,289 @@
|
||||
.. _access-control:
|
||||
|
||||
Embargo and Access Control
|
||||
--------------------------
|
||||
|
||||
The embargo system allows for date-based rules to block access to captures based on their capture dates.
|
||||
|
||||
The access controls system provides additional URL-based rules to allow, block or exclude access to specific URL prefixes or exact URLs.
|
||||
|
||||
The embargo and access control rules are configured per collection.
|
||||
|
||||
Embargo Settings
|
||||
================
|
||||
|
||||
The embargo system allows restricting access to all URLs within a collection based on the timestamp of each URL.
|
||||
Access to these resources is 'embargoed' until the date range is adjusted or the time interval passes.
|
||||
|
||||
The embargo can be used to disallow access to captures based on following criteria:
|
||||
|
||||
- Captures before an exact date
|
||||
- Captures after an exact date
|
||||
- Captures newer than a time interval
|
||||
- Captures older than a time interval
|
||||
|
||||
Embargo Before/After Exact Date
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
To block access to all captures before or after a specific date, use the ``before`` or ``after`` embargo blocks
|
||||
with a specific timestamp.
|
||||
|
||||
For example, the following blocks access to all URLs captured before 2020-12-26 in the collection ``embargo-before``::
|
||||
|
||||
embargo-before:
|
||||
index_paths: ...
|
||||
archive_paths: ...
|
||||
embargo:
|
||||
before: '20201226'
|
||||
|
||||
|
||||
The following blocks access to all URLs captured on or after 2020-12-26 in collection ``embargo-after``::
|
||||
|
||||
embargo-after:
|
||||
index_paths: ...
|
||||
archive_paths: ...
|
||||
embargo:
|
||||
after: '20201226'
|
||||
|
||||
Embargo By Time Interval
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
The embargo can also be set for a relative time interval, consisting of years, months, weeks and/or days.
|
||||
|
||||
|
||||
For example, the following blocks access to all URLs newer than 1 year::
|
||||
|
||||
embargo-newer:
|
||||
...
|
||||
embargo:
|
||||
newer:
|
||||
years: 1
|
||||
|
||||
|
||||
|
||||
The following blocks access to all URLs older than 1 year, 2 months, 3 weeks and 4 days::
|
||||
|
||||
embargo-older:
|
||||
...
|
||||
embargo:
|
||||
older:
|
||||
years: 1
|
||||
months: 2
|
||||
weeks: 3
|
||||
days: 4
|
||||
|
||||
|
||||
Any combination of years, months, weeks and days can be used (as long as at least one is provided) for the ``newer`` or ``older`` embargo settings.
|
||||
|
||||
|
||||
Access Control Settings
|
||||
=======================
|
||||
|
||||
Access Control Files (.aclj)
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
URL-based access controls are set in one or more access control JSON files (.aclj), sorted in reverse alphabetical order.
|
||||
To determine the best match, a binary search is used (similar to CDXJ lookup) and then the best match is found forward.
|
||||
|
||||
An .aclj file may look as follows::
|
||||
|
||||
org,httpbin)/anything/something - {"access": "allow", "url": "http://httpbin.org/anything/something"}
|
||||
org,httpbin)/anything - {"access": "exclude", "url": "http://httpbin.org/anything"}
|
||||
org,httpbin)/ - {"access": "block", "url": "httpbin.org/"}
|
||||
com, - {"access": "allow", "url": "com,"}
|
||||
|
||||
|
||||
Each JSON entry contains an ``access`` field and the original ``url`` field that was used to convert to the SURT (if any).
|
||||
|
||||
The JSON entry may also contain a ``user`` field, as explained below.
|
||||
|
||||
The prefix consists of a SURT key and a ``-`` (currently reserved for a timestamp/date range field to be added later).
|
||||
|
||||
Given these rules, a user would:
|
||||
|
||||
* be allowed to visit ``http://httpbin.org/anything/something`` (allow)
|
||||
* but would receive an 'access blocked' error message when viewing ``http://httpbin.org/`` (block)
|
||||
* would receive a 404 not found error when viewing ``http://httpbin.org/anything`` (exclude)
|
||||
|
||||
To match any possible URL in an .aclj file, set ``*,`` as the leading SURT, for example::
|
||||
|
||||
*, - {"access": "allow"}
|
||||
|
||||
Lines starting with ``*,`` should generally be at the end of the file, respecting the reverse alphabetical order.
|
||||
|
||||
|
||||
Access Types: allow, block, exclude, allow_ignore_embargo
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
The available access types are as follows:
|
||||
|
||||
- ``exclude`` - when matched, results are excluded from the index, as if they do not exist. User will receive a 404.
|
||||
- ``block`` - when matched, results are not excluded from the index, but access to the actual content is blocked. User will see a 451.
|
||||
- ``allow`` - full access to the index and the resource, but may be overriden by embargo.
|
||||
- ``allow_ignore_embargo`` - full access to the index and resource, overriding any embargo settings.
|
||||
|
||||
The difference between ``exclude`` and ``block`` is that when blocked, the user can be notified that access is blocked, while
|
||||
with exclude, no trace of the resource is presented to the user.
|
||||
|
||||
The use of ``allow`` is useful to provide access to more specific resources within a broader block/exclude rule, while ``allow_ignore_embargo``
|
||||
can be used to override any embargo settings.
|
||||
|
||||
If both are present, the embargo restrictions are checked first and take precedence, unless the ``allow_ignore_embargo`` option is used
|
||||
to override the embargo.
|
||||
|
||||
|
||||
User-Based Access Controls
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
The access control rules can further be customized be specifying different permissions for different 'users'. Since pywb does not have a user system,
|
||||
a special header, ``X-Pywb-ACL-User`` can be used to indicate a specific user.
|
||||
|
||||
This setting is designed to allow a more privileged user to access additional content or override an embargo.
|
||||
|
||||
For example, the following access control settings restrict access to ``https://example.com/restricted/`` by default, but allow access for the ``staff`` user::
|
||||
|
||||
com,example)/restricted - {"access": "allow", "user": "staff"}
|
||||
com,example)/restricted - {"access": "block"}
|
||||
|
||||
|
||||
Combined with the embargo settings, this can also be used to override the embargo for internal organizational users, while keeping the embargo for general access::
|
||||
|
||||
com,example)/restricted - {"access": "allow_ignore_embargo", "user": "staff"}
|
||||
com,example)/restricted - {"access": "allow"}
|
||||
|
||||
To make this work, pywb must be running behind an Apache or Nginx system that is configured to set ``X-Pywb-ACL-User: staff`` based on certain settings.
|
||||
|
||||
For example, this header may be set based on IP range, or based on password authentication.
|
||||
|
||||
To allow a user access to all URLs, overriding more specific rules and the ``default_access`` configuration setting, use the ``*,`` SURT::
|
||||
|
||||
*, - {"access": "allow", "user": "staff"}
|
||||
|
||||
Further examples of how to set this header will be provided in the deployments section.
|
||||
|
||||
**Note: Do not use the user-based rules without configuring proper authentication on an Apache or Nginx frontend to set or remove this header, otherwise the 'X-Pywb-ACL-User' can easily be faked.**
|
||||
|
||||
See the :ref:`config-acl-header` section in Usage for examples on how to configure this header.
|
||||
|
||||
|
||||
Access Error Messages
|
||||
^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
The special error code 451 is used to indicate that a resource has been blocked (access setting ``block``).
|
||||
|
||||
The `error.html <https://github.com/webrecorder/pywb/blob/master/pywb/templates/error.html>`_ template contains a special message for this access and can be customized further.
|
||||
|
||||
By design, resources that are ``exclude``-ed simply appear as 404 not found and no special error is provided.
|
||||
|
||||
|
||||
Managing Access Lists via Command-Line
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
The .aclj files need not ever be added or edited manually.
|
||||
|
||||
The pywb ``wb-manager`` utility has been extended to provide tools for adding, removing and checking access control rules.
|
||||
|
||||
The access rules are written to ``<collection>/acl/access-rules.aclj`` for a given collection ``<collection>`` for automatic collections.
|
||||
|
||||
For example, to add the first line to an ACL file ``access.aclj``, one could run::
|
||||
|
||||
wb-manager acl add <collection> http://httpbin.org/anything/something exclude
|
||||
|
||||
|
||||
The URL supplied can be a URL or a SURT prefix. If a SURT is supplied, it is used as is::
|
||||
|
||||
wb-manager acl add <collection> com, allow
|
||||
|
||||
|
||||
A specific user for user-based rules can also be specified, for example to add ``allow_ignore_embargo`` for user ``staff`` only, run::
|
||||
|
||||
wb-manager acl add <collection> http://httpbin.org/anything/something allow_ignore_embargo -u staff
|
||||
|
||||
|
||||
By default, access control rules apply to a prefix of a given URL or SURT.
|
||||
|
||||
To have the rule apply only to the exact match, use::
|
||||
|
||||
wb-manager acl add <collection> http://httpbin.org/anything/something allow --exact-match
|
||||
|
||||
Rules added with and without the ``--exact-match`` flag are considered distinct rules, and can be added
|
||||
and removed separately.
|
||||
|
||||
With the above rules, ``http://httpbin.org/anything/something`` would be allowed, but
|
||||
``http://httpbin.org/anything/something/subpath`` would be excluded for any ``subpath``.
|
||||
|
||||
To remove a rule, one can run::
|
||||
|
||||
wb-manager acl remove <collection> http://httpbin.org/anything/something
|
||||
|
||||
To import rules in bulk, such as from an OpenWayback-style excludes.txt and mark them as ``exclude``::
|
||||
|
||||
wb-manager acl importtxt <collection> ./excludes.txt exclude
|
||||
|
||||
|
||||
See ``wb-manager acl -h`` for a list of additional commands such as for validating rules files and running a match against
|
||||
an existing rule set.
|
||||
|
||||
|
||||
|
||||
Access Controls for Custom Collections
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
For manually configured collections, there are additional options for configuring access controls.
|
||||
The access control files can be specified explicitly using the ``acl_paths`` key and allow specifying multiple ACL files,
|
||||
and allow sharing access control files between different collections.
|
||||
|
||||
Single ACLJ::
|
||||
|
||||
collections:
|
||||
test:
|
||||
acl_paths: ./path/to/file.aclj
|
||||
default_access: block
|
||||
|
||||
|
||||
|
||||
Multiple ACLJ::
|
||||
|
||||
collections:
|
||||
test:
|
||||
acl_paths:
|
||||
- ./path/to/allows.aclj
|
||||
- ./path/to/blocks.aclj
|
||||
- ./path/to/other.aclj
|
||||
- ./path/to/directory
|
||||
|
||||
default_access: block
|
||||
|
||||
The ``acl_paths`` can be a single entry or a list, and can also include directories. If a directory is specified, all ``.aclj`` files
|
||||
in the directory are checked.
|
||||
|
||||
When finding the best rule from multiple ``.aclj`` files, each file is binary searched and the result
|
||||
set merge-sorted to find the best match (very similar to the CDXJ index lookup).
|
||||
|
||||
Note: It might make sense to separate ``allows.aclj`` and ``blocks.aclj`` into individual files for organizational reasons,
|
||||
but there is no specific need to keep more than one access control file.
|
||||
|
||||
Finally, ACLJ and embargo settings combined for the same collection might look as follows::
|
||||
|
||||
collections:
|
||||
test:
|
||||
...
|
||||
embargo:
|
||||
newer:
|
||||
days: 366
|
||||
|
||||
acl_paths:
|
||||
- ./path/to/allows.aclj
|
||||
- ./path/to/blocks.aclj
|
||||
|
||||
|
||||
Default Access
|
||||
^^^^^^^^^^^^^^
|
||||
|
||||
An additional ``default_access`` setting can be added to specify the default rule if no other rules match for custom collections.
|
||||
If omitted, this setting is ``default_access: allow``, which is usually the desired default.
|
||||
|
||||
Setting ``default_access: block`` and providing a list of ``allow`` rules provides a flexible way to allow access
|
||||
to only a limited set of resources, and block access to anything out of scope by default.
|
||||
|
||||
|
@ -46,6 +46,7 @@ It can be used to:
|
||||
|
||||
* Create a new collection -- ``wb-manager init <coll>``
|
||||
* Add WARCs to collection -- ``wb-manager add <coll> <warc>``
|
||||
* Unpack WACZs to add their WARCs and indices to collection -- ``wb-manager add --unpack-wacz <coll> <wacz>``
|
||||
* Add override templates
|
||||
* Add and remove metadata to a collections ``metadata.yaml``
|
||||
* List all collections
|
||||
|
@ -19,7 +19,7 @@ For example, the following query might return the first 10 results from host ``h
|
||||
http://localhost:8080/coll/cdx?url=http://example.com/*&page=1&filter=mime:text/html&limit=10
|
||||
|
||||
|
||||
By default, the api endpoint is available at ``/<coll>/cdx`` for every collection.
|
||||
By default, the api endpoint is available at ``/<coll>/cdx`` for a collection named ``<coll>``.
|
||||
|
||||
The setting can be changed by setting ``cdx_api_endpoint`` in ``config.yaml``.
|
||||
|
||||
@ -36,9 +36,10 @@ API Reference
|
||||
^^^^^^^
|
||||
|
||||
| The only required parameter to the cdx server api is the url, ex:
|
||||
| ``http://localhost:8080/coll-cdx?url=example.com``
|
||||
| ``http://localhost:8080/coll/cdx?url=example.com``
|
||||
|
||||
will return a list of captures for ‘example.com’
|
||||
will return a list of captures for ‘example.com’ in the collection
|
||||
``coll`` (see above regarding per-collection api endpoints).
|
||||
|
||||
|
||||
``from, to``
|
||||
@ -50,7 +51,7 @@ given date/time range (inclusive).
|
||||
Timestamps may be <=14 digits and will be padded to either lower or
|
||||
upper bound.
|
||||
|
||||
| For example, ``...coll-cdx?url=example.com&from=2014&to=2014`` will
|
||||
| For example, ``...?url=example.com&from=2014&to=2014`` will
|
||||
return results of ``example.com`` that
|
||||
| have a timestamp between ``20140101000000`` and ``20141231235959``
|
||||
|
||||
@ -75,11 +76,11 @@ The cdx server supports the following ``matchType``
|
||||
As a shortcut, instead of specifying a separate ``matchType`` parameter,
|
||||
wildcards may be used in the url:
|
||||
|
||||
- ``...coll-cdx?url=http://example.com/path/*`` is equivalent to
|
||||
``...coll-cdx?url=http://example.com/path/&matchType=prefix``
|
||||
- ``...?url=http://example.com/path/*`` is equivalent to
|
||||
``...?url=http://example.com/path/&matchType=prefix``
|
||||
|
||||
- ``...coll-cdx?url=*.example.com`` is equivalent to
|
||||
``...coll-cdx?url=example.com&matchType=domain``
|
||||
- ``...?url=*.example.com`` is equivalent to
|
||||
``...?url=example.com&matchType=domain``
|
||||
|
||||
*Note: if you are using legacy cdx index files which are not
|
||||
SURT-ordered, the ``domain`` option will not be available. if this is
|
||||
@ -141,10 +142,10 @@ The ``filter`` param can be specified multiple times to filter by
|
||||
specific fields in the cdx index. Field names correspond to the fields
|
||||
returned in the JSON output. Filters can be specified as follows:
|
||||
|
||||
- ``...coll-cdx?url=example.com/*&filter==mime:text/html&filter=!=status:200``
|
||||
- ``...?url=example.com/*&filter==mime:text/html&filter=!=status:200``
|
||||
Return captures from example.com/\* where mime is text/html and http
|
||||
status is not 200.
|
||||
- ``...coll-cdx?url=example.com&matchType=domain&filter=~url:.*\.php$``
|
||||
- ``...?url=example.com&matchType=domain&filter=~url:.*\.php$``
|
||||
Return captures from the domain example.com which URL ends in
|
||||
``.php``.
|
||||
|
||||
@ -181,10 +182,10 @@ the following modifiers:
|
||||
+---------------+-----------------------------+------------------------------------+
|
||||
|
||||
|
||||
``fl``
|
||||
^^^^^^
|
||||
``fields``
|
||||
^^^^^^^^^^
|
||||
|
||||
The ``fl`` param can be used to specify which fields to include in the
|
||||
The ``fields`` param can be used to specify which fields to include in the
|
||||
output. The standard available fields are usually: ``urlkey``,
|
||||
``timestamp``, ``url``, ``mime``, ``status``, ``digest``, ``length``,
|
||||
``offset``, ``filename``
|
||||
@ -193,7 +194,7 @@ If a minimal cdx index is used, the ``mime`` and ``status`` fields may
|
||||
not be available. Additional fields may be introduced in the future,
|
||||
especially in the CDX JSON format.
|
||||
|
||||
Fields can be comma delimited, for example ``fl=urlkey,timestamp`` will
|
||||
Fields can be comma delimited, for example ``fields=urlkey,timestamp`` will
|
||||
only include the ``urlkey``, ``timestamp`` and ``filename`` in the
|
||||
output.
|
||||
|
||||
|
@ -16,8 +16,19 @@ With **framed replay**, the archived content is loaded into an iframe, and a top
|
||||
In this mode, the top frame url is for example, ``http://my-archive.example.com/<coll name>/http://example.com/`` while
|
||||
the actual content is served at ``http://my-archive.example.com/<coll name>/mp_/http://example.com/``
|
||||
|
||||
With **frameless replay**, the archived content is loaded directly. As of pywb 2.7, frameless replay is bannerless
|
||||
unless a custom banner is added via the ``custom_banner.html`` template.
|
||||
|
||||
|
||||
.. warning::
|
||||
pywb 2.7 introduces a breaking change around frameless replay and banners.
|
||||
Any custom banner intended to be used with frameless replay in pywb 2.7 and
|
||||
higher must be specified in the ``custom_banner.html`` template. This may
|
||||
require moving custom content from ``banner.html`` to the new
|
||||
``custom_banner.html``.
|
||||
|
||||
The default banner will no longer be served in frameless replay.
|
||||
|
||||
With **frameless replay**, the archived content is loaded directly, and a banner UI is injected into the page.
|
||||
|
||||
In this mode, the content is served directly at ``http://my-archive.example.com/<coll name>/http://example.com/``
|
||||
|
||||
@ -34,6 +45,8 @@ To disable framed replay add:
|
||||
Note: pywb also supports HTTP/S **proxy mode** which requires additional setup. See :ref:`https-proxy` for more details.
|
||||
|
||||
|
||||
.. _dir_structure:
|
||||
|
||||
Directory Structure
|
||||
-------------------
|
||||
|
||||
@ -53,13 +66,18 @@ The default directory structure for a web archive is as follows::
|
||||
|
|
||||
+-- <coll name>
|
||||
|
|
||||
+-- archives
|
||||
+-- archive
|
||||
| |
|
||||
| +-- (WARC or ARC files here)
|
||||
|
|
||||
+-- indexes
|
||||
| |
|
||||
| +-- (CDXJ index files here)
|
||||
|
|
||||
|
|
||||
+-- acl
|
||||
| |
|
||||
| +-- (.aclj access control files)
|
||||
|
|
||||
+-- templates
|
||||
| |
|
||||
@ -105,94 +123,18 @@ When resolving a ``example.warc.gz``, pywb will then check (in order):
|
||||
* Then, ``http://remote-backup.example.com/collections/<coll name>/example.warc.gz`` (if first lookup unsuccessful)
|
||||
|
||||
|
||||
Access Controls
|
||||
^^^^^^^^^^^^^^^
|
||||
|
||||
With pywb 2.4, pywb includes an extensible :ref:`access-control` system.
|
||||
By default, the access control files are stored in ``acl`` directory of each collection.
|
||||
|
||||
|
||||
UI Customizations
|
||||
-----------------
|
||||
^^^^^^^^^^^^^^^^^
|
||||
|
||||
pywb supports UI customizations, either for an entire archive,
|
||||
or per-collection.
|
||||
|
||||
Static Files
|
||||
^^^^^^^^^^^^
|
||||
|
||||
The replay server will automatically support static files placed under the following directories:
|
||||
|
||||
* Files under the root ``static`` directory can be accessed via ``http://my-archive.example.com/static/<filename>``
|
||||
|
||||
* Files under the per-collection ``./collections/<coll name>/static`` directory can be accessed via ``http://my-archive.example.com/static/_/<coll name>/<filename>``
|
||||
|
||||
Templates
|
||||
^^^^^^^^^
|
||||
|
||||
pywb users Jinja2 templates to render HTML to render the HTML for all aspects of the application.
|
||||
A version placed in the ``templates`` directory, either in the root or per collection, will override that template.
|
||||
|
||||
To copy the default pywb template to the template directory run:
|
||||
|
||||
``wb-manager template --add search_html``
|
||||
|
||||
The following templates are available:
|
||||
|
||||
* ``home.html`` -- Home Page Template, used for ``http://my-archive.example.com/``
|
||||
|
||||
* ``search.html`` -- Collection Template, used for each collection page ``http://my-archive.example.com/<coll name>/``
|
||||
|
||||
* ``query.html`` -- Capture Query Page for a given url, used for ``http://my-archive.example.com/<coll name/*/<url>``
|
||||
|
||||
Error Pages:
|
||||
|
||||
* ``not_found.html`` -- Page to show when a url is not found in the archive
|
||||
|
||||
* ``error.html`` -- Generic Error Page for any error (except not found)
|
||||
|
||||
Replay and Banner templates:
|
||||
|
||||
* ``frame_insert.html`` -- Top-frame for framed replay mode (not used with frameless mode)
|
||||
|
||||
* ``head_insert.html`` -- Rewriting code injected into ``<head>`` of each replayed page.
|
||||
This template includes the banner template and itself should generally not need to be modified.
|
||||
|
||||
* ``banner.html`` -- The banner used for frameless replay. Can be set to blank to disable the banner.
|
||||
|
||||
|
||||
Custom Outer Replay Frame
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
The top-frame used for framed replay can be replaced or augmented
|
||||
by modifying the ``frame_insert.html``.
|
||||
|
||||
To start with modifying the default outer page, you can add it to the current
|
||||
templates directory by running ``wb-manager template --add frame_insert_html``
|
||||
|
||||
To initialize the replay, the outer page should include ``wb_frame.js``,
|
||||
create an ``<iframe>`` element and pass the id (or element itself) to the ``ContentFrame`` constructor:
|
||||
|
||||
.. code-block:: html
|
||||
|
||||
<script src='{{ host_prefix }}/{{ static_path }}/wb_frame.js'> </script>
|
||||
<script>
|
||||
var cframe = new ContentFrame({"url": "{{ url }}" + window.location.hash,
|
||||
"prefix": "{{ wb_prefix }}",
|
||||
"request_ts": "{{ wb_url.timestamp }}",
|
||||
"iframe": "#replay_iframe"});
|
||||
</script>
|
||||
|
||||
|
||||
The outer frame can receive notifications of changes to the replay via ``postMessage``
|
||||
|
||||
For example, to detect when the content frame changed and log the new url and timestamp,
|
||||
use the following script to the outer frame html:
|
||||
|
||||
.. code-block:: javascript
|
||||
|
||||
window.addEventListener("message", function(event) {
|
||||
if (event.data.wb_type == "load" || event.data.wb_type == "replace-url") {
|
||||
console.log("New Url: " + event.data.url);
|
||||
console.log("New Timestamp: " + event.data.ts);
|
||||
}
|
||||
});
|
||||
|
||||
The ``load`` message is sent when a new page is first loaded, while ``replace-url`` is used
|
||||
for url changes caused by content frame History navigation.
|
||||
The ``templates`` directory supports custom Jinja templates to allow customizing the UI.
|
||||
See :ref:`ui-customizations` for more details on available options.
|
||||
|
||||
|
||||
Special and Custom Collections
|
||||
@ -335,7 +277,7 @@ The full set of configurable options (with their default settings) is as follows
|
||||
rollover_idle_secs: 600
|
||||
filename_template: my-warc-{timestamp}-{hostname}-{random}.warc.gz
|
||||
source_filter: live
|
||||
|
||||
enable_put_custom_record: false
|
||||
|
||||
The required ``source_coll`` setting specifies the source collection from which to load content that will be recorded.
|
||||
Most likely this will be the :ref:`live-web` collection, which should also be defined.
|
||||
@ -365,6 +307,70 @@ If running with auto indexing, the WARC will also get automatically indexed and
|
||||
As a shortcut, ``recorder: live`` can also be used to specify only the ``source_coll`` option.
|
||||
|
||||
|
||||
Dedup Options for Recording
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
By default, recording mode will record every URL.
|
||||
|
||||
Starting with pywb 2.5.0, it is possible to configure pywb to either write revisit records or skip duplicate URLs altogether using the ``dedup_policy`` key.
|
||||
|
||||
Using deduplication requires a Redis instance, which will keep track of the index for deduplication in a sorted-set key.
|
||||
The default Redis key used is ``redis://localhost:6379/0/pywb:{coll}:cdxj`` where ``{coll}`` is replaced with current collection id.
|
||||
|
||||
The field can be customized using the ``dedup_index_url`` field in the recorder config. The URL must start with ``redis://``, as that is the only
|
||||
supported dedup index at this time.
|
||||
|
||||
- To skip duplicate URLs, set ``dedup_policy: skip``. With this setting, only one instance of any URL will be recorded.
|
||||
|
||||
- To write revist records, set ``dedup_policy: revisit``. With this setting, WARC ``revisit`` records will be written when a duplicate URL is detected
|
||||
and has the same digest as a previous response.
|
||||
|
||||
- To keep all duplicates, use ``dedup_policy: keep``. All WARC records are written to disk normally as with no policy, however, the Redis dedup index is still populated,
|
||||
which allows for instant replay (see below).
|
||||
|
||||
- To disable the dedup system, set to ``dedup_policy: none`` or omit the field. This is the default, and no Redis is required.
|
||||
|
||||
Another option, pywb can add an aggressive Cache-Control header to force the browser to cache all responses on a page.
|
||||
This feature is still experimental, but can be enabled via ``cache: always`` setting.
|
||||
|
||||
|
||||
For example, the following will enable ``revisit`` records to be written using the given Redis URL, and also enable aggressive cacheing when recording::
|
||||
|
||||
recorder:
|
||||
...
|
||||
cache: always
|
||||
dedup_policy: revisit
|
||||
dedup_index_url: 'redis://localhost:6379/0/pywb:{coll}:cdxj' # default when omitted
|
||||
|
||||
|
||||
Instant Replay (experimental)
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
Starting with pywb 2.5.0, when the ``dedup_policy`` is set, pywb can do 'instant replay' after recording, without having to regenerate the CDX or waiting for it to be updated with auto-indexing.
|
||||
|
||||
When any dedup_policy, pywb can also access the dedup Redis index, along with any on-disk CDX, when replaying the collection.
|
||||
|
||||
This feature is still experimental but should generally work. Additional options for working with the Redis Dedup index will be added in the futuer.
|
||||
|
||||
|
||||
.. _put-custom-record:
|
||||
|
||||
Adding Custom Resource Records
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
pywb now also supports adding custom data to a WARC ``resource`` record. This can be used to add custom resources, such as screenshots, logs, error messages,
|
||||
etc.. that are not normally captured as part of recording, but still useful to store in WARCs.
|
||||
|
||||
To add a custom resources, simply call ``PUT /<coll>/record`` with the data to be added as the request body and the type of the data specified as the content-type. The ``url`` can be specified as a query param.
|
||||
|
||||
For example, adding a custom record ``file:///my-custom-resource`` containing ``Some Custom Data`` can be done using ``curl`` as follows::
|
||||
|
||||
curl -XPUT "localhost:8080/my-web-archive/record?url=file:///my-custom-resource" --data "Some Custom Data"
|
||||
|
||||
|
||||
This feature is only available if ``enable_put_custom_record: true`` is set in the recorder config.
|
||||
|
||||
|
||||
.. _auto-fetch:
|
||||
|
||||
Auto-Fetch Responsive Recording
|
||||
@ -382,6 +388,8 @@ The detected urls are loaded in the background using a web worker while the user
|
||||
|
||||
To enable this functionality, add ``--enable-auto-fetch`` to the command-line or ``enable_auto_fetch: true`` to the root of the ``config.yaml``
|
||||
|
||||
The auto-fetch system is provided as part of the :ref:`wombat`
|
||||
|
||||
|
||||
Auto-Indexing Mode
|
||||
------------------
|
||||
@ -408,6 +416,23 @@ The auto-indexing mode can also be enabled via command-line by running ``wayback
|
||||
(If running pywb with uWSGI in multi-process mode, the auto-indexing is only run in a single worker to avoid race conditions and duplicate indexing)
|
||||
|
||||
|
||||
.. _wombat:
|
||||
|
||||
Client-Side Rewriting System (wombat.js)
|
||||
----------------------------------------
|
||||
|
||||
In addition to server-side rewriting, pywb includes a Javascript client-rewriting system.
|
||||
|
||||
This system intercepts network traffic and emulates the correct JS environment expected by a replayed page.
|
||||
|
||||
The auto-fetch system is also implemented as part of wombat.
|
||||
|
||||
Wombat was integrated into pywb upto 2.2.x. Starting with 2.3, wombat has been spun off into its own
|
||||
standalone JS module.
|
||||
|
||||
For more information on wombat.js and client-side rewriting, see the `wombat README <https://github.com/webrecorder/wombat/blob/master/README.md>`_
|
||||
|
||||
|
||||
.. _https-proxy:
|
||||
|
||||
HTTP/S Proxy Mode
|
||||
@ -440,7 +465,7 @@ The timestamp can also be optionally specified by running: ``wayback --proxy my-
|
||||
|
||||
proxy:
|
||||
coll: my-coll
|
||||
default-timestamp: 20181226010203
|
||||
default_timestamp: "20181226010203"
|
||||
|
||||
The ISO date format, eg. ``2018-12-26T01:02:03`` is also accepted.
|
||||
|
||||
@ -605,3 +630,15 @@ To enable the previous behavior, add to config::
|
||||
enable_flash_video_rewrite: true
|
||||
|
||||
The system may be revamped in the future and enabled by default, but for now, it is provided "as-is" for compatibility reasons.
|
||||
|
||||
Verify SSL-Certificates
|
||||
-----------------------
|
||||
|
||||
By default, SSL-Certificates of websites are not verified. To enable verification, add the following to the config::
|
||||
|
||||
certificates:
|
||||
cert_reqs: 'CERT_REQUIRED'
|
||||
ca_cert_dir: '/etc/ssl/certs'
|
||||
|
||||
``ca_cert_dir`` can optionally point to a directory containing the CA certificates that you trust. Most linux distributions provide CA certificates via a package called ``ca-certificates``.
|
||||
If omitted, the default system CA used by Python is used.
|
||||
|
BIN
docs/manual/images/vue-banner.png
Normal file
BIN
docs/manual/images/vue-banner.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 1.3 MiB |
BIN
docs/manual/images/vue-cal.png
Normal file
BIN
docs/manual/images/vue-cal.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 330 KiB |
152
docs/manual/localization.rst
Normal file
152
docs/manual/localization.rst
Normal file
@ -0,0 +1,152 @@
|
||||
.. _localization:
|
||||
|
||||
Localization / Multi-lingual Support
|
||||
------------------------------------
|
||||
|
||||
pywb supports configuring different language locales and loading different language translations, and dynamically switching languages.
|
||||
|
||||
pywb can extract all text from templates and generate CSV files for translation and convert them back into a binary format used for localization/internationalization.
|
||||
|
||||
(pywb uses the `Babel library <http://babel.pocoo.org/en/latest/>`_ which extends the `standard Python i18n system <https://docs.python.org/3/library/gettext.html>`_)
|
||||
|
||||
To ensure all localization related dependencies are installed, first run::
|
||||
|
||||
pip install pywb[i18n]
|
||||
|
||||
Locales to use are configured in the ``config.yaml``.
|
||||
|
||||
The command-line ``wb-manager`` utility provides a way to manage locales for translation, including generating extracted text, and to update translated text.
|
||||
|
||||
|
||||
Adding a Locale and Extracting Text
|
||||
===================================
|
||||
|
||||
To add a new locale for translation and automatically extract all text that needs to be translated, run::
|
||||
|
||||
wb-manager i18n extract <loc>
|
||||
|
||||
The ``<loc>`` can be one or more supported two-letter locales or CLDR language codes. To list available codes, you can run ``pybabel --list-locales``.
|
||||
|
||||
Localization data is placed in the ``i18n`` directory, and translatable strings can be found in ``i18n/translations/<locale>/LC_MESSAGES/messages.csv``
|
||||
|
||||
Each CSV file looks as follows, listing each source string and an empty string for the translated version::
|
||||
|
||||
"location","source","target"
|
||||
"pywb/templates/banner.html:6","Live on",""
|
||||
"pywb/templates/banner.html:8","Calendar icon",""
|
||||
"pywb/templates/banner.html:9 pywb/templates/query.html:45","View All Captures",""
|
||||
"pywb/templates/banner.html:10 pywb/templates/header.html:4","Language:",""
|
||||
"pywb/templates/banner.html:11","Loading...",""
|
||||
...
|
||||
|
||||
|
||||
This CSV can then be passed to translators to translate the text.
|
||||
|
||||
(The extraction parameters are configured to load data from ``pywb/templates/*.html`` in ``babel.ini``)
|
||||
|
||||
|
||||
For example, the following will generate translation strings for ``es`` and ``pt`` locales::
|
||||
|
||||
wb-manager i18n extract es pt
|
||||
|
||||
|
||||
The translatable text can then be found in ``i18n/translations/es/LC_MESSAGES/messages.csv`` and ``i18n/translations/pt/LC_MESSAGES/messages.csv``.
|
||||
|
||||
|
||||
The CSV files should be updated with a translation for each string in the ``target`` column.
|
||||
|
||||
The extract command adds any new strings without overwriting existing translations, so after running the update command to compile translated strings (described below), it is safe to run the extract command again.
|
||||
|
||||
|
||||
Updating Locale Catalog
|
||||
=======================
|
||||
|
||||
Once the text has been translated, and the CSV files updated, simply run::
|
||||
|
||||
wb-manager i18n update <loc>
|
||||
|
||||
This will parse the CSVs and compile the translated string tables for use with pywb.
|
||||
|
||||
|
||||
Specifying locales in pywb
|
||||
==========================
|
||||
|
||||
To enable the locales in pywb, one or more locales can be added to the ``locales`` key in ``config.yaml``, ex::
|
||||
|
||||
locales:
|
||||
- en
|
||||
- es
|
||||
|
||||
Single Language Default Locale
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
pywb can be configured with a default, single-language locale, by setting the ``default_locale`` property in ``config.yaml``::
|
||||
|
||||
|
||||
default_locale: es
|
||||
locales:
|
||||
- es
|
||||
|
||||
|
||||
With this configuration, pywb will automatically use the ``es`` locale for all text strings in pywb pages.
|
||||
|
||||
pywb will also set the ``<html lang="es">`` so that the browser will recognize the correct locale.
|
||||
|
||||
|
||||
Mutli-language Translations
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
If more than one locale is specified, pywb will automatically show a language switching UI at the top of collection and search pages, with an option
|
||||
for each locale listed. To include English as an option, it should also be added as a locale (and no strings translated). For example::
|
||||
|
||||
locales:
|
||||
- en
|
||||
- es
|
||||
- pt
|
||||
|
||||
will configure pywb to show a language switch option on all pages.
|
||||
|
||||
|
||||
Localized Collection Paths
|
||||
==========================
|
||||
|
||||
When localization is enabled, pywb supports the locale prefix for accessing each collection with a localized language:
|
||||
If pywb has a collection ``my-web-archive``, then:
|
||||
|
||||
* ``/my-web-archive/`` - loads UI with default language (set via ``default_locale``)
|
||||
* ``/en/my-web-archive/`` - loads UI with ``en`` locale
|
||||
* ``/es/my-web-archive/`` - loads UI with ``es`` locale
|
||||
* ``/pt/my-web-archive/`` - loads UI with ``pt`` locale
|
||||
|
||||
The language switch options work by changing the locale prefix for the same page.
|
||||
|
||||
Listing and Removing Locales
|
||||
============================
|
||||
|
||||
To list the locales that have previously been added, you can also run ``wb-manager i18n list``.
|
||||
|
||||
To disable a locale from being used in pywb, simply remove it from the ``locales`` key in ``config.yaml``.
|
||||
|
||||
To remove data for a locale permanently, you can run: ``wb-manager i18n remove <loc>``. This will remove the locale directory on disk.
|
||||
|
||||
To remove all localization data, you can manually delete the ``i18n`` directory.
|
||||
|
||||
|
||||
UI Templates: Adding Localizable Text
|
||||
=====================================
|
||||
|
||||
Text that can be translated, localizable text, can be marked as such directly in the UI templates:
|
||||
|
||||
1. By wrapping the text in ``{% trans %}``/``{% endtrans %}`` tags. For example::
|
||||
|
||||
{% trans %}Collection {{ coll }} Search Page{% endtrans %}
|
||||
|
||||
2. Short-hand by calling a special ``_()`` function, which can be used in attributes or more dynamically. For example::
|
||||
|
||||
... title="{{ _('Enter a URL to search for') }}">
|
||||
|
||||
|
||||
These methods can be used in all UI templates and are supported by the Jinja2 templating system.
|
||||
|
||||
See :ref:`ui-customizations` for a list of all available UI templates.
|
||||
|
31
docs/manual/migrating-cdx.rst
Normal file
31
docs/manual/migrating-cdx.rst
Normal file
@ -0,0 +1,31 @@
|
||||
.. _migrating-cdx:
|
||||
|
||||
Migrating CDX
|
||||
=============
|
||||
|
||||
If you are not using OutbackCDX, you may need to check on the format of the CDX files that you are using.
|
||||
|
||||
Over the years, there have been many variations on the CDX (capture index) format which is used by OpenWayback and pywb to look up captures in WARC/ARC files.
|
||||
|
||||
When migrating CDX from OpenWayback, there are a few options.
|
||||
|
||||
pywb currently supports:
|
||||
|
||||
- 9 field CDX (surt-ordered)
|
||||
- 11 field CDX (surt-ordered)
|
||||
- CDXJ (surt-ordered)
|
||||
|
||||
pywb will support the 11-field and 9-field `CDX format <http://iipc.github.io/warc-specifications/specifications/cdx-format/cdx-2015/>`_ that is also used in OpenWayback.
|
||||
|
||||
Non-SURT ordered CDXs are not currently supported, though they may be supported in the future (see this `pending pull request <https://github.com/webrecorder/pywb/pull/586>`_).
|
||||
|
||||
CDXJ Conversion
|
||||
---------------
|
||||
|
||||
The native format used by pywb is the :ref:`cdxj-index` with SURT-ordering, which uses JSON to encode the fields, allowing for more flexibility by storing most of the index in a JSON, allowing support for optional fields as needed.
|
||||
|
||||
If your CDX are not SURT-ordered, 11 or 9 field CDX, or if there is a mix, pywb also offers a conversion utility which will convert all CDX to the pywb native CDXJ: ::
|
||||
|
||||
wb-manager cdx-convert <dir-of-cdx-files>
|
||||
|
||||
The converter will read the CDX files and create a corresponding .cdxj file for every cdx file. Since the conversion happens on the .cdx itself, it does not require reindexing the source WARC/ARC files and can happen fairly quickly. The converted CDXJ are guaranteed to be in the right format to work with pywb.
|
74
docs/manual/outbackcdx.rst
Normal file
74
docs/manual/outbackcdx.rst
Normal file
@ -0,0 +1,74 @@
|
||||
.. _using-outback:
|
||||
|
||||
|
||||
Using OutbackCDX with pywb
|
||||
==========================
|
||||
|
||||
The recommended setup is to run `OutbackCDX <https://github.com/nla/outbackcdx>`_ alongside pywb.
|
||||
OutbackCDX provides an index (CDX) server and can efficiently store and look up web archive data by URL.
|
||||
|
||||
|
||||
Adding CDX to OutbackCDX
|
||||
------------------------
|
||||
|
||||
To set up OutbackCDX, please follow the instructions on the `OutbackCDX README <https://github.com/nla/outbackcdx>`_.
|
||||
|
||||
Since pywb also uses the default port 8080, be sure to use a different port for OutbackCDX, eg. ``java -jar outbackcdx*.jar -p 8084``.
|
||||
|
||||
OutbackCDX can generally ingest existing CDX used in OpenWayback simply by POSTing to OutbackCDX at a new index endpoint.
|
||||
|
||||
For example, assuming OutbackCDX is running on port 8084, to add CDX for ``index1.cdx``, ``index2.cdx``, run:
|
||||
|
||||
.. code:: console
|
||||
|
||||
curl -X POST --data-binary @index1.cdx http://localhost:8084/mycoll
|
||||
curl -X POST --data-binary @index2.cdx http://localhost:8084/mycoll
|
||||
|
||||
The contents of each CDX file are added to the ``mycoll`` OutbackCDX index, which can correspond to the web archive collection ``mycoll``.
|
||||
The index is created automatically if it does not exist.
|
||||
|
||||
See the `OutbackCDX Docs <https://github.com/nla/outbackcdx#loading-records>`_ for more info on ingesting CDX.
|
||||
|
||||
|
||||
(Re)generating CDX from WARCs
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
There are some exceptions where it may be useful to re-generate the CDX with pywb for existing WARCs:
|
||||
|
||||
- If your CDX is 9-field and does not include the compressed length, regnerating the CDX will result in more efficient HTTP range requests
|
||||
- If you want to replay pages with POST requests, pywb generated CDX will soon be supported in OutbackCDX (see: `Issue #585 <https://github.com/webrecorder/pywb/issues/585>`_, `Issue #91 <https://github.com/nla/outbackcdx/pull/91>`_ )
|
||||
|
||||
|
||||
To generate the CDX, run the ``cdx-indexer`` command (with ``-p`` flag for POST request handling) for each WARC or set of WARCs you wish to index:
|
||||
|
||||
.. code:: console
|
||||
|
||||
cdx-indexer /path/to/mywarcs/my.warc.gz > ./index1.cdx
|
||||
cdx-indexer /path/to/all_warcs/*warc.gz > ./index2.cdx
|
||||
|
||||
|
||||
Then, run the POST command as shown above to ingest to OutbackCDX.
|
||||
|
||||
The above can be repeated for each WARC file, or for a set of WARCs using the ``*.warc.gz`` wildcard.
|
||||
|
||||
If a CDX index is too big, OutbackCDX may fail and ingesting an index per-WARC may be needed.
|
||||
|
||||
|
||||
Configure pywb with OutbackCDX
|
||||
------------------------------
|
||||
|
||||
The ``config.yaml`` should be configured to point to OutbackCDX.
|
||||
|
||||
Assuming a collection named ``mycoll``, the ``config.yaml`` can be configured as follows to use OutbackCDX
|
||||
|
||||
|
||||
.. code:: yaml
|
||||
|
||||
collections:
|
||||
mycoll:
|
||||
index_paths: cdx+http://localhost:8084/mycoll
|
||||
archive_paths: /path/to/mywarcs/
|
||||
|
||||
|
||||
The ``archive_paths`` can be configured to point to a directory of WARCs or a path index.
|
||||
|
42
docs/manual/owb-pywb-terms.rst
Normal file
42
docs/manual/owb-pywb-terms.rst
Normal file
@ -0,0 +1,42 @@
|
||||
OpenWayback vs pywb Terms
|
||||
=========================
|
||||
|
||||
pywb and OpenWayback use slightly different terms to describe the configuration options, as explained below.
|
||||
|
||||
Some differences are:
|
||||
- The ``wayback.xml`` config file in OpenWayback is replaced with ``config.yaml`` yaml
|
||||
- The terms ``Access Point`` and ``Wayback Collection`` are replaced with ``Collection`` in pywb. The collection configuration represents a unique path (access point) and the data that is accessed at that path.
|
||||
- The ``Resource Store`` in OpenWayback is known in pywb as the archive paths, configured under ``archive_paths``
|
||||
- The ``Resource Index`` in OpenWayback is known in pywb as the index paths, configurable under ``index_paths``
|
||||
- The ``Exclusions`` in OpenWayback are replaced with general :ref:`access-control`
|
||||
|
||||
|
||||
|
||||
Pywb Collection Basics
|
||||
----------------------
|
||||
|
||||
A pywb collection must consist of a minimum of three parts: the collection name, the ``index_paths`` (where to read the index), and the ``archive_paths`` (where to read the WARC files).
|
||||
|
||||
The collection is accessed by name, so there is no distinct access point.
|
||||
|
||||
The collections are configured in the ``config.yaml`` under the ``collections`` key:
|
||||
|
||||
For example, a basic collection definition can be specified via:
|
||||
|
||||
.. code:: yaml
|
||||
|
||||
collections:
|
||||
wayback:
|
||||
index_paths: /archive/cdx/
|
||||
archive_paths: /archive/storage/warcs/
|
||||
|
||||
|
||||
Pywb also supports a convention-based directory structure. Collections created in this structure can be detected automatically
|
||||
and need not be specified in the ``config.yaml``. This structure is designed for smaller collections that are all stored locally in a subdirectory.
|
||||
|
||||
See the :ref:`dir_structure` for the default pywb directory structure.
|
||||
|
||||
However, for importing existing collections from OpenWayback, it is probably easier to specify the existing paths as shown above.
|
||||
|
||||
|
||||
|
308
docs/manual/owb-to-pywb-config.rst
Normal file
308
docs/manual/owb-to-pywb-config.rst
Normal file
@ -0,0 +1,308 @@
|
||||
Converting OpenWayback Config to pywb Config
|
||||
============================================
|
||||
|
||||
OpenWayback includes many different types of configurations.
|
||||
|
||||
For most use cases, using OutbackCDX with pywb is the recommended approach, as explained in :ref:`using-outback`.
|
||||
|
||||
The following are a few specific example of WaybackCollections gathered from active OpenWayback configurations
|
||||
and how they can be configured for use with pywb.
|
||||
|
||||
|
||||
Remote Collection / Access Point
|
||||
--------------------------------
|
||||
|
||||
A collection configured with a remote index and WARC access can be converted to use OutbackCDX
|
||||
for the remote index, while pywb can load WARCs directly from an HTTP endpoint.
|
||||
|
||||
For example, a configuration similar to:
|
||||
|
||||
.. code:: xml
|
||||
|
||||
<bean name="standardaccesspoint" class="org.archive.wayback.webapp.AccessPoint">
|
||||
<property name="accessPointPath" value="/wayback/"/>
|
||||
<property name="collection" ref="remotecollection" />
|
||||
...
|
||||
</bean>
|
||||
|
||||
<bean id="remotecollection" class="org.archive.wayback.webapp.WaybackCollection">
|
||||
<property name="resourceStore">
|
||||
<bean class="org.archive.wayback.resourcestore.SimpleResourceStore">
|
||||
<property name="prefix" value="http://myarchive.example.com/RemoteStore/" />
|
||||
</bean>
|
||||
</property>
|
||||
<property name="resourceIndex">
|
||||
<bean class="org.archive.wayback.resourceindex.RemoteResourceIndex">
|
||||
<property name="searchUrlBase" value="http://myarchive.example.com/RemoteIndex" />
|
||||
</bean>
|
||||
</property>
|
||||
</bean>
|
||||
|
||||
can be converted to the following config, with OutbackCDX assumed to be running
|
||||
at: ``http://myarchive.example.com/RemoteIndex``
|
||||
|
||||
|
||||
.. code:: yaml
|
||||
|
||||
collections:
|
||||
wayback:
|
||||
index_paths: cdx+http://myarchive.example.com/RemoteIndex
|
||||
archive_paths: http://myarchive.example.com/RemoteStore/
|
||||
|
||||
Local Collection / Access Point
|
||||
-------------------------------
|
||||
|
||||
An OpenWayback configuration with a local collection and local CDX, for example:
|
||||
|
||||
.. code:: xml
|
||||
|
||||
<bean id="collection" class="org.archive.wayback.webapp.WaybackCollection">
|
||||
<property name="resourceIndex">
|
||||
<bean class="org.archive.wayback.resourceindex.cdxserver.EmbeddedCDXServerIndex">
|
||||
...
|
||||
<property name="cdxServer">
|
||||
<bean class="org.archive.cdxserver.CDXServer">
|
||||
<property name="cdxSource">
|
||||
<bean class="org.archive.format.cdx.MultiCDXInputSource">
|
||||
<property name="cdxUris">
|
||||
<list>
|
||||
<value>/wayback/cdx/mycdx1.cdx</value>
|
||||
<value>/wayback/cdx/mycdx2.cdx</value>
|
||||
</list>
|
||||
</property>
|
||||
</bean>
|
||||
</property>
|
||||
<property name="cdxFormat" value="cdx11"/>
|
||||
<property name="surtMode" value="true"/>
|
||||
</bean>
|
||||
</property>
|
||||
...
|
||||
</bean>
|
||||
</property>
|
||||
</bean>
|
||||
|
||||
|
||||
can be configured in pywb using the ``index_paths`` key.
|
||||
|
||||
Note that the CDX files should all be in the same format. See :ref:`migrating-cdx` for more info on converting
|
||||
CDX to pywb native CDXJ format.
|
||||
|
||||
|
||||
.. code:: yaml
|
||||
|
||||
collections:
|
||||
wayback:
|
||||
index_paths: /wayback/cdx/
|
||||
archive_paths: ...
|
||||
|
||||
|
||||
It's also possible to combine directories, individual CDX files, and even a remote index from OutbackCDX in a single collection
|
||||
(as long as all CDX are in the same format).
|
||||
|
||||
pywb will query all the sources simultaneously to find the best match.
|
||||
|
||||
.. code:: yaml
|
||||
|
||||
collections:
|
||||
wayback:
|
||||
index_group:
|
||||
cdx1: /wayback/cdx1/
|
||||
cdx2: /wayback/cdx2/mycdx.cdx
|
||||
remote: cdx+https://myarchive.example.com/outbackcdx
|
||||
|
||||
archive_paths: ...
|
||||
|
||||
However, OutbackCDX is still recommended to avoid more complex CDX configurations.
|
||||
|
||||
|
||||
WatchedCDXSource
|
||||
^^^^^^^^^^^^^^^^
|
||||
|
||||
OpenWayback includes a 'Watched CDX Source' option which watches a directory for new CDX indexes.
|
||||
This functionality is default in pywb when specifying a directory for the index path:
|
||||
|
||||
For example, the config:
|
||||
|
||||
.. code:: xml
|
||||
|
||||
<property name="source">
|
||||
<bean class="org.archive.wayback.resourceindex.WatchedCDXSource">
|
||||
<property name="recursive" value="false" />
|
||||
<property name="filters">
|
||||
<list>
|
||||
<value>^.+\.cdx$</value>
|
||||
</list>
|
||||
</property>
|
||||
<property name="path" value="/wayback/cdx-index/" />
|
||||
</bean>
|
||||
</property>
|
||||
|
||||
can be replaced with:
|
||||
|
||||
.. code:: yaml
|
||||
|
||||
collections:
|
||||
wayback:
|
||||
index_paths: /wayback/cdx-index/
|
||||
archive_paths: ...
|
||||
|
||||
|
||||
pywb will load all CDX from that directory.
|
||||
|
||||
|
||||
ZipNum Cluster Index
|
||||
--------------------
|
||||
|
||||
pywb also supports using a compressed :ref:`zipnum` instead of a plain text CDX. For example, the following OpenWayback configuration:
|
||||
|
||||
.. code:: xml
|
||||
|
||||
<bean id="collection" class="org.archive.wayback.webapp.WaybackCollection">
|
||||
<property name="resourceIndex">
|
||||
<bean class="org.archive.wayback.resourceindex.LocalResourceIndex">
|
||||
...
|
||||
<property name="source">
|
||||
<bean class="org.archive.wayback.resourceindex.ZipNumClusterSearchResultSource">
|
||||
<property name="cluster">
|
||||
<bean class="org.archive.format.gzip.zipnum.ZipNumCluster">
|
||||
<property name="summaryFile" value="/webarchive/zipnum-cdx/all.summary"></property>
|
||||
<property name="locFile" value="/webarchive/zipnum-cdx/all.loc"></property>
|
||||
</bean>
|
||||
</property>
|
||||
...
|
||||
</bean>
|
||||
</property>
|
||||
</bean>
|
||||
|
||||
can simply be converted to the pywb config:
|
||||
|
||||
.. code:: yaml
|
||||
|
||||
collections:
|
||||
wayback:
|
||||
index_paths: /webarchive/zipnum-cdx
|
||||
|
||||
# if the index is not surt ordered
|
||||
surt_ordered: false
|
||||
|
||||
|
||||
pywb will automatically determine the ``.summary`` and use the ``.loc`` files for the ZipNum Cluster if they are present in the directory.
|
||||
|
||||
Note that if the ZipNum index is **not** SURT ordered, the ``surt_ordered: false`` flag must be added to support this format.
|
||||
|
||||
|
||||
|
||||
Path Index Configuration
|
||||
------------------------
|
||||
|
||||
OpenWayback supports a 'path index' that can be used to look up a WARC by filename and map to an exact path.
|
||||
For compatibility, pywb supports the same path index lookup, as well as loading WARC files by path or URL prefix.
|
||||
|
||||
|
||||
For example, an OpenWayback configuration that includes a path index:
|
||||
|
||||
.. code:: xml
|
||||
|
||||
<bean id="resourcefilelocationdb" class="org.archive.wayback.resourcestore.locationdb.FlatFileResourceFileLocationDB">
|
||||
<property name="path" value="/archive/warc-paths.txt"/>
|
||||
</bean>
|
||||
|
||||
<bean id="resourceStore" class="org.archive.wayback.resourcestore.LocationDBResourceStore">
|
||||
<property name="db" ref="resourcefilelocationdb" />
|
||||
</bean>
|
||||
|
||||
|
||||
can be configured in the ``archive_paths`` field of pywb collection configuration:
|
||||
|
||||
.. code:: yaml
|
||||
|
||||
collections:
|
||||
wayback:
|
||||
index_paths: ...
|
||||
archive_paths: /archive/warc-paths.txt
|
||||
|
||||
|
||||
The path index is a tab-delimited text file for mapping WARC filenames to full file paths or URLs, eg:
|
||||
|
||||
.. code::
|
||||
|
||||
example.warc.gz<tab>/some/path/to/example.warc.gz
|
||||
another.warc.gz<tab>/some-other/path/another.warc.gz
|
||||
remote.warc.gz<tab>http://warcstore.example.com/serve/remote.warc.gz
|
||||
|
||||
|
||||
However, if all WARC files are stored in the same directory, or in a few directories, a path index is not needed and pywb will try loading the WARC by prefix.
|
||||
|
||||
The ``archive_paths`` can accept a list of entries. For example, given the config:
|
||||
|
||||
.. code:: yaml
|
||||
|
||||
collections:
|
||||
wayback:
|
||||
index_paths: ...
|
||||
archive_paths:
|
||||
- /archive/warcs1/
|
||||
- /archive/warcs2/
|
||||
- https://myarchive.example.com/warcs/
|
||||
- /archive/warc-paths.txt
|
||||
|
||||
|
||||
And the WARC file: ``example.warc.gz``, pywb will try to find the WARC in order from:
|
||||
|
||||
.. code::
|
||||
|
||||
1. /archive/warcs1/example.warc.gz
|
||||
2. /archive/warcs2/example.warc.gz
|
||||
3. https://myarchive.example.com/warcs/example.warc.gz
|
||||
4. Looking up example.warc.gz in /archive/warc-paths.txt
|
||||
|
||||
|
||||
Proxy Mode Access
|
||||
-----------------
|
||||
|
||||
A OpenWayback configuration may include many beans to support proxy mode, eg:
|
||||
|
||||
.. code:: xml
|
||||
|
||||
<bean id="proxyreplaydispatcher" class="org.archive.wayback.replay.SelectorReplayDispatcher">
|
||||
...
|
||||
<property name="renderer">
|
||||
<bean class="org.archive.wayback.proxy.HttpsRedirectAndLinksRewriteProxyHTMLMarkupReplayRenderer">
|
||||
...
|
||||
<property name="uriConverter">
|
||||
<bean class="org.archive.wayback.proxy.ProxyHttpsResultURIConverter"/>
|
||||
</property>
|
||||
</bean>
|
||||
</propery>
|
||||
</bean>
|
||||
<bean name="proxy" class="org.archive.wayback.webapp.AccessPoint">
|
||||
<property name="internalPort" value="${proxy.port}"/>
|
||||
<property name="accessPointPath" value="${proxy.port}" />
|
||||
<property name="collection" ref="localcdxcollection" />
|
||||
...
|
||||
</bean>
|
||||
|
||||
|
||||
In pywb, the proxy mode can be enabled by adding to the main ``config.yaml`` the name of the collection
|
||||
that should be served in proxy mode:
|
||||
|
||||
.. code:: yaml
|
||||
|
||||
proxy:
|
||||
source_coll: wayback
|
||||
|
||||
|
||||
There are some differences between OpenWayback and pywb proxy mode support.
|
||||
|
||||
In OpenWayback, proxy mode is configured using separate access points for different collections on different ports.
|
||||
OpenWayback only supports HTTP proxy and attempts to rewrite HTTPS URLs to HTTP.
|
||||
|
||||
In pywb, proxy mode is enabled on the same port as regular access, and pywb supports HTTP and HTTPS proxy.
|
||||
pywb does not attempt to rewrite HTTPS to HTTP, as most browsers disallow HTTP access as insecure for many sites.
|
||||
pywb supports a default collection that is enabled for proxy mode, and a default timestamp accessed by the proxy mode.
|
||||
(Switching the collection and date accessed is possible but not currently supported without extensions to pywb).
|
||||
|
||||
To support HTTPS access, pywb provides a certificate authority that can be trusted by a browser to rewrite HTTPS content.
|
||||
|
||||
See :ref:`https-proxy` for all of the options of pywb proxy mode configuration.
|
||||
|
80
docs/manual/owb-to-pywb-deploy.rst
Normal file
80
docs/manual/owb-to-pywb-deploy.rst
Normal file
@ -0,0 +1,80 @@
|
||||
Deploying pywb: Collection Paths and routing with Nginx/Apache
|
||||
======================================================
|
||||
|
||||
In pywb, the collection name is also the access point, and each of the collections in ``config.yaml``
|
||||
can be accessed by their name as the subpath:
|
||||
|
||||
.. code:: yaml
|
||||
|
||||
collections:
|
||||
wayback:
|
||||
...
|
||||
|
||||
another-collection:
|
||||
...
|
||||
|
||||
If pywb is deployed on port 8080, each collection will be available under:
|
||||
``http://<hostname>/wayback/*/https://example.com/`` and ``http://<hostname>/another-collection/*/https://example.com/``
|
||||
|
||||
To make a collection available under the root, simply set its name to: ``$root``
|
||||
|
||||
|
||||
.. code:: yaml
|
||||
|
||||
collections:
|
||||
$root:
|
||||
...
|
||||
|
||||
another-collection:
|
||||
...
|
||||
|
||||
|
||||
Now, the first collection is available at: ``http://<hostname>/*/https://example.com/``.
|
||||
|
||||
|
||||
To deploy pywb on a subdirectory, eg. ``http://<hostname>/pywb/another-collection/*/https://example.com/``,
|
||||
|
||||
and in general, for production use, it is recommended to deploy pywb behind an Nginx or Apache reverse proxy.
|
||||
|
||||
|
||||
Nginx and Apache Reverse Proxy
|
||||
------------------------------
|
||||
|
||||
The recommended deployment for pywb is with uWSGI and behind an Nginx or Apache frontend.
|
||||
|
||||
This configuration allows for more robust deployment, and allowing these servers to handle static files.
|
||||
|
||||
|
||||
See the :ref:`nginx-deploy` and :ref:`apache-deploy` sections for more info on deploying with Nginx and Apache.
|
||||
|
||||
|
||||
Working Docker Compose Examples
|
||||
-------------------------------
|
||||
|
||||
The pywb `Deployment Examples <https://github.com/webrecorder/pywb/blob/main/sample-deploy/>`_ include working examples of deploying pywb with Nginx, Apache and OutbackCDX
|
||||
in Docker using Docker Compose, widely available container orchestration tools.
|
||||
|
||||
See `Installing Docker <https://docs.docker.com/get-docker/>`_ and `Installing Docker Compose <https://docs.docker.com/compose/install/>`_ for instructions on how to install these tools.
|
||||
|
||||
The examples are available in the ``sample-deploy`` directory of the pywb repo. The examples include:
|
||||
|
||||
- ``docker-compose-outback.yaml`` -- Docker Compose config to start OutbackCDX and pywb, and ingest sample data into OutbackCDX
|
||||
- ``docker-compose-nginx.yaml`` -- Docker Compose config to launch pywb and latest Nginx, with pywb running on subdirectory ``/wayback`` and Nginx serving static files from pywb.
|
||||
- ``docker-compose-apache.yaml`` -- Docker Compose config to launch pywb and latest Apache, with pywb running on subdirectory ``/wayback`` and Apache serving static files from pywb.
|
||||
|
||||
|
||||
The examples are designed to be run one at a time, and assume port 8080 is available.
|
||||
|
||||
After installing Docker and Docker Compose, run either of:
|
||||
|
||||
- ``docker-compose -f docker-compose-outback.yaml up``
|
||||
- ``docker-compose -f docker-compose-nginx.yaml up``
|
||||
- ``docker-compose -f docker-compose-apache.yaml up``
|
||||
|
||||
This will download the standard Docker images and start all of the components in Docker.
|
||||
|
||||
If everything works correctly, you should be able to access: ``http://localhost:8080/pywb/https://example.com/`` to view the sample pywb collection.
|
||||
|
||||
Press CTRL+C to interrupt and stop the example in the console.
|
||||
|
||||
|
68
docs/manual/owb-to-pywb-exclusions.rst
Normal file
68
docs/manual/owb-to-pywb-exclusions.rst
Normal file
@ -0,0 +1,68 @@
|
||||
Migrating Exclusion Rules
|
||||
=========================
|
||||
|
||||
pywb includes a new :ref:`access-control` system, which allows granual allow/block/exclude access control rules on paths and subpaths.
|
||||
|
||||
The rules are configured in .aclj files, and a command-line utility exists to import OpenWayback exclusions
|
||||
into the pywb ACLJ format.
|
||||
|
||||
For example, given an OpenWayback exclusion list configuration for a static file:
|
||||
|
||||
.. code:: xml
|
||||
|
||||
<bean id="excluder-factory-static" class="org.archive.wayback.accesscontrol.staticmap.StaticMapExclusionFilterFactory">
|
||||
<property name="file" value="/archive/exclusions.txt"/>
|
||||
<property name="checkInterval" value="600000" />
|
||||
</bean>
|
||||
|
||||
|
||||
The exclusions file can be converted to an .aclj file by running: ::
|
||||
|
||||
wb-manager acl importtxt /archive/exclusions.aclj /archive/exclusions.txt exclude
|
||||
|
||||
|
||||
Then, in the pywb config, specify:
|
||||
|
||||
.. code:: yaml
|
||||
|
||||
collections:
|
||||
wayback:
|
||||
index_paths: ...
|
||||
archive_paths: ...
|
||||
acl_paths: /archive/exclusions.aclj
|
||||
|
||||
|
||||
It is possible to specify multiple access control files, which will all be applied.
|
||||
|
||||
Using ``block`` instead of ``exclude`` will result in pywb returning a 451 error, indicating that URLs are in the index but blocked.
|
||||
|
||||
|
||||
CLI Tool
|
||||
--------
|
||||
|
||||
After exclusions have been imported, it is recommended to use ``wb-manager acl`` command-line tool for managing exclusions:
|
||||
|
||||
|
||||
To add an exclusion, run: ::
|
||||
|
||||
wb-manager acl add /archive/exclusions.aclj http://httpbin.org/anything/something exclude
|
||||
|
||||
To remove an exclusion, run: ::
|
||||
|
||||
wb-manager acl remove /archive/exclusions.aclj http://httpbin.org/anything/something
|
||||
|
||||
|
||||
For more options, see the full :ref:`access-control` documentation or run ``wb-manager acl --help``.
|
||||
|
||||
|
||||
Not Yet Supported
|
||||
-----------------
|
||||
|
||||
Some OpenWayback exclusion options are not yet supported in pywb.
|
||||
The following is not yet supported in the access control system:
|
||||
|
||||
- Exclusions/Access Control By specific date range
|
||||
- Regex based exclusions
|
||||
- Date Range Embargo on All URLs
|
||||
- Robots.txt-based exclusions
|
||||
|
21
docs/manual/owb-transition.rst
Normal file
21
docs/manual/owb-transition.rst
Normal file
@ -0,0 +1,21 @@
|
||||
.. _transition-openwayback:
|
||||
|
||||
OpenWayback Transition Guide
|
||||
============================
|
||||
|
||||
This guide provides guidelines for transtioning from OpenWayback to pywb,
|
||||
with additional recommendations. The main recommendation is to run pywb along
|
||||
with OutbackCDX and nginx, and this configuration is covered below, along with additional options.
|
||||
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
||||
owb-pywb-terms
|
||||
outbackcdx
|
||||
migrating-cdx
|
||||
owb-to-pywb-config
|
||||
owb-to-pywb-exclusions
|
||||
owb-to-pywb-deploy
|
||||
|
||||
|
@ -7,6 +7,8 @@ pywb includes a sophisticated server and client-side rewriting systems, includin
|
||||
configuration for domain and content-specific rewriting rules, fuzzy index matching for replay,
|
||||
and a thorough client-side JS rewriting system.
|
||||
|
||||
With pywb 2.3.0, the client-side rewriting system exists in a separate module at ``https://github.com/webrecorder/wombat``
|
||||
|
||||
|
||||
URL Rewriting
|
||||
-------------
|
||||
@ -90,7 +92,7 @@ Configuring Rewriters
|
||||
---------------------
|
||||
|
||||
pywb provides customizable rewriting based on content-type, the available types are configured
|
||||
in the :py:mod:`pywb.rewriter.default_rewriter`, which specifies rewriter classes per known type,
|
||||
in the :py:mod:`pywb.rewrite.default_rewriter`, which specifies rewriter classes per known type,
|
||||
and mapping of content-types to rewriters.
|
||||
|
||||
|
||||
@ -116,6 +118,7 @@ JS Rewriting
|
||||
The JS rewriter is applied to inline ``<script>`` blocks, or inline attribute js, and any files determine to be javascript (based on content type and ``js_`` modifier).
|
||||
|
||||
The default JS rewriter does not rewrite any links. Instead, JS rewriter performs limited regular expression on the following:
|
||||
|
||||
* ``postMessage`` calls
|
||||
* certain ``this`` property accessors
|
||||
* specific ``location =`` assignment
|
||||
@ -124,7 +127,7 @@ Then, the entire script block is wrapped in a special code block to be executed
|
||||
|
||||
The server-side rewriting is to aid the client-side execution of wrapped code.
|
||||
|
||||
For more information, see :py:mod:`pywb.rewriter.regex_rewriters.JSWombatProxyRewriterMixin`
|
||||
For more information, see :py:mod:`pywb.rewrite.regex_rewriters.JSWombatProxyRewriterMixin`
|
||||
|
||||
|
||||
JSONP Rewriting
|
||||
@ -138,7 +141,7 @@ For example, a requested url might be ``/my-coll/http://example.com?callback=jQu
|
||||
|
||||
To ensure the JSONP callback works as expected, the content is rewritten to ``jQuery123(...)`` -> ``jQuery456(...)``
|
||||
|
||||
For more information, see :py:mod:`pywb.rewriter.jsonp_rewriter`
|
||||
For more information, see :py:mod:`pywb.rewrite.jsonp_rewriter`
|
||||
|
||||
|
||||
DASH and HLS Rewriting
|
||||
@ -146,5 +149,5 @@ DASH and HLS Rewriting
|
||||
|
||||
To support recording and replaying, adaptive streaming formants (DASH and HLS), pywb can perform special rewriting on the manifests for these formats to remoe all but one possible resolution/format. As a result, the non-deterministic format selection is reduced to a single consistent format.
|
||||
|
||||
For more information, see :py:mod:`pywb.rewriter.rewrite_hls` and :py:mod:`pywb.rewriter.rewrite_dash` and the tests in ``pywb/rewrite/test/test_content_rewriter.py``
|
||||
For more information, see :py:mod:`pywb.rewrite.rewrite_hls` and :py:mod:`pywb.rewrite.rewrite_dash` and the tests in ``pywb/rewrite/test/test_content_rewriter.py``
|
||||
|
||||
|
367
docs/manual/template-guide.rst
Normal file
367
docs/manual/template-guide.rst
Normal file
@ -0,0 +1,367 @@
|
||||
.. _template-guide:
|
||||
|
||||
Template Guide
|
||||
==============
|
||||
|
||||
Introduction
|
||||
------------
|
||||
|
||||
This guide provides a reference of all of the templates available in pywb and how they could be modified.
|
||||
|
||||
These templates are found in the ``pywb/templates`` directory and can be overridden as needed, one HTML page at a time.
|
||||
|
||||
Template variables are listed as ``{{ variable }}`` to indicate the syntax used for rendering the value of the variable in Jinja2.
|
||||
|
||||
Copying a Template For Modification
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
To modify a template, it is often useful to start with the default template. To do so, simply copy a default template
|
||||
to a local ``templates`` directory.
|
||||
|
||||
For convenience, you can also run: ``wb-manager template --add <template-name>`` to add the template automatically.
|
||||
|
||||
For a list of available templates that can be overridden in this way, run ``wb-manager template --list``.
|
||||
|
||||
|
||||
Per-Collection Templates
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
Certain templates can be customized per-collection, instead of for all of pywb.
|
||||
|
||||
To override a template for a specific collection only, run ``wb-manager template --add <template-name> <coll-name>``
|
||||
|
||||
For example:
|
||||
|
||||
|
||||
.. code:: console
|
||||
|
||||
wb-manager init my-coll
|
||||
wb-manager template --add search_html my-coll
|
||||
|
||||
This will create the file ``collections/my-coll/templates/search.html``, a copy of the default search.html, but configured to be used only
|
||||
for the collection ``my-coll``.
|
||||
|
||||
|
||||
|
||||
Base Templates (and supporting templates)
|
||||
-----------------------------------------
|
||||
|
||||
File: ``base.html``
|
||||
|
||||
This template includes the HTML added to all pages other than framed replay. Shared JS and CSS includes meant for pages other than framed replay can be added here.
|
||||
|
||||
To customize the default pywb UI across multiple pages, the following additional templates
|
||||
can also be overriden:
|
||||
|
||||
* ``head.html`` -- Template containing content to be added to the ``<head>`` of the ``base`` template
|
||||
|
||||
* ``header.html`` -- Template to be added as the first content of the ``<body>`` tag of the ``base`` template
|
||||
|
||||
* ``footer.html`` -- Template for adding content as the "footer" of the ``<body>`` tag of the ``base`` template
|
||||
|
||||
|
||||
Note: The default pywb ``head.html`` and ``footer.html`` are currently blank. They can be populated to customize the rendering, add analytics, etc... as needed. Content such as styles or JS code (for example for analytics) must be added to the ``frame_insert.html`` template as well (details on that template below) to also be included in framed replay.
|
||||
|
||||
|
||||
The ``base.html`` template also provides five blocks that can be supplied by templates that extend it.
|
||||
|
||||
* ``title`` -- Block for supplying the title for the page
|
||||
|
||||
* ``head`` -- Block for adding content to the ``<head>``, includes ``head.html`` template
|
||||
|
||||
* ``header`` -- Block for adding content to the ``<body>`` before the ``body`` block, includes the ``header.html`` template
|
||||
|
||||
* ``body`` -- Block for adding the primary content to template
|
||||
|
||||
* ``footer`` -- Block for adding content to the ``<body>`` after the ``body`` block, includes the ``footer.html`` template
|
||||
|
||||
|
||||
Home, Collection and Search Templates
|
||||
-------------------------------------
|
||||
|
||||
|
||||
Home Page Template
|
||||
^^^^^^^^^^^^^^^^^^
|
||||
|
||||
File: ``index.html``
|
||||
|
||||
This template renders the home page for pywb, and by default renders a list of available collections.
|
||||
|
||||
|
||||
Template variables:
|
||||
|
||||
* ``{{ routes }}`` - a list of available collection routes.
|
||||
|
||||
* ``{{ all_metadata }}`` - a dictionary of all metadata for all collections, keyed by collection id. See :ref:`custom-metadata` for more info on the custom metadata.
|
||||
|
||||
|
||||
Additionally, the :ref:`shared-template-vars` are also available to the home page template, as well as all other templates.
|
||||
|
||||
|
||||
Collection Page Template
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
File: ``search.html``
|
||||
|
||||
The 'collection page' template is the page rendered when no URL is specified, e.g. ``http://localhost:8080/my-collection/``.
|
||||
|
||||
The default template renders a search page that can be used to start searching for URLs.
|
||||
|
||||
Template variables:
|
||||
|
||||
* ``{{ coll }}`` - the collection name identifier.
|
||||
|
||||
* ``{{ metadata }}`` - an optional dictionary of metadata. See :ref:`custom-metadata` for more info.
|
||||
|
||||
* ``{{ ui }}`` - an optional ``ui`` dictionary from ``config.yaml``, if any
|
||||
|
||||
|
||||
.. _custom-metadata:
|
||||
|
||||
Custom Metadata
|
||||
"""""""""""""""
|
||||
|
||||
If custom collection metadata is provided, this page will automatically show this metadata as well.
|
||||
|
||||
It is possible to also add custom metadata per-collection that will be available to the collection.
|
||||
|
||||
For dynamic collections, any fields placed in ``<coll_name>/metadata.yaml`` files can be accessed
|
||||
|
||||
via the ``{{ metadata }}`` variable.
|
||||
|
||||
For example, if the metadata file contains:
|
||||
|
||||
.. code:: yaml
|
||||
|
||||
somedata: value
|
||||
|
||||
Accessing ``{{ metadata.somedata }}`` will resolve to ``value``.
|
||||
|
||||
The metadata can also be added via commandline: ``wb-manager metadata myCollection --set somedata=value``.
|
||||
|
||||
|
||||
URL Query/Calendar Page Template
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
File: ``query.html``
|
||||
|
||||
This template is rendered for any URL search response pages, either a single URL or more complex queries.
|
||||
|
||||
For example, the page ``http://localhost:8080/my-collection/*/https://example.com/`` will be rendered using this template, with functionality provided by a Vue application.
|
||||
|
||||
Template variables:
|
||||
|
||||
* ``{{ url }}`` - the URL being queried, e.g. ``https://example.com/``
|
||||
|
||||
* ``{{ prefix }}`` - the collection prefix that will be used for replay, e.g. ``http://localhost:8080/my-collection/``
|
||||
|
||||
* ``{{ ui }}`` - an optional ``ui`` dictionary from ``config.yaml``, if any
|
||||
|
||||
* ``{{ static_prefix }}`` - the prefix from which static files will be accessed from, e.g. ``http://localhost:8080/static/``.
|
||||
|
||||
|
||||
Replay and Banner Templates
|
||||
---------------------------
|
||||
|
||||
The following templates are used to configure the replay view itself.
|
||||
|
||||
|
||||
Banner Template
|
||||
^^^^^^^^^^^^^^^
|
||||
|
||||
File: ``banner.html``
|
||||
|
||||
This template is used to render the banner for framed replay. It is rendered only rendered in the top/outer frame.
|
||||
|
||||
Template variables:
|
||||
|
||||
* ``{{ url }}`` - the URL being replayed.
|
||||
|
||||
* ``{{ timestamp }}`` - the timestamp being replayed, e.g. ``20211226`` in ``http://localhost:8080/pywb/20211226/mp_/https://example.com/``
|
||||
|
||||
* ``{{ is_framed }}`` - true/false if currently in framed mode.
|
||||
|
||||
* ``{{ wb_prefix }}`` - the collection prefix, e.g. ``http://localhost:8080/pywb/``
|
||||
|
||||
* ``{{ host_prefix }}`` - the pywb server origin, e.g. ``http://localhost:8080``
|
||||
|
||||
* ``{{ config }}`` - provides the contents of the ``config.yaml`` as a dictionary.
|
||||
|
||||
* ``{{ ui }}`` - an optional ``ui`` dictionary from ``config.yaml``, if any.
|
||||
|
||||
The default banner creates the UI dynamically in JavaScript using Vue in the ``frame_insert.html`` template.
|
||||
|
||||
|
||||
Custom Banner Template
|
||||
^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
File: ``custom_banner.html``
|
||||
|
||||
This template can be used to render a custom banner for frameless replay. It is blank by default.
|
||||
|
||||
In frameless replay, the content of this template is injected into the ``head_insert.html`` template to render the banner.
|
||||
|
||||
|
||||
Head Insert Template
|
||||
^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
File: ``head_insert.html``
|
||||
|
||||
This template represents the HTML injected into every replay page to add support for client-side rewriting via ``wombat.js``.
|
||||
|
||||
This template is part of the core pywb replay, and modifying this template is not recommended.
|
||||
|
||||
For customizing the banner, modify the ``banner.html`` (framed replay) or ``custom_banner.html`` (frameless replay) template instead.
|
||||
|
||||
|
||||
Top Frame Template
|
||||
^^^^^^^^^^^^^^^^^^
|
||||
|
||||
File: ``frame_insert.html``
|
||||
|
||||
This template represents the top-level frame that is inserted to render the replay in framed mode.
|
||||
|
||||
By design, this template does *not* extend from the base template.
|
||||
|
||||
This template is responsible for creating the iframe that will render the content.
|
||||
|
||||
This template only renders the banner and is designed *not* to set the encoding to allow the browser to 'detect' the encoding for the containing iframe.
|
||||
For this reason, the template should only contain ASCII text, and %-encode any non-ASCII characters.
|
||||
|
||||
Content such as analytics code that is desired in the top frame of framed replay pages should be added to this template.
|
||||
|
||||
Template variables:
|
||||
|
||||
* ``{{ url }}`` - the URL being replayed.
|
||||
|
||||
* ``{{ timestamp }}`` - the timestamp being replayed, e.g. ``20211226`` in ``http://localhost:8080/pywb/20211226/mp_/https://example.com/``
|
||||
|
||||
* ``{{ wb_url }}`` - A complete ``WbUrl`` object, which contains the ``url``, ``timestamp`` and ``mod`` properties, representing the replay url.
|
||||
|
||||
* ``{{ wb_prefix }}`` - the collection prefix, e.g. ``http://localhost:8080/pywb/``
|
||||
|
||||
* ``{{ is_proxy }}`` - set to true if page is being loaded via an HTTP/S proxy (checks if WSGI env has ``wsgiprox.proxy_host`` set)
|
||||
|
||||
* ``{{ ui }}`` - an optional ``ui`` dictionary from ``config.yaml``, if any.
|
||||
|
||||
|
||||
.. _custom-top-frame:
|
||||
|
||||
Customizing the Top Frame Template
|
||||
""""""""""""""""""""""""""""""""""
|
||||
|
||||
The top-frame used for framed replay can be replaced or augmented
|
||||
by modifying the ``frame_insert.html``.
|
||||
|
||||
To start with modifying the default outer page, you can add it to the current
|
||||
templates directory by running ``wb-manager template --add frame_insert_html``
|
||||
|
||||
To initialize the replay, the outer page should include ``wb_frame.js``,
|
||||
create an ``<iframe>`` element and pass the id (or element itself) to the ``ContentFrame`` constructor:
|
||||
|
||||
.. code-block:: html
|
||||
|
||||
<script src='{{ host_prefix }}/{{ static_path }}/wb_frame.js'> </script>
|
||||
<script>
|
||||
var cframe = new ContentFrame({"url": "{{ url }}" + window.location.hash,
|
||||
"prefix": "{{ wb_prefix }}",
|
||||
"request_ts": "{{ wb_url.timestamp }}",
|
||||
"iframe": "#replay_iframe"});
|
||||
</script>
|
||||
|
||||
|
||||
The outer frame can receive notifications of changes to the replay via ``postMessage``
|
||||
|
||||
For example, to detect when the content frame changed and log the new url and timestamp,
|
||||
use the following script in the outer frame html:
|
||||
|
||||
.. code-block:: javascript
|
||||
|
||||
window.addEventListener("message", function(event) {
|
||||
if (event.data.wb_type == "load" || event.data.wb_type == "replace-url") {
|
||||
console.log("New Url: " + event.data.url);
|
||||
console.log("New Timestamp: " + event.data.ts);
|
||||
}
|
||||
});
|
||||
|
||||
The ``load`` message is sent when a new page is first loaded, while ``replace-url`` is used
|
||||
for url changes caused by content frame History navigation.
|
||||
|
||||
|
||||
Error Templates
|
||||
---------------
|
||||
|
||||
The following templates are used to render errors.
|
||||
|
||||
|
||||
Page Not Found Template
|
||||
^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
File: ``not_found.html`` - template for 404 error pages.
|
||||
|
||||
This template is used to render any 404/page not found errors that can occur when loading a URL that is not in the web archive.
|
||||
|
||||
Template variables:
|
||||
|
||||
* ``{{ url }}`` - the URL of the page
|
||||
|
||||
* ``{{ wbrequest }}`` - the full ``WbRequest`` object which can be used to get additional info about the request.
|
||||
|
||||
|
||||
(The default template checks ``{{ wbrequest and wbrequest.env.pywb_proxy_magic }}`` to determine if the request is via an :ref:`https-proxy` connection or a regular request).
|
||||
|
||||
|
||||
Generic Error Template
|
||||
^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
File: ``error.html`` - generic error template.
|
||||
|
||||
|
||||
This template is used to render all other errors that are not 'page not found'.
|
||||
|
||||
Template variables:
|
||||
|
||||
* ``{{ err_msg }}`` - a shorter error message indicating what went wrong.
|
||||
|
||||
* ``{{ err_details }}`` - additional details about the error.
|
||||
|
||||
|
||||
|
||||
|
||||
.. _shared-template-vars:
|
||||
|
||||
Shared Template Variables
|
||||
-------------------------
|
||||
|
||||
The following template variables are available to all templates.
|
||||
|
||||
* ``{{ env }}`` - contains environment variables passed to pywb.
|
||||
|
||||
* ``{{ env.pywb_proxy_magic }}`` - if set, indicates pywb is accessed via proxy. See :ref:`https-proxy`
|
||||
|
||||
* ``{{ static_prefix }}`` - URL path to use for loading static files.
|
||||
|
||||
|
||||
UI Configuration
|
||||
^^^^^^^^^^^^^^^^
|
||||
|
||||
Starting with pywb 2.7.0, the ``ui`` block in ``config.yaml`` can contain any custom ui-specific settings.
|
||||
|
||||
This block is provided to the ``search.html``, ``query.html`` and ``banner.html`` templates.
|
||||
|
||||
|
||||
Localization Globals
|
||||
^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
The Localization system (see: :ref:`localization`) adds several additional template globals, to facilitate listing available locales and getting URLs to switch locales, including:
|
||||
|
||||
* ``{{ _Q() }}`` - a function used to mark certain text for localization, e.g. ``{{ _Q('localize this text') }}``
|
||||
|
||||
* ``{{ env.pywb_lang }}`` - indicates current locale language code used for localization.
|
||||
|
||||
* ``{{ locales }}`` - a list of all available locale language codes, used for iterating over all locales.
|
||||
|
||||
* ``{{ get_locale_prefixes() }}`` - a function which returns the prefixes to use to switch locales.
|
||||
|
||||
* ``{{ switch_locale() }}`` - a function used to render a URL to switch locale for the current page. Ex: ``<a href="{{ switch_locale(locale) }}">{{ locale }}</a>`` renders a link to switch to a specific locale.
|
||||
|
9
docs/manual/ui-customization.rst
Normal file
9
docs/manual/ui-customization.rst
Normal file
@ -0,0 +1,9 @@
|
||||
UI Customization
|
||||
================
|
||||
|
||||
.. toctree::
|
||||
|
||||
ui-guide
|
||||
vue-ui
|
||||
template-guide
|
||||
|
91
docs/manual/ui-guide.rst
Normal file
91
docs/manual/ui-guide.rst
Normal file
@ -0,0 +1,91 @@
|
||||
.. _ui-customizations:
|
||||
|
||||
Customization Guide
|
||||
===================
|
||||
|
||||
Most aspects of the pywb user-interface can be customized by changing the default styles, or overriding the HTML templates.
|
||||
|
||||
This guide covers a few different options for customizing the UI.
|
||||
|
||||
|
||||
New Vue-based UI
|
||||
----------------
|
||||
|
||||
With pywb 2.7.0, pywb includes a brand new UI which includes a visual calendar mode and a histogram-based banner.
|
||||
|
||||
See :ref:`vue-ui` for more information on how to enable this UI.
|
||||
|
||||
|
||||
Customizing UI Templates
|
||||
------------------------
|
||||
|
||||
pywb renders HTML using the Jinja2 templating engine, loading default templates from the ``pywb/templates`` directory.
|
||||
|
||||
If running from a custom directory, templates can be placed in the ``templates`` directory and will override the defaults.
|
||||
|
||||
See :ref:`template-guide` for more details on customizing the templates.
|
||||
|
||||
|
||||
Static Files
|
||||
------------
|
||||
|
||||
pywb will automatically support static files placed under the following directories:
|
||||
|
||||
* Files under the root ``static`` directory: ``static/my-file.js`` can be accessed via ``http://localhost:8080/static/my-file.js``
|
||||
|
||||
|
||||
* Files under the per-collection directory: ``./collections/my-coll/static/my-file.js`` can be accessed via ``http://localhost:8080/static/_/my-coll/my-file.js``
|
||||
|
||||
|
||||
It is possible to change these settings via ``config.yaml``:
|
||||
|
||||
* ``static_prefix`` - sets the URL path used in pywb to serve static content (default ``static``)
|
||||
|
||||
* ``static_dir`` - sets the directory name used to read static files on disk (default ``static``)
|
||||
|
||||
While pywb can serve static files, it is recommended to use an existing web server to serve static files, especially if already using it in production.
|
||||
|
||||
For example, this can be done via nginx with:
|
||||
|
||||
|
||||
.. code:: text
|
||||
|
||||
location /wayback/static {
|
||||
alias /pywb/pywb/static;
|
||||
}
|
||||
|
||||
|
||||
Loading Custom Metadata
|
||||
-----------------------
|
||||
|
||||
pywb includes a default mechanism for loading externally defined metadata, loaded from a per-collection ``metadata.yaml`` YAML file at runtime.
|
||||
|
||||
See :ref:`custom-metadata` for more details.
|
||||
|
||||
Additionally, the banner template has access to the contents of the ``config.yaml`` via the ``{{ config }}`` template variable,
|
||||
allowing for passing in arbitrary config information.
|
||||
|
||||
For more dynamic loading of data, the banner and all of the templates can load additional data via JS ``fetch()`` calls.
|
||||
|
||||
|
||||
Embedding pywb in frames
|
||||
------------------------
|
||||
|
||||
It should be possible to embed pywb replay itself as an iframe as needed.
|
||||
|
||||
For customizing the top-level page and banner, see :ref:`custom-top-frame`.
|
||||
|
||||
However, there may be other reasons to embed pywb in an iframe.
|
||||
|
||||
This can be done simply by including something like:
|
||||
|
||||
.. code:: html
|
||||
|
||||
<html>
|
||||
<head>
|
||||
<body>
|
||||
<div>Embedding pywb replay</div>
|
||||
<iframe style="width: 100%; height: 100%" src="http://localhost:8080/pywb/20130729195151/http://test@example.com/"></iframe>
|
||||
</body>
|
||||
</html>
|
||||
|
@ -20,9 +20,13 @@ and introduces many new features, including:
|
||||
|
||||
* Flexible rewriting system with pluggable rewriters for different content-types.
|
||||
|
||||
* Significantly improved client-side rewriting to handle most modern web sites.
|
||||
* Significantly improved :ref:`wombat` to handle most modern web sites.
|
||||
|
||||
* Improved 'calendar' query UI, grouping results by year and month, and updated replay banner.
|
||||
* Improved 'calendar' query UI with incremental loading, grouping results by year and month, and updated replay banner.
|
||||
|
||||
* New in 2.4: Extensible :ref:`ui-customizations` for modifying all aspects of the UI.
|
||||
|
||||
* New in 2.4: Robust :ref:`access-control` system for blocking or excluding URLs, by prefix or by exact match.
|
||||
|
||||
|
||||
Getting Started
|
||||
@ -91,8 +95,8 @@ add the WARC to a new collection and start pywb:
|
||||
|
||||
docker pull webrecorder/pywb
|
||||
docker run -e INIT_COLLECTION=my-web-archive -v /pywb-data:/webarchive \
|
||||
-v /path/to:/source webrecorder/pywb wb-manager add default /path/to/my_warc.warc.gz
|
||||
docker run -p 8080:8080 -v /pywb-data/:/webarchive wayback
|
||||
-v /path/to:/source webrecorder/pywb wb-manager add my-web-archive /source/my_warc.warc.gz
|
||||
docker run -p 8080:8080 -v /pywb-data/:/webarchive webrecorder/pywb wayback
|
||||
|
||||
This example is equivalent to the non-Docker example above.
|
||||
|
||||
@ -110,6 +114,8 @@ Using Existing Web Archive Collections
|
||||
Existing archives of WARCs/ARCs files can be used with pywb with minimal amount of setup. By using ``wb-manager add``,
|
||||
WARC/ARC files will automatically be placed in the collection archive directory and indexed.
|
||||
|
||||
In pywb 2.8.0 and later, preliminary support for WACZ files is also added with ``wb-manager add --unpack-wacz``. This will unpack the provided WACZ file, adding its WARCs and indices to the collection.
|
||||
|
||||
By default ``wb-manager``, places new collections in ``collections/<coll name>`` subdirectory in the current working directory. To specify a different root directory, the ``wb-manager -d <dir>``. Other options can be set in the config file.
|
||||
|
||||
If you have a large number of existing CDX index files, pywb will be able to read them as well after running through a simple conversion process.
|
||||
@ -150,32 +156,40 @@ To enable auto-indexing, run with ``wayback -a`` or ``wayback -a --auto-interval
|
||||
Creating a Web Archive
|
||||
----------------------
|
||||
|
||||
Using Webrecorder
|
||||
^^^^^^^^^^^^^^^^^
|
||||
Using ArchiveWeb.page
|
||||
^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
If you do not have a web archive to test, one easy way to create one is to use `Webrecorder <https://webrecorder.io>`_
|
||||
If you do not have a web archive to test, one easy way to create one is to use the `ArchiveWeb.page <https://archiveweb.page>`_ browser extension for Chrome and other Chromium-based browsers such as Brave Browser. ArchiveWeb.page records pages visited during an archiving session in the browser, and provides means of both replaying and downloading the archived items created.
|
||||
|
||||
After recording, you can click **Stop** and then click `Download Collection` to receive a WARC (`.warc.gz`) file.
|
||||
Follow the instructions in `How To Create Web Archives with ArchiveWeb.page <https://archiveweb.page/en/usage/>`_. After recording, press **Stop** and then `download your collection <https://archiveweb.page/en/download/>`_ to receive a WARC (`.warc.gz`) file. If you choose to download your collection in the WACZ format, the WARC files can be found inside the zipped WACZ in the ``archive/`` directory.
|
||||
|
||||
You can then use this with work with pywb.
|
||||
You can then use your WARCs to work with pywb.
|
||||
|
||||
|
||||
Using pywb Recorder
|
||||
^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
The core recording functionality in Webrecorder is also part of :mod:`pywb`. If you want to create a WARC locally, this can be
|
||||
Recording functionality is also part of :mod:`pywb`. If you want to create a WARC locally, this can be
|
||||
done by directly recording into your pywb collection:
|
||||
|
||||
1. Create a collection: ``wb-manager init my-web-archive`` (if you haven't already created a web archive collection)
|
||||
2. Run: ``wayback --record --live -a --auto-interval 10``
|
||||
3. Point your browser to ``http://localhost:8080/my-web-archive/record/<url>``
|
||||
|
||||
For example, to record ``http://example.com/``, visit ``http://localhost:8080/my-web-archive/record/<url>``
|
||||
For example, to record ``http://example.com/``, visit ``http://localhost:8080/my-web-archive/record/http://example.com/``
|
||||
|
||||
In this configuration, the indexing happens every 10 seconds.. After 10 seconds, the recorded url will be accessible for replay, eg:
|
||||
``http://localhost:8080/my-web-archive/http://example.com/``
|
||||
|
||||
|
||||
Using Browsertrix
|
||||
^^^^^^^^^^^^^^^^^
|
||||
|
||||
For a more automated browser-based web archiving experience, `Browsertrix <https://browsertrix.com/>`_ provides a web interface for configuring, scheduling, running, reviewing, and curating crawls of web content. Crawl activity is shown in a live screencast of the browsers used for crawling and all web archives created in Browsertrix can be easily downloaded from the application in the WACZ format.
|
||||
|
||||
`Browsertrix Crawler <https://crawler.docs.browsertrix.com/>`_, which provides the underlying crawling functionality of Browsertrix, can also be run standalone in a Docker container on your local computer.
|
||||
|
||||
|
||||
HTTP/S Proxy Mode Access
|
||||
------------------------
|
||||
|
||||
@ -202,6 +216,21 @@ pywb uses the gevent coroutine library, and the default app will support many co
|
||||
|
||||
For larger scale production deployments, running with `uwsgi <http://uwsgi-docs.readthedocs.io/>`_ server application is recommended. The ``uwsgi.ini`` script provided can be used to launch pywb with uwsgi. uwsgi can be scaled to multiple processes to support the necessary workload, and pywb must be run with the `Gevent Loop Engine <http://uwsgi-docs.readthedocs.io/en/latest/Gevent.html>`_. Nginx or Apache can be used as an additional frontend for uwsgi.
|
||||
|
||||
It is recommended to install uwsgi and its dependencies in a Python virtual environment (virtualenv). Consult the uwsgi documentation for `virtualenv support <https://uwsgi-docs.readthedocs.io/en/latest/Python.html#virtualenv-support>`_ for details on how to specify the virtualenv to uwsgi.
|
||||
|
||||
Installation of uswgi in a virtualenv will avoid known issues with installing uwsgi in some Debian-based OSes with Python 3.9+. As an example, in Ubuntu 22.04 with Python 3.10, it is recommended to install uwsgi like so: ::
|
||||
|
||||
sudo apt install -y python3-pip \
|
||||
python3-dev \
|
||||
build-essential \
|
||||
libssl-dev \
|
||||
libffi-dev \
|
||||
python3-setuptools \
|
||||
python3-venv
|
||||
python3 -m venv pywbenv
|
||||
source pywbenv/bin/activate
|
||||
pip install wheel uwsgi pywb
|
||||
|
||||
Although uwsgi does not provide a way to specify command line, all command line options can alternatively be configured via ``config.yaml``. See :ref:`configuring-pywb` for more info on available configuration options.
|
||||
|
||||
Docker Deployment
|
||||
@ -214,18 +243,20 @@ The following will run pywb in Docker directly on port 80:
|
||||
|
||||
.. code:: console
|
||||
|
||||
docker run -p 80:8080 -v /webarchive-data/:/webarchive
|
||||
docker run -p 80:8080 -v /webarchive-data/:/webarchive webrecorder/pywb
|
||||
|
||||
To run pywb in Docker behind a local nginx (as shown below), port 8081 should also be mapped:
|
||||
|
||||
.. code:: console
|
||||
|
||||
docker run -p 8081:8081 -v /webarchive-data/:/webarchive
|
||||
docker run -p 8081:8081 -v /webarchive-data/:/webarchive webrecorder/pywb
|
||||
|
||||
|
||||
See :ref:`getting-started-docker` for more info on using pywb with Docker.
|
||||
|
||||
|
||||
.. _nginx-deploy:
|
||||
|
||||
Sample Nginx Configuration
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
@ -259,29 +290,99 @@ See the `Nginx Docs <https://nginx.org/en/docs/>`_ for a lot more details on how
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
.. _apache-deploy:
|
||||
|
||||
Sample Apache Configuration
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
The following Apache configuration snippet can be used to deploy pywb *without* uwsgi. A configuration with uwsgi is also probably possible but this covers the simplest case of launching the `wayback` binary directly.
|
||||
The recommended Apache configuration is to use pywb with ``mod_proxy`` and ``mod_proxy_uwsgi``.
|
||||
|
||||
The configuration assumes pywb is running on port 8080 on localhost, but it could be on a different machine as well.
|
||||
To enable these, ensure that your httpd.conf includes:
|
||||
|
||||
.. code:: apache
|
||||
|
||||
LoadModule proxy_module modules/mod_proxy.so
|
||||
LoadModule proxy_uwsgi_module modules/mod_proxy_uwsgi.so
|
||||
|
||||
|
||||
|
||||
Then, in your config, simply include:
|
||||
|
||||
.. code:: apache
|
||||
|
||||
<VirtualHost *:80>
|
||||
ServerName proxy.example.com
|
||||
Redirect / https://proxy.example.com/
|
||||
DocumentRoot /var/www/html/
|
||||
ProxyPass / uwsgi://pywb:8081/
|
||||
</VirtualHost>
|
||||
|
||||
<VirtualHost *:443>
|
||||
ServerName proxy.example.com
|
||||
SSLEngine on
|
||||
DocumentRoot /var/www/html/
|
||||
ErrorDocument 404 /404.html
|
||||
ProxyPreserveHost On
|
||||
ProxyPass /.well-known/ !
|
||||
ProxyPass / http://localhost:8080/
|
||||
ProxyPassReverse / http://localhost:8080/
|
||||
RequestHeader set "X-Forwarded-Proto" expr=%{REQUEST_SCHEME}
|
||||
</VirtualHost>
|
||||
The configuration assumes uwsgi is started with ``uwsgi uwsgi.ini``
|
||||
|
||||
|
||||
.. _config-acl-header:
|
||||
|
||||
Configuring Access Control Header
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
The :ref:`access-control` system allows users to be granted different access settings based on the value of an ACL header, ``X-pywb-ACL-user``.
|
||||
|
||||
The header can be set via Nginx or Apache to grant custom access priviliges based on IP address, password, or other combination of rules.
|
||||
|
||||
For example, to set the value of the header to ``staff`` if the IP of the request is from designated local IP ranges (127.0.0.1, 192.168.1.0/24), the following settings can be added to the configs:
|
||||
|
||||
For Nginx::
|
||||
|
||||
geo $acl_user {
|
||||
# ensure user is set to empty by default
|
||||
default "";
|
||||
|
||||
# optional: add IP ranges to allow privileged access
|
||||
127.0.0.1 "staff";
|
||||
192.168.0.0/24 "staff";
|
||||
}
|
||||
|
||||
...
|
||||
location /wayback/ {
|
||||
...
|
||||
uwsgi_param HTTP_X_PYWB_ACL_USER $acl_user;
|
||||
}
|
||||
|
||||
|
||||
For Apache::
|
||||
|
||||
<If "-R '192.168.1.0/24' || -R '127.0.0.1'">
|
||||
RequestHeader set X-Pywb-ACL-User staff
|
||||
</If>
|
||||
# ensure header is cleared if no match
|
||||
<Else>
|
||||
RequestHeader set X-Pywb-ACL-User ""
|
||||
</Else>
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
Running on Subdirectory Path
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
To run pywb on a subdirectory, rather than at the root of the web server, the recommended configuration is to adjust the ``uwsgi.ini`` to include the subdirectory:
|
||||
For example, to deploy pywb under the ``/wayback`` subdirectory, the ``uwsgi.ini`` can be configured as follows:
|
||||
|
||||
.. code:: ini
|
||||
|
||||
mount = /wayback=./pywb/apps/wayback.py
|
||||
manage-script-name = true
|
||||
|
||||
|
||||
.. _example-deploy:
|
||||
|
||||
Deployment Examples
|
||||
^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
The ``sample-deploy`` directory includes working Docker Compose examples for deploying pywb with Nginx and Apache on the ``/wayback`` subdirectory.
|
||||
|
||||
See:
|
||||
- `Docker Compose Nginx <https://github.com/webrecorder/pywb/blob/main/sample-deploy/docker-compose-nginx.yaml>`_ for sample Nginx config.
|
||||
- `Docker Compose Apache <https://github.com/webrecorder/pywb/blob/main/sample-deploy/docker-compose-apache.yaml>`_ for sample Apache config.
|
||||
- `uwsgi_subdir.ini <https://github.com/webrecorder/pywb/blob/main/sample-deploy/uwsgi_subdir.ini>`_ for example subdirectory uwsgi config.
|
||||
|
||||
|
126
docs/manual/vue-ui.rst
Normal file
126
docs/manual/vue-ui.rst
Normal file
@ -0,0 +1,126 @@
|
||||
.. _vue-ui:
|
||||
|
||||
|
||||
Vue-based UI
|
||||
================
|
||||
|
||||
With 2.7.0, pywb introduces a new `Vue UI <https://vuejs.org/>`_ based system, which provides a more feature-rich representation of a web archive.
|
||||
|
||||
|
||||
Overview
|
||||
--------
|
||||
|
||||
Calendar UI
|
||||
^^^^^^^^^^^
|
||||
|
||||
The new calendar UI provides a histogram and a clickable calendar representation of a web archive.
|
||||
|
||||
The calendar is rendered in place of the URL query page from versions before 2.7.0.
|
||||
|
||||
.. image:: images/vue-cal.png
|
||||
:width: 600
|
||||
:alt: Calendar UI Screenshot
|
||||
|
||||
|
||||
Banner Replay UI
|
||||
^^^^^^^^^^^^^^^^
|
||||
|
||||
The new banner histogram allows for zooming in on captures per year, month, week, and day.
|
||||
|
||||
Navigation preserves the different levels. The full calendar UI is also available as a dropdown by clicking the calendar icon.
|
||||
|
||||
The new banner should allow for faster navigation across multiple captures.
|
||||
|
||||
.. image:: images/vue-banner.png
|
||||
:width: 600
|
||||
:alt: Calendar UI Screenshot
|
||||
|
||||
|
||||
Custom Logo
|
||||
^^^^^^^^^^^
|
||||
|
||||
It is possible to configure a custom logo by setting ``ui.logo`` in ``config.yaml`` to a static file.
|
||||
|
||||
If omitted, the standard pywb logo will be used by default.
|
||||
|
||||
If set, the logo should point to a file in the static directory (default is ``static`` but can be changed via the ``static_dir`` config option).
|
||||
|
||||
For example, to use the file ``./static/my-logo.png`` as the logo, set:
|
||||
|
||||
.. code:: yaml
|
||||
|
||||
ui:
|
||||
logo: my-logo.png
|
||||
|
||||
|
||||
Logo URL
|
||||
^^^^^^^^
|
||||
|
||||
It is possible to configure the logo to link to any URL by setting ``ui.logo_home_url`` in ``config.yml`` to the URL of your choice.
|
||||
|
||||
If omitted, the logo will not link to any page.
|
||||
|
||||
For example, to have the logo redirect to ``https://example.com/web-archive-landing-page``, set:
|
||||
|
||||
.. code:: yaml
|
||||
|
||||
ui:
|
||||
logo_home_url: https://example.com/web-archive-landing-page
|
||||
|
||||
|
||||
Printing
|
||||
^^^^^^^^
|
||||
|
||||
As of pywb 2.8, the replay header includes a print button that prints the contents of the replay iframe.
|
||||
|
||||
This button can be disabled by setting ``ui.disable_printing`` in ``config.yaml`` to any value.
|
||||
|
||||
For example:
|
||||
|
||||
.. code:: yaml
|
||||
|
||||
ui:
|
||||
disable_printing: true
|
||||
|
||||
|
||||
Banner Colors
|
||||
^^^^^^^^^^^^^
|
||||
|
||||
It is possible to configure the background color, text color, and button outlines of the header by setting values in the ``ui`` section of ``config.yaml``.
|
||||
|
||||
To customize the header background color, set ``ui.navbar_background_hex`` to the color's hex value, with the initial hash symbol (``#``) omitted. If omitted, ``#f8f9fa`` (Bootstrap 4's ``light``) will be used by default.
|
||||
|
||||
For example, to use the color ``#cff3ff`` as the banner color, set:
|
||||
|
||||
.. code:: yaml
|
||||
|
||||
ui:
|
||||
navbar_background_hex: cff3ff
|
||||
|
||||
The navbar text color can similarly be set using the ``ui.navbar_color_hex`` setting.
|
||||
|
||||
The banner's buttons default to Bootstrap 4's ``btn-outline-dark``. To use light-outlined buttons instead, set ``ui.navbar_light_buttons`` equal to any value.
|
||||
|
||||
|
||||
Updating the Vue UI
|
||||
-------------------
|
||||
|
||||
The UI is contained within the ``pywb/vueui`` directory.
|
||||
|
||||
The Vue component sources can be found in ``pywb/vueui/src``.
|
||||
|
||||
Updating the UI requires ``node`` and ``yarn``.
|
||||
|
||||
To install and build, run:
|
||||
|
||||
|
||||
.. code:: console
|
||||
|
||||
cd pywb/vueui
|
||||
yarn install
|
||||
yarn build
|
||||
|
||||
|
||||
This will generate the output to ``pywb/static/vue/vueui.js`` which is loaded from the default templates when the Vue UI rendering is enabled.
|
||||
|
||||
Additional styles for the banner are loaded from ``pywb/static/vue_banner.css``.
|
@ -320,12 +320,12 @@ which does not use a YAML config
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
server = BaseWarcServer()
|
||||
app = BaseWarcServer()
|
||||
|
||||
# /live endpoint
|
||||
live_agg = SimpleAggregator({'live': LiveIndexSource()})
|
||||
|
||||
server.add_route('/live', DefaultResourceHandler(live_agg))
|
||||
app.add_route('/live', DefaultResourceHandler(live_agg))
|
||||
|
||||
|
||||
# /memento endpoint
|
||||
|
@ -2,5 +2,8 @@ certauth
|
||||
youtube-dl
|
||||
boto3
|
||||
uwsgi
|
||||
git+https://github.com/esnme/ultrajson.git
|
||||
ujson
|
||||
pysocks
|
||||
lxml
|
||||
babel
|
||||
translate_toolkit
|
||||
|
@ -1,4 +0,0 @@
|
||||
NODE_BIN_DIR=../node_modules/.bin
|
||||
|
||||
test:
|
||||
$(NODE_BIN_DIR)/karma start --single-run
|
@ -1,9 +0,0 @@
|
||||
<html>
|
||||
<head><meta charset="UTF-8"></head>
|
||||
<body>
|
||||
<!-- This is a dummy page used in
|
||||
tests of Wombat's live-rewriting
|
||||
functionality.
|
||||
!-->
|
||||
</body>
|
||||
</html>
|
@ -1,108 +0,0 @@
|
||||
var sauceLabsConfig = {
|
||||
testName: 'pywb Client Tests',
|
||||
};
|
||||
|
||||
// see https://github.com/karma-runner/karma-sauce-launcher/issues/73
|
||||
if (process.env.TRAVIS_JOB_NUMBER) {
|
||||
sauceLabsConfig.startConnect = false;
|
||||
sauceLabsConfig.tunnelIdentifier = process.env.TRAVIS_JOB_NUMBER;
|
||||
}
|
||||
|
||||
var WOMBAT_JS_PATH = 'pywb/static/wombat.js';
|
||||
|
||||
var sauceLaunchers = {
|
||||
sl_chrome: {
|
||||
base: 'SauceLabs',
|
||||
browserName: 'chrome',
|
||||
},
|
||||
|
||||
sl_firefox: {
|
||||
base: 'SauceLabs',
|
||||
browserName: 'firefox',
|
||||
},
|
||||
|
||||
sl_safari: {
|
||||
base: 'SauceLabs',
|
||||
browserName: 'safari',
|
||||
platform: 'OS X 10.11',
|
||||
version: '9.0',
|
||||
},
|
||||
|
||||
sl_edge: {
|
||||
base: 'SauceLabs',
|
||||
browserName: 'MicrosoftEdge',
|
||||
},
|
||||
};
|
||||
|
||||
var localLaunchers = {
|
||||
localFirefox: {
|
||||
base: 'Firefox',
|
||||
},
|
||||
};
|
||||
|
||||
var customLaunchers = {};
|
||||
|
||||
if (process.env['SAUCE_USERNAME'] && process.env['SAUCE_ACCESS_KEY']) {
|
||||
customLaunchers = sauceLaunchers;
|
||||
} else {
|
||||
console.error('Sauce Labs account details not set, ' +
|
||||
'Karma tests will be run only against local browsers.' +
|
||||
'Set SAUCE_USERNAME and SAUCE_ACCESS_KEY environment variables to ' +
|
||||
'run tests against Sauce Labs browsers');
|
||||
customLaunchers = localLaunchers;
|
||||
}
|
||||
|
||||
module.exports = function(config) {
|
||||
config.set({
|
||||
basePath: '../',
|
||||
|
||||
frameworks: ['mocha', 'chai'],
|
||||
|
||||
files: [
|
||||
{
|
||||
pattern: WOMBAT_JS_PATH,
|
||||
watched: true,
|
||||
included: false,
|
||||
served: true,
|
||||
},
|
||||
{
|
||||
pattern: 'karma-tests/dummy.html',
|
||||
included: false,
|
||||
served: true,
|
||||
},
|
||||
'karma-tests/*.spec.js',
|
||||
],
|
||||
|
||||
preprocessors: {},
|
||||
|
||||
reporters: ['progress'],
|
||||
|
||||
port: 9876,
|
||||
|
||||
colors: true,
|
||||
|
||||
logLevel: config.LOG_INFO,
|
||||
|
||||
autoWatch: true,
|
||||
|
||||
sauceLabs: sauceLabsConfig,
|
||||
|
||||
// Set extended timeouts to account for the slowness
|
||||
// in connecting to remote browsers (eg. when using
|
||||
// Sauce Labs)
|
||||
//
|
||||
// See https://oligofren.wordpress.com/2014/05/27/running-karma-tests-on-browserstack/
|
||||
captureTimeout: 3 * 60000,
|
||||
browserNoActivityTimeout: 30 * 1000,
|
||||
browserDisconnectTimeout: 10 * 1000,
|
||||
browserDisconnectTolerance: 1,
|
||||
|
||||
customLaunchers: customLaunchers,
|
||||
|
||||
browsers: Object.keys(customLaunchers),
|
||||
|
||||
singleRun: false,
|
||||
|
||||
concurrency: Infinity
|
||||
})
|
||||
};
|
@ -1,225 +0,0 @@
|
||||
var DEFAULT_TIMEOUT = 20000;
|
||||
|
||||
// creates a new document in an <iframe> and runs
|
||||
// a WombatJS test case in it.
|
||||
//
|
||||
// A new <iframe> is used for each test so that each
|
||||
// case is run with fresh Document and Window objects,
|
||||
// since Wombat monkey-patches many Document and Window
|
||||
// functions
|
||||
//
|
||||
function runWombatTest(testCase, done) {
|
||||
// create an <iframe>
|
||||
var testFrame = document.createElement('iframe');
|
||||
testFrame.src = '/base/karma-tests/dummy.html';
|
||||
document.body.appendChild(testFrame);
|
||||
|
||||
testFrame.contentWindow.addEventListener('load', function () {
|
||||
var testDocument = testFrame.contentDocument;
|
||||
|
||||
function runFunctionInIFrame(func) {
|
||||
testFrame.contentWindow.eval('(' + func.toString() + ')()');
|
||||
}
|
||||
|
||||
// expose an error reporting function to the <iframe>
|
||||
window.reportError = function(ex) {
|
||||
done(new Error(ex));
|
||||
};
|
||||
|
||||
// expose utility methods for assertion testing in tests.
|
||||
// (We used to expose chai asserts here but Karma's default
|
||||
// error reporter replaces URLs in exception messages with
|
||||
// the corresponding file paths, which is unhelpful for us
|
||||
// since assert.equal() will often be called with URLs in our tests)
|
||||
window.assert = {
|
||||
equal: function (a, b) {
|
||||
if (a !== b) {
|
||||
console.error('Mismatch between', a, 'and', b);
|
||||
throw new Error('AssertionError');
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
runFunctionInIFrame(function () {
|
||||
// re-assign the iframe's console object to the parent window's
|
||||
// console so that messages are intercepted by Karma
|
||||
// and output to wherever it is configured to send
|
||||
// console logs (typically stdout)
|
||||
console = window.parent.console;
|
||||
window.onerror = function (message, url, line, col, error) {
|
||||
if (error) {
|
||||
console.log(error.stack);
|
||||
}
|
||||
reportError(new Error(message));
|
||||
};
|
||||
|
||||
// expose chai's assertion testing API to the test script
|
||||
window.assert = window.parent.assert;
|
||||
window.reportError = window.parent.reportError;
|
||||
|
||||
// helpers which check whether DOM property overrides are supported
|
||||
// in the current browser
|
||||
window.domTests = {
|
||||
areDOMPropertiesConfigurable: function () {
|
||||
var descriptor = Object.getOwnPropertyDescriptor(Node.prototype, 'baseURI');
|
||||
if (descriptor && !descriptor.configurable) {
|
||||
return false;
|
||||
} else {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
};
|
||||
});
|
||||
|
||||
try {
|
||||
runFunctionInIFrame(testCase.initScript);
|
||||
} catch (e) {
|
||||
throw new Error('Configuring Wombat failed: ' + e.toString());
|
||||
}
|
||||
|
||||
try {
|
||||
testFrame.contentWindow.eval(testCase.wombatScript);
|
||||
runFunctionInIFrame(function () {
|
||||
new window._WBWombat(window, wbinfo);
|
||||
});
|
||||
} catch (e) {
|
||||
console.error(e.stack);
|
||||
throw new Error('Initializing WombatJS failed: ' + e.toString());
|
||||
}
|
||||
|
||||
if (testCase.html) {
|
||||
testDocument.body.innerHTML = testCase.html;
|
||||
}
|
||||
|
||||
if (testCase.testScript) {
|
||||
try {
|
||||
runFunctionInIFrame(testCase.testScript);
|
||||
} catch (e) {
|
||||
throw new Error('Test script failed: ' + e.toString());
|
||||
}
|
||||
}
|
||||
|
||||
testFrame.remove();
|
||||
done();
|
||||
});
|
||||
}
|
||||
|
||||
describe('WombatJS', function () {
|
||||
this.timeout(DEFAULT_TIMEOUT);
|
||||
|
||||
var wombatScript;
|
||||
|
||||
before(function (done) {
|
||||
// load the source of the WombatJS content
|
||||
// rewriting script
|
||||
var req = new XMLHttpRequest();
|
||||
req.open('GET', '/base/pywb/static/wombat.js');
|
||||
req.onload = function () {
|
||||
wombatScript = req.responseText;
|
||||
done();
|
||||
};
|
||||
req.send();
|
||||
});
|
||||
|
||||
it('should load', function (done) {
|
||||
runWombatTest({
|
||||
initScript: function () {
|
||||
wbinfo = {
|
||||
wombat_opts: {},
|
||||
wombat_ts: '',
|
||||
is_live: false,
|
||||
top_url: ''
|
||||
};
|
||||
},
|
||||
wombatScript: wombatScript,
|
||||
}, done);
|
||||
});
|
||||
|
||||
describe('anchor rewriting', function () {
|
||||
var config;
|
||||
beforeEach(function () {
|
||||
config = {
|
||||
initScript: function () {
|
||||
wbinfo = {
|
||||
wombat_opts: {},
|
||||
wombat_scheme: 'http',
|
||||
prefix: window.location.origin,
|
||||
wombat_ts: '',
|
||||
is_live: false,
|
||||
top_url: ''
|
||||
};
|
||||
},
|
||||
wombatScript: wombatScript,
|
||||
html: '<a href="foobar.html" id="link">A link</a>',
|
||||
};
|
||||
});
|
||||
|
||||
it('should rewrite links in dynamically injected <a> tags', function (done) {
|
||||
config.testScript = function () {
|
||||
if (domTests.areDOMPropertiesConfigurable()) {
|
||||
var link = document.getElementById('link');
|
||||
assert.equal(link.href, 'http:///base/karma-tests/foobar.html');
|
||||
}
|
||||
};
|
||||
|
||||
runWombatTest(config, done);
|
||||
});
|
||||
|
||||
it('toString() should return the rewritten URL', function (done) {
|
||||
config.testScript = function () {
|
||||
if (domTests.areDOMPropertiesConfigurable()) {
|
||||
var link = document.getElementById('link');
|
||||
assert.equal(link.href, link.toString());
|
||||
}
|
||||
};
|
||||
runWombatTest(config, done);
|
||||
});
|
||||
});
|
||||
|
||||
describe('base URL overrides', function () {
|
||||
it('document.baseURI should return the original URL', function (done) {
|
||||
runWombatTest({
|
||||
initScript: function () {
|
||||
wbinfo = {
|
||||
wombat_opts: {},
|
||||
prefix: window.location.origin,
|
||||
wombat_ts: '',
|
||||
wombat_scheme: 'http',
|
||||
is_live: false,
|
||||
top_url: ''
|
||||
};
|
||||
},
|
||||
wombatScript: wombatScript,
|
||||
testScript: function () {
|
||||
var baseURI = document.baseURI;
|
||||
if (typeof baseURI !== 'string') {
|
||||
throw new Error('baseURI is not a string');
|
||||
}
|
||||
if (domTests.areDOMPropertiesConfigurable()) {
|
||||
assert.equal(baseURI, 'http:///base/karma-tests/dummy.html');
|
||||
}
|
||||
},
|
||||
}, done);
|
||||
});
|
||||
|
||||
it('should allow base.href to be assigned', function (done) {
|
||||
runWombatTest({
|
||||
initScript: function () {
|
||||
wbinfo = {
|
||||
wombat_opts: {},
|
||||
wombat_scheme: 'http',
|
||||
is_live: false,
|
||||
top_url: ''
|
||||
};
|
||||
},
|
||||
wombatScript: wombatScript,
|
||||
testScript: function () {
|
||||
'use strict';
|
||||
var baseElement = document.createElement('base');
|
||||
baseElement.href = 'http://foobar.com/base';
|
||||
assert.equal(baseElement.href, 'http://foobar.com/base');
|
||||
},
|
||||
}, done);
|
||||
});
|
||||
});
|
||||
});
|
34
package.json
34
package.json
@ -1,34 +0,0 @@
|
||||
{
|
||||
"name": "pywb",
|
||||
"version": "1.0.0",
|
||||
"description": "Web archival replay tools",
|
||||
"main": "index.js",
|
||||
"directories": {
|
||||
"doc": "doc",
|
||||
"test": "tests"
|
||||
},
|
||||
"scripts": {
|
||||
"test": "echo \"Error: no test specified\" && exit 1"
|
||||
},
|
||||
"repository": {
|
||||
"type": "git",
|
||||
"url": "git+https://github.com/ikreymer/pywb.git"
|
||||
},
|
||||
"author": "",
|
||||
"license": "GPL-3.0",
|
||||
"bugs": {
|
||||
"url": "https://github.com/ikreymer/pywb/issues"
|
||||
},
|
||||
"homepage": "https://github.com/ikreymer/pywb#readme",
|
||||
"devDependencies": {
|
||||
"chai": "^3.4.1",
|
||||
"karma": "^0.13.15",
|
||||
"karma-chai": "^0.1.0",
|
||||
"karma-chrome-launcher": "^0.2.1",
|
||||
"karma-firefox-launcher": "^0.1.7",
|
||||
"karma-html2js-preprocessor": "^0.1.0",
|
||||
"karma-mocha": "^0.2.1",
|
||||
"karma-sauce-launcher": "^0.3.0",
|
||||
"mocha": "^2.3.4"
|
||||
}
|
||||
}
|
@ -2,6 +2,13 @@ from gevent.monkey import patch_all; patch_all()
|
||||
from argparse import ArgumentParser
|
||||
|
||||
import logging
|
||||
import pkg_resources
|
||||
|
||||
|
||||
#=============================================================================
|
||||
def get_version():
|
||||
"""Get version of the pywb"""
|
||||
return "pywb " + pkg_resources.get_distribution("pywb").version
|
||||
|
||||
|
||||
#=============================================================================
|
||||
@ -40,6 +47,8 @@ class BaseCli(object):
|
||||
:param str desc: The description for the application to be started
|
||||
"""
|
||||
parser = ArgumentParser(description=desc)
|
||||
parser.add_argument("-V", "--version", action="version", version=get_version())
|
||||
|
||||
parser.add_argument('-p', '--port', type=int, default=default_port,
|
||||
help='Port to listen on (default %s)' % default_port)
|
||||
parser.add_argument('-b', '--bind', default='0.0.0.0',
|
||||
@ -110,7 +119,7 @@ class BaseCli(object):
|
||||
self.extra_config['debug'] = True
|
||||
|
||||
if self.r.record:
|
||||
self.extra_config['recorder'] = 'live'
|
||||
self.extra_config['recorder'] = {'source_coll': 'live'}
|
||||
|
||||
def run(self):
|
||||
"""Start the application"""
|
||||
|
@ -1,29 +1,29 @@
|
||||
from gevent.monkey import patch_all; patch_all()
|
||||
|
||||
#from bottle import run, Bottle, request, response, debug
|
||||
from werkzeug.routing import Map, Rule
|
||||
from werkzeug.exceptions import HTTPException, NotFound
|
||||
from werkzeug.wsgi import pop_path_info
|
||||
from six.moves.urllib.parse import urljoin
|
||||
from werkzeug.routing import Map, Rule, RequestRedirect, Submount
|
||||
from wsgiref.util import shift_path_info
|
||||
from six.moves.urllib.parse import urljoin, parse_qsl
|
||||
from six import iteritems
|
||||
from warcio.statusandheaders import StatusAndHeaders
|
||||
from warcio.utils import to_native_str
|
||||
from warcio.timeutils import iso_date_to_timestamp
|
||||
from warcio.timeutils import iso_date_to_timestamp, timestamp_to_iso_date
|
||||
from wsgiprox.wsgiprox import WSGIProxMiddleware
|
||||
|
||||
from pywb.recorder.multifilewarcwriter import MultiFileWARCWriter
|
||||
from pywb.recorder.recorderapp import RecorderApp
|
||||
from pywb.recorder.filters import SkipDupePolicy, WriteDupePolicy, WriteRevisitDupePolicy
|
||||
from pywb.recorder.redisindexer import WritableRedisIndexer, RedisPendingCounterTempBuffer
|
||||
|
||||
from pywb.utils.loaders import load_yaml_config
|
||||
from pywb.utils.geventserver import GeventServer
|
||||
from pywb.utils.io import StreamIter
|
||||
from pywb.utils.wbexception import WbException, AppPageNotFound
|
||||
|
||||
from pywb.warcserver.warcserver import WarcServer
|
||||
|
||||
from pywb.rewrite.templateview import BaseInsertView
|
||||
|
||||
from pywb.apps.static_handler import StaticHandler
|
||||
from pywb.apps.rewriterapp import RewriterApp, UpstreamException
|
||||
from pywb.apps.rewriterapp import RewriterApp
|
||||
from pywb.apps.wbrequestresponse import WbResponse
|
||||
|
||||
import os
|
||||
@ -44,6 +44,8 @@ class FrontEndApp(object):
|
||||
- WSGIProxMiddleware (Optional): If proxy mode is enabled, performs pywb's HTTP(s) proxy functionality
|
||||
- AutoIndexer (Optional): If auto-indexing is enabled for the collections it is started here
|
||||
- RecorderApp (Optional): Recording functionality, available when recording mode is enabled
|
||||
|
||||
The RewriterApp is configurable and can be set via the class var `REWRITER_APP_CLS`, defaults to RewriterApp
|
||||
"""
|
||||
|
||||
REPLAY_API = 'http://localhost:%s/{coll}/resource/postreq'
|
||||
@ -57,16 +59,23 @@ class FrontEndApp(object):
|
||||
|
||||
PROXY_CA_PATH = os.path.join('proxy-certs', 'pywb-ca.pem')
|
||||
|
||||
REWRITER_APP_CLS = RewriterApp
|
||||
|
||||
ALL_DIGITS = re.compile(r'^\d+$')
|
||||
|
||||
def __init__(self, config_file='./config.yaml', custom_config=None):
|
||||
def __init__(self, config_file=None, custom_config=None):
|
||||
"""
|
||||
:param str config_file: Path to the config file
|
||||
:param dict custom_config: Dictionary containing additional configuration information
|
||||
:param str|None config_file: Path to the config file
|
||||
:param dict|None custom_config: Dictionary containing additional configuration information
|
||||
"""
|
||||
config_file = config_file or './config.yaml'
|
||||
self.handler = self.handle_request
|
||||
self.warcserver = WarcServer(config_file=config_file,
|
||||
custom_config=custom_config)
|
||||
self.recorder = None
|
||||
self.recorder_path = None
|
||||
self.put_custom_record_path = None
|
||||
self.proxy_default_timestamp = None
|
||||
|
||||
config = self.warcserver.config
|
||||
|
||||
@ -76,6 +85,7 @@ class FrontEndApp(object):
|
||||
|
||||
self.proxy_prefix = None # the URL prefix to be used for the collection with proxy mode (e.g. /coll/id_/)
|
||||
self.proxy_coll = None # the name of the collection that has proxy mode enabled
|
||||
self.proxy_record = False # indicate if proxy recording
|
||||
self.init_proxy(config)
|
||||
|
||||
self.init_recorder(config.get('recorder'))
|
||||
@ -86,28 +96,32 @@ class FrontEndApp(object):
|
||||
self.static_handler = StaticHandler(static_path)
|
||||
|
||||
self.cdx_api_endpoint = config.get('cdx_api_endpoint', '/cdx')
|
||||
|
||||
self._init_routes()
|
||||
self.query_limit = config.get('query_limit')
|
||||
|
||||
upstream_paths = self.get_upstream_paths(self.warcserver_server.port)
|
||||
|
||||
framed_replay = config.get('framed_replay', True)
|
||||
self.rewriterapp = RewriterApp(framed_replay,
|
||||
config=config,
|
||||
paths=upstream_paths)
|
||||
self.rewriterapp = self.REWRITER_APP_CLS(framed_replay,
|
||||
config=config,
|
||||
paths=upstream_paths)
|
||||
|
||||
self.templates_dir = config.get('templates_dir', 'templates')
|
||||
self.static_dir = config.get('static_dir', 'static')
|
||||
self.static_prefix = config.get('static_prefix', 'static')
|
||||
self.default_locale = config.get('default_locale', '')
|
||||
|
||||
metadata_templ = os.path.join(self.warcserver.root_dir, '{coll}', 'metadata.yaml')
|
||||
self.metadata_cache = MetadataCache(metadata_templ)
|
||||
|
||||
self._init_routes()
|
||||
|
||||
def _init_routes(self):
|
||||
"""Initialize the routes and based on the configuration file makes available
|
||||
specific routes (proxy mode, record)"""
|
||||
specific routes (proxy mode, record)
|
||||
"""
|
||||
self.url_map = Map()
|
||||
self.url_map.add(Rule('/static/_/<coll>/<path:filepath>', endpoint=self.serve_static))
|
||||
self.url_map.add(Rule('/static/<path:filepath>', endpoint=self.serve_static))
|
||||
self.url_map.add(Rule('/{0}/_/<coll>/<path:filepath>'.format(self.static_prefix), endpoint=self.serve_static))
|
||||
self.url_map.add(Rule('/{0}/<path:filepath>'.format(self.static_prefix), endpoint=self.serve_static))
|
||||
self.url_map.add(Rule('/collinfo.json', endpoint=self.serve_listing))
|
||||
|
||||
if self.is_valid_coll('$root'):
|
||||
@ -116,18 +130,57 @@ class FrontEndApp(object):
|
||||
coll_prefix = '/<coll>'
|
||||
self.url_map.add(Rule('/', endpoint=self.serve_home))
|
||||
|
||||
self.url_map.add(Rule(coll_prefix + self.cdx_api_endpoint, endpoint=self.serve_cdx))
|
||||
self.url_map.add(Rule(coll_prefix + '/', endpoint=self.serve_coll_page))
|
||||
self.url_map.add(Rule(coll_prefix + '/timemap/<timemap_output>/<path:url>', endpoint=self.serve_content))
|
||||
|
||||
if self.recorder_path:
|
||||
self.url_map.add(Rule(coll_prefix + self.RECORD_ROUTE + '/<path:url>', endpoint=self.serve_record))
|
||||
self._init_coll_routes(coll_prefix)
|
||||
|
||||
if self.proxy_prefix is not None:
|
||||
# Add the proxy-fetch endpoint to enable PreservationWorker to make CORS fetches worry free in proxy mode
|
||||
self.url_map.add(Rule('/proxy-fetch/<path:url>', endpoint=self.proxy_fetch,
|
||||
methods=['GET', 'HEAD', 'OPTIONS']))
|
||||
self.url_map.add(Rule(coll_prefix + '/<path:url>', endpoint=self.serve_content))
|
||||
|
||||
def _init_coll_routes(self, coll_prefix):
|
||||
"""Initialize and register the routes for specified collection path
|
||||
|
||||
:param str coll_prefix: The collection path
|
||||
:rtype: None
|
||||
"""
|
||||
routes = self._make_coll_routes(coll_prefix)
|
||||
|
||||
# init loc routes, if any
|
||||
loc_keys = list(self.rewriterapp.loc_map.keys())
|
||||
if loc_keys:
|
||||
routes.append(Rule('/', endpoint=self.serve_home))
|
||||
|
||||
submount_route = ', '.join(loc_keys)
|
||||
submount_route = '/<any({0}):lang>'.format(submount_route)
|
||||
|
||||
self.url_map.add(Submount(submount_route, routes))
|
||||
|
||||
for route in routes:
|
||||
self.url_map.add(route)
|
||||
|
||||
def _make_coll_routes(self, coll_prefix):
|
||||
"""Creates a list of standard collection routes for the
|
||||
specified collection path
|
||||
|
||||
:param str coll_prefix: The collection path
|
||||
:return: A list of route rules for the supplied collection
|
||||
:rtype: list[Rule]
|
||||
"""
|
||||
routes = [
|
||||
Rule(coll_prefix + self.cdx_api_endpoint, endpoint=self.serve_cdx),
|
||||
Rule(coll_prefix + '/', endpoint=self.serve_coll_page),
|
||||
Rule(coll_prefix + '/timemap/<timemap_output>/<path:url>', endpoint=self.serve_content),
|
||||
Rule(coll_prefix + '/<path:url>', endpoint=self.serve_content)
|
||||
]
|
||||
|
||||
if self.recorder_path:
|
||||
routes.append(Rule(coll_prefix + self.RECORD_ROUTE + '/<path:url>', endpoint=self.serve_record))
|
||||
|
||||
# enable PUT of custom data as 'resource' records
|
||||
if self.put_custom_record_path:
|
||||
routes.append(Rule(coll_prefix + self.RECORD_ROUTE, endpoint=self.put_custom_record, methods=["PUT"]))
|
||||
|
||||
return routes
|
||||
|
||||
def get_upstream_paths(self, port):
|
||||
"""Retrieve a dictionary containing the full URLs of the upstream apps
|
||||
@ -137,9 +190,9 @@ class FrontEndApp(object):
|
||||
:rtype: dict[str, str]
|
||||
"""
|
||||
base_paths = {
|
||||
'replay': self.REPLAY_API % port,
|
||||
'cdx-server': self.CDX_API % port,
|
||||
}
|
||||
'replay': self.REPLAY_API % port,
|
||||
'cdx-server': self.CDX_API % port,
|
||||
}
|
||||
|
||||
if self.recorder_path:
|
||||
base_paths['record'] = self.recorder_path
|
||||
@ -147,7 +200,11 @@ class FrontEndApp(object):
|
||||
return base_paths
|
||||
|
||||
def init_recorder(self, recorder_config):
|
||||
"""Initialize the recording functionality of pywb. If recording_config is None this function is a no op"""
|
||||
"""Initialize the recording functionality of pywb. If recording_config is None this function is a no op
|
||||
|
||||
:param str|dict|None recorder_config: The configuration for the recorder app
|
||||
:rtype: None
|
||||
"""
|
||||
if not recorder_config:
|
||||
self.recorder = None
|
||||
self.recorder_path = None
|
||||
@ -159,22 +216,60 @@ class FrontEndApp(object):
|
||||
else:
|
||||
recorder_coll = recorder_config['source_coll']
|
||||
|
||||
# TODO: support dedup
|
||||
dedup_index = None
|
||||
# cache mode
|
||||
self.rec_cache_mode = recorder_config.get('cache', 'default')
|
||||
|
||||
dedup_policy = recorder_config.get('dedup_policy')
|
||||
dedup_by_url = False
|
||||
|
||||
if dedup_policy == 'none':
|
||||
dedup_policy = ''
|
||||
|
||||
if dedup_policy == 'keep':
|
||||
dedup_policy = WriteDupePolicy()
|
||||
elif dedup_policy == 'revisit':
|
||||
dedup_policy = WriteRevisitDupePolicy()
|
||||
elif dedup_policy == 'skip':
|
||||
dedup_policy = SkipDupePolicy()
|
||||
dedup_by_url = True
|
||||
elif dedup_policy:
|
||||
msg = 'Invalid option for dedup_policy: {0}'
|
||||
raise Exception(msg.format(dedup_policy))
|
||||
|
||||
if dedup_policy:
|
||||
dedup_index = WritableRedisIndexer(redis_url=self.warcserver.dedup_index_url,
|
||||
dupe_policy=dedup_policy,
|
||||
rel_path_template=self.warcserver.root_dir + '/{coll}/archive')
|
||||
else:
|
||||
dedup_index = None
|
||||
|
||||
|
||||
warc_writer = MultiFileWARCWriter(self.warcserver.archive_paths,
|
||||
max_size=int(recorder_config.get('rollover_size', 1000000000)),
|
||||
max_idle_secs=int(recorder_config.get('rollover_idle_secs', 600)),
|
||||
filename_template=recorder_config.get('filename_template'),
|
||||
dedup_index=dedup_index)
|
||||
dedup_index=dedup_index,
|
||||
dedup_by_url=dedup_by_url)
|
||||
|
||||
if dedup_policy:
|
||||
pending_counter = self.warcserver.dedup_index_url.replace(':cdxj', ':pending')
|
||||
pending_timeout = recorder_config.get('pending_timeout', 30)
|
||||
create_buff_func = lambda params, name: RedisPendingCounterTempBuffer(512 * 1024, pending_counter, params, name, pending_timeout)
|
||||
else:
|
||||
create_buff_func = None
|
||||
|
||||
self.recorder = RecorderApp(self.RECORD_SERVER % str(self.warcserver_server.port), warc_writer,
|
||||
accept_colls=recorder_config.get('source_filter'))
|
||||
|
||||
accept_colls=recorder_config.get('source_filter'),
|
||||
create_buff_func=create_buff_func)
|
||||
|
||||
recorder_server = GeventServer(self.recorder, port=0)
|
||||
|
||||
self.recorder_path = self.RECORD_API % (recorder_server.port, recorder_coll)
|
||||
|
||||
# enable PUT of custom data as 'resource' records
|
||||
if recorder_config.get('enable_put_custom_record'):
|
||||
self.put_custom_record_path = self.recorder_path + '&put_record={rec_type}&url={url}'
|
||||
|
||||
def init_autoindex(self, auto_interval):
|
||||
"""Initialize and start the auto-indexing of the collections. If auto_interval is None this is a no op.
|
||||
|
||||
@ -200,6 +295,12 @@ class FrontEndApp(object):
|
||||
indexer.start()
|
||||
|
||||
def is_proxy_enabled(self, environ):
|
||||
"""Returns T/F indicating if proxy mode is enabled
|
||||
|
||||
:param dict environ: The WSGI environment dictionary for the request
|
||||
:return: T/F indicating if proxy mode is enabled
|
||||
:rtype: bool
|
||||
"""
|
||||
return self.proxy_prefix is not None and 'wsgiprox.proxy_host' in environ
|
||||
|
||||
def serve_home(self, environ):
|
||||
@ -246,28 +347,28 @@ class FrontEndApp(object):
|
||||
if proxy_enabled:
|
||||
response.add_access_control_headers(env=environ)
|
||||
return response
|
||||
except:
|
||||
self.raise_not_found(environ, 'Static File Not Found: {0}'.format(filepath))
|
||||
except Exception:
|
||||
self.raise_not_found(environ, 'static_file_not_found', filepath)
|
||||
|
||||
def get_metadata(self, coll):
|
||||
"""Retrieve the metadata associated with a collection
|
||||
def get_coll_config(self, coll):
|
||||
"""Retrieve the collection config, including metadata, associated with a collection
|
||||
|
||||
:param str coll: The name of the collection to receive metadata for
|
||||
:return: The collections metadata if it exists
|
||||
:param str coll: The name of the collection to receive config info for
|
||||
:return: The collections config
|
||||
:rtype: dict
|
||||
"""
|
||||
#if coll == self.all_coll:
|
||||
# coll = '*'
|
||||
|
||||
metadata = {'coll': coll,
|
||||
'type': 'replay'}
|
||||
coll_config = {'coll': coll,
|
||||
'type': 'replay'}
|
||||
|
||||
if coll in self.warcserver.list_fixed_routes():
|
||||
metadata.update(self.warcserver.get_coll_config(coll))
|
||||
coll_config.update(self.warcserver.get_coll_config(coll))
|
||||
else:
|
||||
metadata.update(self.metadata_cache.load(coll))
|
||||
coll_config['metadata'] = self.metadata_cache.load(coll) or {}
|
||||
|
||||
return metadata
|
||||
if 'ui' in self.warcserver.config:
|
||||
coll_config['ui'] = self.warcserver.config['ui']
|
||||
|
||||
return coll_config
|
||||
|
||||
def serve_coll_page(self, environ, coll='$root'):
|
||||
"""Render and serve a collections search page (search.html).
|
||||
@ -278,22 +379,26 @@ class FrontEndApp(object):
|
||||
:rtype: WbResponse
|
||||
"""
|
||||
if not self.is_valid_coll(coll):
|
||||
self.raise_not_found(environ, 'No handler for "/{0}"'.format(coll))
|
||||
self.raise_not_found(environ, 'coll_not_found', coll)
|
||||
|
||||
self.setup_paths(environ, coll)
|
||||
|
||||
metadata = self.get_metadata(coll)
|
||||
coll_config = self.get_coll_config(coll)
|
||||
metadata = coll_config.get('metadata')
|
||||
ui = coll_config.get('ui', {})
|
||||
|
||||
view = BaseInsertView(self.rewriterapp.jinja_env, 'search.html')
|
||||
|
||||
wb_prefix = environ.get('SCRIPT_NAME')
|
||||
wb_prefix = environ.get('SCRIPT_NAME', '')
|
||||
if wb_prefix:
|
||||
wb_prefix += '/'
|
||||
|
||||
content = view.render_to_string(environ,
|
||||
wb_prefix=wb_prefix,
|
||||
coll=coll,
|
||||
coll_config=coll_config,
|
||||
metadata=metadata,
|
||||
coll=coll)
|
||||
ui=ui)
|
||||
|
||||
return WbResponse.text_response(content, content_type='text/html; charset="utf-8"')
|
||||
|
||||
@ -307,22 +412,41 @@ class FrontEndApp(object):
|
||||
"""
|
||||
base_url = self.rewriterapp.paths['cdx-server']
|
||||
|
||||
#if coll == self.all_coll:
|
||||
# if coll == self.all_coll:
|
||||
# coll = '*'
|
||||
|
||||
config = self.warcserver.get_coll_config(coll)
|
||||
is_live = config.get("index") == "$live"
|
||||
|
||||
if is_live:
|
||||
cache_control = "no-store, no-cache"
|
||||
else:
|
||||
cache_control = "max-age=86400, must-revalidate"
|
||||
|
||||
cdx_url = base_url.format(coll=coll)
|
||||
|
||||
if environ.get('QUERY_STRING'):
|
||||
cdx_url += '&' if '?' in cdx_url else '?'
|
||||
cdx_url += environ.get('QUERY_STRING')
|
||||
|
||||
try:
|
||||
res = requests.get(cdx_url, stream=True)
|
||||
if self.query_limit:
|
||||
cdx_url += '&' if '?' in cdx_url else '?'
|
||||
cdx_url += 'limit=' + str(self.query_limit)
|
||||
|
||||
try:
|
||||
headers = {}
|
||||
for key in environ.keys():
|
||||
if key.startswith("HTTP_X_"):
|
||||
headers[key[5:].replace("_", "-")] = environ[key]
|
||||
res = requests.get(cdx_url, stream=True, headers=headers)
|
||||
|
||||
status_line = '{} {}'.format(res.status_code, res.reason)
|
||||
content_type = res.headers.get('Content-Type')
|
||||
|
||||
return WbResponse.bin_stream(StreamIter(res.raw),
|
||||
content_type=content_type)
|
||||
content_type=content_type,
|
||||
status=status_line,
|
||||
headers=[("Cache-Control", cache_control)])
|
||||
|
||||
except Exception as e:
|
||||
return WbResponse.text_response('Error: ' + str(e), status='400 Bad Request')
|
||||
@ -354,7 +478,7 @@ class FrontEndApp(object):
|
||||
:rtype: WbResponse
|
||||
"""
|
||||
if not self.is_valid_coll(coll):
|
||||
self.raise_not_found(environ, 'No handler for "/{0}"'.format(coll))
|
||||
self.raise_not_found(environ, 'coll_not_found', coll)
|
||||
|
||||
self.setup_paths(environ, coll, record)
|
||||
|
||||
@ -369,20 +493,58 @@ class FrontEndApp(object):
|
||||
if environ.get('QUERY_STRING'):
|
||||
wb_url_str += '?' + environ.get('QUERY_STRING')
|
||||
|
||||
metadata = self.get_metadata(coll)
|
||||
coll_config = self.get_coll_config(coll)
|
||||
if record:
|
||||
metadata['type'] = 'record'
|
||||
coll_config['type'] = 'record'
|
||||
coll_config['cache'] = self.rec_cache_mode
|
||||
|
||||
if timemap_output:
|
||||
metadata['output'] = timemap_output
|
||||
coll_config['output'] = timemap_output
|
||||
# ensure that the timemap path information is not included
|
||||
wb_url_str = wb_url_str.replace('timemap/{0}/'.format(timemap_output), '')
|
||||
try:
|
||||
response = self.rewriterapp.render_content(wb_url_str, metadata, environ)
|
||||
except UpstreamException as ue:
|
||||
response = self.rewriterapp.handle_error(environ, ue)
|
||||
raise HTTPException(response=response)
|
||||
return response
|
||||
|
||||
return self.rewriterapp.render_content(wb_url_str, coll_config, environ)
|
||||
|
||||
def put_custom_record(self, environ, coll="$root"):
|
||||
""" When recording, PUT a custom WARC record to the specified collection
|
||||
(Available only when recording)
|
||||
|
||||
:param dict environ: The WSGI environment dictionary for the request
|
||||
:param str coll: The name of the collection the record is to be served from
|
||||
"""
|
||||
chunks = []
|
||||
while True:
|
||||
buff = environ["wsgi.input"].read()
|
||||
if not buff:
|
||||
break
|
||||
|
||||
chunks.append(buff)
|
||||
|
||||
data = b"".join(chunks)
|
||||
|
||||
params = dict(parse_qsl(environ.get("QUERY_STRING")))
|
||||
|
||||
rec_type = "resource"
|
||||
|
||||
headers = {"Content-Type": environ.get("CONTENT_TYPE", "text/plain")}
|
||||
|
||||
target_uri = params.get("url")
|
||||
|
||||
if not target_uri:
|
||||
return WbResponse.json_response({"error": "no url"}, status="400 Bad Request")
|
||||
|
||||
timestamp = params.get("timestamp")
|
||||
if timestamp:
|
||||
headers["WARC-Date"] = timestamp_to_iso_date(timestamp)
|
||||
|
||||
put_url = self.put_custom_record_path.format(
|
||||
url=target_uri, coll=coll, rec_type=rec_type
|
||||
)
|
||||
res = requests.put(put_url, headers=headers, data=data)
|
||||
|
||||
res = res.json()
|
||||
|
||||
return WbResponse.json_response(res)
|
||||
|
||||
def setup_paths(self, environ, coll, record=False):
|
||||
"""Populates the WSGI environment dictionary with the path information necessary to perform a response for
|
||||
@ -396,9 +558,9 @@ class FrontEndApp(object):
|
||||
return
|
||||
|
||||
if coll != '$root':
|
||||
pop_path_info(environ)
|
||||
shift_path_info(environ)
|
||||
if record:
|
||||
pop_path_info(environ)
|
||||
shift_path_info(environ)
|
||||
|
||||
paths = [self.warcserver.root_dir]
|
||||
|
||||
@ -419,7 +581,7 @@ class FrontEndApp(object):
|
||||
"""
|
||||
result = {'fixed': self.warcserver.list_fixed_routes(),
|
||||
'dynamic': self.warcserver.list_dynamic_routes()
|
||||
}
|
||||
}
|
||||
|
||||
return WbResponse.json_response(result)
|
||||
|
||||
@ -430,20 +592,21 @@ class FrontEndApp(object):
|
||||
:return: True if the collection is valid, false otherwise
|
||||
:rtype: bool
|
||||
"""
|
||||
#if coll == self.all_coll:
|
||||
# if coll == self.all_coll:
|
||||
# return True
|
||||
|
||||
return (coll in self.warcserver.list_fixed_routes() or
|
||||
coll in self.warcserver.list_dynamic_routes())
|
||||
|
||||
def raise_not_found(self, environ, msg):
|
||||
def raise_not_found(self, environ, err_type, url):
|
||||
"""Utility function for raising a werkzeug.exceptions.NotFound execption with the supplied WSGI environment
|
||||
and message.
|
||||
|
||||
:param dict environ: The WSGI environment dictionary for the request
|
||||
:param str msg: The error message
|
||||
:param str err_type: The identifier for type of error that occurred
|
||||
:param str url: The url of the archived page that was requested
|
||||
"""
|
||||
raise NotFound(response=self.rewriterapp._error_response(environ, msg))
|
||||
raise AppPageNotFound(err_type, url)
|
||||
|
||||
def _check_refer_redirect(self, environ):
|
||||
"""Returns a WbResponse for a HTTP 307 redirection if the HTTP referer header is the same as the HTTP host header
|
||||
@ -463,8 +626,6 @@ class FrontEndApp(object):
|
||||
inx = referer[1:].find('http')
|
||||
if not inx:
|
||||
inx = referer[1:].find('///')
|
||||
if inx > 0:
|
||||
inx + 1
|
||||
|
||||
if inx < 0:
|
||||
return
|
||||
@ -480,6 +641,13 @@ class FrontEndApp(object):
|
||||
return WbResponse.redir_response(full_url, '307 Redirect')
|
||||
|
||||
def __call__(self, environ, start_response):
|
||||
"""Handles a request
|
||||
|
||||
:param dict environ: The WSGI environment dictionary for the request
|
||||
:param start_response:
|
||||
:return: The WbResponse for the request
|
||||
:rtype: WbResponse
|
||||
"""
|
||||
return self.handler(environ, start_response)
|
||||
|
||||
def handle_request(self, environ, start_response):
|
||||
@ -493,25 +661,47 @@ class FrontEndApp(object):
|
||||
urls = self.url_map.bind_to_environ(environ)
|
||||
try:
|
||||
endpoint, args = urls.match()
|
||||
|
||||
self.rewriterapp.prepare_env(environ)
|
||||
|
||||
# store original script_name (original prefix) before modifications are made
|
||||
environ['pywb.app_prefix'] = environ.get('SCRIPT_NAME')
|
||||
environ['ORIG_SCRIPT_NAME'] = environ.get('SCRIPT_NAME')
|
||||
|
||||
lang = args.pop('lang', '')
|
||||
if lang:
|
||||
shift_path_info(environ)
|
||||
|
||||
if lang:
|
||||
environ['pywb_lang'] = lang
|
||||
elif self.default_locale:
|
||||
environ['pywb_lang'] = self.default_locale
|
||||
|
||||
response = endpoint(environ, **args)
|
||||
return response(environ, start_response)
|
||||
|
||||
except HTTPException as e:
|
||||
except RequestRedirect as rr:
|
||||
# if werkzeug throws this, likely a missing slash redirect
|
||||
# also check referrer here to avoid another redirect later
|
||||
redir = self._check_refer_redirect(environ)
|
||||
if redir:
|
||||
return redir(environ, start_response)
|
||||
|
||||
return e(environ, start_response)
|
||||
response = WbResponse.redir_response(rr.new_url, '307 Redirect')
|
||||
|
||||
except WbException as wbe:
|
||||
if wbe.status_code == 404:
|
||||
redir = self._check_refer_redirect(environ)
|
||||
if redir:
|
||||
return redir(environ, start_response)
|
||||
|
||||
response = self.rewriterapp.handle_error(environ, wbe)
|
||||
|
||||
except Exception as e:
|
||||
if self.debug:
|
||||
traceback.print_exc()
|
||||
|
||||
response = self.rewriterapp._error_response(environ, 'Internal Error: ' + str(e), '500 Server Error')
|
||||
return response(environ, start_response)
|
||||
response = self.rewriterapp._error_response(environ, WbException('Internal Error: ' + str(e)))
|
||||
|
||||
return response(environ, start_response)
|
||||
|
||||
@classmethod
|
||||
def create_app(cls, port):
|
||||
@ -552,24 +742,28 @@ class FrontEndApp(object):
|
||||
if proxy_coll in self.warcserver.list_fixed_routes():
|
||||
raise Exception('Can not record into fixed collection')
|
||||
|
||||
proxy_coll += self.RECORD_ROUTE
|
||||
proxy_route = proxy_coll + self.RECORD_ROUTE
|
||||
if not config.get('recorder'):
|
||||
config['recorder'] = 'live'
|
||||
|
||||
self.proxy_record = True
|
||||
|
||||
else:
|
||||
logging.info('Proxy enabled for collection "{0}"'.format(proxy_coll))
|
||||
self.proxy_record = False
|
||||
proxy_route = proxy_coll
|
||||
|
||||
if proxy_config.get('enable_content_rewrite', True):
|
||||
self.proxy_prefix = '/{0}/bn_/'.format(proxy_coll)
|
||||
self.proxy_prefix = '/{0}/bn_/'.format(proxy_route)
|
||||
else:
|
||||
self.proxy_prefix = '/{0}/id_/'.format(proxy_coll)
|
||||
self.proxy_prefix = '/{0}/id_/'.format(proxy_route)
|
||||
|
||||
self.proxy_default_timestamp = proxy_config.get('default_timestamp')
|
||||
if self.proxy_default_timestamp:
|
||||
if not self.ALL_DIGITS.match(self.proxy_default_timestamp):
|
||||
try:
|
||||
self.proxy_default_timestamp = iso_date_to_timestamp(self.proxy_default_timestamp)
|
||||
except:
|
||||
except Exception:
|
||||
raise Exception('Invalid Proxy Timestamp: Must Be All-Digit Timestamp or ISO Date Format')
|
||||
|
||||
self.proxy_coll = proxy_coll
|
||||
@ -611,14 +805,14 @@ class FrontEndApp(object):
|
||||
return WbResponse.options_response(env)
|
||||
|
||||
# ensure full URL
|
||||
request_url = env['REQUEST_URI']
|
||||
# replace with /id_ so we do not get rewritten
|
||||
url = request_url.replace('/proxy-fetch', '/id_')
|
||||
# update WSGI environment object
|
||||
env['REQUEST_URI'] = self.proxy_coll + url
|
||||
env['PATH_INFO'] = env['PATH_INFO'].replace('/proxy-fetch', self.proxy_coll + '/id_')
|
||||
url = env['REQUEST_URI'].split('/proxy-fetch/', 1)[-1]
|
||||
|
||||
env['REQUEST_URI'] = self.proxy_prefix + url
|
||||
env['PATH_INFO'] = self.proxy_prefix + env['PATH_INFO'].split('/proxy-fetch/', 1)[-1]
|
||||
|
||||
# make request using normal serve_content
|
||||
response = self.serve_content(env, self.proxy_coll, url)
|
||||
response = self.serve_content(env, self.proxy_coll, url, record=self.proxy_record)
|
||||
|
||||
# for WR
|
||||
if isinstance(response, WbResponse):
|
||||
response.add_access_control_headers(env=env)
|
||||
@ -653,7 +847,7 @@ class MetadataCache(object):
|
||||
try:
|
||||
mtime = os.path.getmtime(path)
|
||||
obj = self.cache.get(path)
|
||||
except:
|
||||
except Exception:
|
||||
return {}
|
||||
|
||||
if not obj:
|
||||
@ -695,5 +889,3 @@ class MetadataCache(object):
|
||||
if __name__ == "__main__":
|
||||
app_server = FrontEndApp.create_app(port=8080)
|
||||
app_server.join()
|
||||
|
||||
|
||||
|
@ -1,56 +1,46 @@
|
||||
from io import BytesIO
|
||||
|
||||
import requests
|
||||
|
||||
from werkzeug.http import HTTP_STATUS_CODES
|
||||
from six.moves.urllib.parse import urlencode, urlsplit, urlunsplit
|
||||
|
||||
from pywb.rewrite.default_rewriter import DefaultRewriter, RewriterWithJSProxy
|
||||
|
||||
from pywb.rewrite.wburl import WbUrl
|
||||
from pywb.rewrite.url_rewriter import UrlRewriter, IdentityUrlRewriter
|
||||
|
||||
from pywb.utils.wbexception import WbException
|
||||
from pywb.utils.canonicalize import canonicalize
|
||||
from pywb.utils.loaders import extract_client_cookie
|
||||
from pywb.utils.io import BUFF_SIZE, OffsetLimitReader
|
||||
from pywb.utils.memento import MementoUtils
|
||||
|
||||
from warcio.timeutils import http_date_to_timestamp, timestamp_to_http_date
|
||||
from fakeredis import FakeStrictRedis
|
||||
from six.moves.urllib.parse import unquote, urlencode, urlsplit, urlunsplit, parse_qsl
|
||||
from warcio.bufferedreaders import BufferedReader
|
||||
from warcio.recordloader import ArcWarcRecordLoader
|
||||
from warcio.timeutils import http_date_to_timestamp, timestamp_to_http_date
|
||||
|
||||
from pywb.warcserver.index.cdxobject import CDXObject
|
||||
from pywb.apps.wbrequestresponse import WbResponse
|
||||
|
||||
from pywb.rewrite.cookies import CookieTracker
|
||||
from pywb.rewrite.default_rewriter import DefaultRewriter, RewriterWithJSProxy
|
||||
from pywb.rewrite.rewriteinputreq import RewriteInputRequest
|
||||
from pywb.rewrite.templateview import JinjaEnv, HeadInsertView, TopFrameView, BaseInsertView
|
||||
|
||||
|
||||
from io import BytesIO
|
||||
from copy import copy
|
||||
|
||||
import gevent
|
||||
import json
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class UpstreamException(WbException):
|
||||
def __init__(self, status_code, url, details):
|
||||
super(UpstreamException, self).__init__(url=url, msg=details)
|
||||
self.status_code = status_code
|
||||
|
||||
|
||||
# ============================================================================
|
||||
#class Rewriter(RewriteDASHMixin, RewriteAMFMixin, RewriteContent):
|
||||
# pass
|
||||
from pywb.rewrite.templateview import BaseInsertView, HeadInsertView, JinjaEnv, TopFrameView
|
||||
from pywb.rewrite.url_rewriter import IdentityUrlRewriter, UrlRewriter
|
||||
from pywb.rewrite.wburl import WbUrl
|
||||
from pywb.utils.canonicalize import canonicalize
|
||||
from pywb.utils.io import BUFF_SIZE, OffsetLimitReader, no_except_close
|
||||
from pywb.utils.memento import MementoUtils
|
||||
from pywb.utils.wbexception import NotFoundException, UpstreamException
|
||||
from pywb.warcserver.index.cdxobject import CDXObject
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class RewriterApp(object):
|
||||
"""Primary application for rewriting the content served by pywb (if it is to be rewritten).
|
||||
|
||||
This class is also responsible rendering the archives templates
|
||||
"""
|
||||
VIDEO_INFO_CONTENT_TYPE = 'application/vnd.youtube-dl_formats+json'
|
||||
|
||||
DEFAULT_CSP = "default-src 'unsafe-eval' 'unsafe-inline' 'self' data: blob: mediastream: ws: wss: ; form-action 'self'"
|
||||
|
||||
def __init__(self, framed_replay=False, jinja_env=None, config=None, paths=None):
|
||||
"""Initialize a new instance of RewriterApp
|
||||
|
||||
:param bool framed_replay: Is rewriting happening in framed replay mode
|
||||
:param JinjaEnv|None jinja_env: Optional JinjaEnv instance to be used for
|
||||
rendering static files
|
||||
:param dict|None config: Optional config dictionary
|
||||
:param dict|None paths: Optional dictionary containing a mapping
|
||||
of path names to URLs
|
||||
"""
|
||||
self.loader = ArcWarcRecordLoader()
|
||||
|
||||
self.config = config or {}
|
||||
@ -65,27 +55,38 @@ class RewriterApp(object):
|
||||
self.frame_mod = None
|
||||
self.replay_mod = ''
|
||||
|
||||
self.enable_prefer = self.config.get('enable_prefer', False)
|
||||
|
||||
self.default_rw = DefaultRewriter(replay_mod=self.replay_mod,
|
||||
config=config)
|
||||
|
||||
self.js_proxy_rw = RewriterWithJSProxy(replay_mod=self.replay_mod)
|
||||
|
||||
if not jinja_env:
|
||||
jinja_env = JinjaEnv(globals={'static_path': 'static'})
|
||||
jinja_env = JinjaEnv(globals={'static_path': 'static'},
|
||||
extensions=['jinja2.ext.i18n'])
|
||||
jinja_env.jinja_env.install_null_translations()
|
||||
|
||||
self.jinja_env = jinja_env
|
||||
self.loc_map = {}
|
||||
|
||||
self.jinja_env.init_loc(self.config.get('locales_root_dir'),
|
||||
self.config.get('locales'),
|
||||
self.loc_map,
|
||||
self.config.get('default_locale'))
|
||||
|
||||
self.redirect_to_exact = config.get('redirect_to_exact')
|
||||
|
||||
self.banner_view = BaseInsertView(self.jinja_env, self._html_templ('banner_html'))
|
||||
self.custom_banner_view = BaseInsertView(self.jinja_env, self._html_templ('custom_banner_html'))
|
||||
|
||||
self.head_insert_view = HeadInsertView(self.jinja_env,
|
||||
self._html_templ('head_insert_html'),
|
||||
self.banner_view)
|
||||
self.custom_banner_view)
|
||||
|
||||
self.frame_insert_view = TopFrameView(self.jinja_env,
|
||||
self._html_templ('frame_insert_html'),
|
||||
self.banner_view)
|
||||
self._html_templ('frame_insert_html'),
|
||||
self.banner_view)
|
||||
|
||||
self.error_view = BaseInsertView(self.jinja_env, self._html_templ('error_html'))
|
||||
self.not_found_view = BaseInsertView(self.jinja_env, self._html_templ('not_found_html'))
|
||||
@ -93,10 +94,12 @@ class RewriterApp(object):
|
||||
|
||||
self.use_js_obj_proxy = config.get('use_js_obj_proxy', True)
|
||||
|
||||
self.cookie_tracker = None
|
||||
self.cookie_tracker = self._init_cookie_tracker()
|
||||
|
||||
self.enable_memento = self.config.get('enable_memento')
|
||||
|
||||
self.static_prefix = self.config.get('static_prefix', 'static')
|
||||
|
||||
csp_header = self.config.get('csp-header', self.DEFAULT_CSP)
|
||||
if csp_header:
|
||||
self.csp_header = ('Content-Security-Policy', csp_header)
|
||||
@ -106,22 +109,65 @@ class RewriterApp(object):
|
||||
# deprecated: Use X-Forwarded-Proto header instead!
|
||||
self.force_scheme = config.get('force_scheme')
|
||||
|
||||
def _init_cookie_tracker(self, redis=None):
|
||||
"""Initialize the CookieTracker
|
||||
|
||||
:param redis: Optional redis instance to be used
|
||||
Defaults to FakeStrictRedis
|
||||
:return: The initialized cookie tracker
|
||||
:rtype: CookieTracker
|
||||
"""
|
||||
if redis is None:
|
||||
redis = FakeStrictRedis()
|
||||
return CookieTracker(redis)
|
||||
|
||||
def add_csp_header(self, wb_url, status_headers):
|
||||
"""Adds Content-Security-Policy headers to the supplied
|
||||
StatusAndHeaders instance if the wb_url's mod is equal
|
||||
to the replay mod
|
||||
|
||||
:param WbUrl wb_url: The WbUrl for the URL being operated on
|
||||
:param warcio.StatusAndHeaders status_headers: The status and
|
||||
headers instance for the reply to the URL
|
||||
"""
|
||||
if self.csp_header and wb_url.mod == self.replay_mod:
|
||||
status_headers.headers.append(self.csp_header)
|
||||
|
||||
def _html_templ(self, name):
|
||||
"""Returns the html file name for the supplied
|
||||
html template name.
|
||||
|
||||
:param str name: The name of the html template
|
||||
:return: The file name for the template
|
||||
:rtype: str|None
|
||||
"""
|
||||
value = self.config.get(name)
|
||||
if not value:
|
||||
value = name.replace('_html', '.html')
|
||||
return value
|
||||
|
||||
def is_framed_replay(self, wb_url):
|
||||
"""Returns T/F indicating if the rewriter app is configured to
|
||||
be operating in framed replay mode and the supplied WbUrl
|
||||
is also operating in framed replay mode
|
||||
|
||||
:param WbUrl wb_url: The WbUrl instance to check
|
||||
:return: T/F if in framed replay mode
|
||||
:rtype: bool
|
||||
"""
|
||||
return (self.framed_replay and
|
||||
wb_url.mod == self.frame_mod and
|
||||
wb_url.is_replay())
|
||||
|
||||
def _check_accept_dt(self, wb_url, environ):
|
||||
"""Returns T/F indicating if the supplied WbUrl instance
|
||||
is for a timegate request
|
||||
|
||||
:param WbUrl wb_url: The URL to be checked
|
||||
:param dict environ: The wsgi environment object for the request
|
||||
:return: T/F indicating if the WbUrl is for timegate request
|
||||
:rtype: bool
|
||||
"""
|
||||
is_timegate = False
|
||||
if wb_url.is_latest_replay():
|
||||
accept_dt = environ.get('HTTP_ACCEPT_DATETIME')
|
||||
@ -129,9 +175,9 @@ class RewriterApp(object):
|
||||
if accept_dt:
|
||||
try:
|
||||
wb_url.timestamp = http_date_to_timestamp(accept_dt)
|
||||
except:
|
||||
except Exception:
|
||||
raise UpstreamException(400, url=wb_url.url, details='Invalid Accept-Datetime')
|
||||
#return WbResponse.text_response('Invalid Accept-Datetime', status='400 Bad Request')
|
||||
# return WbResponse.text_response('Invalid Accept-Datetime', status='400 Bad Request')
|
||||
|
||||
wb_url.type = wb_url.REPLAY
|
||||
|
||||
@ -141,7 +187,46 @@ class RewriterApp(object):
|
||||
|
||||
return is_timegate
|
||||
|
||||
def _get_prefer_mod(self, wb_url, environ, content_rw, is_proxy):
|
||||
"""Returns the default rewrite modifier and rewrite modifier based on the
|
||||
value of the Prefer HTTP header if it is present
|
||||
|
||||
:param WbUrl wb_url: The WbUrl for the URL being rewritten
|
||||
:param dict environ: The WSGI environment dictionary for the request
|
||||
:param content_rw: The content rewriter instance
|
||||
:param bool is_proxy: Is the rewrite operating in proxy mode
|
||||
:return: A tuple containing the default rewrite modifier and rewrite modifier based
|
||||
on the value of the Prefer HTTP header if it is present
|
||||
:rtype: tuple[str|None, str|None]
|
||||
"""
|
||||
if not self.enable_prefer:
|
||||
return None, None
|
||||
|
||||
prefer = environ.get('HTTP_PREFER')
|
||||
if not prefer:
|
||||
return None, content_rw.mod_to_prefer(wb_url.mod)
|
||||
|
||||
mod = content_rw.prefer_to_mod(prefer)
|
||||
|
||||
if mod is None:
|
||||
raise UpstreamException(400, url=wb_url.url, details='Invalid Prefer: ' + prefer)
|
||||
|
||||
if is_proxy and mod == self.replay_mod:
|
||||
mod = 'bn_'
|
||||
prefer = content_rw.mod_to_prefer('bn_')
|
||||
|
||||
return mod, prefer
|
||||
|
||||
def _check_range(self, inputreq, wb_url):
|
||||
"""Checks the input request if it is a range request returning
|
||||
the start and end of the range as well as T/F if the request should
|
||||
be skipped as a tuple.
|
||||
|
||||
:param RewriteInputRequest inputreq: The input request to check range
|
||||
:param WbUrl wb_url: The WbUrl associated with the request
|
||||
:return: A tuple with the start, end, and T/F should skip request
|
||||
:rtype: tuple[int|None, int|None, bool]
|
||||
"""
|
||||
skip_record = False
|
||||
range_start = None
|
||||
range_end = None
|
||||
@ -163,7 +248,7 @@ class RewriterApp(object):
|
||||
range_start = start
|
||||
range_end = end
|
||||
|
||||
#if start with 0, load from upstream, but add range after
|
||||
# if start with 0, load from upstream, but add range after
|
||||
if start == 0:
|
||||
del inputreq.env['HTTP_RANGE']
|
||||
else:
|
||||
@ -193,11 +278,6 @@ class RewriterApp(object):
|
||||
|
||||
if range_start >= content_length or range_end >= content_length:
|
||||
details = 'Invalid Range: {0} >= {2} or {1} >= {2}'.format(range_start, range_end, content_length)
|
||||
try:
|
||||
r.raw.close()
|
||||
except:
|
||||
pass
|
||||
|
||||
raise UpstreamException(416, url=wb_url.url, details=details)
|
||||
|
||||
range_len = range_end - range_start + 1
|
||||
@ -224,28 +304,90 @@ class RewriterApp(object):
|
||||
|
||||
return resp
|
||||
|
||||
def render_content(self, wb_url, kwargs, environ):
|
||||
wb_url = wb_url.replace('#', '%23')
|
||||
wb_url = WbUrl(wb_url)
|
||||
def prepare_env(self, environ):
|
||||
""" setup environ path prefixes and scheme """
|
||||
if 'pywb.host_prefix' in environ:
|
||||
return
|
||||
|
||||
proto = environ.get('HTTP_X_FORWARDED_PROTO', self.force_scheme)
|
||||
|
||||
if proto:
|
||||
environ['wsgi.url_scheme'] = proto
|
||||
|
||||
environ['pywb.host_prefix'] = self.get_host_prefix(environ)
|
||||
environ['pywb.app_prefix'] = environ.get('SCRIPT_NAME', '')
|
||||
environ['pywb.static_prefix'] = environ['pywb.host_prefix'] + environ['pywb.app_prefix'] + '/' + self.static_prefix
|
||||
|
||||
def render_content(self, wb_url, kwargs, environ):
|
||||
wb_url = wb_url.replace('#', '%23')
|
||||
wb_url = WbUrl(wb_url)
|
||||
|
||||
history_page = environ.pop('HTTP_X_WOMBAT_HISTORY_PAGE', '')
|
||||
if history_page:
|
||||
wb_url.url = history_page
|
||||
is_ajax = True
|
||||
else:
|
||||
is_ajax = self.is_ajax(environ)
|
||||
|
||||
is_timegate = self._check_accept_dt(wb_url, environ)
|
||||
|
||||
host_prefix = self.get_host_prefix(environ)
|
||||
self.prepare_env(environ)
|
||||
|
||||
host_prefix = environ['pywb.host_prefix']
|
||||
rel_prefix = self.get_rel_prefix(environ)
|
||||
full_prefix = host_prefix + rel_prefix
|
||||
|
||||
pywb_static_prefix = environ['pywb.static_prefix'] + '/'
|
||||
is_proxy = ('wsgiprox.proxy_host' in environ)
|
||||
|
||||
response = self.handle_custom_response(environ, wb_url,
|
||||
full_prefix, host_prefix,
|
||||
kwargs)
|
||||
# if OPTIONS in proxy mode, just generate the proxy responss
|
||||
if is_proxy and self.is_preflight(environ):
|
||||
return WbResponse.options_response(environ)
|
||||
|
||||
if response:
|
||||
if self.use_js_obj_proxy:
|
||||
content_rw = self.js_proxy_rw
|
||||
else:
|
||||
content_rw = self.default_rw
|
||||
|
||||
# no redirects if in proxy
|
||||
redirect_to_exact = self.redirect_to_exact and not is_proxy
|
||||
|
||||
# Check Prefer
|
||||
pref_mod, pref_applied = self._get_prefer_mod(wb_url, environ,
|
||||
content_rw, is_proxy)
|
||||
|
||||
response = None
|
||||
keep_frame_response = False
|
||||
|
||||
# prefer overrides custom response?
|
||||
if pref_mod is not None:
|
||||
# fast-redirect to preferred
|
||||
if redirect_to_exact and not is_timegate and pref_mod != wb_url.mod:
|
||||
new_url = full_prefix + wb_url.to_str(mod=pref_mod)
|
||||
headers = [('Preference-Applied', pref_applied),
|
||||
('Vary', 'Prefer')]
|
||||
|
||||
return WbResponse.redir_response(new_url,
|
||||
'307 Temporary Redirect',
|
||||
headers=headers)
|
||||
else:
|
||||
wb_url.mod = pref_mod
|
||||
else:
|
||||
if kwargs.get('output'):
|
||||
response = self.handle_timemap(wb_url, kwargs, full_prefix)
|
||||
|
||||
elif wb_url.is_query():
|
||||
response = self.handle_query(environ, wb_url, kwargs, full_prefix)
|
||||
|
||||
else:
|
||||
response = self.handle_custom_response(environ, wb_url,
|
||||
full_prefix, host_prefix,
|
||||
kwargs)
|
||||
|
||||
keep_frame_response = (not kwargs.get('no_timegate_check') and is_timegate and not is_proxy) or redirect_to_exact
|
||||
|
||||
|
||||
if response and not keep_frame_response:
|
||||
return self.format_response(response, wb_url, full_prefix, is_timegate, is_proxy)
|
||||
|
||||
if is_proxy:
|
||||
@ -257,7 +399,8 @@ class RewriterApp(object):
|
||||
urlrewriter = UrlRewriter(wb_url,
|
||||
prefix=full_prefix,
|
||||
full_prefix=full_prefix,
|
||||
rel_prefix=rel_prefix)
|
||||
rel_prefix=rel_prefix,
|
||||
pywb_static_prefix=pywb_static_prefix)
|
||||
|
||||
framed_replay = self.framed_replay
|
||||
|
||||
@ -269,13 +412,6 @@ class RewriterApp(object):
|
||||
|
||||
urlkey = canonicalize(wb_url.url)
|
||||
|
||||
environ['pywb.host_prefix'] = host_prefix
|
||||
|
||||
if self.use_js_obj_proxy:
|
||||
content_rw = self.js_proxy_rw
|
||||
else:
|
||||
content_rw = self.default_rw
|
||||
|
||||
inputreq = RewriteInputRequest(environ, urlkey, wb_url.url, content_rw)
|
||||
|
||||
inputreq.include_method_query(wb_url.url)
|
||||
@ -283,10 +419,15 @@ class RewriterApp(object):
|
||||
range_start, range_end, skip_record = self._check_range(inputreq, wb_url)
|
||||
|
||||
setcookie_headers = None
|
||||
cookie_key = None
|
||||
if self.cookie_tracker:
|
||||
cookie_key = self.get_cookie_key(kwargs)
|
||||
res = self.cookie_tracker.get_cookie_headers(wb_url.url, urlrewriter, cookie_key)
|
||||
inputreq.extra_cookie, setcookie_headers = res
|
||||
if cookie_key:
|
||||
res = self.cookie_tracker.get_cookie_headers(wb_url.url,
|
||||
urlrewriter,
|
||||
cookie_key,
|
||||
environ.get('HTTP_COOKIE', ''))
|
||||
inputreq.extra_cookie, setcookie_headers = res
|
||||
|
||||
r = self._do_req(inputreq, wb_url, kwargs, skip_record)
|
||||
|
||||
@ -294,9 +435,10 @@ class RewriterApp(object):
|
||||
error = None
|
||||
try:
|
||||
error = r.raw.read()
|
||||
r.raw.close()
|
||||
except:
|
||||
except Exception:
|
||||
pass
|
||||
finally:
|
||||
no_except_close(r.raw)
|
||||
|
||||
if error:
|
||||
error = error.decode('utf-8')
|
||||
@ -304,7 +446,11 @@ class RewriterApp(object):
|
||||
error = ''
|
||||
|
||||
details = dict(args=kwargs, error=error)
|
||||
raise UpstreamException(r.status_code, url=wb_url.url, details=details)
|
||||
if r.status_code == 404:
|
||||
raise NotFoundException(url=wb_url.url, msg=details)
|
||||
|
||||
else:
|
||||
raise UpstreamException(r.status_code, url=wb_url.url, details=details)
|
||||
|
||||
cdx = CDXObject(r.headers.get('Warcserver-Cdx').encode('utf-8'))
|
||||
|
||||
@ -314,13 +460,19 @@ class RewriterApp(object):
|
||||
# add trailing slash
|
||||
new_path = url_parts.path + '/'
|
||||
|
||||
try:
|
||||
r.raw.close()
|
||||
except:
|
||||
pass
|
||||
no_except_close(r.raw)
|
||||
|
||||
return self.send_redirect(new_path, url_parts, urlrewriter)
|
||||
|
||||
|
||||
# only redirect to exact if not live, otherwise set to false
|
||||
redirect_to_exact = redirect_to_exact and not cdx.get('is_live')
|
||||
|
||||
# return top-frame timegate response, with timestamp from cdx
|
||||
if response and keep_frame_response and (not redirect_to_exact or not is_timegate):
|
||||
no_except_close(r.raw)
|
||||
return self.format_response(response, wb_url, full_prefix, is_timegate, is_proxy, cdx['timestamp'])
|
||||
|
||||
stream = BufferedReader(r.raw, block_size=BUFF_SIZE)
|
||||
record = self.loader.parse_record_stream(stream,
|
||||
ensure_http_headers=True)
|
||||
@ -328,9 +480,9 @@ class RewriterApp(object):
|
||||
memento_dt = r.headers.get('Memento-Datetime')
|
||||
target_uri = r.headers.get('WARC-Target-URI')
|
||||
|
||||
#cdx['urlkey'] = urlkey
|
||||
#cdx['timestamp'] = http_date_to_timestamp(memento_dt)
|
||||
#cdx['url'] = target_uri
|
||||
# cdx['urlkey'] = urlkey
|
||||
# cdx['timestamp'] = http_date_to_timestamp(memento_dt)
|
||||
# cdx['url'] = target_uri
|
||||
|
||||
set_content_loc = False
|
||||
|
||||
@ -338,11 +490,9 @@ class RewriterApp(object):
|
||||
if target_uri != wb_url.url and cdx.get('is_fuzzy') == '1':
|
||||
set_content_loc = True
|
||||
|
||||
# if redir to exact, redir if url or ts are different
|
||||
if self.redirect_to_exact:
|
||||
if (set_content_loc or
|
||||
(wb_url.timestamp != cdx.get('timestamp') and not cdx.get('is_live'))):
|
||||
|
||||
# if redirect to exact timestamp (only set if not live)
|
||||
if redirect_to_exact:
|
||||
if set_content_loc or is_timegate or wb_url.timestamp != cdx.get('timestamp'):
|
||||
new_url = urlrewriter.get_new_url(url=target_uri,
|
||||
timestamp=cdx['timestamp'],
|
||||
mod=wb_url.mod)
|
||||
@ -353,7 +503,10 @@ class RewriterApp(object):
|
||||
self._add_memento_links(target_uri, full_prefix,
|
||||
memento_dt, cdx['timestamp'],
|
||||
resp.status_headers,
|
||||
is_timegate, is_proxy)
|
||||
is_timegate, is_proxy,
|
||||
pref_applied=pref_applied,
|
||||
mod=pref_mod,
|
||||
is_memento=False)
|
||||
|
||||
else:
|
||||
resp.status_headers['Link'] = MementoUtils.make_link(target_uri, 'original')
|
||||
@ -365,26 +518,31 @@ class RewriterApp(object):
|
||||
if self._add_range(record, wb_url, range_start, range_end):
|
||||
wb_url.mod = 'id_'
|
||||
|
||||
is_ajax = self.is_ajax(environ)
|
||||
|
||||
if is_ajax:
|
||||
head_insert_func = None
|
||||
urlrewriter.rewrite_opts['is_ajax'] = True
|
||||
else:
|
||||
top_url = self.get_top_url(full_prefix, wb_url, cdx, kwargs)
|
||||
head_insert_func = (self.head_insert_view.
|
||||
create_insert_func(wb_url,
|
||||
full_prefix,
|
||||
host_prefix,
|
||||
top_url,
|
||||
environ,
|
||||
framed_replay,
|
||||
coll=kwargs.get('coll', ''),
|
||||
replay_mod=self.replay_mod,
|
||||
config=self.config))
|
||||
create_insert_func(wb_url,
|
||||
full_prefix,
|
||||
host_prefix,
|
||||
top_url,
|
||||
environ,
|
||||
framed_replay,
|
||||
coll=kwargs.get('coll', ''),
|
||||
replay_mod=self.replay_mod,
|
||||
metadata=kwargs.get('metadata', {}),
|
||||
ui=kwargs.get('ui', {}),
|
||||
config=self.config))
|
||||
|
||||
cookie_rewriter = None
|
||||
if self.cookie_tracker:
|
||||
if self.cookie_tracker and cookie_key:
|
||||
# skip add cookie if service worker is not 200
|
||||
# it seems cookie headers from service workers are not applied, so don't update in cache
|
||||
if wb_url.mod == 'sw_':
|
||||
cookie_key = None
|
||||
|
||||
cookie_rewriter = self.cookie_tracker.get_rewriter(urlrewriter,
|
||||
cookie_key)
|
||||
|
||||
@ -394,6 +552,17 @@ class RewriterApp(object):
|
||||
|
||||
status_headers, gen, is_rw = result
|
||||
|
||||
if history_page:
|
||||
title = DefaultRewriter._extract_title(gen)
|
||||
if not title:
|
||||
title = unquote(environ.get('HTTP_X_WOMBAT_HISTORY_TITLE', ''))
|
||||
|
||||
if not title:
|
||||
title = history_page
|
||||
|
||||
self._add_history_page(cdx, kwargs, title)
|
||||
return WbResponse.json_response({'title': title})
|
||||
|
||||
if setcookie_headers:
|
||||
status_headers.headers.extend(setcookie_headers)
|
||||
|
||||
@ -403,21 +572,29 @@ class RewriterApp(object):
|
||||
if not is_ajax and self.enable_memento:
|
||||
self._add_memento_links(cdx['url'], full_prefix,
|
||||
memento_dt, cdx['timestamp'], status_headers,
|
||||
is_timegate, is_proxy, cdx.get('source-coll'))
|
||||
is_timegate, is_proxy, cdx.get('source-coll'),
|
||||
mod=pref_mod, pref_applied=pref_applied)
|
||||
|
||||
set_content_loc = True
|
||||
|
||||
if set_content_loc and not self.redirect_to_exact:
|
||||
if set_content_loc and not redirect_to_exact and not is_proxy:
|
||||
status_headers.headers.append(('Content-Location', urlrewriter.get_new_url(timestamp=cdx['timestamp'],
|
||||
url=cdx['url'])))
|
||||
|
||||
if not is_proxy:
|
||||
self.add_csp_header(wb_url, status_headers)
|
||||
|
||||
response = WbResponse(status_headers, gen)
|
||||
|
||||
if is_proxy and environ.get('HTTP_ORIGIN'):
|
||||
response.add_access_control_headers(environ)
|
||||
|
||||
if r.status_code == 200 and kwargs.get('cache') == 'always' and environ.get('HTTP_REFERER'):
|
||||
response.status_headers['Cache-Control'] = 'public, max-age=31536000, immutable'
|
||||
|
||||
return response
|
||||
|
||||
def format_response(self, response, wb_url, full_prefix, is_timegate, is_proxy):
|
||||
def format_response(self, response, wb_url, full_prefix, is_timegate, is_proxy, timegate_closest_ts=None):
|
||||
memento_ts = None
|
||||
if not isinstance(response, WbResponse):
|
||||
content_type = 'text/html'
|
||||
@ -426,34 +603,53 @@ class RewriterApp(object):
|
||||
if not self.is_framed_replay(wb_url):
|
||||
content_type += '; charset=utf-8'
|
||||
else:
|
||||
memento_ts = wb_url.timestamp
|
||||
memento_ts = timegate_closest_ts or wb_url.timestamp
|
||||
|
||||
response = WbResponse.text_response(response, content_type=content_type)
|
||||
|
||||
if self.enable_memento:
|
||||
if self.enable_memento and response.status_headers.statusline.startswith('200'):
|
||||
self._add_memento_links(wb_url.url, full_prefix, None, memento_ts,
|
||||
response.status_headers, is_timegate, is_proxy)
|
||||
response.status_headers, is_timegate, is_proxy, is_memento=not is_timegate)
|
||||
return response
|
||||
|
||||
def _add_memento_links(self, url, full_prefix, memento_dt, memento_ts,
|
||||
status_headers, is_timegate, is_proxy, coll=None):
|
||||
status_headers, is_timegate, is_proxy, coll=None,
|
||||
pref_applied=None, mod=None, is_memento=True):
|
||||
"""Adds the memento link headers to supplied StatusAndHeaders instance
|
||||
|
||||
:param str url: The URI-R being rewritten
|
||||
:param str full_prefix: The replay prefix
|
||||
:param str|None memento_dt: The memento datetime for the URI-R being rewritten
|
||||
:param str memento_ts: The memento timestamp
|
||||
:param warcio.StatusAndHeaders status_headers:
|
||||
:param bool is_timegate: Are we returning a response for a timegate
|
||||
:param bool is_proxy: Are we operating in proxy mode
|
||||
:param str|None coll: The collection the URI-R is from
|
||||
:param str|None pref_applied:
|
||||
:param str|None mod: The rewrite modifier
|
||||
:param bool is_memento:
|
||||
:rtype: None
|
||||
"""
|
||||
|
||||
replay_mod = mod or self.replay_mod
|
||||
|
||||
# memento url + header
|
||||
if not memento_dt and memento_ts:
|
||||
memento_dt = timestamp_to_http_date(memento_ts)
|
||||
|
||||
if memento_dt:
|
||||
status_headers.headers.append(('Memento-Datetime', memento_dt))
|
||||
if is_memento:
|
||||
status_headers.headers.append(('Memento-Datetime', memento_dt))
|
||||
|
||||
if is_proxy:
|
||||
memento_url = url
|
||||
else:
|
||||
memento_url = full_prefix + memento_ts + self.replay_mod
|
||||
memento_url = full_prefix + memento_ts + replay_mod
|
||||
memento_url += '/' + url
|
||||
else:
|
||||
memento_url = None
|
||||
|
||||
timegate_url, timemap_url = self._get_timegate_timemap(url, full_prefix)
|
||||
timegate_url, timemap_url = self._get_timegate_timemap(url, full_prefix, mod)
|
||||
|
||||
link = []
|
||||
if not is_proxy:
|
||||
@ -468,14 +664,23 @@ class RewriterApp(object):
|
||||
|
||||
status_headers.headers.append(('Link', link_str))
|
||||
|
||||
vary = ''
|
||||
if is_timegate:
|
||||
status_headers.headers.append(('Vary', 'accept-datetime'))
|
||||
vary = 'accept-datetime'
|
||||
|
||||
def _get_timegate_timemap(self, url, full_prefix):
|
||||
if pref_applied:
|
||||
vary = 'Prefer' if not vary else vary + ', Prefer'
|
||||
status_headers.headers.append(('Preference-Applied', pref_applied))
|
||||
|
||||
if vary:
|
||||
status_headers.headers.append(('Vary', vary))
|
||||
|
||||
def _get_timegate_timemap(self, url, full_prefix, mod):
|
||||
# timegate url
|
||||
timegate_url = full_prefix
|
||||
if self.replay_mod:
|
||||
timegate_url += self.replay_mod + '/'
|
||||
mod = ''
|
||||
if mod:
|
||||
timegate_url += mod + '/'
|
||||
|
||||
timegate_url += url
|
||||
|
||||
@ -484,38 +689,38 @@ class RewriterApp(object):
|
||||
return timegate_url, timemap_url
|
||||
|
||||
def get_top_url(self, full_prefix, wb_url, cdx, kwargs):
|
||||
top_url = full_prefix
|
||||
top_url += wb_url.to_str(mod='')
|
||||
top_url = full_prefix + wb_url.to_str(mod='')
|
||||
return top_url
|
||||
|
||||
def handle_error(self, environ, ue):
|
||||
if ue.status_code == 404:
|
||||
return self._not_found_response(environ, ue.url)
|
||||
|
||||
def handle_error(self, environ, wbe):
|
||||
if isinstance(wbe, NotFoundException):
|
||||
return self._not_found_response(environ, wbe.url)
|
||||
else:
|
||||
status = str(ue.status_code) + ' ' + HTTP_STATUS_CODES.get(ue.status_code, 'Unknown Error')
|
||||
return self._error_response(environ, ue.url, ue.msg,
|
||||
status=status)
|
||||
return self._error_response(environ, wbe)
|
||||
|
||||
def _not_found_response(self, environ, url):
|
||||
resp = self.not_found_view.render_to_string(environ, url=url)
|
||||
resp = self.not_found_view.render_to_string(environ, url=url, err_msg="Not Found")
|
||||
|
||||
return WbResponse.text_response(resp, status='404 Not Found', content_type='text/html')
|
||||
|
||||
def _error_response(self, environ, msg='', details='', status='404 Not Found'):
|
||||
def _error_response(self, environ, wbe):
|
||||
status = wbe.status()
|
||||
|
||||
resp = self.error_view.render_to_string(environ,
|
||||
err_msg=msg,
|
||||
err_details=details)
|
||||
err_msg=wbe.url,
|
||||
err_details=wbe.msg,
|
||||
err_status=wbe.status_code)
|
||||
|
||||
return WbResponse.text_response(resp, status=status, content_type='text/html')
|
||||
|
||||
|
||||
def _do_req(self, inputreq, wb_url, kwargs, skip_record):
|
||||
req_data = inputreq.reconstruct_request(wb_url.url)
|
||||
|
||||
headers = {'Content-Length': str(len(req_data)),
|
||||
'Content-Type': 'application/request'}
|
||||
|
||||
headers.update(inputreq.warcserver_headers)
|
||||
|
||||
if skip_record:
|
||||
headers['Recorder-Skip'] = '1'
|
||||
|
||||
@ -524,10 +729,7 @@ class RewriterApp(object):
|
||||
else:
|
||||
closest = wb_url.timestamp
|
||||
|
||||
params = {}
|
||||
params['url'] = wb_url.url
|
||||
params['closest'] = closest
|
||||
params['matchType'] = 'exact'
|
||||
params = {'url': wb_url.url, 'closest': closest, 'matchType': 'exact'}
|
||||
|
||||
if wb_url.mod == 'vi_':
|
||||
params['content_type'] = self.VIDEO_INFO_CONTENT_TYPE
|
||||
@ -542,11 +744,25 @@ class RewriterApp(object):
|
||||
return r
|
||||
|
||||
def do_query(self, wb_url, kwargs):
|
||||
params = {}
|
||||
params['url'] = wb_url.url
|
||||
params['output'] = kwargs.get('output', 'json')
|
||||
params['from'] = wb_url.timestamp
|
||||
params['to'] = wb_url.end_timestamp
|
||||
"""Performs the timemap query request for the supplied WbUrl
|
||||
returning the response
|
||||
|
||||
:param WbUrl wb_url: The WbUrl to be queried
|
||||
:param dict kwargs: Optional keyword arguments
|
||||
:return: The queries response
|
||||
:rtype: requests.Response
|
||||
"""
|
||||
params = {
|
||||
'url': wb_url.url,
|
||||
'output': kwargs.get('output', 'json'),
|
||||
'from': wb_url.timestamp,
|
||||
'to': wb_url.end_timestamp
|
||||
}
|
||||
if 'memento_format' in kwargs:
|
||||
params['memento_format'] = kwargs['memento_format']
|
||||
|
||||
if 'limit' in kwargs:
|
||||
params['limit'] = kwargs['limit']
|
||||
|
||||
upstream_url = self.get_upstream_url(wb_url, kwargs, params)
|
||||
upstream_url = upstream_url.replace('/resource/postreq', '/index')
|
||||
@ -568,7 +784,7 @@ class RewriterApp(object):
|
||||
status = str(res.status_code) + ' ' + res.reason
|
||||
|
||||
if res.status_code == 200 and output == 'link':
|
||||
timegate, timemap = self._get_timegate_timemap(wb_url.url, full_prefix)
|
||||
timegate, timemap = self._get_timegate_timemap(wb_url.url, full_prefix, wb_url.mod)
|
||||
|
||||
text = MementoUtils.wrap_timemap_header(wb_url.url,
|
||||
timegate,
|
||||
@ -580,14 +796,24 @@ class RewriterApp(object):
|
||||
|
||||
def handle_timemap(self, wb_url, kwargs, full_prefix):
|
||||
output = kwargs.get('output')
|
||||
kwargs['memento_format'] = full_prefix + '{timestamp}' + self.replay_mod + '/{url}'
|
||||
res = self.do_query(wb_url, kwargs)
|
||||
return self.make_timemap(wb_url, res, full_prefix, output)
|
||||
|
||||
def handle_query(self, environ, wb_url, kwargs, full_prefix):
|
||||
prefix = self.get_full_prefix(environ)
|
||||
|
||||
res = dict(parse_qsl(environ.get("QUERY_STRING")))
|
||||
is_advanced = res.get("matchType", "exact") != "exact" or res.get("url", "").endswith("*")
|
||||
|
||||
# vue ui not supported for advanced search for now
|
||||
ui = kwargs.get("ui", {})
|
||||
if is_advanced:
|
||||
ui["vue_calendar_ui"] = False
|
||||
|
||||
params = dict(url=wb_url.url,
|
||||
prefix=prefix)
|
||||
prefix=prefix,
|
||||
ui=ui)
|
||||
|
||||
return self.query_view.render_to_string(environ, **params)
|
||||
|
||||
@ -616,8 +842,8 @@ class RewriterApp(object):
|
||||
return scheme + host
|
||||
|
||||
def get_rel_prefix(self, environ):
|
||||
#return request.script_name
|
||||
return environ.get('SCRIPT_NAME') + '/'
|
||||
# return request.script_name
|
||||
return environ.get('SCRIPT_NAME', '') + '/'
|
||||
|
||||
def get_full_prefix(self, environ):
|
||||
return self.get_host_prefix(environ) + self.get_rel_prefix(environ)
|
||||
@ -641,11 +867,35 @@ class RewriterApp(object):
|
||||
if value and value.lower() == 'xmlhttprequest':
|
||||
return True
|
||||
|
||||
|
||||
# additional checks for proxy mode only
|
||||
if not ('wsgiprox.proxy_host' in environ):
|
||||
return False
|
||||
|
||||
# if Chrome Sec-Fetch-Mode is set and is set to 'cors', then
|
||||
# a fetch / ajax request
|
||||
sec_fetch_mode = environ.get('HTTP_SEC_FETCH_MODE')
|
||||
if sec_fetch_mode and sec_fetch_mode == 'cors':
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def is_preflight(self, environ):
|
||||
if environ.get('REQUEST_METHOD') != 'OPTIONS':
|
||||
return False
|
||||
|
||||
if not environ.get('HTTP_ORIGIN'):
|
||||
return False
|
||||
|
||||
if not environ.get('HTTP_ACCESS_CONTROL_REQUEST_METHOD') and not environ.get('HTTP_ACCESS_CONTROL_REQUEST_HEADERS'):
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def get_base_url(self, wb_url, kwargs):
|
||||
type = kwargs.get('type')
|
||||
return self.paths[type].format(**kwargs)
|
||||
type_ = kwargs.get('type')
|
||||
return self.paths[type_].format(**kwargs)
|
||||
|
||||
def get_upstream_url(self, wb_url, kwargs, params):
|
||||
base_url = self.get_base_url(wb_url, kwargs)
|
||||
@ -656,21 +906,25 @@ class RewriterApp(object):
|
||||
return base_url
|
||||
|
||||
def get_cookie_key(self, kwargs):
|
||||
raise NotImplemented()
|
||||
# note: currently this is per-collection, so enabled only for live or recording
|
||||
# to support multiple users recording/live, would need per user cookie
|
||||
if kwargs.get('index') == '$live' or kwargs.get('type') == 'record':
|
||||
return 'cookie:' + kwargs['coll']
|
||||
else:
|
||||
return None
|
||||
|
||||
def _add_history_page(self, cdx, kwargs, doc_title):
|
||||
pass
|
||||
|
||||
def _add_custom_params(self, cdx, headers, kwargs, record):
|
||||
pass
|
||||
|
||||
def get_top_frame_params(self, wb_url, kwargs):
|
||||
return None
|
||||
return {'metadata': kwargs.get('metadata', {}),
|
||||
'ui': kwargs.get('ui', {})
|
||||
}
|
||||
|
||||
def handle_custom_response(self, environ, wb_url, full_prefix, host_prefix, kwargs):
|
||||
if kwargs.get('output'):
|
||||
return self.handle_timemap(wb_url, kwargs, full_prefix)
|
||||
|
||||
if wb_url.is_query():
|
||||
return self.handle_query(environ, wb_url, kwargs, full_prefix)
|
||||
|
||||
if self.is_framed_replay(wb_url):
|
||||
extra_params = self.get_top_frame_params(wb_url, kwargs)
|
||||
return self.frame_insert_view.get_top_frame(wb_url,
|
||||
|
@ -20,6 +20,9 @@ class StaticHandler(object):
|
||||
def __call__(self, environ, url_str):
|
||||
url = url_str.split('?')[0]
|
||||
|
||||
if url.endswith('/'):
|
||||
url += 'index.html'
|
||||
|
||||
full_path = environ.get('pywb.static_dir')
|
||||
if full_path:
|
||||
full_path = os.path.join(full_path, url)
|
||||
|
@ -26,7 +26,7 @@ class TestRewriterApp(FakeRedisTests, BaseTestClass):
|
||||
resp = self.testapp.get('/live/mp_/http://example.com/')
|
||||
resp.charset = 'utf-8'
|
||||
|
||||
assert '"http://localhost:80/live/mp_/http://www.iana.org/domains/example"' in resp.text
|
||||
assert '"http://localhost:80/live/mp_/https://www.iana.org/domains/example"' in resp.text
|
||||
|
||||
assert '"http://example.com/"'
|
||||
|
||||
|
@ -1,5 +1,7 @@
|
||||
from warcio.statusandheaders import StatusAndHeaders
|
||||
|
||||
from pywb.utils.io import no_except_close
|
||||
|
||||
try:
|
||||
import ujson as json
|
||||
except ImportError: # pragma: no cover
|
||||
@ -140,6 +142,18 @@ class WbResponse(object):
|
||||
response.add_access_control_headers(env=env)
|
||||
return response
|
||||
|
||||
def try_fix_errors(self):
|
||||
"""Utility method to try remove faulty headers from response.
|
||||
|
||||
:return:
|
||||
:rtype: None
|
||||
"""
|
||||
for header in self.status_headers.headers:
|
||||
try:
|
||||
header[1].encode('latin1')
|
||||
except UnicodeError:
|
||||
self.status_headers.remove_header(header[0])
|
||||
|
||||
def __call__(self, env, start_response):
|
||||
"""Callable definition to allow WbResponse control over how the response is sent
|
||||
|
||||
@ -147,12 +161,17 @@ class WbResponse(object):
|
||||
:param function start_response: The WSGI start_response function
|
||||
:return: The response body
|
||||
"""
|
||||
start_response(self.status_headers.statusline,
|
||||
self.status_headers.headers)
|
||||
try:
|
||||
start_response(self.status_headers.statusline,
|
||||
self.status_headers.headers)
|
||||
except (UnicodeError, TypeError):
|
||||
self.try_fix_errors()
|
||||
start_response(self.status_headers.statusline,
|
||||
self.status_headers.headers)
|
||||
|
||||
request_method = env['REQUEST_METHOD']
|
||||
if request_method == 'HEAD' or request_method == 'OPTIONS' or self.status_headers.statusline.startswith('304'):
|
||||
if hasattr(self.body, 'close'):
|
||||
self.body.close()
|
||||
no_except_close(self.body)
|
||||
return []
|
||||
|
||||
return self.body
|
||||
@ -185,14 +204,15 @@ class WbResponse(object):
|
||||
allowed_methods = allowed_methods + ', ' + r_method
|
||||
acr_headers = env.get('HTTP_ACCESS_CONTROL_REQUEST_HEADERS')
|
||||
if acr_headers is not None:
|
||||
self.status_headers.add_header('Access-Control-Allow-Headers', acr_headers)
|
||||
self.status_headers.replace_header('Access-Control-Allow-Headers', acr_headers)
|
||||
allowed_origin = env.get('HTTP_ORIGIN', env.get('HTTP_REFERER', allowed_origin))
|
||||
if allowed_origin is None:
|
||||
allowed_origin = '*'
|
||||
self.status_headers.replace_header('Access-Control-Allow-Origin', allowed_origin)
|
||||
self.status_headers.add_header('Access-Control-Allow-Methods', allowed_methods)
|
||||
self.status_headers.add_header('Access-Control-Allow-Credentials', 'true')
|
||||
self.status_headers.add_header('Access-Control-Max-Age', '1800')
|
||||
self.status_headers.replace_header('Access-Control-Allow-Methods', allowed_methods)
|
||||
self.status_headers.replace_header('Access-Control-Allow-Credentials', 'true')
|
||||
self.status_headers.replace_header('Access-Control-Max-Age', '1800')
|
||||
self.status_headers.replace_header('Cross-Origin-Resource-Policy', 'cross-origin')
|
||||
return self
|
||||
|
||||
def __repr__(self):
|
||||
|
@ -3,15 +3,24 @@ collections_root: collections
|
||||
# Per-Collection Paths
|
||||
archive_paths: archive
|
||||
index_paths: indexes
|
||||
acl_paths: acl
|
||||
static_path: static
|
||||
|
||||
default_access: allow
|
||||
|
||||
templates_dir: templates
|
||||
|
||||
# Template HTML
|
||||
banner_html: banner.html
|
||||
custom_banner_html: custom_banner.html
|
||||
head_insert_html: head_insert.html
|
||||
frame_insert_html: frame_insert.html
|
||||
|
||||
base_html: base.html
|
||||
header_html: header.html
|
||||
footer_html: footer.html
|
||||
head_html: head.html
|
||||
|
||||
query_html: query.html
|
||||
search_html: search.html
|
||||
not_found_html: not_found.html
|
||||
@ -28,6 +37,7 @@ info_json: collinfo.json
|
||||
# HTML Templates List
|
||||
html_templates:
|
||||
- banner_html
|
||||
- custom_banner_html
|
||||
- head_insert_html
|
||||
- frame_insert_html
|
||||
|
||||
@ -36,6 +46,12 @@ html_templates:
|
||||
- not_found_html
|
||||
|
||||
- home_html
|
||||
|
||||
- base_html
|
||||
- header_html
|
||||
- head_html
|
||||
- footer_html
|
||||
|
||||
- error_html
|
||||
- proxy_cert_download_html
|
||||
- proxy_select_html
|
||||
|
@ -75,6 +75,9 @@ class ArchiveIndexEntryMixin(object):
|
||||
self['urlkey'] = canonicalize(new_url, surt_ordered)
|
||||
other['urlkey'] = self['urlkey']
|
||||
|
||||
self['method'] = post_query.method
|
||||
self['requestBody'] = post_query.query
|
||||
|
||||
referer = other.record.http_headers.get_header('referer')
|
||||
if referer:
|
||||
self['_referer'] = referer
|
||||
|
@ -1,5 +1,9 @@
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import traceback
|
||||
|
||||
import warcio
|
||||
|
||||
# Use ujson if available
|
||||
try:
|
||||
@ -27,7 +31,6 @@ except ImportError: # pragma: no cover
|
||||
|
||||
|
||||
from argparse import ArgumentParser, RawTextHelpFormatter
|
||||
from bisect import insort
|
||||
|
||||
from six import StringIO
|
||||
|
||||
@ -167,9 +170,10 @@ class SortedCDXWriter(BaseCDXWriter):
|
||||
super(SortedCDXWriter, self).write(entry, filename)
|
||||
line = self.out.getvalue()
|
||||
if line:
|
||||
insort(self.sortlist, line)
|
||||
self.sortlist.append(line)
|
||||
|
||||
def __exit__(self, *args):
|
||||
self.sortlist.sort()
|
||||
self.actual_out.write(''.join(self.sortlist))
|
||||
return False
|
||||
|
||||
@ -298,8 +302,11 @@ def write_multi_cdx_index(output, inputs, **options):
|
||||
with open(fullpath, 'rb') as infile:
|
||||
entry_iter = record_iter(infile)
|
||||
|
||||
for entry in entry_iter:
|
||||
writer.write(entry, filename)
|
||||
try:
|
||||
for entry in entry_iter:
|
||||
writer.write(entry, filename)
|
||||
except warcio.exceptions.ArchiveLoadFailed:
|
||||
logging.error('Error while indexing file %s, %s',filename,traceback.format_exc())
|
||||
|
||||
return writer
|
||||
|
||||
@ -331,13 +338,13 @@ are supported.
|
||||
Some examples:
|
||||
|
||||
* Create "example.cdx" index from example.warc.gz
|
||||
{0} ./cdx/example.cdx ./warcs/example.warc.gz
|
||||
{0} --output ./cdx/example.cdx ./warcs/example.warc.gz
|
||||
|
||||
* Create "combined.cdx", a combined, sorted index of all warcs in ./warcs/
|
||||
{0} --sort combined.cdx ./warcs/
|
||||
{0} --sort --output combined.cdx ./warcs/
|
||||
|
||||
* Create a sorted cdx per file in ./cdx/ for each archive file in ./warcs/
|
||||
{0} --sort ./cdx/ ./warcs/
|
||||
{0} --sort --output ./cdx/ ./warcs/
|
||||
""".format(os.path.basename(sys.argv[0]))
|
||||
|
||||
sort_help = """
|
||||
@ -377,7 +384,7 @@ url timestamp { ... }
|
||||
|
||||
output_help = """
|
||||
Output file or directory.
|
||||
- If directory, each input file is written to a seperate output file
|
||||
- If directory, each input file is written to a separate output file
|
||||
with a .cdx extension
|
||||
- If output is '-', output is written to stdout
|
||||
"""
|
||||
@ -451,7 +458,9 @@ instead of current working directory
|
||||
action='store_true',
|
||||
help=minimal_json_help)
|
||||
|
||||
parser.add_argument('output', nargs='?', default='-', help=output_help)
|
||||
parser.add_argument('-o', '--output',
|
||||
default='-', help=output_help)
|
||||
|
||||
parser.add_argument('inputs', nargs='+', help=input_help)
|
||||
|
||||
cmd = parser.parse_args(args=args)
|
||||
|
@ -79,7 +79,7 @@ org,gnu)/software/wget/warc/wget.log 20140216012908 metadata://gnu.org/software/
|
||||
>>> print_cdx_index('example-wpull.warc.gz')
|
||||
CDX N b a m s k r M S V g
|
||||
com,example)/ 20150330235046 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1150 2031 example-wpull.warc.gz
|
||||
urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 example-wpull.warc.gz
|
||||
urn:x-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 example-wpull.warc.gz
|
||||
|
||||
# bad arcs -- test error edge cases
|
||||
>>> print_cdx_index('bad.arc', include_all=True)
|
||||
@ -101,9 +101,9 @@ org,httpbin)/post?foo=bar 20140610001255 http://httpbin.org/post?foo=bar applica
|
||||
# post append
|
||||
>>> print_cdx_index('post-test.warc.gz', append_post=True)
|
||||
CDX N b a m s k r M S V g
|
||||
org,httpbin)/post?foo=bar&test=abc 20140610000859 http://httpbin.org/post application/json 200 M532K5WS4GY2H4OVZO6HRPOP47A7KDWU - - 720 0 post-test.warc.gz
|
||||
org,httpbin)/post?a=1&b=[]&c=3 20140610001151 http://httpbin.org/post application/json 200 M7YCTM7HS3YKYQTAWQVMQSQZBNEOXGU2 - - 723 1196 post-test.warc.gz
|
||||
org,httpbin)/post?data=^&foo=bar 20140610001255 http://httpbin.org/post?foo=bar application/json 200 B6E5P6JUZI6UPDTNO4L2BCHMGLTNCUAJ - - 723 2395 post-test.warc.gz
|
||||
org,httpbin)/post?__wb_method=post&foo=bar&test=abc 20140610000859 http://httpbin.org/post application/json 200 M532K5WS4GY2H4OVZO6HRPOP47A7KDWU - - 720 0 post-test.warc.gz
|
||||
org,httpbin)/post?__wb_method=post&a=1&b=[]&c=3 20140610001151 http://httpbin.org/post application/json 200 M7YCTM7HS3YKYQTAWQVMQSQZBNEOXGU2 - - 723 1196 post-test.warc.gz
|
||||
org,httpbin)/post?__wb_method=post&data=^&foo=bar 20140610001255 http://httpbin.org/post?foo=bar application/json 200 B6E5P6JUZI6UPDTNO4L2BCHMGLTNCUAJ - - 723 2395 post-test.warc.gz
|
||||
|
||||
# no post append, requests included
|
||||
>>> print_cdx_index('post-test.warc.gz', include_all=True)
|
||||
@ -118,12 +118,12 @@ org,httpbin)/post?foo=bar 20140610001255 http://httpbin.org/post?foo=bar applica
|
||||
# post append + requests included
|
||||
>>> print_cdx_index('post-test.warc.gz', include_all=True, append_post=True)
|
||||
CDX N b a m s k r M S V g
|
||||
org,httpbin)/post?foo=bar&test=abc 20140610000859 http://httpbin.org/post application/json 200 M532K5WS4GY2H4OVZO6HRPOP47A7KDWU - - 720 0 post-test.warc.gz
|
||||
org,httpbin)/post?foo=bar&test=abc 20140610000859 http://httpbin.org/post application/x-www-form-urlencoded - - - - 476 720 post-test.warc.gz
|
||||
org,httpbin)/post?a=1&b=[]&c=3 20140610001151 http://httpbin.org/post application/json 200 M7YCTM7HS3YKYQTAWQVMQSQZBNEOXGU2 - - 723 1196 post-test.warc.gz
|
||||
org,httpbin)/post?a=1&b=[]&c=3 20140610001151 http://httpbin.org/post application/x-www-form-urlencoded - - - - 476 1919 post-test.warc.gz
|
||||
org,httpbin)/post?data=^&foo=bar 20140610001255 http://httpbin.org/post?foo=bar application/json 200 B6E5P6JUZI6UPDTNO4L2BCHMGLTNCUAJ - - 723 2395 post-test.warc.gz
|
||||
org,httpbin)/post?data=^&foo=bar 20140610001255 http://httpbin.org/post?foo=bar application/x-www-form-urlencoded - - - - 475 3118 post-test.warc.gz
|
||||
org,httpbin)/post?__wb_method=post&foo=bar&test=abc 20140610000859 http://httpbin.org/post application/json 200 M532K5WS4GY2H4OVZO6HRPOP47A7KDWU - - 720 0 post-test.warc.gz
|
||||
org,httpbin)/post?__wb_method=post&foo=bar&test=abc 20140610000859 http://httpbin.org/post application/x-www-form-urlencoded - - - - 476 720 post-test.warc.gz
|
||||
org,httpbin)/post?__wb_method=post&a=1&b=[]&c=3 20140610001151 http://httpbin.org/post application/json 200 M7YCTM7HS3YKYQTAWQVMQSQZBNEOXGU2 - - 723 1196 post-test.warc.gz
|
||||
org,httpbin)/post?__wb_method=post&a=1&b=[]&c=3 20140610001151 http://httpbin.org/post application/x-www-form-urlencoded - - - - 476 1919 post-test.warc.gz
|
||||
org,httpbin)/post?__wb_method=post&data=^&foo=bar 20140610001255 http://httpbin.org/post?foo=bar application/json 200 B6E5P6JUZI6UPDTNO4L2BCHMGLTNCUAJ - - 723 2395 post-test.warc.gz
|
||||
org,httpbin)/post?__wb_method=post&data=^&foo=bar 20140610001255 http://httpbin.org/post?foo=bar application/x-www-form-urlencoded - - - - 475 3118 post-test.warc.gz
|
||||
|
||||
# post append + minimal = error
|
||||
>>> print_cdx_index('example.arc.gz', append_post=True, minimal=True)
|
||||
@ -149,25 +149,25 @@ StatusAndHeadersParserException: Expected Status Line starting with ['HTTP/1.0',
|
||||
#=================================================================
|
||||
|
||||
# test sort, multiple inputs
|
||||
>>> cli_lines(['--sort', '-', TEST_WARC_DIR])
|
||||
>>> cli_lines(['--sort', '-o', '-', TEST_WARC_DIR])
|
||||
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
|
||||
urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 example-wpull.warc.gz
|
||||
Total: 212
|
||||
urn:x-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 example-wpull.warc.gz
|
||||
Total: 213
|
||||
|
||||
# test sort, multiple inputs, recursive, from base test dir
|
||||
>>> cli_lines(['--sort', '-r', '-', get_test_dir()])
|
||||
>>> cli_lines(['--sort', '-r', '-o', '-', get_test_dir()])
|
||||
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 warcs/example-url-agnostic-revisit.warc.gz
|
||||
urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 warcs/example-wpull.warc.gz
|
||||
Total: 212
|
||||
urn:x-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 warcs/example-wpull.warc.gz
|
||||
Total: 213
|
||||
|
||||
# test sort, 9-field, multiple inputs, all records + post query
|
||||
>>> cli_lines(['--sort', '-a', '-p', '-9', TEST_WARC_DIR])
|
||||
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - 355 example-url-agnostic-revisit.warc.gz
|
||||
urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - 3181 example-wpull.warc.gz
|
||||
Total: 407
|
||||
urn:x-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - 3181 example-wpull.warc.gz
|
||||
Total: 408
|
||||
|
||||
# test writing to stdout
|
||||
>>> cli_lines(['-', TEST_WARC_DIR + 'example.warc.gz'])
|
||||
>>> cli_lines([TEST_WARC_DIR + 'example.warc.gz'])
|
||||
com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
|
||||
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
|
||||
Total: 4
|
||||
@ -178,7 +178,7 @@ com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 20
|
||||
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
|
||||
Total: 4
|
||||
|
||||
# test custom root dir for cdx filenames, singlw warc
|
||||
# test custom root dir for cdx filenames, single warc
|
||||
>>> cli_lines(['--dir-root', get_test_dir() + 'other/', TEST_WARC_DIR + 'example.warc.gz'])
|
||||
com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 ../warcs/example.warc.gz
|
||||
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 ../warcs/example.warc.gz
|
||||
@ -187,8 +187,8 @@ Total: 4
|
||||
# test custom root dir for cdx filenames, dir input
|
||||
>>> cli_lines(['--sort', '--dir-root', get_test_dir() + 'other/', TEST_WARC_DIR])
|
||||
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 ../warcs/example-url-agnostic-revisit.warc.gz
|
||||
urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 ../warcs/example-wpull.warc.gz
|
||||
Total: 212
|
||||
urn:x-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 ../warcs/example-wpull.warc.gz
|
||||
Total: 213
|
||||
|
||||
# test writing to temp dir, also use unicode filename
|
||||
>>> cli_lines_with_dir(TEST_WARC_DIR + 'example.warc.gz')
|
||||
@ -265,7 +265,7 @@ def cli_lines_with_dir(input_):
|
||||
tmp_dir = None
|
||||
tmp_dir = tempfile.mkdtemp()
|
||||
|
||||
main([tmp_dir, input_])
|
||||
main(['-o', tmp_dir, input_])
|
||||
|
||||
filename = cdx_filename(os.path.basename(input_))
|
||||
|
||||
@ -463,6 +463,104 @@ com,example)/xyz.pdf 20140401052011 http://example.com/xyz.pdf application/http
|
||||
"""
|
||||
|
||||
|
||||
def test_multipart_form():
|
||||
test_data = b'\
|
||||
WARC/1.0\r\n\
|
||||
WARC-Type: response\r\n\
|
||||
WARC-Record-ID: <urn:uuid:073fac44-c383-4a2b-980d-76fec83bd20d>\r\n\
|
||||
WARC-Date: 2020-11-19T19:54:34Z\r\n\
|
||||
WARC-Target-URI: https://example.com/ajax/bz?foo=bar\r\n\
|
||||
Content-Type: application/http;msgtype=response\r\n\
|
||||
WARC-Payload-Digest: sha1:3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ\r\n\
|
||||
Content-Length: 48\r\n\
|
||||
WARC-Block-Digest: sha1:XN45YTSBLG5PLJ4HA7DRDYGJBM5VW4UO\r\n\
|
||||
\r\n\
|
||||
Content-Type: text/html; charset="utf-8"\r\n\
|
||||
\r\n\
|
||||
ABCD\r\n\
|
||||
\r\n\
|
||||
\r\n\
|
||||
\r\n\
|
||||
WARC/1.0\r\n\
|
||||
WARC-Type: request\r\n\
|
||||
WARC-Record-ID: <urn:uuid:3084e79c-ae58-4bfd-8590-fcf2830fe896>\r\n\
|
||||
WARC-Date: 2020-11-19T19:54:34Z\r\n\
|
||||
WARC-Target-URI: https://example.com/ajax/bz?foo=bar\r\n\
|
||||
WARC-Concurrent-To: <urn:uuid:073fac44-c383-4a2b-980d-76fec83bd20d>\r\n\
|
||||
WARC-Block-Digest: sha1:LNYP3X3NWXQLUGDI745P4L4FK27XGP24\r\n\
|
||||
Content-Type: application/http;msgtype=request\r\n\
|
||||
Content-Length: 321\r\n\
|
||||
\r\n\
|
||||
POST /ajax/bz?foo=bar HTTP/1.1\r\n\
|
||||
Content-Type: multipart/form-data; boundary=----WebKitFormBoundaryWUBf9liofZK0nuJd\r\n\
|
||||
content-Length: 199\r\n\
|
||||
\r\n\
|
||||
------WebKitFormBoundaryWUBf9liofZK0nuJd\r\n\
|
||||
Content-Disposition: form-data; name="q"\r\n\
|
||||
\r\n\
|
||||
[{"webSessionId":"pb2tr7:vx83uz:fdi8ta","user":"0"}]\r\n\
|
||||
------WebKitFormBoundaryWUBf9liofZK0nuJd--\r\n\
|
||||
\r\n\
|
||||
'
|
||||
options = dict(include_all=True, append_post=True)
|
||||
buff = BytesIO()
|
||||
test_record = BytesIO(test_data)
|
||||
write_cdx_index(buff, test_record, 'test.warc.gz', **options)
|
||||
print(buff.getvalue())
|
||||
assert buff.getvalue() == b"""\
|
||||
CDX N b a m s k r M S V g
|
||||
com,example)/ajax/bz?__wb_method=post&foo=bar&q=[{"websessionid":"pb2tr7:vx83uz:fdi8ta","user":"0"}] 20201119195434 https://example.com/ajax/bz?foo=bar unk text/html; 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 420 0 test.warc.gz
|
||||
com,example)/ajax/bz?__wb_method=post&foo=bar&q=[{"websessionid":"pb2tr7:vx83uz:fdi8ta","user":"0"}] 20201119195434 https://example.com/ajax/bz?foo=bar multipart/form-data - - - - 701 428 test.warc.gz
|
||||
"""
|
||||
|
||||
|
||||
def test_multipart_form_no_boundary():
|
||||
test_data = b'\
|
||||
WARC/1.0\r\n\
|
||||
WARC-Type: response\r\n\
|
||||
WARC-Record-ID: <urn:uuid:3bc1606a-d517-487e-a6d5-bfeaebda2ec3>\r\n\
|
||||
WARC-Date: 2020-11-19T14:02:52Z\r\n\
|
||||
WARC-Target-URI: https://capi.connatix.com/core/story?v=77797\r\n\
|
||||
WARC-IP-Address: 18.221.6.219\r\n\
|
||||
Content-Type: application/http;msgtype=response\r\n\
|
||||
WARC-Payload-Digest: sha1:SIGZ3RJW5J7DUKEZ4R7RSYUZNGLETIS5\r\n\
|
||||
Content-Length: 41\r\n\
|
||||
WARC-Block-Digest: sha1:JXKKZNALIPOW7J2FX5XUTGQZXKBSGZLU\r\n\
|
||||
\r\n\
|
||||
Content-Type: multipart/form-data\r\n\
|
||||
\r\n\
|
||||
ABCD\r\n\
|
||||
\r\n\
|
||||
\r\n\
|
||||
\r\n\
|
||||
WARC/1.0\r\n\
|
||||
WARC-Type: request\r\n\
|
||||
WARC-Record-ID: <urn:uuid:d5e7186f-5725-4ed1-b199-56fbdf4bd805>\r\n\
|
||||
WARC-Date: 2020-11-19T14:02:52Z\r\n\
|
||||
WARC-Target-URI: https://capi.connatix.com/core/story?v=77797\r\n\
|
||||
WARC-Concurrent-To: <urn:uuid:3bc1606a-d517-487e-a6d5-bfeaebda2ec3>\r\n\
|
||||
WARC-Block-Digest: sha1:QJ2YUIKEWDSCLK5A2DHGLQ7WWEKYMO3W\r\n\
|
||||
Content-Type: application/http;msgtype=request\r\n\
|
||||
Content-Length: 111\r\n\
|
||||
\r\n\
|
||||
POST /core/story?v=77797 HTTP/1.1\r\n\
|
||||
Content-Length: 19\r\n\
|
||||
Content-Type: multipart/form-data\r\n\
|
||||
\r\n\
|
||||
{"text": "default"}\r\n\
|
||||
\r\n\
|
||||
'
|
||||
options = dict(include_all=True, append_post=True)
|
||||
buff = BytesIO()
|
||||
test_record = BytesIO(test_data)
|
||||
write_cdx_index(buff, test_record, 'test.warc.gz', **options)
|
||||
assert buff.getvalue() == b"""\
|
||||
CDX N b a m s k r M S V g
|
||||
com,connatix,capi)/core/story?__wb_method=post&__wb_post_data=eyj0zxh0ijogimrlzmf1bhqifq==&v=77797 20201119140252 https://capi.connatix.com/core/story?v=77797 unk multipart/form-data SIGZ3RJW5J7DUKEZ4R7RSYUZNGLETIS5 - - 453 0 test.warc.gz
|
||||
com,connatix,capi)/core/story?__wb_method=post&__wb_post_data=eyj0zxh0ijogimrlzmf1bhqifq==&v=77797 20201119140252 https://capi.connatix.com/core/story?v=77797 multipart/form-data - - - - 500 461 test.warc.gz
|
||||
"""
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
doctest.testmod()
|
||||
|
363
pywb/manager/aclmanager.py
Normal file
363
pywb/manager/aclmanager.py
Normal file
@ -0,0 +1,363 @@
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
|
||||
from pywb.manager.manager import CollectionsManager
|
||||
from pywb.utils.canonicalize import canonicalize
|
||||
from pywb.warcserver.access_checker import AccessChecker
|
||||
from pywb.warcserver.index.cdxobject import CDXObject
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class ACLManager(CollectionsManager):
|
||||
SURT_RX = re.compile('([^:.]+[,)])+')
|
||||
|
||||
VALID_ACCESS = ('allow', 'block', 'exclude', 'allow_ignore_embargo')
|
||||
|
||||
DEFAULT_FILE = 'access-rules.aclj'
|
||||
|
||||
def __init__(self, r):
|
||||
"""
|
||||
:param argparse.Namespace r: Parsed result from ArgumentParser
|
||||
:rtype: None
|
||||
"""
|
||||
self.rules = []
|
||||
|
||||
coll_name = r.coll_name
|
||||
if not self.is_valid_auto_coll(r.coll_name):
|
||||
coll_name = ''
|
||||
|
||||
self.target = r.coll_name
|
||||
|
||||
super(ACLManager, self).__init__(coll_name, must_exist=False)
|
||||
|
||||
self.acl_file = None
|
||||
|
||||
def process(self, r):
|
||||
"""
|
||||
Process acl command
|
||||
|
||||
:param argparse.Namespace r: Parsed result from ArgumentParser
|
||||
:rtype: None
|
||||
"""
|
||||
|
||||
# if target exists as a file, use that
|
||||
if os.path.isfile(self.target):
|
||||
self.acl_file = self.target
|
||||
|
||||
# otherwise, if auto collection, use default file in ./collections/<coll>/acl/<DEFAULT_FILE>
|
||||
elif os.path.isdir(self.curr_coll_dir):
|
||||
self.acl_file = os.path.join(self.acl_dir, self.DEFAULT_FILE)
|
||||
|
||||
# else, assume filename (may not exist yet)
|
||||
else:
|
||||
self.acl_file = self.target
|
||||
|
||||
# for add/import, file doesn't have to exist
|
||||
if r.op in ('add', 'importtxt'):
|
||||
self.load_acl(False)
|
||||
|
||||
# for other ops (except matching), ensure entire file loads successfully, log errors
|
||||
elif r.op not in ('match'):
|
||||
if not self.load_acl(True):
|
||||
sys.exit(2)
|
||||
return
|
||||
|
||||
# if 'validate', the command itself is validation
|
||||
if r.op != 'validate':
|
||||
self.validate()
|
||||
|
||||
r.acl_func(self, r)
|
||||
|
||||
def is_valid_auto_coll(self, coll_name):
|
||||
"""Returns T/F indicating if the supplied collection name
|
||||
is a valid collection
|
||||
|
||||
:param coll_name: The collection name to check
|
||||
:return: T/F indicating a valid collection
|
||||
:rtype: bool
|
||||
"""
|
||||
if not self.COLL_RX.match(coll_name):
|
||||
return False
|
||||
|
||||
if not os.path.isdir(os.path.join(self.COLLS_DIR, coll_name)):
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def load_acl(self, must_exist=True):
|
||||
"""Loads the access control list
|
||||
|
||||
:param bool must_exist: Does the acl file have to exist
|
||||
:return: T/F indicating load success
|
||||
:rtype: bool
|
||||
"""
|
||||
try:
|
||||
with open(self.acl_file, 'rb') as fh:
|
||||
for line in fh:
|
||||
if line:
|
||||
self.rules.append(CDXObject(line))
|
||||
|
||||
return True
|
||||
|
||||
except IOError as io:
|
||||
if must_exist:
|
||||
print('Error Occurred: ' + str(io))
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
print('Error Occurred: ' + str(e))
|
||||
return False
|
||||
|
||||
def save_acl(self, r=None):
|
||||
"""Save the contents of the rules as cdxj entries to
|
||||
the access control list file
|
||||
|
||||
:param argparse.Namespace|None r: Not used
|
||||
:rtype: None
|
||||
"""
|
||||
try:
|
||||
os.makedirs(os.path.dirname(self.acl_file))
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
try:
|
||||
with open(self.acl_file, 'wb') as fh:
|
||||
for acl in self.rules:
|
||||
fh.write(acl.to_cdxj().encode('utf-8'))
|
||||
|
||||
except Exception as e:
|
||||
print('Error Saving ACL Rules: ' + str(e))
|
||||
|
||||
def to_key(self, url_or_surt, exact_match=False):
|
||||
""" If 'url_or_surt' already a SURT, use as is
|
||||
If exact match, add the exact match suffix
|
||||
|
||||
:param str url_or_surt: The url or surt to be converted to an acl key
|
||||
:param bool exact_match: Should the exact match suffix be added to key
|
||||
:rtype: str
|
||||
"""
|
||||
if self.SURT_RX.search(url_or_surt):
|
||||
result = url_or_surt
|
||||
else:
|
||||
result = canonicalize(url_or_surt)
|
||||
|
||||
if exact_match:
|
||||
result += AccessChecker.EXACT_SUFFIX
|
||||
|
||||
return result
|
||||
|
||||
def validate_access(self, access):
|
||||
"""Returns true if the supplied access value is valid
|
||||
otherwise terminates the process
|
||||
|
||||
:param str access: The access value to be validated
|
||||
:return: True if valid
|
||||
:rtype: bool
|
||||
"""
|
||||
if access not in self.VALID_ACCESS:
|
||||
print('Valid access values are: ' + ', '.join(self.VALID_ACCESS))
|
||||
sys.exit(1)
|
||||
|
||||
return True
|
||||
|
||||
def add_rule(self, r):
|
||||
"""Adds a rule the ACL manager
|
||||
|
||||
:param argparse.Namespace r: The argparse namespace representing the rule to be added
|
||||
:rtype: None
|
||||
"""
|
||||
return self._add_rule(r.url, r.access, r.exact_match, r.user)
|
||||
|
||||
def _add_rule(self, url, access, exact_match=False, user=None):
|
||||
"""Adds an rule to the acl file
|
||||
|
||||
:param str url: The URL for the rule
|
||||
:param str access: The access value for the rule
|
||||
:param bool exact_match: Is the rule to be added an exact match
|
||||
:rtype: None
|
||||
"""
|
||||
if not self.validate_access(access):
|
||||
return
|
||||
|
||||
acl = CDXObject()
|
||||
acl['urlkey'] = self.to_key(url, exact_match)
|
||||
acl['timestamp'] = '-'
|
||||
acl['access'] = access
|
||||
acl['url'] = url
|
||||
if user:
|
||||
acl['user'] = user
|
||||
|
||||
i = 0
|
||||
replace = False
|
||||
|
||||
for rule in self.rules:
|
||||
if acl['urlkey'] == rule['urlkey'] and acl['timestamp'] == rule['timestamp'] and acl.get('user') == rule.get('user'):
|
||||
replace = True
|
||||
break
|
||||
|
||||
if acl > rule:
|
||||
break
|
||||
|
||||
i += 1
|
||||
|
||||
if replace:
|
||||
print('Existing Rule Found, Replacing:')
|
||||
self.print_rule(self.rules[i])
|
||||
print('with:')
|
||||
self.print_rule(acl)
|
||||
self.rules[i] = acl
|
||||
else:
|
||||
print('Added new Rule:')
|
||||
self.print_rule(acl)
|
||||
self.rules.insert(i, acl)
|
||||
|
||||
self.save_acl()
|
||||
|
||||
def validate_save(self, r=None, log=False):
|
||||
"""Validates the acl rules and saves the file
|
||||
|
||||
:param argparse.Namespace|None r: Not used
|
||||
:param bool log: Should a report be printed to stdout
|
||||
:rtype: None
|
||||
"""
|
||||
self.validate(log=log, correct=True)
|
||||
|
||||
def validate(self, log=False, correct=False):
|
||||
"""Validates the acl rules returning T/F if the list should be saved
|
||||
|
||||
:param bool log: Should the results of validating be logged to stdout
|
||||
:param bool correct: Should invalid results be corrected and saved
|
||||
:rtype: None
|
||||
"""
|
||||
last_rule = None
|
||||
out_of_order = False
|
||||
for rule in self.rules:
|
||||
if last_rule and rule > last_rule:
|
||||
out_of_order = True
|
||||
break
|
||||
|
||||
last_rule = rule
|
||||
|
||||
if out_of_order:
|
||||
if log:
|
||||
print('Rules out of order, resorting')
|
||||
if correct:
|
||||
self.rules.sort(reverse=True)
|
||||
self.save_acl()
|
||||
elif log:
|
||||
print('Rules in order')
|
||||
|
||||
def remove_rule(self, r):
|
||||
"""Removes a rule from the acl file
|
||||
|
||||
:param argparse.Namespace r: Parsed result from ArgumentParser
|
||||
:rtype: None
|
||||
"""
|
||||
i = 0
|
||||
urlkey = self.to_key(r.url, r.exact_match)
|
||||
for rule in self.rules:
|
||||
if urlkey == rule['urlkey'] and r.user == rule.get('user'):
|
||||
acl = self.rules.pop(i)
|
||||
print('Removed Rule:')
|
||||
self.print_rule(acl)
|
||||
self.save_acl()
|
||||
return
|
||||
|
||||
i += 1
|
||||
|
||||
print('Rule to remove not found!')
|
||||
|
||||
def list_rules(self, r):
|
||||
"""Print the acl rules to the stdout
|
||||
|
||||
:param argparse.Namespace|None r: Not used
|
||||
:rtype: None
|
||||
"""
|
||||
print('Rules for {0} from {1}:'.format(self.target, self.acl_file))
|
||||
print('')
|
||||
for rule in self.rules:
|
||||
sys.stdout.write(rule.to_cdxj())
|
||||
print('')
|
||||
|
||||
def find_match(self, r):
|
||||
"""Finds a matching acl rule
|
||||
|
||||
:param argparse.Namespace r: Parsed result from ArgumentParser
|
||||
:rtype: None
|
||||
"""
|
||||
access_checker = AccessChecker(self.acl_file, '<default>')
|
||||
rule = access_checker.find_access_rule(r.url, acl_user=r.user)
|
||||
|
||||
print('Matched rule:')
|
||||
print('')
|
||||
if rule['urlkey'] == '':
|
||||
print(' <No Match, Using Default Rule>')
|
||||
print('')
|
||||
else:
|
||||
self.print_rule(rule)
|
||||
|
||||
def add_excludes(self, r):
|
||||
"""
|
||||
Import old-style excludes, in url-per-line format
|
||||
|
||||
:param argparse.Namespace r: Parsed result from ArgumentParser
|
||||
"""
|
||||
if not self.validate_access(r.access):
|
||||
return
|
||||
|
||||
try:
|
||||
with open(r.filename, 'rb') as fh:
|
||||
count = 0
|
||||
for url in fh:
|
||||
url = url.decode('utf-8').strip()
|
||||
self._add_rule(url, r.access)
|
||||
count += 1
|
||||
|
||||
print('Added or replaced {0} rules from '.format(count) + r.filename)
|
||||
|
||||
except Exception as e:
|
||||
print('Error Importing: ' + str(e))
|
||||
sys.exit(1)
|
||||
|
||||
def print_rule(self, rule):
|
||||
"""Prints the supplied rule to the std out
|
||||
|
||||
:param CDXObject rule: The rule to be printed
|
||||
:rtype: None
|
||||
"""
|
||||
print(' ' + rule.to_cdxj())
|
||||
|
||||
@classmethod
|
||||
def init_parser(cls, parser):
|
||||
"""Initializes an argument parser for acl commands
|
||||
|
||||
:param argparse.ArgumentParser parser: The parser to be initialized
|
||||
:rtype: None
|
||||
"""
|
||||
subparsers = parser.add_subparsers(dest='op')
|
||||
subparsers.required = True
|
||||
|
||||
def command(name, *args, **kwargs):
|
||||
op = subparsers.add_parser(name)
|
||||
for arg in args:
|
||||
if arg == 'default_access':
|
||||
op.add_argument(arg, nargs='?', default='allow')
|
||||
else:
|
||||
op.add_argument(arg)
|
||||
|
||||
if kwargs.get('user_opt'):
|
||||
op.add_argument('-u', '--user')
|
||||
|
||||
if kwargs.get('exact_opt'):
|
||||
op.add_argument('-e', '--exact-match', action='store_true', default=False)
|
||||
|
||||
op.set_defaults(acl_func=kwargs['func'])
|
||||
|
||||
command('add', 'coll_name', 'url', 'access', func=cls.add_rule, exact_opt=True, user_opt=True)
|
||||
command('remove', 'coll_name', 'url', func=cls.remove_rule, exact_opt=True, user_opt=True)
|
||||
command('list', 'coll_name', func=cls.list_rules)
|
||||
command('validate', 'coll_name', func=cls.validate_save)
|
||||
command('match', 'coll_name', 'url', 'default_access', func=cls.find_match, user_opt=True)
|
||||
command('importtxt', 'coll_name', 'filename', 'access', func=cls.add_excludes)
|
||||
|
115
pywb/manager/locmanager.py
Normal file
115
pywb/manager/locmanager.py
Normal file
@ -0,0 +1,115 @@
|
||||
import os
|
||||
import os.path
|
||||
import shutil
|
||||
|
||||
try:
|
||||
from babel.messages.frontend import CommandLineInterface
|
||||
|
||||
from translate.convert.po2csv import main as po2csv
|
||||
from translate.convert.csv2po import main as csv2po
|
||||
loc_avail = True
|
||||
except:
|
||||
loc_avail = False
|
||||
|
||||
|
||||
ROOT_DIR = 'i18n'
|
||||
|
||||
TRANSLATIONS = os.path.join(ROOT_DIR, 'translations')
|
||||
|
||||
MESSAGES = os.path.join(ROOT_DIR, 'messages.pot')
|
||||
|
||||
# ============================================================================
|
||||
class LocManager:
|
||||
def process(self, r):
|
||||
if r.name == 'list':
|
||||
r.loc_func(self)
|
||||
elif r.name == 'remove':
|
||||
r.loc_func(self, r.locale)
|
||||
else:
|
||||
r.loc_func(self, r.locale, r.no_csv)
|
||||
|
||||
def extract_loc(self, locale, no_csv):
|
||||
self.extract_text()
|
||||
|
||||
for loc in locale:
|
||||
loc_dir = os.path.join(TRANSLATIONS, loc)
|
||||
if os.path.isdir(loc_dir):
|
||||
self.update_catalog(loc)
|
||||
else:
|
||||
os.makedirs(loc_dir)
|
||||
self.init_catalog(loc)
|
||||
|
||||
if not no_csv:
|
||||
base = os.path.join(TRANSLATIONS, loc, 'LC_MESSAGES')
|
||||
po = os.path.join(base, 'messages.po')
|
||||
csv = os.path.join(base, 'messages.csv')
|
||||
po2csv([po, csv])
|
||||
|
||||
self.compile_catalog()
|
||||
|
||||
def update_loc(self, locale, no_csv):
|
||||
for loc in locale:
|
||||
if not no_csv:
|
||||
loc_dir = os.path.join(TRANSLATIONS, loc)
|
||||
base = os.path.join(TRANSLATIONS, loc, 'LC_MESSAGES')
|
||||
po = os.path.join(base, 'messages.po')
|
||||
csv = os.path.join(base, 'messages.csv')
|
||||
|
||||
if os.path.isfile(csv):
|
||||
csv2po([csv, po])
|
||||
|
||||
self.compile_catalog()
|
||||
|
||||
def remove_loc(self, locale):
|
||||
for loc in locale:
|
||||
loc_dir = os.path.join(TRANSLATIONS, loc)
|
||||
if not os.path.isdir(loc_dir):
|
||||
print('Locale "{0}" does not exist'.format(loc))
|
||||
return
|
||||
|
||||
shutil.rmtree(loc_dir)
|
||||
print('Removed locale "{0}"'.format(loc))
|
||||
|
||||
def list_loc(self):
|
||||
print('Current locales:')
|
||||
print('\n'.join(' - ' + x for x in os.listdir(TRANSLATIONS)))
|
||||
print('')
|
||||
|
||||
def extract_text(self):
|
||||
os.makedirs(ROOT_DIR, exist_ok=True)
|
||||
|
||||
CommandLineInterface().run(['pybabel', 'extract', '-F', 'babel.ini', '-k', '_ _Q gettext ngettext', '-o', MESSAGES, './', '--omit-header'])
|
||||
|
||||
def init_catalog(self, loc):
|
||||
CommandLineInterface().run(['pybabel', 'init', '-l', loc, '-i', MESSAGES, '-d', TRANSLATIONS])
|
||||
|
||||
def update_catalog(self, loc):
|
||||
CommandLineInterface().run(['pybabel', 'update', '-l', loc, '-i', MESSAGES, '-d', TRANSLATIONS, '--previous'])
|
||||
|
||||
def compile_catalog(self):
|
||||
CommandLineInterface().run(['pybabel', 'compile', '-d', TRANSLATIONS])
|
||||
|
||||
|
||||
@classmethod
|
||||
def init_parser(cls, parser):
|
||||
"""Initializes an argument parser for acl commands
|
||||
|
||||
:param argparse.ArgumentParser parser: The parser to be initialized
|
||||
:rtype: None
|
||||
"""
|
||||
subparsers = parser.add_subparsers(dest='op')
|
||||
subparsers.required = True
|
||||
|
||||
def command(name, func):
|
||||
op = subparsers.add_parser(name)
|
||||
if name != 'list':
|
||||
op.add_argument('locale', nargs='+')
|
||||
if name != 'remove':
|
||||
op.add_argument('--no-csv', action='store_true')
|
||||
|
||||
op.set_defaults(loc_func=func, name=name)
|
||||
|
||||
command('extract', cls.extract_loc)
|
||||
command('update', cls.update_loc)
|
||||
command('remove', cls.remove_loc)
|
||||
command('list', cls.list_loc)
|
@ -5,12 +5,16 @@ import logging
|
||||
import heapq
|
||||
import yaml
|
||||
import re
|
||||
import gzip
|
||||
import six
|
||||
import pathlib
|
||||
|
||||
from distutils.util import strtobool
|
||||
from pkg_resources import resource_string
|
||||
from pkg_resources import resource_string, get_distribution
|
||||
|
||||
from argparse import ArgumentParser, RawTextHelpFormatter
|
||||
from tempfile import mkdtemp, TemporaryDirectory
|
||||
from zipfile import ZipFile
|
||||
|
||||
from pywb.utils.loaders import load_yaml_config
|
||||
from warcio.timeutils import timestamp20_now
|
||||
@ -19,6 +23,7 @@ from pywb import DEFAULT_CONFIG
|
||||
|
||||
from six.moves import input
|
||||
|
||||
|
||||
#=============================================================================
|
||||
# to allow testing by mocking get_input
|
||||
|
||||
@ -27,8 +32,12 @@ def get_input(msg): # pragma: no cover
|
||||
return input(msg)
|
||||
|
||||
#=============================================================================
|
||||
def get_version():
|
||||
"""Get version of the pywb"""
|
||||
return "wb-manager " + get_distribution("pywb").version
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class CollectionsManager(object):
|
||||
""" This utility is designed to
|
||||
simplify the creation and management of web archive collections
|
||||
@ -42,6 +51,9 @@ directory structure expected by pywb
|
||||
|
||||
COLLS_DIR = 'collections'
|
||||
|
||||
WARC_RX = re.compile(r'.*\.w?arc(\.gz)?$')
|
||||
WACZ_RX = re.compile(r'.*\.wacz$')
|
||||
|
||||
def __init__(self, coll_name, colls_dir=None, must_exist=True):
|
||||
colls_dir = colls_dir or self.COLLS_DIR
|
||||
self.default_config = load_yaml_config(DEFAULT_CONFIG)
|
||||
@ -66,6 +78,8 @@ directory structure expected by pywb
|
||||
self.static_dir = self._get_dir('static_path')
|
||||
self.templates_dir = self._get_dir('templates_dir')
|
||||
|
||||
self.acl_dir = self._get_dir('acl_paths')
|
||||
|
||||
def list_colls(self):
|
||||
print('Collections:')
|
||||
if not os.path.isdir(self.colls_dir):
|
||||
@ -108,19 +122,142 @@ directory structure expected by pywb
|
||||
'To create a new collection, run\n\n{1} init {0}')
|
||||
raise IOError(msg.format(self.coll_name, sys.argv[0]))
|
||||
|
||||
def add_warcs(self, warcs):
|
||||
def add_archives(self, archives, unpack_wacz=False):
|
||||
if not os.path.isdir(self.archive_dir):
|
||||
raise IOError('Directory {0} does not exist'.
|
||||
format(self.archive_dir))
|
||||
|
||||
full_paths = []
|
||||
for filename in warcs:
|
||||
filename = os.path.abspath(filename)
|
||||
shutil.copy2(filename, self.archive_dir)
|
||||
full_paths.append(os.path.join(self.archive_dir, filename))
|
||||
logging.info('Copied ' + filename + ' to ' + self.archive_dir)
|
||||
invalid_archives = []
|
||||
warc_paths = []
|
||||
for archive in archives:
|
||||
if self.WARC_RX.match(archive):
|
||||
full_path = self._add_warc(archive)
|
||||
if full_path:
|
||||
warc_paths.append(full_path)
|
||||
elif self.WACZ_RX.match(archive):
|
||||
if unpack_wacz:
|
||||
self._add_wacz_unpacked(archive)
|
||||
else:
|
||||
raise NotImplementedError('Adding waczs without unpacking is not yet implemented. Use '
|
||||
'\'--unpack-wacz\' flag to add the wacz\'s content.')
|
||||
else:
|
||||
invalid_archives.append(archive)
|
||||
|
||||
self._index_merge_warcs(full_paths, self.DEF_INDEX_FILE)
|
||||
self._index_merge_warcs(warc_paths, self.DEF_INDEX_FILE)
|
||||
|
||||
if invalid_archives:
|
||||
logging.warning(f'Invalid archives weren\'t added: {", ".join(invalid_archives)}')
|
||||
|
||||
def _rename_warc(self, warc_basename):
|
||||
dupe_idx = 1
|
||||
ext = ''.join(pathlib.Path(warc_basename).suffixes)
|
||||
pre_ext_name = warc_basename.split(ext)[0]
|
||||
|
||||
while True:
|
||||
new_basename = f'{pre_ext_name}-{dupe_idx}{ext}'
|
||||
if not os.path.exists(os.path.join(self.archive_dir, new_basename)):
|
||||
break
|
||||
dupe_idx += 1
|
||||
|
||||
return new_basename
|
||||
|
||||
def _add_warc(self, warc):
|
||||
warc_source = os.path.abspath(warc)
|
||||
source_dir, warc_basename = os.path.split(warc_source)
|
||||
|
||||
# don't overwrite existing warcs with duplicate names
|
||||
if os.path.exists(os.path.join(self.archive_dir, warc_basename)):
|
||||
warc_basename = self._rename_warc(warc_basename)
|
||||
logging.info(f'Warc {os.path.basename(warc)} already exists - renamed to {warc_basename}.')
|
||||
|
||||
warc_dest = os.path.join(self.archive_dir, warc_basename)
|
||||
shutil.copy2(warc_source, warc_dest)
|
||||
logging.info(f'Copied {warc} to {self.archive_dir} as {warc_basename}')
|
||||
return warc_dest
|
||||
|
||||
def _add_wacz_unpacked(self, wacz):
|
||||
wacz = os.path.abspath(wacz)
|
||||
temp_dir = mkdtemp()
|
||||
warc_regex = re.compile(r'.+\.warc(\.gz)?$')
|
||||
cdx_regex = re.compile(r'.+\.cdx(\.gz)?$')
|
||||
with ZipFile(wacz, 'r') as wacz_zip_file:
|
||||
archive_members = wacz_zip_file.namelist()
|
||||
warc_files = [file for file in archive_members if warc_regex.match(file)]
|
||||
if not warc_files:
|
||||
logging.warning(f'WACZ {wacz} does not contain any warc files.')
|
||||
return
|
||||
|
||||
# extract warc files
|
||||
for warc_file in warc_files:
|
||||
wacz_zip_file.extract(warc_file, temp_dir)
|
||||
|
||||
cdx_files = [file for file in archive_members if cdx_regex.match(file)]
|
||||
if not cdx_files:
|
||||
logging.warning(f'WACZ {wacz} does not contain any indices.')
|
||||
return
|
||||
|
||||
for cdx_file in cdx_files:
|
||||
wacz_zip_file.extract(cdx_file, temp_dir)
|
||||
|
||||
# copy extracted warc files to collections archive dir, use wacz filename as filename with added index if
|
||||
# multiple warc files exist
|
||||
warc_filename_mapping = {}
|
||||
full_paths = []
|
||||
for idx, extracted_warc_file in enumerate(warc_files):
|
||||
_, warc_ext = os.path.splitext(extracted_warc_file)
|
||||
if warc_ext == '.gz':
|
||||
warc_ext = '.warc.gz'
|
||||
warc_filename = os.path.basename(wacz)
|
||||
warc_filename, _ = os.path.splitext(warc_filename)
|
||||
warc_filename = f'{warc_filename}-{idx}{warc_ext}'
|
||||
warc_destination_path = os.path.join(self.archive_dir, warc_filename)
|
||||
|
||||
if os.path.exists(warc_destination_path):
|
||||
warc_filename = self._rename_warc(warc_filename)
|
||||
logging.info(f'Warc {warc_destination_path} already exists - renamed to {warc_filename}.')
|
||||
warc_destination_path = os.path.join(self.archive_dir, warc_filename)
|
||||
|
||||
warc_filename_mapping[os.path.basename(extracted_warc_file)] = warc_filename
|
||||
shutil.copy2(os.path.join(temp_dir, extracted_warc_file), warc_destination_path)
|
||||
full_paths.append(warc_destination_path)
|
||||
|
||||
# rewrite filenames in wacz indices and merge them with collection index file
|
||||
for cdx_file in cdx_files:
|
||||
self._add_wacz_index(os.path.join(self.indexes_dir, self.DEF_INDEX_FILE), os.path.join(temp_dir, cdx_file),
|
||||
warc_filename_mapping)
|
||||
|
||||
# delete temporary files
|
||||
shutil.rmtree(temp_dir)
|
||||
|
||||
def _add_wacz_index(self, collection_index_path, wacz_index_path, filename_mapping):
|
||||
from pywb.warcserver.index.cdxobject import CDXObject
|
||||
|
||||
# rewrite wacz index to temporary index file
|
||||
tempdir = TemporaryDirectory()
|
||||
wacz_index_name = os.path.basename(wacz_index_path)
|
||||
rewritten_index_path = os.path.join(tempdir.name, wacz_index_name)
|
||||
|
||||
with open(rewritten_index_path, 'w') as rewritten_index:
|
||||
if wacz_index_path.endswith('.gz'):
|
||||
wacz_index = gzip.open(wacz_index_path, 'rb')
|
||||
else:
|
||||
wacz_index = open(wacz_index_path, 'rb')
|
||||
|
||||
for line in wacz_index:
|
||||
cdx_object = CDXObject(cdxline=line)
|
||||
if cdx_object['filename'] in filename_mapping:
|
||||
cdx_object['filename'] = filename_mapping[cdx_object['filename']]
|
||||
rewritten_index.write(cdx_object.to_cdxj())
|
||||
|
||||
if not os.path.isfile(collection_index_path):
|
||||
shutil.move(rewritten_index_path, collection_index_path)
|
||||
return
|
||||
|
||||
temp_coll_index_path = collection_index_path + '.tmp.' + timestamp20_now()
|
||||
self._merge_indices(collection_index_path, rewritten_index_path, temp_coll_index_path)
|
||||
shutil.move(temp_coll_index_path, collection_index_path)
|
||||
|
||||
tempdir.cleanup()
|
||||
|
||||
def reindex(self):
|
||||
cdx_file = os.path.join(self.indexes_dir, self.DEF_INDEX_FILE)
|
||||
@ -173,20 +310,24 @@ directory structure expected by pywb
|
||||
|
||||
merged_file = temp_file + '.merged'
|
||||
|
||||
last_line = None
|
||||
|
||||
with open(cdx_file, 'rb') as orig_index:
|
||||
with open(temp_file, 'rb') as new_index:
|
||||
with open(merged_file, 'w+b') as merged:
|
||||
for line in heapq.merge(orig_index, new_index):
|
||||
if last_line != line:
|
||||
merged.write(line)
|
||||
last_line = line
|
||||
self._merge_indices(cdx_file, temp_file, merged_file)
|
||||
|
||||
shutil.move(merged_file, cdx_file)
|
||||
#os.rename(merged_file, cdx_file)
|
||||
os.remove(temp_file)
|
||||
|
||||
@staticmethod
|
||||
def _merge_indices(index1, index2, dest):
|
||||
last_line = None
|
||||
|
||||
with open(index1, 'rb') as index1_f:
|
||||
with open(index2, 'rb') as index2_f:
|
||||
with open(dest, 'wb') as dest_f:
|
||||
for line in heapq.merge(index1_f, index2_f):
|
||||
if last_line != line:
|
||||
dest_f.write(line)
|
||||
last_line = line
|
||||
|
||||
def set_metadata(self, namevalue_pairs):
|
||||
metadata_yaml = os.path.join(self.curr_coll_dir, 'metadata.yaml')
|
||||
metadata = None
|
||||
@ -230,17 +371,20 @@ directory structure expected by pywb
|
||||
v = defaults[n]
|
||||
print('- {0}: (pywb/{1})'.format(n, v))
|
||||
|
||||
def _confirm_overwrite(self, full_path, msg):
|
||||
def _confirm_overwrite(self, full_path, msg, ignore=False):
|
||||
if not os.path.isfile(full_path):
|
||||
return True
|
||||
|
||||
if ignore:
|
||||
return False
|
||||
|
||||
res = get_input(msg)
|
||||
try:
|
||||
res = strtobool(res)
|
||||
except ValueError:
|
||||
res = False
|
||||
|
||||
if not res:
|
||||
if not res and not ignore:
|
||||
raise IOError('Skipping, {0} already exists'.format(full_path))
|
||||
|
||||
def _get_template_path(self, template_name, verb):
|
||||
@ -261,7 +405,7 @@ directory structure expected by pywb
|
||||
|
||||
return full_path, filename
|
||||
|
||||
def add_template(self, template_name, force=False):
|
||||
def add_template(self, template_name, force=False, ignore=False):
|
||||
full_path, filename = self._get_template_path(template_name, 'add')
|
||||
|
||||
msg = ('Template file "{0}" ({1}) already exists. ' +
|
||||
@ -269,7 +413,11 @@ directory structure expected by pywb
|
||||
msg = msg.format(full_path, template_name)
|
||||
|
||||
if not force:
|
||||
self._confirm_overwrite(full_path, msg)
|
||||
res = self._confirm_overwrite(full_path, msg, ignore)
|
||||
if ignore and not res:
|
||||
return
|
||||
|
||||
os.makedirs(os.path.dirname(full_path), exist_ok=True)
|
||||
|
||||
data = resource_string('pywb', filename)
|
||||
with open(full_path, 'w+b') as fh:
|
||||
@ -279,6 +427,9 @@ directory structure expected by pywb
|
||||
msg = 'Copied default template "{0}" to "{1}"'
|
||||
print(msg.format(filename, full_path))
|
||||
|
||||
if template_name != "base_html":
|
||||
self.add_template("base_html", force=False, ignore=True)
|
||||
|
||||
def remove_template(self, template_name, force=False):
|
||||
full_path, filename = self._get_template_path(template_name, 'remove')
|
||||
|
||||
@ -332,6 +483,8 @@ Create manage file based web archive collections
|
||||
# epilog=epilog,
|
||||
formatter_class=RawTextHelpFormatter)
|
||||
|
||||
parser.add_argument("-V", "--version", action="version", version=get_version())
|
||||
|
||||
subparsers = parser.add_subparsers(dest='type')
|
||||
subparsers.required = True
|
||||
|
||||
@ -354,16 +507,23 @@ Create manage file based web archive collections
|
||||
listcmd = subparsers.add_parser('list', help=list_help)
|
||||
listcmd.set_defaults(func=do_list)
|
||||
|
||||
# Add Warcs
|
||||
# Add Warcs or Waczs
|
||||
def do_add(r):
|
||||
m = CollectionsManager(r.coll_name)
|
||||
m.add_warcs(r.files)
|
||||
m.add_archives(r.files, r.unpack_wacz)
|
||||
|
||||
addwarc_help = 'Copy ARCS/WARCS to collection directory and reindex'
|
||||
addwarc = subparsers.add_parser('add', help=addwarc_help)
|
||||
addwarc.add_argument('coll_name')
|
||||
addwarc.add_argument('files', nargs='+')
|
||||
addwarc.set_defaults(func=do_add)
|
||||
add_archives_help = 'Copy ARCs/WARCs to collection directory and reindex'
|
||||
add_unpack_wacz_help = 'Copy WARCs from WACZ to collection directory and reindex'
|
||||
add_archives = subparsers.add_parser('add', help=add_archives_help)
|
||||
add_archives.add_argument(
|
||||
'--unpack-wacz',
|
||||
dest='unpack_wacz',
|
||||
action='store_true',
|
||||
help=add_unpack_wacz_help
|
||||
)
|
||||
add_archives.add_argument('coll_name')
|
||||
add_archives.add_argument('files', nargs='+')
|
||||
add_archives.set_defaults(func=do_add)
|
||||
|
||||
# Reindex All
|
||||
def do_reindex(r):
|
||||
@ -427,6 +587,34 @@ Create manage file based web archive collections
|
||||
migrate.add_argument('-f', '--force', action='store_true')
|
||||
migrate.set_defaults(func=do_migrate)
|
||||
|
||||
# ACL
|
||||
from pywb.manager.aclmanager import ACLManager
|
||||
def do_acl(r):
|
||||
acl = ACLManager(r)
|
||||
acl.process(r)
|
||||
|
||||
acl_help = 'Configure Access Control Lists (ACL) for a collection'
|
||||
acl = subparsers.add_parser('acl', help=acl_help)
|
||||
ACLManager.init_parser(acl)
|
||||
acl.set_defaults(func=do_acl)
|
||||
|
||||
# LOC
|
||||
from pywb.manager.locmanager import LocManager, loc_avail
|
||||
|
||||
def do_loc(r):
|
||||
if not loc_avail:
|
||||
print("You must install i18n extensions with 'pip install pywb[i18n]' to use localization features")
|
||||
return
|
||||
|
||||
loc = LocManager()
|
||||
loc.process(r)
|
||||
|
||||
loc_help = 'Generate strings for i18n/localization'
|
||||
loc = subparsers.add_parser('i18n', help=loc_help)
|
||||
if loc_avail:
|
||||
LocManager.init_parser(loc)
|
||||
loc.set_defaults(func=do_loc)
|
||||
|
||||
# Parse
|
||||
r = parser.parse_args(args=args)
|
||||
r.func(r)
|
||||
|
@ -2,15 +2,14 @@ import base64
|
||||
import datetime
|
||||
import os
|
||||
import shutil
|
||||
|
||||
import traceback
|
||||
|
||||
import portalocker
|
||||
|
||||
from warcio.timeutils import timestamp20_now
|
||||
from warcio.warcwriter import BaseWARCWriter
|
||||
|
||||
from pywb.utils.format import res_template
|
||||
from pywb.utils.io import no_except_close
|
||||
|
||||
|
||||
# ============================================================================
|
||||
@ -31,6 +30,7 @@ class MultiFileWARCWriter(BaseWARCWriter):
|
||||
self.dir_template = dir_template
|
||||
self.key_template = kwargs.get('key_template', self.dir_template)
|
||||
self.dedup_index = kwargs.get('dedup_index')
|
||||
self.dedup_by_url = kwargs.get('dedup_by_url')
|
||||
self.filename_template = filename_template
|
||||
self.max_size = max_size
|
||||
if max_idle_secs > 0:
|
||||
@ -49,7 +49,7 @@ class MultiFileWARCWriter(BaseWARCWriter):
|
||||
|
||||
try:
|
||||
url = record.rec_headers.get_header('WARC-Target-URI')
|
||||
digest = record.rec_headers.get_header('WARC-Payload-Digest')
|
||||
digest = record.rec_headers.get_header('WARC-Payload-Digest') if not self.dedup_by_url else None
|
||||
iso_dt = record.rec_headers.get_header('WARC-Date')
|
||||
result = self.dedup_index.lookup_revisit(params, digest, url, iso_dt)
|
||||
except Exception as e:
|
||||
@ -85,7 +85,7 @@ class MultiFileWARCWriter(BaseWARCWriter):
|
||||
|
||||
try:
|
||||
os.makedirs(path)
|
||||
except:
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
fh = open(filename, 'a+b')
|
||||
@ -99,11 +99,12 @@ class MultiFileWARCWriter(BaseWARCWriter):
|
||||
try:
|
||||
if os.name != 'nt':
|
||||
portalocker.lock(fh, portalocker.LOCK_UN)
|
||||
fh.close()
|
||||
return True
|
||||
except Exception as e:
|
||||
print(e)
|
||||
return False
|
||||
finally:
|
||||
no_except_close(fh)
|
||||
|
||||
def get_dir_key(self, params):
|
||||
return res_template(self.key_template, params)
|
||||
@ -249,7 +250,7 @@ class MultiFileWARCWriter(BaseWARCWriter):
|
||||
for dir_key, out, filename in self.iter_open_files():
|
||||
try:
|
||||
mtime = os.path.getmtime(filename)
|
||||
except:
|
||||
except Exception:
|
||||
self.close_key(dir_key)
|
||||
return
|
||||
|
||||
|
@ -1,26 +1,21 @@
|
||||
from pywb.utils.io import StreamIter, BUFF_SIZE
|
||||
from pywb.utils.format import ParamFormatter, res_template
|
||||
from pywb.warcserver.inputrequest import DirectWSGIInputRequest
|
||||
|
||||
from warcio.recordloader import ArcWarcRecordLoader
|
||||
|
||||
from pywb.recorder.filters import SkipRangeRequestFilter, CollectionFilter
|
||||
|
||||
from six.moves.urllib.parse import parse_qsl
|
||||
import six
|
||||
|
||||
import json
|
||||
import tempfile
|
||||
|
||||
import requests
|
||||
|
||||
import traceback
|
||||
|
||||
import gevent.queue
|
||||
import gevent
|
||||
import gevent.queue
|
||||
import requests
|
||||
import six
|
||||
from six.moves.urllib.parse import parse_qsl
|
||||
from warcio.recordloader import ArcWarcRecordLoader
|
||||
|
||||
from pywb.recorder.filters import CollectionFilter, SkipRangeRequestFilter
|
||||
from pywb.utils.format import ParamFormatter
|
||||
from pywb.utils.io import BUFF_SIZE, StreamIter, no_except_close
|
||||
from pywb.warcserver.inputrequest import DirectWSGIInputRequest
|
||||
|
||||
|
||||
#==============================================================================
|
||||
# ==============================================================================
|
||||
class RecorderApp(object):
|
||||
def __init__(self, upstream_host, writer, skip_filters=None, **kwargs):
|
||||
self.upstream_host = upstream_host
|
||||
@ -29,8 +24,7 @@ class RecorderApp(object):
|
||||
|
||||
self.rec_source_name = kwargs.get('name', 'recorder')
|
||||
|
||||
self.create_buff_func = kwargs.get('create_buff_func',
|
||||
self.default_create_buffer)
|
||||
self.create_buff_func = kwargs.get('create_buff_func') or self.default_create_buffer
|
||||
|
||||
self.write_queue = gevent.queue.Queue()
|
||||
gevent.spawn(self._write_loop)
|
||||
@ -52,13 +46,13 @@ class RecorderApp(object):
|
||||
|
||||
@staticmethod
|
||||
def default_create_buffer(params, name):
|
||||
return tempfile.SpooledTemporaryFile(max_size=512*1024)
|
||||
return tempfile.SpooledTemporaryFile(max_size=512 * 1024)
|
||||
|
||||
def _write_loop(self):
|
||||
while True:
|
||||
try:
|
||||
self._write_one()
|
||||
except:
|
||||
except Exception:
|
||||
traceback.print_exc()
|
||||
|
||||
def _write_one(self):
|
||||
@ -88,14 +82,13 @@ class RecorderApp(object):
|
||||
else:
|
||||
self.writer.write_record(resp, params)
|
||||
|
||||
|
||||
finally:
|
||||
try:
|
||||
if req_pay:
|
||||
req_pay.close()
|
||||
no_except_close(req_pay)
|
||||
|
||||
if resp_pay:
|
||||
resp_pay.close()
|
||||
no_except_close(resp_pay)
|
||||
except Exception as e:
|
||||
traceback.print_exc()
|
||||
|
||||
@ -155,7 +148,7 @@ class RecorderApp(object):
|
||||
|
||||
finally:
|
||||
if req_stream:
|
||||
req_stream.out.close()
|
||||
no_except_close(req_stream.out)
|
||||
|
||||
return self.send_message(msg,
|
||||
'200 OK',
|
||||
@ -169,8 +162,7 @@ class RecorderApp(object):
|
||||
def __call__(self, environ, start_response):
|
||||
try:
|
||||
return self.handle_call(environ, start_response)
|
||||
except:
|
||||
import traceback
|
||||
except Exception:
|
||||
traceback.print_exc()
|
||||
|
||||
def handle_call(self, environ, start_response):
|
||||
@ -217,15 +209,15 @@ class RecorderApp(object):
|
||||
|
||||
try:
|
||||
res = requests.request(url=self.upstream_host + request_uri,
|
||||
method=method,
|
||||
data=data,
|
||||
headers=headers,
|
||||
allow_redirects=False,
|
||||
stream=True)
|
||||
method=method,
|
||||
data=data,
|
||||
headers=headers,
|
||||
allow_redirects=False,
|
||||
stream=True)
|
||||
res.raise_for_status()
|
||||
except Exception as e:
|
||||
if req_is_wrapped:
|
||||
req_stream.out.close()
|
||||
no_except_close(req_stream.out)
|
||||
return self.send_error(e, start_response)
|
||||
|
||||
if not skipping:
|
||||
@ -233,8 +225,7 @@ class RecorderApp(object):
|
||||
req_stream.headers,
|
||||
res.headers,
|
||||
params)
|
||||
for x in self.skip_filters)
|
||||
|
||||
for x in self.skip_filters)
|
||||
|
||||
if not skipping:
|
||||
resp_stream = RespWrapper(res.raw,
|
||||
@ -248,7 +239,7 @@ class RecorderApp(object):
|
||||
else:
|
||||
resp_stream = res.raw
|
||||
if req_is_wrapped:
|
||||
req_stream.out.close()
|
||||
no_except_close(req_stream.out)
|
||||
|
||||
resp_iter = StreamIter(resp_stream)
|
||||
|
||||
@ -260,7 +251,7 @@ class RecorderApp(object):
|
||||
return resp_iter
|
||||
|
||||
|
||||
#==============================================================================
|
||||
# ==============================================================================
|
||||
class Wrapper(object):
|
||||
def __init__(self, stream, params, create_func):
|
||||
self.stream = stream
|
||||
@ -280,7 +271,7 @@ class Wrapper(object):
|
||||
return buff
|
||||
|
||||
|
||||
#==============================================================================
|
||||
# ==============================================================================
|
||||
class RespWrapper(Wrapper):
|
||||
def __init__(self, stream, headers, req,
|
||||
params, queue, path, create_func):
|
||||
@ -319,23 +310,20 @@ class RespWrapper(Wrapper):
|
||||
entry = (self.req.headers, self.req.out,
|
||||
self.headers, self.out, self.params)
|
||||
self.queue.put(entry)
|
||||
except:
|
||||
except Exception:
|
||||
traceback.print_exc()
|
||||
skipping = True
|
||||
|
||||
finally:
|
||||
try:
|
||||
if skipping:
|
||||
self.out.close()
|
||||
self.req.out.close()
|
||||
except:
|
||||
traceback.print_exc()
|
||||
if skipping:
|
||||
no_except_close(self.out)
|
||||
no_except_close(self.req.out)
|
||||
|
||||
self.req.close()
|
||||
no_except_close(self.req)
|
||||
self.req = None
|
||||
|
||||
|
||||
#==============================================================================
|
||||
# ==============================================================================
|
||||
class ReqWrapper(Wrapper):
|
||||
def __init__(self, stream, req_headers, params, create_func):
|
||||
super(ReqWrapper, self).__init__(stream, params, create_func)
|
||||
@ -348,5 +336,3 @@ class ReqWrapper(Wrapper):
|
||||
def close(self):
|
||||
# no need to close wsgi.input
|
||||
pass
|
||||
|
||||
|
||||
|
@ -2,6 +2,7 @@ from warcio.timeutils import iso_date_to_timestamp
|
||||
|
||||
from io import BytesIO
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
from pywb.utils.canonicalize import calc_search_range
|
||||
from pywb.utils.format import res_template
|
||||
@ -48,9 +49,11 @@ class WritableRedisIndexer(RedisIndexSource):
|
||||
return base_name
|
||||
|
||||
def add_warc_file(self, full_filename, params):
|
||||
base_filename = self._get_rel_or_base_name(full_filename, params)
|
||||
file_key = res_template(self.file_key_template, params)
|
||||
if not file_key:
|
||||
return
|
||||
|
||||
base_filename = self._get_rel_or_base_name(full_filename, params)
|
||||
full_load_path = self.full_warc_prefix + full_filename
|
||||
|
||||
self.redis.hset(file_key, base_filename, full_load_path)
|
||||
@ -99,3 +102,29 @@ class WritableRedisIndexer(RedisIndexSource):
|
||||
return res
|
||||
|
||||
return None
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class RedisPendingCounterTempBuffer(tempfile.SpooledTemporaryFile):
|
||||
def __init__(self, max_size, redis_url, params, name, timeout=30):
|
||||
redis_url = res_template(redis_url, params)
|
||||
super(RedisPendingCounterTempBuffer, self).__init__(max_size=max_size)
|
||||
self.redis, self.key = RedisIndexSource.parse_redis_url(redis_url)
|
||||
self.timeout = timeout
|
||||
|
||||
self.redis.incrby(self.key, 1)
|
||||
self.redis.expire(self.key, self.timeout)
|
||||
|
||||
def write(self, buf):
|
||||
super(RedisPendingCounterTempBuffer, self).write(buf)
|
||||
self.redis.expire(self.key, self.timeout)
|
||||
|
||||
def close(self):
|
||||
try:
|
||||
super(RedisPendingCounterTempBuffer, self).close()
|
||||
except:
|
||||
traceback.print_exc()
|
||||
|
||||
self.redis.incrby(self.key, -1)
|
||||
self.redis.expire(self.key, self.timeout)
|
||||
|
||||
|
@ -71,8 +71,8 @@ class TestRecorder(LiveServerTests, HttpBinLiveTests, FakeRedisTests, TempDirTes
|
||||
|
||||
return dedup_index
|
||||
|
||||
def _test_warc_write(self, recorder_app, host, path, other_params='', link_url=''):
|
||||
url = 'http://' + host + path
|
||||
def _test_warc_write(self, recorder_app, host, path, other_params='', link_url='', protocol='http'):
|
||||
url = protocol + '://' + host + path
|
||||
req_url = '/live/resource/postreq?url=' + url + other_params
|
||||
testapp = webtest.TestApp(recorder_app)
|
||||
resp = testapp.post(req_url, general_req_data.format(host=host, path=path).encode('utf-8'))
|
||||
@ -231,8 +231,9 @@ class TestRecorder(LiveServerTests, HttpBinLiveTests, FakeRedisTests, TempDirTes
|
||||
PerRecordWARCWriter(warc_path, header_filter=header_filter),
|
||||
accept_colls='live')
|
||||
|
||||
resp = self._test_warc_write(recorder_app, 'www.google.com', '/')
|
||||
assert b'HTTP/1.1 302' in resp.body
|
||||
resp = self._test_warc_write(recorder_app, 'www.google.com', '/', protocol='https')
|
||||
print(resp.body.decode('utf-8'))
|
||||
#assert b'HTTP/1.1 302' in resp.body
|
||||
|
||||
buff = BytesIO(resp.body)
|
||||
record = ArcWarcRecordLoader().parse_record_stream(buff)
|
||||
@ -606,6 +607,8 @@ class TestRecorder(LiveServerTests, HttpBinLiveTests, FakeRedisTests, TempDirTes
|
||||
writer.close()
|
||||
assert len(writer.fh_cache) == 0
|
||||
|
||||
#@pytest.mark.skipif(os.environ.get('CI') is not None, reason='Skip Test on CI')
|
||||
@pytest.mark.skip
|
||||
def test_record_video_metadata(self):
|
||||
pytest.importorskip('youtube_dl')
|
||||
warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')
|
||||
|
@ -1,31 +1,74 @@
|
||||
from io import BytesIO
|
||||
|
||||
import codecs
|
||||
import json
|
||||
import re
|
||||
import tempfile
|
||||
from contextlib import closing
|
||||
|
||||
import webencodings
|
||||
from warcio.bufferedreaders import BufferedReader, ChunkedDataReader
|
||||
from warcio.utils import to_native_str
|
||||
|
||||
import re
|
||||
import webencodings
|
||||
import tempfile
|
||||
import json
|
||||
import codecs
|
||||
from pywb.utils.io import BUFF_SIZE, StreamIter, no_except_close
|
||||
from pywb.utils.loaders import load_py_name, load_yaml_config
|
||||
|
||||
from pywb.utils.io import StreamIter, BUFF_SIZE
|
||||
|
||||
from pywb.utils.loaders import load_yaml_config, load_py_name
|
||||
WORKER_MODS = {"wkr_", "sw_"} # type: Set[str]
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class BaseContentRewriter(object):
|
||||
CHARSET_REGEX = re.compile(b'<meta[^>]*?[\s;"\']charset\s*=[\s"\']*([^\s"\'/>]*)')
|
||||
|
||||
TITLE = re.compile(r'<\s*title\s*>(.*)<\s*\/\s*title\s*>', re.M | re.I | re.S)
|
||||
|
||||
# set via html_rewriter since it overrides the default one
|
||||
html_unescape = None
|
||||
|
||||
@classmethod
|
||||
def set_unescape(cls, unescape):
|
||||
cls.html_unescape = unescape
|
||||
|
||||
@classmethod
|
||||
def _extract_title(cls, gen):
|
||||
title_res = list(gen)
|
||||
if not title_res or not title_res[0]:
|
||||
return
|
||||
|
||||
m = cls.TITLE.search(title_res[0].decode('utf-8'))
|
||||
if not m:
|
||||
return
|
||||
|
||||
title_res = m.group(1)
|
||||
title_res = title_res.strip()
|
||||
try:
|
||||
title_res = cls.html_unescape(title_res)
|
||||
except Exception as e:
|
||||
pass
|
||||
|
||||
return title_res
|
||||
|
||||
def __init__(self, rules_file, replay_mod=''):
|
||||
self.rules = []
|
||||
self.all_rewriters = []
|
||||
self.load_rules(rules_file)
|
||||
self.replay_mod = replay_mod
|
||||
|
||||
self._mod_to_pref = {}
|
||||
self._pref_to_mod = {}
|
||||
|
||||
def add_prefer_mod(self, pref, mod):
|
||||
self._mod_to_pref[mod] = pref
|
||||
self._pref_to_mod[pref] = mod
|
||||
|
||||
def mod_to_prefer(self, mod):
|
||||
pref = self._mod_to_pref.get(mod)
|
||||
if not pref:
|
||||
pref = self._mod_to_pref.get(self.replay_mod)
|
||||
|
||||
return pref
|
||||
|
||||
def prefer_to_mod(self, pref):
|
||||
return self._pref_to_mod.get(pref)
|
||||
|
||||
def add_rewriter(self, rw):
|
||||
self.all_rewriters[rw.name] = rw
|
||||
|
||||
@ -342,13 +385,13 @@ class StreamingRewriter(object):
|
||||
yield buff.encode(charset)
|
||||
|
||||
finally:
|
||||
stream.close()
|
||||
no_except_close(stream)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class RewriteInfo(object):
|
||||
TAG_REGEX = re.compile(b'^(\xef\xbb\xbf)?\s*\<')
|
||||
TAG_REGEX2 = re.compile(b'^.*<\w+[\s>]')
|
||||
TAG_REGEX2 = re.compile(b'^.*<[!]?\w+[\s>]')
|
||||
JSON_REGEX = re.compile(b'^\s*[{[][{"]') # if it starts with this then highly likely not HTML
|
||||
|
||||
JSONP_CONTAINS = ['callback=jQuery',
|
||||
@ -423,8 +466,8 @@ class RewriteInfo(object):
|
||||
def _resolve_text_type(self, text_type):
|
||||
mod = self.url_rewriter.wburl.mod
|
||||
|
||||
if mod == 'sw_' or mod == 'wkr_':
|
||||
return None
|
||||
if mod in WORKER_MODS:
|
||||
return 'js-worker'
|
||||
|
||||
if text_type == 'css' and mod == 'js_':
|
||||
text_type = 'css'
|
||||
@ -445,7 +488,7 @@ class RewriteInfo(object):
|
||||
else:
|
||||
return text_type
|
||||
|
||||
buff = self.read_and_keep(128)
|
||||
buff = self.read_and_keep(1024)
|
||||
|
||||
# check if doesn't start with a tag, then likely not html
|
||||
if self.TAG_REGEX.match(buff):
|
||||
@ -481,7 +524,7 @@ class RewriteInfo(object):
|
||||
if not self.text_type:
|
||||
return False
|
||||
|
||||
if self.url_rewriter.wburl.mod == 'id_':
|
||||
if self.is_identity():
|
||||
return False
|
||||
|
||||
if self.url_rewriter.rewrite_opts.get('is_ajax'):
|
||||
@ -494,9 +537,11 @@ class RewriteInfo(object):
|
||||
|
||||
return True
|
||||
|
||||
def is_identity(self):
|
||||
return self.url_rewriter.wburl.mod in ('id_', 'ir_')
|
||||
|
||||
def is_url_rw(self):
|
||||
if self.url_rewriter.wburl.mod in ('id_', 'bn_', 'sw_', 'wkr_'):
|
||||
if self.url_rewriter.wburl.mod in ('id_', 'bn_', 'wkrf_'):
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
@ -2,6 +2,9 @@ from six.moves.http_cookies import SimpleCookie, CookieError
|
||||
import six
|
||||
import re
|
||||
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
#================================================================
|
||||
class WbUrlBaseCookieRewriter(object):
|
||||
@ -17,9 +20,8 @@ class WbUrlBaseCookieRewriter(object):
|
||||
cookie_str = self.REMOVE_EXPIRES.sub('', cookie_str)
|
||||
try:
|
||||
cookie = SimpleCookie(cookie_str)
|
||||
except CookieError:
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
except CookieError as e:
|
||||
logger.info(e, exc_info=True)
|
||||
return results
|
||||
|
||||
for name, morsel in six.iteritems(cookie):
|
||||
@ -39,10 +41,10 @@ class WbUrlBaseCookieRewriter(object):
|
||||
then assume its meant to be a prefix, and likely needed for
|
||||
other content.
|
||||
Set cookie with same prefix but for all common modifiers:
|
||||
(mp_, js_, cs_, oe_, if_)
|
||||
(mp_, js_, cs_, oe_, if_, sw_, wkrf_)
|
||||
"""
|
||||
curr_mod = self.url_rewriter.wburl.mod
|
||||
if curr_mod not in ('mp_', 'if_'):
|
||||
if curr_mod not in ('mp_', 'if_', 'sw_'):
|
||||
return False
|
||||
|
||||
if not morsel.get('httponly'):
|
||||
@ -52,7 +54,7 @@ class WbUrlBaseCookieRewriter(object):
|
||||
if not path or not path.endswith('/'):
|
||||
return False
|
||||
|
||||
for mod in ('mp_', 'cs_', 'js_', 'im_', 'oe_', 'if_'):
|
||||
for mod in ('mp_', 'cs_', 'js_', 'im_', 'oe_', 'if_', 'sw_', 'wkrf_'):
|
||||
new_path = path.replace(curr_mod + '/', mod + '/')
|
||||
morsel['path'] = new_path
|
||||
results.append((header, morsel.OutputString()))
|
||||
|
@ -19,7 +19,8 @@ class CookieTracker(object):
|
||||
def get_rewriter(self, url_rewriter, cookie_key):
|
||||
return DomainCacheCookieRewriter(url_rewriter, self, cookie_key)
|
||||
|
||||
def get_cookie_headers(self, url, url_rewriter, cookie_key):
|
||||
def get_cookie_headers(self, url, url_rewriter, cookie_key, existing_cookie):
|
||||
existing_cookie = existing_cookie or ''
|
||||
subds = self.get_subdomains(url)
|
||||
host_cookie_rewriter = HostScopeNoFilterCookieRewriter(url_rewriter)
|
||||
|
||||
@ -46,7 +47,14 @@ class CookieTracker(object):
|
||||
n = n.decode('utf-8')
|
||||
v = v.decode('utf-8')
|
||||
|
||||
full = n + '=' + v
|
||||
n += '='
|
||||
|
||||
# if cookie already in existing cookie, don't add duplicate
|
||||
# also, don't add to set-cookie again (to avoid exceeding cookie size)
|
||||
if n in existing_cookie:
|
||||
continue
|
||||
|
||||
full = n + v
|
||||
cookies.append(full.split(';')[0])
|
||||
|
||||
full += '; Max-Age=' + str(self.expire_time)
|
||||
@ -108,7 +116,7 @@ class DomainCacheCookieRewriter(WbUrlBaseCookieRewriter):
|
||||
# if domain set, no choice but to expand cookie path to root
|
||||
domain = morsel.pop('domain', '')
|
||||
|
||||
if domain:
|
||||
if domain and self.cookie_key:
|
||||
#if morsel.get('max-age'):
|
||||
# morsel['max-age'] = int(morsel['max-age'])
|
||||
|
||||
|
@ -15,10 +15,12 @@ from pywb.rewrite.rewrite_dash import RewriteDASH
|
||||
from pywb.rewrite.rewrite_hls import RewriteHLS
|
||||
from pywb.rewrite.rewrite_amf import RewriteAMF
|
||||
|
||||
from pywb.rewrite.rewrite_js_workers import JSWorkerRewriter
|
||||
|
||||
from pywb import DEFAULT_RULES_FILE
|
||||
|
||||
import copy
|
||||
from werkzeug.useragents import UserAgent
|
||||
from ua_parser import user_agent_parser
|
||||
|
||||
|
||||
# ============================================================================
|
||||
@ -32,8 +34,9 @@ class DefaultRewriter(BaseContentRewriter):
|
||||
|
||||
'css': CSSRewriter,
|
||||
|
||||
'js': JSLocationOnlyRewriter,
|
||||
'js': JSWombatProxyRewriter,
|
||||
'js-proxy': JSNoneRewriter,
|
||||
'js-worker': JSWorkerRewriter,
|
||||
|
||||
'json': JSONPRewriter,
|
||||
|
||||
@ -95,9 +98,15 @@ class DefaultRewriter(BaseContentRewriter):
|
||||
def __init__(self, replay_mod='', config=None):
|
||||
config = config or {}
|
||||
rules_file = config.get('rules_file', DEFAULT_RULES_FILE)
|
||||
|
||||
super(DefaultRewriter, self).__init__(rules_file, replay_mod)
|
||||
self.all_rewriters = copy.copy(self.DEFAULT_REWRITERS)
|
||||
|
||||
self.add_prefer_mod('raw', 'ir_')
|
||||
self.add_prefer_mod('raw', 'id_')
|
||||
self.add_prefer_mod('banner-only', 'bn_')
|
||||
self.add_prefer_mod('rewritten', replay_mod)
|
||||
|
||||
def init_js_regex(self, regexs):
|
||||
return RegexRewriter.parse_rules_from_config(regexs)
|
||||
|
||||
@ -111,33 +120,44 @@ class RewriterWithJSProxy(DefaultRewriter):
|
||||
super(RewriterWithJSProxy, self).__init__(*args, **kwargs)
|
||||
|
||||
def get_rewriter(self, rw_type, rwinfo=None):
|
||||
if rw_type == 'js' and rwinfo:
|
||||
# check if UA allows this
|
||||
if self.ua_allows_obj_proxy(rwinfo.url_rewriter.rewrite_opts):
|
||||
return JSWombatProxyRewriter
|
||||
if rw_type != 'js' or not rwinfo:
|
||||
return super(RewriterWithJSProxy, self).get_rewriter(rw_type, rwinfo)
|
||||
|
||||
# otherwise, return default rewriter
|
||||
return super(RewriterWithJSProxy, self).get_rewriter(rw_type, rwinfo)
|
||||
# check if should use old non-proxy rewriter
|
||||
if self.ua_no_obj_proxy(rwinfo.url_rewriter.rewrite_opts):
|
||||
print("loc only")
|
||||
return JSLocationOnlyRewriter
|
||||
else:
|
||||
# otherwise, return default, js proxy-capable rewriter
|
||||
return JSWombatProxyRewriter
|
||||
|
||||
def ua_allows_obj_proxy(self, opts):
|
||||
def ua_no_obj_proxy(self, opts):
|
||||
ua = opts.get('ua')
|
||||
if not ua:
|
||||
ua_string = opts.get('ua_string')
|
||||
if ua_string:
|
||||
ua = UserAgent(ua_string)
|
||||
ua = user_agent_parser.ParseUserAgent(ua_string)
|
||||
|
||||
if ua is None:
|
||||
return True
|
||||
return False
|
||||
|
||||
supported = {
|
||||
'chrome': '49.0',
|
||||
'firefox': '44.0',
|
||||
'safari': '10.0',
|
||||
'opera': '36.0',
|
||||
'edge': '12.0',
|
||||
'msie': None,
|
||||
'chrome': 49,
|
||||
'firefox': 4,
|
||||
'safari': 10,
|
||||
'opera': 36,
|
||||
'edge': 12,
|
||||
'ie': 1000,
|
||||
}
|
||||
|
||||
min_vers = supported.get(ua.browser)
|
||||
min_vers = supported.get(ua.get("family", "").lower())
|
||||
if not min_vers:
|
||||
return False
|
||||
|
||||
try:
|
||||
ua_version = int(ua.get("major", 0))
|
||||
except:
|
||||
return False
|
||||
|
||||
return ua_version < min_vers
|
||||
|
||||
return (min_vers and ua.version >= min_vers)
|
||||
|
@ -38,7 +38,7 @@ class DefaultHeaderRewriter(object):
|
||||
'content-security-policy-report-only': 'prefix',
|
||||
'content-type': 'keep',
|
||||
|
||||
'date': 'keep',
|
||||
'date': 'prefix',
|
||||
|
||||
'etag': 'prefix',
|
||||
'expires': 'prefix',
|
||||
|
@ -4,31 +4,40 @@ from pywb.rewrite.content_rewriter import StreamingRewriter
|
||||
|
||||
# ============================================================================
|
||||
class HTMLInsertOnlyRewriter(StreamingRewriter):
|
||||
""" Insert custom string into HTML <head> tag
|
||||
""" Insert custom string into HTML into the head, before any tag not <head> or <html>
|
||||
no other rewriting performed
|
||||
"""
|
||||
HEAD_REGEX = re.compile('<\s*head\\b[^>]*[>]+', re.I)
|
||||
NOT_HEAD_REGEX = re.compile(r'(<\s*\b)(?!(html|head))', re.I)
|
||||
|
||||
XML_HEADER = re.compile(r'<\?xml.*\?>')
|
||||
|
||||
def __init__(self, url_rewriter, **kwargs):
|
||||
super(HTMLInsertOnlyRewriter, self).__init__(url_rewriter, False)
|
||||
self.head_insert = kwargs['head_insert']
|
||||
|
||||
self.done = False
|
||||
self.first = True
|
||||
|
||||
def rewrite(self, string):
|
||||
if self.first:
|
||||
if self.url_rewriter.rewrite_opts.get('is_ajax') and self.XML_HEADER.search(string):
|
||||
self.done = True
|
||||
|
||||
self.first = False
|
||||
|
||||
if self.done:
|
||||
return string
|
||||
|
||||
# only try to find <head> in first buffer
|
||||
self.done = True
|
||||
m = self.HEAD_REGEX.search(string)
|
||||
m = self.NOT_HEAD_REGEX.search(string)
|
||||
if m:
|
||||
inx = m.end()
|
||||
inx = m.start()
|
||||
buff = string[:inx]
|
||||
buff += self.head_insert
|
||||
buff += string[inx:]
|
||||
self.done = True
|
||||
return buff
|
||||
else:
|
||||
return string
|
||||
|
||||
|
||||
def final_read(self):
|
||||
return '' if self.done else self.head_insert
|
||||
|
@ -11,7 +11,7 @@ from six.moves.urllib.parse import urljoin, urlsplit, urlunsplit
|
||||
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||
from pywb.rewrite.regex_rewriters import JSRewriter, CSSRewriter
|
||||
|
||||
from pywb.rewrite.content_rewriter import StreamingRewriter
|
||||
from pywb.rewrite.content_rewriter import StreamingRewriter, BaseContentRewriter
|
||||
|
||||
from six import text_type
|
||||
|
||||
@ -20,9 +20,16 @@ import six.moves.html_parser
|
||||
try:
|
||||
orig_unescape = six.moves.html_parser.unescape
|
||||
six.moves.html_parser.unescape = lambda x: x
|
||||
BaseContentRewriter.set_unescape(orig_unescape)
|
||||
except:
|
||||
orig_unescape = None
|
||||
|
||||
@staticmethod
|
||||
def __unescape(x):
|
||||
return HTMLParser().unescape(x)
|
||||
|
||||
BaseContentRewriter.set_unescape(__unescape)
|
||||
|
||||
|
||||
try:
|
||||
import _markupbase as markupbase
|
||||
@ -58,7 +65,7 @@ class HTMLRewriterMixin(StreamingRewriter):
|
||||
'embed': {'src': 'oe_'},
|
||||
'head': {'': defmod}, # for head rewriting
|
||||
'iframe': {'src': 'if_'},
|
||||
'image': {'src': 'im_', 'xlink:href': 'im_'},
|
||||
'image': {'src': 'im_', 'xlink:href': 'im_', 'href': 'im_'},
|
||||
'img': {'src': 'im_',
|
||||
'srcset': 'im_'},
|
||||
'ins': {'cite': defmod},
|
||||
@ -74,17 +81,13 @@ class HTMLRewriterMixin(StreamingRewriter):
|
||||
'q': {'cite': defmod},
|
||||
'ref': {'href': 'oe_'},
|
||||
'script': {'src': 'js_', 'xlink:href': 'js_'}, # covers both HTML and SVG script tags
|
||||
'source': {'src': 'oe_'},
|
||||
'source': {'src': 'oe_', 'srcset': 'oe_'},
|
||||
'video': {'src': 'oe_',
|
||||
'poster': 'im_'},
|
||||
}
|
||||
|
||||
return rewrite_tags
|
||||
|
||||
# tags allowed in the <head> of an html document
|
||||
HEAD_TAGS = ['html', 'head', 'base', 'link', 'meta',
|
||||
'title', 'style', 'script', 'object', 'bgsound']
|
||||
|
||||
BEFORE_HEAD_TAGS = ['html', 'head']
|
||||
|
||||
DATA_RW_PROTOCOLS = ('http://', 'https://', '//')
|
||||
@ -167,6 +170,16 @@ class HTMLRewriterMixin(StreamingRewriter):
|
||||
|
||||
ADD_WINDOW = re.compile('(?<![.])(WB_wombat_)')
|
||||
|
||||
SRCSET_REGEX = re.compile('\s*(\S*\s+[\d\.]+[wx]),|(?:\s*,(?:\s+|(?=https?:)))')
|
||||
|
||||
def _rewrite_srcset(self, value, mod=''):
|
||||
if not value:
|
||||
return ''
|
||||
|
||||
values = (url.strip() for url in re.split(self.SRCSET_REGEX, value) if url)
|
||||
values = [self._rewrite_url(v.split(' ')[0].strip()) + (' ' + ' '.join(v.split(' ')[1:])).rstrip() for v in values if v]
|
||||
return ', '.join(values)
|
||||
|
||||
def _rewrite_meta_refresh(self, meta_refresh):
|
||||
if not meta_refresh:
|
||||
return ''
|
||||
@ -255,7 +268,7 @@ class HTMLRewriterMixin(StreamingRewriter):
|
||||
unesc_value = self.try_unescape(value)
|
||||
rewritten_value = self.url_rewriter.rewrite(unesc_value, mod, force_abs)
|
||||
|
||||
# if no rewriting has occured, ensure we return original, not reencoded value
|
||||
# if no rewriting has occurred, ensure we return original, not reencoded value
|
||||
if rewritten_value == value:
|
||||
return orig_value
|
||||
|
||||
@ -265,7 +278,7 @@ class HTMLRewriterMixin(StreamingRewriter):
|
||||
return rewritten_value
|
||||
|
||||
def try_unescape(self, value):
|
||||
if not value.startswith('http'):
|
||||
if '&#' not in value:
|
||||
return value
|
||||
|
||||
try:
|
||||
@ -278,22 +291,18 @@ class HTMLRewriterMixin(StreamingRewriter):
|
||||
|
||||
return new_value
|
||||
|
||||
SRCSET_REGEX = re.compile('\s*(\S*\s+[\d\.]+[wx]),|(?:\s*,(?:\s+|(?=https?:)))')
|
||||
|
||||
def _rewrite_srcset(self, value, mod=''):
|
||||
if not value:
|
||||
return ''
|
||||
|
||||
values = (url.strip() for url in re.split(self.SRCSET_REGEX, value) if url)
|
||||
values = [self._rewrite_url(v.strip()) for v in values]
|
||||
return ', '.join(values)
|
||||
|
||||
def _rewrite_css(self, css_content):
|
||||
if css_content:
|
||||
return self.css_rewriter.rewrite_complete(css_content)
|
||||
else:
|
||||
if not css_content:
|
||||
return ''
|
||||
|
||||
unesc_css = self.try_unescape(css_content)
|
||||
rw_css = self.css_rewriter.rewrite_complete(unesc_css)
|
||||
|
||||
if unesc_css == rw_css:
|
||||
return css_content
|
||||
else:
|
||||
return rw_css
|
||||
|
||||
def _rewrite_script(self, script_content, inline_attr=False):
|
||||
if not script_content:
|
||||
return ''
|
||||
@ -407,12 +416,6 @@ class HTMLRewriterMixin(StreamingRewriter):
|
||||
rw_mod = handler.get(attr_name)
|
||||
attr_value = self._rewrite_url(attr_value, rw_mod)
|
||||
|
||||
# special case: data- attrs, conditional rewrite
|
||||
elif attr_name and attr_value and attr_name.startswith('data-'):
|
||||
if attr_value.startswith(self.DATA_RW_PROTOCOLS):
|
||||
rw_mod = 'oe_'
|
||||
attr_value = self._rewrite_url(attr_value, rw_mod)
|
||||
|
||||
# special case: base tag
|
||||
elif (tag == 'base') and (attr_name == 'href') and attr_value:
|
||||
rw_mod = handler.get(attr_name)
|
||||
@ -430,6 +433,12 @@ class HTMLRewriterMixin(StreamingRewriter):
|
||||
# URL not skipped, likely src='js/....', forcing abs to make sure, cause PHP MIME(JS) === HTML
|
||||
attr_value = self._rewrite_url(attr_value, rw_mod, True)
|
||||
self._write_attr('__wb_orig_src', ov, empty_attr=None)
|
||||
|
||||
elif attr_name == 'target':
|
||||
target = attr_value
|
||||
if target in ('_blank', '_parent', '_top'):
|
||||
attr_value = '___wb_replay_top_frame'
|
||||
|
||||
else:
|
||||
# rewrite url using tag handler
|
||||
rw_mod = handler.get(attr_name)
|
||||
@ -460,7 +469,7 @@ class HTMLRewriterMixin(StreamingRewriter):
|
||||
rw_mod = self.PRELOAD_TYPES.get(preload, rw_mod)
|
||||
|
||||
# for html imports with an optional as (google exclusive)
|
||||
elif rel == 'import':
|
||||
elif rel == 'import' or rel == 'alternate':
|
||||
rw_mod = 'mp_'
|
||||
|
||||
elif rel == 'stylesheet':
|
||||
@ -659,7 +668,7 @@ class HTMLRewriter(HTMLRewriterMixin, HTMLParser):
|
||||
if self.parse_comments:
|
||||
#data = self._rewrite_script(data)
|
||||
|
||||
# Rewrite with seperate HTMLRewriter
|
||||
# Rewrite with separate HTMLRewriter
|
||||
comment_rewriter = HTMLRewriter(self.url_rewriter,
|
||||
defmod=self.defmod)
|
||||
|
||||
|
@ -4,12 +4,14 @@ from pywb.rewrite.content_rewriter import StreamingRewriter
|
||||
|
||||
# ============================================================================
|
||||
class JSONPRewriter(StreamingRewriter):
|
||||
JSONP = re.compile(r'^(?:\s*\/\*(?:.*)\*\/)*\s*(\w+)\(\{')
|
||||
#JSONP = re.compile(r'^(?:\s*\/\*(?:.*)\*\/)*\s*(\w+)\(\{')
|
||||
# Match a single /* and // style comments at the beginning
|
||||
JSONP = re.compile(r'(?:^[ \t]*(?:(?:\/\*[^\*]*\*\/)|(?:\/\/[^\n]+[\n])))*[ \t]*(\w+)\(\{', re.M)
|
||||
CALLBACK = re.compile(r'[?].*callback=([^&]+)')
|
||||
|
||||
def rewrite(self, string):
|
||||
# see if json is jsonp, starts with callback func
|
||||
m_json = self.JSONP.search(string)
|
||||
m_json = self.JSONP.match(string)
|
||||
if not m_json:
|
||||
return string
|
||||
|
||||
@ -17,6 +19,12 @@ class JSONPRewriter(StreamingRewriter):
|
||||
m_callback = self.CALLBACK.search(self.url_rewriter.wburl.url)
|
||||
if not m_callback:
|
||||
return string
|
||||
if m_callback.group(1) == '?':
|
||||
# this is a very sharp edge case e.g. callback=?
|
||||
# since we only have this string[m_json.end(1):]
|
||||
# would cut off the name of the CB if any is included
|
||||
# so we just pass the string through
|
||||
return string
|
||||
|
||||
string = m_callback.group(1) + string[m_json.end(1):]
|
||||
return string
|
||||
|
@ -13,8 +13,21 @@ class RxRules(object):
|
||||
return string.replace("https", "http")
|
||||
|
||||
@staticmethod
|
||||
def replace_str(replacer):
|
||||
return lambda x, _: x.replace('this', replacer)
|
||||
def replace_str(replacer, match='this'):
|
||||
return lambda x, _: x.replace(match, replacer)
|
||||
|
||||
@staticmethod
|
||||
def replace_prefix_from(prefix, match):
|
||||
def do_replace(x, _):
|
||||
start = x.find(match)
|
||||
if start == 0:
|
||||
return prefix
|
||||
if start > 0:
|
||||
return x[:start] + prefix
|
||||
return x
|
||||
|
||||
return do_replace
|
||||
|
||||
|
||||
@staticmethod
|
||||
def format(template):
|
||||
@ -25,8 +38,8 @@ class RxRules(object):
|
||||
return lambda _, _2: string
|
||||
|
||||
@staticmethod
|
||||
def archival_rewrite():
|
||||
return lambda string, rewriter: rewriter.rewrite(string)
|
||||
def archival_rewrite(mod=None):
|
||||
return lambda string, rewriter: rewriter.rewrite(string, mod)
|
||||
|
||||
@staticmethod
|
||||
def add_prefix(prefix):
|
||||
@ -42,7 +55,7 @@ class RxRules(object):
|
||||
regex_str = '|'.join(['(' + rx + ')' for rx, op, count in rules])
|
||||
|
||||
# ensure it's not middle of a word, wrap in non-capture group
|
||||
regex_str = '(?<!\w)(?:' + regex_str + ')'
|
||||
regex_str = '(?:' + regex_str + ')'
|
||||
|
||||
return re.compile(regex_str, re.M)
|
||||
|
||||
@ -63,50 +76,69 @@ class RxRules(object):
|
||||
class JSWombatProxyRules(RxRules):
|
||||
def __init__(self):
|
||||
local_init_func = '\nvar {0} = function(name) {{\
|
||||
return (self._wb_wombat && self._wb_wombat.local_init &&\
|
||||
return (self._wb_wombat && self._wb_wombat.local_init && \
|
||||
self._wb_wombat.local_init(name)) || self[name]; }};\n\
|
||||
if (!self.__WB_pmw) {{ self.__WB_pmw = function(obj) {{ return obj; }} }}\n\
|
||||
if (!self.__WB_pmw) {{ self.__WB_pmw = function(obj) {{ this.__WB_source = obj; return this; }} }}\n\
|
||||
{{\n'
|
||||
|
||||
local_init_func_name = '_____WB$wombat$assign$function_____'
|
||||
|
||||
local_init_func = local_init_func.format(local_init_func_name)
|
||||
|
||||
local_var_line = 'let {0} = {1}("{0}");'
|
||||
|
||||
this_rw = '(this && this._WB_wombat_obj_proxy || this)'
|
||||
# we must use a function to perform the this check because most minfiers reduce the number of statements
|
||||
# by turning everything into one or more expressions. Our previous rewrite was an logical expression,
|
||||
# (this && this._WB_wombat_obj_proxy || this), that would cause the outer expression to be invalid when
|
||||
# it was used as the LHS of certain expressions.
|
||||
# e.g. assignment expressions containing non parenthesized logical expression.
|
||||
# By using a function the expression injected is an call expression that plays nice in those cases
|
||||
this_rw = '_____WB$wombat$check$this$function_____(this)'
|
||||
|
||||
check_loc = '(self.__WB_check_loc && self.__WB_check_loc(location) || {}).href = '
|
||||
check_loc = '((self.__WB_check_loc && self.__WB_check_loc(location, arguments)) || {}).href = '
|
||||
|
||||
eval_str = 'WB_wombat_runEval2((_______eval_arg, isGlobal) => { var ge = eval; return isGlobal ? ge(_______eval_arg) : eval(_______eval_arg); }).eval(this, (function() { return arguments })(),'
|
||||
|
||||
self.local_objs = [
|
||||
'window',
|
||||
'self',
|
||||
'document',
|
||||
'location',
|
||||
'top',
|
||||
'parent',
|
||||
'frames',
|
||||
'opener']
|
||||
|
||||
'window',
|
||||
'self',
|
||||
'document',
|
||||
'location',
|
||||
'top',
|
||||
'parent',
|
||||
'frames',
|
||||
'opener'
|
||||
]
|
||||
|
||||
local_declares = '\n'.join([local_var_line.format(obj, local_init_func_name) for obj in self.local_objs])
|
||||
local_declares += "\nlet arguments;"
|
||||
|
||||
prop_str = '|'.join(self.local_objs)
|
||||
|
||||
rules = [
|
||||
(r'(?<=\.)postMessage\b\(', self.add_prefix('__WB_pmw(self).'), 0),
|
||||
(r'(?<![$.])\s*location\b\s*[=]\s*(?![=])', self.add_suffix(check_loc), 0),
|
||||
(r'\breturn\s+this\b\s*(?![.$])', self.replace_str(this_rw), 0),
|
||||
(r'(?<=[\n])\s*this\b(?=(?:\.(?:{0})\b))'.format(prop_str), self.replace_str(';' + this_rw), 0),
|
||||
(r'(?<![$.])\s*this\b(?=(?:\.(?:{0})\b))'.format(prop_str), self.replace_str(this_rw), 0),
|
||||
(r'(?<=[=])\s*this\b\s*(?![.$])', self.replace_str(this_rw), 0),
|
||||
('\}(?:\s*\))?\s*\(this\)', self.replace_str(this_rw), 0),
|
||||
(r'(?<=[^|&][|&]{2})\s*this\b\s*(?)', self.replace_str(this_rw), 0),
|
||||
# rewriting 'eval(...)' - invocation
|
||||
(r'(?<!function)(?:\s|^)\beval\s*\(', self.replace_prefix_from(eval_str, 'eval'), 0),
|
||||
# rewriting 'x = eval' - no invocation
|
||||
(r'(?<=[=,])\s*\beval\b\s*(?![(:.$])', self.replace_str('self.eval', 'eval'), 0),
|
||||
(r'(?<=\.)postMessage\b\(', self.add_prefix('__WB_pmw(self).'), 0),
|
||||
(r'(?<![$.])\s*\blocation\b\s*[=]\s*(?![=])', self.add_suffix(check_loc), 0),
|
||||
# rewriting 'return this'
|
||||
(r'\breturn\s+this\b\s*(?![.$])', self.replace_str(this_rw), 0),
|
||||
# rewriting 'this.' special properties access
|
||||
(r'(?<![$.])\s*this\b(?=(?:\.(?:{0})\b))'.format(prop_str), self.replace_str(this_rw), 0),
|
||||
# rewrite '= this' or ', this'
|
||||
(r'(?<=[=,])\s*this\b\s*(?![:.$])', self.replace_str(this_rw), 0),
|
||||
# rewrite ')(this)'
|
||||
('\}(?:\s*\))?\s*\(this\)', self.replace_str(this_rw), 0),
|
||||
# rewrite this in && or || expr?
|
||||
(r'(?<=[^|&][|&]{2})\s*this\b\s*(?)', self.replace_str(this_rw), 0),
|
||||
]
|
||||
|
||||
super(JSWombatProxyRules, self).__init__(rules)
|
||||
|
||||
self.first_buff = local_init_func.format(local_init_func_name) + local_declares
|
||||
self.first_buff = local_init_func + local_declares + '\n\n{'
|
||||
|
||||
self.last_buff = '\n\n}'
|
||||
self.last_buff = '\n\n}}'
|
||||
|
||||
|
||||
# =================================================================
|
||||
@ -306,13 +338,12 @@ class JSReplaceFuzzy(object):
|
||||
class CSSRules(RxRules):
|
||||
CSS_URL_REGEX = "url\\s*\\(\\s*(?:[\\\\\"']|(?:&.{1,4};))*\\s*([^)'\"]+)\\s*(?:[\\\\\"']|(?:&.{1,4};))*\\s*\\)"
|
||||
|
||||
CSS_IMPORT_NO_URL_REGEX = ("@import\\s+(?!url)\\(?\\s*['\"]?" +
|
||||
"(?!url[\\s\\(])([\w.:/\\\\-]+)")
|
||||
CSS_IMPORT_REGEX = ("@import\\s+(?:url\\s*)?\\(?\\s*['\"]?([\w.:/\\\\-]+)")
|
||||
|
||||
def __init__(self):
|
||||
rules = [
|
||||
(self.CSS_URL_REGEX, self.archival_rewrite(), 1),
|
||||
(self.CSS_IMPORT_NO_URL_REGEX, self.archival_rewrite(), 1),
|
||||
(self.CSS_URL_REGEX, self.archival_rewrite('oe_'), 1),
|
||||
(self.CSS_IMPORT_REGEX, self.archival_rewrite('cs_'), 1),
|
||||
]
|
||||
|
||||
super(CSSRules, self).__init__(rules)
|
||||
@ -326,7 +357,7 @@ class CSSRewriter(RegexRewriter):
|
||||
class XMLRules(RxRules):
|
||||
def __init__(self):
|
||||
rules = [
|
||||
('([A-Za-z:]+[\s=]+)?["\'\s]*(' +
|
||||
('(?<![\w])([A-Za-z:]+[\s=]+)?["\'\s]*(' +
|
||||
self.HTTPX_MATCH_STR + ')',
|
||||
self.archival_rewrite(), 2),
|
||||
]
|
||||
|
@ -59,21 +59,64 @@ class RewriteDASH(BufferedRewriter):
|
||||
|
||||
# ============================================================================
|
||||
def rewrite_fb_dash(string, *args):
|
||||
DASH_SPLIT = r'\n",dash_prefetched_representation_ids:'
|
||||
inx = string.find(DASH_SPLIT)
|
||||
DASH_SPLITS = [r'\n",dash_prefetched_representation_ids:', r'\n","dash_prefetched_representation_ids":']
|
||||
|
||||
inx = -1
|
||||
split = None
|
||||
for split in DASH_SPLITS:
|
||||
inx = string.find(split)
|
||||
if inx >= 0:
|
||||
break
|
||||
|
||||
if inx < 0:
|
||||
return string
|
||||
return
|
||||
|
||||
string = string[:inx]
|
||||
|
||||
buff = string.encode('utf-8').decode('unicode-escape')
|
||||
buff = buff.replace('\\/', '/')
|
||||
buff = buff.encode('utf-8')
|
||||
io = BytesIO(buff)
|
||||
io, best_ids = RewriteDASH().rewrite_dash(io, None)
|
||||
string = json.dumps(io.read().decode('utf-8'))
|
||||
buff = io.read().decode('utf-8')
|
||||
string = json.dumps(buff)
|
||||
string = string[1:-1].replace('<', r'\x3C')
|
||||
|
||||
string += DASH_SPLIT
|
||||
string += split
|
||||
string += json.dumps(best_ids)
|
||||
return string
|
||||
|
||||
def rewrite_tw_dash(string, *args):
|
||||
try:
|
||||
best_variant = None
|
||||
best_bitrate = 0
|
||||
best_src = ""
|
||||
max_bitrate = 5000000
|
||||
|
||||
data = json.loads(string)
|
||||
for variant in data["variants"]:
|
||||
if (("content_type" in variant and variant["content_type"] != "video/mp4") or
|
||||
("type" in variant and variant["type"] != "video/mp4")):
|
||||
continue
|
||||
|
||||
bitrate = variant.get("bitrate")
|
||||
src = variant.get("src")
|
||||
|
||||
if bitrate and bitrate > best_bitrate and bitrate <= max_bitrate:
|
||||
best_variant = variant
|
||||
best_bitrate = bitrate
|
||||
# just compare src strings with dimensions
|
||||
elif src and src > best_src:
|
||||
best_variant = variant
|
||||
best_src = src
|
||||
|
||||
if best_variant:
|
||||
data["variants"] = [best_variant]
|
||||
|
||||
string = json.dumps(data)
|
||||
|
||||
except Exception as e:
|
||||
print(e)
|
||||
|
||||
return string
|
||||
|
||||
|
30
pywb/rewrite/rewrite_js_workers.py
Normal file
30
pywb/rewrite/rewrite_js_workers.py
Normal file
@ -0,0 +1,30 @@
|
||||
from pywb.rewrite.content_rewriter import StreamingRewriter, WORKER_MODS
|
||||
|
||||
__all__ = ["JSWorkerRewriter"]
|
||||
|
||||
INJECT = "(function() { self.importScripts('%s'); new WBWombat(%s); })();"
|
||||
INIT = "{'prefix': '%s', 'prefixMod': '%s/', 'originalURL': '%s'}"
|
||||
|
||||
|
||||
class JSWorkerRewriter(StreamingRewriter):
|
||||
"""A simple rewriter for rewriting web or service workers.
|
||||
The only rewriting that occurs is the injection of the init code
|
||||
for wombatWorkers.js.
|
||||
This allows for all them to operate as expected on the live web.
|
||||
"""
|
||||
|
||||
def __init__(self, url_rewriter, align_to_line=True, first_buff=''):
|
||||
"""Initialize a new JSWorkerRewriter
|
||||
|
||||
:param UrlRewriter url_rewriter: The url rewriter for this rewrite
|
||||
:param bool align_to_line: Should the response stream be aliened to line boundaries
|
||||
:param str first_buff: The first string to be added to the rewrite
|
||||
:rtype: None
|
||||
"""
|
||||
super(JSWorkerRewriter, self).__init__(url_rewriter, align_to_line, first_buff)
|
||||
wb_url = self.url_rewriter.wburl
|
||||
if wb_url.mod in WORKER_MODS:
|
||||
rw_url = self.url_rewriter.pywb_static_prefix + "wombatWorkers.js"
|
||||
prefix = self.url_rewriter.full_prefix
|
||||
init = INIT % (prefix, prefix + 'wkrf_', wb_url.url)
|
||||
self.first_buff = INJECT % (rw_url, init)
|
@ -26,6 +26,7 @@ class RewriteInputRequest(DirectWSGIInputRequest):
|
||||
self.url = url
|
||||
self.rewriter = rewriter
|
||||
self.extra_cookie = None
|
||||
self.warcserver_headers = {}
|
||||
|
||||
is_proxy = ('wsgiprox.proxy_host' in env)
|
||||
|
||||
@ -82,6 +83,11 @@ class RewriteInputRequest(DirectWSGIInputRequest):
|
||||
elif name in ('HTTP_IF_MODIFIED_SINCE', 'HTTP_IF_UNMODIFIED_SINCE'):
|
||||
continue
|
||||
|
||||
elif name == 'HTTP_X_PYWB_ACL_USER':
|
||||
name = name[5:].title().replace('_', '-')
|
||||
self.warcserver_headers[name] = value
|
||||
continue
|
||||
|
||||
elif name == 'HTTP_X_FORWARDED_PROTO':
|
||||
name = 'X-Forwarded-Proto'
|
||||
if self.splits:
|
||||
|
@ -3,9 +3,9 @@ from warcio.timeutils import timestamp_now
|
||||
|
||||
from pywb.utils.loaders import load
|
||||
|
||||
from six.moves.urllib.parse import urlsplit
|
||||
from six.moves.urllib.parse import urlsplit, quote
|
||||
|
||||
from jinja2 import Environment, TemplateNotFound
|
||||
from jinja2 import Environment, TemplateNotFound, pass_context, select_autoescape
|
||||
from jinja2 import FileSystemLoader, PackageLoader, ChoiceLoader
|
||||
|
||||
from webassets.ext.jinja2 import AssetsExtension
|
||||
@ -15,6 +15,7 @@ from webassets.env import Resolver
|
||||
from pkg_resources import resource_filename
|
||||
|
||||
import os
|
||||
import logging
|
||||
|
||||
try:
|
||||
import ujson as json
|
||||
@ -75,10 +76,12 @@ class JinjaEnv(object):
|
||||
|
||||
if overlay:
|
||||
jinja_env = overlay.jinja_env.overlay(loader=loader,
|
||||
autoescape=select_autoescape(),
|
||||
trim_blocks=True,
|
||||
extensions=extensions)
|
||||
else:
|
||||
jinja_env = RelEnvironment(loader=loader,
|
||||
autoescape=select_autoescape(),
|
||||
trim_blocks=True,
|
||||
extensions=extensions)
|
||||
|
||||
@ -96,6 +99,8 @@ class JinjaEnv(object):
|
||||
assets_env.resolver = PkgResResolver()
|
||||
jinja_env.assets_environment = assets_env
|
||||
|
||||
self.default_locale = ''
|
||||
|
||||
def _make_loaders(self, paths, packages):
|
||||
"""Initialize the template loaders based on the supplied paths and packages.
|
||||
|
||||
@ -115,6 +120,97 @@ class JinjaEnv(object):
|
||||
|
||||
return loaders
|
||||
|
||||
def init_loc(self, locales_root_dir, locales, loc_map, default_locale):
|
||||
locales = locales or []
|
||||
locales_root_dir = locales_root_dir or os.path.join('i18n', 'translations')
|
||||
default_locale = default_locale or 'en'
|
||||
self.default_locale = default_locale
|
||||
|
||||
if locales:
|
||||
try:
|
||||
from babel.support import Translations
|
||||
for loc in locales:
|
||||
loc_map[loc] = Translations.load(locales_root_dir, [loc, default_locale])
|
||||
except:
|
||||
logging.warn("Ignoring Locales. You must install i18n extensions with 'pip install pywb[i18n]' to use localization features")
|
||||
|
||||
def get_translate(context):
|
||||
loc = context.get('env', {}).get('pywb_lang', default_locale)
|
||||
return loc_map.get(loc)
|
||||
|
||||
def override_func(jinja_env, name):
|
||||
@pass_context
|
||||
def get_override(context, text):
|
||||
translate = get_translate(context)
|
||||
if not translate:
|
||||
return text
|
||||
|
||||
func = getattr(translate, name)
|
||||
return func(text)
|
||||
|
||||
jinja_env.globals[name] = get_override
|
||||
|
||||
# standard gettext() translation function
|
||||
override_func(self.jinja_env, 'gettext')
|
||||
|
||||
# single/plural form translation function
|
||||
override_func(self.jinja_env, 'ngettext')
|
||||
|
||||
# Special _Q() function to return %-encoded text, necessary for use
|
||||
# with text in banner
|
||||
@pass_context
|
||||
def quote_gettext(context, text):
|
||||
translate = get_translate(context)
|
||||
if not translate:
|
||||
return text
|
||||
|
||||
text = translate.gettext(text)
|
||||
return quote(text, safe='/: ')
|
||||
|
||||
self.jinja_env.globals['locales'] = list(loc_map.keys())
|
||||
self.jinja_env.globals['_Q'] = quote_gettext
|
||||
self.jinja_env.globals['default_locale'] = default_locale
|
||||
|
||||
@pass_context
|
||||
def switch_locale(context, locale):
|
||||
environ = context.get('env')
|
||||
curr_loc = environ.get('pywb_lang', '')
|
||||
|
||||
request_uri = environ.get('REQUEST_URI', environ.get('PATH_INFO'))
|
||||
|
||||
if curr_loc and request_uri.startswith('/' + curr_loc + '/'):
|
||||
return request_uri.replace(curr_loc, locale, 1)
|
||||
|
||||
app_prefix = environ.get('pywb.app_prefix', '')
|
||||
|
||||
if app_prefix and request_uri.startswith(app_prefix):
|
||||
request_uri = request_uri.replace(app_prefix, '')
|
||||
|
||||
return app_prefix + '/' + locale + request_uri
|
||||
|
||||
@pass_context
|
||||
def get_locale_prefixes(context):
|
||||
environ = context.get('env')
|
||||
locale_prefixes = {}
|
||||
|
||||
orig_prefix = environ.get('pywb.app_prefix', '')
|
||||
coll = environ.get('SCRIPT_NAME', '')
|
||||
|
||||
if orig_prefix and coll.startswith(orig_prefix):
|
||||
coll = coll[len(orig_prefix):]
|
||||
|
||||
curr_loc = environ.get('pywb_lang', '')
|
||||
if curr_loc and coll.startswith('/' + curr_loc):
|
||||
coll = coll[len(curr_loc) + 1:]
|
||||
|
||||
for locale in loc_map.keys():
|
||||
locale_prefixes[locale] = orig_prefix + '/' + locale + coll + '/'
|
||||
|
||||
return locale_prefixes
|
||||
|
||||
self.jinja_env.globals['switch_locale'] = switch_locale
|
||||
self.jinja_env.globals['get_locale_prefixes'] = get_locale_prefixes
|
||||
|
||||
def template_filter(self, param=None):
|
||||
"""Returns a decorator that adds the wrapped function to dictionary of template filters.
|
||||
|
||||
@ -226,7 +322,7 @@ class BaseInsertView(object):
|
||||
kwargs.update(params)
|
||||
|
||||
kwargs['env'] = env
|
||||
kwargs['static_prefix'] = env.get('pywb.host_prefix', '') + env.get('pywb.app_prefix', '') + '/static'
|
||||
kwargs['static_prefix'] = env.get('pywb.static_prefix', '/static')
|
||||
|
||||
|
||||
return template.render(**kwargs)
|
||||
@ -275,7 +371,7 @@ class HeadInsertView(BaseInsertView):
|
||||
|
||||
if self.banner_view:
|
||||
banner_html = self.banner_view.render_to_string(env, cdx=cdx, **params)
|
||||
params['banner_html'] = banner_html
|
||||
params['custom_banner_html'] = banner_html
|
||||
|
||||
return self.render_to_string(env, cdx=cdx, **params)
|
||||
|
||||
@ -309,10 +405,11 @@ class TopFrameView(BaseInsertView):
|
||||
|
||||
embed_url = wb_url.to_str(mod=replay_mod)
|
||||
|
||||
timestamp = ''
|
||||
if wb_url.timestamp:
|
||||
timestamp = wb_url.timestamp
|
||||
else:
|
||||
timestamp = timestamp_now()
|
||||
#else:
|
||||
# timestamp = timestamp_now()
|
||||
|
||||
is_proxy = 'wsgiprox.proxy_host' in env
|
||||
|
||||
|
@ -13,7 +13,7 @@ from pywb.utils.io import chunk_encode_iter
|
||||
|
||||
from pywb.rewrite.wburl import WbUrl
|
||||
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||
from pywb.rewrite.default_rewriter import DefaultRewriter, RewriterWithJSProxy
|
||||
from pywb.rewrite.default_rewriter import RewriterWithJSProxy
|
||||
|
||||
from pywb import get_test_dir
|
||||
|
||||
@ -39,8 +39,7 @@ def headers(request):
|
||||
class TestContentRewriter(object):
|
||||
@classmethod
|
||||
def setup_class(self):
|
||||
self.content_rewriter = DefaultRewriter()
|
||||
self.js_proxy_content_rewriter = RewriterWithJSProxy()
|
||||
self.content_rewriter = RewriterWithJSProxy()
|
||||
|
||||
def _create_response_record(self, url, headers, payload, warc_headers):
|
||||
writer = BufferWARCWriter()
|
||||
@ -65,7 +64,6 @@ class TestContentRewriter(object):
|
||||
record = self._create_response_record(url, headers, content, warc_headers)
|
||||
|
||||
wburl = WbUrl(ts + '/' + (request_url or url))
|
||||
url_rewriter = UrlRewriter(wburl, prefix)
|
||||
|
||||
cdx = CDXObject()
|
||||
cdx['url'] = url
|
||||
@ -79,11 +77,13 @@ class TestContentRewriter(object):
|
||||
return ''
|
||||
|
||||
if use_js_proxy:
|
||||
rewriter = self.js_proxy_content_rewriter
|
||||
rewrite_opts = {}
|
||||
else:
|
||||
rewriter = self.content_rewriter
|
||||
rewrite_opts = {'ua_string': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/10.0 Safari/537.36'}
|
||||
|
||||
return rewriter(record, url_rewriter, cookie_rewriter=None,
|
||||
url_rewriter = UrlRewriter(wburl, prefix, rewrite_opts=rewrite_opts)
|
||||
|
||||
return self.content_rewriter(record, url_rewriter, cookie_rewriter=None,
|
||||
head_insert_func=insert_func,
|
||||
cdx=cdx,
|
||||
environ=environ)
|
||||
@ -141,6 +141,17 @@ class TestContentRewriter(object):
|
||||
assert ('Content-Type', 'text/html; charset=utf-8') in headers.headers
|
||||
assert b''.join(gen).decode('utf-8') == exp
|
||||
|
||||
def test_rewrite_html_ignore_bom(self):
|
||||
headers = {'Content-Type': 'text/html'}
|
||||
content = u'\ufeff\ufeff\ufeff<!DOCTYPE html>\n<head>\n<a href="http://example.com"></a></body></html>'
|
||||
|
||||
headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701mp_')
|
||||
|
||||
exp = '\ufeff\ufeff\ufeff<!DOCTYPE html>\n<head>\n<a href="http://localhost:8080/prefix/201701/http://example.com"></a></body></html>'
|
||||
assert is_rw
|
||||
assert ('Content-Type', 'text/html') in headers.headers
|
||||
assert b''.join(gen).decode('utf-8') == exp
|
||||
|
||||
def test_rewrite_html_utf_8_anchor(self):
|
||||
headers = {'Content-Type': 'text/html; charset=utf-8'}
|
||||
content = u'<html><body><a href="#éxample-tésté"></a></body></html>'
|
||||
@ -235,24 +246,22 @@ class TestContentRewriter(object):
|
||||
|
||||
def test_rewrite_sw_add_headers(self):
|
||||
headers = {'Content-Type': 'application/x-javascript'}
|
||||
content = 'function() { location.href = "http://example.com/"; }'
|
||||
content = "function() { location.href = 'http://example.com/'; }"
|
||||
|
||||
headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701sw_')
|
||||
|
||||
assert ('Content-Type', 'application/x-javascript') in headers.headers
|
||||
assert ('Service-Worker-Allowed', 'http://localhost:8080/prefix/201701mp_/http://example.com/') in headers.headers
|
||||
|
||||
exp = 'function() { location.href = "http://example.com/"; }'
|
||||
assert b''.join(gen).decode('utf-8') == exp
|
||||
assert "self.importScripts('wombatWorkers.js');" in b''.join(gen).decode('utf-8')
|
||||
|
||||
def test_rewrite_worker(self):
|
||||
headers = {'Content-Type': 'application/x-javascript'}
|
||||
content = 'importScripts("http://example.com/js.js")'
|
||||
content = "importScripts('http://example.com/js.js')"
|
||||
|
||||
rwheaders, gen, is_rw = self.rewrite_record(headers, content, ts='201701wkr_')
|
||||
|
||||
exp = 'importScripts("http://example.com/js.js")'
|
||||
assert b''.join(gen).decode('utf-8') == exp
|
||||
assert "self.importScripts('wombatWorkers.js');" in b''.join(gen).decode('utf-8')
|
||||
|
||||
def test_banner_only_no_cookie_rewrite(self):
|
||||
headers = {'Set-Cookie': 'foo=bar; Expires=Wed, 13 Jan 2021 22:23:01 GMT; Path=/',
|
||||
@ -284,12 +293,12 @@ class TestContentRewriter(object):
|
||||
headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701mp_')
|
||||
|
||||
mods = set()
|
||||
assert len(headers.headers) == 6
|
||||
assert len(headers.headers) == 8
|
||||
for name, value in headers.headers:
|
||||
assert name == 'Set-Cookie'
|
||||
mods.add(re.search('Path=/prefix/201701([^/]+)', value).group(1))
|
||||
|
||||
assert mods == {'mp_', 'cs_', 'js_', 'im_', 'oe_', 'if_'}
|
||||
assert mods == {'mp_', 'cs_', 'js_', 'im_', 'oe_', 'if_', 'sw_', 'wkrf_'}
|
||||
assert is_rw == False
|
||||
|
||||
def test_rewrite_http_cookie_no_all_mods_no_slash(self):
|
||||
@ -459,7 +468,23 @@ class TestContentRewriter(object):
|
||||
|
||||
def test_rewrite_js_as_json_generic_jsonp(self):
|
||||
headers = {'Content-Type': 'application/json'}
|
||||
content = '/**/ jsonpCallbackABCDEF({"foo": "bar"});'
|
||||
content = '/*abc*/ jsonpCallbackABCDEF({"foo": "bar"});'
|
||||
|
||||
headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701js_',
|
||||
url='http://example.com/path/file?callback=jsonpCallback12345')
|
||||
|
||||
# content-type unchanged
|
||||
assert ('Content-Type', 'application/json') in headers.headers
|
||||
|
||||
exp = 'jsonpCallback12345({"foo": "bar"});'
|
||||
assert b''.join(gen).decode('utf-8') == exp
|
||||
|
||||
def test_rewrite_js_as_json_generic_jsonp_multiline_comment(self):
|
||||
headers = {'Content-Type': 'application/json'}
|
||||
content = """\
|
||||
// A comment
|
||||
// Another?
|
||||
jsonpCallbackABCDEF({"foo": "bar"});"""
|
||||
|
||||
headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701js_',
|
||||
url='http://example.com/path/file?callback=jsonpCallback12345')
|
||||
@ -704,6 +729,25 @@ http://example.com/video_4.m3u8
|
||||
assert 'dash_prefetched_representation_ids:["1", "7"]' in result
|
||||
assert rep_ids not in result
|
||||
|
||||
def test_dash_fb_in_js_2(self):
|
||||
headers = {'Content-Type': 'text/javascript'}
|
||||
with open(os.path.join(get_test_dir(), 'text_content', 'sample_dash.mpd'), 'rt') as fh:
|
||||
content = 'dash_manifest:"' + fh.read().encode('unicode-escape').decode('utf-8')
|
||||
|
||||
rep_ids = r'\n","dash_prefetched_representation_ids":["4","5"]'
|
||||
content += rep_ids
|
||||
|
||||
headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701js_',
|
||||
url='http://facebook.com/example/dash/manifest.mpd')
|
||||
|
||||
assert headers.headers == [('Content-Type', 'text/javascript')]
|
||||
|
||||
result = b''.join(gen).decode('utf-8')
|
||||
|
||||
# 4, 5 representations removed, replaced with default 1, 7
|
||||
assert '"dash_prefetched_representation_ids":["1", "7"]' in result
|
||||
assert rep_ids not in result
|
||||
|
||||
def test_dash_custom_max_resolution(self):
|
||||
headers = {'Content-Type': 'application/dash+xml'}
|
||||
with open(os.path.join(get_test_dir(), 'text_content', 'sample_dash.mpd'), 'rt') as fh:
|
||||
|
@ -42,7 +42,7 @@ class TestHeaderRewriter(object):
|
||||
|
||||
res = """\
|
||||
HTTP/1.0 200 OK\r\n\
|
||||
Date: Fri, 03 Jan 2014 03:03:21 GMT\r\n\
|
||||
X-Archive-Orig-Date: Fri, 03 Jan 2014 03:03:21 GMT\r\n\
|
||||
X-Archive-Orig-Content-Length: 5\r\n\
|
||||
Content-Type: text/html;charset=UTF-8\r\n\
|
||||
"""
|
||||
|
40
pywb/rewrite/test/test_html_insert_rewriter.py
Normal file
40
pywb/rewrite/test/test_html_insert_rewriter.py
Normal file
@ -0,0 +1,40 @@
|
||||
|
||||
|
||||
|
||||
r'''
|
||||
>>> parse('<html><head><some-tag></head</html>')
|
||||
'<html><head><!--Insert--><some-tag></head</html>'
|
||||
|
||||
>>> parse('<HTML><A Href="page.html">Text</a></hTmL>')
|
||||
'<HTML><!--Insert--><A Href="page.html">Text</a></hTmL>'
|
||||
|
||||
>>> parse('<html> < head> <link>')
|
||||
'<html> < head> <!--Insert--><link>'
|
||||
|
||||
>>> parse('< head> <link> <html>')
|
||||
'< head> <!--Insert--><link> <html>'
|
||||
|
||||
>>> parse('<head></head>text')
|
||||
'<head></head>text<!--Insert-->'
|
||||
|
||||
>>> parse('<?xml version="1.0" encoding="UTF-8" standalone="no"?>\n<html xmlns="http://www.w3.org/1999/xhtml"><body></body></html>')
|
||||
'<?xml version="1.0" encoding="UTF-8" standalone="no"?>\n<html xmlns="http://www.w3.org/1999/xhtml"><!--Insert--><body></body></html>'
|
||||
|
||||
# ajax leave unchanged?
|
||||
>>> parse('<?xml version="1.0" encoding="UTF-8" standalone="no"?>\n<html xmlns="http://www.w3.org/1999/xhtml"><body></body></html>', is_ajax=True)
|
||||
'<?xml version="1.0" encoding="UTF-8" standalone="no"?>\n<html xmlns="http://www.w3.org/1999/xhtml"><body></body></html>'
|
||||
'''
|
||||
|
||||
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||
from pywb.rewrite.html_insert_rewriter import HTMLInsertOnlyRewriter
|
||||
|
||||
def parse(html_text, is_ajax=False):
|
||||
urlrewriter = UrlRewriter('20131226101010/https://example.com/some/path.html', '/web/')
|
||||
|
||||
if is_ajax:
|
||||
urlrewriter.rewrite_opts['is_ajax'] = True
|
||||
|
||||
rewriter = HTMLInsertOnlyRewriter(urlrewriter, head_insert='<!--Insert-->')
|
||||
|
||||
return rewriter.rewrite(html_text) + rewriter.final_read()
|
||||
|
@ -107,9 +107,11 @@ r"""
|
||||
#<a href="/web/20131226101010/http://испытание.испытание/">испытание</a>
|
||||
|
||||
# entity unescaping
|
||||
#>>> parse('<a href="http://www.example.com/path/file.html">')
|
||||
<a href="/web/20131226101010/http://www.example.com/path/file.html">
|
||||
>>> parse('<a href="http://www.example.com/path/file.html">')
|
||||
<a href="/web/20131226101010/http://www.example.com/path/file.html">
|
||||
|
||||
>>> parse('<a href="//www.example.com/path/file.html">')
|
||||
<a href="/web/20131226101010///www.example.com/path/file.html">
|
||||
|
||||
# Meta tag
|
||||
>>> parse('<META http-equiv="refresh" content="10; URL=/abc/def.html">')
|
||||
@ -136,9 +138,9 @@ r"""
|
||||
>>> parse('<meta http-equiv="Content-Security-Policy" content="default-src http://example.com" />')
|
||||
<meta http-equiv="Content-Security-Policy" _content="default-src http://example.com"/>
|
||||
|
||||
# Custom -data attribs
|
||||
# Don't rewrite Custom -data attribs
|
||||
>>> parse('<div data-url="http://example.com/a/b/c.html" data-some-other-value="http://example.com/img.gif">')
|
||||
<div data-url="/web/20131226101010oe_/http://example.com/a/b/c.html" data-some-other-value="/web/20131226101010oe_/http://example.com/img.gif">
|
||||
<div data-url="http://example.com/a/b/c.html" data-some-other-value="http://example.com/img.gif">
|
||||
|
||||
# param tag -- rewrite conditionally if url
|
||||
>>> parse('<param value="http://example.com/"/>')
|
||||
@ -183,6 +185,10 @@ r"""
|
||||
>>> parse('<img srcset="//example.com/1x,1x 2w, //example1.com/foo 2x, http://example.com/bar,bar 4x">')
|
||||
<img srcset="/web/20131226101010///example.com/1x,1x 2w, /web/20131226101010///example1.com/foo 2x, /web/20131226101010/http://example.com/bar,bar 4x">
|
||||
|
||||
# complex srcset attrib
|
||||
>>> parse('<img srcset="http://test.com/yaşar-kunduz.jpg 320w, http://test.com/yaşar-konçalves-273x300.jpg 273w">')
|
||||
<img srcset="/web/20131226101010/http://test.com/ya%C5%9Far-kunduz.jpg 320w, /web/20131226101010/http://test.com/ya%C5%9Far-konc%CC%A7alves-273x300.jpg 273w">
|
||||
|
||||
# empty srcset attrib
|
||||
>>> parse('<img srcset="">')
|
||||
<img srcset="">
|
||||
@ -242,26 +248,29 @@ r"""
|
||||
<div style="background: url('abc.html')" onblah on-click="location = 'redirect.html'"></div>
|
||||
|
||||
>>> parse('<div style="background: url(\'/other_path/abc.html\')" onblah onclick="window.location = \'redirect.html\'"></div>')
|
||||
<div style="background: url('/web/20131226101010/http://example.com/other_path/abc.html')" onblah onclick="window.WB_wombat_location = 'redirect.html'"></div>
|
||||
<div style="background: url('/web/20131226101010oe_/http://example.com/other_path/abc.html')" onblah onclick="window.WB_wombat_location = 'redirect.html'"></div>
|
||||
|
||||
>>> parse('<i style="background-image: url(http://foo-.bar_.example.com/)"></i>')
|
||||
<i style="background-image: url(/web/20131226101010/http://foo-.bar_.example.com/)"></i>
|
||||
<i style="background-image: url(/web/20131226101010oe_/http://foo-.bar_.example.com/)"></i>
|
||||
|
||||
>>> parse('<i style=\'background-image: url("http://foo.example.com/")\'></i>')
|
||||
<i style="background-image: url("/web/20131226101010/http://foo.example.com/")"></i>
|
||||
<i style="background-image: url("/web/20131226101010oe_/http://foo.example.com/")"></i>
|
||||
|
||||
>>> parse('<i style=\'background-image: url("http://foo.example.com/")\'></i>')
|
||||
<i style="background-image: url("/web/20131226101010/http://foo.example.com/")"></i>
|
||||
<i style="background-image: url("/web/20131226101010oe_/http://foo.example.com/")"></i>
|
||||
|
||||
>>> parse('<i style=\'background-image: url('http://foo.example.com/')\'></i>')
|
||||
<i style="background-image: url('/web/20131226101010oe_/http://foo.example.com/')"></i>
|
||||
|
||||
>>> parse("<i style='background-image: url('http://foo.example.com/')'></i>")
|
||||
<i style="background-image: url('/web/20131226101010/http://foo.example.com/')"></i>
|
||||
<i style="background-image: url('/web/20131226101010oe_/http://foo.example.com/')"></i>
|
||||
|
||||
#>>> parse('<i style=\'background-image: url("http://исп/")\'></i>')
|
||||
<i style="background-image: url("/web/20131226101010/http://%D0%B8%D1%81%D0%BF/")"></i>
|
||||
|
||||
# Style
|
||||
>>> parse('<style>@import "/styles.css" .a { font-face: url(\'../myfont.ttf\') }</style>')
|
||||
<style>@import "/web/20131226101010/http://example.com/styles.css" .a { font-face: url('/web/20131226101010/http://example.com/some/myfont.ttf') }</style>
|
||||
<style>@import "/web/20131226101010cs_/http://example.com/styles.css" .a { font-face: url('/web/20131226101010oe_/http://example.com/some/myfont.ttf') }</style>
|
||||
|
||||
# Unterminated style tag, handle and auto-terminate
|
||||
>>> parse('<style>@import url(styles.css)')
|
||||
@ -389,7 +398,7 @@ r"""
|
||||
|
||||
# parse attr with js proxy, rewrite location assignment
|
||||
>>> parse('<html><a href="javascript:location=\'foo.html\'"></a></html>', js_proxy=True)
|
||||
<html><a href="javascript:{ location=(self.__WB_check_loc && self.__WB_check_loc(location) || {}).href = 'foo.html' }"></a></html>
|
||||
<html><a href="javascript:{ location=((self.__WB_check_loc && self.__WB_check_loc(location, arguments)) || {}).href = 'foo.html' }"></a></html>
|
||||
|
||||
# parse attr with js proxy, assigning to location.href, no location assignment rewrite needed
|
||||
>>> parse('<html><a href="javascript:location.href=\'foo.html\'"></a></html>', js_proxy=True)
|
||||
@ -407,6 +416,9 @@ r"""
|
||||
>>> parse('<!--[if !IE]> --><html><![endif]--><a href="http://example.com/"><!--[if IE]><![endif]--><a href="http://example.com/"></html>')
|
||||
<!--[if !IE]> --><html><![endif]><a href="/web/20131226101010/http://example.com/"><!--[if IE]><![endif]--><a href="/web/20131226101010/http://example.com/"></html>
|
||||
|
||||
# Test tag with a target
|
||||
>>> parse('<HTML><A Href=\"page.html\" target=\"_blank\">Text</a></hTmL>')
|
||||
<html><a href="page.html" target="___wb_replay_top_frame">Text</a></html>
|
||||
|
||||
# Test blank
|
||||
>>> parse('')
|
||||
|
@ -8,13 +8,18 @@ class TestJSONPRewriter(object):
|
||||
cls.rewriter = JSONPRewriter(urlrewriter)
|
||||
|
||||
urlrewriter = UrlRewriter('20161226/http://example.com/', '/web/', 'https://localhost/web/')
|
||||
cls.rewriter_no_cb = JSONPRewriter(urlrewriter)
|
||||
cls.rewriter_missing_cb = JSONPRewriter(urlrewriter)
|
||||
|
||||
def test_jsonp_rewrite_1(self):
|
||||
string = 'jQuery_1234({"foo": "bar", "some": "data"})'
|
||||
expect = 'jQuery_ABC({"foo": "bar", "some": "data"})'
|
||||
assert self.rewriter.rewrite(string) == expect
|
||||
|
||||
def test_jsonp_rewrite_1_with_whitespace(self):
|
||||
string = ' jQuery_1234({"foo": "bar", "some": "data"})'
|
||||
expect = 'jQuery_ABC({"foo": "bar", "some": "data"})'
|
||||
assert self.rewriter.rewrite(string) == expect
|
||||
|
||||
def test_jsonp_rewrite_2(self):
|
||||
string = ' /**/ jQuery_1234({"foo": "bar", "some": "data"})'
|
||||
expect = 'jQuery_ABC({"foo": "bar", "some": "data"})'
|
||||
@ -25,6 +30,34 @@ class TestJSONPRewriter(object):
|
||||
expect = 'jQuery_ABC({"foo": "bar", "some": "data"})'
|
||||
assert self.rewriter.rewrite(string) == expect
|
||||
|
||||
def test_jsonp_rewrite_4(self):
|
||||
string = """// some comment
|
||||
jQuery_1234({"foo": "bar", "some": "data"})"""
|
||||
expect = 'jQuery_ABC({"foo": "bar", "some": "data"})'
|
||||
assert self.rewriter.rewrite(string) == expect
|
||||
|
||||
def test_jsonp_rewrite_5(self):
|
||||
string = """// some comment
|
||||
// blah = 4;
|
||||
jQuery_1234({"foo": "bar", "some": "data"})"""
|
||||
expect = 'jQuery_ABC({"foo": "bar", "some": "data"})'
|
||||
assert self.rewriter.rewrite(string) == expect
|
||||
|
||||
# JSONP valid but 'callback=' missing in url tests
|
||||
def test_no_jsonp_rewrite_missing_callback_1(self):
|
||||
""" JSONP valid but callback is missing in url
|
||||
"""
|
||||
string = 'jQuery_1234({"foo": "bar", "some": "data"})'
|
||||
assert self.rewriter_missing_cb.rewrite(string) == string
|
||||
|
||||
def test_no_jsonp_rewrite_missing_callback_2(self):
|
||||
string = """// some comment
|
||||
jQuery_1234({"foo": "bar", "some": "data"})"""
|
||||
expect = 'jQuery_ABC({"foo": "bar", "some": "data"})'
|
||||
assert self.rewriter_missing_cb.rewrite(string) == string
|
||||
|
||||
|
||||
# Invalid JSONP Tests
|
||||
def test_no_jsonp_rewrite_1(self):
|
||||
string = ' /* comment jQuery_1234({"foo": "bar", "some": "data"})'
|
||||
assert self.rewriter.rewrite(string) == string
|
||||
@ -37,8 +70,14 @@ class TestJSONPRewriter(object):
|
||||
string = 'var foo = ({"foo": "bar", "some": "data"})'
|
||||
assert self.rewriter.rewrite(string) == string
|
||||
|
||||
def test_no_jsonp_rewrite_no_callback_1(self):
|
||||
string = 'jQuery_1234({"foo": "bar", "some": "data"})'
|
||||
assert self.rewriter_no_cb.rewrite(string) == string
|
||||
def test_jsonp_rewrite_3(self):
|
||||
string = ' abc /* some comment */ jQuery_1234({"foo": "bar", "some": "data"})'
|
||||
assert self.rewriter.rewrite(string) == string
|
||||
|
||||
def test_no_jsonp_multiline_rewrite_2(self):
|
||||
string = """// some comment
|
||||
blah = 4;
|
||||
jQuery_1234({"foo": "bar", "some": "data"})"""
|
||||
assert self.rewriter.rewrite(string) == string
|
||||
|
||||
|
||||
|
@ -131,49 +131,52 @@ r"""
|
||||
#=================================================================
|
||||
|
||||
>>> _test_js_obj_proxy('var foo = this; location = bar')
|
||||
'var foo = (this && this._WB_wombat_obj_proxy || this); location = (self.__WB_check_loc && self.__WB_check_loc(location) || {}).href = bar'
|
||||
'var foo = _____WB$wombat$check$this$function_____(this); location = ((self.__WB_check_loc && self.__WB_check_loc(location, arguments)) || {}).href = bar'
|
||||
|
||||
>>> _test_js_obj_proxy('var that = this\n location = bar')
|
||||
'var that = (this && this._WB_wombat_obj_proxy || this)\n location = (self.__WB_check_loc && self.__WB_check_loc(location) || {}).href = bar'
|
||||
'var that = _____WB$wombat$check$this$function_____(this)\n location = ((self.__WB_check_loc && self.__WB_check_loc(location, arguments)) || {}).href = bar'
|
||||
|
||||
>>> _test_js_obj_proxy('location = "xyz"')
|
||||
'location = (self.__WB_check_loc && self.__WB_check_loc(location) || {}).href = "xyz"'
|
||||
'location = ((self.__WB_check_loc && self.__WB_check_loc(location, arguments)) || {}).href = "xyz"'
|
||||
|
||||
>>> _test_js_obj_proxy('var foo = this.location')
|
||||
'var foo = (this && this._WB_wombat_obj_proxy || this).location'
|
||||
'var foo = _____WB$wombat$check$this$function_____(this).location'
|
||||
|
||||
>>> _test_js_obj_proxy('A = B\nthis.location = "foo"')
|
||||
'A = B\n;(this && this._WB_wombat_obj_proxy || this).location = "foo"'
|
||||
'A = B\n_____WB$wombat$check$this$function_____(this).location = "foo"'
|
||||
|
||||
>>> _test_js_obj_proxy('var foo = this.location2')
|
||||
'var foo = this.location2'
|
||||
|
||||
>>> _test_js_obj_proxy('func(Function("return this"));')
|
||||
'func(Function("return (this && this._WB_wombat_obj_proxy || this)"));'
|
||||
'func(Function("return _____WB$wombat$check$this$function_____(this)"));'
|
||||
|
||||
>>> _test_js_obj_proxy('A.call(function() { return this });')
|
||||
'A.call(function() { return (this && this._WB_wombat_obj_proxy || this) });'
|
||||
>>> _test_js_obj_proxy('A.call(function() { return this });')
|
||||
'A.call(function() { return _____WB$wombat$check$this$function_____(this) });'
|
||||
|
||||
>>> _test_js_obj_proxy('this.document.location = foo')
|
||||
'(this && this._WB_wombat_obj_proxy || this).document.location = foo'
|
||||
'_____WB$wombat$check$this$function_____(this).document.location = foo'
|
||||
|
||||
>>> _test_js_obj_proxy('if (that != this) { ... }')
|
||||
'if (that != (this && this._WB_wombat_obj_proxy || this)) { ... }'
|
||||
'if (that != _____WB$wombat$check$this$function_____(this)) { ... }'
|
||||
|
||||
>>> _test_js_obj_proxy('function(){...} (this)')
|
||||
'function(){...} ((this && this._WB_wombat_obj_proxy || this))'
|
||||
'function(){...} (_____WB$wombat$check$this$function_____(this))'
|
||||
|
||||
>>> _test_js_obj_proxy('function(){...} ) (this); foo(this)')
|
||||
'function(){...} ) ((this && this._WB_wombat_obj_proxy || this)); foo(this)'
|
||||
'function(){...} ) (_____WB$wombat$check$this$function_____(this)); foo(this)'
|
||||
|
||||
>>> _test_js_obj_proxy('var foo = that || this ;')
|
||||
'var foo = that || (this && this._WB_wombat_obj_proxy || this) ;'
|
||||
'var foo = that || _____WB$wombat$check$this$function_____(this) ;'
|
||||
|
||||
>>> _test_js_obj_proxy('a||this||that')
|
||||
'a||(this && this._WB_wombat_obj_proxy || this)||that'
|
||||
'a||_____WB$wombat$check$this$function_____(this)||that'
|
||||
|
||||
>>> _test_js_obj_proxy('a||this)')
|
||||
'a||(this && this._WB_wombat_obj_proxy || this))'
|
||||
'a||_____WB$wombat$check$this$function_____(this))'
|
||||
|
||||
>>> _test_js_obj_proxy(r'(a,b,Q.contains(i[t], this))')
|
||||
'(a,b,Q.contains(i[t], _____WB$wombat$check$this$function_____(this)))'
|
||||
|
||||
# not rewritten
|
||||
>>> _test_js_obj_proxy('var window = this$')
|
||||
@ -194,6 +197,9 @@ r"""
|
||||
>>> _test_js_obj_proxy('return this.foo')
|
||||
'return this.foo'
|
||||
|
||||
>>> _test_js_obj_proxy('{foo: bar, this: other}')
|
||||
'{foo: bar, this: other}'
|
||||
|
||||
>>> _test_js_obj_proxy(r'this.$location = http://example.com/')
|
||||
'this.$location = http://example.com/'
|
||||
|
||||
@ -207,8 +213,57 @@ r"""
|
||||
'this. alocation = http://example.com/'
|
||||
|
||||
>>> _test_js_obj_proxy(r'this. location = http://example.com/')
|
||||
'this. location = (self.__WB_check_loc && self.__WB_check_loc(location) || {}).href = http://example.com/'
|
||||
'this. location = ((self.__WB_check_loc && self.__WB_check_loc(location, arguments)) || {}).href = http://example.com/'
|
||||
|
||||
>>> _test_js_obj_proxy('eval(a)')
|
||||
'WB_wombat_runEval2((_______eval_arg, isGlobal) => { var ge = eval; return isGlobal ? ge(_______eval_arg) : eval(_______eval_arg); }).eval(this, (function() { return arguments })(),a)'
|
||||
|
||||
>>> _test_js_obj_proxy('abc eval(a)')
|
||||
'abc WB_wombat_runEval2((_______eval_arg, isGlobal) => { var ge = eval; return isGlobal ? ge(_______eval_arg) : eval(_______eval_arg); }).eval(this, (function() { return arguments })(),a)'
|
||||
|
||||
|
||||
|
||||
>>> _test_js_obj_proxy(',eval(a)')
|
||||
',eval(a)'
|
||||
|
||||
>>> _test_js_obj_proxy('this.$eval(a)')
|
||||
'this.$eval(a)'
|
||||
|
||||
>>> _test_js_obj_proxy('x = this.$eval; x(a);')
|
||||
'x = this.$eval; x(a);'
|
||||
|
||||
>>> _test_js_obj_proxy('x = eval; x(a);')
|
||||
'x = self.eval; x(a);'
|
||||
|
||||
>>> _test_js_obj_proxy('$eval = eval; $eval(a);')
|
||||
'$eval = self.eval; $eval(a);'
|
||||
|
||||
>>> _test_js_obj_proxy('foo(a, eval(data));')
|
||||
'foo(a, WB_wombat_runEval2((_______eval_arg, isGlobal) => { var ge = eval; return isGlobal ? ge(_______eval_arg) : eval(_______eval_arg); }).eval(this, (function() { return arguments })(),data));'
|
||||
|
||||
>>> _test_js_obj_proxy('function eval() {}')
|
||||
'function eval() {}'
|
||||
|
||||
>>> _test_js_obj_proxy('window.eval(a);')
|
||||
'window.eval(a);'
|
||||
|
||||
>>> _test_js_obj_proxy('x = window.eval; x(a);')
|
||||
'x = window.eval; x(a);'
|
||||
|
||||
>>> _test_js_obj_proxy('obj = { eval : 1 }')
|
||||
'obj = { eval : 1 }'
|
||||
|
||||
>>> _test_js_obj_proxy('x = obj.eval')
|
||||
'x = obj.eval'
|
||||
|
||||
>>> _test_js_obj_proxy('x = obj.eval(a)')
|
||||
'x = obj.eval(a)'
|
||||
|
||||
>>> _test_js_obj_proxy('x = obj._eval(a)')
|
||||
'x = obj._eval(a)'
|
||||
|
||||
>>> _test_js_obj_proxy('x = obj.$eval(a)')
|
||||
'x = obj.$eval(a)'
|
||||
|
||||
|
||||
#=================================================================
|
||||
@ -232,28 +287,28 @@ r"""
|
||||
#=================================================================
|
||||
|
||||
>>> _test_css("background: url('/some/path.html')")
|
||||
"background: url('/web/20131010/http://example.com/some/path.html')"
|
||||
"background: url('/web/20131010oe_/http://example.com/some/path.html')"
|
||||
|
||||
>>> _test_css("background: url('../path.html')")
|
||||
"background: url('/web/20131010/http://example.com/path.html')"
|
||||
"background: url('/web/20131010oe_/http://example.com/path.html')"
|
||||
|
||||
>>> _test_css("background: url(\"http://domain.com/path.html\")")
|
||||
'background: url("/web/20131010/http://domain.com/path.html")'
|
||||
'background: url("/web/20131010oe_/http://domain.com/path.html")'
|
||||
|
||||
>>> _test_css('background: url(" http://domain.com/path.html ")')
|
||||
'background: url(" /web/20131010/http://domain.com/path.html ")'
|
||||
'background: url(" /web/20131010oe_/http://domain.com/path.html ")'
|
||||
|
||||
>>> _test_css('background: url(" http://domain.com/path.html x ")')
|
||||
'background: url(" /web/20131010/http://domain.com/path.html x ")'
|
||||
'background: url(" /web/20131010oe_/http://domain.com/path.html x ")'
|
||||
|
||||
>>> _test_css("background: url(file.jpeg)")
|
||||
'background: url(file.jpeg)'
|
||||
|
||||
>>> _test_css("background:#abc url('/static/styles/../images/layout/logo.png')")
|
||||
"background:#abc url('/web/20131010/http://example.com/static/images/layout/logo.png')"
|
||||
"background:#abc url('/web/20131010oe_/http://example.com/static/images/layout/logo.png')"
|
||||
|
||||
>>> _test_css("background:#000 url('/static/styles/../../images/layout/logo.png')")
|
||||
"background:#000 url('/web/20131010/http://example.com/images/layout/logo.png')"
|
||||
"background:#000 url('/web/20131010oe_/http://example.com/images/layout/logo.png')"
|
||||
|
||||
>>> _test_css("background: url('')")
|
||||
"background: url('')"
|
||||
@ -262,7 +317,7 @@ r"""
|
||||
'background: url ("weirdpath\')'
|
||||
|
||||
>>> _test_css("@import url ('/path.css')")
|
||||
"@import url ('/web/20131010/http://example.com/path.css')"
|
||||
"@import url ('/web/20131010cs_/http://example.com/path.css')"
|
||||
|
||||
>>> _test_css("@import url('path.css')")
|
||||
"@import url('path.css')"
|
||||
@ -271,19 +326,19 @@ r"""
|
||||
"@import ( 'path.css')"
|
||||
|
||||
>>> _test_css("@import \"/path.css\"")
|
||||
'@import "/web/20131010/http://example.com/path.css"'
|
||||
'@import "/web/20131010cs_/http://example.com/path.css"'
|
||||
|
||||
>>> _test_css("@import ('../path.css\"")
|
||||
'@import (\'/web/20131010/http://example.com/path.css"'
|
||||
'@import (\'/web/20131010cs_/http://example.com/path.css"'
|
||||
|
||||
>>> _test_css("@import ('../url.css\"")
|
||||
'@import (\'/web/20131010/http://example.com/url.css"'
|
||||
'@import (\'/web/20131010cs_/http://example.com/url.css"'
|
||||
|
||||
>>> _test_css("@import (\"url.css\")")
|
||||
'@import ("url.css")'
|
||||
|
||||
>>> _test_css("@import url(/url.css)\n@import url(/anotherurl.css)\n @import url(/and_a_third.css)")
|
||||
'@import url(/web/20131010/http://example.com/url.css)\n@import url(/web/20131010/http://example.com/anotherurl.css)\n @import url(/web/20131010/http://example.com/and_a_third.css)'
|
||||
'@import url(/web/20131010cs_/http://example.com/url.css)\n@import url(/web/20131010cs_/http://example.com/anotherurl.css)\n @import url(/web/20131010cs_/http://example.com/and_a_third.css)'
|
||||
|
||||
"""
|
||||
|
||||
@ -312,7 +367,6 @@ def _test_xml(string):
|
||||
def _test_css(string):
|
||||
return CSSRewriter(urlrewriter).rewrite(string)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
doctest.testmod()
|
||||
|
@ -23,7 +23,7 @@ class UrlRewriter(object):
|
||||
REL_PATH = '/'
|
||||
|
||||
def __init__(self, wburl, prefix='', full_prefix=None, rel_prefix=None,
|
||||
root_path=None, cookie_scope=None, rewrite_opts=None):
|
||||
root_path=None, cookie_scope=None, rewrite_opts=None, pywb_static_prefix=None):
|
||||
self.wburl = wburl if isinstance(wburl, WbUrl) else WbUrl(wburl)
|
||||
self.prefix = prefix
|
||||
self.full_prefix = full_prefix or prefix
|
||||
@ -36,10 +36,22 @@ class UrlRewriter(object):
|
||||
self.prefix_abs = self.prefix and self.prefix.startswith(self.PROTOCOLS)
|
||||
self.cookie_scope = cookie_scope
|
||||
self.rewrite_opts = rewrite_opts or {}
|
||||
self._pywb_static_prefix = pywb_static_prefix
|
||||
|
||||
if self.rewrite_opts.get('punycode_links'):
|
||||
self.wburl._do_percent_encode = False
|
||||
|
||||
@property
|
||||
def pywb_static_prefix(self):
|
||||
"""Returns the static path URL
|
||||
:rtype: str
|
||||
"""
|
||||
if self._pywb_static_prefix is None:
|
||||
return ''
|
||||
if self._pywb_static_prefix.startswith(self.PROTOCOLS):
|
||||
return self._pywb_static_prefix
|
||||
return self.urljoin(self.full_prefix, self._pywb_static_prefix)
|
||||
|
||||
def rewrite(self, url, mod=None, force_abs=False):
|
||||
# if special protocol, no rewriting at all
|
||||
if url.startswith(self.NO_REWRITE_URI_PREFIX):
|
||||
|
@ -1,5 +1,8 @@
|
||||
# Default Filters
|
||||
default_filters:
|
||||
# limit to fuzzy match prefix results
|
||||
fuzzy_search_limit: '100'
|
||||
|
||||
# exts that should *not* be treated as files (ignore all query args)
|
||||
not_exts:
|
||||
- asp
|
||||
@ -47,10 +50,18 @@ default_filters:
|
||||
- match: '[?&](\w*(bust|ts)\w*=1[\d]{12,15})(?=&|$)'
|
||||
replace: ''
|
||||
|
||||
# remove facbook link ID when pywb urls are shared on facebook
|
||||
- match: '[?&](fbclid)=(.*)+(?=&|$)'
|
||||
replace: ''
|
||||
|
||||
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
# twitter rules
|
||||
#=================================================================
|
||||
|
||||
- url_prefix: 'com,twitter)/i/profiles/show/'
|
||||
|
||||
fuzzy_lookup: '/profiles/show/.*with_replies\?.*(max_id=[^&]+)'
|
||||
@ -64,6 +75,24 @@ rules:
|
||||
- url_prefix: 'com,twitter)/i/videos/tweet'
|
||||
|
||||
fuzzy_lookup: '()'
|
||||
|
||||
- url_prefix: ['com,twitter,api)/2/', 'com,twitter)/i/api/2/', 'com,twitter)/i/api/graphql/']
|
||||
|
||||
rewrite:
|
||||
js_regexs:
|
||||
- match: 'video_info":(.*?}]})'
|
||||
group: 1
|
||||
function: 'pywb.rewrite.rewrite_dash:rewrite_tw_dash'
|
||||
|
||||
|
||||
- url_prefix: ['com,twimg,syndication,cdn)/tweet-result']
|
||||
|
||||
rewrite:
|
||||
js_regexs:
|
||||
- match: 'video":(.*?viewCount":\d+})'
|
||||
group: 1
|
||||
function: 'pywb.rewrite.rewrite_dash:rewrite_tw_dash'
|
||||
|
||||
|
||||
|
||||
# facebook rules
|
||||
@ -81,7 +110,7 @@ rules:
|
||||
|
||||
fuzzy_lookup:
|
||||
match: '("(?:cursor|cursorindex)":["\d\w]+)'
|
||||
find_all: true
|
||||
re_type: findall
|
||||
|
||||
- url_prefix: 'com,facebook)/ajax/pagelet/generic.php/profiletimeline'
|
||||
fuzzy_lookup: 'com,facebook\)/.*[?&](__adt=[^&]+).*[&]data=(?:.*?(?:[&]|(profile_id|pagelet_token)[^,]+))'
|
||||
@ -146,7 +175,7 @@ rules:
|
||||
|
||||
fuzzy_lookup:
|
||||
match: '("q[\d]+":|after:\\"[^"]+)'
|
||||
find_all: true
|
||||
re_type: findall
|
||||
|
||||
- url_prefix: 'com,facebook)/pages_reaction_units/more'
|
||||
|
||||
@ -163,10 +192,13 @@ rules:
|
||||
- match: 'Bootloader\.configurePage.*?;'
|
||||
replace: '/* {0} */'
|
||||
|
||||
- match: 'dash_manifest:"(.*",dash_prefetched_representation_ids:.*?])'
|
||||
- match: 'dash_manifest"?:"(.*","?dash_prefetched_representation_ids"?:.*?])'
|
||||
group: 1
|
||||
function: 'pywb.rewrite.rewrite_dash:rewrite_fb_dash'
|
||||
|
||||
- match: '"debugNoBatching\s?":(?:false|0)'
|
||||
replace: '"debugNoBatching":true'
|
||||
|
||||
parse_comments: true
|
||||
|
||||
- url_prefix: 'com,facebook'
|
||||
@ -193,6 +225,14 @@ rules:
|
||||
|
||||
- url_prefix: 'com,instagram)/'
|
||||
|
||||
rewrite:
|
||||
js_regexs:
|
||||
- match: '"is_dash_eligible":true'
|
||||
replace: '"is_dash_eligible":false'
|
||||
|
||||
- match: '"debugNoBatching\s?":(?:false|0)'
|
||||
replace: '"debugNoBatching":true'
|
||||
|
||||
fuzzy_lookup: '()'
|
||||
|
||||
|
||||
@ -278,6 +318,10 @@ rules:
|
||||
# soundcloud
|
||||
#=================================================================
|
||||
|
||||
- url_prefix: 'com,sndcdn,cf-media)/'
|
||||
|
||||
fuzzy_lookup: '()'
|
||||
|
||||
- url_prefix: 'com,soundcloud,api)/i1/tracks/'
|
||||
|
||||
rewrite:
|
||||
@ -287,6 +331,15 @@ rules:
|
||||
replace: '"__hls'
|
||||
|
||||
|
||||
- url_prefix: 'com,soundcloud,api-v2)/'
|
||||
|
||||
rewrite:
|
||||
live_only: true
|
||||
js_regexs:
|
||||
- match: 'hls'
|
||||
replace: 'mp3'
|
||||
|
||||
|
||||
# vimeo rules
|
||||
#=================================================================
|
||||
|
||||
@ -328,7 +381,7 @@ rules:
|
||||
- videoFileId
|
||||
- signature
|
||||
|
||||
- url_prefix: 'net,akamaized,gcs-vimeo)/'
|
||||
- url_prefix: ['net,akamaized,gcs-vimeo)/', 'net,akamaized,vod)/', 'net,akamaized,vod-progressive)/']
|
||||
|
||||
fuzzy_lookup:
|
||||
match: '([/\d]+\.mp4)$'
|
||||
@ -394,6 +447,15 @@ rules:
|
||||
- action_load_comments
|
||||
- filter
|
||||
|
||||
- url_prefix: ['com,youtube)/embed', 'com,youtube-nocookie)/embed']
|
||||
|
||||
fuzzy_lookup:
|
||||
match: '()'
|
||||
|
||||
- url_prefix: ['com,youtube)/youtubei/v1', 'com,youtube-nocookie)/youtubei/v1']
|
||||
|
||||
fuzzy_lookup:
|
||||
- videoid
|
||||
|
||||
- url_prefix: 'com,googlevideo,'
|
||||
|
||||
@ -440,9 +502,16 @@ rules:
|
||||
- match: 'yt\.setConfig.*PLAYER_CONFIG.*args":\s*{'
|
||||
replace: '{0} "dash": "0", dashmpd: "", '
|
||||
|
||||
- match: '"player":.*"args":{'
|
||||
- match: 'yt\.setConfig.*PLAYER_VARS.*?{'
|
||||
replace: '{0}"dash":"0","dashmpd":"",'
|
||||
|
||||
- match: '(?:"player":|ytplayer\.config).*"args":\s*{'
|
||||
replace: '{0}"dash":"0","dashmpd":"",'
|
||||
|
||||
- match: '"0"\s*?==\s*?\w+\.dash\&\&'
|
||||
replace: '1&&'
|
||||
|
||||
|
||||
# testing rules -- not for valid domain
|
||||
#=================================================================
|
||||
# this rule block is a non-existent prefix merely for testing
|
||||
@ -475,6 +544,12 @@ rules:
|
||||
rewrite:
|
||||
js_rewrite_location: urls
|
||||
|
||||
- url_prefix: 'com,example)/matched'
|
||||
fuzzy_lookup:
|
||||
re_type: sub
|
||||
match: 'matched'
|
||||
replace: 'replaced'
|
||||
|
||||
# all domain rules -- fallback to this dataset
|
||||
#=================================================================
|
||||
# Applies to all urls -- should be last
|
||||
|
@ -1,352 +1,399 @@
|
||||
'use strict';
|
||||
// thanks wombat
|
||||
var STYLE_REGEX = /(url\s*\(\s*[\\"']*)([^)'"]+)([\\"']*\s*\))/gi;
|
||||
var IMPORT_REGEX = /(@import\s+[\\"']*)([^)'";]+)([\\"']*\s*;?)/gi;
|
||||
var IMPORT_REGEX = /(@import\s*[\\"']*)([^)'";]+)([\\"']*\s*;?)/gi;
|
||||
var srcsetSplit = /\s*(\S*\s+[\d.]+[wx]),|(?:\s*,(?:\s+|(?=https?:)))/;
|
||||
var DefaultNumImFetches = 30;
|
||||
var FullImgQDrainLen = 10;
|
||||
var DefaultNumAvFetches = 5;
|
||||
var FullAVQDrainLen = 5;
|
||||
var MaxRunningFetches = 15;
|
||||
var DataURLPrefix = 'data:';
|
||||
var seen = {};
|
||||
// array of URLs to be fetched
|
||||
var queue = [];
|
||||
var runningFetches = 0;
|
||||
// a URL to resolve relative URLs found in the cssText of CSSMedia rules.
|
||||
var currentResolver = null;
|
||||
|
||||
// the autofetcher instance for this worker
|
||||
var autofetcher = null;
|
||||
var config = {
|
||||
havePromise: typeof self.Promise !== 'undefined',
|
||||
haveFetch: typeof self.fetch !== 'undefined',
|
||||
proxyMode: false,
|
||||
mod: null,
|
||||
prefix: null,
|
||||
prefixMod: null,
|
||||
relative: null,
|
||||
rwRe: null,
|
||||
defaultFetchOptions: {
|
||||
cache: 'force-cache',
|
||||
mode: 'cors'
|
||||
}
|
||||
};
|
||||
|
||||
if (!config.havePromise) {
|
||||
// not kewl we must polyfill Promise
|
||||
self.Promise = function(executor) {
|
||||
executor(noop, noop);
|
||||
};
|
||||
self.Promise.prototype.then = function(cb) {
|
||||
if (cb) cb();
|
||||
return this;
|
||||
};
|
||||
self.Promise.prototype.catch = function() {
|
||||
return this;
|
||||
};
|
||||
self.Promise.all = function(values) {
|
||||
return new Promise(noop);
|
||||
};
|
||||
}
|
||||
|
||||
if (!config.haveFetch) {
|
||||
// not kewl we must polyfill fetch.
|
||||
self.fetch = function(url) {
|
||||
return new Promise(function(resolve) {
|
||||
var xhr = new XMLHttpRequest();
|
||||
xhr.open('GET', url, true);
|
||||
xhr.onreadystatechange = function() {
|
||||
if (xhr.readyState === 4) {
|
||||
if (!config.havePromise) {
|
||||
fetchDone();
|
||||
}
|
||||
resolve();
|
||||
}
|
||||
};
|
||||
xhr.send();
|
||||
});
|
||||
};
|
||||
}
|
||||
|
||||
if (location.search.indexOf('init') !== -1) {
|
||||
(function() {
|
||||
var init;
|
||||
if (typeof self.URL === 'function') {
|
||||
var loc = new self.URL(location.href);
|
||||
init = JSON.parse(loc.searchParams.get('init'));
|
||||
} else {
|
||||
var search = decodeURIComponent(location.search.split('?')[1]).split('&');
|
||||
init = JSON.parse(search[0].substr(search[0].indexOf('=') + 1));
|
||||
init.prefix = decodeURIComponent(init.prefix);
|
||||
init.baseURI = decodeURIComponent(init.prefix);
|
||||
}
|
||||
config.prefix = init.prefix;
|
||||
config.mod = init.mod;
|
||||
config.prefixMod = init.prefix + init.mod;
|
||||
config.rwRe = new RegExp(init.rwRe);
|
||||
config.relative = init.prefix.split(location.origin)[1];
|
||||
config.schemeless = '/' + config.relative;
|
||||
})();
|
||||
} else {
|
||||
config.proxyMode = true;
|
||||
config.defaultFetchOptions.mode = 'no-cors';
|
||||
}
|
||||
|
||||
self.onmessage = function(event) {
|
||||
var data = event.data;
|
||||
switch (data.type) {
|
||||
case 'values':
|
||||
autoFetch(data);
|
||||
break;
|
||||
case 'fetch-all':
|
||||
justFetch(data);
|
||||
break;
|
||||
}
|
||||
};
|
||||
|
||||
function noop() {}
|
||||
|
||||
if (typeof self.Promise === 'undefined') {
|
||||
// not kewl we must polyfill Promise
|
||||
self.Promise = function (executor) {
|
||||
executor(noop, noop);
|
||||
};
|
||||
self.Promise.prototype.then = function (cb) {
|
||||
if (cb) cb();
|
||||
return this;
|
||||
};
|
||||
self.Promise.prototype.catch = function () {
|
||||
return this;
|
||||
};
|
||||
self.Promise.all = function (values) {
|
||||
return new Promise(noop);
|
||||
};
|
||||
function fetchDone() {
|
||||
runningFetches -= 1;
|
||||
fetchFromQ();
|
||||
}
|
||||
|
||||
if (typeof self.fetch === 'undefined') {
|
||||
// not kewl we must polyfill fetch.
|
||||
self.fetch = function (url) {
|
||||
return new Promise(function (resolve) {
|
||||
var xhr = new XMLHttpRequest();
|
||||
xhr.open('GET', url);
|
||||
xhr.send();
|
||||
resolve();
|
||||
});
|
||||
};
|
||||
function fetchErrored(err) {
|
||||
console.warn('Fetch Failed: ' + err);
|
||||
fetchDone();
|
||||
}
|
||||
|
||||
self.onmessage = function (event) {
|
||||
var data = event.data;
|
||||
switch (data.type) {
|
||||
case 'values':
|
||||
autofetcher.autoFetch(data);
|
||||
break;
|
||||
}
|
||||
};
|
||||
/**
|
||||
* Fetches the supplied URL and increments the {@link runningFetches} variable
|
||||
* to represent an inflight request.
|
||||
* If the url to be fetched is an object then its a fetch-as-page and the
|
||||
* fetch is configured using its supplied options and url properties.
|
||||
*
|
||||
* Otherwise, the fetch is made using cache mode force-cache and if we
|
||||
* are operating in proxy mode the fetch mode no-cors is used.
|
||||
* @param {string|Object} toBeFetched - The URL to be fetched
|
||||
*/
|
||||
function fetchURL(toBeFetched) {
|
||||
runningFetches += 1;
|
||||
|
||||
function AutoFetcher(init) {
|
||||
if (!(this instanceof AutoFetcher)) {
|
||||
return new AutoFetcher(init);
|
||||
}
|
||||
this.prefix = init.prefix;
|
||||
this.mod = init.mod;
|
||||
this.prefixMod = init.prefix + init.mod;
|
||||
this.rwRe = new RegExp(init.rwRe);
|
||||
// relative url, WorkerLocation is set by owning document
|
||||
this.relative = init.prefix.split(location.origin)[1];
|
||||
// schemeless url
|
||||
this.schemeless = '/' + this.relative;
|
||||
// local cache of URLs fetched, to reduce server load
|
||||
this.seen = {};
|
||||
// array of URLs to be fetched
|
||||
this.queue = [];
|
||||
this.avQueue = [];
|
||||
// should we queue a URL or not
|
||||
this.queuing = false;
|
||||
this.queuingAV = false;
|
||||
this.urlExtractor = this.urlExtractor.bind(this);
|
||||
this.imgFetchDone = this.imgFetchDone.bind(this);
|
||||
this.avFetchDone = this.avFetchDone.bind(this);
|
||||
var url;
|
||||
var options = config.defaultFetchOptions;
|
||||
|
||||
if (typeof toBeFetched === 'object') {
|
||||
url = toBeFetched.url;
|
||||
options = toBeFetched.options;
|
||||
} else {
|
||||
url = toBeFetched;
|
||||
}
|
||||
|
||||
fetch(url, options)
|
||||
.then(fetchDone)
|
||||
.catch(fetchErrored);
|
||||
}
|
||||
|
||||
AutoFetcher.prototype.delay = function () {
|
||||
// 2 second delay seem reasonable
|
||||
return new Promise(function (resolve, reject) {
|
||||
setTimeout(resolve, 2000);
|
||||
});
|
||||
};
|
||||
function queueOrFetch(toBeFetched) {
|
||||
var url = typeof toBeFetched === 'object' ? toBeFetched.url : toBeFetched;
|
||||
if (!url || url.indexOf(DataURLPrefix) === 0 || seen[url] != null) {
|
||||
return;
|
||||
}
|
||||
seen[url] = true;
|
||||
if (runningFetches >= MaxRunningFetches) {
|
||||
queue.push(toBeFetched);
|
||||
return;
|
||||
}
|
||||
fetchURL(toBeFetched);
|
||||
}
|
||||
|
||||
AutoFetcher.prototype.imgFetchDone = function () {
|
||||
if (this.queue.length > 0) {
|
||||
// we have a Q of some length drain it
|
||||
var autofetcher = this;
|
||||
this.delay().then(function () {
|
||||
autofetcher.queuing = false;
|
||||
autofetcher.fetchImgs();
|
||||
});
|
||||
} else {
|
||||
this.queuing = false;
|
||||
}
|
||||
};
|
||||
function fetchFromQ() {
|
||||
while (queue.length && runningFetches < MaxRunningFetches) {
|
||||
fetchURL(queue.shift());
|
||||
}
|
||||
}
|
||||
|
||||
AutoFetcher.prototype.avFetchDone = function () {
|
||||
if (this.avQueue.length > 0) {
|
||||
// we have a Q of some length drain it
|
||||
var autofetcher = this;
|
||||
this.delay().then(function () {
|
||||
autofetcher.queuingAV = false;
|
||||
autofetcher.fetchAV();
|
||||
});
|
||||
} else {
|
||||
this.queuingAV = false;
|
||||
}
|
||||
};
|
||||
|
||||
AutoFetcher.prototype.fetchAV = function () {
|
||||
if (this.queuingAV || this.avQueue.length === 0) {
|
||||
return;
|
||||
}
|
||||
// the number of fetches is limited to a maximum of DefaultNumAvFetches + FullAVQDrainLen outstanding fetches
|
||||
// the baseline maximum number of fetches is DefaultNumAvFetches but if the size(avQueue) <= FullAVQDrainLen
|
||||
// we add them to the current batch. Because audio video resources might be big
|
||||
// we limit how many we fetch at a time drastically
|
||||
this.queuingAV = true;
|
||||
var runningFetchers = [];
|
||||
while (this.avQueue.length > 0 && runningFetchers.length <= DefaultNumAvFetches) {
|
||||
runningFetchers.push(fetch(this.avQueue.shift()).catch(noop))
|
||||
}
|
||||
if (this.avQueue.length <= FullAVQDrainLen) {
|
||||
while (this.avQueue.length > 0) {
|
||||
runningFetchers.push(fetch(this.avQueue.shift()).catch(noop))
|
||||
}
|
||||
}
|
||||
Promise.all(runningFetchers)
|
||||
.then(this.avFetchDone)
|
||||
.catch(this.avFetchDone);
|
||||
};
|
||||
|
||||
AutoFetcher.prototype.fetchImgs = function () {
|
||||
if (this.queuing || this.queue.length === 0) {
|
||||
return;
|
||||
}
|
||||
// the number of fetches is limited to a maximum of DefaultNumImFetches + FullImgQDrainLen outstanding fetches
|
||||
// the baseline maximum number of fetches is DefaultNumImFetches but if the size(queue) <= FullImgQDrainLen
|
||||
// we add them to the current batch
|
||||
this.queuing = true;
|
||||
var runningFetchers = [];
|
||||
while (this.queue.length > 0 && runningFetchers.length <= DefaultNumImFetches) {
|
||||
runningFetchers.push(fetch(this.queue.shift()).catch(noop))
|
||||
}
|
||||
if (this.queue.length <= FullImgQDrainLen) {
|
||||
while (this.queue.length > 0) {
|
||||
runningFetchers.push(fetch(this.queue.shift()).catch(noop))
|
||||
}
|
||||
}
|
||||
Promise.all(runningFetchers)
|
||||
.then(this.imgFetchDone)
|
||||
.catch(this.imgFetchDone);
|
||||
};
|
||||
|
||||
AutoFetcher.prototype.queueNonAVURL = function (url) {
|
||||
// ensure we do not request data urls
|
||||
if (url.indexOf(DataURLPrefix) === 0) return;
|
||||
// check to see if we have seen this url before in order
|
||||
// to lessen the load against the server content is fetched from
|
||||
if (this.seen[url] != null) return;
|
||||
this.seen[url] = true;
|
||||
this.queue.push(url);
|
||||
};
|
||||
|
||||
AutoFetcher.prototype.queueAVURL = function (url) {
|
||||
// ensure we do not request data urls
|
||||
if (url.indexOf(DataURLPrefix) === 0) return;
|
||||
// check to see if we have seen this url before in order
|
||||
// to lessen the load against the server content is fetched from
|
||||
if (this.seen[url] != null) return;
|
||||
this.seen[url] = true;
|
||||
this.avQueue.push(url);
|
||||
};
|
||||
|
||||
AutoFetcher.prototype.maybeResolveURL = function (url, base) {
|
||||
// given a url and base url returns a resolved full URL or
|
||||
// null if resolution was unsuccessful
|
||||
try {
|
||||
var _url = new URL(url, base);
|
||||
return _url.href;
|
||||
} catch (e) {
|
||||
return null;
|
||||
}
|
||||
};
|
||||
|
||||
AutoFetcher.prototype.maybeFixUpRelSchemelessPrefix = function (url) {
|
||||
// attempt to ensure rewritten relative or schemeless URLs become full URLS!
|
||||
// otherwise returns null if this did not happen
|
||||
if (url.indexOf(this.relative) === 0) {
|
||||
return url.replace(this.relative, this.prefix);
|
||||
}
|
||||
if (url.indexOf(this.schemeless) === 0) {
|
||||
return url.replace(this.schemeless, this.prefix);
|
||||
}
|
||||
function maybeResolveURL(url, base) {
|
||||
// given a url and base url returns a resolved full URL or
|
||||
// null if resolution was unsuccessful
|
||||
try {
|
||||
var _url = new URL(url, base);
|
||||
return _url.href;
|
||||
} catch (e) {
|
||||
return null;
|
||||
};
|
||||
|
||||
AutoFetcher.prototype.maybeFixUpURL = function (url, resolveOpts) {
|
||||
// attempt to fix up the url and do our best to ensure we can get dat 200 OK!
|
||||
if (this.rwRe.test(url)) {
|
||||
return url;
|
||||
}
|
||||
var mod = resolveOpts.mod || 'mp_';
|
||||
// first check for / (relative) or // (schemeless) rewritten urls
|
||||
var maybeFixed = this.maybeFixUpRelSchemelessPrefix(url);
|
||||
if (maybeFixed != null) {
|
||||
return maybeFixed;
|
||||
}
|
||||
// resolve URL against tag src
|
||||
if (resolveOpts.tagSrc != null) {
|
||||
maybeFixed = this.maybeResolveURL(url, resolveOpts.tagSrc);
|
||||
if (maybeFixed != null) {
|
||||
return this.prefix + mod + '/' + maybeFixed;
|
||||
}
|
||||
}
|
||||
// finally last attempt resolve the originating documents base URI
|
||||
if (resolveOpts.docBaseURI) {
|
||||
maybeFixed = this.maybeResolveURL(url, resolveOpts.docBaseURI);
|
||||
if (maybeFixed != null) {
|
||||
return this.prefix + mod + '/' + maybeFixed;
|
||||
}
|
||||
}
|
||||
// not much to do now.....
|
||||
return this.prefixMod + '/' + url;
|
||||
};
|
||||
|
||||
AutoFetcher.prototype.urlExtractor = function (match, n1, n2, n3, offset, string) {
|
||||
// Same function as style_replacer in wombat.rewrite_style, n2 is our URL
|
||||
this.queueNonAVURL(n2);
|
||||
return n1 + n2 + n3;
|
||||
};
|
||||
|
||||
AutoFetcher.prototype.handleMedia = function (mediaRules) {
|
||||
// this is a broken down rewrite_style
|
||||
if (mediaRules == null || mediaRules.length === 0) return;
|
||||
// var rules = mediaRules.values;
|
||||
for (var i = 0; i < mediaRules.length; i++) {
|
||||
mediaRules[i]
|
||||
.replace(STYLE_REGEX, this.urlExtractor)
|
||||
.replace(IMPORT_REGEX, this.urlExtractor);
|
||||
}
|
||||
};
|
||||
|
||||
AutoFetcher.prototype.handleSrc = function (srcValues, context) {
|
||||
var resolveOpts = { 'docBaseURI': context.docBaseURI };
|
||||
if (srcValues.value) {
|
||||
resolveOpts.mod = srcValues.mod;
|
||||
if (resolveOpts.mod === 1) {
|
||||
return this.queueNonAVURL(this.maybeFixUpURL(srcValues.value.trim(), resolveOpts));
|
||||
}
|
||||
return this.queueAVURL(this.maybeFixUpURL(srcValues.value.trim(), resolveOpts));
|
||||
}
|
||||
var len = srcValues.values.length;
|
||||
for (var i = 0; i < len; i++) {
|
||||
var value = srcValues.values[i];
|
||||
resolveOpts.mod = value.mod;
|
||||
if (resolveOpts.mod === 'im_') {
|
||||
this.queueNonAVURL(this.maybeFixUpURL(value.src, resolveOpts));
|
||||
} else {
|
||||
this.queueAVURL(this.maybeFixUpURL(value.src, resolveOpts));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
AutoFetcher.prototype.extractSrcSetNotPreSplit = function (ssV, resolveOpts) {
|
||||
// was from extract from local doc so we need to duplicate work
|
||||
var srcsetValues = ssV.split(srcsetSplit);
|
||||
for (var i = 0; i < srcsetValues.length; i++) {
|
||||
// grab the URL not width/height key
|
||||
if (srcsetValues[i]) {
|
||||
var value = srcsetValues[i].trim().split(' ')[0];
|
||||
var maybeResolvedURL = this.maybeFixUpURL(value.trim(), resolveOpts);
|
||||
if (resolveOpts.mod === 'im_') {
|
||||
this.queueNonAVURL(maybeResolvedURL);
|
||||
} else {
|
||||
this.queueAVURL(maybeResolvedURL);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
AutoFetcher.prototype.extractSrcset = function (srcsets, context) {
|
||||
// was rewrite_srcset and only need to q
|
||||
for (var i = 0; i < srcsets.length; i++) {
|
||||
// grab the URL not width/height key
|
||||
var url = srcsets[i].split(' ')[0];
|
||||
if (context.mod === 'im_') {
|
||||
this.queueNonAVURL(url);
|
||||
} else {
|
||||
this.queueAVURL(url);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
AutoFetcher.prototype.handleSrcset = function (srcset, context) {
|
||||
var resolveOpts = { 'docBaseURI': context.docBaseURI };
|
||||
if (srcset.value) {
|
||||
// we have a single value, this srcset came from either
|
||||
// preserveDataSrcset (not presplit) preserveSrcset (presplit)
|
||||
resolveOpts.mod = srcset.mod;
|
||||
if (!srcset.presplit) {
|
||||
// extract URLs from the srcset string
|
||||
return this.extractSrcSetNotPreSplit(srcset.value, resolveOpts);
|
||||
}
|
||||
// we have an array of srcset URL strings
|
||||
return this.extractSrcset(srcset.value, resolveOpts);
|
||||
}
|
||||
// we have an array of values, these srcsets came from extractFromLocalDoc
|
||||
var len = srcset.values.length;
|
||||
for (var i = 0; i < len; i++) {
|
||||
var ssv = srcset.values[i];
|
||||
resolveOpts.mod = ssv.mod;
|
||||
resolveOpts.tagSrc = ssv.tagSrc;
|
||||
this.extractSrcSetNotPreSplit(ssv.srcset, resolveOpts);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
AutoFetcher.prototype.autoFetch = function (data) {
|
||||
// we got a message and now we autofetch!
|
||||
// these calls turn into no ops if they have no work
|
||||
if (data.media) {
|
||||
this.handleMedia(data.media);
|
||||
}
|
||||
|
||||
if (data.src) {
|
||||
this.handleSrc(data.src, data.context || {});
|
||||
}
|
||||
|
||||
if (data.srcset) {
|
||||
this.handleSrcset(data.srcset, data.context || {});
|
||||
}
|
||||
|
||||
this.fetchImgs();
|
||||
this.fetchAV();
|
||||
};
|
||||
|
||||
// initialize ourselves from the query params :)
|
||||
try {
|
||||
var loc = new self.URL(location.href);
|
||||
autofetcher = new AutoFetcher(JSON.parse(loc.searchParams.get('init')));
|
||||
} catch (e) {
|
||||
// likely we are in an older version of safari
|
||||
var search = decodeURIComponent(location.search.split('?')[1]).split('&');
|
||||
var init = JSON.parse(search[0].substr(search[0].indexOf('=') + 1));
|
||||
init.prefix = decodeURIComponent(init.prefix);
|
||||
init.baseURI = decodeURIComponent(init.baseURI);
|
||||
autofetcher = new AutoFetcher(init);
|
||||
}
|
||||
}
|
||||
|
||||
function safeResolve(url, resolver) {
|
||||
// Guard against the exception thrown by the URL constructor if the URL or resolver is bad
|
||||
// if resolver is undefined/null then this function passes url through
|
||||
var resolvedURL = url;
|
||||
if (resolver) {
|
||||
try {
|
||||
var _url = new URL(url, resolver);
|
||||
return _url.href;
|
||||
} catch (e) {
|
||||
resolvedURL = url;
|
||||
}
|
||||
}
|
||||
return resolvedURL;
|
||||
}
|
||||
|
||||
function maybeFixUpRelSchemelessPrefix(url) {
|
||||
// attempt to ensure rewritten relative or schemeless URLs become full URLS!
|
||||
// otherwise returns null if this did not happen
|
||||
if (url.indexOf(config.relative) === 0) {
|
||||
return url.replace(config.relative, config.prefix);
|
||||
}
|
||||
if (url.indexOf(config.schemeless) === 0) {
|
||||
return url.replace(config.schemeless, config.prefix);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
function maybeFixUpURL(url, resolveOpts) {
|
||||
// attempt to fix up the url and do our best to ensure we can get dat 200 OK!
|
||||
if (config.rwRe.test(url)) {
|
||||
return url;
|
||||
}
|
||||
var mod = resolveOpts.mod || 'mp_';
|
||||
// first check for / (relative) or // (schemeless) rewritten urls
|
||||
var maybeFixed = maybeFixUpRelSchemelessPrefix(url);
|
||||
if (maybeFixed != null) {
|
||||
return maybeFixed;
|
||||
}
|
||||
// resolve URL against tag src
|
||||
if (resolveOpts.tagSrc != null) {
|
||||
maybeFixed = maybeResolveURL(url, resolveOpts.tagSrc);
|
||||
if (maybeFixed != null) {
|
||||
return config.prefix + mod + '/' + maybeFixed;
|
||||
}
|
||||
}
|
||||
// finally last attempt resolve the originating documents base URI
|
||||
if (resolveOpts.docBaseURI) {
|
||||
maybeFixed = maybeResolveURL(url, resolveOpts.docBaseURI);
|
||||
if (maybeFixed != null) {
|
||||
return config.prefix + mod + '/' + maybeFixed;
|
||||
}
|
||||
}
|
||||
// not much to do now.....
|
||||
return config.prefixMod + '/' + url;
|
||||
}
|
||||
|
||||
function urlExtractor(match, n1, n2, n3, offset, string) {
|
||||
// Same function as style_replacer in wombat.rewrite_style, n2 is our URL
|
||||
queueOrFetch(n2);
|
||||
return n1 + n2 + n3;
|
||||
}
|
||||
|
||||
function urlExtractorProxyMode(match, n1, n2, n3, offset, string) {
|
||||
// Same function as style_replacer in wombat.rewrite_style, n2 is our URL
|
||||
// this.currentResolver is set to the URL which the browser would normally
|
||||
// resolve relative urls with (URL of the stylesheet) in an exceptionless manner
|
||||
// (resolvedURL will be undefined if an error occurred)
|
||||
queueOrFetch(safeResolve(n2, currentResolver));
|
||||
return n1 + n2 + n3;
|
||||
}
|
||||
|
||||
function handleMedia(mediaRules) {
|
||||
// this is a broken down rewrite_style
|
||||
if (mediaRules == null || mediaRules.length === 0) return;
|
||||
for (var i = 0; i < mediaRules.length; i++) {
|
||||
mediaRules[i]
|
||||
.replace(STYLE_REGEX, urlExtractor)
|
||||
.replace(IMPORT_REGEX, urlExtractor);
|
||||
}
|
||||
}
|
||||
|
||||
function handleMediaProxyMode(mediaRules) {
|
||||
// this is a broken down rewrite_style
|
||||
if (mediaRules == null || mediaRules.length === 0) return;
|
||||
for (var i = 0; i < mediaRules.length; i++) {
|
||||
// set currentResolver to the value of this stylesheets URL, done to ensure we do not have to
|
||||
// create functions on each loop iteration because we potentially create a new `URL` object
|
||||
// twice per iteration
|
||||
currentResolver = mediaRules[i].resolve;
|
||||
mediaRules[i].cssText
|
||||
.replace(STYLE_REGEX, urlExtractorProxyMode)
|
||||
.replace(IMPORT_REGEX, urlExtractorProxyMode);
|
||||
}
|
||||
}
|
||||
|
||||
function handleSrc(srcValues, context) {
|
||||
var resolveOpts = { docBaseURI: context.docBaseURI, mod: null };
|
||||
if (srcValues.value) {
|
||||
resolveOpts.mod = srcValues.mod;
|
||||
return queueOrFetch(maybeFixUpURL(srcValues.value.trim(), resolveOpts));
|
||||
}
|
||||
var len = srcValues.values.length;
|
||||
for (var i = 0; i < len; i++) {
|
||||
var value = srcValues.values[i];
|
||||
resolveOpts.mod = value.mod;
|
||||
queueOrFetch(maybeFixUpURL(value.src, resolveOpts));
|
||||
}
|
||||
}
|
||||
|
||||
function handleSrcProxyMode(srcValues) {
|
||||
// preservation worker in proxy mode sends us the value of the srcset attribute of an element
|
||||
// and a URL to correctly resolve relative URLS. Thus we must recreate rewrite_srcset logic here
|
||||
if (srcValues == null || srcValues.length === 0) return;
|
||||
var srcVal;
|
||||
for (var i = 0; i < srcValues.length; i++) {
|
||||
srcVal = srcValues[i];
|
||||
queueOrFetch(safeResolve(srcVal.src, srcVal.resolve));
|
||||
}
|
||||
}
|
||||
|
||||
function extractSrcSetNotPreSplit(ssV, resolveOpts) {
|
||||
if (!ssV) return;
|
||||
// was from extract from local doc so we need to duplicate work
|
||||
var srcsetValues = ssV.split(srcsetSplit);
|
||||
for (var i = 0; i < srcsetValues.length; i++) {
|
||||
// grab the URL not width/height key
|
||||
if (srcsetValues[i]) {
|
||||
var value = srcsetValues[i].trim().split(' ')[0];
|
||||
var maybeResolvedURL = maybeFixUpURL(value.trim(), resolveOpts);
|
||||
queueOrFetch(maybeResolvedURL);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function extractSrcset(srcsets) {
|
||||
// was rewrite_srcset and only need to q
|
||||
for (var i = 0; i < srcsets.length; i++) {
|
||||
// grab the URL not width/height key
|
||||
var url = srcsets[i].split(' ')[0];
|
||||
queueOrFetch(url);
|
||||
}
|
||||
}
|
||||
|
||||
function handleSrcset(srcset, context) {
|
||||
if (srcset == null) return;
|
||||
var resolveOpts = {
|
||||
docBaseURI: context.docBaseURI,
|
||||
mod: null,
|
||||
tagSrc: null
|
||||
};
|
||||
if (srcset.value) {
|
||||
// we have a single value, this srcset came from either
|
||||
// preserveDataSrcset (not presplit) preserveSrcset (presplit)
|
||||
resolveOpts.mod = srcset.mod;
|
||||
if (!srcset.presplit) {
|
||||
// extract URLs from the srcset string
|
||||
return extractSrcSetNotPreSplit(srcset.value, resolveOpts);
|
||||
}
|
||||
// we have an array of srcset URL strings
|
||||
return extractSrcset(srcset.value);
|
||||
}
|
||||
// we have an array of values, these srcsets came from extractFromLocalDoc
|
||||
var len = srcset.values.length;
|
||||
for (var i = 0; i < len; i++) {
|
||||
var ssv = srcset.values[i];
|
||||
resolveOpts.mod = ssv.mod;
|
||||
resolveOpts.tagSrc = ssv.tagSrc;
|
||||
extractSrcSetNotPreSplit(ssv.srcset, resolveOpts);
|
||||
}
|
||||
}
|
||||
|
||||
function handleSrcsetProxyMode(srcsets) {
|
||||
// preservation worker in proxy mode sends us the value of the srcset attribute of an element
|
||||
// and a URL to correctly resolve relative URLS. Thus we must recreate rewrite_srcset logic here
|
||||
if (srcsets == null) return;
|
||||
var length = srcsets.length;
|
||||
var extractedSrcSet, srcsetValue, ssSplit, j;
|
||||
for (var i = 0; i < length; i++) {
|
||||
extractedSrcSet = srcsets[i];
|
||||
ssSplit = extractedSrcSet.srcset.split(srcsetSplit);
|
||||
for (j = 0; j < ssSplit.length; j++) {
|
||||
if (ssSplit[j]) {
|
||||
srcsetValue = ssSplit[j].trim();
|
||||
if (srcsetValue) {
|
||||
queueOrFetch(
|
||||
safeResolve(srcsetValue.split(' ')[0], extractedSrcSet.resolve)
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function autoFetch(data) {
|
||||
// we got a message and now we autofetch!
|
||||
// these calls turn into no ops if they have no work
|
||||
if (data.media) {
|
||||
if (config.proxyMode) {
|
||||
handleMediaProxyMode(data.media);
|
||||
} else {
|
||||
handleMedia(data.media);
|
||||
}
|
||||
}
|
||||
|
||||
if (data.src) {
|
||||
if (config.proxyMode) {
|
||||
handleSrcProxyMode(data.src);
|
||||
} else {
|
||||
handleSrc(data.src, data.context || { docBaseURI: null });
|
||||
}
|
||||
}
|
||||
|
||||
if (data.srcset) {
|
||||
if (config.proxyMode) {
|
||||
handleSrcsetProxyMode(data.srcset);
|
||||
} else {
|
||||
handleSrcset(data.srcset, data.context || { docBaseURI: null });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function justFetch(data) {
|
||||
// we got a message containing only urls to be fetched
|
||||
if (data == null || data.values == null) return;
|
||||
for (var i = 0; i < data.values.length; ++i) {
|
||||
queueOrFetch(data.values[i]);
|
||||
}
|
||||
}
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user