mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
Compare commits
209 Commits
Author | SHA1 | Date | |
---|---|---|---|
|
369e8a4657 | ||
|
66ad775188 | ||
|
fbed60ff38 | ||
|
d6b9058e3b | ||
|
bfe18aeaf1 | ||
|
6028e523f3 | ||
|
7ce00f001c | ||
|
0e565889e1 | ||
|
01832c3cc5 | ||
|
ef774f5f29 | ||
|
c3ce3b160a | ||
|
14d2a0c005 | ||
|
aef8ca7012 | ||
|
701b659510 | ||
|
10d36cc943 | ||
|
a65b8b82b9 | ||
|
6756ba60fa | ||
|
2068c037ea | ||
|
f00ca5c336 | ||
|
c0ea6ef00f | ||
|
f7d4286b54 | ||
|
56e0b17dc9 | ||
|
af52dec469 | ||
|
848c089afa | ||
|
9fd5a22502 | ||
|
3d653e023c | ||
|
4cb8e0d5dc | ||
|
a20ad226cb | ||
|
bc0da12c48 | ||
|
8f0039de02 | ||
|
c620d7dd19 | ||
|
4fbf523a3e | ||
|
3b5d9d8ef0 | ||
|
5e779af2e9 | ||
|
a90c9c3dd4 | ||
|
99a825c055 | ||
|
c01d58df78 | ||
|
6eb2bd1265 | ||
|
d864ea91ee | ||
|
83c109bc9b | ||
|
1cc08233d6 | ||
|
ca02c22ff7 | ||
|
1fd3b2c7a1 | ||
|
ba14480a2d | ||
|
50a4f35e5f | ||
|
9973d28de9 | ||
|
ee9e375560 | ||
|
c008c2eca7 | ||
|
7958921053 | ||
|
329fef31a8 | ||
|
d253ea85c3 | ||
|
8418fe10ba | ||
|
fcd9b2b3bd | ||
|
731cfe80cc | ||
|
9521042a23 | ||
|
daa925db17 | ||
|
d96dd5d842 | ||
|
1e3d22aba4 | ||
|
5ae1291e37 | ||
|
05daafa19e | ||
|
ade2373711 | ||
|
3a234d0cec | ||
|
366ed5155f | ||
|
c027659001 | ||
|
9e8ea5bb45 | ||
|
bc3d1e6d00 | ||
|
6b372e2f3f | ||
|
5d8fbf7038 | ||
|
a969430b37 | ||
|
aeecb6515f | ||
|
e1eddb8fa7 | ||
|
d7aec77597 | ||
|
bcaf293081 | ||
|
7d4c8dcb4e | ||
|
da089e0a92 | ||
|
3eeccd0016 | ||
|
5e5a74f204 | ||
|
b67f1ad0f3 | ||
|
e6a1a7dd7e | ||
|
e744075913 | ||
|
1476bfec8c | ||
|
b57ec9c589 | ||
|
e61099ff5f | ||
|
0e23a31a31 | ||
|
7f406b7942 | ||
|
5f1c8c75fa | ||
|
e0732ffaf4 | ||
|
b8057825d8 | ||
|
e2e2c02802 | ||
|
f19ead0058 | ||
|
36784de174 | ||
|
ce1f32dc41 | ||
|
ae11daedc1 | ||
|
456698fe06 | ||
|
d90367f21f | ||
|
8078ee7af9 | ||
|
c649355285 | ||
|
21351094ec | ||
|
edeae3b21a | ||
|
b34419543f | ||
|
5e397e9bca | ||
|
d0b21f5dc4 | ||
|
36711c0148 | ||
|
a5e9c27223 | ||
|
de9219e646 | ||
|
5c15582be5 | ||
|
47731c61c1 | ||
|
90fba01514 | ||
|
a8cd53bfe4 | ||
|
ee6bc151e1 | ||
|
ca0197330d | ||
|
469b41773a | ||
|
91fcc054c4 | ||
|
3f5251ed60 | ||
|
f54e1b37c7 | ||
|
47ec5d7644 | ||
|
4ceebe1fa9 | ||
|
e88a88f247 | ||
|
f9c9443d2f | ||
|
ac959c6db5 | ||
|
ad652b407c | ||
|
fe19bb268f | ||
|
f77c152037 | ||
|
22d786f72e | ||
|
52e83632dd | ||
|
1f852f5f36 | ||
|
a34b7be431 | ||
|
d1b52f8d80 | ||
|
da9c4b0b4e | ||
|
af0fe2892c | ||
|
a09901dcef | ||
|
407e890258 | ||
|
8460a670b2 | ||
|
6536516375 | ||
|
8f20fc014e | ||
|
84a46e4323 | ||
|
88a7f79a7e | ||
|
a8cd219da7 | ||
|
2b408b3af0 | ||
|
1aa6b0c5d6 | ||
|
fce1c3d722 | ||
|
932001c921 | ||
|
a4253d5425 | ||
|
48d96fbc79 | ||
|
c0fcf59c86 | ||
|
79aab697e2 | ||
|
51c4f6d622 | ||
|
8c52bd8442 | ||
|
81a945e840 | ||
|
0abb1808b2 | ||
|
4ca10a22d8 | ||
|
740a80bfdb | ||
|
c7f8a8f223 | ||
|
2d6eefd8c6 | ||
|
76abe4b753 | ||
|
d133565061 | ||
|
6ee7ab36a2 | ||
|
957bd079e8 | ||
|
8c31ec2916 | ||
|
bbf3fad1dc | ||
|
f51f2ec225 | ||
|
2772b80fab | ||
|
8ed93fea37 | ||
|
5b30dd4576 | ||
|
f0d2898326 | ||
|
89041e83b4 | ||
|
75e789c15f | ||
|
bbe41bc900 | ||
|
89d987a181 | ||
|
41d7f0be53 | ||
|
653dec71ae | ||
|
1a8c719422 | ||
|
50d29bdf80 | ||
|
16489b99d9 | ||
|
dfc081fff8 | ||
|
ddcde36982 | ||
|
be7048844b | ||
|
38d6e4337d | ||
|
de01d498cb | ||
|
3298128e0c | ||
|
f207e32f50 | ||
|
5de2569430 | ||
|
10327d28c9 | ||
|
0d268659ab | ||
|
5ced2588d4 | ||
|
98b3c1f80b | ||
|
21731a2dfe | ||
|
7560c0946d | ||
|
2ca84ae023 | ||
|
4893a8eac0 | ||
|
c048b05d46 | ||
|
ac3d238a3d | ||
|
0cab6fc4bf | ||
|
794cc29c80 | ||
|
5633ae6a9c | ||
|
3f08639553 | ||
|
a25971e06b | ||
|
f2eebae641 | ||
|
a291de086d | ||
|
cb2a07bff2 | ||
|
1e0a0ca63a | ||
|
df7b46d94f | ||
|
436a27b19e | ||
|
b0367a9c82 | ||
|
878ab0977f | ||
|
c8f1c64494 | ||
|
6e6b43eb79 | ||
|
c70bf2e2b9 | ||
|
adca46427d |
70
.travis.yml
70
.travis.yml
@ -1,70 +0,0 @@
|
|||||||
sudo: required
|
|
||||||
|
|
||||||
language: python
|
|
||||||
python:
|
|
||||||
- 3.6
|
|
||||||
- 3.5
|
|
||||||
- 3.4
|
|
||||||
- 2.7
|
|
||||||
- pypy
|
|
||||||
- pypy3
|
|
||||||
- 3.7-dev
|
|
||||||
- nightly
|
|
||||||
|
|
||||||
matrix:
|
|
||||||
allow_failures:
|
|
||||||
- python: nightly
|
|
||||||
- python: 3.7-dev
|
|
||||||
- python: 2.7
|
|
||||||
- python: pypy
|
|
||||||
|
|
||||||
addons:
|
|
||||||
apt:
|
|
||||||
packages:
|
|
||||||
- tor
|
|
||||||
|
|
||||||
services:
|
|
||||||
- docker
|
|
||||||
|
|
||||||
before_install:
|
|
||||||
- sudo service docker restart ; sleep 10 # https://github.com/travis-ci/travis-ci/issues/4778
|
|
||||||
- docker network create --driver=bridge trough
|
|
||||||
- docker run --detach --network=trough --hostname=rethinkdb --name=rethinkdb --publish=28015:28015 rethinkdb
|
|
||||||
- docker run --detach --network=trough --hostname=hadoop --name=hadoop chalimartines/cdh5-pseudo-distributed
|
|
||||||
- docker run --detach --network=trough --hostname=trough --name=trough --volume="$PWD/tests/run-trough.sh:/run-trough.sh" --publish=6111:6111 --publish=6112:6112 --publish=6222:6222 --publish=6444:6444 python:3.6 bash /run-trough.sh
|
|
||||||
- cat /etc/hosts
|
|
||||||
- echo | sudo tee -a /etc/hosts # travis-ci default doesn't end with a newline 🙄
|
|
||||||
- echo 127.0.0.1 rethinkdb | sudo tee -a /etc/hosts
|
|
||||||
- echo 127.0.0.1 hadoop | sudo tee -a /etc/hosts
|
|
||||||
- echo 127.0.0.1 trough | sudo tee -a /etc/hosts
|
|
||||||
- cat /etc/hosts
|
|
||||||
- ping -c2 trough
|
|
||||||
|
|
||||||
install:
|
|
||||||
- pip install . pytest requests warcio mock
|
|
||||||
|
|
||||||
before_script:
|
|
||||||
- docker exec trough bash -c 'while ! test -e /tmp/trough-read.out ; do sleep 0.5 ; done' || true
|
|
||||||
- docker logs --timestamps --details trough
|
|
||||||
- ps ww -fHe
|
|
||||||
- docker ps
|
|
||||||
|
|
||||||
script:
|
|
||||||
- py.test -v --tb=native tests
|
|
||||||
- py.test -v --tb=native --rethinkdb-dedup-url=rethinkdb://localhost/test1/dedup tests
|
|
||||||
- py.test -v --tb=native --rethinkdb-big-table-url=rethinkdb://localhost/test2/captures tests
|
|
||||||
- py.test -v --tb=native --rethinkdb-trough-db-url=rethinkdb://localhost/trough_configuration tests
|
|
||||||
|
|
||||||
after_script:
|
|
||||||
- ps ww -fHe
|
|
||||||
- docker exec trough cat /tmp/trough-write.out
|
|
||||||
- docker exec trough cat /tmp/trough-segment-manager-server.out
|
|
||||||
- docker exec trough cat /tmp/trough-segment-manager-local.out
|
|
||||||
- docker exec trough cat /tmp/trough-sync-server.out
|
|
||||||
- docker exec trough cat /tmp/trough-sync-local.out
|
|
||||||
- docker exec trough cat /tmp/trough-read.out
|
|
||||||
|
|
||||||
notifications:
|
|
||||||
slack:
|
|
||||||
secure: UJzNe+kEJ8QhNxrdqObroisJAO2ipr+Sr2+u1e2euQdIkacyX+nZ88jSk6uDKniAemSfFDI8Ty5a7++2wSbE//Hr3jOSNOJMZLzockafzvIYrq9bP7V97j1gQ4u7liWd19VBnbf0pULuwEfy/n5PdOBR/TiPrgMuYjfZseV+alo=
|
|
||||||
secure: S1SK52178uywcWLMO4S5POdjMv1MQjR061CKprjVn2d8x5RBbg8QZtumA6Xt+pByvJzh8vk+ITHCN57tcdi51yL6Z0QauXwxwzTsZmjrhxWOybAO2uOHliqQSDgxKcbXIqJKg7Yv19eLQYWDVJVGuwlMfVBS0hOHtTTpVuLuGuc=
|
|
@ -1,7 +1,5 @@
|
|||||||
Warcprox - WARC writing MITM HTTP/S proxy
|
Warcprox - WARC writing MITM HTTP/S proxy
|
||||||
*****************************************
|
*****************************************
|
||||||
.. image:: https://travis-ci.org/internetarchive/warcprox.svg?branch=master
|
|
||||||
:target: https://travis-ci.org/internetarchive/warcprox
|
|
||||||
|
|
||||||
Warcprox is an HTTP proxy designed for web archiving applications. When used in
|
Warcprox is an HTTP proxy designed for web archiving applications. When used in
|
||||||
parallel with `brozzler <https://github.com/internetarchive/brozzler>`_ it
|
parallel with `brozzler <https://github.com/internetarchive/brozzler>`_ it
|
||||||
@ -89,12 +87,13 @@ for deduplication works similarly to deduplication by `Heritrix
|
|||||||
4. If not found,
|
4. If not found,
|
||||||
|
|
||||||
a. Write ``response`` record with full payload
|
a. Write ``response`` record with full payload
|
||||||
b. Store new entry in deduplication database
|
b. Store new entry in deduplication database (can be disabled, see
|
||||||
|
`Warcprox-Meta HTTP request header <api.rst#warcprox-meta-http-request-header>`_)
|
||||||
|
|
||||||
The deduplication database is partitioned into different "buckets". URLs are
|
The deduplication database is partitioned into different "buckets". URLs are
|
||||||
deduplicated only against other captures in the same bucket. If specified, the
|
deduplicated only against other captures in the same bucket. If specified, the
|
||||||
``dedup-bucket`` field of the `Warcprox-Meta HTTP request header
|
``dedup-buckets`` field of the `Warcprox-Meta HTTP request header
|
||||||
<api.rst#warcprox-meta-http-request-header>`_ determines the bucket. Otherwise,
|
<api.rst#warcprox-meta-http-request-header>`_ determines the bucket(s). Otherwise,
|
||||||
the default bucket is used.
|
the default bucket is used.
|
||||||
|
|
||||||
Deduplication can be disabled entirely by starting warcprox with the argument
|
Deduplication can be disabled entirely by starting warcprox with the argument
|
||||||
|
0
__init__.py
Normal file
0
__init__.py
Normal file
26
api.rst
26
api.rst
@ -137,14 +137,16 @@ Example::
|
|||||||
|
|
||||||
Warcprox-Meta: {"warc-prefix": "special-warc"}
|
Warcprox-Meta: {"warc-prefix": "special-warc"}
|
||||||
|
|
||||||
``dedup-bucket`` (string)
|
``dedup-buckets`` (string)
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
Specifies the deduplication bucket. For more information about deduplication
|
Specifies the deduplication bucket(s). For more information about deduplication
|
||||||
see `<README.rst#deduplication>`_.
|
see `<README.rst#deduplication>`_.
|
||||||
|
|
||||||
Example::
|
Examples::
|
||||||
|
|
||||||
Warcprox-Meta: {"dedup-bucket":"my-dedup-bucket"}
|
Warcprox-Meta: {"dedup-buckets":{"my-dedup-bucket":"rw"}}
|
||||||
|
|
||||||
|
Warcprox-Meta: {"dedup-buckets":{"my-dedup-bucket":"rw", "my-read-only-dedup-bucket": "ro"}}
|
||||||
|
|
||||||
``blocks`` (list)
|
``blocks`` (list)
|
||||||
~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~
|
||||||
@ -184,6 +186,22 @@ to evaluate the block rules. In particular, this circumstance prevails when the
|
|||||||
browser controlled by brozzler is requesting images, javascript, css, and so
|
browser controlled by brozzler is requesting images, javascript, css, and so
|
||||||
on, embedded in a page.
|
on, embedded in a page.
|
||||||
|
|
||||||
|
``compressed_blocks`` (string)
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
If the ``blocks`` header is large, it may be useful or necessary to compress it.
|
||||||
|
``compressed_blocks`` is a string containing a zlib and base64-encoded
|
||||||
|
``blocks`` list. If both ``blocks`` and ``compressed_blocks`` are provided,
|
||||||
|
warcprox will use the value of ``compressed_blocks``, however this behavior
|
||||||
|
is not guaranteed.
|
||||||
|
|
||||||
|
Example::
|
||||||
|
|
||||||
|
Warcprox-Meta: {"compressed_blocks": "eJwVykEKgCAQQNGryKwt90F0kGgxlZSgzuCMFIR3r7b//fkBkVoUBgMbJetvTBy9de5U5cFBs+aBnRKG/D8J44XF91XAGpC6ipaQj58u7iIdIfd88oSbBsrjF6gqtOUFJ5YjwQ=="}
|
||||||
|
|
||||||
|
Is equivalent to::
|
||||||
|
|
||||||
|
{"blocks": [{"ssurt": "com,example,//http:/"}, {"domain": "malware.us", "substring": "wp-login.php?action=logout"}]}
|
||||||
|
|
||||||
``stats`` (dictionary)
|
``stats`` (dictionary)
|
||||||
~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~
|
||||||
``stats`` is a dictionary with only one field understood by warcprox,
|
``stats`` is a dictionary with only one field understood by warcprox,
|
||||||
|
28
pyproject.toml
Normal file
28
pyproject.toml
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
[project]
|
||||||
|
name = "warcprox"
|
||||||
|
authors = [
|
||||||
|
{ name="Noah Levitt", email="nlevitt@archive.org" },
|
||||||
|
]
|
||||||
|
maintainers = [
|
||||||
|
{ name="Vangelis Banos", email="vangelis@archive.org" },
|
||||||
|
{ name="Adam Miller", email="adam@archive.org" },
|
||||||
|
{ name="Barbara Miller", email="barbara@archive.org" },
|
||||||
|
{ name="Alex Dempsey", email="avdempsey@archive.org" },
|
||||||
|
]
|
||||||
|
description = "WARC writing MITM HTTP/S proxy"
|
||||||
|
readme = "README.rst"
|
||||||
|
requires-python = ">=3.8"
|
||||||
|
classifiers = [
|
||||||
|
"Programming Language :: Python :: 3",
|
||||||
|
"License :: OSI Approved :: Apache Software License",
|
||||||
|
"Operating System :: OS Independent",
|
||||||
|
]
|
||||||
|
dynamic = [ "version", "license", "scripts", "dependencies", "optional-dependencies" ]
|
||||||
|
|
||||||
|
[project.urls]
|
||||||
|
Homepage = "https://github.com/internetarchive/warcprox"
|
||||||
|
Issues = "https://github.com/internetarchive/warcprox/issues"
|
||||||
|
|
||||||
|
[build-system]
|
||||||
|
requires = ["setuptools>=61.0"]
|
||||||
|
build-backend = "setuptools.build_meta"
|
35
setup.py
35
setup.py
@ -2,7 +2,7 @@
|
|||||||
'''
|
'''
|
||||||
setup.py - setuptools installation configuration for warcprox
|
setup.py - setuptools installation configuration for warcprox
|
||||||
|
|
||||||
Copyright (C) 2013-2019 Internet Archive
|
Copyright (C) 2013-2024 Internet Archive
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or
|
This program is free software; you can redistribute it and/or
|
||||||
modify it under the terms of the GNU General Public License
|
modify it under the terms of the GNU General Public License
|
||||||
@ -24,15 +24,17 @@ import sys
|
|||||||
import setuptools
|
import setuptools
|
||||||
|
|
||||||
deps = [
|
deps = [
|
||||||
'certauth==1.1.6',
|
'warctools>=4.10.0',
|
||||||
'warctools>=4.10.0,<=4.10.0',
|
'urlcanon>=0.3.0',
|
||||||
'urlcanon>=0.1.dev16,<=0.3.dev28',
|
'doublethink==0.4.9',
|
||||||
'doublethink>=0.2.0.dev87,<=0.2.0.dev94',
|
'urllib3>=1.23',
|
||||||
'urllib3>=1.14,<=1.24.1',
|
'requests>=2.0.1',
|
||||||
'requests>=2.0.1,<=2.21.0',
|
'PySocks>=1.6.8',
|
||||||
'PySocks>=1.6.8,<=1.6.8',
|
'cryptography>=39,<40',
|
||||||
'cryptography>=2.3,<=2.5',
|
'idna',
|
||||||
'idna>=2.5,<=2.8',
|
'PyYAML>=5.1',
|
||||||
|
'cachetools',
|
||||||
|
'rfc3986>=1.5.0',
|
||||||
]
|
]
|
||||||
try:
|
try:
|
||||||
import concurrent.futures
|
import concurrent.futures
|
||||||
@ -41,7 +43,7 @@ except:
|
|||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='warcprox',
|
name='warcprox',
|
||||||
version='2.4b6',
|
version='2.6.1',
|
||||||
description='WARC writing MITM HTTP/S proxy',
|
description='WARC writing MITM HTTP/S proxy',
|
||||||
url='https://github.com/internetarchive/warcprox',
|
url='https://github.com/internetarchive/warcprox',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
@ -50,6 +52,8 @@ setuptools.setup(
|
|||||||
license='GPL',
|
license='GPL',
|
||||||
packages=['warcprox'],
|
packages=['warcprox'],
|
||||||
install_requires=deps,
|
install_requires=deps,
|
||||||
|
# preferred trough 'trough @ git+https://github.com/internetarchive/trough.git@jammy_focal'
|
||||||
|
extras_require={'trough': 'trough'},
|
||||||
setup_requires=['pytest-runner'],
|
setup_requires=['pytest-runner'],
|
||||||
tests_require=['mock', 'pytest', 'warcio'],
|
tests_require=['mock', 'pytest', 'warcio'],
|
||||||
entry_points={
|
entry_points={
|
||||||
@ -64,13 +68,12 @@ setuptools.setup(
|
|||||||
'Development Status :: 5 - Production/Stable',
|
'Development Status :: 5 - Production/Stable',
|
||||||
'Environment :: Console',
|
'Environment :: Console',
|
||||||
'License :: OSI Approved :: GNU General Public License (GPL)',
|
'License :: OSI Approved :: GNU General Public License (GPL)',
|
||||||
'Programming Language :: Python :: 3.4',
|
'Programming Language :: Python :: 3.8',
|
||||||
'Programming Language :: Python :: 3.5',
|
'Programming Language :: Python :: 3.9',
|
||||||
'Programming Language :: Python :: 3.6',
|
'Programming Language :: Python :: 3.10',
|
||||||
'Programming Language :: Python :: 3.7',
|
'Programming Language :: Python :: 3.11',
|
||||||
'Topic :: Internet :: Proxy Servers',
|
'Topic :: Internet :: Proxy Servers',
|
||||||
'Topic :: Internet :: WWW/HTTP',
|
'Topic :: Internet :: WWW/HTTP',
|
||||||
'Topic :: Software Development :: Libraries :: Python Modules',
|
'Topic :: Software Development :: Libraries :: Python Modules',
|
||||||
'Topic :: System :: Archiving',
|
'Topic :: System :: Archiving',
|
||||||
])
|
])
|
||||||
|
|
||||||
|
@ -19,7 +19,7 @@
|
|||||||
# USA.
|
# USA.
|
||||||
#
|
#
|
||||||
|
|
||||||
FROM phusion/baseimage
|
FROM ubuntu:focal-20220404
|
||||||
MAINTAINER Noah Levitt <nlevitt@archive.org>
|
MAINTAINER Noah Levitt <nlevitt@archive.org>
|
||||||
|
|
||||||
# see https://github.com/stuartpb/rethinkdb-dockerfiles/blob/master/trusty/2.1.3/Dockerfile
|
# see https://github.com/stuartpb/rethinkdb-dockerfiles/blob/master/trusty/2.1.3/Dockerfile
|
||||||
@ -28,10 +28,11 @@ MAINTAINER Noah Levitt <nlevitt@archive.org>
|
|||||||
ENV LANG=C.UTF-8
|
ENV LANG=C.UTF-8
|
||||||
|
|
||||||
RUN apt-get update && apt-get --auto-remove -y dist-upgrade
|
RUN apt-get update && apt-get --auto-remove -y dist-upgrade
|
||||||
|
RUN apt-get install -y ca-certificates curl gnupg wget
|
||||||
|
|
||||||
# Add the RethinkDB repository and public key
|
# Add the RethinkDB repository and public key
|
||||||
RUN curl -s https://download.rethinkdb.com/apt/pubkey.gpg | apt-key add - \
|
RUN curl -Ss https://download.rethinkdb.com/repository/raw/pubkey.gpg | apt-key add -
|
||||||
&& echo "deb http://download.rethinkdb.com/apt xenial main" > /etc/apt/sources.list.d/rethinkdb.list \
|
RUN echo "deb https://download.rethinkdb.com/repository/ubuntu-focal focal main" > /etc/apt/sources.list.d/rethinkdb.list \
|
||||||
&& apt-get update && apt-get -y install rethinkdb
|
&& apt-get update && apt-get -y install rethinkdb
|
||||||
|
|
||||||
RUN mkdir -vp /etc/service/rethinkdb \
|
RUN mkdir -vp /etc/service/rethinkdb \
|
||||||
@ -57,30 +58,59 @@ RUN mkdir -vp /etc/service/tor \
|
|||||||
&& chmod a+x /etc/service/tor/run
|
&& chmod a+x /etc/service/tor/run
|
||||||
|
|
||||||
# hadoop hdfs for trough
|
# hadoop hdfs for trough
|
||||||
RUN curl -s https://archive.cloudera.com/cdh5/ubuntu/xenial/amd64/cdh/archive.key | apt-key add - \
|
|
||||||
&& . /etc/lsb-release \
|
|
||||||
&& echo "deb [arch=amd64] http://archive.cloudera.com/cdh5/ubuntu/$DISTRIB_CODENAME/amd64/cdh $DISTRIB_CODENAME-cdh5 contrib" >> /etc/apt/sources.list.d/cloudera.list
|
|
||||||
|
|
||||||
RUN apt-get update
|
ARG DEBIAN_FRONTEND=noninteractive
|
||||||
RUN apt-get install -y openjdk-8-jdk hadoop-conf-pseudo
|
ENV TZ=Etc/UTC
|
||||||
|
RUN apt-get install -y openjdk-8-jdk openssh-server
|
||||||
|
|
||||||
RUN su hdfs -c 'hdfs namenode -format'
|
# set java home
|
||||||
|
ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64
|
||||||
|
|
||||||
RUN mv -v /etc/hadoop/conf/core-site.xml /etc/hadoop/conf/core-site.xml.orig \
|
# setup ssh with no passphrase
|
||||||
&& cat /etc/hadoop/conf/core-site.xml.orig | sed 's,localhost:8020,0.0.0.0:8020,' > /etc/hadoop/conf/core-site.xml
|
RUN ssh-keygen -t rsa -f $HOME/.ssh/id_rsa -P "" \
|
||||||
|
&& cat $HOME/.ssh/id_rsa.pub >> $HOME/.ssh/authorized_keys
|
||||||
|
|
||||||
RUN mv -v /etc/hadoop/conf/hdfs-site.xml /etc/hadoop/conf/hdfs-site.xml.orig \
|
RUN wget -O /hadoop-2.7.3.tar.gz -q https://archive.apache.org/dist/hadoop/common/hadoop-2.7.3/hadoop-2.7.3.tar.gz \
|
||||||
&& cat /etc/hadoop/conf/hdfs-site.xml.orig | sed 's,^</configuration>$, <property>\n <name>dfs.permissions.enabled</name>\n <value>false</value>\n </property>\n</configuration>,' > /etc/hadoop/conf/hdfs-site.xml
|
&& tar xfz hadoop-2.7.3.tar.gz \
|
||||||
|
&& mv /hadoop-2.7.3 /usr/local/hadoop \
|
||||||
|
&& rm /hadoop-2.7.3.tar.gz
|
||||||
|
|
||||||
|
# hadoop environment variables
|
||||||
|
ENV HADOOP_HOME=/usr/local/hadoop
|
||||||
|
ENV PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin
|
||||||
|
|
||||||
RUN echo '#!/bin/bash\nservice hadoop-hdfs-namenode start\nservice hadoop-hdfs-datanode start' > /etc/my_init.d/50_start_hdfs.sh \
|
# hadoop-store
|
||||||
&& chmod a+x /etc/my_init.d/50_start_hdfs.sh
|
RUN mkdir -p $HADOOP_HOME/hdfs/namenode \
|
||||||
|
&& mkdir -p $HADOOP_HOME/hdfs/datanode
|
||||||
|
|
||||||
RUN apt-get install -y libsqlite3-dev
|
# Temporary files: http://refspecs.linuxfoundation.org/FHS_3.0/fhs/ch03s18.html
|
||||||
|
COPY config/ /tmp/
|
||||||
|
RUN mv /tmp/ssh_config $HOME/.ssh/config \
|
||||||
|
&& mv /tmp/hadoop-env.sh $HADOOP_HOME/etc/hadoop/hadoop-env.sh \
|
||||||
|
&& mv /tmp/core-site.xml $HADOOP_HOME/etc/hadoop/core-site.xml \
|
||||||
|
&& mv /tmp/hdfs-site.xml $HADOOP_HOME/etc/hadoop/hdfs-site.xml \
|
||||||
|
&& mv /tmp/mapred-site.xml $HADOOP_HOME/etc/hadoop/mapred-site.xml.template \
|
||||||
|
&& cp $HADOOP_HOME/etc/hadoop/mapred-site.xml.template $HADOOP_HOME/etc/hadoop/mapred-site.xml \
|
||||||
|
&& mv /tmp/yarn-site.xml $HADOOP_HOME/etc/hadoop/yarn-site.xml
|
||||||
|
|
||||||
|
# Add startup script
|
||||||
|
ADD config/hadoop-services.sh $HADOOP_HOME/hadoop-services.sh
|
||||||
|
|
||||||
|
# set permissions
|
||||||
|
RUN chmod 744 -R $HADOOP_HOME
|
||||||
|
|
||||||
|
# format namenode
|
||||||
|
RUN $HADOOP_HOME/bin/hdfs namenode -format
|
||||||
|
|
||||||
|
# run hadoop services
|
||||||
|
#ENTRYPOINT $HADOOP_HOME/hadoop-services.sh; bash
|
||||||
|
|
||||||
|
RUN apt-get install -y libsqlite3-dev build-essential
|
||||||
|
|
||||||
# trough itself
|
# trough itself
|
||||||
RUN virtualenv -p python3 /opt/trough-ve3 \
|
RUN virtualenv -p python3 /opt/trough-ve3 \
|
||||||
&& . /opt/trough-ve3/bin/activate \
|
&& . /opt/trough-ve3/bin/activate \
|
||||||
&& pip install git+https://github.com/jkafader/snakebite@feature/python3-version-string \
|
&& pip install git+https://github.com/nlevitt/snakebite.git@py3 \
|
||||||
&& pip install git+https://github.com/internetarchive/trough.git
|
&& pip install git+https://github.com/internetarchive/trough.git
|
||||||
|
|
||||||
RUN mkdir -vp /etc/service/trough-sync-local \
|
RUN mkdir -vp /etc/service/trough-sync-local \
|
||||||
@ -107,3 +137,4 @@ RUN mkdir -vp /etc/service/trough-segment-manager-server \
|
|||||||
&& echo '#!/bin/bash\nvenv=/opt/trough-ve3\nsource $venv/bin/activate\nsleep 5\npython -c $"import doublethink ; from trough.settings import settings ; rr = doublethink.Rethinker(settings[\"RETHINKDB_HOSTS\"]) ; rr.db(\"trough_configuration\").wait().run()"\nexec uwsgi --venv=$venv --http :6111 --master --processes=2 --harakiri=7200 --http-timeout=7200 --max-requests=50000 --vacuum --die-on-term --mount /=trough.wsgi.segment_manager:server >>/tmp/trough-segment-manager-server.out 2>&1' > /etc/service/trough-segment-manager-server/run \
|
&& echo '#!/bin/bash\nvenv=/opt/trough-ve3\nsource $venv/bin/activate\nsleep 5\npython -c $"import doublethink ; from trough.settings import settings ; rr = doublethink.Rethinker(settings[\"RETHINKDB_HOSTS\"]) ; rr.db(\"trough_configuration\").wait().run()"\nexec uwsgi --venv=$venv --http :6111 --master --processes=2 --harakiri=7200 --http-timeout=7200 --max-requests=50000 --vacuum --die-on-term --mount /=trough.wsgi.segment_manager:server >>/tmp/trough-segment-manager-server.out 2>&1' > /etc/service/trough-segment-manager-server/run \
|
||||||
&& chmod a+x /etc/service/trough-segment-manager-server/run
|
&& chmod a+x /etc/service/trough-segment-manager-server/run
|
||||||
|
|
||||||
|
RUN apt-get install -y daemontools daemontools-run
|
||||||
|
@ -31,15 +31,18 @@ script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|||||||
|
|
||||||
docker build -t internetarchive/warcprox-tests $script_dir
|
docker build -t internetarchive/warcprox-tests $script_dir
|
||||||
|
|
||||||
docker run --rm --volume="$script_dir/..:/warcprox" internetarchive/warcprox-tests /sbin/my_init -- \
|
docker run --rm --volume="$script_dir/..:/warcprox" internetarchive/warcprox-tests \
|
||||||
bash -x -c "cd /tmp && git clone /warcprox && cd /tmp/warcprox \
|
bash -x -c "cd /tmp && git clone /warcprox && cd /tmp/warcprox \
|
||||||
&& (cd /warcprox && git diff HEAD) | patch -p1 \
|
&& (cd /warcprox && git diff HEAD) | patch -p1 \
|
||||||
&& virtualenv -p python3 /tmp/venv \
|
&& virtualenv -p python3 /tmp/venv \
|
||||||
&& source /tmp/venv/bin/activate \
|
&& source /tmp/venv/bin/activate \
|
||||||
&& pip --log-file /tmp/pip.log install . pytest mock requests warcio \
|
&& pip --log-file /tmp/pip.log install . pytest mock requests warcio trough \
|
||||||
&& py.test -v tests \
|
&& py.test -v tests; \
|
||||||
&& py.test -v --rethinkdb-dedup-url=rethinkdb://localhost/test1/dedup tests \
|
svscan /etc/service & \
|
||||||
|
sleep 10; \
|
||||||
|
py.test -v --rethinkdb-dedup-url=rethinkdb://localhost/test1/dedup tests \
|
||||||
&& py.test -v --rethinkdb-big-table-url=rethinkdb://localhost/test2/captures tests \
|
&& py.test -v --rethinkdb-big-table-url=rethinkdb://localhost/test2/captures tests \
|
||||||
|
&& /usr/local/hadoop/hadoop-services.sh \
|
||||||
&& py.test -v --rethinkdb-trough-db-url=rethinkdb://localhost/trough_configuration tests \
|
&& py.test -v --rethinkdb-trough-db-url=rethinkdb://localhost/trough_configuration tests \
|
||||||
"
|
"
|
||||||
|
|
||||||
|
@ -5,7 +5,7 @@
|
|||||||
|
|
||||||
set -x
|
set -x
|
||||||
|
|
||||||
pip install git+https://github.com/jkafader/snakebite@feature/python3-version-string
|
pip install git+https://github.com/nlevitt/snakebite.git@py3
|
||||||
pip install git+https://github.com/internetarchive/trough.git
|
pip install git+https://github.com/internetarchive/trough.git
|
||||||
|
|
||||||
mkdir /etc/trough
|
mkdir /etc/trough
|
||||||
|
89
tests/test_certauth.py
Normal file
89
tests/test_certauth.py
Normal file
@ -0,0 +1,89 @@
|
|||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
|
||||||
|
from warcprox.certauth import main, CertificateAuthority
|
||||||
|
import tempfile
|
||||||
|
from OpenSSL import crypto
|
||||||
|
import datetime
|
||||||
|
import time
|
||||||
|
|
||||||
|
def setup_module():
|
||||||
|
global TEST_CA_DIR
|
||||||
|
TEST_CA_DIR = tempfile.mkdtemp()
|
||||||
|
|
||||||
|
global TEST_CA_ROOT
|
||||||
|
TEST_CA_ROOT = os.path.join(TEST_CA_DIR, 'certauth_test_ca.pem')
|
||||||
|
|
||||||
|
def teardown_module():
|
||||||
|
shutil.rmtree(TEST_CA_DIR)
|
||||||
|
assert not os.path.isdir(TEST_CA_DIR)
|
||||||
|
assert not os.path.isfile(TEST_CA_ROOT)
|
||||||
|
|
||||||
|
def test_create_root():
|
||||||
|
ret = main([TEST_CA_ROOT, '-c', 'Test Root Cert'])
|
||||||
|
assert ret == 0
|
||||||
|
|
||||||
|
def test_create_host_cert():
|
||||||
|
ret = main([TEST_CA_ROOT, '-d', TEST_CA_DIR, '-n', 'example.com'])
|
||||||
|
assert ret == 0
|
||||||
|
certfile = os.path.join(TEST_CA_DIR, 'example.com.pem')
|
||||||
|
assert os.path.isfile(certfile)
|
||||||
|
|
||||||
|
def test_create_wildcard_host_cert_force_overwrite():
|
||||||
|
ret = main([TEST_CA_ROOT, '-d', TEST_CA_DIR, '--hostname', 'example.com', '-w', '-f'])
|
||||||
|
assert ret == 0
|
||||||
|
certfile = os.path.join(TEST_CA_DIR, 'example.com.pem')
|
||||||
|
assert os.path.isfile(certfile)
|
||||||
|
|
||||||
|
def test_explicit_wildcard():
|
||||||
|
ca = CertificateAuthority(TEST_CA_ROOT, TEST_CA_DIR, 'Test CA')
|
||||||
|
filename = ca.get_wildcard_cert('test.example.proxy')
|
||||||
|
certfile = os.path.join(TEST_CA_DIR, 'example.proxy.pem')
|
||||||
|
assert filename == certfile
|
||||||
|
assert os.path.isfile(certfile)
|
||||||
|
os.remove(certfile)
|
||||||
|
|
||||||
|
def test_create_already_exists():
|
||||||
|
ret = main([TEST_CA_ROOT, '-d', TEST_CA_DIR, '-n', 'example.com', '-w'])
|
||||||
|
assert ret == 1
|
||||||
|
certfile = os.path.join(TEST_CA_DIR, 'example.com.pem')
|
||||||
|
assert os.path.isfile(certfile)
|
||||||
|
# remove now
|
||||||
|
os.remove(certfile)
|
||||||
|
|
||||||
|
def test_create_root_already_exists():
|
||||||
|
ret = main([TEST_CA_ROOT])
|
||||||
|
# not created, already exists
|
||||||
|
assert ret == 1
|
||||||
|
# remove now
|
||||||
|
os.remove(TEST_CA_ROOT)
|
||||||
|
|
||||||
|
def test_create_root_subdir():
|
||||||
|
# create a new cert in a subdirectory
|
||||||
|
subdir = os.path.join(TEST_CA_DIR, 'subdir')
|
||||||
|
|
||||||
|
ca_file = os.path.join(subdir, 'certauth_test_ca.pem')
|
||||||
|
|
||||||
|
ca = CertificateAuthority(ca_file, subdir, 'Test CA',
|
||||||
|
cert_not_before=-60 * 60,
|
||||||
|
cert_not_after=60 * 60 * 24 * 3)
|
||||||
|
|
||||||
|
assert os.path.isdir(subdir)
|
||||||
|
assert os.path.isfile(ca_file)
|
||||||
|
|
||||||
|
buff = ca.get_root_PKCS12()
|
||||||
|
assert len(buff) > 0
|
||||||
|
|
||||||
|
expected_not_before = datetime.datetime.utcnow() - datetime.timedelta(seconds=60 * 60)
|
||||||
|
expected_not_after = datetime.datetime.utcnow() + datetime.timedelta(seconds=60 * 60 * 24 * 3)
|
||||||
|
|
||||||
|
cert = crypto.load_pkcs12(buff).get_certificate()
|
||||||
|
|
||||||
|
actual_not_before = datetime.datetime.strptime(
|
||||||
|
cert.get_notBefore().decode('ascii'), '%Y%m%d%H%M%SZ')
|
||||||
|
actual_not_after = datetime.datetime.strptime(
|
||||||
|
cert.get_notAfter().decode('ascii'), '%Y%m%d%H%M%SZ')
|
||||||
|
|
||||||
|
time.mktime(expected_not_before.utctimetuple())
|
||||||
|
assert abs((time.mktime(actual_not_before.utctimetuple()) - time.mktime(expected_not_before.utctimetuple()))) < 10
|
||||||
|
assert abs((time.mktime(actual_not_after.utctimetuple()) - time.mktime(expected_not_after.utctimetuple()))) < 10
|
@ -52,6 +52,7 @@ import mock
|
|||||||
import email.message
|
import email.message
|
||||||
import socketserver
|
import socketserver
|
||||||
from concurrent import futures
|
from concurrent import futures
|
||||||
|
import urllib.parse
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import http.server as http_server
|
import http.server as http_server
|
||||||
@ -67,6 +68,7 @@ import certauth.certauth
|
|||||||
|
|
||||||
import warcprox
|
import warcprox
|
||||||
import warcprox.main
|
import warcprox.main
|
||||||
|
import warcprox.crawl_log as crawl_log
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import http.client as http_client
|
import http.client as http_client
|
||||||
@ -93,9 +95,11 @@ logging.basicConfig(
|
|||||||
stream=sys.stdout, level=logging.TRACE,
|
stream=sys.stdout, level=logging.TRACE,
|
||||||
format='%(asctime)s %(process)d %(levelname)s %(threadName)s '
|
format='%(asctime)s %(process)d %(levelname)s %(threadName)s '
|
||||||
'%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s')
|
'%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s')
|
||||||
|
|
||||||
|
logging.getLogger("urllib3").setLevel(logging.WARN)
|
||||||
logging.getLogger("requests.packages.urllib3").setLevel(logging.WARN)
|
logging.getLogger("requests.packages.urllib3").setLevel(logging.WARN)
|
||||||
warnings.simplefilter("ignore", category=requests.packages.urllib3.exceptions.InsecureRequestWarning)
|
import urllib3 ; urllib3.disable_warnings()
|
||||||
warnings.simplefilter("ignore", category=requests.packages.urllib3.exceptions.InsecurePlatformWarning)
|
import requests.packages.urllib3 ; requests.packages.urllib3.disable_warnings()
|
||||||
|
|
||||||
def wait(callback, timeout=10):
|
def wait(callback, timeout=10):
|
||||||
start = time.time()
|
start = time.time()
|
||||||
@ -144,7 +148,7 @@ def dump_state(signum=None, frame=None):
|
|||||||
stack = traceback.format_stack(sys._current_frames()[th.ident])
|
stack = traceback.format_stack(sys._current_frames()[th.ident])
|
||||||
state_strs.append("".join(stack))
|
state_strs.append("".join(stack))
|
||||||
|
|
||||||
logging.warn("dumping state (caught signal {})\n{}".format(signum, "\n".join(state_strs)))
|
logging.warning("dumping state (caught signal {})\n{}".format(signum, "\n".join(state_strs)))
|
||||||
|
|
||||||
signal.signal(signal.SIGQUIT, dump_state)
|
signal.signal(signal.SIGQUIT, dump_state)
|
||||||
|
|
||||||
@ -173,8 +177,10 @@ class _TestHttpRequestHandler(http_server.BaseHTTPRequestHandler):
|
|||||||
def build_response(self):
|
def build_response(self):
|
||||||
m = re.match(r'^/([^/]+)/([^/]+)$', self.path)
|
m = re.match(r'^/([^/]+)/([^/]+)$', self.path)
|
||||||
if m is not None:
|
if m is not None:
|
||||||
special_header = 'warcprox-test-header: {}!'.format(m.group(1)).encode('utf-8')
|
seg1 = urllib.parse.unquote(m.group(1))
|
||||||
payload = 'I am the warcprox test payload! {}!\n'.format(10*m.group(2)).encode('utf-8')
|
seg2 = urllib.parse.unquote(m.group(2))
|
||||||
|
special_header = 'warcprox-test-header: {}!'.format(seg1).encode('utf-8')
|
||||||
|
payload = 'I am the warcprox test payload! {}!\n'.format(10*seg2).encode('utf-8')
|
||||||
headers = (b'HTTP/1.1 200 OK\r\n'
|
headers = (b'HTTP/1.1 200 OK\r\n'
|
||||||
+ b'Content-Type: text/plain\r\n'
|
+ b'Content-Type: text/plain\r\n'
|
||||||
+ special_header + b'\r\n'
|
+ special_header + b'\r\n'
|
||||||
@ -279,6 +285,21 @@ class _TestHttpRequestHandler(http_server.BaseHTTPRequestHandler):
|
|||||||
payload = b'Test.'
|
payload = b'Test.'
|
||||||
actual_headers = (b'Content-Type: text/plain\r\n'
|
actual_headers = (b'Content-Type: text/plain\r\n'
|
||||||
+ b'Content-Length: ' + str(len(payload)).encode('ascii') + b'\r\n')
|
+ b'Content-Length: ' + str(len(payload)).encode('ascii') + b'\r\n')
|
||||||
|
elif self.path == '/incomplete-read':
|
||||||
|
headers = (b'HTTP/1.1 200 OK\r\n'
|
||||||
|
+ b'Content-Type: text/plain\r\n'
|
||||||
|
+ b'Transfer-Encoding: chunked\r\n'
|
||||||
|
+ b'\r\n')
|
||||||
|
# payload = b'''1\r\na'''
|
||||||
|
payload = chunkify(
|
||||||
|
b'Server closes connection when client expects next chunk')
|
||||||
|
payload = payload[:-7]
|
||||||
|
elif self.path == '/space_in_content_type':
|
||||||
|
payload = b'test'
|
||||||
|
headers = (b'HTTP/1.1 200 OK\r\n'
|
||||||
|
+ b'Content-Type: \r\n'
|
||||||
|
+ b'Content-Length: ' + str(len(payload)).encode('ascii') + b'\r\n'
|
||||||
|
+ b'\r\n')
|
||||||
else:
|
else:
|
||||||
payload = b'404 Not Found\n'
|
payload = b'404 Not Found\n'
|
||||||
headers = (b'HTTP/1.1 404 Not Found\r\n'
|
headers = (b'HTTP/1.1 404 Not Found\r\n'
|
||||||
@ -292,7 +313,9 @@ class _TestHttpRequestHandler(http_server.BaseHTTPRequestHandler):
|
|||||||
headers, payload = self.build_response()
|
headers, payload = self.build_response()
|
||||||
self.connection.sendall(headers)
|
self.connection.sendall(headers)
|
||||||
self.connection.sendall(payload)
|
self.connection.sendall(payload)
|
||||||
if self.path in ('/missing-content-length', '/empty-response'):
|
if self.path in (
|
||||||
|
'/missing-content-length', '/empty-response',
|
||||||
|
'/incomplete-read'):
|
||||||
# server must close the connection, else client has no idea if
|
# server must close the connection, else client has no idea if
|
||||||
# there is more data coming
|
# there is more data coming
|
||||||
self.connection.shutdown(socket.SHUT_RDWR)
|
self.connection.shutdown(socket.SHUT_RDWR)
|
||||||
@ -446,7 +469,7 @@ def warcprox_(request, http_daemon, https_daemon):
|
|||||||
logging.info('dropping rethinkdb database %r', parsed.database)
|
logging.info('dropping rethinkdb database %r', parsed.database)
|
||||||
rr.db_drop(parsed.database).run()
|
rr.db_drop(parsed.database).run()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.warn(
|
logging.warning(
|
||||||
'problem deleting rethinkdb database %r: %s',
|
'problem deleting rethinkdb database %r: %s',
|
||||||
parsed.database, e)
|
parsed.database, e)
|
||||||
logging.info('deleting working directory %r', work_dir)
|
logging.info('deleting working directory %r', work_dir)
|
||||||
@ -777,7 +800,7 @@ def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies,
|
|||||||
url2 = 'https://localhost:{}/k/l'.format(https_daemon.server_port)
|
url2 = 'https://localhost:{}/k/l'.format(https_daemon.server_port)
|
||||||
|
|
||||||
# archive url1 bucket_a
|
# archive url1 bucket_a
|
||||||
headers = {"Warcprox-Meta": json.dumps({"warc-prefix":"test_dedup_buckets","dedup-bucket":"bucket_a"})}
|
headers = {"Warcprox-Meta": json.dumps({"warc-prefix":"test_dedup_buckets","dedup-buckets":{"bucket_a":"rw"}})}
|
||||||
response = requests.get(url1, proxies=archiving_proxies, verify=False, headers=headers)
|
response = requests.get(url1, proxies=archiving_proxies, verify=False, headers=headers)
|
||||||
assert response.status_code == 200
|
assert response.status_code == 200
|
||||||
assert response.headers['warcprox-test-header'] == 'k!'
|
assert response.headers['warcprox-test-header'] == 'k!'
|
||||||
@ -803,7 +826,7 @@ def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies,
|
|||||||
assert dedup_lookup is None
|
assert dedup_lookup is None
|
||||||
|
|
||||||
# archive url2 bucket_b
|
# archive url2 bucket_b
|
||||||
headers = {"Warcprox-Meta": json.dumps({"warc-prefix":"test_dedup_buckets","dedup-bucket":"bucket_b"})}
|
headers = {"Warcprox-Meta": json.dumps({"warc-prefix":"test_dedup_buckets","dedup-buckets":{"bucket_b":""}})}
|
||||||
response = requests.get(url2, proxies=archiving_proxies, verify=False, headers=headers)
|
response = requests.get(url2, proxies=archiving_proxies, verify=False, headers=headers)
|
||||||
assert response.status_code == 200
|
assert response.status_code == 200
|
||||||
assert response.headers['warcprox-test-header'] == 'k!'
|
assert response.headers['warcprox-test-header'] == 'k!'
|
||||||
@ -903,6 +926,71 @@ def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies,
|
|||||||
finally:
|
finally:
|
||||||
fh.close()
|
fh.close()
|
||||||
|
|
||||||
|
def test_dedup_buckets_readonly(https_daemon, http_daemon, warcprox_, archiving_proxies, playback_proxies):
|
||||||
|
urls_before = warcprox_.proxy.running_stats.urls
|
||||||
|
|
||||||
|
url1 = 'http://localhost:{}/k/l'.format(http_daemon.server_port)
|
||||||
|
|
||||||
|
# archive url1
|
||||||
|
headers = {"Warcprox-Meta": json.dumps({"warc-prefix":"test_dedup_buckets_readonly",
|
||||||
|
"dedup-buckets":{"bucket_1":"rw", "bucket_2":"ro"}})
|
||||||
|
}
|
||||||
|
response = requests.get(url1, proxies=archiving_proxies, verify=False, headers=headers)
|
||||||
|
assert response.status_code == 200
|
||||||
|
assert response.headers['warcprox-test-header'] == 'k!'
|
||||||
|
assert response.content == b'I am the warcprox test payload! llllllllll!\n'
|
||||||
|
|
||||||
|
# wait for postfetch chain
|
||||||
|
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 1)
|
||||||
|
|
||||||
|
# check url1 in dedup db bucket_1 (rw)
|
||||||
|
# logging.info('looking up sha1:bc3fac8847c9412f49d955e626fb58a76befbf81 in bucket_1')
|
||||||
|
dedup_lookup = warcprox_.dedup_db.lookup(
|
||||||
|
b'sha1:bc3fac8847c9412f49d955e626fb58a76befbf81', bucket="bucket_1")
|
||||||
|
assert dedup_lookup
|
||||||
|
assert dedup_lookup['url'] == url1.encode('ascii')
|
||||||
|
assert re.match(br'^<urn:uuid:[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}>$', dedup_lookup['id'])
|
||||||
|
assert re.match(br'^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$', dedup_lookup['date'])
|
||||||
|
record_id = dedup_lookup['id']
|
||||||
|
dedup_date = dedup_lookup['date']
|
||||||
|
|
||||||
|
# check url1 not in dedup db bucket_2 (ro)
|
||||||
|
dedup_lookup = warcprox_.dedup_db.lookup(
|
||||||
|
b'sha1:bc3fac8847c9412f49d955e626fb58a76befbf81', bucket="bucket_2")
|
||||||
|
assert dedup_lookup is None
|
||||||
|
|
||||||
|
# close the warc
|
||||||
|
assert warcprox_.warc_writer_processor.writer_pool.warc_writers["test_dedup_buckets_readonly"]
|
||||||
|
writer = warcprox_.warc_writer_processor.writer_pool.warc_writers["test_dedup_buckets_readonly"]
|
||||||
|
warc_path = os.path.join(writer.directory, writer.finalname)
|
||||||
|
assert not os.path.exists(warc_path)
|
||||||
|
warcprox_.warc_writer_processor.writer_pool.warc_writers["test_dedup_buckets_readonly"].close()
|
||||||
|
assert os.path.exists(warc_path)
|
||||||
|
|
||||||
|
# read the warc
|
||||||
|
fh = warctools.ArchiveRecord.open_archive(warc_path)
|
||||||
|
record_iter = fh.read_records(limit=None, offsets=True)
|
||||||
|
try:
|
||||||
|
(offset, record, errors) = next(record_iter)
|
||||||
|
assert record.type == b'warcinfo'
|
||||||
|
|
||||||
|
# url1 bucket_1
|
||||||
|
(offset, record, errors) = next(record_iter)
|
||||||
|
assert record.type == b'response'
|
||||||
|
assert record.url == url1.encode('ascii')
|
||||||
|
# check for duplicate warc record headers
|
||||||
|
assert Counter(h[0] for h in record.headers).most_common(1)[0][1] == 1
|
||||||
|
assert record.content[1] == b'HTTP/1.1 200 OK\r\nContent-Type: text/plain\r\nwarcprox-test-header: k!\r\nContent-Length: 44\r\n\r\nI am the warcprox test payload! llllllllll!\n'
|
||||||
|
(offset, record, errors) = next(record_iter)
|
||||||
|
assert record.type == b'request'
|
||||||
|
|
||||||
|
# that's all folks
|
||||||
|
assert next(record_iter)[1] == None
|
||||||
|
assert next(record_iter, None) == None
|
||||||
|
|
||||||
|
finally:
|
||||||
|
fh.close()
|
||||||
|
|
||||||
def test_dedup_bucket_concurrency(https_daemon, http_daemon, warcprox_, archiving_proxies):
|
def test_dedup_bucket_concurrency(https_daemon, http_daemon, warcprox_, archiving_proxies):
|
||||||
urls_before = warcprox_.proxy.running_stats.urls
|
urls_before = warcprox_.proxy.running_stats.urls
|
||||||
revisits_before = warcprox_.proxy.stats_db.value(
|
revisits_before = warcprox_.proxy.stats_db.value(
|
||||||
@ -915,7 +1003,7 @@ def test_dedup_bucket_concurrency(https_daemon, http_daemon, warcprox_, archivin
|
|||||||
http_daemon.server_port, i)
|
http_daemon.server_port, i)
|
||||||
headers = {"Warcprox-Meta": json.dumps({
|
headers = {"Warcprox-Meta": json.dumps({
|
||||||
"warc-prefix":"test_dedup_buckets",
|
"warc-prefix":"test_dedup_buckets",
|
||||||
"dedup-bucket":"bucket_%s" % i})}
|
"dedup-buckets":{"bucket_%s" % i:"rw"}})}
|
||||||
pool.submit(
|
pool.submit(
|
||||||
requests.get, url, proxies=archiving_proxies, verify=False,
|
requests.get, url, proxies=archiving_proxies, verify=False,
|
||||||
headers=headers)
|
headers=headers)
|
||||||
@ -931,7 +1019,7 @@ def test_dedup_bucket_concurrency(https_daemon, http_daemon, warcprox_, archivin
|
|||||||
http_daemon.server_port, -i - 1)
|
http_daemon.server_port, -i - 1)
|
||||||
headers = {"Warcprox-Meta": json.dumps({
|
headers = {"Warcprox-Meta": json.dumps({
|
||||||
"warc-prefix":"test_dedup_buckets",
|
"warc-prefix":"test_dedup_buckets",
|
||||||
"dedup-bucket":"bucket_%s" % i})}
|
"dedup-buckets":{"bucket_%s" % i:"rw"}})}
|
||||||
pool.submit(
|
pool.submit(
|
||||||
requests.get, url, proxies=archiving_proxies, verify=False,
|
requests.get, url, proxies=archiving_proxies, verify=False,
|
||||||
headers=headers)
|
headers=headers)
|
||||||
@ -946,7 +1034,7 @@ def test_dedup_bucket_concurrency(https_daemon, http_daemon, warcprox_, archivin
|
|||||||
http_daemon.server_port, i)
|
http_daemon.server_port, i)
|
||||||
headers = {"Warcprox-Meta": json.dumps({
|
headers = {"Warcprox-Meta": json.dumps({
|
||||||
"warc-prefix":"test_dedup_buckets",
|
"warc-prefix":"test_dedup_buckets",
|
||||||
"dedup-bucket":"bucket_%s" % i})}
|
"dedup-buckets":{"bucket_%s" % i:"rw"}})}
|
||||||
pool.submit(
|
pool.submit(
|
||||||
requests.get, url, proxies=archiving_proxies, verify=False,
|
requests.get, url, proxies=archiving_proxies, verify=False,
|
||||||
headers=headers)
|
headers=headers)
|
||||||
@ -965,12 +1053,12 @@ def test_block_rules(http_daemon, https_daemon, warcprox_, archiving_proxies):
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"url_match": "SURT_MATCH",
|
"url_match": "SURT_MATCH",
|
||||||
"value": "http://(localhost:%s,)/fuh/" % (http_daemon.server_port),
|
"value": "http://(localhost,:%s)/fuh/" % (http_daemon.server_port),
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"url_match": "SURT_MATCH",
|
"url_match": "SURT_MATCH",
|
||||||
# this rule won't match because of http scheme, https port
|
# this rule won't match because of http scheme, https port
|
||||||
"value": "http://(localhost:%s,)/fuh/" % (https_daemon.server_port),
|
"value": "http://(localhost,:%s)/fuh/" % (https_daemon.server_port),
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"domain": "bad.domain.com",
|
"domain": "bad.domain.com",
|
||||||
@ -1273,7 +1361,7 @@ def test_domain_data_soft_limit(
|
|||||||
warcprox_.proxy.remote_connection_pool.clear()
|
warcprox_.proxy.remote_connection_pool.clear()
|
||||||
|
|
||||||
# novel, pushes stats over the limit
|
# novel, pushes stats over the limit
|
||||||
url = 'https://muh.XN--Zz-2Ka.locALHOst:{}/z/~'.format(https_daemon.server_port)
|
url = 'https://muh.XN--Zz-2Ka.locALHOst:{}/z/%7E'.format(https_daemon.server_port)
|
||||||
response = requests.get(
|
response = requests.get(
|
||||||
url, proxies=archiving_proxies, headers=headers, stream=True,
|
url, proxies=archiving_proxies, headers=headers, stream=True,
|
||||||
verify=False)
|
verify=False)
|
||||||
@ -1400,7 +1488,7 @@ def test_missing_content_length(archiving_proxies, http_daemon, https_daemon, wa
|
|||||||
assert not 'content-length' in response.headers
|
assert not 'content-length' in response.headers
|
||||||
|
|
||||||
# wait for postfetch chain
|
# wait for postfetch chain
|
||||||
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 2)
|
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 2, timeout=20)
|
||||||
|
|
||||||
def test_limit_large_resource(archiving_proxies, http_daemon, warcprox_):
|
def test_limit_large_resource(archiving_proxies, http_daemon, warcprox_):
|
||||||
"""We try to load a 300k response but we use --max-resource-size=200000 in
|
"""We try to load a 300k response but we use --max-resource-size=200000 in
|
||||||
@ -1487,7 +1575,7 @@ def test_dedup_ok_flag(
|
|||||||
assert dedup_lookup is None
|
assert dedup_lookup is None
|
||||||
|
|
||||||
# archive with dedup_ok:False
|
# archive with dedup_ok:False
|
||||||
request_meta = {'dedup-bucket':'test_dedup_ok_flag','dedup-ok':False}
|
request_meta = {'dedup-buckets':{'test_dedup_ok_flag':''},'dedup-ok':False}
|
||||||
headers = {'Warcprox-Meta': json.dumps(request_meta)}
|
headers = {'Warcprox-Meta': json.dumps(request_meta)}
|
||||||
response = requests.get(
|
response = requests.get(
|
||||||
url, proxies=archiving_proxies, headers=headers, verify=False)
|
url, proxies=archiving_proxies, headers=headers, verify=False)
|
||||||
@ -1505,7 +1593,7 @@ def test_dedup_ok_flag(
|
|||||||
assert dedup_lookup is None
|
assert dedup_lookup is None
|
||||||
|
|
||||||
# archive without dedup_ok:False
|
# archive without dedup_ok:False
|
||||||
request_meta = {'dedup-bucket':'test_dedup_ok_flag'}
|
request_meta = {'dedup-buckets':{'test_dedup_ok_flag':''}}
|
||||||
headers = {'Warcprox-Meta': json.dumps(request_meta)}
|
headers = {'Warcprox-Meta': json.dumps(request_meta)}
|
||||||
response = requests.get(
|
response = requests.get(
|
||||||
url, proxies=archiving_proxies, headers=headers, verify=False)
|
url, proxies=archiving_proxies, headers=headers, verify=False)
|
||||||
@ -1611,13 +1699,11 @@ def test_controller_with_defaults():
|
|||||||
assert not wwp.writer_pool.default_warc_writer.record_builder.base32
|
assert not wwp.writer_pool.default_warc_writer.record_builder.base32
|
||||||
assert wwp.writer_pool.default_warc_writer.record_builder.digest_algorithm == 'sha1'
|
assert wwp.writer_pool.default_warc_writer.record_builder.digest_algorithm == 'sha1'
|
||||||
|
|
||||||
|
|
||||||
class EarlyPlugin(warcprox.BaseStandardPostfetchProcessor):
|
class EarlyPlugin(warcprox.BaseStandardPostfetchProcessor):
|
||||||
CHAIN_POSITION = 'early'
|
CHAIN_POSITION = 'early'
|
||||||
def _process_url(self):
|
def _process_url(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
def test_load_plugin():
|
def test_load_plugin():
|
||||||
options = warcprox.Options(port=0, plugins=[
|
options = warcprox.Options(port=0, plugins=[
|
||||||
'warcprox.stats.RunningStats',
|
'warcprox.stats.RunningStats',
|
||||||
@ -1713,13 +1799,13 @@ def test_slash_in_warc_prefix(warcprox_, http_daemon, archiving_proxies):
|
|||||||
url = 'http://localhost:%s/b/b' % http_daemon.server_port
|
url = 'http://localhost:%s/b/b' % http_daemon.server_port
|
||||||
headers = {"Warcprox-Meta": json.dumps({"warc-prefix":"../../../../etc/a"})}
|
headers = {"Warcprox-Meta": json.dumps({"warc-prefix":"../../../../etc/a"})}
|
||||||
response = requests.get(url, proxies=archiving_proxies, headers=headers)
|
response = requests.get(url, proxies=archiving_proxies, headers=headers)
|
||||||
assert response.status_code == 500
|
assert response.status_code == 400
|
||||||
assert response.reason == 'request rejected by warcprox: slash and backslash are not permitted in warc-prefix'
|
assert response.reason == 'request rejected by warcprox: slash and backslash are not permitted in warc-prefix'
|
||||||
|
|
||||||
url = 'http://localhost:%s/b/c' % http_daemon.server_port
|
url = 'http://localhost:%s/b/c' % http_daemon.server_port
|
||||||
headers = {"Warcprox-Meta": json.dumps({"warc-prefix":"..\\..\\..\\derp\\monkey"})}
|
headers = {"Warcprox-Meta": json.dumps({"warc-prefix":"..\\..\\..\\derp\\monkey"})}
|
||||||
response = requests.get(url, proxies=archiving_proxies, headers=headers)
|
response = requests.get(url, proxies=archiving_proxies, headers=headers)
|
||||||
assert response.status_code == 500
|
assert response.status_code == 400
|
||||||
assert response.reason == 'request rejected by warcprox: slash and backslash are not permitted in warc-prefix'
|
assert response.reason == 'request rejected by warcprox: slash and backslash are not permitted in warc-prefix'
|
||||||
|
|
||||||
def test_crawl_log(warcprox_, http_daemon, archiving_proxies):
|
def test_crawl_log(warcprox_, http_daemon, archiving_proxies):
|
||||||
@ -1762,7 +1848,7 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies):
|
|||||||
|
|
||||||
crawl_log = open(default_crawl_log_path, 'rb').read()
|
crawl_log = open(default_crawl_log_path, 'rb').read()
|
||||||
# tests will fail in year 3000 :)
|
# tests will fail in year 3000 :)
|
||||||
assert re.match(b'\A2[^\n]+\n\Z', crawl_log)
|
assert re.match(br'\A2[^\n]+\n\Z', crawl_log)
|
||||||
assert crawl_log[24:31] == b' 200 '
|
assert crawl_log[24:31] == b' 200 '
|
||||||
assert crawl_log[31:42] == b' 54 '
|
assert crawl_log[31:42] == b' 54 '
|
||||||
fields = crawl_log.split()
|
fields = crawl_log.split()
|
||||||
@ -1782,7 +1868,7 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies):
|
|||||||
assert extra_info['contentSize'] == 145
|
assert extra_info['contentSize'] == 145
|
||||||
|
|
||||||
crawl_log_1 = open(file, 'rb').read()
|
crawl_log_1 = open(file, 'rb').read()
|
||||||
assert re.match(b'\A2[^\n]+\n\Z', crawl_log_1)
|
assert re.match(br'\A2[^\n]+\n\Z', crawl_log_1)
|
||||||
assert crawl_log_1[24:31] == b' 200 '
|
assert crawl_log_1[24:31] == b' 200 '
|
||||||
assert crawl_log_1[31:42] == b' 54 '
|
assert crawl_log_1[31:42] == b' 54 '
|
||||||
fields = crawl_log_1.split()
|
fields = crawl_log_1.split()
|
||||||
@ -1820,7 +1906,7 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies):
|
|||||||
|
|
||||||
crawl_log_2 = open(file, 'rb').read()
|
crawl_log_2 = open(file, 'rb').read()
|
||||||
|
|
||||||
assert re.match(b'\A2[^\n]+\n\Z', crawl_log_2)
|
assert re.match(br'\A2[^\n]+\n\Z', crawl_log_2)
|
||||||
assert crawl_log_2[24:31] == b' 200 '
|
assert crawl_log_2[24:31] == b' 200 '
|
||||||
assert crawl_log_2[31:42] == b' 54 '
|
assert crawl_log_2[31:42] == b' 54 '
|
||||||
fields = crawl_log_2.split()
|
fields = crawl_log_2.split()
|
||||||
@ -1853,7 +1939,7 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies):
|
|||||||
|
|
||||||
assert os.path.exists(file)
|
assert os.path.exists(file)
|
||||||
crawl_log_3 = open(file, 'rb').read()
|
crawl_log_3 = open(file, 'rb').read()
|
||||||
assert re.match(b'\A2[^\n]+\n\Z', crawl_log_3)
|
assert re.match(br'\A2[^\n]+\n\Z', crawl_log_3)
|
||||||
assert crawl_log_3[24:31] == b' 200 '
|
assert crawl_log_3[24:31] == b' 200 '
|
||||||
assert crawl_log_3[31:42] == b' 0 '
|
assert crawl_log_3[31:42] == b' 0 '
|
||||||
fields = crawl_log_3.split()
|
fields = crawl_log_3.split()
|
||||||
@ -1893,7 +1979,7 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies):
|
|||||||
assert os.path.exists(file)
|
assert os.path.exists(file)
|
||||||
crawl_log_4 = open(file, 'rb').read()
|
crawl_log_4 = open(file, 'rb').read()
|
||||||
|
|
||||||
assert re.match(b'\A2[^\n]+\n\Z', crawl_log_4)
|
assert re.match(br'\A2[^\n]+\n\Z', crawl_log_4)
|
||||||
assert crawl_log_4[24:31] == b' 204 '
|
assert crawl_log_4[24:31] == b' 204 '
|
||||||
assert crawl_log_4[31:42] == b' 38 '
|
assert crawl_log_4[31:42] == b' 38 '
|
||||||
fields = crawl_log_4.split()
|
fields = crawl_log_4.split()
|
||||||
@ -1913,6 +1999,155 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies):
|
|||||||
assert extra_info['contentSize'] == 38
|
assert extra_info['contentSize'] == 38
|
||||||
assert extra_info['method'] == 'WARCPROX_WRITE_RECORD'
|
assert extra_info['method'] == 'WARCPROX_WRITE_RECORD'
|
||||||
|
|
||||||
|
#Empty spae for Content Type
|
||||||
|
url = 'http://localhost:%s/space_in_content_type' % http_daemon.server_port
|
||||||
|
headers = {'Warcprox-Meta': json.dumps({'warc-prefix': 'test_crawl_log_5'})}
|
||||||
|
response = requests.get(url, proxies=archiving_proxies, headers=headers)
|
||||||
|
|
||||||
|
# wait for postfetch chain
|
||||||
|
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 6)
|
||||||
|
|
||||||
|
file = os.path.join(
|
||||||
|
warcprox_.options.crawl_log_dir,
|
||||||
|
'test_crawl_log_5-%s-%s.log' % (hostname, port))
|
||||||
|
|
||||||
|
assert os.path.exists(file)
|
||||||
|
crawl_log_5 = open(file, 'rb').read()
|
||||||
|
assert re.match(br'\A2[^\n]+\n\Z', crawl_log_5)
|
||||||
|
assert crawl_log_5[24:31] == b' 200 '
|
||||||
|
assert crawl_log_5[31:42] == b' 4 '
|
||||||
|
fields = crawl_log_5.split()
|
||||||
|
assert len(fields) == 13
|
||||||
|
assert fields[3].endswith(b'/space_in_content_type')
|
||||||
|
assert fields[4] == b'-'
|
||||||
|
assert fields[5] == b'-'
|
||||||
|
assert fields[6] == b'-'
|
||||||
|
assert fields[7] == b'-'
|
||||||
|
assert re.match(br'^\d{17}[+]\d{3}', fields[8])
|
||||||
|
assert fields[9] == b'sha1:a94a8fe5ccb19ba61c4c0873d391e987982fbbd3'
|
||||||
|
assert fields[10] == b'-'
|
||||||
|
assert fields[11] == b'-'
|
||||||
|
extra_info = json.loads(fields[12].decode('utf-8'))
|
||||||
|
assert set(extra_info.keys()) == {
|
||||||
|
'contentSize', 'warcFilename', 'warcFileOffset'}
|
||||||
|
assert extra_info['contentSize'] == 59
|
||||||
|
|
||||||
|
|
||||||
|
#Fetch Exception
|
||||||
|
url = 'http://localhost-doesnt-exist:%s/connection-error' % http_daemon.server_port
|
||||||
|
headers = {'Warcprox-Meta': json.dumps({'warc-prefix': 'test_crawl_log_6'})}
|
||||||
|
response = requests.get(url, proxies=archiving_proxies, headers=headers)
|
||||||
|
|
||||||
|
#Verify the connection is cleaned up properly after the exception
|
||||||
|
url = 'http://localhost:%s/b/aa' % http_daemon.server_port
|
||||||
|
response = requests.get(url, proxies=archiving_proxies)
|
||||||
|
assert response.status_code == 200
|
||||||
|
|
||||||
|
# wait for postfetch chain
|
||||||
|
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 7)
|
||||||
|
|
||||||
|
file = os.path.join(
|
||||||
|
warcprox_.options.crawl_log_dir,
|
||||||
|
'test_crawl_log_6-%s-%s.log' % (hostname, port))
|
||||||
|
|
||||||
|
assert os.path.exists(file)
|
||||||
|
crawl_log_6 = open(file, 'rb').read()
|
||||||
|
assert re.match(br'\A2[^\n]+\n\Z', crawl_log_6)
|
||||||
|
|
||||||
|
#seems to vary depending on the environment
|
||||||
|
assert crawl_log_6[24:31] == b' -6 ' or crawl_log_6[24:31] == b' -2 '
|
||||||
|
assert crawl_log_6[31:42] == b' 0 '
|
||||||
|
fields = crawl_log_6.split()
|
||||||
|
assert len(fields) == 13
|
||||||
|
assert fields[3].endswith(b'/connection-error')
|
||||||
|
assert fields[4] == b'-'
|
||||||
|
assert fields[5] == b'-'
|
||||||
|
assert fields[6] == b'-'
|
||||||
|
assert fields[7] == b'-'
|
||||||
|
assert fields[8] == b'-'
|
||||||
|
assert fields[9] == b'-'
|
||||||
|
assert fields[10] == b'-'
|
||||||
|
assert fields[11] == b'-'
|
||||||
|
extra_info = json.loads(fields[12].decode('utf-8'))
|
||||||
|
assert set(extra_info.keys()) == {'exception'}
|
||||||
|
|
||||||
|
#Test the same bad server to check for -404
|
||||||
|
url = 'http://localhost-doesnt-exist:%s/connection-error' % http_daemon.server_port
|
||||||
|
headers = {'Warcprox-Meta': json.dumps({'warc-prefix': 'test_crawl_log_7'})}
|
||||||
|
response = requests.get(url, proxies=archiving_proxies, headers=headers)
|
||||||
|
|
||||||
|
#Verify the connection is cleaned up properly after the exception
|
||||||
|
url = 'http://localhost:%s/b/aa' % http_daemon.server_port
|
||||||
|
response = requests.get(url, proxies=archiving_proxies)
|
||||||
|
assert response.status_code == 200
|
||||||
|
|
||||||
|
# wait for postfetch chain
|
||||||
|
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 8)
|
||||||
|
|
||||||
|
file = os.path.join(
|
||||||
|
warcprox_.options.crawl_log_dir,
|
||||||
|
'test_crawl_log_7-%s-%s.log' % (hostname, port))
|
||||||
|
|
||||||
|
assert os.path.exists(file)
|
||||||
|
crawl_log_7 = open(file, 'rb').read()
|
||||||
|
assert re.match(br'\A2[^\n]+\n\Z', crawl_log_7)
|
||||||
|
assert crawl_log_7[24:31] == b' -404 '
|
||||||
|
assert crawl_log_7[31:42] == b' 0 '
|
||||||
|
fields = crawl_log_7.split()
|
||||||
|
assert len(fields) == 13
|
||||||
|
assert fields[3].endswith(b'/connection-error')
|
||||||
|
assert fields[4] == b'-'
|
||||||
|
assert fields[5] == b'-'
|
||||||
|
assert fields[6] == b'-'
|
||||||
|
assert fields[7] == b'-'
|
||||||
|
assert fields[8] == b'-'
|
||||||
|
assert fields[9] == b'-'
|
||||||
|
assert fields[10] == b'-'
|
||||||
|
assert fields[11] == b'-'
|
||||||
|
extra_info = json.loads(fields[12].decode('utf-8'))
|
||||||
|
assert set(extra_info.keys()) == {'exception'}
|
||||||
|
|
||||||
|
#Verify non-ascii urls are encoded properly
|
||||||
|
url = 'http://localhost:%s/b/¶-non-ascii' % http_daemon.server_port
|
||||||
|
headers = {
|
||||||
|
"Warcprox-Meta": json.dumps({"warc-prefix":"test_crawl_log_8",
|
||||||
|
"metadata":{'seed': 'http://example.com/¶-non-ascii', 'hop_path': 'L', 'brozzled_url': 'http://localhost:%s/b/¶-non-ascii' % http_daemon.server_port, 'hop_via_url': 'http://чунджа.kz/b/¶-non-ascii'}}),
|
||||||
|
}
|
||||||
|
response = requests.get(url, proxies=archiving_proxies, headers=headers)
|
||||||
|
assert response.status_code == 200
|
||||||
|
|
||||||
|
# wait for postfetch chain
|
||||||
|
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 9)
|
||||||
|
|
||||||
|
file = os.path.join(
|
||||||
|
warcprox_.options.crawl_log_dir,
|
||||||
|
'test_crawl_log_8-%s-%s.log' % (hostname, port))
|
||||||
|
|
||||||
|
assert os.path.exists(file)
|
||||||
|
crawl_log_8 = open(file, 'rb').read()
|
||||||
|
assert re.match(br'\A2[^\n]+\n\Z', crawl_log_8)
|
||||||
|
assert crawl_log_8[24:31] == b' 200 '
|
||||||
|
assert crawl_log_8[31:42] == b' 154 '
|
||||||
|
fields = crawl_log_8.split()
|
||||||
|
assert len(fields) == 13
|
||||||
|
assert fields[3].endswith(b'/b/%C2%B6-non-ascii')
|
||||||
|
assert fields[4] == b'L'
|
||||||
|
assert fields[5].endswith(b'http://xn--80ahg0a3ax.kz/b/%C2%B6-non-ascii')
|
||||||
|
assert fields[6] == b'text/plain'
|
||||||
|
assert fields[7] == b'-'
|
||||||
|
assert re.match(br'^\d{17}[+]\d{3}', fields[8])
|
||||||
|
assert fields[9] == b'sha1:cdd841ea7c5e46fde3fba56b2e45e4df5aeec439'
|
||||||
|
assert fields[10].endswith('/¶-non-ascii'.encode('utf-8'))
|
||||||
|
assert fields[11] == b'-'
|
||||||
|
extra_info = json.loads(fields[12].decode('utf-8'))
|
||||||
|
|
||||||
|
def test_crawl_log_canonicalization():
|
||||||
|
assert crawl_log.canonicalize_url(None) is None
|
||||||
|
assert crawl_log.canonicalize_url("") is ''
|
||||||
|
assert crawl_log.canonicalize_url("-") == '-'
|
||||||
|
assert crawl_log.canonicalize_url("http://чунджа.kz/b/¶-non-ascii") == "http://xn--80ahg0a3ax.kz/b/%C2%B6-non-ascii"
|
||||||
|
assert crawl_log.canonicalize_url("Not a URL") == "Not a URL"
|
||||||
|
|
||||||
def test_long_warcprox_meta(
|
def test_long_warcprox_meta(
|
||||||
warcprox_, http_daemon, archiving_proxies, playback_proxies):
|
warcprox_, http_daemon, archiving_proxies, playback_proxies):
|
||||||
urls_before = warcprox_.proxy.running_stats.urls
|
urls_before = warcprox_.proxy.running_stats.urls
|
||||||
@ -1975,6 +2210,10 @@ def test_socket_timeout_response(
|
|||||||
def test_empty_response(
|
def test_empty_response(
|
||||||
warcprox_, http_daemon, https_daemon, archiving_proxies,
|
warcprox_, http_daemon, https_daemon, archiving_proxies,
|
||||||
playback_proxies):
|
playback_proxies):
|
||||||
|
# localhost:server_port was added to the `bad_hostnames_ports` cache by
|
||||||
|
# previous tests and this causes subsequent tests to fail. We clear it.
|
||||||
|
warcprox_.proxy.bad_hostnames_ports.clear()
|
||||||
|
|
||||||
url = 'http://localhost:%s/empty-response' % http_daemon.server_port
|
url = 'http://localhost:%s/empty-response' % http_daemon.server_port
|
||||||
response = requests.get(url, proxies=archiving_proxies, verify=False)
|
response = requests.get(url, proxies=archiving_proxies, verify=False)
|
||||||
assert response.status_code == 502
|
assert response.status_code == 502
|
||||||
@ -1990,6 +2229,10 @@ def test_payload_digest(warcprox_, http_daemon):
|
|||||||
Tests that digest is of RFC2616 "entity body"
|
Tests that digest is of RFC2616 "entity body"
|
||||||
(transfer-decoded but not content-decoded)
|
(transfer-decoded but not content-decoded)
|
||||||
'''
|
'''
|
||||||
|
# localhost:server_port was added to the `bad_hostnames_ports` cache by
|
||||||
|
# previous tests and this causes subsequent tests to fail. We clear it.
|
||||||
|
warcprox_.proxy.bad_hostnames_ports.clear()
|
||||||
|
|
||||||
class HalfMockedMitm(warcprox.mitmproxy.MitmProxyHandler):
|
class HalfMockedMitm(warcprox.mitmproxy.MitmProxyHandler):
|
||||||
def __init__(self, url):
|
def __init__(self, url):
|
||||||
self.path = url
|
self.path = url
|
||||||
@ -2045,24 +2288,6 @@ def test_payload_digest(warcprox_, http_daemon):
|
|||||||
req, prox_rec_res = mitm.do_GET()
|
req, prox_rec_res = mitm.do_GET()
|
||||||
assert warcprox.digest_str(prox_rec_res.payload_digest) == GZIP_GZIP_SHA1
|
assert warcprox.digest_str(prox_rec_res.payload_digest) == GZIP_GZIP_SHA1
|
||||||
|
|
||||||
def test_trough_segment_promotion(warcprox_):
|
|
||||||
if not warcprox_.options.rethinkdb_trough_db_url:
|
|
||||||
return
|
|
||||||
cli = warcprox.trough.TroughClient(
|
|
||||||
warcprox_.options.rethinkdb_trough_db_url, 3)
|
|
||||||
promoted = []
|
|
||||||
def mock(segment_id):
|
|
||||||
promoted.append(segment_id)
|
|
||||||
cli.promote = mock
|
|
||||||
cli.register_schema('default', 'create table foo (bar varchar(100))')
|
|
||||||
cli.write('my_seg', 'insert into foo (bar) values ("boof")')
|
|
||||||
assert promoted == []
|
|
||||||
time.sleep(3)
|
|
||||||
assert promoted == ['my_seg']
|
|
||||||
promoted = []
|
|
||||||
time.sleep(3)
|
|
||||||
assert promoted == []
|
|
||||||
|
|
||||||
def test_dedup_min_text_size(http_daemon, warcprox_, archiving_proxies):
|
def test_dedup_min_text_size(http_daemon, warcprox_, archiving_proxies):
|
||||||
"""We use options --dedup-min-text-size=3 --dedup-min-binary-size=5 and we
|
"""We use options --dedup-min-text-size=3 --dedup-min-binary-size=5 and we
|
||||||
try to download content smaller than these limits to make sure that it is
|
try to download content smaller than these limits to make sure that it is
|
||||||
@ -2223,6 +2448,23 @@ def test_dedup_min_binary_size(http_daemon, warcprox_, archiving_proxies):
|
|||||||
with pytest.raises(StopIteration):
|
with pytest.raises(StopIteration):
|
||||||
next(rec_iter)
|
next(rec_iter)
|
||||||
|
|
||||||
|
def test_incomplete_read(http_daemon, warcprox_, archiving_proxies):
|
||||||
|
urls_before = warcprox_.proxy.running_stats.urls
|
||||||
|
|
||||||
|
# see https://github.com/internetarchive/warcprox/pull/123
|
||||||
|
url = 'http://localhost:%s/incomplete-read' % http_daemon.server_port
|
||||||
|
with pytest.raises(requests.exceptions.ChunkedEncodingError):
|
||||||
|
response = requests.get(
|
||||||
|
url, proxies=archiving_proxies, verify=False, timeout=10)
|
||||||
|
|
||||||
|
# although `requests.get` raises exception here, other clients like
|
||||||
|
# browsers put up with the server misbehavior; warcprox does too, and will
|
||||||
|
# record the response verbatim in the warc; this `wait()` call tests
|
||||||
|
# that a warc record is written
|
||||||
|
|
||||||
|
# wait for postfetch chain
|
||||||
|
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 1)
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
pytest.main()
|
pytest.main()
|
||||||
|
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
"""
|
"""
|
||||||
warcprox/__init__.py - warcprox package main file, contains some utility code
|
warcprox/__init__.py - warcprox package main file, contains some utility code
|
||||||
|
|
||||||
Copyright (C) 2013-2019 Internet Archive
|
Copyright (C) 2013-2021 Internet Archive
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or
|
This program is free software; you can redistribute it and/or
|
||||||
modify it under the terms of the GNU General Public License
|
modify it under the terms of the GNU General Public License
|
||||||
@ -78,6 +78,15 @@ class RequestBlockedByRule(Exception):
|
|||||||
def __str__(self):
|
def __str__(self):
|
||||||
return "%s: %s" % (self.__class__.__name__, self.msg)
|
return "%s: %s" % (self.__class__.__name__, self.msg)
|
||||||
|
|
||||||
|
class BadRequest(Exception):
|
||||||
|
'''
|
||||||
|
Raised in case of a request deemed unacceptable by warcprox.
|
||||||
|
'''
|
||||||
|
def __init__(self, msg):
|
||||||
|
self.msg = msg
|
||||||
|
def __str__(self):
|
||||||
|
return "%s: %s" % (self.__class__.__name__, self.msg)
|
||||||
|
|
||||||
class BasePostfetchProcessor(threading.Thread):
|
class BasePostfetchProcessor(threading.Thread):
|
||||||
logger = logging.getLogger("warcprox.BasePostfetchProcessor")
|
logger = logging.getLogger("warcprox.BasePostfetchProcessor")
|
||||||
|
|
||||||
@ -166,8 +175,10 @@ class BaseStandardPostfetchProcessor(BasePostfetchProcessor):
|
|||||||
|
|
||||||
class BaseBatchPostfetchProcessor(BasePostfetchProcessor):
|
class BaseBatchPostfetchProcessor(BasePostfetchProcessor):
|
||||||
MAX_BATCH_SIZE = 500
|
MAX_BATCH_SIZE = 500
|
||||||
MAX_BATCH_SEC = 10
|
MAX_BATCH_SEC = 60
|
||||||
MIN_BATCH_SEC = 2.0
|
MIN_BATCH_SEC = 30
|
||||||
|
# these updated batch seconds values have resulted in fewer reported dedup
|
||||||
|
# errors and otherwise have worked well in qa
|
||||||
|
|
||||||
def _get_process_put(self):
|
def _get_process_put(self):
|
||||||
batch = []
|
batch = []
|
||||||
|
@ -33,7 +33,7 @@ import hashlib
|
|||||||
import threading
|
import threading
|
||||||
import datetime
|
import datetime
|
||||||
import doublethink
|
import doublethink
|
||||||
import rethinkdb as r
|
from rethinkdb import RethinkDB; r = RethinkDB()
|
||||||
from warcprox.dedup import DedupableMixin
|
from warcprox.dedup import DedupableMixin
|
||||||
|
|
||||||
class RethinkCaptures:
|
class RethinkCaptures:
|
||||||
@ -71,7 +71,7 @@ class RethinkCaptures:
|
|||||||
"unexpected result saving batch of %s: %s "
|
"unexpected result saving batch of %s: %s "
|
||||||
"entries" % (len(self._batch), result))
|
"entries" % (len(self._batch), result))
|
||||||
if result["replaced"] > 0 or result["unchanged"] > 0:
|
if result["replaced"] > 0 or result["unchanged"] > 0:
|
||||||
self.logger.warn(
|
self.logger.warning(
|
||||||
"inserted=%s replaced=%s unchanged=%s in big "
|
"inserted=%s replaced=%s unchanged=%s in big "
|
||||||
"captures table (normally replaced=0 and "
|
"captures table (normally replaced=0 and "
|
||||||
"unchanged=0)", result["inserted"],
|
"unchanged=0)", result["inserted"],
|
||||||
@ -148,7 +148,7 @@ class RethinkCaptures:
|
|||||||
recorded_url.payload_digest.digest()
|
recorded_url.payload_digest.digest()
|
||||||
).decode("utf-8")
|
).decode("utf-8")
|
||||||
else:
|
else:
|
||||||
self.logger.warn(
|
self.logger.warning(
|
||||||
"digest type is %r but big captures table is indexed "
|
"digest type is %r but big captures table is indexed "
|
||||||
"by sha1",
|
"by sha1",
|
||||||
recorded_url.payload_digest.name)
|
recorded_url.payload_digest.name)
|
||||||
@ -157,8 +157,11 @@ class RethinkCaptures:
|
|||||||
sha1base32 = base64.b32encode(digest.digest()).decode("utf-8")
|
sha1base32 = base64.b32encode(digest.digest()).decode("utf-8")
|
||||||
|
|
||||||
if (recorded_url.warcprox_meta
|
if (recorded_url.warcprox_meta
|
||||||
and "dedup-bucket" in recorded_url.warcprox_meta):
|
and "dedup-buckets" in recorded_url.warcprox_meta):
|
||||||
bucket = recorded_url.warcprox_meta["dedup-bucket"]
|
for bucket, bucket_mode in recorded_url.warcprox_meta["dedup-buckets"].items():
|
||||||
|
if not bucket_mode == 'ro':
|
||||||
|
# maybe this is the right thing to do here? or should we return an entry for each? or ?
|
||||||
|
break
|
||||||
else:
|
else:
|
||||||
bucket = "__unspecified__"
|
bucket = "__unspecified__"
|
||||||
|
|
||||||
|
278
warcprox/certauth.py
Normal file
278
warcprox/certauth.py
Normal file
@ -0,0 +1,278 @@
|
|||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import random
|
||||||
|
from argparse import ArgumentParser
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
import threading
|
||||||
|
|
||||||
|
from cryptography import x509
|
||||||
|
from cryptography.hazmat.backends import default_backend
|
||||||
|
from cryptography.hazmat.primitives import hashes, serialization
|
||||||
|
from cryptography.hazmat.primitives.asymmetric import rsa
|
||||||
|
from cryptography.x509.oid import NameOID
|
||||||
|
|
||||||
|
# =================================================================
|
||||||
|
# Valid for 3 years from now
|
||||||
|
# Max validity is 39 months:
|
||||||
|
# https://casecurity.org/2015/02/19/ssl-certificate-validity-periods-limited-to-39-months-starting-in-april/
|
||||||
|
CERT_NOT_AFTER = 3 * 365 * 24 * 60 * 60
|
||||||
|
|
||||||
|
CERTS_DIR = './ca/certs/'
|
||||||
|
|
||||||
|
CERT_NAME = 'certauth sample CA'
|
||||||
|
|
||||||
|
DEF_HASH_FUNC = hashes.SHA256()
|
||||||
|
|
||||||
|
|
||||||
|
# =================================================================
|
||||||
|
class CertificateAuthority(object):
|
||||||
|
"""
|
||||||
|
Utility class for signing individual certificate
|
||||||
|
with a root cert.
|
||||||
|
|
||||||
|
Static generate_ca_root() method for creating the root cert
|
||||||
|
|
||||||
|
All certs saved on filesystem. Individual certs are stored
|
||||||
|
in specified certs_dir and reused if previously created.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, ca_file, certs_dir, ca_name,
|
||||||
|
overwrite=False,
|
||||||
|
cert_not_before=0,
|
||||||
|
cert_not_after=CERT_NOT_AFTER):
|
||||||
|
|
||||||
|
assert(ca_file)
|
||||||
|
self.ca_file = ca_file
|
||||||
|
|
||||||
|
assert(certs_dir)
|
||||||
|
self.certs_dir = certs_dir
|
||||||
|
|
||||||
|
assert(ca_name)
|
||||||
|
self.ca_name = ca_name
|
||||||
|
|
||||||
|
self._file_created = False
|
||||||
|
|
||||||
|
self.cert_not_before = cert_not_before
|
||||||
|
self.cert_not_after = cert_not_after
|
||||||
|
|
||||||
|
if not os.path.exists(certs_dir):
|
||||||
|
os.makedirs(certs_dir)
|
||||||
|
|
||||||
|
# if file doesn't exist or overwrite is true
|
||||||
|
# create new root cert
|
||||||
|
if (overwrite or not os.path.isfile(ca_file)):
|
||||||
|
self.cert, self.key = self.generate_ca_root(ca_file, ca_name)
|
||||||
|
self._file_created = True
|
||||||
|
|
||||||
|
# read previously created root cert
|
||||||
|
else:
|
||||||
|
self.cert, self.key = self.read_pem(ca_file)
|
||||||
|
|
||||||
|
self._lock = threading.Lock()
|
||||||
|
|
||||||
|
def cert_for_host(self, host, overwrite=False, wildcard=False):
|
||||||
|
with self._lock:
|
||||||
|
host_filename = os.path.join(self.certs_dir, host) + '.pem'
|
||||||
|
|
||||||
|
if not overwrite and os.path.exists(host_filename):
|
||||||
|
self._file_created = False
|
||||||
|
return host_filename
|
||||||
|
|
||||||
|
self.generate_host_cert(host, self.cert, self.key, host_filename,
|
||||||
|
wildcard)
|
||||||
|
|
||||||
|
self._file_created = True
|
||||||
|
return host_filename
|
||||||
|
|
||||||
|
def get_wildcard_cert(self, cert_host):
|
||||||
|
host_parts = cert_host.split('.', 1)
|
||||||
|
if len(host_parts) == 2 and '.' in host_parts[1]:
|
||||||
|
cert_host = host_parts[1]
|
||||||
|
|
||||||
|
certfile = self.cert_for_host(cert_host,
|
||||||
|
wildcard=True)
|
||||||
|
|
||||||
|
return certfile
|
||||||
|
|
||||||
|
def get_root_PKCS12(self):
|
||||||
|
return serialization.pkcs12.serialize_key_and_certificates(
|
||||||
|
name=b"root",
|
||||||
|
key=self.key,
|
||||||
|
cert=self.cert,
|
||||||
|
cas=None,
|
||||||
|
encryption_algorithm=serialization.NoEncryption()
|
||||||
|
)
|
||||||
|
|
||||||
|
def _make_cert(self, certname):
|
||||||
|
subject = issuer = x509.Name([
|
||||||
|
x509.NameAttribute(NameOID.COMMON_NAME, certname),
|
||||||
|
])
|
||||||
|
cert = x509.CertificateBuilder().subject_name(
|
||||||
|
subject
|
||||||
|
).issuer_name(
|
||||||
|
issuer
|
||||||
|
).public_key(
|
||||||
|
self.key.public_key()
|
||||||
|
).serial_number(
|
||||||
|
random.randint(0, 2**64 - 1)
|
||||||
|
).not_valid_before(
|
||||||
|
datetime.utcnow()
|
||||||
|
).not_valid_after(
|
||||||
|
datetime.utcnow() + timedelta(seconds=self.cert_not_after)
|
||||||
|
).add_extension(
|
||||||
|
x509.BasicConstraints(ca=True, path_length=0), critical=True,
|
||||||
|
).add_extension(
|
||||||
|
x509.KeyUsage(key_cert_sign=True, crl_sign=True, digital_signature=False,
|
||||||
|
content_commitment=False, key_encipherment=False,
|
||||||
|
data_encipherment=False, key_agreement=False, encipher_only=False,
|
||||||
|
decipher_only=False), critical=True
|
||||||
|
).add_extension(
|
||||||
|
x509.SubjectKeyIdentifier.from_public_key(self.key.public_key()), critical=False
|
||||||
|
).sign(self.key, DEF_HASH_FUNC, default_backend())
|
||||||
|
return cert
|
||||||
|
|
||||||
|
def generate_ca_root(self, ca_file, ca_name, hash_func=DEF_HASH_FUNC):
|
||||||
|
# Generate key
|
||||||
|
key = rsa.generate_private_key(
|
||||||
|
public_exponent=65537,
|
||||||
|
key_size=2048,
|
||||||
|
backend=default_backend()
|
||||||
|
)
|
||||||
|
|
||||||
|
# Generate cert
|
||||||
|
self.key = key
|
||||||
|
cert = self._make_cert(ca_name)
|
||||||
|
|
||||||
|
# Write cert + key
|
||||||
|
self.write_pem(ca_file, cert, key)
|
||||||
|
return cert, key
|
||||||
|
|
||||||
|
def generate_host_cert(self, host, root_cert, root_key, host_filename,
|
||||||
|
wildcard=False, hash_func=DEF_HASH_FUNC):
|
||||||
|
|
||||||
|
host = host.encode('utf-8')
|
||||||
|
|
||||||
|
# Generate CSR
|
||||||
|
csr = x509.CertificateSigningRequestBuilder().subject_name(
|
||||||
|
x509.Name([
|
||||||
|
x509.NameAttribute(NameOID.COMMON_NAME, host.decode('utf-8')),
|
||||||
|
])
|
||||||
|
).sign(self.key, hash_func, default_backend())
|
||||||
|
|
||||||
|
# Generate Cert
|
||||||
|
cert_builder = x509.CertificateBuilder().subject_name(
|
||||||
|
csr.subject
|
||||||
|
).issuer_name(
|
||||||
|
root_cert.subject
|
||||||
|
).public_key(
|
||||||
|
csr.public_key()
|
||||||
|
).serial_number(
|
||||||
|
random.randint(0, 2**64 - 1)
|
||||||
|
).not_valid_before(
|
||||||
|
datetime.utcnow()
|
||||||
|
).not_valid_after(
|
||||||
|
datetime.utcnow() + timedelta(seconds=self.cert_not_after)
|
||||||
|
)
|
||||||
|
|
||||||
|
if wildcard:
|
||||||
|
cert_builder = cert_builder.add_extension(
|
||||||
|
x509.SubjectAlternativeName([
|
||||||
|
x509.DNSName(host.decode('utf-8')),
|
||||||
|
x509.DNSName('*.' + host.decode('utf-8')),
|
||||||
|
]),
|
||||||
|
critical=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
cert = cert_builder.sign(root_key, hash_func, default_backend())
|
||||||
|
|
||||||
|
# Write cert + key
|
||||||
|
self.write_pem(host_filename, cert, self.key)
|
||||||
|
return cert, self.key
|
||||||
|
|
||||||
|
def write_pem(self, filename, cert, key):
|
||||||
|
with open(filename, 'wb+') as f:
|
||||||
|
f.write(key.private_bytes(
|
||||||
|
encoding=serialization.Encoding.PEM,
|
||||||
|
format=serialization.PrivateFormat.TraditionalOpenSSL,
|
||||||
|
encryption_algorithm=serialization.NoEncryption()
|
||||||
|
))
|
||||||
|
f.write(cert.public_bytes(serialization.Encoding.PEM))
|
||||||
|
|
||||||
|
def read_pem(self, filename):
|
||||||
|
with open(filename, 'rb') as f:
|
||||||
|
cert = x509.load_pem_x509_certificate(f.read(), default_backend())
|
||||||
|
f.seek(0)
|
||||||
|
key = serialization.load_pem_private_key(f.read(), password=None, backend=default_backend())
|
||||||
|
|
||||||
|
return cert, key
|
||||||
|
|
||||||
|
|
||||||
|
# =================================================================
|
||||||
|
def main(args=None):
|
||||||
|
parser = ArgumentParser(description='Certificate Authority Cert Maker Tools')
|
||||||
|
|
||||||
|
parser.add_argument('root_ca_cert',
|
||||||
|
help='Path to existing or new root CA file')
|
||||||
|
|
||||||
|
parser.add_argument('-c', '--certname', action='store', default=CERT_NAME,
|
||||||
|
help='Name for root certificate')
|
||||||
|
|
||||||
|
parser.add_argument('-n', '--hostname',
|
||||||
|
help='Hostname certificate to create')
|
||||||
|
|
||||||
|
parser.add_argument('-d', '--certs-dir', default=CERTS_DIR,
|
||||||
|
help='Directory for host certificates')
|
||||||
|
|
||||||
|
parser.add_argument('-f', '--force', action='store_true',
|
||||||
|
help='Overwrite certificates if they already exist')
|
||||||
|
|
||||||
|
parser.add_argument('-w', '--wildcard_cert', action='store_true',
|
||||||
|
help='add wildcard SAN to host: *.<host>, <host>')
|
||||||
|
|
||||||
|
r = parser.parse_args(args=args)
|
||||||
|
|
||||||
|
certs_dir = r.certs_dir
|
||||||
|
wildcard = r.wildcard_cert
|
||||||
|
|
||||||
|
root_cert = r.root_ca_cert
|
||||||
|
hostname = r.hostname
|
||||||
|
|
||||||
|
if not hostname:
|
||||||
|
overwrite = r.force
|
||||||
|
else:
|
||||||
|
overwrite = False
|
||||||
|
|
||||||
|
ca = CertificateAuthority(ca_file=root_cert,
|
||||||
|
certs_dir=r.certs_dir,
|
||||||
|
ca_name=r.certname,
|
||||||
|
overwrite=overwrite)
|
||||||
|
|
||||||
|
# Just creating the root cert
|
||||||
|
if not hostname:
|
||||||
|
if ca._file_created:
|
||||||
|
print('Created new root cert: "' + root_cert + '"')
|
||||||
|
return 0
|
||||||
|
else:
|
||||||
|
print('Root cert "' + root_cert +
|
||||||
|
'" already exists,' + ' use -f to overwrite')
|
||||||
|
return 1
|
||||||
|
|
||||||
|
# Sign a certificate for a given host
|
||||||
|
overwrite = r.force
|
||||||
|
host_filename = ca.cert_for_host(hostname,
|
||||||
|
overwrite, wildcard)
|
||||||
|
|
||||||
|
if ca._file_created:
|
||||||
|
print('Created new cert "' + hostname +
|
||||||
|
'" signed by root cert ' +
|
||||||
|
root_cert)
|
||||||
|
return 0
|
||||||
|
|
||||||
|
else:
|
||||||
|
print('Cert for "' + hostname + '" already exists,' +
|
||||||
|
' use -f to overwrite')
|
||||||
|
return 1
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__": #pragma: no cover
|
||||||
|
main()
|
@ -31,12 +31,12 @@ import sys
|
|||||||
import gc
|
import gc
|
||||||
import datetime
|
import datetime
|
||||||
import warcprox
|
import warcprox
|
||||||
import certauth
|
|
||||||
import functools
|
import functools
|
||||||
import doublethink
|
import doublethink
|
||||||
import importlib
|
import importlib
|
||||||
import queue
|
import queue
|
||||||
import socket
|
import socket
|
||||||
|
import os
|
||||||
|
|
||||||
class Factory:
|
class Factory:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@ -110,7 +110,7 @@ class Factory:
|
|||||||
assert hasattr(plugin, 'notify') ^ hasattr(plugin, '_startup')
|
assert hasattr(plugin, 'notify') ^ hasattr(plugin, '_startup')
|
||||||
return plugin
|
return plugin
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.fatal('problem with plugin class %r: %s', qualname, e)
|
logging.fatal('problem with plugin class %r', qualname, exc_info=1)
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@ -441,7 +441,12 @@ class WarcproxController(object):
|
|||||||
exc_info=True)
|
exc_info=True)
|
||||||
pass
|
pass
|
||||||
finally:
|
finally:
|
||||||
self.shutdown()
|
try:
|
||||||
|
self.shutdown()
|
||||||
|
except:
|
||||||
|
self.logger.critical("graceful shutdown failed", exc_info=True)
|
||||||
|
self.logger.critical("killing myself -9")
|
||||||
|
os.kill(os.getpid(), 9)
|
||||||
|
|
||||||
def _dump_profiling(self):
|
def _dump_profiling(self):
|
||||||
import pstats, tempfile, os, io
|
import pstats, tempfile, os, io
|
||||||
|
@ -25,6 +25,8 @@ import json
|
|||||||
import os
|
import os
|
||||||
import warcprox
|
import warcprox
|
||||||
import socket
|
import socket
|
||||||
|
import rfc3986
|
||||||
|
from urllib3.exceptions import TimeoutError, HTTPError, NewConnectionError, MaxRetryError
|
||||||
|
|
||||||
class CrawlLogger(object):
|
class CrawlLogger(object):
|
||||||
def __init__(self, dir_, options=warcprox.Options()):
|
def __init__(self, dir_, options=warcprox.Options()):
|
||||||
@ -40,7 +42,12 @@ class CrawlLogger(object):
|
|||||||
def notify(self, recorded_url, records):
|
def notify(self, recorded_url, records):
|
||||||
# 2017-08-03T21:45:24.496Z 200 2189 https://autismcouncil.wisconsin.gov/robots.txt P https://autismcouncil.wisconsin.gov/ text/plain #001 20170803214523617+365 sha1:PBS2CEF7B4OSEXZZF3QE2XN2VHYCPNPX https://autismcouncil.wisconsin.gov/ duplicate:digest {"warcFileOffset":942,"contentSize":2495,"warcFilename":"ARCHIVEIT-2159-TEST-JOB319150-20170803214522386-00000.warc.gz"}
|
# 2017-08-03T21:45:24.496Z 200 2189 https://autismcouncil.wisconsin.gov/robots.txt P https://autismcouncil.wisconsin.gov/ text/plain #001 20170803214523617+365 sha1:PBS2CEF7B4OSEXZZF3QE2XN2VHYCPNPX https://autismcouncil.wisconsin.gov/ duplicate:digest {"warcFileOffset":942,"contentSize":2495,"warcFilename":"ARCHIVEIT-2159-TEST-JOB319150-20170803214522386-00000.warc.gz"}
|
||||||
now = datetime.datetime.utcnow()
|
now = datetime.datetime.utcnow()
|
||||||
extra_info = {'contentSize': recorded_url.size,}
|
status = self.get_artificial_status(recorded_url)
|
||||||
|
extra_info = {'contentSize': recorded_url.size,} if recorded_url.size is not None and recorded_url.size > 0 else {}
|
||||||
|
if hasattr(recorded_url, 'exception') and recorded_url.exception is not None:
|
||||||
|
extra_info['exception'] = str(recorded_url.exception).replace(" ", "_")
|
||||||
|
if(hasattr(recorded_url, 'message') and recorded_url.message is not None):
|
||||||
|
extra_info['exceptionMessage'] = str(recorded_url.message).replace(" ", "_")
|
||||||
if records:
|
if records:
|
||||||
extra_info['warcFilename'] = records[0].warc_filename
|
extra_info['warcFilename'] = records[0].warc_filename
|
||||||
extra_info['warcFileOffset'] = records[0].offset
|
extra_info['warcFileOffset'] = records[0].offset
|
||||||
@ -51,23 +58,50 @@ class CrawlLogger(object):
|
|||||||
payload_digest = warcprox.digest_str(
|
payload_digest = warcprox.digest_str(
|
||||||
recorded_url.payload_digest,
|
recorded_url.payload_digest,
|
||||||
self.options.base32)
|
self.options.base32)
|
||||||
else:
|
elif records is not None and len(records) > 0:
|
||||||
# WARCPROX_WRITE_RECORD request
|
# WARCPROX_WRITE_RECORD request
|
||||||
content_length = int(records[0].get_header(b'Content-Length'))
|
content_length = int(records[0].get_header(b'Content-Length'))
|
||||||
payload_digest = records[0].get_header(b'WARC-Payload-Digest')
|
payload_digest = records[0].get_header(b'WARC-Payload-Digest')
|
||||||
|
else:
|
||||||
|
content_length = 0
|
||||||
|
payload_digest = '-'
|
||||||
|
logging.info('warcprox_meta %s' , recorded_url.warcprox_meta)
|
||||||
|
|
||||||
|
hop_path = recorded_url.warcprox_meta.get('metadata', {}).get('hop_path')
|
||||||
|
#URLs are url encoded into plain ascii urls by HTTP spec. Since we're comparing against those, our urls sent over the json blob need to be encoded similarly
|
||||||
|
brozzled_url = canonicalize_url(recorded_url.warcprox_meta.get('metadata', {}).get('brozzled_url'))
|
||||||
|
hop_via_url = canonicalize_url(recorded_url.warcprox_meta.get('metadata', {}).get('hop_via_url'))
|
||||||
|
|
||||||
|
if hop_path is None and brozzled_url is None and hop_via_url is None:
|
||||||
|
#No hop info headers provided
|
||||||
|
hop_path = "-"
|
||||||
|
via_url = recorded_url.referer or '-'
|
||||||
|
else:
|
||||||
|
if hop_path is None:
|
||||||
|
hop_path = "-"
|
||||||
|
if hop_via_url is None:
|
||||||
|
hop_via_url = "-"
|
||||||
|
#Prefer referer header. Otherwise use provided via_url
|
||||||
|
via_url = recorded_url.referer or hop_via_url if hop_path != "-" else "-"
|
||||||
|
logging.info('brozzled_url:%s recorded_url:%s' , brozzled_url, recorded_url.url)
|
||||||
|
if brozzled_url != recorded_url.url.decode('ascii') and "brozzled_url" in recorded_url.warcprox_meta.get('metadata', {}).keys():
|
||||||
|
#Requested page is not the Brozzled url, thus we are an embed or redirect.
|
||||||
|
via_url = brozzled_url
|
||||||
|
hop_path = "B" if hop_path == "-" else "".join([hop_path,"B"])
|
||||||
|
|
||||||
fields = [
|
fields = [
|
||||||
'{:%Y-%m-%dT%H:%M:%S}.{:03d}Z'.format(now, now.microsecond//1000),
|
'{:%Y-%m-%dT%H:%M:%S}.{:03d}Z'.format(now, now.microsecond//1000),
|
||||||
'% 5s' % recorded_url.status,
|
'% 5s' % status,
|
||||||
'% 10s' % content_length,
|
'% 10s' % content_length,
|
||||||
recorded_url.url,
|
recorded_url.url,
|
||||||
'-', # hop path
|
hop_path,
|
||||||
recorded_url.referer or '-',
|
via_url,
|
||||||
recorded_url.mimetype or '-',
|
recorded_url.mimetype if recorded_url.mimetype is not None and recorded_url.mimetype.strip() else '-',
|
||||||
'-',
|
'-',
|
||||||
'{:%Y%m%d%H%M%S}{:03d}+{:03d}'.format(
|
'{:%Y%m%d%H%M%S}{:03d}+{:03d}'.format(
|
||||||
recorded_url.timestamp,
|
recorded_url.timestamp,
|
||||||
recorded_url.timestamp.microsecond//1000,
|
recorded_url.timestamp.microsecond//1000,
|
||||||
recorded_url.duration.microseconds//1000),
|
recorded_url.duration.microseconds//1000) if (recorded_url.timestamp is not None and recorded_url.duration is not None) else '-',
|
||||||
payload_digest,
|
payload_digest,
|
||||||
recorded_url.warcprox_meta.get('metadata', {}).get('seed', '-'),
|
recorded_url.warcprox_meta.get('metadata', {}).get('seed', '-'),
|
||||||
'duplicate:digest' if records and records[0].type == b'revisit' else '-',
|
'duplicate:digest' if records and records[0].type == b'revisit' else '-',
|
||||||
@ -80,7 +114,6 @@ class CrawlLogger(object):
|
|||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
line = b' '.join(fields) + b'\n'
|
line = b' '.join(fields) + b'\n'
|
||||||
|
|
||||||
prefix = recorded_url.warcprox_meta.get('warc-prefix', 'crawl')
|
prefix = recorded_url.warcprox_meta.get('warc-prefix', 'crawl')
|
||||||
filename = '%s-%s-%s.log' % (
|
filename = '%s-%s-%s.log' % (
|
||||||
prefix, self.hostname, self.options.server_port)
|
prefix, self.hostname, self.options.server_port)
|
||||||
@ -89,3 +122,43 @@ class CrawlLogger(object):
|
|||||||
with open(crawl_log_path, 'ab') as f:
|
with open(crawl_log_path, 'ab') as f:
|
||||||
f.write(line)
|
f.write(line)
|
||||||
|
|
||||||
|
def get_artificial_status(self, recorded_url):
|
||||||
|
# urllib3 Does not specify DNS errors. We must parse them from the exception string.
|
||||||
|
# Unfortunately, the errors are reported differently on different systems.
|
||||||
|
# https://stackoverflow.com/questions/40145631
|
||||||
|
|
||||||
|
if hasattr(recorded_url, 'exception') and isinstance(recorded_url.exception, (MaxRetryError, )):
|
||||||
|
return '-8'
|
||||||
|
elif hasattr(recorded_url, 'exception') and isinstance(recorded_url.exception, (NewConnectionError, )):
|
||||||
|
exception_string=str(recorded_url.exception)
|
||||||
|
if ("[Errno 11001] getaddrinfo failed" in exception_string or # Windows
|
||||||
|
"[Errno -2] Name or service not known" in exception_string or # Linux
|
||||||
|
"[Errno -3] Temporary failure in name resolution" in exception_string or # Linux
|
||||||
|
"[Errno 8] nodename nor servname " in exception_string): # OS X
|
||||||
|
return '-6' # DNS Failure
|
||||||
|
else:
|
||||||
|
return '-2' # Other Connection Failure
|
||||||
|
elif hasattr(recorded_url, 'exception') and isinstance(recorded_url.exception, (socket.timeout, TimeoutError, )):
|
||||||
|
return '-2' # Connection Timeout
|
||||||
|
elif isinstance(recorded_url, warcprox.warcproxy.FailedUrl):
|
||||||
|
# synthetic status, used when some other status (such as connection-lost)
|
||||||
|
# is considered by policy the same as a document-not-found
|
||||||
|
# Cached failures result in FailedUrl with no Exception
|
||||||
|
return '-404'
|
||||||
|
else:
|
||||||
|
return recorded_url.status
|
||||||
|
|
||||||
|
def canonicalize_url(url):
|
||||||
|
#URL needs to be split out to separately encode the hostname from the rest of the path.
|
||||||
|
#hostname will be idna encoded (punycode)
|
||||||
|
#The rest of the URL will be urlencoded, but browsers only encode "unsafe" and not "reserved" characters, so ignore the reserved chars.
|
||||||
|
if url is None or url == '-' or url == '':
|
||||||
|
return url
|
||||||
|
try:
|
||||||
|
parsed_url=rfc3986.urlparse(url)
|
||||||
|
encoded_url=parsed_url.copy_with(host=parsed_url.host.encode('idna'))
|
||||||
|
return encoded_url.unsplit()
|
||||||
|
except (TypeError, ValueError, AttributeError) as e:
|
||||||
|
logging.warning("URL Canonicalization failure. Returning raw url: rfc3986 %s - %s", url, e)
|
||||||
|
return url
|
||||||
|
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
'''
|
'''
|
||||||
warcprox/dedup.py - identical payload digest deduplication using sqlite db
|
warcprox/dedup.py - identical payload digest deduplication using sqlite db
|
||||||
|
|
||||||
Copyright (C) 2013-2018 Internet Archive
|
Copyright (C) 2013-2021 Internet Archive
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or
|
This program is free software; you can redistribute it and/or
|
||||||
modify it under the terms of the GNU General Public License
|
modify it under the terms of the GNU General Public License
|
||||||
@ -26,7 +26,6 @@ import os
|
|||||||
import json
|
import json
|
||||||
from hanzo import warctools
|
from hanzo import warctools
|
||||||
import warcprox
|
import warcprox
|
||||||
import warcprox.trough
|
|
||||||
import sqlite3
|
import sqlite3
|
||||||
import doublethink
|
import doublethink
|
||||||
import datetime
|
import datetime
|
||||||
@ -47,11 +46,15 @@ class DedupableMixin(object):
|
|||||||
def should_dedup(self, recorded_url):
|
def should_dedup(self, recorded_url):
|
||||||
"""Check if we should try to run dedup on resource based on payload
|
"""Check if we should try to run dedup on resource based on payload
|
||||||
size compared with min text/binary dedup size options.
|
size compared with min text/binary dedup size options.
|
||||||
When we use option --dedup-only-with-bucket, `dedup-bucket` is required
|
When we use option --dedup-only-with-bucket, `dedup-buckets` is required
|
||||||
in Warcprox-Meta to perform dedup.
|
in Warcprox-Meta to perform dedup.
|
||||||
|
If recorded_url.do_not_archive is True, we skip dedup. This record will
|
||||||
|
not be written to WARC anyway.
|
||||||
Return Boolean.
|
Return Boolean.
|
||||||
"""
|
"""
|
||||||
if self.dedup_only_with_bucket and "dedup-bucket" not in recorded_url.warcprox_meta:
|
if recorded_url.do_not_archive:
|
||||||
|
return False
|
||||||
|
if self.dedup_only_with_bucket and "dedup-buckets" not in recorded_url.warcprox_meta:
|
||||||
return False
|
return False
|
||||||
if recorded_url.is_text():
|
if recorded_url.is_text():
|
||||||
return recorded_url.response_recorder.payload_size() > self.min_text_size
|
return recorded_url.response_recorder.payload_size() > self.min_text_size
|
||||||
@ -65,14 +68,19 @@ class DedupLoader(warcprox.BaseStandardPostfetchProcessor, DedupableMixin):
|
|||||||
self.dedup_db = dedup_db
|
self.dedup_db = dedup_db
|
||||||
|
|
||||||
def _process_url(self, recorded_url):
|
def _process_url(self, recorded_url):
|
||||||
|
if isinstance(recorded_url, warcprox.warcproxy.FailedUrl):
|
||||||
|
return
|
||||||
if (recorded_url.response_recorder
|
if (recorded_url.response_recorder
|
||||||
and recorded_url.payload_digest
|
and recorded_url.payload_digest
|
||||||
and self.should_dedup(recorded_url)):
|
and self.should_dedup(recorded_url)):
|
||||||
digest_key = warcprox.digest_str(recorded_url.payload_digest, self.options.base32)
|
digest_key = warcprox.digest_str(recorded_url.payload_digest, self.options.base32)
|
||||||
if recorded_url.warcprox_meta and "dedup-bucket" in recorded_url.warcprox_meta:
|
if recorded_url.warcprox_meta and "dedup-buckets" in recorded_url.warcprox_meta:
|
||||||
recorded_url.dedup_info = self.dedup_db.lookup(
|
for bucket, bucket_mode in recorded_url.warcprox_meta["dedup-buckets"].items():
|
||||||
digest_key, recorded_url.warcprox_meta["dedup-bucket"],
|
recorded_url.dedup_info = self.dedup_db.lookup(
|
||||||
recorded_url.url)
|
digest_key, bucket, recorded_url.url)
|
||||||
|
if recorded_url.dedup_info:
|
||||||
|
# we found an existing capture
|
||||||
|
break
|
||||||
else:
|
else:
|
||||||
recorded_url.dedup_info = self.dedup_db.lookup(
|
recorded_url.dedup_info = self.dedup_db.lookup(
|
||||||
digest_key, url=recorded_url.url)
|
digest_key, url=recorded_url.url)
|
||||||
@ -148,10 +156,12 @@ class DedupDb(DedupableMixin):
|
|||||||
and self.should_dedup(recorded_url)):
|
and self.should_dedup(recorded_url)):
|
||||||
digest_key = warcprox.digest_str(
|
digest_key = warcprox.digest_str(
|
||||||
recorded_url.payload_digest, self.options.base32)
|
recorded_url.payload_digest, self.options.base32)
|
||||||
if recorded_url.warcprox_meta and "dedup-bucket" in recorded_url.warcprox_meta:
|
if recorded_url.warcprox_meta and "dedup-buckets" in recorded_url.warcprox_meta:
|
||||||
self.save(
|
for bucket, bucket_mode in recorded_url.warcprox_meta["dedup-buckets"].items():
|
||||||
digest_key, records[0],
|
if not bucket_mode == "ro":
|
||||||
bucket=recorded_url.warcprox_meta["dedup-bucket"])
|
self.save(
|
||||||
|
digest_key, records[0],
|
||||||
|
bucket=bucket)
|
||||||
else:
|
else:
|
||||||
self.save(digest_key, records[0])
|
self.save(digest_key, records[0])
|
||||||
|
|
||||||
@ -213,8 +223,10 @@ class RethinkDedupDb(DedupDb, DedupableMixin):
|
|||||||
and self.should_dedup(recorded_url)):
|
and self.should_dedup(recorded_url)):
|
||||||
digest_key = warcprox.digest_str(
|
digest_key = warcprox.digest_str(
|
||||||
recorded_url.payload_digest, self.options.base32)
|
recorded_url.payload_digest, self.options.base32)
|
||||||
if recorded_url.warcprox_meta and "dedup-bucket" in recorded_url.warcprox_meta:
|
if recorded_url.warcprox_meta and "dedup-buckets" in recorded_url.warcprox_meta:
|
||||||
self.save(digest_key, records[0], bucket=recorded_url.warcprox_meta["dedup-bucket"])
|
for bucket, bucket_mode in recorded_url.warcprox_meta["dedup-buckets"].items():
|
||||||
|
if not bucket_mode == 'ro':
|
||||||
|
self.save(digest_key, records[0], bucket=bucket)
|
||||||
else:
|
else:
|
||||||
self.save(digest_key, records[0])
|
self.save(digest_key, records[0])
|
||||||
|
|
||||||
@ -259,6 +271,9 @@ class CdxServerDedup(DedupDb):
|
|||||||
performance optimisation to handle that. limit < 0 is very inefficient
|
performance optimisation to handle that. limit < 0 is very inefficient
|
||||||
in general. Maybe it could be configurable in the future.
|
in general. Maybe it could be configurable in the future.
|
||||||
|
|
||||||
|
Skip dedup for URLs with session params. These URLs are certainly
|
||||||
|
unique and highly volatile, we cannot dedup them.
|
||||||
|
|
||||||
:param digest_key: b'sha1:<KEY-VALUE>' (prefix is optional).
|
:param digest_key: b'sha1:<KEY-VALUE>' (prefix is optional).
|
||||||
Example: b'sha1:B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'
|
Example: b'sha1:B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'
|
||||||
:param url: Target URL string
|
:param url: Target URL string
|
||||||
@ -267,6 +282,8 @@ class CdxServerDedup(DedupDb):
|
|||||||
"""
|
"""
|
||||||
u = url.decode("utf-8") if isinstance(url, bytes) else url
|
u = url.decode("utf-8") if isinstance(url, bytes) else url
|
||||||
try:
|
try:
|
||||||
|
if any(s in u for s in ('JSESSIONID=', 'session=', 'sess=')):
|
||||||
|
return None
|
||||||
result = self.http_pool.request('GET', self.cdx_url, fields=dict(
|
result = self.http_pool.request('GET', self.cdx_url, fields=dict(
|
||||||
url=u, fl="timestamp,digest", filter="!mimetype:warc/revisit",
|
url=u, fl="timestamp,digest", filter="!mimetype:warc/revisit",
|
||||||
limit=-1))
|
limit=-1))
|
||||||
@ -347,11 +364,12 @@ class BatchTroughStorer(warcprox.BaseBatchPostfetchProcessor):
|
|||||||
and recorded_url.warc_records[0].type == b'response'
|
and recorded_url.warc_records[0].type == b'response'
|
||||||
and self.trough_dedup_db.should_dedup(recorded_url)):
|
and self.trough_dedup_db.should_dedup(recorded_url)):
|
||||||
if (recorded_url.warcprox_meta
|
if (recorded_url.warcprox_meta
|
||||||
and 'dedup-bucket' in recorded_url.warcprox_meta):
|
and 'dedup-buckets' in recorded_url.warcprox_meta):
|
||||||
bucket = recorded_url.warcprox_meta['dedup-bucket']
|
for bucket, bucket_mode in recorded_url.warcprox_meta["dedup-buckets"].items():
|
||||||
|
if not bucket_mode == 'ro':
|
||||||
|
buckets[bucket].append(recorded_url)
|
||||||
else:
|
else:
|
||||||
bucket = '__unspecified__'
|
buckets['__unspecified__'].append(recorded_url)
|
||||||
buckets[bucket].append(recorded_url)
|
|
||||||
return buckets
|
return buckets
|
||||||
|
|
||||||
def _process_batch(self, batch):
|
def _process_batch(self, batch):
|
||||||
@ -366,6 +384,9 @@ class BatchTroughStorer(warcprox.BaseBatchPostfetchProcessor):
|
|||||||
self.trough_dedup_db.batch_save,
|
self.trough_dedup_db.batch_save,
|
||||||
buckets[bucket], bucket)
|
buckets[bucket], bucket)
|
||||||
fs[future] = bucket
|
fs[future] = bucket
|
||||||
|
logging.debug(
|
||||||
|
'storing dedup info for %s urls '
|
||||||
|
'in bucket %s', len(buckets[bucket]), bucket)
|
||||||
|
|
||||||
# wait for results
|
# wait for results
|
||||||
try:
|
try:
|
||||||
@ -374,7 +395,7 @@ class BatchTroughStorer(warcprox.BaseBatchPostfetchProcessor):
|
|||||||
except futures.TimeoutError as e:
|
except futures.TimeoutError as e:
|
||||||
# the remaining threads actually keep running in this case,
|
# the remaining threads actually keep running in this case,
|
||||||
# there's no way to stop them, but that should be harmless
|
# there's no way to stop them, but that should be harmless
|
||||||
logging.warn(
|
logging.warning(
|
||||||
'timed out saving dedup info to trough', exc_info=True)
|
'timed out saving dedup info to trough', exc_info=True)
|
||||||
|
|
||||||
class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor):
|
class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor):
|
||||||
@ -394,21 +415,32 @@ class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor):
|
|||||||
'''
|
'''
|
||||||
buckets = collections.defaultdict(list)
|
buckets = collections.defaultdict(list)
|
||||||
discards = []
|
discards = []
|
||||||
|
# for duplicate checks, see https://webarchive.jira.com/browse/WT-31
|
||||||
|
hash_plus_urls = set()
|
||||||
for recorded_url in batch:
|
for recorded_url in batch:
|
||||||
|
if not recorded_url.payload_digest:
|
||||||
|
discards.append('n/a')
|
||||||
|
continue
|
||||||
|
payload_hash = warcprox.digest_str(
|
||||||
|
recorded_url.payload_digest, self.options.base32)
|
||||||
|
hash_plus_url = b''.join((payload_hash, recorded_url.url))
|
||||||
if (recorded_url.response_recorder
|
if (recorded_url.response_recorder
|
||||||
and recorded_url.payload_digest
|
and hash_plus_url not in hash_plus_urls
|
||||||
and self.trough_dedup_db.should_dedup(recorded_url)):
|
and self.trough_dedup_db.should_dedup(recorded_url)):
|
||||||
|
hash_plus_urls.add(hash_plus_url)
|
||||||
if (recorded_url.warcprox_meta
|
if (recorded_url.warcprox_meta
|
||||||
and 'dedup-bucket' in recorded_url.warcprox_meta):
|
and 'dedup-buckets' in recorded_url.warcprox_meta):
|
||||||
bucket = recorded_url.warcprox_meta['dedup-bucket']
|
for bucket, bucket_mode in recorded_url.warcprox_meta["dedup-buckets"].items():
|
||||||
|
buckets[bucket].append(recorded_url)
|
||||||
else:
|
else:
|
||||||
bucket = '__unspecified__'
|
buckets['__unspecified__'].append(recorded_url)
|
||||||
buckets[bucket].append(recorded_url)
|
|
||||||
else:
|
else:
|
||||||
discards.append(
|
if hash_plus_url in hash_plus_urls:
|
||||||
warcprox.digest_str(
|
self.logger.debug(
|
||||||
recorded_url.payload_digest, self.options.base32)
|
'discarding duplicate and setting do_not_archive for %s, hash %s',
|
||||||
if recorded_url.payload_digest else 'n/a')
|
recorded_url.url, payload_hash)
|
||||||
|
recorded_url.do_not_archive = True
|
||||||
|
discards.append(payload_hash)
|
||||||
self.logger.debug(
|
self.logger.debug(
|
||||||
'len(batch)=%s len(discards)=%s buckets=%s',
|
'len(batch)=%s len(discards)=%s buckets=%s',
|
||||||
len(batch), len(discards),
|
len(batch), len(discards),
|
||||||
@ -458,7 +490,7 @@ class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor):
|
|||||||
recorded_url.dedup_info = entry
|
recorded_url.dedup_info = entry
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# batch_lookup raised exception or something
|
# batch_lookup raised exception or something
|
||||||
logging.warn(
|
logging.warning(
|
||||||
'problem looking up dedup info for %s urls '
|
'problem looking up dedup info for %s urls '
|
||||||
'in bucket %s', len(buckets[bucket]), bucket,
|
'in bucket %s', len(buckets[bucket]), bucket,
|
||||||
exc_info=True)
|
exc_info=True)
|
||||||
@ -474,7 +506,7 @@ class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor):
|
|||||||
except futures.TimeoutError as e:
|
except futures.TimeoutError as e:
|
||||||
# the remaining threads actually keep running in this case,
|
# the remaining threads actually keep running in this case,
|
||||||
# there's no way to stop them, but that should be harmless
|
# there's no way to stop them, but that should be harmless
|
||||||
self.logger.warn(
|
self.logger.warning(
|
||||||
'timed out loading dedup info from trough', exc_info=True)
|
'timed out loading dedup info from trough', exc_info=True)
|
||||||
|
|
||||||
class TroughDedupDb(DedupDb, DedupableMixin):
|
class TroughDedupDb(DedupDb, DedupableMixin):
|
||||||
@ -487,16 +519,24 @@ class TroughDedupDb(DedupDb, DedupableMixin):
|
|||||||
SCHEMA_SQL = ('create table dedup (\n'
|
SCHEMA_SQL = ('create table dedup (\n'
|
||||||
' digest_key varchar(100) primary key,\n'
|
' digest_key varchar(100) primary key,\n'
|
||||||
' url varchar(2100) not null,\n'
|
' url varchar(2100) not null,\n'
|
||||||
' date datetime not null,\n'
|
' date varchar(100) not null,\n'
|
||||||
' id varchar(100));\n') # warc record id
|
' id varchar(100));\n') # warc record id
|
||||||
WRITE_SQL_TMPL = ('insert or ignore into dedup\n'
|
WRITE_SQL_TMPL = ('insert or ignore into dedup\n'
|
||||||
'(digest_key, url, date, id)\n'
|
'(digest_key, url, date, id)\n'
|
||||||
'values (%s, %s, %s, %s);')
|
'values (%s, %s, %s, %s);')
|
||||||
|
|
||||||
def __init__(self, options=warcprox.Options()):
|
def __init__(self, options=warcprox.Options()):
|
||||||
|
try:
|
||||||
|
import trough.client
|
||||||
|
except ImportError as e:
|
||||||
|
logging.critical(
|
||||||
|
'%s: %s\n\nYou might need to run "pip install '
|
||||||
|
'warcprox[trough]".', type(e).__name__, e)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
DedupableMixin.__init__(self, options)
|
DedupableMixin.__init__(self, options)
|
||||||
self.options = options
|
self.options = options
|
||||||
self._trough_cli = warcprox.trough.TroughClient(
|
self._trough_cli = trough.client.TroughClient(
|
||||||
options.rethinkdb_trough_db_url, promotion_interval=60*60)
|
options.rethinkdb_trough_db_url, promotion_interval=60*60)
|
||||||
|
|
||||||
def loader(self, *args, **kwargs):
|
def loader(self, *args, **kwargs):
|
||||||
@ -518,9 +558,13 @@ class TroughDedupDb(DedupDb, DedupableMixin):
|
|||||||
record_id = response_record.get_header(warctools.WarcRecord.ID)
|
record_id = response_record.get_header(warctools.WarcRecord.ID)
|
||||||
url = response_record.get_header(warctools.WarcRecord.URL)
|
url = response_record.get_header(warctools.WarcRecord.URL)
|
||||||
warc_date = response_record.get_header(warctools.WarcRecord.DATE)
|
warc_date = response_record.get_header(warctools.WarcRecord.DATE)
|
||||||
self._trough_cli.write(
|
try:
|
||||||
bucket, self.WRITE_SQL_TMPL,
|
self._trough_cli.write(
|
||||||
(digest_key, url, warc_date, record_id), self.SCHEMA_ID)
|
bucket, self.WRITE_SQL_TMPL,
|
||||||
|
(digest_key, url, warc_date, record_id), self.SCHEMA_ID)
|
||||||
|
except:
|
||||||
|
self.logger.warning(
|
||||||
|
'problem posting dedup data to trough', exc_info=True)
|
||||||
|
|
||||||
def batch_save(self, batch, bucket='__unspecified__'):
|
def batch_save(self, batch, bucket='__unspecified__'):
|
||||||
sql_tmpl = ('insert or ignore into dedup\n'
|
sql_tmpl = ('insert or ignore into dedup\n'
|
||||||
@ -535,12 +579,22 @@ class TroughDedupDb(DedupDb, DedupableMixin):
|
|||||||
recorded_url.url,
|
recorded_url.url,
|
||||||
recorded_url.warc_records[0].date,
|
recorded_url.warc_records[0].date,
|
||||||
recorded_url.warc_records[0].id,])
|
recorded_url.warc_records[0].id,])
|
||||||
self._trough_cli.write(bucket, sql_tmpl, values, self.SCHEMA_ID)
|
try:
|
||||||
|
self._trough_cli.write(bucket, sql_tmpl, values, self.SCHEMA_ID)
|
||||||
|
except:
|
||||||
|
self.logger.warning(
|
||||||
|
'problem posting dedup data to trough', exc_info=True)
|
||||||
|
|
||||||
def lookup(self, digest_key, bucket='__unspecified__', url=None):
|
def lookup(self, digest_key, bucket='__unspecified__', url=None):
|
||||||
results = self._trough_cli.read(
|
try:
|
||||||
bucket, 'select * from dedup where digest_key=%s;',
|
results = self._trough_cli.read(
|
||||||
(digest_key,))
|
bucket, 'select * from dedup where digest_key=%s;',
|
||||||
|
(digest_key,))
|
||||||
|
except:
|
||||||
|
self.logger.warning(
|
||||||
|
'problem reading dedup data from trough', exc_info=True)
|
||||||
|
return None
|
||||||
|
|
||||||
if results:
|
if results:
|
||||||
assert len(results) == 1 # sanity check (digest_key is primary key)
|
assert len(results) == 1 # sanity check (digest_key is primary key)
|
||||||
result = results[0]
|
result = results[0]
|
||||||
@ -557,7 +611,14 @@ class TroughDedupDb(DedupDb, DedupableMixin):
|
|||||||
'''Returns [{'digest_key': ..., 'url': ..., 'date': ...}, ...]'''
|
'''Returns [{'digest_key': ..., 'url': ..., 'date': ...}, ...]'''
|
||||||
sql_tmpl = 'select * from dedup where digest_key in (%s)' % (
|
sql_tmpl = 'select * from dedup where digest_key in (%s)' % (
|
||||||
','.join('%s' for i in range(len(digest_keys))))
|
','.join('%s' for i in range(len(digest_keys))))
|
||||||
results = self._trough_cli.read(bucket, sql_tmpl, digest_keys)
|
|
||||||
|
try:
|
||||||
|
results = self._trough_cli.read(bucket, sql_tmpl, digest_keys)
|
||||||
|
except:
|
||||||
|
self.logger.warning(
|
||||||
|
'problem reading dedup data from trough', exc_info=True)
|
||||||
|
results = None
|
||||||
|
|
||||||
if results is None:
|
if results is None:
|
||||||
return []
|
return []
|
||||||
self.logger.debug(
|
self.logger.debug(
|
||||||
@ -576,9 +637,11 @@ class TroughDedupDb(DedupDb, DedupableMixin):
|
|||||||
and self.should_dedup(recorded_url)):
|
and self.should_dedup(recorded_url)):
|
||||||
digest_key = warcprox.digest_str(
|
digest_key = warcprox.digest_str(
|
||||||
recorded_url.payload_digest, self.options.base32)
|
recorded_url.payload_digest, self.options.base32)
|
||||||
if recorded_url.warcprox_meta and 'dedup-bucket' in recorded_url.warcprox_meta:
|
if recorded_url.warcprox_meta and 'dedup-buckets' in recorded_url.warcprox_meta:
|
||||||
self.save(
|
for bucket, bucket_mode in recorded_url.warcprox_meta["dedup-buckets"].items():
|
||||||
digest_key, records[0],
|
if not bucket_mode == 'ro':
|
||||||
bucket=recorded_url.warcprox_meta['dedup-bucket'])
|
self.save(
|
||||||
|
digest_key, records[0],
|
||||||
|
bucket=bucket)
|
||||||
else:
|
else:
|
||||||
self.save(digest_key, records[0])
|
self.save(digest_key, records[0])
|
||||||
|
@ -30,6 +30,7 @@ except ImportError:
|
|||||||
import Queue as queue
|
import Queue as queue
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
|
import logging.config
|
||||||
import sys
|
import sys
|
||||||
import hashlib
|
import hashlib
|
||||||
import argparse
|
import argparse
|
||||||
@ -38,7 +39,7 @@ import socket
|
|||||||
import traceback
|
import traceback
|
||||||
import signal
|
import signal
|
||||||
import threading
|
import threading
|
||||||
import certauth.certauth
|
import yaml
|
||||||
import warcprox
|
import warcprox
|
||||||
import doublethink
|
import doublethink
|
||||||
import cryptography.hazmat.backends.openssl
|
import cryptography.hazmat.backends.openssl
|
||||||
@ -89,9 +90,11 @@ def _build_arg_parser(prog='warcprox', show_hidden=False):
|
|||||||
help='where to store and load generated certificates')
|
help='where to store and load generated certificates')
|
||||||
arg_parser.add_argument('-d', '--dir', dest='directory',
|
arg_parser.add_argument('-d', '--dir', dest='directory',
|
||||||
default='./warcs', help='where to write warcs')
|
default='./warcs', help='where to write warcs')
|
||||||
|
arg_parser.add_argument('--subdir-prefix', dest='subdir_prefix', action='store_true',
|
||||||
|
help='write warcs to --dir subdir equal to the current warc-prefix'),
|
||||||
arg_parser.add_argument('--warc-filename', dest='warc_filename',
|
arg_parser.add_argument('--warc-filename', dest='warc_filename',
|
||||||
default='{prefix}-{timestamp17}-{serialno}-{randomtoken}',
|
default='{prefix}-{timestamp17}-{serialno}-{randomtoken}',
|
||||||
help='define custom WARC filename with variables {prefix}, {timestamp14}, {timestamp17}, {serialno}, {randomtoken}, {hostname}, {shorthostname}')
|
help='define custom WARC filename with variables {prefix}, {timestamp14}, {timestamp17}, {serialno}, {randomtoken}, {hostname}, {shorthostname}, {port}')
|
||||||
arg_parser.add_argument('-z', '--gzip', dest='gzip', action='store_true',
|
arg_parser.add_argument('-z', '--gzip', dest='gzip', action='store_true',
|
||||||
help='write gzip-compressed warc records')
|
help='write gzip-compressed warc records')
|
||||||
hidden.add_argument(
|
hidden.add_argument(
|
||||||
@ -205,6 +208,15 @@ def _build_arg_parser(prog='warcprox', show_hidden=False):
|
|||||||
default=None, help=(
|
default=None, help=(
|
||||||
'host:port of tor socks proxy, used only to connect to '
|
'host:port of tor socks proxy, used only to connect to '
|
||||||
'.onion sites'))
|
'.onion sites'))
|
||||||
|
arg_parser.add_argument(
|
||||||
|
'--socks-proxy', dest='socks_proxy',
|
||||||
|
default=None, help='host:port of socks proxy, used for all traffic if activated')
|
||||||
|
arg_parser.add_argument(
|
||||||
|
'--socks-proxy-username', dest='socks_proxy_username',
|
||||||
|
default=None, help='optional socks proxy username')
|
||||||
|
arg_parser.add_argument(
|
||||||
|
'--socks-proxy-password', dest='socks_proxy_password',
|
||||||
|
default=None, help='optional socks proxy password')
|
||||||
hidden.add_argument(
|
hidden.add_argument(
|
||||||
'--socket-timeout', dest='socket_timeout', type=float, default=60,
|
'--socket-timeout', dest='socket_timeout', type=float, default=60,
|
||||||
help=suppress(
|
help=suppress(
|
||||||
@ -239,6 +251,9 @@ def _build_arg_parser(prog='warcprox', show_hidden=False):
|
|||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
'--trace', dest='trace', action='store_true',
|
'--trace', dest='trace', action='store_true',
|
||||||
help='very verbose logging')
|
help='very verbose logging')
|
||||||
|
arg_parser.add_argument(
|
||||||
|
'--logging-conf-file', dest='logging_conf_file', default=None,
|
||||||
|
help=('reads logging configuration from a YAML file'))
|
||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
'--version', action='version',
|
'--version', action='version',
|
||||||
version="warcprox {}".format(warcprox.__version__))
|
version="warcprox {}".format(warcprox.__version__))
|
||||||
@ -259,7 +274,7 @@ def dump_state(signum=None, frame=None):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
state_strs.append('<n/a:%r>' % e)
|
state_strs.append('<n/a:%r>' % e)
|
||||||
|
|
||||||
logging.warn(
|
logging.warning(
|
||||||
'dumping state (caught signal %s)\n%s',
|
'dumping state (caught signal %s)\n%s',
|
||||||
signum, '\n'.join(state_strs))
|
signum, '\n'.join(state_strs))
|
||||||
|
|
||||||
@ -297,11 +312,17 @@ def main(argv=None):
|
|||||||
else:
|
else:
|
||||||
loglevel = logging.INFO
|
loglevel = logging.INFO
|
||||||
|
|
||||||
|
logging.root.handlers = []
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
stream=sys.stdout, level=loglevel, format=(
|
stream=sys.stdout, level=loglevel, format=(
|
||||||
'%(asctime)s %(process)d %(levelname)s %(threadName)s '
|
'%(asctime)s %(process)d %(levelname)s %(threadName)s '
|
||||||
'%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s'))
|
'%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s'))
|
||||||
|
|
||||||
|
if args.logging_conf_file:
|
||||||
|
with open(args.logging_conf_file, 'r') as fd:
|
||||||
|
conf = yaml.safe_load(fd)
|
||||||
|
logging.config.dictConfig(conf)
|
||||||
|
|
||||||
# see https://github.com/pyca/cryptography/issues/2911
|
# see https://github.com/pyca/cryptography/issues/2911
|
||||||
cryptography.hazmat.backends.openssl.backend.activate_builtin_random()
|
cryptography.hazmat.backends.openssl.backend.activate_builtin_random()
|
||||||
|
|
||||||
@ -316,7 +337,11 @@ def main(argv=None):
|
|||||||
# SIGQUIT does not exist on some platforms (windows)
|
# SIGQUIT does not exist on some platforms (windows)
|
||||||
pass
|
pass
|
||||||
|
|
||||||
controller.run_until_shutdown()
|
try:
|
||||||
|
controller.run_until_shutdown()
|
||||||
|
except:
|
||||||
|
logging.fatal('unhandled exception in controller', exc_info=True)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
def ensure_rethinkdb_tables(argv=None):
|
def ensure_rethinkdb_tables(argv=None):
|
||||||
'''
|
'''
|
||||||
@ -388,7 +413,7 @@ def ensure_rethinkdb_tables(argv=None):
|
|||||||
did_something = True
|
did_something = True
|
||||||
if args.rethinkdb_trough_db_url:
|
if args.rethinkdb_trough_db_url:
|
||||||
dedup_db = warcprox.dedup.TroughDedupDb(options)
|
dedup_db = warcprox.dedup.TroughDedupDb(options)
|
||||||
logging.warn(
|
logging.warning(
|
||||||
'trough is responsible for creating most of the rethinkdb '
|
'trough is responsible for creating most of the rethinkdb '
|
||||||
'tables that it uses')
|
'tables that it uses')
|
||||||
did_something = True
|
did_something = True
|
||||||
|
@ -35,6 +35,13 @@ try:
|
|||||||
import urllib.parse as urllib_parse
|
import urllib.parse as urllib_parse
|
||||||
except ImportError:
|
except ImportError:
|
||||||
import urlparse as urllib_parse
|
import urlparse as urllib_parse
|
||||||
|
# In python2/3, urllib parse caches in memory URL parsing results to avoid
|
||||||
|
# repeating the process for the same URL. The problem is that the default
|
||||||
|
# in memory cache size is just 20.
|
||||||
|
# https://github.com/python/cpython/blob/3.7/Lib/urllib/parse.py#L80
|
||||||
|
# since we do a lot of URL parsing, it makes sense to increase cache size.
|
||||||
|
urllib_parse.MAX_CACHE_SIZE = 2000
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import http.client as http_client
|
import http.client as http_client
|
||||||
# In python3 http.client.parse_headers() enforces http_client._MAXLINE
|
# In python3 http.client.parse_headers() enforces http_client._MAXLINE
|
||||||
@ -45,6 +52,11 @@ try:
|
|||||||
http_client._MAXLINE = 4194304 # 4 MiB
|
http_client._MAXLINE = 4194304 # 4 MiB
|
||||||
except ImportError:
|
except ImportError:
|
||||||
import httplib as http_client
|
import httplib as http_client
|
||||||
|
# http_client has an arbitrary limit of 100 HTTP Headers which is too low and
|
||||||
|
# it raises an HTTPException if the target URL has more.
|
||||||
|
# https://github.com/python/cpython/blob/3.7/Lib/http/client.py#L113
|
||||||
|
http_client._MAXHEADERS = 7000
|
||||||
|
|
||||||
import json
|
import json
|
||||||
import socket
|
import socket
|
||||||
import logging
|
import logging
|
||||||
@ -52,6 +64,7 @@ import ssl
|
|||||||
import warcprox
|
import warcprox
|
||||||
import threading
|
import threading
|
||||||
import datetime
|
import datetime
|
||||||
|
import random
|
||||||
import socks
|
import socks
|
||||||
import tempfile
|
import tempfile
|
||||||
import hashlib
|
import hashlib
|
||||||
@ -64,8 +77,14 @@ import urlcanon
|
|||||||
import time
|
import time
|
||||||
import collections
|
import collections
|
||||||
import cProfile
|
import cProfile
|
||||||
|
from urllib3 import PoolManager
|
||||||
from urllib3.util import is_connection_dropped
|
from urllib3.util import is_connection_dropped
|
||||||
|
from urllib3.exceptions import TimeoutError, HTTPError, NewConnectionError
|
||||||
import doublethink
|
import doublethink
|
||||||
|
from cachetools import TTLCache
|
||||||
|
from threading import RLock
|
||||||
|
|
||||||
|
from .certauth import CertificateAuthority
|
||||||
|
|
||||||
class ProxyingRecorder(object):
|
class ProxyingRecorder(object):
|
||||||
"""
|
"""
|
||||||
@ -100,7 +119,7 @@ class ProxyingRecorder(object):
|
|||||||
self.proxy_client.sendall(hunk)
|
self.proxy_client.sendall(hunk)
|
||||||
except BaseException as e:
|
except BaseException as e:
|
||||||
self._proxy_client_conn_open = False
|
self._proxy_client_conn_open = False
|
||||||
self.logger.warn(
|
self.logger.warning(
|
||||||
'%s sending data to proxy client for url %s',
|
'%s sending data to proxy client for url %s',
|
||||||
e, self.url)
|
e, self.url)
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
@ -203,6 +222,28 @@ def via_header_value(orig, request_version):
|
|||||||
via = via + '%s %s' % (request_version, 'warcprox')
|
via = via + '%s %s' % (request_version, 'warcprox')
|
||||||
return via
|
return via
|
||||||
|
|
||||||
|
|
||||||
|
# Ref and detailed description about cipher selection at
|
||||||
|
# https://github.com/urllib3/urllib3/blob/f070ec2e6f6c545f40d9196e5246df10c72e48e1/src/urllib3/util/ssl_.py#L170
|
||||||
|
SSL_CIPHERS = [
|
||||||
|
"ECDHE+AESGCM",
|
||||||
|
"ECDHE+CHACHA20",
|
||||||
|
"DH+AESGCM",
|
||||||
|
"ECDH+AES",
|
||||||
|
"DH+AES",
|
||||||
|
"RSA+AESGCM",
|
||||||
|
"RSA+AES",
|
||||||
|
"!aNULL",
|
||||||
|
"!eNULL",
|
||||||
|
"!MD5",
|
||||||
|
"!DSS",
|
||||||
|
"!AESCCM",
|
||||||
|
"DHE+AESGCM",
|
||||||
|
"DHE+CHACHA20",
|
||||||
|
"ECDH+AESGCM",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
||||||
'''
|
'''
|
||||||
An http proxy implementation of BaseHTTPRequestHandler, that acts as a
|
An http proxy implementation of BaseHTTPRequestHandler, that acts as a
|
||||||
@ -210,9 +251,16 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
|||||||
and records the bytes in transit as it proxies them.
|
and records the bytes in transit as it proxies them.
|
||||||
'''
|
'''
|
||||||
logger = logging.getLogger("warcprox.mitmproxy.MitmProxyHandler")
|
logger = logging.getLogger("warcprox.mitmproxy.MitmProxyHandler")
|
||||||
|
|
||||||
_socket_timeout = 60
|
_socket_timeout = 60
|
||||||
_max_resource_size = None
|
_max_resource_size = None
|
||||||
_tmp_file_max_memory_size = 512 * 1024
|
_tmp_file_max_memory_size = 512 * 1024
|
||||||
|
onion_tor_socks_proxy_host = None
|
||||||
|
onion_tor_socks_proxy_port = None
|
||||||
|
socks_proxy_host = None
|
||||||
|
socks_proxy_port = None
|
||||||
|
socks_proxy_username = None
|
||||||
|
socks_proxy_password = None
|
||||||
|
|
||||||
def __init__(self, request, client_address, server):
|
def __init__(self, request, client_address, server):
|
||||||
threading.current_thread().name = 'MitmProxyHandler(tid={},started={},client={}:{})'.format(warcprox.gettid(), datetime.datetime.utcnow().isoformat(), client_address[0], client_address[1])
|
threading.current_thread().name = 'MitmProxyHandler(tid={},started={},client={}:{})'.format(warcprox.gettid(), datetime.datetime.utcnow().isoformat(), client_address[0], client_address[1])
|
||||||
@ -228,7 +276,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
|||||||
else:
|
else:
|
||||||
self.url = self.path
|
self.url = self.path
|
||||||
u = urllib_parse.urlparse(self.url)
|
u = urllib_parse.urlparse(self.url)
|
||||||
if u.scheme != 'http':
|
if u.scheme != 'http' or u.netloc == '':
|
||||||
raise Exception(
|
raise Exception(
|
||||||
'unable to parse request %r as a proxy request' % (
|
'unable to parse request %r as a proxy request' % (
|
||||||
self.requestline))
|
self.requestline))
|
||||||
@ -240,6 +288,9 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
|||||||
query=u.query, fragment=u.fragment))
|
query=u.query, fragment=u.fragment))
|
||||||
self.hostname = urlcanon.normalize_host(host).decode('ascii')
|
self.hostname = urlcanon.normalize_host(host).decode('ascii')
|
||||||
|
|
||||||
|
def _hostname_port_cache_key(self):
|
||||||
|
return '%s:%s' % (self.hostname, self.port)
|
||||||
|
|
||||||
def _connect_to_remote_server(self):
|
def _connect_to_remote_server(self):
|
||||||
'''
|
'''
|
||||||
Connect to destination.
|
Connect to destination.
|
||||||
@ -251,7 +302,9 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
|||||||
'''
|
'''
|
||||||
self._conn_pool = self.server.remote_connection_pool.connection_from_host(
|
self._conn_pool = self.server.remote_connection_pool.connection_from_host(
|
||||||
host=self.hostname, port=int(self.port), scheme='http',
|
host=self.hostname, port=int(self.port), scheme='http',
|
||||||
pool_kwargs={'maxsize': 6, 'timeout': self._socket_timeout})
|
pool_kwargs={'maxsize': 12, 'timeout': self._socket_timeout})
|
||||||
|
|
||||||
|
remote_ip = None
|
||||||
|
|
||||||
self._remote_server_conn = self._conn_pool._get_conn()
|
self._remote_server_conn = self._conn_pool._get_conn()
|
||||||
if is_connection_dropped(self._remote_server_conn):
|
if is_connection_dropped(self._remote_server_conn):
|
||||||
@ -266,8 +319,21 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
|||||||
port=self.onion_tor_socks_proxy_port, rdns=True)
|
port=self.onion_tor_socks_proxy_port, rdns=True)
|
||||||
self._remote_server_conn.sock.settimeout(self._socket_timeout)
|
self._remote_server_conn.sock.settimeout(self._socket_timeout)
|
||||||
self._remote_server_conn.sock.connect((self.hostname, int(self.port)))
|
self._remote_server_conn.sock.connect((self.hostname, int(self.port)))
|
||||||
|
elif self.socks_proxy_host and self.socks_proxy_port:
|
||||||
|
self.logger.info(
|
||||||
|
"using socks proxy at %s:%s to connect to %s",
|
||||||
|
self.socks_proxy_host, self.socks_proxy_port, self.hostname)
|
||||||
|
self._remote_server_conn.sock = socks.socksocket()
|
||||||
|
self._remote_server_conn.sock.set_proxy(
|
||||||
|
socks.SOCKS5, addr=self.socks_proxy_host,
|
||||||
|
port=self.socks_proxy_port, rdns=True,
|
||||||
|
username=self.socks_proxy_username,
|
||||||
|
password=self.socks_proxy_password)
|
||||||
|
self._remote_server_conn.sock.settimeout(self._socket_timeout)
|
||||||
|
self._remote_server_conn.sock.connect((self.hostname, int(self.port)))
|
||||||
else:
|
else:
|
||||||
self._remote_server_conn.connect()
|
self._remote_server_conn.connect()
|
||||||
|
remote_ip = self._remote_server_conn.sock.getpeername()[0]
|
||||||
|
|
||||||
# Wrap socket if SSL is required
|
# Wrap socket if SSL is required
|
||||||
if self.is_connect:
|
if self.is_connect:
|
||||||
@ -275,6 +341,9 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
|||||||
context = ssl.create_default_context()
|
context = ssl.create_default_context()
|
||||||
context.check_hostname = False
|
context.check_hostname = False
|
||||||
context.verify_mode = ssl.CERT_NONE
|
context.verify_mode = ssl.CERT_NONE
|
||||||
|
# randomize TLS fingerprint to evade anti-web-bot systems
|
||||||
|
random.shuffle(SSL_CIPHERS)
|
||||||
|
context.set_ciphers(":".join(SSL_CIPHERS))
|
||||||
self._remote_server_conn.sock = context.wrap_socket(
|
self._remote_server_conn.sock = context.wrap_socket(
|
||||||
self._remote_server_conn.sock,
|
self._remote_server_conn.sock,
|
||||||
server_hostname=self.hostname)
|
server_hostname=self.hostname)
|
||||||
@ -283,12 +352,17 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
|||||||
self._remote_server_conn.sock = ssl.wrap_socket(
|
self._remote_server_conn.sock = ssl.wrap_socket(
|
||||||
self._remote_server_conn.sock)
|
self._remote_server_conn.sock)
|
||||||
except ssl.SSLError:
|
except ssl.SSLError:
|
||||||
self.logger.warn(
|
self.logger.warning(
|
||||||
"failed to establish ssl connection to %s; "
|
"failed to establish ssl connection to %s; "
|
||||||
"python ssl library does not support SNI, "
|
"python ssl library does not support SNI, "
|
||||||
"consider upgrading to python 2.7.9+ or 3.4+",
|
"consider upgrading to python 2.7.9+ or 3.4+",
|
||||||
self.hostname)
|
self.hostname)
|
||||||
raise
|
raise
|
||||||
|
except ssl.SSLError as e:
|
||||||
|
self.logger.error(
|
||||||
|
'error connecting to %s (%s) port %s: %s',
|
||||||
|
self.hostname, remote_ip, self.port, e)
|
||||||
|
raise
|
||||||
return self._remote_server_conn.sock
|
return self._remote_server_conn.sock
|
||||||
|
|
||||||
def _transition_to_ssl(self):
|
def _transition_to_ssl(self):
|
||||||
@ -328,11 +402,11 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
|||||||
self.logger.error(
|
self.logger.error(
|
||||||
"problem handling %r: %r", self.requestline, e)
|
"problem handling %r: %r", self.requestline, e)
|
||||||
if type(e) is socket.timeout:
|
if type(e) is socket.timeout:
|
||||||
self.send_error(504, str(e))
|
self.send_error(504, str(e), exception=e)
|
||||||
else:
|
else:
|
||||||
self.send_error(500, str(e))
|
self.send_error(500, str(e))
|
||||||
except Exception as f:
|
except Exception as f:
|
||||||
self.logger.warn("failed to send error response ({}) to proxy client: {}".format(e, f))
|
self.logger.warning("failed to send error response ({}) to proxy client: {}".format(e, f))
|
||||||
return
|
return
|
||||||
|
|
||||||
# Reload!
|
# Reload!
|
||||||
@ -368,25 +442,55 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
|||||||
else:
|
else:
|
||||||
self._determine_host_port()
|
self._determine_host_port()
|
||||||
assert self.url
|
assert self.url
|
||||||
|
# Check if target hostname:port is in `bad_hostnames_ports` cache
|
||||||
|
# to avoid retrying to connect. Cached value is http status code.
|
||||||
|
cached = None
|
||||||
|
hostname_port = self._hostname_port_cache_key()
|
||||||
|
with self.server.bad_hostnames_ports_lock:
|
||||||
|
cached = self.server.bad_hostnames_ports.get(hostname_port)
|
||||||
|
if cached:
|
||||||
|
self.logger.info('Cannot connect to %s (cache)', hostname_port)
|
||||||
|
self.send_error(cached, exception=Exception('Cached Failed Connection'))
|
||||||
|
return
|
||||||
# Connect to destination
|
# Connect to destination
|
||||||
self._connect_to_remote_server()
|
self._connect_to_remote_server()
|
||||||
except warcprox.RequestBlockedByRule as e:
|
except warcprox.RequestBlockedByRule as e:
|
||||||
# limit enforcers have already sent the appropriate response
|
# limit enforcers have already sent the appropriate response
|
||||||
self.logger.info("%r: %r", self.requestline, e)
|
self.logger.info("%r: %r", self.requestline, e)
|
||||||
return
|
return
|
||||||
|
except warcprox.BadRequest as e:
|
||||||
|
self.send_error(400, e.msg)
|
||||||
|
return
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
# If connection fails, add hostname:port to cache to avoid slow
|
||||||
|
# subsequent reconnection attempts. `NewConnectionError` can be
|
||||||
|
# caused by many types of errors which are handled by urllib3.
|
||||||
|
response_code = 500
|
||||||
|
cache = False
|
||||||
|
if isinstance(e, (socket.timeout, TimeoutError,)):
|
||||||
|
response_code = 504
|
||||||
|
cache = True
|
||||||
|
elif isinstance(e, HTTPError):
|
||||||
|
response_code = 502
|
||||||
|
cache = True
|
||||||
|
|
||||||
|
if cache:
|
||||||
|
host_port = self._hostname_port_cache_key()
|
||||||
|
with self.server.bad_hostnames_ports_lock:
|
||||||
|
self.server.bad_hostnames_ports[host_port] = response_code
|
||||||
|
self.logger.info('bad_hostnames_ports cache size: %d',
|
||||||
|
len(self.server.bad_hostnames_ports))
|
||||||
self.logger.error(
|
self.logger.error(
|
||||||
"problem processing request %r: %r",
|
"problem processing request %r: %r",
|
||||||
self.requestline, e, exc_info=True)
|
self.requestline, e, exc_info=True)
|
||||||
self.send_error(500, str(e))
|
self.send_error(response_code, exception=e)
|
||||||
return
|
return
|
||||||
|
|
||||||
try:
|
try:
|
||||||
return self._proxy_request()
|
return self._proxy_request()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if self.server.shutting_down:
|
if self.server.shutting_down:
|
||||||
self.logger.warn(
|
self.logger.warning(
|
||||||
'sending 503 warcprox shutting down %r: %r',
|
'sending 503 warcprox shutting down %r: %r',
|
||||||
self.requestline, e)
|
self.requestline, e)
|
||||||
self.send_error(503, 'warcprox shutting down')
|
self.send_error(503, 'warcprox shutting down')
|
||||||
@ -394,10 +498,10 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
|||||||
self.logger.error(
|
self.logger.error(
|
||||||
'error from remote server(?) %r: %r',
|
'error from remote server(?) %r: %r',
|
||||||
self.requestline, e, exc_info=True)
|
self.requestline, e, exc_info=True)
|
||||||
self.send_error(502, str(e))
|
self.send_error(502)
|
||||||
return
|
return
|
||||||
|
|
||||||
def send_error(self, code, message=None, explain=None):
|
def send_error(self, code, message=None, explain=None, exception=None):
|
||||||
# BaseHTTPRequestHandler.send_response_only() in http/server.py
|
# BaseHTTPRequestHandler.send_response_only() in http/server.py
|
||||||
# does this:
|
# does this:
|
||||||
# if not hasattr(self, '_headers_buffer'):
|
# if not hasattr(self, '_headers_buffer'):
|
||||||
@ -410,9 +514,13 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
|||||||
try:
|
try:
|
||||||
return http_server.BaseHTTPRequestHandler.send_error(
|
return http_server.BaseHTTPRequestHandler.send_error(
|
||||||
self, code, message, explain)
|
self, code, message, explain)
|
||||||
except:
|
except Exception as e:
|
||||||
self.logger.error(
|
level = logging.ERROR
|
||||||
'send_error(%r, %r, %r) raised exception', exc_info=True)
|
if isinstance(e, OSError) and e.errno == 9:
|
||||||
|
level = logging.TRACE
|
||||||
|
self.logger.log(
|
||||||
|
level, 'send_error(%r, %r, %r) raised exception',
|
||||||
|
exc_info=True)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def _proxy_request(self, extra_response_headers={}):
|
def _proxy_request(self, extra_response_headers={}):
|
||||||
@ -424,6 +532,33 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
|||||||
self.server.unregister_remote_server_sock(
|
self.server.unregister_remote_server_sock(
|
||||||
self._remote_server_conn.sock)
|
self._remote_server_conn.sock)
|
||||||
|
|
||||||
|
def _swallow_hop_by_hop_headers(self):
|
||||||
|
'''
|
||||||
|
Swallow headers that don't make sense to forward on, i.e.
|
||||||
|
most hop-by-hop headers.
|
||||||
|
|
||||||
|
http://tools.ietf.org/html/rfc2616#section-13.5.
|
||||||
|
'''
|
||||||
|
# self.headers is an email.message.Message, which is case-insensitive
|
||||||
|
# and doesn't throw KeyError in __delitem__
|
||||||
|
for key in (
|
||||||
|
'Warcprox-Meta', 'Connection', 'Proxy-Connection', 'Keep-Alive',
|
||||||
|
'Proxy-Authenticate', 'Proxy-Authorization', 'Upgrade'):
|
||||||
|
del self.headers[key]
|
||||||
|
|
||||||
|
def _build_request(self):
|
||||||
|
req_str = '{} {} {}\r\n'.format(
|
||||||
|
self.command, self.path, self.request_version)
|
||||||
|
|
||||||
|
# Add headers to the request
|
||||||
|
# XXX in at least python3.3 str(self.headers) uses \n not \r\n :(
|
||||||
|
req_str += '\r\n'.join(
|
||||||
|
'{}: {}'.format(k,v) for (k,v) in self.headers.items())
|
||||||
|
|
||||||
|
req = req_str.encode('latin1') + b'\r\n\r\n'
|
||||||
|
|
||||||
|
return req
|
||||||
|
|
||||||
def _inner_proxy_request(self, extra_response_headers={}):
|
def _inner_proxy_request(self, extra_response_headers={}):
|
||||||
'''
|
'''
|
||||||
Sends the request to the remote server, then uses a ProxyingRecorder to
|
Sends the request to the remote server, then uses a ProxyingRecorder to
|
||||||
@ -435,29 +570,11 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
|||||||
It may contain extra HTTP headers such as ``Warcprox-Meta`` which
|
It may contain extra HTTP headers such as ``Warcprox-Meta`` which
|
||||||
are written in the WARC record for this request.
|
are written in the WARC record for this request.
|
||||||
'''
|
'''
|
||||||
# Build request
|
self._swallow_hop_by_hop_headers()
|
||||||
req_str = '{} {} {}\r\n'.format(
|
|
||||||
self.command, self.path, self.request_version)
|
|
||||||
|
|
||||||
# Swallow headers that don't make sense to forward on, i.e. most
|
|
||||||
# hop-by-hop headers. http://tools.ietf.org/html/rfc2616#section-13.5.
|
|
||||||
# self.headers is an email.message.Message, which is case-insensitive
|
|
||||||
# and doesn't throw KeyError in __delitem__
|
|
||||||
for key in (
|
|
||||||
'Connection', 'Proxy-Connection', 'Keep-Alive',
|
|
||||||
'Proxy-Authenticate', 'Proxy-Authorization', 'Upgrade'):
|
|
||||||
del self.headers[key]
|
|
||||||
|
|
||||||
self.headers['Via'] = via_header_value(
|
self.headers['Via'] = via_header_value(
|
||||||
self.headers.get('Via'),
|
self.headers.get('Via'),
|
||||||
self.request_version.replace('HTTP/', ''))
|
self.request_version.replace('HTTP/', ''))
|
||||||
|
req = self._build_request()
|
||||||
# Add headers to the request
|
|
||||||
# XXX in at least python3.3 str(self.headers) uses \n not \r\n :(
|
|
||||||
req_str += '\r\n'.join(
|
|
||||||
'{}: {}'.format(k,v) for (k,v) in self.headers.items())
|
|
||||||
|
|
||||||
req = req_str.encode('latin1') + b'\r\n\r\n'
|
|
||||||
|
|
||||||
# Append message body if present to the request
|
# Append message body if present to the request
|
||||||
if 'Content-Length' in self.headers:
|
if 'Content-Length' in self.headers:
|
||||||
@ -478,9 +595,14 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
|||||||
tmp_file_max_memory_size=self._tmp_file_max_memory_size)
|
tmp_file_max_memory_size=self._tmp_file_max_memory_size)
|
||||||
prox_rec_res.begin(extra_response_headers=extra_response_headers)
|
prox_rec_res.begin(extra_response_headers=extra_response_headers)
|
||||||
|
|
||||||
buf = prox_rec_res.read(65536)
|
buf = None
|
||||||
while buf != b'':
|
while buf != b'':
|
||||||
buf = prox_rec_res.read(65536)
|
try:
|
||||||
|
buf = prox_rec_res.read(65536)
|
||||||
|
except http_client.IncompleteRead as e:
|
||||||
|
self.logger.warning('%s from %s', e, self.url)
|
||||||
|
buf = e.partial
|
||||||
|
|
||||||
if (self._max_resource_size and
|
if (self._max_resource_size and
|
||||||
prox_rec_res.recorder.len > self._max_resource_size):
|
prox_rec_res.recorder.len > self._max_resource_size):
|
||||||
prox_rec_res.truncated = b'length'
|
prox_rec_res.truncated = b'length'
|
||||||
@ -506,9 +628,31 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
|||||||
# put it back in the pool to reuse it later.
|
# put it back in the pool to reuse it later.
|
||||||
if not is_connection_dropped(self._remote_server_conn):
|
if not is_connection_dropped(self._remote_server_conn):
|
||||||
self._conn_pool._put_conn(self._remote_server_conn)
|
self._conn_pool._put_conn(self._remote_server_conn)
|
||||||
except:
|
except Exception as e:
|
||||||
self._remote_server_conn.sock.shutdown(socket.SHUT_RDWR)
|
# A common error is to connect to the remote server successfully
|
||||||
self._remote_server_conn.sock.close()
|
# but raise a `RemoteDisconnected` exception when trying to begin
|
||||||
|
# downloading. Its caused by prox_rec_res.begin(...) which calls
|
||||||
|
# http_client._read_status(). The connection fails there.
|
||||||
|
# https://github.com/python/cpython/blob/3.7/Lib/http/client.py#L275
|
||||||
|
# Another case is when the connection is fine but the response
|
||||||
|
# status is problematic, raising `BadStatusLine`.
|
||||||
|
# https://github.com/python/cpython/blob/3.7/Lib/http/client.py#L296
|
||||||
|
# In both cases, the host is bad and we must add it to
|
||||||
|
# `bad_hostnames_ports` cache.
|
||||||
|
if isinstance(e, (http_client.RemoteDisconnected,
|
||||||
|
http_client.BadStatusLine)):
|
||||||
|
host_port = self._hostname_port_cache_key()
|
||||||
|
with self.server.bad_hostnames_ports_lock:
|
||||||
|
self.server.bad_hostnames_ports[host_port] = 502
|
||||||
|
self.logger.info('bad_hostnames_ports cache size: %d',
|
||||||
|
len(self.server.bad_hostnames_ports))
|
||||||
|
|
||||||
|
# Close the connection only if its still open. If its already
|
||||||
|
# closed, an `OSError` "([Errno 107] Transport endpoint is not
|
||||||
|
# connected)" would be raised.
|
||||||
|
if not is_connection_dropped(self._remote_server_conn):
|
||||||
|
self._remote_server_conn.sock.shutdown(socket.SHUT_RDWR)
|
||||||
|
self._remote_server_conn.sock.close()
|
||||||
raise
|
raise
|
||||||
finally:
|
finally:
|
||||||
if prox_rec_res:
|
if prox_rec_res:
|
||||||
@ -521,7 +665,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
|||||||
return self.do_COMMAND
|
return self.do_COMMAND
|
||||||
|
|
||||||
def log_error(self, fmt, *args):
|
def log_error(self, fmt, *args):
|
||||||
self.logger.warn(fmt, *args)
|
self.logger.warning(fmt, *args)
|
||||||
|
|
||||||
class PooledMixIn(socketserver.ThreadingMixIn):
|
class PooledMixIn(socketserver.ThreadingMixIn):
|
||||||
logger = logging.getLogger("warcprox.mitmproxy.PooledMixIn")
|
logger = logging.getLogger("warcprox.mitmproxy.PooledMixIn")
|
||||||
@ -667,6 +811,63 @@ class PooledMitmProxy(PooledMixIn, MitmProxy):
|
|||||||
Abort active connections to remote servers to achieve prompt shutdown.
|
Abort active connections to remote servers to achieve prompt shutdown.
|
||||||
'''
|
'''
|
||||||
self.shutting_down = True
|
self.shutting_down = True
|
||||||
for sock in self.remote_server_socks:
|
for sock in list(self.remote_server_socks):
|
||||||
self.shutdown_request(sock)
|
self.shutdown_request(sock)
|
||||||
|
|
||||||
|
class SingleThreadedMitmProxy(http_server.HTTPServer):
|
||||||
|
logger = logging.getLogger('warcprox.warcproxy.SingleThreadedMitmProxy')
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self, MitmProxyHandlerClass=MitmProxyHandler,
|
||||||
|
options=warcprox.Options()):
|
||||||
|
self.options = options
|
||||||
|
|
||||||
|
# TTLCache is not thread-safe. Access to the shared cache from multiple
|
||||||
|
# threads must be properly synchronized with an RLock according to ref:
|
||||||
|
# https://cachetools.readthedocs.io/en/latest/
|
||||||
|
self.bad_hostnames_ports = TTLCache(maxsize=1024, ttl=60)
|
||||||
|
self.bad_hostnames_ports_lock = RLock()
|
||||||
|
|
||||||
|
self.remote_connection_pool = PoolManager(
|
||||||
|
num_pools=max((options.max_threads or 0) // 6, 400), maxsize=6)
|
||||||
|
|
||||||
|
if options.onion_tor_socks_proxy:
|
||||||
|
try:
|
||||||
|
host, port = options.onion_tor_socks_proxy.split(':')
|
||||||
|
MitmProxyHandlerClass.onion_tor_socks_proxy_host = host
|
||||||
|
MitmProxyHandlerClass.onion_tor_socks_proxy_port = int(port)
|
||||||
|
except ValueError:
|
||||||
|
MitmProxyHandlerClass.onion_tor_socks_proxy_host = options.onion_tor_socks_proxy
|
||||||
|
MitmProxyHandlerClass.onion_tor_socks_proxy_port = None
|
||||||
|
if options.socks_proxy:
|
||||||
|
host, port = options.socks_proxy.split(':')
|
||||||
|
MitmProxyHandlerClass.socks_proxy_host = host
|
||||||
|
MitmProxyHandlerClass.socks_proxy_port = int(port)
|
||||||
|
if options.socks_proxy_username:
|
||||||
|
MitmProxyHandlerClass.socks_proxy_username = options.socks_proxy_username
|
||||||
|
if options.socks_proxy_password:
|
||||||
|
MitmProxyHandlerClass.socks_proxy_password = options.socks_proxy_password
|
||||||
|
|
||||||
|
if options.socket_timeout:
|
||||||
|
MitmProxyHandlerClass._socket_timeout = options.socket_timeout
|
||||||
|
if options.max_resource_size:
|
||||||
|
MitmProxyHandlerClass._max_resource_size = options.max_resource_size
|
||||||
|
if options.tmp_file_max_memory_size:
|
||||||
|
MitmProxyHandlerClass._tmp_file_max_memory_size = options.tmp_file_max_memory_size
|
||||||
|
|
||||||
|
self.digest_algorithm = options.digest_algorithm or 'sha1'
|
||||||
|
|
||||||
|
ca_name = ('Warcprox CA on %s' % socket.gethostname())[:64]
|
||||||
|
self.ca = CertificateAuthority(
|
||||||
|
ca_file=options.cacert or 'warcprox-ca.pem',
|
||||||
|
certs_dir=options.certs_dir or './warcprox-ca',
|
||||||
|
ca_name=ca_name)
|
||||||
|
|
||||||
|
server_address = (
|
||||||
|
options.address or 'localhost',
|
||||||
|
options.port if options.port is not None else 8000)
|
||||||
|
|
||||||
|
http_server.HTTPServer.__init__(
|
||||||
|
self, server_address, MitmProxyHandlerClass,
|
||||||
|
bind_and_activate=True)
|
||||||
|
|
||||||
|
@ -42,6 +42,7 @@ from warcprox.mitmproxy import MitmProxyHandler
|
|||||||
import warcprox
|
import warcprox
|
||||||
import sqlite3
|
import sqlite3
|
||||||
import threading
|
import threading
|
||||||
|
from cachetools import TTLCache
|
||||||
|
|
||||||
class PlaybackProxyHandler(MitmProxyHandler):
|
class PlaybackProxyHandler(MitmProxyHandler):
|
||||||
logger = logging.getLogger("warcprox.playback.PlaybackProxyHandler")
|
logger = logging.getLogger("warcprox.playback.PlaybackProxyHandler")
|
||||||
@ -219,6 +220,8 @@ class PlaybackProxy(socketserver.ThreadingMixIn, http_server.HTTPServer):
|
|||||||
self.playback_index_db = playback_index_db
|
self.playback_index_db = playback_index_db
|
||||||
self.warcs_dir = options.directory
|
self.warcs_dir = options.directory
|
||||||
self.options = options
|
self.options = options
|
||||||
|
self.bad_hostnames_ports = TTLCache(maxsize=1024, ttl=60)
|
||||||
|
self.bad_hostnames_ports_lock = threading.RLock()
|
||||||
|
|
||||||
def server_activate(self):
|
def server_activate(self):
|
||||||
http_server.HTTPServer.server_activate(self)
|
http_server.HTTPServer.server_activate(self)
|
||||||
|
@ -29,7 +29,7 @@ import doublethink
|
|||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import rethinkdb as r
|
from rethinkdb import RethinkDB; r = RethinkDB()
|
||||||
import sqlite3
|
import sqlite3
|
||||||
import threading
|
import threading
|
||||||
import time
|
import time
|
||||||
@ -81,7 +81,7 @@ def unravel_buckets(url, warcprox_meta):
|
|||||||
for bucket in warcprox_meta["stats"]["buckets"]:
|
for bucket in warcprox_meta["stats"]["buckets"]:
|
||||||
if isinstance(bucket, dict):
|
if isinstance(bucket, dict):
|
||||||
if not 'bucket' in bucket:
|
if not 'bucket' in bucket:
|
||||||
self.logger.warn(
|
self.logger.warning(
|
||||||
'ignoring invalid stats bucket in '
|
'ignoring invalid stats bucket in '
|
||||||
'warcprox-meta header %s', bucket)
|
'warcprox-meta header %s', bucket)
|
||||||
continue
|
continue
|
||||||
@ -162,6 +162,8 @@ class StatsProcessor(warcprox.BaseBatchPostfetchProcessor):
|
|||||||
def _tally_batch(self, batch):
|
def _tally_batch(self, batch):
|
||||||
batch_buckets = {}
|
batch_buckets = {}
|
||||||
for recorded_url in batch:
|
for recorded_url in batch:
|
||||||
|
if isinstance(recorded_url, warcprox.warcproxy.FailedUrl):
|
||||||
|
continue
|
||||||
for bucket in self.buckets(recorded_url):
|
for bucket in self.buckets(recorded_url):
|
||||||
bucket_stats = batch_buckets.get(bucket)
|
bucket_stats = batch_buckets.get(bucket)
|
||||||
if not bucket_stats:
|
if not bucket_stats:
|
||||||
@ -297,6 +299,8 @@ class RunningStats:
|
|||||||
(self.first_snap_time - 120 + i * 10, 0, 0))
|
(self.first_snap_time - 120 + i * 10, 0, 0))
|
||||||
|
|
||||||
def notify(self, recorded_url, records):
|
def notify(self, recorded_url, records):
|
||||||
|
if isinstance(recorded_url, warcprox.warcproxy.FailedUrl):
|
||||||
|
return
|
||||||
with self._lock:
|
with self._lock:
|
||||||
self.urls += 1
|
self.urls += 1
|
||||||
if records:
|
if records:
|
||||||
|
@ -1,246 +0,0 @@
|
|||||||
'''
|
|
||||||
warcprox/trough.py - trough client code
|
|
||||||
|
|
||||||
Copyright (C) 2017 Internet Archive
|
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or
|
|
||||||
modify it under the terms of the GNU General Public License
|
|
||||||
as published by the Free Software Foundation; either version 2
|
|
||||||
of the License, or (at your option) any later version.
|
|
||||||
|
|
||||||
This program is distributed in the hope that it will be useful,
|
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
GNU General Public License for more details.
|
|
||||||
|
|
||||||
You should have received a copy of the GNU General Public License
|
|
||||||
along with this program; if not, write to the Free Software
|
|
||||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
|
|
||||||
USA.
|
|
||||||
'''
|
|
||||||
|
|
||||||
from __future__ import absolute_import
|
|
||||||
|
|
||||||
import logging
|
|
||||||
import os
|
|
||||||
import json
|
|
||||||
import requests
|
|
||||||
import doublethink
|
|
||||||
import rethinkdb as r
|
|
||||||
import datetime
|
|
||||||
import threading
|
|
||||||
import time
|
|
||||||
|
|
||||||
class TroughClient(object):
|
|
||||||
logger = logging.getLogger("warcprox.trough.TroughClient")
|
|
||||||
|
|
||||||
def __init__(self, rethinkdb_trough_db_url, promotion_interval=None):
|
|
||||||
'''
|
|
||||||
TroughClient constructor
|
|
||||||
|
|
||||||
Args:
|
|
||||||
rethinkdb_trough_db_url: url with schema rethinkdb:// pointing to
|
|
||||||
trough configuration database
|
|
||||||
promotion_interval: if specified, `TroughClient` will spawn a
|
|
||||||
thread that "promotes" (pushed to hdfs) "dirty" trough segments
|
|
||||||
(segments that have received writes) periodically, sleeping for
|
|
||||||
`promotion_interval` seconds between cycles (default None)
|
|
||||||
'''
|
|
||||||
parsed = doublethink.parse_rethinkdb_url(rethinkdb_trough_db_url)
|
|
||||||
self.rr = doublethink.Rethinker(
|
|
||||||
servers=parsed.hosts, db=parsed.database)
|
|
||||||
self.svcreg = doublethink.ServiceRegistry(self.rr)
|
|
||||||
self._write_url_cache = {}
|
|
||||||
self._read_url_cache = {}
|
|
||||||
self._dirty_segments = set()
|
|
||||||
self._dirty_segments_lock = threading.RLock()
|
|
||||||
|
|
||||||
self.promotion_interval = promotion_interval
|
|
||||||
self._promoter_thread = None
|
|
||||||
if promotion_interval:
|
|
||||||
self._promoter_thread = threading.Thread(
|
|
||||||
target=self._promotrix, name='TroughClient-promoter')
|
|
||||||
self._promoter_thread.setDaemon(True)
|
|
||||||
self._promoter_thread.start()
|
|
||||||
|
|
||||||
def _promotrix(self):
|
|
||||||
while True:
|
|
||||||
time.sleep(self.promotion_interval)
|
|
||||||
try:
|
|
||||||
with self._dirty_segments_lock:
|
|
||||||
dirty_segments = list(self._dirty_segments)
|
|
||||||
self._dirty_segments.clear()
|
|
||||||
logging.info(
|
|
||||||
'promoting %s trough segments', len(dirty_segments))
|
|
||||||
for segment_id in dirty_segments:
|
|
||||||
try:
|
|
||||||
self.promote(segment_id)
|
|
||||||
except:
|
|
||||||
logging.error(
|
|
||||||
'problem promoting segment %s', segment_id,
|
|
||||||
exc_info=True)
|
|
||||||
except:
|
|
||||||
logging.error(
|
|
||||||
'caught exception doing segment promotion',
|
|
||||||
exc_info=True)
|
|
||||||
|
|
||||||
def promote(self, segment_id):
|
|
||||||
url = os.path.join(self.segment_manager_url(), 'promote')
|
|
||||||
payload_dict = {'segment': segment_id}
|
|
||||||
response = requests.post(url, json=payload_dict, timeout=21600)
|
|
||||||
if response.status_code != 200:
|
|
||||||
raise Exception(
|
|
||||||
'Received %s: %r in response to POST %s with data %s' % (
|
|
||||||
response.status_code, response.text, url,
|
|
||||||
json.dumps(payload_dict)))
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def sql_value(x):
|
|
||||||
if x is None:
|
|
||||||
return 'null'
|
|
||||||
elif isinstance(x, datetime.datetime):
|
|
||||||
return 'datetime(%r)' % x.isoformat()
|
|
||||||
elif isinstance(x, bool):
|
|
||||||
return int(x)
|
|
||||||
elif isinstance(x, str) or isinstance(x, bytes):
|
|
||||||
# the only character that needs escaped in sqlite string literals
|
|
||||||
# is single-quote, which is escaped as two single-quotes
|
|
||||||
if isinstance(x, bytes):
|
|
||||||
s = x.decode('utf-8')
|
|
||||||
else:
|
|
||||||
s = x
|
|
||||||
return "'" + s.replace("'", "''") + "'"
|
|
||||||
elif isinstance(x, (int, float)):
|
|
||||||
return x
|
|
||||||
else:
|
|
||||||
raise Exception(
|
|
||||||
"don't know how to make an sql value from %r (%r)" % (
|
|
||||||
x, type(x)))
|
|
||||||
|
|
||||||
def segment_manager_url(self):
|
|
||||||
master_node = self.svcreg.unique_service('trough-sync-master')
|
|
||||||
assert master_node
|
|
||||||
return master_node['url']
|
|
||||||
|
|
||||||
def write_url_nocache(self, segment_id, schema_id='default'):
|
|
||||||
provision_url = os.path.join(self.segment_manager_url(), 'provision')
|
|
||||||
payload_dict = {'segment': segment_id, 'schema': schema_id}
|
|
||||||
response = requests.post(provision_url, json=payload_dict, timeout=600)
|
|
||||||
if response.status_code != 200:
|
|
||||||
raise Exception(
|
|
||||||
'Received %s: %r in response to POST %s with data %s' % (
|
|
||||||
response.status_code, response.text, provision_url,
|
|
||||||
json.dumps(payload_dict)))
|
|
||||||
result_dict = response.json()
|
|
||||||
# assert result_dict['schema'] == schema_id # previously provisioned?
|
|
||||||
return result_dict['write_url']
|
|
||||||
|
|
||||||
def read_url_nocache(self, segment_id):
|
|
||||||
reql = self.rr.table('services').get_all(
|
|
||||||
segment_id, index='segment').filter(
|
|
||||||
{'role':'trough-read'}).filter(
|
|
||||||
lambda svc: r.now().sub(
|
|
||||||
svc['last_heartbeat']).lt(svc['ttl'])
|
|
||||||
).order_by('load')
|
|
||||||
self.logger.debug('querying rethinkdb: %r', reql)
|
|
||||||
results = reql.run()
|
|
||||||
if results:
|
|
||||||
return results[0]['url']
|
|
||||||
else:
|
|
||||||
return None
|
|
||||||
|
|
||||||
def write_url(self, segment_id, schema_id='default'):
|
|
||||||
if not segment_id in self._write_url_cache:
|
|
||||||
self._write_url_cache[segment_id] = self.write_url_nocache(
|
|
||||||
segment_id, schema_id)
|
|
||||||
self.logger.info(
|
|
||||||
'segment %r write url is %r', segment_id,
|
|
||||||
self._write_url_cache[segment_id])
|
|
||||||
return self._write_url_cache[segment_id]
|
|
||||||
|
|
||||||
def read_url(self, segment_id):
|
|
||||||
if not self._read_url_cache.get(segment_id):
|
|
||||||
self._read_url_cache[segment_id] = self.read_url_nocache(segment_id)
|
|
||||||
self.logger.info(
|
|
||||||
'segment %r read url is %r', segment_id,
|
|
||||||
self._read_url_cache[segment_id])
|
|
||||||
return self._read_url_cache[segment_id]
|
|
||||||
|
|
||||||
def write(self, segment_id, sql_tmpl, values=(), schema_id='default'):
|
|
||||||
write_url = self.write_url(segment_id, schema_id)
|
|
||||||
sql = sql_tmpl % tuple(self.sql_value(v) for v in values)
|
|
||||||
sql_bytes = sql.encode('utf-8')
|
|
||||||
|
|
||||||
try:
|
|
||||||
response = requests.post(
|
|
||||||
write_url, sql_bytes, timeout=600,
|
|
||||||
headers={'content-type': 'application/sql;charset=utf-8'})
|
|
||||||
if response.status_code != 200:
|
|
||||||
raise Exception(
|
|
||||||
'Received %s: %r in response to POST %s with data %r' % (
|
|
||||||
response.status_code, response.text, write_url, sql))
|
|
||||||
if segment_id not in self._dirty_segments:
|
|
||||||
with self._dirty_segments_lock:
|
|
||||||
self._dirty_segments.add(segment_id)
|
|
||||||
except:
|
|
||||||
self._write_url_cache.pop(segment_id, None)
|
|
||||||
self.logger.error(
|
|
||||||
'problem with trough write url %r', write_url,
|
|
||||||
exc_info=True)
|
|
||||||
return
|
|
||||||
if response.status_code != 200:
|
|
||||||
self._write_url_cache.pop(segment_id, None)
|
|
||||||
self.logger.warn(
|
|
||||||
'unexpected response %r %r %r from %r to sql=%r',
|
|
||||||
response.status_code, response.reason, response.text,
|
|
||||||
write_url, sql)
|
|
||||||
return
|
|
||||||
self.logger.debug('posted to %s: %r', write_url, sql)
|
|
||||||
|
|
||||||
def read(self, segment_id, sql_tmpl, values=()):
|
|
||||||
read_url = self.read_url(segment_id)
|
|
||||||
if not read_url:
|
|
||||||
return None
|
|
||||||
sql = sql_tmpl % tuple(self.sql_value(v) for v in values)
|
|
||||||
sql_bytes = sql.encode('utf-8')
|
|
||||||
try:
|
|
||||||
response = requests.post(
|
|
||||||
read_url, sql_bytes, timeout=600,
|
|
||||||
headers={'content-type': 'application/sql;charset=utf-8'})
|
|
||||||
except:
|
|
||||||
self._read_url_cache.pop(segment_id, None)
|
|
||||||
self.logger.error(
|
|
||||||
'problem with trough read url %r', read_url, exc_info=True)
|
|
||||||
return None
|
|
||||||
if response.status_code != 200:
|
|
||||||
self._read_url_cache.pop(segment_id, None)
|
|
||||||
self.logger.warn(
|
|
||||||
'unexpected response %r %r %r from %r to sql=%r',
|
|
||||||
response.status_code, response.reason, response.text,
|
|
||||||
read_url, sql)
|
|
||||||
return None
|
|
||||||
self.logger.trace(
|
|
||||||
'got %r from posting query %r to %r', response.text, sql,
|
|
||||||
read_url)
|
|
||||||
results = json.loads(response.text)
|
|
||||||
return results
|
|
||||||
|
|
||||||
def schema_exists(self, schema_id):
|
|
||||||
url = os.path.join(self.segment_manager_url(), 'schema', schema_id)
|
|
||||||
response = requests.get(url, timeout=60)
|
|
||||||
if response.status_code == 200:
|
|
||||||
return True
|
|
||||||
elif response.status_code == 404:
|
|
||||||
return False
|
|
||||||
else:
|
|
||||||
response.raise_for_status()
|
|
||||||
|
|
||||||
def register_schema(self, schema_id, sql):
|
|
||||||
url = os.path.join(
|
|
||||||
self.segment_manager_url(), 'schema', schema_id, 'sql')
|
|
||||||
response = requests.put(url, sql, timeout=600)
|
|
||||||
if response.status_code not in (201, 204):
|
|
||||||
raise Exception(
|
|
||||||
'Received %s: %r in response to PUT %r with data %r' % (
|
|
||||||
response.status_code, response.text, sql, url))
|
|
||||||
|
|
@ -125,48 +125,59 @@ class WarcRecordBuilder:
|
|||||||
headers.append((warctools.WarcRecord.CONCURRENT_TO, concurrent_to))
|
headers.append((warctools.WarcRecord.CONCURRENT_TO, concurrent_to))
|
||||||
if content_type is not None:
|
if content_type is not None:
|
||||||
headers.append((warctools.WarcRecord.CONTENT_TYPE, content_type))
|
headers.append((warctools.WarcRecord.CONTENT_TYPE, content_type))
|
||||||
if payload_digest is not None:
|
|
||||||
headers.append((warctools.WarcRecord.PAYLOAD_DIGEST, payload_digest))
|
|
||||||
# truncated value may be 'length' or 'time'
|
# truncated value may be 'length' or 'time'
|
||||||
if truncated is not None:
|
if truncated is not None:
|
||||||
headers.append((b'WARC-Truncated', truncated))
|
headers.append((b'WARC-Truncated', truncated))
|
||||||
|
if content_length is not None:
|
||||||
|
headers.append((
|
||||||
|
warctools.WarcRecord.CONTENT_LENGTH,
|
||||||
|
str(content_length).encode('latin1')))
|
||||||
|
|
||||||
if recorder is not None:
|
if recorder is not None:
|
||||||
if content_length is not None:
|
if payload_digest is not None:
|
||||||
headers.append((
|
headers.append(
|
||||||
warctools.WarcRecord.CONTENT_LENGTH,
|
(warctools.WarcRecord.PAYLOAD_DIGEST, payload_digest))
|
||||||
str(content_length).encode('latin1')))
|
if content_length is None:
|
||||||
else:
|
|
||||||
headers.append((
|
headers.append((
|
||||||
warctools.WarcRecord.CONTENT_LENGTH,
|
warctools.WarcRecord.CONTENT_LENGTH,
|
||||||
str(len(recorder)).encode('latin1')))
|
str(len(recorder)).encode('latin1')))
|
||||||
headers.append((warctools.WarcRecord.BLOCK_DIGEST,
|
headers.append((warctools.WarcRecord.BLOCK_DIGEST,
|
||||||
warcprox.digest_str(recorder.block_digest, self.base32)))
|
warcprox.digest_str(recorder.block_digest, self.base32)))
|
||||||
recorder.tempfile.seek(0)
|
recorder.tempfile.seek(0)
|
||||||
record = warctools.WarcRecord(headers=headers, content_file=recorder.tempfile)
|
record = warctools.WarcRecord(
|
||||||
|
headers=headers, content_file=recorder.tempfile)
|
||||||
else:
|
else:
|
||||||
if content_length is not None:
|
if content_length is None:
|
||||||
headers.append((
|
|
||||||
warctools.WarcRecord.CONTENT_LENGTH,
|
|
||||||
str(content_length).encode('latin1')))
|
|
||||||
else:
|
|
||||||
headers.append((
|
headers.append((
|
||||||
warctools.WarcRecord.CONTENT_LENGTH,
|
warctools.WarcRecord.CONTENT_LENGTH,
|
||||||
str(len(data)).encode('latin1')))
|
str(len(data)).encode('latin1')))
|
||||||
# no http headers so block digest == payload digest
|
|
||||||
if not payload_digest:
|
block_digest = None
|
||||||
payload_digest = warcprox.digest_str(
|
if not hasattr(data, 'read'):
|
||||||
|
block_digest = warcprox.digest_str(
|
||||||
hashlib.new(self.digest_algorithm, data), self.base32)
|
hashlib.new(self.digest_algorithm, data), self.base32)
|
||||||
headers.append((
|
|
||||||
warctools.WarcRecord.PAYLOAD_DIGEST, payload_digest))
|
if not content_type.lower().startswith(b'application/http'):
|
||||||
headers.append((warctools.WarcRecord.BLOCK_DIGEST, payload_digest))
|
# no http headers, so block digest == payload digest
|
||||||
|
if payload_digest and not block_digest:
|
||||||
|
block_digest = payload_digest
|
||||||
|
elif block_digest and not payload_digest:
|
||||||
|
payload_digest = block_digest
|
||||||
|
|
||||||
|
if block_digest:
|
||||||
|
headers.append(
|
||||||
|
(warctools.WarcRecord.BLOCK_DIGEST, block_digest))
|
||||||
|
if payload_digest:
|
||||||
|
headers.append(
|
||||||
|
(warctools.WarcRecord.PAYLOAD_DIGEST, payload_digest))
|
||||||
|
|
||||||
if hasattr(data, 'read'):
|
if hasattr(data, 'read'):
|
||||||
record = warctools.WarcRecord(
|
record = warctools.WarcRecord(
|
||||||
headers=headers, content_file=data)
|
headers=headers, content_file=data)
|
||||||
else:
|
else:
|
||||||
content_tuple = content_type, data
|
content_tuple = content_type, data
|
||||||
record = warctools.WarcRecord(
|
record = warctools.WarcRecord(
|
||||||
headers=headers, content=content_tuple)
|
headers=headers, content=(content_type, data))
|
||||||
|
|
||||||
return record
|
return record
|
||||||
|
|
||||||
|
@ -2,7 +2,7 @@
|
|||||||
warcprox/warcproxy.py - recording proxy, extends mitmproxy to record traffic,
|
warcprox/warcproxy.py - recording proxy, extends mitmproxy to record traffic,
|
||||||
enqueue info on the recorded url queue
|
enqueue info on the recorded url queue
|
||||||
|
|
||||||
Copyright (C) 2013-2018 Internet Archive
|
Copyright (C) 2013-2022 Internet Archive
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or
|
This program is free software; you can redistribute it and/or
|
||||||
modify it under the terms of the GNU General Public License
|
modify it under the terms of the GNU General Public License
|
||||||
@ -38,15 +38,16 @@ import logging
|
|||||||
import json
|
import json
|
||||||
import socket
|
import socket
|
||||||
from hanzo import warctools
|
from hanzo import warctools
|
||||||
from certauth.certauth import CertificateAuthority
|
|
||||||
import warcprox
|
import warcprox
|
||||||
import datetime
|
import datetime
|
||||||
import urlcanon
|
import urlcanon
|
||||||
import os
|
import os
|
||||||
from urllib3 import PoolManager
|
|
||||||
import tempfile
|
import tempfile
|
||||||
import hashlib
|
import hashlib
|
||||||
import doublethink
|
import doublethink
|
||||||
|
import re
|
||||||
|
import zlib
|
||||||
|
import base64
|
||||||
|
|
||||||
class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
||||||
'''
|
'''
|
||||||
@ -167,7 +168,7 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
|||||||
if warcprox_meta and 'warc-prefix' in warcprox_meta and (
|
if warcprox_meta and 'warc-prefix' in warcprox_meta and (
|
||||||
'/' in warcprox_meta['warc-prefix']
|
'/' in warcprox_meta['warc-prefix']
|
||||||
or '\\' in warcprox_meta['warc-prefix']):
|
or '\\' in warcprox_meta['warc-prefix']):
|
||||||
raise Exception(
|
raise warcprox.BadRequest(
|
||||||
"request rejected by warcprox: slash and backslash are not "
|
"request rejected by warcprox: slash and backslash are not "
|
||||||
"permitted in warc-prefix")
|
"permitted in warc-prefix")
|
||||||
|
|
||||||
@ -176,6 +177,13 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
|||||||
warcprox_meta = json.loads(self.headers['Warcprox-Meta'])
|
warcprox_meta = json.loads(self.headers['Warcprox-Meta'])
|
||||||
self._security_check(warcprox_meta)
|
self._security_check(warcprox_meta)
|
||||||
self._enforce_limits(warcprox_meta)
|
self._enforce_limits(warcprox_meta)
|
||||||
|
if 'compressed_blocks' in warcprox_meta:
|
||||||
|
# b64decode and decompress
|
||||||
|
blocks_decompressed = zlib.decompress(base64.b64decode(warcprox_meta['compressed_blocks']))
|
||||||
|
# decode() and json.loads
|
||||||
|
warcprox_meta['blocks'] = json.loads(blocks_decompressed.decode())
|
||||||
|
# delete compressed_blocks (just in case?)
|
||||||
|
del warcprox_meta['compressed_blocks']
|
||||||
self._enforce_blocks(warcprox_meta)
|
self._enforce_blocks(warcprox_meta)
|
||||||
|
|
||||||
def _connect_to_remote_server(self):
|
def _connect_to_remote_server(self):
|
||||||
@ -189,16 +197,21 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
|||||||
self._enforce_limits_and_blocks()
|
self._enforce_limits_and_blocks()
|
||||||
return warcprox.mitmproxy.MitmProxyHandler._connect_to_remote_server(self)
|
return warcprox.mitmproxy.MitmProxyHandler._connect_to_remote_server(self)
|
||||||
|
|
||||||
def _proxy_request(self):
|
def _parse_warcprox_meta(self):
|
||||||
warcprox_meta = None
|
'''
|
||||||
|
:return: Warcprox-Meta request header value as a dictionary, or None
|
||||||
|
'''
|
||||||
raw_warcprox_meta = self.headers.get('Warcprox-Meta')
|
raw_warcprox_meta = self.headers.get('Warcprox-Meta')
|
||||||
self.logger.trace(
|
self.logger.trace(
|
||||||
'request for %s Warcprox-Meta header: %s', self.url,
|
'request for %s Warcprox-Meta header: %s', self.url,
|
||||||
raw_warcprox_meta)
|
raw_warcprox_meta)
|
||||||
if raw_warcprox_meta:
|
if raw_warcprox_meta:
|
||||||
warcprox_meta = json.loads(raw_warcprox_meta)
|
return json.loads(raw_warcprox_meta)
|
||||||
del self.headers['Warcprox-Meta']
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _proxy_request(self):
|
||||||
|
warcprox_meta = self._parse_warcprox_meta()
|
||||||
remote_ip = self._remote_server_conn.sock.getpeername()[0]
|
remote_ip = self._remote_server_conn.sock.getpeername()[0]
|
||||||
timestamp = doublethink.utcnow()
|
timestamp = doublethink.utcnow()
|
||||||
extra_response_headers = {}
|
extra_response_headers = {}
|
||||||
@ -345,15 +358,43 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
|||||||
self.logger.error("uncaught exception in do_WARCPROX_WRITE_RECORD", exc_info=True)
|
self.logger.error("uncaught exception in do_WARCPROX_WRITE_RECORD", exc_info=True)
|
||||||
raise
|
raise
|
||||||
|
|
||||||
|
def send_error(self, code, message=None, explain=None, exception=None):
|
||||||
|
super().send_error(code, message=message, explain=explain, exception=exception)
|
||||||
|
|
||||||
|
# If error happens during CONNECT handling and before the inner request, self.url
|
||||||
|
# is unset, and self.path is something like 'example.com:443'
|
||||||
|
urlish = self.url or self.path
|
||||||
|
|
||||||
|
warcprox_meta = self._parse_warcprox_meta()
|
||||||
|
self._swallow_hop_by_hop_headers()
|
||||||
|
request_data = self._build_request()
|
||||||
|
|
||||||
|
failed_url = FailedUrl(
|
||||||
|
url=urlish,
|
||||||
|
request_data=request_data,
|
||||||
|
warcprox_meta=warcprox_meta,
|
||||||
|
status=code,
|
||||||
|
client_ip=self.client_address[0],
|
||||||
|
method=self.command,
|
||||||
|
timestamp=doublethink.utcnow(),
|
||||||
|
host=self.hostname,
|
||||||
|
duration=None,
|
||||||
|
referer=self.headers.get('referer'),
|
||||||
|
do_not_archive=True,
|
||||||
|
message=message,
|
||||||
|
exception=exception)
|
||||||
|
|
||||||
|
self.server.recorded_url_q.put(failed_url)
|
||||||
|
|
||||||
def log_message(self, fmt, *args):
|
def log_message(self, fmt, *args):
|
||||||
# logging better handled elsewhere?
|
# logging better handled elsewhere?
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
RE_MIMETYPE = re.compile(r'[;\s]')
|
||||||
|
|
||||||
class RecordedUrl:
|
class RequestedUrl:
|
||||||
logger = logging.getLogger("warcprox.warcproxy.RecordedUrl")
|
logger = logging.getLogger("warcprox.warcproxy.RequestedUrl")
|
||||||
|
def __init__(self, url, request_data, response_recorder=None, remote_ip=None,
|
||||||
def __init__(self, url, request_data, response_recorder, remote_ip,
|
|
||||||
warcprox_meta=None, content_type=None, custom_type=None,
|
warcprox_meta=None, content_type=None, custom_type=None,
|
||||||
status=None, size=None, client_ip=None, method=None,
|
status=None, size=None, client_ip=None, method=None,
|
||||||
timestamp=None, host=None, duration=None, referer=None,
|
timestamp=None, host=None, duration=None, referer=None,
|
||||||
@ -366,19 +407,20 @@ class RecordedUrl:
|
|||||||
else:
|
else:
|
||||||
self.url = url
|
self.url = url
|
||||||
|
|
||||||
if type(remote_ip) is not bytes:
|
|
||||||
self.remote_ip = remote_ip.encode('ascii')
|
|
||||||
else:
|
|
||||||
self.remote_ip = remote_ip
|
|
||||||
|
|
||||||
self.request_data = request_data
|
self.request_data = request_data
|
||||||
self.response_recorder = response_recorder
|
self.response_recorder = response_recorder
|
||||||
|
|
||||||
if warcprox_meta:
|
if warcprox_meta:
|
||||||
if 'captures-bucket' in warcprox_meta:
|
if 'captures-bucket' in warcprox_meta:
|
||||||
# backward compatibility
|
# backward compatibility
|
||||||
warcprox_meta['dedup-bucket'] = warcprox_meta['captures-bucket']
|
warcprox_meta['dedup-buckets'] = {}
|
||||||
|
warcprox_meta['dedup-buckets'][warcprox_meta['captures-bucket']] = 'rw'
|
||||||
del warcprox_meta['captures-bucket']
|
del warcprox_meta['captures-bucket']
|
||||||
|
if 'dedup-bucket' in warcprox_meta:
|
||||||
|
# more backwards compatibility
|
||||||
|
warcprox_meta['dedup-buckets'] = {}
|
||||||
|
warcprox_meta['dedup-buckets'][warcprox_meta['dedup-bucket']] = 'rw'
|
||||||
|
del warcprox_meta['dedup-bucket']
|
||||||
self.warcprox_meta = warcprox_meta
|
self.warcprox_meta = warcprox_meta
|
||||||
else:
|
else:
|
||||||
self.warcprox_meta = {}
|
self.warcprox_meta = {}
|
||||||
@ -387,9 +429,8 @@ class RecordedUrl:
|
|||||||
|
|
||||||
self.mimetype = content_type
|
self.mimetype = content_type
|
||||||
if self.mimetype:
|
if self.mimetype:
|
||||||
n = self.mimetype.find(";")
|
# chop off subtype, and ensure there's no whitespace
|
||||||
if n >= 0:
|
self.mimetype = RE_MIMETYPE.split(self.mimetype, 2)[0]
|
||||||
self.mimetype = self.mimetype[:n]
|
|
||||||
|
|
||||||
self.custom_type = custom_type
|
self.custom_type = custom_type
|
||||||
self.status = status
|
self.status = status
|
||||||
@ -405,6 +446,43 @@ class RecordedUrl:
|
|||||||
self.warc_records = warc_records
|
self.warc_records = warc_records
|
||||||
self.do_not_archive = do_not_archive
|
self.do_not_archive = do_not_archive
|
||||||
|
|
||||||
|
class FailedUrl(RequestedUrl):
|
||||||
|
logger = logging.getLogger("warcprox.warcproxy.FailedUrl")
|
||||||
|
|
||||||
|
def __init__(self, url, request_data, warcprox_meta=None, status=None,
|
||||||
|
client_ip=None, method=None, timestamp=None, host=None, duration=None,
|
||||||
|
referer=None, do_not_archive=True, message=None, exception=None):
|
||||||
|
|
||||||
|
super().__init__(url, request_data, warcprox_meta=warcprox_meta,
|
||||||
|
status=status, client_ip=client_ip, method=method,
|
||||||
|
timestamp=timestamp, host=host, duration=duration,
|
||||||
|
referer=referer, do_not_archive=do_not_archive)
|
||||||
|
|
||||||
|
self.message = message
|
||||||
|
self.exception = exception
|
||||||
|
|
||||||
|
class RecordedUrl(RequestedUrl):
|
||||||
|
logger = logging.getLogger("warcprox.warcproxy.RecordedUrl")
|
||||||
|
|
||||||
|
def __init__(self, url, request_data, response_recorder, remote_ip,
|
||||||
|
warcprox_meta=None, content_type=None, custom_type=None,
|
||||||
|
status=None, size=None, client_ip=None, method=None,
|
||||||
|
timestamp=None, host=None, duration=None, referer=None,
|
||||||
|
payload_digest=None, truncated=None, warc_records=None,
|
||||||
|
do_not_archive=False):
|
||||||
|
|
||||||
|
super().__init__(url, request_data, response_recorder=response_recorder,
|
||||||
|
warcprox_meta=warcprox_meta, content_type=content_type,
|
||||||
|
custom_type=custom_type, status=status, size=size, client_ip=client_ip,
|
||||||
|
method=method, timestamp=timestamp, host=host, duration=duration,
|
||||||
|
referer=referer, payload_digest=payload_digest, truncated=truncated,
|
||||||
|
warc_records=warc_records, do_not_archive=do_not_archive)
|
||||||
|
|
||||||
|
if type(remote_ip) is not bytes:
|
||||||
|
self.remote_ip = remote_ip.encode('ascii')
|
||||||
|
else:
|
||||||
|
self.remote_ip = remote_ip
|
||||||
|
|
||||||
def is_text(self):
|
def is_text(self):
|
||||||
"""Ref: https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Complete_list_of_MIME_types
|
"""Ref: https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Complete_list_of_MIME_types
|
||||||
Alternative method: try to decode('ascii') first N bytes to make sure
|
Alternative method: try to decode('ascii') first N bytes to make sure
|
||||||
@ -420,51 +498,20 @@ class RecordedUrl:
|
|||||||
# inherit from object so that multiple inheritance from this class works
|
# inherit from object so that multiple inheritance from this class works
|
||||||
# properly in python 2
|
# properly in python 2
|
||||||
# http://stackoverflow.com/questions/1713038/super-fails-with-error-typeerror-argument-1-must-be-type-not-classobj#18392639
|
# http://stackoverflow.com/questions/1713038/super-fails-with-error-typeerror-argument-1-must-be-type-not-classobj#18392639
|
||||||
class SingleThreadedWarcProxy(http_server.HTTPServer, object):
|
class SingleThreadedWarcProxy(warcprox.mitmproxy.SingleThreadedMitmProxy):
|
||||||
logger = logging.getLogger("warcprox.warcproxy.WarcProxy")
|
logger = logging.getLogger("warcprox.warcproxy.WarcProxy")
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, stats_db=None, status_callback=None,
|
self, stats_db=None, status_callback=None,
|
||||||
options=warcprox.Options()):
|
options=warcprox.Options()):
|
||||||
self.start_time = doublethink.utcnow()
|
self.start_time = doublethink.utcnow()
|
||||||
|
|
||||||
|
warcprox.mitmproxy.SingleThreadedMitmProxy.__init__(
|
||||||
|
self, WarcProxyHandler, options)
|
||||||
|
|
||||||
self.status_callback = status_callback
|
self.status_callback = status_callback
|
||||||
self.stats_db = stats_db
|
self.stats_db = stats_db
|
||||||
self.options = options
|
|
||||||
self.remote_connection_pool = PoolManager(
|
|
||||||
num_pools=max(round(options.max_threads / 6), 200) if options.max_threads else 200)
|
|
||||||
server_address = (
|
|
||||||
options.address or 'localhost',
|
|
||||||
options.port if options.port is not None else 8000)
|
|
||||||
|
|
||||||
if options.onion_tor_socks_proxy:
|
|
||||||
try:
|
|
||||||
host, port = options.onion_tor_socks_proxy.split(':')
|
|
||||||
WarcProxyHandler.onion_tor_socks_proxy_host = host
|
|
||||||
WarcProxyHandler.onion_tor_socks_proxy_port = int(port)
|
|
||||||
except ValueError:
|
|
||||||
WarcProxyHandler.onion_tor_socks_proxy_host = options.onion_tor_socks_proxy
|
|
||||||
WarcProxyHandler.onion_tor_socks_proxy_port = None
|
|
||||||
|
|
||||||
if options.socket_timeout:
|
|
||||||
WarcProxyHandler._socket_timeout = options.socket_timeout
|
|
||||||
if options.max_resource_size:
|
|
||||||
WarcProxyHandler._max_resource_size = options.max_resource_size
|
|
||||||
if options.tmp_file_max_memory_size:
|
|
||||||
WarcProxyHandler._tmp_file_max_memory_size = options.tmp_file_max_memory_size
|
|
||||||
|
|
||||||
http_server.HTTPServer.__init__(
|
|
||||||
self, server_address, WarcProxyHandler, bind_and_activate=True)
|
|
||||||
|
|
||||||
self.digest_algorithm = options.digest_algorithm or 'sha1'
|
|
||||||
|
|
||||||
ca_name = ('Warcprox CA on %s' % socket.gethostname())[:64]
|
|
||||||
self.ca = CertificateAuthority(
|
|
||||||
ca_file=options.cacert or 'warcprox-ca.pem',
|
|
||||||
certs_dir=options.certs_dir or './warcprox-ca',
|
|
||||||
ca_name=ca_name)
|
|
||||||
|
|
||||||
self.recorded_url_q = queue.Queue(maxsize=options.queue_size or 1000)
|
self.recorded_url_q = queue.Queue(maxsize=options.queue_size or 1000)
|
||||||
|
|
||||||
self.running_stats = warcprox.stats.RunningStats()
|
self.running_stats = warcprox.stats.RunningStats()
|
||||||
|
|
||||||
def status(self):
|
def status(self):
|
||||||
@ -530,6 +577,6 @@ class WarcProxy(SingleThreadedWarcProxy, warcprox.mitmproxy.PooledMitmProxy):
|
|||||||
self.remote_connection_pool.clear()
|
self.remote_connection_pool.clear()
|
||||||
|
|
||||||
def handle_error(self, request, client_address):
|
def handle_error(self, request, client_address):
|
||||||
self.logger.warn(
|
self.logger.warning(
|
||||||
"exception processing request %s from %s", request,
|
"exception processing request %s from %s", request,
|
||||||
client_address, exc_info=True)
|
client_address, exc_info=True)
|
||||||
|
@ -51,10 +51,14 @@ class WarcWriter:
|
|||||||
self.finalname = None
|
self.finalname = None
|
||||||
self.gzip = options.gzip or False
|
self.gzip = options.gzip or False
|
||||||
self.prefix = options.prefix or 'warcprox'
|
self.prefix = options.prefix or 'warcprox'
|
||||||
|
self.port = options.port or 8000
|
||||||
self.open_suffix = '' if options.no_warc_open_suffix else '.open'
|
self.open_suffix = '' if options.no_warc_open_suffix else '.open'
|
||||||
self.rollover_size = options.rollover_size or 1000000000
|
self.rollover_size = options.rollover_size or 1000000000
|
||||||
self.rollover_idle_time = options.rollover_idle_time or None
|
self.rollover_idle_time = options.rollover_idle_time or None
|
||||||
self.directory = options.directory or './warcs'
|
if options.subdir_prefix and options.prefix:
|
||||||
|
self.directory = os.path.sep.join([options.directory, options.prefix]) or './warcs'
|
||||||
|
else:
|
||||||
|
self.directory = options.directory or './warcs'
|
||||||
self.filename_template = options.warc_filename or \
|
self.filename_template = options.warc_filename or \
|
||||||
'{prefix}-{timestamp17}-{randomtoken}-{serialno}'
|
'{prefix}-{timestamp17}-{randomtoken}-{serialno}'
|
||||||
self.last_activity = time.time()
|
self.last_activity = time.time()
|
||||||
@ -67,7 +71,7 @@ class WarcWriter:
|
|||||||
"""WARC filename is configurable with CLI parameter --warc-filename.
|
"""WARC filename is configurable with CLI parameter --warc-filename.
|
||||||
Default: '{prefix}-{timestamp17}-{randomtoken}-{serialno}'
|
Default: '{prefix}-{timestamp17}-{randomtoken}-{serialno}'
|
||||||
Available variables are: prefix, timestamp14, timestamp17, serialno,
|
Available variables are: prefix, timestamp14, timestamp17, serialno,
|
||||||
randomtoken, hostname, shorthostname.
|
randomtoken, hostname, shorthostname, port.
|
||||||
Extension ``.warc`` or ``.warc.gz`` is appended automatically.
|
Extension ``.warc`` or ``.warc.gz`` is appended automatically.
|
||||||
"""
|
"""
|
||||||
hostname = socket.getfqdn()
|
hostname = socket.getfqdn()
|
||||||
@ -77,7 +81,7 @@ class WarcWriter:
|
|||||||
timestamp17=warcprox.timestamp17(),
|
timestamp17=warcprox.timestamp17(),
|
||||||
serialno='{:05d}'.format(serial),
|
serialno='{:05d}'.format(serial),
|
||||||
randomtoken=self.randomtoken, hostname=hostname,
|
randomtoken=self.randomtoken, hostname=hostname,
|
||||||
shorthostname=shorthostname)
|
shorthostname=shorthostname, port=self.port)
|
||||||
if self.gzip:
|
if self.gzip:
|
||||||
fname = fname + '.warc.gz'
|
fname = fname + '.warc.gz'
|
||||||
else:
|
else:
|
||||||
@ -115,10 +119,8 @@ class WarcWriter:
|
|||||||
'''
|
'''
|
||||||
Ensures `self.f` is ready to write the next warc record.
|
Ensures `self.f` is ready to write the next warc record.
|
||||||
|
|
||||||
Closes current warc if size limit has been reached. Then, if warc is
|
If warc is not open, opens one, and writes the warcinfo record.
|
||||||
not open, opens one, and writes the warcinfo record.
|
|
||||||
'''
|
'''
|
||||||
self.maybe_size_rollover()
|
|
||||||
if not self.f:
|
if not self.f:
|
||||||
serial = self.serial
|
serial = self.serial
|
||||||
self.serial += 1
|
self.serial += 1
|
||||||
@ -136,11 +138,14 @@ class WarcWriter:
|
|||||||
records = self.record_builder.build_warc_records(recorded_url)
|
records = self.record_builder.build_warc_records(recorded_url)
|
||||||
|
|
||||||
self.ensure_open()
|
self.ensure_open()
|
||||||
|
total_warc_file_size = None
|
||||||
for record in records:
|
for record in records:
|
||||||
offset = self.f.tell()
|
offset = self.f.tell()
|
||||||
record.write_to(self.f, gzip=self.gzip)
|
record.write_to(self.f, gzip=self.gzip)
|
||||||
record.offset = offset
|
record.offset = offset
|
||||||
record.length = self.f.tell() - offset
|
offset2 = self.f.tell()
|
||||||
|
record.length = offset2 - offset
|
||||||
|
total_warc_file_size = offset2
|
||||||
record.warc_filename = self.finalname
|
record.warc_filename = self.finalname
|
||||||
self.logger.trace(
|
self.logger.trace(
|
||||||
'wrote warc record: warc_type=%s content_length=%s '
|
'wrote warc record: warc_type=%s content_length=%s '
|
||||||
@ -150,7 +155,8 @@ class WarcWriter:
|
|||||||
self.path, record.get_header(warctools.WarcRecord.URL))
|
self.path, record.get_header(warctools.WarcRecord.URL))
|
||||||
self.f.flush()
|
self.f.flush()
|
||||||
self.last_activity = time.time()
|
self.last_activity = time.time()
|
||||||
|
# Closes current warc if size limit has been reached.
|
||||||
|
self.maybe_size_rollover(total_warc_file_size)
|
||||||
return records
|
return records
|
||||||
|
|
||||||
def close(self):
|
def close(self):
|
||||||
@ -165,14 +171,17 @@ class WarcWriter:
|
|||||||
if self.open_suffix == '':
|
if self.open_suffix == '':
|
||||||
try:
|
try:
|
||||||
fcntl.lockf(self.f, fcntl.LOCK_UN)
|
fcntl.lockf(self.f, fcntl.LOCK_UN)
|
||||||
except IOError as exc:
|
except Exception as exc:
|
||||||
self.logger.error(
|
self.logger.error(
|
||||||
'could not unlock file %s (%s)', self.path, exc)
|
'could not unlock file %s (%s)', self.path, exc)
|
||||||
self.f.close()
|
try:
|
||||||
finalpath = os.path.sep.join(
|
self.f.close()
|
||||||
[self.directory, self.finalname])
|
finalpath = os.path.sep.join(
|
||||||
os.rename(self.path, finalpath)
|
[self.directory, self.finalname])
|
||||||
|
os.rename(self.path, finalpath)
|
||||||
|
except Exception as exc:
|
||||||
|
self.logger.error(
|
||||||
|
'could not close and rename file %s (%s)', self.path, exc)
|
||||||
self.path = None
|
self.path = None
|
||||||
self.f = None
|
self.f = None
|
||||||
|
|
||||||
@ -185,11 +194,11 @@ class WarcWriter:
|
|||||||
self.finalname, time.time() - self.last_activity)
|
self.finalname, time.time() - self.last_activity)
|
||||||
self.close()
|
self.close()
|
||||||
|
|
||||||
def maybe_size_rollover(self):
|
def maybe_size_rollover(self, total_warc_file_size):
|
||||||
if self.path and os.path.getsize(self.path) > self.rollover_size:
|
if total_warc_file_size and total_warc_file_size > self.rollover_size:
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
'rolling over %s because it has reached %s bytes in size',
|
'rolling over %s because it has reached %s bytes in size',
|
||||||
self.finalname, os.path.getsize(self.path))
|
self.finalname, total_warc_file_size)
|
||||||
self.close()
|
self.close()
|
||||||
|
|
||||||
class WarcWriterPool:
|
class WarcWriterPool:
|
||||||
|
@ -72,6 +72,8 @@ class WarcWriterProcessor(warcprox.BaseStandardPostfetchProcessor):
|
|||||||
self.close_prefix_reqs.put(prefix)
|
self.close_prefix_reqs.put(prefix)
|
||||||
|
|
||||||
def _process_url(self, recorded_url):
|
def _process_url(self, recorded_url):
|
||||||
|
if isinstance(recorded_url, warcprox.warcproxy.FailedUrl):
|
||||||
|
return
|
||||||
try:
|
try:
|
||||||
records = []
|
records = []
|
||||||
if self._should_archive(recorded_url):
|
if self._should_archive(recorded_url):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user