mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
Compare commits
226 Commits
Author | SHA1 | Date | |
---|---|---|---|
|
369e8a4657 | ||
|
66ad775188 | ||
|
fbed60ff38 | ||
|
d6b9058e3b | ||
|
bfe18aeaf1 | ||
|
6028e523f3 | ||
|
7ce00f001c | ||
|
0e565889e1 | ||
|
01832c3cc5 | ||
|
ef774f5f29 | ||
|
c3ce3b160a | ||
|
14d2a0c005 | ||
|
aef8ca7012 | ||
|
701b659510 | ||
|
10d36cc943 | ||
|
a65b8b82b9 | ||
|
6756ba60fa | ||
|
2068c037ea | ||
|
f00ca5c336 | ||
|
c0ea6ef00f | ||
|
f7d4286b54 | ||
|
56e0b17dc9 | ||
|
af52dec469 | ||
|
848c089afa | ||
|
9fd5a22502 | ||
|
3d653e023c | ||
|
4cb8e0d5dc | ||
|
a20ad226cb | ||
|
bc0da12c48 | ||
|
8f0039de02 | ||
|
c620d7dd19 | ||
|
4fbf523a3e | ||
|
3b5d9d8ef0 | ||
|
5e779af2e9 | ||
|
a90c9c3dd4 | ||
|
99a825c055 | ||
|
c01d58df78 | ||
|
6eb2bd1265 | ||
|
d864ea91ee | ||
|
83c109bc9b | ||
|
1cc08233d6 | ||
|
ca02c22ff7 | ||
|
1fd3b2c7a1 | ||
|
ba14480a2d | ||
|
50a4f35e5f | ||
|
9973d28de9 | ||
|
ee9e375560 | ||
|
c008c2eca7 | ||
|
7958921053 | ||
|
329fef31a8 | ||
|
d253ea85c3 | ||
|
8418fe10ba | ||
|
fcd9b2b3bd | ||
|
731cfe80cc | ||
|
9521042a23 | ||
|
daa925db17 | ||
|
d96dd5d842 | ||
|
1e3d22aba4 | ||
|
5ae1291e37 | ||
|
05daafa19e | ||
|
ade2373711 | ||
|
3a234d0cec | ||
|
366ed5155f | ||
|
c027659001 | ||
|
9e8ea5bb45 | ||
|
bc3d1e6d00 | ||
|
6b372e2f3f | ||
|
5d8fbf7038 | ||
|
a969430b37 | ||
|
aeecb6515f | ||
|
e1eddb8fa7 | ||
|
d7aec77597 | ||
|
bcaf293081 | ||
|
7d4c8dcb4e | ||
|
da089e0a92 | ||
|
3eeccd0016 | ||
|
5e5a74f204 | ||
|
b67f1ad0f3 | ||
|
e6a1a7dd7e | ||
|
e744075913 | ||
|
1476bfec8c | ||
|
b57ec9c589 | ||
|
e61099ff5f | ||
|
0e23a31a31 | ||
|
7f406b7942 | ||
|
5f1c8c75fa | ||
|
e0732ffaf4 | ||
|
b8057825d8 | ||
|
e2e2c02802 | ||
|
f19ead0058 | ||
|
36784de174 | ||
|
ce1f32dc41 | ||
|
ae11daedc1 | ||
|
456698fe06 | ||
|
d90367f21f | ||
|
8078ee7af9 | ||
|
c649355285 | ||
|
21351094ec | ||
|
edeae3b21a | ||
|
b34419543f | ||
|
5e397e9bca | ||
|
d0b21f5dc4 | ||
|
36711c0148 | ||
|
a5e9c27223 | ||
|
de9219e646 | ||
|
5c15582be5 | ||
|
47731c61c1 | ||
|
90fba01514 | ||
|
a8cd53bfe4 | ||
|
ee6bc151e1 | ||
|
ca0197330d | ||
|
469b41773a | ||
|
91fcc054c4 | ||
|
3f5251ed60 | ||
|
f54e1b37c7 | ||
|
47ec5d7644 | ||
|
4ceebe1fa9 | ||
|
e88a88f247 | ||
|
f9c9443d2f | ||
|
ac959c6db5 | ||
|
ad652b407c | ||
|
fe19bb268f | ||
|
f77c152037 | ||
|
22d786f72e | ||
|
52e83632dd | ||
|
1f852f5f36 | ||
|
a34b7be431 | ||
|
d1b52f8d80 | ||
|
da9c4b0b4e | ||
|
af0fe2892c | ||
|
a09901dcef | ||
|
407e890258 | ||
|
8460a670b2 | ||
|
6536516375 | ||
|
8f20fc014e | ||
|
84a46e4323 | ||
|
88a7f79a7e | ||
|
a8cd219da7 | ||
|
2b408b3af0 | ||
|
1aa6b0c5d6 | ||
|
fce1c3d722 | ||
|
932001c921 | ||
|
a4253d5425 | ||
|
48d96fbc79 | ||
|
c0fcf59c86 | ||
|
79aab697e2 | ||
|
51c4f6d622 | ||
|
8c52bd8442 | ||
|
81a945e840 | ||
|
0abb1808b2 | ||
|
4ca10a22d8 | ||
|
740a80bfdb | ||
|
c7f8a8f223 | ||
|
2d6eefd8c6 | ||
|
76abe4b753 | ||
|
d133565061 | ||
|
6ee7ab36a2 | ||
|
957bd079e8 | ||
|
8c31ec2916 | ||
|
bbf3fad1dc | ||
|
f51f2ec225 | ||
|
2772b80fab | ||
|
8ed93fea37 | ||
|
5b30dd4576 | ||
|
f0d2898326 | ||
|
89041e83b4 | ||
|
75e789c15f | ||
|
bbe41bc900 | ||
|
89d987a181 | ||
|
41d7f0be53 | ||
|
653dec71ae | ||
|
1a8c719422 | ||
|
50d29bdf80 | ||
|
16489b99d9 | ||
|
dfc081fff8 | ||
|
ddcde36982 | ||
|
be7048844b | ||
|
38d6e4337d | ||
|
de01d498cb | ||
|
3298128e0c | ||
|
f207e32f50 | ||
|
5de2569430 | ||
|
10327d28c9 | ||
|
0d268659ab | ||
|
5ced2588d4 | ||
|
98b3c1f80b | ||
|
21731a2dfe | ||
|
7560c0946d | ||
|
2ca84ae023 | ||
|
4893a8eac0 | ||
|
c048b05d46 | ||
|
ac3d238a3d | ||
|
0cab6fc4bf | ||
|
794cc29c80 | ||
|
5633ae6a9c | ||
|
3f08639553 | ||
|
a25971e06b | ||
|
f2eebae641 | ||
|
a291de086d | ||
|
cb2a07bff2 | ||
|
1e0a0ca63a | ||
|
df7b46d94f | ||
|
436a27b19e | ||
|
b0367a9c82 | ||
|
878ab0977f | ||
|
c8f1c64494 | ||
|
6e6b43eb79 | ||
|
c70bf2e2b9 | ||
|
adca46427d | ||
|
5a7a4ff710 | ||
|
2824ee6a5b | ||
|
dde2c3efda | ||
|
99fb998e1d | ||
|
660989939e | ||
|
1133715331 | ||
|
53f13d3536 | ||
|
98f50ca296 | ||
|
e04ffa5a36 | ||
|
25281376f6 | ||
|
cb72af015a | ||
|
a780f1774c | ||
|
16e3302d36 | ||
|
8fd1af1d04 | ||
|
150c1e67c6 | ||
|
79d09d013b | ||
|
0882a2b174 |
70
.travis.yml
70
.travis.yml
@ -1,70 +0,0 @@
|
||||
sudo: required
|
||||
|
||||
language: python
|
||||
python:
|
||||
- 3.6
|
||||
- 3.5
|
||||
- 3.4
|
||||
- 2.7
|
||||
- pypy
|
||||
- pypy3
|
||||
- 3.7-dev
|
||||
- nightly
|
||||
|
||||
matrix:
|
||||
allow_failures:
|
||||
- python: nightly
|
||||
- python: 3.7-dev
|
||||
- python: 2.7
|
||||
- python: pypy
|
||||
|
||||
addons:
|
||||
apt:
|
||||
packages:
|
||||
- tor
|
||||
|
||||
services:
|
||||
- docker
|
||||
|
||||
before_install:
|
||||
- sudo service docker restart ; sleep 10 # https://github.com/travis-ci/travis-ci/issues/4778
|
||||
- docker network create --driver=bridge trough
|
||||
- docker run --detach --network=trough --hostname=rethinkdb --name=rethinkdb --publish=28015:28015 rethinkdb
|
||||
- docker run --detach --network=trough --hostname=hadoop --name=hadoop chalimartines/cdh5-pseudo-distributed
|
||||
- docker run --detach --network=trough --hostname=trough --name=trough --volume="$PWD/tests/run-trough.sh:/run-trough.sh" --publish=6111:6111 --publish=6112:6112 --publish=6222:6222 --publish=6444:6444 python:3.6 bash /run-trough.sh
|
||||
- cat /etc/hosts
|
||||
- echo | sudo tee -a /etc/hosts # travis-ci default doesn't end with a newline 🙄
|
||||
- echo 127.0.0.1 rethinkdb | sudo tee -a /etc/hosts
|
||||
- echo 127.0.0.1 hadoop | sudo tee -a /etc/hosts
|
||||
- echo 127.0.0.1 trough | sudo tee -a /etc/hosts
|
||||
- cat /etc/hosts
|
||||
- ping -c2 trough
|
||||
|
||||
install:
|
||||
- pip install . pytest requests warcio mock
|
||||
|
||||
before_script:
|
||||
- docker exec trough bash -c 'while ! test -e /tmp/trough-read.out ; do sleep 0.5 ; done' || true
|
||||
- docker logs --timestamps --details trough
|
||||
- ps ww -fHe
|
||||
- docker ps
|
||||
|
||||
script:
|
||||
- py.test -v --tb=native tests
|
||||
- py.test -v --tb=native --rethinkdb-dedup-url=rethinkdb://localhost/test1/dedup tests
|
||||
- py.test -v --tb=native --rethinkdb-big-table-url=rethinkdb://localhost/test2/captures tests
|
||||
- py.test -v --tb=native --rethinkdb-trough-db-url=rethinkdb://localhost/trough_configuration tests
|
||||
|
||||
after_script:
|
||||
- ps ww -fHe
|
||||
- docker exec trough cat /tmp/trough-write.out
|
||||
- docker exec trough cat /tmp/trough-segment-manager-server.out
|
||||
- docker exec trough cat /tmp/trough-segment-manager-local.out
|
||||
- docker exec trough cat /tmp/trough-sync-server.out
|
||||
- docker exec trough cat /tmp/trough-sync-local.out
|
||||
- docker exec trough cat /tmp/trough-read.out
|
||||
|
||||
notifications:
|
||||
slack:
|
||||
secure: UJzNe+kEJ8QhNxrdqObroisJAO2ipr+Sr2+u1e2euQdIkacyX+nZ88jSk6uDKniAemSfFDI8Ty5a7++2wSbE//Hr3jOSNOJMZLzockafzvIYrq9bP7V97j1gQ4u7liWd19VBnbf0pULuwEfy/n5PdOBR/TiPrgMuYjfZseV+alo=
|
||||
secure: S1SK52178uywcWLMO4S5POdjMv1MQjR061CKprjVn2d8x5RBbg8QZtumA6Xt+pByvJzh8vk+ITHCN57tcdi51yL6Z0QauXwxwzTsZmjrhxWOybAO2uOHliqQSDgxKcbXIqJKg7Yv19eLQYWDVJVGuwlMfVBS0hOHtTTpVuLuGuc=
|
@ -1,7 +1,5 @@
|
||||
Warcprox - WARC writing MITM HTTP/S proxy
|
||||
*****************************************
|
||||
.. image:: https://travis-ci.org/internetarchive/warcprox.svg?branch=master
|
||||
:target: https://travis-ci.org/internetarchive/warcprox
|
||||
|
||||
Warcprox is an HTTP proxy designed for web archiving applications. When used in
|
||||
parallel with `brozzler <https://github.com/internetarchive/brozzler>`_ it
|
||||
@ -89,12 +87,13 @@ for deduplication works similarly to deduplication by `Heritrix
|
||||
4. If not found,
|
||||
|
||||
a. Write ``response`` record with full payload
|
||||
b. Store new entry in deduplication database
|
||||
b. Store new entry in deduplication database (can be disabled, see
|
||||
`Warcprox-Meta HTTP request header <api.rst#warcprox-meta-http-request-header>`_)
|
||||
|
||||
The deduplication database is partitioned into different "buckets". URLs are
|
||||
deduplicated only against other captures in the same bucket. If specified, the
|
||||
``dedup-bucket`` field of the `Warcprox-Meta HTTP request header
|
||||
<api.rst#warcprox-meta-http-request-header>`_ determines the bucket. Otherwise,
|
||||
``dedup-buckets`` field of the `Warcprox-Meta HTTP request header
|
||||
<api.rst#warcprox-meta-http-request-header>`_ determines the bucket(s). Otherwise,
|
||||
the default bucket is used.
|
||||
|
||||
Deduplication can be disabled entirely by starting warcprox with the argument
|
||||
|
0
__init__.py
Normal file
0
__init__.py
Normal file
26
api.rst
26
api.rst
@ -137,14 +137,16 @@ Example::
|
||||
|
||||
Warcprox-Meta: {"warc-prefix": "special-warc"}
|
||||
|
||||
``dedup-bucket`` (string)
|
||||
``dedup-buckets`` (string)
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
Specifies the deduplication bucket. For more information about deduplication
|
||||
Specifies the deduplication bucket(s). For more information about deduplication
|
||||
see `<README.rst#deduplication>`_.
|
||||
|
||||
Example::
|
||||
Examples::
|
||||
|
||||
Warcprox-Meta: {"dedup-bucket":"my-dedup-bucket"}
|
||||
Warcprox-Meta: {"dedup-buckets":{"my-dedup-bucket":"rw"}}
|
||||
|
||||
Warcprox-Meta: {"dedup-buckets":{"my-dedup-bucket":"rw", "my-read-only-dedup-bucket": "ro"}}
|
||||
|
||||
``blocks`` (list)
|
||||
~~~~~~~~~~~~~~~~~
|
||||
@ -184,6 +186,22 @@ to evaluate the block rules. In particular, this circumstance prevails when the
|
||||
browser controlled by brozzler is requesting images, javascript, css, and so
|
||||
on, embedded in a page.
|
||||
|
||||
``compressed_blocks`` (string)
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
If the ``blocks`` header is large, it may be useful or necessary to compress it.
|
||||
``compressed_blocks`` is a string containing a zlib and base64-encoded
|
||||
``blocks`` list. If both ``blocks`` and ``compressed_blocks`` are provided,
|
||||
warcprox will use the value of ``compressed_blocks``, however this behavior
|
||||
is not guaranteed.
|
||||
|
||||
Example::
|
||||
|
||||
Warcprox-Meta: {"compressed_blocks": "eJwVykEKgCAQQNGryKwt90F0kGgxlZSgzuCMFIR3r7b//fkBkVoUBgMbJetvTBy9de5U5cFBs+aBnRKG/D8J44XF91XAGpC6ipaQj58u7iIdIfd88oSbBsrjF6gqtOUFJ5YjwQ=="}
|
||||
|
||||
Is equivalent to::
|
||||
|
||||
{"blocks": [{"ssurt": "com,example,//http:/"}, {"domain": "malware.us", "substring": "wp-login.php?action=logout"}]}
|
||||
|
||||
``stats`` (dictionary)
|
||||
~~~~~~~~~~~~~~~~~~~~~~
|
||||
``stats`` is a dictionary with only one field understood by warcprox,
|
||||
|
28
pyproject.toml
Normal file
28
pyproject.toml
Normal file
@ -0,0 +1,28 @@
|
||||
[project]
|
||||
name = "warcprox"
|
||||
authors = [
|
||||
{ name="Noah Levitt", email="nlevitt@archive.org" },
|
||||
]
|
||||
maintainers = [
|
||||
{ name="Vangelis Banos", email="vangelis@archive.org" },
|
||||
{ name="Adam Miller", email="adam@archive.org" },
|
||||
{ name="Barbara Miller", email="barbara@archive.org" },
|
||||
{ name="Alex Dempsey", email="avdempsey@archive.org" },
|
||||
]
|
||||
description = "WARC writing MITM HTTP/S proxy"
|
||||
readme = "README.rst"
|
||||
requires-python = ">=3.8"
|
||||
classifiers = [
|
||||
"Programming Language :: Python :: 3",
|
||||
"License :: OSI Approved :: Apache Software License",
|
||||
"Operating System :: OS Independent",
|
||||
]
|
||||
dynamic = [ "version", "license", "scripts", "dependencies", "optional-dependencies" ]
|
||||
|
||||
[project.urls]
|
||||
Homepage = "https://github.com/internetarchive/warcprox"
|
||||
Issues = "https://github.com/internetarchive/warcprox/issues"
|
||||
|
||||
[build-system]
|
||||
requires = ["setuptools>=61.0"]
|
||||
build-backend = "setuptools.build_meta"
|
35
setup.py
35
setup.py
@ -2,7 +2,7 @@
|
||||
'''
|
||||
setup.py - setuptools installation configuration for warcprox
|
||||
|
||||
Copyright (C) 2013-2018 Internet Archive
|
||||
Copyright (C) 2013-2024 Internet Archive
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU General Public License
|
||||
@ -24,15 +24,17 @@ import sys
|
||||
import setuptools
|
||||
|
||||
deps = [
|
||||
'certauth==1.1.6',
|
||||
'warctools>=4.10.0,<=4.10.0',
|
||||
'urlcanon>=0.1.dev16,<=0.1.dev23',
|
||||
'doublethink>=0.2.0.dev87,<=0.2.0.dev88',
|
||||
'urllib3>=1.23,<=1.23',
|
||||
'requests>=2.0.1,<=2.19.1',
|
||||
'PySocks>=1.6.8,<=1.6.8',
|
||||
'cryptography>=2.3,<=2.3.1',
|
||||
'idna>=2.5,<2.8',
|
||||
'warctools>=4.10.0',
|
||||
'urlcanon>=0.3.0',
|
||||
'doublethink==0.4.9',
|
||||
'urllib3>=1.23',
|
||||
'requests>=2.0.1',
|
||||
'PySocks>=1.6.8',
|
||||
'cryptography>=39,<40',
|
||||
'idna',
|
||||
'PyYAML>=5.1',
|
||||
'cachetools',
|
||||
'rfc3986>=1.5.0',
|
||||
]
|
||||
try:
|
||||
import concurrent.futures
|
||||
@ -41,7 +43,7 @@ except:
|
||||
|
||||
setuptools.setup(
|
||||
name='warcprox',
|
||||
version='2.4b3',
|
||||
version='2.6.1',
|
||||
description='WARC writing MITM HTTP/S proxy',
|
||||
url='https://github.com/internetarchive/warcprox',
|
||||
author='Noah Levitt',
|
||||
@ -50,6 +52,8 @@ setuptools.setup(
|
||||
license='GPL',
|
||||
packages=['warcprox'],
|
||||
install_requires=deps,
|
||||
# preferred trough 'trough @ git+https://github.com/internetarchive/trough.git@jammy_focal'
|
||||
extras_require={'trough': 'trough'},
|
||||
setup_requires=['pytest-runner'],
|
||||
tests_require=['mock', 'pytest', 'warcio'],
|
||||
entry_points={
|
||||
@ -64,13 +68,12 @@ setuptools.setup(
|
||||
'Development Status :: 5 - Production/Stable',
|
||||
'Environment :: Console',
|
||||
'License :: OSI Approved :: GNU General Public License (GPL)',
|
||||
'Programming Language :: Python :: 3.4',
|
||||
'Programming Language :: Python :: 3.5',
|
||||
'Programming Language :: Python :: 3.6',
|
||||
'Programming Language :: Python :: 3.7',
|
||||
'Programming Language :: Python :: 3.8',
|
||||
'Programming Language :: Python :: 3.9',
|
||||
'Programming Language :: Python :: 3.10',
|
||||
'Programming Language :: Python :: 3.11',
|
||||
'Topic :: Internet :: Proxy Servers',
|
||||
'Topic :: Internet :: WWW/HTTP',
|
||||
'Topic :: Software Development :: Libraries :: Python Modules',
|
||||
'Topic :: System :: Archiving',
|
||||
])
|
||||
|
||||
|
@ -19,7 +19,7 @@
|
||||
# USA.
|
||||
#
|
||||
|
||||
FROM phusion/baseimage
|
||||
FROM ubuntu:focal-20220404
|
||||
MAINTAINER Noah Levitt <nlevitt@archive.org>
|
||||
|
||||
# see https://github.com/stuartpb/rethinkdb-dockerfiles/blob/master/trusty/2.1.3/Dockerfile
|
||||
@ -28,10 +28,11 @@ MAINTAINER Noah Levitt <nlevitt@archive.org>
|
||||
ENV LANG=C.UTF-8
|
||||
|
||||
RUN apt-get update && apt-get --auto-remove -y dist-upgrade
|
||||
RUN apt-get install -y ca-certificates curl gnupg wget
|
||||
|
||||
# Add the RethinkDB repository and public key
|
||||
RUN curl -s https://download.rethinkdb.com/apt/pubkey.gpg | apt-key add - \
|
||||
&& echo "deb http://download.rethinkdb.com/apt xenial main" > /etc/apt/sources.list.d/rethinkdb.list \
|
||||
RUN curl -Ss https://download.rethinkdb.com/repository/raw/pubkey.gpg | apt-key add -
|
||||
RUN echo "deb https://download.rethinkdb.com/repository/ubuntu-focal focal main" > /etc/apt/sources.list.d/rethinkdb.list \
|
||||
&& apt-get update && apt-get -y install rethinkdb
|
||||
|
||||
RUN mkdir -vp /etc/service/rethinkdb \
|
||||
@ -57,30 +58,59 @@ RUN mkdir -vp /etc/service/tor \
|
||||
&& chmod a+x /etc/service/tor/run
|
||||
|
||||
# hadoop hdfs for trough
|
||||
RUN curl -s https://archive.cloudera.com/cdh5/ubuntu/xenial/amd64/cdh/archive.key | apt-key add - \
|
||||
&& . /etc/lsb-release \
|
||||
&& echo "deb [arch=amd64] http://archive.cloudera.com/cdh5/ubuntu/$DISTRIB_CODENAME/amd64/cdh $DISTRIB_CODENAME-cdh5 contrib" >> /etc/apt/sources.list.d/cloudera.list
|
||||
|
||||
RUN apt-get update
|
||||
RUN apt-get install -y openjdk-8-jdk hadoop-conf-pseudo
|
||||
ARG DEBIAN_FRONTEND=noninteractive
|
||||
ENV TZ=Etc/UTC
|
||||
RUN apt-get install -y openjdk-8-jdk openssh-server
|
||||
|
||||
RUN su hdfs -c 'hdfs namenode -format'
|
||||
# set java home
|
||||
ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64
|
||||
|
||||
RUN mv -v /etc/hadoop/conf/core-site.xml /etc/hadoop/conf/core-site.xml.orig \
|
||||
&& cat /etc/hadoop/conf/core-site.xml.orig | sed 's,localhost:8020,0.0.0.0:8020,' > /etc/hadoop/conf/core-site.xml
|
||||
# setup ssh with no passphrase
|
||||
RUN ssh-keygen -t rsa -f $HOME/.ssh/id_rsa -P "" \
|
||||
&& cat $HOME/.ssh/id_rsa.pub >> $HOME/.ssh/authorized_keys
|
||||
|
||||
RUN mv -v /etc/hadoop/conf/hdfs-site.xml /etc/hadoop/conf/hdfs-site.xml.orig \
|
||||
&& cat /etc/hadoop/conf/hdfs-site.xml.orig | sed 's,^</configuration>$, <property>\n <name>dfs.permissions.enabled</name>\n <value>false</value>\n </property>\n</configuration>,' > /etc/hadoop/conf/hdfs-site.xml
|
||||
RUN wget -O /hadoop-2.7.3.tar.gz -q https://archive.apache.org/dist/hadoop/common/hadoop-2.7.3/hadoop-2.7.3.tar.gz \
|
||||
&& tar xfz hadoop-2.7.3.tar.gz \
|
||||
&& mv /hadoop-2.7.3 /usr/local/hadoop \
|
||||
&& rm /hadoop-2.7.3.tar.gz
|
||||
|
||||
RUN echo '#!/bin/bash\nservice hadoop-hdfs-namenode start\nservice hadoop-hdfs-datanode start' > /etc/my_init.d/50_start_hdfs.sh \
|
||||
&& chmod a+x /etc/my_init.d/50_start_hdfs.sh
|
||||
# hadoop environment variables
|
||||
ENV HADOOP_HOME=/usr/local/hadoop
|
||||
ENV PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin
|
||||
|
||||
RUN apt-get install -y libsqlite3-dev
|
||||
# hadoop-store
|
||||
RUN mkdir -p $HADOOP_HOME/hdfs/namenode \
|
||||
&& mkdir -p $HADOOP_HOME/hdfs/datanode
|
||||
|
||||
# Temporary files: http://refspecs.linuxfoundation.org/FHS_3.0/fhs/ch03s18.html
|
||||
COPY config/ /tmp/
|
||||
RUN mv /tmp/ssh_config $HOME/.ssh/config \
|
||||
&& mv /tmp/hadoop-env.sh $HADOOP_HOME/etc/hadoop/hadoop-env.sh \
|
||||
&& mv /tmp/core-site.xml $HADOOP_HOME/etc/hadoop/core-site.xml \
|
||||
&& mv /tmp/hdfs-site.xml $HADOOP_HOME/etc/hadoop/hdfs-site.xml \
|
||||
&& mv /tmp/mapred-site.xml $HADOOP_HOME/etc/hadoop/mapred-site.xml.template \
|
||||
&& cp $HADOOP_HOME/etc/hadoop/mapred-site.xml.template $HADOOP_HOME/etc/hadoop/mapred-site.xml \
|
||||
&& mv /tmp/yarn-site.xml $HADOOP_HOME/etc/hadoop/yarn-site.xml
|
||||
|
||||
# Add startup script
|
||||
ADD config/hadoop-services.sh $HADOOP_HOME/hadoop-services.sh
|
||||
|
||||
# set permissions
|
||||
RUN chmod 744 -R $HADOOP_HOME
|
||||
|
||||
# format namenode
|
||||
RUN $HADOOP_HOME/bin/hdfs namenode -format
|
||||
|
||||
# run hadoop services
|
||||
#ENTRYPOINT $HADOOP_HOME/hadoop-services.sh; bash
|
||||
|
||||
RUN apt-get install -y libsqlite3-dev build-essential
|
||||
|
||||
# trough itself
|
||||
RUN virtualenv -p python3 /opt/trough-ve3 \
|
||||
&& . /opt/trough-ve3/bin/activate \
|
||||
&& pip install git+https://github.com/jkafader/snakebite@feature/python3-version-string \
|
||||
&& pip install git+https://github.com/nlevitt/snakebite.git@py3 \
|
||||
&& pip install git+https://github.com/internetarchive/trough.git
|
||||
|
||||
RUN mkdir -vp /etc/service/trough-sync-local \
|
||||
@ -107,3 +137,4 @@ RUN mkdir -vp /etc/service/trough-segment-manager-server \
|
||||
&& echo '#!/bin/bash\nvenv=/opt/trough-ve3\nsource $venv/bin/activate\nsleep 5\npython -c $"import doublethink ; from trough.settings import settings ; rr = doublethink.Rethinker(settings[\"RETHINKDB_HOSTS\"]) ; rr.db(\"trough_configuration\").wait().run()"\nexec uwsgi --venv=$venv --http :6111 --master --processes=2 --harakiri=7200 --http-timeout=7200 --max-requests=50000 --vacuum --die-on-term --mount /=trough.wsgi.segment_manager:server >>/tmp/trough-segment-manager-server.out 2>&1' > /etc/service/trough-segment-manager-server/run \
|
||||
&& chmod a+x /etc/service/trough-segment-manager-server/run
|
||||
|
||||
RUN apt-get install -y daemontools daemontools-run
|
||||
|
@ -31,15 +31,18 @@ script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
|
||||
docker build -t internetarchive/warcprox-tests $script_dir
|
||||
|
||||
docker run --rm --volume="$script_dir/..:/warcprox" internetarchive/warcprox-tests /sbin/my_init -- \
|
||||
docker run --rm --volume="$script_dir/..:/warcprox" internetarchive/warcprox-tests \
|
||||
bash -x -c "cd /tmp && git clone /warcprox && cd /tmp/warcprox \
|
||||
&& (cd /warcprox && git diff HEAD) | patch -p1 \
|
||||
&& virtualenv -p python3 /tmp/venv \
|
||||
&& source /tmp/venv/bin/activate \
|
||||
&& pip --log-file /tmp/pip.log install . pytest mock requests warcio \
|
||||
&& py.test -v tests \
|
||||
&& py.test -v --rethinkdb-dedup-url=rethinkdb://localhost/test1/dedup tests \
|
||||
&& pip --log-file /tmp/pip.log install . pytest mock requests warcio trough \
|
||||
&& py.test -v tests; \
|
||||
svscan /etc/service & \
|
||||
sleep 10; \
|
||||
py.test -v --rethinkdb-dedup-url=rethinkdb://localhost/test1/dedup tests \
|
||||
&& py.test -v --rethinkdb-big-table-url=rethinkdb://localhost/test2/captures tests \
|
||||
&& /usr/local/hadoop/hadoop-services.sh \
|
||||
&& py.test -v --rethinkdb-trough-db-url=rethinkdb://localhost/trough_configuration tests \
|
||||
"
|
||||
|
||||
|
@ -5,7 +5,7 @@
|
||||
|
||||
set -x
|
||||
|
||||
pip install git+https://github.com/jkafader/snakebite@feature/python3-version-string
|
||||
pip install git+https://github.com/nlevitt/snakebite.git@py3
|
||||
pip install git+https://github.com/internetarchive/trough.git
|
||||
|
||||
mkdir /etc/trough
|
||||
|
89
tests/test_certauth.py
Normal file
89
tests/test_certauth.py
Normal file
@ -0,0 +1,89 @@
|
||||
import os
|
||||
import shutil
|
||||
|
||||
from warcprox.certauth import main, CertificateAuthority
|
||||
import tempfile
|
||||
from OpenSSL import crypto
|
||||
import datetime
|
||||
import time
|
||||
|
||||
def setup_module():
|
||||
global TEST_CA_DIR
|
||||
TEST_CA_DIR = tempfile.mkdtemp()
|
||||
|
||||
global TEST_CA_ROOT
|
||||
TEST_CA_ROOT = os.path.join(TEST_CA_DIR, 'certauth_test_ca.pem')
|
||||
|
||||
def teardown_module():
|
||||
shutil.rmtree(TEST_CA_DIR)
|
||||
assert not os.path.isdir(TEST_CA_DIR)
|
||||
assert not os.path.isfile(TEST_CA_ROOT)
|
||||
|
||||
def test_create_root():
|
||||
ret = main([TEST_CA_ROOT, '-c', 'Test Root Cert'])
|
||||
assert ret == 0
|
||||
|
||||
def test_create_host_cert():
|
||||
ret = main([TEST_CA_ROOT, '-d', TEST_CA_DIR, '-n', 'example.com'])
|
||||
assert ret == 0
|
||||
certfile = os.path.join(TEST_CA_DIR, 'example.com.pem')
|
||||
assert os.path.isfile(certfile)
|
||||
|
||||
def test_create_wildcard_host_cert_force_overwrite():
|
||||
ret = main([TEST_CA_ROOT, '-d', TEST_CA_DIR, '--hostname', 'example.com', '-w', '-f'])
|
||||
assert ret == 0
|
||||
certfile = os.path.join(TEST_CA_DIR, 'example.com.pem')
|
||||
assert os.path.isfile(certfile)
|
||||
|
||||
def test_explicit_wildcard():
|
||||
ca = CertificateAuthority(TEST_CA_ROOT, TEST_CA_DIR, 'Test CA')
|
||||
filename = ca.get_wildcard_cert('test.example.proxy')
|
||||
certfile = os.path.join(TEST_CA_DIR, 'example.proxy.pem')
|
||||
assert filename == certfile
|
||||
assert os.path.isfile(certfile)
|
||||
os.remove(certfile)
|
||||
|
||||
def test_create_already_exists():
|
||||
ret = main([TEST_CA_ROOT, '-d', TEST_CA_DIR, '-n', 'example.com', '-w'])
|
||||
assert ret == 1
|
||||
certfile = os.path.join(TEST_CA_DIR, 'example.com.pem')
|
||||
assert os.path.isfile(certfile)
|
||||
# remove now
|
||||
os.remove(certfile)
|
||||
|
||||
def test_create_root_already_exists():
|
||||
ret = main([TEST_CA_ROOT])
|
||||
# not created, already exists
|
||||
assert ret == 1
|
||||
# remove now
|
||||
os.remove(TEST_CA_ROOT)
|
||||
|
||||
def test_create_root_subdir():
|
||||
# create a new cert in a subdirectory
|
||||
subdir = os.path.join(TEST_CA_DIR, 'subdir')
|
||||
|
||||
ca_file = os.path.join(subdir, 'certauth_test_ca.pem')
|
||||
|
||||
ca = CertificateAuthority(ca_file, subdir, 'Test CA',
|
||||
cert_not_before=-60 * 60,
|
||||
cert_not_after=60 * 60 * 24 * 3)
|
||||
|
||||
assert os.path.isdir(subdir)
|
||||
assert os.path.isfile(ca_file)
|
||||
|
||||
buff = ca.get_root_PKCS12()
|
||||
assert len(buff) > 0
|
||||
|
||||
expected_not_before = datetime.datetime.utcnow() - datetime.timedelta(seconds=60 * 60)
|
||||
expected_not_after = datetime.datetime.utcnow() + datetime.timedelta(seconds=60 * 60 * 24 * 3)
|
||||
|
||||
cert = crypto.load_pkcs12(buff).get_certificate()
|
||||
|
||||
actual_not_before = datetime.datetime.strptime(
|
||||
cert.get_notBefore().decode('ascii'), '%Y%m%d%H%M%SZ')
|
||||
actual_not_after = datetime.datetime.strptime(
|
||||
cert.get_notAfter().decode('ascii'), '%Y%m%d%H%M%SZ')
|
||||
|
||||
time.mktime(expected_not_before.utctimetuple())
|
||||
assert abs((time.mktime(actual_not_before.utctimetuple()) - time.mktime(expected_not_before.utctimetuple()))) < 10
|
||||
assert abs((time.mktime(actual_not_after.utctimetuple()) - time.mktime(expected_not_after.utctimetuple()))) < 10
|
@ -3,7 +3,7 @@
|
||||
'''
|
||||
tests/test_warcprox.py - automated tests for warcprox
|
||||
|
||||
Copyright (C) 2013-2018 Internet Archive
|
||||
Copyright (C) 2013-2019 Internet Archive
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU General Public License
|
||||
@ -52,6 +52,7 @@ import mock
|
||||
import email.message
|
||||
import socketserver
|
||||
from concurrent import futures
|
||||
import urllib.parse
|
||||
|
||||
try:
|
||||
import http.server as http_server
|
||||
@ -67,6 +68,7 @@ import certauth.certauth
|
||||
|
||||
import warcprox
|
||||
import warcprox.main
|
||||
import warcprox.crawl_log as crawl_log
|
||||
|
||||
try:
|
||||
import http.client as http_client
|
||||
@ -93,9 +95,11 @@ logging.basicConfig(
|
||||
stream=sys.stdout, level=logging.TRACE,
|
||||
format='%(asctime)s %(process)d %(levelname)s %(threadName)s '
|
||||
'%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s')
|
||||
|
||||
logging.getLogger("urllib3").setLevel(logging.WARN)
|
||||
logging.getLogger("requests.packages.urllib3").setLevel(logging.WARN)
|
||||
warnings.simplefilter("ignore", category=requests.packages.urllib3.exceptions.InsecureRequestWarning)
|
||||
warnings.simplefilter("ignore", category=requests.packages.urllib3.exceptions.InsecurePlatformWarning)
|
||||
import urllib3 ; urllib3.disable_warnings()
|
||||
import requests.packages.urllib3 ; requests.packages.urllib3.disable_warnings()
|
||||
|
||||
def wait(callback, timeout=10):
|
||||
start = time.time()
|
||||
@ -144,7 +148,7 @@ def dump_state(signum=None, frame=None):
|
||||
stack = traceback.format_stack(sys._current_frames()[th.ident])
|
||||
state_strs.append("".join(stack))
|
||||
|
||||
logging.warn("dumping state (caught signal {})\n{}".format(signum, "\n".join(state_strs)))
|
||||
logging.warning("dumping state (caught signal {})\n{}".format(signum, "\n".join(state_strs)))
|
||||
|
||||
signal.signal(signal.SIGQUIT, dump_state)
|
||||
|
||||
@ -173,8 +177,10 @@ class _TestHttpRequestHandler(http_server.BaseHTTPRequestHandler):
|
||||
def build_response(self):
|
||||
m = re.match(r'^/([^/]+)/([^/]+)$', self.path)
|
||||
if m is not None:
|
||||
special_header = 'warcprox-test-header: {}!'.format(m.group(1)).encode('utf-8')
|
||||
payload = 'I am the warcprox test payload! {}!\n'.format(10*m.group(2)).encode('utf-8')
|
||||
seg1 = urllib.parse.unquote(m.group(1))
|
||||
seg2 = urllib.parse.unquote(m.group(2))
|
||||
special_header = 'warcprox-test-header: {}!'.format(seg1).encode('utf-8')
|
||||
payload = 'I am the warcprox test payload! {}!\n'.format(10*seg2).encode('utf-8')
|
||||
headers = (b'HTTP/1.1 200 OK\r\n'
|
||||
+ b'Content-Type: text/plain\r\n'
|
||||
+ special_header + b'\r\n'
|
||||
@ -279,6 +285,21 @@ class _TestHttpRequestHandler(http_server.BaseHTTPRequestHandler):
|
||||
payload = b'Test.'
|
||||
actual_headers = (b'Content-Type: text/plain\r\n'
|
||||
+ b'Content-Length: ' + str(len(payload)).encode('ascii') + b'\r\n')
|
||||
elif self.path == '/incomplete-read':
|
||||
headers = (b'HTTP/1.1 200 OK\r\n'
|
||||
+ b'Content-Type: text/plain\r\n'
|
||||
+ b'Transfer-Encoding: chunked\r\n'
|
||||
+ b'\r\n')
|
||||
# payload = b'''1\r\na'''
|
||||
payload = chunkify(
|
||||
b'Server closes connection when client expects next chunk')
|
||||
payload = payload[:-7]
|
||||
elif self.path == '/space_in_content_type':
|
||||
payload = b'test'
|
||||
headers = (b'HTTP/1.1 200 OK\r\n'
|
||||
+ b'Content-Type: \r\n'
|
||||
+ b'Content-Length: ' + str(len(payload)).encode('ascii') + b'\r\n'
|
||||
+ b'\r\n')
|
||||
else:
|
||||
payload = b'404 Not Found\n'
|
||||
headers = (b'HTTP/1.1 404 Not Found\r\n'
|
||||
@ -292,7 +313,9 @@ class _TestHttpRequestHandler(http_server.BaseHTTPRequestHandler):
|
||||
headers, payload = self.build_response()
|
||||
self.connection.sendall(headers)
|
||||
self.connection.sendall(payload)
|
||||
if self.path in ('/missing-content-length', '/empty-response'):
|
||||
if self.path in (
|
||||
'/missing-content-length', '/empty-response',
|
||||
'/incomplete-read'):
|
||||
# server must close the connection, else client has no idea if
|
||||
# there is more data coming
|
||||
self.connection.shutdown(socket.SHUT_RDWR)
|
||||
@ -446,7 +469,7 @@ def warcprox_(request, http_daemon, https_daemon):
|
||||
logging.info('dropping rethinkdb database %r', parsed.database)
|
||||
rr.db_drop(parsed.database).run()
|
||||
except Exception as e:
|
||||
logging.warn(
|
||||
logging.warning(
|
||||
'problem deleting rethinkdb database %r: %s',
|
||||
parsed.database, e)
|
||||
logging.info('deleting working directory %r', work_dir)
|
||||
@ -777,7 +800,7 @@ def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies,
|
||||
url2 = 'https://localhost:{}/k/l'.format(https_daemon.server_port)
|
||||
|
||||
# archive url1 bucket_a
|
||||
headers = {"Warcprox-Meta": json.dumps({"warc-prefix":"test_dedup_buckets","dedup-bucket":"bucket_a"})}
|
||||
headers = {"Warcprox-Meta": json.dumps({"warc-prefix":"test_dedup_buckets","dedup-buckets":{"bucket_a":"rw"}})}
|
||||
response = requests.get(url1, proxies=archiving_proxies, verify=False, headers=headers)
|
||||
assert response.status_code == 200
|
||||
assert response.headers['warcprox-test-header'] == 'k!'
|
||||
@ -803,7 +826,7 @@ def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies,
|
||||
assert dedup_lookup is None
|
||||
|
||||
# archive url2 bucket_b
|
||||
headers = {"Warcprox-Meta": json.dumps({"warc-prefix":"test_dedup_buckets","dedup-bucket":"bucket_b"})}
|
||||
headers = {"Warcprox-Meta": json.dumps({"warc-prefix":"test_dedup_buckets","dedup-buckets":{"bucket_b":""}})}
|
||||
response = requests.get(url2, proxies=archiving_proxies, verify=False, headers=headers)
|
||||
assert response.status_code == 200
|
||||
assert response.headers['warcprox-test-header'] == 'k!'
|
||||
@ -844,10 +867,9 @@ def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies,
|
||||
# close the warc
|
||||
assert warcprox_.warc_writer_processor.writer_pool.warc_writers["test_dedup_buckets"]
|
||||
writer = warcprox_.warc_writer_processor.writer_pool.warc_writers["test_dedup_buckets"]
|
||||
warc = writer._available_warcs.queue[0]
|
||||
warc_path = os.path.join(warc.directory, warc.finalname)
|
||||
warc_path = os.path.join(writer.directory, writer.finalname)
|
||||
assert not os.path.exists(warc_path)
|
||||
warcprox_.warc_writer_processor.writer_pool.warc_writers["test_dedup_buckets"].close_writer()
|
||||
warcprox_.warc_writer_processor.writer_pool.warc_writers["test_dedup_buckets"].close()
|
||||
assert os.path.exists(warc_path)
|
||||
|
||||
# read the warc
|
||||
@ -904,6 +926,71 @@ def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies,
|
||||
finally:
|
||||
fh.close()
|
||||
|
||||
def test_dedup_buckets_readonly(https_daemon, http_daemon, warcprox_, archiving_proxies, playback_proxies):
|
||||
urls_before = warcprox_.proxy.running_stats.urls
|
||||
|
||||
url1 = 'http://localhost:{}/k/l'.format(http_daemon.server_port)
|
||||
|
||||
# archive url1
|
||||
headers = {"Warcprox-Meta": json.dumps({"warc-prefix":"test_dedup_buckets_readonly",
|
||||
"dedup-buckets":{"bucket_1":"rw", "bucket_2":"ro"}})
|
||||
}
|
||||
response = requests.get(url1, proxies=archiving_proxies, verify=False, headers=headers)
|
||||
assert response.status_code == 200
|
||||
assert response.headers['warcprox-test-header'] == 'k!'
|
||||
assert response.content == b'I am the warcprox test payload! llllllllll!\n'
|
||||
|
||||
# wait for postfetch chain
|
||||
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 1)
|
||||
|
||||
# check url1 in dedup db bucket_1 (rw)
|
||||
# logging.info('looking up sha1:bc3fac8847c9412f49d955e626fb58a76befbf81 in bucket_1')
|
||||
dedup_lookup = warcprox_.dedup_db.lookup(
|
||||
b'sha1:bc3fac8847c9412f49d955e626fb58a76befbf81', bucket="bucket_1")
|
||||
assert dedup_lookup
|
||||
assert dedup_lookup['url'] == url1.encode('ascii')
|
||||
assert re.match(br'^<urn:uuid:[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}>$', dedup_lookup['id'])
|
||||
assert re.match(br'^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$', dedup_lookup['date'])
|
||||
record_id = dedup_lookup['id']
|
||||
dedup_date = dedup_lookup['date']
|
||||
|
||||
# check url1 not in dedup db bucket_2 (ro)
|
||||
dedup_lookup = warcprox_.dedup_db.lookup(
|
||||
b'sha1:bc3fac8847c9412f49d955e626fb58a76befbf81', bucket="bucket_2")
|
||||
assert dedup_lookup is None
|
||||
|
||||
# close the warc
|
||||
assert warcprox_.warc_writer_processor.writer_pool.warc_writers["test_dedup_buckets_readonly"]
|
||||
writer = warcprox_.warc_writer_processor.writer_pool.warc_writers["test_dedup_buckets_readonly"]
|
||||
warc_path = os.path.join(writer.directory, writer.finalname)
|
||||
assert not os.path.exists(warc_path)
|
||||
warcprox_.warc_writer_processor.writer_pool.warc_writers["test_dedup_buckets_readonly"].close()
|
||||
assert os.path.exists(warc_path)
|
||||
|
||||
# read the warc
|
||||
fh = warctools.ArchiveRecord.open_archive(warc_path)
|
||||
record_iter = fh.read_records(limit=None, offsets=True)
|
||||
try:
|
||||
(offset, record, errors) = next(record_iter)
|
||||
assert record.type == b'warcinfo'
|
||||
|
||||
# url1 bucket_1
|
||||
(offset, record, errors) = next(record_iter)
|
||||
assert record.type == b'response'
|
||||
assert record.url == url1.encode('ascii')
|
||||
# check for duplicate warc record headers
|
||||
assert Counter(h[0] for h in record.headers).most_common(1)[0][1] == 1
|
||||
assert record.content[1] == b'HTTP/1.1 200 OK\r\nContent-Type: text/plain\r\nwarcprox-test-header: k!\r\nContent-Length: 44\r\n\r\nI am the warcprox test payload! llllllllll!\n'
|
||||
(offset, record, errors) = next(record_iter)
|
||||
assert record.type == b'request'
|
||||
|
||||
# that's all folks
|
||||
assert next(record_iter)[1] == None
|
||||
assert next(record_iter, None) == None
|
||||
|
||||
finally:
|
||||
fh.close()
|
||||
|
||||
def test_dedup_bucket_concurrency(https_daemon, http_daemon, warcprox_, archiving_proxies):
|
||||
urls_before = warcprox_.proxy.running_stats.urls
|
||||
revisits_before = warcprox_.proxy.stats_db.value(
|
||||
@ -916,7 +1003,7 @@ def test_dedup_bucket_concurrency(https_daemon, http_daemon, warcprox_, archivin
|
||||
http_daemon.server_port, i)
|
||||
headers = {"Warcprox-Meta": json.dumps({
|
||||
"warc-prefix":"test_dedup_buckets",
|
||||
"dedup-bucket":"bucket_%s" % i})}
|
||||
"dedup-buckets":{"bucket_%s" % i:"rw"}})}
|
||||
pool.submit(
|
||||
requests.get, url, proxies=archiving_proxies, verify=False,
|
||||
headers=headers)
|
||||
@ -932,7 +1019,7 @@ def test_dedup_bucket_concurrency(https_daemon, http_daemon, warcprox_, archivin
|
||||
http_daemon.server_port, -i - 1)
|
||||
headers = {"Warcprox-Meta": json.dumps({
|
||||
"warc-prefix":"test_dedup_buckets",
|
||||
"dedup-bucket":"bucket_%s" % i})}
|
||||
"dedup-buckets":{"bucket_%s" % i:"rw"}})}
|
||||
pool.submit(
|
||||
requests.get, url, proxies=archiving_proxies, verify=False,
|
||||
headers=headers)
|
||||
@ -947,7 +1034,7 @@ def test_dedup_bucket_concurrency(https_daemon, http_daemon, warcprox_, archivin
|
||||
http_daemon.server_port, i)
|
||||
headers = {"Warcprox-Meta": json.dumps({
|
||||
"warc-prefix":"test_dedup_buckets",
|
||||
"dedup-bucket":"bucket_%s" % i})}
|
||||
"dedup-buckets":{"bucket_%s" % i:"rw"}})}
|
||||
pool.submit(
|
||||
requests.get, url, proxies=archiving_proxies, verify=False,
|
||||
headers=headers)
|
||||
@ -966,12 +1053,12 @@ def test_block_rules(http_daemon, https_daemon, warcprox_, archiving_proxies):
|
||||
},
|
||||
{
|
||||
"url_match": "SURT_MATCH",
|
||||
"value": "http://(localhost:%s,)/fuh/" % (http_daemon.server_port),
|
||||
"value": "http://(localhost,:%s)/fuh/" % (http_daemon.server_port),
|
||||
},
|
||||
{
|
||||
"url_match": "SURT_MATCH",
|
||||
# this rule won't match because of http scheme, https port
|
||||
"value": "http://(localhost:%s,)/fuh/" % (https_daemon.server_port),
|
||||
"value": "http://(localhost,:%s)/fuh/" % (https_daemon.server_port),
|
||||
},
|
||||
{
|
||||
"domain": "bad.domain.com",
|
||||
@ -1274,7 +1361,7 @@ def test_domain_data_soft_limit(
|
||||
warcprox_.proxy.remote_connection_pool.clear()
|
||||
|
||||
# novel, pushes stats over the limit
|
||||
url = 'https://muh.XN--Zz-2Ka.locALHOst:{}/z/~'.format(https_daemon.server_port)
|
||||
url = 'https://muh.XN--Zz-2Ka.locALHOst:{}/z/%7E'.format(https_daemon.server_port)
|
||||
response = requests.get(
|
||||
url, proxies=archiving_proxies, headers=headers, stream=True,
|
||||
verify=False)
|
||||
@ -1401,7 +1488,7 @@ def test_missing_content_length(archiving_proxies, http_daemon, https_daemon, wa
|
||||
assert not 'content-length' in response.headers
|
||||
|
||||
# wait for postfetch chain
|
||||
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 2)
|
||||
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 2, timeout=20)
|
||||
|
||||
def test_limit_large_resource(archiving_proxies, http_daemon, warcprox_):
|
||||
"""We try to load a 300k response but we use --max-resource-size=200000 in
|
||||
@ -1488,7 +1575,7 @@ def test_dedup_ok_flag(
|
||||
assert dedup_lookup is None
|
||||
|
||||
# archive with dedup_ok:False
|
||||
request_meta = {'dedup-bucket':'test_dedup_ok_flag','dedup-ok':False}
|
||||
request_meta = {'dedup-buckets':{'test_dedup_ok_flag':''},'dedup-ok':False}
|
||||
headers = {'Warcprox-Meta': json.dumps(request_meta)}
|
||||
response = requests.get(
|
||||
url, proxies=archiving_proxies, headers=headers, verify=False)
|
||||
@ -1506,7 +1593,7 @@ def test_dedup_ok_flag(
|
||||
assert dedup_lookup is None
|
||||
|
||||
# archive without dedup_ok:False
|
||||
request_meta = {'dedup-bucket':'test_dedup_ok_flag'}
|
||||
request_meta = {'dedup-buckets':{'test_dedup_ok_flag':''}}
|
||||
headers = {'Warcprox-Meta': json.dumps(request_meta)}
|
||||
response = requests.get(
|
||||
url, proxies=archiving_proxies, headers=headers, verify=False)
|
||||
@ -1612,13 +1699,11 @@ def test_controller_with_defaults():
|
||||
assert not wwp.writer_pool.default_warc_writer.record_builder.base32
|
||||
assert wwp.writer_pool.default_warc_writer.record_builder.digest_algorithm == 'sha1'
|
||||
|
||||
|
||||
class EarlyPlugin(warcprox.BaseStandardPostfetchProcessor):
|
||||
CHAIN_POSITION = 'early'
|
||||
def _process_url(self):
|
||||
pass
|
||||
|
||||
|
||||
def test_load_plugin():
|
||||
options = warcprox.Options(port=0, plugins=[
|
||||
'warcprox.stats.RunningStats',
|
||||
@ -1701,7 +1786,7 @@ def test_via_response_header(warcprox_, http_daemon, archiving_proxies, playback
|
||||
assert response.status_code == 200
|
||||
assert not 'via' in playback_response
|
||||
|
||||
warc = warcprox_.warc_writer_processor.writer_pool.default_warc_writer._available_warcs.queue[0].path
|
||||
warc = warcprox_.warc_writer_processor.writer_pool.default_warc_writer.path
|
||||
with open(warc, 'rb') as f:
|
||||
for record in warcio.archiveiterator.ArchiveIterator(f):
|
||||
if record.rec_headers.get_header('warc-target-uri') == url:
|
||||
@ -1714,13 +1799,13 @@ def test_slash_in_warc_prefix(warcprox_, http_daemon, archiving_proxies):
|
||||
url = 'http://localhost:%s/b/b' % http_daemon.server_port
|
||||
headers = {"Warcprox-Meta": json.dumps({"warc-prefix":"../../../../etc/a"})}
|
||||
response = requests.get(url, proxies=archiving_proxies, headers=headers)
|
||||
assert response.status_code == 500
|
||||
assert response.status_code == 400
|
||||
assert response.reason == 'request rejected by warcprox: slash and backslash are not permitted in warc-prefix'
|
||||
|
||||
url = 'http://localhost:%s/b/c' % http_daemon.server_port
|
||||
headers = {"Warcprox-Meta": json.dumps({"warc-prefix":"..\\..\\..\\derp\\monkey"})}
|
||||
response = requests.get(url, proxies=archiving_proxies, headers=headers)
|
||||
assert response.status_code == 500
|
||||
assert response.status_code == 400
|
||||
assert response.reason == 'request rejected by warcprox: slash and backslash are not permitted in warc-prefix'
|
||||
|
||||
def test_crawl_log(warcprox_, http_daemon, archiving_proxies):
|
||||
@ -1763,7 +1848,7 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies):
|
||||
|
||||
crawl_log = open(default_crawl_log_path, 'rb').read()
|
||||
# tests will fail in year 3000 :)
|
||||
assert re.match(b'\A2[^\n]+\n\Z', crawl_log)
|
||||
assert re.match(br'\A2[^\n]+\n\Z', crawl_log)
|
||||
assert crawl_log[24:31] == b' 200 '
|
||||
assert crawl_log[31:42] == b' 54 '
|
||||
fields = crawl_log.split()
|
||||
@ -1783,7 +1868,7 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies):
|
||||
assert extra_info['contentSize'] == 145
|
||||
|
||||
crawl_log_1 = open(file, 'rb').read()
|
||||
assert re.match(b'\A2[^\n]+\n\Z', crawl_log_1)
|
||||
assert re.match(br'\A2[^\n]+\n\Z', crawl_log_1)
|
||||
assert crawl_log_1[24:31] == b' 200 '
|
||||
assert crawl_log_1[31:42] == b' 54 '
|
||||
fields = crawl_log_1.split()
|
||||
@ -1821,7 +1906,7 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies):
|
||||
|
||||
crawl_log_2 = open(file, 'rb').read()
|
||||
|
||||
assert re.match(b'\A2[^\n]+\n\Z', crawl_log_2)
|
||||
assert re.match(br'\A2[^\n]+\n\Z', crawl_log_2)
|
||||
assert crawl_log_2[24:31] == b' 200 '
|
||||
assert crawl_log_2[31:42] == b' 54 '
|
||||
fields = crawl_log_2.split()
|
||||
@ -1854,7 +1939,7 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies):
|
||||
|
||||
assert os.path.exists(file)
|
||||
crawl_log_3 = open(file, 'rb').read()
|
||||
assert re.match(b'\A2[^\n]+\n\Z', crawl_log_3)
|
||||
assert re.match(br'\A2[^\n]+\n\Z', crawl_log_3)
|
||||
assert crawl_log_3[24:31] == b' 200 '
|
||||
assert crawl_log_3[31:42] == b' 0 '
|
||||
fields = crawl_log_3.split()
|
||||
@ -1894,7 +1979,7 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies):
|
||||
assert os.path.exists(file)
|
||||
crawl_log_4 = open(file, 'rb').read()
|
||||
|
||||
assert re.match(b'\A2[^\n]+\n\Z', crawl_log_4)
|
||||
assert re.match(br'\A2[^\n]+\n\Z', crawl_log_4)
|
||||
assert crawl_log_4[24:31] == b' 204 '
|
||||
assert crawl_log_4[31:42] == b' 38 '
|
||||
fields = crawl_log_4.split()
|
||||
@ -1914,6 +1999,155 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies):
|
||||
assert extra_info['contentSize'] == 38
|
||||
assert extra_info['method'] == 'WARCPROX_WRITE_RECORD'
|
||||
|
||||
#Empty spae for Content Type
|
||||
url = 'http://localhost:%s/space_in_content_type' % http_daemon.server_port
|
||||
headers = {'Warcprox-Meta': json.dumps({'warc-prefix': 'test_crawl_log_5'})}
|
||||
response = requests.get(url, proxies=archiving_proxies, headers=headers)
|
||||
|
||||
# wait for postfetch chain
|
||||
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 6)
|
||||
|
||||
file = os.path.join(
|
||||
warcprox_.options.crawl_log_dir,
|
||||
'test_crawl_log_5-%s-%s.log' % (hostname, port))
|
||||
|
||||
assert os.path.exists(file)
|
||||
crawl_log_5 = open(file, 'rb').read()
|
||||
assert re.match(br'\A2[^\n]+\n\Z', crawl_log_5)
|
||||
assert crawl_log_5[24:31] == b' 200 '
|
||||
assert crawl_log_5[31:42] == b' 4 '
|
||||
fields = crawl_log_5.split()
|
||||
assert len(fields) == 13
|
||||
assert fields[3].endswith(b'/space_in_content_type')
|
||||
assert fields[4] == b'-'
|
||||
assert fields[5] == b'-'
|
||||
assert fields[6] == b'-'
|
||||
assert fields[7] == b'-'
|
||||
assert re.match(br'^\d{17}[+]\d{3}', fields[8])
|
||||
assert fields[9] == b'sha1:a94a8fe5ccb19ba61c4c0873d391e987982fbbd3'
|
||||
assert fields[10] == b'-'
|
||||
assert fields[11] == b'-'
|
||||
extra_info = json.loads(fields[12].decode('utf-8'))
|
||||
assert set(extra_info.keys()) == {
|
||||
'contentSize', 'warcFilename', 'warcFileOffset'}
|
||||
assert extra_info['contentSize'] == 59
|
||||
|
||||
|
||||
#Fetch Exception
|
||||
url = 'http://localhost-doesnt-exist:%s/connection-error' % http_daemon.server_port
|
||||
headers = {'Warcprox-Meta': json.dumps({'warc-prefix': 'test_crawl_log_6'})}
|
||||
response = requests.get(url, proxies=archiving_proxies, headers=headers)
|
||||
|
||||
#Verify the connection is cleaned up properly after the exception
|
||||
url = 'http://localhost:%s/b/aa' % http_daemon.server_port
|
||||
response = requests.get(url, proxies=archiving_proxies)
|
||||
assert response.status_code == 200
|
||||
|
||||
# wait for postfetch chain
|
||||
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 7)
|
||||
|
||||
file = os.path.join(
|
||||
warcprox_.options.crawl_log_dir,
|
||||
'test_crawl_log_6-%s-%s.log' % (hostname, port))
|
||||
|
||||
assert os.path.exists(file)
|
||||
crawl_log_6 = open(file, 'rb').read()
|
||||
assert re.match(br'\A2[^\n]+\n\Z', crawl_log_6)
|
||||
|
||||
#seems to vary depending on the environment
|
||||
assert crawl_log_6[24:31] == b' -6 ' or crawl_log_6[24:31] == b' -2 '
|
||||
assert crawl_log_6[31:42] == b' 0 '
|
||||
fields = crawl_log_6.split()
|
||||
assert len(fields) == 13
|
||||
assert fields[3].endswith(b'/connection-error')
|
||||
assert fields[4] == b'-'
|
||||
assert fields[5] == b'-'
|
||||
assert fields[6] == b'-'
|
||||
assert fields[7] == b'-'
|
||||
assert fields[8] == b'-'
|
||||
assert fields[9] == b'-'
|
||||
assert fields[10] == b'-'
|
||||
assert fields[11] == b'-'
|
||||
extra_info = json.loads(fields[12].decode('utf-8'))
|
||||
assert set(extra_info.keys()) == {'exception'}
|
||||
|
||||
#Test the same bad server to check for -404
|
||||
url = 'http://localhost-doesnt-exist:%s/connection-error' % http_daemon.server_port
|
||||
headers = {'Warcprox-Meta': json.dumps({'warc-prefix': 'test_crawl_log_7'})}
|
||||
response = requests.get(url, proxies=archiving_proxies, headers=headers)
|
||||
|
||||
#Verify the connection is cleaned up properly after the exception
|
||||
url = 'http://localhost:%s/b/aa' % http_daemon.server_port
|
||||
response = requests.get(url, proxies=archiving_proxies)
|
||||
assert response.status_code == 200
|
||||
|
||||
# wait for postfetch chain
|
||||
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 8)
|
||||
|
||||
file = os.path.join(
|
||||
warcprox_.options.crawl_log_dir,
|
||||
'test_crawl_log_7-%s-%s.log' % (hostname, port))
|
||||
|
||||
assert os.path.exists(file)
|
||||
crawl_log_7 = open(file, 'rb').read()
|
||||
assert re.match(br'\A2[^\n]+\n\Z', crawl_log_7)
|
||||
assert crawl_log_7[24:31] == b' -404 '
|
||||
assert crawl_log_7[31:42] == b' 0 '
|
||||
fields = crawl_log_7.split()
|
||||
assert len(fields) == 13
|
||||
assert fields[3].endswith(b'/connection-error')
|
||||
assert fields[4] == b'-'
|
||||
assert fields[5] == b'-'
|
||||
assert fields[6] == b'-'
|
||||
assert fields[7] == b'-'
|
||||
assert fields[8] == b'-'
|
||||
assert fields[9] == b'-'
|
||||
assert fields[10] == b'-'
|
||||
assert fields[11] == b'-'
|
||||
extra_info = json.loads(fields[12].decode('utf-8'))
|
||||
assert set(extra_info.keys()) == {'exception'}
|
||||
|
||||
#Verify non-ascii urls are encoded properly
|
||||
url = 'http://localhost:%s/b/¶-non-ascii' % http_daemon.server_port
|
||||
headers = {
|
||||
"Warcprox-Meta": json.dumps({"warc-prefix":"test_crawl_log_8",
|
||||
"metadata":{'seed': 'http://example.com/¶-non-ascii', 'hop_path': 'L', 'brozzled_url': 'http://localhost:%s/b/¶-non-ascii' % http_daemon.server_port, 'hop_via_url': 'http://чунджа.kz/b/¶-non-ascii'}}),
|
||||
}
|
||||
response = requests.get(url, proxies=archiving_proxies, headers=headers)
|
||||
assert response.status_code == 200
|
||||
|
||||
# wait for postfetch chain
|
||||
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 9)
|
||||
|
||||
file = os.path.join(
|
||||
warcprox_.options.crawl_log_dir,
|
||||
'test_crawl_log_8-%s-%s.log' % (hostname, port))
|
||||
|
||||
assert os.path.exists(file)
|
||||
crawl_log_8 = open(file, 'rb').read()
|
||||
assert re.match(br'\A2[^\n]+\n\Z', crawl_log_8)
|
||||
assert crawl_log_8[24:31] == b' 200 '
|
||||
assert crawl_log_8[31:42] == b' 154 '
|
||||
fields = crawl_log_8.split()
|
||||
assert len(fields) == 13
|
||||
assert fields[3].endswith(b'/b/%C2%B6-non-ascii')
|
||||
assert fields[4] == b'L'
|
||||
assert fields[5].endswith(b'http://xn--80ahg0a3ax.kz/b/%C2%B6-non-ascii')
|
||||
assert fields[6] == b'text/plain'
|
||||
assert fields[7] == b'-'
|
||||
assert re.match(br'^\d{17}[+]\d{3}', fields[8])
|
||||
assert fields[9] == b'sha1:cdd841ea7c5e46fde3fba56b2e45e4df5aeec439'
|
||||
assert fields[10].endswith('/¶-non-ascii'.encode('utf-8'))
|
||||
assert fields[11] == b'-'
|
||||
extra_info = json.loads(fields[12].decode('utf-8'))
|
||||
|
||||
def test_crawl_log_canonicalization():
|
||||
assert crawl_log.canonicalize_url(None) is None
|
||||
assert crawl_log.canonicalize_url("") is ''
|
||||
assert crawl_log.canonicalize_url("-") == '-'
|
||||
assert crawl_log.canonicalize_url("http://чунджа.kz/b/¶-non-ascii") == "http://xn--80ahg0a3ax.kz/b/%C2%B6-non-ascii"
|
||||
assert crawl_log.canonicalize_url("Not a URL") == "Not a URL"
|
||||
|
||||
def test_long_warcprox_meta(
|
||||
warcprox_, http_daemon, archiving_proxies, playback_proxies):
|
||||
urls_before = warcprox_.proxy.running_stats.urls
|
||||
@ -1933,9 +2167,8 @@ def test_long_warcprox_meta(
|
||||
# check that warcprox-meta was parsed and honored ("warc-prefix" param)
|
||||
assert warcprox_.warc_writer_processor.writer_pool.warc_writers["test_long_warcprox_meta"]
|
||||
writer = warcprox_.warc_writer_processor.writer_pool.warc_writers["test_long_warcprox_meta"]
|
||||
warc = writer._available_warcs.queue[0]
|
||||
warc_path = os.path.join(warc.directory, warc.finalname)
|
||||
warcprox_.warc_writer_processor.writer_pool.warc_writers["test_long_warcprox_meta"].close_writer()
|
||||
warc_path = os.path.join(writer.directory, writer.finalname)
|
||||
warcprox_.warc_writer_processor.writer_pool.warc_writers["test_long_warcprox_meta"].close()
|
||||
assert os.path.exists(warc_path)
|
||||
|
||||
# read the warc
|
||||
@ -1977,6 +2210,10 @@ def test_socket_timeout_response(
|
||||
def test_empty_response(
|
||||
warcprox_, http_daemon, https_daemon, archiving_proxies,
|
||||
playback_proxies):
|
||||
# localhost:server_port was added to the `bad_hostnames_ports` cache by
|
||||
# previous tests and this causes subsequent tests to fail. We clear it.
|
||||
warcprox_.proxy.bad_hostnames_ports.clear()
|
||||
|
||||
url = 'http://localhost:%s/empty-response' % http_daemon.server_port
|
||||
response = requests.get(url, proxies=archiving_proxies, verify=False)
|
||||
assert response.status_code == 502
|
||||
@ -1992,6 +2229,10 @@ def test_payload_digest(warcprox_, http_daemon):
|
||||
Tests that digest is of RFC2616 "entity body"
|
||||
(transfer-decoded but not content-decoded)
|
||||
'''
|
||||
# localhost:server_port was added to the `bad_hostnames_ports` cache by
|
||||
# previous tests and this causes subsequent tests to fail. We clear it.
|
||||
warcprox_.proxy.bad_hostnames_ports.clear()
|
||||
|
||||
class HalfMockedMitm(warcprox.mitmproxy.MitmProxyHandler):
|
||||
def __init__(self, url):
|
||||
self.path = url
|
||||
@ -2047,24 +2288,6 @@ def test_payload_digest(warcprox_, http_daemon):
|
||||
req, prox_rec_res = mitm.do_GET()
|
||||
assert warcprox.digest_str(prox_rec_res.payload_digest) == GZIP_GZIP_SHA1
|
||||
|
||||
def test_trough_segment_promotion(warcprox_):
|
||||
if not warcprox_.options.rethinkdb_trough_db_url:
|
||||
return
|
||||
cli = warcprox.trough.TroughClient(
|
||||
warcprox_.options.rethinkdb_trough_db_url, 3)
|
||||
promoted = []
|
||||
def mock(segment_id):
|
||||
promoted.append(segment_id)
|
||||
cli.promote = mock
|
||||
cli.register_schema('default', 'create table foo (bar varchar(100))')
|
||||
cli.write('my_seg', 'insert into foo (bar) values ("boof")')
|
||||
assert promoted == []
|
||||
time.sleep(3)
|
||||
assert promoted == ['my_seg']
|
||||
promoted = []
|
||||
time.sleep(3)
|
||||
assert promoted == []
|
||||
|
||||
def test_dedup_min_text_size(http_daemon, warcprox_, archiving_proxies):
|
||||
"""We use options --dedup-min-text-size=3 --dedup-min-binary-size=5 and we
|
||||
try to download content smaller than these limits to make sure that it is
|
||||
@ -2118,7 +2341,7 @@ def test_dedup_min_text_size(http_daemon, warcprox_, archiving_proxies):
|
||||
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 1)
|
||||
|
||||
# check that response records were written
|
||||
warc = warcprox_.warc_writer_processor.writer_pool.default_warc_writer._available_warcs.queue[0].path
|
||||
warc = warcprox_.warc_writer_processor.writer_pool.default_warc_writer.path
|
||||
with open(warc, 'rb') as f:
|
||||
rec_iter = iter(warcio.archiveiterator.ArchiveIterator(f))
|
||||
record = next(rec_iter)
|
||||
@ -2198,7 +2421,7 @@ def test_dedup_min_binary_size(http_daemon, warcprox_, archiving_proxies):
|
||||
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 1)
|
||||
|
||||
# check that response records were written
|
||||
warc = warcprox_.warc_writer_processor.writer_pool.default_warc_writer._available_warcs.queue[0].path
|
||||
warc = warcprox_.warc_writer_processor.writer_pool.default_warc_writer.path
|
||||
with open(warc, 'rb') as f:
|
||||
rec_iter = iter(warcio.archiveiterator.ArchiveIterator(f))
|
||||
record = next(rec_iter)
|
||||
@ -2225,6 +2448,23 @@ def test_dedup_min_binary_size(http_daemon, warcprox_, archiving_proxies):
|
||||
with pytest.raises(StopIteration):
|
||||
next(rec_iter)
|
||||
|
||||
def test_incomplete_read(http_daemon, warcprox_, archiving_proxies):
|
||||
urls_before = warcprox_.proxy.running_stats.urls
|
||||
|
||||
# see https://github.com/internetarchive/warcprox/pull/123
|
||||
url = 'http://localhost:%s/incomplete-read' % http_daemon.server_port
|
||||
with pytest.raises(requests.exceptions.ChunkedEncodingError):
|
||||
response = requests.get(
|
||||
url, proxies=archiving_proxies, verify=False, timeout=10)
|
||||
|
||||
# although `requests.get` raises exception here, other clients like
|
||||
# browsers put up with the server misbehavior; warcprox does too, and will
|
||||
# record the response verbatim in the warc; this `wait()` call tests
|
||||
# that a warc record is written
|
||||
|
||||
# wait for postfetch chain
|
||||
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 1)
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main()
|
||||
|
||||
|
@ -1,7 +1,7 @@
|
||||
'''
|
||||
tests/test_writer.py - warcprox warc writing tests
|
||||
|
||||
Copyright (C) 2017 Internet Archive
|
||||
Copyright (C) 2017-2019 Internet Archive
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU General Public License
|
||||
@ -36,6 +36,12 @@ import tempfile
|
||||
import logging
|
||||
import hashlib
|
||||
import queue
|
||||
import sys
|
||||
|
||||
logging.basicConfig(
|
||||
stream=sys.stdout, level=logging.TRACE,
|
||||
format='%(asctime)s %(process)d %(levelname)s %(threadName)s '
|
||||
'%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s')
|
||||
|
||||
def lock_file(q, filename):
|
||||
"""Try to lock file and return 1 if successful, else return 0.
|
||||
@ -49,7 +55,6 @@ def lock_file(q, filename):
|
||||
except IOError:
|
||||
q.put('FAILED TO OBTAIN LOCK')
|
||||
|
||||
|
||||
def test_warc_writer_locking(tmpdir):
|
||||
"""Test if WarcWriter is locking WARC files.
|
||||
When we don't have the .open suffix, WarcWriter locks the file and the
|
||||
@ -64,7 +69,7 @@ def test_warc_writer_locking(tmpdir):
|
||||
|
||||
dirname = os.path.dirname(str(tmpdir.mkdir('test-warc-writer')))
|
||||
wwriter = WarcWriter(Options(
|
||||
directory=dirname, no_warc_open_suffix=True, writer_threads=1))
|
||||
directory=dirname, no_warc_open_suffix=True))
|
||||
wwriter.write_records(recorded_url)
|
||||
warcs = [fn for fn in os.listdir(dirname) if fn.endswith('.warc')]
|
||||
assert warcs
|
||||
@ -75,7 +80,7 @@ def test_warc_writer_locking(tmpdir):
|
||||
p.start()
|
||||
p.join()
|
||||
assert q.get() == 'FAILED TO OBTAIN LOCK'
|
||||
wwriter.close_writer()
|
||||
wwriter.close()
|
||||
|
||||
# locking must succeed after writer has closed the WARC file.
|
||||
p = Process(target=lock_file, args=(q, target_warc))
|
||||
@ -96,8 +101,7 @@ def test_special_dont_write_prefix():
|
||||
logging.debug('cd %s', tmpdir)
|
||||
os.chdir(tmpdir)
|
||||
|
||||
wwt = warcprox.writerthread.WarcWriterProcessor(
|
||||
Options(prefix='-', writer_threads=1))
|
||||
wwt = warcprox.writerthread.WarcWriterProcessor(Options(prefix='-'))
|
||||
wwt.inq = queue.Queue(maxsize=1)
|
||||
wwt.outq = queue.Queue(maxsize=1)
|
||||
try:
|
||||
@ -131,7 +135,7 @@ def test_special_dont_write_prefix():
|
||||
wwt.join()
|
||||
|
||||
wwt = warcprox.writerthread.WarcWriterProcessor(
|
||||
Options(writer_threads=1, blackout_period=60, prefix='foo'))
|
||||
Options(blackout_period=60, prefix='foo'))
|
||||
wwt.inq = queue.Queue(maxsize=1)
|
||||
wwt.outq = queue.Queue(maxsize=1)
|
||||
try:
|
||||
@ -199,14 +203,12 @@ def test_special_dont_write_prefix():
|
||||
wwt.stop.set()
|
||||
wwt.join()
|
||||
|
||||
|
||||
def test_do_not_archive():
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
logging.debug('cd %s', tmpdir)
|
||||
os.chdir(tmpdir)
|
||||
|
||||
wwt = warcprox.writerthread.WarcWriterProcessor(
|
||||
Options(writer_threads=1))
|
||||
wwt = warcprox.writerthread.WarcWriterProcessor()
|
||||
wwt.inq = queue.Queue(maxsize=1)
|
||||
wwt.outq = queue.Queue(maxsize=1)
|
||||
try:
|
||||
@ -240,7 +242,6 @@ def test_do_not_archive():
|
||||
wwt.stop.set()
|
||||
wwt.join()
|
||||
|
||||
|
||||
def test_warc_writer_filename(tmpdir):
|
||||
"""Test if WarcWriter is writing WARC files with custom filenames.
|
||||
"""
|
||||
@ -253,11 +254,121 @@ def test_warc_writer_filename(tmpdir):
|
||||
|
||||
dirname = os.path.dirname(str(tmpdir.mkdir('test-warc-writer')))
|
||||
wwriter = WarcWriter(Options(directory=dirname, prefix='foo',
|
||||
warc_filename='{timestamp17}_{prefix}_{timestamp14}_{serialno}',
|
||||
writer_threads=1))
|
||||
warc_filename='{timestamp17}_{prefix}_{timestamp14}_{serialno}'))
|
||||
wwriter.write_records(recorded_url)
|
||||
warcs = [fn for fn in os.listdir(dirname)]
|
||||
assert warcs
|
||||
assert re.search(
|
||||
r'\d{17}_foo_\d{14}_00000.warc.open',
|
||||
wwriter._available_warcs.queue[0].path)
|
||||
r'\d{17}_foo_\d{14}_00000.warc.open', wwriter.path)
|
||||
|
||||
def test_close_for_prefix(tmpdir):
|
||||
wwp = warcprox.writerthread.WarcWriterProcessor(
|
||||
Options(directory=str(tmpdir)))
|
||||
wwp.inq = queue.Queue(maxsize=1)
|
||||
wwp.outq = queue.Queue(maxsize=1)
|
||||
|
||||
try:
|
||||
wwp.start()
|
||||
|
||||
# write a record to the default prefix
|
||||
recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
|
||||
recorder.read()
|
||||
wwp.inq.put(RecordedUrl(
|
||||
url='http://example.com/1', content_type='text/plain',
|
||||
status=200, client_ip='127.0.0.2', request_data=b'abc',
|
||||
response_recorder=recorder, remote_ip='127.0.0.3',
|
||||
timestamp=datetime.utcnow(),
|
||||
payload_digest=recorder.block_digest))
|
||||
time.sleep(0.5)
|
||||
rurl = wwp.outq.get() # wait for it to finish
|
||||
|
||||
assert rurl.url == b'http://example.com/1'
|
||||
assert len(tmpdir.listdir()) == 1
|
||||
assert tmpdir.listdir()[0].basename.startswith('warcprox-')
|
||||
assert tmpdir.listdir()[0].basename.endswith('-00000.warc.open')
|
||||
assert tmpdir.listdir()[0].basename == wwp.writer_pool.default_warc_writer.finalname + '.open'
|
||||
|
||||
# request close of default warc
|
||||
wwp.close_for_prefix()
|
||||
|
||||
# write a record to some other prefix
|
||||
recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
|
||||
recorder.read()
|
||||
wwp.inq.put(RecordedUrl(
|
||||
url='http://example.com/2', content_type='text/plain',
|
||||
status=200, client_ip='127.0.0.2', request_data=b'abc',
|
||||
response_recorder=recorder, remote_ip='127.0.0.3',
|
||||
timestamp=datetime.utcnow(),
|
||||
payload_digest=recorder.block_digest,
|
||||
warcprox_meta={'warc-prefix': 'some-prefix'}))
|
||||
time.sleep(0.5)
|
||||
rurl = wwp.outq.get() # wait for it to finish
|
||||
|
||||
assert rurl.url == b'http://example.com/2'
|
||||
assert len(tmpdir.listdir()) == 2
|
||||
basenames = sorted(f.basename for f in tmpdir.listdir())
|
||||
assert basenames[0].startswith('some-prefix-')
|
||||
assert basenames[0].endswith('-00000.warc.open')
|
||||
assert basenames[1].startswith('warcprox-')
|
||||
assert basenames[1].endswith('-00000.warc')
|
||||
|
||||
# request close of warc with prefix
|
||||
wwp.close_for_prefix('some-prefix')
|
||||
|
||||
# write another record to the default prefix
|
||||
recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
|
||||
recorder.read()
|
||||
wwp.inq.put(RecordedUrl(
|
||||
url='http://example.com/3', content_type='text/plain',
|
||||
status=200, client_ip='127.0.0.2', request_data=b'abc',
|
||||
response_recorder=recorder, remote_ip='127.0.0.3',
|
||||
timestamp=datetime.utcnow(),
|
||||
payload_digest=recorder.block_digest))
|
||||
time.sleep(0.5)
|
||||
rurl = wwp.outq.get() # wait for it to finish
|
||||
|
||||
assert rurl.url == b'http://example.com/3'
|
||||
# now some-prefix warc is closed and a new default prefix warc is open
|
||||
basenames = sorted(f.basename for f in tmpdir.listdir())
|
||||
assert len(basenames) == 3
|
||||
assert basenames[0].startswith('some-prefix-')
|
||||
assert basenames[0].endswith('-00000.warc')
|
||||
assert basenames[1].startswith('warcprox-')
|
||||
assert basenames[1].endswith('-00000.warc')
|
||||
assert basenames[2].startswith('warcprox-')
|
||||
assert basenames[2].endswith('-00001.warc.open')
|
||||
|
||||
# write another record to with prefix "some-prefix"
|
||||
recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
|
||||
recorder.read()
|
||||
wwp.inq.put(RecordedUrl(
|
||||
url='http://example.com/4', content_type='text/plain',
|
||||
status=200, client_ip='127.0.0.2', request_data=b'abc',
|
||||
response_recorder=recorder, remote_ip='127.0.0.3',
|
||||
timestamp=datetime.utcnow(),
|
||||
payload_digest=recorder.block_digest,
|
||||
warcprox_meta={'warc-prefix': 'some-prefix'}))
|
||||
time.sleep(0.5)
|
||||
rurl = wwp.outq.get() # wait for it to finish
|
||||
|
||||
assert rurl.url == b'http://example.com/4'
|
||||
# new some-prefix warc will have a new random token and start over at
|
||||
# serial 00000
|
||||
basenames = sorted(f.basename for f in tmpdir.listdir())
|
||||
assert len(basenames) == 4
|
||||
assert basenames[0].startswith('some-prefix-')
|
||||
assert basenames[1].startswith('some-prefix-')
|
||||
# order of these two warcs depends on random token so we don't know
|
||||
# which is which
|
||||
assert basenames[0][-5:] != basenames[1][-5:]
|
||||
assert '-00000.' in basenames[0]
|
||||
assert '-00000.' in basenames[1]
|
||||
|
||||
assert basenames[2].startswith('warcprox-')
|
||||
assert basenames[2].endswith('-00000.warc')
|
||||
assert basenames[3].startswith('warcprox-')
|
||||
assert basenames[3].endswith('-00001.warc.open')
|
||||
|
||||
finally:
|
||||
wwp.stop.set()
|
||||
wwp.join()
|
||||
|
@ -1,7 +1,7 @@
|
||||
"""
|
||||
warcprox/__init__.py - warcprox package main file, contains some utility code
|
||||
|
||||
Copyright (C) 2013-2018 Internet Archive
|
||||
Copyright (C) 2013-2021 Internet Archive
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU General Public License
|
||||
@ -57,17 +57,6 @@ class Jsonner(json.JSONEncoder):
|
||||
else:
|
||||
return json.JSONEncoder.default(self, o)
|
||||
|
||||
class ThreadPoolExecutor(concurrent.futures.ThreadPoolExecutor):
|
||||
'''
|
||||
`concurrent.futures.ThreadPoolExecutor` supporting a queue of limited size.
|
||||
|
||||
If `max_queued` is set, calls to `submit()` will block if necessary until a
|
||||
free slot is available.
|
||||
'''
|
||||
def __init__(self, max_queued=None, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self._work_queue = queue.Queue(maxsize=max_queued or 0)
|
||||
|
||||
# XXX linux-specific
|
||||
def gettid():
|
||||
try:
|
||||
@ -89,14 +78,26 @@ class RequestBlockedByRule(Exception):
|
||||
def __str__(self):
|
||||
return "%s: %s" % (self.__class__.__name__, self.msg)
|
||||
|
||||
class BadRequest(Exception):
|
||||
'''
|
||||
Raised in case of a request deemed unacceptable by warcprox.
|
||||
'''
|
||||
def __init__(self, msg):
|
||||
self.msg = msg
|
||||
def __str__(self):
|
||||
return "%s: %s" % (self.__class__.__name__, self.msg)
|
||||
|
||||
class BasePostfetchProcessor(threading.Thread):
|
||||
logger = logging.getLogger("warcprox.BasePostfetchProcessor")
|
||||
|
||||
def __init__(self, options=Options()):
|
||||
def __init__(self, options=Options(), controller=None, **kwargs):
|
||||
threading.Thread.__init__(self, name=self.__class__.__name__)
|
||||
self.options = options
|
||||
self.controller = controller
|
||||
|
||||
self.stop = threading.Event()
|
||||
# these should be set before thread is started
|
||||
|
||||
# these should be set by the caller before thread is started
|
||||
self.inq = None
|
||||
self.outq = None
|
||||
self.profiler = None
|
||||
@ -174,8 +175,10 @@ class BaseStandardPostfetchProcessor(BasePostfetchProcessor):
|
||||
|
||||
class BaseBatchPostfetchProcessor(BasePostfetchProcessor):
|
||||
MAX_BATCH_SIZE = 500
|
||||
MAX_BATCH_SEC = 10
|
||||
MIN_BATCH_SEC = 2.0
|
||||
MAX_BATCH_SEC = 60
|
||||
MIN_BATCH_SEC = 30
|
||||
# these updated batch seconds values have resulted in fewer reported dedup
|
||||
# errors and otherwise have worked well in qa
|
||||
|
||||
def _get_process_put(self):
|
||||
batch = []
|
||||
@ -216,8 +219,8 @@ class BaseBatchPostfetchProcessor(BasePostfetchProcessor):
|
||||
raise Exception('not implemented')
|
||||
|
||||
class ListenerPostfetchProcessor(BaseStandardPostfetchProcessor):
|
||||
def __init__(self, listener, options=Options()):
|
||||
BaseStandardPostfetchProcessor.__init__(self, options)
|
||||
def __init__(self, listener, options=Options(), controller=None, **kwargs):
|
||||
BaseStandardPostfetchProcessor.__init__(self, options, controller, **kwargs)
|
||||
self.listener = listener
|
||||
self.name = listener.__class__.__name__
|
||||
|
||||
|
@ -33,7 +33,7 @@ import hashlib
|
||||
import threading
|
||||
import datetime
|
||||
import doublethink
|
||||
import rethinkdb as r
|
||||
from rethinkdb import RethinkDB; r = RethinkDB()
|
||||
from warcprox.dedup import DedupableMixin
|
||||
|
||||
class RethinkCaptures:
|
||||
@ -71,7 +71,7 @@ class RethinkCaptures:
|
||||
"unexpected result saving batch of %s: %s "
|
||||
"entries" % (len(self._batch), result))
|
||||
if result["replaced"] > 0 or result["unchanged"] > 0:
|
||||
self.logger.warn(
|
||||
self.logger.warning(
|
||||
"inserted=%s replaced=%s unchanged=%s in big "
|
||||
"captures table (normally replaced=0 and "
|
||||
"unchanged=0)", result["inserted"],
|
||||
@ -148,7 +148,7 @@ class RethinkCaptures:
|
||||
recorded_url.payload_digest.digest()
|
||||
).decode("utf-8")
|
||||
else:
|
||||
self.logger.warn(
|
||||
self.logger.warning(
|
||||
"digest type is %r but big captures table is indexed "
|
||||
"by sha1",
|
||||
recorded_url.payload_digest.name)
|
||||
@ -157,8 +157,11 @@ class RethinkCaptures:
|
||||
sha1base32 = base64.b32encode(digest.digest()).decode("utf-8")
|
||||
|
||||
if (recorded_url.warcprox_meta
|
||||
and "dedup-bucket" in recorded_url.warcprox_meta):
|
||||
bucket = recorded_url.warcprox_meta["dedup-bucket"]
|
||||
and "dedup-buckets" in recorded_url.warcprox_meta):
|
||||
for bucket, bucket_mode in recorded_url.warcprox_meta["dedup-buckets"].items():
|
||||
if not bucket_mode == 'ro':
|
||||
# maybe this is the right thing to do here? or should we return an entry for each? or ?
|
||||
break
|
||||
else:
|
||||
bucket = "__unspecified__"
|
||||
|
||||
|
278
warcprox/certauth.py
Normal file
278
warcprox/certauth.py
Normal file
@ -0,0 +1,278 @@
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
from argparse import ArgumentParser
|
||||
from datetime import datetime, timedelta
|
||||
import threading
|
||||
|
||||
from cryptography import x509
|
||||
from cryptography.hazmat.backends import default_backend
|
||||
from cryptography.hazmat.primitives import hashes, serialization
|
||||
from cryptography.hazmat.primitives.asymmetric import rsa
|
||||
from cryptography.x509.oid import NameOID
|
||||
|
||||
# =================================================================
|
||||
# Valid for 3 years from now
|
||||
# Max validity is 39 months:
|
||||
# https://casecurity.org/2015/02/19/ssl-certificate-validity-periods-limited-to-39-months-starting-in-april/
|
||||
CERT_NOT_AFTER = 3 * 365 * 24 * 60 * 60
|
||||
|
||||
CERTS_DIR = './ca/certs/'
|
||||
|
||||
CERT_NAME = 'certauth sample CA'
|
||||
|
||||
DEF_HASH_FUNC = hashes.SHA256()
|
||||
|
||||
|
||||
# =================================================================
|
||||
class CertificateAuthority(object):
|
||||
"""
|
||||
Utility class for signing individual certificate
|
||||
with a root cert.
|
||||
|
||||
Static generate_ca_root() method for creating the root cert
|
||||
|
||||
All certs saved on filesystem. Individual certs are stored
|
||||
in specified certs_dir and reused if previously created.
|
||||
"""
|
||||
|
||||
def __init__(self, ca_file, certs_dir, ca_name,
|
||||
overwrite=False,
|
||||
cert_not_before=0,
|
||||
cert_not_after=CERT_NOT_AFTER):
|
||||
|
||||
assert(ca_file)
|
||||
self.ca_file = ca_file
|
||||
|
||||
assert(certs_dir)
|
||||
self.certs_dir = certs_dir
|
||||
|
||||
assert(ca_name)
|
||||
self.ca_name = ca_name
|
||||
|
||||
self._file_created = False
|
||||
|
||||
self.cert_not_before = cert_not_before
|
||||
self.cert_not_after = cert_not_after
|
||||
|
||||
if not os.path.exists(certs_dir):
|
||||
os.makedirs(certs_dir)
|
||||
|
||||
# if file doesn't exist or overwrite is true
|
||||
# create new root cert
|
||||
if (overwrite or not os.path.isfile(ca_file)):
|
||||
self.cert, self.key = self.generate_ca_root(ca_file, ca_name)
|
||||
self._file_created = True
|
||||
|
||||
# read previously created root cert
|
||||
else:
|
||||
self.cert, self.key = self.read_pem(ca_file)
|
||||
|
||||
self._lock = threading.Lock()
|
||||
|
||||
def cert_for_host(self, host, overwrite=False, wildcard=False):
|
||||
with self._lock:
|
||||
host_filename = os.path.join(self.certs_dir, host) + '.pem'
|
||||
|
||||
if not overwrite and os.path.exists(host_filename):
|
||||
self._file_created = False
|
||||
return host_filename
|
||||
|
||||
self.generate_host_cert(host, self.cert, self.key, host_filename,
|
||||
wildcard)
|
||||
|
||||
self._file_created = True
|
||||
return host_filename
|
||||
|
||||
def get_wildcard_cert(self, cert_host):
|
||||
host_parts = cert_host.split('.', 1)
|
||||
if len(host_parts) == 2 and '.' in host_parts[1]:
|
||||
cert_host = host_parts[1]
|
||||
|
||||
certfile = self.cert_for_host(cert_host,
|
||||
wildcard=True)
|
||||
|
||||
return certfile
|
||||
|
||||
def get_root_PKCS12(self):
|
||||
return serialization.pkcs12.serialize_key_and_certificates(
|
||||
name=b"root",
|
||||
key=self.key,
|
||||
cert=self.cert,
|
||||
cas=None,
|
||||
encryption_algorithm=serialization.NoEncryption()
|
||||
)
|
||||
|
||||
def _make_cert(self, certname):
|
||||
subject = issuer = x509.Name([
|
||||
x509.NameAttribute(NameOID.COMMON_NAME, certname),
|
||||
])
|
||||
cert = x509.CertificateBuilder().subject_name(
|
||||
subject
|
||||
).issuer_name(
|
||||
issuer
|
||||
).public_key(
|
||||
self.key.public_key()
|
||||
).serial_number(
|
||||
random.randint(0, 2**64 - 1)
|
||||
).not_valid_before(
|
||||
datetime.utcnow()
|
||||
).not_valid_after(
|
||||
datetime.utcnow() + timedelta(seconds=self.cert_not_after)
|
||||
).add_extension(
|
||||
x509.BasicConstraints(ca=True, path_length=0), critical=True,
|
||||
).add_extension(
|
||||
x509.KeyUsage(key_cert_sign=True, crl_sign=True, digital_signature=False,
|
||||
content_commitment=False, key_encipherment=False,
|
||||
data_encipherment=False, key_agreement=False, encipher_only=False,
|
||||
decipher_only=False), critical=True
|
||||
).add_extension(
|
||||
x509.SubjectKeyIdentifier.from_public_key(self.key.public_key()), critical=False
|
||||
).sign(self.key, DEF_HASH_FUNC, default_backend())
|
||||
return cert
|
||||
|
||||
def generate_ca_root(self, ca_file, ca_name, hash_func=DEF_HASH_FUNC):
|
||||
# Generate key
|
||||
key = rsa.generate_private_key(
|
||||
public_exponent=65537,
|
||||
key_size=2048,
|
||||
backend=default_backend()
|
||||
)
|
||||
|
||||
# Generate cert
|
||||
self.key = key
|
||||
cert = self._make_cert(ca_name)
|
||||
|
||||
# Write cert + key
|
||||
self.write_pem(ca_file, cert, key)
|
||||
return cert, key
|
||||
|
||||
def generate_host_cert(self, host, root_cert, root_key, host_filename,
|
||||
wildcard=False, hash_func=DEF_HASH_FUNC):
|
||||
|
||||
host = host.encode('utf-8')
|
||||
|
||||
# Generate CSR
|
||||
csr = x509.CertificateSigningRequestBuilder().subject_name(
|
||||
x509.Name([
|
||||
x509.NameAttribute(NameOID.COMMON_NAME, host.decode('utf-8')),
|
||||
])
|
||||
).sign(self.key, hash_func, default_backend())
|
||||
|
||||
# Generate Cert
|
||||
cert_builder = x509.CertificateBuilder().subject_name(
|
||||
csr.subject
|
||||
).issuer_name(
|
||||
root_cert.subject
|
||||
).public_key(
|
||||
csr.public_key()
|
||||
).serial_number(
|
||||
random.randint(0, 2**64 - 1)
|
||||
).not_valid_before(
|
||||
datetime.utcnow()
|
||||
).not_valid_after(
|
||||
datetime.utcnow() + timedelta(seconds=self.cert_not_after)
|
||||
)
|
||||
|
||||
if wildcard:
|
||||
cert_builder = cert_builder.add_extension(
|
||||
x509.SubjectAlternativeName([
|
||||
x509.DNSName(host.decode('utf-8')),
|
||||
x509.DNSName('*.' + host.decode('utf-8')),
|
||||
]),
|
||||
critical=False,
|
||||
)
|
||||
|
||||
cert = cert_builder.sign(root_key, hash_func, default_backend())
|
||||
|
||||
# Write cert + key
|
||||
self.write_pem(host_filename, cert, self.key)
|
||||
return cert, self.key
|
||||
|
||||
def write_pem(self, filename, cert, key):
|
||||
with open(filename, 'wb+') as f:
|
||||
f.write(key.private_bytes(
|
||||
encoding=serialization.Encoding.PEM,
|
||||
format=serialization.PrivateFormat.TraditionalOpenSSL,
|
||||
encryption_algorithm=serialization.NoEncryption()
|
||||
))
|
||||
f.write(cert.public_bytes(serialization.Encoding.PEM))
|
||||
|
||||
def read_pem(self, filename):
|
||||
with open(filename, 'rb') as f:
|
||||
cert = x509.load_pem_x509_certificate(f.read(), default_backend())
|
||||
f.seek(0)
|
||||
key = serialization.load_pem_private_key(f.read(), password=None, backend=default_backend())
|
||||
|
||||
return cert, key
|
||||
|
||||
|
||||
# =================================================================
|
||||
def main(args=None):
|
||||
parser = ArgumentParser(description='Certificate Authority Cert Maker Tools')
|
||||
|
||||
parser.add_argument('root_ca_cert',
|
||||
help='Path to existing or new root CA file')
|
||||
|
||||
parser.add_argument('-c', '--certname', action='store', default=CERT_NAME,
|
||||
help='Name for root certificate')
|
||||
|
||||
parser.add_argument('-n', '--hostname',
|
||||
help='Hostname certificate to create')
|
||||
|
||||
parser.add_argument('-d', '--certs-dir', default=CERTS_DIR,
|
||||
help='Directory for host certificates')
|
||||
|
||||
parser.add_argument('-f', '--force', action='store_true',
|
||||
help='Overwrite certificates if they already exist')
|
||||
|
||||
parser.add_argument('-w', '--wildcard_cert', action='store_true',
|
||||
help='add wildcard SAN to host: *.<host>, <host>')
|
||||
|
||||
r = parser.parse_args(args=args)
|
||||
|
||||
certs_dir = r.certs_dir
|
||||
wildcard = r.wildcard_cert
|
||||
|
||||
root_cert = r.root_ca_cert
|
||||
hostname = r.hostname
|
||||
|
||||
if not hostname:
|
||||
overwrite = r.force
|
||||
else:
|
||||
overwrite = False
|
||||
|
||||
ca = CertificateAuthority(ca_file=root_cert,
|
||||
certs_dir=r.certs_dir,
|
||||
ca_name=r.certname,
|
||||
overwrite=overwrite)
|
||||
|
||||
# Just creating the root cert
|
||||
if not hostname:
|
||||
if ca._file_created:
|
||||
print('Created new root cert: "' + root_cert + '"')
|
||||
return 0
|
||||
else:
|
||||
print('Root cert "' + root_cert +
|
||||
'" already exists,' + ' use -f to overwrite')
|
||||
return 1
|
||||
|
||||
# Sign a certificate for a given host
|
||||
overwrite = r.force
|
||||
host_filename = ca.cert_for_host(hostname,
|
||||
overwrite, wildcard)
|
||||
|
||||
if ca._file_created:
|
||||
print('Created new cert "' + hostname +
|
||||
'" signed by root cert ' +
|
||||
root_cert)
|
||||
return 0
|
||||
|
||||
else:
|
||||
print('Cert for "' + hostname + '" already exists,' +
|
||||
' use -f to overwrite')
|
||||
return 1
|
||||
|
||||
|
||||
if __name__ == "__main__": #pragma: no cover
|
||||
main()
|
@ -4,7 +4,7 @@ starting up and shutting down the various components of warcprox, and for
|
||||
sending heartbeats to the service registry if configured to do so; also has
|
||||
some memory profiling capabilities
|
||||
|
||||
Copyright (C) 2013-2018 Internet Archive
|
||||
Copyright (C) 2013-2019 Internet Archive
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU General Public License
|
||||
@ -31,12 +31,12 @@ import sys
|
||||
import gc
|
||||
import datetime
|
||||
import warcprox
|
||||
import certauth
|
||||
import functools
|
||||
import doublethink
|
||||
import importlib
|
||||
import queue
|
||||
import socket
|
||||
import os
|
||||
|
||||
class Factory:
|
||||
@staticmethod
|
||||
@ -93,20 +93,24 @@ class Factory:
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def plugin(qualname, options):
|
||||
def plugin(qualname, options, controller=None):
|
||||
try:
|
||||
(module_name, class_name) = qualname.rsplit('.', 1)
|
||||
module_ = importlib.import_module(module_name)
|
||||
class_ = getattr(module_, class_name)
|
||||
try: # new plugins take `options` argument
|
||||
plugin = class_(options)
|
||||
except: # backward-compatibility
|
||||
plugin = class_()
|
||||
try:
|
||||
# new plugins take `options` and `controller` arguments
|
||||
plugin = class_(options, controller)
|
||||
except:
|
||||
try: # medium plugins take `options` argument
|
||||
plugin = class_(options)
|
||||
except: # old plugins take no arguments
|
||||
plugin = class_()
|
||||
# check that this is either a listener or a batch processor
|
||||
assert hasattr(plugin, 'notify') ^ hasattr(plugin, '_startup')
|
||||
return plugin
|
||||
except Exception as e:
|
||||
logging.fatal('problem with plugin class %r: %s', qualname, e)
|
||||
logging.fatal('problem with plugin class %r', qualname, exc_info=1)
|
||||
sys.exit(1)
|
||||
|
||||
@staticmethod
|
||||
@ -143,10 +147,6 @@ class WarcproxController(object):
|
||||
self.playback_proxy = Factory.playback_proxy(
|
||||
self.proxy.ca, self.options)
|
||||
|
||||
# https://github.com/internetarchive/warcprox/wiki/benchmarking-number-of-threads
|
||||
if not self.options.writer_threads:
|
||||
self.options.writer_threads = 1
|
||||
|
||||
self.build_postfetch_chain(self.proxy.recorded_url_q)
|
||||
|
||||
self.service_registry = Factory.service_registry(options)
|
||||
@ -233,7 +233,7 @@ class WarcproxController(object):
|
||||
crawl_logger, self.options))
|
||||
|
||||
for qualname in self.options.plugins or []:
|
||||
plugin = Factory.plugin(qualname, self.options)
|
||||
plugin = Factory.plugin(qualname, self.options, self)
|
||||
if hasattr(plugin, 'notify'):
|
||||
self._postfetch_chain.append(
|
||||
warcprox.ListenerPostfetchProcessor(
|
||||
@ -441,7 +441,12 @@ class WarcproxController(object):
|
||||
exc_info=True)
|
||||
pass
|
||||
finally:
|
||||
self.shutdown()
|
||||
try:
|
||||
self.shutdown()
|
||||
except:
|
||||
self.logger.critical("graceful shutdown failed", exc_info=True)
|
||||
self.logger.critical("killing myself -9")
|
||||
os.kill(os.getpid(), 9)
|
||||
|
||||
def _dump_profiling(self):
|
||||
import pstats, tempfile, os, io
|
||||
|
@ -25,6 +25,8 @@ import json
|
||||
import os
|
||||
import warcprox
|
||||
import socket
|
||||
import rfc3986
|
||||
from urllib3.exceptions import TimeoutError, HTTPError, NewConnectionError, MaxRetryError
|
||||
|
||||
class CrawlLogger(object):
|
||||
def __init__(self, dir_, options=warcprox.Options()):
|
||||
@ -40,7 +42,12 @@ class CrawlLogger(object):
|
||||
def notify(self, recorded_url, records):
|
||||
# 2017-08-03T21:45:24.496Z 200 2189 https://autismcouncil.wisconsin.gov/robots.txt P https://autismcouncil.wisconsin.gov/ text/plain #001 20170803214523617+365 sha1:PBS2CEF7B4OSEXZZF3QE2XN2VHYCPNPX https://autismcouncil.wisconsin.gov/ duplicate:digest {"warcFileOffset":942,"contentSize":2495,"warcFilename":"ARCHIVEIT-2159-TEST-JOB319150-20170803214522386-00000.warc.gz"}
|
||||
now = datetime.datetime.utcnow()
|
||||
extra_info = {'contentSize': recorded_url.size,}
|
||||
status = self.get_artificial_status(recorded_url)
|
||||
extra_info = {'contentSize': recorded_url.size,} if recorded_url.size is not None and recorded_url.size > 0 else {}
|
||||
if hasattr(recorded_url, 'exception') and recorded_url.exception is not None:
|
||||
extra_info['exception'] = str(recorded_url.exception).replace(" ", "_")
|
||||
if(hasattr(recorded_url, 'message') and recorded_url.message is not None):
|
||||
extra_info['exceptionMessage'] = str(recorded_url.message).replace(" ", "_")
|
||||
if records:
|
||||
extra_info['warcFilename'] = records[0].warc_filename
|
||||
extra_info['warcFileOffset'] = records[0].offset
|
||||
@ -51,23 +58,50 @@ class CrawlLogger(object):
|
||||
payload_digest = warcprox.digest_str(
|
||||
recorded_url.payload_digest,
|
||||
self.options.base32)
|
||||
else:
|
||||
elif records is not None and len(records) > 0:
|
||||
# WARCPROX_WRITE_RECORD request
|
||||
content_length = int(records[0].get_header(b'Content-Length'))
|
||||
payload_digest = records[0].get_header(b'WARC-Payload-Digest')
|
||||
else:
|
||||
content_length = 0
|
||||
payload_digest = '-'
|
||||
logging.info('warcprox_meta %s' , recorded_url.warcprox_meta)
|
||||
|
||||
hop_path = recorded_url.warcprox_meta.get('metadata', {}).get('hop_path')
|
||||
#URLs are url encoded into plain ascii urls by HTTP spec. Since we're comparing against those, our urls sent over the json blob need to be encoded similarly
|
||||
brozzled_url = canonicalize_url(recorded_url.warcprox_meta.get('metadata', {}).get('brozzled_url'))
|
||||
hop_via_url = canonicalize_url(recorded_url.warcprox_meta.get('metadata', {}).get('hop_via_url'))
|
||||
|
||||
if hop_path is None and brozzled_url is None and hop_via_url is None:
|
||||
#No hop info headers provided
|
||||
hop_path = "-"
|
||||
via_url = recorded_url.referer or '-'
|
||||
else:
|
||||
if hop_path is None:
|
||||
hop_path = "-"
|
||||
if hop_via_url is None:
|
||||
hop_via_url = "-"
|
||||
#Prefer referer header. Otherwise use provided via_url
|
||||
via_url = recorded_url.referer or hop_via_url if hop_path != "-" else "-"
|
||||
logging.info('brozzled_url:%s recorded_url:%s' , brozzled_url, recorded_url.url)
|
||||
if brozzled_url != recorded_url.url.decode('ascii') and "brozzled_url" in recorded_url.warcprox_meta.get('metadata', {}).keys():
|
||||
#Requested page is not the Brozzled url, thus we are an embed or redirect.
|
||||
via_url = brozzled_url
|
||||
hop_path = "B" if hop_path == "-" else "".join([hop_path,"B"])
|
||||
|
||||
fields = [
|
||||
'{:%Y-%m-%dT%H:%M:%S}.{:03d}Z'.format(now, now.microsecond//1000),
|
||||
'% 5s' % recorded_url.status,
|
||||
'% 5s' % status,
|
||||
'% 10s' % content_length,
|
||||
recorded_url.url,
|
||||
'-', # hop path
|
||||
recorded_url.referer or '-',
|
||||
recorded_url.mimetype or '-',
|
||||
hop_path,
|
||||
via_url,
|
||||
recorded_url.mimetype if recorded_url.mimetype is not None and recorded_url.mimetype.strip() else '-',
|
||||
'-',
|
||||
'{:%Y%m%d%H%M%S}{:03d}+{:03d}'.format(
|
||||
recorded_url.timestamp,
|
||||
recorded_url.timestamp.microsecond//1000,
|
||||
recorded_url.duration.microseconds//1000),
|
||||
recorded_url.duration.microseconds//1000) if (recorded_url.timestamp is not None and recorded_url.duration is not None) else '-',
|
||||
payload_digest,
|
||||
recorded_url.warcprox_meta.get('metadata', {}).get('seed', '-'),
|
||||
'duplicate:digest' if records and records[0].type == b'revisit' else '-',
|
||||
@ -80,7 +114,6 @@ class CrawlLogger(object):
|
||||
except:
|
||||
pass
|
||||
line = b' '.join(fields) + b'\n'
|
||||
|
||||
prefix = recorded_url.warcprox_meta.get('warc-prefix', 'crawl')
|
||||
filename = '%s-%s-%s.log' % (
|
||||
prefix, self.hostname, self.options.server_port)
|
||||
@ -89,3 +122,43 @@ class CrawlLogger(object):
|
||||
with open(crawl_log_path, 'ab') as f:
|
||||
f.write(line)
|
||||
|
||||
def get_artificial_status(self, recorded_url):
|
||||
# urllib3 Does not specify DNS errors. We must parse them from the exception string.
|
||||
# Unfortunately, the errors are reported differently on different systems.
|
||||
# https://stackoverflow.com/questions/40145631
|
||||
|
||||
if hasattr(recorded_url, 'exception') and isinstance(recorded_url.exception, (MaxRetryError, )):
|
||||
return '-8'
|
||||
elif hasattr(recorded_url, 'exception') and isinstance(recorded_url.exception, (NewConnectionError, )):
|
||||
exception_string=str(recorded_url.exception)
|
||||
if ("[Errno 11001] getaddrinfo failed" in exception_string or # Windows
|
||||
"[Errno -2] Name or service not known" in exception_string or # Linux
|
||||
"[Errno -3] Temporary failure in name resolution" in exception_string or # Linux
|
||||
"[Errno 8] nodename nor servname " in exception_string): # OS X
|
||||
return '-6' # DNS Failure
|
||||
else:
|
||||
return '-2' # Other Connection Failure
|
||||
elif hasattr(recorded_url, 'exception') and isinstance(recorded_url.exception, (socket.timeout, TimeoutError, )):
|
||||
return '-2' # Connection Timeout
|
||||
elif isinstance(recorded_url, warcprox.warcproxy.FailedUrl):
|
||||
# synthetic status, used when some other status (such as connection-lost)
|
||||
# is considered by policy the same as a document-not-found
|
||||
# Cached failures result in FailedUrl with no Exception
|
||||
return '-404'
|
||||
else:
|
||||
return recorded_url.status
|
||||
|
||||
def canonicalize_url(url):
|
||||
#URL needs to be split out to separately encode the hostname from the rest of the path.
|
||||
#hostname will be idna encoded (punycode)
|
||||
#The rest of the URL will be urlencoded, but browsers only encode "unsafe" and not "reserved" characters, so ignore the reserved chars.
|
||||
if url is None or url == '-' or url == '':
|
||||
return url
|
||||
try:
|
||||
parsed_url=rfc3986.urlparse(url)
|
||||
encoded_url=parsed_url.copy_with(host=parsed_url.host.encode('idna'))
|
||||
return encoded_url.unsplit()
|
||||
except (TypeError, ValueError, AttributeError) as e:
|
||||
logging.warning("URL Canonicalization failure. Returning raw url: rfc3986 %s - %s", url, e)
|
||||
return url
|
||||
|
||||
|
@ -1,7 +1,7 @@
|
||||
'''
|
||||
warcprox/dedup.py - identical payload digest deduplication using sqlite db
|
||||
|
||||
Copyright (C) 2013-2018 Internet Archive
|
||||
Copyright (C) 2013-2021 Internet Archive
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU General Public License
|
||||
@ -26,7 +26,6 @@ import os
|
||||
import json
|
||||
from hanzo import warctools
|
||||
import warcprox
|
||||
import warcprox.trough
|
||||
import sqlite3
|
||||
import doublethink
|
||||
import datetime
|
||||
@ -34,6 +33,7 @@ import urllib3
|
||||
from urllib3.exceptions import HTTPError
|
||||
import collections
|
||||
from concurrent import futures
|
||||
from functools import lru_cache
|
||||
|
||||
urllib3.disable_warnings()
|
||||
|
||||
@ -46,11 +46,15 @@ class DedupableMixin(object):
|
||||
def should_dedup(self, recorded_url):
|
||||
"""Check if we should try to run dedup on resource based on payload
|
||||
size compared with min text/binary dedup size options.
|
||||
When we use option --dedup-only-with-bucket, `dedup-bucket` is required
|
||||
When we use option --dedup-only-with-bucket, `dedup-buckets` is required
|
||||
in Warcprox-Meta to perform dedup.
|
||||
If recorded_url.do_not_archive is True, we skip dedup. This record will
|
||||
not be written to WARC anyway.
|
||||
Return Boolean.
|
||||
"""
|
||||
if self.dedup_only_with_bucket and "dedup-bucket" not in recorded_url.warcprox_meta:
|
||||
if recorded_url.do_not_archive:
|
||||
return False
|
||||
if self.dedup_only_with_bucket and "dedup-buckets" not in recorded_url.warcprox_meta:
|
||||
return False
|
||||
if recorded_url.is_text():
|
||||
return recorded_url.response_recorder.payload_size() > self.min_text_size
|
||||
@ -64,14 +68,19 @@ class DedupLoader(warcprox.BaseStandardPostfetchProcessor, DedupableMixin):
|
||||
self.dedup_db = dedup_db
|
||||
|
||||
def _process_url(self, recorded_url):
|
||||
if isinstance(recorded_url, warcprox.warcproxy.FailedUrl):
|
||||
return
|
||||
if (recorded_url.response_recorder
|
||||
and recorded_url.payload_digest
|
||||
and self.should_dedup(recorded_url)):
|
||||
digest_key = warcprox.digest_str(recorded_url.payload_digest, self.options.base32)
|
||||
if recorded_url.warcprox_meta and "dedup-bucket" in recorded_url.warcprox_meta:
|
||||
recorded_url.dedup_info = self.dedup_db.lookup(
|
||||
digest_key, recorded_url.warcprox_meta["dedup-bucket"],
|
||||
recorded_url.url)
|
||||
if recorded_url.warcprox_meta and "dedup-buckets" in recorded_url.warcprox_meta:
|
||||
for bucket, bucket_mode in recorded_url.warcprox_meta["dedup-buckets"].items():
|
||||
recorded_url.dedup_info = self.dedup_db.lookup(
|
||||
digest_key, bucket, recorded_url.url)
|
||||
if recorded_url.dedup_info:
|
||||
# we found an existing capture
|
||||
break
|
||||
else:
|
||||
recorded_url.dedup_info = self.dedup_db.lookup(
|
||||
digest_key, url=recorded_url.url)
|
||||
@ -147,10 +156,12 @@ class DedupDb(DedupableMixin):
|
||||
and self.should_dedup(recorded_url)):
|
||||
digest_key = warcprox.digest_str(
|
||||
recorded_url.payload_digest, self.options.base32)
|
||||
if recorded_url.warcprox_meta and "dedup-bucket" in recorded_url.warcprox_meta:
|
||||
self.save(
|
||||
digest_key, records[0],
|
||||
bucket=recorded_url.warcprox_meta["dedup-bucket"])
|
||||
if recorded_url.warcprox_meta and "dedup-buckets" in recorded_url.warcprox_meta:
|
||||
for bucket, bucket_mode in recorded_url.warcprox_meta["dedup-buckets"].items():
|
||||
if not bucket_mode == "ro":
|
||||
self.save(
|
||||
digest_key, records[0],
|
||||
bucket=bucket)
|
||||
else:
|
||||
self.save(digest_key, records[0])
|
||||
|
||||
@ -212,8 +223,10 @@ class RethinkDedupDb(DedupDb, DedupableMixin):
|
||||
and self.should_dedup(recorded_url)):
|
||||
digest_key = warcprox.digest_str(
|
||||
recorded_url.payload_digest, self.options.base32)
|
||||
if recorded_url.warcprox_meta and "dedup-bucket" in recorded_url.warcprox_meta:
|
||||
self.save(digest_key, records[0], bucket=recorded_url.warcprox_meta["dedup-bucket"])
|
||||
if recorded_url.warcprox_meta and "dedup-buckets" in recorded_url.warcprox_meta:
|
||||
for bucket, bucket_mode in recorded_url.warcprox_meta["dedup-buckets"].items():
|
||||
if not bucket_mode == 'ro':
|
||||
self.save(digest_key, records[0], bucket=bucket)
|
||||
else:
|
||||
self.save(digest_key, records[0])
|
||||
|
||||
@ -236,6 +249,7 @@ class CdxServerDedup(DedupDb):
|
||||
headers['Cookie'] = options.cdxserver_dedup_cookies
|
||||
self.http_pool = urllib3.PoolManager(maxsize=maxsize, retries=0,
|
||||
timeout=2.0, headers=headers)
|
||||
self.cached_lookup = lru_cache(maxsize=1024)(self.lookup)
|
||||
|
||||
def loader(self, *args, **kwargs):
|
||||
return CdxServerDedupLoader(self, self.options)
|
||||
@ -257,6 +271,9 @@ class CdxServerDedup(DedupDb):
|
||||
performance optimisation to handle that. limit < 0 is very inefficient
|
||||
in general. Maybe it could be configurable in the future.
|
||||
|
||||
Skip dedup for URLs with session params. These URLs are certainly
|
||||
unique and highly volatile, we cannot dedup them.
|
||||
|
||||
:param digest_key: b'sha1:<KEY-VALUE>' (prefix is optional).
|
||||
Example: b'sha1:B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'
|
||||
:param url: Target URL string
|
||||
@ -265,6 +282,8 @@ class CdxServerDedup(DedupDb):
|
||||
"""
|
||||
u = url.decode("utf-8") if isinstance(url, bytes) else url
|
||||
try:
|
||||
if any(s in u for s in ('JSESSIONID=', 'session=', 'sess=')):
|
||||
return None
|
||||
result = self.http_pool.request('GET', self.cdx_url, fields=dict(
|
||||
url=u, fl="timestamp,digest", filter="!mimetype:warc/revisit",
|
||||
limit=-1))
|
||||
@ -296,7 +315,7 @@ class CdxServerDedupLoader(warcprox.BaseBatchPostfetchProcessor, DedupableMixin)
|
||||
def __init__(self, cdx_dedup, options=warcprox.Options()):
|
||||
warcprox.BaseBatchPostfetchProcessor.__init__(self, options)
|
||||
DedupableMixin.__init__(self, options)
|
||||
self.pool = futures.ThreadPoolExecutor(max_workers=400)
|
||||
self.pool = futures.ThreadPoolExecutor(max_workers=options.cdxserver_dedup_max_threads)
|
||||
self.batch = set()
|
||||
self.cdx_dedup = cdx_dedup
|
||||
|
||||
@ -315,7 +334,10 @@ class CdxServerDedupLoader(warcprox.BaseBatchPostfetchProcessor, DedupableMixin)
|
||||
try:
|
||||
digest_key = warcprox.digest_str(recorded_url.payload_digest,
|
||||
self.options.base32)
|
||||
dedup_info = self.cdx_dedup.lookup(digest_key, recorded_url.url)
|
||||
dedup_info = self.cdx_dedup.cached_lookup(digest_key, recorded_url.url)
|
||||
cache_info = self.cdx_dedup.cached_lookup.cache_info()
|
||||
if (cache_info.hits + cache_info.misses) % 1000 == 0:
|
||||
self.logger.info(self.cdx_dedup.cached_lookup.cache_info())
|
||||
if dedup_info:
|
||||
recorded_url.dedup_info = dedup_info
|
||||
except ValueError as exc:
|
||||
@ -342,11 +364,12 @@ class BatchTroughStorer(warcprox.BaseBatchPostfetchProcessor):
|
||||
and recorded_url.warc_records[0].type == b'response'
|
||||
and self.trough_dedup_db.should_dedup(recorded_url)):
|
||||
if (recorded_url.warcprox_meta
|
||||
and 'dedup-bucket' in recorded_url.warcprox_meta):
|
||||
bucket = recorded_url.warcprox_meta['dedup-bucket']
|
||||
and 'dedup-buckets' in recorded_url.warcprox_meta):
|
||||
for bucket, bucket_mode in recorded_url.warcprox_meta["dedup-buckets"].items():
|
||||
if not bucket_mode == 'ro':
|
||||
buckets[bucket].append(recorded_url)
|
||||
else:
|
||||
bucket = '__unspecified__'
|
||||
buckets[bucket].append(recorded_url)
|
||||
buckets['__unspecified__'].append(recorded_url)
|
||||
return buckets
|
||||
|
||||
def _process_batch(self, batch):
|
||||
@ -361,6 +384,9 @@ class BatchTroughStorer(warcprox.BaseBatchPostfetchProcessor):
|
||||
self.trough_dedup_db.batch_save,
|
||||
buckets[bucket], bucket)
|
||||
fs[future] = bucket
|
||||
logging.debug(
|
||||
'storing dedup info for %s urls '
|
||||
'in bucket %s', len(buckets[bucket]), bucket)
|
||||
|
||||
# wait for results
|
||||
try:
|
||||
@ -369,7 +395,7 @@ class BatchTroughStorer(warcprox.BaseBatchPostfetchProcessor):
|
||||
except futures.TimeoutError as e:
|
||||
# the remaining threads actually keep running in this case,
|
||||
# there's no way to stop them, but that should be harmless
|
||||
logging.warn(
|
||||
logging.warning(
|
||||
'timed out saving dedup info to trough', exc_info=True)
|
||||
|
||||
class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor):
|
||||
@ -389,21 +415,32 @@ class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor):
|
||||
'''
|
||||
buckets = collections.defaultdict(list)
|
||||
discards = []
|
||||
# for duplicate checks, see https://webarchive.jira.com/browse/WT-31
|
||||
hash_plus_urls = set()
|
||||
for recorded_url in batch:
|
||||
if not recorded_url.payload_digest:
|
||||
discards.append('n/a')
|
||||
continue
|
||||
payload_hash = warcprox.digest_str(
|
||||
recorded_url.payload_digest, self.options.base32)
|
||||
hash_plus_url = b''.join((payload_hash, recorded_url.url))
|
||||
if (recorded_url.response_recorder
|
||||
and recorded_url.payload_digest
|
||||
and hash_plus_url not in hash_plus_urls
|
||||
and self.trough_dedup_db.should_dedup(recorded_url)):
|
||||
hash_plus_urls.add(hash_plus_url)
|
||||
if (recorded_url.warcprox_meta
|
||||
and 'dedup-bucket' in recorded_url.warcprox_meta):
|
||||
bucket = recorded_url.warcprox_meta['dedup-bucket']
|
||||
and 'dedup-buckets' in recorded_url.warcprox_meta):
|
||||
for bucket, bucket_mode in recorded_url.warcprox_meta["dedup-buckets"].items():
|
||||
buckets[bucket].append(recorded_url)
|
||||
else:
|
||||
bucket = '__unspecified__'
|
||||
buckets[bucket].append(recorded_url)
|
||||
buckets['__unspecified__'].append(recorded_url)
|
||||
else:
|
||||
discards.append(
|
||||
warcprox.digest_str(
|
||||
recorded_url.payload_digest, self.options.base32)
|
||||
if recorded_url.payload_digest else 'n/a')
|
||||
if hash_plus_url in hash_plus_urls:
|
||||
self.logger.debug(
|
||||
'discarding duplicate and setting do_not_archive for %s, hash %s',
|
||||
recorded_url.url, payload_hash)
|
||||
recorded_url.do_not_archive = True
|
||||
discards.append(payload_hash)
|
||||
self.logger.debug(
|
||||
'len(batch)=%s len(discards)=%s buckets=%s',
|
||||
len(batch), len(discards),
|
||||
@ -453,7 +490,7 @@ class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor):
|
||||
recorded_url.dedup_info = entry
|
||||
except Exception as e:
|
||||
# batch_lookup raised exception or something
|
||||
logging.warn(
|
||||
logging.warning(
|
||||
'problem looking up dedup info for %s urls '
|
||||
'in bucket %s', len(buckets[bucket]), bucket,
|
||||
exc_info=True)
|
||||
@ -469,7 +506,7 @@ class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor):
|
||||
except futures.TimeoutError as e:
|
||||
# the remaining threads actually keep running in this case,
|
||||
# there's no way to stop them, but that should be harmless
|
||||
self.logger.warn(
|
||||
self.logger.warning(
|
||||
'timed out loading dedup info from trough', exc_info=True)
|
||||
|
||||
class TroughDedupDb(DedupDb, DedupableMixin):
|
||||
@ -482,16 +519,24 @@ class TroughDedupDb(DedupDb, DedupableMixin):
|
||||
SCHEMA_SQL = ('create table dedup (\n'
|
||||
' digest_key varchar(100) primary key,\n'
|
||||
' url varchar(2100) not null,\n'
|
||||
' date datetime not null,\n'
|
||||
' date varchar(100) not null,\n'
|
||||
' id varchar(100));\n') # warc record id
|
||||
WRITE_SQL_TMPL = ('insert or ignore into dedup\n'
|
||||
'(digest_key, url, date, id)\n'
|
||||
'values (%s, %s, %s, %s);')
|
||||
|
||||
def __init__(self, options=warcprox.Options()):
|
||||
try:
|
||||
import trough.client
|
||||
except ImportError as e:
|
||||
logging.critical(
|
||||
'%s: %s\n\nYou might need to run "pip install '
|
||||
'warcprox[trough]".', type(e).__name__, e)
|
||||
sys.exit(1)
|
||||
|
||||
DedupableMixin.__init__(self, options)
|
||||
self.options = options
|
||||
self._trough_cli = warcprox.trough.TroughClient(
|
||||
self._trough_cli = trough.client.TroughClient(
|
||||
options.rethinkdb_trough_db_url, promotion_interval=60*60)
|
||||
|
||||
def loader(self, *args, **kwargs):
|
||||
@ -513,9 +558,13 @@ class TroughDedupDb(DedupDb, DedupableMixin):
|
||||
record_id = response_record.get_header(warctools.WarcRecord.ID)
|
||||
url = response_record.get_header(warctools.WarcRecord.URL)
|
||||
warc_date = response_record.get_header(warctools.WarcRecord.DATE)
|
||||
self._trough_cli.write(
|
||||
bucket, self.WRITE_SQL_TMPL,
|
||||
(digest_key, url, warc_date, record_id), self.SCHEMA_ID)
|
||||
try:
|
||||
self._trough_cli.write(
|
||||
bucket, self.WRITE_SQL_TMPL,
|
||||
(digest_key, url, warc_date, record_id), self.SCHEMA_ID)
|
||||
except:
|
||||
self.logger.warning(
|
||||
'problem posting dedup data to trough', exc_info=True)
|
||||
|
||||
def batch_save(self, batch, bucket='__unspecified__'):
|
||||
sql_tmpl = ('insert or ignore into dedup\n'
|
||||
@ -530,12 +579,22 @@ class TroughDedupDb(DedupDb, DedupableMixin):
|
||||
recorded_url.url,
|
||||
recorded_url.warc_records[0].date,
|
||||
recorded_url.warc_records[0].id,])
|
||||
self._trough_cli.write(bucket, sql_tmpl, values, self.SCHEMA_ID)
|
||||
try:
|
||||
self._trough_cli.write(bucket, sql_tmpl, values, self.SCHEMA_ID)
|
||||
except:
|
||||
self.logger.warning(
|
||||
'problem posting dedup data to trough', exc_info=True)
|
||||
|
||||
def lookup(self, digest_key, bucket='__unspecified__', url=None):
|
||||
results = self._trough_cli.read(
|
||||
bucket, 'select * from dedup where digest_key=%s;',
|
||||
(digest_key,))
|
||||
try:
|
||||
results = self._trough_cli.read(
|
||||
bucket, 'select * from dedup where digest_key=%s;',
|
||||
(digest_key,))
|
||||
except:
|
||||
self.logger.warning(
|
||||
'problem reading dedup data from trough', exc_info=True)
|
||||
return None
|
||||
|
||||
if results:
|
||||
assert len(results) == 1 # sanity check (digest_key is primary key)
|
||||
result = results[0]
|
||||
@ -552,7 +611,14 @@ class TroughDedupDb(DedupDb, DedupableMixin):
|
||||
'''Returns [{'digest_key': ..., 'url': ..., 'date': ...}, ...]'''
|
||||
sql_tmpl = 'select * from dedup where digest_key in (%s)' % (
|
||||
','.join('%s' for i in range(len(digest_keys))))
|
||||
results = self._trough_cli.read(bucket, sql_tmpl, digest_keys)
|
||||
|
||||
try:
|
||||
results = self._trough_cli.read(bucket, sql_tmpl, digest_keys)
|
||||
except:
|
||||
self.logger.warning(
|
||||
'problem reading dedup data from trough', exc_info=True)
|
||||
results = None
|
||||
|
||||
if results is None:
|
||||
return []
|
||||
self.logger.debug(
|
||||
@ -571,9 +637,11 @@ class TroughDedupDb(DedupDb, DedupableMixin):
|
||||
and self.should_dedup(recorded_url)):
|
||||
digest_key = warcprox.digest_str(
|
||||
recorded_url.payload_digest, self.options.base32)
|
||||
if recorded_url.warcprox_meta and 'dedup-bucket' in recorded_url.warcprox_meta:
|
||||
self.save(
|
||||
digest_key, records[0],
|
||||
bucket=recorded_url.warcprox_meta['dedup-bucket'])
|
||||
if recorded_url.warcprox_meta and 'dedup-buckets' in recorded_url.warcprox_meta:
|
||||
for bucket, bucket_mode in recorded_url.warcprox_meta["dedup-buckets"].items():
|
||||
if not bucket_mode == 'ro':
|
||||
self.save(
|
||||
digest_key, records[0],
|
||||
bucket=bucket)
|
||||
else:
|
||||
self.save(digest_key, records[0])
|
||||
|
@ -4,7 +4,7 @@
|
||||
warcprox/main.py - entrypoint for warcprox executable, parses command line
|
||||
arguments, initializes components, starts controller, handles signals
|
||||
|
||||
Copyright (C) 2013-2018 Internet Archive
|
||||
Copyright (C) 2013-2019 Internet Archive
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU General Public License
|
||||
@ -30,6 +30,7 @@ except ImportError:
|
||||
import Queue as queue
|
||||
|
||||
import logging
|
||||
import logging.config
|
||||
import sys
|
||||
import hashlib
|
||||
import argparse
|
||||
@ -38,7 +39,7 @@ import socket
|
||||
import traceback
|
||||
import signal
|
||||
import threading
|
||||
import certauth.certauth
|
||||
import yaml
|
||||
import warcprox
|
||||
import doublethink
|
||||
import cryptography.hazmat.backends.openssl
|
||||
@ -89,9 +90,11 @@ def _build_arg_parser(prog='warcprox', show_hidden=False):
|
||||
help='where to store and load generated certificates')
|
||||
arg_parser.add_argument('-d', '--dir', dest='directory',
|
||||
default='./warcs', help='where to write warcs')
|
||||
arg_parser.add_argument('--subdir-prefix', dest='subdir_prefix', action='store_true',
|
||||
help='write warcs to --dir subdir equal to the current warc-prefix'),
|
||||
arg_parser.add_argument('--warc-filename', dest='warc_filename',
|
||||
default='{prefix}-{timestamp17}-{serialno}-{randomtoken}',
|
||||
help='define custom WARC filename with variables {prefix}, {timestamp14}, {timestamp17}, {serialno}, {randomtoken}, {hostname}, {shorthostname}')
|
||||
help='define custom WARC filename with variables {prefix}, {timestamp14}, {timestamp17}, {serialno}, {randomtoken}, {hostname}, {shorthostname}, {port}')
|
||||
arg_parser.add_argument('-z', '--gzip', dest='gzip', action='store_true',
|
||||
help='write gzip-compressed warc records')
|
||||
hidden.add_argument(
|
||||
@ -168,6 +171,10 @@ def _build_arg_parser(prog='warcprox', show_hidden=False):
|
||||
help=suppress(
|
||||
'value of Cookie header to include in requests to the cdx '
|
||||
'server, when using --cdxserver-dedup'))
|
||||
hidden.add_argument(
|
||||
'--cdxserver-dedup-max-threads', dest='cdxserver_dedup_max_threads',
|
||||
type=int, default=50, help=suppress(
|
||||
'maximum number of cdx server dedup threads'))
|
||||
arg_parser.add_argument('--dedup-min-text-size', dest='dedup_min_text_size',
|
||||
type=int, default=0,
|
||||
help=('try to dedup text resources with payload size over this limit in bytes'))
|
||||
@ -196,16 +203,20 @@ def _build_arg_parser(prog='warcprox', show_hidden=False):
|
||||
help=suppress(
|
||||
'turn on performance profiling; summary statistics are dumped '
|
||||
'every 10 minutes and at shutdown'))
|
||||
hidden.add_argument(
|
||||
'--writer-threads', dest='writer_threads', type=int, default=1,
|
||||
help=suppress(
|
||||
'number of warc writer threads; caution, see '
|
||||
'https://github.com/internetarchive/warcprox/issues/101'))
|
||||
arg_parser.add_argument(
|
||||
'--onion-tor-socks-proxy', dest='onion_tor_socks_proxy',
|
||||
default=None, help=(
|
||||
'host:port of tor socks proxy, used only to connect to '
|
||||
'.onion sites'))
|
||||
arg_parser.add_argument(
|
||||
'--socks-proxy', dest='socks_proxy',
|
||||
default=None, help='host:port of socks proxy, used for all traffic if activated')
|
||||
arg_parser.add_argument(
|
||||
'--socks-proxy-username', dest='socks_proxy_username',
|
||||
default=None, help='optional socks proxy username')
|
||||
arg_parser.add_argument(
|
||||
'--socks-proxy-password', dest='socks_proxy_password',
|
||||
default=None, help='optional socks proxy password')
|
||||
hidden.add_argument(
|
||||
'--socket-timeout', dest='socket_timeout', type=float, default=60,
|
||||
help=suppress(
|
||||
@ -240,6 +251,9 @@ def _build_arg_parser(prog='warcprox', show_hidden=False):
|
||||
arg_parser.add_argument(
|
||||
'--trace', dest='trace', action='store_true',
|
||||
help='very verbose logging')
|
||||
arg_parser.add_argument(
|
||||
'--logging-conf-file', dest='logging_conf_file', default=None,
|
||||
help=('reads logging configuration from a YAML file'))
|
||||
arg_parser.add_argument(
|
||||
'--version', action='version',
|
||||
version="warcprox {}".format(warcprox.__version__))
|
||||
@ -260,7 +274,7 @@ def dump_state(signum=None, frame=None):
|
||||
except Exception as e:
|
||||
state_strs.append('<n/a:%r>' % e)
|
||||
|
||||
logging.warn(
|
||||
logging.warning(
|
||||
'dumping state (caught signal %s)\n%s',
|
||||
signum, '\n'.join(state_strs))
|
||||
|
||||
@ -298,11 +312,17 @@ def main(argv=None):
|
||||
else:
|
||||
loglevel = logging.INFO
|
||||
|
||||
logging.root.handlers = []
|
||||
logging.basicConfig(
|
||||
stream=sys.stdout, level=loglevel, format=(
|
||||
'%(asctime)s %(process)d %(levelname)s %(threadName)s '
|
||||
'%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s'))
|
||||
|
||||
if args.logging_conf_file:
|
||||
with open(args.logging_conf_file, 'r') as fd:
|
||||
conf = yaml.safe_load(fd)
|
||||
logging.config.dictConfig(conf)
|
||||
|
||||
# see https://github.com/pyca/cryptography/issues/2911
|
||||
cryptography.hazmat.backends.openssl.backend.activate_builtin_random()
|
||||
|
||||
@ -317,7 +337,11 @@ def main(argv=None):
|
||||
# SIGQUIT does not exist on some platforms (windows)
|
||||
pass
|
||||
|
||||
controller.run_until_shutdown()
|
||||
try:
|
||||
controller.run_until_shutdown()
|
||||
except:
|
||||
logging.fatal('unhandled exception in controller', exc_info=True)
|
||||
sys.exit(1)
|
||||
|
||||
def ensure_rethinkdb_tables(argv=None):
|
||||
'''
|
||||
@ -389,7 +413,7 @@ def ensure_rethinkdb_tables(argv=None):
|
||||
did_something = True
|
||||
if args.rethinkdb_trough_db_url:
|
||||
dedup_db = warcprox.dedup.TroughDedupDb(options)
|
||||
logging.warn(
|
||||
logging.warning(
|
||||
'trough is responsible for creating most of the rethinkdb '
|
||||
'tables that it uses')
|
||||
did_something = True
|
||||
|
@ -35,6 +35,13 @@ try:
|
||||
import urllib.parse as urllib_parse
|
||||
except ImportError:
|
||||
import urlparse as urllib_parse
|
||||
# In python2/3, urllib parse caches in memory URL parsing results to avoid
|
||||
# repeating the process for the same URL. The problem is that the default
|
||||
# in memory cache size is just 20.
|
||||
# https://github.com/python/cpython/blob/3.7/Lib/urllib/parse.py#L80
|
||||
# since we do a lot of URL parsing, it makes sense to increase cache size.
|
||||
urllib_parse.MAX_CACHE_SIZE = 2000
|
||||
|
||||
try:
|
||||
import http.client as http_client
|
||||
# In python3 http.client.parse_headers() enforces http_client._MAXLINE
|
||||
@ -45,6 +52,11 @@ try:
|
||||
http_client._MAXLINE = 4194304 # 4 MiB
|
||||
except ImportError:
|
||||
import httplib as http_client
|
||||
# http_client has an arbitrary limit of 100 HTTP Headers which is too low and
|
||||
# it raises an HTTPException if the target URL has more.
|
||||
# https://github.com/python/cpython/blob/3.7/Lib/http/client.py#L113
|
||||
http_client._MAXHEADERS = 7000
|
||||
|
||||
import json
|
||||
import socket
|
||||
import logging
|
||||
@ -52,6 +64,7 @@ import ssl
|
||||
import warcprox
|
||||
import threading
|
||||
import datetime
|
||||
import random
|
||||
import socks
|
||||
import tempfile
|
||||
import hashlib
|
||||
@ -64,8 +77,14 @@ import urlcanon
|
||||
import time
|
||||
import collections
|
||||
import cProfile
|
||||
from urllib3 import PoolManager
|
||||
from urllib3.util import is_connection_dropped
|
||||
from urllib3.exceptions import TimeoutError, HTTPError, NewConnectionError
|
||||
import doublethink
|
||||
from cachetools import TTLCache
|
||||
from threading import RLock
|
||||
|
||||
from .certauth import CertificateAuthority
|
||||
|
||||
class ProxyingRecorder(object):
|
||||
"""
|
||||
@ -100,7 +119,7 @@ class ProxyingRecorder(object):
|
||||
self.proxy_client.sendall(hunk)
|
||||
except BaseException as e:
|
||||
self._proxy_client_conn_open = False
|
||||
self.logger.warn(
|
||||
self.logger.warning(
|
||||
'%s sending data to proxy client for url %s',
|
||||
e, self.url)
|
||||
self.logger.info(
|
||||
@ -203,6 +222,28 @@ def via_header_value(orig, request_version):
|
||||
via = via + '%s %s' % (request_version, 'warcprox')
|
||||
return via
|
||||
|
||||
|
||||
# Ref and detailed description about cipher selection at
|
||||
# https://github.com/urllib3/urllib3/blob/f070ec2e6f6c545f40d9196e5246df10c72e48e1/src/urllib3/util/ssl_.py#L170
|
||||
SSL_CIPHERS = [
|
||||
"ECDHE+AESGCM",
|
||||
"ECDHE+CHACHA20",
|
||||
"DH+AESGCM",
|
||||
"ECDH+AES",
|
||||
"DH+AES",
|
||||
"RSA+AESGCM",
|
||||
"RSA+AES",
|
||||
"!aNULL",
|
||||
"!eNULL",
|
||||
"!MD5",
|
||||
"!DSS",
|
||||
"!AESCCM",
|
||||
"DHE+AESGCM",
|
||||
"DHE+CHACHA20",
|
||||
"ECDH+AESGCM",
|
||||
]
|
||||
|
||||
|
||||
class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
||||
'''
|
||||
An http proxy implementation of BaseHTTPRequestHandler, that acts as a
|
||||
@ -210,9 +251,16 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
||||
and records the bytes in transit as it proxies them.
|
||||
'''
|
||||
logger = logging.getLogger("warcprox.mitmproxy.MitmProxyHandler")
|
||||
|
||||
_socket_timeout = 60
|
||||
_max_resource_size = None
|
||||
_tmp_file_max_memory_size = 512 * 1024
|
||||
onion_tor_socks_proxy_host = None
|
||||
onion_tor_socks_proxy_port = None
|
||||
socks_proxy_host = None
|
||||
socks_proxy_port = None
|
||||
socks_proxy_username = None
|
||||
socks_proxy_password = None
|
||||
|
||||
def __init__(self, request, client_address, server):
|
||||
threading.current_thread().name = 'MitmProxyHandler(tid={},started={},client={}:{})'.format(warcprox.gettid(), datetime.datetime.utcnow().isoformat(), client_address[0], client_address[1])
|
||||
@ -228,7 +276,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
||||
else:
|
||||
self.url = self.path
|
||||
u = urllib_parse.urlparse(self.url)
|
||||
if u.scheme != 'http':
|
||||
if u.scheme != 'http' or u.netloc == '':
|
||||
raise Exception(
|
||||
'unable to parse request %r as a proxy request' % (
|
||||
self.requestline))
|
||||
@ -240,6 +288,9 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
||||
query=u.query, fragment=u.fragment))
|
||||
self.hostname = urlcanon.normalize_host(host).decode('ascii')
|
||||
|
||||
def _hostname_port_cache_key(self):
|
||||
return '%s:%s' % (self.hostname, self.port)
|
||||
|
||||
def _connect_to_remote_server(self):
|
||||
'''
|
||||
Connect to destination.
|
||||
@ -251,7 +302,9 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
||||
'''
|
||||
self._conn_pool = self.server.remote_connection_pool.connection_from_host(
|
||||
host=self.hostname, port=int(self.port), scheme='http',
|
||||
pool_kwargs={'maxsize': 6, 'timeout': self._socket_timeout})
|
||||
pool_kwargs={'maxsize': 12, 'timeout': self._socket_timeout})
|
||||
|
||||
remote_ip = None
|
||||
|
||||
self._remote_server_conn = self._conn_pool._get_conn()
|
||||
if is_connection_dropped(self._remote_server_conn):
|
||||
@ -266,8 +319,21 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
||||
port=self.onion_tor_socks_proxy_port, rdns=True)
|
||||
self._remote_server_conn.sock.settimeout(self._socket_timeout)
|
||||
self._remote_server_conn.sock.connect((self.hostname, int(self.port)))
|
||||
elif self.socks_proxy_host and self.socks_proxy_port:
|
||||
self.logger.info(
|
||||
"using socks proxy at %s:%s to connect to %s",
|
||||
self.socks_proxy_host, self.socks_proxy_port, self.hostname)
|
||||
self._remote_server_conn.sock = socks.socksocket()
|
||||
self._remote_server_conn.sock.set_proxy(
|
||||
socks.SOCKS5, addr=self.socks_proxy_host,
|
||||
port=self.socks_proxy_port, rdns=True,
|
||||
username=self.socks_proxy_username,
|
||||
password=self.socks_proxy_password)
|
||||
self._remote_server_conn.sock.settimeout(self._socket_timeout)
|
||||
self._remote_server_conn.sock.connect((self.hostname, int(self.port)))
|
||||
else:
|
||||
self._remote_server_conn.connect()
|
||||
remote_ip = self._remote_server_conn.sock.getpeername()[0]
|
||||
|
||||
# Wrap socket if SSL is required
|
||||
if self.is_connect:
|
||||
@ -275,6 +341,9 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
||||
context = ssl.create_default_context()
|
||||
context.check_hostname = False
|
||||
context.verify_mode = ssl.CERT_NONE
|
||||
# randomize TLS fingerprint to evade anti-web-bot systems
|
||||
random.shuffle(SSL_CIPHERS)
|
||||
context.set_ciphers(":".join(SSL_CIPHERS))
|
||||
self._remote_server_conn.sock = context.wrap_socket(
|
||||
self._remote_server_conn.sock,
|
||||
server_hostname=self.hostname)
|
||||
@ -283,12 +352,17 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
||||
self._remote_server_conn.sock = ssl.wrap_socket(
|
||||
self._remote_server_conn.sock)
|
||||
except ssl.SSLError:
|
||||
self.logger.warn(
|
||||
self.logger.warning(
|
||||
"failed to establish ssl connection to %s; "
|
||||
"python ssl library does not support SNI, "
|
||||
"consider upgrading to python 2.7.9+ or 3.4+",
|
||||
self.hostname)
|
||||
raise
|
||||
except ssl.SSLError as e:
|
||||
self.logger.error(
|
||||
'error connecting to %s (%s) port %s: %s',
|
||||
self.hostname, remote_ip, self.port, e)
|
||||
raise
|
||||
return self._remote_server_conn.sock
|
||||
|
||||
def _transition_to_ssl(self):
|
||||
@ -328,11 +402,11 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
||||
self.logger.error(
|
||||
"problem handling %r: %r", self.requestline, e)
|
||||
if type(e) is socket.timeout:
|
||||
self.send_error(504, str(e))
|
||||
self.send_error(504, str(e), exception=e)
|
||||
else:
|
||||
self.send_error(500, str(e))
|
||||
except Exception as f:
|
||||
self.logger.warn("failed to send error response ({}) to proxy client: {}".format(e, f))
|
||||
self.logger.warning("failed to send error response ({}) to proxy client: {}".format(e, f))
|
||||
return
|
||||
|
||||
# Reload!
|
||||
@ -368,25 +442,55 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
||||
else:
|
||||
self._determine_host_port()
|
||||
assert self.url
|
||||
|
||||
# Check if target hostname:port is in `bad_hostnames_ports` cache
|
||||
# to avoid retrying to connect. Cached value is http status code.
|
||||
cached = None
|
||||
hostname_port = self._hostname_port_cache_key()
|
||||
with self.server.bad_hostnames_ports_lock:
|
||||
cached = self.server.bad_hostnames_ports.get(hostname_port)
|
||||
if cached:
|
||||
self.logger.info('Cannot connect to %s (cache)', hostname_port)
|
||||
self.send_error(cached, exception=Exception('Cached Failed Connection'))
|
||||
return
|
||||
# Connect to destination
|
||||
self._connect_to_remote_server()
|
||||
except warcprox.RequestBlockedByRule as e:
|
||||
# limit enforcers have already sent the appropriate response
|
||||
self.logger.info("%r: %r", self.requestline, e)
|
||||
return
|
||||
except warcprox.BadRequest as e:
|
||||
self.send_error(400, e.msg)
|
||||
return
|
||||
except Exception as e:
|
||||
# If connection fails, add hostname:port to cache to avoid slow
|
||||
# subsequent reconnection attempts. `NewConnectionError` can be
|
||||
# caused by many types of errors which are handled by urllib3.
|
||||
response_code = 500
|
||||
cache = False
|
||||
if isinstance(e, (socket.timeout, TimeoutError,)):
|
||||
response_code = 504
|
||||
cache = True
|
||||
elif isinstance(e, HTTPError):
|
||||
response_code = 502
|
||||
cache = True
|
||||
|
||||
if cache:
|
||||
host_port = self._hostname_port_cache_key()
|
||||
with self.server.bad_hostnames_ports_lock:
|
||||
self.server.bad_hostnames_ports[host_port] = response_code
|
||||
self.logger.info('bad_hostnames_ports cache size: %d',
|
||||
len(self.server.bad_hostnames_ports))
|
||||
self.logger.error(
|
||||
"problem processing request %r: %r",
|
||||
self.requestline, e, exc_info=True)
|
||||
self.send_error(500, str(e))
|
||||
self.send_error(response_code, exception=e)
|
||||
return
|
||||
|
||||
try:
|
||||
return self._proxy_request()
|
||||
except Exception as e:
|
||||
if self.server.shutting_down:
|
||||
self.logger.warn(
|
||||
self.logger.warning(
|
||||
'sending 503 warcprox shutting down %r: %r',
|
||||
self.requestline, e)
|
||||
self.send_error(503, 'warcprox shutting down')
|
||||
@ -394,10 +498,10 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
||||
self.logger.error(
|
||||
'error from remote server(?) %r: %r',
|
||||
self.requestline, e, exc_info=True)
|
||||
self.send_error(502, str(e))
|
||||
self.send_error(502)
|
||||
return
|
||||
|
||||
def send_error(self, code, message=None, explain=None):
|
||||
def send_error(self, code, message=None, explain=None, exception=None):
|
||||
# BaseHTTPRequestHandler.send_response_only() in http/server.py
|
||||
# does this:
|
||||
# if not hasattr(self, '_headers_buffer'):
|
||||
@ -410,9 +514,13 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
||||
try:
|
||||
return http_server.BaseHTTPRequestHandler.send_error(
|
||||
self, code, message, explain)
|
||||
except:
|
||||
self.logger.error(
|
||||
'send_error(%r, %r, %r) raised exception', exc_info=True)
|
||||
except Exception as e:
|
||||
level = logging.ERROR
|
||||
if isinstance(e, OSError) and e.errno == 9:
|
||||
level = logging.TRACE
|
||||
self.logger.log(
|
||||
level, 'send_error(%r, %r, %r) raised exception',
|
||||
exc_info=True)
|
||||
return None
|
||||
|
||||
def _proxy_request(self, extra_response_headers={}):
|
||||
@ -424,6 +532,33 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
||||
self.server.unregister_remote_server_sock(
|
||||
self._remote_server_conn.sock)
|
||||
|
||||
def _swallow_hop_by_hop_headers(self):
|
||||
'''
|
||||
Swallow headers that don't make sense to forward on, i.e.
|
||||
most hop-by-hop headers.
|
||||
|
||||
http://tools.ietf.org/html/rfc2616#section-13.5.
|
||||
'''
|
||||
# self.headers is an email.message.Message, which is case-insensitive
|
||||
# and doesn't throw KeyError in __delitem__
|
||||
for key in (
|
||||
'Warcprox-Meta', 'Connection', 'Proxy-Connection', 'Keep-Alive',
|
||||
'Proxy-Authenticate', 'Proxy-Authorization', 'Upgrade'):
|
||||
del self.headers[key]
|
||||
|
||||
def _build_request(self):
|
||||
req_str = '{} {} {}\r\n'.format(
|
||||
self.command, self.path, self.request_version)
|
||||
|
||||
# Add headers to the request
|
||||
# XXX in at least python3.3 str(self.headers) uses \n not \r\n :(
|
||||
req_str += '\r\n'.join(
|
||||
'{}: {}'.format(k,v) for (k,v) in self.headers.items())
|
||||
|
||||
req = req_str.encode('latin1') + b'\r\n\r\n'
|
||||
|
||||
return req
|
||||
|
||||
def _inner_proxy_request(self, extra_response_headers={}):
|
||||
'''
|
||||
Sends the request to the remote server, then uses a ProxyingRecorder to
|
||||
@ -435,29 +570,11 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
||||
It may contain extra HTTP headers such as ``Warcprox-Meta`` which
|
||||
are written in the WARC record for this request.
|
||||
'''
|
||||
# Build request
|
||||
req_str = '{} {} {}\r\n'.format(
|
||||
self.command, self.path, self.request_version)
|
||||
|
||||
# Swallow headers that don't make sense to forward on, i.e. most
|
||||
# hop-by-hop headers. http://tools.ietf.org/html/rfc2616#section-13.5.
|
||||
# self.headers is an email.message.Message, which is case-insensitive
|
||||
# and doesn't throw KeyError in __delitem__
|
||||
for key in (
|
||||
'Connection', 'Proxy-Connection', 'Keep-Alive',
|
||||
'Proxy-Authenticate', 'Proxy-Authorization', 'Upgrade'):
|
||||
del self.headers[key]
|
||||
|
||||
self._swallow_hop_by_hop_headers()
|
||||
self.headers['Via'] = via_header_value(
|
||||
self.headers.get('Via'),
|
||||
self.request_version.replace('HTTP/', ''))
|
||||
|
||||
# Add headers to the request
|
||||
# XXX in at least python3.3 str(self.headers) uses \n not \r\n :(
|
||||
req_str += '\r\n'.join(
|
||||
'{}: {}'.format(k,v) for (k,v) in self.headers.items())
|
||||
|
||||
req = req_str.encode('latin1') + b'\r\n\r\n'
|
||||
req = self._build_request()
|
||||
|
||||
# Append message body if present to the request
|
||||
if 'Content-Length' in self.headers:
|
||||
@ -478,9 +595,14 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
||||
tmp_file_max_memory_size=self._tmp_file_max_memory_size)
|
||||
prox_rec_res.begin(extra_response_headers=extra_response_headers)
|
||||
|
||||
buf = prox_rec_res.read(65536)
|
||||
buf = None
|
||||
while buf != b'':
|
||||
buf = prox_rec_res.read(65536)
|
||||
try:
|
||||
buf = prox_rec_res.read(65536)
|
||||
except http_client.IncompleteRead as e:
|
||||
self.logger.warning('%s from %s', e, self.url)
|
||||
buf = e.partial
|
||||
|
||||
if (self._max_resource_size and
|
||||
prox_rec_res.recorder.len > self._max_resource_size):
|
||||
prox_rec_res.truncated = b'length'
|
||||
@ -506,9 +628,31 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
||||
# put it back in the pool to reuse it later.
|
||||
if not is_connection_dropped(self._remote_server_conn):
|
||||
self._conn_pool._put_conn(self._remote_server_conn)
|
||||
except:
|
||||
self._remote_server_conn.sock.shutdown(socket.SHUT_RDWR)
|
||||
self._remote_server_conn.sock.close()
|
||||
except Exception as e:
|
||||
# A common error is to connect to the remote server successfully
|
||||
# but raise a `RemoteDisconnected` exception when trying to begin
|
||||
# downloading. Its caused by prox_rec_res.begin(...) which calls
|
||||
# http_client._read_status(). The connection fails there.
|
||||
# https://github.com/python/cpython/blob/3.7/Lib/http/client.py#L275
|
||||
# Another case is when the connection is fine but the response
|
||||
# status is problematic, raising `BadStatusLine`.
|
||||
# https://github.com/python/cpython/blob/3.7/Lib/http/client.py#L296
|
||||
# In both cases, the host is bad and we must add it to
|
||||
# `bad_hostnames_ports` cache.
|
||||
if isinstance(e, (http_client.RemoteDisconnected,
|
||||
http_client.BadStatusLine)):
|
||||
host_port = self._hostname_port_cache_key()
|
||||
with self.server.bad_hostnames_ports_lock:
|
||||
self.server.bad_hostnames_ports[host_port] = 502
|
||||
self.logger.info('bad_hostnames_ports cache size: %d',
|
||||
len(self.server.bad_hostnames_ports))
|
||||
|
||||
# Close the connection only if its still open. If its already
|
||||
# closed, an `OSError` "([Errno 107] Transport endpoint is not
|
||||
# connected)" would be raised.
|
||||
if not is_connection_dropped(self._remote_server_conn):
|
||||
self._remote_server_conn.sock.shutdown(socket.SHUT_RDWR)
|
||||
self._remote_server_conn.sock.close()
|
||||
raise
|
||||
finally:
|
||||
if prox_rec_res:
|
||||
@ -521,7 +665,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
||||
return self.do_COMMAND
|
||||
|
||||
def log_error(self, fmt, *args):
|
||||
self.logger.warn(fmt, *args)
|
||||
self.logger.warning(fmt, *args)
|
||||
|
||||
class PooledMixIn(socketserver.ThreadingMixIn):
|
||||
logger = logging.getLogger("warcprox.mitmproxy.PooledMixIn")
|
||||
@ -667,6 +811,63 @@ class PooledMitmProxy(PooledMixIn, MitmProxy):
|
||||
Abort active connections to remote servers to achieve prompt shutdown.
|
||||
'''
|
||||
self.shutting_down = True
|
||||
for sock in self.remote_server_socks:
|
||||
for sock in list(self.remote_server_socks):
|
||||
self.shutdown_request(sock)
|
||||
|
||||
class SingleThreadedMitmProxy(http_server.HTTPServer):
|
||||
logger = logging.getLogger('warcprox.warcproxy.SingleThreadedMitmProxy')
|
||||
|
||||
def __init__(
|
||||
self, MitmProxyHandlerClass=MitmProxyHandler,
|
||||
options=warcprox.Options()):
|
||||
self.options = options
|
||||
|
||||
# TTLCache is not thread-safe. Access to the shared cache from multiple
|
||||
# threads must be properly synchronized with an RLock according to ref:
|
||||
# https://cachetools.readthedocs.io/en/latest/
|
||||
self.bad_hostnames_ports = TTLCache(maxsize=1024, ttl=60)
|
||||
self.bad_hostnames_ports_lock = RLock()
|
||||
|
||||
self.remote_connection_pool = PoolManager(
|
||||
num_pools=max((options.max_threads or 0) // 6, 400), maxsize=6)
|
||||
|
||||
if options.onion_tor_socks_proxy:
|
||||
try:
|
||||
host, port = options.onion_tor_socks_proxy.split(':')
|
||||
MitmProxyHandlerClass.onion_tor_socks_proxy_host = host
|
||||
MitmProxyHandlerClass.onion_tor_socks_proxy_port = int(port)
|
||||
except ValueError:
|
||||
MitmProxyHandlerClass.onion_tor_socks_proxy_host = options.onion_tor_socks_proxy
|
||||
MitmProxyHandlerClass.onion_tor_socks_proxy_port = None
|
||||
if options.socks_proxy:
|
||||
host, port = options.socks_proxy.split(':')
|
||||
MitmProxyHandlerClass.socks_proxy_host = host
|
||||
MitmProxyHandlerClass.socks_proxy_port = int(port)
|
||||
if options.socks_proxy_username:
|
||||
MitmProxyHandlerClass.socks_proxy_username = options.socks_proxy_username
|
||||
if options.socks_proxy_password:
|
||||
MitmProxyHandlerClass.socks_proxy_password = options.socks_proxy_password
|
||||
|
||||
if options.socket_timeout:
|
||||
MitmProxyHandlerClass._socket_timeout = options.socket_timeout
|
||||
if options.max_resource_size:
|
||||
MitmProxyHandlerClass._max_resource_size = options.max_resource_size
|
||||
if options.tmp_file_max_memory_size:
|
||||
MitmProxyHandlerClass._tmp_file_max_memory_size = options.tmp_file_max_memory_size
|
||||
|
||||
self.digest_algorithm = options.digest_algorithm or 'sha1'
|
||||
|
||||
ca_name = ('Warcprox CA on %s' % socket.gethostname())[:64]
|
||||
self.ca = CertificateAuthority(
|
||||
ca_file=options.cacert or 'warcprox-ca.pem',
|
||||
certs_dir=options.certs_dir or './warcprox-ca',
|
||||
ca_name=ca_name)
|
||||
|
||||
server_address = (
|
||||
options.address or 'localhost',
|
||||
options.port if options.port is not None else 8000)
|
||||
|
||||
http_server.HTTPServer.__init__(
|
||||
self, server_address, MitmProxyHandlerClass,
|
||||
bind_and_activate=True)
|
||||
|
||||
|
@ -42,6 +42,7 @@ from warcprox.mitmproxy import MitmProxyHandler
|
||||
import warcprox
|
||||
import sqlite3
|
||||
import threading
|
||||
from cachetools import TTLCache
|
||||
|
||||
class PlaybackProxyHandler(MitmProxyHandler):
|
||||
logger = logging.getLogger("warcprox.playback.PlaybackProxyHandler")
|
||||
@ -219,6 +220,8 @@ class PlaybackProxy(socketserver.ThreadingMixIn, http_server.HTTPServer):
|
||||
self.playback_index_db = playback_index_db
|
||||
self.warcs_dir = options.directory
|
||||
self.options = options
|
||||
self.bad_hostnames_ports = TTLCache(maxsize=1024, ttl=60)
|
||||
self.bad_hostnames_ports_lock = threading.RLock()
|
||||
|
||||
def server_activate(self):
|
||||
http_server.HTTPServer.server_activate(self)
|
||||
|
@ -29,7 +29,7 @@ import doublethink
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import rethinkdb as r
|
||||
from rethinkdb import RethinkDB; r = RethinkDB()
|
||||
import sqlite3
|
||||
import threading
|
||||
import time
|
||||
@ -81,7 +81,7 @@ def unravel_buckets(url, warcprox_meta):
|
||||
for bucket in warcprox_meta["stats"]["buckets"]:
|
||||
if isinstance(bucket, dict):
|
||||
if not 'bucket' in bucket:
|
||||
self.logger.warn(
|
||||
self.logger.warning(
|
||||
'ignoring invalid stats bucket in '
|
||||
'warcprox-meta header %s', bucket)
|
||||
continue
|
||||
@ -162,6 +162,8 @@ class StatsProcessor(warcprox.BaseBatchPostfetchProcessor):
|
||||
def _tally_batch(self, batch):
|
||||
batch_buckets = {}
|
||||
for recorded_url in batch:
|
||||
if isinstance(recorded_url, warcprox.warcproxy.FailedUrl):
|
||||
continue
|
||||
for bucket in self.buckets(recorded_url):
|
||||
bucket_stats = batch_buckets.get(bucket)
|
||||
if not bucket_stats:
|
||||
@ -297,6 +299,8 @@ class RunningStats:
|
||||
(self.first_snap_time - 120 + i * 10, 0, 0))
|
||||
|
||||
def notify(self, recorded_url, records):
|
||||
if isinstance(recorded_url, warcprox.warcproxy.FailedUrl):
|
||||
return
|
||||
with self._lock:
|
||||
self.urls += 1
|
||||
if records:
|
||||
|
@ -1,246 +0,0 @@
|
||||
'''
|
||||
warcprox/trough.py - trough client code
|
||||
|
||||
Copyright (C) 2017 Internet Archive
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU General Public License
|
||||
as published by the Free Software Foundation; either version 2
|
||||
of the License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
|
||||
USA.
|
||||
'''
|
||||
|
||||
from __future__ import absolute_import
|
||||
|
||||
import logging
|
||||
import os
|
||||
import json
|
||||
import requests
|
||||
import doublethink
|
||||
import rethinkdb as r
|
||||
import datetime
|
||||
import threading
|
||||
import time
|
||||
|
||||
class TroughClient(object):
|
||||
logger = logging.getLogger("warcprox.trough.TroughClient")
|
||||
|
||||
def __init__(self, rethinkdb_trough_db_url, promotion_interval=None):
|
||||
'''
|
||||
TroughClient constructor
|
||||
|
||||
Args:
|
||||
rethinkdb_trough_db_url: url with schema rethinkdb:// pointing to
|
||||
trough configuration database
|
||||
promotion_interval: if specified, `TroughClient` will spawn a
|
||||
thread that "promotes" (pushed to hdfs) "dirty" trough segments
|
||||
(segments that have received writes) periodically, sleeping for
|
||||
`promotion_interval` seconds between cycles (default None)
|
||||
'''
|
||||
parsed = doublethink.parse_rethinkdb_url(rethinkdb_trough_db_url)
|
||||
self.rr = doublethink.Rethinker(
|
||||
servers=parsed.hosts, db=parsed.database)
|
||||
self.svcreg = doublethink.ServiceRegistry(self.rr)
|
||||
self._write_url_cache = {}
|
||||
self._read_url_cache = {}
|
||||
self._dirty_segments = set()
|
||||
self._dirty_segments_lock = threading.RLock()
|
||||
|
||||
self.promotion_interval = promotion_interval
|
||||
self._promoter_thread = None
|
||||
if promotion_interval:
|
||||
self._promoter_thread = threading.Thread(
|
||||
target=self._promotrix, name='TroughClient-promoter')
|
||||
self._promoter_thread.setDaemon(True)
|
||||
self._promoter_thread.start()
|
||||
|
||||
def _promotrix(self):
|
||||
while True:
|
||||
time.sleep(self.promotion_interval)
|
||||
try:
|
||||
with self._dirty_segments_lock:
|
||||
dirty_segments = list(self._dirty_segments)
|
||||
self._dirty_segments.clear()
|
||||
logging.info(
|
||||
'promoting %s trough segments', len(dirty_segments))
|
||||
for segment_id in dirty_segments:
|
||||
try:
|
||||
self.promote(segment_id)
|
||||
except:
|
||||
logging.error(
|
||||
'problem promoting segment %s', segment_id,
|
||||
exc_info=True)
|
||||
except:
|
||||
logging.error(
|
||||
'caught exception doing segment promotion',
|
||||
exc_info=True)
|
||||
|
||||
def promote(self, segment_id):
|
||||
url = os.path.join(self.segment_manager_url(), 'promote')
|
||||
payload_dict = {'segment': segment_id}
|
||||
response = requests.post(url, json=payload_dict, timeout=21600)
|
||||
if response.status_code != 200:
|
||||
raise Exception(
|
||||
'Received %s: %r in response to POST %s with data %s' % (
|
||||
response.status_code, response.text, url,
|
||||
json.dumps(payload_dict)))
|
||||
|
||||
@staticmethod
|
||||
def sql_value(x):
|
||||
if x is None:
|
||||
return 'null'
|
||||
elif isinstance(x, datetime.datetime):
|
||||
return 'datetime(%r)' % x.isoformat()
|
||||
elif isinstance(x, bool):
|
||||
return int(x)
|
||||
elif isinstance(x, str) or isinstance(x, bytes):
|
||||
# the only character that needs escaped in sqlite string literals
|
||||
# is single-quote, which is escaped as two single-quotes
|
||||
if isinstance(x, bytes):
|
||||
s = x.decode('utf-8')
|
||||
else:
|
||||
s = x
|
||||
return "'" + s.replace("'", "''") + "'"
|
||||
elif isinstance(x, (int, float)):
|
||||
return x
|
||||
else:
|
||||
raise Exception(
|
||||
"don't know how to make an sql value from %r (%r)" % (
|
||||
x, type(x)))
|
||||
|
||||
def segment_manager_url(self):
|
||||
master_node = self.svcreg.unique_service('trough-sync-master')
|
||||
assert master_node
|
||||
return master_node['url']
|
||||
|
||||
def write_url_nocache(self, segment_id, schema_id='default'):
|
||||
provision_url = os.path.join(self.segment_manager_url(), 'provision')
|
||||
payload_dict = {'segment': segment_id, 'schema': schema_id}
|
||||
response = requests.post(provision_url, json=payload_dict, timeout=600)
|
||||
if response.status_code != 200:
|
||||
raise Exception(
|
||||
'Received %s: %r in response to POST %s with data %s' % (
|
||||
response.status_code, response.text, provision_url,
|
||||
json.dumps(payload_dict)))
|
||||
result_dict = response.json()
|
||||
# assert result_dict['schema'] == schema_id # previously provisioned?
|
||||
return result_dict['write_url']
|
||||
|
||||
def read_url_nocache(self, segment_id):
|
||||
reql = self.rr.table('services').get_all(
|
||||
segment_id, index='segment').filter(
|
||||
{'role':'trough-read'}).filter(
|
||||
lambda svc: r.now().sub(
|
||||
svc['last_heartbeat']).lt(svc['ttl'])
|
||||
).order_by('load')
|
||||
self.logger.debug('querying rethinkdb: %r', reql)
|
||||
results = reql.run()
|
||||
if results:
|
||||
return results[0]['url']
|
||||
else:
|
||||
return None
|
||||
|
||||
def write_url(self, segment_id, schema_id='default'):
|
||||
if not segment_id in self._write_url_cache:
|
||||
self._write_url_cache[segment_id] = self.write_url_nocache(
|
||||
segment_id, schema_id)
|
||||
self.logger.info(
|
||||
'segment %r write url is %r', segment_id,
|
||||
self._write_url_cache[segment_id])
|
||||
return self._write_url_cache[segment_id]
|
||||
|
||||
def read_url(self, segment_id):
|
||||
if not self._read_url_cache.get(segment_id):
|
||||
self._read_url_cache[segment_id] = self.read_url_nocache(segment_id)
|
||||
self.logger.info(
|
||||
'segment %r read url is %r', segment_id,
|
||||
self._read_url_cache[segment_id])
|
||||
return self._read_url_cache[segment_id]
|
||||
|
||||
def write(self, segment_id, sql_tmpl, values=(), schema_id='default'):
|
||||
write_url = self.write_url(segment_id, schema_id)
|
||||
sql = sql_tmpl % tuple(self.sql_value(v) for v in values)
|
||||
sql_bytes = sql.encode('utf-8')
|
||||
|
||||
try:
|
||||
response = requests.post(
|
||||
write_url, sql_bytes, timeout=600,
|
||||
headers={'content-type': 'application/sql;charset=utf-8'})
|
||||
if response.status_code != 200:
|
||||
raise Exception(
|
||||
'Received %s: %r in response to POST %s with data %r' % (
|
||||
response.status_code, response.text, write_url, sql))
|
||||
if segment_id not in self._dirty_segments:
|
||||
with self._dirty_segments_lock:
|
||||
self._dirty_segments.add(segment_id)
|
||||
except:
|
||||
self._write_url_cache.pop(segment_id, None)
|
||||
self.logger.error(
|
||||
'problem with trough write url %r', write_url,
|
||||
exc_info=True)
|
||||
return
|
||||
if response.status_code != 200:
|
||||
self._write_url_cache.pop(segment_id, None)
|
||||
self.logger.warn(
|
||||
'unexpected response %r %r %r from %r to sql=%r',
|
||||
response.status_code, response.reason, response.text,
|
||||
write_url, sql)
|
||||
return
|
||||
self.logger.debug('posted to %s: %r', write_url, sql)
|
||||
|
||||
def read(self, segment_id, sql_tmpl, values=()):
|
||||
read_url = self.read_url(segment_id)
|
||||
if not read_url:
|
||||
return None
|
||||
sql = sql_tmpl % tuple(self.sql_value(v) for v in values)
|
||||
sql_bytes = sql.encode('utf-8')
|
||||
try:
|
||||
response = requests.post(
|
||||
read_url, sql_bytes, timeout=600,
|
||||
headers={'content-type': 'application/sql;charset=utf-8'})
|
||||
except:
|
||||
self._read_url_cache.pop(segment_id, None)
|
||||
self.logger.error(
|
||||
'problem with trough read url %r', read_url, exc_info=True)
|
||||
return None
|
||||
if response.status_code != 200:
|
||||
self._read_url_cache.pop(segment_id, None)
|
||||
self.logger.warn(
|
||||
'unexpected response %r %r %r from %r to sql=%r',
|
||||
response.status_code, response.reason, response.text,
|
||||
read_url, sql)
|
||||
return None
|
||||
self.logger.trace(
|
||||
'got %r from posting query %r to %r', response.text, sql,
|
||||
read_url)
|
||||
results = json.loads(response.text)
|
||||
return results
|
||||
|
||||
def schema_exists(self, schema_id):
|
||||
url = os.path.join(self.segment_manager_url(), 'schema', schema_id)
|
||||
response = requests.get(url, timeout=60)
|
||||
if response.status_code == 200:
|
||||
return True
|
||||
elif response.status_code == 404:
|
||||
return False
|
||||
else:
|
||||
response.raise_for_status()
|
||||
|
||||
def register_schema(self, schema_id, sql):
|
||||
url = os.path.join(
|
||||
self.segment_manager_url(), 'schema', schema_id, 'sql')
|
||||
response = requests.put(url, sql, timeout=600)
|
||||
if response.status_code not in (201, 204):
|
||||
raise Exception(
|
||||
'Received %s: %r in response to PUT %r with data %r' % (
|
||||
response.status_code, response.text, sql, url))
|
||||
|
@ -125,48 +125,59 @@ class WarcRecordBuilder:
|
||||
headers.append((warctools.WarcRecord.CONCURRENT_TO, concurrent_to))
|
||||
if content_type is not None:
|
||||
headers.append((warctools.WarcRecord.CONTENT_TYPE, content_type))
|
||||
if payload_digest is not None:
|
||||
headers.append((warctools.WarcRecord.PAYLOAD_DIGEST, payload_digest))
|
||||
# truncated value may be 'length' or 'time'
|
||||
if truncated is not None:
|
||||
headers.append((b'WARC-Truncated', truncated))
|
||||
if content_length is not None:
|
||||
headers.append((
|
||||
warctools.WarcRecord.CONTENT_LENGTH,
|
||||
str(content_length).encode('latin1')))
|
||||
|
||||
if recorder is not None:
|
||||
if content_length is not None:
|
||||
headers.append((
|
||||
warctools.WarcRecord.CONTENT_LENGTH,
|
||||
str(content_length).encode('latin1')))
|
||||
else:
|
||||
if payload_digest is not None:
|
||||
headers.append(
|
||||
(warctools.WarcRecord.PAYLOAD_DIGEST, payload_digest))
|
||||
if content_length is None:
|
||||
headers.append((
|
||||
warctools.WarcRecord.CONTENT_LENGTH,
|
||||
str(len(recorder)).encode('latin1')))
|
||||
headers.append((warctools.WarcRecord.BLOCK_DIGEST,
|
||||
warcprox.digest_str(recorder.block_digest, self.base32)))
|
||||
recorder.tempfile.seek(0)
|
||||
record = warctools.WarcRecord(headers=headers, content_file=recorder.tempfile)
|
||||
record = warctools.WarcRecord(
|
||||
headers=headers, content_file=recorder.tempfile)
|
||||
else:
|
||||
if content_length is not None:
|
||||
headers.append((
|
||||
warctools.WarcRecord.CONTENT_LENGTH,
|
||||
str(content_length).encode('latin1')))
|
||||
else:
|
||||
if content_length is None:
|
||||
headers.append((
|
||||
warctools.WarcRecord.CONTENT_LENGTH,
|
||||
str(len(data)).encode('latin1')))
|
||||
# no http headers so block digest == payload digest
|
||||
if not payload_digest:
|
||||
payload_digest = warcprox.digest_str(
|
||||
|
||||
block_digest = None
|
||||
if not hasattr(data, 'read'):
|
||||
block_digest = warcprox.digest_str(
|
||||
hashlib.new(self.digest_algorithm, data), self.base32)
|
||||
headers.append((
|
||||
warctools.WarcRecord.PAYLOAD_DIGEST, payload_digest))
|
||||
headers.append((warctools.WarcRecord.BLOCK_DIGEST, payload_digest))
|
||||
|
||||
if not content_type.lower().startswith(b'application/http'):
|
||||
# no http headers, so block digest == payload digest
|
||||
if payload_digest and not block_digest:
|
||||
block_digest = payload_digest
|
||||
elif block_digest and not payload_digest:
|
||||
payload_digest = block_digest
|
||||
|
||||
if block_digest:
|
||||
headers.append(
|
||||
(warctools.WarcRecord.BLOCK_DIGEST, block_digest))
|
||||
if payload_digest:
|
||||
headers.append(
|
||||
(warctools.WarcRecord.PAYLOAD_DIGEST, payload_digest))
|
||||
|
||||
if hasattr(data, 'read'):
|
||||
record = warctools.WarcRecord(
|
||||
headers=headers, content_file=data)
|
||||
else:
|
||||
content_tuple = content_type, data
|
||||
record = warctools.WarcRecord(
|
||||
headers=headers, content=content_tuple)
|
||||
headers=headers, content=(content_type, data))
|
||||
|
||||
return record
|
||||
|
||||
|
@ -2,7 +2,7 @@
|
||||
warcprox/warcproxy.py - recording proxy, extends mitmproxy to record traffic,
|
||||
enqueue info on the recorded url queue
|
||||
|
||||
Copyright (C) 2013-2018 Internet Archive
|
||||
Copyright (C) 2013-2022 Internet Archive
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU General Public License
|
||||
@ -38,15 +38,16 @@ import logging
|
||||
import json
|
||||
import socket
|
||||
from hanzo import warctools
|
||||
from certauth.certauth import CertificateAuthority
|
||||
import warcprox
|
||||
import datetime
|
||||
import urlcanon
|
||||
import os
|
||||
from urllib3 import PoolManager
|
||||
import tempfile
|
||||
import hashlib
|
||||
import doublethink
|
||||
import re
|
||||
import zlib
|
||||
import base64
|
||||
|
||||
class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
||||
'''
|
||||
@ -167,7 +168,7 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
||||
if warcprox_meta and 'warc-prefix' in warcprox_meta and (
|
||||
'/' in warcprox_meta['warc-prefix']
|
||||
or '\\' in warcprox_meta['warc-prefix']):
|
||||
raise Exception(
|
||||
raise warcprox.BadRequest(
|
||||
"request rejected by warcprox: slash and backslash are not "
|
||||
"permitted in warc-prefix")
|
||||
|
||||
@ -176,6 +177,13 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
||||
warcprox_meta = json.loads(self.headers['Warcprox-Meta'])
|
||||
self._security_check(warcprox_meta)
|
||||
self._enforce_limits(warcprox_meta)
|
||||
if 'compressed_blocks' in warcprox_meta:
|
||||
# b64decode and decompress
|
||||
blocks_decompressed = zlib.decompress(base64.b64decode(warcprox_meta['compressed_blocks']))
|
||||
# decode() and json.loads
|
||||
warcprox_meta['blocks'] = json.loads(blocks_decompressed.decode())
|
||||
# delete compressed_blocks (just in case?)
|
||||
del warcprox_meta['compressed_blocks']
|
||||
self._enforce_blocks(warcprox_meta)
|
||||
|
||||
def _connect_to_remote_server(self):
|
||||
@ -189,16 +197,21 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
||||
self._enforce_limits_and_blocks()
|
||||
return warcprox.mitmproxy.MitmProxyHandler._connect_to_remote_server(self)
|
||||
|
||||
def _proxy_request(self):
|
||||
warcprox_meta = None
|
||||
def _parse_warcprox_meta(self):
|
||||
'''
|
||||
:return: Warcprox-Meta request header value as a dictionary, or None
|
||||
'''
|
||||
raw_warcprox_meta = self.headers.get('Warcprox-Meta')
|
||||
self.logger.trace(
|
||||
'request for %s Warcprox-Meta header: %s', self.url,
|
||||
raw_warcprox_meta)
|
||||
'request for %s Warcprox-Meta header: %s', self.url,
|
||||
raw_warcprox_meta)
|
||||
if raw_warcprox_meta:
|
||||
warcprox_meta = json.loads(raw_warcprox_meta)
|
||||
del self.headers['Warcprox-Meta']
|
||||
return json.loads(raw_warcprox_meta)
|
||||
else:
|
||||
return None
|
||||
|
||||
def _proxy_request(self):
|
||||
warcprox_meta = self._parse_warcprox_meta()
|
||||
remote_ip = self._remote_server_conn.sock.getpeername()[0]
|
||||
timestamp = doublethink.utcnow()
|
||||
extra_response_headers = {}
|
||||
@ -345,15 +358,43 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
||||
self.logger.error("uncaught exception in do_WARCPROX_WRITE_RECORD", exc_info=True)
|
||||
raise
|
||||
|
||||
def send_error(self, code, message=None, explain=None, exception=None):
|
||||
super().send_error(code, message=message, explain=explain, exception=exception)
|
||||
|
||||
# If error happens during CONNECT handling and before the inner request, self.url
|
||||
# is unset, and self.path is something like 'example.com:443'
|
||||
urlish = self.url or self.path
|
||||
|
||||
warcprox_meta = self._parse_warcprox_meta()
|
||||
self._swallow_hop_by_hop_headers()
|
||||
request_data = self._build_request()
|
||||
|
||||
failed_url = FailedUrl(
|
||||
url=urlish,
|
||||
request_data=request_data,
|
||||
warcprox_meta=warcprox_meta,
|
||||
status=code,
|
||||
client_ip=self.client_address[0],
|
||||
method=self.command,
|
||||
timestamp=doublethink.utcnow(),
|
||||
host=self.hostname,
|
||||
duration=None,
|
||||
referer=self.headers.get('referer'),
|
||||
do_not_archive=True,
|
||||
message=message,
|
||||
exception=exception)
|
||||
|
||||
self.server.recorded_url_q.put(failed_url)
|
||||
|
||||
def log_message(self, fmt, *args):
|
||||
# logging better handled elsewhere?
|
||||
pass
|
||||
|
||||
RE_MIMETYPE = re.compile(r'[;\s]')
|
||||
|
||||
class RecordedUrl:
|
||||
logger = logging.getLogger("warcprox.warcproxy.RecordedUrl")
|
||||
|
||||
def __init__(self, url, request_data, response_recorder, remote_ip,
|
||||
class RequestedUrl:
|
||||
logger = logging.getLogger("warcprox.warcproxy.RequestedUrl")
|
||||
def __init__(self, url, request_data, response_recorder=None, remote_ip=None,
|
||||
warcprox_meta=None, content_type=None, custom_type=None,
|
||||
status=None, size=None, client_ip=None, method=None,
|
||||
timestamp=None, host=None, duration=None, referer=None,
|
||||
@ -366,19 +407,20 @@ class RecordedUrl:
|
||||
else:
|
||||
self.url = url
|
||||
|
||||
if type(remote_ip) is not bytes:
|
||||
self.remote_ip = remote_ip.encode('ascii')
|
||||
else:
|
||||
self.remote_ip = remote_ip
|
||||
|
||||
self.request_data = request_data
|
||||
self.response_recorder = response_recorder
|
||||
|
||||
if warcprox_meta:
|
||||
if 'captures-bucket' in warcprox_meta:
|
||||
# backward compatibility
|
||||
warcprox_meta['dedup-bucket'] = warcprox_meta['captures-bucket']
|
||||
warcprox_meta['dedup-buckets'] = {}
|
||||
warcprox_meta['dedup-buckets'][warcprox_meta['captures-bucket']] = 'rw'
|
||||
del warcprox_meta['captures-bucket']
|
||||
if 'dedup-bucket' in warcprox_meta:
|
||||
# more backwards compatibility
|
||||
warcprox_meta['dedup-buckets'] = {}
|
||||
warcprox_meta['dedup-buckets'][warcprox_meta['dedup-bucket']] = 'rw'
|
||||
del warcprox_meta['dedup-bucket']
|
||||
self.warcprox_meta = warcprox_meta
|
||||
else:
|
||||
self.warcprox_meta = {}
|
||||
@ -387,9 +429,8 @@ class RecordedUrl:
|
||||
|
||||
self.mimetype = content_type
|
||||
if self.mimetype:
|
||||
n = self.mimetype.find(";")
|
||||
if n >= 0:
|
||||
self.mimetype = self.mimetype[:n]
|
||||
# chop off subtype, and ensure there's no whitespace
|
||||
self.mimetype = RE_MIMETYPE.split(self.mimetype, 2)[0]
|
||||
|
||||
self.custom_type = custom_type
|
||||
self.status = status
|
||||
@ -405,6 +446,43 @@ class RecordedUrl:
|
||||
self.warc_records = warc_records
|
||||
self.do_not_archive = do_not_archive
|
||||
|
||||
class FailedUrl(RequestedUrl):
|
||||
logger = logging.getLogger("warcprox.warcproxy.FailedUrl")
|
||||
|
||||
def __init__(self, url, request_data, warcprox_meta=None, status=None,
|
||||
client_ip=None, method=None, timestamp=None, host=None, duration=None,
|
||||
referer=None, do_not_archive=True, message=None, exception=None):
|
||||
|
||||
super().__init__(url, request_data, warcprox_meta=warcprox_meta,
|
||||
status=status, client_ip=client_ip, method=method,
|
||||
timestamp=timestamp, host=host, duration=duration,
|
||||
referer=referer, do_not_archive=do_not_archive)
|
||||
|
||||
self.message = message
|
||||
self.exception = exception
|
||||
|
||||
class RecordedUrl(RequestedUrl):
|
||||
logger = logging.getLogger("warcprox.warcproxy.RecordedUrl")
|
||||
|
||||
def __init__(self, url, request_data, response_recorder, remote_ip,
|
||||
warcprox_meta=None, content_type=None, custom_type=None,
|
||||
status=None, size=None, client_ip=None, method=None,
|
||||
timestamp=None, host=None, duration=None, referer=None,
|
||||
payload_digest=None, truncated=None, warc_records=None,
|
||||
do_not_archive=False):
|
||||
|
||||
super().__init__(url, request_data, response_recorder=response_recorder,
|
||||
warcprox_meta=warcprox_meta, content_type=content_type,
|
||||
custom_type=custom_type, status=status, size=size, client_ip=client_ip,
|
||||
method=method, timestamp=timestamp, host=host, duration=duration,
|
||||
referer=referer, payload_digest=payload_digest, truncated=truncated,
|
||||
warc_records=warc_records, do_not_archive=do_not_archive)
|
||||
|
||||
if type(remote_ip) is not bytes:
|
||||
self.remote_ip = remote_ip.encode('ascii')
|
||||
else:
|
||||
self.remote_ip = remote_ip
|
||||
|
||||
def is_text(self):
|
||||
"""Ref: https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Complete_list_of_MIME_types
|
||||
Alternative method: try to decode('ascii') first N bytes to make sure
|
||||
@ -420,51 +498,20 @@ class RecordedUrl:
|
||||
# inherit from object so that multiple inheritance from this class works
|
||||
# properly in python 2
|
||||
# http://stackoverflow.com/questions/1713038/super-fails-with-error-typeerror-argument-1-must-be-type-not-classobj#18392639
|
||||
class SingleThreadedWarcProxy(http_server.HTTPServer, object):
|
||||
class SingleThreadedWarcProxy(warcprox.mitmproxy.SingleThreadedMitmProxy):
|
||||
logger = logging.getLogger("warcprox.warcproxy.WarcProxy")
|
||||
|
||||
def __init__(
|
||||
self, stats_db=None, status_callback=None,
|
||||
options=warcprox.Options()):
|
||||
self.start_time = doublethink.utcnow()
|
||||
|
||||
warcprox.mitmproxy.SingleThreadedMitmProxy.__init__(
|
||||
self, WarcProxyHandler, options)
|
||||
|
||||
self.status_callback = status_callback
|
||||
self.stats_db = stats_db
|
||||
self.options = options
|
||||
self.remote_connection_pool = PoolManager(
|
||||
num_pools=max(round(options.max_threads / 6), 200) if options.max_threads else 200)
|
||||
server_address = (
|
||||
options.address or 'localhost',
|
||||
options.port if options.port is not None else 8000)
|
||||
|
||||
if options.onion_tor_socks_proxy:
|
||||
try:
|
||||
host, port = options.onion_tor_socks_proxy.split(':')
|
||||
WarcProxyHandler.onion_tor_socks_proxy_host = host
|
||||
WarcProxyHandler.onion_tor_socks_proxy_port = int(port)
|
||||
except ValueError:
|
||||
WarcProxyHandler.onion_tor_socks_proxy_host = options.onion_tor_socks_proxy
|
||||
WarcProxyHandler.onion_tor_socks_proxy_port = None
|
||||
|
||||
if options.socket_timeout:
|
||||
WarcProxyHandler._socket_timeout = options.socket_timeout
|
||||
if options.max_resource_size:
|
||||
WarcProxyHandler._max_resource_size = options.max_resource_size
|
||||
if options.tmp_file_max_memory_size:
|
||||
WarcProxyHandler._tmp_file_max_memory_size = options.tmp_file_max_memory_size
|
||||
|
||||
http_server.HTTPServer.__init__(
|
||||
self, server_address, WarcProxyHandler, bind_and_activate=True)
|
||||
|
||||
self.digest_algorithm = options.digest_algorithm or 'sha1'
|
||||
|
||||
ca_name = ('Warcprox CA on %s' % socket.gethostname())[:64]
|
||||
self.ca = CertificateAuthority(
|
||||
ca_file=options.cacert or 'warcprox-ca.pem',
|
||||
certs_dir=options.certs_dir or './warcprox-ca',
|
||||
ca_name=ca_name)
|
||||
|
||||
self.recorded_url_q = queue.Queue(maxsize=options.queue_size or 1000)
|
||||
|
||||
self.running_stats = warcprox.stats.RunningStats()
|
||||
|
||||
def status(self):
|
||||
@ -530,6 +577,6 @@ class WarcProxy(SingleThreadedWarcProxy, warcprox.mitmproxy.PooledMitmProxy):
|
||||
self.remote_connection_pool.clear()
|
||||
|
||||
def handle_error(self, request, client_address):
|
||||
self.logger.warn(
|
||||
self.logger.warning(
|
||||
"exception processing request %s from %s", request,
|
||||
client_address, exc_info=True)
|
||||
|
@ -1,7 +1,7 @@
|
||||
'''
|
||||
warcprox/writer.py - warc writer, manages and writes records to warc files
|
||||
|
||||
Copyright (C) 2013-2017 Internet Archive
|
||||
Copyright (C) 2013-2019 Internet Archive
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU General Public License
|
||||
@ -29,41 +29,49 @@ import warcprox
|
||||
import os
|
||||
import socket
|
||||
import random
|
||||
import threading
|
||||
try:
|
||||
import queue
|
||||
except ImportError:
|
||||
import Queue as queue
|
||||
import contextlib
|
||||
|
||||
class _OneWritableWarc:
|
||||
class WarcWriter:
|
||||
'''
|
||||
Utility class used by WarcWriter
|
||||
A writer for one warc prefix, which rolls over to new warc file,
|
||||
incrementing serial number, when size limit is hit. Should only be used
|
||||
from one thread.
|
||||
'''
|
||||
logger = logging.getLogger('warcprox.writer.WarcWriter')
|
||||
|
||||
logger = logging.getLogger('warcprox.writer._OneWritableWarc')
|
||||
def __init__(self, options=warcprox.Options()):
|
||||
self.options = options
|
||||
|
||||
self.gzip = options.gzip or False
|
||||
self.record_builder = warcprox.warc.WarcRecordBuilder(
|
||||
digest_algorithm=options.digest_algorithm or 'sha1',
|
||||
base32=options.base32)
|
||||
|
||||
def __init__(self, options=warcprox.Options(), randomtoken='0'):
|
||||
self.f = None
|
||||
self.path = None
|
||||
self.finalname = None
|
||||
self.gzip = options.gzip or False
|
||||
self.prefix = options.prefix or 'warcprox'
|
||||
self.port = options.port or 8000
|
||||
self.open_suffix = '' if options.no_warc_open_suffix else '.open'
|
||||
self.randomtoken = randomtoken
|
||||
self.rollover_size = options.rollover_size or 1000000000
|
||||
self.rollover_idle_time = options.rollover_idle_time or None
|
||||
self.directory = options.directory or './warcs'
|
||||
if options.subdir_prefix and options.prefix:
|
||||
self.directory = os.path.sep.join([options.directory, options.prefix]) or './warcs'
|
||||
else:
|
||||
self.directory = options.directory or './warcs'
|
||||
self.filename_template = options.warc_filename or \
|
||||
'{prefix}-{timestamp17}-{randomtoken}-{serialno}'
|
||||
self.last_activity = time.time()
|
||||
self.serial = 0
|
||||
self.randomtoken = ''.join(
|
||||
random.sample('abcdefghijklmnopqrstuvwxyz0123456789', 8))
|
||||
|
||||
# h3 default <!-- <property name="template" value="${prefix}-${timestamp17}-${serialno}-${heritrix.pid}~${heritrix.hostname}~${heritrix.port}" /> -->
|
||||
def next_filename(self, serial):
|
||||
def filename(self, serial):
|
||||
"""WARC filename is configurable with CLI parameter --warc-filename.
|
||||
Default: '{prefix}-{timestamp17}-{randomtoken}-{serialno}'
|
||||
Available variables are: prefix, timestamp14, timestamp17, serialno,
|
||||
randomtoken, hostname, shorthostname.
|
||||
randomtoken, hostname, shorthostname, port.
|
||||
Extension ``.warc`` or ``.warc.gz`` is appended automatically.
|
||||
"""
|
||||
hostname = socket.getfqdn()
|
||||
@ -73,7 +81,7 @@ class _OneWritableWarc:
|
||||
timestamp17=warcprox.timestamp17(),
|
||||
serialno='{:05d}'.format(serial),
|
||||
randomtoken=self.randomtoken, hostname=hostname,
|
||||
shorthostname=shorthostname)
|
||||
shorthostname=shorthostname, port=self.port)
|
||||
if self.gzip:
|
||||
fname = fname + '.warc.gz'
|
||||
else:
|
||||
@ -81,13 +89,17 @@ class _OneWritableWarc:
|
||||
return fname
|
||||
|
||||
def open(self, serial):
|
||||
'''
|
||||
Opens a new warc file with filename prefix `self.prefix` and serial
|
||||
number `self.serial` and assigns file handle to `self.f`.
|
||||
'''
|
||||
if not os.path.exists(self.directory):
|
||||
self.logger.info(
|
||||
"warc destination directory %s doesn't exist, creating it",
|
||||
self.directory)
|
||||
os.mkdir(self.directory)
|
||||
|
||||
self.finalname = self.next_filename(serial)
|
||||
self.finalname = self.filename(serial)
|
||||
self.logger.trace('opening %s', self.finalname)
|
||||
self.path = os.path.sep.join(
|
||||
[self.directory, self.finalname + self.open_suffix])
|
||||
@ -103,20 +115,73 @@ class _OneWritableWarc:
|
||||
'could not lock file %s (%s)', self.path, exc)
|
||||
return self.f
|
||||
|
||||
def ensure_open(self):
|
||||
'''
|
||||
Ensures `self.f` is ready to write the next warc record.
|
||||
|
||||
If warc is not open, opens one, and writes the warcinfo record.
|
||||
'''
|
||||
if not self.f:
|
||||
serial = self.serial
|
||||
self.serial += 1
|
||||
self.open(serial)
|
||||
warcinfo = self.record_builder.build_warcinfo_record(self.finalname)
|
||||
self.logger.debug('warcinfo.headers=%s', warcinfo.headers)
|
||||
warcinfo.write_to(self.f, gzip=self.gzip)
|
||||
|
||||
def write_records(self, recorded_url):
|
||||
'''
|
||||
Returns tuple of records written, which are instances of
|
||||
`hanzo.warctools.warc.WarcRecord`, decorated with `warc_filename` and
|
||||
`offset` attributes.
|
||||
'''
|
||||
records = self.record_builder.build_warc_records(recorded_url)
|
||||
|
||||
self.ensure_open()
|
||||
total_warc_file_size = None
|
||||
for record in records:
|
||||
offset = self.f.tell()
|
||||
record.write_to(self.f, gzip=self.gzip)
|
||||
record.offset = offset
|
||||
offset2 = self.f.tell()
|
||||
record.length = offset2 - offset
|
||||
total_warc_file_size = offset2
|
||||
record.warc_filename = self.finalname
|
||||
self.logger.trace(
|
||||
'wrote warc record: warc_type=%s content_length=%s '
|
||||
'digest=%s offset=%d warc=%s url=%s', record.type,
|
||||
record.get_header(warctools.WarcRecord.CONTENT_LENGTH),
|
||||
record.get_header(b'WARC-Payload-Digest'), record.offset,
|
||||
self.path, record.get_header(warctools.WarcRecord.URL))
|
||||
self.f.flush()
|
||||
self.last_activity = time.time()
|
||||
# Closes current warc if size limit has been reached.
|
||||
self.maybe_size_rollover(total_warc_file_size)
|
||||
return records
|
||||
|
||||
def close(self):
|
||||
'''
|
||||
Closes out the active warc.
|
||||
|
||||
The next call to `write_records()` will write to a a new warc file with
|
||||
the serial number incremented.
|
||||
'''
|
||||
if self.path:
|
||||
self.logger.trace('closing %s', self.finalname)
|
||||
if self.open_suffix == '':
|
||||
try:
|
||||
fcntl.lockf(self.f, fcntl.LOCK_UN)
|
||||
except IOError as exc:
|
||||
except Exception as exc:
|
||||
self.logger.error(
|
||||
'could not unlock file %s (%s)', self.path, exc)
|
||||
self.f.close()
|
||||
finalpath = os.path.sep.join(
|
||||
[self.directory, self.finalname])
|
||||
os.rename(self.path, finalpath)
|
||||
|
||||
try:
|
||||
self.f.close()
|
||||
finalpath = os.path.sep.join(
|
||||
[self.directory, self.finalname])
|
||||
os.rename(self.path, finalpath)
|
||||
except Exception as exc:
|
||||
self.logger.error(
|
||||
'could not close and rename file %s (%s)', self.path, exc)
|
||||
self.path = None
|
||||
self.f = None
|
||||
|
||||
@ -129,119 +194,23 @@ class _OneWritableWarc:
|
||||
self.finalname, time.time() - self.last_activity)
|
||||
self.close()
|
||||
|
||||
def maybe_size_rollover(self):
|
||||
if self.path and os.path.getsize(self.path) > self.rollover_size:
|
||||
def maybe_size_rollover(self, total_warc_file_size):
|
||||
if total_warc_file_size and total_warc_file_size > self.rollover_size:
|
||||
self.logger.info(
|
||||
'rolling over %s because it has reached %s bytes in size',
|
||||
self.finalname, os.path.getsize(self.path))
|
||||
self.finalname, total_warc_file_size)
|
||||
self.close()
|
||||
|
||||
class WarcWriter:
|
||||
logger = logging.getLogger('warcprox.writer.WarcWriter')
|
||||
|
||||
def __init__(self, options=warcprox.Options()):
|
||||
self.options = options
|
||||
|
||||
self.gzip = options.gzip or False
|
||||
self.record_builder = warcprox.warc.WarcRecordBuilder(
|
||||
digest_algorithm=options.digest_algorithm or 'sha1',
|
||||
base32=options.base32)
|
||||
|
||||
self._available_warcs = queue.Queue()
|
||||
self._warc_count = 0
|
||||
self._warc_count_lock = threading.Lock()
|
||||
|
||||
self._serial = 0
|
||||
self._serial_lock = threading.Lock()
|
||||
|
||||
self._randomtoken = ''.join(
|
||||
random.sample('abcdefghijklmnopqrstuvwxyz0123456789', 8))
|
||||
|
||||
def _bespeak_warc(self):
|
||||
try:
|
||||
return self._available_warcs.get(block=False)
|
||||
except queue.Empty:
|
||||
with self._warc_count_lock:
|
||||
if self._warc_count < self.options.writer_threads:
|
||||
self._warc_count += 1
|
||||
return _OneWritableWarc(self.options, self._randomtoken)
|
||||
# else we're maxed out, wait for one to free up
|
||||
return self._available_warcs.get(block=True)
|
||||
|
||||
@contextlib.contextmanager
|
||||
def _warc(self):
|
||||
warc = self._bespeak_warc()
|
||||
|
||||
warc.maybe_size_rollover()
|
||||
|
||||
# lazy file open
|
||||
if warc.f == None:
|
||||
with self._serial_lock:
|
||||
serial = self._serial
|
||||
self._serial += 1
|
||||
warc.open(serial)
|
||||
warcinfo = self.record_builder.build_warcinfo_record(warc.finalname)
|
||||
self.logger.debug('warcinfo.headers=%s', warcinfo.headers)
|
||||
warcinfo.write_to(warc.f, gzip=self.gzip)
|
||||
|
||||
yield warc
|
||||
|
||||
# __exit__()
|
||||
warc.f.flush()
|
||||
warc.last_activity = time.time()
|
||||
self._available_warcs.put(warc)
|
||||
|
||||
def write_records(self, recorded_url):
|
||||
"""Returns tuple of records written, which are instances of
|
||||
hanzo.warctools.warc.WarcRecord, decorated with "warc_filename" and
|
||||
"offset" attributes."""
|
||||
records = self.record_builder.build_warc_records(recorded_url)
|
||||
|
||||
with self._warc() as warc:
|
||||
for record in records:
|
||||
offset = warc.f.tell()
|
||||
record.write_to(warc.f, gzip=self.gzip)
|
||||
record.offset = offset
|
||||
record.length = warc.f.tell() - offset
|
||||
record.warc_filename = warc.finalname
|
||||
self.logger.trace(
|
||||
'wrote warc record: warc_type=%s content_length=%s '
|
||||
'digest=%s offset=%d warc=%s url=%s',
|
||||
record.type,
|
||||
record.get_header(warctools.WarcRecord.CONTENT_LENGTH),
|
||||
record.get_header(b'WARC-Payload-Digest'),
|
||||
record.offset, warc.path,
|
||||
record.get_header(warctools.WarcRecord.URL))
|
||||
|
||||
return records
|
||||
|
||||
def maybe_idle_rollover(self):
|
||||
warcs = []
|
||||
while True:
|
||||
try:
|
||||
warc = self._available_warcs.get(block=False)
|
||||
warcs.append(warc)
|
||||
except queue.Empty:
|
||||
break
|
||||
for warc in warcs:
|
||||
warc.maybe_idle_rollover()
|
||||
self._available_warcs.put(warc)
|
||||
|
||||
def close_writer(self):
|
||||
while self._warc_count > 0:
|
||||
with self._warc_count_lock:
|
||||
warc = self._available_warcs.get()
|
||||
warc.close()
|
||||
self._warc_count -= 1
|
||||
|
||||
class WarcWriterPool:
|
||||
'''
|
||||
A `WarcWriter` per warc prefix. Should only be used from one thread.
|
||||
'''
|
||||
logger = logging.getLogger("warcprox.writer.WarcWriterPool")
|
||||
|
||||
def __init__(self, options=warcprox.Options()):
|
||||
self.default_warc_writer = WarcWriter(options)
|
||||
self.warc_writers = {} # {prefix:WarcWriter}
|
||||
self.options = options
|
||||
self._lock = threading.RLock()
|
||||
self._last_maybe = time.time()
|
||||
|
||||
# chooses writer for filename specified by warcprox_meta["warc-prefix"] if set
|
||||
@ -251,16 +220,17 @@ class WarcWriterPool:
|
||||
# self.logger.info("recorded_url.warcprox_meta={} for {}".format(recorded_url.warcprox_meta, recorded_url.url))
|
||||
options = warcprox.Options(**vars(self.options))
|
||||
options.prefix = recorded_url.warcprox_meta["warc-prefix"]
|
||||
with self._lock:
|
||||
if not options.prefix in self.warc_writers:
|
||||
self.warc_writers[options.prefix] = WarcWriter(options)
|
||||
w = self.warc_writers[options.prefix]
|
||||
if not options.prefix in self.warc_writers:
|
||||
self.warc_writers[options.prefix] = WarcWriter(options)
|
||||
w = self.warc_writers[options.prefix]
|
||||
return w
|
||||
|
||||
def write_records(self, recorded_url):
|
||||
"""Returns tuple of records written, which are instances of
|
||||
hanzo.warctools.warc.WarcRecord, decorated with "warc_filename" and
|
||||
"offset" attributes."""
|
||||
'''
|
||||
Returns tuple of records written, which are instances of
|
||||
`hanzo.warctools.warc.WarcRecord`, decorated with `warc_filename` and
|
||||
`offset` attributes.
|
||||
'''
|
||||
return self._writer(recorded_url).write_records(recorded_url)
|
||||
|
||||
def maybe_idle_rollover(self):
|
||||
@ -271,7 +241,20 @@ class WarcWriterPool:
|
||||
self._last_maybe = time.time()
|
||||
|
||||
def close_writers(self):
|
||||
self.default_warc_writer.close_writer()
|
||||
for w in self.warc_writers.values():
|
||||
w.close_writer()
|
||||
self.default_warc_writer.close()
|
||||
for prefix, writer in list(self.warc_writers.items()):
|
||||
del self.warc_writers[prefix]
|
||||
writer.close()
|
||||
|
||||
def close_for_prefix(self, prefix=None):
|
||||
'''
|
||||
Close warc writer for the given warc prefix, or the default prefix if
|
||||
`prefix` is `None`.
|
||||
'''
|
||||
if prefix and prefix in self.warc_writers:
|
||||
writer = self.warc_writers[prefix]
|
||||
del self.warc_writers[prefix]
|
||||
writer.close()
|
||||
else:
|
||||
self.default_warc_writer.close()
|
||||
|
||||
|
@ -2,7 +2,7 @@
|
||||
warcprox/writerthread.py - warc writer thread, reads from the recorded url
|
||||
queue, writes warc records, runs final tasks after warc records are written
|
||||
|
||||
Copyright (C) 2013-2018 Internet Archive
|
||||
Copyright (C) 2013-2019 Internet Archive
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU General Public License
|
||||
@ -33,6 +33,10 @@ import warcprox
|
||||
from concurrent import futures
|
||||
from datetime import datetime
|
||||
import threading
|
||||
try:
|
||||
import queue
|
||||
except ImportError:
|
||||
import Queue as queue
|
||||
|
||||
class WarcWriterProcessor(warcprox.BaseStandardPostfetchProcessor):
|
||||
logger = logging.getLogger("warcprox.writerthread.WarcWriterProcessor")
|
||||
@ -43,47 +47,33 @@ class WarcWriterProcessor(warcprox.BaseStandardPostfetchProcessor):
|
||||
warcprox.BaseStandardPostfetchProcessor.__init__(self, options=options)
|
||||
self.writer_pool = warcprox.writer.WarcWriterPool(options)
|
||||
self.method_filter = set(method.upper() for method in self.options.method_filter or [])
|
||||
|
||||
# set max_queued small, because self.inq is already handling queueing
|
||||
self.thread_local = threading.local()
|
||||
self.thread_profilers = {}
|
||||
# for us; but give it a little breathing room to make sure it can keep
|
||||
# worker threads busy
|
||||
self.pool = warcprox.ThreadPoolExecutor(
|
||||
max_workers=options.writer_threads or 1,
|
||||
max_queued=10 * (options.writer_threads or 1))
|
||||
self.batch = set()
|
||||
self.blackout_period = options.blackout_period or 0
|
||||
|
||||
def _startup(self):
|
||||
self.logger.info('%s warc writer threads', self.pool._max_workers)
|
||||
warcprox.BaseStandardPostfetchProcessor._startup(self)
|
||||
self.close_prefix_reqs = queue.Queue()
|
||||
|
||||
def _get_process_put(self):
|
||||
try:
|
||||
recorded_url = self.inq.get(block=True, timeout=0.5)
|
||||
self.batch.add(recorded_url)
|
||||
self.pool.submit(self._wrap_process_url, recorded_url)
|
||||
finally:
|
||||
self.writer_pool.maybe_idle_rollover()
|
||||
while True:
|
||||
try:
|
||||
prefix = self.close_prefix_reqs.get_nowait()
|
||||
self.writer_pool.close_for_prefix(prefix)
|
||||
except queue.Empty:
|
||||
break
|
||||
self.writer_pool.maybe_idle_rollover()
|
||||
super()._get_process_put()
|
||||
|
||||
def _wrap_process_url(self, recorded_url):
|
||||
if not getattr(self.thread_local, 'name_set', False):
|
||||
threading.current_thread().name = 'WarcWriterThread(tid=%s)' % warcprox.gettid()
|
||||
self.thread_local.name_set = True
|
||||
if self.options.profile:
|
||||
import cProfile
|
||||
if not hasattr(self.thread_local, 'profiler'):
|
||||
self.thread_local.profiler = cProfile.Profile()
|
||||
tid = threading.current_thread().ident
|
||||
self.thread_profilers[tid] = self.thread_local.profiler
|
||||
self.thread_local.profiler.enable()
|
||||
self._process_url(recorded_url)
|
||||
self.thread_local.profiler.disable()
|
||||
else:
|
||||
self._process_url(recorded_url)
|
||||
def close_for_prefix(self, prefix=None):
|
||||
'''
|
||||
Request close of warc writer for the given warc prefix, or the default
|
||||
prefix if `prefix` is `None`.
|
||||
|
||||
This API exists so that some code from outside of warcprox proper (in a
|
||||
third-party plugin for example) can close open warcs promptly when it
|
||||
knows they are finished.
|
||||
'''
|
||||
self.close_prefix_reqs.put(prefix)
|
||||
|
||||
def _process_url(self, recorded_url):
|
||||
if isinstance(recorded_url, warcprox.warcproxy.FailedUrl):
|
||||
return
|
||||
try:
|
||||
records = []
|
||||
if self._should_archive(recorded_url):
|
||||
@ -97,10 +87,6 @@ class WarcWriterProcessor(warcprox.BaseStandardPostfetchProcessor):
|
||||
logging.error(
|
||||
'caught exception processing %s', recorded_url.url,
|
||||
exc_info=True)
|
||||
finally:
|
||||
self.batch.remove(recorded_url)
|
||||
if self.outq:
|
||||
self.outq.put(recorded_url)
|
||||
|
||||
def _filter_accepts(self, recorded_url):
|
||||
if not self.method_filter:
|
||||
|
Loading…
x
Reference in New Issue
Block a user