mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
Merge branch 'dedup-fixes' into qa
This commit is contained in:
commit
f906312800
@ -1,20 +1,19 @@
|
|||||||
sudo: required
|
sudo: required
|
||||||
|
dist: xenial
|
||||||
language: python
|
language: python
|
||||||
python:
|
python:
|
||||||
|
- 3.7
|
||||||
- 3.6
|
- 3.6
|
||||||
- 3.5
|
- 3.5
|
||||||
- 3.4
|
- 3.4
|
||||||
- 2.7
|
- 2.7
|
||||||
- pypy
|
- pypy
|
||||||
- pypy3
|
- pypy3.5
|
||||||
- 3.7-dev
|
|
||||||
- nightly
|
- nightly
|
||||||
|
|
||||||
matrix:
|
matrix:
|
||||||
allow_failures:
|
allow_failures:
|
||||||
- python: nightly
|
- python: nightly
|
||||||
- python: 3.7-dev
|
|
||||||
- python: 2.7
|
- python: 2.7
|
||||||
- python: pypy
|
- python: pypy
|
||||||
|
|
||||||
|
@ -89,12 +89,13 @@ for deduplication works similarly to deduplication by `Heritrix
|
|||||||
4. If not found,
|
4. If not found,
|
||||||
|
|
||||||
a. Write ``response`` record with full payload
|
a. Write ``response`` record with full payload
|
||||||
b. Store new entry in deduplication database
|
b. Store new entry in deduplication database (can be disabled, see
|
||||||
|
`Warcprox-Meta HTTP request header <api.rst#warcprox-meta-http-request-header>`
|
||||||
|
|
||||||
The deduplication database is partitioned into different "buckets". URLs are
|
The deduplication database is partitioned into different "buckets". URLs are
|
||||||
deduplicated only against other captures in the same bucket. If specified, the
|
deduplicated only against other captures in the same bucket. If specified, the
|
||||||
``dedup-bucket`` field of the `Warcprox-Meta HTTP request header
|
``dedup-buckets`` field of the `Warcprox-Meta HTTP request header
|
||||||
<api.rst#warcprox-meta-http-request-header>`_ determines the bucket. Otherwise,
|
<api.rst#warcprox-meta-http-request-header>`_ determines the bucket(s). Otherwise,
|
||||||
the default bucket is used.
|
the default bucket is used.
|
||||||
|
|
||||||
Deduplication can be disabled entirely by starting warcprox with the argument
|
Deduplication can be disabled entirely by starting warcprox with the argument
|
||||||
|
10
api.rst
10
api.rst
@ -137,14 +137,16 @@ Example::
|
|||||||
|
|
||||||
Warcprox-Meta: {"warc-prefix": "special-warc"}
|
Warcprox-Meta: {"warc-prefix": "special-warc"}
|
||||||
|
|
||||||
``dedup-bucket`` (string)
|
``dedup-buckets`` (string)
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
Specifies the deduplication bucket. For more information about deduplication
|
Specifies the deduplication bucket(s). For more information about deduplication
|
||||||
see `<README.rst#deduplication>`_.
|
see `<README.rst#deduplication>`_.
|
||||||
|
|
||||||
Example::
|
Examples::
|
||||||
|
|
||||||
Warcprox-Meta: {"dedup-bucket":"my-dedup-bucket"}
|
Warcprox-Meta: {"dedup-buckets":{"my-dedup-bucket":"rw"}}
|
||||||
|
|
||||||
|
Warcprox-Meta: {"dedup-buckets":{"my-dedup-bucket":"rw", "my-read-only-dedup-bucket": "ro"}}
|
||||||
|
|
||||||
``blocks`` (list)
|
``blocks`` (list)
|
||||||
~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~
|
||||||
|
10
setup.py
10
setup.py
@ -25,14 +25,16 @@ import setuptools
|
|||||||
|
|
||||||
deps = [
|
deps = [
|
||||||
'certauth==1.1.6',
|
'certauth==1.1.6',
|
||||||
'warctools>=4.10.0,<=4.10.0',
|
'warctools>=4.10.0',
|
||||||
'urlcanon>=0.1.dev16',
|
'urlcanon>=0.3.0',
|
||||||
'doublethink>=0.2.0.dev87',
|
'doublethink>=0.2.0.dev87',
|
||||||
'urllib3>=1.23',
|
'urllib3>=1.14,<1.25',
|
||||||
'requests>=2.0.1',
|
'requests>=2.0.1',
|
||||||
'PySocks>=1.6.8',
|
'PySocks>=1.6.8',
|
||||||
'cryptography>=2.3',
|
'cryptography>=2.3',
|
||||||
'idna>=2.5',
|
'idna>=2.5',
|
||||||
|
'PyYAML>=5.1',
|
||||||
|
'cachetools',
|
||||||
]
|
]
|
||||||
try:
|
try:
|
||||||
import concurrent.futures
|
import concurrent.futures
|
||||||
@ -41,7 +43,7 @@ except:
|
|||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='warcprox',
|
name='warcprox',
|
||||||
version='2.4b4.dev195',
|
version='2.4.14',
|
||||||
description='WARC writing MITM HTTP/S proxy',
|
description='WARC writing MITM HTTP/S proxy',
|
||||||
url='https://github.com/internetarchive/warcprox',
|
url='https://github.com/internetarchive/warcprox',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
@ -80,7 +80,7 @@ RUN apt-get install -y libsqlite3-dev
|
|||||||
# trough itself
|
# trough itself
|
||||||
RUN virtualenv -p python3 /opt/trough-ve3 \
|
RUN virtualenv -p python3 /opt/trough-ve3 \
|
||||||
&& . /opt/trough-ve3/bin/activate \
|
&& . /opt/trough-ve3/bin/activate \
|
||||||
&& pip install git+https://github.com/jkafader/snakebite@feature/python3-version-string \
|
&& pip install git+https://github.com/nlevitt/snakebite.git@py3 \
|
||||||
&& pip install git+https://github.com/internetarchive/trough.git
|
&& pip install git+https://github.com/internetarchive/trough.git
|
||||||
|
|
||||||
RUN mkdir -vp /etc/service/trough-sync-local \
|
RUN mkdir -vp /etc/service/trough-sync-local \
|
||||||
|
@ -5,7 +5,7 @@
|
|||||||
|
|
||||||
set -x
|
set -x
|
||||||
|
|
||||||
pip install git+https://github.com/jkafader/snakebite@feature/python3-version-string
|
pip install git+https://github.com/nlevitt/snakebite.git@py3
|
||||||
pip install git+https://github.com/internetarchive/trough.git
|
pip install git+https://github.com/internetarchive/trough.git
|
||||||
|
|
||||||
mkdir /etc/trough
|
mkdir /etc/trough
|
||||||
|
@ -93,9 +93,11 @@ logging.basicConfig(
|
|||||||
stream=sys.stdout, level=logging.TRACE,
|
stream=sys.stdout, level=logging.TRACE,
|
||||||
format='%(asctime)s %(process)d %(levelname)s %(threadName)s '
|
format='%(asctime)s %(process)d %(levelname)s %(threadName)s '
|
||||||
'%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s')
|
'%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s')
|
||||||
|
|
||||||
|
logging.getLogger("urllib3").setLevel(logging.WARN)
|
||||||
logging.getLogger("requests.packages.urllib3").setLevel(logging.WARN)
|
logging.getLogger("requests.packages.urllib3").setLevel(logging.WARN)
|
||||||
warnings.simplefilter("ignore", category=requests.packages.urllib3.exceptions.InsecureRequestWarning)
|
import urllib3 ; urllib3.disable_warnings()
|
||||||
warnings.simplefilter("ignore", category=requests.packages.urllib3.exceptions.InsecurePlatformWarning)
|
import requests.packages.urllib3 ; requests.packages.urllib3.disable_warnings()
|
||||||
|
|
||||||
def wait(callback, timeout=10):
|
def wait(callback, timeout=10):
|
||||||
start = time.time()
|
start = time.time()
|
||||||
@ -144,7 +146,7 @@ def dump_state(signum=None, frame=None):
|
|||||||
stack = traceback.format_stack(sys._current_frames()[th.ident])
|
stack = traceback.format_stack(sys._current_frames()[th.ident])
|
||||||
state_strs.append("".join(stack))
|
state_strs.append("".join(stack))
|
||||||
|
|
||||||
logging.warn("dumping state (caught signal {})\n{}".format(signum, "\n".join(state_strs)))
|
logging.warning("dumping state (caught signal {})\n{}".format(signum, "\n".join(state_strs)))
|
||||||
|
|
||||||
signal.signal(signal.SIGQUIT, dump_state)
|
signal.signal(signal.SIGQUIT, dump_state)
|
||||||
|
|
||||||
@ -279,6 +281,15 @@ class _TestHttpRequestHandler(http_server.BaseHTTPRequestHandler):
|
|||||||
payload = b'Test.'
|
payload = b'Test.'
|
||||||
actual_headers = (b'Content-Type: text/plain\r\n'
|
actual_headers = (b'Content-Type: text/plain\r\n'
|
||||||
+ b'Content-Length: ' + str(len(payload)).encode('ascii') + b'\r\n')
|
+ b'Content-Length: ' + str(len(payload)).encode('ascii') + b'\r\n')
|
||||||
|
elif self.path == '/incomplete-read':
|
||||||
|
headers = (b'HTTP/1.1 200 OK\r\n'
|
||||||
|
+ b'Content-Type: text/plain\r\n'
|
||||||
|
+ b'Transfer-Encoding: chunked\r\n'
|
||||||
|
+ b'\r\n')
|
||||||
|
# payload = b'''1\r\na'''
|
||||||
|
payload = chunkify(
|
||||||
|
b'Server closes connection when client expects next chunk')
|
||||||
|
payload = payload[:-7]
|
||||||
else:
|
else:
|
||||||
payload = b'404 Not Found\n'
|
payload = b'404 Not Found\n'
|
||||||
headers = (b'HTTP/1.1 404 Not Found\r\n'
|
headers = (b'HTTP/1.1 404 Not Found\r\n'
|
||||||
@ -292,7 +303,9 @@ class _TestHttpRequestHandler(http_server.BaseHTTPRequestHandler):
|
|||||||
headers, payload = self.build_response()
|
headers, payload = self.build_response()
|
||||||
self.connection.sendall(headers)
|
self.connection.sendall(headers)
|
||||||
self.connection.sendall(payload)
|
self.connection.sendall(payload)
|
||||||
if self.path in ('/missing-content-length', '/empty-response'):
|
if self.path in (
|
||||||
|
'/missing-content-length', '/empty-response',
|
||||||
|
'/incomplete-read'):
|
||||||
# server must close the connection, else client has no idea if
|
# server must close the connection, else client has no idea if
|
||||||
# there is more data coming
|
# there is more data coming
|
||||||
self.connection.shutdown(socket.SHUT_RDWR)
|
self.connection.shutdown(socket.SHUT_RDWR)
|
||||||
@ -446,7 +459,7 @@ def warcprox_(request, http_daemon, https_daemon):
|
|||||||
logging.info('dropping rethinkdb database %r', parsed.database)
|
logging.info('dropping rethinkdb database %r', parsed.database)
|
||||||
rr.db_drop(parsed.database).run()
|
rr.db_drop(parsed.database).run()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.warn(
|
logging.warning(
|
||||||
'problem deleting rethinkdb database %r: %s',
|
'problem deleting rethinkdb database %r: %s',
|
||||||
parsed.database, e)
|
parsed.database, e)
|
||||||
logging.info('deleting working directory %r', work_dir)
|
logging.info('deleting working directory %r', work_dir)
|
||||||
@ -777,7 +790,7 @@ def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies,
|
|||||||
url2 = 'https://localhost:{}/k/l'.format(https_daemon.server_port)
|
url2 = 'https://localhost:{}/k/l'.format(https_daemon.server_port)
|
||||||
|
|
||||||
# archive url1 bucket_a
|
# archive url1 bucket_a
|
||||||
headers = {"Warcprox-Meta": json.dumps({"warc-prefix":"test_dedup_buckets","dedup-bucket":"bucket_a"})}
|
headers = {"Warcprox-Meta": json.dumps({"warc-prefix":"test_dedup_buckets","dedup-buckets":{"bucket_a":"rw"}})}
|
||||||
response = requests.get(url1, proxies=archiving_proxies, verify=False, headers=headers)
|
response = requests.get(url1, proxies=archiving_proxies, verify=False, headers=headers)
|
||||||
assert response.status_code == 200
|
assert response.status_code == 200
|
||||||
assert response.headers['warcprox-test-header'] == 'k!'
|
assert response.headers['warcprox-test-header'] == 'k!'
|
||||||
@ -803,7 +816,7 @@ def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies,
|
|||||||
assert dedup_lookup is None
|
assert dedup_lookup is None
|
||||||
|
|
||||||
# archive url2 bucket_b
|
# archive url2 bucket_b
|
||||||
headers = {"Warcprox-Meta": json.dumps({"warc-prefix":"test_dedup_buckets","dedup-bucket":"bucket_b"})}
|
headers = {"Warcprox-Meta": json.dumps({"warc-prefix":"test_dedup_buckets","dedup-buckets":{"bucket_b":""}})}
|
||||||
response = requests.get(url2, proxies=archiving_proxies, verify=False, headers=headers)
|
response = requests.get(url2, proxies=archiving_proxies, verify=False, headers=headers)
|
||||||
assert response.status_code == 200
|
assert response.status_code == 200
|
||||||
assert response.headers['warcprox-test-header'] == 'k!'
|
assert response.headers['warcprox-test-header'] == 'k!'
|
||||||
@ -903,6 +916,71 @@ def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies,
|
|||||||
finally:
|
finally:
|
||||||
fh.close()
|
fh.close()
|
||||||
|
|
||||||
|
def test_dedup_buckets_readonly(https_daemon, http_daemon, warcprox_, archiving_proxies, playback_proxies):
|
||||||
|
urls_before = warcprox_.proxy.running_stats.urls
|
||||||
|
|
||||||
|
url1 = 'http://localhost:{}/k/l'.format(http_daemon.server_port)
|
||||||
|
|
||||||
|
# archive url1
|
||||||
|
headers = {"Warcprox-Meta": json.dumps({"warc-prefix":"test_dedup_buckets_readonly",
|
||||||
|
"dedup-buckets":{"bucket_1":"rw", "bucket_2":"ro"}})
|
||||||
|
}
|
||||||
|
response = requests.get(url1, proxies=archiving_proxies, verify=False, headers=headers)
|
||||||
|
assert response.status_code == 200
|
||||||
|
assert response.headers['warcprox-test-header'] == 'k!'
|
||||||
|
assert response.content == b'I am the warcprox test payload! llllllllll!\n'
|
||||||
|
|
||||||
|
# wait for postfetch chain
|
||||||
|
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 1)
|
||||||
|
|
||||||
|
# check url1 in dedup db bucket_1 (rw)
|
||||||
|
# logging.info('looking up sha1:bc3fac8847c9412f49d955e626fb58a76befbf81 in bucket_1')
|
||||||
|
dedup_lookup = warcprox_.dedup_db.lookup(
|
||||||
|
b'sha1:bc3fac8847c9412f49d955e626fb58a76befbf81', bucket="bucket_1")
|
||||||
|
assert dedup_lookup
|
||||||
|
assert dedup_lookup['url'] == url1.encode('ascii')
|
||||||
|
assert re.match(br'^<urn:uuid:[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}>$', dedup_lookup['id'])
|
||||||
|
assert re.match(br'^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$', dedup_lookup['date'])
|
||||||
|
record_id = dedup_lookup['id']
|
||||||
|
dedup_date = dedup_lookup['date']
|
||||||
|
|
||||||
|
# check url1 not in dedup db bucket_2 (ro)
|
||||||
|
dedup_lookup = warcprox_.dedup_db.lookup(
|
||||||
|
b'sha1:bc3fac8847c9412f49d955e626fb58a76befbf81', bucket="bucket_2")
|
||||||
|
assert dedup_lookup is None
|
||||||
|
|
||||||
|
# close the warc
|
||||||
|
assert warcprox_.warc_writer_processor.writer_pool.warc_writers["test_dedup_buckets_readonly"]
|
||||||
|
writer = warcprox_.warc_writer_processor.writer_pool.warc_writers["test_dedup_buckets_readonly"]
|
||||||
|
warc_path = os.path.join(writer.directory, writer.finalname)
|
||||||
|
assert not os.path.exists(warc_path)
|
||||||
|
warcprox_.warc_writer_processor.writer_pool.warc_writers["test_dedup_buckets_readonly"].close()
|
||||||
|
assert os.path.exists(warc_path)
|
||||||
|
|
||||||
|
# read the warc
|
||||||
|
fh = warctools.ArchiveRecord.open_archive(warc_path)
|
||||||
|
record_iter = fh.read_records(limit=None, offsets=True)
|
||||||
|
try:
|
||||||
|
(offset, record, errors) = next(record_iter)
|
||||||
|
assert record.type == b'warcinfo'
|
||||||
|
|
||||||
|
# url1 bucket_1
|
||||||
|
(offset, record, errors) = next(record_iter)
|
||||||
|
assert record.type == b'response'
|
||||||
|
assert record.url == url1.encode('ascii')
|
||||||
|
# check for duplicate warc record headers
|
||||||
|
assert Counter(h[0] for h in record.headers).most_common(1)[0][1] == 1
|
||||||
|
assert record.content[1] == b'HTTP/1.1 200 OK\r\nContent-Type: text/plain\r\nwarcprox-test-header: k!\r\nContent-Length: 44\r\n\r\nI am the warcprox test payload! llllllllll!\n'
|
||||||
|
(offset, record, errors) = next(record_iter)
|
||||||
|
assert record.type == b'request'
|
||||||
|
|
||||||
|
# that's all folks
|
||||||
|
assert next(record_iter)[1] == None
|
||||||
|
assert next(record_iter, None) == None
|
||||||
|
|
||||||
|
finally:
|
||||||
|
fh.close()
|
||||||
|
|
||||||
def test_dedup_bucket_concurrency(https_daemon, http_daemon, warcprox_, archiving_proxies):
|
def test_dedup_bucket_concurrency(https_daemon, http_daemon, warcprox_, archiving_proxies):
|
||||||
urls_before = warcprox_.proxy.running_stats.urls
|
urls_before = warcprox_.proxy.running_stats.urls
|
||||||
revisits_before = warcprox_.proxy.stats_db.value(
|
revisits_before = warcprox_.proxy.stats_db.value(
|
||||||
@ -915,7 +993,7 @@ def test_dedup_bucket_concurrency(https_daemon, http_daemon, warcprox_, archivin
|
|||||||
http_daemon.server_port, i)
|
http_daemon.server_port, i)
|
||||||
headers = {"Warcprox-Meta": json.dumps({
|
headers = {"Warcprox-Meta": json.dumps({
|
||||||
"warc-prefix":"test_dedup_buckets",
|
"warc-prefix":"test_dedup_buckets",
|
||||||
"dedup-bucket":"bucket_%s" % i})}
|
"dedup-buckets":{"bucket_%s" % i:"rw"}})}
|
||||||
pool.submit(
|
pool.submit(
|
||||||
requests.get, url, proxies=archiving_proxies, verify=False,
|
requests.get, url, proxies=archiving_proxies, verify=False,
|
||||||
headers=headers)
|
headers=headers)
|
||||||
@ -931,7 +1009,7 @@ def test_dedup_bucket_concurrency(https_daemon, http_daemon, warcprox_, archivin
|
|||||||
http_daemon.server_port, -i - 1)
|
http_daemon.server_port, -i - 1)
|
||||||
headers = {"Warcprox-Meta": json.dumps({
|
headers = {"Warcprox-Meta": json.dumps({
|
||||||
"warc-prefix":"test_dedup_buckets",
|
"warc-prefix":"test_dedup_buckets",
|
||||||
"dedup-bucket":"bucket_%s" % i})}
|
"dedup-buckets":{"bucket_%s" % i:"rw"}})}
|
||||||
pool.submit(
|
pool.submit(
|
||||||
requests.get, url, proxies=archiving_proxies, verify=False,
|
requests.get, url, proxies=archiving_proxies, verify=False,
|
||||||
headers=headers)
|
headers=headers)
|
||||||
@ -946,7 +1024,7 @@ def test_dedup_bucket_concurrency(https_daemon, http_daemon, warcprox_, archivin
|
|||||||
http_daemon.server_port, i)
|
http_daemon.server_port, i)
|
||||||
headers = {"Warcprox-Meta": json.dumps({
|
headers = {"Warcprox-Meta": json.dumps({
|
||||||
"warc-prefix":"test_dedup_buckets",
|
"warc-prefix":"test_dedup_buckets",
|
||||||
"dedup-bucket":"bucket_%s" % i})}
|
"dedup-buckets":{"bucket_%s" % i:"rw"}})}
|
||||||
pool.submit(
|
pool.submit(
|
||||||
requests.get, url, proxies=archiving_proxies, verify=False,
|
requests.get, url, proxies=archiving_proxies, verify=False,
|
||||||
headers=headers)
|
headers=headers)
|
||||||
@ -965,12 +1043,12 @@ def test_block_rules(http_daemon, https_daemon, warcprox_, archiving_proxies):
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"url_match": "SURT_MATCH",
|
"url_match": "SURT_MATCH",
|
||||||
"value": "http://(localhost:%s,)/fuh/" % (http_daemon.server_port),
|
"value": "http://(localhost,:%s)/fuh/" % (http_daemon.server_port),
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"url_match": "SURT_MATCH",
|
"url_match": "SURT_MATCH",
|
||||||
# this rule won't match because of http scheme, https port
|
# this rule won't match because of http scheme, https port
|
||||||
"value": "http://(localhost:%s,)/fuh/" % (https_daemon.server_port),
|
"value": "http://(localhost,:%s)/fuh/" % (https_daemon.server_port),
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"domain": "bad.domain.com",
|
"domain": "bad.domain.com",
|
||||||
@ -1487,7 +1565,7 @@ def test_dedup_ok_flag(
|
|||||||
assert dedup_lookup is None
|
assert dedup_lookup is None
|
||||||
|
|
||||||
# archive with dedup_ok:False
|
# archive with dedup_ok:False
|
||||||
request_meta = {'dedup-bucket':'test_dedup_ok_flag','dedup-ok':False}
|
request_meta = {'dedup-buckets':{'test_dedup_ok_flag':''},'dedup-ok':False}
|
||||||
headers = {'Warcprox-Meta': json.dumps(request_meta)}
|
headers = {'Warcprox-Meta': json.dumps(request_meta)}
|
||||||
response = requests.get(
|
response = requests.get(
|
||||||
url, proxies=archiving_proxies, headers=headers, verify=False)
|
url, proxies=archiving_proxies, headers=headers, verify=False)
|
||||||
@ -1505,7 +1583,7 @@ def test_dedup_ok_flag(
|
|||||||
assert dedup_lookup is None
|
assert dedup_lookup is None
|
||||||
|
|
||||||
# archive without dedup_ok:False
|
# archive without dedup_ok:False
|
||||||
request_meta = {'dedup-bucket':'test_dedup_ok_flag'}
|
request_meta = {'dedup-buckets':{'test_dedup_ok_flag':''}}
|
||||||
headers = {'Warcprox-Meta': json.dumps(request_meta)}
|
headers = {'Warcprox-Meta': json.dumps(request_meta)}
|
||||||
response = requests.get(
|
response = requests.get(
|
||||||
url, proxies=archiving_proxies, headers=headers, verify=False)
|
url, proxies=archiving_proxies, headers=headers, verify=False)
|
||||||
@ -1611,13 +1689,11 @@ def test_controller_with_defaults():
|
|||||||
assert not wwp.writer_pool.default_warc_writer.record_builder.base32
|
assert not wwp.writer_pool.default_warc_writer.record_builder.base32
|
||||||
assert wwp.writer_pool.default_warc_writer.record_builder.digest_algorithm == 'sha1'
|
assert wwp.writer_pool.default_warc_writer.record_builder.digest_algorithm == 'sha1'
|
||||||
|
|
||||||
|
|
||||||
class EarlyPlugin(warcprox.BaseStandardPostfetchProcessor):
|
class EarlyPlugin(warcprox.BaseStandardPostfetchProcessor):
|
||||||
CHAIN_POSITION = 'early'
|
CHAIN_POSITION = 'early'
|
||||||
def _process_url(self):
|
def _process_url(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
def test_load_plugin():
|
def test_load_plugin():
|
||||||
options = warcprox.Options(port=0, plugins=[
|
options = warcprox.Options(port=0, plugins=[
|
||||||
'warcprox.stats.RunningStats',
|
'warcprox.stats.RunningStats',
|
||||||
@ -1714,13 +1790,13 @@ def test_slash_in_warc_prefix(warcprox_, http_daemon, archiving_proxies):
|
|||||||
url = 'http://localhost:%s/b/b' % http_daemon.server_port
|
url = 'http://localhost:%s/b/b' % http_daemon.server_port
|
||||||
headers = {"Warcprox-Meta": json.dumps({"warc-prefix":"../../../../etc/a"})}
|
headers = {"Warcprox-Meta": json.dumps({"warc-prefix":"../../../../etc/a"})}
|
||||||
response = requests.get(url, proxies=archiving_proxies, headers=headers)
|
response = requests.get(url, proxies=archiving_proxies, headers=headers)
|
||||||
assert response.status_code == 500
|
assert response.status_code == 400
|
||||||
assert response.reason == 'request rejected by warcprox: slash and backslash are not permitted in warc-prefix'
|
assert response.reason == 'request rejected by warcprox: slash and backslash are not permitted in warc-prefix'
|
||||||
|
|
||||||
url = 'http://localhost:%s/b/c' % http_daemon.server_port
|
url = 'http://localhost:%s/b/c' % http_daemon.server_port
|
||||||
headers = {"Warcprox-Meta": json.dumps({"warc-prefix":"..\\..\\..\\derp\\monkey"})}
|
headers = {"Warcprox-Meta": json.dumps({"warc-prefix":"..\\..\\..\\derp\\monkey"})}
|
||||||
response = requests.get(url, proxies=archiving_proxies, headers=headers)
|
response = requests.get(url, proxies=archiving_proxies, headers=headers)
|
||||||
assert response.status_code == 500
|
assert response.status_code == 400
|
||||||
assert response.reason == 'request rejected by warcprox: slash and backslash are not permitted in warc-prefix'
|
assert response.reason == 'request rejected by warcprox: slash and backslash are not permitted in warc-prefix'
|
||||||
|
|
||||||
def test_crawl_log(warcprox_, http_daemon, archiving_proxies):
|
def test_crawl_log(warcprox_, http_daemon, archiving_proxies):
|
||||||
@ -1763,7 +1839,7 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies):
|
|||||||
|
|
||||||
crawl_log = open(default_crawl_log_path, 'rb').read()
|
crawl_log = open(default_crawl_log_path, 'rb').read()
|
||||||
# tests will fail in year 3000 :)
|
# tests will fail in year 3000 :)
|
||||||
assert re.match(b'\A2[^\n]+\n\Z', crawl_log)
|
assert re.match(br'\A2[^\n]+\n\Z', crawl_log)
|
||||||
assert crawl_log[24:31] == b' 200 '
|
assert crawl_log[24:31] == b' 200 '
|
||||||
assert crawl_log[31:42] == b' 54 '
|
assert crawl_log[31:42] == b' 54 '
|
||||||
fields = crawl_log.split()
|
fields = crawl_log.split()
|
||||||
@ -1783,7 +1859,7 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies):
|
|||||||
assert extra_info['contentSize'] == 145
|
assert extra_info['contentSize'] == 145
|
||||||
|
|
||||||
crawl_log_1 = open(file, 'rb').read()
|
crawl_log_1 = open(file, 'rb').read()
|
||||||
assert re.match(b'\A2[^\n]+\n\Z', crawl_log_1)
|
assert re.match(br'\A2[^\n]+\n\Z', crawl_log_1)
|
||||||
assert crawl_log_1[24:31] == b' 200 '
|
assert crawl_log_1[24:31] == b' 200 '
|
||||||
assert crawl_log_1[31:42] == b' 54 '
|
assert crawl_log_1[31:42] == b' 54 '
|
||||||
fields = crawl_log_1.split()
|
fields = crawl_log_1.split()
|
||||||
@ -1821,7 +1897,7 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies):
|
|||||||
|
|
||||||
crawl_log_2 = open(file, 'rb').read()
|
crawl_log_2 = open(file, 'rb').read()
|
||||||
|
|
||||||
assert re.match(b'\A2[^\n]+\n\Z', crawl_log_2)
|
assert re.match(br'\A2[^\n]+\n\Z', crawl_log_2)
|
||||||
assert crawl_log_2[24:31] == b' 200 '
|
assert crawl_log_2[24:31] == b' 200 '
|
||||||
assert crawl_log_2[31:42] == b' 54 '
|
assert crawl_log_2[31:42] == b' 54 '
|
||||||
fields = crawl_log_2.split()
|
fields = crawl_log_2.split()
|
||||||
@ -1854,7 +1930,7 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies):
|
|||||||
|
|
||||||
assert os.path.exists(file)
|
assert os.path.exists(file)
|
||||||
crawl_log_3 = open(file, 'rb').read()
|
crawl_log_3 = open(file, 'rb').read()
|
||||||
assert re.match(b'\A2[^\n]+\n\Z', crawl_log_3)
|
assert re.match(br'\A2[^\n]+\n\Z', crawl_log_3)
|
||||||
assert crawl_log_3[24:31] == b' 200 '
|
assert crawl_log_3[24:31] == b' 200 '
|
||||||
assert crawl_log_3[31:42] == b' 0 '
|
assert crawl_log_3[31:42] == b' 0 '
|
||||||
fields = crawl_log_3.split()
|
fields = crawl_log_3.split()
|
||||||
@ -1894,7 +1970,7 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies):
|
|||||||
assert os.path.exists(file)
|
assert os.path.exists(file)
|
||||||
crawl_log_4 = open(file, 'rb').read()
|
crawl_log_4 = open(file, 'rb').read()
|
||||||
|
|
||||||
assert re.match(b'\A2[^\n]+\n\Z', crawl_log_4)
|
assert re.match(br'\A2[^\n]+\n\Z', crawl_log_4)
|
||||||
assert crawl_log_4[24:31] == b' 204 '
|
assert crawl_log_4[24:31] == b' 204 '
|
||||||
assert crawl_log_4[31:42] == b' 38 '
|
assert crawl_log_4[31:42] == b' 38 '
|
||||||
fields = crawl_log_4.split()
|
fields = crawl_log_4.split()
|
||||||
@ -1976,6 +2052,10 @@ def test_socket_timeout_response(
|
|||||||
def test_empty_response(
|
def test_empty_response(
|
||||||
warcprox_, http_daemon, https_daemon, archiving_proxies,
|
warcprox_, http_daemon, https_daemon, archiving_proxies,
|
||||||
playback_proxies):
|
playback_proxies):
|
||||||
|
# localhost:server_port was added to the `bad_hostnames_ports` cache by
|
||||||
|
# previous tests and this causes subsequent tests to fail. We clear it.
|
||||||
|
warcprox_.proxy.bad_hostnames_ports.clear()
|
||||||
|
|
||||||
url = 'http://localhost:%s/empty-response' % http_daemon.server_port
|
url = 'http://localhost:%s/empty-response' % http_daemon.server_port
|
||||||
response = requests.get(url, proxies=archiving_proxies, verify=False)
|
response = requests.get(url, proxies=archiving_proxies, verify=False)
|
||||||
assert response.status_code == 502
|
assert response.status_code == 502
|
||||||
@ -1991,6 +2071,10 @@ def test_payload_digest(warcprox_, http_daemon):
|
|||||||
Tests that digest is of RFC2616 "entity body"
|
Tests that digest is of RFC2616 "entity body"
|
||||||
(transfer-decoded but not content-decoded)
|
(transfer-decoded but not content-decoded)
|
||||||
'''
|
'''
|
||||||
|
# localhost:server_port was added to the `bad_hostnames_ports` cache by
|
||||||
|
# previous tests and this causes subsequent tests to fail. We clear it.
|
||||||
|
warcprox_.proxy.bad_hostnames_ports.clear()
|
||||||
|
|
||||||
class HalfMockedMitm(warcprox.mitmproxy.MitmProxyHandler):
|
class HalfMockedMitm(warcprox.mitmproxy.MitmProxyHandler):
|
||||||
def __init__(self, url):
|
def __init__(self, url):
|
||||||
self.path = url
|
self.path = url
|
||||||
@ -2224,6 +2308,23 @@ def test_dedup_min_binary_size(http_daemon, warcprox_, archiving_proxies):
|
|||||||
with pytest.raises(StopIteration):
|
with pytest.raises(StopIteration):
|
||||||
next(rec_iter)
|
next(rec_iter)
|
||||||
|
|
||||||
|
def test_incomplete_read(http_daemon, warcprox_, archiving_proxies):
|
||||||
|
urls_before = warcprox_.proxy.running_stats.urls
|
||||||
|
|
||||||
|
# see https://github.com/internetarchive/warcprox/pull/123
|
||||||
|
url = 'http://localhost:%s/incomplete-read' % http_daemon.server_port
|
||||||
|
with pytest.raises(requests.exceptions.ChunkedEncodingError):
|
||||||
|
response = requests.get(
|
||||||
|
url, proxies=archiving_proxies, verify=False, timeout=10)
|
||||||
|
|
||||||
|
# although `requests.get` raises exception here, other clients like
|
||||||
|
# browsers put up with the server misbehavior; warcprox does too, and will
|
||||||
|
# record the response verbatim in the warc; this `wait()` call tests
|
||||||
|
# that a warc record is written
|
||||||
|
|
||||||
|
# wait for postfetch chain
|
||||||
|
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 1)
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
pytest.main()
|
pytest.main()
|
||||||
|
|
||||||
|
@ -78,6 +78,15 @@ class RequestBlockedByRule(Exception):
|
|||||||
def __str__(self):
|
def __str__(self):
|
||||||
return "%s: %s" % (self.__class__.__name__, self.msg)
|
return "%s: %s" % (self.__class__.__name__, self.msg)
|
||||||
|
|
||||||
|
class BadRequest(Exception):
|
||||||
|
'''
|
||||||
|
Raised in case of a request deemed unacceptable by warcprox.
|
||||||
|
'''
|
||||||
|
def __init__(self, msg):
|
||||||
|
self.msg = msg
|
||||||
|
def __str__(self):
|
||||||
|
return "%s: %s" % (self.__class__.__name__, self.msg)
|
||||||
|
|
||||||
class BasePostfetchProcessor(threading.Thread):
|
class BasePostfetchProcessor(threading.Thread):
|
||||||
logger = logging.getLogger("warcprox.BasePostfetchProcessor")
|
logger = logging.getLogger("warcprox.BasePostfetchProcessor")
|
||||||
|
|
||||||
|
@ -71,7 +71,7 @@ class RethinkCaptures:
|
|||||||
"unexpected result saving batch of %s: %s "
|
"unexpected result saving batch of %s: %s "
|
||||||
"entries" % (len(self._batch), result))
|
"entries" % (len(self._batch), result))
|
||||||
if result["replaced"] > 0 or result["unchanged"] > 0:
|
if result["replaced"] > 0 or result["unchanged"] > 0:
|
||||||
self.logger.warn(
|
self.logger.warning(
|
||||||
"inserted=%s replaced=%s unchanged=%s in big "
|
"inserted=%s replaced=%s unchanged=%s in big "
|
||||||
"captures table (normally replaced=0 and "
|
"captures table (normally replaced=0 and "
|
||||||
"unchanged=0)", result["inserted"],
|
"unchanged=0)", result["inserted"],
|
||||||
@ -148,7 +148,7 @@ class RethinkCaptures:
|
|||||||
recorded_url.payload_digest.digest()
|
recorded_url.payload_digest.digest()
|
||||||
).decode("utf-8")
|
).decode("utf-8")
|
||||||
else:
|
else:
|
||||||
self.logger.warn(
|
self.logger.warning(
|
||||||
"digest type is %r but big captures table is indexed "
|
"digest type is %r but big captures table is indexed "
|
||||||
"by sha1",
|
"by sha1",
|
||||||
recorded_url.payload_digest.name)
|
recorded_url.payload_digest.name)
|
||||||
@ -157,8 +157,11 @@ class RethinkCaptures:
|
|||||||
sha1base32 = base64.b32encode(digest.digest()).decode("utf-8")
|
sha1base32 = base64.b32encode(digest.digest()).decode("utf-8")
|
||||||
|
|
||||||
if (recorded_url.warcprox_meta
|
if (recorded_url.warcprox_meta
|
||||||
and "dedup-bucket" in recorded_url.warcprox_meta):
|
and "dedup-buckets" in recorded_url.warcprox_meta):
|
||||||
bucket = recorded_url.warcprox_meta["dedup-bucket"]
|
for bucket, bucket_mode in recorded_url.warcprox_meta["dedup-buckets"].items():
|
||||||
|
if not bucket_mode == 'ro':
|
||||||
|
# maybe this is the right thing to do here? or should we return an entry for each? or ?
|
||||||
|
break
|
||||||
else:
|
else:
|
||||||
bucket = "__unspecified__"
|
bucket = "__unspecified__"
|
||||||
|
|
||||||
|
@ -441,7 +441,12 @@ class WarcproxController(object):
|
|||||||
exc_info=True)
|
exc_info=True)
|
||||||
pass
|
pass
|
||||||
finally:
|
finally:
|
||||||
self.shutdown()
|
try:
|
||||||
|
self.shutdown()
|
||||||
|
except:
|
||||||
|
self.logger.critical("graceful shutdown failed", exc_info=True)
|
||||||
|
self.logger.critical("killing myself -9")
|
||||||
|
os.kill(os.getpid(), 9)
|
||||||
|
|
||||||
def _dump_profiling(self):
|
def _dump_profiling(self):
|
||||||
import pstats, tempfile, os, io
|
import pstats, tempfile, os, io
|
||||||
|
@ -34,6 +34,7 @@ import urllib3
|
|||||||
from urllib3.exceptions import HTTPError
|
from urllib3.exceptions import HTTPError
|
||||||
import collections
|
import collections
|
||||||
from concurrent import futures
|
from concurrent import futures
|
||||||
|
from functools import lru_cache
|
||||||
|
|
||||||
urllib3.disable_warnings()
|
urllib3.disable_warnings()
|
||||||
|
|
||||||
@ -46,11 +47,11 @@ class DedupableMixin(object):
|
|||||||
def should_dedup(self, recorded_url):
|
def should_dedup(self, recorded_url):
|
||||||
"""Check if we should try to run dedup on resource based on payload
|
"""Check if we should try to run dedup on resource based on payload
|
||||||
size compared with min text/binary dedup size options.
|
size compared with min text/binary dedup size options.
|
||||||
When we use option --dedup-only-with-bucket, `dedup-bucket` is required
|
When we use option --dedup-only-with-bucket, `dedup-buckets` is required
|
||||||
in Warcprox-Meta to perform dedup.
|
in Warcprox-Meta to perform dedup.
|
||||||
Return Boolean.
|
Return Boolean.
|
||||||
"""
|
"""
|
||||||
if self.dedup_only_with_bucket and "dedup-bucket" not in recorded_url.warcprox_meta:
|
if self.dedup_only_with_bucket and "dedup-buckets" not in recorded_url.warcprox_meta:
|
||||||
return False
|
return False
|
||||||
if recorded_url.is_text():
|
if recorded_url.is_text():
|
||||||
return recorded_url.response_recorder.payload_size() > self.min_text_size
|
return recorded_url.response_recorder.payload_size() > self.min_text_size
|
||||||
@ -68,10 +69,13 @@ class DedupLoader(warcprox.BaseStandardPostfetchProcessor, DedupableMixin):
|
|||||||
and recorded_url.payload_digest
|
and recorded_url.payload_digest
|
||||||
and self.should_dedup(recorded_url)):
|
and self.should_dedup(recorded_url)):
|
||||||
digest_key = warcprox.digest_str(recorded_url.payload_digest, self.options.base32)
|
digest_key = warcprox.digest_str(recorded_url.payload_digest, self.options.base32)
|
||||||
if recorded_url.warcprox_meta and "dedup-bucket" in recorded_url.warcprox_meta:
|
if recorded_url.warcprox_meta and "dedup-buckets" in recorded_url.warcprox_meta:
|
||||||
recorded_url.dedup_info = self.dedup_db.lookup(
|
for bucket, bucket_mode in recorded_url.warcprox_meta["dedup-buckets"].items():
|
||||||
digest_key, recorded_url.warcprox_meta["dedup-bucket"],
|
recorded_url.dedup_info = self.dedup_db.lookup(
|
||||||
recorded_url.url)
|
digest_key, bucket, recorded_url.url)
|
||||||
|
if recorded_url.dedup_info:
|
||||||
|
# we found an existing capture
|
||||||
|
break
|
||||||
else:
|
else:
|
||||||
recorded_url.dedup_info = self.dedup_db.lookup(
|
recorded_url.dedup_info = self.dedup_db.lookup(
|
||||||
digest_key, url=recorded_url.url)
|
digest_key, url=recorded_url.url)
|
||||||
@ -147,10 +151,12 @@ class DedupDb(DedupableMixin):
|
|||||||
and self.should_dedup(recorded_url)):
|
and self.should_dedup(recorded_url)):
|
||||||
digest_key = warcprox.digest_str(
|
digest_key = warcprox.digest_str(
|
||||||
recorded_url.payload_digest, self.options.base32)
|
recorded_url.payload_digest, self.options.base32)
|
||||||
if recorded_url.warcprox_meta and "dedup-bucket" in recorded_url.warcprox_meta:
|
if recorded_url.warcprox_meta and "dedup-buckets" in recorded_url.warcprox_meta:
|
||||||
self.save(
|
for bucket, bucket_mode in recorded_url.warcprox_meta["dedup-buckets"].items():
|
||||||
digest_key, records[0],
|
if not bucket_mode == "ro":
|
||||||
bucket=recorded_url.warcprox_meta["dedup-bucket"])
|
self.save(
|
||||||
|
digest_key, records[0],
|
||||||
|
bucket=bucket)
|
||||||
else:
|
else:
|
||||||
self.save(digest_key, records[0])
|
self.save(digest_key, records[0])
|
||||||
|
|
||||||
@ -212,8 +218,10 @@ class RethinkDedupDb(DedupDb, DedupableMixin):
|
|||||||
and self.should_dedup(recorded_url)):
|
and self.should_dedup(recorded_url)):
|
||||||
digest_key = warcprox.digest_str(
|
digest_key = warcprox.digest_str(
|
||||||
recorded_url.payload_digest, self.options.base32)
|
recorded_url.payload_digest, self.options.base32)
|
||||||
if recorded_url.warcprox_meta and "dedup-bucket" in recorded_url.warcprox_meta:
|
if recorded_url.warcprox_meta and "dedup-buckets" in recorded_url.warcprox_meta:
|
||||||
self.save(digest_key, records[0], bucket=recorded_url.warcprox_meta["dedup-bucket"])
|
for bucket, bucket_mode in recorded_url.warcprox_meta["dedup-buckets"].items():
|
||||||
|
if not bucket_mode == 'ro':
|
||||||
|
self.save(digest_key, records[0], bucket=bucket)
|
||||||
else:
|
else:
|
||||||
self.save(digest_key, records[0])
|
self.save(digest_key, records[0])
|
||||||
|
|
||||||
@ -236,6 +244,7 @@ class CdxServerDedup(DedupDb):
|
|||||||
headers['Cookie'] = options.cdxserver_dedup_cookies
|
headers['Cookie'] = options.cdxserver_dedup_cookies
|
||||||
self.http_pool = urllib3.PoolManager(maxsize=maxsize, retries=0,
|
self.http_pool = urllib3.PoolManager(maxsize=maxsize, retries=0,
|
||||||
timeout=2.0, headers=headers)
|
timeout=2.0, headers=headers)
|
||||||
|
self.cached_lookup = lru_cache(maxsize=1024)(self.lookup)
|
||||||
|
|
||||||
def loader(self, *args, **kwargs):
|
def loader(self, *args, **kwargs):
|
||||||
return CdxServerDedupLoader(self, self.options)
|
return CdxServerDedupLoader(self, self.options)
|
||||||
@ -296,7 +305,7 @@ class CdxServerDedupLoader(warcprox.BaseBatchPostfetchProcessor, DedupableMixin)
|
|||||||
def __init__(self, cdx_dedup, options=warcprox.Options()):
|
def __init__(self, cdx_dedup, options=warcprox.Options()):
|
||||||
warcprox.BaseBatchPostfetchProcessor.__init__(self, options)
|
warcprox.BaseBatchPostfetchProcessor.__init__(self, options)
|
||||||
DedupableMixin.__init__(self, options)
|
DedupableMixin.__init__(self, options)
|
||||||
self.pool = futures.ThreadPoolExecutor(max_workers=400)
|
self.pool = futures.ThreadPoolExecutor(max_workers=options.cdxserver_dedup_max_threads)
|
||||||
self.batch = set()
|
self.batch = set()
|
||||||
self.cdx_dedup = cdx_dedup
|
self.cdx_dedup = cdx_dedup
|
||||||
|
|
||||||
@ -315,7 +324,10 @@ class CdxServerDedupLoader(warcprox.BaseBatchPostfetchProcessor, DedupableMixin)
|
|||||||
try:
|
try:
|
||||||
digest_key = warcprox.digest_str(recorded_url.payload_digest,
|
digest_key = warcprox.digest_str(recorded_url.payload_digest,
|
||||||
self.options.base32)
|
self.options.base32)
|
||||||
dedup_info = self.cdx_dedup.lookup(digest_key, recorded_url.url)
|
dedup_info = self.cdx_dedup.cached_lookup(digest_key, recorded_url.url)
|
||||||
|
cache_info = self.cdx_dedup.cached_lookup.cache_info()
|
||||||
|
if (cache_info.hits + cache_info.misses) % 1000 == 0:
|
||||||
|
self.logger.info(self.cdx_dedup.cached_lookup.cache_info())
|
||||||
if dedup_info:
|
if dedup_info:
|
||||||
recorded_url.dedup_info = dedup_info
|
recorded_url.dedup_info = dedup_info
|
||||||
except ValueError as exc:
|
except ValueError as exc:
|
||||||
@ -342,11 +354,12 @@ class BatchTroughStorer(warcprox.BaseBatchPostfetchProcessor):
|
|||||||
and recorded_url.warc_records[0].type == b'response'
|
and recorded_url.warc_records[0].type == b'response'
|
||||||
and self.trough_dedup_db.should_dedup(recorded_url)):
|
and self.trough_dedup_db.should_dedup(recorded_url)):
|
||||||
if (recorded_url.warcprox_meta
|
if (recorded_url.warcprox_meta
|
||||||
and 'dedup-bucket' in recorded_url.warcprox_meta):
|
and 'dedup-buckets' in recorded_url.warcprox_meta):
|
||||||
bucket = recorded_url.warcprox_meta['dedup-bucket']
|
for bucket, bucket_mode in recorded_url.warcprox_meta["dedup-buckets"].items():
|
||||||
|
if not bucket_mode == 'ro':
|
||||||
|
buckets[bucket].append(recorded_url)
|
||||||
else:
|
else:
|
||||||
bucket = '__unspecified__'
|
buckets['__unspecified__'].append(recorded_url)
|
||||||
buckets[bucket].append(recorded_url)
|
|
||||||
return buckets
|
return buckets
|
||||||
|
|
||||||
def _process_batch(self, batch):
|
def _process_batch(self, batch):
|
||||||
@ -369,7 +382,7 @@ class BatchTroughStorer(warcprox.BaseBatchPostfetchProcessor):
|
|||||||
except futures.TimeoutError as e:
|
except futures.TimeoutError as e:
|
||||||
# the remaining threads actually keep running in this case,
|
# the remaining threads actually keep running in this case,
|
||||||
# there's no way to stop them, but that should be harmless
|
# there's no way to stop them, but that should be harmless
|
||||||
logging.warn(
|
logging.warning(
|
||||||
'timed out saving dedup info to trough', exc_info=True)
|
'timed out saving dedup info to trough', exc_info=True)
|
||||||
|
|
||||||
class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor):
|
class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor):
|
||||||
@ -394,11 +407,11 @@ class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor):
|
|||||||
and recorded_url.payload_digest
|
and recorded_url.payload_digest
|
||||||
and self.trough_dedup_db.should_dedup(recorded_url)):
|
and self.trough_dedup_db.should_dedup(recorded_url)):
|
||||||
if (recorded_url.warcprox_meta
|
if (recorded_url.warcprox_meta
|
||||||
and 'dedup-bucket' in recorded_url.warcprox_meta):
|
and 'dedup-buckets' in recorded_url.warcprox_meta):
|
||||||
bucket = recorded_url.warcprox_meta['dedup-bucket']
|
for bucket, bucket_mode in recorded_url.warcprox_meta["dedup-buckets"].items():
|
||||||
|
buckets[bucket].append(recorded_url)
|
||||||
else:
|
else:
|
||||||
bucket = '__unspecified__'
|
buckets['__unspecified__'].append(recorded_url)
|
||||||
buckets[bucket].append(recorded_url)
|
|
||||||
else:
|
else:
|
||||||
discards.append(
|
discards.append(
|
||||||
warcprox.digest_str(
|
warcprox.digest_str(
|
||||||
@ -453,7 +466,7 @@ class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor):
|
|||||||
recorded_url.dedup_info = entry
|
recorded_url.dedup_info = entry
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# batch_lookup raised exception or something
|
# batch_lookup raised exception or something
|
||||||
logging.warn(
|
logging.warning(
|
||||||
'problem looking up dedup info for %s urls '
|
'problem looking up dedup info for %s urls '
|
||||||
'in bucket %s', len(buckets[bucket]), bucket,
|
'in bucket %s', len(buckets[bucket]), bucket,
|
||||||
exc_info=True)
|
exc_info=True)
|
||||||
@ -469,7 +482,7 @@ class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor):
|
|||||||
except futures.TimeoutError as e:
|
except futures.TimeoutError as e:
|
||||||
# the remaining threads actually keep running in this case,
|
# the remaining threads actually keep running in this case,
|
||||||
# there's no way to stop them, but that should be harmless
|
# there's no way to stop them, but that should be harmless
|
||||||
self.logger.warn(
|
self.logger.warning(
|
||||||
'timed out loading dedup info from trough', exc_info=True)
|
'timed out loading dedup info from trough', exc_info=True)
|
||||||
|
|
||||||
class TroughDedupDb(DedupDb, DedupableMixin):
|
class TroughDedupDb(DedupDb, DedupableMixin):
|
||||||
@ -571,9 +584,11 @@ class TroughDedupDb(DedupDb, DedupableMixin):
|
|||||||
and self.should_dedup(recorded_url)):
|
and self.should_dedup(recorded_url)):
|
||||||
digest_key = warcprox.digest_str(
|
digest_key = warcprox.digest_str(
|
||||||
recorded_url.payload_digest, self.options.base32)
|
recorded_url.payload_digest, self.options.base32)
|
||||||
if recorded_url.warcprox_meta and 'dedup-bucket' in recorded_url.warcprox_meta:
|
if recorded_url.warcprox_meta and 'dedup-buckets' in recorded_url.warcprox_meta:
|
||||||
self.save(
|
for bucket, bucket_mode in recorded_url.warcprox_meta["dedup-buckets"].items():
|
||||||
digest_key, records[0],
|
if not bucket_mode == 'ro':
|
||||||
bucket=recorded_url.warcprox_meta['dedup-bucket'])
|
self.save(
|
||||||
|
digest_key, records[0],
|
||||||
|
bucket=bucket)
|
||||||
else:
|
else:
|
||||||
self.save(digest_key, records[0])
|
self.save(digest_key, records[0])
|
||||||
|
@ -30,6 +30,7 @@ except ImportError:
|
|||||||
import Queue as queue
|
import Queue as queue
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
|
import logging.config
|
||||||
import sys
|
import sys
|
||||||
import hashlib
|
import hashlib
|
||||||
import argparse
|
import argparse
|
||||||
@ -39,6 +40,7 @@ import traceback
|
|||||||
import signal
|
import signal
|
||||||
import threading
|
import threading
|
||||||
import certauth.certauth
|
import certauth.certauth
|
||||||
|
import yaml
|
||||||
import warcprox
|
import warcprox
|
||||||
import doublethink
|
import doublethink
|
||||||
import cryptography.hazmat.backends.openssl
|
import cryptography.hazmat.backends.openssl
|
||||||
@ -168,6 +170,10 @@ def _build_arg_parser(prog='warcprox', show_hidden=False):
|
|||||||
help=suppress(
|
help=suppress(
|
||||||
'value of Cookie header to include in requests to the cdx '
|
'value of Cookie header to include in requests to the cdx '
|
||||||
'server, when using --cdxserver-dedup'))
|
'server, when using --cdxserver-dedup'))
|
||||||
|
hidden.add_argument(
|
||||||
|
'--cdxserver-dedup-max-threads', dest='cdxserver_dedup_max_threads',
|
||||||
|
type=int, default=50, help=suppress(
|
||||||
|
'maximum number of cdx server dedup threads'))
|
||||||
arg_parser.add_argument('--dedup-min-text-size', dest='dedup_min_text_size',
|
arg_parser.add_argument('--dedup-min-text-size', dest='dedup_min_text_size',
|
||||||
type=int, default=0,
|
type=int, default=0,
|
||||||
help=('try to dedup text resources with payload size over this limit in bytes'))
|
help=('try to dedup text resources with payload size over this limit in bytes'))
|
||||||
@ -235,6 +241,9 @@ def _build_arg_parser(prog='warcprox', show_hidden=False):
|
|||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
'--trace', dest='trace', action='store_true',
|
'--trace', dest='trace', action='store_true',
|
||||||
help='very verbose logging')
|
help='very verbose logging')
|
||||||
|
arg_parser.add_argument(
|
||||||
|
'--logging-conf-file', dest='logging_conf_file', default=None,
|
||||||
|
help=('reads logging configuration from a YAML file'))
|
||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
'--version', action='version',
|
'--version', action='version',
|
||||||
version="warcprox {}".format(warcprox.__version__))
|
version="warcprox {}".format(warcprox.__version__))
|
||||||
@ -255,7 +264,7 @@ def dump_state(signum=None, frame=None):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
state_strs.append('<n/a:%r>' % e)
|
state_strs.append('<n/a:%r>' % e)
|
||||||
|
|
||||||
logging.warn(
|
logging.warning(
|
||||||
'dumping state (caught signal %s)\n%s',
|
'dumping state (caught signal %s)\n%s',
|
||||||
signum, '\n'.join(state_strs))
|
signum, '\n'.join(state_strs))
|
||||||
|
|
||||||
@ -298,6 +307,11 @@ def main(argv=None):
|
|||||||
'%(asctime)s %(process)d %(levelname)s %(threadName)s '
|
'%(asctime)s %(process)d %(levelname)s %(threadName)s '
|
||||||
'%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s'))
|
'%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s'))
|
||||||
|
|
||||||
|
if args.logging_conf_file:
|
||||||
|
with open(args.logging_conf_file, 'r') as fd:
|
||||||
|
conf = yaml.safe_load(fd)
|
||||||
|
logging.config.dictConfig(conf)
|
||||||
|
|
||||||
# see https://github.com/pyca/cryptography/issues/2911
|
# see https://github.com/pyca/cryptography/issues/2911
|
||||||
cryptography.hazmat.backends.openssl.backend.activate_builtin_random()
|
cryptography.hazmat.backends.openssl.backend.activate_builtin_random()
|
||||||
|
|
||||||
@ -312,7 +326,11 @@ def main(argv=None):
|
|||||||
# SIGQUIT does not exist on some platforms (windows)
|
# SIGQUIT does not exist on some platforms (windows)
|
||||||
pass
|
pass
|
||||||
|
|
||||||
controller.run_until_shutdown()
|
try:
|
||||||
|
controller.run_until_shutdown()
|
||||||
|
except:
|
||||||
|
logging.fatal('unhandled exception in controller', exc_info=True)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
def ensure_rethinkdb_tables(argv=None):
|
def ensure_rethinkdb_tables(argv=None):
|
||||||
'''
|
'''
|
||||||
@ -384,7 +402,7 @@ def ensure_rethinkdb_tables(argv=None):
|
|||||||
did_something = True
|
did_something = True
|
||||||
if args.rethinkdb_trough_db_url:
|
if args.rethinkdb_trough_db_url:
|
||||||
dedup_db = warcprox.dedup.TroughDedupDb(options)
|
dedup_db = warcprox.dedup.TroughDedupDb(options)
|
||||||
logging.warn(
|
logging.warning(
|
||||||
'trough is responsible for creating most of the rethinkdb '
|
'trough is responsible for creating most of the rethinkdb '
|
||||||
'tables that it uses')
|
'tables that it uses')
|
||||||
did_something = True
|
did_something = True
|
||||||
|
@ -35,6 +35,13 @@ try:
|
|||||||
import urllib.parse as urllib_parse
|
import urllib.parse as urllib_parse
|
||||||
except ImportError:
|
except ImportError:
|
||||||
import urlparse as urllib_parse
|
import urlparse as urllib_parse
|
||||||
|
# In python2/3, urllib parse caches in memory URL parsing results to avoid
|
||||||
|
# repeating the process for the same URL. The problem is that the default
|
||||||
|
# in memory cache size is just 20.
|
||||||
|
# https://github.com/python/cpython/blob/3.7/Lib/urllib/parse.py#L80
|
||||||
|
# since we do a lot of URL parsing, it makes sense to increase cache size.
|
||||||
|
urllib_parse.MAX_CACHE_SIZE = 2000
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import http.client as http_client
|
import http.client as http_client
|
||||||
# In python3 http.client.parse_headers() enforces http_client._MAXLINE
|
# In python3 http.client.parse_headers() enforces http_client._MAXLINE
|
||||||
@ -45,6 +52,11 @@ try:
|
|||||||
http_client._MAXLINE = 4194304 # 4 MiB
|
http_client._MAXLINE = 4194304 # 4 MiB
|
||||||
except ImportError:
|
except ImportError:
|
||||||
import httplib as http_client
|
import httplib as http_client
|
||||||
|
# http_client has an arbitrary limit of 100 HTTP Headers which is too low and
|
||||||
|
# it raises an HTTPException if the target URL has more.
|
||||||
|
# https://github.com/python/cpython/blob/3.7/Lib/http/client.py#L113
|
||||||
|
http_client._MAXHEADERS = 7000
|
||||||
|
|
||||||
import json
|
import json
|
||||||
import socket
|
import socket
|
||||||
import logging
|
import logging
|
||||||
@ -64,8 +76,13 @@ import urlcanon
|
|||||||
import time
|
import time
|
||||||
import collections
|
import collections
|
||||||
import cProfile
|
import cProfile
|
||||||
|
from urllib3 import PoolManager
|
||||||
from urllib3.util import is_connection_dropped
|
from urllib3.util import is_connection_dropped
|
||||||
|
from urllib3.exceptions import TimeoutError, HTTPError
|
||||||
import doublethink
|
import doublethink
|
||||||
|
from cachetools import TTLCache
|
||||||
|
from threading import RLock
|
||||||
|
from certauth.certauth import CertificateAuthority
|
||||||
|
|
||||||
class ProxyingRecorder(object):
|
class ProxyingRecorder(object):
|
||||||
"""
|
"""
|
||||||
@ -100,7 +117,7 @@ class ProxyingRecorder(object):
|
|||||||
self.proxy_client.sendall(hunk)
|
self.proxy_client.sendall(hunk)
|
||||||
except BaseException as e:
|
except BaseException as e:
|
||||||
self._proxy_client_conn_open = False
|
self._proxy_client_conn_open = False
|
||||||
self.logger.warn(
|
self.logger.warning(
|
||||||
'%s sending data to proxy client for url %s',
|
'%s sending data to proxy client for url %s',
|
||||||
e, self.url)
|
e, self.url)
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
@ -210,9 +227,12 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
|||||||
and records the bytes in transit as it proxies them.
|
and records the bytes in transit as it proxies them.
|
||||||
'''
|
'''
|
||||||
logger = logging.getLogger("warcprox.mitmproxy.MitmProxyHandler")
|
logger = logging.getLogger("warcprox.mitmproxy.MitmProxyHandler")
|
||||||
|
|
||||||
_socket_timeout = 60
|
_socket_timeout = 60
|
||||||
_max_resource_size = None
|
_max_resource_size = None
|
||||||
_tmp_file_max_memory_size = 512 * 1024
|
_tmp_file_max_memory_size = 512 * 1024
|
||||||
|
onion_tor_socks_proxy_host = None
|
||||||
|
onion_tor_socks_proxy_port = None
|
||||||
|
|
||||||
def __init__(self, request, client_address, server):
|
def __init__(self, request, client_address, server):
|
||||||
threading.current_thread().name = 'MitmProxyHandler(tid={},started={},client={}:{})'.format(warcprox.gettid(), datetime.datetime.utcnow().isoformat(), client_address[0], client_address[1])
|
threading.current_thread().name = 'MitmProxyHandler(tid={},started={},client={}:{})'.format(warcprox.gettid(), datetime.datetime.utcnow().isoformat(), client_address[0], client_address[1])
|
||||||
@ -228,7 +248,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
|||||||
else:
|
else:
|
||||||
self.url = self.path
|
self.url = self.path
|
||||||
u = urllib_parse.urlparse(self.url)
|
u = urllib_parse.urlparse(self.url)
|
||||||
if u.scheme != 'http':
|
if u.scheme != 'http' or u.netloc == '':
|
||||||
raise Exception(
|
raise Exception(
|
||||||
'unable to parse request %r as a proxy request' % (
|
'unable to parse request %r as a proxy request' % (
|
||||||
self.requestline))
|
self.requestline))
|
||||||
@ -240,6 +260,9 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
|||||||
query=u.query, fragment=u.fragment))
|
query=u.query, fragment=u.fragment))
|
||||||
self.hostname = urlcanon.normalize_host(host).decode('ascii')
|
self.hostname = urlcanon.normalize_host(host).decode('ascii')
|
||||||
|
|
||||||
|
def _hostname_port_cache_key(self):
|
||||||
|
return '%s:%s' % (self.hostname, self.port)
|
||||||
|
|
||||||
def _connect_to_remote_server(self):
|
def _connect_to_remote_server(self):
|
||||||
'''
|
'''
|
||||||
Connect to destination.
|
Connect to destination.
|
||||||
@ -251,7 +274,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
|||||||
'''
|
'''
|
||||||
self._conn_pool = self.server.remote_connection_pool.connection_from_host(
|
self._conn_pool = self.server.remote_connection_pool.connection_from_host(
|
||||||
host=self.hostname, port=int(self.port), scheme='http',
|
host=self.hostname, port=int(self.port), scheme='http',
|
||||||
pool_kwargs={'maxsize': 6, 'timeout': self._socket_timeout})
|
pool_kwargs={'maxsize': 12, 'timeout': self._socket_timeout})
|
||||||
|
|
||||||
self._remote_server_conn = self._conn_pool._get_conn()
|
self._remote_server_conn = self._conn_pool._get_conn()
|
||||||
if is_connection_dropped(self._remote_server_conn):
|
if is_connection_dropped(self._remote_server_conn):
|
||||||
@ -283,7 +306,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
|||||||
self._remote_server_conn.sock = ssl.wrap_socket(
|
self._remote_server_conn.sock = ssl.wrap_socket(
|
||||||
self._remote_server_conn.sock)
|
self._remote_server_conn.sock)
|
||||||
except ssl.SSLError:
|
except ssl.SSLError:
|
||||||
self.logger.warn(
|
self.logger.warning(
|
||||||
"failed to establish ssl connection to %s; "
|
"failed to establish ssl connection to %s; "
|
||||||
"python ssl library does not support SNI, "
|
"python ssl library does not support SNI, "
|
||||||
"consider upgrading to python 2.7.9+ or 3.4+",
|
"consider upgrading to python 2.7.9+ or 3.4+",
|
||||||
@ -332,7 +355,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
|||||||
else:
|
else:
|
||||||
self.send_error(500, str(e))
|
self.send_error(500, str(e))
|
||||||
except Exception as f:
|
except Exception as f:
|
||||||
self.logger.warn("failed to send error response ({}) to proxy client: {}".format(e, f))
|
self.logger.warning("failed to send error response ({}) to proxy client: {}".format(e, f))
|
||||||
return
|
return
|
||||||
|
|
||||||
# Reload!
|
# Reload!
|
||||||
@ -368,25 +391,55 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
|||||||
else:
|
else:
|
||||||
self._determine_host_port()
|
self._determine_host_port()
|
||||||
assert self.url
|
assert self.url
|
||||||
|
# Check if target hostname:port is in `bad_hostnames_ports` cache
|
||||||
|
# to avoid retrying to connect. Cached value is http status code.
|
||||||
|
cached = None
|
||||||
|
hostname_port = self._hostname_port_cache_key()
|
||||||
|
with self.server.bad_hostnames_ports_lock:
|
||||||
|
cached = self.server.bad_hostnames_ports.get(hostname_port)
|
||||||
|
if cached:
|
||||||
|
self.logger.info('Cannot connect to %s (cache)', hostname_port)
|
||||||
|
self.send_error(cached)
|
||||||
|
return
|
||||||
# Connect to destination
|
# Connect to destination
|
||||||
self._connect_to_remote_server()
|
self._connect_to_remote_server()
|
||||||
except warcprox.RequestBlockedByRule as e:
|
except warcprox.RequestBlockedByRule as e:
|
||||||
# limit enforcers have already sent the appropriate response
|
# limit enforcers have already sent the appropriate response
|
||||||
self.logger.info("%r: %r", self.requestline, e)
|
self.logger.info("%r: %r", self.requestline, e)
|
||||||
return
|
return
|
||||||
|
except warcprox.BadRequest as e:
|
||||||
|
self.send_error(400, e.msg)
|
||||||
|
return
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
# If connection fails, add hostname:port to cache to avoid slow
|
||||||
|
# subsequent reconnection attempts. `NewConnectionError` can be
|
||||||
|
# caused by many types of errors which are handled by urllib3.
|
||||||
|
response_code = 500
|
||||||
|
cache = False
|
||||||
|
if isinstance(e, (socket.timeout, TimeoutError,)):
|
||||||
|
response_code = 504
|
||||||
|
cache = True
|
||||||
|
elif isinstance(e, HTTPError):
|
||||||
|
response_code = 502
|
||||||
|
cache = True
|
||||||
|
|
||||||
|
if cache:
|
||||||
|
host_port = self._hostname_port_cache_key()
|
||||||
|
with self.server.bad_hostnames_ports_lock:
|
||||||
|
self.server.bad_hostnames_ports[host_port] = response_code
|
||||||
|
self.logger.info('bad_hostnames_ports cache size: %d',
|
||||||
|
len(self.server.bad_hostnames_ports))
|
||||||
self.logger.error(
|
self.logger.error(
|
||||||
"problem processing request %r: %r",
|
"problem processing request %r: %r",
|
||||||
self.requestline, e, exc_info=True)
|
self.requestline, e, exc_info=True)
|
||||||
self.send_error(500, str(e))
|
self.send_error(response_code)
|
||||||
return
|
return
|
||||||
|
|
||||||
try:
|
try:
|
||||||
return self._proxy_request()
|
return self._proxy_request()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if self.server.shutting_down:
|
if self.server.shutting_down:
|
||||||
self.logger.warn(
|
self.logger.warning(
|
||||||
'sending 503 warcprox shutting down %r: %r',
|
'sending 503 warcprox shutting down %r: %r',
|
||||||
self.requestline, e)
|
self.requestline, e)
|
||||||
self.send_error(503, 'warcprox shutting down')
|
self.send_error(503, 'warcprox shutting down')
|
||||||
@ -394,7 +447,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
|||||||
self.logger.error(
|
self.logger.error(
|
||||||
'error from remote server(?) %r: %r',
|
'error from remote server(?) %r: %r',
|
||||||
self.requestline, e, exc_info=True)
|
self.requestline, e, exc_info=True)
|
||||||
self.send_error(502, str(e))
|
self.send_error(502)
|
||||||
return
|
return
|
||||||
|
|
||||||
def send_error(self, code, message=None, explain=None):
|
def send_error(self, code, message=None, explain=None):
|
||||||
@ -410,9 +463,13 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
|||||||
try:
|
try:
|
||||||
return http_server.BaseHTTPRequestHandler.send_error(
|
return http_server.BaseHTTPRequestHandler.send_error(
|
||||||
self, code, message, explain)
|
self, code, message, explain)
|
||||||
except:
|
except Exception as e:
|
||||||
self.logger.error(
|
level = logging.ERROR
|
||||||
'send_error(%r, %r, %r) raised exception', exc_info=True)
|
if isinstance(e, OSError) and e.errno == 9:
|
||||||
|
level = logging.TRACE
|
||||||
|
self.logger.log(
|
||||||
|
level, 'send_error(%r, %r, %r) raised exception',
|
||||||
|
exc_info=True)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def _proxy_request(self, extra_response_headers={}):
|
def _proxy_request(self, extra_response_headers={}):
|
||||||
@ -478,9 +535,14 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
|||||||
tmp_file_max_memory_size=self._tmp_file_max_memory_size)
|
tmp_file_max_memory_size=self._tmp_file_max_memory_size)
|
||||||
prox_rec_res.begin(extra_response_headers=extra_response_headers)
|
prox_rec_res.begin(extra_response_headers=extra_response_headers)
|
||||||
|
|
||||||
buf = prox_rec_res.read(65536)
|
buf = None
|
||||||
while buf != b'':
|
while buf != b'':
|
||||||
buf = prox_rec_res.read(65536)
|
try:
|
||||||
|
buf = prox_rec_res.read(65536)
|
||||||
|
except http_client.IncompleteRead as e:
|
||||||
|
self.logger.warn('%s from %s', e, self.url)
|
||||||
|
buf = e.partial
|
||||||
|
|
||||||
if (self._max_resource_size and
|
if (self._max_resource_size and
|
||||||
prox_rec_res.recorder.len > self._max_resource_size):
|
prox_rec_res.recorder.len > self._max_resource_size):
|
||||||
prox_rec_res.truncated = b'length'
|
prox_rec_res.truncated = b'length'
|
||||||
@ -506,7 +568,19 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
|||||||
# put it back in the pool to reuse it later.
|
# put it back in the pool to reuse it later.
|
||||||
if not is_connection_dropped(self._remote_server_conn):
|
if not is_connection_dropped(self._remote_server_conn):
|
||||||
self._conn_pool._put_conn(self._remote_server_conn)
|
self._conn_pool._put_conn(self._remote_server_conn)
|
||||||
except:
|
except Exception as e:
|
||||||
|
# A common error is to connect to the remote server successfully
|
||||||
|
# but raise a `RemoteDisconnected` exception when trying to begin
|
||||||
|
# downloading. Its caused by prox_rec_res.begin(...) which calls
|
||||||
|
# http_client._read_status(). In that case, the host is also bad
|
||||||
|
# and we must add it to `bad_hostnames_ports` cache.
|
||||||
|
if isinstance(e, http_client.RemoteDisconnected):
|
||||||
|
host_port = self._hostname_port_cache_key()
|
||||||
|
with self.server.bad_hostnames_ports_lock:
|
||||||
|
self.server.bad_hostnames_ports[host_port] = 502
|
||||||
|
self.logger.info('bad_hostnames_ports cache size: %d',
|
||||||
|
len(self.server.bad_hostnames_ports))
|
||||||
|
|
||||||
self._remote_server_conn.sock.shutdown(socket.SHUT_RDWR)
|
self._remote_server_conn.sock.shutdown(socket.SHUT_RDWR)
|
||||||
self._remote_server_conn.sock.close()
|
self._remote_server_conn.sock.close()
|
||||||
raise
|
raise
|
||||||
@ -521,7 +595,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
|||||||
return self.do_COMMAND
|
return self.do_COMMAND
|
||||||
|
|
||||||
def log_error(self, fmt, *args):
|
def log_error(self, fmt, *args):
|
||||||
self.logger.warn(fmt, *args)
|
self.logger.warning(fmt, *args)
|
||||||
|
|
||||||
class PooledMixIn(socketserver.ThreadingMixIn):
|
class PooledMixIn(socketserver.ThreadingMixIn):
|
||||||
logger = logging.getLogger("warcprox.mitmproxy.PooledMixIn")
|
logger = logging.getLogger("warcprox.mitmproxy.PooledMixIn")
|
||||||
@ -670,3 +744,52 @@ class PooledMitmProxy(PooledMixIn, MitmProxy):
|
|||||||
for sock in self.remote_server_socks:
|
for sock in self.remote_server_socks:
|
||||||
self.shutdown_request(sock)
|
self.shutdown_request(sock)
|
||||||
|
|
||||||
|
class SingleThreadedMitmProxy(http_server.HTTPServer):
|
||||||
|
logger = logging.getLogger('warcprox.warcproxy.SingleThreadedMitmProxy')
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self, MitmProxyHandlerClass=MitmProxyHandler,
|
||||||
|
options=warcprox.Options()):
|
||||||
|
self.options = options
|
||||||
|
|
||||||
|
# TTLCache is not thread-safe. Access to the shared cache from multiple
|
||||||
|
# threads must be properly synchronized with an RLock according to ref:
|
||||||
|
# https://cachetools.readthedocs.io/en/latest/
|
||||||
|
self.bad_hostnames_ports = TTLCache(maxsize=1024, ttl=60)
|
||||||
|
self.bad_hostnames_ports_lock = RLock()
|
||||||
|
|
||||||
|
self.remote_connection_pool = PoolManager(
|
||||||
|
num_pools=max((options.max_threads or 0) // 6, 400))
|
||||||
|
|
||||||
|
if options.onion_tor_socks_proxy:
|
||||||
|
try:
|
||||||
|
host, port = options.onion_tor_socks_proxy.split(':')
|
||||||
|
MitmProxyHandlerClass.onion_tor_socks_proxy_host = host
|
||||||
|
MitmProxyHandlerClass.onion_tor_socks_proxy_port = int(port)
|
||||||
|
except ValueError:
|
||||||
|
MitmProxyHandlerClass.onion_tor_socks_proxy_host = options.onion_tor_socks_proxy
|
||||||
|
MitmProxyHandlerClass.onion_tor_socks_proxy_port = None
|
||||||
|
|
||||||
|
if options.socket_timeout:
|
||||||
|
MitmProxyHandlerClass._socket_timeout = options.socket_timeout
|
||||||
|
if options.max_resource_size:
|
||||||
|
MitmProxyHandlerClass._max_resource_size = options.max_resource_size
|
||||||
|
if options.tmp_file_max_memory_size:
|
||||||
|
MitmProxyHandlerClass._tmp_file_max_memory_size = options.tmp_file_max_memory_size
|
||||||
|
|
||||||
|
self.digest_algorithm = options.digest_algorithm or 'sha1'
|
||||||
|
|
||||||
|
ca_name = ('Warcprox CA on %s' % socket.gethostname())[:64]
|
||||||
|
self.ca = CertificateAuthority(
|
||||||
|
ca_file=options.cacert or 'warcprox-ca.pem',
|
||||||
|
certs_dir=options.certs_dir or './warcprox-ca',
|
||||||
|
ca_name=ca_name)
|
||||||
|
|
||||||
|
server_address = (
|
||||||
|
options.address or 'localhost',
|
||||||
|
options.port if options.port is not None else 8000)
|
||||||
|
|
||||||
|
http_server.HTTPServer.__init__(
|
||||||
|
self, server_address, MitmProxyHandlerClass,
|
||||||
|
bind_and_activate=True)
|
||||||
|
|
||||||
|
@ -42,6 +42,7 @@ from warcprox.mitmproxy import MitmProxyHandler
|
|||||||
import warcprox
|
import warcprox
|
||||||
import sqlite3
|
import sqlite3
|
||||||
import threading
|
import threading
|
||||||
|
from cachetools import TTLCache
|
||||||
|
|
||||||
class PlaybackProxyHandler(MitmProxyHandler):
|
class PlaybackProxyHandler(MitmProxyHandler):
|
||||||
logger = logging.getLogger("warcprox.playback.PlaybackProxyHandler")
|
logger = logging.getLogger("warcprox.playback.PlaybackProxyHandler")
|
||||||
@ -219,6 +220,8 @@ class PlaybackProxy(socketserver.ThreadingMixIn, http_server.HTTPServer):
|
|||||||
self.playback_index_db = playback_index_db
|
self.playback_index_db = playback_index_db
|
||||||
self.warcs_dir = options.directory
|
self.warcs_dir = options.directory
|
||||||
self.options = options
|
self.options = options
|
||||||
|
self.bad_hostnames_ports = TTLCache(maxsize=1024, ttl=60)
|
||||||
|
self.bad_hostnames_ports_lock = threading.RLock()
|
||||||
|
|
||||||
def server_activate(self):
|
def server_activate(self):
|
||||||
http_server.HTTPServer.server_activate(self)
|
http_server.HTTPServer.server_activate(self)
|
||||||
|
@ -81,7 +81,7 @@ def unravel_buckets(url, warcprox_meta):
|
|||||||
for bucket in warcprox_meta["stats"]["buckets"]:
|
for bucket in warcprox_meta["stats"]["buckets"]:
|
||||||
if isinstance(bucket, dict):
|
if isinstance(bucket, dict):
|
||||||
if not 'bucket' in bucket:
|
if not 'bucket' in bucket:
|
||||||
self.logger.warn(
|
self.logger.warning(
|
||||||
'ignoring invalid stats bucket in '
|
'ignoring invalid stats bucket in '
|
||||||
'warcprox-meta header %s', bucket)
|
'warcprox-meta header %s', bucket)
|
||||||
continue
|
continue
|
||||||
|
@ -190,7 +190,7 @@ class TroughClient(object):
|
|||||||
return
|
return
|
||||||
if response.status_code != 200:
|
if response.status_code != 200:
|
||||||
self._write_url_cache.pop(segment_id, None)
|
self._write_url_cache.pop(segment_id, None)
|
||||||
self.logger.warn(
|
self.logger.warning(
|
||||||
'unexpected response %r %r %r from %r to sql=%r',
|
'unexpected response %r %r %r from %r to sql=%r',
|
||||||
response.status_code, response.reason, response.text,
|
response.status_code, response.reason, response.text,
|
||||||
write_url, sql)
|
write_url, sql)
|
||||||
|
@ -125,48 +125,59 @@ class WarcRecordBuilder:
|
|||||||
headers.append((warctools.WarcRecord.CONCURRENT_TO, concurrent_to))
|
headers.append((warctools.WarcRecord.CONCURRENT_TO, concurrent_to))
|
||||||
if content_type is not None:
|
if content_type is not None:
|
||||||
headers.append((warctools.WarcRecord.CONTENT_TYPE, content_type))
|
headers.append((warctools.WarcRecord.CONTENT_TYPE, content_type))
|
||||||
if payload_digest is not None:
|
|
||||||
headers.append((warctools.WarcRecord.PAYLOAD_DIGEST, payload_digest))
|
|
||||||
# truncated value may be 'length' or 'time'
|
# truncated value may be 'length' or 'time'
|
||||||
if truncated is not None:
|
if truncated is not None:
|
||||||
headers.append((b'WARC-Truncated', truncated))
|
headers.append((b'WARC-Truncated', truncated))
|
||||||
|
if content_length is not None:
|
||||||
|
headers.append((
|
||||||
|
warctools.WarcRecord.CONTENT_LENGTH,
|
||||||
|
str(content_length).encode('latin1')))
|
||||||
|
|
||||||
if recorder is not None:
|
if recorder is not None:
|
||||||
if content_length is not None:
|
if payload_digest is not None:
|
||||||
headers.append((
|
headers.append(
|
||||||
warctools.WarcRecord.CONTENT_LENGTH,
|
(warctools.WarcRecord.PAYLOAD_DIGEST, payload_digest))
|
||||||
str(content_length).encode('latin1')))
|
if content_length is None:
|
||||||
else:
|
|
||||||
headers.append((
|
headers.append((
|
||||||
warctools.WarcRecord.CONTENT_LENGTH,
|
warctools.WarcRecord.CONTENT_LENGTH,
|
||||||
str(len(recorder)).encode('latin1')))
|
str(len(recorder)).encode('latin1')))
|
||||||
headers.append((warctools.WarcRecord.BLOCK_DIGEST,
|
headers.append((warctools.WarcRecord.BLOCK_DIGEST,
|
||||||
warcprox.digest_str(recorder.block_digest, self.base32)))
|
warcprox.digest_str(recorder.block_digest, self.base32)))
|
||||||
recorder.tempfile.seek(0)
|
recorder.tempfile.seek(0)
|
||||||
record = warctools.WarcRecord(headers=headers, content_file=recorder.tempfile)
|
record = warctools.WarcRecord(
|
||||||
|
headers=headers, content_file=recorder.tempfile)
|
||||||
else:
|
else:
|
||||||
if content_length is not None:
|
if content_length is None:
|
||||||
headers.append((
|
|
||||||
warctools.WarcRecord.CONTENT_LENGTH,
|
|
||||||
str(content_length).encode('latin1')))
|
|
||||||
else:
|
|
||||||
headers.append((
|
headers.append((
|
||||||
warctools.WarcRecord.CONTENT_LENGTH,
|
warctools.WarcRecord.CONTENT_LENGTH,
|
||||||
str(len(data)).encode('latin1')))
|
str(len(data)).encode('latin1')))
|
||||||
# no http headers so block digest == payload digest
|
|
||||||
if not payload_digest:
|
block_digest = None
|
||||||
payload_digest = warcprox.digest_str(
|
if not hasattr(data, 'read'):
|
||||||
|
block_digest = warcprox.digest_str(
|
||||||
hashlib.new(self.digest_algorithm, data), self.base32)
|
hashlib.new(self.digest_algorithm, data), self.base32)
|
||||||
headers.append((
|
|
||||||
warctools.WarcRecord.PAYLOAD_DIGEST, payload_digest))
|
if not content_type.lower().startswith(b'application/http'):
|
||||||
headers.append((warctools.WarcRecord.BLOCK_DIGEST, payload_digest))
|
# no http headers, so block digest == payload digest
|
||||||
|
if payload_digest and not block_digest:
|
||||||
|
block_digest = payload_digest
|
||||||
|
elif block_digest and not payload_digest:
|
||||||
|
payload_digest = block_digest
|
||||||
|
|
||||||
|
if block_digest:
|
||||||
|
headers.append(
|
||||||
|
(warctools.WarcRecord.BLOCK_DIGEST, block_digest))
|
||||||
|
if payload_digest:
|
||||||
|
headers.append(
|
||||||
|
(warctools.WarcRecord.PAYLOAD_DIGEST, payload_digest))
|
||||||
|
|
||||||
if hasattr(data, 'read'):
|
if hasattr(data, 'read'):
|
||||||
record = warctools.WarcRecord(
|
record = warctools.WarcRecord(
|
||||||
headers=headers, content_file=data)
|
headers=headers, content_file=data)
|
||||||
else:
|
else:
|
||||||
content_tuple = content_type, data
|
content_tuple = content_type, data
|
||||||
record = warctools.WarcRecord(
|
record = warctools.WarcRecord(
|
||||||
headers=headers, content=content_tuple)
|
headers=headers, content=(content_type, data))
|
||||||
|
|
||||||
return record
|
return record
|
||||||
|
|
||||||
|
@ -38,15 +38,14 @@ import logging
|
|||||||
import json
|
import json
|
||||||
import socket
|
import socket
|
||||||
from hanzo import warctools
|
from hanzo import warctools
|
||||||
from certauth.certauth import CertificateAuthority
|
|
||||||
import warcprox
|
import warcprox
|
||||||
import datetime
|
import datetime
|
||||||
import urlcanon
|
import urlcanon
|
||||||
import os
|
import os
|
||||||
from urllib3 import PoolManager
|
|
||||||
import tempfile
|
import tempfile
|
||||||
import hashlib
|
import hashlib
|
||||||
import doublethink
|
import doublethink
|
||||||
|
import re
|
||||||
|
|
||||||
class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
||||||
'''
|
'''
|
||||||
@ -167,7 +166,7 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
|||||||
if warcprox_meta and 'warc-prefix' in warcprox_meta and (
|
if warcprox_meta and 'warc-prefix' in warcprox_meta and (
|
||||||
'/' in warcprox_meta['warc-prefix']
|
'/' in warcprox_meta['warc-prefix']
|
||||||
or '\\' in warcprox_meta['warc-prefix']):
|
or '\\' in warcprox_meta['warc-prefix']):
|
||||||
raise Exception(
|
raise warcprox.BadRequest(
|
||||||
"request rejected by warcprox: slash and backslash are not "
|
"request rejected by warcprox: slash and backslash are not "
|
||||||
"permitted in warc-prefix")
|
"permitted in warc-prefix")
|
||||||
|
|
||||||
@ -349,6 +348,7 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
|||||||
# logging better handled elsewhere?
|
# logging better handled elsewhere?
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
RE_MIMETYPE = re.compile(r'[;\s]')
|
||||||
|
|
||||||
class RecordedUrl:
|
class RecordedUrl:
|
||||||
logger = logging.getLogger("warcprox.warcproxy.RecordedUrl")
|
logger = logging.getLogger("warcprox.warcproxy.RecordedUrl")
|
||||||
@ -377,8 +377,14 @@ class RecordedUrl:
|
|||||||
if warcprox_meta:
|
if warcprox_meta:
|
||||||
if 'captures-bucket' in warcprox_meta:
|
if 'captures-bucket' in warcprox_meta:
|
||||||
# backward compatibility
|
# backward compatibility
|
||||||
warcprox_meta['dedup-bucket'] = warcprox_meta['captures-bucket']
|
warcprox_meta['dedup-buckets'] = {}
|
||||||
|
warcprox_meta['dedup-buckets'][warcprox_meta['captures-bucket']] = 'rw'
|
||||||
del warcprox_meta['captures-bucket']
|
del warcprox_meta['captures-bucket']
|
||||||
|
if 'dedup-bucket' in warcprox_meta:
|
||||||
|
# more backwards compatibility
|
||||||
|
warcprox_meta['dedup-buckets'] = {}
|
||||||
|
warcprox_meta['dedup-buckets'][warcprox_meta['dedup-bucket']] = 'rw'
|
||||||
|
del warcprox_meta['dedup-bucket']
|
||||||
self.warcprox_meta = warcprox_meta
|
self.warcprox_meta = warcprox_meta
|
||||||
else:
|
else:
|
||||||
self.warcprox_meta = {}
|
self.warcprox_meta = {}
|
||||||
@ -387,9 +393,8 @@ class RecordedUrl:
|
|||||||
|
|
||||||
self.mimetype = content_type
|
self.mimetype = content_type
|
||||||
if self.mimetype:
|
if self.mimetype:
|
||||||
n = self.mimetype.find(";")
|
# chop off subtype, and ensure there's no whitespace
|
||||||
if n >= 0:
|
self.mimetype = RE_MIMETYPE.split(self.mimetype, 2)[0]
|
||||||
self.mimetype = self.mimetype[:n]
|
|
||||||
|
|
||||||
self.custom_type = custom_type
|
self.custom_type = custom_type
|
||||||
self.status = status
|
self.status = status
|
||||||
@ -420,51 +425,20 @@ class RecordedUrl:
|
|||||||
# inherit from object so that multiple inheritance from this class works
|
# inherit from object so that multiple inheritance from this class works
|
||||||
# properly in python 2
|
# properly in python 2
|
||||||
# http://stackoverflow.com/questions/1713038/super-fails-with-error-typeerror-argument-1-must-be-type-not-classobj#18392639
|
# http://stackoverflow.com/questions/1713038/super-fails-with-error-typeerror-argument-1-must-be-type-not-classobj#18392639
|
||||||
class SingleThreadedWarcProxy(http_server.HTTPServer, object):
|
class SingleThreadedWarcProxy(warcprox.mitmproxy.SingleThreadedMitmProxy):
|
||||||
logger = logging.getLogger("warcprox.warcproxy.WarcProxy")
|
logger = logging.getLogger("warcprox.warcproxy.WarcProxy")
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, stats_db=None, status_callback=None,
|
self, stats_db=None, status_callback=None,
|
||||||
options=warcprox.Options()):
|
options=warcprox.Options()):
|
||||||
self.start_time = doublethink.utcnow()
|
self.start_time = doublethink.utcnow()
|
||||||
|
|
||||||
|
warcprox.mitmproxy.SingleThreadedMitmProxy.__init__(
|
||||||
|
self, WarcProxyHandler, options)
|
||||||
|
|
||||||
self.status_callback = status_callback
|
self.status_callback = status_callback
|
||||||
self.stats_db = stats_db
|
self.stats_db = stats_db
|
||||||
self.options = options
|
|
||||||
self.remote_connection_pool = PoolManager(
|
|
||||||
num_pools=max(round(options.max_threads / 6), 200) if options.max_threads else 200)
|
|
||||||
server_address = (
|
|
||||||
options.address or 'localhost',
|
|
||||||
options.port if options.port is not None else 8000)
|
|
||||||
|
|
||||||
if options.onion_tor_socks_proxy:
|
|
||||||
try:
|
|
||||||
host, port = options.onion_tor_socks_proxy.split(':')
|
|
||||||
WarcProxyHandler.onion_tor_socks_proxy_host = host
|
|
||||||
WarcProxyHandler.onion_tor_socks_proxy_port = int(port)
|
|
||||||
except ValueError:
|
|
||||||
WarcProxyHandler.onion_tor_socks_proxy_host = options.onion_tor_socks_proxy
|
|
||||||
WarcProxyHandler.onion_tor_socks_proxy_port = None
|
|
||||||
|
|
||||||
if options.socket_timeout:
|
|
||||||
WarcProxyHandler._socket_timeout = options.socket_timeout
|
|
||||||
if options.max_resource_size:
|
|
||||||
WarcProxyHandler._max_resource_size = options.max_resource_size
|
|
||||||
if options.tmp_file_max_memory_size:
|
|
||||||
WarcProxyHandler._tmp_file_max_memory_size = options.tmp_file_max_memory_size
|
|
||||||
|
|
||||||
http_server.HTTPServer.__init__(
|
|
||||||
self, server_address, WarcProxyHandler, bind_and_activate=True)
|
|
||||||
|
|
||||||
self.digest_algorithm = options.digest_algorithm or 'sha1'
|
|
||||||
|
|
||||||
ca_name = ('Warcprox CA on %s' % socket.gethostname())[:64]
|
|
||||||
self.ca = CertificateAuthority(
|
|
||||||
ca_file=options.cacert or 'warcprox-ca.pem',
|
|
||||||
certs_dir=options.certs_dir or './warcprox-ca',
|
|
||||||
ca_name=ca_name)
|
|
||||||
|
|
||||||
self.recorded_url_q = queue.Queue(maxsize=options.queue_size or 1000)
|
self.recorded_url_q = queue.Queue(maxsize=options.queue_size or 1000)
|
||||||
|
|
||||||
self.running_stats = warcprox.stats.RunningStats()
|
self.running_stats = warcprox.stats.RunningStats()
|
||||||
|
|
||||||
def status(self):
|
def status(self):
|
||||||
@ -530,6 +504,6 @@ class WarcProxy(SingleThreadedWarcProxy, warcprox.mitmproxy.PooledMitmProxy):
|
|||||||
self.remote_connection_pool.clear()
|
self.remote_connection_pool.clear()
|
||||||
|
|
||||||
def handle_error(self, request, client_address):
|
def handle_error(self, request, client_address):
|
||||||
self.logger.warn(
|
self.logger.warning(
|
||||||
"exception processing request %s from %s", request,
|
"exception processing request %s from %s", request,
|
||||||
client_address, exc_info=True)
|
client_address, exc_info=True)
|
||||||
|
@ -149,6 +149,7 @@ class WarcWriter:
|
|||||||
record.get_header(b'WARC-Payload-Digest'), record.offset,
|
record.get_header(b'WARC-Payload-Digest'), record.offset,
|
||||||
self.path, record.get_header(warctools.WarcRecord.URL))
|
self.path, record.get_header(warctools.WarcRecord.URL))
|
||||||
self.f.flush()
|
self.f.flush()
|
||||||
|
self.last_activity = time.time()
|
||||||
|
|
||||||
return records
|
return records
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user