mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
Merge branch 'master' into qa
* master: respect CA-related command line options bump version number after pull request use trick to avoid dns looking up local ip close open warcs at shutdown Remove unused writer.tell() call in Writer.write_records
This commit is contained in:
commit
aa4a9bf3ab
2
setup.py
2
setup.py
@ -52,7 +52,7 @@ except:
|
|||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='warcprox',
|
name='warcprox',
|
||||||
version='2.4b1.dev139',
|
version='2.4b1.dev143',
|
||||||
description='WARC writing MITM HTTP/S proxy',
|
description='WARC writing MITM HTTP/S proxy',
|
||||||
url='https://github.com/internetarchive/warcprox',
|
url='https://github.com/internetarchive/warcprox',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
@ -1,23 +1,23 @@
|
|||||||
#
|
'''
|
||||||
# warcprox/warc.py - assembles warc records
|
warcprox/warc.py - assembles warc records
|
||||||
#
|
|
||||||
# Copyright (C) 2013-2016 Internet Archive
|
Copyright (C) 2013-2018 Internet Archive
|
||||||
#
|
|
||||||
# This program is free software; you can redistribute it and/or
|
This program is free software; you can redistribute it and/or
|
||||||
# modify it under the terms of the GNU General Public License
|
modify it under the terms of the GNU General Public License
|
||||||
# as published by the Free Software Foundation; either version 2
|
as published by the Free Software Foundation; either version 2
|
||||||
# of the License, or (at your option) any later version.
|
of the License, or (at your option) any later version.
|
||||||
#
|
|
||||||
# This program is distributed in the hope that it will be useful,
|
This program is distributed in the hope that it will be useful,
|
||||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
# GNU General Public License for more details.
|
GNU General Public License for more details.
|
||||||
#
|
|
||||||
# You should have received a copy of the GNU General Public License
|
You should have received a copy of the GNU General Public License
|
||||||
# along with this program; if not, write to the Free Software
|
along with this program; if not, write to the Free Software
|
||||||
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
|
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
|
||||||
# USA.
|
USA.
|
||||||
#
|
'''
|
||||||
|
|
||||||
from __future__ import absolute_import
|
from __future__ import absolute_import
|
||||||
|
|
||||||
@ -140,6 +140,13 @@ class WarcRecordBuilder:
|
|||||||
|
|
||||||
return record
|
return record
|
||||||
|
|
||||||
|
def _local_address(self):
|
||||||
|
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
|
||||||
|
s.connect(('10.255.255.255', 1)) # ip doesn't need to be reachable
|
||||||
|
output = s.getsockname()[0]
|
||||||
|
s.close()
|
||||||
|
return output
|
||||||
|
|
||||||
def build_warcinfo_record(self, filename):
|
def build_warcinfo_record(self, filename):
|
||||||
warc_record_date = warctools.warc.warc_datetime_str(datetime.datetime.utcnow())
|
warc_record_date = warctools.warc.warc_datetime_str(datetime.datetime.utcnow())
|
||||||
record_id = warctools.WarcRecord.random_warc_uuid()
|
record_id = warctools.WarcRecord.random_warc_uuid()
|
||||||
@ -154,7 +161,7 @@ class WarcRecordBuilder:
|
|||||||
warcinfo_fields.append(b'software: warcprox ' + warcprox.__version__.encode('latin1'))
|
warcinfo_fields.append(b'software: warcprox ' + warcprox.__version__.encode('latin1'))
|
||||||
hostname = socket.gethostname()
|
hostname = socket.gethostname()
|
||||||
warcinfo_fields.append('hostname: {}'.format(hostname).encode('latin1'))
|
warcinfo_fields.append('hostname: {}'.format(hostname).encode('latin1'))
|
||||||
warcinfo_fields.append('ip: {}'.format(socket.gethostbyname(hostname)).encode('latin1'))
|
warcinfo_fields.append(('ip: %s' % self._local_address()).encode('latin1'))
|
||||||
warcinfo_fields.append(b'format: WARC File Format 1.0')
|
warcinfo_fields.append(b'format: WARC File Format 1.0')
|
||||||
# warcinfo_fields.append('robots: ignore')
|
# warcinfo_fields.append('robots: ignore')
|
||||||
# warcinfo_fields.append('description: {0}'.format(self.description))
|
# warcinfo_fields.append('description: {0}'.format(self.description))
|
||||||
|
@ -404,7 +404,8 @@ class SingleThreadedWarcProxy(http_server.HTTPServer, object):
|
|||||||
|
|
||||||
ca_name = ('Warcprox CA on %s' % socket.gethostname())[:64]
|
ca_name = ('Warcprox CA on %s' % socket.gethostname())[:64]
|
||||||
self.ca = CertificateAuthority(
|
self.ca = CertificateAuthority(
|
||||||
ca_file='warcprox-ca.pem', certs_dir='./warcprox-ca',
|
ca_file=options.cacert or 'warcprox-ca.pem',
|
||||||
|
certs_dir=options.certs_dir or './warcprox-ca',
|
||||||
ca_name=ca_name)
|
ca_name=ca_name)
|
||||||
|
|
||||||
self.recorded_url_q = warcprox.TimestampedQueue(
|
self.recorded_url_q = warcprox.TimestampedQueue(
|
||||||
|
@ -158,7 +158,6 @@ class WarcWriter:
|
|||||||
|
|
||||||
with self._lock:
|
with self._lock:
|
||||||
writer = self._writer()
|
writer = self._writer()
|
||||||
recordset_offset = writer.tell()
|
|
||||||
|
|
||||||
for record in records:
|
for record in records:
|
||||||
offset = writer.tell()
|
offset = writer.tell()
|
||||||
|
@ -89,3 +89,7 @@ class WarcWriterThread(warcprox.BaseStandardPostfetchProcessor):
|
|||||||
recorded_url.method, recorded_url.url.decode("utf-8"),
|
recorded_url.method, recorded_url.url.decode("utf-8"),
|
||||||
recorded_url.mimetype, recorded_url.size, payload_digest,
|
recorded_url.mimetype, recorded_url.size, payload_digest,
|
||||||
type_, filename, offset)
|
type_, filename, offset)
|
||||||
|
|
||||||
|
def _shutdown(self):
|
||||||
|
self.writer_pool.close_writers()
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user