mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
warcinfo record
This commit is contained in:
parent
556e969465
commit
b3b6406e71
76
warcprox.py
76
warcprox.py
@ -6,7 +6,7 @@ import BaseHTTPServer, SocketServer
|
|||||||
import httplib
|
import httplib
|
||||||
import socket
|
import socket
|
||||||
import urlparse
|
import urlparse
|
||||||
import OpenSSL.crypto, OpenSSL.crypto
|
import OpenSSL
|
||||||
import ssl
|
import ssl
|
||||||
import logging
|
import logging
|
||||||
import sys
|
import sys
|
||||||
@ -274,20 +274,22 @@ class WarcRecordQueuer(RequestInterceptorPlugin, ResponseInterceptorPlugin):
|
|||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
||||||
def make_warc_uuid(self, text):
|
@staticmethod
|
||||||
|
def make_warc_uuid(text):
|
||||||
return "<urn:uuid:{0}>".format(uuid.UUID(hashlib.sha1(text).hexdigest()[0:32]))
|
return "<urn:uuid:{0}>".format(uuid.UUID(hashlib.sha1(text).hexdigest()[0:32]))
|
||||||
|
|
||||||
|
|
||||||
def do_response(self, data):
|
def do_response(self, data):
|
||||||
logging.info('{0} << {1}'.format(self.url, repr(data[:100])))
|
logging.info('{0} << {1}'.format(self.url, repr(data[:100])))
|
||||||
|
|
||||||
warc_record_id = self.make_warc_uuid("{0} {1}".format(self.url, datetime.now().isoformat()))
|
warc_record_date = warctools.warc.warc_datetime_str(datetime.now())
|
||||||
|
warc_record_id = WarcRecordQueuer.make_warc_uuid("{0} {1}".format(self.url, warc_record_date))
|
||||||
logging.info('{0}: {1}'.format(warctools.WarcRecord.ID, warc_record_id))
|
logging.info('{0}: {1}'.format(warctools.WarcRecord.ID, warc_record_id))
|
||||||
|
|
||||||
headers = []
|
headers = []
|
||||||
headers.append((warctools.WarcRecord.ID, warc_record_id))
|
headers.append((warctools.WarcRecord.ID, warc_record_id))
|
||||||
headers.append((warctools.WarcRecord.URL, self.url))
|
headers.append((warctools.WarcRecord.URL, self.url))
|
||||||
headers.append((warctools.WarcRecord.DATE, warctools.warc.warc_datetime_str(datetime.now())))
|
headers.append((warctools.WarcRecord.DATE, warc_record_date))
|
||||||
# headers.append((warctools.WarcRecord.IP_ADDRESS, ip))
|
# headers.append((warctools.WarcRecord.IP_ADDRESS, ip))
|
||||||
headers.append((warctools.WarcRecord.TYPE, warctools.WarcRecord.RESPONSE))
|
headers.append((warctools.WarcRecord.TYPE, warctools.WarcRecord.RESPONSE))
|
||||||
|
|
||||||
@ -329,13 +331,57 @@ class WarcWriterThread(threading.Thread):
|
|||||||
|
|
||||||
|
|
||||||
def _close_writer(self):
|
def _close_writer(self):
|
||||||
final_name = self._fpath[:-5]
|
if self._fpath:
|
||||||
logging.info('closing {0}'.format(final_name))
|
final_name = self._fpath[:-5]
|
||||||
self._f.close()
|
logging.info('closing {0}'.format(final_name))
|
||||||
os.rename(self._fpath, final_name)
|
self._f.close()
|
||||||
|
os.rename(self._fpath, final_name)
|
||||||
|
|
||||||
self._fpath = None
|
self._fpath = None
|
||||||
self._f = None
|
self._f = None
|
||||||
|
|
||||||
|
# WARC/1.0
|
||||||
|
# WARC-Type: warcinfo
|
||||||
|
# WARC-Date: 2013-10-15T22:11:29Z
|
||||||
|
# WARC-Filename: ARCHIVEIT-3714-WEEKLY-14487-20131015221129606-00000-wbgrp-crawl105.us.archive.org-6442.warc.gz
|
||||||
|
# WARC-Record-ID: <urn:uuid:8c5d5d7d-11df-4a83-9999-8d6c8244316b>
|
||||||
|
# Content-Type: application/warc-fields
|
||||||
|
# Content-Length: 713
|
||||||
|
#
|
||||||
|
# software: Heritrix/3.1.2-SNAPSHOT-20131011-0101 http://crawler.archive.org
|
||||||
|
# ip: 207.241.226.68
|
||||||
|
# hostname: wbgrp-crawl105.us.archive.org
|
||||||
|
# format: WARC File Format 1.0
|
||||||
|
# conformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf
|
||||||
|
# isPartOf: 3714-20131015221121926
|
||||||
|
# description: recurrence=WEEKLY, maxDuration=259200, maxDocumentCount=null, isTestCrawl=false, isPatchCrawl=false, oneTimeSubtype=null, seedCount=1, accountId
|
||||||
|
# robots: obey
|
||||||
|
# http-header-user-agent: Mozilla/5.0 (compatible; archive.org_bot; Archive-It; +http://archive-it.org/files/site-owners.html)
|
||||||
|
def _make_warcinfo_record(self, filename):
|
||||||
|
warc_record_date = warctools.warc.warc_datetime_str(datetime.now())
|
||||||
|
warc_record_id = WarcRecordQueuer.make_warc_uuid("{0} {1}".format(filename, warc_record_date))
|
||||||
|
|
||||||
|
headers = []
|
||||||
|
headers.append((warctools.WarcRecord.ID, warc_record_id))
|
||||||
|
headers.append((warctools.WarcRecord.TYPE, warctools.WarcRecord.WARCINFO))
|
||||||
|
headers.append((warctools.WarcRecord.FILENAME, filename))
|
||||||
|
headers.append((warctools.WarcRecord.DATE, warc_record_date))
|
||||||
|
# headers.append((warctools.WarcRecord.IP_ADDRESS, ip))
|
||||||
|
|
||||||
|
warcinfo_fields = []
|
||||||
|
warcinfo_fields.append('software: warcprox.py https://github.com/nlevitt/warcprox')
|
||||||
|
hostname = socket.gethostname()
|
||||||
|
warcinfo_fields.append('hostname: {0}'.format(hostname))
|
||||||
|
warcinfo_fields.append('ip: {0}'.format(socket.gethostbyname(hostname)))
|
||||||
|
warcinfo_fields.append('format: WARC File Format 1.0')
|
||||||
|
warcinfo_fields.append('robots: ignore') # XXX implement robots support
|
||||||
|
# warcinfo_fields.append('description: {0}'.format(self.description))
|
||||||
|
# warcinfo_fields.append('isPartOf: {0}'.format(self.is_part_of))
|
||||||
|
data = '\r\n'.join(warcinfo_fields) + '\r\n'
|
||||||
|
|
||||||
|
warcrecord = warctools.WarcRecord(headers=headers, content=('application/warc-fields', data))
|
||||||
|
|
||||||
|
return warcrecord
|
||||||
|
|
||||||
|
|
||||||
# <!-- <property name="template" value="${prefix}-${timestamp17}-${serialno}-${heritrix.pid}~${heritrix.hostname}~${heritrix.port}" /> -->
|
# <!-- <property name="template" value="${prefix}-${timestamp17}-${serialno}-${heritrix.pid}~${heritrix.hostname}~${heritrix.port}" /> -->
|
||||||
@ -344,9 +390,15 @@ class WarcWriterThread(threading.Thread):
|
|||||||
self._close_writer()
|
self._close_writer()
|
||||||
|
|
||||||
if self._f == None:
|
if self._f == None:
|
||||||
self._fpath = '{0}/{1}-{2}-{3:05d}-{4}-{5}-{6}.warc{7}.open'.format(
|
filename = '{}-{}-{:05d}-{}-{}-{}.warc{}'.format(
|
||||||
self.directory, self.prefix, self.timestamp17(), self._serial, os.getpid(), socket.gethostname(), self.port, '.gz' if self.gzip else '')
|
self.prefix, self.timestamp17(), self._serial, os.getpid(),
|
||||||
|
socket.gethostname(), self.port, '.gz' if self.gzip else '')
|
||||||
|
self._fpath = '{0}/{1}.open'.format(self.directory, filename)
|
||||||
self._f = open(self._fpath, 'wb')
|
self._f = open(self._fpath, 'wb')
|
||||||
|
|
||||||
|
warcinfo_record = self._make_warcinfo_record(filename)
|
||||||
|
warcinfo_record.write_to(self._f, gzip=self.gzip)
|
||||||
|
|
||||||
self._serial += 1
|
self._serial += 1
|
||||||
|
|
||||||
return self._f
|
return self._f
|
||||||
|
Loading…
x
Reference in New Issue
Block a user